コード例 #1
0
ファイル: word_embedding.py プロジェクト: justinnhli/debias
    def __init__(self, dimensions=None, gensim_obj=None, source=None):
        # type: (int, gensim.Word2VecKeyedVectors, Path) -> None
        """Initialize a word embedding.

        At least one of dimensions and gensim_obj must be provided. If both are
        used, dimensions is ignored.

        Parameters:
            dimensions (int): The number of dimensions of the embedding.
            gensim_obj (gensim.Word2VecKeyedVectors): A gensim word embedding or related model.
            source (Path): The path of the source file.

        Raises:
            ValueError:
                If neither dimensions nor gensim_obj is provided.
                If dimensions is not a positive integer.
                If the word vectors in the gensim_obj cannot be determined.
        """
        if dimensions is None and gensim_obj is None:
            raise ValueError(
                'one of dimensions or gensim_obj must be provided')
        if gensim_obj is None:
            if not isinstance(dimensions, int) and dimensions > 0:
                raise ValueError('dimensions must be a positive integer')
            self.keyed_vectors = Word2VecKeyedVectors(dimensions)
        elif isinstance(gensim_obj, WordEmbeddingsKeyedVectors):
            if not hasattr(gensim_obj, 'save_word2vec_format'):
                raise ValueError(
                    f'gensim_obj {type(gensim_obj)} does not have attribute "save_word2vec_format"'
                )
            self.keyed_vectors = gensim_obj
        elif isinstance(gensim_obj, BaseWordEmbeddingsModel):
            if not hasattr(gensim_obj, 'wv'):
                raise ValueError(
                    f'gensim_obj {type(gensim_obj)} does not have attribute "wv"'
                )
            self.keyed_vectors = gensim_obj.wv
        else:
            raise ValueError(
                f'unable to determine word vectors in gensim object {gensim_obj}'
            )
        self.source = source
        # forcefully normalize the vectors
        self.keyed_vectors.vectors = normalize(self.keyed_vectors.vectors)
コード例 #2
0
def _define_pca_bias_subspace(matrix, subspace_dimensions=1, **kwargs):
    # type: (numpy.ndarray, int, **Any) -> numpy.ndarray
    """Calculate the gender direction using PCA.

    Parameters:
        matrix (numpy.ndarray): A word embedding.
        subspace_dimensions (int): The number of principle components to use.
            Defaults to 1.
        **kwargs: Other keyword arguments.

    Returns:
        numpy.ndarray: A basis of the bias subspace.

    Raises:
        ValueError: If none of the words are in the embedding.
    """
    pca = PCA(n_components=subspace_dimensions)
    pca.fit(matrix)
    return normalize(pca.components_)  # FIXME trim down to desired dimensions
コード例 #3
0
def _define_mean_bias_subspace(embedding, word_pairs, **kwargs):
    # type: (WordEmbedding, Iterable[Tuple[str, str]], **Any) -> numpy.ndarray
    """Calculate the gender direction using the Euclidean mean.

    Parameters:
        embedding (WordEmbedding): A word embedding.
        word_pairs (Iterable[Tuple[str, str]]): A list of male-female word pairs.
        **kwargs: Other keyword arguments.

    Returns:
        numpy.ndarray: A female->male vector.

    Raises:
        ValueError: If none of the gender pairs are in the embedding.
    """
    diff_vectors = []
    for male_word, female_word in word_pairs:
        if male_word not in embedding or female_word not in embedding:
            continue
        diff_vectors.append(embedding[male_word] - embedding[female_word])
    if not diff_vectors:
        raise ValueError('embedding does not contain any gender pairs.')
    return normalize(np.mean(np.array(diff_vectors), axis=0))
コード例 #4
0
def bolukbasi_debias_generalized(embedding,
                                 words,
                                 out_file,
                                 excludes=None,
                                 **kwargs):
    # type: (WordEmbedding, Iterable[str], Path, Iterable[str], **Any) -> WordEmbedding
    """Debias a word embedding using a generalized version of Bolukbasi's algorithm.

    Parameters:
        embedding (WordEmbedding): The word embedding to debias.
        words (Iterable[str]): A list of words that define the bias subspace.
        out_file (Path): The path to save the new embedding to.
        excludes (Iterable[str]): A collection of words to be excluded from the debiasing
        **kwargs: Other keyword arguments.

    Returns:
        WordEmbedding: The debiased word embedding.
    """
    if out_file.exists():
        return WordEmbedding.load_word2vec_file(out_file)
    matrix = recenter(
        np.array([embedding[word] for word in words if word in embedding]))
    bias_subspace = _define_pca_bias_subspace(matrix, **kwargs)
    bias_subspace = bias_subspace[np.newaxis, :]
    # debias by rejecting the subspace and reverting the excluded words
    if excludes is None:
        excludes = set()
    new_vectors = reject(embedding.vectors, bias_subspace)
    for word in excludes:
        if word in embedding:
            new_vectors[embedding.index(word)] = embedding[word]
    new_vectors = normalize(new_vectors)
    # create a word embedding from the new vectors
    new_embedding = WordEmbedding.from_vectors(embedding.words, new_vectors)
    new_embedding.source = out_file
    new_embedding.save()
    return new_embedding
コード例 #5
0
def bolukbasi_debias_original(embedding,
                              word_pairs,
                              out_file,
                              excludes=None,
                              mirrors=None,
                              **kwargs):
    # type: (WordEmbedding, Iterable[Tuple[str, str]], Path, Iterable[str], Iterable[Tuple[str, str]], **Any) -> WordEmbedding
    """Debias a word embedding using Bolukbasi's original algorithm.

    Adapted from https://github.com/tolga-b/debiaswe/blob/master/debiaswe/debias.py#L19
    Commit 10277b23e187ee4bd2b6872b507163ef4198686b on 2018-04-02

    Parameters:
        embedding (WordEmbedding): The word embedding to debias.
        word_pairs (Iterable[Tuple[str, str]]):
            A list of word pairs that define the bias subspace.
        out_file (Path):
            The path to save the new embedding to.
        excludes (Iterable[str]):
            A collection of words to be excluded from the debiasing
        mirrors (Iterable[Tuple[str, str]]):
            Specific words that should be equidistant.
        **kwargs: Other keyword arguments.

    Returns:
        WordEmbedding: The debiased word embedding.
    """
    if out_file.exists():
        return WordEmbedding.load_word2vec_file(out_file)

    # define the bias subspace

    # recenter words
    matrix = []
    for male_word, female_word in word_pairs:
        if male_word not in embedding or female_word not in embedding:
            continue
        matrix.extend(
            recenter(np.array([embedding[male_word], embedding[female_word]])))

    bias_subspace = define_bias_subspace(matrix, **kwargs)
    bias_subspace = _align_gender_direction(embedding, bias_subspace,
                                            word_pairs)
    bias_subspace = bias_subspace[np.newaxis, :]

    # debias by rejecting the subspace and reverting the excluded words
    if excludes is None:
        excludes = set()
    new_vectors = reject(embedding.vectors, bias_subspace)
    for word in excludes:
        if word in embedding:
            new_vectors[embedding.index(word)] = embedding[word]
    new_vectors = normalize(new_vectors)

    # FIXME does equalizing make sense in higher dimensions?
    #new_vectors = _bolukbasi_equalize(embedding, new_vectors, bias_subspace, mirrors)

    # create a word embedding from the new vectors
    new_embedding = WordEmbedding.from_vectors(embedding.words, new_vectors)
    new_embedding.source = out_file
    new_embedding.save()
    return new_embedding