def __init__(self, dimensions=None, gensim_obj=None, source=None): # type: (int, gensim.Word2VecKeyedVectors, Path) -> None """Initialize a word embedding. At least one of dimensions and gensim_obj must be provided. If both are used, dimensions is ignored. Parameters: dimensions (int): The number of dimensions of the embedding. gensim_obj (gensim.Word2VecKeyedVectors): A gensim word embedding or related model. source (Path): The path of the source file. Raises: ValueError: If neither dimensions nor gensim_obj is provided. If dimensions is not a positive integer. If the word vectors in the gensim_obj cannot be determined. """ if dimensions is None and gensim_obj is None: raise ValueError( 'one of dimensions or gensim_obj must be provided') if gensim_obj is None: if not isinstance(dimensions, int) and dimensions > 0: raise ValueError('dimensions must be a positive integer') self.keyed_vectors = Word2VecKeyedVectors(dimensions) elif isinstance(gensim_obj, WordEmbeddingsKeyedVectors): if not hasattr(gensim_obj, 'save_word2vec_format'): raise ValueError( f'gensim_obj {type(gensim_obj)} does not have attribute "save_word2vec_format"' ) self.keyed_vectors = gensim_obj elif isinstance(gensim_obj, BaseWordEmbeddingsModel): if not hasattr(gensim_obj, 'wv'): raise ValueError( f'gensim_obj {type(gensim_obj)} does not have attribute "wv"' ) self.keyed_vectors = gensim_obj.wv else: raise ValueError( f'unable to determine word vectors in gensim object {gensim_obj}' ) self.source = source # forcefully normalize the vectors self.keyed_vectors.vectors = normalize(self.keyed_vectors.vectors)
def _define_pca_bias_subspace(matrix, subspace_dimensions=1, **kwargs): # type: (numpy.ndarray, int, **Any) -> numpy.ndarray """Calculate the gender direction using PCA. Parameters: matrix (numpy.ndarray): A word embedding. subspace_dimensions (int): The number of principle components to use. Defaults to 1. **kwargs: Other keyword arguments. Returns: numpy.ndarray: A basis of the bias subspace. Raises: ValueError: If none of the words are in the embedding. """ pca = PCA(n_components=subspace_dimensions) pca.fit(matrix) return normalize(pca.components_) # FIXME trim down to desired dimensions
def _define_mean_bias_subspace(embedding, word_pairs, **kwargs): # type: (WordEmbedding, Iterable[Tuple[str, str]], **Any) -> numpy.ndarray """Calculate the gender direction using the Euclidean mean. Parameters: embedding (WordEmbedding): A word embedding. word_pairs (Iterable[Tuple[str, str]]): A list of male-female word pairs. **kwargs: Other keyword arguments. Returns: numpy.ndarray: A female->male vector. Raises: ValueError: If none of the gender pairs are in the embedding. """ diff_vectors = [] for male_word, female_word in word_pairs: if male_word not in embedding or female_word not in embedding: continue diff_vectors.append(embedding[male_word] - embedding[female_word]) if not diff_vectors: raise ValueError('embedding does not contain any gender pairs.') return normalize(np.mean(np.array(diff_vectors), axis=0))
def bolukbasi_debias_generalized(embedding, words, out_file, excludes=None, **kwargs): # type: (WordEmbedding, Iterable[str], Path, Iterable[str], **Any) -> WordEmbedding """Debias a word embedding using a generalized version of Bolukbasi's algorithm. Parameters: embedding (WordEmbedding): The word embedding to debias. words (Iterable[str]): A list of words that define the bias subspace. out_file (Path): The path to save the new embedding to. excludes (Iterable[str]): A collection of words to be excluded from the debiasing **kwargs: Other keyword arguments. Returns: WordEmbedding: The debiased word embedding. """ if out_file.exists(): return WordEmbedding.load_word2vec_file(out_file) matrix = recenter( np.array([embedding[word] for word in words if word in embedding])) bias_subspace = _define_pca_bias_subspace(matrix, **kwargs) bias_subspace = bias_subspace[np.newaxis, :] # debias by rejecting the subspace and reverting the excluded words if excludes is None: excludes = set() new_vectors = reject(embedding.vectors, bias_subspace) for word in excludes: if word in embedding: new_vectors[embedding.index(word)] = embedding[word] new_vectors = normalize(new_vectors) # create a word embedding from the new vectors new_embedding = WordEmbedding.from_vectors(embedding.words, new_vectors) new_embedding.source = out_file new_embedding.save() return new_embedding
def bolukbasi_debias_original(embedding, word_pairs, out_file, excludes=None, mirrors=None, **kwargs): # type: (WordEmbedding, Iterable[Tuple[str, str]], Path, Iterable[str], Iterable[Tuple[str, str]], **Any) -> WordEmbedding """Debias a word embedding using Bolukbasi's original algorithm. Adapted from https://github.com/tolga-b/debiaswe/blob/master/debiaswe/debias.py#L19 Commit 10277b23e187ee4bd2b6872b507163ef4198686b on 2018-04-02 Parameters: embedding (WordEmbedding): The word embedding to debias. word_pairs (Iterable[Tuple[str, str]]): A list of word pairs that define the bias subspace. out_file (Path): The path to save the new embedding to. excludes (Iterable[str]): A collection of words to be excluded from the debiasing mirrors (Iterable[Tuple[str, str]]): Specific words that should be equidistant. **kwargs: Other keyword arguments. Returns: WordEmbedding: The debiased word embedding. """ if out_file.exists(): return WordEmbedding.load_word2vec_file(out_file) # define the bias subspace # recenter words matrix = [] for male_word, female_word in word_pairs: if male_word not in embedding or female_word not in embedding: continue matrix.extend( recenter(np.array([embedding[male_word], embedding[female_word]]))) bias_subspace = define_bias_subspace(matrix, **kwargs) bias_subspace = _align_gender_direction(embedding, bias_subspace, word_pairs) bias_subspace = bias_subspace[np.newaxis, :] # debias by rejecting the subspace and reverting the excluded words if excludes is None: excludes = set() new_vectors = reject(embedding.vectors, bias_subspace) for word in excludes: if word in embedding: new_vectors[embedding.index(word)] = embedding[word] new_vectors = normalize(new_vectors) # FIXME does equalizing make sense in higher dimensions? #new_vectors = _bolukbasi_equalize(embedding, new_vectors, bias_subspace, mirrors) # create a word embedding from the new vectors new_embedding = WordEmbedding.from_vectors(embedding.words, new_vectors) new_embedding.source = out_file new_embedding.save() return new_embedding