def eval_w2v_wordsim999(): fasttext = we.WordEmbedding('D://Codes/NSE/data/raw/embeddings/fastText-crawl-300d-2M.vec') simlex = get_simlex999('D://Codes/NSE/data/output/vocab_unigram.txt') results = [] for pair in simlex: w1 = pair[0] w2 = pair[1] if w1 not in fasttext.words or w2 not in fasttext.words: print('Cant find', w1, w2) continue w1v = fasttext.vecs[fasttext.index[w1]] w2v = fasttext.vecs[fasttext.index[w2]] results.append([w1, w2, pair[2], np.sum(np.multiply(w1v, w2v))]) print(stats.pearsonr([i[2] for i in results], [i[3] for i in results])[0], stats.spearmanr([i[2] for i in results], [i[3] for i in results]).correlation)
def eval_w2v_wordsim999(): sswe = we.WordEmbedding('D://Codes/NSE/data/raw/embeddings/sswe-r.txt') simlex = get_simlex999('D://Codes/NSE/data/output/vocab_unigram.txt') results = [] for pair in simlex: w1 = pair[0] w2 = pair[1] if w1 not in sswe.words or w2 not in sswe.words: print('Cant find', w1, w2) continue w1v = sswe.vecs[sswe.index[w1]] w2v = sswe.vecs[sswe.index[w2]] results.append([w1, w2, pair[2], np.sum(np.multiply(w1v, w2v))]) print(stats.pearsonr([i[2] for i in results], [i[3] for i in results])[0], stats.spearmanr([i[2] for i in results], [i[3] for i in results]).correlation) for i in sorted(results, key=lambda x: x[3]): print(i)
def eval_w2v_principal_component(): sswe = we.WordEmbedding('D://Codes/NSE/data/raw/embeddings/sswe-r.txt') wordpair = [line.strip().split() for line in open('D://Codes/NSE/data/used/seeds/wordpairs-greater-than-0.5').readlines()] sub_words_p = [line[0] for line in wordpair] sub_words_n = [line[1] for line in wordpair] matrix = [] for index in range(len(sub_words_p)): w1 = sub_words_p[index] w2 = sub_words_n[index] if w1 not in sswe.words or w2 not in sswe.words: print('Cant find', w1, w2) continue w1v = sswe.vecs[sswe.index[w1]] w2v = sswe.vecs[sswe.index[w2]] center = (w1v + w2v) / 2 matrix.append(w1v - center) matrix.append(w2v - center) pca = PCA(n_components=10) pca.fit(matrix) print(pca.explained_variance_ratio_)
def run(): new_words = [] matrix = [] coll = get_sentiwords() fasttext = we.WordEmbedding( 'D://Codes/NSE/data/raw/embeddings/fastText-crawl-300d-2M.vec') for i in coll: word = i[0] if word not in fasttext.words: # print(word, 'can not be found in fasttext') continue else: if word not in new_words: new_words.append(word) matrix.append(fasttext.vecs[fasttext.index[word]]) f1 = open('D://Codes/NSE/data/used/embeddings/word-picked', 'w') f1.writelines("\n".join(new_words)) f2 = open('D://Codes/NSE/data/used/embeddings/word-picked.vec', 'w') f2.write("\n".join([ w + " " + " ".join([str(x) for x in v]) for w, v in zip(new_words, matrix) ]))
import we import numpy as np picked = we.WordEmbedding('D://Codes/NSE/data/used/embeddings/word-picked.vec')
import argparse import random if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument('--f', default='embeddings/glove', type=str) parser.add_argument('--o', default='my_emb', type=str) parser.add_argument('--dim', default=300, type=int) args = parser.parse_args() if args.o == 'my_emb': output_path = f'my_emb_{random.randint(1,10000)}.txt' else: output_path = args.o print(f"Converting: {args.f}") E = we.WordEmbedding(args.f) lines = [f'{len(E.words)} {args.dim}'] for word in tqdm(E.words): vector = ' '.join([str(v) for v in E.v(word)]) word = ' '.join([w for w in word.split()]) line = word + ' ' + vector lines.append(line) with open(output_path, 'w') as handle: handle.write('\n'.join(lines)) print(f"File saved in {output_path}")
idb is calculated using original debiased therefore you need to provide it here." ) parser.add_argument( '--dictf', default="../data/glove_ns.pkl", type=str, help="pkl file for neighbour dict (see `make_neighbours.py` for this)") parser.add_argument( '--bias_list', default="../data/debias.pkl", type=str, help="pkl file for list of words to debias (V_d, debias set in paper)." ) args = parser.parse_args() E_orig = we.WordEmbedding(args.orig_emb) X = we.WordEmbedding(args.f) g = utils.get_g(E_orig) biased_words = get_words(args.bias_list) vals, wts = gipe(biased_words, X, E_orig, g, args.dictf, args.idb_thresh, args.n) gipe_score = score(vals, wts) print(gipe_score) fname = "gipe-results.txt" with open(fname, 'a+') as handle: handle.write( "=========================================================\n") handle.write(
parser.add_argument( "gendered_words_filename", help="File containing words not to neutralize (one per line)") parser.add_argument("equalize_filename", help="???.bin") parser.add_argument("debiased_filename", help="???.bin") args = parser.parse_args() print(args) with open(args.definitional_filename, "r") as f: defs = json.load(f) with open(args.equalize_filename, "r") as f: equalize_pairs = json.load(f) with open(args.gendered_words_filename, "r") as f: gender_specific_words = json.load(f) E = we.WordEmbedding(args.embedding_filename) print("Debiasing...") debias(E, gender_specific_words, defs, equalize_pairs) print("Saving to file...") if args.embedding_filename[-4:] == args.debiased_filename[-4:] == ".bin": E.save_w2v(args.debiased_filename) else: E.save(args.debiased_filename) print("\n\nDone!\n")
def get_embedding_and_g(filename): E = we.WordEmbedding(filename) g = utils.get_g() g = torch.Tensor(g) return E, g
if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument('--emb_file', default="../embeddings/glove", type=str) parser.add_argument('--dict_dir', default="output", type=str) parser.add_argument('--out_dir', default="finals", type=str) parser.add_argument('--out_fname', default="RAN-GloVe", type=str) args = parser.parse_args() emb_file = args.emb_file dict_dir = args.dict_dir out_fname = args.out_fname out_dir = args.out_dir E = we.WordEmbedding(emb_file) fnames = glob.glob(f'{dict_dir}/*.dict.pickle') D = {} for fname in fnames: print(f"Reading {fname}...") d = read(fname) for k in d: D[k] = d[k] new_embs = copy.deepcopy(E.vecs) for w in tqdm(D): new_embs[E.index[w]] = D[w] if not os.path.exists(out_dir):
# Hyperparameters and other EMB = 'data/glove' #path to embedding in vector-vocab format. See we.py to know how this format works. words_list_file = conf.get( 'WORDS_LIST_FILE' ) #pickle file list of words to debias (small list from bulakbasi et. al.) bs = conf.getint('BS') nz = conf.getint('NZ') lrd = conf.getflooat('LRD') lrg = conf.getfloat('LRG') beta1 = conf.getfloat('BETA1') epochs = conf.getfloat('EPOCHS') PATH_D = conf.get('PATH_D') PATH_G = conf.get('PATH_G') E = we.WordEmbedding(EMB) #Load embedding g = gender_subspace_simple(E) #Get gender direction with open(words_list_file, 'rb') as handle: words = pickle.load(handle) #Load word list to debias words = clean_vocab(words, E) #Clean word list device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') # init dataset and dataloader train_dataset = gender_set(words, E, g, device) train_loader = DataLoader(train_dataset, batch_size=bs, shuffle=True) netG = Generator(nz).to(device) netD = SexistDiscriminator().to(device)