Exemple #1
0
def eval_w2v_wordsim999():
    fasttext = we.WordEmbedding('D://Codes/NSE/data/raw/embeddings/fastText-crawl-300d-2M.vec')
    simlex = get_simlex999('D://Codes/NSE/data/output/vocab_unigram.txt')
    results = []

    for pair in simlex:
        w1 = pair[0]
        w2 = pair[1]
        if w1 not in fasttext.words or w2 not in fasttext.words:
            print('Cant find', w1, w2)
            continue

        w1v = fasttext.vecs[fasttext.index[w1]]
        w2v = fasttext.vecs[fasttext.index[w2]]
        results.append([w1, w2, pair[2], np.sum(np.multiply(w1v, w2v))])

    print(stats.pearsonr([i[2] for i in results], [i[3] for i in results])[0], stats.spearmanr([i[2] for i in results], [i[3] for i in results]).correlation)
Exemple #2
0
def eval_w2v_wordsim999():
    sswe = we.WordEmbedding('D://Codes/NSE/data/raw/embeddings/sswe-r.txt')
    simlex = get_simlex999('D://Codes/NSE/data/output/vocab_unigram.txt')
    results = []

    for pair in simlex:
        w1 = pair[0]
        w2 = pair[1]
        if w1 not in sswe.words or w2 not in sswe.words:
            print('Cant find', w1, w2)
            continue

        w1v = sswe.vecs[sswe.index[w1]]
        w2v = sswe.vecs[sswe.index[w2]]
        results.append([w1, w2, pair[2], np.sum(np.multiply(w1v, w2v))])

    print(stats.pearsonr([i[2] for i in results], [i[3] for i in results])[0], stats.spearmanr([i[2] for i in results], [i[3] for i in results]).correlation)

    for i in sorted(results, key=lambda x: x[3]):
        print(i)
Exemple #3
0
def eval_w2v_principal_component():
    sswe = we.WordEmbedding('D://Codes/NSE/data/raw/embeddings/sswe-r.txt')
    wordpair = [line.strip().split() for line in open('D://Codes/NSE/data/used/seeds/wordpairs-greater-than-0.5').readlines()]
    sub_words_p = [line[0] for line in wordpair]
    sub_words_n = [line[1] for line in wordpair]
    matrix = []

    for index in range(len(sub_words_p)):
        w1 = sub_words_p[index]
        w2 = sub_words_n[index]
        if w1 not in sswe.words or w2 not in sswe.words:
            print('Cant find', w1, w2)
            continue
        w1v = sswe.vecs[sswe.index[w1]]
        w2v = sswe.vecs[sswe.index[w2]]
        center = (w1v + w2v) / 2
        matrix.append(w1v - center)
        matrix.append(w2v - center)
    pca = PCA(n_components=10)
    pca.fit(matrix)
    print(pca.explained_variance_ratio_)
def run():
    new_words = []
    matrix = []
    coll = get_sentiwords()
    fasttext = we.WordEmbedding(
        'D://Codes/NSE/data/raw/embeddings/fastText-crawl-300d-2M.vec')

    for i in coll:
        word = i[0]
        if word not in fasttext.words:
            # print(word, 'can not be found in fasttext')
            continue
        else:
            if word not in new_words:
                new_words.append(word)
                matrix.append(fasttext.vecs[fasttext.index[word]])

    f1 = open('D://Codes/NSE/data/used/embeddings/word-picked', 'w')
    f1.writelines("\n".join(new_words))
    f2 = open('D://Codes/NSE/data/used/embeddings/word-picked.vec', 'w')
    f2.write("\n".join([
        w + " " + " ".join([str(x) for x in v])
        for w, v in zip(new_words, matrix)
    ]))
Exemple #5
0
import we
import numpy as np

picked = we.WordEmbedding('D://Codes/NSE/data/used/embeddings/word-picked.vec')
import argparse
import random

if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument('--f', default='embeddings/glove', type=str)
    parser.add_argument('--o', default='my_emb', type=str)
    parser.add_argument('--dim', default=300, type=int)

    args = parser.parse_args()

    if args.o == 'my_emb':
        output_path = f'my_emb_{random.randint(1,10000)}.txt'
    else:
        output_path = args.o

    print(f"Converting: {args.f}")

    E = we.WordEmbedding(args.f)
    lines = [f'{len(E.words)} {args.dim}']
    for word in tqdm(E.words):
        vector = ' '.join([str(v) for v in E.v(word)])
        word = ' '.join([w for w in word.split()])
        line = word + ' ' + vector
        lines.append(line)

    with open(output_path, 'w') as handle:
        handle.write('\n'.join(lines))

    print(f"File saved in {output_path}")
Exemple #7
0
                        idb is calculated using original debiased therefore you need to provide it here."
    )
    parser.add_argument(
        '--dictf',
        default="../data/glove_ns.pkl",
        type=str,
        help="pkl file for neighbour dict (see `make_neighbours.py` for this)")
    parser.add_argument(
        '--bias_list',
        default="../data/debias.pkl",
        type=str,
        help="pkl file for list of words to debias (V_d, debias set in paper)."
    )
    args = parser.parse_args()

    E_orig = we.WordEmbedding(args.orig_emb)
    X = we.WordEmbedding(args.f)
    g = utils.get_g(E_orig)

    biased_words = get_words(args.bias_list)

    vals, wts = gipe(biased_words, X, E_orig, g, args.dictf, args.idb_thresh,
                     args.n)
    gipe_score = score(vals, wts)
    print(gipe_score)

    fname = "gipe-results.txt"
    with open(fname, 'a+') as handle:
        handle.write(
            "=========================================================\n")
        handle.write(
    parser.add_argument(
        "gendered_words_filename",
        help="File containing words not to neutralize (one per line)")
    parser.add_argument("equalize_filename", help="???.bin")
    parser.add_argument("debiased_filename", help="???.bin")

    args = parser.parse_args()
    print(args)

    with open(args.definitional_filename, "r") as f:
        defs = json.load(f)

    with open(args.equalize_filename, "r") as f:
        equalize_pairs = json.load(f)

    with open(args.gendered_words_filename, "r") as f:
        gender_specific_words = json.load(f)

    E = we.WordEmbedding(args.embedding_filename)

    print("Debiasing...")
    debias(E, gender_specific_words, defs, equalize_pairs)

    print("Saving to file...")
    if args.embedding_filename[-4:] == args.debiased_filename[-4:] == ".bin":
        E.save_w2v(args.debiased_filename)
    else:
        E.save(args.debiased_filename)

    print("\n\nDone!\n")
Exemple #9
0
def get_embedding_and_g(filename):
    E = we.WordEmbedding(filename)
    g = utils.get_g()
    g = torch.Tensor(g)
    return E, g
Exemple #10
0

if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument('--emb_file', default="../embeddings/glove", type=str)
    parser.add_argument('--dict_dir', default="output", type=str)
    parser.add_argument('--out_dir', default="finals", type=str)
    parser.add_argument('--out_fname', default="RAN-GloVe", type=str)
    args = parser.parse_args()

    emb_file = args.emb_file
    dict_dir = args.dict_dir
    out_fname = args.out_fname
    out_dir = args.out_dir

    E = we.WordEmbedding(emb_file)

    fnames = glob.glob(f'{dict_dir}/*.dict.pickle')

    D = {}
    for fname in fnames:
        print(f"Reading {fname}...")
        d = read(fname)
        for k in d:
            D[k] = d[k]

    new_embs = copy.deepcopy(E.vecs)
    for w in tqdm(D):
        new_embs[E.index[w]] = D[w]

    if not os.path.exists(out_dir):
Exemple #11
0
# Hyperparameters and other
EMB = 'data/glove'  #path to embedding in vector-vocab format. See we.py to know how this format works.
words_list_file = conf.get(
    'WORDS_LIST_FILE'
)  #pickle file list of words to debias (small list from bulakbasi et. al.)
bs = conf.getint('BS')
nz = conf.getint('NZ')
lrd = conf.getflooat('LRD')
lrg = conf.getfloat('LRG')
beta1 = conf.getfloat('BETA1')
epochs = conf.getfloat('EPOCHS')
PATH_D = conf.get('PATH_D')
PATH_G = conf.get('PATH_G')

E = we.WordEmbedding(EMB)  #Load embedding
g = gender_subspace_simple(E)  #Get gender direction

with open(words_list_file, 'rb') as handle:
    words = pickle.load(handle)  #Load word list to debias

words = clean_vocab(words, E)  #Clean word list

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# init dataset and dataloader
train_dataset = gender_set(words, E, g, device)
train_loader = DataLoader(train_dataset, batch_size=bs, shuffle=True)

netG = Generator(nz).to(device)
netD = SexistDiscriminator().to(device)