Ejemplo n.º 1
0
def create_representation(args):
    rep_type = args['<representation>']
    path = args['<representation_path>']
    neg = int(args['--neg'])
    w_c = args['--w+c']
    eig = float(args['--eig'])
    
    if rep_type == 'PPMI':
        if w_c:
            raise Exception('w+c is not implemented for PPMI.')
        else:
            return PositiveExplicit(path, True, neg)
        
    elif rep_type == 'SVD':
        if w_c:
            return EnsembleEmbedding(SVDEmbedding(path, False, eig, False), SVDEmbedding(path, False, eig, True), True)
        else:
            return SVDEmbedding(path, True, eig)
    elif rep_type == 'GLOVE':
        return GLOVEEmbedding(path, True)        
    else:
        if w_c:
            return EnsembleEmbedding(Embedding(path + '.words', False), Embedding(path + '.contexts', False), True)
        else:
            return Embedding(path + '.words', True)
Ejemplo n.º 2
0
def intersection_align(embed1, embed2, post_normalize=True):
    """ 
        Get the intersection of two embeddings.
        Returns embeddings with common vocabulary and indices.
    """
    common_vocab = list(filter(set(embed1.iw).__contains__, embed2.iw))
    newvecs1 = np.empty((len(common_vocab), embed1.m.shape[1]))
    newvecs2 = np.empty((len(common_vocab), embed2.m.shape[1]))
    for i in range(len(common_vocab)):
        newvecs1[i] = embed1.m[embed1.wi[common_vocab[i]]]
        newvecs2[i] = embed2.m[embed2.wi[common_vocab[i]]]
    return Embedding(newvecs1, common_vocab,
                     normalize=post_normalize), Embedding(
                         newvecs2, common_vocab, normalize=post_normalize)
Ejemplo n.º 3
0
def align_cloud(year, rep_type, main_dir, num, dim, wordlist, **rep_args):
    print "Aligning cloud year:", year
    avg_embed_mat = np.zeros((len(wordlist), dim))
    for i in range(1, num + 1):  # Iterates throug the embeddings
        print i
        finname = main_dir + "/embedding_" + str(i) + "/noinit/" + str(
            dim) + "/" + str(year)
        foutname = main_dir + "/embedding_" + str(i) + "/noinit/" + str(
            dim) + "/aligned/" + str(year)
        other_embed = create_representation(
            rep_type, finname, **rep_args)  # Loads the individual embedding
        keep_indices = [other_embed.wi[word] for word in wordlist]
        other_embed = Embedding(
            other_embed.m[keep_indices, :], wordlist,
            normalize=False)  # Synchronize the order of words
        if i == 1:
            base_embed = other_embed
            ortho = np.eye(dim)
        else:
            ortho = alignment.get_procrustes_mat(base_embed, other_embed)
        aligned_embed_mat = (other_embed.m).dot(
            ortho)  # Rotates the embedding to the reference
        avg_embed_mat += aligned_embed_mat / num  # Creates avarage embedding
        np.save(foutname + "-w.npy", aligned_embed_mat)
        write_pickle(other_embed.iw, foutname + "-vocab.pkl")
    foutname = main_dir + "/embedding_avg/" + str(year)
    np.save(foutname + "-w.npy", avg_embed_mat)
    write_pickle(base_embed.iw, foutname + "-vocab.pkl")
Ejemplo n.º 4
0
def align_years(years, rep_type, main_dir, num, dim, **rep_args):
    print "Aligning years to each other"
    first_iter = True
    base_embed = None
    for year in years:  # Iterates through years
        print year
        year_embed = create_representation(
            rep_type, main_dir + "/embedding_avg/" + str(year),
            **rep_args)  # Loads the individual embedding
        if first_iter:
            aligned_embed = year_embed
            first_iter = False
        else:
            ortho = alignment.get_procrustes_mat(base_embed, year_embed)
            aligned_embed = Embedding(
                (year_embed.m).dot(ortho), year_embed.iw,
                normalize=False)  # Rotates to the previous year embedding
            for i in range(
                    1, num +
                    1):  # Align all the embedding the same way as the avarage
                finname = main_dir + "/embedding_" + str(i) + "/noinit/" + str(
                    dim) + "/aligned/" + str(year)
                foutname = main_dir + "/embedding_" + str(
                    i) + "/noinit/" + str(dim) + "/aligned/" + str(year)
                mat = np.load(finname + "-w.npy")
                mat = mat.dot(ortho)
                np.save(foutname + "-w.npy", mat)
        base_embed = aligned_embed
        foutname = main_dir + "/embedding_avg/aligned/" + str(year)
        np.save(foutname + "-w.npy", aligned_embed.m)
        write_pickle(aligned_embed.iw, foutname + "-vocab.pkl")
Ejemplo n.º 5
0
def linear_align(base_embed, other_embed):
    """
        Align other embedding to base embedding using best linear transform.
        NOTE: Assumes indices are aligned
    """
    basevecs = base_embed.m
    othervecs = other_embed.m
    fixedvecs = othervecs.dot(np.linalg.pinv(othervecs)).dot(basevecs)
    return Embedding(fixedvecs, other_embed.iw)
Ejemplo n.º 6
0
def smart_procrustes_align(base_embed, other_embed, post_normalize=True):
    in_base_embed, in_other_embed = intersection_align(base_embed,
                                                       other_embed,
                                                       post_normalize=False)
    base_vecs = in_base_embed.m
    other_vecs = in_other_embed.m
    m = other_vecs.T.dot(base_vecs)
    u, _, v = np.linalg.svd(m)
    ortho = u.dot(v)
    return Embedding((other_embed.m).dot(ortho),
                     other_embed.iw,
                     normalize=post_normalize)
Ejemplo n.º 7
0
def main():
    args = docopt("""
    Usage:
        sgns2text.py [options] <sgns_path> <output_path>
    
    Options:
        --w+c        Use ensemble of word and context vectors
    """)
    
    sgns_path = args['<sgns_path>']
    output_path = args['<output_path>']
    w_c = args['--w+c']
    
    if w_c:
        sgns = EnsembleEmbedding(Embedding(sgns_path + '.words', False), Embedding(sgns_path + '.contexts', False), True)
    else:
        sgns = Embedding(sgns_path + '.words', True)
    
    with open(output_path, 'w') as f:
        for i, w in enumerate(sgns.iw):
            print >>f, w, ' '.join([str(x) for x in sgns.m[i]])
Ejemplo n.º 8
0
def procrustes_align(base_embed, other_embed):
    """ 
        Align other embedding to base embeddings via Procrustes.
        Returns best distance-preserving aligned version of other_embed
        NOTE: Assumes indices are aligned
    """
    basevecs = base_embed.m - base_embed.m.mean(0)
    othervecs = other_embed.m - other_embed.m.mean(0)
    m = othervecs.T.dot(basevecs)
    u, _, v = np.linalg.svd(m)
    ortho = u.dot(v)
    fixedvecs = othervecs.dot(ortho)
    return Embedding(fixedvecs, other_embed.iw)
 def load(cls, path, years, **kwargs):
     embeds = collections.OrderedDict()
     for year in years:
         for file in os.listdir(path) :
             if(re.match(".*-w.npy",file)):
                 year_name = str(year)
                 break
             elif(re.match(".*"+str(year)+".*",file)):
                 year_name = file
                 break
             else:
                 year_name = None
         if(year_name):
             embeds[year] = Embedding.load(path + "/" + year_name, **kwargs)
         else:
             print("Couldn't load data of year"+str(year))
     return SequentialEmbedding(embeds)
Ejemplo n.º 10
0
def main():
    args = docopt("""
        Usage:
            eval_reliability.py [options] <folders>...

        Options:
            --words FILE      Use FILE with list of words (1 per line) to measure reliabilty
            --ws FILES        Testsets for word similarity evaluation, use "," as separator!
            --ana FILES       Testsets for analogy evaluation, use "," as separator!
            --closest N       Use N closest neighbors to measure reliability [default: 10]   
    """)
    folders = args["<folders>"]

    closest = int(args["--closest"])
    word_list = args["--words"]
    ws_test_sets = [read_ws_test_set(path) for path in args["--ws"].split(",")]
    as_test_sets = [
        read_as_test_set(path) for path in args["--ana"].split(",")
    ]
    as_xi_and_ix = [get_vocab_as(test_set) for test_set in as_test_sets]
    words = words_to_evaluate_file(
        word_list) if word_list else argswords_to_evaluate(representations)

    representations = []
    for folder in folders:
        representations.append(Embedding(folder + '/vec',
                                         True))  #only works for SGNS!

    #comparisson over all subsets
    if len(representations) < 2:
        raise Exception("Need multiple models for evaluation")

    evaluated = [
        " ".join([str(evaluate_ws(r, w)) for r in representations])
        for w in ws_test_sets
    ]
    for i, test_set in enumerate(as_test_sets):
        evaluated.append(" ".join([
            str(
                evaluate_as(r, test_set, as_xi_and_ix[i][0],
                            as_xi_and_ix[i][1])) for r in representations
        ]))
    evaluated.append(reliability(representations, words, closest))
    print("\t".join(evaluated))
Ejemplo n.º 11
0
def wiktionary_eval(src_vecs,
                    trg_vecs,
                    wiktionary_file,
                    b_reverse,
                    b_print=False,
                    b_include_oov=False,
                    eval_dir=None,
                    src_lang_code=None,
                    trg_lang_code=None,
                    precision_at_N=1):
    OOV = None
    INV = None

    if eval_dir is not None and src_lang_code is not None and trg_lang_code is not None:
        # These extra params are to do with outputting OOV and INV words during evaluation
        oov_filename = './' + eval_dir + '/wiktionary-eval/oov-' + src_lang_code + '-' + trg_lang_code + '.txt'
        inv_filename = './' + eval_dir + '/wiktionary-eval/inv-' + src_lang_code + '-' + trg_lang_code + '.txt'
        OOV = set()  # Use sets, so that the words get de-duped
        INV = set()

    Es = Embedding(src_vecs, True)
    Et = Embedding(trg_vecs, True)

    BX = [(l.split("|||")[-1].strip(), l.split("|||")[0].strip())
          for l in open(wiktionary_file).readlines()]
    if b_reverse:
        # Reverse the evaluation source/target
        BX = [(t, s) for s, t in BX]
        Es, Et = Et, Es

    BD = []
    for s, t in BX:
        if s in Es.wi and t in Et.wi:
            BD.append((s, t))
            if INV is not None:
                INV.add(s + ' ||| ' + t)
        else:
            if b_include_oov:
                BD.append((s, t))
            if OOV is not None:
                OOV.add(s + ' ||| ' + t)

    p1, tot = 0, 0
    for s, t in BD:
        vs = Es.represent(s)
        scores = vs.dot(Et.m.T)
        if precision_at_N == 1:
            cand = Et.iw[np.nanargmax(scores)]
            if t == cand:
                p1 += 1
        else:
            is_match = False
            indices = np.argsort(scores)[-precision_at_N:]  # positions
            for index in indices:
                cand = Et.iw[index]
                if t == cand:
                    is_match = True
                    break
            if is_match:
                p1 += 1
        tot += 1

    if b_print:
        print '{0:.4f}'.format(p1 / tot)

    if eval_dir is not None and src_lang_code is not None and trg_lang_code is not None:
        inv_f = open(inv_filename, 'w')
        for inv in INV:
            inv_f.write(inv + '\n')
        inv_f.close()

        oov_f = open(oov_filename, 'w')
        for oov in OOV:
            oov_f.write(oov + '\n')
        oov_f.close()

    return p1 / tot
Ejemplo n.º 12
0
 def load(cls, path, years, **kwargs):
     embeds = collections.OrderedDict()
     for year in years:
         embeds[year] = Embedding.load(path + "/" + str(year), **kwargs)
     return SequentialEmbedding(embeds)
Ejemplo n.º 13
0
def main():
    usage = "%prog sgns.words sgns.words.vocab"
    parser = OptionParser(usage=usage)
    #parser.add_option('-n', dest='n', default=10,
    #                  help='Most similar: default=%default')
    #parser.add_option('--boolarg', action="store_true", dest="boolarg", default=False,
    #                  help='Keyword argument: default=%default')

    (options, args) = parser.parse_args()

    vec_file = args[0]
    vocab_file = args[1]

    #n = int(options.n)

    emb = Embedding(vec_file)
    subset = {}

    with open(vocab_file, 'r') as f:
        vocab = f.readlines()
    vocab = [v.strip() for v in vocab]
    vocab = [v for v in vocab if v.startswith('msa-')]

    for word in vocab:
        subset[word] = emb.represent(word)

    keys = list(subset.keys())
    n_words = len(keys)
    emb_size = len(subset[keys[0]])
    for word in keys:
        print(word)

    print("Doing dimensionality reduction")
    vectors = np.zeros([n_words, emb_size])
    for word_i, word in enumerate(keys):
        vectors[word_i, :] = subset[word]

    tsne = TSNE(n_components=2)
    proj = tsne.fit_transform(vectors)

    #pca = PCA(n_components=2)
    #proj = pca.fit_transform(vectors)

    #transformer = np.random.randn(emb_size, 2)
    #proj = np.dot(vectors, transformer)

    print("plotting")
    fig, ax = plt.subplots(figsize=(8, 6))

    for i, word in enumerate(keys):
        ax.scatter([proj[i, 0]], [proj[i, 1]],
                   color='k',
                   alpha=0.6,
                   s=1,
                   edgecolors=None)
        if word == 'msa-orlando-nightclub-massacre':
            ax.text(proj[i, 0],
                    proj[i, 1],
                    'Orlando',
                    fontsize=8,
                    alpha=0.6,
                    ha='center',
                    va='baseline')
        if word == 'msa-san-bernardino,-california':
            ax.text(proj[i, 0],
                    proj[i, 1],
                    'San Bernadino',
                    fontsize=8,
                    alpha=0.6,
                    ha='center',
                    va='baseline')
        if word == 'msa-amnicola-training-center,-chattanooga':
            ax.text(proj[i, 0],
                    proj[i, 1],
                    'Chattanooga',
                    fontsize=8,
                    alpha=0.6,
                    ha='center',
                    va='baseline')
        if word == 'msa-columbine-high-school':
            ax.text(proj[i, 0],
                    proj[i, 1],
                    'Columbine',
                    fontsize=8,
                    alpha=0.6,
                    ha='center',
                    va='baseline')
        if word == 'msa-westside-middle-school':
            ax.text(proj[i, 0],
                    proj[i, 1],
                    'Westside Middle School',
                    fontsize=8,
                    alpha=0.6,
                    ha='center',
                    va='baseline')
        if word == 'msa-heritage-high-school':
            ax.text(proj[i, 0],
                    proj[i, 1],
                    'Heritage High School',
                    fontsize=8,
                    alpha=0.6,
                    ha='center',
                    va='baseline')
        if word == 'msa-virginia-tech--campus':
            ax.text(proj[i, 0],
                    proj[i, 1],
                    'Virginia Tech',
                    fontsize=8,
                    alpha=0.6,
                    ha='center',
                    va='baseline')
        if word == 'msa-tucson,-arizona':
            ax.text(proj[i, 0],
                    proj[i, 1],
                    'Tuscon, Arizona',
                    fontsize=8,
                    alpha=0.6,
                    ha='center',
                    va='baseline')
        if word == 'msa-movie-theater-in-aurora':
            ax.text(proj[i, 0],
                    proj[i, 1],
                    'Aurora',
                    fontsize=8,
                    alpha=0.6,
                    ha='center',
                    va='baseline')
        if word == 'msa-mother-emanuel-ame-church':
            ax.text(proj[i, 0],
                    proj[i, 1],
                    'Mother Emanuel',
                    fontsize=8,
                    alpha=0.6,
                    ha='center',
                    va='baseline')
        if word == 'Umpqua Community College':
            ax.text(proj[i, 0],
                    proj[i, 1],
                    'Umpqua Community College',
                    fontsize=8,
                    alpha=0.6,
                    ha='center',
                    va='baseline')

        #if word in target_words:
        #    ax.scatter([proj[i, 0]], [proj[i, 1]], color='k', alpha=0.6, s=1, edgecolors=None)
        #    ax.text(proj[i, 0], proj[i, 1], word, fontsize=8, alpha=0.6, ha='center', va='baseline', color='red')
        #else:
        #    ax.scatter([proj[i, 0]], [proj[i, 1]], color='k', alpha=0.6, s=1, edgecolors=None)
        #    ax.text(proj[i, 0], proj[i, 1], word, fontsize=8, alpha=0.6, ha='center', va='baseline')

    previous_x = None
    previous_y = None
    """
    for i, year in enumerate(years):
        target_word = base_word + '_' + str(year)
        index = keys.index(target_word)
        x = proj[index, 0]
        y = proj[index, 1]
        if previous_x is not None:
            plt.plot([previous_x, x], [previous_y, y], c='b', linewidth=1, alpha=(0.8 * i/len(years) + 0.1))
        ax.scatter([x], [y], color='red', alpha=0.5, s=2, edgecolors=None)
        previous_x = x
        previous_y = y
    """

    plt.savefig('shooters.pdf', bbox_inches='tight')
Ejemplo n.º 14
0
    parser.add_argument("test_path", help="Path to test data")
    parser.add_argument("--word-path",
                        help="Path to sorted list of context words",
                        default="")
    parser.add_argument("--num-context",
                        type=int,
                        help="Number context words to use",
                        default=-1)
    parser.add_argument("--type", default="PPMI")
    args = parser.parse_args()
    if args.type == "PPMI":
        year = int(args.vec_path.split("/")[-1].split(".")[0])
        if args.num_context != -1 and args.word_path == "":
            raise Exception(
                "Must specify path to context word file if the context words are to be restricted!"
            )
        elif args.word_path != "":
            _, context_words = ioutils.load_target_context_words(
                [year], args.word_path, -1, args.num_context)
            context_words = context_words[year]
        else:
            context_words = None
        rep = Explicit.load(args.vec_path, restricted_context=context_words)
    elif args.type == "SVD":
        rep = SVDEmbedding(args.vec_path, eig=0.0)
    else:
        rep = Embedding.load(args.vec_path, add_context=False)
    data = read_test_set(args.test_path)
    correlation = evaluate(rep, data)
    print "Correlation: " + str(correlation)
Ejemplo n.º 15
0
 def __init__(self, model_code):
     model_name = models[model_code]
     year = model_name.split('-')[-1]
     self.embed = Embedding.load('files/sgns/'+year)
     self.embed.normalize()             
Ejemplo n.º 16
0
from representations.embedding import Embedding
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import itertools as it

from viz.scripts import closest_over_time
from viz import mplot, mcommon
from subprocess import check_output

import random
random.seed(1111)

## !!   use python2, because github uses python2

model_dir = 'C:/Users/dat/Documents/SemEval2017Task4/4B-English/BertSentiment/'
os.chdir(model_dir)

model_name = 'word_vector768'
tweet2017all = Embedding.load(model_name)

print('model {}'.format(model_name))

word_list = ['misogynistic', 'feminist', 'caitlyn_jenner', 'he', 'she']

for word in word_list:
    print('word {}'.format(word))
    print(tweet2017all.closest(word, 20))
    mplot.plot_one_word(word, tweet2017all)
Ejemplo n.º 17
0
def alignment_eval(src_vecs,
                   trg_vecs,
                   align_file,
                   src_test_file,
                   trg_test_file,
                   b_reverse,
                   b_print=False):
    Es = Embedding(src_vecs, True)
    Et = Embedding(trg_vecs, True)

    poss_aligns = [[int(x) for x in l.strip().split()[:3]]
                   for l in open(align_file).readlines()]
    sure_aligns = [[int(x) for x in l.strip().split()[:3]]
                   for l in open(align_file).readlines()
                   if l.strip().split()[-1] != "P"]

    ssents = [
        l.strip().split(">")[1].split("<")[0].split() for l in codecs.open(
            src_test_file, 'r', "utf8", errors='ignore').readlines()
    ]
    tsents = [
        l.strip().split(">")[1].split("<")[0].split() for l in codecs.open(
            trg_test_file, 'r', "utf8", errors='ignore').readlines()
    ]  #swap 3 and 4

    if b_reverse:
        # Reverse the evaluation source/target
        poss_aligns = [(sid, twid, swid) for sid, swid, twid in poss_aligns]
        sure_aligns = [(sid, twid, swid) for sid, swid, twid in sure_aligns]
        Es, Et = Et, Es
        ssents, tsents = tsents, ssents

    poss_aligns = gb([(sid - 1, (swid - 1, twid - 1))
                      for sid, swid, twid in poss_aligns])
    sure_aligns = gb([(sid - 1, (swid - 1, twid - 1))
                      for sid, swid, twid in sure_aligns])

    size_a = 0.0
    size_s = 0.0
    size_a_and_s = 0.0
    size_a_and_p = 0.0

    for sid, (ssent, tsent) in enumerate(zip(ssents, tsents)):
        alignment = set()
        twords = [tword.split("_")[0].lower() for tword in tsent]
        tvecs = [
            Et.represent(tword) if tword in Et.wi else None for tword in twords
        ]
        for swid, sword in enumerate(ssent):
            sword = sword.split("_")[0].lower()
            if sword in Es.wi:
                svec = Es.represent(sword)
                sims = [
                    svec.dot(tvec) if tvec is not None else -1.0
                    for tvec in tvecs
                ]
                alignment.add((swid, np.argmax(sims)))
                #print(sword,twords[np.argmax(sims)])

        sure = sure_aligns[sid]
        poss = poss_aligns[sid]

        size_a += float(len(alignment))
        size_s += float(len(sure))
        s_a = alignment & sure
        p_a = alignment & poss
        size_a_and_s += float(len(s_a))
        size_a_and_p += float(len(p_a))

    if b_print:
        print '{0:.4f}'.format(
            (size_a_and_s + size_a_and_p) / (size_a + size_s))

    return (size_a_and_s + size_a_and_p) / (size_a + size_s)
Ejemplo n.º 18
0
import codecs
import glob
import numpy as np
from scipy import stats
from itertools import groupby
sys.path.append('hyperwords')
from representations.embedding import Embedding


def gb(collection):
    keyfunc = lambda x: x[0]
    groups = groupby(sorted(collection, key=keyfunc), keyfunc)
    return {k: set([v for k_, v in g]) for k, g in groups}


Es = Embedding(sys.argv[1], True)
Et = Embedding(sys.argv[2], True)

poss_aligns = [[int(x) for x in l.strip().split()[:3]]
               for l in open(sys.argv[3]).readlines()]
sure_aligns = [[int(x) for x in l.strip().split()[:3]]
               for l in open(sys.argv[3]).readlines()
               if l.strip().split()[-1] != "P"]

ssents = [
    l.strip().split(">")[1].split("<")[0].split() for l in codecs.open(
        sys.argv[4], 'r', "utf8", errors='ignore').readlines()
]
tsents = [
    l.strip().split(">")[1].split("<")[0].split() for l in codecs.open(
        sys.argv[5], 'r', "utf8", errors='ignore').readlines()
Ejemplo n.º 19
0
    actual, expected = zip(*results)
    print "OOV: ", oov
    return spearmanr(actual, expected)[0]


if __name__ == '__main__':
    parser = ArgumentParser("Run word similarity benchmark")
    parser.add_argument("vec_path", help="Path to word vectors")
    parser.add_argument("test_path", help="Path to test data")
    parser.add_argument("--word-path", help="Path to sorted list of context words", default="")
    parser.add_argument("--num-context", type=int, help="Number context words to use", default=-1)
    parser.add_argument("--type", default="PPMI")
    args = parser.parse_args()
    if args.type == "PPMI":
        year = int(args.vec_path.split("/")[-1].split(".")[0])
        if args.num_context != -1 and args.word_path == "":
            raise Exception("Must specify path to context word file if the context words are to be restricted!")
        elif args.word_path != "":
            _, context_words = ioutils.load_target_context_words([year], args.word_path, -1, args.num_context)
            context_words = context_words[year]
        else:
            context_words = None
        rep = Explicit.load(args.vec_path, restricted_context=context_words)
    elif args.type == "SVD":
        rep = SVDEmbedding(args.vec_path, eig=0.0)
    else:
        rep = Embedding.load(args.vec_path, add_context=False)
    data = read_test_set(args.test_path)
    correlation = evaluate(rep, data)
    print "Correlation: " + str(correlation)
Ejemplo n.º 20
0
def main():
    usage = "%prog sgns.words"
    parser = OptionParser(usage=usage)
    #parser.add_option('-n', dest='n', default=10,
    #                  help='Most similar: default=%default')
    #parser.add_option('--boolarg', action="store_true", dest="boolarg", default=False,
    #                  help='Keyword argument: default=%default')

    (options, args) = parser.parse_args()

    vec_file = args[0]

    #n = int(options.n)

    emb = Embedding(vec_file)
    subset = {}

    target_words = []
    target_vectors = []
    words = []

    years = list(range(1987, 2008))

    print("Collecting vectors")
    target_word = 'terrorism_pre911'
    target_words.append(target_word)
    subset[target_word] = emb.represent(target_word)
    target_vectors.append(emb.represent(target_word))

    target_word = 'terrorism_post911'
    target_words.append(target_word)
    subset[target_word] = emb.represent(target_word)
    target_vectors.append(emb.represent(target_word))

    mean_vector = np.mean(target_vectors, axis=0)

    closest = emb.closest_to_vec(mean_vector, n=50)
    words = [pair[1] for pair in closest]
    words = [word for word in words if word != 'terrorism']
    words = [word for word in words if not re.search(r'\d', word)]
    for word in words:
        subset[word] = emb.represent(word)

    keys = list(subset.keys())
    n_words = len(keys)
    emb_size = len(subset[keys[0]])
    for word in keys:
        print(word)

    print("Doing dimensionality reduction")
    vectors = np.zeros([n_words, emb_size])
    dist1 = np.zeros(n_words)
    dist2 = np.zeros(n_words)
    for word_i, word in enumerate(keys):
        vectors[word_i, :] = subset[word]
        dist1[word_i] = np.abs(cosine_similarity(subset[word].reshape(-1, 1), subset['terrorism_pre911'].reshape(-1, 1))[0][0])
        dist2[word_i] = np.abs(cosine_similarity(subset[word].reshape(-1, 1), subset['terrorism_post911'].reshape(-1, 1))[0][0])

    tsne = TSNE(n_components=2)
    proj = tsne.fit_transform(vectors)

    #pca = PCA(n_components=2)
    #proj = pca.fit_transform(vectors)

    #transformer = np.random.randn(emb_size, 2)
    #proj = np.dot(vectors, transformer)

    print("plotting")
    fig, ax = plt.subplots(figsize=(8, 6))

    for i, word in enumerate(keys):
        if word in target_words:
            ax.scatter([proj[i, 0]], [proj[i, 1]], color='k', alpha=0.6, s=1, edgecolors=None)
            ax.text(proj[i, 0], proj[i, 1], word, fontsize=8, alpha=0.6, ha='center', va='baseline', color='red')
        else:
            ax.scatter([proj[i, 0]], [proj[i, 1]], color='k', alpha=0.6, s=1, edgecolors=None)
            ax.text(proj[i, 0], proj[i, 1], word, fontsize=8, alpha=0.6, ha='center', va='baseline')

    previous_x = None
    previous_y = None

    """
    for i, year in enumerate(years):
        target_word = base_word + '_' + str(year)
        index = keys.index(target_word)
        x = proj[index, 0]
        y = proj[index, 1]
        if previous_x is not None:
            plt.plot([previous_x, x], [previous_y, y], c='b', linewidth=1, alpha=(0.8 * i/len(years) + 0.1))
        ax.scatter([x], [y], color='red', alpha=0.5, s=2, edgecolors=None)
        previous_x = x
        previous_y = y
    """

    plt.savefig('test.pdf', bbox_inches='tight')


    fig, ax = plt.subplots(figsize=(8, 6))
    for i, word in enumerate(keys):
        if word not in target_words:
            ax.scatter(dist1[i], dist2[i], color='k', alpha=0.6, s=1, edgecolors=None)
            ax.text(dist1[i], dist2[i], word, fontsize=8, alpha=0.6, ha='center', va='baseline')

    plt.savefig('test2.pdf', bbox_inches='tight')
Ejemplo n.º 21
0
 def load(cls, path, years, **kwargs):
     embeds = collections.OrderedDict()
     for year in years:
         embeds[year] = Embedding.load(path + "/" + str(year), **kwargs)
     return SequentialEmbedding(embeds)
Ejemplo n.º 22
0
from __future__ import division
import sys
import numpy as np
sys.path.append('hyperwords')
from representations.embedding import Embedding


Es = Embedding(sys.argv[1], True)
Et = Embedding(sys.argv[2], True)

BX = [(l.split("|||")[-1].strip(), l.split("|||")[0].strip()) for l in open(sys.argv[3]).readlines()]
if sys.argv[-1] == 'R':
   BX = [(t, s) for s, t in BX]
   Es, Et = Et, Es

BD=[]
for s,t in BX:
    if s in Es.wi and t in Et.wi:
        BD.append((s,t))

p1, tot = 0, 0
for s, t in BD:
    vs = Es.represent(s)
    scores = vs.dot(Et.m.T)
    cand = Et.iw[np.nanargmax(scores)]
    if t==cand:
        p1+=1
    tot+=1

print '{0:.4f}'.format(p1/tot)