def worker(proc_num, queue, out_pref, in_dir, target_lists, context_lists,
           displacement_base, thresh, year_inc, type):
    time.sleep(10 * random.random())
    while True:
        if queue.empty():
            print proc_num, "Finished"
            break
        year = queue.get()
        print proc_num, "Loading matrices..."
        base = create_representation(type,
                                     in_dir + str(year - year_inc),
                                     thresh=thresh,
                                     restricted_context=context_lists[year],
                                     normalize=True,
                                     add_context=False)
        delta = create_representation(type,
                                      in_dir + str(year),
                                      thresh=thresh,
                                      restricted_context=context_lists[year],
                                      normalize=True,
                                      add_context=False)
        print proc_num, "Getting deltas..."
        year_vols = get_cosine_deltas(base, delta, target_lists[year], type)
        year_disp = get_cosine_deltas(displacement_base, delta,
                                      target_lists[year], type)
        print proc_num, "Writing results..."
        ioutils.write_pickle(year_vols, out_pref + str(year) + "-vols.pkl")
        ioutils.write_pickle(year_disp, out_pref + str(year) + "-disps.pkl")
Ejemplo n.º 2
0
def align_cloud(year, rep_type, main_dir, num, dim, wordlist, **rep_args):
    print "Aligning cloud year:", year
    avg_embed_mat = np.zeros((len(wordlist), dim))
    for i in range(1, num + 1):  # Iterates throug the embeddings
        print i
        finname = main_dir + "/embedding_" + str(i) + "/noinit/" + str(
            dim) + "/" + str(year)
        foutname = main_dir + "/embedding_" + str(i) + "/noinit/" + str(
            dim) + "/aligned/" + str(year)
        other_embed = create_representation(
            rep_type, finname, **rep_args)  # Loads the individual embedding
        keep_indices = [other_embed.wi[word] for word in wordlist]
        other_embed = Embedding(
            other_embed.m[keep_indices, :], wordlist,
            normalize=False)  # Synchronize the order of words
        if i == 1:
            base_embed = other_embed
            ortho = np.eye(dim)
        else:
            ortho = alignment.get_procrustes_mat(base_embed, other_embed)
        aligned_embed_mat = (other_embed.m).dot(
            ortho)  # Rotates the embedding to the reference
        avg_embed_mat += aligned_embed_mat / num  # Creates avarage embedding
        np.save(foutname + "-w.npy", aligned_embed_mat)
        write_pickle(other_embed.iw, foutname + "-vocab.pkl")
    foutname = main_dir + "/embedding_avg/" + str(year)
    np.save(foutname + "-w.npy", avg_embed_mat)
    write_pickle(base_embed.iw, foutname + "-vocab.pkl")
def align_years(years, rep_type, in_dir, out_dir, count_dir, min_count,
                **rep_args):
    first_iter = True
    base_embed = None
    for year in years:
        print "Loading year:", year  # for each year
        year_embed = create_representation(rep_type, in_dir + str(year),
                                           **rep_args)  # load in embedding pkl
        year_words = words_above_count(
            count_dir, year,
            min_count)  # load count pkl, returns only words greater min_count
        year_embed.get_subembed(
            year_words
        )  # keep the embeddings for only the words in year_words, if not out of vocabulary
        print "Aligning year:", year
        if first_iter:  # for first iteration, our aligned embed is our base embed so basically skip it
            aligned_embed = year_embed
            first_iter = False
        else:
            aligned_embed = alignment.smart_procrustes_align(
                base_embed, year_embed)
        base_embed = aligned_embed
        print "Writing year:", year
        foutname = out_dir + str(year)
        np.save(foutname + "-w.npy", aligned_embed.m)
        write_pickle(aligned_embed.iw, foutname + "-vocab.pkl")
Ejemplo n.º 4
0
def worker(proc_num, queue):
    while True:
        time.sleep(random.random()*10)
        try:
            year = queue.get(block=False)
        except Empty:
            print(proc_num, "Finished")
            return
        positive_seeds, negative_seeds = seeds.hist_seeds()
        year = str(year)
        print(proc_num, "On year", year)
        words = vocab.top_words(year, 5100)
        stop_words = vocab.top_words(year, 100)
        words = words.difference(stop_words)
        embed = create_representation("SVD", constants.COHA_EMBEDDINGS + year)
        embed_words = set(embed.iw)
        words = words.intersection(embed_words)

        polarities = polarity_induction_methods.bootstrap(
                 embed.get_subembed(words.union(positive_seeds).union(negative_seeds)),
                 positive_seeds, negative_seeds,
                 score_method=polarity_induction_methods.random_walk,
                 num_boots=50, n_procs=20, return_all=True,
                 beta=0.9, nn=25)
        util.write_pickle(polarities, constants.POLARITIES + year + '-coha-freq-boot.pkl')
Ejemplo n.º 5
0
def worker(proc_num, queue, out_pref, in_dir, target_lists, context_lists, displacement_base, thresh, year_inc, type):
    time.sleep(10*random.random())
    while True:
        if queue.empty():
            print proc_num, "Finished"
            break
        year = queue.get()
        print proc_num, "Loading matrices..."
        base = create_representation(type, in_dir + str(year-year_inc),  thresh=thresh, restricted_context=context_lists[year], normalize=True, add_context=False)
        delta = create_representation(type, in_dir + str(year),  thresh=thresh, restricted_context=context_lists[year], normalize=True, add_context=False)
        print proc_num, "Getting deltas..."
        year_vols = get_cosine_deltas(base, delta, target_lists[year], type)
        year_disp = get_cosine_deltas(displacement_base, delta, target_lists[year], type)
        print proc_num, "Writing results..."
        ioutils.write_pickle(year_vols, out_pref + str(year) + "-vols.pkl")
        ioutils.write_pickle(year_disp, out_pref + str(year) + "-disps.pkl")
def worker(proc_num, queue):
    while True:
        time.sleep(random.random()*10)
        try:
            year = queue.get(block=False)
        except Empty:
            print proc_num, "Finished"
            return
        positive_seeds, negative_seeds = seeds.hist_seeds()
        year = str(year)
        print proc_num, "On year", year
        words = vocab.top_words(year, 5100)
        stop_words = vocab.top_words(year, 100)
        words = words.difference(stop_words)
        embed = create_representation("SVD", constants.COHA_EMBEDDINGS + year)
        embed_words = set(embed.iw)
        words = words.intersection(embed_words)

        polarities = polarity_induction_methods.bootstrap(
                 embed.get_subembed(words.union(positive_seeds).union(negative_seeds)),
                 positive_seeds, negative_seeds,
                 score_method=polarity_induction_methods.random_walk,
                 num_boots=50, n_procs=20, return_all=True,
                 beta=0.9, nn=25)
        util.write_pickle(polarities, constants.POLARITIES + year + '-coha-freq-boot.pkl')
Ejemplo n.º 7
0
def run(count_path, out_path, smooth=0, cds=True, normalize=False, neg=1):
    counts = create_representation("Explicit", count_path, normalize=False)
    old_mat = counts.m
    index = counts.wi
    smooth = old_mat.sum() * smooth

    # getting marginal probs
    row_probs = old_mat.sum(1) + smooth
    col_probs = old_mat.sum(0) + smooth
    if cds:
        col_probs = np.power(col_probs, 0.75)
    row_probs = row_probs / row_probs.sum()
    col_probs = col_probs / col_probs.sum()

    # building PPMI matrix
    ppmi_mat = make_ppmi_mat(old_mat,
                             row_probs,
                             col_probs,
                             smooth,
                             neg=neg,
                             normalize=normalize)
    import pyximport
    pyximport.install(setup_args={"include_dirs": np.get_include()})
    from representations import sparse_io
    sparse_io.export_mat_eff(ppmi_mat.row, ppmi_mat.col, ppmi_mat.data,
                             out_path + ".bin")
    util.write_pickle(index, out_path + "-index.pkl")
Ejemplo n.º 8
0
def worker(proc_num, queue):
    while True:
        #        time.sleep(random.random()*10)
        try:
            name = queue.get(block=False)
        except Empty:
            print proc_num, "Finished"
            return
        if name + ".pkl" in os.listdir(POLARITIES):
            continue
        print proc_num, "Running", name
        subredditgen.main(name)
        word_dict = util.load_pickle(DICTS.format(name))
        word_dict.filter_extremes(no_above=0.1, no_below=100)
        to_keep = sorted(word_dict.dfs,
                         key=lambda w: word_dict.dfs[w],
                         reverse=True)[:5000]
        word_dict.filter_tokens(good_ids=to_keep)
        sub_vecs = create_representation(
            "SVD", constants.SUBREDDIT_EMBEDDINGS.format(name))
        pos_seeds, neg_seeds = seeds.twitter_seeds()
        sub_vecs = sub_vecs.get_subembed(
            set(word_dict.token2id.keys()).union(pos_seeds).union(neg_seeds))
        pols = polarity_induction_methods.bootstrap(sub_vecs,
                                                    pos_seeds,
                                                    neg_seeds,
                                                    return_all=True,
                                                    nn=25,
                                                    beta=0.9,
                                                    num_boots=50,
                                                    n_procs=10)
        util.write_pickle(pols, POLARITIES + name + ".pkl")
Ejemplo n.º 9
0
def align_years(years, rep_type, main_dir, num, dim, **rep_args):
    print "Aligning years to each other"
    first_iter = True
    base_embed = None
    for year in years:  # Iterates through years
        print year
        year_embed = create_representation(
            rep_type, main_dir + "/embedding_avg/" + str(year),
            **rep_args)  # Loads the individual embedding
        if first_iter:
            aligned_embed = year_embed
            first_iter = False
        else:
            ortho = alignment.get_procrustes_mat(base_embed, year_embed)
            aligned_embed = Embedding(
                (year_embed.m).dot(ortho), year_embed.iw,
                normalize=False)  # Rotates to the previous year embedding
            for i in range(
                    1, num +
                    1):  # Align all the embedding the same way as the avarage
                finname = main_dir + "/embedding_" + str(i) + "/noinit/" + str(
                    dim) + "/aligned/" + str(year)
                foutname = main_dir + "/embedding_" + str(
                    i) + "/noinit/" + str(dim) + "/aligned/" + str(year)
                mat = np.load(finname + "-w.npy")
                mat = mat.dot(ortho)
                np.save(foutname + "-w.npy", mat)
        base_embed = aligned_embed
        foutname = main_dir + "/embedding_avg/aligned/" + str(year)
        np.save(foutname + "-w.npy", aligned_embed.m)
        write_pickle(aligned_embed.iw, foutname + "-vocab.pkl")
Ejemplo n.º 10
0
def main():
    args = docopt("""
        Usage:
            eval_reliability.py [options] <representation> <file_name> <folders>...

        Options:
            --words FILE      Use FILE with list of words (1 per line) to measure reliabilty
            --ws FILES        Testsets for word similarity evaluation, use "," as separator!
            --ana FILES       Testsets for analogy evaluation, use "," as separator!
            --closest N       Use N closest neighbors to measure reliability [default: 10]   
    """)
    folders = args["<folders>"]

    closest = int(args["--closest"])
    word_list = args["--words"]
    ws_test_sets = [read_ws_test_set(path) for path in args["--ws"].split(",")]
    as_test_sets = [
        read_as_test_set(path) for path in args["--ana"].split(",")
    ]
    as_xi_and_ix = [get_vocab_as(test_set) for test_set in as_test_sets]
    words = words_to_evaluate_file(
        word_list) if word_list else argswords_to_evaluate(representations)

    #good default parameter for svd
    args["--eig"] = 0
    args["--w+c"] = False
    #not used
    args["--neg"] = 1

    representations = []
    for file in folders:
        if os.path.isfile(file + "/" + args["<file_name>"] + ".words.vocab"):
            x = copy.deepcopy(args)
            x["<representation_path>"] = file + "/" + args["<file_name>"]
            representations.append(create_representation(x))
        else:
            print("Could not find " + file + "/" + args["<file_name>"] +
                  ".words.vocab",
                  file=sys.stderr)
    #comparisson over all subsets
    if len(representations) < 2:
        raise Exception("Need multiple models for evaluation")

    evaluated = [
        " ".join([str(evaluate_ws(r, w)) for r in representations])
        for w in ws_test_sets
    ]
    for i, test_set in enumerate(as_test_sets):
        evaluated.append(" ".join([
            str(
                evaluate_as(r, test_set, as_xi_and_ix[i][0],
                            as_xi_and_ix[i][1])) for r in representations
        ]))
    evaluated.append(reliability(representations, words, closest))
    print("\t".join(evaluated))
Ejemplo n.º 11
0
def main():
    args = docopt("""
    Usage:
        analogy_eval.py [options] <representation> <representation_path> <task_path>
    
    Options:
        --neg NUM    Number of negative samples; subtracts its log from PMI (only applicable to PPMI) [default: 1]
        --w+c        Use ensemble of word and context vectors (not applicable to PPMI)
        --eig NUM    Weighted exponent of the eigenvalue matrix (only applicable to SVD) [default: 0.5]
        --normalize  Use row-normalized word vectors
    """)

    print args['--normalize']
    representation = create_representation(args)
    args['--normalize'] = True
    print args['--normalize']
    representation_sim = create_representation(args)
    data = read_test_set(representation, args['<task_path>'])
    xi, ix = get_vocab(data)
    accuracy_add, accuracy_mul = evaluate(representation, representation_sim,
                                          data, xi, ix)
    print args['<representation>'], args[
        '<representation_path>'], '\t%0.3f' % accuracy_add, '\t%0.3f' % accuracy_mul
Ejemplo n.º 12
0
def main():
    args = docopt("""
    Usage:
        ws_eval.py [options] <representation> <representation_path> <task_path>
    
    Options:
        --neg NUM    Number of negative samples; subtracts its log from PMI (only applicable to PPMI) [default: 1]
        --w+c        Use ensemble of word and context vectors (not applicable to PPMI)
        --eig NUM    Weighted exponent of the eigenvalue matrix (only applicable to SVD) [default: 0.5]
    """)
    
    data = read_test_set(args['<task_path>'])
    representation = create_representation(args)
    correlation = evaluate(representation, data)
    print (args['<representation>'] + " " +  args['<task_path>'] + '\t%0.3f' % correlation)
Ejemplo n.º 13
0
def main():
    args = docopt("""
    Usage:
        ws_eval.py [options] <representation> <representation_path> <task_path>
    
    Options:
        --neg NUM    Number of negative samples; subtracts its log from PMI (only applicable to PPMI) [default: 1]
        --w+c        Use ensemble of word and context vectors (not applicable to PPMI)
        --eig NUM    Weighted exponent of the eigenvalue matrix (only applicable to SVD) [default: 0.5]
    """)

    data = read_test_set(args['<task_path>'])
    representation = create_representation(args)
    correlation = evaluate(representation, data)
    print 'Word Similarity', '\t%0.3f' % correlation
Ejemplo n.º 14
0
def main():
    args = docopt("""
    Usage:
        get_most_similar.py [options] <representation> <representation_path> <test_words>

    Options:
        --neg NUM    Number of negative samples; subtracts its log from PMI (only applicable to PPMI) [default: 1]
        --w+c        Use ensemble of word and context vectors (not applicable to PPMI)
        --eig NUM    Weighted exponent of the eigenvalue matrix (only applicable to SVD) [default: 0.5]
    """)

    words = args['<test_words>'].split('_')
    representation = create_representation(args)
    for w in words:
        print w
        print representation.closest(w)
Ejemplo n.º 15
0
def main():
    args = docopt("""
    Usage:
        analogy_eval.py [options] <representation> <representation_path> <task_path>
    
    Options:
        --neg NUM    Number of negative samples; subtracts its log from PMI (only applicable to PPMI) [default: 1]
        --w+c        Use ensemble of word and context vectors (not applicable to PPMI)
        --eig NUM    Weighted exponent of the eigenvalue matrix (only applicable to SVD) [default: 0.5]
    """)
    
    data = read_test_set(args['<task_path>'])
    xi, ix = get_vocab(data)
    representation = create_representation(args)
    accuracy_add, accuracy_mul = evaluate(representation, data, xi, ix)
    print args['<representation>'], args['<representation_path>'], '\t%0.3f' % accuracy_add, '\t%0.3f' % accuracy_mul
def main():
    args = docopt("""
    Usage:
        analogy_eval.py [options] <representation> <representation_path> <task_path>
    
    Options:
        --neg NUM    Number of negative samples; subtracts its log from PMI (only applicable to PPMI) [default: 1]
        --w+c        Use ensemble of word and context vectors (not applicable to PPMI)
        --concatenate        Concatenate left vector and right vector together
        --eig NUM    Weighted exponent of the eigenvalue matrix (only applicable to SVD) [default: 0.5]
        --contexts        Use context embedding as word embedding
    """)

    data = read_test_set(args['<task_path>'])
    xi, ix = get_vocab(data)
    representation = create_representation(args)

    accuracy_add, accuracy_mul = evaluate(representation, data, xi, ix)
    print args['<representation>'], args[
        '<representation_path>'], '\t%0.3f' % accuracy_add, '\t%0.3f' % accuracy_mul
Ejemplo n.º 17
0
def align_years(years, rep_type, in_dir, out_dir, **rep_args):
    first_iter = True
    base_embed = None
    for year in years:  # Iterates through years
        print "Loading year:", year
        year_embed = create_representation(
            rep_type, in_dir + str(year),
            **rep_args)  # Loads the individual embedding
        print "Aligning year:", year
        if first_iter:
            aligned_embed = year_embed
            first_iter = False
        else:
            aligned_embed = alignment.smart_procrustes_align(
                base_embed, year_embed,
                post_normalize=False)  # Rotates to the previous year embedding
        base_embed = aligned_embed
        print "Writing year:", year
        foutname = out_dir + str(year)
        np.save(foutname + "-w.npy", aligned_embed.m)
        write_pickle(aligned_embed.iw, foutname + "-vocab.pkl")
Ejemplo n.º 18
0
def worker(proc_num, queue):
    while True:
#        time.sleep(random.random()*10)
        try:
            name = queue.get(block=False)
        except Empty:
            print proc_num, "Finished"
            return
        if name + ".pkl" in os.listdir(POLARITIES):
            continue
        print proc_num, "Running", name
        subredditgen.main(name)
        word_dict = util.load_pickle(DICTS.format(name))
        word_dict.filter_extremes(no_above=0.1, no_below=100)
        to_keep = sorted(word_dict.dfs, key=lambda w : word_dict.dfs[w], reverse=True)[:5000]
        word_dict.filter_tokens(good_ids=to_keep)
        sub_vecs = create_representation("SVD", constants.SUBREDDIT_EMBEDDINGS.format(name))
        pos_seeds, neg_seeds = seeds.twitter_seeds()
        sub_vecs = sub_vecs.get_subembed(set(word_dict.token2id.keys()).union(pos_seeds).union(neg_seeds))
        pols = polarity_induction_methods.bootstrap(sub_vecs, pos_seeds, neg_seeds, return_all=True,
                nn=25, beta=0.9, num_boots=50, n_procs=10)
        util.write_pickle(pols, POLARITIES + name + ".pkl")
Ejemplo n.º 19
0
def main():
    args = docopt("""
    Usage:
        ws_eval.py [options] <representation> <representation_path> <task_path>

    Options:
        --neg NUM     Number of negative samples; subtracts its log from PMI (only applicable to PPMI) [default: 1]
        --w+c         Use ensemble of word and context vectors (not applicable to PPMI)
        --eig NUM     Weighted exponent of the eigenvalue matrix (only applicable to SVD) [default: 0.5]
        --vocab FILE  Optional: use vocabulary file to determine what is difficult for the embeddings
        --cutoff NUM  Optional: Cutoff proportion for reporting rank mismatches
        --verbose NUM Specify 1 for bonus output for analysis
    """)

    data = read_test_set(args['<task_path>'])
    representation = create_representation(args)
    #print dir(representation), representation.iw[:3]
    correlation, actual, expected = evaluate(representation, data)
    top_n = 50
    print args['<representation>'], args['<representation_path>'], '\t%0.6f' % correlation
    #print args['--verbose']
    verbose = 1 if args['--verbose'] is not None and args['--verbose'] == '1' else 0
    if args['--vocab'] is not None:
        reconstruct_spearmanr(actual, expected, representation, data, args['--vocab'], cutoff=args['--cutoff'], verbose=verbose)
Ejemplo n.º 20
0
    print "Merging"
    full_word_set = set([])
    for year_words in target_lists.itervalues():
        full_word_set = full_word_set.union(set(year_words))
    merge(out_pref, years, list(full_word_set))

if __name__ == '__main__':
    parser = argparse.ArgumentParser(description="Computes semantic change statistics for words.")
    parser.add_argument("dir", help="path to word vectors")
    parser.add_argument("num_procs", type=int, help="number of processes to spawn")
    parser.add_argument("word_file", help="path to sorted word file")
    parser.add_argument("out_dir", help="output path")
    parser.add_argument("--target-words", type=int, help="Number of words (of decreasing average frequency) to analyze", default=-1)
    parser.add_argument("--context-words", type=int, help="Number of words (of decreasing average frequency) to include in context. -2 means all regardless of word list", default=-1)
    parser.add_argument("--context-word-file")
    parser.add_argument("--start-year", type=int, help="start year (inclusive)", default=1800)
    parser.add_argument("--year-inc", type=int, help="year increment", default=10)
    parser.add_argument("--type", default="PPMI")
    parser.add_argument("--end-year", type=int, help="end year (inclusive)", default=2000)
    parser.add_argument("--disp-year", type=int, help="year to measure displacement from", default=2000)
    args = parser.parse_args()
    years = range(args.start_year, args.end_year + 1, args.year_inc)
    target_lists, context_lists = ioutils.load_target_context_words(years, args.word_file, args.target_words, -1)
    if args.context_word_file != None:
        print "Loading context words.."
        _ , context_lists = ioutils.load_target_context_words(years, args.word_file, -1, args.context_words)
    target_lists, context_lists = ioutils.load_target_context_words(years, args.word_file, args.target_words, args.context_words)
    ioutils.mkdir(args.out_dir)
    displacement_base = create_representation(args.type, args.dir + "/" +  str(args.disp_year), restricted_context=context_lists[args.disp_year], normalize=True, add_context=False)
    run_parallel(args.num_procs, args.out_dir, args.dir + "/", years[1:], target_lists, context_lists, displacement_base, 0, args.year_inc, args.type)       
Ejemplo n.º 21
0
    print("Merging")
    full_word_set = set([])
    for year_words in target_lists.values():
        full_word_set = full_word_set.union(set(year_words))
    merge(out_pref, years, list(full_word_set))

if __name__ == '__main__':
    parser = argparse.ArgumentParser(description="Computes semantic change statistics for words.")
    parser.add_argument("dir", help="path to word vectors")
    parser.add_argument("num_procs", type=int, help="number of processes to spawn")
    parser.add_argument("word_file", help="path to sorted word file")
    parser.add_argument("out_dir", help="output path")
    parser.add_argument("--target-words", type=int, help="Number of words (of decreasing average frequency) to analyze", default=-1)
    parser.add_argument("--context-words", type=int, help="Number of words (of decreasing average frequency) to include in context. -2 means all regardless of word list", default=-1)
    parser.add_argument("--context-word-file")
    parser.add_argument("--start-year", type=int, help="start year (inclusive)", default=1800)
    parser.add_argument("--year-inc", type=int, help="year increment", default=10)
    parser.add_argument("--type", default="PPMI")
    parser.add_argument("--end-year", type=int, help="end year (inclusive)", default=2000)
    parser.add_argument("--disp-year", type=int, help="year to measure displacement from", default=2000)
    args = parser.parse_args()
    years = list(range(args.start_year, args.end_year + 1, args.year_inc))
    target_lists, context_lists = ioutils.load_target_context_words(years, args.word_file, args.target_words, -1)
    if args.context_word_file != None:
        print("Loading context words..")
        _ , context_lists = ioutils.load_target_context_words(years, args.word_file, -1, args.context_words)
    target_lists, context_lists = ioutils.load_target_context_words(years, args.word_file, args.target_words, args.context_words)
    ioutils.mkdir(args.out_dir)
    displacement_base = create_representation(args.type, args.dir + "/" +  str(args.disp_year), restricted_context=context_lists[args.disp_year], normalize=True, add_context=False)
    run_parallel(args.num_procs, args.out_dir, args.dir + "/", years[1:], target_lists, context_lists, displacement_base, 0, args.year_inc, args.type)