コード例 #1
0
ファイル: sim_pair.py プロジェクト: cltl/meaning_space
def get_sim_pair_ppmi(corpus, target_word1, target_word2, year, results_dir):

    results_pair = target_word1 + '-' + target_word2 + '-cosines.tsv'

    embedd = PositiveExplicit.load(corpus + "/" + str(year))

    cos = embedd.similarity(target_word1, target_word2)

    if os.path.isfile(results_dir + results_pair):
        print('file exists')
        with open(results_dir + results_pair) as infile:
            existing_results = infile.read().split('\n')

    else:
        existing_results = []

    with open(results_dir + results_pair, 'a') as outfile:
        result = target_word1 + '-' + target_word2 + '\t' + str(
            year) + '\t' + str(cos) + '\n'
        if result.strip() in existing_results:
            print('result already there')
        else:
            outfile.write(result)

    print(cos)
コード例 #2
0
def get_sim_neighbors_ppmi(corpus, target_word1, target_word2, year1, year2, n, results_dir):


    """Two options: either 2 differnt years and 1 target word
    or the same year and 2 target words"""

    if not os.path.isdir(results_dir+'neighbors'):

        os.mkdir(results_dir+'neighbors')

    results_words = 'neighbors/'+target_word1+'-'+target_word2+'-'+str(year1)+'-'+str(year2)+'.tsv'


    if (year1 != year2) and (target_word1 == target_word2):
        results_cosine = 'cosines-'+target_word1+'-n-'+str(n)+'.tsv'

        embedd_year1 = PositiveExplicit.load(corpus+ "/" + str(year1))
        embedd_year2 = PositiveExplicit.load(corpus+ "/" + str(year2))

        with open(corpus+'/'+str(year1)+'-index.pkl', 'rb') as infile:
            year1_vocab = pickle.load(infile, encoding = 'utf-8')
        with open(corpus+'/'+str(year2)+'-index.pkl', 'rb') as infile:
            year2_vocab = pickle.load(infile, encoding = 'utf-8')

        #year1_vocab = pickle.load(open(corpus+'/'+str(year1)+'-index.pkl'))
        #year2_vocab = pickle.load(open(corpus+'/'+str(year2)+'-index.pkl'))

        if (embedd_year1.represent(target_word1).nnz != 0) and (embedd_year2.represent(target_word1).nnz != 0):

            neighbors_year1 = get_nearest_neighbors(embedd_year1, target_word1, n)
            neighbors_year2 = get_nearest_neighbors(embedd_year2, target_word1, n)


            union = get_union(neighbors_year1, neighbors_year2)

            filtered_union = filter_union(union, embedd_year1, embedd_year2, target_word1)

            #clean_union = []

            #for word in union:
            #    if (word in year1_vocab) and (word in year2_vocab):
            #        clean_union.append(word)

            vec1 = get_second_order_vector(embedd_year1, filtered_union, target_word1)
            vec2 = get_second_order_vector(embedd_year2, filtered_union, target_word1)
            #vec1, vec2 = filter_so_vector_for_nans(embedd_year1, embedd_year2, union, target_word1)

            neighbor_words1 = get_nearest_neighbor_words(neighbors_year1)
            neighbor_words2 = get_nearest_neighbor_words(neighbors_year2)

            cos = get_cosine(vec1, vec2)
        else:
            print('word out of vocab')
            cos = 'OOV'
            neighbor_words1 = ['OOV']
            neighbor_words2 = ['OOV']



    elif (year1 == year2) and (target_word1 != target_word2):
        results_cosine = 'cosines-'+target_word1+'-'+target_word2+'-n-'+str(n)+'.tsv'


        embedd_year = PositiveExplicit.load(corpus+ "/" + str(year1))

        if (embedd_year.represent(target_word1).nnz) != 0 and (embedd_year.represent(target_word2).nnz != 0):

            neighbors_word1 = get_nearest_neighbors(embedd_year, target_word1, n)
            neighbors_word2 = get_nearest_neighbors(embedd_year, target_word2, n)

            union = get_union(neighbors_word1, neighbors_word2)

            vec1 = get_second_order_vector(embedd_year, union, target_word1)
            vec2 = get_second_order_vector(embedd_year, union, target_word2)

            neighbor_words1 = get_nearest_neighbor_words(neighbors_word1)
            neighbor_words2 = get_nearest_neighbor_words(neighbors_word2)

            cos = get_cosine(vec1, vec2)
        else:
            print('word out of vocab')
            cos = 'OOV'
            neighbor_words1 = ['OOV']
            neighbor_words2 = ['OOV']

    if os.path.isfile(results_dir+results_cosine):
        print('file exists')
        with open(results_dir+results_cosine) as infile:
            existing_results = infile.read().split('\n')

    else:
        existing_results = []

    with open(results_dir+results_words, 'w') as outfile1:
        for word1, word2 in zip(neighbor_words1, neighbor_words2):
            #outfile1.write(word1.encode('utf-8')+'\t'+word2.encode('utf-8')+'\n')
            outfile1.write(word1+'\t'+word2+'\n')

    with open(results_dir+'/'+results_cosine, 'a') as outfile2:
        result = target_word1+'-'+target_word2+'\t'+str(year1)+'-'+str(year2)+'\t'+str(cos)+'\n'
        if result.strip() in existing_results:
            print('result already there')
        else:
            outfile2.write(result)
    print(cos)