def get_sim_pair_ppmi(corpus, target_word1, target_word2, year, results_dir): results_pair = target_word1 + '-' + target_word2 + '-cosines.tsv' embedd = PositiveExplicit.load(corpus + "/" + str(year)) cos = embedd.similarity(target_word1, target_word2) if os.path.isfile(results_dir + results_pair): print('file exists') with open(results_dir + results_pair) as infile: existing_results = infile.read().split('\n') else: existing_results = [] with open(results_dir + results_pair, 'a') as outfile: result = target_word1 + '-' + target_word2 + '\t' + str( year) + '\t' + str(cos) + '\n' if result.strip() in existing_results: print('result already there') else: outfile.write(result) print(cos)
def get_sim_neighbors_ppmi(corpus, target_word1, target_word2, year1, year2, n, results_dir): """Two options: either 2 differnt years and 1 target word or the same year and 2 target words""" if not os.path.isdir(results_dir+'neighbors'): os.mkdir(results_dir+'neighbors') results_words = 'neighbors/'+target_word1+'-'+target_word2+'-'+str(year1)+'-'+str(year2)+'.tsv' if (year1 != year2) and (target_word1 == target_word2): results_cosine = 'cosines-'+target_word1+'-n-'+str(n)+'.tsv' embedd_year1 = PositiveExplicit.load(corpus+ "/" + str(year1)) embedd_year2 = PositiveExplicit.load(corpus+ "/" + str(year2)) with open(corpus+'/'+str(year1)+'-index.pkl', 'rb') as infile: year1_vocab = pickle.load(infile, encoding = 'utf-8') with open(corpus+'/'+str(year2)+'-index.pkl', 'rb') as infile: year2_vocab = pickle.load(infile, encoding = 'utf-8') #year1_vocab = pickle.load(open(corpus+'/'+str(year1)+'-index.pkl')) #year2_vocab = pickle.load(open(corpus+'/'+str(year2)+'-index.pkl')) if (embedd_year1.represent(target_word1).nnz != 0) and (embedd_year2.represent(target_word1).nnz != 0): neighbors_year1 = get_nearest_neighbors(embedd_year1, target_word1, n) neighbors_year2 = get_nearest_neighbors(embedd_year2, target_word1, n) union = get_union(neighbors_year1, neighbors_year2) filtered_union = filter_union(union, embedd_year1, embedd_year2, target_word1) #clean_union = [] #for word in union: # if (word in year1_vocab) and (word in year2_vocab): # clean_union.append(word) vec1 = get_second_order_vector(embedd_year1, filtered_union, target_word1) vec2 = get_second_order_vector(embedd_year2, filtered_union, target_word1) #vec1, vec2 = filter_so_vector_for_nans(embedd_year1, embedd_year2, union, target_word1) neighbor_words1 = get_nearest_neighbor_words(neighbors_year1) neighbor_words2 = get_nearest_neighbor_words(neighbors_year2) cos = get_cosine(vec1, vec2) else: print('word out of vocab') cos = 'OOV' neighbor_words1 = ['OOV'] neighbor_words2 = ['OOV'] elif (year1 == year2) and (target_word1 != target_word2): results_cosine = 'cosines-'+target_word1+'-'+target_word2+'-n-'+str(n)+'.tsv' embedd_year = PositiveExplicit.load(corpus+ "/" + str(year1)) if (embedd_year.represent(target_word1).nnz) != 0 and (embedd_year.represent(target_word2).nnz != 0): neighbors_word1 = get_nearest_neighbors(embedd_year, target_word1, n) neighbors_word2 = get_nearest_neighbors(embedd_year, target_word2, n) union = get_union(neighbors_word1, neighbors_word2) vec1 = get_second_order_vector(embedd_year, union, target_word1) vec2 = get_second_order_vector(embedd_year, union, target_word2) neighbor_words1 = get_nearest_neighbor_words(neighbors_word1) neighbor_words2 = get_nearest_neighbor_words(neighbors_word2) cos = get_cosine(vec1, vec2) else: print('word out of vocab') cos = 'OOV' neighbor_words1 = ['OOV'] neighbor_words2 = ['OOV'] if os.path.isfile(results_dir+results_cosine): print('file exists') with open(results_dir+results_cosine) as infile: existing_results = infile.read().split('\n') else: existing_results = [] with open(results_dir+results_words, 'w') as outfile1: for word1, word2 in zip(neighbor_words1, neighbor_words2): #outfile1.write(word1.encode('utf-8')+'\t'+word2.encode('utf-8')+'\n') outfile1.write(word1+'\t'+word2+'\n') with open(results_dir+'/'+results_cosine, 'a') as outfile2: result = target_word1+'-'+target_word2+'\t'+str(year1)+'-'+str(year2)+'\t'+str(cos)+'\n' if result.strip() in existing_results: print('result already there') else: outfile2.write(result) print(cos)