def find_inferred(self, lst_instance, tfo): contexts = self.extract_contexts( lst_instance ) # at this stage we're grabbing contexts from DP or via window!!!! tfo.write("Contexts for target %s are: %s\n" % (lst_instance.target, contexts)) contexts = [c for c in contexts if c in self.context_vecs] tfo.write("Contexts in vocabulary for target %s are: %s\n" % (lst_instance.target, contexts)) if self.ignore_target: target = None else: if lst_instance.target not in self.word_vecs: tfo.write("ERROR: %s not in word embeddings.Trying lemma.\n" % lst_instance.target) if lst_instance.target_lemma not in self.word_vecs: tfo.write( "ERROR: lemma %s also not in word embeddings. Giving up.\n" % lst_instance.target_lemma) return None else: target = lst_instance.target_lemma else: target = lst_instance.target # 'add' and 'avg' metrics are implemented more efficiently with vector representation arithmetics # as shown in Omer's linguistic regularities paper, this is equivalent as long as the vectors are normalized to 1 if self.context_math == 'add': cs_rep = self.represent(target, contexts, False, tfo) if cs_rep is None: cs_rep = self.word_vecs.zeros() result_vec = self.word_vecs.closest_vec(cs_rep, -1) elif self.context_math == 'avg': cs_rep = self.represent(target, contexts, True, tfo) if cs_rep is None: cs_rep = self.word_vecs.zeros() result_vec = self.word_vecs.closest_vec(cs_rep, -1) elif self.context_math == 'mult': result_vec = self.mult(target, contexts, False, tfo) elif self.context_math == 'geomean': result_vec = self.mult(target, contexts, True, tfo) elif self.context_math == 'none' and self.ignore_target is not None: result_vec = self.word_vecs.closest(target, -1) else: raise Exception('Unknown context math: %s' % self.context_math) if (result_vec is not None): tfo.write("Top most similar embeddings: " + vec_to_str(result_vec, self.top_inferences_to_analyze) + '\n') else: tfo.write("Top most similar embeddings: " + " contexts: None\n") return result_vec
def find_inferred(self, lst_instance, tfo): if lst_instance.target in self.embeddings: result_vec, deltatime = self.embeddings.closest_with_time(lst_instance.target, -1) else: result_vec, deltatime = None, 0 tfo.write("\nDeltatime: %f msec\n" % ((deltatime)*1000)) self.inference_time(deltatime) if (result_vec is not None): tfo.write("Top most similar embeddings: " + vec_to_str(result_vec, self.top_inferences_to_analyze) + '\n') else: tfo.write("Top most similar embeddings: " + " contexts: None\n") return result_vec
def find_inferred(self, lst_instance, tfo): if lst_instance.target in self.embeddings: result_vec, deltatime = self.embeddings.closest_with_time( lst_instance.target, -1) else: result_vec, deltatime = None, 0 tfo.write("\nDeltatime: %f msec\n" % ((deltatime) * 1000)) self.inference_time(deltatime) if (result_vec is not None): tfo.write("Top most similar embeddings: " + vec_to_str(result_vec, self.top_inferences_to_analyze) + '\n') else: tfo.write("Top most similar embeddings: " + " contexts: None\n") return result_vec
def find_inferred(self, lst_instance, tfo): target_v, context_v = self.represent_target_and_context( lst_instance, tfo) if target_v is not None and context_v is not None: #This is not working very well at the moment. Requires more research. # ZERO-TO-HALF target_sim = (self.target_words.dot(target_v) + 1.0) / 2 context_sim = (self.target_words.dot(context_v) + 1.0) / 2 similarity = target_sim * context_sim # RANKS # target_sim = self.target_words.dot(target_v) # context_sim = self.target_words.dot(context_v) # for rank, i in enumerate(target_sim.argsort()): # target_sim[i] = float(rank) # for rank, i in enumerate(context_sim.argsort()): # context_sim[i] = float(rank) # # similarity = (target_sim*context_sim)/(len(target_sim)**2) # POSITIVE SCORES # target_sim = self.target_words.dot(target_v) # context_sim = self.target_words.dot(context_v) # target_sim[target_sim<0.0] = 0.0 # context_sim[context_sim<0.0] = 0.0 # similarity = target_sim*context_sim # EXP # target_sim = self.target_words.dot(target_v) # target_sim = np.exp(target_sim) # context_sim = self.target_words.dot(context_v) # context_sim = np.exp(context_sim) # NORMALIZE # target_sim = self.target_words.dot(target_v) # target_sim_mean = np.mean(target_sim) # target_sim_std = np.sqrt(np.var(target_sim)) # target_sim = (target_sim - target_sim_mean)/target_sim_std ## target_sim[target_sim<0.0] = 0.0 # context_sim = self.target_words.dot(context_v) # context_sim_mean = np.mean(context_sim) # context_sim_std = np.sqrt(np.var(context_sim)) # context_sim = (context_sim - context_sim_mean)/context_sim_std ## context_sim[context_sim<0.0] = 0.0 # # similarity = target_sim + context_sim else: if target_v is not None: similarity = (self.target_words.dot(target_v) + 1.0) / 2 elif context_v is not None: similarity = (self.target_words.dot(context_v) + 1.0) / 2 else: raise Exception("Can't find a target nor context.") result_vec = sorted(zip(self.index2word, similarity), reverse=True, key=lambda x: x[1]) tfo.write("Top most similar embeddings: " + vec_to_str(result_vec, self.top_inferences_to_analyze) + '\n') return result_vec
def run_test(inferrer): if args.candidatesfile is not None: target2candidates = read_candidates(args.candidatesfile) else: target2candidates = None tfi = open(args.testfile, 'r') tfo = open(args.resultsfile, 'w') tfo_ranked = open(args.resultsfile + '.ranked', 'w') tfo_generated_oot = open(args.resultsfile + '.generated.oot', 'w') tfo_generated_best = open(args.resultsfile + '.generated.best', 'w') lines = 0 while True: context_line = tfi.readline() if not context_line: break lst_instance = ContextInstance(context_line, args.no_pos) lines += 1 if args.debug: tfo.write("\nTest context:\n") tfo.write("***************\n") tfo.write(lst_instance.decorate_context()) result_vec = inferrer.find_inferred(lst_instance, tfo) generated_results = inferrer.generate_inferred( result_vec, lst_instance.target, lst_instance.target_lemma, lst_instance.pos) tfo.write("\nGenerated lemmatized results\n") tfo.write("***************\n") tfo.write( "GENERATED\t" + ' '.join([lst_instance.full_target_key, lst_instance.target_id]) + " ::: " + vec_to_str_generated(iter(generated_results.items()), args.topgenerated) + "\n") tfo_generated_oot.write( ' '.join([lst_instance.full_target_key, lst_instance.target_id]) + " ::: " + vec_to_str_generated(iter(generated_results.items()), args.topgenerated) + "\n") tfo_generated_best.write( ' '.join([lst_instance.full_target_key, lst_instance.target_id]) + " :: " + vec_to_str_generated(iter(generated_results.items()), 1) + "\n") filtered_results = inferrer.filter_inferred( result_vec, target2candidates[lst_instance.target_key], lst_instance.pos) tfo.write("\nFiltered results\n") tfo.write("***************\n") tfo.write( "RANKED\t" + ' '.join([lst_instance.full_target_key, lst_instance.target_id]) + "\t" + vec_to_str(iter(filtered_results.items()), len(filtered_results)) + "\n") tfo_ranked.write( "RANKED\t" + ' '.join([lst_instance.full_target_key, lst_instance.target_id]) + "\t" + vec_to_str(iter(filtered_results.items()), len(filtered_results)) + "\n") # print "end %f" % time.time() if lines % 10 == 0: print("Read %d lines" % lines) print("Read %d lines in total" % lines) print("Time per word: %f msec" % inferrer.msec_per_word()) tfi.close() tfo.close() tfo_ranked.close() tfo_generated_oot.close() tfo_generated_best.close()
def run_test(inferrer): if args.candidatesfile != None: target2candidates = read_candidates(args.candidatesfile) else: target2candidates = None tfi = open(args.testfile, 'r') tfo = open(args.resultsfile, 'w') tfo_ranked = open(args.resultsfile+'.ranked', 'w') tfo_generated_oot = open(args.resultsfile+'.generated.oot', 'w') tfo_generated_best = open(args.resultsfile+'.generated.best', 'w') lines = 0 while True: context_line = tfi.readline() if not context_line: break; lst_instance = ContextInstance(context_line, args.no_pos) lines += 1 if (args.debug == True): tfo.write("\nTest context:\n") tfo.write("***************\n") tfo.write(lst_instance.decorate_context()) result_vec = inferrer.find_inferred(lst_instance, tfo) generated_results = inferrer.generate_inferred(result_vec, lst_instance.target, lst_instance.target_lemma, lst_instance.pos) tfo.write("\nGenerated lemmatized results\n") tfo.write("***************\n") tfo.write("GENERATED\t" + ' '.join([lst_instance.full_target_key, lst_instance.target_id]) + " ::: " + vec_to_str_generated(generated_results.iteritems(), args.topgenerated)+"\n") tfo_generated_oot.write(' '.join([lst_instance.full_target_key, lst_instance.target_id]) + " ::: " + vec_to_str_generated(generated_results.iteritems(), args.topgenerated)+"\n") tfo_generated_best.write(' '.join([lst_instance.full_target_key, lst_instance.target_id]) + " :: " + vec_to_str_generated(generated_results.iteritems(), 1)+"\n") filtered_results = inferrer.filter_inferred(result_vec, target2candidates[lst_instance.target_key], lst_instance.pos) tfo.write("\nFiltered results\n") tfo.write("***************\n") tfo.write("RANKED\t" + ' '.join([lst_instance.full_target_key, lst_instance.target_id]) + "\t" + vec_to_str(filtered_results.iteritems(), len(filtered_results))+"\n") tfo_ranked.write("RANKED\t" + ' '.join([lst_instance.full_target_key, lst_instance.target_id]) + "\t" + vec_to_str(filtered_results.iteritems(), len(filtered_results))+"\n") # print "end %f" % time.time() if lines % 10 == 0: print "Read %d lines" % lines print "Read %d lines in total" % lines print "Time per word: %f msec" % inferrer.msec_per_word() tfi.close() tfo.close() tfo_ranked.close() tfo_generated_oot.close() tfo_generated_best.close()
def find_inferred(self, lst_instance, tfo): target_v, context_v = self.represent_target_and_context(lst_instance, tfo) if target_v is not None and context_v is not None: #This is not working very well at the moment. Requires more research. # ZERO-TO-HALF target_sim = (self.target_words.dot(target_v)+1.0)/2 context_sim = (self.target_words.dot(context_v)+1.0)/2 similarity = target_sim*context_sim # RANKS # target_sim = self.target_words.dot(target_v) # context_sim = self.target_words.dot(context_v) # for rank, i in enumerate(target_sim.argsort()): # target_sim[i] = float(rank) # for rank, i in enumerate(context_sim.argsort()): # context_sim[i] = float(rank) # # similarity = (target_sim*context_sim)/(len(target_sim)**2) # POSITIVE SCORES # target_sim = self.target_words.dot(target_v) # context_sim = self.target_words.dot(context_v) # target_sim[target_sim<0.0] = 0.0 # context_sim[context_sim<0.0] = 0.0 # similarity = target_sim*context_sim # EXP # target_sim = self.target_words.dot(target_v) # target_sim = np.exp(target_sim) # context_sim = self.target_words.dot(context_v) # context_sim = np.exp(context_sim) # NORMALIZE # target_sim = self.target_words.dot(target_v) # target_sim_mean = np.mean(target_sim) # target_sim_std = np.sqrt(np.var(target_sim)) # target_sim = (target_sim - target_sim_mean)/target_sim_std ## target_sim[target_sim<0.0] = 0.0 # context_sim = self.target_words.dot(context_v) # context_sim_mean = np.mean(context_sim) # context_sim_std = np.sqrt(np.var(context_sim)) # context_sim = (context_sim - context_sim_mean)/context_sim_std ## context_sim[context_sim<0.0] = 0.0 # # similarity = target_sim + context_sim else: if target_v is not None: similarity = (self.target_words.dot(target_v)+1.0)/2 elif context_v is not None: similarity = (self.target_words.dot(context_v)+1.0)/2 else: raise Exception("Can't find a target nor context.") result_vec = sorted(zip(self.index2word, similarity), reverse=True, key=lambda x: x[1]) tfo.write("Top most similar embeddings: " + vec_to_str(result_vec, self.top_inferences_to_analyze) + '\n') return result_vec
def candidate_ranking_out(data_f, words2elmo_token, target_ws, sents, position_lst, target_w2candidates, w2index, w2elmo, vocab_all): output_f_rank = open( data_f.split('/')[-1] + '.' + model + 'vocab80000.ranked', 'w') output_f_oot = open( data_f.split('/')[-1] + '.' + model + 'vocab80000.oot', 'w') output_f_best = open( data_f.split('/')[-1] + '.' + model + 'vocab80000.best', 'w') print('normalizing vectors') w2elmo = normalize(w2elmo) words2elmo_token = normalize(words2elmo_token) sim_matrix = w2elmo.dot(words2elmo_token.T).T # w2elmo = w2elmo / np.sqrt((w2elmo * w2elmo).sum()) # words2elmo_token=words2elmo_token / np.sqrt((words2elmo_token * words2elmo_token).sum()) print('normalizing completed') for i in range(len(words2elmo_token)): if i % 100 == 0 and i >= 100: print(i) w2elmo_token = words2elmo_token[i] target_w_out = target_ws[i] target_w = target_w_out.split()[0] pos = target_w.split('.')[-1] target_w_lemma = target_w.split('.')[0] #similarity matrix # similarity=(w2elmo.dot(w2elmo_token)+1.0)/2 similarity = (sim_matrix[i] + 1.0) / 2 result_vec = sorted(zip(vocab_all, similarity), reverse=True, key=lambda x: x[1]) try: candidates = target_w2candidates['.'.join(target_w.split('.')[:2])] except KeyError as e: print( 'target w does not occur in gold candidates list: {0}'.format( e)) continue #ranked result filtered_results = filter_inferred(result_vec, candidates, pos) # candis_cos=sorted(filtered_results.items(),key=lambda x:x[1],reverse=True) # candis_cos='\t'.join([res[0]+' '+str(res[1]) for res in candis_cos]) # out_line='RANKED\t{0}\t{1}\n'.format(target_w,candis_cos) # output_f_rank.write(out_line) output_f_rank.write( "RANKED\t" + target_w_out + "\t" + vec_to_str(filtered_results.items(), len(filtered_results)) + "\n") #generate result generated_results = generate_inferred(result_vec, sents[i][position_lst[i]], target_w_lemma, pos) output_f_oot.write( target_w_out + " ::: " + vec_to_str_generated(generated_results.items(), 10) + "\n") output_f_best.write( target_w_out + " :: " + vec_to_str_generated(generated_results.items(), 1) + "\n") output_f_rank.close() output_f_best.close() output_f_oot.close()