def find_inferred(self, lst_instance, tfo):

        contexts = self.extract_contexts(
            lst_instance
        )  # at this stage we're grabbing contexts from DP or via window!!!!
        tfo.write("Contexts for target %s are: %s\n" %
                  (lst_instance.target, contexts))
        contexts = [c for c in contexts if c in self.context_vecs]
        tfo.write("Contexts in vocabulary for target %s are: %s\n" %
                  (lst_instance.target, contexts))
        if self.ignore_target:
            target = None
        else:
            if lst_instance.target not in self.word_vecs:
                tfo.write("ERROR: %s not in word embeddings.Trying lemma.\n" %
                          lst_instance.target)
                if lst_instance.target_lemma not in self.word_vecs:
                    tfo.write(
                        "ERROR: lemma %s also not in word embeddings. Giving up.\n"
                        % lst_instance.target_lemma)
                    return None
                else:
                    target = lst_instance.target_lemma
            else:
                target = lst_instance.target

        # 'add' and 'avg' metrics are implemented more efficiently with vector representation arithmetics
        # as shown in Omer's linguistic regularities paper, this is equivalent as long as the vectors are normalized to 1
        if self.context_math == 'add':
            cs_rep = self.represent(target, contexts, False, tfo)
            if cs_rep is None:
                cs_rep = self.word_vecs.zeros()
            result_vec = self.word_vecs.closest_vec(cs_rep, -1)
        elif self.context_math == 'avg':
            cs_rep = self.represent(target, contexts, True, tfo)
            if cs_rep is None:
                cs_rep = self.word_vecs.zeros()
            result_vec = self.word_vecs.closest_vec(cs_rep, -1)
        elif self.context_math == 'mult':
            result_vec = self.mult(target, contexts, False, tfo)
        elif self.context_math == 'geomean':
            result_vec = self.mult(target, contexts, True, tfo)
        elif self.context_math == 'none' and self.ignore_target is not None:
            result_vec = self.word_vecs.closest(target, -1)
        else:
            raise Exception('Unknown context math: %s' % self.context_math)

        if (result_vec is not None):
            tfo.write("Top most similar embeddings: " +
                      vec_to_str(result_vec, self.top_inferences_to_analyze) +
                      '\n')
        else:
            tfo.write("Top most similar embeddings: " + " contexts: None\n")

        return result_vec
Beispiel #2
0
 def find_inferred(self, lst_instance, tfo):
     
     if lst_instance.target in self.embeddings:
         result_vec, deltatime = self.embeddings.closest_with_time(lst_instance.target, -1)
     else:
         result_vec, deltatime = None, 0
     
     tfo.write("\nDeltatime: %f msec\n" % ((deltatime)*1000))
     self.inference_time(deltatime)
         
     if (result_vec is not None):
         tfo.write("Top most similar embeddings: " + vec_to_str(result_vec, self.top_inferences_to_analyze) + '\n')
     else:
         tfo.write("Top most similar embeddings: " + " contexts: None\n") 
         
     return result_vec
Beispiel #3
0
    def find_inferred(self, lst_instance, tfo):

        if lst_instance.target in self.embeddings:
            result_vec, deltatime = self.embeddings.closest_with_time(
                lst_instance.target, -1)
        else:
            result_vec, deltatime = None, 0

        tfo.write("\nDeltatime: %f msec\n" % ((deltatime) * 1000))
        self.inference_time(deltatime)

        if (result_vec is not None):
            tfo.write("Top most similar embeddings: " +
                      vec_to_str(result_vec, self.top_inferences_to_analyze) +
                      '\n')
        else:
            tfo.write("Top most similar embeddings: " + " contexts: None\n")

        return result_vec
    def find_inferred(self, lst_instance, tfo):

        target_v, context_v = self.represent_target_and_context(
            lst_instance, tfo)

        if target_v is not None and context_v is not None:

            #This is not working very well at the moment. Requires more research.

            # ZERO-TO-HALF
            target_sim = (self.target_words.dot(target_v) + 1.0) / 2
            context_sim = (self.target_words.dot(context_v) + 1.0) / 2
            similarity = target_sim * context_sim

            # RANKS
#            target_sim = self.target_words.dot(target_v)
#            context_sim = self.target_words.dot(context_v)
#            for rank, i in enumerate(target_sim.argsort()):
#                target_sim[i] = float(rank)
#            for rank, i in enumerate(context_sim.argsort()):
#                context_sim[i] = float(rank)
#
#            similarity = (target_sim*context_sim)/(len(target_sim)**2)

# POSITIVE SCORES
#            target_sim = self.target_words.dot(target_v)
#            context_sim = self.target_words.dot(context_v)
#            target_sim[target_sim<0.0] = 0.0
#            context_sim[context_sim<0.0] = 0.0
#            similarity = target_sim*context_sim

# EXP
#            target_sim = self.target_words.dot(target_v)
#            target_sim = np.exp(target_sim)
#            context_sim = self.target_words.dot(context_v)
#            context_sim = np.exp(context_sim)

# NORMALIZE
#            target_sim = self.target_words.dot(target_v)
#            target_sim_mean = np.mean(target_sim)
#            target_sim_std = np.sqrt(np.var(target_sim))
#            target_sim = (target_sim - target_sim_mean)/target_sim_std
##            target_sim[target_sim<0.0] = 0.0
#            context_sim = self.target_words.dot(context_v)
#            context_sim_mean = np.mean(context_sim)
#            context_sim_std = np.sqrt(np.var(context_sim))
#            context_sim = (context_sim - context_sim_mean)/context_sim_std
##            context_sim[context_sim<0.0] = 0.0
#
#            similarity = target_sim + context_sim

        else:
            if target_v is not None:
                similarity = (self.target_words.dot(target_v) + 1.0) / 2
            elif context_v is not None:
                similarity = (self.target_words.dot(context_v) + 1.0) / 2
            else:
                raise Exception("Can't find a target nor context.")

        result_vec = sorted(zip(self.index2word, similarity),
                            reverse=True,
                            key=lambda x: x[1])

        tfo.write("Top most similar embeddings: " +
                  vec_to_str(result_vec, self.top_inferences_to_analyze) +
                  '\n')

        return result_vec
Beispiel #5
0
def run_test(inferrer):

    if args.candidatesfile is not None:
        target2candidates = read_candidates(args.candidatesfile)
    else:
        target2candidates = None

    tfi = open(args.testfile, 'r')
    tfo = open(args.resultsfile, 'w')
    tfo_ranked = open(args.resultsfile + '.ranked', 'w')
    tfo_generated_oot = open(args.resultsfile + '.generated.oot', 'w')
    tfo_generated_best = open(args.resultsfile + '.generated.best', 'w')

    lines = 0
    while True:
        context_line = tfi.readline()
        if not context_line:
            break
        lst_instance = ContextInstance(context_line, args.no_pos)
        lines += 1
        if args.debug:
            tfo.write("\nTest context:\n")
            tfo.write("***************\n")

        tfo.write(lst_instance.decorate_context())

        result_vec = inferrer.find_inferred(lst_instance, tfo)

        generated_results = inferrer.generate_inferred(
            result_vec, lst_instance.target, lst_instance.target_lemma,
            lst_instance.pos)

        tfo.write("\nGenerated lemmatized results\n")
        tfo.write("***************\n")
        tfo.write(
            "GENERATED\t" +
            ' '.join([lst_instance.full_target_key, lst_instance.target_id]) +
            " ::: " + vec_to_str_generated(iter(generated_results.items()),
                                           args.topgenerated) + "\n")
        tfo_generated_oot.write(
            ' '.join([lst_instance.full_target_key, lst_instance.target_id]) +
            " ::: " + vec_to_str_generated(iter(generated_results.items()),
                                           args.topgenerated) + "\n")
        tfo_generated_best.write(
            ' '.join([lst_instance.full_target_key, lst_instance.target_id]) +
            " :: " + vec_to_str_generated(iter(generated_results.items()), 1) +
            "\n")

        filtered_results = inferrer.filter_inferred(
            result_vec, target2candidates[lst_instance.target_key],
            lst_instance.pos)

        tfo.write("\nFiltered results\n")
        tfo.write("***************\n")
        tfo.write(
            "RANKED\t" +
            ' '.join([lst_instance.full_target_key, lst_instance.target_id]) +
            "\t" +
            vec_to_str(iter(filtered_results.items()), len(filtered_results)) +
            "\n")
        tfo_ranked.write(
            "RANKED\t" +
            ' '.join([lst_instance.full_target_key, lst_instance.target_id]) +
            "\t" +
            vec_to_str(iter(filtered_results.items()), len(filtered_results)) +
            "\n")

        #        print "end %f" % time.time()

        if lines % 10 == 0:
            print("Read %d lines" % lines)

    print("Read %d lines in total" % lines)
    print("Time per word: %f msec" % inferrer.msec_per_word())
    tfi.close()
    tfo.close()
    tfo_ranked.close()
    tfo_generated_oot.close()
    tfo_generated_best.close()
Beispiel #6
0
def run_test(inferrer):
    
    if args.candidatesfile != None:
        target2candidates = read_candidates(args.candidatesfile)
    else:
        target2candidates = None

    tfi = open(args.testfile, 'r')
    tfo = open(args.resultsfile, 'w')
    tfo_ranked = open(args.resultsfile+'.ranked', 'w')
    tfo_generated_oot = open(args.resultsfile+'.generated.oot', 'w')
    tfo_generated_best = open(args.resultsfile+'.generated.best', 'w')
    
    lines = 0
    while True:
        context_line = tfi.readline()
        if not context_line:
            break;
        lst_instance = ContextInstance(context_line, args.no_pos)
        lines += 1
        if (args.debug == True):
            tfo.write("\nTest context:\n")
            tfo.write("***************\n")
            
        tfo.write(lst_instance.decorate_context())
        
        result_vec = inferrer.find_inferred(lst_instance, tfo)

        generated_results = inferrer.generate_inferred(result_vec, lst_instance.target, lst_instance.target_lemma, lst_instance.pos)
        
        tfo.write("\nGenerated lemmatized results\n")
        tfo.write("***************\n")
        tfo.write("GENERATED\t" + ' '.join([lst_instance.full_target_key, lst_instance.target_id]) + " ::: " + vec_to_str_generated(generated_results.iteritems(), args.topgenerated)+"\n")
        tfo_generated_oot.write(' '.join([lst_instance.full_target_key, lst_instance.target_id]) + " ::: " + vec_to_str_generated(generated_results.iteritems(), args.topgenerated)+"\n")
        tfo_generated_best.write(' '.join([lst_instance.full_target_key, lst_instance.target_id]) + " :: " + vec_to_str_generated(generated_results.iteritems(), 1)+"\n")
        
        filtered_results = inferrer.filter_inferred(result_vec, target2candidates[lst_instance.target_key], lst_instance.pos)
        
        tfo.write("\nFiltered results\n")
        tfo.write("***************\n")
        tfo.write("RANKED\t" + ' '.join([lst_instance.full_target_key, lst_instance.target_id]) + "\t" + vec_to_str(filtered_results.iteritems(), len(filtered_results))+"\n")
        tfo_ranked.write("RANKED\t" + ' '.join([lst_instance.full_target_key, lst_instance.target_id]) + "\t" + vec_to_str(filtered_results.iteritems(), len(filtered_results))+"\n")
        
#        print "end %f" % time.time()
        
        if lines % 10 == 0:
            print "Read %d lines" % lines                      
        
    print "Read %d lines in total" % lines 
    print "Time per word: %f msec" % inferrer.msec_per_word()          
    tfi.close()
    tfo.close()
    tfo_ranked.close()
    tfo_generated_oot.close()
    tfo_generated_best.close()
Beispiel #7
0
    def find_inferred(self, lst_instance, tfo):
        
        target_v, context_v = self.represent_target_and_context(lst_instance, tfo)
                
        if target_v is not None and context_v is not None:
        
        #This is not working very well at the moment. Requires more research.
        
            # ZERO-TO-HALF
            target_sim = (self.target_words.dot(target_v)+1.0)/2
            context_sim = (self.target_words.dot(context_v)+1.0)/2
            similarity = target_sim*context_sim

            # RANKS
#            target_sim = self.target_words.dot(target_v)
#            context_sim = self.target_words.dot(context_v)
#            for rank, i in enumerate(target_sim.argsort()):
#                target_sim[i] = float(rank)
#            for rank, i in enumerate(context_sim.argsort()):
#                context_sim[i] = float(rank)
#                 
#            similarity = (target_sim*context_sim)/(len(target_sim)**2)
                
            # POSITIVE SCORES
#            target_sim = self.target_words.dot(target_v)
#            context_sim = self.target_words.dot(context_v)
#            target_sim[target_sim<0.0] = 0.0            
#            context_sim[context_sim<0.0] = 0.0
#            similarity = target_sim*context_sim 
            
            # EXP
#            target_sim = self.target_words.dot(target_v)
#            target_sim = np.exp(target_sim)
#            context_sim = self.target_words.dot(context_v)
#            context_sim = np.exp(context_sim)
            
            
            # NORMALIZE
#            target_sim = self.target_words.dot(target_v)
#            target_sim_mean = np.mean(target_sim)
#            target_sim_std = np.sqrt(np.var(target_sim))
#            target_sim = (target_sim - target_sim_mean)/target_sim_std
##            target_sim[target_sim<0.0] = 0.0            
#            context_sim = self.target_words.dot(context_v)
#            context_sim_mean = np.mean(context_sim)
#            context_sim_std = np.sqrt(np.var(context_sim))
#            context_sim = (context_sim - context_sim_mean)/context_sim_std
##            context_sim[context_sim<0.0] = 0.0            
#            
#            similarity = target_sim + context_sim

        else:
            if target_v is not None:
                similarity = (self.target_words.dot(target_v)+1.0)/2
            elif context_v is not None:
                similarity = (self.target_words.dot(context_v)+1.0)/2                
            else:
                raise Exception("Can't find a target nor context.")   
           
        result_vec = sorted(zip(self.index2word, similarity), reverse=True, key=lambda x: x[1]) 
        
        tfo.write("Top most similar embeddings: " + vec_to_str(result_vec, self.top_inferences_to_analyze) + '\n')
            
        return result_vec
def candidate_ranking_out(data_f, words2elmo_token, target_ws, sents,
                          position_lst, target_w2candidates, w2index, w2elmo,
                          vocab_all):
    output_f_rank = open(
        data_f.split('/')[-1] + '.' + model + 'vocab80000.ranked', 'w')
    output_f_oot = open(
        data_f.split('/')[-1] + '.' + model + 'vocab80000.oot', 'w')
    output_f_best = open(
        data_f.split('/')[-1] + '.' + model + 'vocab80000.best', 'w')
    print('normalizing vectors')
    w2elmo = normalize(w2elmo)
    words2elmo_token = normalize(words2elmo_token)
    sim_matrix = w2elmo.dot(words2elmo_token.T).T

    #     w2elmo = w2elmo / np.sqrt((w2elmo * w2elmo).sum())
    #     words2elmo_token=words2elmo_token / np.sqrt((words2elmo_token * words2elmo_token).sum())
    print('normalizing completed')
    for i in range(len(words2elmo_token)):
        if i % 100 == 0 and i >= 100:
            print(i)
        w2elmo_token = words2elmo_token[i]
        target_w_out = target_ws[i]
        target_w = target_w_out.split()[0]
        pos = target_w.split('.')[-1]
        target_w_lemma = target_w.split('.')[0]
        #similarity matrix
        #             similarity=(w2elmo.dot(w2elmo_token)+1.0)/2
        similarity = (sim_matrix[i] + 1.0) / 2
        result_vec = sorted(zip(vocab_all, similarity),
                            reverse=True,
                            key=lambda x: x[1])
        try:
            candidates = target_w2candidates['.'.join(target_w.split('.')[:2])]
        except KeyError as e:
            print(
                'target w does not occur in gold candidates list: {0}'.format(
                    e))
            continue

        #ranked result
        filtered_results = filter_inferred(result_vec, candidates, pos)
        #             candis_cos=sorted(filtered_results.items(),key=lambda x:x[1],reverse=True)
        #             candis_cos='\t'.join([res[0]+' '+str(res[1]) for res in candis_cos])
        #             out_line='RANKED\t{0}\t{1}\n'.format(target_w,candis_cos)
        #             output_f_rank.write(out_line)
        output_f_rank.write(
            "RANKED\t" + target_w_out + "\t" +
            vec_to_str(filtered_results.items(), len(filtered_results)) + "\n")

        #generate result
        generated_results = generate_inferred(result_vec,
                                              sents[i][position_lst[i]],
                                              target_w_lemma, pos)
        output_f_oot.write(
            target_w_out + " ::: " +
            vec_to_str_generated(generated_results.items(), 10) + "\n")
        output_f_best.write(
            target_w_out + " :: " +
            vec_to_str_generated(generated_results.items(), 1) + "\n")

    output_f_rank.close()
    output_f_best.close()
    output_f_oot.close()