コード例 #1
0
    def __init__(self, vocabfile, ignore_target, context_math, word_path,
                 context_path, conll_filename, window_size,
                 top_inferences_to_analyze):

        CsInferrer.__init__(self)
        self.ignore_target = ignore_target
        self.context_math = context_math
        self.word_vecs = Embedding(word_path)
        self.context_vecs = Embedding(context_path)
        self.use_stopwords = False

        assert (not (window_size >= 0 and conll_filename is not None))
        self.window_size = window_size
        if (window_size < 0):
            self.conll_file = open(conll_filename, 'r')
            self.sents = read_conll(self.conll_file, True)
        self.top_inferences_to_analyze = top_inferences_to_analyze

        self.w2counts, ignore, self.stopwords = load_vocabulary_counts(
            vocabfile)
コード例 #2
0
    def __init__(self, path, vocabfile, top_inferences_to_analyze):
        CsInferrer.__init__(self)
        self.embeddings = Embedding(path)
        self.top_inferences_to_analyze = top_inferences_to_analyze

        self.w2counts, ignore1, ignore2 = load_vocabulary_counts(vocabfile)
コード例 #3
0
class EmbeddingInferrer(CsInferrer):
    '''
    classdocs
    '''
    def __init__(self, path, vocabfile, top_inferences_to_analyze):
        CsInferrer.__init__(self)
        self.embeddings = Embedding(path)
        self.top_inferences_to_analyze = top_inferences_to_analyze

        self.w2counts, ignore1, ignore2 = load_vocabulary_counts(vocabfile)

    def new_target_key(self, target_key):
        pass

    def find_inferred(self, lst_instance, tfo):

        if lst_instance.target in self.embeddings:
            result_vec, deltatime = self.embeddings.closest_with_time(
                lst_instance.target, -1)
        else:
            result_vec, deltatime = None, 0

        tfo.write("\nDeltatime: %f msec\n" % ((deltatime) * 1000))
        self.inference_time(deltatime)

        if (result_vec is not None):
            tfo.write("Top most similar embeddings: " +
                      vec_to_str(result_vec, self.top_inferences_to_analyze) +
                      '\n')
        else:
            tfo.write("Top most similar embeddings: " + " contexts: None\n")

        return result_vec

    def filter_inferred(self, result_vec, candidates, pos):

        filtered_results = {}
        candidates_found = set()

        if result_vec != None:
            for word, weight in result_vec:
                wn_pos = to_wordnet_pos[pos]
                lemma = WordNetLemmatizer().lemmatize(word, wn_pos)
                if lemma in candidates:
                    self.add_inference_result(lemma, weight, filtered_results,
                                              candidates_found)
                if lemma.title() in candidates:  # match also capitalized words
                    self.add_inference_result(lemma.title(), weight,
                                              filtered_results,
                                              candidates_found)
                if word in candidates:  # there are some few cases where the candidates are not lemmatized
                    self.add_inference_result(word, weight, filtered_results,
                                              candidates_found)
                if word.title(
                ) in candidates:  # there are some few cases where the candidates are not lemmatized
                    self.add_inference_result(word.title(), weight,
                                              filtered_results,
                                              candidates_found)

        # assign negative weights for candidates with no score
        # they will appear last sorted according to their unigram count
        candidates_left = candidates - candidates_found
        for candidate in candidates_left:
            count = self.w2counts[
                candidate] if candidate in self.w2counts else 1
            score = -1 - (1.0 / count)  # between (-1,-2]
            filtered_results[candidate] = score

        return filtered_results
コード例 #4
0
class CsEmbeddingInferrer(CsInferrer):
    def __init__(self, vocabfile, ignore_target, context_math, word_path,
                 context_path, conll_filename, window_size,
                 top_inferences_to_analyze):

        CsInferrer.__init__(self)
        self.ignore_target = ignore_target
        self.context_math = context_math
        self.word_vecs = Embedding(word_path)
        self.context_vecs = Embedding(context_path)
        self.use_stopwords = False

        assert (not (window_size >= 0 and conll_filename is not None))
        self.window_size = window_size
        if (window_size < 0):
            self.conll_file = open(conll_filename, 'r')
            self.sents = read_conll(self.conll_file, True)
        self.top_inferences_to_analyze = top_inferences_to_analyze

        self.w2counts, ignore, self.stopwords = load_vocabulary_counts(
            vocabfile)

#    def close(self):
#        self.conll_file.close()

    def represent(self, target, deps, avg_flag, tfo):

        target_vec = None if target is None else np.copy(
            self.word_vecs.represent(target))
        dep_vec = None
        deps_found = 0
        for dep in deps:
            if dep in self.context_vecs:
                deps_found += 1
                if dep_vec is None:
                    dep_vec = np.copy(self.context_vecs.represent(dep))
                else:
                    dep_vec += self.context_vecs.represent(dep)
            else:
                tfo.write("NOTICE: %s not in context embeddings. Ignoring.\n" %
                          dep)

        ret_vec = None
        if target_vec is not None:
            ret_vec = target_vec
        if dep_vec is not None:
            if avg_flag:
                dep_vec /= deps_found
            if ret_vec is None:
                ret_vec = dep_vec
            else:
                ret_vec += dep_vec

        norm = (ret_vec.dot(ret_vec.transpose()))**0.5
        ret_vec /= norm

        return ret_vec

    def mult(self, target, deps, geo_mean_flag, tfo):

        # SUPPORT NONE TARGET

        target_vec = self.word_vecs.represent(target)
        scores = self.word_vecs.pos_scores(target_vec)
        for dep in deps:
            if dep in self.context_vecs:
                dep_vec = self.context_vecs.represent(dep)
                mult_scores = self.word_vecs.pos_scores(dep_vec)
                if geo_mean_flag:
                    mult_scores = mult_scores**(1.0 / len(deps))
                scores = np.multiply(scores, mult_scores)
            else:
                tfo.write("NOTICE: %s not in context embeddings. Ignoring.\n" %
                          dep)

        result_vec = self.word_vecs.top_scores(scores, -1)
        return result_vec

    def extract_contexts(self, lst_instance):
        if self.window_size < 0:
            cur_sent = next(self.sents)
            cur_sent_target_ind = lst_instance.target_ind + 1
            while ((cur_sent_target_ind < len(cur_sent)
                    and cur_sent[cur_sent_target_ind].form !=
                    lst_instance.target)):
                sys.stderr.write(
                    "Target word form mismatch in target id %s: %s != %s  Checking next word.\n"
                    %
                    (lst_instance.target_id,
                     cur_sent[cur_sent_target_ind].form, lst_instance.target))
                cur_sent_target_ind += 1
            if cur_sent_target_ind == len(cur_sent):
                sys.stderr.write("Start looking backwards.\n")
                cur_sent_target_ind = lst_instance.target_ind
                while ((cur_sent_target_ind > 0)
                       and (cur_sent[cur_sent_target_ind].form !=
                            lst_instance.target)):
                    sys.stderr.write(
                        "Target word form mismatch in target id %s: %s != %s  Checking previous word.\n"
                        % (lst_instance.target_id,
                           cur_sent[cur_sent_target_ind].form,
                           lst_instance.target))
                    cur_sent_target_ind -= 1
            if cur_sent_target_ind == 0:
                sys.stderr.write("ERROR: Couldn't find a match for target.")
                cur_sent_target_ind = lst_instance.target_ind + 1
            stopwords = self.stopwords if self.use_stopwords else set()
            contexts = get_deps(cur_sent, cur_sent_target_ind, stopwords)
        else:
            contexts = lst_instance.get_neighbors(self.window_size)

        return contexts

    def find_inferred(self, lst_instance, tfo):

        contexts = self.extract_contexts(lst_instance)
        tfo.write("Contexts for target %s are: %s\n" %
                  (lst_instance.target, contexts))
        contexts = [c for c in contexts if c in self.context_vecs]
        tfo.write("Contexts in vocabulary for target %s are: %s\n" %
                  (lst_instance.target, contexts))
        if self.ignore_target:
            target = None
        else:
            if lst_instance.target not in self.word_vecs:
                tfo.write("ERROR: %s not in word embeddings.Trying lemma.\n" %
                          lst_instance.target)
                if lst_instance.target_lemma not in self.word_vecs:
                    tfo.write(
                        "ERROR: lemma %s also not in word embeddings. Giving up.\n"
                        % lst_instance.target_lemma)
                    return None
                else:
                    target = lst_instance.target_lemma
            else:
                target = lst_instance.target

        # 'add' and 'avg' metrics are implemented more efficiently with vector representation arithmetics
        # as shown in Omer's linguistic regularities paper, this is equivalent as long as the vectors are normalized to 1
        if self.context_math == 'add':
            cs_rep = self.represent(target, contexts, False, tfo)
            if cs_rep is None:
                cs_rep = self.word_vecs.zeros()
            result_vec = self.word_vecs.closest_vec(cs_rep, -1)
        elif self.context_math == 'avg':
            cs_rep = self.represent(target, contexts, True, tfo)
            if cs_rep is None:
                cs_rep = self.word_vecs.zeros()
            result_vec = self.word_vecs.closest_vec(cs_rep, -1)
        elif self.context_math == 'mult':
            result_vec = self.mult(target, contexts, False, tfo)
        elif self.context_math == 'geomean':
            result_vec = self.mult(target, contexts, True, tfo)
        elif self.context_math == 'none' and self.ignore_target is not None:
            result_vec = self.word_vecs.closest(target, -1)
        else:
            raise Exception('Unknown context math: %s' % self.context_math)

        if (result_vec is not None):
            tfo.write("Top most similar embeddings: " +
                      vec_to_str(result_vec, self.top_inferences_to_analyze) +
                      '\n')
        else:
            tfo.write("Top most similar embeddings: " + " contexts: None\n")

        return result_vec
コード例 #5
0
ファイル: embedding_inferrer.py プロジェクト: orenmel/lexsub
 def __init__(self, path, vocabfile, top_inferences_to_analyze):
     CsInferrer.__init__(self)
     self.embeddings = Embedding(path)
     self.top_inferences_to_analyze = top_inferences_to_analyze
     
     self.w2counts, ignore1, ignore2 = load_vocabulary_counts(vocabfile)
コード例 #6
0
ファイル: embedding_inferrer.py プロジェクト: orenmel/lexsub
class EmbeddingInferrer(CsInferrer):
    '''
    classdocs
    '''


    def __init__(self, path, vocabfile, top_inferences_to_analyze):
        CsInferrer.__init__(self)
        self.embeddings = Embedding(path)
        self.top_inferences_to_analyze = top_inferences_to_analyze
        
        self.w2counts, ignore1, ignore2 = load_vocabulary_counts(vocabfile)
        
    def new_target_key(self, target_key):
        pass
    
    def find_inferred(self, lst_instance, tfo):
        
        if lst_instance.target in self.embeddings:
            result_vec, deltatime = self.embeddings.closest_with_time(lst_instance.target, -1)
        else:
            result_vec, deltatime = None, 0
        
        tfo.write("\nDeltatime: %f msec\n" % ((deltatime)*1000))
        self.inference_time(deltatime)
            
        if (result_vec is not None):
            tfo.write("Top most similar embeddings: " + vec_to_str(result_vec, self.top_inferences_to_analyze) + '\n')
        else:
            tfo.write("Top most similar embeddings: " + " contexts: None\n") 
            
        return result_vec
            


    def filter_inferred(self, result_vec, candidates, pos, orig_subvec_ignored):
    
        filtered_results = {}
        candidates_found = set()
        
        if result_vec != None:
            for word, weight in result_vec:
                wn_pos = to_wordnet_pos[pos]
                lemma = WordNetLemmatizer().lemmatize(word, wn_pos)
                if lemma in candidates:
                    self.add_inference_result(lemma, weight, filtered_results, candidates_found)
                if lemma.title() in candidates: # match also capitalized words
                    self.add_inference_result(lemma.title(), weight, filtered_results, candidates_found)
                if word in candidates: # there are some few cases where the candidates are not lemmatized
                    self.add_inference_result(word, weight, filtered_results, candidates_found)
                if word.title() in candidates: # there are some few cases where the candidates are not lemmatized
                    self.add_inference_result(word.title(), weight, filtered_results, candidates_found)    
                                    
                    
        # assign negative weights for candidates with no score
        # they will appear last sorted according to their unigram count        
        candidates_left = candidates - candidates_found
        for candidate in candidates_left:            
            count = self.w2counts[candidate] if candidate in self.w2counts else 1
            score = -1 - (1.0/count) # between (-1,-2] 
            filtered_results[candidate] = score        
         
        return filtered_results