Beispiel #1
0
    def filter_inferred(self, result_vec, candidates, pos):

        filtered_results = {}
        candidates_found = set()

        if result_vec != None:
            for word, weight in result_vec:
                wn_pos = to_wordnet_pos[pos]
                lemma = WordNetLemmatizer().lemmatize(word, wn_pos)
                if lemma in candidates:
                    self.add_inference_result(lemma, weight, filtered_results,
                                              candidates_found)
                if lemma.title() in candidates:  # match also capitalized words
                    self.add_inference_result(lemma.title(), weight,
                                              filtered_results,
                                              candidates_found)
                if word in candidates:  # there are some few cases where the candidates are not lemmatized
                    self.add_inference_result(word, weight, filtered_results,
                                              candidates_found)
                if word.title(
                ) in candidates:  # there are some few cases where the candidates are not lemmatized
                    self.add_inference_result(word.title(), weight,
                                              filtered_results,
                                              candidates_found)

        # assign negative weights for candidates with no score
        # they will appear last sorted according to their unigram count
        candidates_left = candidates - candidates_found
        for candidate in candidates_left:
            count = self.w2counts[
                candidate] if candidate in self.w2counts else 1
            score = -1 - (1.0 / count)  # between (-1,-2]
            filtered_results[candidate] = score

        return filtered_results
Beispiel #2
0
    def filter_inferred(self, result_vec, candidates, pos):
    
        filtered_results = {}
        candidates_found = set()
        
        if result_vec != None:
            for word, weight in result_vec:
                wn_pos = to_wordnet_pos[pos]
                lemma = WordNetLemmatizer().lemmatize(word, wn_pos)
                if lemma in candidates:
                    self.add_inference_result(lemma, weight, filtered_results, candidates_found)
                if lemma.title() in candidates:
                    self.add_inference_result(lemma.title(), weight, filtered_results, candidates_found)
                if word in candidates: # there are some few cases where the candidates are not lemmatized
                    self.add_inference_result(word, weight, filtered_results, candidates_found)                    
                if word.title() in candidates: # there are some few cases where the candidates are not lemmatized
                    self.add_inference_result(word.title(), weight, filtered_results, candidates_found)
                    
        # assign negative weights for candidates with no score
        # they will appear last sorted according to their unigram count        
#        candidates_left = candidates - candidates_found
#        for candidate in candidates_left:            
#            count = self.w2counts[candidate] if candidate in self.w2counts else 1
#            score = -1 - (1.0/count) # between (-1,-2] 
#            filtered_results[candidate] = score   
         
        return filtered_results
Beispiel #3
0
    def filter_inferred(self, result_vec, candidates, pos):

        filtered_results = {}
        candidates_found = set()
        # SO There is no way a composite word can appear?!
        if result_vec != None:

            # # TODO: this is my modification to test the difference hypothesis in our impls.
            # for word, weight in result_vec:
            #     if word in candidates:
            #         self.add_inference_result(word, weight, filtered_results, candidates_found)

            for word, weight in result_vec:
                wn_pos = to_wordnet_pos[pos]
                lemma = WordNetLemmatizer().lemmatize(word, wn_pos)
                if lemma in candidates:
                    self.add_inference_result(lemma, weight, filtered_results,
                                              candidates_found)
                if lemma.title() in candidates:
                    self.add_inference_result(lemma.title(), weight,
                                              filtered_results,
                                              candidates_found)
                if word in candidates:  # there are some few cases where the candidates are not lemmatized
                    self.add_inference_result(word, weight, filtered_results,
                                              candidates_found)
                if word.title(
                ) in candidates:  # there are some few cases where the candidates are not lemmatized
                    self.add_inference_result(word.title(), weight,
                                              filtered_results,
                                              candidates_found)

        # assign negative weights for candidates with no score
        # they will appear last sorted according to their unigram count
#        candidates_left = candidates - candidates_found
#        for candidate in candidates_left:
#            count = self.w2counts[candidate] if candidate in self.w2counts else 1
#            score = -1 - (1.0/count) # between (-1,-2]
#            filtered_results[candidate] = score

        return filtered_results
    def filter_inferred(self, result_vec, candidates, pos):

        filtered_results = {}
        candidates_found = set()

        if result_vec != None:
            for word, weight in result_vec:
                wn_pos = to_wordnet_pos[pos]
                word = word.decode('utf-8')
                try:
                    lemma = WordNetLemmatizer().lemmatize(word, wn_pos)
                except UnicodeDecodeError as e:
                    print(word, e)
                    continue
                if lemma in candidates:
                    self.add_inference_result(lemma, weight, filtered_results,
                                              candidates_found)
                if lemma.title() in candidates:
                    self.add_inference_result(lemma.title(), weight,
                                              filtered_results,
                                              candidates_found)
                if word in candidates:  # there are some few cases where the candidates are not lemmatized
                    self.add_inference_result(word, weight, filtered_results,
                                              candidates_found)
                if word.title(
                ) in candidates:  # there are some few cases where the candidates are not lemmatized
                    self.add_inference_result(word.title(), weight,
                                              filtered_results,
                                              candidates_found)

        # assign negative weights for candidates with no score
        # they will appear last sorted according to their unigram count
#        candidates_left = candidates - candidates_found
#        for candidate in candidates_left:
#            count = self.w2counts[candidate] if candidate in self.w2counts else 1
#            score = -1 - (1.0/count) # between (-1,-2]
#            filtered_results[candidate] = score

        return filtered_results