def score(self, key, tagged_gram, test_file_path, wv_dict, **kwargs): # tagged_gram: [['esafetyworld', 'comp'], ['inc', 'end']] # find ngrams in test file similar to example similar_contexts = [] example_contexts = self.example_tagged_words_contexts_dict[ tagged_words_to_str(tagged_gram)] test_contexts = list(self.context_sized_test_wv_dict.keys()) # save_path = os.path.join(self.wmd_save_dir, ft.file_name_from_path(test_file_path)) # try: # wmd_instance = WmdSimilarity.load(save_path) # except FileNotFoundError: # file = open(save_path, 'x') # file.close() wmd_instance = WmdSimilarity(test_contexts, self.context_vec_model, num_best=1) for example_context in example_contexts: sims = wmd_instance[example_context] similar_contexts.append(test_contexts[sims[0][0]]) # wmd_instance.save(save_path) logging.info('similar contexts:') print(similar_contexts) # similar_contexts = set() context_wv_dict = util.subset_dict_by_list2(wv_dict, similar_contexts) logging.info('context_wv_dict:') logging.info(len(context_wv_dict)) # print(context_wv_dict) gram = util.sentence_from_tagged_ngram(tagged_gram) return OneShotTestDoc2Vec.score(self, key, gram, test_file_path, context_wv_dict)
def test_file_processing(self, test_file_path): OneShotTestDoc2Vec.test_file_processing(self, test_file_path) ngrams = ex_parsing.ngrams_from_file(test_file_path, self.context_size, tagged=True) sentences = [util.sentence_from_tagged_ngram(t) for t in ngrams] # logging.info(ngrams) # logging.info(sentences) self.context_sized_test_wv_dict = self.context_vector_to_dict_by_list( self.context_vec_model, sentences)
def test_file_processing(self, test_file_path): super().test_file_processing(test_file_path) ngrams = ex_parsing.ngrams_from_file(test_file_path, self.context_size, tagged=True) sentences = [util.sentence_from_tagged_ngram(t) for t in ngrams] # logging.info(ngrams) logging.info("sentences: %d" % len(sentences)) self.context_sized_test_wv_dict = self.doc_vector_to_dict_by_list( self.context_vec_model, sentences)
def entity_dict_from_tagged_tokens(tagged_tokens): entity_dict = {} entity_tagged_words_dict = entity_tagged_words_dict_from_tagged_tokens( tagged_tokens) for entity, tagged_words in entity_tagged_words_dict.items(): l = [] for words in tagged_words: l.append(utils.sentence_from_tagged_ngram(words)) entity_dict[entity] = l logging.info(entity_dict) return entity_dict
def score(self, key, tagged_gram, test_file_path, wv_dict, **kwargs): # tagged_gram: [['esafetyworld', 'comp'], ['inc', 'end']] # find ngrams in test file similar to example example_tagged_words_ngram_vecs = \ self.example_tagged_words_contexts_dict[tagged_words_to_str(tagged_gram)] context_wv_dict, context_similarity_dict = make_context_dict( example_tagged_words_ngram_vecs, self.context_sized_test_wv_dict, wv_dict, self.topn * 2, self.context_threshold) gram = util.sentence_from_tagged_ngram(tagged_gram) return OneShotTestDoc2Vec.score(self, key, gram, test_file_path, context_wv_dict)
def find_ngrams_by_tagged_words(tagged_ngrams, tagged_words, window_size=None): ngrams = [] for tagged_ngram in tagged_ngrams: if window_size: window_size = int(window_size) t = filter( None, utils.flatten_list(tagged_ngram[window_size:-window_size])) else: t = filter(None, utils.flatten_list(tagged_ngram)) w = filter(None, utils.flatten_list(tagged_words)) if utils.is_sublist_of(w, t): ngrams.append(utils.sentence_from_tagged_ngram(tagged_ngram)) return ngrams
def str_1_to_n_grams_from_file(file_path, n=5, tagged=False): grams = [] for i in range(1, n + 1): sequenced_ngrams = sequenced_ngrams_from_file(file_path, i, tagged=tagged) # print('print(sequenced_ngrams)') # print(sequenced_ngrams) if tagged: sequenced_ngrams[:] = [ tuple(utils.sentence_from_tagged_ngram(u)) for u in sequenced_ngrams ] # sequenced_ngrams = utils.sentence_from_tagged_ngram(sequenced_ngrams) if tagged else sequenced_ngrams grams.append([utils.iter_to_string(tu) for tu in sequenced_ngrams]) return grams
def m_to_n_grams_from_file(file_path, m=1, n=5, tagged=False): assert n >= m > 0 grams = [] for i in range(m, n + 1): sequenced_ngrams = sequenced_ngrams_from_file(file_path, i, tagged=tagged) # print('print(sequenced_ngrams)') # print(sequenced_ngrams) if tagged: sequenced_ngrams[:] = [ tuple(utils.sentence_from_tagged_ngram(u)) for u in sequenced_ngrams ] # sequenced_ngrams = utils.sentence_from_tagged_ngram(sequenced_ngrams) if tagged else sequenced_ngrams grams += sequenced_ngrams return grams
def score(self, key, tagged_gram, test_file_path, wv_dict, **kwargs): # tagged_gram: [['esafetyworld', 'comp'], ['inc', 'end']] # find ngrams in test file similar to example # similar_contexts = \ # similar_grams_by_doc_vecs(example_tagged_words_ngram_vecs, self.context_sized_test_wv_dict) context_similarity_dict = self.find_example_contexts( self.example_entity_dict, self.example_ngrams, test_file_path, self.context_size) context_wv_dict = util.subset_dict_by_list2( wv_dict, context_similarity_dict.keys()) gram = util.sentence_from_tagged_ngram(tagged_gram) return super().score(key, gram, test_file_path, context_wv_dict, context_sim_dict=context_similarity_dict)