Example #1
0
def find_ngrams_by_tagged_words(tagged_ngrams, tagged_words, window_size=None):
    ngrams = []
    for tagged_ngram in tagged_ngrams:
        if window_size:
            window_size = int(window_size)
            t = filter(
                None,
                utils.flatten_list(tagged_ngram[window_size:-window_size]))
        else:
            t = filter(None, utils.flatten_list(tagged_ngram))
        w = filter(None, utils.flatten_list(tagged_words))
        if utils.is_sublist_of(w, t):
            ngrams.append(utils.sentence_from_tagged_ngram(tagged_ngram))
    return ngrams
Example #2
0
 def _load_aaer_test_data(self, doc_length, one_to_n=False):
     # data only contains test files, to save computing & memory costs
     self.save_dir = const.GENERATED_DATA_DIR
     if one_to_n:
         self.dict_save_fname = os.path.join(
             self.save_dir, "%s%s_1_to_%d.%s" %
             (const.DL_DOC_DICT_PREFIX, self.__class__.__name__, doc_length,
              const.PICKLE_FILE_EXTENSION))
     else:
         self.dict_save_fname = os.path.join(
             self.save_dir, "%s%s_%d.%s" %
             (const.DL_DOC_DICT_PREFIX, self.__class__.__name__, doc_length,
              const.PICKLE_FILE_EXTENSION))
     try:
         logging.info("loading saved data from %s" % self.dict_save_fname)
         with open(self.dict_save_fname, 'rb') as f:
             self._docvec_dict = pickle.load(f)
     except FileNotFoundError:
         logging.info("%s not found. Start building..." %
                      self.dict_save_fname)
         test_files = ft.list_file_paths_under_dir(const.TEST_DIR, ['txt'])
         docs = []
         for test_file in test_files:
             if one_to_n:
                 docs += utils.flatten_list(
                     ex_parsing.one_to_n_grams_from_file(
                         ft.get_source_file_by_example_file(test_file),
                         n=doc_length))
             else:
                 docs += ex_parsing.ngrams_from_file(
                     ft.get_source_file_by_example_file(test_file),
                     n=doc_length)
         # print(docs[0])
         self._make_docvec_dict(docs)
Example #3
0
def score_by_rouge(words_found, test_entity_dict, entity_key):
    logging.info('score_by_Rouge: words_found: ' + str(words_found))
    score = 0
    targets = 1
    if entity_key in test_entity_dict:
        answers = [util.flatten_list(test_entity_dict[entity_key])]
        words_found = [] if words_found is None else util.flatten_list(
            words_found)
        # targets += len(test_entity_dict[entity_key])
        print('answers:')
        print(answers)
        # print(words_found)
        score += rouge.rouge_1(words_found, answers, alpha=0.5)
        # print(score)
    elif not words_found:  # both do not have similar words compared to example
        score += 1  # set rouge2 as 0 because for single word rouge2 returns 0
    return score, targets
Example #4
0
 def score(self, key, gram, test_file_path, wv_dict, **kwargs):
     print('similar to:' + str(gram))
     # words_found = self.similar_grams_by_gram(gram, gram)
     answers = [random.choice(util.flatten_list(self.test_tokens))]
     hits, targets = score_by_rouge(answers, self.test_entity_dict, key)
     print("rouge:", hits)
     self.score_dict[
         test_file_path] = self.score_dict[test_file_path] + hits
     return targets
Example #5
0
 def make_wv_dict(self, file_path):
     sentences = ex_parsing.sentences_from_file(
         ft.get_source_file_by_example_file(file_path))
     tokens = list(self.phrases_model.get_bigrams(sentences))
     flat_grams = util.flatten_list(tokens)
     flat_grams[:] = [
         tuple(w.split(const.GENSIM_PHRASES_DELIMITER)) for w in flat_grams
     ]
     return self.doc_vector_to_dict_by_list(self.doc_vec_model, flat_grams)
Example #6
0
def dir_to_file_without_punctuations(dir_path,
                                     extension='txt',
                                     file_name=False):
    file_names = ft.list_file_paths_under_dir(dir_path, [extension])
    tokens = []
    for fname in file_names:
        temp_tokens, _ = parse_file(fname)
        tokens.extend(util.flatten_list(temp_tokens))

    if not file_name:
        file_name = '_'.join(dir_path.split('/')[-2:])
    with open(file_name, 'w') as f:
        print('saving to:', file_name)
        f.write(' '.join(tokens))
Example #7
0
def tagged_words_to_str(tagged_words):
    return const.UNIQUE_DELIMITER.join(util.flatten_list(tagged_words))
Example #8
0
def make_vec_file_from_wiki_model(sentences, wiki_aaer_vec_name):
    flatten_tokens = util.flatten_list(sentences)
    ft.filter_vec_file_by_set(const.FASTTEXT_WIKI_PATH, set(flatten_tokens), wiki_aaer_vec_name)