Esempio n. 1
0
 def _load_aaer_test_data(self, doc_length, one_to_n=False):
     # data only contains test files, to save computing & memory costs
     self.save_dir = const.GENERATED_DATA_DIR
     if one_to_n:
         self.dict_save_fname = os.path.join(
             self.save_dir, "%s%s_1_to_%d.%s" %
             (const.DL_DOC_DICT_PREFIX, self.__class__.__name__, doc_length,
              const.PICKLE_FILE_EXTENSION))
     else:
         self.dict_save_fname = os.path.join(
             self.save_dir, "%s%s_%d.%s" %
             (const.DL_DOC_DICT_PREFIX, self.__class__.__name__, doc_length,
              const.PICKLE_FILE_EXTENSION))
     try:
         logging.info("loading saved data from %s" % self.dict_save_fname)
         with open(self.dict_save_fname, 'rb') as f:
             self._docvec_dict = pickle.load(f)
     except FileNotFoundError:
         logging.info("%s not found. Start building..." %
                      self.dict_save_fname)
         test_files = ft.list_file_paths_under_dir(const.TEST_DIR, ['txt'])
         docs = []
         for test_file in test_files:
             if one_to_n:
                 docs += utils.flatten_list(
                     ex_parsing.one_to_n_grams_from_file(
                         ft.get_source_file_by_example_file(test_file),
                         n=doc_length))
             else:
                 docs += ex_parsing.ngrams_from_file(
                     ft.get_source_file_by_example_file(test_file),
                     n=doc_length)
         # print(docs[0])
         self._make_docvec_dict(docs)
Esempio n. 2
0
 def tokens_from_aaer_corpus(self):
     ngrams = []
     for path in self.path_list_from_dir():
         ng = ex_parsing.ngrams_from_file(path, self.n)
         for i in range(0, len(ng), self.n_skip):
             ngrams += ng[i]
     return ngrams
Esempio n. 3
0
 def __init__(self,
              example_path,
              test_file_path_list,
              enable_saving=False,
              n_gram=5,
              **kwargs):
     super().__init__(example_path, test_file_path_list, enable_saving,
                      n_gram, **kwargs)
     self.tagged_tokens = ex_parsing.tagged_tokens_from_file(
         self.example_path)
     self.example_ngrams = ex_parsing.ngrams_from_file(self.example_path,
                                                       self.context_size,
                                                       tagged=True)
     self.example_entity_dict = \
         ex_parsing.entity_tagged_words_dict_from_tagged_tokens(self.tagged_tokens)
     self.example_ngrams = ex_parsing.ngrams_from_file(self.example_path,
                                                       self.context_size,
                                                       tagged=True)
Esempio n. 4
0
 def test_file_processing(self, test_file_path):
     OneShotTestDoc2Vec.test_file_processing(self, test_file_path)
     ngrams = ex_parsing.ngrams_from_file(test_file_path,
                                          self.context_size,
                                          tagged=True)
     sentences = [util.sentence_from_tagged_ngram(t) for t in ngrams]
     # logging.info(ngrams)
     # logging.info(sentences)
     self.context_sized_test_wv_dict = self.context_vector_to_dict_by_list(
         self.context_vec_model, sentences)
Esempio n. 5
0
 def test_file_processing(self, test_file_path):
     super().test_file_processing(test_file_path)
     ngrams = ex_parsing.ngrams_from_file(test_file_path,
                                          self.context_size,
                                          tagged=True)
     sentences = [util.sentence_from_tagged_ngram(t) for t in ngrams]
     # logging.info(ngrams)
     logging.info("sentences: %d" % len(sentences))
     self.context_sized_test_wv_dict = self.doc_vector_to_dict_by_list(
         self.context_vec_model, sentences)
Esempio n. 6
0
 def test_file_processing(self, test_file_path):
     logging.info('testing file:' + test_file_path)
     self.init_score_dict(test_file_path)
     human_file_path = os.path.join(const.HUMAN_DIR,
                                    test_file_path.split('/')[-1])
     human_tagged_tokens = ex_parsing.tagged_tokens_from_file(
         human_file_path)
     self.human_tagged_ngrams = ex_parsing.ngrams_from_file(
         human_file_path, self.context_size, tagged=True)
     self.human_tagged_entity_dict = \
         ex_parsing.entity_tagged_words_dict_from_tagged_tokens(human_tagged_tokens)
Esempio n. 7
0
    def file_ngrams_similarities_by_docs(self, file_path, docs):
        ngram_dict = {}
        doc_similarity_dict = {}
        origin_sources = []
        origin_targets = []
        replaced_sources = []
        replaced_targets = []
        file_path = ft.get_source_file_by_example_file(file_path)
        for doc in docs:
            source_gram_n = len(doc)
            target_gram_n = t2t_make_data_files.get_target_gram_n(
                source_gram_n, self.window_size)
            try:
                target_ngrams = ngram_dict[target_gram_n]
            except KeyError:
                ngram_dict[
                    target_gram_n] = target_ngrams = ex_parsing.ngrams_from_file(
                        file_path, target_gram_n)
            source_ngrams = [
                t2t_make_data_files.source_ngram_from_target_ngram(
                    target_ngram, self.window_size)
                for target_ngram in target_ngrams
            ]

            assert len(source_ngrams) == len(target_ngrams)
            origin_sources += source_ngrams
            origin_targets += target_ngrams
            for target in target_ngrams:
                replaced_target = t2t_make_data_files.replace_by_window_size(
                    target, doc, self.window_size)
                replaced_targets.append(replaced_target)
                replaced_sources.append(doc)
        print("len(replaced_sources):%d" % len(replaced_sources))
        assert len(replaced_sources) == len(origin_sources) == len(
            replaced_targets) == len(origin_targets)
        # feed data into t2t model
        str_sources = [
            " ".join(tokens) for tokens in origin_sources + replaced_sources
        ]
        str_targets = [
            " ".join(tokens) for tokens in origin_targets + replaced_targets
        ]
        loss_model = text_encoding.TextSimilarity(str_sources, str_targets)
        losses = loss_model.encode()
        assert len(losses) == 2 * len(origin_sources)
        origin_losses = np.array(losses[:len(origin_sources)])
        replaced_losses = np.array(losses[len(origin_sources):])
        print(origin_losses)
        print(replaced_losses)
Esempio n. 8
0
 def __init__(self,
              example_path,
              test_file_path_list,
              enable_saving=False,
              n_gram=5,
              **kwargs):
     super().__init__(example_path, test_file_path_list, enable_saving,
                      n_gram, **kwargs)
     self.context_vec_model = None
     self.tagged_tokens = ex_parsing.tagged_tokens_from_file(
         self.example_path)
     self.example_entity_dict = \
         ex_parsing.entity_tagged_words_dict_from_tagged_tokens(self.tagged_tokens)
     self.example_ngrams = ex_parsing.ngrams_from_file(self.example_path,
                                                       self.context_size,
                                                       tagged=True)
     self.example_tagged_words_contexts_dict = {}
     self.context_sized_test_wv_dict = None
     self.wmd_save_dir = os.path.join(const.GENERATED_DATA_DIR, "wmdsim")
     os.makedirs(self.wmd_save_dir, exist_ok=True)
Esempio n. 9
0
from t2t_models import text_encoding
import os
import tensorflow as tf
import text_cleaning.example_parsing as ex_parsing
import common.constants as const
import common.file_tools as ft
import common.tf_utilities as tf_utils
import common.utilities as utils
import model_testing.dl_context_models as dl_context

N_GRAMS = 10
test_file_source = ft.get_source_file_by_example_file(const.TEST_FILE)
tokens = ex_parsing.ngrams_from_file(test_file_source, N_GRAMS, tagged=False)
# eval_tokens = []
# for t in tokens:
#     s = t[:-1] + 'profits'.split(' ')
#     eval_tokens.append(s)
# # TRAIN_DIR=$DATA_DIR/train/$PROBLEM/$MODEL-$HPARAMS
# print(eval_tokens)
# t = text_encoding.TextEncoding(tokens, eval_tokens)
# t.encode()
# tokens = tokens[:10]
m_t2t = dl_context.T2TContextModel(load_aaer_test_data=False, docs=tokens)
dv = m_t2t.infer_vectors_dict(tokens)
test_vec = m_t2t.infer_vector('profits')
# print(test_vec)
# print(m_t2t._docvec_dict)
# print(dv)

# print(utils.similar_by_vector(test_vec, dv, topn=3))
print(tf_utils.similar_by_ndarray(test_vec, dv, topn=3))
Esempio n. 10
0
def make_eval_files(source_file_list, tagged=False):
    n_grams = []
    for path in source_file_list:
        n_grams += ex_parsing.ngrams_from_file(path, N_GRAMS, tagged=tagged)
Esempio n. 11
0
 def tokens_from_aaer_corpus(self):
     ngrams = []
     for path in self.path_list_from_dir():
         ngrams += ex_parsing.ngrams_from_file(path, self.n)
     return ngrams