def _load_aaer_test_data(self, doc_length, one_to_n=False): # data only contains test files, to save computing & memory costs self.save_dir = const.GENERATED_DATA_DIR if one_to_n: self.dict_save_fname = os.path.join( self.save_dir, "%s%s_1_to_%d.%s" % (const.DL_DOC_DICT_PREFIX, self.__class__.__name__, doc_length, const.PICKLE_FILE_EXTENSION)) else: self.dict_save_fname = os.path.join( self.save_dir, "%s%s_%d.%s" % (const.DL_DOC_DICT_PREFIX, self.__class__.__name__, doc_length, const.PICKLE_FILE_EXTENSION)) try: logging.info("loading saved data from %s" % self.dict_save_fname) with open(self.dict_save_fname, 'rb') as f: self._docvec_dict = pickle.load(f) except FileNotFoundError: logging.info("%s not found. Start building..." % self.dict_save_fname) test_files = ft.list_file_paths_under_dir(const.TEST_DIR, ['txt']) docs = [] for test_file in test_files: if one_to_n: docs += utils.flatten_list( ex_parsing.one_to_n_grams_from_file( ft.get_source_file_by_example_file(test_file), n=doc_length)) else: docs += ex_parsing.ngrams_from_file( ft.get_source_file_by_example_file(test_file), n=doc_length) # print(docs[0]) self._make_docvec_dict(docs)
def tokens_from_aaer_corpus(self): ngrams = [] for path in self.path_list_from_dir(): ng = ex_parsing.ngrams_from_file(path, self.n) for i in range(0, len(ng), self.n_skip): ngrams += ng[i] return ngrams
def __init__(self, example_path, test_file_path_list, enable_saving=False, n_gram=5, **kwargs): super().__init__(example_path, test_file_path_list, enable_saving, n_gram, **kwargs) self.tagged_tokens = ex_parsing.tagged_tokens_from_file( self.example_path) self.example_ngrams = ex_parsing.ngrams_from_file(self.example_path, self.context_size, tagged=True) self.example_entity_dict = \ ex_parsing.entity_tagged_words_dict_from_tagged_tokens(self.tagged_tokens) self.example_ngrams = ex_parsing.ngrams_from_file(self.example_path, self.context_size, tagged=True)
def test_file_processing(self, test_file_path): OneShotTestDoc2Vec.test_file_processing(self, test_file_path) ngrams = ex_parsing.ngrams_from_file(test_file_path, self.context_size, tagged=True) sentences = [util.sentence_from_tagged_ngram(t) for t in ngrams] # logging.info(ngrams) # logging.info(sentences) self.context_sized_test_wv_dict = self.context_vector_to_dict_by_list( self.context_vec_model, sentences)
def test_file_processing(self, test_file_path): super().test_file_processing(test_file_path) ngrams = ex_parsing.ngrams_from_file(test_file_path, self.context_size, tagged=True) sentences = [util.sentence_from_tagged_ngram(t) for t in ngrams] # logging.info(ngrams) logging.info("sentences: %d" % len(sentences)) self.context_sized_test_wv_dict = self.doc_vector_to_dict_by_list( self.context_vec_model, sentences)
def test_file_processing(self, test_file_path): logging.info('testing file:' + test_file_path) self.init_score_dict(test_file_path) human_file_path = os.path.join(const.HUMAN_DIR, test_file_path.split('/')[-1]) human_tagged_tokens = ex_parsing.tagged_tokens_from_file( human_file_path) self.human_tagged_ngrams = ex_parsing.ngrams_from_file( human_file_path, self.context_size, tagged=True) self.human_tagged_entity_dict = \ ex_parsing.entity_tagged_words_dict_from_tagged_tokens(human_tagged_tokens)
def file_ngrams_similarities_by_docs(self, file_path, docs): ngram_dict = {} doc_similarity_dict = {} origin_sources = [] origin_targets = [] replaced_sources = [] replaced_targets = [] file_path = ft.get_source_file_by_example_file(file_path) for doc in docs: source_gram_n = len(doc) target_gram_n = t2t_make_data_files.get_target_gram_n( source_gram_n, self.window_size) try: target_ngrams = ngram_dict[target_gram_n] except KeyError: ngram_dict[ target_gram_n] = target_ngrams = ex_parsing.ngrams_from_file( file_path, target_gram_n) source_ngrams = [ t2t_make_data_files.source_ngram_from_target_ngram( target_ngram, self.window_size) for target_ngram in target_ngrams ] assert len(source_ngrams) == len(target_ngrams) origin_sources += source_ngrams origin_targets += target_ngrams for target in target_ngrams: replaced_target = t2t_make_data_files.replace_by_window_size( target, doc, self.window_size) replaced_targets.append(replaced_target) replaced_sources.append(doc) print("len(replaced_sources):%d" % len(replaced_sources)) assert len(replaced_sources) == len(origin_sources) == len( replaced_targets) == len(origin_targets) # feed data into t2t model str_sources = [ " ".join(tokens) for tokens in origin_sources + replaced_sources ] str_targets = [ " ".join(tokens) for tokens in origin_targets + replaced_targets ] loss_model = text_encoding.TextSimilarity(str_sources, str_targets) losses = loss_model.encode() assert len(losses) == 2 * len(origin_sources) origin_losses = np.array(losses[:len(origin_sources)]) replaced_losses = np.array(losses[len(origin_sources):]) print(origin_losses) print(replaced_losses)
def __init__(self, example_path, test_file_path_list, enable_saving=False, n_gram=5, **kwargs): super().__init__(example_path, test_file_path_list, enable_saving, n_gram, **kwargs) self.context_vec_model = None self.tagged_tokens = ex_parsing.tagged_tokens_from_file( self.example_path) self.example_entity_dict = \ ex_parsing.entity_tagged_words_dict_from_tagged_tokens(self.tagged_tokens) self.example_ngrams = ex_parsing.ngrams_from_file(self.example_path, self.context_size, tagged=True) self.example_tagged_words_contexts_dict = {} self.context_sized_test_wv_dict = None self.wmd_save_dir = os.path.join(const.GENERATED_DATA_DIR, "wmdsim") os.makedirs(self.wmd_save_dir, exist_ok=True)
from t2t_models import text_encoding import os import tensorflow as tf import text_cleaning.example_parsing as ex_parsing import common.constants as const import common.file_tools as ft import common.tf_utilities as tf_utils import common.utilities as utils import model_testing.dl_context_models as dl_context N_GRAMS = 10 test_file_source = ft.get_source_file_by_example_file(const.TEST_FILE) tokens = ex_parsing.ngrams_from_file(test_file_source, N_GRAMS, tagged=False) # eval_tokens = [] # for t in tokens: # s = t[:-1] + 'profits'.split(' ') # eval_tokens.append(s) # # TRAIN_DIR=$DATA_DIR/train/$PROBLEM/$MODEL-$HPARAMS # print(eval_tokens) # t = text_encoding.TextEncoding(tokens, eval_tokens) # t.encode() # tokens = tokens[:10] m_t2t = dl_context.T2TContextModel(load_aaer_test_data=False, docs=tokens) dv = m_t2t.infer_vectors_dict(tokens) test_vec = m_t2t.infer_vector('profits') # print(test_vec) # print(m_t2t._docvec_dict) # print(dv) # print(utils.similar_by_vector(test_vec, dv, topn=3)) print(tf_utils.similar_by_ndarray(test_vec, dv, topn=3))
def make_eval_files(source_file_list, tagged=False): n_grams = [] for path in source_file_list: n_grams += ex_parsing.ngrams_from_file(path, N_GRAMS, tagged=tagged)
def tokens_from_aaer_corpus(self): ngrams = [] for path in self.path_list_from_dir(): ngrams += ex_parsing.ngrams_from_file(path, self.n) return ngrams