def test_when_unk_symbol_is_absent(self): data_path = 'mldp/tests/data/small_chunks/' data_source = {"data_path": data_path} vocab = Vocabulary(self.reader) vocab.create(data_source, "first_name", add_default_special_symbols=False) with self.assertRaises(Exception): a = vocab["dummy_token"]
def test_when_unk_symbol_is_present(self): data_path = 'mldp/tests/data/small_chunks/' data_source = {"data_path": data_path} vocab = Vocabulary(self.reader) vocab.create(data_source, "first_name", add_default_special_symbols=True) unk_token = vocab["dummy_token"].token self.assertTrue(unk_token == UNK)
def test_creation(self): data_path = 'mldp/tests/data/small_chunks/' data_source = {"data_path": data_path} vocab = Vocabulary(self.reader) vocab.create(data_source, "first_name") data = read_data_from_csv_file(get_file_paths(data_path)) unique_first_names = np.unique(data['first_name']) for ufn in unique_first_names: self.assertTrue(ufn in vocab)
def test_loading(self): tmp_f_path = "dummy_vocab.txt" sep = '\t' entries = [("first", "1"), ("two", "2"), ("three", "3"), ("four", "4"), ("five", "5"), ("seven", "7")] with open(tmp_f_path, 'w') as f: for entry in entries: f.write(sep.join(entry) + "\n") vocab = Vocabulary() vocab.load(tmp_f_path, sep="\t") for token, count in entries: self.assertTrue(token in vocab) self.assertTrue(vocab[token].count == int(count)) os.remove(tmp_f_path)
def create_vocabulary(vocab_fp, data_path, sep='\t'): """Creates a word vocabulary using a simple pipeline.""" vocab_pipeline = assemble_vocab_pipeline(text_fname=InpDataF.REV_TEXT, sep=sep) words_vocab = Vocabulary(vocab_pipeline, name_prefix="words") # adding special symbols before creating vocab, so they would appear on top for st in VOCAB_DEFAULT_SYMBOLS: if st not in words_vocab: words_vocab.add_special_symbol(st) words_vocab.create(data_source={'data_path': data_path}, data_fnames=InpDataF.REV_TEXT) words_vocab.write(vocab_fp, sep=' ')
def create_bpe_vocabulary(bpe_vocab_fp, bpe_int_fp, data_path, truecaser_fp): """Creates vocabulary that is used to map BPEs to ids and vice-versa. It iterates over data, performs tokenization with the BPE tokenizer and true caser and creates a vocabulary of unique symbols that is saved. Args: bpe_vocab_fp: path to the new vocabulary that will be created. data_path: path to data with text. bpe_int_fp: internal file with BPEs. truecaser_fp: self-explanatory. """ bpe = BPE(glossaries=SPECIAL_TOKENS) bpe.load(bpcodes_fp=bpe_int_fp, merges=-1) tcaser = MosesTruecaser(load_from=truecaser_fp, is_asr=True) tcase_func = partial(tcaser.truecase, return_str=True, use_known=True) unsup_tok_func = lambda x: bpe.tokenize(tcase_func(x).split()) # PIPELINES AND VOCAB # vocab_pipeline = assemble_vocab_pipeline(text_fname=InpDataF.REV_TEXT, lowercase=False, tok_func=unsup_tok_func) subword_vocab = Vocabulary(vocab_pipeline, name_prefix="word", special_tokens=SPECIAL_TOKENS) subword_vocab.create(data_source={"data_path": data_path}, max_size=None, data_fnames=InpDataF.REV_TEXT) subword_vocab.write(bpe_vocab_fp, sep=' ')
def test_vocabulary_mapper(self): """Testing whether the mapper allows to map back and forth field values. """ data_path = 'mldp/tests/data/mock_data.csv' target_fields = ["first_name", "last_name", "email", "gender"] reader = CsvReader(sep=',') vocab = Vocabulary(reader) for target_field in target_fields: vocab.create(data_source={"data_path": data_path}, data_fnames=target_field) data = read_data_from_csv_file(data_path) data_original = copy.deepcopy(data) mapper_to = VocabMapper({target_field: vocab}, "id") mapper_back = VocabMapper({target_field: vocab}, "token") data = mapper_to(data) data = mapper_back(data) self.assertTrue( (data[target_field] == data_original[target_field]).all())
def create_word_vocab(vocab_fp, data_path, truecaser_fp): """Creates a vocabulary using a vocabulary specific pipeline.""" tcaser = MosesTruecaser(load_from=truecaser_fp, is_asr=True) tcase_func = partial(tcaser.truecase, return_str=True, use_known=True) tok_func = lambda x: tcase_func(x).split() vocab_pipeline = assemble_vocab_pipeline(text_fname=InpDataF.REV_TEXT, lowercase=False, tok_func=tok_func) word_vocab = Vocabulary(vocab_pipeline, name_prefix="word", special_tokens=SPECIAL_TOKENS) word_vocab.create(data_source={'data_path': data_path}, data_fnames=InpDataF.REV_TEXT) word_vocab.write(vocab_fp, sep=' ')
# KL ANNEALING MECHANISMS # c_kl_ann = KlCycAnnealing(t=run_hp.c_kl_ann_batches, m=run_hp.c_m, r=run_hp.c_r, max_val=run_hp.c_kl_ann_max_val) z_kl_ann = KlCycAnnealing(t=run_hp.z_kl_ann_batches, m=run_hp.z_m, r=run_hp.c_r, max_val=run_hp.z_kl_ann_max_val) # PIPELINES AND VOCAB # vocab_pipeline = assemble_vocab_pipeline(text_fname=InpDataF.REV_TEXT) word_vocab = Vocabulary(vocab_pipeline, name_prefix="word") # adding special symbols before creating vocab, so they would appear on top for st in VOCAB_DEFAULT_SYMBOLS: if st not in word_vocab: word_vocab.add_special_symbol(st) word_vocab.load_or_create(run_hp.words_vocab_fp, data_source=vocab_data_source, max_size=model_hp.ext_vocab_size, sep=' ', data_fnames=InpDataF.REV_TEXT) word_vocab.write(comb_paths(run_hp.output_path, "word_vocab.txt"), sep=' ') train_pipeline = assemble_train_pipeline(
bpe = BPE(glossaries=SPECIAL_TOKENS) bpe.load(bpcodes_fp=run_conf.bpe_fp) unsup_tok_func = lambda x: bpe.tokenize(tcase_func(x).split()) gold_tok_func = lambda x: bpe.tokenize(mt.tokenize(tcase_func(x), escape=False) ) detok_func = lambda x: dt.detokenize(bpe.detokenize(x), unescape=False) # DATA PIPELINES AND VOCAB # vocab_pipeline = assemble_vocab_pipeline(text_fname=InpDataF.REV_TEXT, lowercase=False, tok_func=unsup_tok_func) subword_vocab = Vocabulary(vocab_pipeline, name_prefix="word", special_tokens=SPECIAL_TOKENS) subword_vocab.load(run_conf.bpe_vocab_fp, max_size=None, sep=' ') subword_vocab.write(comb_paths(run_conf.output_path, "bpe_%d_vocab.txt" % run_conf.subword_num), sep=' ') worker_num = 3 * device_count reader_threads = 3 * device_count uns_train_pipeline = assemble_unsup_pipeline( word_vocab=subword_vocab, reader_threads=reader_threads, worker_num=worker_num, max_len=run_conf.max_seq_len, max_groups_per_batch=run_conf.train_groups_per_batch, min_revs_per_group=run_conf.min_rev_per_group, max_revs_per_group=run_conf.max_rev_per_group,
def test_how_to_apply_run(self): data_path = os.path.join(self.tutorials_path, "data/tweets.csv") # paths where vocabs will be saved and later loaded from words_vocab_file_path = os.path.join(self.tutorials_path, "data/vocabs/words.txt") labels_vocab_file_path = os.path.join(self.tutorials_path, 'data/vocabs/labels.txt') # creating step objects twitter_tokenizer = TweetTokenizer() preprocessor = TwitterFilesPreprocessor( input_cols_number=3, tweets_indx=2, add_header=['ids', 'labels', 'tweets']) csv_reader = CsvReader(sep='\t', chunk_size=30) fields_selector = FieldSelector(fnames=["tweets", "labels"]) token_processor = TokenProcessor( fnames="tweets", tok_func=twitter_tokenizer.tokenize, tok_cleaning_func=twitter_text_cleaner, lowercase=True) # data pipeline for vocabularies creation vocab_data_pipeline = Pipeline(reader=csv_reader, preprocessor=preprocessor, worker_processes_num=0, name_prefix="vocabs") vocab_data_pipeline.add_step(fields_selector) vocab_data_pipeline.add_step(token_processor) # creating or loading vocabs words_vocab = Vocabulary(vocab_data_pipeline, name_prefix="words") words_vocab.load_or_create(words_vocab_file_path, data_source={"data_path": data_path}, data_fnames="tweets") labels_vocab = Vocabulary(vocab_data_pipeline, name_prefix="labels") labels_vocab.load_or_create(labels_vocab_file_path, data_source={"data_path": data_path}, data_fnames="labels") print(words_vocab) print(labels_vocab) print(vocab_data_pipeline) # extra steps for training and evaluation mapper = VocabMapper(field_names_to_vocabs={ "tweets": words_vocab, "labels": labels_vocab }) padder = Padder(fname="tweets", new_mask_fname="tweets_mask", pad_symbol=words_vocab[PAD].id) formatter = FeaturesLabelsFormatter(features_field_name="tweets", labels_field_name="labels", classes_number=len(labels_vocab)) # building the actual pipeline dev_data_pipeline = Pipeline(reader=csv_reader, preprocessor=preprocessor, worker_processes_num=1, name_prefix="dev") dev_data_pipeline.add_step(fields_selector) dev_data_pipeline.add_step(token_processor) dev_data_pipeline.add_step(mapper) dev_data_pipeline.add_step(padder) dev_data_pipeline.add_step(formatter) print(dev_data_pipeline) epochs = 2 i_model = ISentiLSTM(dev_data_pipeline) i_model.init_model(words_vocab_size=len(words_vocab), input_dim=50, lstm_hidden_dim=120, number_of_classes=len(labels_vocab), mask_symbol=words_vocab[PAD].id)
def test_vocabulary_mapper_mixed_field_values(self): """Testing whether the mapper can map multi-dim mixed field values.""" target_field_name = "dummy" symbols_attr = "id" data_chunk = DataChunk( **{ target_field_name: np.array( [[["one"], np.array(["two", "one"])], [["three"], np.array(["four", "five", "six"])]], dtype="object") }) expected_output_chunk = DataChunk( **{ target_field_name: np.array( [[[1], np.array([2, 1])], [[3], np.array([4, 5, 6])]], dtype="object") }) # creating and populating a vocab vocab = Vocabulary() vocab.add_symbol("zero") vocab.add_symbol("one") vocab.add_symbol("two") vocab.add_symbol("three") vocab.add_symbol("four") vocab.add_symbol("five") vocab.add_symbol("six") mapper = VocabMapper({target_field_name: vocab}, symbols_attr=symbols_attr) actual_output_chunk = mapper(data_chunk) self.assertTrue(actual_output_chunk == expected_output_chunk)
def test_vocabulary_mapper_multidim_lists(self): """Testing whether the mapper can map multi-dim lists.""" target_field_name = "dummy" symbols_attr = "id" data_chunk = DataChunk( **{ target_field_name: np.array( [[["one"], ["two"]], [["three"], ["four", "five", "six"]]], dtype="object") }) exp_val = np.empty(2, dtype="object") exp_val[0] = np.array([[1], [2]]) exp_val[1] = np.array([[3], [4, 5, 6]]) expected_output_chunk = DataChunk(**{target_field_name: exp_val}) # creating and populating a vocab vocab = Vocabulary() vocab.add_symbol("zero") vocab.add_symbol("one") vocab.add_symbol("two") vocab.add_symbol("three") vocab.add_symbol("four") vocab.add_symbol("five") vocab.add_symbol("six") mapper = VocabMapper({target_field_name: vocab}, symbols_attr=symbols_attr) actual_output_chunk = mapper(copy.deepcopy(data_chunk)) self.assertTrue(actual_output_chunk == expected_output_chunk)