Esempio n. 1
0
    def test_when_unk_symbol_is_absent(self):
        data_path = 'mldp/tests/data/small_chunks/'
        data_source = {"data_path": data_path}
        vocab = Vocabulary(self.reader)
        vocab.create(data_source,
                     "first_name",
                     add_default_special_symbols=False)

        with self.assertRaises(Exception):
            a = vocab["dummy_token"]
Esempio n. 2
0
    def test_when_unk_symbol_is_present(self):
        data_path = 'mldp/tests/data/small_chunks/'
        data_source = {"data_path": data_path}
        vocab = Vocabulary(self.reader)
        vocab.create(data_source,
                     "first_name",
                     add_default_special_symbols=True)

        unk_token = vocab["dummy_token"].token
        self.assertTrue(unk_token == UNK)
Esempio n. 3
0
    def test_creation(self):
        data_path = 'mldp/tests/data/small_chunks/'
        data_source = {"data_path": data_path}
        vocab = Vocabulary(self.reader)
        vocab.create(data_source, "first_name")

        data = read_data_from_csv_file(get_file_paths(data_path))
        unique_first_names = np.unique(data['first_name'])

        for ufn in unique_first_names:
            self.assertTrue(ufn in vocab)
Esempio n. 4
0
    def test_loading(self):
        tmp_f_path = "dummy_vocab.txt"
        sep = '\t'
        entries = [("first", "1"), ("two", "2"), ("three", "3"), ("four", "4"),
                   ("five", "5"), ("seven", "7")]
        with open(tmp_f_path, 'w') as f:
            for entry in entries:
                f.write(sep.join(entry) + "\n")

        vocab = Vocabulary()
        vocab.load(tmp_f_path, sep="\t")

        for token, count in entries:
            self.assertTrue(token in vocab)
            self.assertTrue(vocab[token].count == int(count))
        os.remove(tmp_f_path)
def create_vocabulary(vocab_fp, data_path, sep='\t'):
    """Creates a word vocabulary using a simple pipeline."""
    vocab_pipeline = assemble_vocab_pipeline(text_fname=InpDataF.REV_TEXT,
                                             sep=sep)
    words_vocab = Vocabulary(vocab_pipeline, name_prefix="words")
    # adding special symbols before creating vocab, so they would appear on top
    for st in VOCAB_DEFAULT_SYMBOLS:
        if st not in words_vocab:
            words_vocab.add_special_symbol(st)

    words_vocab.create(data_source={'data_path': data_path},
                       data_fnames=InpDataF.REV_TEXT)
    words_vocab.write(vocab_fp, sep=' ')
Esempio n. 6
0
def create_bpe_vocabulary(bpe_vocab_fp, bpe_int_fp, data_path, truecaser_fp):
    """Creates vocabulary that is used to map BPEs to ids and vice-versa.

    It iterates over data, performs tokenization with the BPE tokenizer and
    true caser and creates a vocabulary of unique symbols that is saved.

    Args:
        bpe_vocab_fp: path to the new vocabulary that will be created.
        data_path: path to data with text.
        bpe_int_fp: internal file with BPEs.
        truecaser_fp: self-explanatory.
    """
    bpe = BPE(glossaries=SPECIAL_TOKENS)
    bpe.load(bpcodes_fp=bpe_int_fp, merges=-1)

    tcaser = MosesTruecaser(load_from=truecaser_fp, is_asr=True)
    tcase_func = partial(tcaser.truecase, return_str=True, use_known=True)
    unsup_tok_func = lambda x: bpe.tokenize(tcase_func(x).split())

    #   PIPELINES AND VOCAB   #

    vocab_pipeline = assemble_vocab_pipeline(text_fname=InpDataF.REV_TEXT,
                                             lowercase=False,
                                             tok_func=unsup_tok_func)
    subword_vocab = Vocabulary(vocab_pipeline,
                               name_prefix="word",
                               special_tokens=SPECIAL_TOKENS)
    subword_vocab.create(data_source={"data_path": data_path},
                         max_size=None,
                         data_fnames=InpDataF.REV_TEXT)
    subword_vocab.write(bpe_vocab_fp, sep=' ')
    def test_vocabulary_mapper(self):
        """Testing whether the mapper allows to map back and forth field values.
        """
        data_path = 'mldp/tests/data/mock_data.csv'
        target_fields = ["first_name", "last_name", "email", "gender"]

        reader = CsvReader(sep=',')
        vocab = Vocabulary(reader)

        for target_field in target_fields:
            vocab.create(data_source={"data_path": data_path},
                         data_fnames=target_field)

            data = read_data_from_csv_file(data_path)
            data_original = copy.deepcopy(data)

            mapper_to = VocabMapper({target_field: vocab}, "id")
            mapper_back = VocabMapper({target_field: vocab}, "token")

            data = mapper_to(data)
            data = mapper_back(data)

            self.assertTrue(
                (data[target_field] == data_original[target_field]).all())
Esempio n. 8
0
def create_word_vocab(vocab_fp, data_path, truecaser_fp):
    """Creates a vocabulary using a vocabulary specific pipeline."""
    tcaser = MosesTruecaser(load_from=truecaser_fp, is_asr=True)
    tcase_func = partial(tcaser.truecase, return_str=True, use_known=True)
    tok_func = lambda x: tcase_func(x).split()
    vocab_pipeline = assemble_vocab_pipeline(text_fname=InpDataF.REV_TEXT,
                                             lowercase=False, tok_func=tok_func)
    word_vocab = Vocabulary(vocab_pipeline, name_prefix="word",
                            special_tokens=SPECIAL_TOKENS)

    word_vocab.create(data_source={'data_path': data_path},
                      data_fnames=InpDataF.REV_TEXT)
    word_vocab.write(vocab_fp, sep=' ')
#   KL ANNEALING MECHANISMS  #

c_kl_ann = KlCycAnnealing(t=run_hp.c_kl_ann_batches,
                          m=run_hp.c_m,
                          r=run_hp.c_r,
                          max_val=run_hp.c_kl_ann_max_val)
z_kl_ann = KlCycAnnealing(t=run_hp.z_kl_ann_batches,
                          m=run_hp.z_m,
                          r=run_hp.c_r,
                          max_val=run_hp.z_kl_ann_max_val)

#   PIPELINES AND VOCAB   #

vocab_pipeline = assemble_vocab_pipeline(text_fname=InpDataF.REV_TEXT)
word_vocab = Vocabulary(vocab_pipeline, name_prefix="word")

# adding special symbols before creating vocab, so they would appear on top
for st in VOCAB_DEFAULT_SYMBOLS:
    if st not in word_vocab:
        word_vocab.add_special_symbol(st)

word_vocab.load_or_create(run_hp.words_vocab_fp,
                          data_source=vocab_data_source,
                          max_size=model_hp.ext_vocab_size,
                          sep=' ',
                          data_fnames=InpDataF.REV_TEXT)

word_vocab.write(comb_paths(run_hp.output_path, "word_vocab.txt"), sep=' ')

train_pipeline = assemble_train_pipeline(
Esempio n. 10
0
bpe = BPE(glossaries=SPECIAL_TOKENS)
bpe.load(bpcodes_fp=run_conf.bpe_fp)

unsup_tok_func = lambda x: bpe.tokenize(tcase_func(x).split())
gold_tok_func = lambda x: bpe.tokenize(mt.tokenize(tcase_func(x), escape=False)
                                       )
detok_func = lambda x: dt.detokenize(bpe.detokenize(x), unescape=False)

#   DATA PIPELINES AND VOCAB   #

vocab_pipeline = assemble_vocab_pipeline(text_fname=InpDataF.REV_TEXT,
                                         lowercase=False,
                                         tok_func=unsup_tok_func)
subword_vocab = Vocabulary(vocab_pipeline,
                           name_prefix="word",
                           special_tokens=SPECIAL_TOKENS)
subword_vocab.load(run_conf.bpe_vocab_fp, max_size=None, sep=' ')
subword_vocab.write(comb_paths(run_conf.output_path,
                               "bpe_%d_vocab.txt" % run_conf.subword_num),
                    sep=' ')
worker_num = 3 * device_count
reader_threads = 3 * device_count
uns_train_pipeline = assemble_unsup_pipeline(
    word_vocab=subword_vocab,
    reader_threads=reader_threads,
    worker_num=worker_num,
    max_len=run_conf.max_seq_len,
    max_groups_per_batch=run_conf.train_groups_per_batch,
    min_revs_per_group=run_conf.min_rev_per_group,
    max_revs_per_group=run_conf.max_rev_per_group,
Esempio n. 11
0
    def test_how_to_apply_run(self):

        data_path = os.path.join(self.tutorials_path, "data/tweets.csv")

        # paths where vocabs will be saved and later loaded from
        words_vocab_file_path = os.path.join(self.tutorials_path,
                                             "data/vocabs/words.txt")
        labels_vocab_file_path = os.path.join(self.tutorials_path,
                                              'data/vocabs/labels.txt')

        # creating step objects
        twitter_tokenizer = TweetTokenizer()
        preprocessor = TwitterFilesPreprocessor(
            input_cols_number=3,
            tweets_indx=2,
            add_header=['ids', 'labels', 'tweets'])
        csv_reader = CsvReader(sep='\t', chunk_size=30)
        fields_selector = FieldSelector(fnames=["tweets", "labels"])
        token_processor = TokenProcessor(
            fnames="tweets",
            tok_func=twitter_tokenizer.tokenize,
            tok_cleaning_func=twitter_text_cleaner,
            lowercase=True)

        # data pipeline for vocabularies creation
        vocab_data_pipeline = Pipeline(reader=csv_reader,
                                       preprocessor=preprocessor,
                                       worker_processes_num=0,
                                       name_prefix="vocabs")
        vocab_data_pipeline.add_step(fields_selector)
        vocab_data_pipeline.add_step(token_processor)

        # creating or loading vocabs
        words_vocab = Vocabulary(vocab_data_pipeline, name_prefix="words")
        words_vocab.load_or_create(words_vocab_file_path,
                                   data_source={"data_path": data_path},
                                   data_fnames="tweets")

        labels_vocab = Vocabulary(vocab_data_pipeline, name_prefix="labels")
        labels_vocab.load_or_create(labels_vocab_file_path,
                                    data_source={"data_path": data_path},
                                    data_fnames="labels")

        print(words_vocab)

        print(labels_vocab)

        print(vocab_data_pipeline)

        # extra steps for training and evaluation
        mapper = VocabMapper(field_names_to_vocabs={
            "tweets": words_vocab,
            "labels": labels_vocab
        })
        padder = Padder(fname="tweets",
                        new_mask_fname="tweets_mask",
                        pad_symbol=words_vocab[PAD].id)
        formatter = FeaturesLabelsFormatter(features_field_name="tweets",
                                            labels_field_name="labels",
                                            classes_number=len(labels_vocab))

        # building the actual pipeline
        dev_data_pipeline = Pipeline(reader=csv_reader,
                                     preprocessor=preprocessor,
                                     worker_processes_num=1,
                                     name_prefix="dev")
        dev_data_pipeline.add_step(fields_selector)
        dev_data_pipeline.add_step(token_processor)
        dev_data_pipeline.add_step(mapper)
        dev_data_pipeline.add_step(padder)
        dev_data_pipeline.add_step(formatter)

        print(dev_data_pipeline)

        epochs = 2

        i_model = ISentiLSTM(dev_data_pipeline)
        i_model.init_model(words_vocab_size=len(words_vocab),
                           input_dim=50,
                           lstm_hidden_dim=120,
                           number_of_classes=len(labels_vocab),
                           mask_symbol=words_vocab[PAD].id)
    def test_vocabulary_mapper_mixed_field_values(self):
        """Testing whether the mapper can map multi-dim mixed field values."""
        target_field_name = "dummy"
        symbols_attr = "id"

        data_chunk = DataChunk(
            **{
                target_field_name:
                np.array(
                    [[["one"], np.array(["two", "one"])],
                     [["three"], np.array(["four", "five", "six"])]],
                    dtype="object")
            })
        expected_output_chunk = DataChunk(
            **{
                target_field_name:
                np.array(
                    [[[1], np.array([2, 1])], [[3], np.array([4, 5, 6])]],
                    dtype="object")
            })

        # creating and populating a vocab
        vocab = Vocabulary()
        vocab.add_symbol("zero")
        vocab.add_symbol("one")
        vocab.add_symbol("two")
        vocab.add_symbol("three")
        vocab.add_symbol("four")
        vocab.add_symbol("five")
        vocab.add_symbol("six")

        mapper = VocabMapper({target_field_name: vocab},
                             symbols_attr=symbols_attr)
        actual_output_chunk = mapper(data_chunk)

        self.assertTrue(actual_output_chunk == expected_output_chunk)
    def test_vocabulary_mapper_multidim_lists(self):
        """Testing whether the mapper can map multi-dim lists."""
        target_field_name = "dummy"
        symbols_attr = "id"

        data_chunk = DataChunk(
            **{
                target_field_name:
                np.array(
                    [[["one"], ["two"]], [["three"], ["four", "five", "six"]]],
                    dtype="object")
            })
        exp_val = np.empty(2, dtype="object")
        exp_val[0] = np.array([[1], [2]])
        exp_val[1] = np.array([[3], [4, 5, 6]])
        expected_output_chunk = DataChunk(**{target_field_name: exp_val})

        # creating and populating a vocab
        vocab = Vocabulary()
        vocab.add_symbol("zero")
        vocab.add_symbol("one")
        vocab.add_symbol("two")
        vocab.add_symbol("three")
        vocab.add_symbol("four")
        vocab.add_symbol("five")
        vocab.add_symbol("six")

        mapper = VocabMapper({target_field_name: vocab},
                             symbols_attr=symbols_attr)
        actual_output_chunk = mapper(copy.deepcopy(data_chunk))

        self.assertTrue(actual_output_chunk == expected_output_chunk)