def test_readme_example(self): from mltoolkit.mldp.pipeline import Pipeline from mltoolkit.mldp.steps.readers import CsvReader from mltoolkit.mldp.steps.transformers.nlp import TokenProcessor, Padder from mltoolkit.mldp.steps.transformers.field import FieldSelector data_path = "mltoolkit/mldp/tests/data/tweets.csv" # creating steps csv_reader = CsvReader(sep='\t', chunk_size=30) fields_selector = FieldSelector(fnames=["tweets", "labels"]) token_processor = TokenProcessor(fnames="tweets", tok_func=lambda x: x.split(), lowercase=True) padder = Padder(fname="tweets", new_mask_fname="tweets_mask", pad_symbol="<PAD>") # creating the pipeline pipeline = Pipeline(reader=csv_reader, worker_processes_num=1) pipeline.add_step(fields_selector) pipeline.add_step(token_processor) pipeline.add_step(padder) # iterate over data chunks for data_chunk in pipeline.iter(data_path=data_path): pass # generate documentation and print it print(pipeline)
def test_invalid_steps(self): """Testing whether an error is raised if an invalid step is present.""" data_path = 'mldp/tests/data/news.csv' data_source = {'data_path': data_path} inv_reader = InvalidCsvReader() val_reader = CsvReader() val_transf1 = FieldSelector("text") val_transf2 = TokenProcessor(fnames='text') inv_transf1 = InvalidTransformer() accum = ChunkAccumulator(new_size=3) formatter = PandasFormatter() # try only the invalid reader and valid steps dp = Pipeline(reader=inv_reader, error_on_invalid_chunk='error') for vs in [val_transf1, val_transf2, accum, formatter]: dp.add_step(vs) with self.assertRaises(DataChunkError): for _ in dp.iter(**data_source): pass # try valid reader and invalid steps steps = [val_transf1, val_transf2, inv_transf1, accum] for st in permutations(steps): dp = Pipeline(reader=val_reader, error_on_invalid_chunk='error') for s in st: dp.add_step(s) dp.add_step(formatter) with self.assertRaises(DataChunkError): for _ in dp.iter(**data_source): pass
def test_output(self): """ Testing a simple scenario when a token matching function, cleaner, and a simple token splitter are used. """ field_name = "dummy" special_token = "<ANIMAL>" lower_case = True tok_mat_func = lambda x: token_matching_func(x, special_token) token_cleaning_func = lambda x: re.sub(r'[?!,.]', '', x) tokenization_func = lambda x: x.split() input_seqs = ["Hello, this is my dog!", "A dummy sentence for tokenization.", "What a lovely puppy!"] input_data_chunk = DataChunk(**{field_name: input_seqs}) expect_seqs = [["hello", "this", "is", "my", special_token], ["a", "dummy", "sentence", "for", "tokenization"], ["what", "a", "lovely", special_token]] expected_data_chunk = DataChunk(**{field_name: expect_seqs}) tokenizer = TokenProcessor(field_name, tokenization_func=tokenization_func, token_cleaning_func=token_cleaning_func, token_matching_func=tok_mat_func, lower_case=lower_case) actual_data_chunk = tokenizer(input_data_chunk) self.assertTrue(expected_data_chunk == actual_data_chunk)
def assemble_vocab_pipeline(text_fname, sep='\t', encoding='utf-8', tok_func=None, lowercase=True): """Assembler for the vocabulary pipeline based on a CSV reader.""" reader = CsvReader(sep=sep, encoding=encoding, quoting=QUOTE_NONE) token_processor = TokenProcessor(fnames=text_fname, lowercase=lowercase, tok_func=tok_func) # creating vocabulary pipeline vocab_pipeline = Pipeline(reader=reader) vocab_pipeline.add_step(token_processor) return vocab_pipeline
def assemble_train_pipeline(word_vocab, max_groups_per_batch=1, min_revs_per_group=None, max_revs_per_group=10, seed=None, workers=1): """ This pipeline is specific to the preprocessed Amazon and Yelp reviews. Creates a flow of transformation steps that modify the data until the final form is reached in terms of PyTorch tensors. :param word_vocab: vocabulary object with words/tokens. :param max_groups_per_batch: number of groups each batch should have. :param min_revs_per_group: number of reviews a group should have in order not to be discarded. :param max_revs_per_group: self-explanatory. :param reseed: set it to True if use multi-processing and want it to return different sequences of batches every epoch. This has to do purely with multi-processing issues in combination with numpy. """ assert START in word_vocab assert END in word_vocab group_files_shuffler = GroupFileShuffler() reader = CsvReader(sep='\t', engine='python', chunk_size=None, encoding='utf-8', quoting=QUOTE_NONE, timeout=None, worker_threads_num=1) fname_renamer = FieldRenamer({ InpDataF.REV_TEXT: ModelF.REV, InpDataF.GROUP_ID: ModelF.GROUP_ID }) unit_sampler = UnitSampler(id_fname=ModelF.GROUP_ID, sample_all=True, min_units=min_revs_per_group, max_units=max_revs_per_group) unit_sampler_accum = ChunkAccumulator(unit_sampler) # since we're splitting one group into multiple chunks, it's convenient # to postfix each group_id name, such that it would be possible to # associate summaries with different subsets of reviews postfixer = Postfixer(id_fname=ModelF.GROUP_ID) # to avoid having same product/business appearing in the same merged # data-chunk, buffer a small number of them, shuffle, and release chunk_shuffler = ChunkAccumulator(ChunkShuffler(buffer_size=500)) # accumulates a fixed number of group chunks, merges them # together, and passes along the pipeline chunk_coll = ChunkCollector(buffer_size=max_groups_per_batch) chunk_accum = ChunkAccumulator(chunk_coll) # alternation of data entries tokenizer = TokenProcessor(fnames=ModelF.REV) vocab_mapper = VocabMapper({ModelF.REV: word_vocab}) seq_wrapper = SeqWrapper(fname=ModelF.REV, start_el=word_vocab[START].id, end_el=word_vocab[END].id) seq_len_computer = SeqLenComputer(ModelF.REV, ModelF.REV_LEN) sorter = ChunkSorter(ModelF.REV_LEN) padder = Padder(fname=ModelF.REV, new_mask_fname=ModelF.REV_MASK, pad_symbol=word_vocab[PAD].id, padding_mode='right') summ_rev_indxs_creator = SummRevIndxsCreator( group_id_fname=ModelF.GROUP_ID, category_fname=ModelF.CAT) rev_mapper = RevMapper(group_rev_indxs_fname=ModelF.GROUP_REV_INDXS, group_rev_mask_fname=ModelF.GROUP_REV_INDXS_MASK, rev_mask_fname=ModelF.REV_MASK) pipeline = PyTorchPipeline(reader=reader, preprocessor=group_files_shuffler, worker_processes_num=workers, seed=seed, error_on_invalid_chunk=False, timeout=None) pipeline.add_step(fname_renamer) pipeline.add_step(unit_sampler_accum) pipeline.add_step(postfixer) pipeline.add_step(chunk_shuffler) pipeline.add_step(chunk_accum) # entry transformations pipeline.add_step(tokenizer) pipeline.add_step(vocab_mapper) pipeline.add_step(seq_wrapper) pipeline.add_step(seq_len_computer) pipeline.add_step(sorter) pipeline.add_step(padder) # adding additional fields for attention and summarization pipeline.add_step(summ_rev_indxs_creator) pipeline.add_step(rev_mapper) return pipeline
def assemble_tuning_pipeline(word_vocab, max_groups_per_batch=1, tok_func=None, lowercase=False): """ The pipeline yields tokenized reviews and summaries that can be used for training (fine-tuning of the model). """ assert START in word_vocab and END in word_vocab reader = CsvReader(sep='\t', encoding='utf-8', engine='python', chunk_size=None, use_lists=True, quating=QUOTE_NONE) chunk_accum = ChunkAccumulator(new_size=max_groups_per_batch) ama_spec_trans = AmazonTransformer(fnames_to_copy=[ GoldDataF.PROD_ID, GoldDataF.CAT, ]) summ_mapper = SummMapper(fname=ModelF.SUMMS, new_indx_fname=ModelF.SUMM_GROUP_INDX) token_processor = TokenProcessor(fnames=[ModelF.REV, ModelF.SUMM], tok_func=tok_func, lowercase=lowercase) vocab_mapper = VocabMapper({ ModelF.REV: word_vocab, ModelF.SUMM: word_vocab }) fname_renamer = FieldRenamer({ GoldDataF.PROD_ID: ModelF.GROUP_ID, GoldDataF.CAT: ModelF.CAT, ModelF.SUMMS: ModelF.SUMM }) seq_wrapper = SeqWrapper(fname=[ModelF.REV, ModelF.SUMM], start_el=word_vocab[START].id, end_el=word_vocab[END].id) padder = Padder(fname=[ModelF.REV, ModelF.SUMM], new_mask_fname=[ModelF.REV_MASK, ModelF.SUMM_MASK], pad_symbol=word_vocab[PAD].id, padding_mode='right') indxs_creator = GoldSummRevIndxsCreator() # rev_mapper = RevMapper(group_rev_indxs_fname=ModelF.GROUP_REV_INDXS, # group_rev_mask_fname=ModelF.GROUP_REV_INDXS_MASK, # rev_mask_fname=ModelF.REV_MASK) # props len_prop = SummLenProp(summ_fname=ModelF.SUMM, rev_fname=ModelF.REV, group_rev_indxs_fname=ModelF.GROUP_REV_INDXS, summ_group_indx_fname=ModelF.SUMM_GROUP_INDX, new_fname=ModelF.LEN_PROP) pov_prop = POVProp(text_fname=ModelF.SUMM, new_fname=ModelF.POV_PROP) rouge_prop = SummRougeProp(summ_fname=ModelF.SUMM, rev_fname=ModelF.REV, group_rev_indxs_fname=ModelF.GROUP_REV_INDXS, summ_group_indx_fname=ModelF.SUMM_GROUP_INDX, new_fname=ModelF.ROUGE_PROP) rating_prop = DummyProp(fname=ModelF.SUMM, new_fname=ModelF.RATING_PROP, fval=0.) np_formatter = NumpyFormatter([ ModelF.LEN_PROP, ModelF.RATING_PROP, ModelF.POV_PROP, ModelF.ROUGE_PROP ]) pipeline = PyTorchPipeline(reader=reader, error_on_invalid_chunk=False) # pipeline.add_step(shuffler) pipeline.add_step(chunk_accum) pipeline.add_step(ama_spec_trans) pipeline.add_step(summ_mapper) pipeline.add_step(fname_renamer) pipeline.add_step(indxs_creator) # props pipeline.add_step(rating_prop) pipeline.add_step(rouge_prop) pipeline.add_step(token_processor) # the props below require tokenization pipeline.add_step(len_prop) pipeline.add_step(pov_prop) pipeline.add_step(vocab_mapper) pipeline.add_step(seq_wrapper) pipeline.add_step(padder) pipeline.add_step(np_formatter) return pipeline
def assemble_eval_pipeline(word_vocab, max_groups_per_chunk=1, tok_func=None, lowercase=False): """Assembles a data-pipeline for eval. against gold summaries.""" assert START in word_vocab and END in word_vocab reader = CsvReader(sep='\t', encoding='utf-8', engine='python', chunk_size=max_groups_per_chunk, use_lists=True, quating=QUOTE_NONE) rouge_prop = SummEvalRougeKnob( hyp_fnames=[GoldDataF.SUMM1, GoldDataF.SUMM2, GoldDataF.SUMM3], ref_fnames=GoldDataF.REVS, new_fname=ModelF.ROUGE_PROP) field_dupl = FieldDuplicator({ GoldDataF.SUMM1: TOK_SUMM1, GoldDataF.SUMM2: TOK_SUMM2, GoldDataF.SUMM3: TOK_SUMM3 }) tokenizer = TokenProcessor(fnames=[TOK_SUMM1, TOK_SUMM2, TOK_SUMM3] + GoldDataF.REVS, tok_func=tok_func, lowercase=lowercase) field_dropper = FieldDropper([TOK_SUMM1, TOK_SUMM2, TOK_SUMM3]) rating_prop = DummyProp(fname=GoldDataF.PROD_ID, new_fname=ModelF.RATING_PROP, fval=0.) len_prop = SummEvalLenProp(summ_fnames=[TOK_SUMM1, TOK_SUMM2, TOK_SUMM3], rev_fnames=GoldDataF.REVS, new_fname=ModelF.LEN_PROP) pov_prop = SummEvalPOVProp(summ_fnames=[TOK_SUMM1, TOK_SUMM2, TOK_SUMM3], new_fname=ModelF.POV_PROP) # summaries are not converted to tokens vocab_mapper = VocabMapper({ModelF.REV: word_vocab}) dataset_spec_trans = AmazonTransformer([ GoldDataF.PROD_ID, GoldDataF.CAT, ModelF.ROUGE_PROP, ModelF.LEN_PROP, ModelF.RATING_PROP, ModelF.POV_PROP ]) fname_renamer = FieldRenamer({ GoldDataF.PROD_ID: ModelF.GROUP_ID, GoldDataF.CAT: ModelF.CAT }) seq_wrapper = SeqWrapper(fname=[ModelF.REV], start_el=word_vocab[START].id, end_el=word_vocab[END].id) padder = Padder(fname=[ModelF.REV], new_mask_fname=[ModelF.REV_MASK], pad_symbol=word_vocab[PAD].id, padding_mode='right') indxs_creator = GoldSummRevIndxsCreator() rev_mapper = RevMapper(group_rev_indxs_fname=ModelF.GROUP_REV_INDXS, group_rev_mask_fname=ModelF.GROUP_REV_INDXS_MASK, rev_mask_fname=ModelF.REV_MASK) np_formatter = NumpyFormatter([ ModelF.ROUGE_PROP, ModelF.LEN_PROP, ModelF.RATING_PROP, ModelF.POV_PROP ]) pipeline = PyTorchPipeline(reader=reader, error_on_invalid_chunk=False) pipeline.add_step(rouge_prop) pipeline.add_step(rating_prop) # props that require tokenization pipeline.add_step(field_dupl) pipeline.add_step(tokenizer) pipeline.add_step(pov_prop) pipeline.add_step(len_prop) pipeline.add_step(field_dropper) pipeline.add_step(dataset_spec_trans) pipeline.add_step(vocab_mapper) pipeline.add_step(fname_renamer) pipeline.add_step(seq_wrapper) pipeline.add_step(padder) pipeline.add_step(indxs_creator) pipeline.add_step(rev_mapper) pipeline.add_step(np_formatter) return pipeline
def assemble_eval_pipeline(word_vocab, max_groups_per_chunk=1, dataset='yelp', tokenization_func=lambda x: x.split()): """Assembles the pipeline for evaluation on the YELP and Amazon eval set.""" assert dataset in ['yelp', 'amazon'] if dataset == 'yelp': fields_obj = YelpEvalF fname_renamer = FieldRenamer({fields_obj.BUSINESS_ID: ModelF.GROUP_ID}) dataset_spec_trans = YelpTransformer() else: fields_obj = AmazonEvalF fname_renamer = FieldRenamer({ fields_obj.PROD_ID: ModelF.GROUP_ID, fields_obj.CAT: ModelF.CAT }) dataset_spec_trans = AmazonTransformer() assert START in word_vocab assert END in word_vocab reader = CsvReader(sep='\t', encoding='utf-8', engine='python', quotechar='\'', chunk_size=max_groups_per_chunk) # notice that I do not tokenize summaries, I leave them as they are! token_processor = TokenProcessor(fnames=fields_obj.REVS, tokenization_func=tokenization_func) # notice that I don't convert summs tokens to ids vocab_mapper = VocabMapper({fn: word_vocab for fn in fields_obj.REVS}) seq_wrapper = SeqWrapper(ModelF.REV, start_el=word_vocab[START].id, end_el=word_vocab[END].id) seq_len_computer = SeqLenComputer(ModelF.REV, ModelF.REV_LEN) padder = Padder(fname=ModelF.REV, new_mask_fname=ModelF.REV_MASK, pad_symbol=word_vocab[PAD].id, padding_mode='right') sorter = ChunkSorter(field_name=ModelF.REV_LEN, fields_to_sort=[ ModelF.REV, ModelF.REV_MASK, ModelF.CAT, ModelF.GROUP_ID ]) indxs_creator = GoldSummRevIndxsCreator(group_id_fname=ModelF.GROUP_ID) rev_mapper = RevMapper(group_rev_indxs_fname=ModelF.GROUP_REV_INDXS, group_rev_mask_fname=ModelF.GROUP_REV_INDXS_MASK, rev_mask_fname=ModelF.REV_MASK) formatter = PyTorchFormatter() pipeline = Pipeline(reader=reader, error_on_invalid_chunk=False) pipeline.add_step(token_processor) pipeline.add_step(vocab_mapper) pipeline.add_step(dataset_spec_trans) pipeline.add_step(fname_renamer) pipeline.add_step(seq_wrapper) pipeline.add_step(seq_len_computer) pipeline.add_step(padder) pipeline.add_step(sorter) pipeline.add_step(indxs_creator) pipeline.add_step(rev_mapper) pipeline.add_step(formatter) return pipeline
def assemble_infer_pipeline(word_vocab, max_groups_per_chunk=1, max_reviews=10, tokenization_func=lambda x: x.split()): """Assembles a simple inference pipeline for summary generation. Assumes that csv files are read where reviews have the following column names: 'rev1', 'rev2', ..., 'revN', each review separated by \t. Args: word_vocab: word vocabulary to convert words to ids. max_groups_per_chunk: self-explanatory. max_reviews: the maximum number of reviews to load per group. Columns in the CSV file should be `rev1`, ...., `revN`. tokenization_func: self-explanatory. """ rev_fnames = [ f'{InfDataF.REV_PREFIX}{i}' for i in range(1, max_reviews + 1) ] assert START in word_vocab assert END in word_vocab reader = CsvReader(sep='\t', encoding='utf-8', engine='python', quotechar='\'', chunk_size=max_groups_per_chunk) rev_flattener = ReviewFlattener(group_id_fname=InfDataF.GROUP_ID, rev_fnames=rev_fnames) token_processor = TokenProcessor(fnames=ModelF.REV, tokenization_func=tokenization_func) # notice that I don't convert summs tokens to ids vocab_mapper = VocabMapper({ModelF.REV: word_vocab}) seq_wrapper = SeqWrapper(ModelF.REV, start_el=word_vocab[START].id, end_el=word_vocab[END].id) seq_len_computer = SeqLenComputer(ModelF.REV, ModelF.REV_LEN) padder = Padder(fname=ModelF.REV, new_mask_fname=ModelF.REV_MASK, pad_symbol=word_vocab[PAD].id, padding_mode='right') sorter = ChunkSorter(field_name=ModelF.REV_LEN, fields_to_sort=[ModelF.REV, ModelF.GROUP_ID]) # re-using the step summ_rev_indx_creator = GoldSummRevIndxsCreator( group_id_fname=ModelF.GROUP_ID) pipeline = PyTorchPipeline(reader=reader, error_on_invalid_chunk=False) pipeline.add_step(rev_flattener) pipeline.add_step(token_processor) pipeline.add_step(vocab_mapper) pipeline.add_step(seq_wrapper) pipeline.add_step(seq_len_computer) pipeline.add_step(padder) pipeline.add_step(sorter) pipeline.add_step(summ_rev_indx_creator) return pipeline
def test_how_to_apply_run(self): data_path = os.path.join(self.tutorials_path, "data/tweets.csv") # paths where vocabs will be saved and later loaded from words_vocab_file_path = os.path.join(self.tutorials_path, "data/vocabs/words.txt") labels_vocab_file_path = os.path.join(self.tutorials_path, 'data/vocabs/labels.txt') # creating step objects twitter_tokenizer = TweetTokenizer() preprocessor = TwitterFilesPreprocessor( input_cols_number=3, tweets_indx=2, add_header=['ids', 'labels', 'tweets']) csv_reader = CsvReader(sep='\t', chunk_size=30) fields_selector = FieldSelector(fnames=["tweets", "labels"]) token_processor = TokenProcessor( fnames="tweets", tok_func=twitter_tokenizer.tokenize, tok_cleaning_func=twitter_text_cleaner, lowercase=True) # data pipeline for vocabularies creation vocab_data_pipeline = Pipeline(reader=csv_reader, preprocessor=preprocessor, worker_processes_num=0, name_prefix="vocabs") vocab_data_pipeline.add_step(fields_selector) vocab_data_pipeline.add_step(token_processor) # creating or loading vocabs words_vocab = Vocabulary(vocab_data_pipeline, name_prefix="words") words_vocab.load_or_create(words_vocab_file_path, data_source={"data_path": data_path}, data_fnames="tweets") labels_vocab = Vocabulary(vocab_data_pipeline, name_prefix="labels") labels_vocab.load_or_create(labels_vocab_file_path, data_source={"data_path": data_path}, data_fnames="labels") print(words_vocab) print(labels_vocab) print(vocab_data_pipeline) # extra steps for training and evaluation mapper = VocabMapper(field_names_to_vocabs={ "tweets": words_vocab, "labels": labels_vocab }) padder = Padder(fname="tweets", new_mask_fname="tweets_mask", pad_symbol=words_vocab[PAD].id) formatter = FeaturesLabelsFormatter(features_field_name="tweets", labels_field_name="labels", classes_number=len(labels_vocab)) # building the actual pipeline dev_data_pipeline = Pipeline(reader=csv_reader, preprocessor=preprocessor, worker_processes_num=1, name_prefix="dev") dev_data_pipeline.add_step(fields_selector) dev_data_pipeline.add_step(token_processor) dev_data_pipeline.add_step(mapper) dev_data_pipeline.add_step(padder) dev_data_pipeline.add_step(formatter) print(dev_data_pipeline) epochs = 2 i_model = ISentiLSTM(dev_data_pipeline) i_model.init_model(words_vocab_size=len(words_vocab), input_dim=50, lstm_hidden_dim=120, number_of_classes=len(labels_vocab), mask_symbol=words_vocab[PAD].id)
def assemble_unsup_pipeline(word_vocab, max_groups_per_batch=1, reader_threads=5, min_revs_per_group=None, max_revs_per_group=10, worker_num=1, seed=None, tok_func=None, lowercase=True, max_len=None, shuffler_buffer_size=250): """Creates a data-pipeline that yields batches for to train the unsup. model. Creates a flow of data transformation steps that modify the data until the final form is reached in terms of PyTorch tensors. Args: word_vocab: vocabulary object with words/tokens. max_groups_per_batch: number of groups each batch should have. min_revs_per_group: number of reviews a group should have in order not to be discarded. max_revs_per_group: self-explanatory. seed: used to use the same data subsamples/shuffles every epoch. max_len: if passed will filter out all reviews that a longer than the threshold. Returns: DataPipeline object that allows iteration over batches/chunks. """ assert START in word_vocab and END in word_vocab file_shuffler = FileShuffler() # TODO: explain how grouping works here - each file has reviews of a group reader = CsvReader(sep='\t', engine='c', chunk_size=None, encoding='utf-8', quoting=QUOTE_NONE, buffer_size=200, timeout=None, worker_threads_num=reader_threads, use_lists=True) fname_renamer = FieldRenamer({ InpDataF.REV_TEXT: ModelF.REV, InpDataF.GROUP_ID: ModelF.REV_GROUP_ID, InpDataF.RATING: ModelF.REV_RATING, InpDataF.RATING_DEV: ModelF.RATING_PROP, InpDataF.CAT: ModelF.REV_CAT }) unit_sampler = UnitSampler(id_fname=ModelF.REV_GROUP_ID, sample_all=True, min_units=min_revs_per_group, max_units=max_revs_per_group) unit_sampler_accum = ChunkAccumulator(unit_sampler) # since we're splitting one group into multiple chunks, it's convenient # to postfix each group_id name, such that it would be possible to # associate summaries with different subsets of reviews postfixer = Postfixer(id_fname=ModelF.REV_GROUP_ID) # property and related steps len_prop = LenProp(len_fname=ModelF.REV_LEN, new_fname=ModelF.LEN_PROP) pov_prop = POVProp(text_fname=ModelF.REV, new_fname=ModelF.POV_PROP) rouge_field_merger = FieldMerger( merge_fnames=[InpDataF.ROUGE1, InpDataF.ROUGE2, InpDataF.ROUGEL], new_fname=ModelF.ROUGE_PROP) # to avoid having same product/business appearing in the same merged # data-chunk, buffer a small number of them, shuffle, and release chunk_shuffler = ChunkAccumulator( ChunkShuffler(buffer_size=shuffler_buffer_size)) # accumulates a fixed number of group chunks, merges them # together, and passes along the pipeline chunk_coll = ChunkCollector(buffer_size=max_groups_per_batch, strict=True) chunk_accum = ChunkAccumulator(chunk_coll) # alternation of data entries tokenizer = TokenProcessor(fnames=ModelF.REV, tok_func=tok_func, lowercase=lowercase) vocab_mapper = VocabMapper({ModelF.REV: word_vocab}) seq_wrapper = SeqWrapper(fname=ModelF.REV, start_el=word_vocab[START].token, end_el=word_vocab[END].token) seq_len_computer = SeqLenComputer(ModelF.REV, ModelF.REV_LEN) padder = Padder(fname=ModelF.REV, new_mask_fname=ModelF.REV_MASK, pad_symbol=word_vocab[PAD].id, padding_mode='right') summ_rev_indxs_creator = GroupRevIndxsCreator( rev_group_id_fname=ModelF.REV_GROUP_ID, rev_cat_fname=ModelF.REV_CAT) rev_mapper = RevMapper(group_rev_indxs_fname=ModelF.GROUP_REV_INDXS, group_rev_mask_fname=ModelF.GROUP_REV_INDXS_MASK, rev_mask_fname=ModelF.REV_MASK) # extra steps for the loss associated with probability mass un_word_cal = UniqueWordCalc( new_fname=ModelF.OTHER_REV_UWORDS, rev_fname=ModelF.REV, other_rev_indxs_fname=ModelF.OTHER_REV_INDXS, other_rev_indxs_mask_fname=ModelF.OTHER_REV_INDXS_MASK) un_word_padder = Padder(fname=ModelF.OTHER_REV_UWORDS, new_mask_fname=ModelF.OTHER_REV_UWORDS_MASK, pad_symbol=word_vocab[PAD].id, padding_mode='right') numpy_formatter = NumpyFormatter(fnames=[ ModelF.ROUGE_PROP, ModelF.RATING_PROP, ModelF.LEN_PROP, ModelF.POV_PROP ]) pipeline = PyTorchPipeline(reader=reader, preprocessor=file_shuffler, worker_processes_num=worker_num, seed=seed, output_buffer_size=50, error_on_invalid_chunk=False, timeout=None) pipeline.add_step(fname_renamer) pipeline.add_step(rouge_field_merger) pipeline.add_step(tokenizer) if max_len: pipeline.add_step(TextLenFilter(fname=ModelF.REV, max_len=max_len)) pipeline.add_step(unit_sampler_accum) pipeline.add_step(postfixer) pipeline.add_step(chunk_shuffler) pipeline.add_step(seq_wrapper) pipeline.add_step(seq_len_computer) # properties pipeline.add_step(len_prop) pipeline.add_step(pov_prop) pipeline.add_step(chunk_accum) pipeline.add_step(vocab_mapper) pipeline.add_step(padder) # adding additional fields for attention and summarization pipeline.add_step(summ_rev_indxs_creator) pipeline.add_step(rev_mapper) # adding steps for word count computation pipeline.add_step(un_word_cal) pipeline.add_step(un_word_padder) pipeline.add_step(numpy_formatter) return pipeline