def test_vocabulary_mapper_multidim_lists(self): """Testing whether the mapper can map multi-dim lists.""" target_field_name = "dummy" symbols_attr = "id" data_chunk = DataChunk( **{ target_field_name: np.array( [[["one"], ["two"]], [["three"], ["four", "five", "six"]]], dtype="object") }) exp_val = np.empty(2, dtype="object") exp_val[0] = np.array([[1], [2]]) exp_val[1] = np.array([[3], [4, 5, 6]]) expected_output_chunk = DataChunk(**{target_field_name: exp_val}) # creating and populating a vocab vocab = Vocabulary() vocab.add_symbol("zero") vocab.add_symbol("one") vocab.add_symbol("two") vocab.add_symbol("three") vocab.add_symbol("four") vocab.add_symbol("five") vocab.add_symbol("six") mapper = VocabMapper({target_field_name: vocab}, symbols_attr=symbols_attr) actual_output_chunk = mapper(copy.deepcopy(data_chunk)) self.assertTrue(actual_output_chunk == expected_output_chunk)
def test_vocabulary_mapper_mixed_field_values(self): """Testing whether the mapper can map multi-dim mixed field values.""" target_field_name = "dummy" symbols_attr = "id" data_chunk = DataChunk(**{target_field_name: np.array([ [["one"], np.array(["two", "one"])], [["three"], np.array(["four", "five", "six"])] ], dtype="object")}) expected_output_chunk = DataChunk(**{target_field_name: np.array([ [[1], np.array([2, 1])], [[3], np.array([4, 5, 6])] ], dtype="object")}) # creating and populating a vocab vocab = Vocabulary() vocab.add_symbol("zero") vocab.add_symbol("one") vocab.add_symbol("two") vocab.add_symbol("three") vocab.add_symbol("four") vocab.add_symbol("five") vocab.add_symbol("six") mapper = VocabMapper({target_field_name: vocab}, symbols_attr=symbols_attr) actual_output_chunk = mapper(data_chunk) self.assertTrue(actual_output_chunk == expected_output_chunk)
def test_vocabulary_mapper(self): """Testing whether the mapper allows to map back and forth field values. """ data_path = 'mldp/tests/data/mock_data.csv' target_fields = ["first_name", "last_name", "email", "gender"] reader = CsvReader(sep=',') vocab = Vocabulary(reader) for target_field in target_fields: vocab.create(data_source={"data_path": data_path}, data_fnames=target_field) data = read_data_from_csv_file(data_path) data_original = copy.deepcopy(data) mapper_to = VocabMapper({target_field: vocab}, "id") mapper_back = VocabMapper({target_field: vocab}, "token") data = mapper_to(data) data = mapper_back(data) self.assertTrue( (data[target_field] == data_original[target_field]).all())
def assemble_train_pipeline(word_vocab, max_groups_per_batch=1, min_revs_per_group=None, max_revs_per_group=10, seed=None, workers=1): """ This pipeline is specific to the preprocessed Amazon and Yelp reviews. Creates a flow of transformation steps that modify the data until the final form is reached in terms of PyTorch tensors. :param word_vocab: vocabulary object with words/tokens. :param max_groups_per_batch: number of groups each batch should have. :param min_revs_per_group: number of reviews a group should have in order not to be discarded. :param max_revs_per_group: self-explanatory. :param reseed: set it to True if use multi-processing and want it to return different sequences of batches every epoch. This has to do purely with multi-processing issues in combination with numpy. """ assert START in word_vocab assert END in word_vocab group_files_shuffler = GroupFileShuffler() reader = CsvReader(sep='\t', engine='python', chunk_size=None, encoding='utf-8', quoting=QUOTE_NONE, timeout=None, worker_threads_num=1) fname_renamer = FieldRenamer({ InpDataF.REV_TEXT: ModelF.REV, InpDataF.GROUP_ID: ModelF.GROUP_ID }) unit_sampler = UnitSampler(id_fname=ModelF.GROUP_ID, sample_all=True, min_units=min_revs_per_group, max_units=max_revs_per_group) unit_sampler_accum = ChunkAccumulator(unit_sampler) # since we're splitting one group into multiple chunks, it's convenient # to postfix each group_id name, such that it would be possible to # associate summaries with different subsets of reviews postfixer = Postfixer(id_fname=ModelF.GROUP_ID) # to avoid having same product/business appearing in the same merged # data-chunk, buffer a small number of them, shuffle, and release chunk_shuffler = ChunkAccumulator(ChunkShuffler(buffer_size=500)) # accumulates a fixed number of group chunks, merges them # together, and passes along the pipeline chunk_coll = ChunkCollector(buffer_size=max_groups_per_batch) chunk_accum = ChunkAccumulator(chunk_coll) # alternation of data entries tokenizer = TokenProcessor(fnames=ModelF.REV) vocab_mapper = VocabMapper({ModelF.REV: word_vocab}) seq_wrapper = SeqWrapper(fname=ModelF.REV, start_el=word_vocab[START].id, end_el=word_vocab[END].id) seq_len_computer = SeqLenComputer(ModelF.REV, ModelF.REV_LEN) sorter = ChunkSorter(ModelF.REV_LEN) padder = Padder(fname=ModelF.REV, new_mask_fname=ModelF.REV_MASK, pad_symbol=word_vocab[PAD].id, padding_mode='right') summ_rev_indxs_creator = SummRevIndxsCreator( group_id_fname=ModelF.GROUP_ID, category_fname=ModelF.CAT) rev_mapper = RevMapper(group_rev_indxs_fname=ModelF.GROUP_REV_INDXS, group_rev_mask_fname=ModelF.GROUP_REV_INDXS_MASK, rev_mask_fname=ModelF.REV_MASK) pipeline = PyTorchPipeline(reader=reader, preprocessor=group_files_shuffler, worker_processes_num=workers, seed=seed, error_on_invalid_chunk=False, timeout=None) pipeline.add_step(fname_renamer) pipeline.add_step(unit_sampler_accum) pipeline.add_step(postfixer) pipeline.add_step(chunk_shuffler) pipeline.add_step(chunk_accum) # entry transformations pipeline.add_step(tokenizer) pipeline.add_step(vocab_mapper) pipeline.add_step(seq_wrapper) pipeline.add_step(seq_len_computer) pipeline.add_step(sorter) pipeline.add_step(padder) # adding additional fields for attention and summarization pipeline.add_step(summ_rev_indxs_creator) pipeline.add_step(rev_mapper) return pipeline
def assemble_tuning_pipeline(word_vocab, max_groups_per_batch=1, tok_func=None, lowercase=False): """ The pipeline yields tokenized reviews and summaries that can be used for training (fine-tuning of the model). """ assert START in word_vocab and END in word_vocab reader = CsvReader(sep='\t', encoding='utf-8', engine='python', chunk_size=None, use_lists=True, quating=QUOTE_NONE) chunk_accum = ChunkAccumulator(new_size=max_groups_per_batch) ama_spec_trans = AmazonTransformer(fnames_to_copy=[ GoldDataF.PROD_ID, GoldDataF.CAT, ]) summ_mapper = SummMapper(fname=ModelF.SUMMS, new_indx_fname=ModelF.SUMM_GROUP_INDX) token_processor = TokenProcessor(fnames=[ModelF.REV, ModelF.SUMM], tok_func=tok_func, lowercase=lowercase) vocab_mapper = VocabMapper({ ModelF.REV: word_vocab, ModelF.SUMM: word_vocab }) fname_renamer = FieldRenamer({ GoldDataF.PROD_ID: ModelF.GROUP_ID, GoldDataF.CAT: ModelF.CAT, ModelF.SUMMS: ModelF.SUMM }) seq_wrapper = SeqWrapper(fname=[ModelF.REV, ModelF.SUMM], start_el=word_vocab[START].id, end_el=word_vocab[END].id) padder = Padder(fname=[ModelF.REV, ModelF.SUMM], new_mask_fname=[ModelF.REV_MASK, ModelF.SUMM_MASK], pad_symbol=word_vocab[PAD].id, padding_mode='right') indxs_creator = GoldSummRevIndxsCreator() # rev_mapper = RevMapper(group_rev_indxs_fname=ModelF.GROUP_REV_INDXS, # group_rev_mask_fname=ModelF.GROUP_REV_INDXS_MASK, # rev_mask_fname=ModelF.REV_MASK) # props len_prop = SummLenProp(summ_fname=ModelF.SUMM, rev_fname=ModelF.REV, group_rev_indxs_fname=ModelF.GROUP_REV_INDXS, summ_group_indx_fname=ModelF.SUMM_GROUP_INDX, new_fname=ModelF.LEN_PROP) pov_prop = POVProp(text_fname=ModelF.SUMM, new_fname=ModelF.POV_PROP) rouge_prop = SummRougeProp(summ_fname=ModelF.SUMM, rev_fname=ModelF.REV, group_rev_indxs_fname=ModelF.GROUP_REV_INDXS, summ_group_indx_fname=ModelF.SUMM_GROUP_INDX, new_fname=ModelF.ROUGE_PROP) rating_prop = DummyProp(fname=ModelF.SUMM, new_fname=ModelF.RATING_PROP, fval=0.) np_formatter = NumpyFormatter([ ModelF.LEN_PROP, ModelF.RATING_PROP, ModelF.POV_PROP, ModelF.ROUGE_PROP ]) pipeline = PyTorchPipeline(reader=reader, error_on_invalid_chunk=False) # pipeline.add_step(shuffler) pipeline.add_step(chunk_accum) pipeline.add_step(ama_spec_trans) pipeline.add_step(summ_mapper) pipeline.add_step(fname_renamer) pipeline.add_step(indxs_creator) # props pipeline.add_step(rating_prop) pipeline.add_step(rouge_prop) pipeline.add_step(token_processor) # the props below require tokenization pipeline.add_step(len_prop) pipeline.add_step(pov_prop) pipeline.add_step(vocab_mapper) pipeline.add_step(seq_wrapper) pipeline.add_step(padder) pipeline.add_step(np_formatter) return pipeline
def assemble_eval_pipeline(word_vocab, max_groups_per_chunk=1, tok_func=None, lowercase=False): """Assembles a data-pipeline for eval. against gold summaries.""" assert START in word_vocab and END in word_vocab reader = CsvReader(sep='\t', encoding='utf-8', engine='python', chunk_size=max_groups_per_chunk, use_lists=True, quating=QUOTE_NONE) rouge_prop = SummEvalRougeKnob( hyp_fnames=[GoldDataF.SUMM1, GoldDataF.SUMM2, GoldDataF.SUMM3], ref_fnames=GoldDataF.REVS, new_fname=ModelF.ROUGE_PROP) field_dupl = FieldDuplicator({ GoldDataF.SUMM1: TOK_SUMM1, GoldDataF.SUMM2: TOK_SUMM2, GoldDataF.SUMM3: TOK_SUMM3 }) tokenizer = TokenProcessor(fnames=[TOK_SUMM1, TOK_SUMM2, TOK_SUMM3] + GoldDataF.REVS, tok_func=tok_func, lowercase=lowercase) field_dropper = FieldDropper([TOK_SUMM1, TOK_SUMM2, TOK_SUMM3]) rating_prop = DummyProp(fname=GoldDataF.PROD_ID, new_fname=ModelF.RATING_PROP, fval=0.) len_prop = SummEvalLenProp(summ_fnames=[TOK_SUMM1, TOK_SUMM2, TOK_SUMM3], rev_fnames=GoldDataF.REVS, new_fname=ModelF.LEN_PROP) pov_prop = SummEvalPOVProp(summ_fnames=[TOK_SUMM1, TOK_SUMM2, TOK_SUMM3], new_fname=ModelF.POV_PROP) # summaries are not converted to tokens vocab_mapper = VocabMapper({ModelF.REV: word_vocab}) dataset_spec_trans = AmazonTransformer([ GoldDataF.PROD_ID, GoldDataF.CAT, ModelF.ROUGE_PROP, ModelF.LEN_PROP, ModelF.RATING_PROP, ModelF.POV_PROP ]) fname_renamer = FieldRenamer({ GoldDataF.PROD_ID: ModelF.GROUP_ID, GoldDataF.CAT: ModelF.CAT }) seq_wrapper = SeqWrapper(fname=[ModelF.REV], start_el=word_vocab[START].id, end_el=word_vocab[END].id) padder = Padder(fname=[ModelF.REV], new_mask_fname=[ModelF.REV_MASK], pad_symbol=word_vocab[PAD].id, padding_mode='right') indxs_creator = GoldSummRevIndxsCreator() rev_mapper = RevMapper(group_rev_indxs_fname=ModelF.GROUP_REV_INDXS, group_rev_mask_fname=ModelF.GROUP_REV_INDXS_MASK, rev_mask_fname=ModelF.REV_MASK) np_formatter = NumpyFormatter([ ModelF.ROUGE_PROP, ModelF.LEN_PROP, ModelF.RATING_PROP, ModelF.POV_PROP ]) pipeline = PyTorchPipeline(reader=reader, error_on_invalid_chunk=False) pipeline.add_step(rouge_prop) pipeline.add_step(rating_prop) # props that require tokenization pipeline.add_step(field_dupl) pipeline.add_step(tokenizer) pipeline.add_step(pov_prop) pipeline.add_step(len_prop) pipeline.add_step(field_dropper) pipeline.add_step(dataset_spec_trans) pipeline.add_step(vocab_mapper) pipeline.add_step(fname_renamer) pipeline.add_step(seq_wrapper) pipeline.add_step(padder) pipeline.add_step(indxs_creator) pipeline.add_step(rev_mapper) pipeline.add_step(np_formatter) return pipeline
def assemble_eval_pipeline(word_vocab, max_groups_per_chunk=1, dataset='yelp', tokenization_func=lambda x: x.split()): """Assembles the pipeline for evaluation on the YELP and Amazon eval set.""" assert dataset in ['yelp', 'amazon'] if dataset == 'yelp': fields_obj = YelpEvalF fname_renamer = FieldRenamer({fields_obj.BUSINESS_ID: ModelF.GROUP_ID}) dataset_spec_trans = YelpTransformer() else: fields_obj = AmazonEvalF fname_renamer = FieldRenamer({ fields_obj.PROD_ID: ModelF.GROUP_ID, fields_obj.CAT: ModelF.CAT }) dataset_spec_trans = AmazonTransformer() assert START in word_vocab assert END in word_vocab reader = CsvReader(sep='\t', encoding='utf-8', engine='python', quotechar='\'', chunk_size=max_groups_per_chunk) # notice that I do not tokenize summaries, I leave them as they are! token_processor = TokenProcessor(fnames=fields_obj.REVS, tokenization_func=tokenization_func) # notice that I don't convert summs tokens to ids vocab_mapper = VocabMapper({fn: word_vocab for fn in fields_obj.REVS}) seq_wrapper = SeqWrapper(ModelF.REV, start_el=word_vocab[START].id, end_el=word_vocab[END].id) seq_len_computer = SeqLenComputer(ModelF.REV, ModelF.REV_LEN) padder = Padder(fname=ModelF.REV, new_mask_fname=ModelF.REV_MASK, pad_symbol=word_vocab[PAD].id, padding_mode='right') sorter = ChunkSorter(field_name=ModelF.REV_LEN, fields_to_sort=[ ModelF.REV, ModelF.REV_MASK, ModelF.CAT, ModelF.GROUP_ID ]) indxs_creator = GoldSummRevIndxsCreator(group_id_fname=ModelF.GROUP_ID) rev_mapper = RevMapper(group_rev_indxs_fname=ModelF.GROUP_REV_INDXS, group_rev_mask_fname=ModelF.GROUP_REV_INDXS_MASK, rev_mask_fname=ModelF.REV_MASK) formatter = PyTorchFormatter() pipeline = Pipeline(reader=reader, error_on_invalid_chunk=False) pipeline.add_step(token_processor) pipeline.add_step(vocab_mapper) pipeline.add_step(dataset_spec_trans) pipeline.add_step(fname_renamer) pipeline.add_step(seq_wrapper) pipeline.add_step(seq_len_computer) pipeline.add_step(padder) pipeline.add_step(sorter) pipeline.add_step(indxs_creator) pipeline.add_step(rev_mapper) pipeline.add_step(formatter) return pipeline
def assemble_infer_pipeline(word_vocab, max_groups_per_chunk=1, max_reviews=10, tokenization_func=lambda x: x.split()): """Assembles a simple inference pipeline for summary generation. Assumes that csv files are read where reviews have the following column names: 'rev1', 'rev2', ..., 'revN', each review separated by \t. Args: word_vocab: word vocabulary to convert words to ids. max_groups_per_chunk: self-explanatory. max_reviews: the maximum number of reviews to load per group. Columns in the CSV file should be `rev1`, ...., `revN`. tokenization_func: self-explanatory. """ rev_fnames = [ f'{InfDataF.REV_PREFIX}{i}' for i in range(1, max_reviews + 1) ] assert START in word_vocab assert END in word_vocab reader = CsvReader(sep='\t', encoding='utf-8', engine='python', quotechar='\'', chunk_size=max_groups_per_chunk) rev_flattener = ReviewFlattener(group_id_fname=InfDataF.GROUP_ID, rev_fnames=rev_fnames) token_processor = TokenProcessor(fnames=ModelF.REV, tokenization_func=tokenization_func) # notice that I don't convert summs tokens to ids vocab_mapper = VocabMapper({ModelF.REV: word_vocab}) seq_wrapper = SeqWrapper(ModelF.REV, start_el=word_vocab[START].id, end_el=word_vocab[END].id) seq_len_computer = SeqLenComputer(ModelF.REV, ModelF.REV_LEN) padder = Padder(fname=ModelF.REV, new_mask_fname=ModelF.REV_MASK, pad_symbol=word_vocab[PAD].id, padding_mode='right') sorter = ChunkSorter(field_name=ModelF.REV_LEN, fields_to_sort=[ModelF.REV, ModelF.GROUP_ID]) # re-using the step summ_rev_indx_creator = GoldSummRevIndxsCreator( group_id_fname=ModelF.GROUP_ID) pipeline = PyTorchPipeline(reader=reader, error_on_invalid_chunk=False) pipeline.add_step(rev_flattener) pipeline.add_step(token_processor) pipeline.add_step(vocab_mapper) pipeline.add_step(seq_wrapper) pipeline.add_step(seq_len_computer) pipeline.add_step(padder) pipeline.add_step(sorter) pipeline.add_step(summ_rev_indx_creator) return pipeline
def gen_and_save_summs(self, data_source, output_file_path): """ Generates summaries by running the model and writes them along with other attributes to a json file. :param data_source: self-explanatory. :param output_file_path: self-explanatory. """ safe_mkfdir(output_file_path) start_id = self.word_vocab[START].id end_id = self.word_vocab[END].id pad_id = self.word_vocab[PAD].id output_file = open(output_file_path, encoding='utf-8', mode='w') vocab_mapper = VocabMapper( { ModelF.REV: self.word_vocab, ModelF.GEN_SUMM: self.word_vocab, ModelF.GEN_REV: self.word_vocab }, symbols_attr='token') chunk_coll = [] for i, dc in enumerate(self.val_data_pipeline.iter(**data_source), 1): gen_revs, _, gen_summ, _ = self.imodel.predict(dc) # converting to the data-chunk to use the internal writing # mechanism new_dc = DataChunk() for fn in [ ModelF.SUMM_CAT, ModelF.SUMM_GROUP_ID, ModelF.REV, ModelF.GROUP_ID ]: new_dc[fn] = dc[fn] new_dc[ModelF.GEN_REV] = gen_revs new_dc[ModelF.GEN_SUMM] = gen_summ seq_fnames = [ModelF.GEN_SUMM, ModelF.GEN_REV, ModelF.REV] # converting PyTorch tensors to numpy arrays if present new_dc = convert_tensors_to_numpy(new_dc) for fn in seq_fnames: new_dc[fn] = format_seqs(new_dc[fn], start_id=start_id, end_id=end_id, pad_id=pad_id) new_dc = vocab_mapper(new_dc) # convert all seqs to strings for fn in seq_fnames: new_dc[fn] = conv_seqs_to_sents(new_dc[fn]) # group by product ids indxs = group_vals_by_keys(range(len(new_dc[ModelF.REV])), new_dc[ModelF.GROUP_ID]).values() for fn in [ModelF.GEN_REV, ModelF.REV]: new_dc[fn] = self._group_by_prods(indxs, new_dc[fn]) del new_dc[ModelF.GROUP_ID] chunk_coll.append(new_dc) output_chunk = concat_chunks(*chunk_coll) output_chunk.to_json( f=output_file, grouping_fnames=[ModelF.SUMM_CAT, ModelF.SUMM_GROUP_ID]) logger.info("Generated summaries and saved to: '%s'." "" % output_file_path) # analytics for repetitions checking # because gen summs contain list of strings I need to merge them # together before running analytics all_gen_summ_strs = [ " ".join(sents) for sents in output_chunk[ModelF.GEN_SUMM] ] an_metrics = ngram_seq_analysis(all_gen_summ_strs, tokenizer=self.tok_func, sent_splitter=self.sent_split_func, n_grams_to_comp=(2, 3, 4)) logger.info("Ran analytics of generated summaries.") metrs_str = " ".join(["%s: %.3f" % (k, v) for k, v in an_metrics]) logger.info(metrs_str)
def test_how_to_apply_run(self): data_path = os.path.join(self.tutorials_path, "data/tweets.csv") # paths where vocabs will be saved and later loaded from words_vocab_file_path = os.path.join(self.tutorials_path, "data/vocabs/words.txt") labels_vocab_file_path = os.path.join(self.tutorials_path, 'data/vocabs/labels.txt') # creating step objects twitter_tokenizer = TweetTokenizer() preprocessor = TwitterFilesPreprocessor( input_cols_number=3, tweets_indx=2, add_header=['ids', 'labels', 'tweets']) csv_reader = CsvReader(sep='\t', chunk_size=30) fields_selector = FieldSelector(fnames=["tweets", "labels"]) token_processor = TokenProcessor( fnames="tweets", tok_func=twitter_tokenizer.tokenize, tok_cleaning_func=twitter_text_cleaner, lowercase=True) # data pipeline for vocabularies creation vocab_data_pipeline = Pipeline(reader=csv_reader, preprocessor=preprocessor, worker_processes_num=0, name_prefix="vocabs") vocab_data_pipeline.add_step(fields_selector) vocab_data_pipeline.add_step(token_processor) # creating or loading vocabs words_vocab = Vocabulary(vocab_data_pipeline, name_prefix="words") words_vocab.load_or_create(words_vocab_file_path, data_source={"data_path": data_path}, data_fnames="tweets") labels_vocab = Vocabulary(vocab_data_pipeline, name_prefix="labels") labels_vocab.load_or_create(labels_vocab_file_path, data_source={"data_path": data_path}, data_fnames="labels") print(words_vocab) print(labels_vocab) print(vocab_data_pipeline) # extra steps for training and evaluation mapper = VocabMapper(field_names_to_vocabs={ "tweets": words_vocab, "labels": labels_vocab }) padder = Padder(fname="tweets", new_mask_fname="tweets_mask", pad_symbol=words_vocab[PAD].id) formatter = FeaturesLabelsFormatter(features_field_name="tweets", labels_field_name="labels", classes_number=len(labels_vocab)) # building the actual pipeline dev_data_pipeline = Pipeline(reader=csv_reader, preprocessor=preprocessor, worker_processes_num=1, name_prefix="dev") dev_data_pipeline.add_step(fields_selector) dev_data_pipeline.add_step(token_processor) dev_data_pipeline.add_step(mapper) dev_data_pipeline.add_step(padder) dev_data_pipeline.add_step(formatter) print(dev_data_pipeline) epochs = 2 i_model = ISentiLSTM(dev_data_pipeline) i_model.init_model(words_vocab_size=len(words_vocab), input_dim=50, lstm_hidden_dim=120, number_of_classes=len(labels_vocab), mask_symbol=words_vocab[PAD].id)
def assemble_unsup_pipeline(word_vocab, max_groups_per_batch=1, reader_threads=5, min_revs_per_group=None, max_revs_per_group=10, worker_num=1, seed=None, tok_func=None, lowercase=True, max_len=None, shuffler_buffer_size=250): """Creates a data-pipeline that yields batches for to train the unsup. model. Creates a flow of data transformation steps that modify the data until the final form is reached in terms of PyTorch tensors. Args: word_vocab: vocabulary object with words/tokens. max_groups_per_batch: number of groups each batch should have. min_revs_per_group: number of reviews a group should have in order not to be discarded. max_revs_per_group: self-explanatory. seed: used to use the same data subsamples/shuffles every epoch. max_len: if passed will filter out all reviews that a longer than the threshold. Returns: DataPipeline object that allows iteration over batches/chunks. """ assert START in word_vocab and END in word_vocab file_shuffler = FileShuffler() # TODO: explain how grouping works here - each file has reviews of a group reader = CsvReader(sep='\t', engine='c', chunk_size=None, encoding='utf-8', quoting=QUOTE_NONE, buffer_size=200, timeout=None, worker_threads_num=reader_threads, use_lists=True) fname_renamer = FieldRenamer({ InpDataF.REV_TEXT: ModelF.REV, InpDataF.GROUP_ID: ModelF.REV_GROUP_ID, InpDataF.RATING: ModelF.REV_RATING, InpDataF.RATING_DEV: ModelF.RATING_PROP, InpDataF.CAT: ModelF.REV_CAT }) unit_sampler = UnitSampler(id_fname=ModelF.REV_GROUP_ID, sample_all=True, min_units=min_revs_per_group, max_units=max_revs_per_group) unit_sampler_accum = ChunkAccumulator(unit_sampler) # since we're splitting one group into multiple chunks, it's convenient # to postfix each group_id name, such that it would be possible to # associate summaries with different subsets of reviews postfixer = Postfixer(id_fname=ModelF.REV_GROUP_ID) # property and related steps len_prop = LenProp(len_fname=ModelF.REV_LEN, new_fname=ModelF.LEN_PROP) pov_prop = POVProp(text_fname=ModelF.REV, new_fname=ModelF.POV_PROP) rouge_field_merger = FieldMerger( merge_fnames=[InpDataF.ROUGE1, InpDataF.ROUGE2, InpDataF.ROUGEL], new_fname=ModelF.ROUGE_PROP) # to avoid having same product/business appearing in the same merged # data-chunk, buffer a small number of them, shuffle, and release chunk_shuffler = ChunkAccumulator( ChunkShuffler(buffer_size=shuffler_buffer_size)) # accumulates a fixed number of group chunks, merges them # together, and passes along the pipeline chunk_coll = ChunkCollector(buffer_size=max_groups_per_batch, strict=True) chunk_accum = ChunkAccumulator(chunk_coll) # alternation of data entries tokenizer = TokenProcessor(fnames=ModelF.REV, tok_func=tok_func, lowercase=lowercase) vocab_mapper = VocabMapper({ModelF.REV: word_vocab}) seq_wrapper = SeqWrapper(fname=ModelF.REV, start_el=word_vocab[START].token, end_el=word_vocab[END].token) seq_len_computer = SeqLenComputer(ModelF.REV, ModelF.REV_LEN) padder = Padder(fname=ModelF.REV, new_mask_fname=ModelF.REV_MASK, pad_symbol=word_vocab[PAD].id, padding_mode='right') summ_rev_indxs_creator = GroupRevIndxsCreator( rev_group_id_fname=ModelF.REV_GROUP_ID, rev_cat_fname=ModelF.REV_CAT) rev_mapper = RevMapper(group_rev_indxs_fname=ModelF.GROUP_REV_INDXS, group_rev_mask_fname=ModelF.GROUP_REV_INDXS_MASK, rev_mask_fname=ModelF.REV_MASK) # extra steps for the loss associated with probability mass un_word_cal = UniqueWordCalc( new_fname=ModelF.OTHER_REV_UWORDS, rev_fname=ModelF.REV, other_rev_indxs_fname=ModelF.OTHER_REV_INDXS, other_rev_indxs_mask_fname=ModelF.OTHER_REV_INDXS_MASK) un_word_padder = Padder(fname=ModelF.OTHER_REV_UWORDS, new_mask_fname=ModelF.OTHER_REV_UWORDS_MASK, pad_symbol=word_vocab[PAD].id, padding_mode='right') numpy_formatter = NumpyFormatter(fnames=[ ModelF.ROUGE_PROP, ModelF.RATING_PROP, ModelF.LEN_PROP, ModelF.POV_PROP ]) pipeline = PyTorchPipeline(reader=reader, preprocessor=file_shuffler, worker_processes_num=worker_num, seed=seed, output_buffer_size=50, error_on_invalid_chunk=False, timeout=None) pipeline.add_step(fname_renamer) pipeline.add_step(rouge_field_merger) pipeline.add_step(tokenizer) if max_len: pipeline.add_step(TextLenFilter(fname=ModelF.REV, max_len=max_len)) pipeline.add_step(unit_sampler_accum) pipeline.add_step(postfixer) pipeline.add_step(chunk_shuffler) pipeline.add_step(seq_wrapper) pipeline.add_step(seq_len_computer) # properties pipeline.add_step(len_prop) pipeline.add_step(pov_prop) pipeline.add_step(chunk_accum) pipeline.add_step(vocab_mapper) pipeline.add_step(padder) # adding additional fields for attention and summarization pipeline.add_step(summ_rev_indxs_creator) pipeline.add_step(rev_mapper) # adding steps for word count computation pipeline.add_step(un_word_cal) pipeline.add_step(un_word_padder) pipeline.add_step(numpy_formatter) return pipeline