def test_order(self): """Testing production of chunks in a different order from the stream.""" data_sizes = [200, 545] data_attrs_numbers = [5, 8, 2, 1, 15] inp_chunk_sizes = [1, 2, 3, 4, 5] buffer_sizes = [2, 38, 1000] for data_size, data_attrs_number, buffer_size, input_chunk_size in \ itertools.product(data_sizes, data_attrs_numbers, buffer_sizes, inp_chunk_sizes): data = generate_data_chunk(data_attrs_number, data_size) inp_data_chunks = create_list_of_data_chunks( data, input_chunk_size) chunk_collector = ChunkShuffler(buffer_size=buffer_size) accum = ChunkAccumulator(collector=chunk_collector) actual_chunks = [] for actual_chunk in accum.iter(inp_data_chunks): actual_chunks.append(actual_chunk) actual_ds = concat_chunks(*actual_chunks) self.assertTrue(data != actual_ds) self.assertTrue(len(data) == len(actual_ds))
def gen_and_save_summs(self, data_source, output_file_path): """ Generates summaries by running the model and writes them along with other attributes to a json file. :param data_source: self-explanatory. :param output_file_path: self-explanatory. """ safe_mkfdir(output_file_path) start_id = self.word_vocab[START].id end_id = self.word_vocab[END].id pad_id = self.word_vocab[PAD].id output_file = open(output_file_path, encoding='utf-8', mode='w') vocab_mapper = VocabMapper( { ModelF.REV: self.word_vocab, ModelF.GEN_SUMM: self.word_vocab, ModelF.GEN_REV: self.word_vocab }, symbols_attr='token') chunk_coll = [] for i, dc in enumerate(self.val_data_pipeline.iter(**data_source), 1): gen_revs, _, gen_summ, _ = self.imodel.predict(dc) # converting to the data-chunk to use the internal writing # mechanism new_dc = DataChunk() for fn in [ ModelF.SUMM_CAT, ModelF.SUMM_GROUP_ID, ModelF.REV, ModelF.GROUP_ID ]: new_dc[fn] = dc[fn] new_dc[ModelF.GEN_REV] = gen_revs new_dc[ModelF.GEN_SUMM] = gen_summ seq_fnames = [ModelF.GEN_SUMM, ModelF.GEN_REV, ModelF.REV] # converting PyTorch tensors to numpy arrays if present new_dc = convert_tensors_to_numpy(new_dc) for fn in seq_fnames: new_dc[fn] = format_seqs(new_dc[fn], start_id=start_id, end_id=end_id, pad_id=pad_id) new_dc = vocab_mapper(new_dc) # convert all seqs to strings for fn in seq_fnames: new_dc[fn] = conv_seqs_to_sents(new_dc[fn]) # group by product ids indxs = group_vals_by_keys(range(len(new_dc[ModelF.REV])), new_dc[ModelF.GROUP_ID]).values() for fn in [ModelF.GEN_REV, ModelF.REV]: new_dc[fn] = self._group_by_prods(indxs, new_dc[fn]) del new_dc[ModelF.GROUP_ID] chunk_coll.append(new_dc) output_chunk = concat_chunks(*chunk_coll) output_chunk.to_json( f=output_file, grouping_fnames=[ModelF.SUMM_CAT, ModelF.SUMM_GROUP_ID]) logger.info("Generated summaries and saved to: '%s'." "" % output_file_path) # analytics for repetitions checking # because gen summs contain list of strings I need to merge them # together before running analytics all_gen_summ_strs = [ " ".join(sents) for sents in output_chunk[ModelF.GEN_SUMM] ] an_metrics = ngram_seq_analysis(all_gen_summ_strs, tokenizer=self.tok_func, sent_splitter=self.sent_split_func, n_grams_to_comp=(2, 3, 4)) logger.info("Ran analytics of generated summaries.") metrs_str = " ".join(["%s: %.3f" % (k, v) for k, v in an_metrics]) logger.info(metrs_str)
def _merge_chunks(self): """Merges data-chunks and returns their generator.""" merged_dc = concat_chunks(*[dc for dc in self._coll]) return merged_dc