def init_logger(logger_name=__name__, output_path=None, level=logging.INFO): """ Initializes a logger for console and file writing. :param logger_name: self-explanatory. :param output_path: directory or file path where the logs should be saved. By default it will not store file logs. :param level: self-explanatory. """ logger = logging.getLogger(logger_name) logger.setLevel(level) formatter = logging.Formatter("%(asctime)s [%(levelname)s]: %(message)s") # adding console output stream_handler = logging.StreamHandler() stream_handler.setFormatter(formatter) logger.addHandler(stream_handler) if output_path: if is_file_path(output_path): safe_mkfdir(output_path) if os.path.exists(output_path): os.remove(output_path) else: safe_mkdir(output_path) # using the default name of the logger default_file_name = "log_" + strftime("%b_%d_%H_%M_%S") + '.txt' output_path = os.path.join(output_path, default_file_name) file_handler = logging.FileHandler(output_path) file_handler.setFormatter(formatter) logger.addHandler(file_handler) return logger
def delete_attr_from_params(input_fp, output_fp, attr_names, match_start=False): """Removes a particular attrs from the dictionary of params, saves back. Args: input_fp: output_fp: attr_names: match_start: if set to True will match based on the beginning of the string. E.g., _encoder match _encoder.param1.linear.weights. """ # TODO: explain how regex works model_params = T.load(input_fp, 'cpu')[MODEL_PARAMS] if match_start: for attr_name in attr_names: r_aname = re.compile("^" + attr_name) for param_name in list(model_params.keys()): if r_aname.match(param_name): del model_params[param_name] logger.info("Deleting: %s." % param_name) else: for attr_name in attr_names: if attr_name in model_params: del model_params[attr_name] logger.info("Deleting: %s." % attr_name) # dumping to the disk safe_mkfdir(output_fp) T.save({MODEL_PARAMS: model_params}, f=output_fp)
def run(self): safe_mkfdir(self.out_file_path) data_units = [] for data_unit in read_csv_file(self.inp_file_path, sep='\t'): tok_str = " ".join(tokenizer(data_unit[OutputFields.REV_TEXT])) data_unit[OutputFields.REV_TEXT] = tok_str data_units.append(data_unit) write_group_to_csv(self.out_file_path, data_units, sep='\t')
def save(self, file_path, encoding='utf-8'): """Saves hyper-params object as a json file.""" safe_mkfdir(file_path) params = self.to_dict(types_whitelist=OVERRIDABLE_ATTR_TYPES) f = codecs.open(file_path, encoding=encoding, mode='w') json.dump(params, f, indent=2) logger.debug("Extracted the following hparams: '%s'." "" % " ".join(params.keys())) logger.info("Saved hyper-parameters to '%s'." % file_path)
def rename_params(input_fp, output_fp, old_attr_names, new_attr_names): """Renames a model's parameters, saves them to an output file.""" assert len(old_attr_names) == len(new_attr_names) model_params = T.load(input_fp, 'cpu')[MODEL_PARAMS] for old_name, new_name in zip(old_attr_names, new_attr_names): model_params[new_name] = model_params[old_name] del model_params[old_name] # dumping to the disk safe_mkfdir(output_fp) T.save({MODEL_PARAMS: model_params}, f=output_fp)
def extract_params(input_fp, output_fp, attr_names, device='cpu'): """Extract a subset parameters from the file. Saves to a new file.""" model_params = T.load(input_fp, device)[MODEL_PARAMS] params_to_save = {} for attr_name in attr_names: params_to_save[attr_name] = model_params[attr_name] # dumping to the disk safe_mkfdir(output_fp) T.save({MODEL_PARAMS: params_to_save}, f=output_fp)
def train_and_save_true_casing_model(input_fps, text_fname, output_fp): """Trains the Moses model on tokenized csv files; saves params.""" mtr = MosesTruecaser(is_asr=True) reader = CsvReader(quoting=QUOTE_NONE, sep='\t', engine='python', encoding='utf-8') texts = [] logger.info("Loading data from: '%s'." % input_fps) for dc in reader.iter(data_path=input_fps): for du in dc.iter(): texts.append(du[text_fname].split()) logger.info("Loaded the data.") safe_mkfdir(output_fp) logger.info("Training the truecaser.") mtr.train(texts, save_to=output_fp, progress_bar=True, processes=1) logger.info("Done, saved the model to: '%s'." % output_fp)
def rename_params_by_prefix(input_fp, output_fp, old_prefixes, new_prefixes): """Renames a model's parameters by matching prefixes, saves them to an output file. Renaming is performed in multiple iterations if multiple prefixes are given. """ assert len(old_prefixes) == len(new_prefixes) model_params = T.load(input_fp, 'cpu')[MODEL_PARAMS] for old_prefix, new_prefix in zip(old_prefixes, new_prefixes): tmp_params = dict() for curr_name, param in model_params.items(): if curr_name.startswith(old_prefix): new_name = new_prefix + curr_name[len(old_prefix):] print(f"{curr_name} => {new_name}") curr_name = new_name tmp_params[curr_name] = param model_params = tmp_params # dumping to the disk safe_mkfdir(output_fp) T.save({MODEL_PARAMS: model_params}, f=output_fp)
def write_group_to_csv(out_file_path, units, sep="\t"): """Writes data units into a CSV file. Args: out_file_path (str): self-explanatory. units (list): list with dicts (review texts and other attributes). sep (str): separation in the output csv files. Returns: None. """ safe_mkfdir(out_file_path) with open(out_file_path, 'w', encoding='utf-8') as f: header = None for du in units: if header is None: header = du.keys() f.write(sep.join(header) + "\n") str_to_write = sep.join([str(du[attr]) for attr in header]) f.write(str_to_write + '\n')
def save_setup_str(self, dir_path, exper_descr=None): """ Logs/saves the setup of the experiment, namely 3 main components: 1. Logs the experiment's description (if `exper_descr` is provided). 2. Train and val data pipelines' and vocab' blueprint, saved to dp_vocabs.txt. 3. Model's blueprint/summary saved to model.txt. """ logger.info("Experiment's output will be saved to: '%s'." % dir_path) # 1. experiment if exper_descr: form_exp = format_big_box(exper_descr) logger.info(form_exp) # 2. data pipeline + vocabs dp_fp = os.path.join(dir_path, 'dp_vocabs.txt') safe_mkfdir(dp_fp) try: with open(dp_fp, 'w') as f: f.write(str(self.word_vocab)) f.write('===========================') f.write(str(self.train_data_pipeline)) f.write('===========================') f.write(str(self.val_data_pipeline)) except Exception: os.remove(dp_fp) warnings.warn( "Could not get the str of the dev_data_pipeline's setup.") # 3. model and its interface m_fp = os.path.join(dir_path, 'model.txt') try: with open(m_fp, 'w') as f: f.write(str(self.imodel)) except Exception: os.remove(m_fp) warnings.warn("Could not get the str of the model's setup.")
def write(self, file_path, sep=' ', encoding='utf-8'): """ Writes the vocabulary to a plain text file where each line is of the form: {token}{sep}{count}. Default special symbols are not written. :param file_path: self-explanatory. :param sep: self-explanatory. :param encoding: self-explanatory. """ safe_mkfdir(file_path) with codecs.open(file_path, 'w', encoding=encoding) as f: for symbol in self: token = symbol.token count = str(symbol.count) try: str_entry = sep.join([token, count]) f.write(str_entry) f.write("\n") except Exception: logger.fatal( "Below entry produced a fatal error in write().") logger.fatal(symbol.token) raise ValueError("Could not process a token.") logger.info("Vocabulary is written to: '%s'." % file_path)
def eval(self, data_source, output_file_path=None): """ Assumes that batches contain SUMMS that are lists of sublists, where is sublist contain a fixed number of summary strings. I.e. summaries should not be tokenized. :param data_source: :param output_file_path: """ output_dc = DataChunk( **{ OutputF.GOLD_SUMMS: [], OutputF.GEN_SUMM: [], OutputF.GROUP_ID: [], OutputF.CAT: [], OutputF.ROUGE: [], OutputF.REV: [] }) rouge_evaluator = Rouge() skipped_summs = 0 for batch in self.data_pipeline.iter(**data_source): # notice that each product has K true summaries created by # annotators true_summs = batch[ModelF.SUMMS] prod_ids = batch[ModelF.SUMM_GROUP_ID] cats = batch[ModelF.SUMM_CAT] # getting group reviews that were used as input to produce summaries inp_revs = self.revs_formatter_func(batch[ModelF.REV]) group_rev_indxs = batch[ModelF.GROUP_REV_INDXS] group_rev_indxs_mask = batch[ModelF.GROUP_REV_INDXS_MASK] group_revs = get_group_reviews(inp_revs, group_rev_indxs, group_rev_indxs_mask) gen_summs = self.summs_gen_func(batch) assert (len(true_summs) == len(gen_summs)) # accumulating ROUGE statistics res = [] for gen_summ, _true_summs in zip(gen_summs, true_summs): if len(gen_summ) == 0: skipped_summs += 1 res.append(None) continue # extra [] wrapping is needed as the accum method is batch based r_avg, _, r_max, _ = rouge_evaluator.accum( hypotheses=[gen_summ], references=[_true_summs]) if self.avg_rouge: curr_rouge = r_avg else: curr_rouge = r_max res.append(curr_rouge) # splitting by the sentence for better visualization if self.sent_splitter: group_revs = self.split_group_seqs_by_sents(group_revs) true_summs = self.split_group_seqs_by_sents(true_summs) gen_summs = [self.sent_splitter(summ) for summ in gen_summs] # storing the output batch for later dumping output_dc[OutputF.GOLD_SUMMS] += true_summs output_dc[OutputF.GEN_SUMM] += gen_summs output_dc[OutputF.REV] += group_revs output_dc[OutputF.CAT] += list(cats) output_dc[OutputF.GROUP_ID] += list(prod_ids) output_dc[OutputF.ROUGE] += res # running analytics if self.analytics_func: if self.sent_splitter: # performing a preliminary merge of sentences summs_to_analyze = [ " ".join(sents) for sents in output_dc[ModelF.GEN_SUMM] ] else: summs_to_analyze = output_dc[ModelF.GEN_SUMM] res = self.analytics_func(summs_to_analyze) logger.info("Ran analytics of generated summaries.") logger.info(" ".join("%s: %.3f" % (k, v) for k, v in res)) final_metrs = rouge_evaluator.aggr(avg=self.avg_rouge) logger.info("ROUGE scores (avg_rouge=%s): " % self.avg_rouge) for k, v in final_metrs.items(): logger.info("%s based avg. %s." % (k, metrics_to_str(v))) # this is a safe way to make proper arrays for k in output_dc: l = len(output_dc[k]) cont = np.zeros(l, dtype='object') for indx in range(l): cont[indx] = output_dc[k][indx] output_dc[k] = cont if output_file_path: gr_fields = [OutputF.CAT, OutputF.GROUP_ID] safe_mkfdir(output_file_path) output_file = codecs.open(output_file_path, 'w') output_dc.to_json(f=output_file, grouping_fnames=gr_fields) logger.info("Wrote the eval output to: " "'%s'." % output_file_path) logger.info("Not generated %d summaries." % skipped_summs)
def eval(self, data_source, out_file_path=None): """ Assumes that batches contain SUMMS that are lists of sublists, where is sublist contain a fixed number of summary strings. I.e. summaries should not be tokenized. """ output_dc = DataChunk( **{ OutDataF.GOLD_SUMMS: [], OutDataF.GEN_SUMM: [], OutDataF.GROUP_ID: [], OutDataF.CAT: [], OutDataF.ROUGE: [], OutDataF.PROPS: [], OutDataF.PRED_PROPS: [], OutDataF.INP_REV: [] }) rouge_evaluator = GoogleRouge() prop_old_new_fnames = { ModelF.LEN_PROP: OutDataF.LEN_PROP, ModelF.RATING_PROP: OutDataF.RATING_PROP, ModelF.ROUGE_PROP: OutDataF.ROUGE_PROP, ModelF.POV_PROP: OutDataF.POV_PROP } skipped_summs = 0 gen_summs_coll = [] for batch in self.data_pipeline.iter(**data_source): # notice that each product has K true summaries created by # annotators inp_revs = batch[ModelF.REV].numpy() true_summs = batch[ModelF.SUMMS] group_ids = list(batch[ModelF.GROUP_ID]) cats = list(batch[ModelF.CAT]) group_rev_indxs = batch[ModelF.GROUP_REV_INDXS].numpy() group_rev_indxs_mask = batch[ModelF.GROUP_REV_INDXS_MASK].numpy() gen_summ, pred_props = self.summs_gen_func(batch) assert (len(true_summs) == len(gen_summ)) # below one will be used for analytics gen_summs_coll += gen_summ # props props = { n: batch[o].tolist() for o, n in prop_old_new_fnames.items() } props = dct_list_to_list_dict(props) output_dc[OutDataF.PROPS] += props if len(pred_props): pred_props = dct_list_to_list_dict(pred_props) output_dc[OutDataF.PRED_PROPS] += pred_props # accumulating ROUGE statistics rouge_scores = [] for _gen_summ, _tr_summs in zip(gen_summ, true_summs): if len(_gen_summ) == 0: skipped_summs += 1 rouge_scores.append(None) continue # extra [] wrapping is needed as the accum method is batch based r_score = rouge_evaluator.accum(hyp=[_gen_summ], refs=[_tr_summs])[0] rouge_scores.append(r_score) # grouping and formatting if self.rev_formatter_func is not None: inp_revs = [self.rev_formatter_func(seq) for seq in inp_revs] group_revs = get_group_reviews(inp_revs, group_rev_indxs, group_rev_indxs_mask) if self.summ_formatter_func is not None: true_summs = [[self.summ_formatter_func(s) for s in _summs] for _summs in true_summs] gen_summ = [self.summ_formatter_func(s) for s in gen_summ] # storing the output batch for later dumping output_dc[OutDataF.GOLD_SUMMS] += true_summs output_dc[OutDataF.GEN_SUMM] += gen_summ output_dc[OutDataF.INP_REV] += group_revs output_dc[OutDataF.CAT] += cats output_dc[OutDataF.GROUP_ID] += group_ids output_dc[OutDataF.ROUGE] += rouge_scores # some models don't output predicted props this condition deals with it if not len(output_dc[OutDataF.PRED_PROPS]): del output_dc[OutDataF.PRED_PROPS] # running analytics if self.analytics_func: formatted_true_summs = [] for seq_coll in output_dc[OutDataF.GOLD_SUMMS]: for seq in seq_coll: seq = seq if not isinstance(seq, list) else " ".join(seq) formatted_true_summs.append(seq) an_scores = self.analytics_func(formatted_true_summs) form_an_scores = format_stats(an_scores, title="True Text Analytics") for s in form_an_scores: logger.info(s) an_scores = self.analytics_func(gen_summs_coll) form_an_scores = format_stats(an_scores, title="Gen. Text Analytics") for s in form_an_scores: logger.info(s) final_metrs = rouge_evaluator.aggr() form_final_metrs = format_stats(final_metrs, title="ROUGE Scores") for s in form_final_metrs: logger.info(s) if out_file_path: gr_fields = [OutDataF.CAT, OutDataF.GROUP_ID] safe_mkfdir(out_file_path) output_file = codecs.open(out_file_path, 'w') output_dc.to_json(f=output_file, grouping_fnames=gr_fields) logger.info("Wrote the eval output to: " "'%s'." % out_file_path) logger.info("Not generated %d summaries." % skipped_summs)
def gen_and_save_summs(self, data_source, output_file_path): """ Generates summaries by running the model and writes them along with other attributes to a json file. :param data_source: self-explanatory. :param output_file_path: self-explanatory. """ safe_mkfdir(output_file_path) start_id = self.word_vocab[START].id end_id = self.word_vocab[END].id pad_id = self.word_vocab[PAD].id output_file = open(output_file_path, encoding='utf-8', mode='w') vocab_mapper = VocabMapper( { ModelF.REV: self.word_vocab, ModelF.GEN_SUMM: self.word_vocab, ModelF.GEN_REV: self.word_vocab }, symbols_attr='token') chunk_coll = [] for i, dc in enumerate(self.val_data_pipeline.iter(**data_source), 1): gen_revs, _, gen_summ, _ = self.imodel.predict(dc) # converting to the data-chunk to use the internal writing # mechanism new_dc = DataChunk() for fn in [ ModelF.SUMM_CAT, ModelF.SUMM_GROUP_ID, ModelF.REV, ModelF.GROUP_ID ]: new_dc[fn] = dc[fn] new_dc[ModelF.GEN_REV] = gen_revs new_dc[ModelF.GEN_SUMM] = gen_summ seq_fnames = [ModelF.GEN_SUMM, ModelF.GEN_REV, ModelF.REV] # converting PyTorch tensors to numpy arrays if present new_dc = convert_tensors_to_numpy(new_dc) for fn in seq_fnames: new_dc[fn] = format_seqs(new_dc[fn], start_id=start_id, end_id=end_id, pad_id=pad_id) new_dc = vocab_mapper(new_dc) # convert all seqs to strings for fn in seq_fnames: new_dc[fn] = conv_seqs_to_sents(new_dc[fn]) # group by product ids indxs = group_vals_by_keys(range(len(new_dc[ModelF.REV])), new_dc[ModelF.GROUP_ID]).values() for fn in [ModelF.GEN_REV, ModelF.REV]: new_dc[fn] = self._group_by_prods(indxs, new_dc[fn]) del new_dc[ModelF.GROUP_ID] chunk_coll.append(new_dc) output_chunk = concat_chunks(*chunk_coll) output_chunk.to_json( f=output_file, grouping_fnames=[ModelF.SUMM_CAT, ModelF.SUMM_GROUP_ID]) logger.info("Generated summaries and saved to: '%s'." "" % output_file_path) # analytics for repetitions checking # because gen summs contain list of strings I need to merge them # together before running analytics all_gen_summ_strs = [ " ".join(sents) for sents in output_chunk[ModelF.GEN_SUMM] ] an_metrics = ngram_seq_analysis(all_gen_summ_strs, tokenizer=self.tok_func, sent_splitter=self.sent_split_func, n_grams_to_comp=(2, 3, 4)) logger.info("Ran analytics of generated summaries.") metrs_str = " ".join(["%s: %.3f" % (k, v) for k, v in an_metrics]) logger.info(metrs_str)
def gen_seqs(self, data_source, out_file_path, **kwargs): """Generates sequences and saves them to a file.""" assert self.seq_gen_pipeline is not None empty_cache() safe_mkfdir(out_file_path) logger.info(f'Generating conditional sequences/summaries for ' f'{data_source}.') # storing to a data-chunk and dumping to the storage output_dc = DataChunk() fields_to_init = [ OutDataF.GROUP_ID, OutDataF.INP_REV, OutDataF.GEN_REV, OutDataF.REV_INDX, OutDataF.INP_REV_RATING, OutDataF.INP_REV_LEN, OutDataF.GEN_REV_LEN, OutDataF.PROPS, OutDataF.PRED_PROPS ] for fname in fields_to_init: output_dc[fname] = [] prop_old_new_fnames = { ModelF.LEN_PROP: OutDataF.LEN_PROP, ModelF.RATING_PROP: OutDataF.RATING_PROP, ModelF.ROUGE_PROP: OutDataF.ROUGE_PROP, ModelF.POV_PROP: OutDataF.POV_PROP } for batch in self.seq_gen_pipeline.iter(**data_source): rev_group_id = batch[ModelF.REV_GROUP_ID] rev_indx = _comp_rev_indx(rev_group_id) rating = batch[ModelF.REV_RATING] inp_rev = batch[ModelF.REV].numpy() gen_rev, pred_props = self.imodel.generate(batch=batch, **kwargs) # post-processing if self.gen_seq_postproc: inp_rev = [self.gen_seq_postproc(seq) for seq in inp_rev] gen_rev = [self.gen_seq_postproc(seq) for seq in gen_rev] output_dc[OutDataF.GROUP_ID] += rev_group_id output_dc[OutDataF.INP_REV] += inp_rev output_dc[OutDataF.GEN_REV] += gen_rev output_dc[OutDataF.INP_REV_RATING] += rating output_dc[OutDataF.REV_INDX] += rev_indx # for analytics inp_seq_len = [_comp_seq_len(seq) for seq in inp_rev] gen_seq_len = [_comp_seq_len(seq) for seq in gen_rev] output_dc[OutDataF.INP_REV_LEN] += inp_seq_len output_dc[OutDataF.GEN_REV_LEN] += gen_seq_len # props props = { n: batch[o].tolist() for o, n in prop_old_new_fnames.items() } props = dct_list_to_list_dict(props) output_dc[OutDataF.PROPS] += props if len(pred_props): pred_props = dct_list_to_list_dict(pred_props) output_dc[OutDataF.PRED_PROPS] += pred_props # some models don't output predicted props this condition deals with it if not len(output_dc[OutDataF.PRED_PROPS]): del output_dc[OutDataF.PRED_PROPS] # running analytics of text if self.seq_analytics: for text_fname in [OutDataF.INP_REV, OutDataF.GEN_REV]: formatted_seqs = [] for seq in output_dc[text_fname]: seq = seq if not isinstance(seq, list) else " ".join(seq) formatted_seqs.append(seq) fscores = format_stats( self.seq_analytics(formatted_seqs), f"`{text_fname.upper()}` Text Analytics") for s in fscores: logger.info(s) output_dc.to_json( f=codecs.open(out_file_path, 'w', 'utf-8'), grouping_fnames=[OutDataF.GROUP_ID, OutDataF.REV_INDX]) logger.info("Generated sequences and saved to: '%s'." % out_file_path)