def __init__(self): super(BaseRunConfig, self).__init__(excl_print_attrs=['exper_descr']) # GENERAL # self.exper_descr = 'Description of the experiment.' self.seed = 42 self.cuda_device_ids = [0] self.training_logging_step = 100 self.shuffler_buffer_size = 50 self.grads_clip = 0.25 self.strict_load = True # GENERAL DATA RELATED # self.dataset = 'amazon' self.min_rev_per_group = 9 self.max_rev_per_group = 9 # DATA SOURCES # self.train_early_term = None self.val_early_term = None # DATA PATHS # self.base_data_path = f"artifacts/{self.dataset}" self.train_fp = comb_paths(self.base_data_path, "reviews/train/") self.val_fp = comb_paths(self.base_data_path, 'reviews/val/') self.gold_train_fp = comb_paths(self.base_data_path, 'gold_summs/train.csv') self.gold_val_fp = comb_paths(self.base_data_path, 'gold_summs/val.csv') self.gold_test_fp = comb_paths(self.base_data_path, 'gold_summs/test.csv') self.word_vocab_fp = comb_paths(self.base_data_path, "misc/tc_word_train.txt") self.checkpoint_fn = 'checkpoint.tar' # FREEZING AND UNFREEZING # self.modules_to_unfreeze = [] # BPE AND TRUECASER # self.subword_num = 32000 self.bpe_fp = comb_paths(self.base_data_path, 'misc/bpe_%d_train.int' % self.subword_num) self.bpe_vocab_fp = comb_paths( self.base_data_path, 'misc/bpe_%d_train.txt' % self.subword_num) self.tcaser_model_path = comb_paths(self.base_data_path, 'misc', 'tcaser.model') # DECODING / GENERATION # self.min_seq_len = 20 self.max_seq_len = 105 self.seq_max_len = 105 self.beam_size = 50 self.block_ngram_repeat = 3 self.ngram_mirror_window = 3 self.mirror_conjs = ["and", "or", ",", "but"] self.block_consecutive = True
def after_ep_func(epoch=None): new_out_path = comb_paths(out_dir_path, "out_ep%d" % epoch) \ if epoch else comb_paths(out_dir_path, "out") safe_mkdir(new_out_path) # saving the state if checkpoint_fn is not None and epoch is not None: new_checkpoint_fn = checkpoint_fn if epoch is None else \ "ep%d_%s" % (epoch, checkpoint_fn) out_fp = comb_paths(out_dir_path, new_checkpoint_fn) self.imodel.save_state(out_fp) # running evaluation against gold summaries if summ_eval_data_source is not None: self.summ_eval(new_out_path, summ_eval_data_source, **summ_eval_kwargs)
def after_ep_func(epoch): out_fp = comb_paths(run_hp.output_path, 'ep%d_%s' % (epoch, run_hp.checkpoint_full_fn)) imodel.save_state(out_fp) gen_folder_path = comb_paths(run_hp.output_path, "output_ep%d" % epoch) summ_eval(output_folder=gen_folder_path, data_pipeline=eval_data_pipeline, eval_data_source=eval_dev_data_source, summ_gen_func=partial(idev.summ_generator, summ_post_proc=summ_post_proc), rev_formatter_func=idev.format_revs, avg_rouge=True, sent_splitter=run_hp.sent_split_func, analytics_func=run_hp.analytics_func) gen_seqs(data_sources=gen_data_sources, output_folder=gen_folder_path, gen_func=partial(idev.gen_and_save_summs))
def run(self): init_logger(LOGGER_NAME) logger.info("Preparing `%s`." % self.inp_file_path) if GlobalConfig().dataset == 'amazon': iter = read_amazon_data(self.inp_file_path, replace_xml=True) else: iter = read_yelp_data(self.inp_file_path) for group_id, dus in iter: full_file_name = "%s.csv" % group_id out_file_path = comb_paths(self.act_out_dir_path, full_file_name) write_group_to_csv(out_file_path, dus, sep='\t')
def gen_seqs(output_folder, gen_func, data_sources): """ Generates the output (reconstructed sequences and summaries) based on the provided data-sources. """ if not isinstance(data_sources, list): data_sources = [data_sources] logger.info("Performing generation of sequences/summaries.") for data_source in data_sources: fn = get_file_name(data_source['data_path']) if fn == '': _, fn = os.path.split(os.path.dirname(data_source['data_path'])) ofp = comb_paths(output_folder, "%s.json" % fn) gen_func(data_source, ofp)
def summ_eval(self, out_dir_path, data_source, **kwargs): """Runs evaluation of summaries.""" assert self.eval_pipeline is not None empty_cache() summ_gen_func = partial(self.summ_gen_wrapper, **kwargs) output_fn = "%s_eval.json" % get_file_name(data_source['data_path']) out_file_path = comb_paths(out_dir_path, output_fn) logger.info("Performing summary evaluation on %s." % data_source) eval_proc = SummEvalProc(self.eval_pipeline, summs_gen_func=summ_gen_func, rev_formatter_func=self.gen_seq_postproc, summ_formatter_func=self.summ_postproc, analytics_func=self.seq_analytics) eval_proc.eval(data_source, out_file_path=out_file_path)
def postprocess_data(data_path, out_dir_path, min_revs_per_file=None, workers=1, max_revs_per_file=9, early_term=None, logging_period=1000): """ Creates `K` reviews per group files, computes ROUGE 1 vs rest. In this case, avoids an expensive online computation of ROUGE. """ logger = init_logger("", output_path=os.path.dirname(out_dir_path)) dt = MosesDetokenizer() detok_func = lambda x: [ dt.detokenize(_x.split(" "), unescape=False) for _x in x ] data_pipeline = assemble_postproc_pipeline( text_prep_func=detok_func, seed=seed, min_revs_per_group=min_revs_per_file, max_revs_per_group=max_revs_per_file, workers=workers) logger.info("Writing chunks to: '%s'." % out_dir_path) safe_mkdir(out_dir_path) chunks_count = 0 start = time() unique_groups = set() review_count = 0 min_rev_per_chunk = float('inf') max_rev_per_chunk = float('-inf') for dc in data_pipeline.iter(data_path=data_path, early_term=early_term): assert len(np.unique(dc[InpDataF.GROUP_ID])) == 1 group_id = dc[0, InpDataF.GROUP_ID].split("_")[0] unique_groups.add(group_id) review_count += len(dc) min_rev_per_chunk = min(min_rev_per_chunk, len(dc)) max_rev_per_chunk = max(max_rev_per_chunk, len(dc)) fp = comb_paths(out_dir_path, "%s.csv" % dc[0][InpDataF.GROUP_ID]) dc.to_csv(open(fp, encoding='utf-8', mode='w')) chunks_count += 1 if chunks_count % logging_period == 0: logger.info("Wrote %d chunks." % chunks_count) logger.info("Totally wrote %d chunks." % chunks_count) logger.info("Total time elapsed: %f." % (time() - start)) logger.info("Unique groups: %d." % len(unique_groups)) logger.info("Total reviews: %d." % review_count) logger.info("Min reviews per chunk: %d." % min_rev_per_chunk) logger.info("Max reviews per chunk: %d." % max_rev_per_chunk)
def summ_eval(output_folder, data_pipeline, eval_data_source, summ_gen_func, rev_formatter_func, sent_splitter=None, avg_rouge=True, analytics_func=None): """Performs evaluation based on Amazon and YELP summaries.""" output_fn = "%s_eval.json" % get_file_name(eval_data_source['data_path']) output_path = comb_paths(output_folder, output_fn) logger.info("Performing the summary evaluation on %s." % eval_data_source) eval_proc = SummEvalProc(data_pipeline, avg_rouge=avg_rouge, summs_gen_func=summ_gen_func, sent_splitter=sent_splitter, revs_formatter_func=rev_formatter_func, analytics_func=analytics_func) eval_proc.eval(eval_data_source, output_file_path=output_path)
def run(self): # TODO: it seems that if I assign a file tokenizer to each file in a loop # TODO: as a dependency, it makes the whole system super slow if the # TODO: number of groups (=files) is very large init_logger(LOGGER_NAME) inp_folder = self.input()[0] failed_file_count = 0 logger.info("Tokenizing `%s`." % inp_folder.path) for file_path in iter_file_paths(inp_folder.path): file_name = get_file_name(file_path) out_file_path = comb_paths(self.act_out_dir_path, "%s.csv" % file_name) try: # TODO: `_csv.Error: line contains NULL byte` was encountered in # TODO: a small number of files; the cause needs to be # TODO: investigated TokenizeFile(inp_file_path=file_path, out_file_path=out_file_path).run() except CsvError: failed_file_count += 1 logger.info('Failed to tokenize `%d` files in `%s`.' % (failed_file_count, inp_folder.path))
eval_dev_data_source = {"data_path": run_hp.eval_dev_fp} eval_test_data_source = {"data_path": run_hp.eval_test_fp} gen_data_sources = [{ "data_path": run_hp.train_fp, 'early_term': run_hp.gener_early_term }, { "data_path": run_hp.val_fp, 'early_term': run_hp.gener_early_term }] os.environ['CUDA_VISIBLE_DEVICES'] = str(run_hp.cuda_device_id) logger = init_logger(logger_name="", level=INFO, output_path=comb_paths(run_hp.output_path, "log.txt")) logger.info('CUDA_VISIBLE_DEVICES=%s' % os.environ.get('CUDA_VISIBLE_DEVICES')) # KL ANNEALING MECHANISMS # c_kl_ann = KlCycAnnealing(t=run_hp.c_kl_ann_batches, m=run_hp.c_m, r=run_hp.c_r, max_val=run_hp.c_kl_ann_max_val) z_kl_ann = KlCycAnnealing(t=run_hp.z_kl_ann_batches, m=run_hp.z_m, r=run_hp.c_r, max_val=run_hp.z_kl_ann_max_val) # PIPELINES AND VOCAB #
eval_test_data_source = {"data_path": run_hp.eval_test_fp} gen_data_sources = [{ "data_path": run_hp.train_fp, 'early_term': run_hp.gener_early_term }, { "data_path": run_hp.val_fp, 'early_term': run_hp.gener_early_term }] os.environ['CUDA_VISIBLE_DEVICES'] = str(run_hp.cuda_device_id) experiments_descr = 'My first experiment with the model.' logger = init_logger(logger_name="", level=INFO, output_path=comb_paths(run_hp.output_path, "log.txt")) logger.info('CUDA_VISIBLE_DEVICES=%s' % os.environ.get('CUDA_VISIBLE_DEVICES')) # KL ANNEALING MECHANISMS # c_kl_ann = KlCycAnnealing(t=run_hp.c_kl_ann_batches, m=run_hp.c_m, r=run_hp.c_r, max_val=run_hp.c_kl_ann_max_val) z_kl_ann = KlCycAnnealing(t=run_hp.z_kl_ann_batches, m=run_hp.z_m, r=run_hp.c_r, max_val=run_hp.z_kl_ann_max_val) # PIPELINES AND VOCAB #
def __init__(self, *args, **kwargs): super(GlobalConfig, self).__init__(*args, **kwargs) self.out_dir_path = comb_paths(self.out_dir_path, self.dataset)
def __init__(self): super(RunHP, self).__init__() # GENERAL # self.seed = 42 self.cuda_device_id = 6 self.device = 'cpu' # 'cuda' or 'cpu' self.training_logging_step = 50 # how often to print internal metrics self.epochs = 10 # if set to 0 will immediately just to evaluation self.learning_rate = 0.0005 self.grads_clip = 0.25 # GENERAL DATA RELATED # self.dataset = 'yelp' self.train_max_groups_per_batch = 6 self.val_max_groups_per_batch = 13 self.eval_max_groups_per_batch = 20 self.max_rev_per_group = 8 # DATA SOURCES # # `early_term` limits the number of chunks per epoch self.train_early_term = None self.val_early_term = None self.gener_early_term = 2 # GENERAL PATHS # self.root_path = 'copycat' self.experiments_folder = 'first_run' self.output_dir = f'{self.root_path}/runs/{self.dataset}/{self.experiments_folder}' self.checkpoint_full_fn = 'checkpoint.tar' epc = ExperimentsPathController() self.output_path = epc(self.output_dir) self.checkpoint_path = f'{self.root_path}/artifacts/{self.dataset}/checkpoint.tar' self.tcaser_model_path = f'{self.root_path}/artifacts/{self.dataset}/data/tcaser.model' # DATA PATHS # self.base_data_path = f'data/{self.dataset}/' self.train_fp = comb_paths(self.base_data_path, "split/train/") self.val_fp = comb_paths(self.base_data_path, 'split/val/') self.words_vocab_fp = f'{self.root_path}/artifacts/{self.dataset}/data/words.txt' self.eval_dev_fp = comb_paths(self.base_data_path, 'gold', 'val.csv') self.eval_test_fp = comb_paths(self.base_data_path, 'gold', 'test.csv') # ANNEALING # self.c_m = 8. self.c_r = 0.8 self.c_kl_ann_max_val = 1. self.c_kl_ann_batches = self.epochs * self.train_early_term if self.train_early_term else self.epochs * 10000 self.z_m = 8. self.z_c = 0.8 self.z_kl_ann_max_val = 1. self.z_kl_ann_batches = self.epochs * self.train_early_term if self.train_early_term else self.epochs * 10000 # DECODING/GENERATION # self.beam_size = 5 self.beam_len_norm = True self.beam_excl_words = [] self.block_ngram_repeat = 3 # or None self.ngram_mirror_window = 3 # or None self.mirror_conjs = ["and", 'or', ',', 'but'] # or None self.block_consecutive = True self.min_gen_seq_len = 20 # POST-PROCESSING AND ANALYTICS # mt = MosesTokenizer() self.tok_func = partial(mt.tokenize, escape=False) self.sent_split_func = nltk.sent_tokenize dt = MosesDetokenizer() self.detok_func = partial(dt.detokenize, unescape=False) true_caser = MosesTruecaser(load_from=self.tcaser_model_path, is_asr=True) self.true_case_func = partial(true_caser.truecase, return_str=True, use_known=True) self.analytics_func = partial(ngram_seq_analysis, tokenizer=self.tok_func, sent_splitter=self.sent_split_func, n_grams_to_comp=(2, 3, 4))
help='Sets the regime of training/inference.', required=True) parser.add_argument( '--inference', action='store_true', help='If set, will perform inference/summary generation otherwise training.' ) regime = parser.parse_args().regime inference = parser.parse_args().inference run_conf = RUN_CONFIG_REGISTRY[regime]() logger = init_logger(logger_name="", level=INFO, output_path=comb_paths(run_conf.output_path, "log.txt")) # ENV and hyper-params handling # manual_seed(run_conf.seed) np.random.seed(run_conf.seed) cuda_visible_devices = str(run_conf.cuda_device_ids) \ if isinstance(run_conf.cuda_device_ids, int) else \ ",".join([str(dev_id) for dev_id in run_conf.cuda_device_ids]) os.environ['CUDA_VISIBLE_DEVICES'] = cuda_visible_devices device_count = 1 if not isinstance(run_conf.cuda_device_ids, list) \ else max(1, len(run_conf.cuda_device_ids)) device = 'cuda' if len(run_conf.cuda_device_ids) > 0 else 'cpu' logger.info('CUDA_VISIBLE_DEVICES=%s' % cuda_visible_devices) # DATA SOURCES #
def run(self): log_file_path = comb_paths( GlobalConfig().out_dir_path, "logs", FOLDER_NAME, "%s.txt" % get_file_name(self.inp_file_path)) init_logger(LOGGER_NAME, output_path=log_file_path) init_unit_count = 0 init_group_count = 0 group_id_to_units = {} group_unit_counts = [] # 1. reading data and filtering out short/long reviews and # unpopular groups inp_dir = self.input()[0] logger.info("Subsampling `%s`." % inp_dir.path) for inp_file_path in iter_file_paths(inp_dir.path): group_id = get_file_name(inp_file_path) group_units = [] init_group_count += 1 for data_unit in read_csv_file(inp_file_path, sep='\t'): init_unit_count += 1 rev_text = data_unit[OutputFields.REV_TEXT].split() # removing too short and long reviews if len(rev_text) < self.min_rev_len or \ len(rev_text) > self.max_rev_len: continue group_units.append(data_unit) # removing unpopular groups if len(group_units) < self.min_revs: continue group_id_to_units[group_id] = group_units group_unit_counts.append(len(group_units)) if not len(group_id_to_units): raise ValueError("No groups to proceed.") # 2. filtering by percentile perc = np.percentile(group_unit_counts, self.percentile) # removing above a kth percentile groups subs_group_id_to_units = {} subs_units_count = 0 subs_units_max_count = 0 for group_id, group_units in group_id_to_units.items(): if len(group_units) < perc or perc == 1.: # making sure that the subsampled number of reviews does not # exceed a threshold unless most of the businesses only have # one review if self.max_total_revs is not None \ and (subs_units_count + len( group_units)) > self.max_total_revs: break subs_units_count += len(group_units) subs_units_max_count = max(subs_units_max_count, len(group_units)) subs_group_id_to_units[group_id] = group_units if subs_units_count == 0: raise ValueError("All units were subsampled out. " "Please adjust the parameters.") # 3. dumping to files write_groups_to_csv(self.act_out_dir_path, subs_group_id_to_units, sep='\t') # 4. logging statistics stats = OrderedDict() stats['General'] = OrderedDict() stats['General']['inp dir'] = inp_dir.path stats['Initial'] = OrderedDict() stats['Initial']['group count'] = init_group_count stats['Initial']['unit count'] = init_unit_count stats['After Filtering'] = OrderedDict() stats['After Filtering']['group count'] = len(group_id_to_units) stats['After Filtering']['unit count'] = np.sum(group_unit_counts) stats['After Filtering']['percentile count'] = perc stats['After Subsampling'] = OrderedDict() stats['After Subsampling']['group count'] = len(subs_group_id_to_units) stats['After Subsampling']['unit count'] = subs_units_count stats['After Subsampling'][ 'max units per group'] = subs_units_max_count stats_str = format_stats(stats) logger.info(stats_str)
def write_groups_to_csv(out_dir_path, group_id_to_units, sep='\t'): for group_id, group_units in group_id_to_units.items(): full_file_name = "%s.csv" % group_id out_file_path = comb_paths(out_dir_path, full_file_name) write_group_to_csv(out_file_path, group_units, sep=sep)