Example #1
0
    def __init__(self):
        super(BaseRunConfig, self).__init__(excl_print_attrs=['exper_descr'])

        #   GENERAL  #
        self.exper_descr = 'Description of the experiment.'
        self.seed = 42
        self.cuda_device_ids = [0]
        self.training_logging_step = 100
        self.shuffler_buffer_size = 50
        self.grads_clip = 0.25
        self.strict_load = True

        #   GENERAL DATA RELATED  #
        self.dataset = 'amazon'
        self.min_rev_per_group = 9
        self.max_rev_per_group = 9

        #   DATA SOURCES  #
        self.train_early_term = None
        self.val_early_term = None

        #   DATA PATHS  #
        self.base_data_path = f"artifacts/{self.dataset}"
        self.train_fp = comb_paths(self.base_data_path, "reviews/train/")
        self.val_fp = comb_paths(self.base_data_path, 'reviews/val/')

        self.gold_train_fp = comb_paths(self.base_data_path,
                                        'gold_summs/train.csv')
        self.gold_val_fp = comb_paths(self.base_data_path,
                                      'gold_summs/val.csv')
        self.gold_test_fp = comb_paths(self.base_data_path,
                                       'gold_summs/test.csv')

        self.word_vocab_fp = comb_paths(self.base_data_path,
                                        "misc/tc_word_train.txt")
        self.checkpoint_fn = 'checkpoint.tar'

        #   FREEZING AND UNFREEZING   #
        self.modules_to_unfreeze = []

        #     BPE AND TRUECASER     #
        self.subword_num = 32000
        self.bpe_fp = comb_paths(self.base_data_path,
                                 'misc/bpe_%d_train.int' % self.subword_num)
        self.bpe_vocab_fp = comb_paths(
            self.base_data_path, 'misc/bpe_%d_train.txt' % self.subword_num)
        self.tcaser_model_path = comb_paths(self.base_data_path, 'misc',
                                            'tcaser.model')

        #   DECODING / GENERATION  #
        self.min_seq_len = 20
        self.max_seq_len = 105
        self.seq_max_len = 105
        self.beam_size = 50
        self.block_ngram_repeat = 3
        self.ngram_mirror_window = 3
        self.mirror_conjs = ["and", "or", ",", "but"]
        self.block_consecutive = True
Example #2
0
        def after_ep_func(epoch=None):
            new_out_path = comb_paths(out_dir_path, "out_ep%d" % epoch) \
                if epoch else comb_paths(out_dir_path, "out")
            safe_mkdir(new_out_path)

            # saving the state
            if checkpoint_fn is not None and epoch is not None:
                new_checkpoint_fn = checkpoint_fn if epoch is None else \
                    "ep%d_%s" % (epoch, checkpoint_fn)
                out_fp = comb_paths(out_dir_path, new_checkpoint_fn)
                self.imodel.save_state(out_fp)

            # running evaluation against gold summaries
            if summ_eval_data_source is not None:
                self.summ_eval(new_out_path, summ_eval_data_source,
                               **summ_eval_kwargs)
    def after_ep_func(epoch):
        out_fp = comb_paths(run_hp.output_path,
                            'ep%d_%s' % (epoch, run_hp.checkpoint_full_fn))
        imodel.save_state(out_fp)

        gen_folder_path = comb_paths(run_hp.output_path, "output_ep%d" % epoch)
        summ_eval(output_folder=gen_folder_path,
                  data_pipeline=eval_data_pipeline,
                  eval_data_source=eval_dev_data_source,
                  summ_gen_func=partial(idev.summ_generator,
                                        summ_post_proc=summ_post_proc),
                  rev_formatter_func=idev.format_revs,
                  avg_rouge=True,
                  sent_splitter=run_hp.sent_split_func,
                  analytics_func=run_hp.analytics_func)
        gen_seqs(data_sources=gen_data_sources,
                 output_folder=gen_folder_path,
                 gen_func=partial(idev.gen_and_save_summs))
 def run(self):
     init_logger(LOGGER_NAME)
     logger.info("Preparing `%s`." % self.inp_file_path)
     if GlobalConfig().dataset == 'amazon':
         iter = read_amazon_data(self.inp_file_path, replace_xml=True)
     else:
         iter = read_yelp_data(self.inp_file_path)
     for group_id, dus in iter:
         full_file_name = "%s.csv" % group_id
         out_file_path = comb_paths(self.act_out_dir_path, full_file_name)
         write_group_to_csv(out_file_path, dus, sep='\t')
Example #5
0
def gen_seqs(output_folder, gen_func, data_sources):
    """
    Generates the output (reconstructed sequences and summaries) based on the
    provided data-sources.
    """
    if not isinstance(data_sources, list):
        data_sources = [data_sources]
    logger.info("Performing generation of sequences/summaries.")
    for data_source in data_sources:
        fn = get_file_name(data_source['data_path'])
        if fn == '':
            _, fn = os.path.split(os.path.dirname(data_source['data_path']))
        ofp = comb_paths(output_folder, "%s.json" % fn)
        gen_func(data_source, ofp)
Example #6
0
 def summ_eval(self, out_dir_path, data_source, **kwargs):
     """Runs evaluation of summaries."""
     assert self.eval_pipeline is not None
     empty_cache()
     summ_gen_func = partial(self.summ_gen_wrapper, **kwargs)
     output_fn = "%s_eval.json" % get_file_name(data_source['data_path'])
     out_file_path = comb_paths(out_dir_path, output_fn)
     logger.info("Performing summary evaluation on %s." % data_source)
     eval_proc = SummEvalProc(self.eval_pipeline,
                              summs_gen_func=summ_gen_func,
                              rev_formatter_func=self.gen_seq_postproc,
                              summ_formatter_func=self.summ_postproc,
                              analytics_func=self.seq_analytics)
     eval_proc.eval(data_source, out_file_path=out_file_path)
Example #7
0
def postprocess_data(data_path,
                     out_dir_path,
                     min_revs_per_file=None,
                     workers=1,
                     max_revs_per_file=9,
                     early_term=None,
                     logging_period=1000):
    """
    Creates `K` reviews per group files, computes ROUGE 1 vs rest. In this case,
    avoids an expensive online computation of ROUGE.
    """
    logger = init_logger("", output_path=os.path.dirname(out_dir_path))
    dt = MosesDetokenizer()
    detok_func = lambda x: [
        dt.detokenize(_x.split(" "), unescape=False) for _x in x
    ]
    data_pipeline = assemble_postproc_pipeline(
        text_prep_func=detok_func,
        seed=seed,
        min_revs_per_group=min_revs_per_file,
        max_revs_per_group=max_revs_per_file,
        workers=workers)
    logger.info("Writing chunks to: '%s'." % out_dir_path)
    safe_mkdir(out_dir_path)
    chunks_count = 0
    start = time()
    unique_groups = set()
    review_count = 0
    min_rev_per_chunk = float('inf')
    max_rev_per_chunk = float('-inf')
    for dc in data_pipeline.iter(data_path=data_path, early_term=early_term):
        assert len(np.unique(dc[InpDataF.GROUP_ID])) == 1
        group_id = dc[0, InpDataF.GROUP_ID].split("_")[0]
        unique_groups.add(group_id)
        review_count += len(dc)
        min_rev_per_chunk = min(min_rev_per_chunk, len(dc))
        max_rev_per_chunk = max(max_rev_per_chunk, len(dc))
        fp = comb_paths(out_dir_path, "%s.csv" % dc[0][InpDataF.GROUP_ID])
        dc.to_csv(open(fp, encoding='utf-8', mode='w'))
        chunks_count += 1
        if chunks_count % logging_period == 0:
            logger.info("Wrote %d chunks." % chunks_count)
    logger.info("Totally wrote %d chunks." % chunks_count)
    logger.info("Total time elapsed: %f." % (time() - start))
    logger.info("Unique groups: %d." % len(unique_groups))
    logger.info("Total reviews: %d." % review_count)
    logger.info("Min reviews per chunk: %d." % min_rev_per_chunk)
    logger.info("Max reviews per chunk: %d." % max_rev_per_chunk)
Example #8
0
def summ_eval(output_folder,
              data_pipeline,
              eval_data_source,
              summ_gen_func,
              rev_formatter_func,
              sent_splitter=None,
              avg_rouge=True,
              analytics_func=None):
    """Performs evaluation based on Amazon and YELP summaries."""
    output_fn = "%s_eval.json" % get_file_name(eval_data_source['data_path'])
    output_path = comb_paths(output_folder, output_fn)
    logger.info("Performing the summary evaluation on %s." % eval_data_source)
    eval_proc = SummEvalProc(data_pipeline,
                             avg_rouge=avg_rouge,
                             summs_gen_func=summ_gen_func,
                             sent_splitter=sent_splitter,
                             revs_formatter_func=rev_formatter_func,
                             analytics_func=analytics_func)
    eval_proc.eval(eval_data_source, output_file_path=output_path)
Example #9
0
    def run(self):
        # TODO: it seems that if I assign a file tokenizer to each file in a loop
        # TODO: as a dependency, it makes the whole system super slow if the
        # TODO: number of groups (=files) is very large

        init_logger(LOGGER_NAME)
        inp_folder = self.input()[0]
        failed_file_count = 0
        logger.info("Tokenizing `%s`." % inp_folder.path)
        for file_path in iter_file_paths(inp_folder.path):
            file_name = get_file_name(file_path)
            out_file_path = comb_paths(self.act_out_dir_path,
                                       "%s.csv" % file_name)
            try:
                # TODO: `_csv.Error: line contains NULL byte` was encountered in
                # TODO: a small number of files; the cause needs to be
                # TODO: investigated
                TokenizeFile(inp_file_path=file_path,
                             out_file_path=out_file_path).run()
            except CsvError:
                failed_file_count += 1
        logger.info('Failed to tokenize `%d` files in `%s`.' %
                    (failed_file_count, inp_folder.path))
eval_dev_data_source = {"data_path": run_hp.eval_dev_fp}
eval_test_data_source = {"data_path": run_hp.eval_test_fp}

gen_data_sources = [{
    "data_path": run_hp.train_fp,
    'early_term': run_hp.gener_early_term
}, {
    "data_path": run_hp.val_fp,
    'early_term': run_hp.gener_early_term
}]

os.environ['CUDA_VISIBLE_DEVICES'] = str(run_hp.cuda_device_id)

logger = init_logger(logger_name="",
                     level=INFO,
                     output_path=comb_paths(run_hp.output_path, "log.txt"))
logger.info('CUDA_VISIBLE_DEVICES=%s' % os.environ.get('CUDA_VISIBLE_DEVICES'))

#   KL ANNEALING MECHANISMS  #

c_kl_ann = KlCycAnnealing(t=run_hp.c_kl_ann_batches,
                          m=run_hp.c_m,
                          r=run_hp.c_r,
                          max_val=run_hp.c_kl_ann_max_val)
z_kl_ann = KlCycAnnealing(t=run_hp.z_kl_ann_batches,
                          m=run_hp.z_m,
                          r=run_hp.c_r,
                          max_val=run_hp.z_kl_ann_max_val)

#   PIPELINES AND VOCAB   #
eval_test_data_source = {"data_path": run_hp.eval_test_fp}

gen_data_sources = [{
    "data_path": run_hp.train_fp,
    'early_term': run_hp.gener_early_term
}, {
    "data_path": run_hp.val_fp,
    'early_term': run_hp.gener_early_term
}]

os.environ['CUDA_VISIBLE_DEVICES'] = str(run_hp.cuda_device_id)
experiments_descr = 'My first experiment with the model.'

logger = init_logger(logger_name="",
                     level=INFO,
                     output_path=comb_paths(run_hp.output_path, "log.txt"))
logger.info('CUDA_VISIBLE_DEVICES=%s' % os.environ.get('CUDA_VISIBLE_DEVICES'))

#   KL ANNEALING MECHANISMS  #

c_kl_ann = KlCycAnnealing(t=run_hp.c_kl_ann_batches,
                          m=run_hp.c_m,
                          r=run_hp.c_r,
                          max_val=run_hp.c_kl_ann_max_val)
z_kl_ann = KlCycAnnealing(t=run_hp.z_kl_ann_batches,
                          m=run_hp.z_m,
                          r=run_hp.c_r,
                          max_val=run_hp.z_kl_ann_max_val)

#   PIPELINES AND VOCAB   #
 def __init__(self, *args, **kwargs):
     super(GlobalConfig, self).__init__(*args, **kwargs)
     self.out_dir_path = comb_paths(self.out_dir_path, self.dataset)
Example #13
0
    def __init__(self):
        super(RunHP, self).__init__()

        #   GENERAL  #
        self.seed = 42
        self.cuda_device_id = 6
        self.device = 'cpu'  # 'cuda' or 'cpu'
        self.training_logging_step = 50  # how often to print internal metrics
        self.epochs = 10  # if set to 0 will immediately just to evaluation
        self.learning_rate = 0.0005
        self.grads_clip = 0.25

        # GENERAL DATA RELATED #
        self.dataset = 'yelp'
        self.train_max_groups_per_batch = 6
        self.val_max_groups_per_batch = 13
        self.eval_max_groups_per_batch = 20
        self.max_rev_per_group = 8

        #   DATA SOURCES  #
        # `early_term` limits the number of chunks per epoch
        self.train_early_term = None
        self.val_early_term = None
        self.gener_early_term = 2

        #  GENERAL PATHS   #
        self.root_path = 'copycat'
        self.experiments_folder = 'first_run'
        self.output_dir = f'{self.root_path}/runs/{self.dataset}/{self.experiments_folder}'
        self.checkpoint_full_fn = 'checkpoint.tar'
        epc = ExperimentsPathController()
        self.output_path = epc(self.output_dir)
        self.checkpoint_path = f'{self.root_path}/artifacts/{self.dataset}/checkpoint.tar'
        self.tcaser_model_path = f'{self.root_path}/artifacts/{self.dataset}/data/tcaser.model'

        #   DATA PATHS  #
        self.base_data_path = f'data/{self.dataset}/'
        self.train_fp = comb_paths(self.base_data_path, "split/train/")
        self.val_fp = comb_paths(self.base_data_path, 'split/val/')
        self.words_vocab_fp = f'{self.root_path}/artifacts/{self.dataset}/data/words.txt'
        self.eval_dev_fp = comb_paths(self.base_data_path, 'gold', 'val.csv')
        self.eval_test_fp = comb_paths(self.base_data_path, 'gold', 'test.csv')

        #   ANNEALING   #
        self.c_m = 8.
        self.c_r = 0.8
        self.c_kl_ann_max_val = 1.
        self.c_kl_ann_batches = self.epochs * self.train_early_term if self.train_early_term else self.epochs * 10000
        self.z_m = 8.
        self.z_c = 0.8
        self.z_kl_ann_max_val = 1.
        self.z_kl_ann_batches = self.epochs * self.train_early_term if self.train_early_term else self.epochs * 10000

        #   DECODING/GENERATION  #
        self.beam_size = 5
        self.beam_len_norm = True
        self.beam_excl_words = []
        self.block_ngram_repeat = 3  # or None
        self.ngram_mirror_window = 3  # or None
        self.mirror_conjs = ["and", 'or', ',', 'but']  # or None
        self.block_consecutive = True
        self.min_gen_seq_len = 20

        #   POST-PROCESSING AND ANALYTICS #
        mt = MosesTokenizer()
        self.tok_func = partial(mt.tokenize, escape=False)
        self.sent_split_func = nltk.sent_tokenize
        dt = MosesDetokenizer()
        self.detok_func = partial(dt.detokenize, unescape=False)
        true_caser = MosesTruecaser(load_from=self.tcaser_model_path,
                                    is_asr=True)
        self.true_case_func = partial(true_caser.truecase,
                                      return_str=True,
                                      use_known=True)
        self.analytics_func = partial(ngram_seq_analysis,
                                      tokenizer=self.tok_func,
                                      sent_splitter=self.sent_split_func,
                                      n_grams_to_comp=(2, 3, 4))
Example #14
0
                    help='Sets the regime of training/inference.',
                    required=True)
parser.add_argument(
    '--inference',
    action='store_true',
    help='If set, will perform inference/summary generation otherwise training.'
)

regime = parser.parse_args().regime
inference = parser.parse_args().inference

run_conf = RUN_CONFIG_REGISTRY[regime]()

logger = init_logger(logger_name="",
                     level=INFO,
                     output_path=comb_paths(run_conf.output_path, "log.txt"))

#   ENV and hyper-params handling  #
manual_seed(run_conf.seed)
np.random.seed(run_conf.seed)
cuda_visible_devices = str(run_conf.cuda_device_ids) \
    if isinstance(run_conf.cuda_device_ids, int) else \
    ",".join([str(dev_id) for dev_id in run_conf.cuda_device_ids])
os.environ['CUDA_VISIBLE_DEVICES'] = cuda_visible_devices
device_count = 1 if not isinstance(run_conf.cuda_device_ids, list) \
    else max(1, len(run_conf.cuda_device_ids))
device = 'cuda' if len(run_conf.cuda_device_ids) > 0 else 'cpu'
logger.info('CUDA_VISIBLE_DEVICES=%s' % cuda_visible_devices)

#   DATA SOURCES   #
Example #15
0
    def run(self):
        log_file_path = comb_paths(
            GlobalConfig().out_dir_path, "logs", FOLDER_NAME,
            "%s.txt" % get_file_name(self.inp_file_path))
        init_logger(LOGGER_NAME, output_path=log_file_path)

        init_unit_count = 0
        init_group_count = 0

        group_id_to_units = {}
        group_unit_counts = []

        # 1. reading data and filtering out short/long reviews and
        # unpopular groups
        inp_dir = self.input()[0]
        logger.info("Subsampling `%s`." % inp_dir.path)
        for inp_file_path in iter_file_paths(inp_dir.path):
            group_id = get_file_name(inp_file_path)
            group_units = []
            init_group_count += 1
            for data_unit in read_csv_file(inp_file_path, sep='\t'):
                init_unit_count += 1
                rev_text = data_unit[OutputFields.REV_TEXT].split()

                # removing too short and long reviews
                if len(rev_text) < self.min_rev_len or \
                        len(rev_text) > self.max_rev_len:
                    continue

                group_units.append(data_unit)

            # removing unpopular groups
            if len(group_units) < self.min_revs:
                continue

            group_id_to_units[group_id] = group_units
            group_unit_counts.append(len(group_units))

        if not len(group_id_to_units):
            raise ValueError("No groups to proceed.")

        # 2. filtering by percentile
        perc = np.percentile(group_unit_counts, self.percentile)

        # removing above a kth percentile groups
        subs_group_id_to_units = {}
        subs_units_count = 0
        subs_units_max_count = 0

        for group_id, group_units in group_id_to_units.items():
            if len(group_units) < perc or perc == 1.:

                # making sure that the subsampled number of reviews does not
                # exceed a threshold unless most of the businesses only have
                # one review
                if self.max_total_revs is not None \
                        and (subs_units_count + len(
                    group_units)) > self.max_total_revs:
                    break

                subs_units_count += len(group_units)
                subs_units_max_count = max(subs_units_max_count,
                                           len(group_units))

                subs_group_id_to_units[group_id] = group_units

        if subs_units_count == 0:
            raise ValueError("All units were subsampled out. "
                             "Please adjust the parameters.")

        # 3. dumping to files
        write_groups_to_csv(self.act_out_dir_path,
                            subs_group_id_to_units,
                            sep='\t')

        # 4. logging statistics
        stats = OrderedDict()
        stats['General'] = OrderedDict()
        stats['General']['inp dir'] = inp_dir.path

        stats['Initial'] = OrderedDict()
        stats['Initial']['group count'] = init_group_count
        stats['Initial']['unit count'] = init_unit_count

        stats['After Filtering'] = OrderedDict()
        stats['After Filtering']['group count'] = len(group_id_to_units)
        stats['After Filtering']['unit count'] = np.sum(group_unit_counts)
        stats['After Filtering']['percentile count'] = perc

        stats['After Subsampling'] = OrderedDict()
        stats['After Subsampling']['group count'] = len(subs_group_id_to_units)
        stats['After Subsampling']['unit count'] = subs_units_count
        stats['After Subsampling'][
            'max units per group'] = subs_units_max_count

        stats_str = format_stats(stats)

        logger.info(stats_str)
def write_groups_to_csv(out_dir_path, group_id_to_units, sep='\t'):
    for group_id, group_units in group_id_to_units.items():
        full_file_name = "%s.csv" % group_id
        out_file_path = comb_paths(out_dir_path, full_file_name)
        write_group_to_csv(out_file_path, group_units, sep=sep)