Exemple #1
0
def gen_seqs(output_folder, gen_func, data_sources):
    """
    Generates the output (reconstructed sequences and summaries) based on the
    provided data-sources.
    """
    if not isinstance(data_sources, list):
        data_sources = [data_sources]
    logger.info("Performing generation of sequences/summaries.")
    for data_source in data_sources:
        fn = get_file_name(data_source['data_path'])
        if fn == '':
            _, fn = os.path.split(os.path.dirname(data_source['data_path']))
        ofp = comb_paths(output_folder, "%s.json" % fn)
        gen_func(data_source, ofp)
Exemple #2
0
 def summ_eval(self, out_dir_path, data_source, **kwargs):
     """Runs evaluation of summaries."""
     assert self.eval_pipeline is not None
     empty_cache()
     summ_gen_func = partial(self.summ_gen_wrapper, **kwargs)
     output_fn = "%s_eval.json" % get_file_name(data_source['data_path'])
     out_file_path = comb_paths(out_dir_path, output_fn)
     logger.info("Performing summary evaluation on %s." % data_source)
     eval_proc = SummEvalProc(self.eval_pipeline,
                              summs_gen_func=summ_gen_func,
                              rev_formatter_func=self.gen_seq_postproc,
                              summ_formatter_func=self.summ_postproc,
                              analytics_func=self.seq_analytics)
     eval_proc.eval(data_source, out_file_path=out_file_path)
def read_amazon_data(path, max_revs=None, replace_xml=False):
    """Reads AmazonFields data, formats and enriches by adding the category attribute.

    Args:
        path (str): data path to a file with AmazonFields reviews.
        max_revs (int): the maximum number of reviews to read.
        replace_xml (bool): if set to True will replace XML/HTML symbols with
            proper strings.

    Returns: an iterator over pairs of group_id and list of data-units (reviews
        with attributes).

    """
    amazon_to_output_map = {
        AmazonFields.PROD_ID: OutputFields.GROUP_ID,
        AmazonFields.REV_TEXT: OutputFields.REV_TEXT,
        AmazonFields.OVERALL: OutputFields.RATING
    }
    dus = []
    prev_prod_id = None
    for indx, du in enumerate(parse(path)):
        if any((du_key not in du for du_key in amazon_to_output_map.keys())):
            continue

        prod_id = du[AmazonFields.PROD_ID]

        if replace_xml:
            du[AmazonFields.REV_TEXT] = unescape(du[AmazonFields.REV_TEXT])
        du = {
            amazon_to_output_map[attr]: du[attr]
            for attr in amazon_to_output_map.keys()
        }

        # adding the category attribute based on the file name
        du[OutputFields.CAT] = get_file_name(path).lower()

        du[OutputFields.REV_TEXT] = clean_text(du[OutputFields.REV_TEXT])

        if prev_prod_id is not None and prod_id != prev_prod_id:
            yield prev_prod_id, dus
            dus = []

        prev_prod_id = prod_id
        dus.append(du)

        if max_revs and indx >= max_revs - 1:
            break
    if len(dus):
        yield prev_prod_id, dus
Exemple #4
0
def summ_eval(output_folder,
              data_pipeline,
              eval_data_source,
              summ_gen_func,
              rev_formatter_func,
              sent_splitter=None,
              avg_rouge=True,
              analytics_func=None):
    """Performs evaluation based on Amazon and YELP summaries."""
    output_fn = "%s_eval.json" % get_file_name(eval_data_source['data_path'])
    output_path = comb_paths(output_folder, output_fn)
    logger.info("Performing the summary evaluation on %s." % eval_data_source)
    eval_proc = SummEvalProc(data_pipeline,
                             avg_rouge=avg_rouge,
                             summs_gen_func=summ_gen_func,
                             sent_splitter=sent_splitter,
                             revs_formatter_func=rev_formatter_func,
                             analytics_func=analytics_func)
    eval_proc.eval(eval_data_source, output_file_path=output_path)
Exemple #5
0
    def run(self):
        # TODO: it seems that if I assign a file tokenizer to each file in a loop
        # TODO: as a dependency, it makes the whole system super slow if the
        # TODO: number of groups (=files) is very large

        init_logger(LOGGER_NAME)
        inp_folder = self.input()[0]
        failed_file_count = 0
        logger.info("Tokenizing `%s`." % inp_folder.path)
        for file_path in iter_file_paths(inp_folder.path):
            file_name = get_file_name(file_path)
            out_file_path = comb_paths(self.act_out_dir_path,
                                       "%s.csv" % file_name)
            try:
                # TODO: `_csv.Error: line contains NULL byte` was encountered in
                # TODO: a small number of files; the cause needs to be
                # TODO: investigated
                TokenizeFile(inp_file_path=file_path,
                             out_file_path=out_file_path).run()
            except CsvError:
                failed_file_count += 1
        logger.info('Failed to tokenize `%d` files in `%s`.' %
                    (failed_file_count, inp_folder.path))
                                    summ_post_proc=summ_post_proc),
              data_pipeline=eval_data_pipeline,
              rev_formatter_func=idev.format_revs,
              avg_rouge=True,
              sent_splitter=run_hp.sent_split_func,
              analytics_func=run_hp.analytics_func)

    gen_seqs(data_sources=gen_data_sources,
             output_folder=gen_folder_path,
             gen_func=partial(idev.gen_and_save_summs))
else:
    # inference procedure where summaries are generated for reviews in CSV
    # files
    infer_bsz = parser_args.infer_batch_size
    infer_inp_file_path = parser_args.infer_input_file_path
    out_file_name = get_file_name(infer_inp_file_path)
    infer_out_file_path = comb_paths(run_hp.output_path,
                                     f'{out_file_name}.out.txt')

    assert infer_inp_file_path is not None
    rev_num = get_rev_number(infer_inp_file_path)

    logger.info("Performing inference/summary generation")
    infer_data_pipeline = assemble_infer_pipeline(
        word_vocab,
        max_reviews=rev_num,
        tokenization_func=run_hp.tok_func,
        max_groups_per_chunk=infer_bsz)
    summ_pproc = SeqPostProcessor(tokenizer=lambda x: x.split(),
                                  detokenizer=run_hp.detok_func,
                                  tcaser=run_hp.true_case_func)
Exemple #7
0
    def run(self):
        log_file_path = comb_paths(
            GlobalConfig().out_dir_path, "logs", FOLDER_NAME,
            "%s.txt" % get_file_name(self.inp_file_path))
        init_logger(LOGGER_NAME, output_path=log_file_path)

        init_unit_count = 0
        init_group_count = 0

        group_id_to_units = {}
        group_unit_counts = []

        # 1. reading data and filtering out short/long reviews and
        # unpopular groups
        inp_dir = self.input()[0]
        logger.info("Subsampling `%s`." % inp_dir.path)
        for inp_file_path in iter_file_paths(inp_dir.path):
            group_id = get_file_name(inp_file_path)
            group_units = []
            init_group_count += 1
            for data_unit in read_csv_file(inp_file_path, sep='\t'):
                init_unit_count += 1
                rev_text = data_unit[OutputFields.REV_TEXT].split()

                # removing too short and long reviews
                if len(rev_text) < self.min_rev_len or \
                        len(rev_text) > self.max_rev_len:
                    continue

                group_units.append(data_unit)

            # removing unpopular groups
            if len(group_units) < self.min_revs:
                continue

            group_id_to_units[group_id] = group_units
            group_unit_counts.append(len(group_units))

        if not len(group_id_to_units):
            raise ValueError("No groups to proceed.")

        # 2. filtering by percentile
        perc = np.percentile(group_unit_counts, self.percentile)

        # removing above a kth percentile groups
        subs_group_id_to_units = {}
        subs_units_count = 0
        subs_units_max_count = 0

        for group_id, group_units in group_id_to_units.items():
            if len(group_units) < perc or perc == 1.:

                # making sure that the subsampled number of reviews does not
                # exceed a threshold unless most of the businesses only have
                # one review
                if self.max_total_revs is not None \
                        and (subs_units_count + len(
                    group_units)) > self.max_total_revs:
                    break

                subs_units_count += len(group_units)
                subs_units_max_count = max(subs_units_max_count,
                                           len(group_units))

                subs_group_id_to_units[group_id] = group_units

        if subs_units_count == 0:
            raise ValueError("All units were subsampled out. "
                             "Please adjust the parameters.")

        # 3. dumping to files
        write_groups_to_csv(self.act_out_dir_path,
                            subs_group_id_to_units,
                            sep='\t')

        # 4. logging statistics
        stats = OrderedDict()
        stats['General'] = OrderedDict()
        stats['General']['inp dir'] = inp_dir.path

        stats['Initial'] = OrderedDict()
        stats['Initial']['group count'] = init_group_count
        stats['Initial']['unit count'] = init_unit_count

        stats['After Filtering'] = OrderedDict()
        stats['After Filtering']['group count'] = len(group_id_to_units)
        stats['After Filtering']['unit count'] = np.sum(group_unit_counts)
        stats['After Filtering']['percentile count'] = perc

        stats['After Subsampling'] = OrderedDict()
        stats['After Subsampling']['group count'] = len(subs_group_id_to_units)
        stats['After Subsampling']['unit count'] = subs_units_count
        stats['After Subsampling'][
            'max units per group'] = subs_units_max_count

        stats_str = format_stats(stats)

        logger.info(stats_str)
def get_act_out_dir_path(out_dir_path, inp_file_path, middle_path):
    """Creates the final/actual output directory path specific to a step."""
    out_file_path = os.path.join(out_dir_path, middle_path,
                                 get_file_name(inp_file_path))
    return out_file_path
Exemple #9
0
    def run(self):

        log_file_path = comb_paths(GlobalConfig().out_dir_path, "logs",
                                   "partitioning.txt")
        init_logger(LOGGER_NAME, output_path=log_file_path)

        excluded_group_count = 0
        list_group_units = []

        # tracking duplicate groups as one group can be in multiple categories
        group_ids = set()
        dup_group_count = 0

        curr_unit_count = 0
        # reading data and excluding some groups
        inp_dirs = []
        for inp_dir in self.input():
            inp_dirs.append(inp_dir.path)
            for inp_group_file_path in get_file_paths(inp_dir.path):
                group_id = get_file_name(inp_group_file_path)
                if group_id in group_ids:
                    dup_group_count += 1
                    continue
                group_ids.add(group_id)
                if self._is_excluded(group_id):
                    excluded_group_count += 1
                    continue
                units = [u for u in
                         read_csv_file(inp_group_file_path, sep='\t')]
                list_group_units.append(units)

        # partitioning
        logger.info("Partitioning `%s`." % " ".join([idp for idp in inp_dirs]))
        tr_part, \
        val_part, \
        test_part = partition(list_group_units, train_part=self.train_part,
                              val_part=self.val_part, test_part=self.test_part)

        # dumping to the storage
        for title, part in zip(['train', 'val', 'test'],
                               [tr_part, val_part, test_part]):
            if len(part):
                for group_units in part:
                    group_id = group_units[0][OutputFields.GROUP_ID]
                    group_file_path = comb_paths(self.act_out_dir_path, title,
                                                 '%s.csv' % group_id)
                    write_group_to_csv(group_file_path, group_units, sep='\t')

        # logging stats
        train_rev_count = np.sum([len(gr) for gr in tr_part])
        val_rev_count = np.sum([len(gr) for gr in val_part])
        test_rev_count = np.sum([len(gr) for gr in test_part])

        stats = OrderedDict()
        stats['General'] = OrderedDict()
        stats['General']['excluded_group_count'] = excluded_group_count
        stats['General']['duplicate_group_count'] = dup_group_count
        stats['General']['train_groups'] = len(tr_part)
        stats['General']['train_rev_count'] = train_rev_count
        stats['General']['val_groups'] = len(val_part)
        stats['General']['val_rev_count'] = val_rev_count
        stats['General']['test_groups'] = len(test_part)
        stats['General']['test_rev_count'] = test_rev_count

        logger.info(format_stats(stats))