Ejemplo n.º 1
0
def init_logger(logger_name=__name__, output_path=None, level=logging.INFO):
    """
    Initializes a logger for console and file writing.

    :param logger_name: self-explanatory.
    :param output_path: directory or file path where the logs should be saved.
                        By default it will not store file logs.
    :param level: self-explanatory.
    """
    logger = logging.getLogger(logger_name)
    logger.setLevel(level)

    formatter = logging.Formatter("%(asctime)s [%(levelname)s]: %(message)s")

    # adding console output
    stream_handler = logging.StreamHandler()
    stream_handler.setFormatter(formatter)
    logger.addHandler(stream_handler)

    if output_path:
        if is_file_path(output_path):
            safe_mkfdir(output_path)
            if os.path.exists(output_path):
                os.remove(output_path)
        else:
            safe_mkdir(output_path)
            # using the default name of the logger
            default_file_name = "log_" + strftime("%b_%d_%H_%M_%S") + '.txt'
            output_path = os.path.join(output_path, default_file_name)
        file_handler = logging.FileHandler(output_path)
        file_handler.setFormatter(formatter)
        logger.addHandler(file_handler)

    return logger
Ejemplo n.º 2
0
def delete_attr_from_params(input_fp,
                            output_fp,
                            attr_names,
                            match_start=False):
    """Removes a particular attrs from the dictionary of params, saves back.

    Args:
        input_fp:
        output_fp:
        attr_names:
        match_start: if set to True will match based on the beginning of the
            string. E.g., _encoder match _encoder.param1.linear.weights.
    """
    # TODO: explain how regex works
    model_params = T.load(input_fp, 'cpu')[MODEL_PARAMS]
    if match_start:
        for attr_name in attr_names:
            r_aname = re.compile("^" + attr_name)
            for param_name in list(model_params.keys()):
                if r_aname.match(param_name):
                    del model_params[param_name]
                    logger.info("Deleting: %s." % param_name)
    else:
        for attr_name in attr_names:
            if attr_name in model_params:
                del model_params[attr_name]
                logger.info("Deleting: %s." % attr_name)

    # dumping to the disk
    safe_mkfdir(output_fp)
    T.save({MODEL_PARAMS: model_params}, f=output_fp)
Ejemplo n.º 3
0
 def run(self):
     safe_mkfdir(self.out_file_path)
     data_units = []
     for data_unit in read_csv_file(self.inp_file_path, sep='\t'):
         tok_str = " ".join(tokenizer(data_unit[OutputFields.REV_TEXT]))
         data_unit[OutputFields.REV_TEXT] = tok_str
         data_units.append(data_unit)
     write_group_to_csv(self.out_file_path, data_units, sep='\t')
Ejemplo n.º 4
0
 def save(self, file_path, encoding='utf-8'):
     """Saves hyper-params object as a json file."""
     safe_mkfdir(file_path)
     params = self.to_dict(types_whitelist=OVERRIDABLE_ATTR_TYPES)
     f = codecs.open(file_path, encoding=encoding, mode='w')
     json.dump(params, f, indent=2)
     logger.debug("Extracted the following hparams: '%s'."
                  "" % " ".join(params.keys()))
     logger.info("Saved hyper-parameters to '%s'." % file_path)
Ejemplo n.º 5
0
def rename_params(input_fp, output_fp, old_attr_names, new_attr_names):
    """Renames a model's parameters, saves them to an output file."""
    assert len(old_attr_names) == len(new_attr_names)
    model_params = T.load(input_fp, 'cpu')[MODEL_PARAMS]
    for old_name, new_name in zip(old_attr_names, new_attr_names):
        model_params[new_name] = model_params[old_name]
        del model_params[old_name]
    # dumping to the disk
    safe_mkfdir(output_fp)
    T.save({MODEL_PARAMS: model_params}, f=output_fp)
Ejemplo n.º 6
0
def extract_params(input_fp, output_fp, attr_names, device='cpu'):
    """Extract a subset parameters from the file. Saves to a new file."""
    model_params = T.load(input_fp, device)[MODEL_PARAMS]
    params_to_save = {}
    for attr_name in attr_names:
        params_to_save[attr_name] = model_params[attr_name]

    # dumping to the disk
    safe_mkfdir(output_fp)
    T.save({MODEL_PARAMS: params_to_save}, f=output_fp)
Ejemplo n.º 7
0
def train_and_save_true_casing_model(input_fps, text_fname, output_fp):
    """Trains the Moses model on tokenized csv files; saves params."""
    mtr = MosesTruecaser(is_asr=True)
    reader = CsvReader(quoting=QUOTE_NONE,
                       sep='\t',
                       engine='python',
                       encoding='utf-8')
    texts = []
    logger.info("Loading data from: '%s'." % input_fps)
    for dc in reader.iter(data_path=input_fps):
        for du in dc.iter():
            texts.append(du[text_fname].split())
    logger.info("Loaded the data.")
    safe_mkfdir(output_fp)
    logger.info("Training the truecaser.")
    mtr.train(texts, save_to=output_fp, progress_bar=True, processes=1)
    logger.info("Done, saved the model to: '%s'." % output_fp)
Ejemplo n.º 8
0
def rename_params_by_prefix(input_fp, output_fp, old_prefixes, new_prefixes):
    """Renames a model's parameters by matching prefixes, saves them to an
    output file. Renaming is performed in multiple iterations if multiple
    prefixes are given.
    """
    assert len(old_prefixes) == len(new_prefixes)
    model_params = T.load(input_fp, 'cpu')[MODEL_PARAMS]
    for old_prefix, new_prefix in zip(old_prefixes, new_prefixes):
        tmp_params = dict()
        for curr_name, param in model_params.items():
            if curr_name.startswith(old_prefix):
                new_name = new_prefix + curr_name[len(old_prefix):]
                print(f"{curr_name} => {new_name}")
                curr_name = new_name
            tmp_params[curr_name] = param
        model_params = tmp_params
    # dumping to the disk
    safe_mkfdir(output_fp)
    T.save({MODEL_PARAMS: model_params}, f=output_fp)
def write_group_to_csv(out_file_path, units, sep="\t"):
    """Writes data units into a CSV file.

    Args:
        out_file_path (str): self-explanatory.
        units (list): list with dicts (review texts and other attributes).
        sep (str): separation in the output csv files.

    Returns: None.

    """
    safe_mkfdir(out_file_path)
    with open(out_file_path, 'w', encoding='utf-8') as f:
        header = None
        for du in units:
            if header is None:
                header = du.keys()
                f.write(sep.join(header) + "\n")
            str_to_write = sep.join([str(du[attr]) for attr in header])
            f.write(str_to_write + '\n')
    def save_setup_str(self, dir_path, exper_descr=None):
        """
        Logs/saves the setup of the experiment, namely 3 main components:
        1. Logs the experiment's description (if `exper_descr` is provided).
        2. Train and val data pipelines' and vocab' blueprint, saved
            to dp_vocabs.txt.
        3. Model's blueprint/summary saved to model.txt.
        """
        logger.info("Experiment's output will be saved to: '%s'." % dir_path)
        # 1. experiment
        if exper_descr:
            form_exp = format_big_box(exper_descr)
            logger.info(form_exp)

        # 2. data pipeline + vocabs
        dp_fp = os.path.join(dir_path, 'dp_vocabs.txt')
        safe_mkfdir(dp_fp)
        try:
            with open(dp_fp, 'w') as f:
                f.write(str(self.word_vocab))
                f.write('===========================')
                f.write(str(self.train_data_pipeline))
                f.write('===========================')
                f.write(str(self.val_data_pipeline))

        except Exception:
            os.remove(dp_fp)
            warnings.warn(
                "Could not get the str of the dev_data_pipeline's setup.")

        # 3. model and its interface
        m_fp = os.path.join(dir_path, 'model.txt')
        try:
            with open(m_fp, 'w') as f:
                f.write(str(self.imodel))
        except Exception:
            os.remove(m_fp)
            warnings.warn("Could not get the str of the model's setup.")
Ejemplo n.º 11
0
    def write(self, file_path, sep=' ', encoding='utf-8'):
        """
        Writes the vocabulary to a plain text file where each line is of the
        form: {token}{sep}{count}. Default special symbols are not written.

        :param file_path: self-explanatory.
        :param sep: self-explanatory.
        :param encoding: self-explanatory.
        """
        safe_mkfdir(file_path)
        with codecs.open(file_path, 'w', encoding=encoding) as f:
            for symbol in self:
                token = symbol.token
                count = str(symbol.count)
                try:
                    str_entry = sep.join([token, count])
                    f.write(str_entry)
                    f.write("\n")
                except Exception:
                    logger.fatal(
                        "Below entry produced a fatal error in write().")
                    logger.fatal(symbol.token)
                    raise ValueError("Could not process a token.")
        logger.info("Vocabulary is written to: '%s'." % file_path)
Ejemplo n.º 12
0
    def eval(self, data_source, output_file_path=None):
        """
        Assumes that batches contain SUMMS that are lists of sublists,
        where is sublist contain a fixed number of summary strings. I.e.
        summaries should not be tokenized.

        :param data_source:
        :param output_file_path:
        """
        output_dc = DataChunk(
            **{
                OutputF.GOLD_SUMMS: [],
                OutputF.GEN_SUMM: [],
                OutputF.GROUP_ID: [],
                OutputF.CAT: [],
                OutputF.ROUGE: [],
                OutputF.REV: []
            })
        rouge_evaluator = Rouge()
        skipped_summs = 0

        for batch in self.data_pipeline.iter(**data_source):
            # notice that each product has K true summaries created by
            # annotators
            true_summs = batch[ModelF.SUMMS]
            prod_ids = batch[ModelF.SUMM_GROUP_ID]
            cats = batch[ModelF.SUMM_CAT]

            # getting group reviews that were used as input to produce summaries
            inp_revs = self.revs_formatter_func(batch[ModelF.REV])
            group_rev_indxs = batch[ModelF.GROUP_REV_INDXS]
            group_rev_indxs_mask = batch[ModelF.GROUP_REV_INDXS_MASK]
            group_revs = get_group_reviews(inp_revs, group_rev_indxs,
                                           group_rev_indxs_mask)

            gen_summs = self.summs_gen_func(batch)

            assert (len(true_summs) == len(gen_summs))

            # accumulating ROUGE statistics
            res = []
            for gen_summ, _true_summs in zip(gen_summs, true_summs):

                if len(gen_summ) == 0:
                    skipped_summs += 1
                    res.append(None)
                    continue

                # extra [] wrapping is needed as the accum method is batch based
                r_avg, _, r_max, _ = rouge_evaluator.accum(
                    hypotheses=[gen_summ], references=[_true_summs])
                if self.avg_rouge:
                    curr_rouge = r_avg
                else:
                    curr_rouge = r_max
                res.append(curr_rouge)

            # splitting by the sentence for better visualization
            if self.sent_splitter:
                group_revs = self.split_group_seqs_by_sents(group_revs)
                true_summs = self.split_group_seqs_by_sents(true_summs)
                gen_summs = [self.sent_splitter(summ) for summ in gen_summs]

            # storing the output batch for later dumping
            output_dc[OutputF.GOLD_SUMMS] += true_summs
            output_dc[OutputF.GEN_SUMM] += gen_summs
            output_dc[OutputF.REV] += group_revs
            output_dc[OutputF.CAT] += list(cats)
            output_dc[OutputF.GROUP_ID] += list(prod_ids)
            output_dc[OutputF.ROUGE] += res

        # running analytics
        if self.analytics_func:
            if self.sent_splitter:
                # performing a preliminary merge of sentences
                summs_to_analyze = [
                    " ".join(sents) for sents in output_dc[ModelF.GEN_SUMM]
                ]
            else:
                summs_to_analyze = output_dc[ModelF.GEN_SUMM]
            res = self.analytics_func(summs_to_analyze)
            logger.info("Ran analytics of generated summaries.")
            logger.info(" ".join("%s: %.3f" % (k, v) for k, v in res))

        final_metrs = rouge_evaluator.aggr(avg=self.avg_rouge)
        logger.info("ROUGE scores (avg_rouge=%s): " % self.avg_rouge)
        for k, v in final_metrs.items():
            logger.info("%s based avg. %s." % (k, metrics_to_str(v)))

        # this is a safe way to make proper arrays
        for k in output_dc:
            l = len(output_dc[k])
            cont = np.zeros(l, dtype='object')
            for indx in range(l):
                cont[indx] = output_dc[k][indx]
            output_dc[k] = cont

        if output_file_path:
            gr_fields = [OutputF.CAT, OutputF.GROUP_ID]
            safe_mkfdir(output_file_path)
            output_file = codecs.open(output_file_path, 'w')
            output_dc.to_json(f=output_file, grouping_fnames=gr_fields)
            logger.info("Wrote the eval output to: "
                        "'%s'." % output_file_path)
        logger.info("Not generated %d summaries." % skipped_summs)
Ejemplo n.º 13
0
    def eval(self, data_source, out_file_path=None):
        """
        Assumes that batches contain SUMMS that are lists of sublists,
        where is sublist contain a fixed number of summary strings. I.e.
        summaries should not be tokenized.
        """
        output_dc = DataChunk(
            **{
                OutDataF.GOLD_SUMMS: [],
                OutDataF.GEN_SUMM: [],
                OutDataF.GROUP_ID: [],
                OutDataF.CAT: [],
                OutDataF.ROUGE: [],
                OutDataF.PROPS: [],
                OutDataF.PRED_PROPS: [],
                OutDataF.INP_REV: []
            })
        rouge_evaluator = GoogleRouge()

        prop_old_new_fnames = {
            ModelF.LEN_PROP: OutDataF.LEN_PROP,
            ModelF.RATING_PROP: OutDataF.RATING_PROP,
            ModelF.ROUGE_PROP: OutDataF.ROUGE_PROP,
            ModelF.POV_PROP: OutDataF.POV_PROP
        }

        skipped_summs = 0
        gen_summs_coll = []

        for batch in self.data_pipeline.iter(**data_source):
            # notice that each product has K true summaries created by
            # annotators
            inp_revs = batch[ModelF.REV].numpy()
            true_summs = batch[ModelF.SUMMS]
            group_ids = list(batch[ModelF.GROUP_ID])
            cats = list(batch[ModelF.CAT])
            group_rev_indxs = batch[ModelF.GROUP_REV_INDXS].numpy()
            group_rev_indxs_mask = batch[ModelF.GROUP_REV_INDXS_MASK].numpy()

            gen_summ, pred_props = self.summs_gen_func(batch)
            assert (len(true_summs) == len(gen_summ))

            # below one will be used for analytics
            gen_summs_coll += gen_summ

            #  props
            props = {
                n: batch[o].tolist()
                for o, n in prop_old_new_fnames.items()
            }
            props = dct_list_to_list_dict(props)
            output_dc[OutDataF.PROPS] += props
            if len(pred_props):
                pred_props = dct_list_to_list_dict(pred_props)
                output_dc[OutDataF.PRED_PROPS] += pred_props

            # accumulating ROUGE statistics
            rouge_scores = []
            for _gen_summ, _tr_summs in zip(gen_summ, true_summs):
                if len(_gen_summ) == 0:
                    skipped_summs += 1
                    rouge_scores.append(None)
                    continue

                # extra [] wrapping is needed as the accum method is batch based
                r_score = rouge_evaluator.accum(hyp=[_gen_summ],
                                                refs=[_tr_summs])[0]
                rouge_scores.append(r_score)

            # grouping and formatting
            if self.rev_formatter_func is not None:
                inp_revs = [self.rev_formatter_func(seq) for seq in inp_revs]
            group_revs = get_group_reviews(inp_revs, group_rev_indxs,
                                           group_rev_indxs_mask)

            if self.summ_formatter_func is not None:
                true_summs = [[self.summ_formatter_func(s) for s in _summs]
                              for _summs in true_summs]
                gen_summ = [self.summ_formatter_func(s) for s in gen_summ]

            # storing the output batch for later dumping
            output_dc[OutDataF.GOLD_SUMMS] += true_summs
            output_dc[OutDataF.GEN_SUMM] += gen_summ
            output_dc[OutDataF.INP_REV] += group_revs
            output_dc[OutDataF.CAT] += cats
            output_dc[OutDataF.GROUP_ID] += group_ids
            output_dc[OutDataF.ROUGE] += rouge_scores

        # some models don't output predicted props this condition deals with it
        if not len(output_dc[OutDataF.PRED_PROPS]):
            del output_dc[OutDataF.PRED_PROPS]

        # running analytics
        if self.analytics_func:
            formatted_true_summs = []
            for seq_coll in output_dc[OutDataF.GOLD_SUMMS]:
                for seq in seq_coll:
                    seq = seq if not isinstance(seq, list) else " ".join(seq)
                    formatted_true_summs.append(seq)
            an_scores = self.analytics_func(formatted_true_summs)
            form_an_scores = format_stats(an_scores,
                                          title="True Text Analytics")
            for s in form_an_scores:
                logger.info(s)

            an_scores = self.analytics_func(gen_summs_coll)
            form_an_scores = format_stats(an_scores,
                                          title="Gen. Text Analytics")
            for s in form_an_scores:
                logger.info(s)

        final_metrs = rouge_evaluator.aggr()
        form_final_metrs = format_stats(final_metrs, title="ROUGE Scores")
        for s in form_final_metrs:
            logger.info(s)

        if out_file_path:
            gr_fields = [OutDataF.CAT, OutDataF.GROUP_ID]
            safe_mkfdir(out_file_path)
            output_file = codecs.open(out_file_path, 'w')
            output_dc.to_json(f=output_file, grouping_fnames=gr_fields)
            logger.info("Wrote the eval output to: " "'%s'." % out_file_path)
        logger.info("Not generated %d summaries." % skipped_summs)
    def gen_and_save_summs(self, data_source, output_file_path):
        """
        Generates summaries by running the model and writes them along with other
        attributes to a json file.

        :param data_source: self-explanatory.
        :param output_file_path: self-explanatory.
        """
        safe_mkfdir(output_file_path)
        start_id = self.word_vocab[START].id
        end_id = self.word_vocab[END].id
        pad_id = self.word_vocab[PAD].id
        output_file = open(output_file_path, encoding='utf-8', mode='w')
        vocab_mapper = VocabMapper(
            {
                ModelF.REV: self.word_vocab,
                ModelF.GEN_SUMM: self.word_vocab,
                ModelF.GEN_REV: self.word_vocab
            },
            symbols_attr='token')
        chunk_coll = []

        for i, dc in enumerate(self.val_data_pipeline.iter(**data_source), 1):
            gen_revs, _, gen_summ, _ = self.imodel.predict(dc)

            # converting to the data-chunk to use the internal writing
            # mechanism
            new_dc = DataChunk()
            for fn in [
                    ModelF.SUMM_CAT, ModelF.SUMM_GROUP_ID, ModelF.REV,
                    ModelF.GROUP_ID
            ]:
                new_dc[fn] = dc[fn]
            new_dc[ModelF.GEN_REV] = gen_revs
            new_dc[ModelF.GEN_SUMM] = gen_summ

            seq_fnames = [ModelF.GEN_SUMM, ModelF.GEN_REV, ModelF.REV]
            # converting PyTorch tensors to numpy arrays if present
            new_dc = convert_tensors_to_numpy(new_dc)
            for fn in seq_fnames:
                new_dc[fn] = format_seqs(new_dc[fn],
                                         start_id=start_id,
                                         end_id=end_id,
                                         pad_id=pad_id)
            new_dc = vocab_mapper(new_dc)

            # convert all seqs to strings
            for fn in seq_fnames:
                new_dc[fn] = conv_seqs_to_sents(new_dc[fn])

            # group by product ids
            indxs = group_vals_by_keys(range(len(new_dc[ModelF.REV])),
                                       new_dc[ModelF.GROUP_ID]).values()

            for fn in [ModelF.GEN_REV, ModelF.REV]:
                new_dc[fn] = self._group_by_prods(indxs, new_dc[fn])

            del new_dc[ModelF.GROUP_ID]

            chunk_coll.append(new_dc)

        output_chunk = concat_chunks(*chunk_coll)

        output_chunk.to_json(
            f=output_file,
            grouping_fnames=[ModelF.SUMM_CAT, ModelF.SUMM_GROUP_ID])

        logger.info("Generated summaries and saved to: '%s'."
                    "" % output_file_path)

        # analytics for repetitions checking
        # because gen summs contain list of strings I need to merge them
        # together before running analytics
        all_gen_summ_strs = [
            " ".join(sents) for sents in output_chunk[ModelF.GEN_SUMM]
        ]

        an_metrics = ngram_seq_analysis(all_gen_summ_strs,
                                        tokenizer=self.tok_func,
                                        sent_splitter=self.sent_split_func,
                                        n_grams_to_comp=(2, 3, 4))

        logger.info("Ran analytics of generated summaries.")
        metrs_str = " ".join(["%s: %.3f" % (k, v) for k, v in an_metrics])
        logger.info(metrs_str)
Ejemplo n.º 15
0
    def gen_seqs(self, data_source, out_file_path, **kwargs):
        """Generates sequences and saves them to a file."""
        assert self.seq_gen_pipeline is not None
        empty_cache()

        safe_mkfdir(out_file_path)
        logger.info(f'Generating conditional sequences/summaries for '
                    f'{data_source}.')

        # storing to a data-chunk and dumping to the storage
        output_dc = DataChunk()
        fields_to_init = [
            OutDataF.GROUP_ID, OutDataF.INP_REV, OutDataF.GEN_REV,
            OutDataF.REV_INDX, OutDataF.INP_REV_RATING, OutDataF.INP_REV_LEN,
            OutDataF.GEN_REV_LEN, OutDataF.PROPS, OutDataF.PRED_PROPS
        ]
        for fname in fields_to_init:
            output_dc[fname] = []

        prop_old_new_fnames = {
            ModelF.LEN_PROP: OutDataF.LEN_PROP,
            ModelF.RATING_PROP: OutDataF.RATING_PROP,
            ModelF.ROUGE_PROP: OutDataF.ROUGE_PROP,
            ModelF.POV_PROP: OutDataF.POV_PROP
        }

        for batch in self.seq_gen_pipeline.iter(**data_source):

            rev_group_id = batch[ModelF.REV_GROUP_ID]
            rev_indx = _comp_rev_indx(rev_group_id)
            rating = batch[ModelF.REV_RATING]
            inp_rev = batch[ModelF.REV].numpy()

            gen_rev, pred_props = self.imodel.generate(batch=batch, **kwargs)

            # post-processing
            if self.gen_seq_postproc:
                inp_rev = [self.gen_seq_postproc(seq) for seq in inp_rev]
                gen_rev = [self.gen_seq_postproc(seq) for seq in gen_rev]

            output_dc[OutDataF.GROUP_ID] += rev_group_id
            output_dc[OutDataF.INP_REV] += inp_rev
            output_dc[OutDataF.GEN_REV] += gen_rev
            output_dc[OutDataF.INP_REV_RATING] += rating
            output_dc[OutDataF.REV_INDX] += rev_indx

            # for analytics
            inp_seq_len = [_comp_seq_len(seq) for seq in inp_rev]
            gen_seq_len = [_comp_seq_len(seq) for seq in gen_rev]
            output_dc[OutDataF.INP_REV_LEN] += inp_seq_len
            output_dc[OutDataF.GEN_REV_LEN] += gen_seq_len

            #  props
            props = {
                n: batch[o].tolist()
                for o, n in prop_old_new_fnames.items()
            }
            props = dct_list_to_list_dict(props)
            output_dc[OutDataF.PROPS] += props
            if len(pred_props):
                pred_props = dct_list_to_list_dict(pred_props)
                output_dc[OutDataF.PRED_PROPS] += pred_props

        # some models don't output predicted props this condition deals with it
        if not len(output_dc[OutDataF.PRED_PROPS]):
            del output_dc[OutDataF.PRED_PROPS]

        # running analytics of text
        if self.seq_analytics:
            for text_fname in [OutDataF.INP_REV, OutDataF.GEN_REV]:
                formatted_seqs = []
                for seq in output_dc[text_fname]:
                    seq = seq if not isinstance(seq, list) else " ".join(seq)
                    formatted_seqs.append(seq)
                fscores = format_stats(
                    self.seq_analytics(formatted_seqs),
                    f"`{text_fname.upper()}` Text Analytics")
                for s in fscores:
                    logger.info(s)

        output_dc.to_json(
            f=codecs.open(out_file_path, 'w', 'utf-8'),
            grouping_fnames=[OutDataF.GROUP_ID, OutDataF.REV_INDX])
        logger.info("Generated sequences and saved to: '%s'." % out_file_path)