Exemple #1
0
 def load_lexicon(self):
     logger.info("Loading lexicon from file: " + self.lexicon_file)
     assert self.src_vocab.frozen
     assert self.trg_vocab.frozen
     lexicon = [{} for _ in range(len(self.src_vocab))]
     with open(self.lexicon_file, encoding='utf-8') as fp:
         for line in fp:
             try:
                 trg, src, prob = line.rstrip().split()
             except:
                 logger.warning("Failed to parse 'trg src prob' from:" +
                                line.strip())
                 continue
             trg_id = self.trg_vocab.convert(trg)
             src_id = self.src_vocab.convert(src)
             lexicon[src_id][trg_id] = float(prob)
     # Setting the rest of the weight to the unknown word
     for i in range(len(lexicon)):
         sum_prob = sum(lexicon[i].values())
         if sum_prob < 1.0:
             lexicon[i][self.trg_vocab.convert(
                 self.trg_vocab.unk_token)] = 1.0 - sum_prob
     # Overriding special tokens
     src_unk_id = self.src_vocab.convert(self.src_vocab.unk_token)
     trg_unk_id = self.trg_vocab.convert(self.trg_vocab.unk_token)
     lexicon[self.src_vocab.SS] = {self.trg_vocab.SS: 1.0}
     lexicon[self.src_vocab.ES] = {self.trg_vocab.ES: 1.0}
     # TODO(philip30): Note sure if this is intended
     lexicon[src_unk_id] = {trg_unk_id: 1.0}
     return lexicon
Exemple #2
0
def print_cg_conditional() -> None:
    if settings.PRINT_CG_ON_ERROR:
        if xnmt.backend_dynet:
            import dynet as dy
            dy.print_text_graphviz()
        else:
            logger.warning("CG printing not implemented with Torch backend")
Exemple #3
0
 def calc_attention(self, state: dy.Expression) -> dy.Expression:
   logger.warning("BilinearAttender does currently not do masking, which may harm training results.")
   Wa = dy.parameter(self.pWa)
   scores = (dy.transpose(state) * Wa) * self.I
   normalized = dy.softmax(scores)
   self.attention_vecs.append(normalized)
   return dy.transpose(normalized)
Exemple #4
0
 def update(self) -> None:
     """
 Update the parameters.
 """
     try:
         if not (self.skip_noisy and self._check_gradients_noisy()):
             self.optimizer.update()
         else:
             logger.info("skipping noisy update")
     except RuntimeError:
         logger.warning(
             "Failed to perform update. Skipping example and clearing gradients."
         )
         for subcol in ParamManager.param_col.subcols.values():
             for param in subcol.parameters_list():
                 param.scale_gradient(0)
Exemple #5
0
 def update(self) -> None:
   """
   Update the parameters.
   """
   self.global_step += 1
   if settings.USE_TENSORBOARD:
     tee.tensorboard_writer.add_scalars(name="lr", tag_scalar_dict={"lr": self.optimizer.learning_rate},
                                        global_step=self.global_step)
     if not self.skip_noisy:
       tee.tensorboard_writer.add_scalars(name="grad", tag_scalar_dict={"norm": np.exp(self.grad_log_norm())},
                                                                       global_step=self.global_step)
   try:
     if not (self.skip_noisy and self.check_gradients_noisy()):
       self.optimizer.update()
     else:
       logger.info("skipping noisy update")
   except RuntimeError:
     logger.warning("Failed to perform update. Skipping example and clearing gradients.")
     for subcol in ParamManager.param_col.subcols.values():
       for param in subcol.parameters_list():
         param.scale_gradient(0)
Exemple #6
0
def main(overwrite_args=None):

    with tee.Tee(), tee.Tee(error=True):
        argparser = argparse.ArgumentParser()
        argparser.add_argument("--dynet-mem", type=str)
        argparser.add_argument("--dynet-seed",
                               type=int,
                               help="set random seed for DyNet and XNMT.")
        argparser.add_argument("--dynet-autobatch", type=int)
        argparser.add_argument("--dynet-devices", type=str)
        argparser.add_argument("--dynet-viz",
                               action='store_true',
                               help="use visualization")
        argparser.add_argument("--dynet-gpu",
                               action='store_true',
                               help="use GPU acceleration")
        argparser.add_argument("--dynet-gpu-ids", type=int)
        argparser.add_argument("--dynet-gpus", type=int)
        argparser.add_argument("--dynet-weight-decay", type=float)
        argparser.add_argument("--dynet-profiling", type=int)
        argparser.add_argument("--settings",
                               type=str,
                               default="standard",
                               help="settings (standard, debug, or unittest)"
                               "must be given in '=' syntax, e.g."
                               " --settings=standard")
        argparser.add_argument("experiments_file")
        argparser.add_argument("experiment_name",
                               nargs='*',
                               help="Run only the specified experiments")
        argparser.set_defaults(generate_doc=False)
        args = argparser.parse_args(overwrite_args)

        if args.dynet_seed:
            random.seed(args.dynet_seed)
            np.random.seed(args.dynet_seed)

        if args.dynet_gpu:
            if settings.CHECK_VALIDITY:
                settings.CHECK_VALIDITY = False
                log_preamble(
                    "disabling CHECK_VALIDITY because it is not supported on GPU currently",
                    logging.WARNING)

        config_experiment_names = YamlPreloader.experiment_names_from_file(
            args.experiments_file)

        results = []

        # Check ahead of time that all experiments exist, to avoid bad surprises
        experiment_names = args.experiment_name or config_experiment_names

        if args.experiment_name:
            nonexistent = set(experiment_names).difference(
                config_experiment_names)
            if len(nonexistent) != 0:
                raise Exception("Experiments {} do not exist".format(",".join(
                    list(nonexistent))))

        log_preamble(
            f"running XNMT revision {tee.get_git_revision()} on {socket.gethostname()} on {datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}"
        )
        for experiment_name in experiment_names:

            ParamManager.init_param_col()

            uninitialized_exp_args = YamlPreloader.preload_experiment_from_file(
                args.experiments_file, experiment_name)

            logger.info(f"=> Running {experiment_name}")

            glob_args = uninitialized_exp_args.data.exp_global
            log_file = glob_args.log_file

            if os.path.isfile(log_file) and not settings.OVERWRITE_LOG:
                logger.warning(
                    f"log file {log_file} already exists, skipping experiment; please delete log file by hand if you want to overwrite it "
                    f"(or activate OVERWRITE_LOG, by either specifying an environment variable as OVERWRITE_LOG=1, "
                    f"or specifying --settings=debug, or changing xnmt.settings.Standard.OVERWRITE_LOG manually)"
                )
                continue

            tee.set_out_file(log_file)

            model_file = glob_args.model_file

            uninitialized_exp_args.data.exp_global.commandline_args = args

            # Create the model
            experiment = initialize_if_needed(uninitialized_exp_args)
            ParamManager.param_col.model_file = experiment.exp_global.model_file
            ParamManager.param_col.save_num_checkpoints = experiment.exp_global.save_num_checkpoints
            ParamManager.populate()

            # Run the experiment
            eval_scores = experiment(save_fct=lambda: save_to_file(
                model_file, experiment, ParamManager.param_col))
            results.append((experiment_name, eval_scores))
            print_results(results)

            tee.unset_out_file()
Exemple #7
0
 def on_html_report(self, context=None):
     logger.warning("Unimplemented html report for retriever!")
     idx, src_words, scores, kbest = self.html_input
     html = etree.Element('html')
     # TODO(philip30): Write the logic of retriever html here
     return html
Exemple #8
0
    def __call__(self,
                 generator,
                 src_file=None,
                 trg_file=None,
                 candidate_id_file=None):
        """
    Args:
      generator (GeneratorModel): the model to be used
      src_file (str): path of input src file to be translated
      trg_file (str): path of file where trg translatons will be written
      candidate_id_file (str): if we are doing something like retrieval where we select from fixed candidates, sometimes we want to limit our candidates to a certain subset of the full set. this setting allows us to do this.
    """
        args = dict(src_file=src_file or self.src_file,
                    trg_file=trg_file or self.trg_file,
                    ref_file=self.ref_file,
                    max_src_len=self.max_src_len,
                    post_process=self.post_process,
                    candidate_id_file=candidate_id_file,
                    report_path=self.report_path,
                    report_type=self.report_type,
                    beam=self.beam,
                    max_len=self.max_len,
                    len_norm_type=self.len_norm_type,
                    mode=self.mode)

        is_reporting = issubclass(
            generator.__class__,
            Reportable) and args["report_path"] is not None
        # Corpus
        src_corpus = list(generator.src_reader.read_sents(args["src_file"]))
        # Get reference if it exists and is necessary
        if args["mode"] == "forced" or args["mode"] == "forceddebug" or args[
                "mode"] == "score":
            if args["ref_file"] == None:
                raise RuntimeError(
                    "When performing {} decoding, must specify reference file".
                    format(args["mode"]))
            score_src_corpus = []
            ref_corpus = []
            with open(args["ref_file"], "r", encoding="utf-8") as fp:
                for line in fp:
                    if args["mode"] == "score":
                        nbest = line.split("|||")
                        assert len(
                            nbest
                        ) > 1, "When performing scoring, ref_file must have nbest format 'index ||| hypothesis'"
                        src_index = int(nbest[0].strip())
                        assert src_index < len(
                            src_corpus
                        ), "The src_file has only {} instances, nbest file has invalid src_index {}".format(
                            len(src_corpus), src_index)
                        score_src_corpus.append(src_corpus[src_index])
                        trg_input = generator.trg_reader.read_sent(
                            nbest[1].strip())
                    else:
                        trg_input = generator.trg_reader.read_sent(line)
                    ref_corpus.append(trg_input)
            if args["mode"] == "score":
                src_corpus = score_src_corpus
            else:
                if self.max_len and any(
                        len(s) > self.max_len for s in ref_corpus):
                    logger.warning(
                        "Forced decoding with some targets being longer than max_len. Increase max_len to avoid unexpected behavior."
                    )
        else:
            ref_corpus = None
        # Vocab
        src_vocab = generator.src_reader.vocab if hasattr(
            generator.src_reader, "vocab") else None
        trg_vocab = generator.trg_reader.vocab if hasattr(
            generator.trg_reader, "vocab") else None
        # Perform initialization
        generator.set_train(False)
        generator.initialize_generator(**args)

        if hasattr(generator, "set_post_processor"):
            generator.set_post_processor(self.get_output_processor())
        if hasattr(generator, "set_trg_vocab"):
            generator.set_trg_vocab(trg_vocab)
        if hasattr(generator, "set_reporting_src_vocab"):
            generator.set_reporting_src_vocab(src_vocab)

        if is_reporting:
            generator.set_report_resource("src_vocab", src_vocab)
            generator.set_report_resource("trg_vocab", trg_vocab)

        # If we're debugging, calculate the loss for each target sentence
        ref_scores = None
        if args["mode"] == 'forceddebug' or args["mode"] == 'score':
            some_batcher = xnmt.batcher.InOrderBatcher(32)  # Arbitrary
            if not isinstance(some_batcher, xnmt.batcher.InOrderBatcher):
                raise ValueError(
                    f"forceddebug requires InOrderBatcher, got: {some_batcher}"
                )
            batched_src, batched_ref = some_batcher.pack(
                src_corpus, ref_corpus)
            ref_scores = []
            for src, ref in zip(batched_src, batched_ref):
                dy.renew_cg(immediate_compute=settings.IMMEDIATE_COMPUTE,
                            check_validity=settings.CHECK_VALIDITY)
                loss_expr = generator.calc_loss(
                    src, ref, loss_calculator=LossCalculator())
                if isinstance(loss_expr.value(), Iterable):
                    ref_scores.extend(loss_expr.value())
                else:
                    ref_scores.append(loss_expr.value())
            ref_scores = [-x for x in ref_scores]

        # Make the parent directory if necessary
        make_parent_dir(args["trg_file"])

        # Perform generation of output
        if args["mode"] != 'score':
            with open(args["trg_file"], 'wt', encoding='utf-8'
                      ) as fp:  # Saving the translated output to a trg file
                src_ret = []
                for i, src in enumerate(src_corpus):
                    # This is necessary when the batcher does some sort of pre-processing, e.g.
                    # when the batcher pads to a particular number of dimensions
                    if self.batcher:
                        self.batcher.add_single_batch(src_curr=[src],
                                                      trg_curr=None,
                                                      src_ret=src_ret,
                                                      trg_ret=None)
                        src = src_ret.pop()[0]
                    # Do the decoding
                    if args["max_src_len"] is not None and len(
                            src) > args["max_src_len"]:
                        output_txt = NO_DECODING_ATTEMPTED
                    else:
                        dy.renew_cg(
                            immediate_compute=settings.IMMEDIATE_COMPUTE,
                            check_validity=settings.CHECK_VALIDITY)
                        ref_ids = ref_corpus[i] if ref_corpus != None else None
                        output = generator.generate_output(
                            src, i, forced_trg_ids=ref_ids)
                        # If debugging forced decoding, make sure it matches
                        if ref_scores != None and (
                                abs(output[0].score - ref_scores[i]) /
                                abs(ref_scores[i])) > 1e-5:
                            logger.error(
                                f'Forced decoding score {output[0].score} and loss {ref_scores[i]} do not match at sentence {i}'
                            )
                        output_txt = output[0].plaintext
                    # Printing to trg file
                    fp.write(f"{output_txt}\n")
        else:
            with open(args["trg_file"], 'wt', encoding='utf-8') as fp:
                with open(args["ref_file"], "r", encoding="utf-8") as nbest_fp:
                    for nbest, score in zip(nbest_fp, ref_scores):
                        fp.write("{} ||| score={}\n".format(
                            nbest.strip(), score))
Exemple #9
0
  def generate_output(self, translator, initial_state,
                      src_length=None, forced_trg_ids=None):
    # TODO(philip30): can only do single decoding, not batched
    assert forced_trg_ids is None or self.beam_size == 1
    if forced_trg_ids is not None and forced_trg_ids.sent_len() > self.max_len:
      logger.warning("Forced decoding with a target longer than max_len. "
                     "Increase max_len to avoid unexpected behavior.")

    active_hyp = [self.Hypothesis(0, None, None, None)]
    completed_hyp = []
    for length in range(self.max_len):
      if len(completed_hyp) >= self.beam_size:
        break
      # Expand hyp
      new_set = []
      for hyp in active_hyp:
        if length > 0:
          prev_word = hyp.word
          prev_state = hyp.output.state
        else:
          prev_word = None
          prev_state = initial_state
        if prev_word == Vocab.ES:
          completed_hyp.append(hyp)
          continue
        current_output = translator.generate_one_step(prev_word, prev_state)
        score = current_output.logsoftmax.npvalue().transpose()
        if self.scores_proc:
          self.scores_proc(score)
        # Next Words
        if forced_trg_ids is None:
          top_words = np.argpartition(score, max(-len(score),-self.beam_size))[-self.beam_size:]
        else:
          top_words = [forced_trg_ids[length]]
        # Queue next states
        for cur_word in top_words:
          new_score = self.len_norm.normalize_partial_topk(hyp.score, score[cur_word], length + 1)
          new_set.append(self.Hypothesis(new_score, current_output, hyp, cur_word))
      # Next top hypothesis
      active_hyp = sorted(new_set, key=lambda x: x.score, reverse=True)[:self.beam_size]
    # There is no hyp reached </s>
    if len(completed_hyp) == 0:
      completed_hyp = active_hyp
    # Length Normalization
    normalized_scores = self.len_norm.normalize_completed(completed_hyp, src_length)
    hyp_and_score = sorted(list(zip(completed_hyp, normalized_scores)), key=lambda x: x[1], reverse=True)
    if self.one_best:
      hyp_and_score = [hyp_and_score[0]]
    # Backtracing + Packing outputs
    results = []
    for end_hyp, score in hyp_and_score:
      logsoftmaxes = []
      word_ids = []
      attentions = []
      states = []
      current = end_hyp
      while current.parent is not None:
        word_ids.append(current.word)
        attentions.append(current.output.attention)
        # TODO(philip30): This should probably be uncommented.
        # These 2 statements are an overhead because it is need only for reinforce and minrisk
        # Furthermore, the attentions is only needed for report.
        # We should have a global flag to indicate whether this is needed or not?
        # The global flag is modified if certain objects is instantiated.
        #logsoftmaxes.append(dy.pick(current.output.logsoftmax, current.word))
        #states.append(translator.get_nobp_state(current.output.state))
        current = current.parent
      results.append(SearchOutput([list(reversed(word_ids))], [list(reversed(attentions))],
                                  [score], list(reversed(logsoftmaxes)),
                                  list(reversed(states)), None))
    return results
Exemple #10
0
def main(overwrite_args: Optional[Sequence[str]] = None) -> None:

    with tee.Tee(), tee.Tee(error=True):
        argparser = argparse.ArgumentParser()
        utils.add_backend_argparse(argparser)
        argparser.add_argument("--settings",
                               type=str,
                               default="standard",
                               help="settings (standard, debug, or unittest)"
                               "must be given in '=' syntax, e.g."
                               " --settings=standard")
        argparser.add_argument(
            "--resume",
            action='store_true',
            help="whether a saved experiment is being resumed, and"
            "locations of output files should be re-used.")
        argparser.add_argument("--backend",
                               type=str,
                               default="dynet",
                               help="backend (dynet or torch)")
        argparser.add_argument("experiments_file")
        argparser.add_argument("experiment_name",
                               nargs='*',
                               help="Run only the specified experiments")
        argparser.set_defaults(generate_doc=False)
        args = argparser.parse_args(overwrite_args)

        if xnmt.backend_dynet and args.dynet_seed: args.seed = args.dynet_seed
        if getattr(args, "seed", None):
            random.seed(args.seed)
            np.random.seed(args.seed)
            if xnmt.backend_torch: torch.manual_seed(0)

        if xnmt.backend_dynet and args.dynet_gpu and settings.CHECK_VALIDITY:
            settings.CHECK_VALIDITY = False
            log_preamble(
                "disabling CHECK_VALIDITY because it is not supported in the DyNet/GPU setting",
                logging.WARNING)

        config_experiment_names = YamlPreloader.experiment_names_from_file(
            args.experiments_file)

        results = []

        # Check ahead of time that all experiments exist, to avoid bad surprises
        experiment_names = args.experiment_name or config_experiment_names

        if args.experiment_name:
            nonexistent = set(experiment_names).difference(
                config_experiment_names)
            if len(nonexistent) != 0:
                raise Exception("Experiments {} do not exist".format(",".join(
                    list(nonexistent))))

        log_preamble(
            f"running XNMT revision {tee.get_git_revision()} on {socket.gethostname()} with {'DyNet' if xnmt.backend_dynet else 'PyTorch'} on {datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}"
        )
        for experiment_name in experiment_names:

            ParamManager.init_param_col()

            uninitialized_exp_args = YamlPreloader.preload_experiment_from_file(
                args.experiments_file, experiment_name, resume=args.resume)

            logger.info(f"=> Running {experiment_name}")

            glob_args = uninitialized_exp_args.data.exp_global
            log_file = glob_args.log_file

            if not settings.OVERWRITE_LOG:
                log_files_exist = []
                if os.path.isfile(log_file): log_files_exist.append(log_file)
                if os.path.isdir(log_file + ".tb"):
                    log_files_exist.append(log_file + ".tb/")
                if log_files_exist:
                    logger.warning(
                        f"log file(s) {' '.join(log_files_exist)} already exists, skipping experiment; "
                        f"please delete log file by hand if you want to overwrite it "
                        f"(or activate OVERWRITE_LOG, by either specifying an environment variable OVERWRITE_LOG=1, "
                        f"or specifying --settings=debug, or changing xnmt.settings.Standard.OVERWRITE_LOG manually)"
                    )
                    continue
            elif settings.OVERWRITE_LOG and os.path.isdir(log_file + ".tb"):
                shutil.rmtree(
                    log_file + ".tb/"
                )  # remove tensorboard logs from previous run that is being overwritten

            tee.set_out_file(log_file, exp_name=experiment_name)

            try:

                model_file = glob_args.model_file

                uninitialized_exp_args.data.exp_global.commandline_args = vars(
                    args)

                # Create the model
                experiment = initialize_if_needed(uninitialized_exp_args)
                ParamManager.param_col.model_file = experiment.exp_global.model_file
                ParamManager.param_col.save_num_checkpoints = experiment.exp_global.save_num_checkpoints
                ParamManager.populate()

                # Run the experiment
                eval_scores = experiment(
                    save_fct=lambda: save_to_file(model_file, experiment))
                results.append((experiment_name, eval_scores))
                print_results(results)

            except Exception as e:
                file_logger.error(traceback.format_exc())
                raise e
            finally:
                tee.unset_out_file()