Example #1
0
    def _forced_decode(
            self,
            generator: 'models.GeneratorModel',
            src_file: str,
            ref_file: str,
            batcher: Optional[batchers.Batcher] = None,
            max_src_len: Optional[int] = None,
            assert_scores: Optional[Sequence[numbers.Real]] = None) -> None:
        """
    Perform forced decoding.

    Args:
      generator: generator model to use
      src_file: a file of src-side inputs to generate outputs for
      ref_file: path of file with reference translations
      batcher: necessary with some cases of input pre-processing such as padding or truncation
      max_src_len: if given, skip inputs that are too long
      assert_scores: if given, raise exception if the scores for generated outputs don't match the given scores
    """
        src_in = generator.src_reader.read_sents(src_file)

        # If we have a "assert scores" list return it, otherwise return "None" infinitely
        assert_in = assert_scores if assert_scores else iter(lambda: None, 1)

        # Reporting is commenced if there is some defined reporters
        is_reporting = self.reporter is not None
        event_trigger.set_reporting(is_reporting)

        # Saving the translated output to a trg file
        src_batch, ref_batch, assert_batch = [], [], []
        for curr_sent_i, (src_line, assert_line) in islice(
                enumerate(zip(src_in, assert_in)), self.max_num_sents):
            src_batch.append(src_line)
            assert_batch.append(assert_line)
            if len(src_batch) == batcher.batch_size:
                self._forced_decode_one_batch(generator, batcher, src_batch,
                                              assert_batch, max_src_len)
                src_batch, ref_batch, assert_batch = [], [], []
        if len(src_batch) != 0:
            self._forced_decode_one_batch(generator, batcher, src_batch,
                                          assert_batch, max_src_len)

        # Finishing up
        try:
            if is_reporting:
                self._conclude_report()
        finally:
            # Reporting is done in _generate_output only
            event_trigger.set_reporting(False)
Example #2
0
  def _generate_output(self, generator: 'models.GeneratorModel', src_file: str,
                       trg_file: str, batcher: Optional[batchers.Batcher] = None, max_src_len: Optional[int] = None,
                       forced_ref_file: Optional[str] = None,
                       assert_scores: Optional[Sequence[numbers.Real]] = None) -> None:
    """
    Generate outputs and write them to file.

    Args:
      generator: generator model to use
      src_file: a file of src-side inputs to generate outputs for
      trg_file: file to write outputs to
      batcher: necessary with some cases of input pre-processing such as padding or truncation
      max_src_len: if given, skip inputs that are too long
      forced_ref_file: if given, perform forced decoding with the given file of trg-side inputs
      assert_scores: if given, raise exception if the scores for generated outputs don't match the given scores
    """
    src_in = generator.src_reader.read_sents(src_file)
    # If we have a reference file return it, otherwise return "None" infinitely
    forced_ref_in = generator.trg_reader.read_sents(forced_ref_file) if forced_ref_file else iter(lambda: None, 1)
    assert_in = assert_scores if assert_scores else iter(lambda: None, 1)
    # Reporting is commenced if there is some defined reporters
    is_reporting = self.reporter is not None
    event_trigger.set_reporting(is_reporting)
    # Saving the translated output to a trg file
    with open(trg_file, 'wt', encoding='utf-8') as fp:
      src_batch, ref_batch, assert_batch = [], [], []
      for curr_sent_i, (src_line, ref_line, assert_line) in enumerate(zip(src_in, forced_ref_in, assert_in)):
        if self.max_num_sents and cur_sent_i >= self.max_num_sents:
          break
        src_batch.append(src_line)
        ref_batch.append(ref_line)
        assert_batch.append(assert_line)
        if len(src_batch) == batcher.batch_size:
          self._generate_one_batch(generator, batcher, src_batch, ref_batch, assert_batch, max_src_len, fp)
          src_batch, ref_batch, assert_batch = [], [], []
      if len(src_batch) != 0:
        self._generate_one_batch(generator, batcher, src_batch, ref_batch, assert_batch, max_src_len, fp)
    # Finishing up
    try:
      if is_reporting:
        self._conclude_report()
    finally:
      # Reporting is done in _generate_output only
      event_trigger.set_reporting(False)
Example #3
0
    def _generate_output(self,
                         generator: 'models.GeneratorModel',
                         src_file: str,
                         trg_file: str,
                         batcher: Optional[batchers.Batcher] = None,
                         max_src_len: Optional[int] = None) -> None:
        """
    Generate outputs and write them to file.

    Args:
      generator: generator model to use
      src_file: a file of src-side inputs to generate outputs for
      trg_file: file to write outputs to
      batcher: necessary with some cases of input pre-processing such as padding or truncation
      max_src_len: if given, skip inputs that are too long
    """
        src_in = generator.src_reader.read_sents(src_file)

        # Reporting is commenced if there is some defined reporters
        is_reporting = self.reporter is not None
        event_trigger.set_reporting(is_reporting)

        # Saving the translated output to a trg file
        with open(trg_file, 'wt', encoding='utf-8') as fp:
            src_batch = []
            for curr_sent_i, src_line in islice(enumerate(src_in),
                                                self.max_num_sents):
                src_batch.append(src_line)
                if len(src_batch) == batcher.batch_size:
                    self._generate_one_batch(generator, batcher, src_batch,
                                             max_src_len, fp)
                    src_batch = []
                if settings.PRETEND: break
            if len(src_batch) != 0:
                self._generate_one_batch(generator, batcher, src_batch,
                                         max_src_len, fp)

        # Finishing up
        try:
            if is_reporting:
                self._conclude_report()
        finally:
            # Reporting is done in _generate_output only
            event_trigger.set_reporting(False)