Beispiel #1
0
 def write_hypos(self, all_hypos, sen_indices=None):
     """Writes the hypotheses in ``all_hypos`` to ``path`` """
     if self.f is not None:
         for hypos in all_hypos:
             self.f.write(utils.apply_trg_wmap(hypos[0].trgt_sentence,
                                               self.trg_wmap))
             self.f.write("\n")
             self.f.flush()
     else:
         with codecs.open(self.path, "w", encoding='utf-8') as f:
             for hypos in all_hypos:
                 f.write(utils.apply_trg_wmap(hypos[0].trgt_sentence,
                                              self.trg_wmap))
                 f.write("\n")
                 self.f.flush()
Beispiel #2
0
def _process_inputs(sync_symbol=-1):
    """Helper method to support multiple input files."""
    inputfiles = [args.src_test]
    while True:
        inputfile = getattr(args, "src_test%d" % (len(inputfiles) + 1), None)
        if not inputfile:
            break
        inputfiles.append(inputfile)
    # Read all input files
    inputs_tmp = [[] for i in xrange(len(inputfiles))]
    for i in xrange(len(inputfiles)):
        with codecs.open(inputfiles[i], encoding='utf-8') as f:
            for line in f:
                #                logging.debug(u'utils.apply_src_wmap(sync_symbol): {}'.format(utils.apply_trg_wmap([sync_symbol])))
                if utils.apply_trg_wmap([sync_symbol])[0] == " ":
                    inputs_tmp[i].append([
                        c.replace('|', ' ')
                        for c in line.strip().replace('   ', ' | ').split()
                    ])


#                    logging.debug(u'line: {}'.format(line))
#                    logging.debug(u'mapped: {}'.format([c.replace('|',' ') for c in line.strip().replace('   ',' | ').split()]))
                else:
                    inputs_tmp[i].append(line.strip().split())

    # Gather multiple input sentences for each line
    inputs = []
    for i in xrange(len(inputs_tmp[0])):
        input_lst = []
        for j in xrange(len(inputfiles)):
            input_lst.append(inputs_tmp[j][i])
            inputs.append(input_lst)
    return inputs
Beispiel #3
0
 def write_hypos(self, all_hypos, sen_indices=None):
     """Writes the hypotheses in ``all_hypos`` to ``path`` """
     if self.f is not None:
         for hypos in all_hypos:
             self.f.write(
                 utils.apply_trg_wmap(hypos[0].trgt_sentence,
                                      self.trg_wmap))
             self.f.write("\n")
             self.f.flush()
     else:
         with codecs.open(self.path, "w", encoding='utf-8') as f:
             for hypos in all_hypos:
                 f.write(
                     utils.apply_trg_wmap(hypos[0].trgt_sentence,
                                          self.trg_wmap))
                 f.write("\n")
                 self.f.flush()
Beispiel #4
0
 def decode(self, src_sentence):
     """Decodes a single source sentence using beam search. """
     self.initialize_predictors(src_sentence)
     hypos = [PartialHypothesis(self.get_predictor_states())]
     it = 0
     while self.stop_criterion(hypos):
         if it > self.max_len:  # prevent infinite loops
             break
         it = it + 1
         next_hypos = []
         next_scores = []
         self.min_score = utils.NEG_INF
         self.best_scores = []
         print("HYPOS")
         for hypo in hypos:
             print(
                 "it%d: %s (%f)" %
                 (it, utils.apply_trg_wmap(hypo.trgt_sentence), hypo.score))
         for hypo in hypos:
             print("H: %s (%f)" %
                   (utils.apply_trg_wmap(hypo.trgt_sentence), hypo.score))
             if hypo.get_last_word() == utils.EOS_ID:
                 next_hypos.append(hypo)
                 next_scores.append(self._get_combined_score(hypo))
                 continue
             for next_hypo in self._expand_hypo(hypo):
                 next_score = self._get_combined_score(next_hypo)
                 if next_score > self.min_score:
                     next_hypos.append(next_hypo)
                     next_scores.append(next_score)
                     self._register_score(next_score)
         if self.hypo_recombination:
             hypos = self._filter_equal_hypos(next_hypos, next_scores)
         else:
             hypos = self._get_next_hypos(next_hypos, next_scores)
     for hypo in hypos:
         if hypo.get_last_word() == utils.EOS_ID:
             self.add_full_hypo(hypo.generate_full_hypothesis())
     if not self.full_hypos:
         logging.warn("No complete hypotheses found for %s" % src_sentence)
         for hypo in hypos:
             self.add_full_hypo(hypo.generate_full_hypothesis())
     return self.get_full_hypos_sorted()
Beispiel #5
0
 def write_hypos(self, all_hypos, sen_indices):
     """Writes the hypotheses in ``all_hypos`` to ``path`` """
     with codecs.open(self.path, "w", encoding='utf-8') as f:
         n_predictors = len(self.predictor_names)
         for idx, hypos in zip(sen_indices, all_hypos):
             for hypo in hypos:
                 f.write("%d ||| %s ||| %s ||| %f" %
                         (idx,
                          utils.apply_trg_wmap(hypo.trgt_sentence,
                                               self.trg_wmap),
                          ' '.join("%s= %f" % (
                               self.predictor_names[i],
                               sum([s[i][0] for s in hypo.score_breakdown]))
                                   for i in xrange(n_predictors)),
                          hypo.total_score))
                 f.write("\n")
             idx += 1
Beispiel #6
0
    def consume(self, pred_id):
        """Feeds back ``pred_id`` to the decoder network. This includes
        embedding of ``pred_id``, running the attention network and update
        the recurrent decoder layer.
        """
        logging.debug(u'nmt consumed: {}'.format(
            utils.apply_trg_wmap([pred_id])))  #SGNMT

        self.consumed.append(pred_id)  #SGNMT

        inputs_id = [self.BEGIN] + self.consumed  #SGNMT
        initial_state = self.decoder.initial_state()  #SGNMT
        inputs_emb = [self.VOCAB_LOOKUP[c_id] for c_id in inputs_id]  #SGNMT
        states = initial_state.transduce(inputs_emb)  #SGNMT
        self.output_state = states[-1]  #SGNMT

        #        self.consume_next(pred_id)#NEW
        pass
Beispiel #7
0
 def write_hypos(self, all_hypos, sen_indices):
     """Writes the hypotheses in ``all_hypos`` to ``path`` """
     with codecs.open(self.path, "w", encoding='utf-8') as f:
         n_predictors = len(self.predictor_names)
         for idx, hypos in zip(sen_indices, all_hypos):
             for hypo in hypos:
                 f.write(
                     "%d ||| %s ||| %s ||| %f" %
                     (idx,
                      utils.apply_trg_wmap(
                          hypo.trgt_sentence, self.trg_wmap), ' '.join(
                              "%s= %f" %
                              (self.predictor_names[i],
                               sum([s[i][0] for s in hypo.score_breakdown]))
                              for i in xrange(n_predictors)),
                      hypo.total_score))
                 f.write("\n")
             idx += 1
Beispiel #8
0
def _process_input(sync_symbol=-1):
    """Helper method to support multiple input files. Handles sync symbol properly if it is space"""
    # Read the input file
    inputs_tmp = []
    with codecs.open(args.src_test, encoding='utf-8') as f:
        for line in f:
            #            logging.debug(u'utils.apply_src_wmap(sync_symbol): {}'.format(utils.apply_trg_wmap([sync_symbol])))
            if utils.apply_trg_wmap([sync_symbol])[0] == " ":
                inputs_tmp.append([
                    c.replace('|', ' ')
                    for c in line.strip().replace('   ', ' | ').split()
                ])


#                logging.debug(u'line: {}'.format(line))
#                logging.debug(u'mapped: {}'.format([c.replace('|',' ') for c in line.strip().replace('   ',' | ').split()]))
            else:
                inputs_tmp.append(line.strip().split())
    return inputs_tmp
Beispiel #9
0
    def write_hypos(self, all_hypos):
        """Writes the hypotheses in ``all_hypos`` to ``path`` """
        n_predictors = len(self.predictor_names)
        idx = self.current_sen_id
        for hypos in all_hypos:
            for hypo in hypos:
                self.f.write("%d ||| %s ||| %s ||| %f" % (
                    idx,
                    utils.apply_trg_wmap(hypo.trgt_sentence, self.trg_wmap),
                    #                             ' '.join("%s=%f" % (
                    #                                  self.predictor_names[i],
                    #                                  sum([s[i][0] for s in hypo.score_breakdown]))
                    #                                      for i in xrange(n_predictors)),
                    ' '.join("%s" %
                             (sum([s[i][0] for s in hypo.score_breakdown]))
                             for i in xrange(n_predictors)),
                    hypo.total_score))
                self.f.write("\n")

    #                idx += 1
        self.current_sen_id += 1
        self.f.flush()
Beispiel #10
0
    def decode(self, src_sentence):
        """This is a generalization to NMT ensembles of ``DynetNMTVanillaDecoder``.
                    
        Args:
        src_sentence (list): List of source word ids without <S> or
        </S> which make up the source sentence
        
        Returns:
        list. A list of ``Hypothesis`` instances ordered by their
        score.
        """
        dy.renew_cg()
        logging.debug(u'src_sentence: {}'.format(src_sentence))
        MAX_PRED_SEQ_LEN = 30  #3*len(src_sentence)
        beam_size = self.beam_size
        nmt_models = self.nmt_models

        #        nmt_vocab = nmt_models[0].vocab # same vocab file for all nmt_models!!
        #        BEGIN   = nmt_vocab.w2i[BEGIN_CHAR]
        BEGIN = utils.GO_ID
        STOP = utils.EOS_ID
        #        STOP   = nmt_vocab.w2i[STOP_CHAR]

        for m in nmt_models:
            m.initialize(src_sentence)
        states = [[m.s] * beam_size
                  for m in nmt_models]  # ensemble x beam matrix of states
        # This array will store all generated outputs, including those from
        # previous step and those from already finished sequences.
        all_outputs = np.full(shape=(1, beam_size),
                              fill_value=BEGIN,
                              dtype=int)
        all_masks = np.ones_like(
            all_outputs, dtype=float)  # whether predicted symbol is self.STOP
        all_costs = np.zeros_like(
            all_outputs, dtype=float)  # the cumulative cost of predictions

        for i in range(MAX_PRED_SEQ_LEN):
            if all_masks[-1].sum() == 0:
                logging.debug(u'check masks: {}'.format(all_masks[-1]))
                break

            # We carefully hack values of the `logprobs` array to ensure
            # that all finished sequences are continued with `eos_symbol`.
            logprobs_lst = []
            for j, m in enumerate(nmt_models):
                logprobs_m = -np.array([m.predict_next_(s) for s in states[j]
                                        ])  # beam_size x vocab_len
                logprobs_lst.append(logprobs_m)
            logprobs = np.sum(logprobs_lst, axis=0)
            next_costs = (
                all_costs[-1, :, None] + logprobs * all_masks[-1, :, None]
            )  #take last row of cumul prev costs and turn into beam_size X 1 matrix, take logprobs distributions for unfinished hypos only and add it (elem-wise) with the array of prev costs; result: beam_size x vocab_len matrix of next costs
            (finished, ) = np.where(
                all_masks[-1] == 0
            )  # finished hypos have all their cost on the self.STOP symbol
            next_costs[finished, :STOP] = np.inf
            next_costs[finished, STOP + 1:] = np.inf

            # indexes - the hypos from prev step to keep, outputs - the next step prediction, chosen cost - cost of predicted symbol
            (indexes,
             outputs), chosen_costs = DynetNMTVanillaDecoder._smallest(
                 next_costs, beam_size, only_first_row=i == 0)

            # Rearrange everything
            new_states = []
            for j, m in enumerate(nmt_models):
                new_states.append([states[j][ind] for ind in indexes])

            #        new_states = ((states_m[ind] for ind in indexes) for states_m in states)
            all_outputs = all_outputs[:, indexes]
            all_masks = all_masks[:, indexes]
            all_costs = all_costs[:, indexes]

            # Record chosen output and compute new states
            states = [[
                m.consume_next_(s, pred_id)
                for s, pred_id in zip(m_new_states, outputs)
            ] for m, m_new_states in zip(nmt_models, new_states)]
            all_outputs = np.vstack([all_outputs, outputs[None, :]])
            logging.debug(u'all_outputs: {}'.format(all_outputs))
            logging.debug(u'outputs: {}'.format(
                [utils.apply_trg_wmap([c]) for c in outputs]))
            logging.debug(u'indexes: {}'.format(indexes))
            logging.debug(u'chosen_costs: {}'.format(chosen_costs))
            logging.debug(u'outputs != STOP: {}'.format(outputs != STOP))
            all_costs = np.vstack([all_costs, chosen_costs[None, :]])
            mask = outputs != STOP
            #        if ignore_first_eol: # and i == 0:
            #            mask[:] = 1
            all_masks = np.vstack([all_masks, mask[None, :]])
            logging.debug(u'last masks: {}'.format(all_masks[-1]))

        all_outputs = all_outputs[1:]  # skipping first row of self.BEGIN
        logging.debug(u'outputs: {}'.format(all_outputs))
        all_masks = all_masks[:
                              -1]  #? all_masks[:-1] # skipping first row of self.BEGIN and the last row of self.STOP
        logging.debug(u'masks: {}'.format(all_masks))
        all_costs = all_costs[
            1:] - all_costs[:
                            -1]  #turn cumulative cost ito cost of each step #?actually the last row would suffice for us?
        result = all_outputs, all_masks, all_costs

        trans, costs = DynetNMTVanillaDecoder.result_to_lists(
            result)  #(nmt_vocab,result)
        logging.debug(u'trans: {}'.format(trans))
        hypos = []
        max_len = 0
        for idx in xrange(len(trans)):
            max_len = max(max_len, len(trans[idx]))
            hypo = Hypothesis(trans[idx], -costs[idx])
            hypo.score_breakdown = len(trans[idx]) * [[(0.0, 1.0)]]
            hypo.score_breakdown[0] = [(-costs[idx], 1.0)]
            hypos.append(hypo)
            self.apply_predictors_count = max_len * self.beam_size
        logging.debug(u'hypos: {}'.format(all_outputs))
        return hypos
Beispiel #11
0
def do_decode(decoder, output_handlers, src_sentences):
    """This method contains the main decoding loop. It iterates through
    ``src_sentences`` and applies ``decoder.decode()`` to each of them.
    At the end, it calls the output handlers to create output files.
    
    Args:
        decoder (Decoder):  Current decoder instance
        output_handlers (list):  List of output handlers, see
                                 ``create_output_handlers()``
        src_sentences (list):  A list of strings. The strings are the
                               source sentences with word indices to 
                               translate (e.g. '1 123 432 2')
    """
    if not decoder.has_predictors():
        logging.fatal("Decoding cancelled because of an error in the "
                      "predictor configuration.")
        return
    start_time = time.time()
    logging.info("Start time: %s" % start_time)
    all_hypos = []
    text_output_handler = get_text_output_handler(output_handlers)
    if text_output_handler:
        text_output_handler.open_file()
    for sen_idx in _get_sentence_indices(args.range, src_sentences):
        try:
            if src_sentences is False:
                src = "0"
                logging.info("Next sentence (ID: %d)" % (sen_idx + 1))
            else:
                src = src_sentences[sen_idx]
                if isinstance(src[0], list):
                    src_lst = []
                    for idx in xrange(len(src)):
                        logging.info("Next sentence, input %d (ID: %d): %s" %
                                     (idx, sen_idx + 1, ' '.join(src[idx])))
                        src_lst.append([int(x) for x in src[idx]])
                    src = src_lst
                else:
                    logging.info("Next sentence (ID: %d): %s" %
                                 (sen_idx + 1, ' '.join(src)))
                    src = [int(x) for x in src]
            start_hypo_time = time.time()
            decoder.apply_predictors_count = 0
            if isinstance(src[0], list):
                # don't apply wordmap for multiple inputs
                hypos = [
                    hypo for hypo in decoder.decode(src)
                    if hypo.total_score > args.min_score
                ]
            else:
                hypos = [
                    hypo for hypo in decoder.decode(utils.apply_src_wmap(src))
                    if hypo.total_score > args.min_score
                ]
            if not hypos:
                logging.error("No translation found for ID %d!" %
                              (sen_idx + 1))
                logging.info("Stats (ID: %d): score=<not-found> "
                             "num_expansions=%d "
                             "time=%.2f" %
                             (sen_idx + 1, decoder.apply_predictors_count,
                              time.time() - start_hypo_time))
                if text_output_handler:
                    text_output_handler.write_empty_line()
                continue
            if args.remove_eos:
                for hypo in hypos:
                    if (hypo.trgt_sentence
                            and hypo.trgt_sentence[-1] == utils.EOS_ID):
                        hypo.trgt_sentence = hypo.trgt_sentence[:-1]
            if args.nbest > 0:
                hypos = hypos[:args.nbest]
            if (args.combination_scheme != 'sum'
                    and not args.apply_combination_scheme_to_partial_hypos):
                for hypo in hypos:
                    hypo.total_score = core.breakdown2score_full(
                        hypo.total_score, hypo.score_breakdown)
                hypos.sort(key=lambda hypo: hypo.total_score, reverse=True)
            if utils.trg_cmap:
                hypos = [
                    h.convert_to_char_level(utils.trg_cmap) for h in hypos
                ]
            logging.info(
                "Decoded (ID: %d): %s" %
                (sen_idx + 1,
                 utils.apply_trg_wmap(hypos[0].trgt_sentence, {}
                                      if utils.trg_cmap else utils.trg_wmap)))
            logging.info("Stats (ID: %d): score=%f "
                         "num_expansions=%d "
                         "time=%.2f" % (sen_idx + 1, hypos[0].total_score,
                                        decoder.apply_predictors_count,
                                        time.time() - start_hypo_time))
            all_hypos.append(hypos)
            try:
                # Write text output as we go
                if text_output_handler:
                    text_output_handler.write_hypos([hypos])
            except IOError as e:
                logging.error(
                    "I/O error %d occurred when creating output files: %s" %
                    (sys.exc_info()[0], e))
        except ValueError as e:
            logging.error("Number format error at sentence id %d: %s, "
                          "Stack trace: %s" %
                          (sen_idx + 1, e, traceback.format_exc()))
        except Exception as e:
            logging.error(
                "An unexpected %s error has occurred at sentence id "
                "%d: %s, Stack trace: %s" %
                (sys.exc_info()[0], sen_idx + 1, e, traceback.format_exc()))
    try:
        for output_handler in output_handlers:
            if output_handler == text_output_handler:
                output_handler.close_file()
            else:
                output_handler.write_hypos(all_hypos)
    except IOError as e:
        logging.error("I/O error %s occurred when creating output files: %s" %
                      (sys.exc_info()[0], e))
    logging.info("Decoding finished. Time: %.2f" % (time.time() - start_time))
Beispiel #12
0
 def _evaluate_model(self):
     """Evaluate model and store checkpoints. """
     logging.info("Started Validation: ")
     val_start_time = time.time()
     total_cost = 0.0
     if self.verbose:
         ftrans = codecs.open(self.config['val_set_out'], 'w', 'utf-8')
     for i, line in enumerate(self.data_stream.get_epoch_iterator()):
         seq = self.src_sparse_feat_map.words2dense(utils.oov_to_unk(
                                                                 line[0], self.config['src_vocab_size']))
         if self.src_sparse_feat_map.dim > 1: # sparse src feats
             input_ = numpy.transpose(
                                      numpy.tile(seq, (self.config['beam_size'], 1, 1)),
                                      (2,0,1))
         else: # word ids on the source side
             input_ = numpy.tile(seq, (self.config['beam_size'], 1))
         # draw sample, checking to ensure we don't get an empty string back
         trans, costs = \
             self.beam_search.search(
                                     input_values={self.source_sentence: input_},
                                     max_length=3*len(line[0]), eol_symbol=utils.EOS_ID,
                                     ignore_first_eol=True)
                 #            if i < 10:
                 #                logging.info("ID: {}".format(i))
                 #                logging.info("Source: {}".format(line[0]))
                 #                for k, tran in enumerate(trans):
                 #                    logging.info(u"{}".format(utils.apply_trg_wmap(tran,self.trg_wmap)))
                 #                    logging.info("{}".format(costs[k]))
                 # normalize costs according to the sequence lengths
         if self.normalize:
             lengths = numpy.array([len(s) for s in trans])
             costs = costs / lengths
                             
         nbest_idx = numpy.argsort(costs)[:self.n_best]
         for j, best in enumerate(nbest_idx):
             try:
                 total_cost += costs[best]
                 trans = trans[best]
                 if trans and trans[-1] == utils.EOS_ID:
                     trans = trans[:-1]
                 trans_out = ' '.join([str(w) for w in trans])
             except ValueError:
                 logging.info(
                          "Can NOT find a translation for line: {}".format(i+1))
                 trans_out = '<UNK>'
                 trans = 0
             if j == 0:
                 # Write to subprocess and file if it exists
                 ##print(trans_out, file=mb_subprocess.stdin)
                 if self.verbose:
                     print(utils.apply_trg_wmap(trans,self.trg_wmap), file=ftrans)
         if i != 0 and i % 100 == 0:
             logging.info(
                 "Translated {} lines of validation set...".format(i))
                                     
     logging.info("Total cost of the validation: {}".format(total_cost))
     self.data_stream.reset()
     if self.verbose:
         ftrans.close()
     logging.info("Validation Took: {} minutes".format(
                                                        float(time.time() - val_start_time) / 60.))
     logger.info("{} {} {} {}".format(self.config['bleu_script'], self.config['val_set_out'], self.config['val_set_grndtruth'], self.config['results_out']))
     bleu_score = float(subprocess.check_output("python2.7 {} {} {} {}".format(self.config['bleu_script'], self.config['val_set_out'], self.config['val_set_grndtruth'], self.config['results_out']), shell=True).decode("utf-8"))
     self.val_bleu_curve.append(bleu_score)
     logging.info(bleu_score)
     return bleu_score
Beispiel #13
0
def do_decode(decoder, 
              output_handlers, 
              src_sentences):
    """This method contains the main decoding loop. It iterates through
    ``src_sentences`` and applies ``decoder.decode()`` to each of them.
    At the end, it calls the output handlers to create output files.
    
    Args:
        decoder (Decoder):  Current decoder instance
        output_handlers (list):  List of output handlers, see
                                 ``create_output_handlers()``
        src_sentences (list):  A list of strings. The strings are the
                               source sentences with word indices to 
                               translate (e.g. '1 123 432 2')
    """
    if not decoder.has_predictors():
        logging.fatal("Terminated due to an error in the "
                      "predictor configuration.")
        return
    all_hypos = []
    text_output_handler = _get_text_output_handler(output_handlers)
    if text_output_handler:
        text_output_handler.open_file()
    start_time = time.time()
    logging.info("Start time: %s" % start_time)
    sen_indices = []
    for sen_idx in get_sentence_indices(args.range, src_sentences):
        decoder.set_current_sen_id(sen_idx)
        try:
            if src_sentences is False:
                src = "0"
                logging.info("Next sentence (ID: %d)" % (sen_idx + 1))
            else:
                src = src_sentences[sen_idx]
            if len(src) > 0 and args.per_sentence_predictor_weights:
                # change predictor weights per-sentence
                weights = src[-1].split(',')
                if len(weights) > 1:
                    weights = [float(x) for x in weights]
                    src = src[:-1]
                    logging.info('Changing predictor weights to {}'.format(
                        weights))
                    decoder.change_predictor_weights(weights)
                else:
                    logging.info(
                        'No weights read in {} - leaving unchanged'.format(
                            src))
            logging.info("Next sentence (ID: %d): %s" % (sen_idx + 1, ' '.join(src)))
            src = [int(x) for x in src]
            start_hypo_time = time.time()
            decoder.apply_predictors_count = 0
            hypos = [hypo 
                     for hypo in decoder.decode(utils.apply_src_wmap(src))
                        if hypo.total_score > args.min_score]
            if not hypos:
                logging.error("No translation found for ID %d!" % (sen_idx+1))
                logging.info("Stats (ID: %d): score=<not-found> "
                         "num_expansions=%d "
                         "time=%.2f" % (sen_idx+1,
                                        decoder.apply_predictors_count,
                                        time.time() - start_hypo_time))
                hypos = [_generate_dummy_hypo(decoder.predictors)]
            hypos = _postprocess_complete_hypos(hypos)
            if utils.trg_cmap:
                hypos = [h.convert_to_char_level(utils.trg_cmap) for h in hypos]
            logging.info("Decoded (ID: %d): %s" % (
                    sen_idx+1,
                    utils.apply_trg_wmap(hypos[0].trgt_sentence, 
                                         {} if utils.trg_cmap else utils.trg_wmap)))
            logging.info("Stats (ID: %d): score=%f "
                         "num_expansions=%d "
                         "time=%.2f" % (sen_idx+1,
                                        hypos[0].total_score,
                                        decoder.apply_predictors_count,
                                        time.time() - start_hypo_time))
            all_hypos.append(hypos)
            sen_indices.append(sen_idx)
            try:
                # Write text output as we go
                if text_output_handler:
                    text_output_handler.write_hypos([hypos])
            except IOError as e:
                logging.error("I/O error %d occurred when creating output files: %s"
                            % (sys.exc_info()[0], e))
        except ValueError as e:
            logging.error("Number format error at sentence id %d: %s, "
                          "Stack trace: %s" % (sen_idx+1, 
                                               e,
                                               traceback.format_exc()))
        except AttributeError as e:
            logging.fatal("Attribute error at sentence id %d: %s. This often "
                          "indicates an error in the predictor configuration "
                          "which could not be detected in initialisation. "
                          "Stack trace: %s" 
                          % (sen_idx+1, e, traceback.format_exc()))
        except Exception as e:
            logging.error("An unexpected %s error has occurred at sentence id "
                          "%d: %s, Stack trace: %s" % (sys.exc_info()[0],
                                                       sen_idx+1,
                                                       e,
                                                       traceback.format_exc()))
    logging.info("Decoding finished. Time: %.2f" % (time.time() - start_time))
    try:
        for output_handler in output_handlers:
            if output_handler == text_output_handler:
                output_handler.close_file()
            else:
                output_handler.write_hypos(all_hypos, sen_indices)
    except IOError as e:
        logging.error("I/O error %s occurred when creating output files: %s"
                      % (sys.exc_info()[0], e))
Beispiel #14
0
def do_decode(decoder, 
              output_handlers, 
              src_sentences):
    """This method contains the main decoding loop. It iterates through
    ``src_sentences`` and applies ``decoder.decode()`` to each of them.
    At the end, it calls the output handlers to create output files.
    
    Args:
        decoder (Decoder):  Current decoder instance
        output_handlers (list):  List of output handlers, see
                                 ``create_output_handlers()``
        src_sentences (list):  A list of strings. The strings are the
                               source sentences with word indices to 
                               translate (e.g. '1 123 432 2')
    """
    if not decoder.has_predictors():
        logging.fatal("Terminated due to an error in the "
                      "predictor configuration.")
        return
    all_hypos = []
    text_output_handler = _get_text_output_handler(output_handlers)
    if text_output_handler:
        text_output_handler.open_file()
    start_time = time.time()
    logging.info("Start time: %s" % start_time)
    sen_indices = []
    for sen_idx in get_sentence_indices(args.range, src_sentences):
        decoder.set_current_sen_id(sen_idx)
        try:
            if src_sentences is False:
                src = "0"
                logging.info("Next sentence (ID: %d)" % (sen_idx + 1))
            else:
                src = src_sentences[sen_idx]
            if len(src) > 0 and args.per_sentence_predictor_weights:
                # change predictor weights per-sentence
                weights = src[-1].split(',')
                if len(weights) > 1:
                    weights = [float(x) for x in weights]
                    src = src[:-1]
                    logging.info('Changing predictor weights to {}'.format(
                        weights))
                    decoder.change_predictor_weights(weights)
                else:
                    logging.info(
                        'No weights read in {} - leaving unchanged'.format(
                            src))
            logging.info("Next sentence (ID: %d): %s" % (sen_idx + 1, ' '.join(src)))
            src = [int(x) for x in src]
            start_hypo_time = time.time()
            decoder.apply_predictors_count = 0
            hypos = [hypo 
                     for hypo in decoder.decode(utils.apply_src_wmap(src))
                        if hypo.total_score > args.min_score]
            if not hypos:
                logging.error("No translation found for ID %d!" % (sen_idx+1))
                logging.info("Stats (ID: %d): score=<not-found> "
                         "num_expansions=%d "
                         "time=%.2f" % (sen_idx+1,
                                        decoder.apply_predictors_count,
                                        time.time() - start_hypo_time))
                hypos = [_generate_dummy_hypo(decoder.predictors)]
            hypos = _postprocess_complete_hypos(hypos)
            if utils.trg_cmap:
                hypos = [h.convert_to_char_level(utils.trg_cmap) for h in hypos]
            logging.info("Decoded (ID: %d): %s" % (
                    sen_idx+1,
                    utils.apply_trg_wmap(hypos[0].trgt_sentence, 
                                         {} if utils.trg_cmap else utils.trg_wmap)))
            logging.info("Stats (ID: %d): score=%f "
                         "num_expansions=%d "
                         "time=%.2f" % (sen_idx+1,
                                        hypos[0].total_score,
                                        decoder.apply_predictors_count,
                                        time.time() - start_hypo_time))
            all_hypos.append(hypos)
            sen_indices.append(sen_idx)
            try:
                # Write text output as we go
                if text_output_handler:
                    text_output_handler.write_hypos([hypos])
            except IOError as e:
                logging.error("I/O error %d occurred when creating output files: %s"
                            % (sys.exc_info()[0], e))
        except ValueError as e:
            logging.error("Number format error at sentence id %d: %s, "
                          "Stack trace: %s" % (sen_idx+1, 
                                               e,
                                               traceback.format_exc()))
        except AttributeError as e:
            logging.fatal("Attribute error at sentence id %d: %s. This often "
                          "indicates an error in the predictor configuration "
                          "which could not be detected in initialisation. "
                          "Stack trace: %s" 
                          % (sen_idx+1, e, traceback.format_exc()))
        except Exception as e:
            logging.error("An unexpected %s error has occurred at sentence id "
                          "%d: %s, Stack trace: %s" % (sys.exc_info()[0],
                                                       sen_idx+1,
                                                       e,
                                                       traceback.format_exc()))
    logging.info("Decoding finished. Time: %.2f" % (time.time() - start_time))
    try:
        for output_handler in output_handlers:
            if output_handler == text_output_handler:
                output_handler.close_file()
            else:
                output_handler.write_hypos(all_hypos, sen_indices)
    except IOError as e:
        logging.error("I/O error %s occurred when creating output files: %s"
                      % (sys.exc_info()[0], e))
Beispiel #15
0
    def decode(self, src_sentence):
        """Decodes a single source sentence. Note that the
        score breakdowns in returned hypotheses are only on the
        sentence level, not on the word level. For finer grained NMT
        scores you need to use the nmt predictor. ``src_sentence`` is a
        list of source word ids representing the source sentence without
        <S> or </S> symbols. As blocks expects to see </S>, this method
        adds it automatically.
        
        Args:
        src_sentence (list): List of source word ids without <S> or
        </S> which make up the source sentence
        
        Returns:
        list. A list of ``Hypothesis`` instances ordered by their
        score.
        """
        dy.renew_cg()
        logging.debug(u'src_sentence: {}'.format(src_sentence))
        #        MAX_PRED_SEQ_LEN = 30*len(src_sentence)
        MAX_PRED_SEQ_LEN = 30
        logging.debug(u'MAX_PRED_SEQ_LEN: {}'.format(MAX_PRED_SEQ_LEN))
        BEGIN = utils.GO_ID
        STOP = utils.EOS_ID
        logging.debug(u'BEGIN: {}, STOP: {}'.format(BEGIN, STOP))
        beam_size = self.beam_size
        self.nmt_model.initialize(src_sentence)
        #        ignore_first_eol=True
        states = [self.nmt_model.s] * beam_size
        # This array will store all generated outputs, including those from
        # previous step and those from already finished sequences.
        all_outputs = np.full(shape=(1, beam_size),
                              fill_value=BEGIN,
                              dtype=int)
        all_masks = np.ones_like(
            all_outputs, dtype=float)  # whether predicted symbol is self.STOP
        all_costs = np.zeros_like(
            all_outputs, dtype=float)  # the cumulative cost of predictions

        for i in range(MAX_PRED_SEQ_LEN):
            if all_masks[-1].sum() == 0:
                logging.debug(u'all_masks: {}'.format(all_masks))
                break

            # We carefully hack values of the `logprobs` array to ensure
            # that all finished sequences are continued with `eos_symbol`.
            logprobs = -np.array(
                [self.nmt_model.predict_next_(s) for s in states])
            #            print logprobs
            #            print all_masks[-1, :, None]
            next_costs = (
                all_costs[-1, :, None] + logprobs * all_masks[-1, :, None]
            )  #take last row of cumul prev costs and turn into beam_size X 1 matrix, take logprobs distributions for unfinished hypos only and add it (elem-wise) with the array of prev costs; result: beam_size x vocab_len matrix of next costs
            (finished, ) = np.where(
                all_masks[-1] == 0
            )  # finished hypos have all their cost on the self.STOP symbol
            next_costs[finished, :STOP] = np.inf
            next_costs[finished, STOP + 1:] = np.inf

            # indexes - the hypos from prev step to keep, outputs - the next step prediction, chosen cost - cost of predicted symbol
            (indexes,
             outputs), chosen_costs = self._smallest(next_costs,
                                                     beam_size,
                                                     only_first_row=i == 0)
            #            print outputs
            # Rearrange everything
            new_states = (states[ind] for ind in indexes)
            all_outputs = all_outputs[:, indexes]
            all_masks = all_masks[:, indexes]
            all_costs = all_costs[:, indexes]

            # Record chosen output and compute new states
            states = [
                self.nmt_model.consume_next_(s, pred_id)
                for s, pred_id in zip(new_states, outputs)
            ]
            all_outputs = np.vstack([all_outputs, outputs[None, :]])
            logging.debug(u'all_outputs: {}'.format(all_outputs))
            logging.debug(u'outputs: {}'.format(
                [utils.apply_trg_wmap([c]) for c in outputs]))
            logging.debug(u'indexes: {}'.format(indexes))
            logging.debug(u'chosen_costs: {}'.format(chosen_costs))
            logging.debug(u'outputs != STOP: {}'.format(outputs != STOP))
            all_costs = np.vstack([all_costs, chosen_costs[None, :]])
            mask = outputs != STOP
            #            if ignore_first_eol: #and i == 0:
            #                mask[:] = 1
            all_masks = np.vstack([all_masks, mask[None, :]])

        all_outputs = all_outputs[1:]  # skipping first row of self.BEGIN
        logging.debug(u'outputs: {}'.format(all_outputs))
        all_masks = all_masks[:
                              -1]  #? all_masks[:-1] # skipping first row of self.BEGIN and the last row of self.STOP
        logging.debug(u'masks: {}'.format(all_masks))
        all_costs = all_costs[
            1:] - all_costs[:
                            -1]  #turn cumulative cost ito cost of each step #?actually the last row would suffice for us?
        result = all_outputs, all_masks, all_costs

        trans, costs = self.result_to_lists(
            result)  #self.nmt_model.vocab, result)
        logging.debug(u'trans: {}'.format(trans))
        hypos = []
        max_len = 0
        for idx in xrange(len(trans)):
            max_len = max(max_len, len(trans[idx]))
            hypo = Hypothesis(trans[idx], -costs[idx])
            hypo.score_breakdown = len(trans[idx]) * [[(0.0, 1.0)]]
            hypo.score_breakdown[0] = [(-costs[idx], 1.0)]
            hypos.append(hypo)

        logging.debug(u'hypos: {}'.format(all_outputs))
        return hypos
Beispiel #16
0
    def _expand_hypo_nmt(self, input_hypo):
        """Get the best beam size expansions of ``hypo`` by one MORPHEME based on nmt predictor scores only, i.e. expand hypo until all of the beam size best hypotheses end with ``sync_symb`` or EOS. The implementation relies on '_expand_hypo_nmt' of the parent class BeamDecoderSegm which provides best beam size expansions of ``hypo`` by one CHAR based on nmt predictor scores only.
        
        Args:
        hypo (PartialHypothesis): Hypothesis to expand
        
        Return:
        list. List of expanded hypotheses.
        """
        # The input hypo to be expanded
        logging.debug(u"EXPAND: {} {}".format(
            utils.apply_trg_wmap(input_hypo.trgt_sentence), input_hypo.score))

        # Get initial expansions by one char
        hypos = super(SyncBeamDecoderSegm, self)._expand_hypo_nmt(input_hypo)
        # input_hypo_len = len(input_hypo.score_breakdown)
        # Expand until all hypos are closed
        it = 0
        while self._all_eos_or_eow(hypos):
            if it > self.max_morf_len:  # prevent infinite loops
                break
            logging.debug(u"SYNC BEAM ITER: {}".format(it))
            it = it + 1
            next_hypos = []
            next_scores = []
            for hypo in hypos:
                # Combined predictors score for the chars in a next morpheme (we look for a best morpheme expansion of the input_hypo)
                next_score = sum([
                    sum([
                        char_scores[i][0] for i, s in enumerate(char_scores)
                        if self.predictor_names[i] == "nmt"
                    ]) for char_scores in hypo.score_breakdown
                ])
                #                next_score = sum([sum([char_scores[i][0] for i,s in enumerate(char_scores) if self.predictor_levels[i]=="c"]) for char_scores in hypo.score_breakdown])
                logging.debug(u"CONTINUATION: {} -> {}, {}".format(
                    utils.apply_trg_wmap(hypo.trgt_sentence), next_score,
                    hypo.score))
                if self._is_closed(hypo):
                    next_hypos.append(hypo)
                    next_scores.append(next_score)
                    logging.debug(u"NOT EXPAND: {} -> {}, {}".format(
                        utils.apply_trg_wmap(hypo.trgt_sentence), next_score,
                        hypo.score))
                    continue
                for next_hypo in super(SyncBeamDecoderSegm,
                                       self)._expand_hypo_nmt(hypo):
                    next_hypos.append(next_hypo)
                    next_score = sum([
                        sum([
                            char_scores[i][0]
                            for i, s in enumerate(char_scores)
                            if self.predictor_names[i] == "nmt"
                        ]) for char_scores in next_hypo.score_breakdown
                    ])
                    #                    next_score = sum([sum([char_scores[i][0] for i,s in enumerate(char_scores) if self.predictor_levels[i]=="c"]) for char_scores in next_hypo.score_breakdown])
                    next_scores.append(next_score)
                    logging.debug(u"EXPAND: {} -> {}, {}".format(
                        utils.apply_trg_wmap(next_hypo.trgt_sentence),
                        next_score, next_hypo.score))
            logging.debug(u"BEFORE CUT on ITERATION: {} -> {}".format(
                it, " && ".join(
                    utils.apply_trg_wmap(h.trgt_sentence) + ", " +
                    str(next_scores[i]) for i, h in enumerate(next_hypos))))

            hypos = self._get_next_hypos(next_hypos, next_scores)
            logging.debug(u"CUT: {}".format(" && ".join(
                utils.apply_trg_wmap(h.trgt_sentence) for h in hypos)))

        # Best final expansion of the initial hypo by morphemes
        for hypo in hypos:
            logging.debug(u"SYNCRESULT {} {}".format(
                utils.apply_trg_wmap(hypo.trgt_sentence),
                sum([
                    sum([
                        char_scores[i][0] for i, s in enumerate(char_scores)
                        if self.predictor_names[i] == "nmt"
                    ]) for char_scores in hypo.score_breakdown
                ])))
#            logging.debug(u"SYNCRESULT {} {}".format(utils.apply_trg_wmap(hypo.trgt_sentence), sum([sum([char_scores[i][0]  for i,s in enumerate(char_scores) if self.predictor_levels[i]=="c"]) for char_scores in hypo.score_breakdown])))

        return hypos
Beispiel #17
0
    def decode(self, src_sentence):
        """Decodes a single source sentence using beam search.
        Expands (beam size) hypotheses based on a sum of nmt predictors scores (_expand_hypo_nmt), cuts (beam size) the resulting continuation based on a combined predictors score."""
        dy.renew_cg()
        self.initialize_predictors(src_sentence)
        hypos = self._get_initial_hypos()
        self.setup_max_len(src_sentence)
        logging.debug(u"Source len {}".format(len(src_sentence)))
        logging.debug(u"MAX-ITER: {}".format(self.max_len))
        # Initial expansion
        for hypo in hypos:
            logging.debug(u"INIT {} {}".format(
                utils.apply_trg_wmap(hypo.trgt_sentence),
                hypo.score_breakdown))
        it = 0
        while self.stop_criterion(hypos):
            logging.debug(u"ITER: {}, MAX-ITER: {}".format(it, self.max_len))
            if it > self.max_len:  # prevent infinite loops
                break
            it = it + 1

            next_hypos = []
            next_scores = []
            self.min_score = utils.NEG_INF
            self.best_scores = []
            for hypo in hypos:
                if hypo.get_last_word() == utils.EOS_ID:
                    next_hypos.append(hypo)
                    next_scores.append(self._get_combined_score(hypo))
                    logging.debug(u"BEAM IT {} HYPO {} NO EXPAND".format(
                        it, utils.apply_trg_wmap(hypo.trgt_sentence)))
                    continue
                for next_hypo in self._expand_hypo_nmt(hypo):
                    next_score = self._get_combined_score(next_hypo)
                    if next_score > self.min_score:
                        next_hypos.append(next_hypo)
                        next_scores.append(next_score)
                        self._register_score(next_score)
                    logging.debug(u"BEAM IT {} HYPO {} -> NEXT HYPO {}".format(
                        it, utils.apply_trg_wmap(hypo.trgt_sentence),
                        utils.apply_trg_wmap(next_hypo.trgt_sentence)))

            # hypo expansions on this iteraion which will be cut (beam size) based on combined predictors score:
            logging.debug(u"BEAM IT {} NEXT HYPOS BEFORE CUT -> {}".format(
                it, " && ".join(
                    utils.apply_trg_wmap(h.trgt_sentence) + ", " +
                    str(next_scores[i]) for i, h in enumerate(next_hypos))))
            logging.debug(u"BEAM IT {} Min score: {}".format(
                it, self.min_score))

            if self.hypo_recombination:
                hypos = self._filter_equal_hypos(next_hypos, next_scores)
            else:
                hypos = self._get_next_hypos(next_hypos, next_scores)

            # Best (beam size) expansions of the hypo on this iteration...
            logging.debug(u"BEAM IT {} CUT: {}".format(
                it, " && ".join(
                    utils.apply_trg_wmap(h.trgt_sentence) for h in hypos)))

            # ... with detailed scores per char
            for i, hypo in enumerate(hypos):
                logging.debug(u"BEAM IT {} :{}".format(
                    utils.apply_trg_wmap(hypo.trgt_sentence), hypo.score))
                for i, score_char in enumerate(hypo.score_breakdown):
                    logging.debug(u"{}: {}".format(
                        utils.apply_trg_wmap([hypo.trgt_sentence[i]]),
                        ", ".join("{:.10f}".format(s) + ":" +
                                  "{:.2f}".format(w) for s, w in score_char)))


#        # final hypos
#        final_scores = []
#        final_hypos = []
#        for hypo in hypos:
#            final_hypos.append(hypo)
#            final_scores.append(hypo.score)
#        hypos = self._get_next_hypos(final_hypos, final_scores)
#
#        # Best final hypos
#        logging.debug(u"BEAM FINAL: {}".format(" && ".join(utils.apply_trg_wmap(h.trgt_sentence) for h in hypos)))

        for hypo in hypos:
            if hypo.get_last_word() == utils.EOS_ID:
                self.add_full_hypo(hypo.generate_full_hypothesis())
        if not self.full_hypos:
            logging.warn("No complete hypotheses found for %s" % src_sentence)
            for hypo in hypos:
                self.add_full_hypo(hypo.generate_full_hypothesis())

        return self.get_full_hypos_sorted()