Beispiel #1
0
def evaluate_with_attention(
        sess,
        eval_op,
        feeding_data,
        attention_op=None,
        output_filename_prefix=None):
    """ Evaluates data by loss.

    Args:
        attention_op:
        sess: `tf.Session`.
        eval_op: Tensorflow operation, computing the loss.
        feeding_data: An iterable instance that each element
          is a packed feeding dictionary for `sess`.
        attention_op: Tensorflow operation for output attention.
        output_filename_prefix: A string.

    Returns: Total loss averaged by number of data samples.
    """
    losses = 0.
    total_size = 0
    attentions = {}
    for ss_strs, tt_strs, feed_dict in feeding_data:
        if attention_op is None:
            loss = _evaluate(sess, feed_dict, eval_op)
        else:
            loss, atts = _evaluate(sess, feed_dict, [eval_op, attention_op])
            attentions.update(pack_batch_attention_dict(
                total_size, ss_strs, tt_strs, atts))
        losses += loss * float(len(ss_strs))
        total_size += len(ss_strs)
    loss = losses / float(total_size)
    if attention_op is not None:
        dump_attentions(output_filename_prefix, attentions)
    return loss
Beispiel #2
0
def evaluate_with_attention(sess,
                            loss_op,
                            eval_data,
                            vocab_source,
                            vocab_target,
                            attention_op=None,
                            output_filename_prefix=None):
    """ Evaluates data by loss.

    Args:
        sess: `tf.Session`.
        loss_op: Tensorflow operation, computing the loss.
        eval_data: An iterable instance that each element
          is a packed feeding dictionary for `sess`.
        vocab_source: A `Vocab` instance for source side feature map.
        vocab_target: A `Vocab` instance for target side feature map.
        attention_op: Tensorflow operation for output attention.
        output_filename_prefix: A string.

    Returns: Total loss averaged by number of data samples.
    """
    losses = 0.
    weights = 0.
    num_of_samples = 0
    attentions = {}
    for data in eval_data:
        _n_samples = len(data["feature_ids"])
        parallels = data["feed_dict"].pop("parallels")
        avail = sum(numpy.array(parallels) > 0)
        if attention_op is None:
            loss = _evaluate(sess, data["feed_dict"], loss_op[:avail])
        else:
            loss, atts = _evaluate(sess, data["feed_dict"],
                                   [loss_op[:avail], attention_op[:avail]])
            ss_strs = [
                vocab_source.convert_to_wordlist(ss, bpe_decoding=False)
                for ss in data["feature_ids"]
            ]
            tt_strs = [
                vocab_target.convert_to_wordlist(tt,
                                                 bpe_decoding=False,
                                                 reverse_seq=False)
                for tt in data["label_ids"]
            ]
            _attentions = sum(
                repeat_n_times(avail, select_attention_sample_by_sample, atts),
                [])
            attentions.update(
                pack_batch_attention_dict(num_of_samples, ss_strs, tt_strs,
                                          _attentions))
        data["feed_dict"]["parallels"] = parallels
        losses += sum([_l[0] for _l in loss])
        weights += sum([_l[1] for _l in loss])
        num_of_samples += _n_samples
    loss = losses / weights
    if attention_op is not None:
        dump_attentions(output_filename_prefix, attentions)
    return loss
Beispiel #3
0
def evaluate_with_attention(
        sess,
        loss_op,
        eval_data,
        vocab_source,
        vocab_target,
        attention_op=None,
        output_filename_prefix=None):
    """ Evaluates data by loss.

    Args:
        sess: `tf.Session`.
        loss_op: Tensorflow operation, computing the loss.
        eval_data: An iterable instance that each element
          is a packed feeding dictionary for `sess`.
        vocab_source: A `Vocab` instance for source side feature map.
        vocab_target: A `Vocab` instance for target side feature map.
        attention_op: Tensorflow operation for output attention.
        output_filename_prefix: A string.

    Returns: Total loss averaged by number of data samples.
    """
    losses = 0.
    weights = 0.
    num_of_samples = 0
    attentions = {}
    for data in eval_data:
        _n_samples = len(data["feature_ids"])
        parallels = data["feed_dict"].pop("parallels")
        avail = sum(numpy.array(parallels) > 0)
        if attention_op is None:
            loss = _evaluate(sess, data["feed_dict"], loss_op[:avail])
        else:
            loss, atts = _evaluate(sess, data["feed_dict"],
                                   [loss_op[:avail], attention_op[:avail]])
            ss_strs = [vocab_source.convert_to_wordlist(ss, bpe_decoding=False)
                       for ss in data["feature_ids"]]
            tt_strs = [vocab_target.convert_to_wordlist(
                tt, bpe_decoding=False, reverse_seq=False)
                       for tt in data["label_ids"]]
            _attentions = sum(repeat_n_times(avail, select_attention_sample_by_sample,
                                             atts), [])
            attentions.update(pack_batch_attention_dict(
                num_of_samples, ss_strs, tt_strs, _attentions))
        data["feed_dict"]["parallels"] = parallels
        losses += sum([_l[0] for _l in loss])
        weights += sum([_l[1] for _l in loss])
        num_of_samples += _n_samples
    loss = losses / weights
    if attention_op is not None:
        dump_attentions(output_filename_prefix, attentions)
    return loss
Beispiel #4
0
def evaluate_with_attention(sess,
                            eval_op,
                            eval_data,
                            vocab_source,
                            vocab_target,
                            attention_op=None,
                            output_filename_prefix=None):
    """ Evaluates data by loss.

    Args:
        sess: `tf.Session`.
        eval_op: Tensorflow operation, computing the loss.
        eval_data: An iterable instance that each element
          is a packed feeding dictionary for `sess`.
        vocab_source: A `Vocab` instance for source side feature map.
        vocab_target: A `Vocab` instance for target side feature map.
        attention_op: Tensorflow operation for output attention.
        output_filename_prefix: A string.

    Returns: Total loss averaged by number of data samples.
    """
    losses = 0.
    total_size = 0
    attentions = {}
    for data in eval_data:
        if attention_op is None:
            loss = _evaluate(sess, data["feed_dict"], eval_op)
        else:
            loss, atts = _evaluate(sess, data["feed_dict"],
                                   [eval_op, attention_op])
            ss_strs = [
                vocab_source.convert_to_wordlist(ss, bpe_decoding=False)
                for ss in data["feature_ids"]
            ]
            tt_strs = [
                vocab_target.convert_to_wordlist(tt,
                                                 bpe_decoding=False,
                                                 reverse_seq=False)
                for tt in data["label_ids"]
            ]
            attentions.update(
                pack_batch_attention_dict(total_size, ss_strs, tt_strs, atts))
        losses += loss * float(len(ss_strs))
        total_size += len(ss_strs)
    loss = losses / float(total_size)
    if attention_op is not None:
        dump_attentions(output_filename_prefix, attentions)
    return loss
Beispiel #5
0
def evaluate_with_attention(sess,
                            eval_op,
                            feeding_data,
                            vocab_source=None,
                            vocab_target=None,
                            attention_op=None,
                            output_filename_prefix=None):
    """ Evaluates data by loss.

    Args:
        sess: `tf.Session`.
        eval_op: Tensorflow operation, computing the loss.
        feeding_data: An iterable instance that each element
          is a packed feeding dictionary for `sess`.
        vocab_source: A `Vocab` instance for source side feature map. For highlighting UNK.
        vocab_target: A `Vocab` instance for target side feature map. For highlighting UNK.
        attention_op: Tensorflow operation for output attention.
        output_filename_prefix: A string.

    Returns: Total loss averaged by number of data samples.
    """
    losses = 0.
    total_size = 0
    attentions = {}
    for ss_strs, tt_strs, feed_dict in feeding_data:
        if attention_op is None:
            loss = _evaluate(sess, feed_dict, eval_op)
        else:
            loss, atts = _evaluate(sess, feed_dict, [eval_op, attention_op])
            if vocab_source is not None:
                ss_strs = [
                    vocab_source.decorate_with_unk(ss) for ss in ss_strs
                ]
                tt_strs = [
                    vocab_target.decorate_with_unk(tt) for tt in tt_strs
                ]
            attentions.update(
                pack_batch_attention_dict(total_size, ss_strs, tt_strs, atts))
        losses += loss * float(len(ss_strs))
        total_size += len(ss_strs)
    loss = losses / float(total_size)
    if attention_op is not None:
        dump_attentions(output_filename_prefix, attentions)
    return loss
Beispiel #6
0
def infer(sess,
          prediction_op,
          infer_data,
          output,
          vocab_source,
          vocab_target,
          delimiter=" ",
          output_attention=False,
          tokenize_output=False,
          verbose=True):
    """ Infers data and save the prediction results.

    Args:
        sess: `tf.Session`.
        prediction_op: Tensorflow operation for inference.
        infer_data: An iterable instance that each element
          is a packed feeding dictionary for `sess`.
        output: Output file name, `str`.
        vocab_source: A `Vocab` instance for source side feature map.
        vocab_target: A `Vocab` instance for target side feature map.
        alpha: A scalar number, length penalty rate. If not provided
          or < 0, simply average each beam by length of predicted
          sequence.
        delimiter: The delimiter of output token sequence.
        output_attention: Whether to output attention information.
        tokenize_output: Whether to split words into characters
          (only for Chinese).
        verbose: Print inference information if set True.

    Returns: A tuple `(sources, hypothesis)`, two lists of
      strings.
    """
    attentions = dict()
    hypothesis = []
    sources = []
    cnt = 0
    for data in infer_data:
        source_tokens = [
            vocab_source.convert_to_wordlist(x, bpe_decoding=False)
            for x in data["feature_ids"]
        ]
        x_str = [delimiter.join(x) for x in source_tokens]
        prediction, att = _infer(sess,
                                 data["feed_dict"],
                                 prediction_op,
                                 len(x_str),
                                 top_k=1,
                                 output_attention=output_attention)

        sources.extend(x_str)
        hypothesis.extend([
            delimiter.join(
                vocab_target.convert_to_wordlist(prediction[sample_idx]))
            for sample_idx in range(prediction.shape[0])
        ])
        if output_attention and att is not None:
            candidate_tokens = [
                vocab_target.convert_to_wordlist(prediction[idx, :],
                                                 bpe_decoding=False,
                                                 reverse_seq=False)
                for idx in range(len(x_str))
            ]

            attentions.update(
                pack_batch_attention_dict(cnt, source_tokens, candidate_tokens,
                                          att))
        cnt += len(x_str)
        if verbose:
            tf.logging.info(cnt)
    if tokenize_output:
        hypothesis = to_chinese_char(hypothesis)
    if output:
        with gfile.GFile(output, "w") as fw:
            fw.write("\n".join(hypothesis) + "\n")
    if output_attention:
        dump_attentions(output, attentions)
    return sources, hypothesis
Beispiel #7
0
def infer(
        sess,
        prediction_op,
        feeding_data,
        output,
        vocab_target,
        alpha=None,
        delimiter=" ",
        output_attention=False,
        tokenize_output=False,
        tokenize_script="./njunmt/tools/tokenizeChinese.py",
        verbose=True):
    """ Infers data and save the prediction results.

    Args:
        sess: `tf.Session`.
        prediction_op: Tensorflow operation for inference.
        feeding_data: An iterable instance that each element
          is a packed feeding dictionary for `sess`.
        output: Output file name, `str`.
        vocab_target: A `Vocab` instance for target side feature map.
        alpha: A scalar number, length penalty rate. If not provided
          or < 0, simply average each beam by length of predicted
          sequence.
        delimiter: The delimiter of output token sequence.
        output_attention: Whether to output attention information.
        tokenize_output: Whether to split words into characters
          (only for Chinese).
        tokenize_script: The script for `tokenize_output`.
        verbose: Print inference information if set True.

    Returns: A tuple `(sample_src, sample_trg)`, two lists of
      strings. Sample from `feeding_data`.
    """
    attentions = dict()
    samples_src = []
    samples_trg = []
    with gfile.GFile(output, "w") as fw:
        cnt = 0
        for x_str, x_len, feeding_batch in feeding_data:
            prediction, att = _infer(sess, feeding_batch, prediction_op,
                                     len(x_str), alpha=alpha, top_k=1,
                                     output_attention=output_attention)
            y_str = [delimiter.join(vocab_target.convert_to_wordlist(prediction[sample_idx]))
                     for sample_idx in range(prediction.shape[0])]
            fw.write('\n'.join(y_str) + "\n")
            # random sample
            if random.random() < 0.3 and len(samples_src) < 5:
                for sample_idx in range(len(x_str)):
                    samples_src.append(x_str[sample_idx])
                    samples_trg.append(y_str[sample_idx])
                    if len(samples_src) >= 5:
                        break

            # output attention
            if output_attention and att is not None:
                source_tokens = [x.strip().split() for x in x_str]
                candidate_tokens = [vocab_target.convert_to_wordlist(
                    prediction[idx, :], bpe_decoding=False, reverse_seq=False)
                                    for idx in range(len(x_str))]

                attentions.update(pack_batch_attention_dict(
                    cnt, source_tokens, candidate_tokens, att))
            cnt += len(x_str)
            if verbose:
                tf.logging.info(cnt)
    if tokenize_output:
        tmp_output_file = output + ''.join((''.join(
            random.sample(string.digits + string.ascii_letters, 10))).split())
        os.system("python %s %s %s" %
                  (tokenize_script, output, tmp_output_file))
        os.system("mv %s %s" % (tmp_output_file, output))
    if output_attention:
        dump_attentions(output, attentions)
    return samples_src, samples_trg
Beispiel #8
0
def infer(
        sess,
        prediction_op,
        infer_data,
        output,
        vocab_source,
        vocab_target,
        delimiter=" ",
        output_attention=False,
        tokenize_output=False,
        verbose=True):
    """ Infers data and save the prediction results.

    Args:
        sess: `tf.Session`.
        prediction_op: Tensorflow operation for inference.
        infer_data: An iterable instance that each element
          is a packed feeding dictionary for `sess`.
        output: Output file name, `str`.
        vocab_source: A `Vocab` instance for source side feature map.
        vocab_target: A `Vocab` instance for target side feature map.
        alpha: A scalar number, length penalty rate. If not provided
          or < 0, simply average each beam by length of predicted
          sequence.
        delimiter: The delimiter of output token sequence.
        output_attention: Whether to output attention information.
        tokenize_output: Whether to split words into characters
          (only for Chinese).
        verbose: Print inference information if set True.

    Returns: A tuple `(sources, hypothesis)`, two lists of
      strings.
    """
    attentions = dict()
    hypothesis = []
    sources = []
    cnt = 0
    for data in infer_data:
        source_tokens = [vocab_source.convert_to_wordlist(x, bpe_decoding=False)
                         for x in data["feature_ids"]]
        x_str = [delimiter.join(x) for x in source_tokens]
        prediction, att = _infer(sess, data["feed_dict"], prediction_op,
                                 len(x_str), top_k=1, output_attention=output_attention)

        sources.extend(x_str)
        hypothesis.extend([delimiter.join(vocab_target.convert_to_wordlist(prediction[sample_idx]))
                           for sample_idx in range(len(prediction))])
        if output_attention and att is not None:
            candidate_tokens = [vocab_target.convert_to_wordlist(
                prediction[idx], bpe_decoding=False, reverse_seq=False)
                                for idx in range(len(x_str))]

            attentions.update(pack_batch_attention_dict(
                cnt, source_tokens, candidate_tokens, att))
        cnt += len(x_str)
        if verbose:
            tf.logging.info(cnt)
    if tokenize_output:
        hypothesis = to_chinese_char(hypothesis)
    if output:
        with gfile.GFile(output, "w") as fw:
            fw.write("\n".join(hypothesis) + "\n")
    if output_attention:
        dump_attentions(output, attentions)
    return sources, hypothesis