Example #1
0
def check_vocab(vocab_file,
                out_dir,
                check_special_token=True,
                sos=None,
                eos=None,
                unk=None):
    """Check if vocab_file doesn't exist, create from corpus_file."""
    if tf.gfile.Exists(vocab_file):
        utils.print_out("# Vocab file %s exists" % vocab_file)
        vocab, vocab_size = load_vocab(vocab_file)
        if check_special_token:
            # Verify if the vocab starts with unk, sos, eos
            # If not, prepend those tokens & generate a new vocab file
            if not unk: unk = UNK
            if not sos: sos = SOS
            if not eos: eos = EOS
            assert len(vocab) >= 3
            if vocab[0] != unk or vocab[1] != sos or vocab[2] != eos:
                utils.print_out("The first 3 vocab words [%s, %s, %s]"
                                " are not [%s, %s, %s]" %
                                (vocab[0], vocab[1], vocab[2], unk, sos, eos))
                vocab = [unk, sos, eos] + vocab
                vocab_size += 3
                new_vocab_file = os.path.join(out_dir,
                                              os.path.basename(vocab_file))
                with codecs.getwriter("utf-8")(tf.gfile.GFile(
                        new_vocab_file, "wb")) as f:
                    for word in vocab:
                        f.write("%s\n" % word)
                vocab_file = new_vocab_file
    else:
        raise ValueError("vocab_file '%s' does not exist." % vocab_file)

    vocab_size = len(vocab)
    return vocab_size, vocab_file
Example #2
0
def load_embed_txt(embed_file):
    """Load embed_file into a python dictionary.

    Note: the embed_file should be a Glove/word2vec formatted txt file. Assuming
    Here is an exampe assuming embed_size=5:

    the -0.071549 0.093459 0.023738 -0.090339 0.056123
    to 0.57346 0.5417 -0.23477 -0.3624 0.4037
    and 0.20327 0.47348 0.050877 0.002103 0.060547

    For word2vec format, the first line will be: <num_words> <emb_size>.

    Args:
      embed_file: file path to the embedding file.
    Returns:
      a dictionary that maps word to vector, and the size of embedding dimensions.
    """
    emb_dict = dict()
    emb_size = None

    is_first_line = True
    with codecs.getreader("utf-8")(tf.gfile.GFile(embed_file, "rb")) as f:
        for line in f:
            tokens = line.rstrip().split(" ")
            if is_first_line:
                is_first_line = False
                if len(tokens) == 2:  # header line
                    emb_size = int(tokens[1])
                    continue
            word = tokens[0]
            vec = list(map(float, tokens[1:]))
            emb_dict[word] = vec
            if emb_size:
                if emb_size != len(vec):
                    utils.print_out(
                        "Ignoring %s since embeding size is inconsistent." %
                        word)
                    del emb_dict[word]
            else:
                emb_size = len(vec)
    return emb_dict, emb_size
Example #3
0
def decode_and_evaluate(name,
                        model,
                        sess,
                        trans_file,
                        ref_file,
                        metrics,
                        subword_option,
                        beam_width,
                        tgt_eos,
                        num_translations_per_input=1,
                        decode=True,
                        infer_mode="greedy",
                        index_pair=[]):
    """Decode a test set and compute a score according to the evaluation task."""
    # Decode
    end_time = None
    num_sentences = None
    if decode:
        utils.print_out("  decoding to output %s" % trans_file)

        start_time = time.time()
        num_sentences = 0
        with codecs.getwriter("utf-8")(tf.gfile.GFile(trans_file,
                                                      mode="wb")) as trans_f:
            trans_f.write("")  # Write empty string to ensure file is created.

            if infer_mode == "greedy":
                num_translations_per_input = 1
            elif infer_mode == "beam_search":
                num_translations_per_input = min(num_translations_per_input,
                                                 beam_width)
            translation = []
            while True:
                try:
                    nmt_outputs, _ = model.decode(sess)
                    if infer_mode != "beam_search":
                        nmt_outputs = np.expand_dims(nmt_outputs, 0)

                    batch_size = nmt_outputs.shape[1]
                    num_sentences += batch_size

                    for sent_id in range(batch_size):
                        for beam_id in range(num_translations_per_input):
                            translation.append(
                                get_translation(nmt_outputs[beam_id],
                                                sent_id,
                                                tgt_eos=tgt_eos,
                                                subword_option=subword_option))
                except tf.errors.OutOfRangeError:
                    end_time = time.time()
                    utils.print_time(
                        "  done, num sentences %d, num translations per input %d"
                        % (num_sentences, num_translations_per_input),
                        start_time)
                    break
            if len(index_pair) is 0:
                for sentence in translation:
                    trans_f.write((sentence + b"\n").decode("utf-8"))
            else:
                for i in index_pair:
                    trans_f.write(
                        (translation[index_pair[i]] + b"\n").decode("utf-8"))

    # Evaluation
    evaluation_scores = {}
    if ref_file and tf.gfile.Exists(trans_file):
        for metric in metrics:
            score = evaluation_utils.evaluate(ref_file,
                                              trans_file,
                                              metric,
                                              subword_option=subword_option)
            evaluation_scores[metric] = score
            utils.print_out("  %s %s: %.1f" % (metric, name, score))

    return evaluation_scores, end_time, num_sentences