예제 #1
0
파일: train.py 프로젝트: Sissipei/caffe2
def gen_batches(source_corpus, target_corpus, source_vocab, target_vocab,
                batch_size, max_length):
    with open(source_corpus) as source, open(target_corpus) as target:
        parallel_sentences = []
        for source_sentence, target_sentence in zip(source, target):
            numerized_source_sentence = seq2seq_util.get_numberized_sentence(
                source_sentence,
                source_vocab,
            )
            numerized_target_sentence = seq2seq_util.get_numberized_sentence(
                target_sentence,
                target_vocab,
            )
            if (
                len(numerized_source_sentence) > 0 and
                len(numerized_target_sentence) > 0 and
                (
                    max_length is None or (
                        len(numerized_source_sentence) <= max_length and
                        len(numerized_target_sentence) <= max_length
                    )
                )
            ):
                parallel_sentences.append((
                    numerized_source_sentence,
                    numerized_target_sentence,
                ))
    parallel_sentences.sort(key=lambda s_t: (len(s_t[0]), len(s_t[1])))

    batches, batch = [], []
    for sentence_pair in parallel_sentences:
        batch.append(sentence_pair)
        if len(batch) >= batch_size:
            batches.append(batch)
            batch = []
    if len(batch) > 0:
        while len(batch) < batch_size:
            batch.append(batch[-1])
        assert len(batch) == batch_size
        batches.append(batch)
    random.shuffle(batches)
    return batches
예제 #2
0
def gen_batches(source_corpus, target_corpus, source_vocab, target_vocab,
                batch_size, max_length):
    with open(source_corpus) as source, open(target_corpus) as target:
        parallel_sentences = []
        for source_sentence, target_sentence in zip(source, target):
            numerized_source_sentence = seq2seq_util.get_numberized_sentence(
                source_sentence,
                source_vocab,
            )
            numerized_target_sentence = seq2seq_util.get_numberized_sentence(
                target_sentence,
                target_vocab,
            )
            if (
                len(numerized_source_sentence) > 0 and
                len(numerized_target_sentence) > 0 and
                (
                    max_length is None or (
                        len(numerized_source_sentence) <= max_length and
                        len(numerized_target_sentence) <= max_length
                    )
                )
            ):
                parallel_sentences.append((
                    numerized_source_sentence,
                    numerized_target_sentence,
                ))
    parallel_sentences.sort(key=lambda s_t: (len(s_t[0]), len(s_t[1])))

    batches, batch = [], []
    for sentence_pair in parallel_sentences:
        batch.append(sentence_pair)
        if len(batch) >= batch_size:
            batches.append(batch)
            batch = []
    if len(batch) > 0:
        while len(batch) < batch_size:
            batch.append(batch[-1])
        assert len(batch) == batch_size
        batches.append(batch)
    random.shuffle(batches)
    return batches
예제 #3
0
def run_seq2seq_beam_decoder(args, model_params, decoding_params):
    source_vocab = seq2seq_util.gen_vocab(
        args.source_corpus,
        args.unk_threshold,
    )
    logger.info('Source vocab size {}'.format(len(source_vocab)))
    target_vocab = seq2seq_util.gen_vocab(
        args.target_corpus,
        args.unk_threshold,
    )
    inversed_target_vocab = {v: k for (k, v) in viewitems(target_vocab)}
    logger.info('Target vocab size {}'.format(len(target_vocab)))

    decoder = Seq2SeqModelCaffe2EnsembleDecoder(
        translate_params=dict(
            ensemble_models=[dict(
                source_vocab=source_vocab,
                target_vocab=target_vocab,
                model_params=model_params,
                model_file=args.checkpoint,
            )],
            decoding_params=decoding_params,
        ),
    )
    decoder.load_models()

    for line in sys.stdin:
        numerized_source_sentence = seq2seq_util.get_numberized_sentence(
            line,
            source_vocab,
        )
        translation, alignment, _ = decoder.decode(
            numerized_source_sentence,
            2 * len(numerized_source_sentence) + 5,
        )
        print(' '.join([inversed_target_vocab[tid] for tid in translation]))
예제 #4
0
파일: translate.py 프로젝트: jusalun/caffe2
def run_seq2seq_beam_decoder(args, model_params, decoding_params):
    source_vocab = seq2seq_util.gen_vocab(
        args.source_corpus,
        args.unk_threshold,
    )
    logger.info('Source vocab size {}'.format(len(source_vocab)))
    target_vocab = seq2seq_util.gen_vocab(
        args.target_corpus,
        args.unk_threshold,
    )
    inversed_target_vocab = {v: k for (k, v) in viewitems(target_vocab)}
    logger.info('Target vocab size {}'.format(len(target_vocab)))

    decoder = Seq2SeqModelCaffe2EnsembleDecoder(translate_params=dict(
        ensemble_models=[
            dict(
                source_vocab=source_vocab,
                target_vocab=target_vocab,
                model_params=model_params,
                model_file=args.checkpoint,
            )
        ],
        decoding_params=decoding_params,
    ), )
    decoder.load_models()

    for line in sys.stdin:
        numerized_source_sentence = seq2seq_util.get_numberized_sentence(
            line,
            source_vocab,
        )
        translation, alignment, _ = decoder.decode(
            numerized_source_sentence,
            2 * len(numerized_source_sentence) + 5,
        )
        print(' '.join([inversed_target_vocab[tid] for tid in translation]))