コード例 #1
0
ファイル: nmt.py プロジェクト: chubbymaggie/pytorch_basic_nmt
def decode(args: Dict[str, str]):
    """
    performs decoding on a test set, and save the best-scoring decoding results.
    If the target gold-standard sentences are given, the function also computes
    corpus-level BLEU score.
    """

    print(f"load test source sentences from [{args['TEST_SOURCE_FILE']}]", file=sys.stderr)
    test_data_src = read_corpus(args['TEST_SOURCE_FILE'], source='src')
    if args['TEST_TARGET_FILE']:
        print(f"load test target sentences from [{args['TEST_TARGET_FILE']}]", file=sys.stderr)
        test_data_tgt = read_corpus(args['TEST_TARGET_FILE'], source='tgt')

    print(f"load model from {args['MODEL_PATH']}", file=sys.stderr)
    model = NMT.load(args['MODEL_PATH'])

    if args['--cuda']:
        model = model.to(torch.device("cuda:0"))

    hypotheses = beam_search(model, test_data_src,
                             beam_size=int(args['--beam-size']),
                             max_decoding_time_step=int(args['--max-decoding-time-step']))

    if args['TEST_TARGET_FILE']:
        top_hypotheses = [hyps[0] for hyps in hypotheses]
        bleu_score = compute_corpus_level_bleu_score(test_data_tgt, top_hypotheses)
        print(f'Corpus BLEU: {bleu_score}', file=sys.stderr)

    with open(args['OUTPUT_FILE'], 'w') as f:
        for src_sent, hyps in zip(test_data_src, hypotheses):
            top_hyp = hyps[0]
            hyp_sent = ' '.join(top_hyp.value)
            f.write(hyp_sent + '\n')
コード例 #2
0
ファイル: eval.py プロジェクト: Koritskaya/tag-cloud
def compute_pr(model):
    '''Takes in a model to produce tags, then evaluates PR of model'''
    print "Evaluating %s" % (model)
    (vocab, tf) = utils.read_corpus()
    top_words = models.run_model(model, vocab, tf)
    average_recall = avg_recall(top_words)
    avgerage_precision = avg_precision(top_words)
    print "Average Precision: %f" %(avg_precision)
    print "Average Precision: %f" %(avg_precision)
コード例 #3
0
ファイル: sanity_check.py プロジェクト: marcelotallis/nmt
def main():
    """ Main func.
    """
    args = docopt(__doc__)

    # Check Python & PyTorch Versions
    assert (sys.version_info >= (3, 5)), "Please update your installation of Python to version >= 3.5"
    # assert(torch.__version__ == "1.0.0"), "Please update your installation of PyTorch. You have {} and you should have version 1.0.0".format(torch.__version__)
    assert(torch.__version__ >= "1.0.0"), "Please update your installation of PyTorch. You have {} and you should have version 1.0.0".format(torch.__version__)

    # Seed the Random Number Generators
    seed = 1234
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    np.random.seed(seed * 13 // 7)

    # Load training data & vocabulary
    train_data_src = read_corpus('./sanity_check_en_es_data/train_sanity_check.es', 'src')
    train_data_tgt = read_corpus('./sanity_check_en_es_data/train_sanity_check.en', 'tgt')
    train_data = list(zip(train_data_src, train_data_tgt))

    for src_sents, tgt_sents in batch_iter(train_data, batch_size=BATCH_SIZE, shuffle=True):
        src_sents = src_sents
        tgt_sents = tgt_sents
        break
    vocab = Vocab.load('./sanity_check_en_es_data/vocab_sanity_check.json') 

    # Create NMT Model
    model = NMT(
        embed_size=EMBED_SIZE,
        hidden_size=HIDDEN_SIZE,
        dropout_rate=DROPOUT_RATE,
        vocab=vocab)

    if args['1d']:
        question_1d_sanity_check(model, src_sents, tgt_sents, vocab)
    elif args['1e']:
        question_1e_sanity_check(model, src_sents, tgt_sents, vocab)
    elif args['1f']:
       # generate_outputs(model, src_sents, tgt_sents, vocab)
        question_1f_sanity_check(model, src_sents, tgt_sents, vocab)
    else:
        raise RuntimeError('invalid run mode')
コード例 #4
0
ファイル: main.py プロジェクト: MEATsir/Dominic
def build_vocab(args):
    if not os.path.exists(args.vocab_path):
        src_sents, labels = read_corpus(args.train_data_dir)
        labels = {label: idx for idx, label in enumerate(labels)}
        vocab = Vocab.build(src_sents, labels, args.max_vocab_size,
                            args.min_freq)
        vocab.save(args.vocab_path)
    else:
        vocab = Vocab.load(args.vocab_path)
    return vocab
コード例 #5
0
ファイル: run.py プロジェクト: MezentsevIlya/CS224n
def decode(args: Dict[str, str]):
    """ Performs decoding on a test set, and save the best-scoring decoding results.
    If the target gold-standard sentences are given, the function also computes
    corpus-level BLEU score.
    @param args (Dict): args from cmd line
    """

    print("load test source sentences from [{}]".format(
        args['TEST_SOURCE_FILE']),
          file=sys.stderr)
    test_data_src = read_corpus(args['TEST_SOURCE_FILE'], source='src')
    if args['TEST_TARGET_FILE']:
        print("load test target sentences from [{}]".format(
            args['TEST_TARGET_FILE']),
              file=sys.stderr)
        test_data_tgt = read_corpus(args['TEST_TARGET_FILE'], source='tgt')

    print("load model from {}".format(args['MODEL_PATH']), file=sys.stderr)
    model = NMT.load(args['MODEL_PATH'])

    if args['--cuda']:
        model = model.to(torch.device("cuda:0"))

    hypotheses = beam_search(model,
                             test_data_src,
                             beam_size=int(args['--beam-size']),
                             max_decoding_time_step=int(
                                 args['--max-decoding-time-step']))

    if args['TEST_TARGET_FILE']:
        top_hypotheses = [hyps[0] for hyps in hypotheses]
        bleu_score = compute_corpus_level_bleu_score(test_data_tgt,
                                                     top_hypotheses)
        print('Corpus BLEU: {}'.format(bleu_score * 100), file=sys.stderr)

    output_file = args['OUTPUT_FILE']
    os.makedirs(os.path.dirname(output_file), exist_ok=True)
    with open(output_file, 'w+') as f:
        for src_sent, hyps in zip(test_data_src, hypotheses):
            top_hyp = hyps[0]
            hyp_sent = ' '.join(top_hyp.value)
            f.write('source sentence: ' + ' '.join(src_sent) + '\n')
            f.write('translation: ' + hyp_sent + '\n')
コード例 #6
0
def test(args):
    """ Testing the model
    Args:
        args: dict that contains options in command
    """
    sent_vocab = Vocab.load(args.SENT_VOCAB)
    tag_vocab = Vocab.load(args.TAG_VOCAB)
    sentences, tags = utils.read_corpus(args.TEST)
    sentences = utils.words2indices(sentences, sent_vocab)
    tags = utils.words2indices(tags, tag_vocab)
    test_data = list(zip(sentences, tags))
    print('num of test samples: %d' % (len(test_data)))

    # device = torch.device('cuda' if args.cuda else 'cpu')
    device = torch.device('cuda:0')
    model = bilstm_crf.BiLSTMCRF.load(args.MODEL, device)
    print('start testing...')
    print('using device', device)

    start = time.time()
    n_iter, num_words = 0, 0
    tp, fp, fn = 0, 0, 0

    model.eval()
    with torch.no_grad():
        for sentences, tags in utils.batch_iter(test_data,
                                                batch_size=int(
                                                    args.batch_size),
                                                shuffle=False):
            sentences, sent_lengths = utils.pad(sentences,
                                                sent_vocab[sent_vocab.PAD],
                                                device)
            predicted_tags = model.predict(sentences, sent_lengths)
            n_iter += 1
            num_words += sum(sent_lengths)
            for tag, predicted_tag in zip(tags, predicted_tags):
                current_tp, current_fp, current_fn = cal_statistics(
                    tag, predicted_tag, tag_vocab)
                tp += current_tp
                fp += current_fp
                fn += current_fn
            if n_iter % int(args.log_every) == 0:
                print(
                    'log: iter %d, %.1f words/sec, precision %f, recall %f, f1_score %f, time %.1f sec'
                    % (n_iter, num_words / (time.time() - start), tp /
                       (tp + fp), tp / (tp + fn),
                       (2 * tp) / (2 * tp + fp + fn), time.time() - start))
                num_words = 0
                start = time.time()
    print('tp = %d, fp = %d, fn = %d' % (tp, fp, fn))
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    f1_score = (2 * tp) / (2 * tp + fp + fn)
    print('Precision: %f, Recall: %f, F1 score: %f' %
          (precision, recall, f1_score))
コード例 #7
0
def decode(args: Dict[str, str]):
    """ Performs decoding on a test set, and save the best-scoring decoding results.
    If the target gold-standard sentences are given, the function also computes
    corpus-level BLEU score.
    @param args (Dict): args from cmd line
    """

    print("load test source sentences from [{}]".format(
        args['TEST_SOURCE_FILE']),
          file=sys.stderr)
    test_data_src = read_corpus(args['TEST_SOURCE_FILE'], source='src')
    if args['TEST_TARGET_FILE']:
        print("load test target sentences from [{}]".format(
            args['TEST_TARGET_FILE']),
              file=sys.stderr)
        test_data_tgt = read_corpus(args['TEST_TARGET_FILE'], source='tgt')

    print("load model from {}".format(args['MODEL_PATH']), file=sys.stderr)
    model1 = NMT.load(args['MODEL_PATH'])
    model2 = DPPNMT.load(args['MODEL_PATH'])

    if args['INDEX']:
        index = int(args['INDEX'])
        beam_search2(
            model1,
            model2,
            [test_data_src[index]],
            5,
            70,
            [test_data_tgt[index]],
        )
    else:
        beam_search2(
            model1,
            model2,
            test_data_src,
            #int(args['--beam-size']),
            5,
            #int(args['--max-decoding-time-step']),
            70,
            test_data_tgt,
        )
コード例 #8
0
ファイル: run.py プロジェクト: justinxu421/224n-project
def decode(args: Dict[str, str]):
    """ Performs decoding on a test set, and save the best-scoring decoding results.
    If the target gold-standard sentences are given, the function also computes
    corpus-level BLEU score.
    @param args (Dict): args from cmd line
    """

    print("load test source sentences from [{}]".format(args['TEST_SOURCE_FILE']), file=sys.stderr)
    test_data_src = read_corpus(args['TEST_SOURCE_FILE'], source='src')
    if args['TEST_TARGET_FILE']:
        print("load test target sentences from [{}]".format(args['TEST_TARGET_FILE']), file=sys.stderr)
        test_data_tgt = read_corpus(args['TEST_TARGET_FILE'], source='tgt')

    print("load model from {}".format(args['MODEL_PATH']), file=sys.stderr)
    model = NMT.load(args['MODEL_PATH'])

    if args['--cuda']:
        model = model.to(torch.device("cuda:0"))

    hypotheses = beam_search(model, test_data_src,
                             beam_size=int(args['--beam-size']),
                             max_decoding_time_step=int(args['--max-decoding-time-step']))

    if args['TEST_TARGET_FILE']:
        top_hypotheses = [hyps[0] for hyps in hypotheses]
        bleu_score = compute_corpus_level_bleu_score(test_data_tgt, top_hypotheses)


                #accuracy (unigrams)
        perfectly_correct = 0
        for index,hyp in enumerate(top_hypotheses):
            if hyp.value[0] == test_data_tgt[index][1]:
                perfectly_correct += 1
        print('Ignore accuracy for non unigrams')
        print('Accuracy: {}'.format(perfectly_correct / len(test_data_tgt)), file=sys.stderr)
        print('Corpus BLEU: {}'.format(bleu_score * 100), file=sys.stderr)

    with open(args['OUTPUT_FILE'], 'w') as f:
        for src_sent, hyps in zip(test_data_src, hypotheses):
            top_hyp = hyps[0]
            hyp_sent = ' '.join(top_hyp.value)
            f.write(hyp_sent + '\n')
コード例 #9
0
def decode(load_from=None):
    """
    performs decoding on a test set, and save the best-scoring decoding results.
    If the target gold-standard sentences are given, the function also computes
    corpus-level BLEU score.
    """

    if gconfig.test:
        data_src = read_corpus(paths.test_source, source='src')
        data_tgt = read_corpus(paths.test_target, source='tgt')
        data_tgt_path = paths.test_target
    else:
        data_src = read_corpus(paths.dev_source, source='src')
        data_tgt = read_corpus(paths.dev_target, source='tgt')
        data_tgt_path = paths.dev_target

    print(f"load model from {paths.model}", file=sys.stderr)
    if load_from is not None:
        model_load_path = load_from
    else:
        model_load_path = paths.model
    model = NMTModel.load(model_load_path)
    if gconfig.cuda:
        model.to_gpu()
    model.eval()
    max_step = dconfig.max_decoding_time_step
    if gconfig.sanity:
        max_step = 2

    hypotheses = routine.beam_search(model,
                                     data_src,
                                     max_step,
                                     replace=dconfig.replace)

    lines = []
    for src_sent, hyps in zip(data_src, hypotheses):
        top_hyp = hyps[0]
        lines.append(top_hyp.value)
    write_sents(lines, paths.decode_output)

    bleu_command = "perl scripts/multi-bleu.perl " + data_tgt_path + " < " + paths.decode_output
    os.system(bleu_command)
コード例 #10
0
ファイル: test.py プロジェクト: ZLBryant/NMT
def translate(model):
    if model.args.en_cn:
        test_src, test_tgt = read_en_cn_corpus(
            model.args.test_data, source=model.args.source_language)
    elif model.args.en_es:
        test_src = read_corpus(model.args.test_src_data, False)
        test_tgt = read_corpus(model.args.test_tgt_data, True)
    else:
        print("invalid input")
        exit(0)
    train_data = list(zip(test_src, test_tgt))
    with open(args.translate_result_save_path, 'w') as f:
        for src_sent, tgt_sent in batch_iter(train_data, 1, shuffle=False):
            tgt_predict = model.translate(src_sent[0])
            tgt_predict = ' '.join(tgt_predict[0]['sentence'])
            tgt_sent = tgt_sent[0]
            tgt = ' '.join(tgt_sent[1:-1])
            line = "translate: " + tgt_predict + " " * 5 + "target: " + tgt + '\r\n'
            f.write(line)
    f.close()
コード例 #11
0
def filter_subs(input_data_path: str, output_data_path: str):
    print('Reading data...')
    subs = read_corpus(input_data_path)
    pattern = re.compile('^(?:[A-z]|[А-я]|[ёЁ\d\s.,!:?\-––\'"%$()`])+$')
    print('Filtering...')
    filtered = [s for s in tqdm(subs) if pattern.match(s)]
    print('Removing too long sentences...')
    short = [s for s in tqdm(filtered) if len(s.split()) <= 50 and len(s) <= 250]
    print('Saving...')
    save_corpus(short, output_data_path)
    print('Done!')
コード例 #12
0
def decodeOT(args: Dict[str, str]):
    """
    performs decoding on a test set, and save the best-scoring decoding results. 
    If the target gold-standard sentences are given, the function also computes
    corpus-level BLEU score.
    """
    test_data_src = read_corpus(args['TEST_SOURCE_FILE'], source='src')
    if args['TEST_TARGET_FILE']:
        test_data_tgt = read_corpus(args['TEST_TARGET_FILE'], source='tgt')

    print(f"load model from {args['MODEL_PATH']}", file=sys.stderr)
    if os.path.exists(args['MODEL_PATH']):
        model = NMT.load(args['MODEL_PATH'])
    else:
        model = NMT(256, 256, pickle.load(open('data/vocab.bin', 'rb')))

    # Set models to eval (disables dropout)
    model.encoder.eval()
    model.decoder.eval()

    hypotheses = beam_searchOT(model,
                               test_data_src,
                               beam_size=int(args['--beam-size']),
                               max_decoding_time_step=int(
                                   args['--max-decoding-time-step']))

    if args['TEST_TARGET_FILE']:
        top_hypotheses = [hyps[0] for hyps in hypotheses]
        bleu_score = compute_corpus_level_bleu_score(test_data_tgt,
                                                     top_hypotheses)
        print(f'Corpus BLEU: {bleu_score}', file=sys.stderr)

    with open(args['OUTPUT_FILE'], 'w') as f:
        for src_sent, hyps in zip(test_data_src, hypotheses):
            top_hyp = hyps[0]
            hyp_sent = ' '.join(top_hyp.value.split()[1:-1])
            f.write(hyp_sent + '\n')

    # Back to train (not really necessary for now)
    model.encoder.train()
    model.decoder.train()
コード例 #13
0
def decode(args: Dict[str, str]):
    """
    performs decoding on a test set, and save the best-scoring decoding results. 
    If the target gold-standard sentences are given, the function also computes
    corpus-level BLEU score.
    """

    vocab = pickle.load(open(args['--vocab'], 'rb'))
    srcEntry = vocab.src  # VocabEntry for src
    tgtEntry = vocab.tgt  # VocabEntry for tgt

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(device)
    test_data_src = read_corpus(args['--test-src'], source='src')

    test_data = Testset(srcEntry.words2indices(test_data_src))
    test_loader = DataLoader(test_data, 1, shuffle=False)

    if args['--test-tgt']:
        test_data_tgt = read_corpus(args['--test-tgt'], source='tgt')

    print(f"load model from {args['--model-path']}", file=sys.stderr)

    model = torch.load(args['--model-path'])

    hypotheses = beam_search(model,
                             test_loader,
                             beam_size=int(args['--beam-size']),
                             max_decoding_time_step=int(
                                 args['--max-decoding-time-step']),
                             tgtEntry=tgtEntry,
                             device=device)

    if args['--test-tgt']:
        bleu_score = compute_corpus_level_bleu_score(test_data_tgt, hypotheses)
        print(f'Corpus BLEU: {bleu_score}', file=sys.stderr)

    with open(args['--output-path'], 'w') as f:
        for src_sent, top_hyp in zip(test_data_src, hypotheses):
            hyp_sent = ' '.join(top_hyp.value)
            f.write(hyp_sent + '\n')
コード例 #14
0
def decode(args: Dict[str, str]):
    """ Performs decoding on a test set, and save the best-scoring decoding results.
    If the target gold-standard sentences are given, the function also computes
    corpus-level BLEU score.
    @param args (Dict): args from cmd line
    """

    print("load test source sentences from [{}]".format(args['TEST_SOURCE_FILE']), file=sys.stderr)
    test_data_src = read_corpus(args['TEST_SOURCE_FILE'], source='src')
    if args['TEST_TARGET_FILE']:
        print("load test target sentences from [{}]".format(args['TEST_TARGET_FILE']), file=sys.stderr)
        test_data_tgt = read_corpus(args['TEST_TARGET_FILE'], source='tgt')

    print("load model from {}".format(args['MODEL_PATH']), file=sys.stderr)
    if args['--dpp'] and args['--sampling']:
        print("Error: cannot specify --dpp and --sampling simultaneously")
    elif args['--dpp']:
        print("Loading DPP Model")
        model = DPPNMT.load(args['MODEL_PATH'])
    else:
        model = NMT.load(args['MODEL_PATH'])
        if args['--sampling']:
            model.sampling = True

    if args['--cuda']:
        model = model.to(torch.device("cuda:0"))

    hypotheses = beam_search(model, test_data_src,
                             beam_size=int(args['--beam-size']),
                             max_decoding_time_step=int(args['--max-decoding-time-step']))

    if args['TEST_TARGET_FILE']:
        top_hypotheses = [hyps[0] for hyps in hypotheses]
        bleu_score = compute_corpus_level_bleu_score(test_data_tgt, top_hypotheses)
        print('Corpus BLEU: {}'.format(bleu_score * 100), file=sys.stderr)

    with open(args['OUTPUT_FILE'], 'w') as f:
        for src_sent, hyps in zip(test_data_src, hypotheses):
            top_hyp = hyps[0]
            hyp_sent = ' '.join(top_hyp.value)
            f.write(hyp_sent + '\n')
コード例 #15
0
def detokenize(input_file, output_file):
    print('Reading data...')
    texts = read_corpus(input_file)
    print('Data reading finished!')

    print('Detokenizing...')
    detokenized = [detokenizer.detokenize(s.split()) for s in tqdm(texts)]
    print('Detokenized!')

    print('Saving...')
    save_corpus(detokenized, output_file)
    print('Saved!')
コード例 #16
0
def decode(args: Dict[str, str]):

    test_data_src, failed_ids_src = read_corpus(args['TEST_SOURCE_FILE'],
                                                source='src')
    test_data_tgt, failed_ids_tgt = read_corpus(args['TEST_TARGET_FILE'],
                                                source='tgt')

    total_failed_ids = set(failed_ids_src).union(failed_ids_tgt)

    test_data_src = [
        test_data_src[i] for i in range(len(test_data_src))
        if i not in total_failed_ids
    ]
    test_data_tgt = [
        test_data_tgt[i] for i in range(len(test_data_tgt))
        if i not in total_failed_ids
    ]

    print(f"load model from {args['MODEL_PATH']}", file=sys.stderr)
    model = NMT.load(args['MODEL_PATH'])

    if args['--cuda']:
        model = model.to(torch.device("cuda:2"))
    hypotheses = beam_search(model,
                             test_data_src,
                             beam_size=int(args['--beam-size']),
                             max_decoding_time_step=int(
                                 args['--max-decoding-time-step']))

    if args['TEST_TARGET_FILE']:
        top_hypotheses = [hyps[0] for hyps in hypotheses]
        bleu_score = compute_corpus_level_bleu_score(test_data_tgt,
                                                     top_hypotheses)
        print(f'Corpus BLEU: {bleu_score}', file=sys.stderr)

    with open(args['OUTPUT_FILE'], 'w') as f:
        for src_sent, hyps in zip(test_data_src, hypotheses):
            top_hyp = hyps[0]
            hyp_sent = ' '.join(top_hyp.value)
            f.write(hyp_sent + '\n')
コード例 #17
0
ファイル: translate.py プロジェクト: Kaixin-Wu/myTransformer
def greedy_test(args):
    """ Test function """

    # load vocabulary
    vocab = torch.load(args.vocab)

    # build model
    translator = Transformer(args, vocab)
    translator.eval()

    # load parameters
    translator.load_state_dict(torch.load(args.decode_model_path))
    if args.cuda:
        translator = translator.cuda()

    test_data = read_corpus(args.decode_from_file, source="src")
    # ['<BOS>', '<PAD>', 'PAD', '<PAD>', '<PAD>']
    pred_data = len(test_data) * [[
        constants.PAD_WORD if i else constants.BOS_WORD
        for i in range(args.decode_max_steps)
    ]]

    output_file = codecs.open(args.decode_output_file, "w", encoding="utf-8")
    for test, pred in zip(test_data, pred_data):
        pred_output = [constants.PAD_WORD] * args.decode_max_steps
        test_var = to_input_variable([test], vocab.src, cuda=args.cuda)

        # only need one time
        enc_output = translator.encode(test_var[0], test_var[1])
        for i in range(args.decode_max_steps):
            pred_var = to_input_variable([pred[:i + 1]],
                                         vocab.tgt,
                                         cuda=args.cuda)

            scores = translator.translate(enc_output, test_var[0], pred_var)

            _, argmax_idxs = torch.max(scores, dim=-1)
            one_step_idx = argmax_idxs[-1].item()

            pred_output[i] = vocab.tgt.id2word[one_step_idx]
            if (one_step_idx
                    == constants.EOS) or (i == args.decode_max_steps - 1):
                print("[Source] %s" % " ".join(test))
                print("[Predict] %s" % " ".join(pred_output[:i]))
                print()

                output_file.write(" ".join(pred_output[:i]) + "\n")
                output_file.flush()
                break
            pred[i + 1] = vocab.tgt.id2word[one_step_idx]

    output_file.close()
コード例 #18
0
def decode(args, test_data_src, test_data_tgt=None):
    """ Performs decoding on a test set, and save the best-scoring decoding results.
    If the target gold-standard sentences are given, the function also computes
    corpus-level BLEU score.
    """

    print("load test source sentences from [{}]".format(test_data_src))
    test_data_src = read_corpus(test_data_src, source='src')
    if test_data_tgt:
        print("load test target sentences from [{}]".format(test_data_tgt))
        test_data_tgt = read_corpus(test_data_tgt, source='tgt')

    print("load model from {}".format(args.model_path))
    model = NMT.load(args.model_path)

    if args.cuda:
        model = model.to(torch.device("cuda:0"))

    hypotheses = beam_search(
        model,
        test_data_src,
        beam_size=args.beam_size,
        max_decoding_time_step=args.max_decoding_time_step)

    if args.test_tgt:
        top_hypotheses = [hyps[0] for hyps in hypotheses]
        bleu_score = compute_corpus_level_bleu_score(test_data_tgt,
                                                     top_hypotheses)
        print('Corpus BLEU: {}'.format(bleu_score * 100))

    if args.output_file:
        print("Saving predictions to " + args.output_file)
        with open(args.output_file, 'w') as f:
            for src_sent, hyps in zip(test_data_src, hypotheses):
                top_hyp = hyps[0]
                hyp_sent = ' '.join(top_hyp.value)
                f.write(hyp_sent + '\n')
    else:
        print("No output_file given, not saving predictions")
コード例 #19
0
def main(args):
    if os.path.isfile(os.path.join(args.data_path, args.vocab_file)):
        print('vocab file exist in %s.' %
              os.path.join(args.data_path, args.vocab_file))
    else:
        print('Reading in source sentence : %s ...' % args.train_src)
        src_sents = read_corpus(os.path.join(args.data_path, args.train_src),
                                source='src')
        print('Reading in target sentence : %s ...' % args.train_tgt)
        tgt_sents = read_corpus(os.path.join(args.data_path, args.train_tgt),
                                source='tgt')

        vocab = Vocab.build(src_sents,
                            tgt_sents,
                            vocab_size=args.size,
                            freq_cutoff=args.freq_cutoff)
        print('generated vocabulary, source %d words, target %d words' %
              (len(vocab.src), len(vocab.tgt)))

        vocab.save(os.path.join(args.data_path, args.vocab_file))
        print('vocabulary saved to %s' %
              os.path.join(args.data_path, args.vocab_file))
コード例 #20
0
ファイル: nmt.py プロジェクト: mvpcom/robust_mtnt
def decode(args: Dict[str, str], vocab):
    """
    performs decoding on a test set, and save the best-scoring decoding results.
    If the target gold-standard sentences are given, the function also computes
    corpus-level BLEU score.
    """
    test_data_src = read_corpus(args['TEST_SOURCE_FILE'], source='src')
    if args['TEST_TARGET_FILE']:
        test_data_tgt = read_corpus(args['TEST_TARGET_FILE'], source='tgt')

    print(f"load model from {args['MODEL_PATH']}", file=sys.stderr)
    model = NMT(embed_size=int(args['--embed-size']),
                hidden_size=int(args['--hidden-size']),
                dropout_rate=float(args['--dropout']),
                n_layers=int(args['--n_layers']),
                vocab=vocab,
                tie_weights=int(args['--tie-weights']),
                mha=int(args['--mha']))
    if torch.cuda.is_available():
        model = model.cuda()
    model.load(args['MODEL_PATH'])

    hypotheses = beam_search(model,
                             test_data_src,
                             beam_size=int(args['--beam-size']),
                             max_decoding_time_step=int(
                                 args['--max-decoding-time-step']))

    if args['TEST_TARGET_FILE']:
        top_hypotheses = [hyps[0] for hyps in hypotheses]
        bleu_score = compute_corpus_level_bleu_score(test_data_tgt,
                                                     top_hypotheses)
        print(f'Corpus BLEU: {bleu_score}', file=sys.stderr)

    with open(args['OUTPUT_FILE'], 'w') as f:
        for src_sent, hyps in zip(test_data_src, hypotheses):
            top_hyp = hyps[0]
            hyp_sent = ' '.join(top_hyp.value)
            f.write(hyp_sent + '\n')
コード例 #21
0
ファイル: baseline.py プロジェクト: jonntd/draft
def load_data(xml_file, vectorizer):

    texts, labels = [], []
    for values, label in utils.read_corpus(xml_file):
        texts.append(values[0])
        labels.append(label)
    feas = vectorizer.fit_transform(texts)
    # pdb.set_trace()
    labeled_data = {"data": feas, "label": labels}

    print "the feature of data:", feas.shape

    return labeled_data
コード例 #22
0
ファイル: baseline.py プロジェクト: jonntd/draft
def load_data(xml_file, vectorizer):

    texts, labels = [], []
    for values, label in utils.read_corpus(xml_file):
        texts.append(values[0])
        labels.append(label)
    feas = vectorizer.fit_transform(texts)
    # pdb.set_trace()
    labeled_data = {"data": feas, "label": labels}

    print "the feature of data:", feas.shape

    return labeled_data
コード例 #23
0
def main():
    args = docopt(__doc__)
    sentences, tags = read_corpus(args['TRAIN'])
    sent_vocab = Vocab.build(sentences,
                             int(args['--max-size']),
                             int(args['--freq-cutoff']),
                             is_tags=False)
    tag_vocab = Vocab.build(tags,
                            int(args['--max-size']),
                            int(args['--freq-cutoff']),
                            is_tags=True)
    sent_vocab.save(args['SENT_VOCAB'])
    tag_vocab.save(args['TAG_VOCAB'])
コード例 #24
0
def generate_sentiment_words(neg_input_path:str, pos_input_path:str,
                             neg_output_path:str, pos_output_path:str,
                             keep_n_most_popular_words:int=3000):
    print('Reading data...')
    neg_lines = read_corpus(neg_input_path)
    pos_lines = read_corpus(pos_input_path)

    print('Counting words')
    neg_counter = Counter([w.lower() for s in tqdm(neg_lines) for w in s.split()])
    pos_counter = Counter([w.lower() for s in tqdm(pos_lines) for w in s.split()])

    print('Getting most popular')
    neg_top_words = set(w for w, _ in neg_counter.most_common(keep_n_most_popular_words))
    pos_top_words = set(w for w, _ in pos_counter.most_common(keep_n_most_popular_words))

    only_neg_top_words = neg_top_words - pos_top_words
    only_pos_top_words = pos_top_words - neg_top_words

    print('Saving')
    save_corpus(list(only_neg_top_words), neg_output_path)
    save_corpus(list(only_pos_top_words), pos_output_path)
    print('Done!')
コード例 #25
0
ファイル: nmt.py プロジェクト: chubbymaggie/pytorch_basic_nmt
def decode(args: Dict[str, str]):
    """
    performs decoding on a test set, and save the best-scoring decoding results.
    If the target gold-standard sentences are given, the function also computes
    corpus-level BLEU score.
    """

    print(f"load test source sentences from [{args['TEST_SOURCE_FILE']}]",
          file=sys.stderr)
    test_data_src = read_corpus(args['TEST_SOURCE_FILE'], source='src')
    if args['TEST_TARGET_FILE']:
        print(f"load test target sentences from [{args['TEST_TARGET_FILE']}]",
              file=sys.stderr)
        test_data_tgt = read_corpus(args['TEST_TARGET_FILE'], source='tgt')

    print(f"load model from {args['MODEL_PATH']}", file=sys.stderr)
    model = NMT.load(args['MODEL_PATH'])

    if args['--cuda']:
        model = model.to(torch.device("cuda:0"))

    hypotheses = beam_search(model,
                             test_data_src,
                             beam_size=int(args['--beam-size']),
                             max_decoding_time_step=int(
                                 args['--max-decoding-time-step']))

    if args['TEST_TARGET_FILE']:
        top_hypotheses = [hyps[0] for hyps in hypotheses]
        bleu_score = compute_corpus_level_bleu_score(test_data_tgt,
                                                     top_hypotheses)
        print(f'Corpus BLEU: {bleu_score}', file=sys.stderr)

    with open(args['OUTPUT_FILE'], 'w') as f:
        for src_sent, hyps in zip(test_data_src, hypotheses):
            top_hyp = hyps[0]
            hyp_sent = ' '.join(top_hyp.value)
            f.write(hyp_sent + '\n')
コード例 #26
0
def main():
    head_funcs = {}
    pairs = parse_pairs(sys.stdin)
    head_np, np_head = parse_NPs(open(sys.argv[1]), pairs)
    id_for_sent = []
    c = Counter()
    for token in read_corpus(open(sys.argv[2])):
        if token.id == 'SENT':
            for i in id_for_sent:
                head_funcs[i]['verbs'] = c['V']
                head_funcs[i]['oids'] = c['o']
                head_funcs[i]['conjs'] = c['C']
                head_funcs[i]['nouns'] = c['N']
            id_for_sent = []
            c.clear()
            continue
        add_pos_count(token, c)
        if token.id in head_np.keys():
            in_pair = _in_pair(token.id, head_np, pairs)
            if in_pair:
                id_for_sent.append(token.id)
                hf = {}
                hf['isNomn'], hf['isNpro'], hf['gnc'] = [hhf(token) for hhf in _funcs]
                hf['text'] = token.orig_text.encode('utf-8')
                hf['id'] = token.id
                hf['POS'] = '_'.join([str(_pos_codes[x]) for x in set(token.getPOStags())])
                # может быть несколько разборов, это плохой вариант
                hf['gnc'] = gnc(token)
                hf['gender'], hf['number'], hf['case'] = hf['gnc']
                head_funcs[token.id] = hf
                if in_pair[0]:
                    antc = np_head[in_pair[1]]
                    anph = np_head[in_pair[2]]
                    hf['c_agr'] = agreement(head_funcs[antc]['case'], head_funcs[anph]['case'])
                    hf['g_agr'] = agreement(head_funcs[antc]['gender'], head_funcs[anph]['gender'])
                    hf['n_agr'] = agreement(head_funcs[antc]['number'], head_funcs[anph]['number'])
                    hf['agreement'] = hf['g_agr'] and hf['c_agr'] and hf['n_agr']
                    hf['gn_agr'] = hf['g_agr'] and hf['n_agr']
                    hf['match'] = head_funcs[antc]['text'] == head_funcs[anph]['text']
                    head_funcs[token.id] = hf
    func = 'POS isNomn isNpro g_agr n_agr c_agr agreement verbs oids conjs nouns'.split()
    for i, f in enumerate(pairs[0].keys()):
        hhf = head_funcs[np_head[f]]
        for a in pairs[0][f]:
            ana = head_funcs[np_head[a]]
            if not i:
                print 'id' + '\t' + '\t'.join(str(x[0]) for x in 
                                              sorted(filter(lambda x: x[0] in func, hhf.items() + ana.items()), key=lambda x: x[0]))
            print str(f) + '_' + str(a) + '\t' + \
            '\t'.join(str(x[1]) for x in sorted(filter(lambda x: x[0] in func, hhf.items() + ana.items()), key=lambda x: x[0]))
コード例 #27
0
def cut_long_dialogs(data_path:str, output_path:str, max_len:int=128):
    print('Reading data...')
    data = read_corpus(data_path)

    print('Splitting dialogs...')
    dialogs = [d.split('|') for d in data]
    print('Cutting data...')
    dialogs = [cut_dialog(d, max_len) for d in tqdm(dialogs)]
    num_dialogs_before = len(dialogs)

    print('Saving data...')
    dialogs = list(filter(len, dialogs))
    save_corpus(dialogs, output_path)
    print('Done! Num dialogs reduced: {} -> {}'.format(num_dialogs_before, len(dialogs)))
コード例 #28
0
def dialogs_from_lines(input_data_path:str, output_data_path:str, n_lines: int, eos:str, n_dialogs:int):
    n_lines, n_dialogs = int(n_lines), int(n_dialogs) # TODO: argparse?

    print('Reading data...')
    lines = read_corpus(input_data_path)
    lines = lines[:n_dialogs * n_lines]

    print('Generating dialogs')
    dialogs = [lines[i:i+n_lines] for i in range(0, len(lines) - n_lines)]
    dialogs = [eos.join(d) for d in dialogs]

    print('Saving corpus')
    save_corpus(dialogs, output_data_path)
    print('Done!')
コード例 #29
0
def train(args: Dict):
    """ Train the NMT Model.
    @param args (Dict): args from cmd line
    """
    train_data_src = read_corpus(args['--train-src'], source='src')
    train_data_tgt = read_corpus(args['--train-tgt'], source='tgt')

    dev_data_src = read_corpus(args['--dev-src'], source='src')
    dev_data_tgt = read_corpus(args['--dev-tgt'], source='tgt')

    train_data = list(zip(train_data_src, train_data_tgt))
    dev_data = list(zip(dev_data_src, dev_data_tgt))

    train_batch_size = int(args['--batch-size'])
    clip_grad = float(args['--clip-grad'])
    valid_niter = int(args['--valid-niter'])
    log_every = int(args['--log-every'])
    model_save_path = args['--save-to']

    vocab = Vocab.load(args['--vocab'])

    model = NMT(embed_size=int(args['--embed-size']),
                hidden_size=int(args['--hidden-size']),
                dropout_rate=float(args['--dropout']),
                vocab=vocab)
    model.train()

    uniform_init = float(args['--uniform-init'])
    if np.abs(uniform_init) > 0.:
        print('uniformly initialize parameters [-%f, +%f]' % (uniform_init, uniform_init), file=sys.stderr)
        for p in model.parameters():
            p.data.uniform_(-uniform_init, uniform_init)

    vocab_mask = torch.ones(len(vocab.tgt))
    vocab_mask[vocab.tgt['<pad>']] = 0

    device = torch.device("cuda:0" if args['--cuda'] else "cpu")
コード例 #30
0
def decode(args: Dict[str, str]):
    """ Performs decoding on a test set, and save the best-scoring decoding results.
    If the target gold-standard sentences are given, the function also computes
    corpus-level BLEU score.
    @param args (Dict): args from cmd line
    """

    print("load test source sentences from [{}]".format(args['TEST_SOURCE_FILE']), file=sys.stderr)
    test_data_src = read_corpus(args['TEST_SOURCE_FILE'], source='src')
    if args['TEST_TARGET_FILE']:
        print("load test target sentences from [{}]".format(args['TEST_TARGET_FILE']), file=sys.stderr)
        test_data_tgt = read_corpus(args['TEST_TARGET_FILE'], source='tgt')

    print("load model from {}".format(args['MODEL_PATH']), file=sys.stderr)
    model = NMT.load(args['MODEL_PATH'], no_char_decoder=args['--no-char-decoder'])

    if args['--cuda']:
        model = model.to(torch.device("cuda:0"))

    hypotheses = beam_search(model, test_data_src,
                             beam_size=int(args['--beam-size']),
                             max_decoding_time_step=int(args['--max-decoding-time-step']))

    if args['TEST_TARGET_FILE']:
        top_hypotheses = [hyps[0] for hyps in hypotheses]
        bleu_score = compute_corpus_level_bleu_score(test_data_tgt, top_hypotheses)
        print('Corpus BLEU: {}'.format(bleu_score * 100), file=sys.stderr)

    with open(args['OUTPUT_FILE'], 'w') as f:
        for src_sent, hyps in zip(test_data_src, hypotheses):
            top_hyp = hyps[0]
            detokenizer = TreebankWordDetokenizer()
            detokenizer.DOUBLE_DASHES = (re.compile(r'--'), r'--')
            hyp_sent = detokenizer.detokenize(top_hyp.value)
            # hyp_sent = ' '.join(top_hyp.value)
            f.write(hyp_sent + '\n')
コード例 #31
0
def main():
    print('Invoked as:', ' '.join(sys.argv), file=sys.stderr)
    parser = argparse.ArgumentParser()
    parser.add_argument('corpus')
    parser.add_argument('dev_corpus')
    parser.add_argument('--layers', type=int, default=1)
    parser.add_argument('--emb_dim', type=int, default=128)
    parser.add_argument('--hidden_dim', type=int, default=128)
    parser.add_argument('--minibatch_size', type=int, default=1)
    parser.add_argument('--tied', action='store_true')
    parser.add_argument('--autobatch', action='store_true')
    parser.add_argument('--dropout', type=float, default=0.0)
    parser.add_argument('--output', type=str, default='')
    harness.add_optimizer_args(parser)
    args = parser.parse_args()

    if args.output == '':
        args.output = '/tmp/model%d' % random.randint(0, 0xFFFF)
    print('Output file:', args.output, file=sys.stderr)

    vocab = Vocabulary()
    train_corpus = read_corpus(args.corpus, vocab)
    dev_corpus = read_corpus(args.dev_corpus, vocab)
    print('Vocab size:', len(vocab), file=sys.stderr)

    with open(args.output + '.vocab', 'w') as f:
        for word in vocab.i2w:
            print(word, file=f)

    pc = dy.ParameterCollection()
    optimizer = harness.make_optimizer(args, pc)
    model = RNNLM(pc, args.layers, args.emb_dim, args.hidden_dim, len(vocab),
                  args.tied)
    print('Total parameters:', pc.parameter_count(), file=sys.stderr)

    harness.train(model, train_corpus, dev_corpus, optimizer, args)
コード例 #32
0
def multi_parameter_tuning(args):
    lrs = [1e-2,1e-3,5e-3,1e-4,5e-4]
    hidden_sizes = [128,256,512]
    lr_decays = [0.9,0.7,0.5]
    iter = 0
    valid_metric = {}               # 存储各个模型ppl的值
    dev_data_src = read_corpus(args['dev_source'], source='src')
    dev_data_tgt = read_corpus(args['dev_target'], source='tgt')
    dev_data = list(zip(dev_data_src, dev_data_tgt))
    for i in lrs:
        for j in hidden_sizes:
            for k in lr_decays:
                print('第%d次测试================================================='%iter)
                arg_test = args
                arg_test['lr'],arg_test['hidden_size'],arg_test['lr_decay'] = i,j,k
                arg_test['save_to'] = 'model_'+'lr_'+str(i)+'hd_size_'+str(j)+'lr_dys_'+str(k)+'.bin'
                run.train(arg_test)
                model = NMT.load(args['save_to'])
                dev_ppl = run.evaluate_ppl(model, dev_data, batch_size=128)  # dev batch size can be a bit larger
                valid_metric[arg_test['save_to']] = dev_ppl
                print(arg_test['save_to'],'  validation: iter %d, dev. ppl %f' % (iter, dev_ppl), file=sys.stderr)
                iter += 1
    model = min(valid_metric,key=valid_metric.get())
    print('best_model is %s ,ppl is %f'%(model,valid_metric[model]))
コード例 #33
0
def decode(args: Dict[str, str]):
    """
    performs decoding on a test set, and save the best-scoring decoding results. 
    If the target gold-standard sentences are given, the function also computes
    corpus-level BLEU score.
    """
    threshold = 2.0
    test_data = read_corpus(args['TEST_SOURCE_FILE'], source='src')

    print(f"load model from {args['MODEL_PATH_I']}", file=sys.stderr)
    model_I = NMT.load(args['MODEL_PATH_I'])
    model_I.encoder.dropout = nn.Dropout(0.)

    ces_I = []
    with torch.no_grad():
        for sent in tqdm(test_data, desc='Decoding', file=sys.stdout):
            loss = model_I([sent]).item()
            ce = loss / len(sent)
            ces_I.append(ce)

    print(f"load model from {args['MODEL_PATH_N']}", file=sys.stderr)
    model_N = NMT.load(args['MODEL_PATH_N'])
    model_N.encoder.dropout = nn.Dropout(0.)

    ces_N = []
    with torch.no_grad():
        for sent in tqdm(test_data, desc='Decoding', file=sys.stdout):
            loss = model_N([sent]).item()
            ce = loss / len(sent)
            ces_N.append(ce)

    ces_diff = []
    for ce_I, ce_N in zip(ces_I, ces_N):
        ces_diff.append(ce_I - ce_N)

    selected = 0
    with open(args['OUTPUT_FILE'], 'w') as f:
        for words, ce in zip(test_data, ces_diff):
            if (ce < threshold):
                selected += 1
                words = words[1:-1:1]
                sent = ("".join(words)).replace("▁", " ▁").strip()
                # f.write(str(ce) + ' ')
                f.write(sent + '\n')

    print("%d out of %d sentences selected." % (selected, len(test_data)))
コード例 #34
0
    def save(self, file_path):
        json.dump(dict(src_word2id=self.src.word2id, tgt_word2id=self.tgt.word2id), open(file_path, 'w'), indent=2)

    @staticmethod
    def load(file_path):
        entry = json.load(open(file_path, 'r'))
        src_word2id = entry['src_word2id']
        tgt_word2id = entry['tgt_word2id']

        return Vocab(VocabEntry(src_word2id), VocabEntry(tgt_word2id))

    def __repr__(self):
        return 'Vocab(source %d words, target %d words)' % (len(self.src), len(self.tgt))


if __name__ == '__main__':
    args = docopt(__doc__)

    print('read in source sentences: %s' % args['--train-src'])
    print('read in target sentences: %s' % args['--train-tgt'])

    src_sents = read_corpus(args['--train-src'], source='src')
    tgt_sents = read_corpus(args['--train-tgt'], source='tgt')

    vocab = Vocab.build(src_sents, tgt_sents, int(args['--size']), int(args['--freq-cutoff']))
    print('generated vocabulary, source %d words, target %d words' % (len(vocab.src), len(vocab.tgt)))

    vocab.save(args['VOCAB_FILE'])
    print('vocabulary saved to %s' % args['VOCAB_FILE'])
コード例 #35
0
                                try:
                                    a = stat[i][distance][context]
                                except KeyError:
                                    continue
                                bestrule = curr_rule
                                top_rules[bestrule] = a
    return top_rules, bestscore, a


def random_choice(corpus):
    for s in corpus:
        for token in s:
            try:
                if token.has_ambig():
                    token.disambiguate(random.choice(token.getPOStags().split('_')))
            except:
                pass


if __name__ == '__main__':
    inc = read_corpus(sys.stdin)
    #s = context_stats(inc, join_context=True)
    #print s
    #rs = scores(s, [])
    #pprint(rs[1:3])
    '''for x in rs[1].keys():
        print x.display()
        print rs[1][x]
    print rs[2]'''
    r = Rule('ADVB NOUN', 'NOUN', (1, 'PNCT'), 0)
コード例 #36
0
ファイル: models.py プロジェクト: gloria310/tag-cloud
    #idf = np.log(float(N)/appCorpus)
    weights = np.zeros(tf.shape, dtype='float') # init array for weights
    for i in range(0, tf.shape[0]):
        lengthD = np.sum(tf[i,:])
        weights[i,:] = idf * tf[i,:] * (k1+1) / (tf[i,:] + k1 * (1 - b + b * lengthD/avgD))
    return weights

def run_model(model_name, vocab, tf):
    speech_info = utils.read_speech_info()
    N = len(speech_info) # number of docs in corpus
    weights = []
    if model_name == "bm25":
        weights = bm25(tf)
    elif model_name == "tfidf":
        weights = tfidf(tf)

    top_words = {}
    for i in range(N):
        gen_tags = utils.get_tags(vocab, weights, i)
        top_words[speech_info[i]] = gen_tags
    return top_words

if __name__ == '__main__':
    (vocab,tf) = utils.read_corpus()
    #weights = tfidf(tf)
    #topWords = utils.get_tags(vocab, weights, 0) # get top 20 words for Obama's Acceptance Speech
    #print topWords
    run_model("tfidf", vocab, tf)
    #speech_info = utils.read_speech_info()
    #for si in speech_info:
        #print speech_info[si]
コード例 #37
0
ファイル: nmt.py プロジェクト: chubbymaggie/pytorch_basic_nmt
def train(args: Dict):
    train_data_src = read_corpus(args['--train-src'], source='src')
    train_data_tgt = read_corpus(args['--train-tgt'], source='tgt')

    dev_data_src = read_corpus(args['--dev-src'], source='src')
    dev_data_tgt = read_corpus(args['--dev-tgt'], source='tgt')

    train_data = list(zip(train_data_src, train_data_tgt))
    dev_data = list(zip(dev_data_src, dev_data_tgt))

    train_batch_size = int(args['--batch-size'])
    clip_grad = float(args['--clip-grad'])
    valid_niter = int(args['--valid-niter'])
    log_every = int(args['--log-every'])
    model_save_path = args['--save-to']

    vocab = Vocab.load(args['--vocab'])

    model = NMT(embed_size=int(args['--embed-size']),
                hidden_size=int(args['--hidden-size']),
                dropout_rate=float(args['--dropout']),
                input_feed=args['--input-feed'],
                label_smoothing=float(args['--label-smoothing']),
                vocab=vocab)
    model.train()

    uniform_init = float(args['--uniform-init'])
    if np.abs(uniform_init) > 0.:
        print('uniformly initialize parameters [-%f, +%f]' % (uniform_init, uniform_init), file=sys.stderr)
        for p in model.parameters():
            p.data.uniform_(-uniform_init, uniform_init)

    vocab_mask = torch.ones(len(vocab.tgt))
    vocab_mask[vocab.tgt['<pad>']] = 0

    device = torch.device("cuda:0" if args['--cuda'] else "cpu")
    print('use device: %s' % device, file=sys.stderr)

    model = model.to(device)

    optimizer = torch.optim.Adam(model.parameters(), lr=float(args['--lr']))

    num_trial = 0
    train_iter = patience = cum_loss = report_loss = cum_tgt_words = report_tgt_words = 0
    cum_examples = report_examples = epoch = valid_num = 0
    hist_valid_scores = []
    train_time = begin_time = time.time()
    print('begin Maximum Likelihood training')

    while True:
        epoch += 1

        for src_sents, tgt_sents in batch_iter(train_data, batch_size=train_batch_size, shuffle=True):
            train_iter += 1

            optimizer.zero_grad()

            batch_size = len(src_sents)

            # (batch_size)
            example_losses = -model(src_sents, tgt_sents)
            batch_loss = example_losses.sum()
            loss = batch_loss / batch_size

            loss.backward()

            # clip gradient
            grad_norm = torch.nn.utils.clip_grad_norm(model.parameters(), clip_grad)

            optimizer.step()

            batch_losses_val = batch_loss.item()
            report_loss += batch_losses_val
            cum_loss += batch_losses_val

            tgt_words_num_to_predict = sum(len(s[1:]) for s in tgt_sents)  # omitting leading `<s>`
            report_tgt_words += tgt_words_num_to_predict
            cum_tgt_words += tgt_words_num_to_predict
            report_examples += batch_size
            cum_examples += batch_size

            if train_iter % log_every == 0:
                print('epoch %d, iter %d, avg. loss %.2f, avg. ppl %.2f ' \
                      'cum. examples %d, speed %.2f words/sec, time elapsed %.2f sec' % (epoch, train_iter,
                                                                                         report_loss / report_examples,
                                                                                         math.exp(report_loss / report_tgt_words),
                                                                                         cum_examples,
                                                                                         report_tgt_words / (time.time() - train_time),
                                                                                         time.time() - begin_time), file=sys.stderr)

                train_time = time.time()
                report_loss = report_tgt_words = report_examples = 0.

            # perform validation
            if train_iter % valid_niter == 0:
                print('epoch %d, iter %d, cum. loss %.2f, cum. ppl %.2f cum. examples %d' % (epoch, train_iter,
                                                                                         cum_loss / cum_examples,
                                                                                         np.exp(cum_loss / cum_tgt_words),
                                                                                         cum_examples), file=sys.stderr)

                cum_loss = cum_examples = cum_tgt_words = 0.
                valid_num += 1

                print('begin validation ...', file=sys.stderr)

                # compute dev. ppl and bleu
                dev_ppl = evaluate_ppl(model, dev_data, batch_size=128)   # dev batch size can be a bit larger
                valid_metric = -dev_ppl

                print('validation: iter %d, dev. ppl %f' % (train_iter, dev_ppl), file=sys.stderr)

                is_better = len(hist_valid_scores) == 0 or valid_metric > max(hist_valid_scores)
                hist_valid_scores.append(valid_metric)

                if is_better:
                    patience = 0
                    print('save currently the best model to [%s]' % model_save_path, file=sys.stderr)
                    model.save(model_save_path)

                    # also save the optimizers' state
                    torch.save(optimizer.state_dict(), model_save_path + '.optim')
                elif patience < int(args['--patience']):
                    patience += 1
                    print('hit patience %d' % patience, file=sys.stderr)

                    if patience == int(args['--patience']):
                        num_trial += 1
                        print('hit #%d trial' % num_trial, file=sys.stderr)
                        if num_trial == int(args['--max-num-trial']):
                            print('early stop!', file=sys.stderr)
                            exit(0)

                        # decay lr, and restore from previously best checkpoint
                        lr = optimizer.param_groups[0]['lr'] * float(args['--lr-decay'])
                        print('load previously best model and decay learning rate to %f' % lr, file=sys.stderr)

                        # load model
                        params = torch.load(model_save_path, map_location=lambda storage, loc: storage)
                        model.load_state_dict(params['state_dict'])
                        model = model.to(device)

                        print('restore parameters of the optimizers', file=sys.stderr)
                        optimizer.load_state_dict(torch.load(model_save_path + '.optim'))

                        # set new lr
                        for param_group in optimizer.param_groups:
                            param_group['lr'] = lr

                        # reset patience
                        patience = 0

                if epoch == int(args['--max-epoch']):
                    print('reached maximum number of epochs!', file=sys.stderr)
                    exit(0)
コード例 #38
0
ファイル: pictures.py プロジェクト: OpenCorpora/opencorpora
 def t(f):
     f = read_corpus(f)
     return n(f)[0]
コード例 #39
0
ファイル: learn.py プロジェクト: OpenCorpora/opencorpora
    args = p.parse_args()

    #name = os.path.split(args.corpus)[1]
    name = args.corpus
    path = '.'
    write = False
    n = 0
    #out = sys.stdout
    copyfile(name, '%s.orig' % name)
    out = open('%s.rules' % name, 'w')
    i = 0
    best_rules = []
    best_score = 0

    orig = open(args.corpus, 'r')
    inc = list(read_corpus(orig))
    orig.close()

    while True:
        context_freq = context_stats(inc, f=args.f)
        scores_rule = scores(context_freq, best_rules, f=args.f)
        #ss = scores_rule[0]
        best_rule = scores_rule[0]

        for r in best_rule.keys():
            best_rules.append(r)
        best_score = scores_rule[1]
        applied = scores_rule[2]

        if best_score <= 0:
            output = open('%s.final' % name, 'w')