Ejemplo n.º 1
0
def main(cfg: DictConfig):
    model_file = os.path.join(cfg.exp_dir, 'model.pt')
    model = torch.load(model_file,
                       map_location=torch.device(device)).to(device)

    model.eval()

    vocab_file = os.path.join(cfg.exp_dir, 'vocab.pkl')
    vocab_dec_file = os.path.join(cfg.exp_dir, 'vocab_dec.pkl')
    with open(vocab_file, 'rb') as file:
        vocab_enc = pickle.load(file)
    with open(vocab_dec_file, 'rb') as file:
        vocab_dec = pickle.load(file)

    vocab = Vocabulary(vocab=vocab_enc, vocab_dec=vocab_dec)

    if cfg.use_file:
        source = get_processed_data(cfg.file, vocab)
        predicted = []
        test_iter = BatchedIterator(source, batch_size=128)

        for bi, src in enumerate(test_iter.iterate_once()):
            src_padded = pad_data(src[0], vocab_enc['<PAD>']).to(device)

            outputs = model(src_padded)

            outputs_pred = outputs.argmax(-1)

            for output in outputs_pred:
                predicted.append(vocab.decode_output(output.tolist()))

        pred_file = os.path.join(cfg.exp_dir,
                                 f'inference/{cfg.lang}_predicted.txt')
        os.makedirs(os.path.dirname(pred_file), exist_ok=True)

        with open(pred_file, 'w') as file:
            file.write('\n'.join(predicted))
    else:
        sentence = input("Sentence: ")
        while sentence != "exit":
            sentence = sentence.lower()
            encoded = vocab.encode(sentence)
            encoded = torch.tensor(encoded)
            encoded = torch.unsqueeze(encoded, 0).to(device)
            output = model(encoded)
            output = output.argmax(-1).to('cpu').tolist()
            decoded = vocab.decode_output(output[0])
            print(f"Restored diacritics version: {decoded}")
            sentence = input("Sentence: ")
def main(cfg: DictConfig):
    model_file = os.path.join(cfg.model_dir, 'model.pt')
    model = torch.load(model_file,
                       map_location=torch.device(device)).to(device)
    model.eval()

    vocab_file = os.path.join(cfg.model_dir, 'vocab.pkl')
    vocab_dec_file = os.path.join(cfg.model_dir, 'vocab_dec.pkl')
    with open(vocab_file, 'rb') as file:
        vocab_enc = pickle.load(file)
    with open(vocab_dec_file, 'rb') as file:
        vocab_dec = pickle.load(file)
    vocab = Vocabulary(vocab=vocab_enc, vocab_dec=vocab_dec)

    eval_df = pd.read_table(cfg.dev_file, header=None, names=['target'])
    eval_df = eval_df.iloc[100:102]
    eval_df['source'] = eval_df.apply(lambda x: remove_diacritics(x.target),
                                      axis=1)
    eval_df['src_encoded'] = eval_df.apply(lambda x: vocab.encode(x.source),
                                           axis=1)

    target = eval_df.target.to_numpy(dtype=str)

    target_words = np.hstack(np.char.split(target, sep=' '))
    target_words = np.array(list(filter(lambda x: len(x) > 1, target_words)))

    print(eval_df.iloc[0].source)
    print(eval_df.iloc[1].source)

    X_dev = eval_df.src_encoded.to_numpy()

    predicted = []
    test_iter = BatchedIterator(X_dev, batch_size=10)

    for bi, src in enumerate(test_iter.iterate_once()):
        src_padded = pad_data(src[0], vocab_enc['<PAD>']).to(device)

        outputs = model(src_padded)
        print(outputs.shape)
        outputs_pred = outputs.argmax(-1)

        for output in outputs_pred:
            decodec_sentence = vocab.decode_output(output.tolist())
            print(decodec_sentence)
            predicted.append(decodec_sentence)

    predicted = np.hstack(np.char.split(predicted, sep=' '))
    predicted = np.array(list(filter(lambda x: len(x) > 1, predicted)))

    print(predicted.shape)
    print(target_words.shape)
    correct = (target_words == predicted).sum()
    accuracy = correct / len(predicted)
    print(accuracy)