コード例 #1
0
def main(cfg: DictConfig):
    model_file = os.path.join(cfg.model_dir, 'model.pt')
    model = torch.load(model_file,
                       map_location=torch.device(device)).to(device)
    model.eval()

    vocab_file = os.path.join(cfg.model_dir, 'vocab.pkl')
    vocab_dec_file = os.path.join(cfg.model_dir, 'vocab_dec.pkl')
    with open(vocab_file, 'rb') as file:
        vocab_enc = pickle.load(file)
    with open(vocab_dec_file, 'rb') as file:
        vocab_dec = pickle.load(file)
    vocab = Vocabulary(vocab=vocab_enc, vocab_dec=vocab_dec)

    eval_df = pd.read_table(cfg.dev_file, header=None, names=['target'])
    eval_df = eval_df.iloc[100:102]
    eval_df['source'] = eval_df.apply(lambda x: remove_diacritics(x.target),
                                      axis=1)
    eval_df['src_encoded'] = eval_df.apply(lambda x: vocab.encode(x.source),
                                           axis=1)

    target = eval_df.target.to_numpy(dtype=str)

    target_words = np.hstack(np.char.split(target, sep=' '))
    target_words = np.array(list(filter(lambda x: len(x) > 1, target_words)))

    print(eval_df.iloc[0].source)
    print(eval_df.iloc[1].source)

    X_dev = eval_df.src_encoded.to_numpy()

    predicted = []
    test_iter = BatchedIterator(X_dev, batch_size=10)

    for bi, src in enumerate(test_iter.iterate_once()):
        src_padded = pad_data(src[0], vocab_enc['<PAD>']).to(device)

        outputs = model(src_padded)
        print(outputs.shape)
        outputs_pred = outputs.argmax(-1)

        for output in outputs_pred:
            decodec_sentence = vocab.decode_output(output.tolist())
            print(decodec_sentence)
            predicted.append(decodec_sentence)

    predicted = np.hstack(np.char.split(predicted, sep=' '))
    predicted = np.array(list(filter(lambda x: len(x) > 1, predicted)))

    print(predicted.shape)
    print(target_words.shape)
    correct = (target_words == predicted).sum()
    accuracy = correct / len(predicted)
    print(accuracy)
コード例 #2
0
def main(cfg: DictConfig):
    model_file = os.path.join(cfg.exp_dir, 'model.pt')
    model = torch.load(model_file,
                       map_location=torch.device(device)).to(device)

    model.eval()

    vocab_file = os.path.join(cfg.exp_dir, 'vocab.pkl')
    vocab_dec_file = os.path.join(cfg.exp_dir, 'vocab_dec.pkl')
    with open(vocab_file, 'rb') as file:
        vocab_enc = pickle.load(file)
    with open(vocab_dec_file, 'rb') as file:
        vocab_dec = pickle.load(file)

    vocab = Vocabulary(vocab=vocab_enc, vocab_dec=vocab_dec)

    if cfg.use_file:
        source = get_processed_data(cfg.file, vocab)
        predicted = []
        test_iter = BatchedIterator(source, batch_size=128)

        for bi, src in enumerate(test_iter.iterate_once()):
            src_padded = pad_data(src[0], vocab_enc['<PAD>']).to(device)

            outputs = model(src_padded)

            outputs_pred = outputs.argmax(-1)

            for output in outputs_pred:
                predicted.append(vocab.decode_output(output.tolist()))

        pred_file = os.path.join(cfg.exp_dir,
                                 f'inference/{cfg.lang}_predicted.txt')
        os.makedirs(os.path.dirname(pred_file), exist_ok=True)

        with open(pred_file, 'w') as file:
            file.write('\n'.join(predicted))
    else:
        sentence = input("Sentence: ")
        while sentence != "exit":
            sentence = sentence.lower()
            encoded = vocab.encode(sentence)
            encoded = torch.tensor(encoded)
            encoded = torch.unsqueeze(encoded, 0).to(device)
            output = model(encoded)
            output = output.argmax(-1).to('cpu').tolist()
            decoded = vocab.decode_output(output[0])
            print(f"Restored diacritics version: {decoded}")
            sentence = input("Sentence: ")
コード例 #3
0
''' Generates time series latent and observed state for the HMMesque models '''
import pandas as pd
from analytics import *
from vocabulary import Vocabulary
from fillin_heuristics import *

### Build vocabularies
obs_voc = Vocabulary()

for val in species.text.drop_duplicates():
    obs_voc.encode(val, 'species')

for val in cells.text.drop_duplicates():
    obs_voc.encode(val, 'cells')

for val in genes.text.drop_duplicates():
    obs_voc.encode(val, 'genes')

for val in relations[['first', 'second', 'type']].drop_duplicates().iterrows():
    t = val[1]
    val = '%s|%s|%s' % (t[0], t[1], t[2])
    obs_voc.encode(val, 'relations')


lat_voc = Vocabulary()

for ix, t in context[['type', 'text']].drop_duplicates().iterrows():
    kind, val = t
    lat_voc.encode(val, kind)
#####################
コード例 #4
0
from vocabulary import Vocabulary
from collections import Counter
review = [
    "The", "pizza", "is", "excellent", ".", "The", "wine", "is", "not", "."
]
count = Counter(review)
print(count)
vocabulary = Vocabulary(count)
print(vocabulary)
print(vocabulary.encode(review))
print(vocabulary.decode(vocabulary.encode(review)))