def main(cfg: DictConfig): model_file = os.path.join(cfg.model_dir, 'model.pt') model = torch.load(model_file, map_location=torch.device(device)).to(device) model.eval() vocab_file = os.path.join(cfg.model_dir, 'vocab.pkl') vocab_dec_file = os.path.join(cfg.model_dir, 'vocab_dec.pkl') with open(vocab_file, 'rb') as file: vocab_enc = pickle.load(file) with open(vocab_dec_file, 'rb') as file: vocab_dec = pickle.load(file) vocab = Vocabulary(vocab=vocab_enc, vocab_dec=vocab_dec) eval_df = pd.read_table(cfg.dev_file, header=None, names=['target']) eval_df = eval_df.iloc[100:102] eval_df['source'] = eval_df.apply(lambda x: remove_diacritics(x.target), axis=1) eval_df['src_encoded'] = eval_df.apply(lambda x: vocab.encode(x.source), axis=1) target = eval_df.target.to_numpy(dtype=str) target_words = np.hstack(np.char.split(target, sep=' ')) target_words = np.array(list(filter(lambda x: len(x) > 1, target_words))) print(eval_df.iloc[0].source) print(eval_df.iloc[1].source) X_dev = eval_df.src_encoded.to_numpy() predicted = [] test_iter = BatchedIterator(X_dev, batch_size=10) for bi, src in enumerate(test_iter.iterate_once()): src_padded = pad_data(src[0], vocab_enc['<PAD>']).to(device) outputs = model(src_padded) print(outputs.shape) outputs_pred = outputs.argmax(-1) for output in outputs_pred: decodec_sentence = vocab.decode_output(output.tolist()) print(decodec_sentence) predicted.append(decodec_sentence) predicted = np.hstack(np.char.split(predicted, sep=' ')) predicted = np.array(list(filter(lambda x: len(x) > 1, predicted))) print(predicted.shape) print(target_words.shape) correct = (target_words == predicted).sum() accuracy = correct / len(predicted) print(accuracy)
def main(cfg: DictConfig): model_file = os.path.join(cfg.exp_dir, 'model.pt') model = torch.load(model_file, map_location=torch.device(device)).to(device) model.eval() vocab_file = os.path.join(cfg.exp_dir, 'vocab.pkl') vocab_dec_file = os.path.join(cfg.exp_dir, 'vocab_dec.pkl') with open(vocab_file, 'rb') as file: vocab_enc = pickle.load(file) with open(vocab_dec_file, 'rb') as file: vocab_dec = pickle.load(file) vocab = Vocabulary(vocab=vocab_enc, vocab_dec=vocab_dec) if cfg.use_file: source = get_processed_data(cfg.file, vocab) predicted = [] test_iter = BatchedIterator(source, batch_size=128) for bi, src in enumerate(test_iter.iterate_once()): src_padded = pad_data(src[0], vocab_enc['<PAD>']).to(device) outputs = model(src_padded) outputs_pred = outputs.argmax(-1) for output in outputs_pred: predicted.append(vocab.decode_output(output.tolist())) pred_file = os.path.join(cfg.exp_dir, f'inference/{cfg.lang}_predicted.txt') os.makedirs(os.path.dirname(pred_file), exist_ok=True) with open(pred_file, 'w') as file: file.write('\n'.join(predicted)) else: sentence = input("Sentence: ") while sentence != "exit": sentence = sentence.lower() encoded = vocab.encode(sentence) encoded = torch.tensor(encoded) encoded = torch.unsqueeze(encoded, 0).to(device) output = model(encoded) output = output.argmax(-1).to('cpu').tolist() decoded = vocab.decode_output(output[0]) print(f"Restored diacritics version: {decoded}") sentence = input("Sentence: ")
''' Generates time series latent and observed state for the HMMesque models ''' import pandas as pd from analytics import * from vocabulary import Vocabulary from fillin_heuristics import * ### Build vocabularies obs_voc = Vocabulary() for val in species.text.drop_duplicates(): obs_voc.encode(val, 'species') for val in cells.text.drop_duplicates(): obs_voc.encode(val, 'cells') for val in genes.text.drop_duplicates(): obs_voc.encode(val, 'genes') for val in relations[['first', 'second', 'type']].drop_duplicates().iterrows(): t = val[1] val = '%s|%s|%s' % (t[0], t[1], t[2]) obs_voc.encode(val, 'relations') lat_voc = Vocabulary() for ix, t in context[['type', 'text']].drop_duplicates().iterrows(): kind, val = t lat_voc.encode(val, kind) #####################
from vocabulary import Vocabulary from collections import Counter review = [ "The", "pizza", "is", "excellent", ".", "The", "wine", "is", "not", "." ] count = Counter(review) print(count) vocabulary = Vocabulary(count) print(vocabulary) print(vocabulary.encode(review)) print(vocabulary.decode(vocabulary.encode(review)))