def run(in_model_folder): with tf.Session() as sess: model, actual_config, vocab, char_vocab, label_vocab = load( in_model_folder, sess) rev_vocab = {word_id: word for word, word_id in vocab.iteritems()} rev_label_vocab = { label_id: label for label, label_id in label_vocab.iteritems() } print 'Done loading' try: line = raw_input().strip() while line: print filter_line(line, model, [(vocab, label_vocab, rev_label_vocab), (vocab, vocab, rev_vocab)], actual_config, sess) line = raw_input().strip() except EOFError as e: pass
def init_model(trainset, in_model_folder, resume, in_config, in_session): model = None if not resume: if in_config['use_pos_tags']: utterances = [] for utterance, postags in zip(trainset['utterance'], trainset['pos']): utterance_augmented = [ '{}_{}'.format(token, pos) for token, pos in zip(utterance, postags) ] utterances.append(utterance_augmented) else: utterances = trainset['utterance'] vocab, _ = make_vocabulary(utterances, in_config['max_vocabulary_size']) char_vocab = make_char_vocabulary() label_vocab, _ = make_vocabulary(trainset['tags'].values, in_config['max_vocabulary_size'], special_tokens=[]) task_output_dimensions = [] for task in in_config['tasks']: if task == 'tag': task_output_dimensions.append(len(label_vocab)) elif task == 'lm': task_output_dimensions.append(len(vocab)) else: raise NotImplementedError model = create_model(len(vocab), in_config['embedding_size'], in_config['max_input_length'], task_output_dimensions) init = tf.global_variables_initializer() in_session.run(init) save(in_config, vocab, char_vocab, label_vocab, in_model_folder, in_session) model, actual_config, vocab, char_vocab, label_vocab = load( in_model_folder, in_session, existing_model=model) return model, actual_config, vocab, char_vocab, label_vocab
def main(in_dataset, in_model_folder, in_trainset_size, in_epochs_number, in_result_folder): model, vocab, label_vocab = load(in_model_folder) in_dataset = in_dataset.sample(frac=1).reset_index(drop=True) trainset, testset = in_dataset[:in_trainset_size], in_dataset[in_trainset_size:] train_data_points = [(tokens, tags) for tokens, tags in zip(trainset['utterance'], trainset['tags'])] test_data_points = [(tokens, tags) for tokens, tags in zip(testset['utterance'], testset['tags'])] train_data = make_dataset(train_data_points, vocab, label_vocab) test_data = make_dataset(test_data_points, vocab, label_vocab) if not os.path.exists(in_result_folder): os.makedirs(in_result_folder) train(model, train_data, test_data, test_data, os.path.join(in_result_folder, MODEL_NAME), epochs=in_epochs_number, batch_size=1) save(model, vocab, label_vocab, in_result_folder, save_model=False) print 'Testset accuracy: {:.3f}'.format(evaluate(model, *test_data))
def main(in_dataset_file, in_model_folder, in_result_file): dataset = pd.read_json(in_dataset_file) with tf.Session() as sess: model, vocab, char_vocab, label_vocab = load(in_model_folder, sess) rev_label_vocab = { label_id: label for label, label_id in label_vocab.iteritems() } print 'Done loading' X, y = make_dataset(dataset, vocab, label_vocab) y_pred = predict(model, (X, y), rev_label_vocab, sess) tags_predicted = [] tag_idx = 0 for tag_seq in dataset['tags']: tags_predicted.append(y_pred[tag_idx:tag_idx + len(tag_seq)]) tag_idx += len(tag_seq) result = pd.DataFrame({ 'utterance': dataset['utterance'], 'tags_gold': dataset['tags'], 'tags_predicted': tags_predicted }) result.to_json(in_result_file)
def main(in_dataset_file, in_model_folder, in_mode): with tf.Session() as sess: model, actual_config, vocab, char_vocab, label_vocab = load( in_model_folder, sess) rev_vocab = {word_id: word for word, word_id in vocab.iteritems()} rev_label_vocab = { label_id: label for label, label_id in label_vocab.iteritems() } if in_mode == 'deep_disfluency': eval_result = eval_deep_disfluency( model, [(vocab, label_vocab, rev_label_vocab), (vocab, vocab, rev_vocab)], in_dataset_file, actual_config, sess) elif in_mode == 'babi': eval_result = eval_babi(model, [(vocab, label_vocab, rev_label_vocab), (vocab, vocab, rev_vocab)], in_dataset_file, actual_config, sess) else: raise NotImplementedError for key, value in eval_result.iteritems(): print '{}:\t{}'.format(key, value)