def inference(params): embedding_size = params['embedding_size'] vocab_size = params['vocab_size'] sentence_len = params['num_words_before'] + params['num_words_after'] embedding_wd = utils.get_dict_value(params, 'embedding_wd', 0.0) embedding_device = utils.get_dict_value(params, 'embedding_device', None) embedding_initializer = utils.get_dict_value(params, 'embedding_initializer', None) embedding_keep_prob = utils.get_dict_value(params, 'embedding_keep_prob', 0.0) print("USING EMBEDDING DEVICE %s" % embedding_device) if embedding_device is not None: with tf.device(embedding_device): embedding_matrix = nlp.variable_with_weight_decay( 'embedding_matrix', [vocab_size, embedding_size], initializer=embedding_initializer, wd=embedding_wd) else: embedding_matrix = nlp.variable_with_weight_decay( 'embedding_matrix', [vocab_size, embedding_size], initializer=embedding_initializer, wd=embedding_wd) if embedding_keep_prob is not None and embedding_keep_prob < 1.0: [embedding_matrix], _ = core.dropout([embedding_matrix], [embedding_keep_prob]) input_sentence = tf.placeholder(tf.int32, [None, sentence_len], 'sentence') emb_sentence = tf.nn.embedding_lookup(embedding_matrix, input_sentence, 'emb_sentence') enc_sentence, _ = sentence_encoder(emb_sentence, params) return enc_sentence, None
def optimizer(optimizer_param, loss_nodes, learning_rate, var_lists=None): # this version has gradient clipping optimizer_nodes = [] max_grad_norm = utils.get_dict_value(optimizer_param, 'max_grad_norm', 5) # if var_lists is None, then make it a list of None matching the # of loss nodes if var_lists == None: var_lists = [None] * len(loss_nodes) # just create adam optimizers for loss_node, var_list in zip(loss_nodes, var_lists): loss = loss_node if utils.get_dict_value(optimizer_param, trainer.ENABLE_REGULARIZATION_PARAM_NAME, False): reg_losses = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES) reg_constant = 1.0 # already have wd, make this parametrizable loss += reg_constant * sum(reg_losses) if utils.get_dict_value(optimizer_param, 'optimizer', 'adam') == 'adam': optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate) else: print("USING SGD OPTIMIZER WITH LR OF %s" % learning_rate) optimizer = tf.train.GradientDescentOptimizer( learning_rate=learning_rate) grads_vars = optimizer.compute_gradients(loss, var_list=var_list) grads = [g for (g, v) in grads_vars] vars = [v for (g, v) in grads_vars] if max_grad_norm > 0: grad, _ = tf.clip_by_global_norm(grads, max_grad_norm) train_op = optimizer.apply_gradients(zip(grad, vars)) optimizer_nodes.append(train_op) return optimizer_nodes
def _gen_data(params, sentence, num_before, num_after, null_sample_factor=0): global max_value vocab_size = utils.get_dict_value(params, 'vocab_size', 256) start_char = utils.get_dict_value(params, 'start_char', 1) slen = len(sentence) z = [ord(x) for x in sentence if ord(x) > vocab_size - 1] if len(z) > 0 and max(z) > max_value: max_value = max(z) print('max_value = %s' % max_value) sentence = [min(ord(x), vocab_size - 1) for x in sentence] sentence = [0] * (num_before - 1) + [start_char ] + sentence + [0] * (num_after - 1) null_list = [] pos_list = [] keychars = [ord(',')] for i in range(num_before, num_before + slen): if sentence[i] in keychars: pos_list.append(sentence[i - num_before:i] + sentence[i + 1:i + num_after + 1]) null_list.append(sentence[i - num_before:i] + sentence[i:i + num_after]) if null_sample_factor < 0: random.shuffle(null_list) null_list = null_list[:((len(pos_list) + 1))] elif null_sample_factor > 0: random.shuffle(null_list) null_list = null_list[:(null_sample_factor * (len(pos_list) + 1))] result = [[x, 0] for x in null_list] + [[x, 1] for x in pos_list] for x in result: yield x
def sentence_encoder(emb_sentence, params, name='encoded_sentence'): """ @param emb_sentence: @param params: @return: """ conv_num_features = utils.get_dict_value(params, 'conv_num_features', [[100,100,100], [100]]) conv_widths = utils.get_dict_value(params, 'conv_widths', [[2,3,4],[3]]) conv_keep_probs = utils.get_dict_value(params, 'conv_keep_probs', 0.5) mlp_config = utils.get_dict_value(params, 'mlp_config', [512]) bipass_conv = utils.get_dict_value(params, 'bipass_conv', False) mlp_activations = utils.get_dict_value(params, 'mlp_activations', 'sigmoid') mlp_dropout_keep_probs = utils.get_dict_value(params, 'mlp_keep_probs', 0.9) use_no_conv_path = utils.get_dict_value(params, 'use_no_conv_path', False) weight_wd_regularization = utils.get_dict_value(params, 'weight_wd_regularization', 0.0) bias_wd_regularization = utils.get_dict_value(params, 'bias_wd_regularization', 0.0) if bipass_conv: conv_group = [emb_sentence] else: if use_no_conv_path: conv_group = [emb_sentence] else: conv_group = [] for i, (conv_num_feature, conv_width) in enumerate(zip(conv_num_features, conv_widths)): conv_out = nlp.conv1d_array(emb_sentence, conv_num_feature, conv_width,name='conv%s'%(str(i)), w_wds=weight_wd_regularization, b_wds=bias_wd_regularization, keep_probs=conv_keep_probs) conv_group.append(conv_out) conv_out, _ = misc.concat(conv_group) mlp_out, _ = mlp.fully_connected_network(conv_out, mlp_config, layer_activations=mlp_activations, dropout_keep_probs=mlp_dropout_keep_probs) return [tf.identity(mlp_out[0], name=name)], {}
def load(self, model_dir): self._model_dir = model_dir self._paramsfile = os.path.join(self._model_dir, 'params.py') self._params = utils.load_param_file(self._paramsfile) ckpt = os.path.join(utils.get_dict_value(self._params,'output_location'), utils.get_dict_value(self._params, 'model_name') + '.ckpt') self._e = Evaluator.load2(ckpt)
def __init__(self, file_list, indexer=None, params=None): self._file_list = file_list self._cur_list = [] self._next_file = 0 self._keywords = [','] self._num_before = utils.get_dict_value(params, 'num_words_before', 5) self._num_after = utils.get_dict_value(params, 'num_words_after', 5) self.load_next_file() self._indexer = indexer self._current_epoch = 0 self._current_index = 0
def inference(params): feature_count = utils.get_dict_value(params, 'feature_count') mlp_config = utils.get_dict_value(params, 'mlp_config') mlp_activations = utils.get_dict_value(params, 'mlp_activations') mlp_dropout_keep_probs = utils.get_dict_value(params, 'mlp_dropout_keep_probs') x = tf.placeholder(tf.float32, [None, feature_count], 'features') mlp_out, _ = mlp.fully_connected_network( [x], mlp_config, layer_activations=mlp_activations, dropout_keep_probs=mlp_dropout_keep_probs) return mlp_out, _
def load(self, model_dir): self._model_dir = model_dir self._paramsfile = os.path.join(self._model_dir, 'params.py') self._params = utils.load_param_file(self._paramsfile) self._num_before = utils.get_dict_value(self._params, "num_words_before") self._num_after = utils.get_dict_value(self._params, "num_words_after") ckpt = os.path.join(utils.get_dict_value(self._params,'output_location'), utils.get_dict_value(self._params, 'model_name') + '.ckpt') vocab_file = os.path.join(utils.get_dict_value(self._params, 'output_location'), 'vocab.pkl') self._e = Evaluator.load2(ckpt) self._i = TextIndexer.from_file(vocab_file) self._keywords = self._params['keywords'] self._id_to_word = self._params['id_to_keyword']
def contrastive(network, name='contrastive_loss', params=None): """ Implement contrastive loss as given in LeCunn's paper. http://yann.lecun.com/exdb/publis/pdf/hadsell-chopra-lecun-06.pdf @param network: input network that contains 3 nodes (left, right, label) @return: #label, margin, y_weight=1,l2_name=None, l2_norm_name=None, loss_name=None, params=None """ # assert(isinstance(network, list) and (len(network)==3), 'losses.contrastive: input must contain 3 nodes') with tf.variable_scope(name): margin = common_utils.get_dict_value(params, 'contrastive_loss_margin', 128) l2_name = common_utils.get_dict_value(params, 'l2_name', 'l2') l2_norm_name = common_utils.get_dict_value(params, 'l2_norm_name', 'l2_norm') left_feature = network[0] right_feature = network[1] label = network[2] one = tf.constant(1.0, dtype=tf.float32) zero = tf.constant(0.0, dtype=tf.float32) margin_constant = tf.constant(margin, dtype=tf.float32) label_sum = tf.reduce_sum(label) Y = tf.cond( label_sum > 0, lambda: tf.mul( tf.div(tf.cast(tf.size(label), dtype=tf.float32), label_sum), label), lambda: label) one_minus_y = tf.sub(one, Y, name='one_minus_y') N = tf.constant(1 / float(right_feature.get_shape().as_list()[1])) Dw2 = tf.reduce_sum(tf.square(tf.sub(left_feature, right_feature)), 1, name=l2_name) right_term = tf.mul(Y, tf.square( tf.maximum( zero, tf.sub( margin_constant, tf.mul(N, tf.sqrt(Dw2, name=l2_norm_name))))), name='left_term') left_term = tf.mul(one_minus_y, tf.mul(tf.square(N), Dw2), name='right_term') loss = tf.mul(tf.constant(0.5), tf.reduce_mean(tf.add(right_term, left_term)), name=name) return [loss], {}
def _adam_optimizer(optimizer_param, loss_nodes, learning_rate, var_lists=None): """ Default optimizer uses adam optimizer @param optimizer_param: dict @param loss_nodes: list of tensorflow nodes @param var_lists: list of list of variables to optimize @return: a list of tensorflow optimizer nodes. This list has equal length as loss_nodes. """ optimizer_nodes = [] # if var_lists is None, then make it a list of None matching the # of loss nodes if var_lists == None: var_lists = [None] * len(loss_nodes) # just create adam optimizers for loss_node, var_list in zip(loss_nodes, var_lists): loss = loss_node if utils.get_dict_value(optimizer_param, ENABLE_REGULARIZATION_PARAM_NAME, False): reg_losses = tf.get_collection( tf.GraphKeys.REGULARIZATION_LOSSES) reg_constant = 1.0 # already have wd, make this parametrizable loss += reg_constant * sum(reg_losses) min_node = tf.train.AdamOptimizer(learning_rate).minimize( loss, var_list) optimizer_nodes.append(min_node) return optimizer_nodes
def _default_train_iteration_done(trainer, epoch, index, iteration_count, loss_value, training_done, run_results, params): stats = params['stats'] next_batch_time = np.mean(stats['next_batch_time_list']) training_time = np.mean(stats['training_time_list']) overhead_time = np.mean(stats['overhead_time_list']) if iteration_count == 1: trainer._training_log_file = open( os.path.join(utils.get_dict_value(params, 'output_location'), 'training_log.txt'), 'w') trainer._training_log_file.write( "%s,%s,%s,%s,%s,%s,%s,%s\n" % ('epoch', 'iteration', 'time', 'loss', 'next_batch_time', 'training_time', 'overhead_time', 'efficiency')) msg = ("%02d, %04d, %s, %s, %0.4f, %0.5f, %0.5f, %0.5f" % (epoch, iteration_count, time(), loss_value, next_batch_time, training_time, overhead_time, training_time / sum([next_batch_time, training_time, overhead_time]))) if "eval_results" in params: eval_results = params['eval_results'] for x in eval_results: msg += ", %0.4f" % x print('%s' % msg) trainer._training_log_file.write('%s\n' % msg) trainer._training_log_file.flush() if trainer._model_log_db is not None: trainer._model_log_db.on_update(epoch, index, iteration_count, loss_value, msg) return False
def train_iteration_done(trainer, epoch, index, iteration_count, loss_value, training_done, run_results, params): if iteration_count == 1: trainer._out_file = open(os.path.join(utils.get_dict_value(params,'output_location'), 'training_log.txt'), 'w') msg = ("%s, %s"%(time(), loss_value)) print('%s: %s' % (iteration_count, msg)) trainer._out_file.write('%s\n'%msg) trainer._out_file.flush()
def train_iteration_done(trainer, epoch, index, iteration_count, loss_value, training_done, run_results, params): if hasattr(trainer, 'last_epoch') and trainer.last_epoch != epoch: # lr decay lrd_epoch_start = utils.get_dict_value( params, 'learning_rate_decay_start_epoch', -1) if epoch > lrd_epoch_start: lr_decay = utils.get_dict_value(params, 'learning_rate_decay', -1) lr = utils.get_dict_value(params, 'learning_rate', 0.001) if lr_decay > 0: new_lr = lr * (lr_decay**(epoch - lrd_epoch_start)) print("NEW LEARNING RATE %s" % new_lr) trainer.set_learning_rate(new_lr) params['eval_results'] = [run_results['tpp']] trainer.last_epoch = epoch return framework.trainer._default_train_iteration_done( trainer, epoch, index, iteration_count, loss_value, training_done, run_results, params)
def main(argv): try: argv = FLAGS(argv) # parse flags except gflags.FlagsError as e: print('%s\\nUsage: %s ARGS\\n%s' % (e, sys.argv[0], FLAGS)) sys.exit(1) print(FLAGS.paramsfile) params = utils.load_param_file(FLAGS.paramsfile) data = load_results(params) fig, ax = plt.subplots() ax.plot([(x * 8192) / 1000000 for x in data[1]], data[8]) ax.set(xlabel='Million Records Seen', ylabel='Accuracy @ 1', title=params['model_name']) ax.grid() max_value = np.max(data[8]) # plt.ylim((.75,math.ceil(max_value*10)/10)) #plt.ylim((.75,1)) fig.savefig( os.path.join(utils.get_dict_value(params, 'output_location'), "accuracy.png")) plt.show(block=False) fig, ax = plt.subplots() ax.plot([(x * 8192) / 1000000 for x in data[1]], data[3]) ax.set(xlabel='Million Records Seen', ylabel='Loss', title=params['model_name']) ax.grid() min_value = np.min(data[3]) # plt.ylim((math.floor(min_value*10)/10,1)) #plt.ylim((.75,1)) fig.savefig( os.path.join(utils.get_dict_value(params, 'output_location'), "loss.png")) plt.show(block=False) input("Press enter to exit...")
def gen_data(dataobj, tokens, keywords, num_before=5, num_after=5, pad_tok="<pad>", null_sample_factor=0, add_redundant_keyword_data=True, use_negative_only_data=True, ignore_negative_data=False, add_keyword_removal_data=False): dataobj._mean = np.mean(dataobj._y_count) dataobj._std = np.std(dataobj._y_count) dataobj._max = np.max(dataobj._y_count) dataobj._min = np.min(dataobj._y_count) params = dataobj._params sampling = utils.get_dict_value(params, 'data_sampling', 'freeform') tokens = [pad_tok] * num_before + tokens + [pad_tok] * (num_after + 5) class_offset = 1 if ignore_negative_data: class_offset = 0 results = [] unk_list = [] no_insert_list = [] for toki in range(num_before, len(tokens) - num_before - 4): tok0 = tokens[toki] if tok0 in keywords: ki = keywords[tok0] if (sampling=='freeform') \ or ((sampling == 'uniform') and (dataobj._y_count[ki+class_offset] < dataobj._min + 5)) \ : #dataobj._y_count[ki+class_offset] < dataobj._min + 5:# + dataobj._std * 1.0: results.append( \ (tokens[(toki - num_before):toki] + tokens[(toki + 1):(toki + num_after + 1)], \ ki + class_offset)) # else: # add unk # if 'unk' in keywords: # ki = keywords.index('unk') # unk_list.append((tokens[(toki-num_before):toki]+tokens[(toki+1):(toki+num_after+1)], ki + class_offset)) # no_insert_list.append((tokens[(toki-num_before):toki]+tokens[(toki):(toki+num_after)], 0)) # num_to_add = min([int(len(results)/len(keywords)),len(no_insert_list),len(unk_list)]) # if num_to_add == 0 and len(results)>0: # num_to_add = 1 # random.shuffle(no_insert_list) # random.shuffle(unk_list) # if num_to_add > 0: # results += no_insert_list[:num_to_add] # results += unk_list[:num_to_add] return results
def load_results(params): traininglog_file = os.path.join( utils.get_dict_value(params, 'output_location'), 'training_log.txt') data = [] with open(traininglog_file, 'r') as f: csvdata = csv.reader(f, delimiter=',') for row in csvdata: for i, col in enumerate(row): if i >= len(data): data.append([]) data[i].append(float(col)) return data
def inference(params): embedding_size = params['embedding_size'] sentence_len = params['num_before'] + params['num_after'] embedding_wd = utils.get_dict_value(params, 'embedding_wd') embedding_device = utils.get_dict_value(params, 'embedding_device') embedding_initializer = utils.get_dict_value(params, 'embedding_initializer') embedding_keep_prob = utils.get_dict_value(params, 'embedding_keep_prob') word_embedding_size = utils.get_dict_value(params, 'word_embedding_size', embedding_size) vocab_size = utils.get_dict_value(params, 'vocab_size', 256) if embedding_device is not None: with tf.device(embedding_device): word_embedding_matrix = nlp.variable_with_weight_decay( 'word_embedding_matrix', [vocab_size, word_embedding_size], initializer=embedding_initializer, wd=embedding_wd) else: word_embedding_matrix = nlp.variable_with_weight_decay( 'word_embedding_matrix', [vocab_size, word_embedding_size], initializer=embedding_initializer, wd=embedding_wd) input_sentence = tf.placeholder(tf.int32, [None, sentence_len], 'sentence') emb_sentence = tf.nn.embedding_lookup(word_embedding_matrix, input_sentence, 'emb_word') if embedding_keep_prob is not None and embedding_keep_prob < 1.0: [emb_sentence], _ = core.dropout([emb_sentence], [embedding_keep_prob]) enc_sentence, _ = encoder(emb_sentence, params) return enc_sentence, None
def __init__(self, params=None, files=[['features_000.npy', 'scores_000.npy'], ['features_001.npy', 'scores_001.npy'], ['features_002.npy', 'scores_002.npy'], ['features_003.npy', 'scores_003.npy']]): self._files = files self._current_file = 0 self._current_index = 0 self._current_epoch = 0 self._num_records_seen = 0 self._separate_epochs = True self._num_minibatches = 0 self._data_dir = utils.get_dict_value(params, 'data_dir') self.load_next_file()
def generate_model_input_sentences(tokens, params): pad_tok = '<pad>' num_before = params['num_words_before'] num_after = params['num_words_after'] start_token = utils.get_dict_value(params, 'start_token') if start_token is not None and len(start_token) > 0: tokens = [pad_tok] * (num_before - 1) + [ start_token ] + tokens + [pad_tok] * (num_after + 5) else: tokens = [pad_tok] * num_before + tokens + [pad_tok] * (num_after + 5) result = [] for toki in range(num_before, len(tokens) - num_before - 5): result.append(tokens[toki - num_before:toki] + tokens[toki:toki + num_after]) return result
def load(self, model_dir): self._model_dir = model_dir self._paramsfile = os.path.join(self._model_dir, 'params.py') self._params = utils.load_param_file(self._paramsfile) self._num_before = utils.get_dict_value(self._params, "num_words_before") self._num_after = utils.get_dict_value(self._params, "num_words_after") ckpt = os.path.join(utils.get_dict_value(self._params,'output_location'), utils.get_dict_value(self._params, 'model_name') + '.ckpt') vocab_file = os.path.join(utils.get_dict_value(self._params, 'output_location'), 'vocab.pkl') self._e = Evaluator.load2(ckpt) self._i = TextIndexer.from_file(vocab_file) with open(os.path.join( utils.get_dict_value(self._params, 'output_location'), 'keywords.pkl'), 'rb') as f: keywords = pickle.load(f) self._params['keywords'] = keywords self._keywords = self._params['keywords'] self._keyword_map, self._keyword_list = gen_keywords(self._params)
def __init__(self, tellme_datadir='/mnt/work/tellme/data', datafiles=['trn1.npy', 'trn2.npy'], params={}): self._tcid_count = 2 ticid_mapfile = os.path.join(tellme_datadir, "tcid.map") with open(ticid_mapfile, "r") as f: for line in f: value = line.strip("\r\n").split("\t") if (int(value[1]) > 1): self._tcid_count += 1 self._tcid_count += 1 self._data_chunks = [] print("self._tcid_count = %s" % self._tcid_count) print("Loading data...") self._separate_epochs = utils.get_dict_value(params, "separate_epochs", False) self._tcids_data = np.load(os.path.join(tellme_datadir, datafiles[0])) self._timing_info_data = np.load( os.path.join(tellme_datadir, datafiles[1])) print("done loading data!") self._current_epoch = 0 self._current_index = 0 self._num_minibatches = 0
def eval(params, save_accuracy_file=True, batch_size=5000, num_batches=20, topn=1, verbose=True): ckpt = os.path.join(utils.get_dict_value(params, 'output_location'), utils.get_dict_value(params, 'model_name') + '.ckpt') accuracy_file = os.path.join( utils.get_dict_value(params, 'output_location'), 'accuracy.txt') e = Evaluator.load2(ckpt) if verbose: e.dump_variable_sizes() training_data = TellmeData(tellme_datadir='/mnt/work/tellme/data', datafiles=['tst1.npy', 'tst2.npy']) correct_list = [] incorrect_list = [] batch_size = batch_size dt_list = [] # num_test_records = 0000 #22596471 for i in range(num_batches): batch = training_data.next_batch(batch_size=batch_size) batch_y = batch['y'] del batch['y'] bef = time() [r] = e.eval(batch, {'sm_decision'}) aft = time() dt_list.append(aft - bef) ccorrect = 0 cincorrect = 0 for model, gt in zip(r, batch_y): topn_idx = np.argpartition(model, -topn)[-topn:] #model_predict = np.argmax(model) if gt in topn_idx: #model_predict == int(gt): ccorrect += 1 else: cincorrect += 1 correct_list.append(ccorrect) incorrect_list.append(cincorrect) if verbose: print('accuracy = %0.4f' % (ccorrect / (ccorrect + cincorrect))) accuracy_list = [ c / (c + ic) for c, ic in zip(correct_list, incorrect_list) ] correct = np.sum(correct_list) incorrect = np.sum(incorrect_list) total_accuracy = np.mean( accuracy_list) #(correct / (correct + incorrect)); accuracy_std = np.std(accuracy_list) accuracy_sem = accuracy_std / np.sqrt(len(correct_list)) if save_accuracy_file: f = open(accuracy_file, 'a') f.write('%s %s\n' % (time(), total_accuracy)) f.close() if verbose: print('accuracy = %0.4f +/- %0.4f (std=%0.4f)' % (total_accuracy, accuracy_sem, accuracy_std)) dt_mean = np.mean(dt_list) dt_std = np.std(dt_list) if verbose: print("dt_mean = %0.4f dt_std = %0.4f" % (dt_mean, dt_std)) return total_accuracy, accuracy_sem, accuracy_std
import framework.subgraph.losses as losses import framework.utils.common as utils import data from framework.trainer import Trainer, _default_train_iteration_done from time import time import pickle import model import os import shutil import copy import numpy as np param_file = 'params.py' params = utils.load_param_file(param_file) params['num_classes'] = len(params['keywords'])+1 indexer = TextIndexer.from_txt_file(utils.get_dict_value(params, 'vocab_file'), max_size=utils.get_dict_value(params,'max_vocab_size',-1)) indexer.add_token('<pad>') indexer.add_token('unk') output_indexer = copy.deepcopy(indexer) output_indexer.add_token('<blank>') os.makedirs(utils.get_dict_value(params,'output_location'), exist_ok=True) indexer.save_vocab_as_pkl(os.path.join(utils.get_dict_value(params,'output_location'), 'vocab.pkl')) files_to_copy = [param_file] for file in files_to_copy: shutil.copyfile(file,os.path.join(utils.get_dict_value(params,'output_location'), file)) params['vocab_size'] = indexer.vocab_size() if 'training_data_dir' in params: training_data = ClassifierData.get_training_data(base_dir=params['training_data_dir'], indexer=indexer, params=params,
from framework.utils.data.text_indexer import TextIndexer from word_classifier.data import ClassifierData import framework.subgraph.losses as losses import framework.utils.common as utils from framework.trainer import Trainer, _default_train_iteration_done from time import time import model import os import shutil param_file = 'params.py' params = utils.load_param_file(param_file) if not utils.get_dict_value(params, 'ignore_negative_data', False): params['num_classes'] = len(params['keywords']) + 1 else: params['num_classes'] = len(params['keywords']) indexer = TextIndexer.from_txt_file(utils.get_dict_value(params, 'vocab_file')) indexer.add_token('<pad>') indexer.add_token('unk') os.makedirs(utils.get_dict_value(params, 'output_location'), exist_ok=True) indexer.save_vocab_as_pkl( os.path.join(utils.get_dict_value(params, 'output_location'), 'vocab.pkl')) shutil.copyfile( param_file, os.path.join(utils.get_dict_value(params, 'output_location'), param_file)) params['vocab_size'] = indexer.vocab_size() training_data = ClassifierData.get_monolingual_training( base_dir=params['monolingual_dir'], indexer=indexer, params=params)
def eval(params, save_accuracy_file=True, batch_size=5000, num_batches=20, topn=1, verbose=True): num_before = utils.get_dict_value(params, "num_words_before") num_after = utils.get_dict_value(params, "num_words_after") ckpt = os.path.join(utils.get_dict_value(params,'output_location'), utils.get_dict_value(params, 'model_name') + '.ckpt') accuracy_file = os.path.join(utils.get_dict_value(params,'output_location'), 'accuracy.txt') vocab_file = os.path.join(utils.get_dict_value(params, 'output_location'), 'vocab.pkl') keywords_file = os.path.join(utils.get_dict_value(params, 'output_location'), 'keywords.pkl') e = Evaluator.load2(ckpt) i = TextIndexer.from_file(vocab_file) #test_sentence = "<S> ___ quick brown fox jumped over the lazy dog" test_sentence = "<S> ___ is no way to know whether it will work" #test_sentence = "<S> ___ house is on fire" # test_sentence = "<S> ___ in your best interest to lie" # test_sentence = "<S> ___ yours and I cannot touch it" #test_sentence = "<S> I ate a ___ and an apple" #test_sentence = "<S> I have to take ___ life away" # test_sentence = "<S> ___ may and it is raining" #test_sentence = "<S> This will take ___ before it will actually work" #test_sentence = "<S> this is probably bigger ___ that" # test_sentence = "<S> ___ is no place like home" #test_sentence = "I have ___ of money" #test_sentence = "<S> I think I ___ have it" test_sentence = "<S> don 't forget to get orange , banana , and ___ ." # test_sentence = "<S> in the heat ___ the night" # test_sentence = "<S> in the river , ___ the boat" # test_sentence = "<S> nothing can be ___ from the truth" # test_sentence = "<S> the ___ knot will unwind" # test_sentence = "<S> if you keep playing, you will ___ ." test_sentence = "<s> I ate a ___ of oranges ." # test_sentence = "<s> I ate a ___ and oranges ." # test_sentence = "<s> I live in a ___ ." # test_sentence = "<s> I ate a ___ of oranges ." test_sentence = "<s> I ate a ___ and oranges ." test_sentence = "<s> I live in a ___ ." test_sentence = "<s> I have seen it on him , and can ___ to it ." test_sentence = "<s> the thieves ___ the library and got very little for their pains ." # input data with open('/mnt/work/NeuralRewriting/eval/small_eval_data.json') as f: data = json.load(f) with open(keywords_file, 'rb') as f: k = pickle.load(f) unk_list = [] for q in data: query_word = q['query_word'] orig_sent = q['orig_sent'] options = q['options'] orig_sent = orig_sent.replace(query_word, "___") orig_sent = "<s> " + orig_sent test_sentence = orig_sent.lower() split_sentence = list(split_sentence_for_eval(test_sentence.split(), ["___"], num_before, num_after)) # print(split_sentence[0][0]) _, sentence, _, _ = i.index_wordlist(split_sentence[0][0]) bef = time() r = e.eval({'sentence': [sentence]}, {'sm_decision'}) aft = time() sm = r[0][0] for o in options: synonym = o['synonym'] if synonym not in k: score = -1000 unk_list += [synonym] else: score = math.log(sm[k.index(synonym)]) o['clmtV1'] = score print(score) # save output with open('/mnt/work/NeuralRewriting/eval/small_eval_data_out.json','w') as f: json.dump(data,f)
# params['logfile'].write(msg) # params['logfile'].write('\n') def train_iteration_done(trainer, epoch, index, iteration_count, loss_value, training_done, run_results, params): if iteration_count == 1: trainer._out_file = open('output.txt', 'w') msg = ("%s, %s" % (time(), loss_value)) print('%s: %s' % (iteration_count, msg)) trainer._out_file.write('%s\n' % msg) trainer = Trainer(inference=model.inference, batch_size=utils.get_dict_value(params, 'batch_size', 128), loss=losses.softmax_xentropy, model_output_location="./output", name=MODEL_NAME, training_data=training_data, train_iteration_done=train_iteration_done, params=params) trainer.run(restore_latest_ckpt=False, save_network=True, save_ckpt=True, mini_batches_between_checkpoint=utils.get_dict_value( params, 'mini_batches_between_checkpoint', 1000), additional_nodes_to_evaluate=['encoded_sentence'], on_checkpoint_saved=on_checkpoint_saved)
def get_model_name(self): return utils.get_dict_value(self._params, 'model_name', '_UNKNOWN_MODEL_')
import framework.utils.common as utils import word_classifier.data as data import os import urllib.parse from time import time from urllib.parse import urlparse from http.server import BaseHTTPRequestHandler, HTTPServer run_server = True paramsfile = "params.py" data_base_dir = "" http_port = 8080 params = utils.load_param_file(paramsfile) vocab_file = os.path.join(utils.get_dict_value(params,'output_location'), 'vocab.pkl') ckpt = os.path.join(utils.get_dict_value(params,'output_location'), utils.get_dict_value(params, 'model_name') + '.ckpt') print(ckpt) e = Evaluator.load2(ckpt) i = TextIndexer.from_file(vocab_file) num_before = utils.get_dict_value(params, "num_words_before") num_after = utils.get_dict_value(params, "num_words_after") pad_tok = utils.get_dict_value(params, "pad_tok", '<pad>') def split_sentence_for_eval(sentence, keywords, num_before, num_after): result = data.gen_data(sentence, keywords, num_before=num_before, num_after=num_after, ignore_negative_data=True, add_redundant_keyword_data=False) return result
from framework.utils.data.text_indexer import TextIndexer from framework.evaluator import Evaluator import framework.utils.common as utils import os import numpy as np run_server = False params = utils.load_param_file('params.py') vocab_file = os.path.join(utils.get_dict_value(params, 'output_location'), 'vocab.pkl') ckpt = os.path.join(utils.get_dict_value(params, 'output_location'), utils.get_dict_value(params, 'model_name') + '.ckpt') sentences = ['The apple , which is rotten is not edible'] e = Evaluator.load2(ckpt) i = TextIndexer.from_file(vocab_file) num_before = utils.get_dict_value(params, "num_words_before") num_after = utils.get_dict_value(params, "num_words_after") pad_tok = utils.get_dict_value(params, "pad_tok", '<pad>') sentence = "In simple terms , high precision means that an algorithm " \ "returned substantially more relevant results than irrelevant ones , while" \ " high recall means that an algorithm returned most of the relevant results ." sentence = "<S> In simple terms , high precision means that algorithm " \ "returned substantially more relevant results than irrelevant ones , while" \ " high recall means that algorithm returned most of relevant results ." #sentence = "<S> Precision can be seen as measure of exactness or quality , "\
from framework.utils.data.text_indexer import TextIndexer from tellme.data import TellmeData import framework.subgraph.losses as losses import framework.utils.common as utils from framework.trainer import Trainer from eval import eval from time import time import numpy as np import model import os import shutil param_file = 'params.py' params = utils.load_param_file(param_file) os.makedirs(utils.get_dict_value(params, 'output_location'), exist_ok=True) shutil.copyfile( param_file, os.path.join(utils.get_dict_value(params, 'output_location'), param_file)) training_data = TellmeData() params['vocab_size'] = training_data.get_tcid_count() params['num_classes'] = training_data.get_tcid_count() def on_checkpoint_saved(trainer, params, save_path): msg = 'saved checkpoint: ' + save_path print(msg) accuracy, accuracy_sem, accuracy_std = eval(params) params['eval_results'] = [accuracy, accuracy_sem, accuracy_std]