Ejemplo n.º 1
0
def inference(params):
    embedding_size = params['embedding_size']
    vocab_size = params['vocab_size']
    sentence_len = params['num_words_before'] + params['num_words_after']
    embedding_wd = utils.get_dict_value(params, 'embedding_wd', 0.0)
    embedding_device = utils.get_dict_value(params, 'embedding_device', None)
    embedding_initializer = utils.get_dict_value(params,
                                                 'embedding_initializer', None)
    embedding_keep_prob = utils.get_dict_value(params, 'embedding_keep_prob',
                                               0.0)
    print("USING EMBEDDING DEVICE %s" % embedding_device)
    if embedding_device is not None:
        with tf.device(embedding_device):
            embedding_matrix = nlp.variable_with_weight_decay(
                'embedding_matrix', [vocab_size, embedding_size],
                initializer=embedding_initializer,
                wd=embedding_wd)
    else:
        embedding_matrix = nlp.variable_with_weight_decay(
            'embedding_matrix', [vocab_size, embedding_size],
            initializer=embedding_initializer,
            wd=embedding_wd)
    if embedding_keep_prob is not None and embedding_keep_prob < 1.0:
        [embedding_matrix], _ = core.dropout([embedding_matrix],
                                             [embedding_keep_prob])
    input_sentence = tf.placeholder(tf.int32, [None, sentence_len], 'sentence')
    emb_sentence = tf.nn.embedding_lookup(embedding_matrix, input_sentence,
                                          'emb_sentence')
    enc_sentence, _ = sentence_encoder(emb_sentence, params)

    return enc_sentence, None
Ejemplo n.º 2
0
def optimizer(optimizer_param, loss_nodes, learning_rate, var_lists=None):
    # this version has gradient clipping
    optimizer_nodes = []
    max_grad_norm = utils.get_dict_value(optimizer_param, 'max_grad_norm', 5)

    # if var_lists is None, then make it a list of None matching the # of loss nodes
    if var_lists == None:
        var_lists = [None] * len(loss_nodes)

    # just create adam optimizers
    for loss_node, var_list in zip(loss_nodes, var_lists):
        loss = loss_node

        if utils.get_dict_value(optimizer_param,
                                trainer.ENABLE_REGULARIZATION_PARAM_NAME,
                                False):
            reg_losses = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)
            reg_constant = 1.0  # already have wd, make this parametrizable
            loss += reg_constant * sum(reg_losses)
        if utils.get_dict_value(optimizer_param, 'optimizer',
                                'adam') == 'adam':
            optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
        else:
            print("USING SGD OPTIMIZER WITH LR OF %s" % learning_rate)
            optimizer = tf.train.GradientDescentOptimizer(
                learning_rate=learning_rate)
        grads_vars = optimizer.compute_gradients(loss, var_list=var_list)
        grads = [g for (g, v) in grads_vars]
        vars = [v for (g, v) in grads_vars]
        if max_grad_norm > 0:
            grad, _ = tf.clip_by_global_norm(grads, max_grad_norm)
        train_op = optimizer.apply_gradients(zip(grad, vars))
        optimizer_nodes.append(train_op)
    return optimizer_nodes
Ejemplo n.º 3
0
def _gen_data(params, sentence, num_before, num_after, null_sample_factor=0):
    global max_value
    vocab_size = utils.get_dict_value(params, 'vocab_size', 256)
    start_char = utils.get_dict_value(params, 'start_char', 1)
    slen = len(sentence)
    z = [ord(x) for x in sentence if ord(x) > vocab_size - 1]
    if len(z) > 0 and max(z) > max_value:
        max_value = max(z)
        print('max_value = %s' % max_value)
    sentence = [min(ord(x), vocab_size - 1) for x in sentence]
    sentence = [0] * (num_before - 1) + [start_char
                                         ] + sentence + [0] * (num_after - 1)

    null_list = []
    pos_list = []
    keychars = [ord(',')]
    for i in range(num_before, num_before + slen):
        if sentence[i] in keychars:
            pos_list.append(sentence[i - num_before:i] +
                            sentence[i + 1:i + num_after + 1])
        null_list.append(sentence[i - num_before:i] +
                         sentence[i:i + num_after])

    if null_sample_factor < 0:
        random.shuffle(null_list)
        null_list = null_list[:((len(pos_list) + 1))]
    elif null_sample_factor > 0:
        random.shuffle(null_list)
        null_list = null_list[:(null_sample_factor * (len(pos_list) + 1))]
    result = [[x, 0] for x in null_list] + [[x, 1] for x in pos_list]
    for x in result:
        yield x
Ejemplo n.º 4
0
def sentence_encoder(emb_sentence, params, name='encoded_sentence'):
	"""
	@param emb_sentence:
	@param params:
	@return:
	"""
	conv_num_features = utils.get_dict_value(params, 'conv_num_features', [[100,100,100], [100]])
	conv_widths = utils.get_dict_value(params, 'conv_widths', [[2,3,4],[3]])
	conv_keep_probs = utils.get_dict_value(params, 'conv_keep_probs', 0.5)
	mlp_config = utils.get_dict_value(params, 'mlp_config', [512])
	bipass_conv = utils.get_dict_value(params, 'bipass_conv', False)
	mlp_activations = utils.get_dict_value(params, 'mlp_activations', 'sigmoid')
	mlp_dropout_keep_probs = utils.get_dict_value(params, 'mlp_keep_probs', 0.9)
	use_no_conv_path = utils.get_dict_value(params, 'use_no_conv_path', False)

	weight_wd_regularization = utils.get_dict_value(params, 'weight_wd_regularization', 0.0)
	bias_wd_regularization = utils.get_dict_value(params, 'bias_wd_regularization', 0.0)

	if bipass_conv:
		conv_group = [emb_sentence]
	else:
		if use_no_conv_path:
			conv_group = [emb_sentence]
		else:
			conv_group = []
		for i, (conv_num_feature, conv_width) in enumerate(zip(conv_num_features, conv_widths)):
			conv_out = nlp.conv1d_array(emb_sentence, conv_num_feature, conv_width,name='conv%s'%(str(i)),
										w_wds=weight_wd_regularization,
										b_wds=bias_wd_regularization, keep_probs=conv_keep_probs)
			conv_group.append(conv_out)
	conv_out, _ = misc.concat(conv_group)
	mlp_out, _ = mlp.fully_connected_network(conv_out, mlp_config, layer_activations=mlp_activations, dropout_keep_probs=mlp_dropout_keep_probs)
	return [tf.identity(mlp_out[0], name=name)], {}
Ejemplo n.º 5
0
	def load(self, model_dir):
		self._model_dir = model_dir
		self._paramsfile = os.path.join(self._model_dir, 'params.py')
		self._params = utils.load_param_file(self._paramsfile)
		ckpt = os.path.join(utils.get_dict_value(self._params,'output_location'),
												utils.get_dict_value(self._params, 'model_name') + '.ckpt')
		self._e = Evaluator.load2(ckpt)
Ejemplo n.º 6
0
 def __init__(self, file_list, indexer=None, params=None):
     self._file_list = file_list
     self._cur_list = []
     self._next_file = 0
     self._keywords = [',']
     self._num_before = utils.get_dict_value(params, 'num_words_before', 5)
     self._num_after = utils.get_dict_value(params, 'num_words_after', 5)
     self.load_next_file()
     self._indexer = indexer
     self._current_epoch = 0
     self._current_index = 0
Ejemplo n.º 7
0
def inference(params):
    feature_count = utils.get_dict_value(params, 'feature_count')
    mlp_config = utils.get_dict_value(params, 'mlp_config')
    mlp_activations = utils.get_dict_value(params, 'mlp_activations')
    mlp_dropout_keep_probs = utils.get_dict_value(params,
                                                  'mlp_dropout_keep_probs')
    x = tf.placeholder(tf.float32, [None, feature_count], 'features')
    mlp_out, _ = mlp.fully_connected_network(
        [x],
        mlp_config,
        layer_activations=mlp_activations,
        dropout_keep_probs=mlp_dropout_keep_probs)
    return mlp_out, _
Ejemplo n.º 8
0
	def load(self, model_dir):
		self._model_dir = model_dir
		self._paramsfile = os.path.join(self._model_dir, 'params.py')
		self._params = utils.load_param_file(self._paramsfile)
		self._num_before = utils.get_dict_value(self._params, "num_words_before")
		self._num_after = utils.get_dict_value(self._params, "num_words_after")
		ckpt = os.path.join(utils.get_dict_value(self._params,'output_location'),
												utils.get_dict_value(self._params, 'model_name') + '.ckpt')
		vocab_file = os.path.join(utils.get_dict_value(self._params, 'output_location'), 'vocab.pkl')
		self._e = Evaluator.load2(ckpt)
		self._i = TextIndexer.from_file(vocab_file)
		self._keywords = self._params['keywords']
		self._id_to_word = self._params['id_to_keyword']
Ejemplo n.º 9
0
def contrastive(network, name='contrastive_loss', params=None):
    """
    Implement contrastive loss as given in LeCunn's paper.
      http://yann.lecun.com/exdb/publis/pdf/hadsell-chopra-lecun-06.pdf
    @param network: input network that contains 3 nodes (left, right, label)
    @return:
    #label, margin, y_weight=1,l2_name=None, l2_norm_name=None, loss_name=None, params=None
    """
    #    assert(isinstance(network, list) and (len(network)==3), 'losses.contrastive: input must contain 3 nodes')
    with tf.variable_scope(name):
        margin = common_utils.get_dict_value(params, 'contrastive_loss_margin',
                                             128)
        l2_name = common_utils.get_dict_value(params, 'l2_name', 'l2')
        l2_norm_name = common_utils.get_dict_value(params, 'l2_norm_name',
                                                   'l2_norm')
        left_feature = network[0]
        right_feature = network[1]
        label = network[2]
        one = tf.constant(1.0, dtype=tf.float32)
        zero = tf.constant(0.0, dtype=tf.float32)
        margin_constant = tf.constant(margin, dtype=tf.float32)
        label_sum = tf.reduce_sum(label)
        Y = tf.cond(
            label_sum > 0, lambda: tf.mul(
                tf.div(tf.cast(tf.size(label), dtype=tf.float32), label_sum),
                label), lambda: label)
        one_minus_y = tf.sub(one, Y, name='one_minus_y')
        N = tf.constant(1 / float(right_feature.get_shape().as_list()[1]))
        Dw2 = tf.reduce_sum(tf.square(tf.sub(left_feature, right_feature)),
                            1,
                            name=l2_name)
        right_term = tf.mul(Y,
                            tf.square(
                                tf.maximum(
                                    zero,
                                    tf.sub(
                                        margin_constant,
                                        tf.mul(N,
                                               tf.sqrt(Dw2,
                                                       name=l2_norm_name))))),
                            name='left_term')
        left_term = tf.mul(one_minus_y,
                           tf.mul(tf.square(N), Dw2),
                           name='right_term')
        loss = tf.mul(tf.constant(0.5),
                      tf.reduce_mean(tf.add(right_term, left_term)),
                      name=name)
    return [loss], {}
Ejemplo n.º 10
0
    def _adam_optimizer(optimizer_param,
                        loss_nodes,
                        learning_rate,
                        var_lists=None):
        """
		Default optimizer uses adam optimizer

		@param optimizer_param: dict
		@param loss_nodes: list of tensorflow nodes
		@param var_lists: list of list of variables to optimize
		@return:
		a list of tensorflow optimizer nodes.  This list has equal length as loss_nodes.
		"""
        optimizer_nodes = []

        # if var_lists is None, then make it a list of None matching the # of loss nodes
        if var_lists == None:
            var_lists = [None] * len(loss_nodes)

        # just create adam optimizers
        for loss_node, var_list in zip(loss_nodes, var_lists):
            loss = loss_node

            if utils.get_dict_value(optimizer_param,
                                    ENABLE_REGULARIZATION_PARAM_NAME, False):
                reg_losses = tf.get_collection(
                    tf.GraphKeys.REGULARIZATION_LOSSES)
                reg_constant = 1.0  # already have wd, make this parametrizable
                loss += reg_constant * sum(reg_losses)
            min_node = tf.train.AdamOptimizer(learning_rate).minimize(
                loss, var_list)
            optimizer_nodes.append(min_node)
        return optimizer_nodes
Ejemplo n.º 11
0
def _default_train_iteration_done(trainer, epoch, index, iteration_count,
                                  loss_value, training_done, run_results,
                                  params):

    stats = params['stats']
    next_batch_time = np.mean(stats['next_batch_time_list'])
    training_time = np.mean(stats['training_time_list'])
    overhead_time = np.mean(stats['overhead_time_list'])

    if iteration_count == 1:
        trainer._training_log_file = open(
            os.path.join(utils.get_dict_value(params, 'output_location'),
                         'training_log.txt'), 'w')
        trainer._training_log_file.write(
            "%s,%s,%s,%s,%s,%s,%s,%s\n" %
            ('epoch', 'iteration', 'time', 'loss', 'next_batch_time',
             'training_time', 'overhead_time', 'efficiency'))

    msg = ("%02d, %04d, %s, %s, %0.4f, %0.5f, %0.5f, %0.5f" %
           (epoch, iteration_count, time(), loss_value, next_batch_time,
            training_time, overhead_time, training_time /
            sum([next_batch_time, training_time, overhead_time])))
    if "eval_results" in params:
        eval_results = params['eval_results']
        for x in eval_results:
            msg += ", %0.4f" % x

    print('%s' % msg)
    trainer._training_log_file.write('%s\n' % msg)
    trainer._training_log_file.flush()

    if trainer._model_log_db is not None:
        trainer._model_log_db.on_update(epoch, index, iteration_count,
                                        loss_value, msg)
    return False
Ejemplo n.º 12
0
def train_iteration_done(trainer, epoch, index, iteration_count, loss_value, training_done, run_results, params):
	if iteration_count == 1:
		trainer._out_file = open(os.path.join(utils.get_dict_value(params,'output_location'), 'training_log.txt'), 'w')

	msg = ("%s, %s"%(time(), loss_value))
	print('%s: %s' % (iteration_count, msg))
	trainer._out_file.write('%s\n'%msg)
	trainer._out_file.flush()
Ejemplo n.º 13
0
def train_iteration_done(trainer, epoch, index, iteration_count, loss_value,
                         training_done, run_results, params):
    if hasattr(trainer, 'last_epoch') and trainer.last_epoch != epoch:
        # lr decay
        lrd_epoch_start = utils.get_dict_value(
            params, 'learning_rate_decay_start_epoch', -1)
        if epoch > lrd_epoch_start:
            lr_decay = utils.get_dict_value(params, 'learning_rate_decay', -1)
            lr = utils.get_dict_value(params, 'learning_rate', 0.001)
            if lr_decay > 0:
                new_lr = lr * (lr_decay**(epoch - lrd_epoch_start))
                print("NEW LEARNING RATE %s" % new_lr)
                trainer.set_learning_rate(new_lr)

    params['eval_results'] = [run_results['tpp']]
    trainer.last_epoch = epoch
    return framework.trainer._default_train_iteration_done(
        trainer, epoch, index, iteration_count, loss_value, training_done,
        run_results, params)
Ejemplo n.º 14
0
def main(argv):
    try:
        argv = FLAGS(argv)  # parse flags
    except gflags.FlagsError as e:
        print('%s\\nUsage: %s ARGS\\n%s' % (e, sys.argv[0], FLAGS))
        sys.exit(1)
    print(FLAGS.paramsfile)
    params = utils.load_param_file(FLAGS.paramsfile)
    data = load_results(params)
    fig, ax = plt.subplots()
    ax.plot([(x * 8192) / 1000000 for x in data[1]], data[8])

    ax.set(xlabel='Million Records Seen',
           ylabel='Accuracy @ 1',
           title=params['model_name'])
    ax.grid()

    max_value = np.max(data[8])
    #	plt.ylim((.75,math.ceil(max_value*10)/10))
    #plt.ylim((.75,1))
    fig.savefig(
        os.path.join(utils.get_dict_value(params, 'output_location'),
                     "accuracy.png"))
    plt.show(block=False)

    fig, ax = plt.subplots()
    ax.plot([(x * 8192) / 1000000 for x in data[1]], data[3])

    ax.set(xlabel='Million Records Seen',
           ylabel='Loss',
           title=params['model_name'])
    ax.grid()

    min_value = np.min(data[3])
    #	plt.ylim((math.floor(min_value*10)/10,1))
    #plt.ylim((.75,1))
    fig.savefig(
        os.path.join(utils.get_dict_value(params, 'output_location'),
                     "loss.png"))
    plt.show(block=False)

    input("Press enter to exit...")
Ejemplo n.º 15
0
def gen_data(dataobj,
             tokens,
             keywords,
             num_before=5,
             num_after=5,
             pad_tok="<pad>",
             null_sample_factor=0,
             add_redundant_keyword_data=True,
             use_negative_only_data=True,
             ignore_negative_data=False,
             add_keyword_removal_data=False):
    dataobj._mean = np.mean(dataobj._y_count)
    dataobj._std = np.std(dataobj._y_count)
    dataobj._max = np.max(dataobj._y_count)
    dataobj._min = np.min(dataobj._y_count)
    params = dataobj._params
    sampling = utils.get_dict_value(params, 'data_sampling', 'freeform')

    tokens = [pad_tok] * num_before + tokens + [pad_tok] * (num_after + 5)
    class_offset = 1
    if ignore_negative_data:
        class_offset = 0
    results = []
    unk_list = []
    no_insert_list = []
    for toki in range(num_before, len(tokens) - num_before - 4):
        tok0 = tokens[toki]
        if tok0 in keywords:
            ki = keywords[tok0]
            if (sampling=='freeform') \
             or ((sampling == 'uniform') and (dataobj._y_count[ki+class_offset] < dataobj._min + 5)) \
             :
                #dataobj._y_count[ki+class_offset] < dataobj._min + 5:# + dataobj._std * 1.0:
                results.append( \
                 (tokens[(toki - num_before):toki] + tokens[(toki + 1):(toki + num_after + 1)], \
                  ki + class_offset))


#		else:
# add unk
#			if 'unk' in keywords:
#				ki = keywords.index('unk')
#				unk_list.append((tokens[(toki-num_before):toki]+tokens[(toki+1):(toki+num_after+1)], ki + class_offset))
#		no_insert_list.append((tokens[(toki-num_before):toki]+tokens[(toki):(toki+num_after)], 0))
#	num_to_add = min([int(len(results)/len(keywords)),len(no_insert_list),len(unk_list)])
#	if num_to_add == 0 and len(results)>0:
#		num_to_add = 1
#	random.shuffle(no_insert_list)
#	random.shuffle(unk_list)
#	if num_to_add > 0:
#		results += no_insert_list[:num_to_add]
#		results += unk_list[:num_to_add]
    return results
Ejemplo n.º 16
0
def load_results(params):
    traininglog_file = os.path.join(
        utils.get_dict_value(params, 'output_location'), 'training_log.txt')
    data = []
    with open(traininglog_file, 'r') as f:
        csvdata = csv.reader(f, delimiter=',')
        for row in csvdata:
            for i, col in enumerate(row):
                if i >= len(data):
                    data.append([])
                data[i].append(float(col))
    return data
Ejemplo n.º 17
0
def inference(params):
    embedding_size = params['embedding_size']
    sentence_len = params['num_before'] + params['num_after']
    embedding_wd = utils.get_dict_value(params, 'embedding_wd')
    embedding_device = utils.get_dict_value(params, 'embedding_device')
    embedding_initializer = utils.get_dict_value(params,
                                                 'embedding_initializer')
    embedding_keep_prob = utils.get_dict_value(params, 'embedding_keep_prob')
    word_embedding_size = utils.get_dict_value(params, 'word_embedding_size',
                                               embedding_size)
    vocab_size = utils.get_dict_value(params, 'vocab_size', 256)
    if embedding_device is not None:
        with tf.device(embedding_device):
            word_embedding_matrix = nlp.variable_with_weight_decay(
                'word_embedding_matrix', [vocab_size, word_embedding_size],
                initializer=embedding_initializer,
                wd=embedding_wd)
    else:
        word_embedding_matrix = nlp.variable_with_weight_decay(
            'word_embedding_matrix', [vocab_size, word_embedding_size],
            initializer=embedding_initializer,
            wd=embedding_wd)

    input_sentence = tf.placeholder(tf.int32, [None, sentence_len], 'sentence')
    emb_sentence = tf.nn.embedding_lookup(word_embedding_matrix,
                                          input_sentence, 'emb_word')
    if embedding_keep_prob is not None and embedding_keep_prob < 1.0:
        [emb_sentence], _ = core.dropout([emb_sentence], [embedding_keep_prob])
    enc_sentence, _ = encoder(emb_sentence, params)

    return enc_sentence, None
Ejemplo n.º 18
0
 def __init__(self,
              params=None,
              files=[['features_000.npy', 'scores_000.npy'],
                     ['features_001.npy', 'scores_001.npy'],
                     ['features_002.npy', 'scores_002.npy'],
                     ['features_003.npy', 'scores_003.npy']]):
     self._files = files
     self._current_file = 0
     self._current_index = 0
     self._current_epoch = 0
     self._num_records_seen = 0
     self._separate_epochs = True
     self._num_minibatches = 0
     self._data_dir = utils.get_dict_value(params, 'data_dir')
     self.load_next_file()
Ejemplo n.º 19
0
def generate_model_input_sentences(tokens, params):
    pad_tok = '<pad>'
    num_before = params['num_words_before']
    num_after = params['num_words_after']
    start_token = utils.get_dict_value(params, 'start_token')
    if start_token is not None and len(start_token) > 0:
        tokens = [pad_tok] * (num_before - 1) + [
            start_token
        ] + tokens + [pad_tok] * (num_after + 5)
    else:
        tokens = [pad_tok] * num_before + tokens + [pad_tok] * (num_after + 5)
    result = []
    for toki in range(num_before, len(tokens) - num_before - 5):
        result.append(tokens[toki - num_before:toki] +
                      tokens[toki:toki + num_after])
    return result
Ejemplo n.º 20
0
	def load(self, model_dir):
		self._model_dir = model_dir
		self._paramsfile = os.path.join(self._model_dir, 'params.py')
		self._params = utils.load_param_file(self._paramsfile)
		self._num_before = utils.get_dict_value(self._params, "num_words_before")
		self._num_after = utils.get_dict_value(self._params, "num_words_after")
		ckpt = os.path.join(utils.get_dict_value(self._params,'output_location'),
												utils.get_dict_value(self._params, 'model_name') + '.ckpt')
		vocab_file = os.path.join(utils.get_dict_value(self._params, 'output_location'), 'vocab.pkl')
		self._e = Evaluator.load2(ckpt)
		self._i = TextIndexer.from_file(vocab_file)
		with open(os.path.join(
				utils.get_dict_value(self._params, 'output_location'),
				'keywords.pkl'), 'rb') as f:
			keywords = pickle.load(f)
		self._params['keywords'] = keywords
		self._keywords = self._params['keywords']
		self._keyword_map, self._keyword_list = gen_keywords(self._params)
Ejemplo n.º 21
0
 def __init__(self,
              tellme_datadir='/mnt/work/tellme/data',
              datafiles=['trn1.npy', 'trn2.npy'],
              params={}):
     self._tcid_count = 2
     ticid_mapfile = os.path.join(tellme_datadir, "tcid.map")
     with open(ticid_mapfile, "r") as f:
         for line in f:
             value = line.strip("\r\n").split("\t")
             if (int(value[1]) > 1):
                 self._tcid_count += 1
         self._tcid_count += 1
     self._data_chunks = []
     print("self._tcid_count = %s" % self._tcid_count)
     print("Loading data...")
     self._separate_epochs = utils.get_dict_value(params, "separate_epochs",
                                                  False)
     self._tcids_data = np.load(os.path.join(tellme_datadir, datafiles[0]))
     self._timing_info_data = np.load(
         os.path.join(tellme_datadir, datafiles[1]))
     print("done loading data!")
     self._current_epoch = 0
     self._current_index = 0
     self._num_minibatches = 0
Ejemplo n.º 22
0
def eval(params,
         save_accuracy_file=True,
         batch_size=5000,
         num_batches=20,
         topn=1,
         verbose=True):
    ckpt = os.path.join(utils.get_dict_value(params, 'output_location'),
                        utils.get_dict_value(params, 'model_name') + '.ckpt')
    accuracy_file = os.path.join(
        utils.get_dict_value(params, 'output_location'), 'accuracy.txt')
    e = Evaluator.load2(ckpt)
    if verbose:
        e.dump_variable_sizes()
    training_data = TellmeData(tellme_datadir='/mnt/work/tellme/data',
                               datafiles=['tst1.npy', 'tst2.npy'])
    correct_list = []
    incorrect_list = []
    batch_size = batch_size
    dt_list = []
    #	num_test_records = 0000 #22596471
    for i in range(num_batches):
        batch = training_data.next_batch(batch_size=batch_size)
        batch_y = batch['y']
        del batch['y']
        bef = time()
        [r] = e.eval(batch, {'sm_decision'})
        aft = time()
        dt_list.append(aft - bef)
        ccorrect = 0
        cincorrect = 0
        for model, gt in zip(r, batch_y):
            topn_idx = np.argpartition(model, -topn)[-topn:]
            #model_predict = np.argmax(model)
            if gt in topn_idx:  #model_predict == int(gt):
                ccorrect += 1
            else:
                cincorrect += 1
        correct_list.append(ccorrect)
        incorrect_list.append(cincorrect)
        if verbose:
            print('accuracy = %0.4f' % (ccorrect / (ccorrect + cincorrect)))
    accuracy_list = [
        c / (c + ic) for c, ic in zip(correct_list, incorrect_list)
    ]
    correct = np.sum(correct_list)
    incorrect = np.sum(incorrect_list)
    total_accuracy = np.mean(
        accuracy_list)  #(correct / (correct + incorrect));
    accuracy_std = np.std(accuracy_list)
    accuracy_sem = accuracy_std / np.sqrt(len(correct_list))
    if save_accuracy_file:
        f = open(accuracy_file, 'a')
        f.write('%s %s\n' % (time(), total_accuracy))
        f.close()
    if verbose:
        print('accuracy = %0.4f +/- %0.4f (std=%0.4f)' %
              (total_accuracy, accuracy_sem, accuracy_std))
    dt_mean = np.mean(dt_list)
    dt_std = np.std(dt_list)
    if verbose:
        print("dt_mean = %0.4f dt_std = %0.4f" % (dt_mean, dt_std))
    return total_accuracy, accuracy_sem, accuracy_std
Ejemplo n.º 23
0
import framework.subgraph.losses as losses
import framework.utils.common as utils
import data
from framework.trainer import Trainer, _default_train_iteration_done
from time import time
import pickle
import model
import os
import shutil
import copy
import numpy as np

param_file = 'params.py'
params = utils.load_param_file(param_file)
params['num_classes'] = len(params['keywords'])+1
indexer = TextIndexer.from_txt_file(utils.get_dict_value(params, 'vocab_file'), max_size=utils.get_dict_value(params,'max_vocab_size',-1))
indexer.add_token('<pad>')
indexer.add_token('unk')
output_indexer = copy.deepcopy(indexer)
output_indexer.add_token('<blank>')
os.makedirs(utils.get_dict_value(params,'output_location'), exist_ok=True)
indexer.save_vocab_as_pkl(os.path.join(utils.get_dict_value(params,'output_location'), 'vocab.pkl'))

files_to_copy = [param_file]
for file in files_to_copy:
	shutil.copyfile(file,os.path.join(utils.get_dict_value(params,'output_location'), file))

params['vocab_size'] = indexer.vocab_size()

if 'training_data_dir' in params:
	training_data = ClassifierData.get_training_data(base_dir=params['training_data_dir'], indexer=indexer, params=params,
Ejemplo n.º 24
0
from framework.utils.data.text_indexer import TextIndexer
from word_classifier.data import ClassifierData
import framework.subgraph.losses as losses
import framework.utils.common as utils
from framework.trainer import Trainer, _default_train_iteration_done
from time import time
import model
import os
import shutil

param_file = 'params.py'
params = utils.load_param_file(param_file)
if not utils.get_dict_value(params, 'ignore_negative_data', False):
    params['num_classes'] = len(params['keywords']) + 1
else:
    params['num_classes'] = len(params['keywords'])
indexer = TextIndexer.from_txt_file(utils.get_dict_value(params, 'vocab_file'))
indexer.add_token('<pad>')
indexer.add_token('unk')
os.makedirs(utils.get_dict_value(params, 'output_location'), exist_ok=True)
indexer.save_vocab_as_pkl(
    os.path.join(utils.get_dict_value(params, 'output_location'), 'vocab.pkl'))
shutil.copyfile(
    param_file,
    os.path.join(utils.get_dict_value(params, 'output_location'), param_file))

params['vocab_size'] = indexer.vocab_size()
training_data = ClassifierData.get_monolingual_training(
    base_dir=params['monolingual_dir'], indexer=indexer, params=params)

Ejemplo n.º 25
0
def eval(params,
				 save_accuracy_file=True,
				 batch_size=5000,
				 num_batches=20,
				 topn=1,
				 verbose=True):
	num_before = utils.get_dict_value(params, "num_words_before")
	num_after = utils.get_dict_value(params, "num_words_after")
	ckpt = os.path.join(utils.get_dict_value(params,'output_location'),
											utils.get_dict_value(params, 'model_name') + '.ckpt')
	accuracy_file = os.path.join(utils.get_dict_value(params,'output_location'),
											'accuracy.txt')
	vocab_file = os.path.join(utils.get_dict_value(params, 'output_location'), 'vocab.pkl')
	keywords_file = os.path.join(utils.get_dict_value(params, 'output_location'), 'keywords.pkl')
	e = Evaluator.load2(ckpt)
	i = TextIndexer.from_file(vocab_file)
	#test_sentence = "<S> ___ quick brown fox jumped over the lazy dog"
	test_sentence = "<S> ___ is no way to know whether it will work"
	#test_sentence = "<S> ___ house is on fire"
#	test_sentence = "<S> ___ in your best interest to lie"
#	test_sentence = "<S> ___ yours and I cannot touch it"
	#test_sentence = "<S> I ate a ___ and an apple"
	#test_sentence = "<S> I have to take ___ life away"
#	test_sentence = "<S> ___ may and it is raining"
	#test_sentence = "<S> This will take ___ before it will actually work"
	#test_sentence = "<S> this is probably bigger ___ that"
#	test_sentence = "<S> ___ is no place like home"
	#test_sentence = "I have ___ of money"
	#test_sentence = "<S> I think I ___ have it"
	test_sentence = "<S> don 't forget to get orange , banana , and ___ ."
#	test_sentence = "<S> in the heat ___ the night"
#	test_sentence = "<S> in the river , ___ the boat"
#	test_sentence = "<S> nothing can be ___ from the truth"
#	test_sentence = "<S> the ___ knot will unwind"
#	test_sentence = "<S> if you keep playing, you will ___ ."
	test_sentence = "<s> I ate a ___ of oranges ."
#	test_sentence = "<s> I ate a ___ and oranges ."
#	test_sentence = "<s> I live in a ___ ."
#	test_sentence = "<s> I ate a ___ of oranges ."
	test_sentence = "<s> I ate a ___ and oranges ."
	test_sentence = "<s> I live in a ___ ."
	test_sentence = "<s> I have seen it on him , and can ___ to it ."
	test_sentence = "<s> the thieves ___ the library and got very little for their pains ."

	# input data
	with open('/mnt/work/NeuralRewriting/eval/small_eval_data.json') as f:
		data = json.load(f)
	with open(keywords_file, 'rb') as f:
		k = pickle.load(f)

	unk_list = []
	for q in data:
		query_word = q['query_word']
		orig_sent = q['orig_sent']
		options = q['options']
		orig_sent = orig_sent.replace(query_word, "___")
		orig_sent = "<s> " + orig_sent
		test_sentence = orig_sent.lower()
		split_sentence = list(split_sentence_for_eval(test_sentence.split(), ["___"], num_before, num_after))
#		print(split_sentence[0][0])
		_, sentence, _, _ = i.index_wordlist(split_sentence[0][0])
		bef = time()
		r = e.eval({'sentence': [sentence]}, {'sm_decision'})
		aft = time()
		sm = r[0][0]

		for o in options:
			synonym = o['synonym']
			if synonym not in k:
				score = -1000
				unk_list += [synonym]
			else:
				score = math.log(sm[k.index(synonym)])
			o['clmtV1'] = score
			print(score)

	# save output
	with open('/mnt/work/NeuralRewriting/eval/small_eval_data_out.json','w') as f:
		json.dump(data,f)
Ejemplo n.º 26
0
#    params['logfile'].write(msg)
#    params['logfile'].write('\n')


def train_iteration_done(trainer, epoch, index, iteration_count, loss_value,
                         training_done, run_results, params):
    if iteration_count == 1:
        trainer._out_file = open('output.txt', 'w')

    msg = ("%s, %s" % (time(), loss_value))
    print('%s: %s' % (iteration_count, msg))
    trainer._out_file.write('%s\n' % msg)


trainer = Trainer(inference=model.inference,
                  batch_size=utils.get_dict_value(params, 'batch_size', 128),
                  loss=losses.softmax_xentropy,
                  model_output_location="./output",
                  name=MODEL_NAME,
                  training_data=training_data,
                  train_iteration_done=train_iteration_done,
                  params=params)

trainer.run(restore_latest_ckpt=False,
            save_network=True,
            save_ckpt=True,
            mini_batches_between_checkpoint=utils.get_dict_value(
                params, 'mini_batches_between_checkpoint', 1000),
            additional_nodes_to_evaluate=['encoded_sentence'],
            on_checkpoint_saved=on_checkpoint_saved)
Ejemplo n.º 27
0
 def get_model_name(self):
     return utils.get_dict_value(self._params, 'model_name',
                                 '_UNKNOWN_MODEL_')
Ejemplo n.º 28
0
import framework.utils.common as utils
import word_classifier.data as data
import os
import urllib.parse
from time import time
from urllib.parse import urlparse
from http.server import BaseHTTPRequestHandler, HTTPServer
run_server = True


paramsfile = "params.py"
data_base_dir = ""
http_port = 8080
params = utils.load_param_file(paramsfile)

vocab_file = os.path.join(utils.get_dict_value(params,'output_location'), 'vocab.pkl')
ckpt = os.path.join(utils.get_dict_value(params,'output_location'),
										utils.get_dict_value(params, 'model_name') + '.ckpt')
print(ckpt)
e = Evaluator.load2(ckpt)
i = TextIndexer.from_file(vocab_file)

num_before = utils.get_dict_value(params, "num_words_before")
num_after = utils.get_dict_value(params, "num_words_after")
pad_tok = utils.get_dict_value(params, "pad_tok", '<pad>')


def split_sentence_for_eval(sentence, keywords, num_before, num_after):
	result = data.gen_data(sentence, keywords, num_before=num_before, num_after=num_after,
                          ignore_negative_data=True, add_redundant_keyword_data=False)
	return result
Ejemplo n.º 29
0
from framework.utils.data.text_indexer import TextIndexer
from framework.evaluator import Evaluator
import framework.utils.common as utils
import os
import numpy as np
run_server = False

params = utils.load_param_file('params.py')

vocab_file = os.path.join(utils.get_dict_value(params, 'output_location'),
                          'vocab.pkl')
ckpt = os.path.join(utils.get_dict_value(params, 'output_location'),
                    utils.get_dict_value(params, 'model_name') + '.ckpt')

sentences = ['The apple , which is rotten is not edible']

e = Evaluator.load2(ckpt)
i = TextIndexer.from_file(vocab_file)

num_before = utils.get_dict_value(params, "num_words_before")
num_after = utils.get_dict_value(params, "num_words_after")
pad_tok = utils.get_dict_value(params, "pad_tok", '<pad>')

sentence = "In simple terms , high precision means that an algorithm " \
   "returned substantially more relevant results than irrelevant ones , while" \
   " high recall means that an algorithm returned most of the relevant results ."

sentence = "<S> In simple terms , high precision means that algorithm " \
   "returned substantially more relevant results than irrelevant ones , while" \
   " high recall means that algorithm returned most of relevant results ."
#sentence = "<S> Precision can be seen as measure of exactness or quality , "\
Ejemplo n.º 30
0
from framework.utils.data.text_indexer import TextIndexer
from tellme.data import TellmeData
import framework.subgraph.losses as losses
import framework.utils.common as utils
from framework.trainer import Trainer
from eval import eval
from time import time
import numpy as np
import model
import os
import shutil

param_file = 'params.py'
params = utils.load_param_file(param_file)
os.makedirs(utils.get_dict_value(params, 'output_location'), exist_ok=True)
shutil.copyfile(
    param_file,
    os.path.join(utils.get_dict_value(params, 'output_location'), param_file))

training_data = TellmeData()
params['vocab_size'] = training_data.get_tcid_count()
params['num_classes'] = training_data.get_tcid_count()


def on_checkpoint_saved(trainer, params, save_path):
    msg = 'saved checkpoint: ' + save_path
    print(msg)
    accuracy, accuracy_sem, accuracy_std = eval(params)
    params['eval_results'] = [accuracy, accuracy_sem, accuracy_std]