def predict_by_model(preprocess_config, columns_sequences_count_dct1, columns_sequences_count_dict2, vocabulary, model): data_path = os.path.join(root_dir, './data/dataset/train_data_part1_test.pickle') data = pickle.load(open(data_path, 'rb')) sequences_count1 = model.sequences_count1 sequences_count2 = model.sequences_count2 sequence_length1 = model.sequence_length1 sequence_length2 = model.sequence_length2 logging.info("Start loading Testing file from {}".format(data_path)) data, features_columns, short_features_columns = processor.process_v2(data, config=preprocess_config, is_dict=False) columns_str_features_dict1 = load_data.process_predict_str_features(data[features_columns], columns_sequences_count_dct1, sequence_length1) columns_str_features_dict2 = load_data.process_predict_str_features(data[short_features_columns], columns_sequences_count_dict2, sequence_length2) logging.info("Test Data Sentence segment is complete") merged_str_features_list1 = load_data.merge_str_features(features_columns, len(data), columns_str_features_dict1, sequences_count1, sequence_length1) merged_str_features_list2 = load_data.merge_str_features(short_features_columns, len(data), columns_str_features_dict2, sequences_count2, sequence_length2) vocab_tree = trie_tree.Trie() for word in vocabulary: vocab_tree.insert(word) input_x1, input_x1_ratio = load_data.build_str_features_data(merged_str_features_list1, vocabulary, vocab_tree) input_x2, input_x2_ratio = load_data.build_str_features_data(merged_str_features_list2, vocabulary, vocab_tree) input_y = data[preprocess_config.result].fillna(0.0) input_y = np.array(input_y) input_y = np.array(input_y, dtype=np.float32) logging.info("Test Data Preprocessing is complete") batches = load_data.batch_iter(list(zip(input_x1, input_x2, input_x1_ratio, input_x2_ratio, input_y)), model.batch_size, num_epochs=1, shuffle=False) total_loss = 0.0 batch_count = 0 for batch in batches: x_1_batch, x_2_batch, x_1_ratio_batch, x_2_ratio_batch, y_batch= zip(*batch) total_loss += model.dev_step(x_1_batch, x_2_batch, x_1_ratio_batch, x_2_ratio_batch, y_batch) batch_count += 1 logging.info("Test Loss is {}".format(total_loss / batch_count))
def predict(data, trained_dir, output='features'): params, preprocess_config, columns_sequences_count_dict1, columns_sequences_count_dict2, vocabulary, word_embedding_mat = \ load_trained_params(trained_dir) sequences_count1 = params['sequences_count1'] sequences_count2 = params['sequences_count2'] sequence_length1 = params['sequence_length1'] sequence_length2 = params['sequence_length2'] data, features_columns, short_features_columns = processor.process_v2(data, config=preprocess_config, is_dict=True) columns_str_features_dict1 = load_data.process_predict_str_features(data[features_columns], columns_sequences_count_dict1, sequence_length1) columns_str_features_dict2 = load_data.process_predict_str_features(data[short_features_columns], columns_sequences_count_dict2, sequence_length2) logging.info("Sentence segment is complete") merged_str_features_list1 = load_data.merge_str_features(features_columns, len(data), columns_str_features_dict1, sequences_count1, sequence_length1) merged_str_features_list2 = load_data.merge_str_features(short_features_columns, len(data), columns_str_features_dict2, sequences_count2, sequence_length2) conv_filter_sizes1 = list(map(int, params['conv_filter_sizes1'].split(','))) conv_filter_sizes2 = list(map(int, params['conv_filter_sizes2'].split(','))) vocab_tree = trie_tree.Trie() for word in vocabulary: vocab_tree.insert(word) input_x1, input_x1_ratio = load_data.build_str_features_data(merged_str_features_list1, vocabulary, vocab_tree) input_x2, input_x2_ratio = load_data.build_str_features_data(merged_str_features_list2, vocabulary, vocab_tree) logging.info("Preprocessing is complete") model = DoubleMultiCnnModel( batch_size=params['batch_size'], sequences_count=(sequences_count1, sequences_count2), sequence_length=(sequence_length1, sequence_length2), word_embedding_size=params['word_embedding_size'], word_embedding_mat=word_embedding_mat, vocabulary_size=len(vocabulary), conv_filter_sizes=(conv_filter_sizes1, conv_filter_sizes2), num_filters=(params['num_filters1'], params['num_filters2']), hidden_size=(params['hidden_size1'], params['hidden_size2']), output_size=len(preprocess_config['result']), decay_steps=params['decay_steps'], learning_rate=params['learning_rate'], clip_gradients=params['clip_gradients'], l2_lambda=params['l2_lambda'], is_training=False, subword_length=params['subword_length'] ) checkpoint_file = trained_dir + 'best_model.ckpt' saver = tf.train.Saver(tf.global_variables()) saver = tf.train.import_meta_graph("{}.meta".format(checkpoint_file)) saver.restore(model.sess, checkpoint_file) logging.critical('{} has been loaded'.format(checkpoint_file)) total_size = np.ceil(len(input_x1) / params['batch_size']) batches = load_data.batch_iter(list(zip(input_x1, input_x2, input_x1_ratio, input_x2_ratio)), params['batch_size'], num_epochs=1, shuffle=False) batch_count = 0 output_data = [] if output == 'result': predict_function = model.predict_step else: predict_function = model.predict_output_features_step for batch in batches: x_1_batch, x_2_batch, x_1_ratio_batch, x_2_ratio_batch = zip(*batch) output_data_batch = predict_function(x_1_batch, x_2_batch, x_1_ratio_batch, x_2_ratio_batch) output_data.append(output_data_batch) batch_count += 1 if batch_count % 20 == 0: progress = ('%.2f' % (batch_count / total_size * 100)) logging.info('Progress at : {}% data'.format(progress)) output_data = np.vstack(output_data) return output_data, params, preprocess_config, features_columns, short_features_columns, \ columns_sequences_count_dict1, columns_sequences_count_dict2, \
def train(data_path, pretrain_path=None): # 生成训练数据 data_x, data_y, word_embedding, vocabulary, vocabulary_inv,\ sequences_count, sequence_length, columns_sequences_count_dict\ = load_data(data_path, word_embedding_size=FLAGS.word_embedding_size, max_count=FLAGS.sequences_count, max_length=FLAGS.sequence_length, shuffle=True, pretrain_embedding_path=pretrain_path) # 词向量矩阵 word_embedding_mat = [ word_embedding[word] for index, word in enumerate(vocabulary_inv) ] word_embedding_mat = np.array(word_embedding_mat, dtype=np.float32) # split the original dataset into train set and test set data_x, data_x_test, data_y, data_y_test = train_test_split(data_x, data_y, test_size=0.1) # split the train set into train set and dev set data_x_train, data_x_dev, data_y_train, data_y_dev = train_test_split( data_x, data_y, test_size=0.1) logging.info('data_train: {}, data_dev: {}, data_test: {}'.format( len(data_x_train), len(data_x_dev), len(data_x_test))) timestamp = str(int(time.time())) trained_dir = './trained_result/trained_results_' + timestamp + '/' if os.path.exists(trained_dir): shutil.rmtree(trained_dir) os.makedirs(trained_dir) logging.critical('The trained result is saved in {}'.format(trained_dir)) params = dict() params['learning_rate'] = FLAGS.learning_rate params['batch_size'] = FLAGS.batch_size params['decay_steps'] = FLAGS.decay_steps params['decay_rate'] = FLAGS.decay_rate params['num_epochs'] = FLAGS.num_epochs params['max_count'] = FLAGS.sequences_count params['sequences_count'] = sequences_count params['sequence_length'] = sequence_length params['word_embedding_size'] = FLAGS.word_embedding_size params['conv_filter_sizes'] = FLAGS.conv_filter_sizes params['num_filters'] = FLAGS.num_filters params['hidden_size'] = FLAGS.hidden_size params['dropout_keep_prob'] = FLAGS.dropout_keep_prob params['l2_lambda'] = FLAGS.l2_lambda params['clip_gradients'] = FLAGS.clip_gradients with open(trained_dir + 'trained_parameters.json', 'w') as outfile: json.dump(params, outfile, indent=4, sort_keys=True, ensure_ascii=False) with open(trained_dir + 'preprocess.json', 'w') as outfile: json.dump(preprocess_yaml, outfile, indent=4, sort_keys=True, ensure_ascii=False) with open(trained_dir + 'columns_sequences_count.json', 'w') as outfile: json.dump(columns_sequences_count_dict, outfile, indent=4, ensure_ascii=False) # 初始化Model model = MultiCnnModel( # batch_size batch_size=FLAGS.batch_size, # sequence数量 sequences_count=sequences_count, # sequence长度 sequence_length=sequence_length, # 词向量维度 word_embedding_size=FLAGS.word_embedding_size, # 词向量矩阵 word_embedding_mat=word_embedding_mat, # 词典size vocabulary_size=len(vocabulary), # 卷积层filter_size conv_filter_sizes=list(map(int, FLAGS.conv_filter_sizes.split(','))), # 卷积层输出通道数 num_filters=FLAGS.num_filters, # 输出dense层维度 hidden_size=FLAGS.hidden_size, # 输出维度 output_size=data_y.shape[1], # dropout dropout_keep_prob=FLAGS.dropout_keep_prob, # 是否训练 is_training=FLAGS.is_training, # 衰减步数 decay_steps=FLAGS.decay_steps, # 衰减速率 decay_rate=FLAGS.decay_rate, # 学习速率 learning_rate=FLAGS.learning_rate, # 梯度裁剪 clip_gradients=FLAGS.clip_gradients, # 正二范数权值 l2_lambda=FLAGS.l2_lambda, ) # summary_dir = SUMMARY_PATH + 'summary_' + timestamp + '/' # if os.path.exists(summary_dir): # shutil.rmtree(summary_dir) # os.makedirs(summary_dir) # logging.critical('The summary information is saved in {}'.format(summary_dir)) # train_writer = tf.summary.FileWriter(summary_dir + 'train', model.sess.graph) if not os.path.exists(CHECKPOINT_PATH): os.makedirs(CHECKPOINT_PATH) checkpoint_dir = CHECKPOINT_PATH + 'check_points_' + timestamp + '/' if os.path.exists(checkpoint_dir): shutil.rmtree(checkpoint_dir) os.makedirs(checkpoint_dir) checkpoint_prefix = os.path.join(checkpoint_dir, 'model') init = tf.global_variables_initializer() model.sess.run(init) saver = tf.train.Saver(tf.global_variables()) # Train start here total_size = np.ceil( len(data_x_train) / FLAGS.batch_size) * FLAGS.num_epochs best_loss = 999 best_step = 0 batch_count = 0 train_batches = batch_iter(list(zip(data_x_train, data_y_train)), FLAGS.batch_size, FLAGS.num_epochs) for train_batch in train_batches: batch_count += 1 x_train_batch, y_train_batch = zip(*train_batch) train_loss = model.train_step(x_train_batch, y_train_batch) current_step = tf.train.global_step(model.sess, model.global_step) #Evaluate the model if current_step % FLAGS.validate_step == 0: progress = ('%.2f' % (batch_count / total_size * 100)) dev_batches = batch_iter(list(zip(data_x_dev, data_y_dev)), FLAGS.batch_size, 1) dev_loss, dev_batch_num = 0.0, 0 for dev_batch in dev_batches: dev_batch_num += 1 x_dev_batch, y_dev_batch = zip(*dev_batch) dev_loss_batch = model.dev_step(x_dev_batch, y_dev_batch) dev_loss += dev_loss_batch dev_loss /= dev_batch_num logging.info( 'Progress at : {}% examples, Train Loss is {}, Dev Loss is {}'. format(progress, train_loss, dev_loss)) if dev_loss < best_loss: best_loss, best_step = dev_loss, current_step path = saver.save(model.sess, checkpoint_prefix, global_step=current_step) logging.critical('Best loss {} at step {}'.format( best_loss, best_step)) # if current step exceeds max step, stop training if current_step >= FLAGS.max_step: break logging.critical( 'Training is complete, testing the best model on x_test and y_test') # close summary writer # train_writer.close() # evaluate x_test and y_test saver.restore(model.sess, checkpoint_prefix + '-' + str(best_step)) #Save the model files to trained_dir saver.save(model.sess, trained_dir + 'best_model.ckpt') test_batches = batch_iter(list(zip(data_x_test, data_y_test)), FLAGS.batch_size, 1) test_loss, test_batch_num = 0.0, 0 for test_batch in test_batches: test_batch_num += 1 x_test_batch, y_test_batch = zip(*test_batch) test_loss_batch = model.dev_step(x_test_batch, y_test_batch) test_loss += test_loss_batch test_loss /= test_batch_num logging.critical('Loss on test set is {}'.format(test_loss)) with open(trained_dir + 'vocabulary.json', 'w') as outfile: json.dump(vocabulary, outfile, indent=4, ensure_ascii=False) word_embedding_mat = model.sess.run(model.word_embedding_mat) with open(trained_dir + 'word_embedding.pickle', 'wb') as outfile: pickle.dump(word_embedding_mat, outfile, pickle.HIGHEST_PROTOCOL)
def predict(data, trained_dir, output='features'): params, preprocess_config, columns_sequences_count_dict, vocabulary, word_embedding_mat = \ load_trained_params(trained_dir) sequences_count = params['sequences_count'] sequence_length = params['sequence_length'] data, features_columns = processor.process(data, config=preprocess_config, is_dict=True) columns_str_features_dict = load_data.process_predict_str_features( data[features_columns], columns_sequences_count_dict, sequence_length) merged_str_features_list = load_data.merge_str_features( features_columns, len(data), columns_str_features_dict, sequences_count, sequence_length) input_x = load_data.build_str_features_data(merged_str_features_list, vocabulary) model = MultiCnnModel( batch_size=params['batch_size'], sequences_count=sequences_count, sequence_length=sequence_length, word_embedding_size=params['word_embedding_size'], word_embedding_mat=word_embedding_mat, vocabulary_size=len(vocabulary), conv_filter_sizes=list(map(int, params['conv_filter_sizes'].split(','))), num_filters=params['num_filters'], hidden_size=params['hidden_size'], output_size=len(preprocess_config['result']), dropout_keep_prob=1.0, is_training=False, decay_steps=params['decay_steps'], learning_rate=params['learning_rate'], clip_gradients=params['clip_gradients'], l2_lambda=params['l2_lambda'], ) checkpoint_file = trained_dir + 'best_model.ckpt' saver = tf.train.Saver(tf.global_variables()) saver = tf.train.import_meta_graph("{}.meta".format(checkpoint_file)) saver.restore(model.sess, checkpoint_file) logging.critical('{} has been loaded'.format(checkpoint_file)) total_size = np.ceil(len(input_x) / params['batch_size']) batches = load_data.batch_iter(input_x, params['batch_size'], num_epochs=1, shuffle=False) batch_count = 0 output_data = [] if output == 'result': predict_function = model.predict_step else: predict_function = model.predict_output_features_step for batch in batches: x_batch = batch output_data_batch = predict_function(x_batch) output_data.append(output_data_batch) batch_count += 1 if batch_count % 20 == 0: progress = ('%.2f' % (batch_count / total_size * 100)) logging.info('Progress at : {}% data'.format(progress)) output_data = np.vstack(output_data) return output_data, params, preprocess_config, features_columns, columns_sequences_count_dict
def train(data_path, pretrain_embedding_path=None, pretrain_model_path=None): # 生成训练数据 data_x_1, data_x_2, data_x_1_ratio, data_x_2_ratio, data_y, word_embedding, vocabulary, vocabulary_inv,\ sequences_count1, sequences_count2, sequence_length1, sequence_length2, \ columns_sequences_count_dict1, columns_sequences_count_dict2 \ = load_data(data_path, word_embedding_size=FLAGS.word_embedding_size, max_count1=FLAGS.sequences_count1, max_length1=FLAGS.sequence_length1, max_count2=FLAGS.sequences_count2, max_length2=FLAGS.sequence_length2, shuffle=True, pretrain_embedding_path=pretrain_embedding_path) # 词向量矩阵 word_embedding_mat = [ word_embedding[word] for index, word in enumerate(vocabulary_inv) ] word_embedding_mat = np.array(word_embedding_mat, dtype=np.float32) # split the original dataset into train set and test set data_x_1, data_x_1_test, data_x_2, data_x_2_test, \ data_x_1_ratio, data_x_1_ratio_test, data_x_2_ratio, data_x_2_ratio_test, \ data_y, data_y_test = train_test_split( data_x_1, data_x_2, data_x_1_ratio, data_x_2_ratio, data_y, test_size=0.1) # split the train set into train set and dev set data_x_1_train, data_x_1_dev, data_x_2_train, data_x_2_dev, \ data_x_1_ratio_train, data_x_1_ratio_dev, data_x_2_ratio_train, data_x_2_ratio_dev, \ data_y_train, data_y_dev = train_test_split( data_x_1, data_x_2, data_x_1_ratio, data_x_2_ratio, data_y, test_size=0.1) logging.info('data_train: {}, data_dev: {}, data_test: {}'.format( len(data_x_1_train), len(data_x_1_dev), len(data_x_1_test))) timestamp = str(int(time.time())) trained_dir = './trained_result/trained_results_' + timestamp + '/' if os.path.exists(trained_dir): shutil.rmtree(trained_dir) os.makedirs(trained_dir) logging.critical('The trained result is saved in {}'.format(trained_dir)) params = dict() params['learning_rate'] = FLAGS.learning_rate params['batch_size'] = FLAGS.batch_size params['decay_steps'] = FLAGS.decay_steps params['decay_rate'] = FLAGS.decay_rate params['num_epochs'] = FLAGS.num_epochs params['max_count1'] = FLAGS.sequences_count1 params['max_count2'] = FLAGS.sequences_count2 params['sequences_count1'] = sequences_count1 params['sequences_count2'] = sequences_count2 params['sequence_length1'] = sequence_length1 params['sequence_length2'] = sequence_length2 params['word_embedding_size'] = FLAGS.word_embedding_size params['conv_filter_sizes1'] = FLAGS.conv_filter_sizes1 params['conv_filter_sizes2'] = FLAGS.conv_filter_sizes2 params['num_filters1'] = FLAGS.num_filters1 params['num_filters2'] = FLAGS.num_filters2 params['hidden_size1'] = FLAGS.hidden_size1 params['hidden_size2'] = FLAGS.hidden_size2 params['dropout_keep_prob'] = FLAGS.dropout_keep_prob params['l2_lambda'] = FLAGS.l2_lambda params['clip_gradients'] = FLAGS.clip_gradients params['subword_length'] = FLAGS.subword_length with open(trained_dir + 'trained_parameters.json', 'w') as outfile: json.dump(params, outfile, indent=4, sort_keys=True, ensure_ascii=False) with open(trained_dir + 'preprocess.json', 'w') as outfile: json.dump(preprocess_yaml, outfile, indent=4, sort_keys=True, ensure_ascii=False) with open(trained_dir + 'columns_sequences_count1.json', 'w') as outfile: json.dump(columns_sequences_count_dict1, outfile, indent=4, ensure_ascii=False) with open(trained_dir + 'columns_sequences_count2.json', 'w') as outfile: json.dump(columns_sequences_count_dict2, outfile, indent=4, ensure_ascii=False) conv_filter_sizes1 = list(map(int, FLAGS.conv_filter_sizes1.split(','))) conv_filter_sizes2 = list(map(int, FLAGS.conv_filter_sizes2.split(','))) # 初始化Model model = DoubleMultiCnnModel( # batch_size batch_size=FLAGS.batch_size, # sequence数量 sequences_count=(sequences_count1, sequences_count2), # sequence长度 sequence_length=(sequence_length1, sequence_length2), # 词向量维度 word_embedding_size=FLAGS.word_embedding_size, # 词向量矩阵 word_embedding_mat=word_embedding_mat, # 词典size vocabulary_size=len(vocabulary), # 卷积层filter_size conv_filter_sizes=(conv_filter_sizes1, conv_filter_sizes2), # 卷积层输出通道数 num_filters=(FLAGS.num_filters1, FLAGS.num_filters2), # 输出dense层维度 hidden_size=(FLAGS.hidden_size1, FLAGS.hidden_size2), # 输出维度 output_size=data_y.shape[1], # 衰减步数 decay_steps=FLAGS.decay_steps, # 衰减速率 decay_rate=FLAGS.decay_rate, # 学习速率 learning_rate=FLAGS.learning_rate, # 梯度裁剪 clip_gradients=FLAGS.clip_gradients, # 正二范数权值 l2_lambda=FLAGS.l2_lambda, # 是否在训练 is_training=True, #subword length subword_length=FLAGS.subword_length) saver = tf.train.Saver(tf.global_variables()) if pretrain_model_path: saver.restore(model.sess, tf.train.latest_checkpoint(pretrain_model_path)) if not os.path.exists(CHECKPOINT_PATH): os.makedirs(CHECKPOINT_PATH) checkpoint_dir = CHECKPOINT_PATH + 'check_points_' + timestamp + '/' if os.path.exists(checkpoint_dir): shutil.rmtree(checkpoint_dir) os.makedirs(checkpoint_dir) checkpoint_prefix = os.path.join(checkpoint_dir, 'model') # Train start here total_size = np.ceil( len(data_x_1_train) / FLAGS.batch_size) * FLAGS.num_epochs epoch_size = np.ceil(total_size / FLAGS.num_epochs) batch_size_larger = 0 best_loss = 999 best_step = 0 batch_count = 0 train_batches = batch_iter( list( zip(data_x_1_train, data_x_2_train, data_x_1_ratio_train, data_x_2_ratio_train, data_y_train)), FLAGS.batch_size, FLAGS.num_epochs) while True: try: train_batch = train_batches.__next__() except StopIteration: break batch_count += 1 * int(model.batch_size / FLAGS.batch_size) x_1_train_batch, x_2_train_batch, x_1_ratio_train_batch, x_2_ratio_train_batch, y_train_batch = zip( *train_batch) train_loss = model.train_step(x_1_train_batch, x_2_train_batch, x_1_ratio_train_batch, x_2_ratio_train_batch, y_train_batch, FLAGS.dropout_keep_prob) current_step = tf.train.global_step(model.sess, model.global_step) #Evaluate the model if current_step % FLAGS.validate_step == 0: progress = ('%.2f' % (batch_count / total_size * 100)) dev_batches = batch_iter( list( zip(data_x_1_dev, data_x_2_dev, data_x_1_ratio_dev, data_x_2_ratio_dev, data_y_dev)), FLAGS.batch_size, 1) dev_loss, dev_batch_num = 0.0, 0 for dev_batch in dev_batches: dev_batch_num += 1 x_1_dev_batch, x_2_dev_batch, x_1_ratio_dev_batch, x_2_ratio_dev_batch, y_dev_batch = zip( *dev_batch) dev_loss_batch = model.dev_step(x_1_dev_batch, x_2_dev_batch, x_1_ratio_dev_batch, x_2_ratio_dev_batch, y_dev_batch) dev_loss += dev_loss_batch dev_loss /= dev_batch_num logging.info( 'Progress at : {}% examples, Train Loss is {}, Dev Loss is {}'. format(progress, train_loss, dev_loss)) if dev_loss < best_loss: best_loss, best_step = dev_loss, current_step path = saver.save(model.sess, checkpoint_prefix, global_step=current_step) logging.critical('Best loss {} at step {}'.format( best_loss, best_step)) if batch_count % epoch_size == 0: logging.critical('Learning rate is {}'.format( model.sess.run(model.learning_rate))) changed = True if batch_size_larger == 0 and best_loss < 0.0405: pass elif batch_size_larger == 1 and best_loss < 0.0395: pass elif batch_size_larger == 2 and best_loss < 0.0385: pass else: changed = False if changed: model.batch_size = FLAGS.batch_size * np.power( 2, batch_size_larger + 1) new_epochs = max( int(FLAGS.num_epochs * (1 - batch_count / total_size)), int(FLAGS.num_epochs / 4)) train_batches = batch_iter( list( zip(data_x_1_train, data_x_2_train, data_x_1_ratio_train, data_x_2_ratio_train, data_y_train)), model.batch_size, new_epochs) batch_size_larger += 1 logging.critical("enlarge batch_size to {}".format( model.batch_size)) # change learning_rate in training scale = int(FLAGS.learning_rate_threshold / model.sess.run(model.learning_rate)) if scale >= 1: model.decay_steps = FLAGS.decay_steps * FLAGS.decay_steps_scale * scale logging.critical("Learning rate decay steps changed to {}".format( model.decay_steps)) # if current step exceeds max step, stop training if current_step >= FLAGS.max_step: break logging.critical( 'Training is complete, testing the best model on x_test and y_test') # evaluate x_test and y_test saver.restore(model.sess, checkpoint_prefix + '-' + str(best_step)) #Save the model files to trained_dir saver.save(model.sess, trained_dir + 'best_model.ckpt') test_batches = batch_iter( list( zip(data_x_1_test, data_x_2_test, data_x_1_ratio_test, data_x_2_ratio_test, data_y_test)), FLAGS.batch_size, 1) test_loss, test_batch_num = 0.0, 0 for test_batch in test_batches: test_batch_num += 1 x_1_test_batch, x_2_test_batch, x_1_ratio_test_batch, x_2_ratio_test_batch, y_test_batch = zip( *test_batch) test_loss_batch = model.dev_step(x_1_test_batch, x_2_test_batch, x_1_ratio_test_batch, x_2_ratio_test_batch, y_test_batch) test_loss += test_loss_batch test_loss /= test_batch_num logging.critical('Loss on test set is {}'.format(test_loss)) predict_by_model(preprocess_config, columns_sequences_count_dict1, columns_sequences_count_dict2, vocabulary, model) with open(trained_dir + 'vocabulary.json', 'w') as outfile: json.dump(vocabulary, outfile, indent=4, ensure_ascii=False) with open(trained_dir + 'word_embedding.pickle', 'wb') as outfile: pickle.dump(word_embedding_mat, outfile, pickle.HIGHEST_PROTOCOL)