Example #1
0
def predict_by_model(preprocess_config, columns_sequences_count_dct1, columns_sequences_count_dict2, vocabulary, model):

    data_path = os.path.join(root_dir, './data/dataset/train_data_part1_test.pickle')
    data = pickle.load(open(data_path, 'rb'))

    sequences_count1 = model.sequences_count1
    sequences_count2 = model.sequences_count2
    sequence_length1 = model.sequence_length1
    sequence_length2 = model.sequence_length2

    logging.info("Start loading Testing file from {}".format(data_path))

    data, features_columns, short_features_columns = processor.process_v2(data, config=preprocess_config, is_dict=False)

    columns_str_features_dict1 = load_data.process_predict_str_features(data[features_columns], columns_sequences_count_dct1,
                                                                        sequence_length1)
    columns_str_features_dict2 = load_data.process_predict_str_features(data[short_features_columns],
                                                                        columns_sequences_count_dict2, sequence_length2)
    logging.info("Test Data Sentence segment is complete")

    merged_str_features_list1 = load_data.merge_str_features(features_columns, len(data), columns_str_features_dict1, sequences_count1,
                                                             sequence_length1)
    merged_str_features_list2 = load_data.merge_str_features(short_features_columns, len(data), columns_str_features_dict2, sequences_count2,
                                                             sequence_length2)

    vocab_tree = trie_tree.Trie()
    for word in vocabulary:
        vocab_tree.insert(word)

    input_x1, input_x1_ratio = load_data.build_str_features_data(merged_str_features_list1, vocabulary, vocab_tree)
    input_x2, input_x2_ratio = load_data.build_str_features_data(merged_str_features_list2, vocabulary, vocab_tree)
    input_y = data[preprocess_config.result].fillna(0.0)
    input_y = np.array(input_y)
    input_y = np.array(input_y, dtype=np.float32)

    logging.info("Test Data Preprocessing is complete")

    batches = load_data.batch_iter(list(zip(input_x1, input_x2, input_x1_ratio, input_x2_ratio, input_y)), model.batch_size, num_epochs=1, shuffle=False)
    total_loss = 0.0
    batch_count = 0

    for batch in batches:
        x_1_batch, x_2_batch, x_1_ratio_batch, x_2_ratio_batch, y_batch= zip(*batch)
        total_loss += model.dev_step(x_1_batch, x_2_batch, x_1_ratio_batch, x_2_ratio_batch, y_batch)
        batch_count += 1
    logging.info("Test Loss is {}".format(total_loss / batch_count))
Example #2
0
def predict(data, trained_dir, output='features'):
    params, preprocess_config, columns_sequences_count_dict1, columns_sequences_count_dict2, vocabulary, word_embedding_mat = \
        load_trained_params(trained_dir)

    sequences_count1 = params['sequences_count1']
    sequences_count2 = params['sequences_count2']
    sequence_length1 = params['sequence_length1']
    sequence_length2 = params['sequence_length2']

    data, features_columns, short_features_columns = processor.process_v2(data, config=preprocess_config, is_dict=True)

    columns_str_features_dict1 = load_data.process_predict_str_features(data[features_columns], columns_sequences_count_dict1,
                                                                        sequence_length1)
    columns_str_features_dict2 = load_data.process_predict_str_features(data[short_features_columns],
                                                                        columns_sequences_count_dict2, sequence_length2)
    logging.info("Sentence segment is complete")

    merged_str_features_list1 = load_data.merge_str_features(features_columns, len(data), columns_str_features_dict1, sequences_count1,
                                                             sequence_length1)
    merged_str_features_list2 = load_data.merge_str_features(short_features_columns, len(data), columns_str_features_dict2, sequences_count2,
                                                             sequence_length2)

    conv_filter_sizes1 = list(map(int, params['conv_filter_sizes1'].split(',')))
    conv_filter_sizes2 = list(map(int, params['conv_filter_sizes2'].split(',')))

    vocab_tree = trie_tree.Trie()
    for word in vocabulary:
        vocab_tree.insert(word)

    input_x1, input_x1_ratio = load_data.build_str_features_data(merged_str_features_list1, vocabulary, vocab_tree)
    input_x2, input_x2_ratio = load_data.build_str_features_data(merged_str_features_list2, vocabulary, vocab_tree)

    logging.info("Preprocessing is complete")

    model = DoubleMultiCnnModel(
        batch_size=params['batch_size'],
        sequences_count=(sequences_count1, sequences_count2),
        sequence_length=(sequence_length1, sequence_length2),
        word_embedding_size=params['word_embedding_size'],
        word_embedding_mat=word_embedding_mat,
        vocabulary_size=len(vocabulary),
        conv_filter_sizes=(conv_filter_sizes1, conv_filter_sizes2),
        num_filters=(params['num_filters1'], params['num_filters2']),
        hidden_size=(params['hidden_size1'], params['hidden_size2']),
        output_size=len(preprocess_config['result']),
        decay_steps=params['decay_steps'],
        learning_rate=params['learning_rate'],
        clip_gradients=params['clip_gradients'],
        l2_lambda=params['l2_lambda'],
        is_training=False,
        subword_length=params['subword_length']
    )
    checkpoint_file = trained_dir + 'best_model.ckpt'
    saver = tf.train.Saver(tf.global_variables())
    saver = tf.train.import_meta_graph("{}.meta".format(checkpoint_file))
    saver.restore(model.sess, checkpoint_file)
    logging.critical('{} has been loaded'.format(checkpoint_file))

    total_size = np.ceil(len(input_x1) / params['batch_size'])

    batches = load_data.batch_iter(list(zip(input_x1, input_x2, input_x1_ratio, input_x2_ratio)), params['batch_size'], num_epochs=1, shuffle=False)
    batch_count = 0
    output_data = []

    if output == 'result':
        predict_function = model.predict_step
    else:
        predict_function = model.predict_output_features_step

    for batch in batches:
        x_1_batch, x_2_batch, x_1_ratio_batch, x_2_ratio_batch = zip(*batch)
        output_data_batch = predict_function(x_1_batch, x_2_batch, x_1_ratio_batch, x_2_ratio_batch)
        output_data.append(output_data_batch)
        batch_count += 1
        if batch_count % 20 == 0:
            progress = ('%.2f' % (batch_count / total_size * 100))
            logging.info('Progress at : {}% data'.format(progress))
    output_data = np.vstack(output_data)

    return output_data, params, preprocess_config, features_columns, short_features_columns, \
           columns_sequences_count_dict1, columns_sequences_count_dict2, \
def train(data_path, pretrain_path=None):
    # 生成训练数据
    data_x, data_y, word_embedding, vocabulary, vocabulary_inv,\
        sequences_count, sequence_length, columns_sequences_count_dict\
        = load_data(data_path,
                    word_embedding_size=FLAGS.word_embedding_size,
                    max_count=FLAGS.sequences_count,
                    max_length=FLAGS.sequence_length,
                    shuffle=True,
                    pretrain_embedding_path=pretrain_path)

    # 词向量矩阵
    word_embedding_mat = [
        word_embedding[word] for index, word in enumerate(vocabulary_inv)
    ]
    word_embedding_mat = np.array(word_embedding_mat, dtype=np.float32)

    # split the original dataset into train set and test set
    data_x, data_x_test, data_y, data_y_test = train_test_split(data_x,
                                                                data_y,
                                                                test_size=0.1)

    # split the train set into train set and dev set
    data_x_train, data_x_dev, data_y_train, data_y_dev = train_test_split(
        data_x, data_y, test_size=0.1)

    logging.info('data_train: {}, data_dev: {}, data_test: {}'.format(
        len(data_x_train), len(data_x_dev), len(data_x_test)))

    timestamp = str(int(time.time()))
    trained_dir = './trained_result/trained_results_' + timestamp + '/'
    if os.path.exists(trained_dir):
        shutil.rmtree(trained_dir)
    os.makedirs(trained_dir)
    logging.critical('The trained result is saved in {}'.format(trained_dir))

    params = dict()
    params['learning_rate'] = FLAGS.learning_rate
    params['batch_size'] = FLAGS.batch_size
    params['decay_steps'] = FLAGS.decay_steps
    params['decay_rate'] = FLAGS.decay_rate
    params['num_epochs'] = FLAGS.num_epochs

    params['max_count'] = FLAGS.sequences_count
    params['sequences_count'] = sequences_count
    params['sequence_length'] = sequence_length
    params['word_embedding_size'] = FLAGS.word_embedding_size
    params['conv_filter_sizes'] = FLAGS.conv_filter_sizes
    params['num_filters'] = FLAGS.num_filters
    params['hidden_size'] = FLAGS.hidden_size

    params['dropout_keep_prob'] = FLAGS.dropout_keep_prob
    params['l2_lambda'] = FLAGS.l2_lambda
    params['clip_gradients'] = FLAGS.clip_gradients

    with open(trained_dir + 'trained_parameters.json', 'w') as outfile:
        json.dump(params,
                  outfile,
                  indent=4,
                  sort_keys=True,
                  ensure_ascii=False)

    with open(trained_dir + 'preprocess.json', 'w') as outfile:
        json.dump(preprocess_yaml,
                  outfile,
                  indent=4,
                  sort_keys=True,
                  ensure_ascii=False)

    with open(trained_dir + 'columns_sequences_count.json', 'w') as outfile:
        json.dump(columns_sequences_count_dict,
                  outfile,
                  indent=4,
                  ensure_ascii=False)

    # 初始化Model
    model = MultiCnnModel(
        # batch_size
        batch_size=FLAGS.batch_size,
        # sequence数量
        sequences_count=sequences_count,
        # sequence长度
        sequence_length=sequence_length,
        # 词向量维度
        word_embedding_size=FLAGS.word_embedding_size,
        # 词向量矩阵
        word_embedding_mat=word_embedding_mat,
        # 词典size
        vocabulary_size=len(vocabulary),
        # 卷积层filter_size
        conv_filter_sizes=list(map(int, FLAGS.conv_filter_sizes.split(','))),
        # 卷积层输出通道数
        num_filters=FLAGS.num_filters,
        # 输出dense层维度
        hidden_size=FLAGS.hidden_size,
        # 输出维度
        output_size=data_y.shape[1],
        # dropout
        dropout_keep_prob=FLAGS.dropout_keep_prob,
        # 是否训练
        is_training=FLAGS.is_training,
        # 衰减步数
        decay_steps=FLAGS.decay_steps,
        # 衰减速率
        decay_rate=FLAGS.decay_rate,
        # 学习速率
        learning_rate=FLAGS.learning_rate,
        # 梯度裁剪
        clip_gradients=FLAGS.clip_gradients,
        # 正二范数权值
        l2_lambda=FLAGS.l2_lambda,
    )

    # summary_dir = SUMMARY_PATH + 'summary_' + timestamp + '/'
    # if os.path.exists(summary_dir):
    #     shutil.rmtree(summary_dir)
    # os.makedirs(summary_dir)
    # logging.critical('The summary information is saved in {}'.format(summary_dir))
    # train_writer = tf.summary.FileWriter(summary_dir + 'train', model.sess.graph)

    if not os.path.exists(CHECKPOINT_PATH):
        os.makedirs(CHECKPOINT_PATH)

    checkpoint_dir = CHECKPOINT_PATH + 'check_points_' + timestamp + '/'
    if os.path.exists(checkpoint_dir):
        shutil.rmtree(checkpoint_dir)
    os.makedirs(checkpoint_dir)
    checkpoint_prefix = os.path.join(checkpoint_dir, 'model')

    init = tf.global_variables_initializer()
    model.sess.run(init)

    saver = tf.train.Saver(tf.global_variables())

    # Train start here
    total_size = np.ceil(
        len(data_x_train) / FLAGS.batch_size) * FLAGS.num_epochs

    best_loss = 999
    best_step = 0
    batch_count = 0

    train_batches = batch_iter(list(zip(data_x_train, data_y_train)),
                               FLAGS.batch_size, FLAGS.num_epochs)

    for train_batch in train_batches:
        batch_count += 1
        x_train_batch, y_train_batch = zip(*train_batch)
        train_loss = model.train_step(x_train_batch, y_train_batch)
        current_step = tf.train.global_step(model.sess, model.global_step)

        #Evaluate the model
        if current_step % FLAGS.validate_step == 0:
            progress = ('%.2f' % (batch_count / total_size * 100))
            dev_batches = batch_iter(list(zip(data_x_dev, data_y_dev)),
                                     FLAGS.batch_size, 1)
            dev_loss, dev_batch_num = 0.0, 0
            for dev_batch in dev_batches:
                dev_batch_num += 1
                x_dev_batch, y_dev_batch = zip(*dev_batch)
                dev_loss_batch = model.dev_step(x_dev_batch, y_dev_batch)
                dev_loss += dev_loss_batch

            dev_loss /= dev_batch_num
            logging.info(
                'Progress at : {}% examples, Train Loss is {}, Dev Loss is {}'.
                format(progress, train_loss, dev_loss))

            if dev_loss < best_loss:
                best_loss, best_step = dev_loss, current_step
                path = saver.save(model.sess,
                                  checkpoint_prefix,
                                  global_step=current_step)
                logging.critical('Best loss {} at step {}'.format(
                    best_loss, best_step))

        # if current step exceeds max step, stop training
        if current_step >= FLAGS.max_step:
            break

    logging.critical(
        'Training is complete, testing the best model on x_test and y_test')

    # close summary writer
    # train_writer.close()

    # evaluate x_test and y_test
    saver.restore(model.sess, checkpoint_prefix + '-' + str(best_step))

    #Save the model files to trained_dir
    saver.save(model.sess, trained_dir + 'best_model.ckpt')

    test_batches = batch_iter(list(zip(data_x_test, data_y_test)),
                              FLAGS.batch_size, 1)
    test_loss, test_batch_num = 0.0, 0
    for test_batch in test_batches:
        test_batch_num += 1
        x_test_batch, y_test_batch = zip(*test_batch)
        test_loss_batch = model.dev_step(x_test_batch, y_test_batch)
        test_loss += test_loss_batch

    test_loss /= test_batch_num
    logging.critical('Loss on test set is {}'.format(test_loss))

    with open(trained_dir + 'vocabulary.json', 'w') as outfile:
        json.dump(vocabulary, outfile, indent=4, ensure_ascii=False)

    word_embedding_mat = model.sess.run(model.word_embedding_mat)
    with open(trained_dir + 'word_embedding.pickle', 'wb') as outfile:
        pickle.dump(word_embedding_mat, outfile, pickle.HIGHEST_PROTOCOL)
def predict(data, trained_dir, output='features'):
    params, preprocess_config, columns_sequences_count_dict, vocabulary, word_embedding_mat = \
        load_trained_params(trained_dir)

    sequences_count = params['sequences_count']
    sequence_length = params['sequence_length']

    data, features_columns = processor.process(data,
                                               config=preprocess_config,
                                               is_dict=True)
    columns_str_features_dict = load_data.process_predict_str_features(
        data[features_columns], columns_sequences_count_dict, sequence_length)

    merged_str_features_list = load_data.merge_str_features(
        features_columns, len(data), columns_str_features_dict,
        sequences_count, sequence_length)

    input_x = load_data.build_str_features_data(merged_str_features_list,
                                                vocabulary)

    model = MultiCnnModel(
        batch_size=params['batch_size'],
        sequences_count=sequences_count,
        sequence_length=sequence_length,
        word_embedding_size=params['word_embedding_size'],
        word_embedding_mat=word_embedding_mat,
        vocabulary_size=len(vocabulary),
        conv_filter_sizes=list(map(int,
                                   params['conv_filter_sizes'].split(','))),
        num_filters=params['num_filters'],
        hidden_size=params['hidden_size'],
        output_size=len(preprocess_config['result']),
        dropout_keep_prob=1.0,
        is_training=False,
        decay_steps=params['decay_steps'],
        learning_rate=params['learning_rate'],
        clip_gradients=params['clip_gradients'],
        l2_lambda=params['l2_lambda'],
    )

    checkpoint_file = trained_dir + 'best_model.ckpt'
    saver = tf.train.Saver(tf.global_variables())
    saver = tf.train.import_meta_graph("{}.meta".format(checkpoint_file))
    saver.restore(model.sess, checkpoint_file)
    logging.critical('{} has been loaded'.format(checkpoint_file))

    total_size = np.ceil(len(input_x) / params['batch_size'])

    batches = load_data.batch_iter(input_x,
                                   params['batch_size'],
                                   num_epochs=1,
                                   shuffle=False)
    batch_count = 0
    output_data = []

    if output == 'result':
        predict_function = model.predict_step
    else:
        predict_function = model.predict_output_features_step

    for batch in batches:
        x_batch = batch
        output_data_batch = predict_function(x_batch)
        output_data.append(output_data_batch)
        batch_count += 1
        if batch_count % 20 == 0:
            progress = ('%.2f' % (batch_count / total_size * 100))
            logging.info('Progress at : {}% data'.format(progress))
    output_data = np.vstack(output_data)

    return output_data, params, preprocess_config, features_columns, columns_sequences_count_dict
def train(data_path, pretrain_embedding_path=None, pretrain_model_path=None):
    # 生成训练数据
    data_x_1, data_x_2, data_x_1_ratio, data_x_2_ratio, data_y, word_embedding, vocabulary, vocabulary_inv,\
        sequences_count1, sequences_count2, sequence_length1, sequence_length2, \
    columns_sequences_count_dict1, columns_sequences_count_dict2 \
        = load_data(data_path,
                    word_embedding_size=FLAGS.word_embedding_size,
                    max_count1=FLAGS.sequences_count1,
                    max_length1=FLAGS.sequence_length1,
                    max_count2=FLAGS.sequences_count2,
                    max_length2=FLAGS.sequence_length2,
                    shuffle=True,
                    pretrain_embedding_path=pretrain_embedding_path)

    # 词向量矩阵
    word_embedding_mat = [
        word_embedding[word] for index, word in enumerate(vocabulary_inv)
    ]
    word_embedding_mat = np.array(word_embedding_mat, dtype=np.float32)

    # split the original dataset into train set and test set
    data_x_1, data_x_1_test, data_x_2, data_x_2_test, \
    data_x_1_ratio, data_x_1_ratio_test, data_x_2_ratio, data_x_2_ratio_test, \
    data_y, data_y_test = train_test_split(
        data_x_1, data_x_2, data_x_1_ratio, data_x_2_ratio, data_y, test_size=0.1)

    # split the train set into train set and dev set
    data_x_1_train, data_x_1_dev, data_x_2_train, data_x_2_dev, \
    data_x_1_ratio_train, data_x_1_ratio_dev, data_x_2_ratio_train, data_x_2_ratio_dev, \
    data_y_train, data_y_dev = train_test_split(
        data_x_1, data_x_2, data_x_1_ratio, data_x_2_ratio, data_y, test_size=0.1)

    logging.info('data_train: {}, data_dev: {}, data_test: {}'.format(
        len(data_x_1_train), len(data_x_1_dev), len(data_x_1_test)))

    timestamp = str(int(time.time()))
    trained_dir = './trained_result/trained_results_' + timestamp + '/'
    if os.path.exists(trained_dir):
        shutil.rmtree(trained_dir)
    os.makedirs(trained_dir)
    logging.critical('The trained result is saved in {}'.format(trained_dir))

    params = dict()
    params['learning_rate'] = FLAGS.learning_rate
    params['batch_size'] = FLAGS.batch_size
    params['decay_steps'] = FLAGS.decay_steps
    params['decay_rate'] = FLAGS.decay_rate
    params['num_epochs'] = FLAGS.num_epochs

    params['max_count1'] = FLAGS.sequences_count1
    params['max_count2'] = FLAGS.sequences_count2

    params['sequences_count1'] = sequences_count1
    params['sequences_count2'] = sequences_count2
    params['sequence_length1'] = sequence_length1
    params['sequence_length2'] = sequence_length2
    params['word_embedding_size'] = FLAGS.word_embedding_size

    params['conv_filter_sizes1'] = FLAGS.conv_filter_sizes1
    params['conv_filter_sizes2'] = FLAGS.conv_filter_sizes2
    params['num_filters1'] = FLAGS.num_filters1
    params['num_filters2'] = FLAGS.num_filters2

    params['hidden_size1'] = FLAGS.hidden_size1
    params['hidden_size2'] = FLAGS.hidden_size2

    params['dropout_keep_prob'] = FLAGS.dropout_keep_prob
    params['l2_lambda'] = FLAGS.l2_lambda
    params['clip_gradients'] = FLAGS.clip_gradients

    params['subword_length'] = FLAGS.subword_length

    with open(trained_dir + 'trained_parameters.json', 'w') as outfile:
        json.dump(params,
                  outfile,
                  indent=4,
                  sort_keys=True,
                  ensure_ascii=False)

    with open(trained_dir + 'preprocess.json', 'w') as outfile:
        json.dump(preprocess_yaml,
                  outfile,
                  indent=4,
                  sort_keys=True,
                  ensure_ascii=False)

    with open(trained_dir + 'columns_sequences_count1.json', 'w') as outfile:
        json.dump(columns_sequences_count_dict1,
                  outfile,
                  indent=4,
                  ensure_ascii=False)

    with open(trained_dir + 'columns_sequences_count2.json', 'w') as outfile:
        json.dump(columns_sequences_count_dict2,
                  outfile,
                  indent=4,
                  ensure_ascii=False)

    conv_filter_sizes1 = list(map(int, FLAGS.conv_filter_sizes1.split(',')))
    conv_filter_sizes2 = list(map(int, FLAGS.conv_filter_sizes2.split(',')))

    # 初始化Model
    model = DoubleMultiCnnModel(
        # batch_size
        batch_size=FLAGS.batch_size,
        # sequence数量
        sequences_count=(sequences_count1, sequences_count2),
        # sequence长度
        sequence_length=(sequence_length1, sequence_length2),
        # 词向量维度
        word_embedding_size=FLAGS.word_embedding_size,
        # 词向量矩阵
        word_embedding_mat=word_embedding_mat,
        # 词典size
        vocabulary_size=len(vocabulary),
        # 卷积层filter_size
        conv_filter_sizes=(conv_filter_sizes1, conv_filter_sizes2),
        # 卷积层输出通道数
        num_filters=(FLAGS.num_filters1, FLAGS.num_filters2),
        # 输出dense层维度
        hidden_size=(FLAGS.hidden_size1, FLAGS.hidden_size2),
        # 输出维度
        output_size=data_y.shape[1],
        # 衰减步数
        decay_steps=FLAGS.decay_steps,
        # 衰减速率
        decay_rate=FLAGS.decay_rate,
        # 学习速率
        learning_rate=FLAGS.learning_rate,
        # 梯度裁剪
        clip_gradients=FLAGS.clip_gradients,
        # 正二范数权值
        l2_lambda=FLAGS.l2_lambda,
        # 是否在训练
        is_training=True,
        #subword length
        subword_length=FLAGS.subword_length)
    saver = tf.train.Saver(tf.global_variables())

    if pretrain_model_path:
        saver.restore(model.sess,
                      tf.train.latest_checkpoint(pretrain_model_path))

    if not os.path.exists(CHECKPOINT_PATH):
        os.makedirs(CHECKPOINT_PATH)

    checkpoint_dir = CHECKPOINT_PATH + 'check_points_' + timestamp + '/'
    if os.path.exists(checkpoint_dir):
        shutil.rmtree(checkpoint_dir)
    os.makedirs(checkpoint_dir)
    checkpoint_prefix = os.path.join(checkpoint_dir, 'model')

    # Train start here
    total_size = np.ceil(
        len(data_x_1_train) / FLAGS.batch_size) * FLAGS.num_epochs
    epoch_size = np.ceil(total_size / FLAGS.num_epochs)
    batch_size_larger = 0

    best_loss = 999
    best_step = 0
    batch_count = 0

    train_batches = batch_iter(
        list(
            zip(data_x_1_train, data_x_2_train, data_x_1_ratio_train,
                data_x_2_ratio_train, data_y_train)), FLAGS.batch_size,
        FLAGS.num_epochs)

    while True:
        try:
            train_batch = train_batches.__next__()
        except StopIteration:
            break

        batch_count += 1 * int(model.batch_size / FLAGS.batch_size)
        x_1_train_batch, x_2_train_batch, x_1_ratio_train_batch, x_2_ratio_train_batch, y_train_batch = zip(
            *train_batch)
        train_loss = model.train_step(x_1_train_batch, x_2_train_batch,
                                      x_1_ratio_train_batch,
                                      x_2_ratio_train_batch, y_train_batch,
                                      FLAGS.dropout_keep_prob)
        current_step = tf.train.global_step(model.sess, model.global_step)

        #Evaluate the model
        if current_step % FLAGS.validate_step == 0:
            progress = ('%.2f' % (batch_count / total_size * 100))
            dev_batches = batch_iter(
                list(
                    zip(data_x_1_dev, data_x_2_dev, data_x_1_ratio_dev,
                        data_x_2_ratio_dev, data_y_dev)), FLAGS.batch_size, 1)
            dev_loss, dev_batch_num = 0.0, 0
            for dev_batch in dev_batches:
                dev_batch_num += 1
                x_1_dev_batch, x_2_dev_batch, x_1_ratio_dev_batch, x_2_ratio_dev_batch, y_dev_batch = zip(
                    *dev_batch)
                dev_loss_batch = model.dev_step(x_1_dev_batch, x_2_dev_batch,
                                                x_1_ratio_dev_batch,
                                                x_2_ratio_dev_batch,
                                                y_dev_batch)
                dev_loss += dev_loss_batch

            dev_loss /= dev_batch_num
            logging.info(
                'Progress at : {}% examples, Train Loss is {}, Dev Loss is {}'.
                format(progress, train_loss, dev_loss))

            if dev_loss < best_loss:
                best_loss, best_step = dev_loss, current_step
                path = saver.save(model.sess,
                                  checkpoint_prefix,
                                  global_step=current_step)
                logging.critical('Best loss {} at step {}'.format(
                    best_loss, best_step))

        if batch_count % epoch_size == 0:

            logging.critical('Learning rate is {}'.format(
                model.sess.run(model.learning_rate)))

            changed = True

            if batch_size_larger == 0 and best_loss < 0.0405:
                pass

            elif batch_size_larger == 1 and best_loss < 0.0395:
                pass

            elif batch_size_larger == 2 and best_loss < 0.0385:
                pass
            else:
                changed = False

            if changed:
                model.batch_size = FLAGS.batch_size * np.power(
                    2, batch_size_larger + 1)
                new_epochs = max(
                    int(FLAGS.num_epochs * (1 - batch_count / total_size)),
                    int(FLAGS.num_epochs / 4))
                train_batches = batch_iter(
                    list(
                        zip(data_x_1_train, data_x_2_train,
                            data_x_1_ratio_train, data_x_2_ratio_train,
                            data_y_train)), model.batch_size, new_epochs)
                batch_size_larger += 1
                logging.critical("enlarge batch_size to {}".format(
                    model.batch_size))

            # change learning_rate in training
            scale = int(FLAGS.learning_rate_threshold /
                        model.sess.run(model.learning_rate))
            if scale >= 1:
                model.decay_steps = FLAGS.decay_steps * FLAGS.decay_steps_scale * scale
            logging.critical("Learning rate decay steps changed to {}".format(
                model.decay_steps))

            # if current step exceeds max step, stop training
            if current_step >= FLAGS.max_step:
                break
    logging.critical(
        'Training is complete, testing the best model on x_test and y_test')

    # evaluate x_test and y_test
    saver.restore(model.sess, checkpoint_prefix + '-' + str(best_step))

    #Save the model files to trained_dir
    saver.save(model.sess, trained_dir + 'best_model.ckpt')

    test_batches = batch_iter(
        list(
            zip(data_x_1_test, data_x_2_test, data_x_1_ratio_test,
                data_x_2_ratio_test, data_y_test)), FLAGS.batch_size, 1)
    test_loss, test_batch_num = 0.0, 0
    for test_batch in test_batches:
        test_batch_num += 1
        x_1_test_batch, x_2_test_batch, x_1_ratio_test_batch, x_2_ratio_test_batch, y_test_batch = zip(
            *test_batch)
        test_loss_batch = model.dev_step(x_1_test_batch, x_2_test_batch,
                                         x_1_ratio_test_batch,
                                         x_2_ratio_test_batch, y_test_batch)
        test_loss += test_loss_batch

    test_loss /= test_batch_num
    logging.critical('Loss on test set is {}'.format(test_loss))

    predict_by_model(preprocess_config, columns_sequences_count_dict1,
                     columns_sequences_count_dict2, vocabulary, model)

    with open(trained_dir + 'vocabulary.json', 'w') as outfile:
        json.dump(vocabulary, outfile, indent=4, ensure_ascii=False)

    with open(trained_dir + 'word_embedding.pickle', 'wb') as outfile:
        pickle.dump(word_embedding_mat, outfile, pickle.HIGHEST_PROTOCOL)