Ejemplo n.º 1
0
def to_csv():
    train_path = "../input/processing/spanish_train_dedup.txt"
    test_path = "../input/processing/test_b.txt"
    columns = ['id', 'qid1', 'qid2', 'question1', 'question2', 'is_duplicate']
    train = read_data(train_path)
    test = read_data(test_path)

    all_question = set()
    for line in train + test:
        line = line.strip().split('\t')
        all_question.add(line[1])
        all_question.add(line[2])

    qids = dict(zip(all_question, list(range(1, len(all_question) + 1))))

    # write train.csv
    with open('../input/processing/train.csv', 'wt', encoding='utf-8') as f:
        f.write("{}\t{}\t{}\t{}\t{}\t{}\n".format(*columns))
        for i, line in enumerate(train):
            line = line.strip().split('\t')
            q1, q2, is_duplicate = line[1], line[2], line[0]
            f.write("{}\t{}\t{}\t{}\t{}\t{}\n".format(i, qids[q1], qids[q2],
                                                      q1, q2, is_duplicate))

    with open('../input/processing/test.csv', 'wt', encoding='utf-8') as f:
        f.write("{}\t{}\t{}\t{}\t{}\t{}\n".format(*columns))
        for i, line in enumerate(test):
            line = line.strip().split('\t')
            q1, q2, is_duplicate = line[1], line[2], line[0]
            f.write("{}\t{}\t{}\t{}\t{}\t{}\n".format(i, qids[q1], qids[q2],
                                                      q1, q2, is_duplicate))
Ejemplo n.º 2
0
def build_spanish_character_vocab():
    print('build spanish character vocab')
    train_path = '../input/processing/spanish_train_dedup.txt'
    test_path = '../input/processing/test.txt'

    vocabs = {}
    train = read_data(train_path)
    test = read_data(test_path)
    for data in [train, test]:
        for line in data:
            line = line.strip().split('\t')
            assert len(line) == 3, len(line)
            for c in line[1] + line[2]:
                if c in vocabs:
                    vocabs[c] += 1
                else:
                    vocabs[c] = 1

    # 写入到char_vocab.txt中
    char_vocabs = sorted(vocabs.items(), key=lambda x: x[1], reverse=True)
    fw = open('../input/char_vocab.txt', 'wt', encoding='utf-8')

    for (k, v) in char_vocabs:
        fw.write("{}\t{}\n".format(k, v))

    fw.close()
Ejemplo n.º 3
0
def concat_english_spanish_vocab():
    english_vocab = read_data("../input/english_word_vocabs.txt")
    spanish_vocab = read_data("../input/words.txt")

    vocabs = spanish_vocab + english_vocab

    save_data(path='../input/processing/multi_task_learn/all_vocab.txt',
              data=vocabs)
Ejemplo n.º 4
0
def processing_data_1_step():
    """对数据集进行切分,先分成英文部分,西班牙文部分,不shuffle"""
    process_base_path = '../input/processing/'
    base_path = '../input/'

    if not os.path.exists(process_base_path):
        os.makedirs(process_base_path)

    train_en = base_path + 'cikm_english_train_20180516.txt'
    train_sp = base_path + 'cikm_spanish_train_20180516.txt'
    unlabel_data = base_path + 'cikm_unlabel_spanish_train_20180516.txt'

    english_file = process_base_path + 'english.txt'
    spanish_file = process_base_path + 'spanish.txt'
    unlabel_file = process_base_path + 'unlabel_spanish.txt'
    test_file = process_base_path + 'test_b_no_process.txt'

    # 将english_train中的英文和西班牙文划分开
    en_train = read_data(train_en)
    ens = []
    sps = []
    for line in en_train:
        line = line.strip()
        line_arr = line.split('\t')
        ens.append('{}\t{}\t{}\n'.format(line_arr[4], line_arr[0],
                                         line_arr[2]))
        sps.append('{}\t{}\t{}\n'.format(line_arr[4], line_arr[1],
                                         line_arr[3]))

    sp_train = read_data(train_sp)

    for line in sp_train:
        # line = punctiation(line)
        line = line.strip()
        line_arr = re.split('\t', line)
        sps.append('{}\t{}\t{}\n'.format(line_arr[4], line_arr[0],
                                         line_arr[2]))
        ens.append('{}\t{}\t{}\n'.format(line_arr[4], line_arr[1],
                                         line_arr[3]))

    # 讲分开的english和spanish文件保存下来
    save_data(english_file, data=ens)
    save_data(spanish_file, data=sps)

    print(u'对测试数据进行预处理,所有的label均设置为0')
    test_path = base_path + 'cikm_test_b_20180730.txt'
    test = read_data(test_path)
    sps = []
    for line in test:
        line = line.strip()
        line_arr = re.split('\t', line)
        sps.append('{}\t{}\t{}\n'.format(0, line_arr[0], line_arr[1]))
    save_data(test_file, sps)

    print('Done')
Ejemplo n.º 5
0
def split_train_valid(split_rate, is_use_real=False):
    train_file = config.spanish_train_path
    train_data_file = '../input/processing/train_data.txt'
    valid_data_file = '../input/processing/valid_data.txt'

    train = read_data(train_file)
    # random.shuffle(train)
    if is_use_real:
        valid_size = 1400

        train_data = train[:-valid_size]
        valid_data = train[-valid_size:]

        random.shuffle(train_data)

        save_data(train_data_file, train_data)
        save_data(valid_data_file, valid_data)

    else:
        valid_size = int(split_rate * len(train))
        train_data = train[:-valid_size]
        valid_data = train[-valid_size:]

        # random.shuffle(train_data)

        save_data(train_data_file, train_data)
        save_data(valid_data_file, valid_data)
Ejemplo n.º 6
0
  def embedding_batch(sess, model, vocab):
    """
    Fast responses by passing
    - pre-generated model,
    - session
    - vocabulary
    And a batch sentences to produce a output logit to
    """
    checkpoint_path = os.path.join(FLAGS.embedding_dir, FLAGS.embedding_model)
    writer = tf.summary.FileWriter(FLAGS.embedding_dir, sess.graph)
    config = projector.ProjectorConfig()
    embed = config.embeddings.add()
    embed.tensor_name = 'item_embedding'
    embed.metadata_path = data_utils.get_metadata_set_path(FLAGS.embedding_dir)
    projector.visualize_embeddings(writer, config)
    train_path = data_utils.get_train_set_path(FLAGS.train_dir)
    train_ids_path = train_path + ("_ids%d" % FLAGS.vocab_size)
    train_set = data_utils.read_data(train_ids_path, max_size=500000)  # FLAGS.max_train_data_size)
    state_list = []

    meta_list = []
    for bucket_id in xrange(len(_buckets)):
      meta_list.extend(train_set[bucket_id])
    for i, each in enumerate(meta_list):
      meta_list[i] = [vocab[id] for id in each[0]]
    deduped = {}
    for i, each in enumerate(meta_list):
      deduped[" ".join(each)] = i
    deduped_tuple_list = deduped.items()
    indices = [each[1] for each in deduped_tuple_list]

    metadata_path = embed.metadata_path

    with open(metadata_path, 'w+') as item_file:
      item_file.write('id\tchar\n')
      for i, each in enumerate(deduped_tuple_list):
        item_file.write('{}\t{}\n'.format(i, each[0]))
      print('metadata file created')

    for bucket_id in xrange(len(_buckets)):
      begin = 0
      # some data will be ignored
      while begin < len(train_set[bucket_id]):
        bucket_data = train_set[bucket_id]
        data = bucket_data[begin: begin + FLAGS.batch_size]
        encoder_inputs, decoder_inputs, target_weights = model.get_batch(
          data, bucket_id, False)
        states, last_states = model.step_encoder_decoder(sess, encoder_inputs, decoder_inputs,
                                                         target_weights, bucket_id, True)
        state_list.append(states)
        begin += FLAGS.batch_size

    concat = np.concatenate(state_list, axis=0)
    embedding_states = concat[indices]

    item_embedding = tf.get_variable(embed.tensor_name, [len(deduped_tuple_list), FLAGS.size])
    assign_op = item_embedding.assign(embedding_states)
    sess.run(assign_op)
    saver = tf.train.Saver([item_embedding])
    saver.save(sess, checkpoint_path, global_step=model.global_step)
Ejemplo n.º 7
0
def build_spanish_vocab(min_freq=2):
    """
    将文件中所有的单词提取出来,用id进行表示
    !包括test的单词
    """
    print('build spanish vocab')

    basepath = '../input/processing/'

    train_data = basepath + 'spanish_train.txt'
    test_data = basepath + 'test_b.txt'

    words = defaultdict(int)
    train1 = read_data(train_data)
    for line in train1:
        line_arr = line.split('\t')[1:]
        for seq in line_arr:
            seq = seq.strip()
            seq = re.split(r' +', seq)
            for w in seq:
                words[w] += 1

    test = read_data(test_data)
    for line in test:
        line_arr = line.split('\t')[1:]
        for seq in line_arr:
            seq = seq.strip()
            seq = re.split(r' +', seq)
            for w in seq:
                words[w] += 1

    print(len(words))
    # 降序排序
    words_sorted_count = sorted(words.items(), key=lambda x: -x[1])
    words_dict_list = [w[0] for w in words_sorted_count if w[1] >= min_freq]
    words_dict_list.insert(0, '<UNK>')
    words_dict_list.insert(0, '<PAD>')

    with open('../input/words.txt', 'wt', encoding='utf-8') as f:
        for word in words_dict_list:
            f.write(word + '\n')

    print('build spanish vocab done!')
def run():
    df_data = data_utils.read_data("iris")
    df_train, df_test = train_test_split(df_data)

    k_list = [i + 1 for i in range(5)]

    for k in k_list:
        i = 0
        for _, test_item in df_test.iterrows():
            test_data = test_item.values[:4]
            test_label = test_item["species"]
            pred = knn(test_data, df_train, k)
            if pred == test_label:
                i += 1

        acc = round(i / len(df_test), 2)
        print(f"k: {k}, accuracy: {acc}")
    def build_vocab(self):
        """
            build sents is for build vocab
            during multi-lingual task, there are two kinds of sents
            :return: sents
        """
        if self.test_file is None:
            print('test_file is None')
            file_list = [self.train_file, self.dev_file]
        else:
            file_list = [self.train_file, self.dev_file, self.test_file]

        examples = data_utils.read_data(file_list)
        sents = []
        for example in examples:
            sent = example[0]
            sents.append(sent)
        word_vocab = data_utils.build_word_vocab(sents, self.threshold)
        char_vocab = data_utils.build_char_vocab(sents)
        # 统计平均长度与最大长度
        max_sent_len = 0
        avg_sent_len = 0
        for sent in sents:
            if len(sent) > max_sent_len:
                max_sent_len = len(sent)
            avg_sent_len += len(sent)
        avg_sent_len /= len(sents)
        print('task: max_sent_len: {}'.format(max_sent_len))
        print('task: avg_sent_len: {}'.format(avg_sent_len))
        max_word_len = 0
        avg_word_len = 0
        total_len = 0
        for sent in sents:
            for word in sent:
                word = list(word)
                if len(word) > max_word_len:
                    max_word_len = len(word)
                avg_word_len += len(word)
            total_len += len(sent)
        avg_word_len /= total_len
        print('task: max_word_len: {}'.format(max_word_len))
        print('task: avg_word_len: {}'.format(avg_word_len))
        return word_vocab, char_vocab
Ejemplo n.º 10
0
def de_duplicate():
    filename = '../input/processing/spanish_train.txt'
    data = read_data(filename)
    dumplicate = []
    new_data = []
    help_set = set()
    for line in data:
        new_line = line[1:]
        if new_line in help_set:
            dumplicate.append(line)
        else:
            new_data.append(line)
            help_set.add(new_line)

    save_data(
        '../input/processing/dumplicate_spanish_data.txt',
        dumplicate,
    )
    save_data('../input/processing/spanish_train_dedup.txt', new_data)
Ejemplo n.º 11
0
    def build_data(self, data_file):
        """
            return the formatted matrix, which is used as the input to deep learning models
            Args: file_list:
              word_vocab:
        """
        self.examples = data_utils.read_data(data_file)
        y = []
        sent_features = []
        sent_lens = []
        ids = []
        for example in self.examples:
            sents = example[0]
            label = self.config.category2id[example[1]]
            char = data_utils.char_to_matrix(sents, self.char_vocab)
            sent = data_utils.sent_to_index(sents, self.word_vocab)
            one_hot_label = data_utils.onehot_vectorize(label, self.config.num_class)
            y.append(one_hot_label)
            # 有的句子长度为0, 取平均长度
            if len(sent) == 0:
                sent = np.ones(8)
            sent_features.append((sent, char))
            sent_lens.append(min(len(sent), self.max_sent_len))
        # 这里添加char, ner的特征, 之后再做处理
        f_sents = []
        f_chars = []
        char_lens = []
        for feature in sent_features:
            f_sents.append(feature[0])
            f_chars.append(feature[1])
        input_x = data_utils.pad_2d_matrix(f_sents, self.max_sent_len)
        input_x_char = data_utils.pad_3d_tensor(f_chars, self.max_sent_len, self.max_word_len)

        for i in range(len(input_x_char)):
            char_lens.append([min(len(word), self.max_word_len) for word in input_x_char[i]])
        x_len = sent_lens
        x_char_len = char_lens
        self.input_x = np.array(input_x, dtype=np.int32)  # [batch_size, sent_len]
        self.input_x_char = np.array(input_x_char, dtype=np.int32)
        self.x_len = np.array(x_len, dtype=np.int32)  # [batch_size]
        self.x_char_len = np.array(x_char_len, dtype=np.int32)
        self.y = np.array(y, dtype=np.float32)  # [batch_size, class_number]
Ejemplo n.º 12
0
def train_model(config):
    print('[INFO] Preparing data for experiment: {}'.format(
        config['experiment']))
    SRC, TRG, train_data, image_train_data, valid_data, test_data, \
    encoder_embeddings_matrix, decoder_embeddings_matrix = data_utils.read_data()
    x_train, y_train = train_data.src, train_data.trg
    x_val, y_val = valid_data.src, valid_data.trg
    x_test, y_test = test_data.src, test_data.trg

    # Re-calculate the vocab size based on the word_idx dictionary
    config['encoder_vocab'] = len(SRC.vocab)
    config['decoder_vocab'] = len(TRG.vocab)

    config['image_size'] = 32

    model = VarSeq2SeqDetAttnModel(config,
                                   encoder_embeddings_matrix,
                                   decoder_embeddings_matrix,
                                   input_word_index=SRC.vocab,
                                   output_word_index=TRG.vocab)

    model.train(x_train, image_train_data, y_train, x_val, y_val, y_val)
Ejemplo n.º 13
0
def tenfold():
    """split train data for 10 fold"""
    base_path = '../input/processing/'

    train_file = base_path + 'train.txt'

    save_path = base_path + '10fold'
    if not os.path.exists(save_path):
        os.makedirs(save_path)

    # split data for ten fold
    train = read_data(train_file)
    size = len(train)
    one_part_size = int(0.1 * size)
    random.shuffle(train)

    for i in range(10):
        save_file_path = "{}/train_{}.txt".format(save_path, i)
        if i < 9:
            save_data(save_file_path,
                      train[i * one_part_size:(i + 1) * one_part_size])
        else:
            save_data(save_file_path, train[i * one_part_size:])
Ejemplo n.º 14
0
def five_fold():
    """split train data for 10 fold"""
    base_path = '../input/processing/'

    train_file = config.spanish_train_path

    save_path = base_path + '5fold'
    if not os.path.exists(save_path):
        os.makedirs(save_path)

    # split data for ten fold
    train = read_data(train_file)

    size = len(train)
    one_part_size = int(0.2 * size)
    # random.shuffle(train)

    for i in range(5):
        save_file_path = "{}/train_{}.txt".format(save_path, i)
        if i < 4:
            save_data(save_file_path,
                      train[i * one_part_size:(i + 1) * one_part_size])
        else:
            save_data(save_file_path, train[i * one_part_size:])
Ejemplo n.º 15
0
def build_english_vocab(min_freq=3):
    print('build english word vocab')
    basepath = '../input/processing/'
    train_data = basepath + 'english_train.txt'

    words = defaultdict(int)
    train1 = read_data(train_data)
    for line in train1:
        line_arr = line.split('\t')[1:]
        for seq in line_arr:
            seq = seq.strip()
            seq = re.split(r' +', seq)
            for w in seq:
                words[w] += 1

    # 降序排序
    words_sorted_count = sorted(words.items(), key=lambda x: -x[1])
    words_dict_list = [w[0] for w in words_sorted_count if w[1] >= min_freq]

    with open('../input/english_word_vocabs.txt', 'wt', encoding='utf-8') as f:
        for word in words_dict_list:
            f.write(word + '\n')

    print('build spanish vocab done!')
Ejemplo n.º 16
0
from model.model import NERModel
from utils.data_utils import (Batch, convert_dataset, create_vocab, read_data,
                              save_vocab, segment_vocab, update_tag_scheme, add_external_words)
from utils.evaluate import evaluate
from utils.train_utils import get_config_proto

if __name__ == "__main__":
    DATA_DIR = "/data/xueyou/fashion/data/"
    checkpoint_dir = '/data/xueyou/ner/category_ner_lstm_dim128_0208/'

    if not os.path.exists(checkpoint_dir):
        os.mkdir(checkpoint_dir)

    # read training data
    train_files = [os.path.join(DATA_DIR,"category.ner.ac.train.txt")]
    train_data = read_data(train_files,lower=True)

    # convert tags to iobes
    update_tag_scheme(train_data)

    # create vocab from training data
    word_vocab,tag_vocab = create_vocab(train_data, lower_case=True, min_cnt = 2)
    segment_vocab = segment_vocab()

    # save vocab
    save_vocab(word_vocab, os.path.join(checkpoint_dir,"word.vocab"))
    save_vocab(tag_vocab,os.path.join(checkpoint_dir,"tag.vocab"))
    save_vocab(segment_vocab,os.path.join(checkpoint_dir,"seg.vocab"))

    # convert word into ids
    train_data = convert_dataset(train_data, word_vocab, tag_vocab, segment_vocab)
Ejemplo n.º 17
0
    epoch_num = 300

    for e in range(1, epoch_num + 1):
        mse, w_g, b_g = grad_step(w, b)
        w = w - (lr_w * w_g)
        b = b - (lr_b * b_g)
        if not e % 10:
            print(f"epoch: {e}, mse:{mse:.2f}, w: {w:.2f}, b: {b:.2f}")

    print(f"w: {w:.2f}, b: {b:.2f}")

    plt.scatter(df_data["cp"], df_data["cp_new"], s=20, c="green", alpha=0.5)
    line_x = np.linspace(0, 630)
    line_y = np.array(b + w * line_x)
    plt.plot(line_x, line_y.T, color='red')
    plt.text(200,
             1080,
             f"y={b:.2f}+{w:.2f}*x",
             rotation=30,
             fontsize=14,
             fontstyle="italic")
    plt.xlabel('cp')
    plt.ylabel('cp_new')
    plt.title('Gradient Descent Result')
    plt.show()


if __name__ == '__main__':
    df_data = data_utils.read_data("pokemon")
    gradient_descent_method(df_data)
Ejemplo n.º 18
0
def processing_data_2_step():
    process_base_path = '../input/processing/'
    english_file = process_base_path + 'english.txt'
    spanish_file = process_base_path + 'spanish.txt'
    test_file = process_base_path + 'test_b_no_process.txt'

    english_core_nlp = StanfordCoreNLP(core_nlp_path, lang='en')

    with StanfordCoreNLP(core_nlp_path, lang='es') as client:

        english_processing_file = process_base_path + 'english_train.txt'
        spanish_processing_file = process_base_path + 'spanish_train.txt'
        test_processing_file = process_base_path + 'test_b.txt'
        #
        englishs = read_data(english_file)
        spanishs = read_data(spanish_file)

        english_processing = []
        spanish_processing = []

        # for english

        for line in tqdm(englishs):
            lines = line.strip().split('\t')
            assert len(lines) == 3
            lines[1] = text_processing_english(lines[1], english_core_nlp)
            lines[2] = text_processing_english(lines[2], english_core_nlp)

            english_processing.append("{}\t{}\t{}\n".format(
                lines[0], lines[1], lines[2]))

        save_data(english_processing_file, english_processing)

        # for spanish
        for line in tqdm(spanishs):
            line = line.strip().split('\t')
            assert len(line) == 3, print(line)
            line[1] = text_processing_spanish(line[1], client)
            line[2] = text_processing_spanish(line[2], client)

            spanish_processing.append("{}\t{}\t{}\n".format(
                line[0], line[1], line[2]))

        save_data(spanish_processing_file, spanish_processing)
        #
        # for test data
        test = read_data(test_file)
        test_processing = []
        for line in tqdm(test):
            # ipdb.set_trace()
            line = line.strip().split('\t')
            assert len(line) == 3
            line[2] = text_processing_spanish(line[2], client)
            line[1] = text_processing_spanish(line[1], client)

            test_processing.append("{}\t{}\t{}\n".format(
                line[0], line[1], line[2]))

        save_data(test_processing_file, test_processing)

    english_core_nlp.close()

    print('Done')
Ejemplo n.º 19
0
def test(exp_settings):
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    with tf.Session(config=config) as sess:
        # Load test data.
        print("Reading data in %s" % FLAGS.data_dir)
        test_set = read_data(FLAGS.data_dir, FLAGS.test_data_prefix,
                             FLAGS.max_list_cutoff)
        find_class(exp_settings['train_input_feed']).preprocess_data(
            test_set, exp_settings['train_input_hparams'], exp_settings)
        exp_settings['max_candidate_num'] = test_set.rank_list_size

        test_set.pad(exp_settings['max_candidate_num'])

        # Create model and load parameters.
        model = create_model(sess, exp_settings, test_set, True)

        # Create input feed
        test_input_feed = find_class(exp_settings['test_input_feed'])(
            model, FLAGS.batch_size, exp_settings['test_input_hparams'], sess)

        test_writer = tf.summary.FileWriter(FLAGS.model_dir + '/test_log')

        rerank_scores = []
        summary_list = []
        # Start testing.

        it = 0
        count_batch = 0.0
        batch_size_list = []
        while it < len(test_set.initial_list):
            input_feed, info_map = test_input_feed.get_next_batch(
                it, test_set, check_validation=False)
            _, output_logits, summary = model.step(sess, input_feed, True)
            summary_list.append(summary)
            batch_size_list.append(len(info_map['input_list']))
            for x in range(batch_size_list[-1]):
                rerank_scores.append(output_logits[x])
            it += batch_size_list[-1]
            count_batch += 1.0
            print("Testing {:.0%} finished".format(
                float(it) / len(test_set.initial_list)),
                  end="\r",
                  flush=True)

        print("\n[Done]")
        test_summary = merge_TFSummary(summary_list, batch_size_list)
        test_writer.add_summary(test_summary, it)
        cprint(
            "[Eval]: %s" % (' '.join([
                '%s: %.3f' % (x.tag, x.simple_value)
                for x in test_summary.value
            ])), 'green')

        #get rerank indexes with new scores
        rerank_lists = []
        for i in range(len(rerank_scores)):
            scores = rerank_scores[i]
            rerank_lists.append(
                sorted(range(len(scores)),
                       key=lambda k: scores[k],
                       reverse=True))

        if not os.path.exists(FLAGS.output_dir):
            os.makedirs(FLAGS.output_dir)
        output_ranklist(test_set, rerank_scores, FLAGS.output_dir,
                        FLAGS.test_data_prefix)

    return
Ejemplo n.º 20
0
def train(exp_settings):
    # Prepare data.
    print("Reading data in %s" % FLAGS.data_dir)
    train_set = read_data(FLAGS.data_dir, FLAGS.train_data_prefix,
                          FLAGS.max_list_cutoff)
    # cprint(train_set, 'green')  # <utils.data_utils.Raw_data object at 0x7f9347482d00>
    find_class(exp_settings['train_input_feed']).preprocess_data(
        train_set, exp_settings['train_input_hparams'], exp_settings)
    valid_set = read_data(FLAGS.data_dir, FLAGS.valid_data_prefix,
                          FLAGS.max_list_cutoff)
    find_class(exp_settings['train_input_feed']).preprocess_data(
        valid_set, exp_settings['train_input_hparams'], exp_settings)

    print("Train Rank list size %d" % train_set.rank_list_size)  # 9
    print("Valid Rank list size %d" % valid_set.rank_list_size)  # 9
    exp_settings['max_candidate_num'] = max(train_set.rank_list_size,
                                            valid_set.rank_list_size)
    test_set = None
    if FLAGS.test_while_train:
        test_set = read_data(FLAGS.data_dir, FLAGS.test_data_prefix,
                             FLAGS.max_list_cutoff)
        find_class(exp_settings['train_input_feed']).preprocess_data(
            test_set, exp_settings['train_input_hparams'], exp_settings)
        print("Test Rank list size %d" % test_set.rank_list_size)
        exp_settings['max_candidate_num'] = max(
            test_set.rank_list_size, exp_settings['max_candidate_num'])
        test_set.pad(exp_settings['max_candidate_num'])

    if 'selection_bias_cutoff' not in exp_settings:  # check if there is a limit on the number of items per training query.
        exp_settings[
            'selection_bias_cutoff'] = FLAGS.selection_bias_cutoff if FLAGS.selection_bias_cutoff > 0 else exp_settings[
                'max_candidate_num']

    exp_settings['selection_bias_cutoff'] = min(
        exp_settings['selection_bias_cutoff'],
        exp_settings['max_candidate_num'])
    print(
        'Users can only see the top %d documents for each query in training.' %
        exp_settings['selection_bias_cutoff'])

    # Pad data
    train_set.pad(exp_settings['max_candidate_num'])
    valid_set.pad(exp_settings['max_candidate_num'])

    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    with tf.Session(config=config) as sess:
        # tf.get_variable_scope().reuse_variables() # zcr --> useless for the error `ValueError: Variable dnn_W_0 does not exist, or was not created with tf.get_variable(). Did you mean to set reuse=tf.AUTO_REUSE in VarScope?`
        # Create model based on the input layer.
        print("Creating model...")
        model = create_model(sess, exp_settings, train_set, False)
        #model.print_info()

        # Create data feed
        train_input_feed = find_class(exp_settings['train_input_feed'])(
            model, FLAGS.batch_size, exp_settings['train_input_hparams'], sess)
        valid_input_feed = find_class(exp_settings['valid_input_feed'])(
            model, FLAGS.batch_size, exp_settings['valid_input_hparams'], sess)
        test_input_feed = None
        if FLAGS.test_while_train:
            test_input_feed = find_class(exp_settings['test_input_feed'])(
                model, FLAGS.batch_size, exp_settings['test_input_hparams'],
                sess)

        # Create tensorboard summarizations.
        train_writer = tf.summary.FileWriter(
            os.path.join(FLAGS.model_dir, 'train_log'), sess.graph)
        valid_writer = tf.summary.FileWriter(
            os.path.join(FLAGS.model_dir, 'valid_log'))
        test_writer = None
        if FLAGS.test_while_train:
            test_writer = tf.summary.FileWriter(
                os.path.join(FLAGS.model_dir, 'test_log'))

        # This is the training loop.
        step_time, loss = 0.0, 0.0
        current_step = 0
        previous_losses = []
        best_perf = None
        while True:
            # Get a batch and make a step.
            start_time = time.time()
            input_feed, info_map = train_input_feed.get_batch(
                train_set, check_validation=True)
            # cprint('input_feed: {}'.format(input_feed), 'green')
            # cprint('info_map: {}'.format(info_map), 'green')
            # cprint('len(info_map[rank_list_idxs]): {}'.format(len(info_map['rank_list_idxs'])), 'green')    # 256
            # cprint('len(info_map[input_list]): {}'.format(len(info_map['input_list'])), 'green')    # 256
            # cprint('len(info_map[click_list]): {}'.format(len(info_map['click_list'])), 'green')    # 256
            # cprint('len(info_map[letor_features]): {}'.format(len(info_map['letor_features'])), 'green')    # 1479
            cprint(
                'info_map[rank_list_idxs]: {}'.format(
                    info_map['rank_list_idxs']), 'green')
            '''
            [12, 7, 17, 0, 0, 13, 9, 8, 10, 4, 18, 8, 10, 6, 5, 15, 14, 10, 6, 3, 16, 1, 10, 0, 18, 1, 19, 15, 3, 2, 18, 7, 6, 8, 13, 4, 11, 11, 5, 2, 10, 1, 19, 2, 14, 6, 18, 14, 9, 1, 5, 11, 19, 4, 6, 12, 15, 11, 19, 9, 15, 3, 4, 16, 6, 6, 7, 0, 10, 17, 4, 14, 8, 14, 10, 8, 13, 6, 14, 17, 4, 1, 6, 1, 7, 0, 15, 3, 14, 4, 6, 6, 17, 19, 7, 3, 7, 7, 14, 18, 0, 16, 14, 16, 10, 9, 15, 6, 0, 12, 17, 9, 4, 2, 16, 17, 10, 16, 4, 2, 12, 12, 13, 14, 4, 17, 6, 9, 1, 3, 12, 19, 17, 10, 3, 4, 15, 19, 17, 0, 5, 10, 19, 8, 7, 4, 17, 17, 0, 12, 14, 7, 9, 0, 6, 10, 12, 15, 2, 5, 19, 7, 19, 16, 6, 2, 11, 1, 17, 3, 1, 10, 9, 0, 16, 12, 17, 19, 12, 1, 1, 18, 3, 19, 12, 13, 16, 4, 1, 2, 19, 15, 3, 12, 2, 12, 9, 18, 5, 13, 13, 2, 4, 10, 6, 4, 4, 9, 0, 0, 6, 15, 1, 11, 1, 15, 19, 8, 19, 3, 9, 1, 19, 4, 14, 18, 13, 0, 8, 11, 6, 17, 1, 18, 16, 14, 14, 14, 4, 13, 13, 4, 3, 8, 6, 14, 3, 19, 2, 2, 19, 0, 2, 9, 18, 0]
            '''
            cprint('info_map[input_list]: {}'.format(info_map['input_list']),
                   'green')
            '''
            是一个list的list,内部list中的item个数为9
            [[0, 1, 2, 3, 4, 5, 6, 1487, 1487], [7, 8, 9, 10, 11, 12, 13, 1487, 1487], [14, 15, 16, 17, 18, 19, 20, 1487, 1487], [21, 22, 23, 24, 1487, 1487, 1487, 1487, 1487], [25, 26, 27, 28, 1487, 1487, 1487, 1487, 1487], [29, 30, 31, 32, 33, 34, 1487, 1487, 1487], [35, 36, 37, 1487, 1487, 1487, 1487, 1487, 1487], [38, 39, 1487, 1487, 1487, 1487, 1487, 1487, 1487], [40, 41, 42, 43, 44, 45, 46, 47, 48], [49, 50, 51, 52, 53, 1487, 1487, 1487, 1487], [54, 55, 56, 1487, 1487, 1487, 1487, 1487, 1487], [57, 58, 1487, 1487, 1487, 1487, 1487, 1487, 1487], [59, 60, 61, 62, 63, 64, 65, 66, 67], [68, 69, 70, 71, 72, 73, 74, 1487, 1487], [75, 76, 77, 78, 1487, 1487, 1487, 1487, 1487], [79, 80, 1487, 1487, 1487, 1487, 1487, 1487, 1487], [81, 82, 83, 84, 85, 86, 87, 88, 89], [90, 91, 92, 93, 94, 95, 96, 97, 98], [99, 100, 101, 102, 103, 104, 105, 1487, 1487], [106, 107, 108, 1487, 1487, 1487, 1487, 1487, 1487], [109, 110, 111, 112, 113, 114, 115, 116, 1487], [117, 118, 119, 120, 1487, 1487, 1487, 1487, 1487], [121, 122, 123, 124, 125, 126, 127, 128, 129], [130, 131, 132, 133, 1487, 1487, 1487, 1487, 1487], [134, 135, 136, 1487, 1487, 1487, 1487, 1487, 1487], [137, 138, 139, 140, 1487, 1487, 1487, 1487, 1487], [141, 142, 143, 144, 145, 146, 147, 148, 149], [150, 151, 1487, 1487, 1487, 1487, 1487, 1487, 1487], [152, 153, 154, 1487, 1487, 1487, 1487, 1487, 1487], [155, 156, 157, 158, 159, 160, 161, 162, 163], [164, 165, 166, 1487, 1487, 1487, 1487, 1487, 1487], [167, 168, 169, 170, 171, 172, 173, 1487, 1487], [174, 175, 176, 177, 178, 179, 180, 1487, 1487], [181, 182, 1487, 1487, 1487, 1487, 1487, 1487, 1487], [183, 184, 185, 186, 187, 188, 1487, 1487, 1487], [189, 190, 191, 192, 193, 1487, 1487, 1487, 1487], [194, 195, 1487, 1487, 1487, 1487, 1487, 1487, 1487], [196, 197, 1487, 1487, 1487, 1487, 1487, 1487, 1487], [198, 199, 200, 201, 1487, 1487, 1487, 1487, 1487], [202, 203, 204, 205, 206, 207, 208, 209, 210], [211, 212, 213, 214, 215, 216, 217, 218, 219], [220, 221, 222, 223, 1487, 1487, 1487, 1487, 1487], [224, 225, 226, 227, 228, 229, 230, 231, 232], [233, 234, 235, 236, 237, 238, 239, 240, 241], [242, 243, 244, 245, 246, 247, 248, 249, 250], [251, 252, 253, 254, 255, 256, 257, 1487, 1487], [258, 259, 260, 1487, 1487, 1487, 1487, 1487, 1487], [261, 262, 263, 264, 265, 266, 267, 268, 269], [270, 271, 272, 1487, 1487, 1487, 1487, 1487, 1487], [273, 274, 275, 276, 1487, 1487, 1487, 1487, 1487], [277, 278, 279, 280, 1487, 1487, 1487, 1487, 1487], [281, 282, 1487, 1487, 1487, 1487, 1487, 1487, 1487], [283, 284, 285, 286, 287, 288, 289, 290, 291], [292, 293, 294, 295, 296, 1487, 1487, 1487, 1487], [297, 298, 299, 300, 301, 302, 303, 1487, 1487], [304, 305, 306, 307, 308, 309, 310, 1487, 1487], [311, 312, 1487, 1487, 1487, 1487, 1487, 1487, 1487], [313, 314, 1487, 1487, 1487, 1487, 1487, 1487, 1487], [315, 316, 317, 318, 319, 320, 321, 322, 323], [324, 325, 326, 1487, 1487, 1487, 1487, 1487, 1487], [327, 328, 1487, 1487, 1487, 1487, 1487, 1487, 1487], [329, 330, 331, 1487, 1487, 1487, 1487, 1487, 1487], [332, 333, 334, 335, 336, 1487, 1487, 1487, 1487], [337, 338, 339, 340, 341, 342, 343, 344, 1487], [345, 346, 347, 348, 349, 350, 351, 1487, 1487], [352, 353, 354, 355, 356, 357, 358, 1487, 1487], [359, 360, 361, 362, 363, 364, 365, 1487, 1487], [366, 367, 368, 369, 1487, 1487, 1487, 1487, 1487], [370, 371, 372, 373, 374, 375, 376, 377, 378], [379, 380, 381, 382, 383, 384, 385, 1487, 1487], [386, 387, 388, 389, 390, 1487, 1487, 1487, 1487], [391, 392, 393, 394, 395, 396, 397, 398, 399], [400, 401, 1487, 1487, 1487, 1487, 1487, 1487, 1487], [402, 403, 404, 405, 406, 407, 408, 409, 410], [411, 412, 413, 414, 415, 416, 417, 418, 419], [420, 421, 1487, 1487, 1487, 1487, 1487, 1487, 1487], [422, 423, 424, 425, 426, 427, 1487, 1487, 1487], [428, 429, 430, 431, 432, 433, 434, 1487, 1487], [435, 436, 437, 438, 439, 440, 441, 442, 443], [444, 445, 446, 447, 448, 449, 450, 1487, 1487], [451, 452, 453, 454, 455, 1487, 1487, 1487, 1487], [456, 457, 458, 459, 1487, 1487, 1487, 1487, 1487], [460, 461, 462, 463, 464, 465, 466, 1487, 1487], [467, 468, 469, 470, 1487, 1487, 1487, 1487, 1487], [471, 472, 473, 474, 475, 476, 477, 1487, 1487], [478, 479, 480, 481, 1487, 1487, 1487, 1487, 1487], [482, 483, 1487, 1487, 1487, 1487, 1487, 1487, 1487], [484, 485, 486, 1487, 1487, 1487, 1487, 1487, 1487], [487, 488, 489, 490, 491, 492, 493, 494, 495], [496, 497, 498, 499, 500, 1487, 1487, 1487, 1487], [501, 502, 503, 504, 505, 506, 507, 1487, 1487], [508, 509, 510, 511, 512, 513, 514, 1487, 1487], [515, 516, 517, 518, 519, 520, 521, 1487, 1487], [522, 523, 524, 525, 526, 527, 528, 529, 530], [531, 532, 533, 534, 535, 536, 537, 1487, 1487], [538, 539, 540, 1487, 1487, 1487, 1487, 1487, 1487], [541, 542, 543, 544, 545, 546, 547, 1487, 1487], [548, 549, 550, 551, 552, 553, 554, 1487, 1487], [555, 556, 557, 558, 559, 560, 561, 562, 563], [564, 565, 566, 1487, 1487, 1487, 1487, 1487, 1487], [567, 568, 569, 570, 1487, 1487, 1487, 1487, 1487], [571, 572, 573, 574, 575, 576, 577, 578, 1487], [579, 580, 581, 582, 583, 584, 585, 586, 587], [588, 589, 590, 591, 592, 593, 594, 595, 1487], [596, 597, 598, 599, 600, 601, 602, 603, 604], [605, 606, 607, 1487, 1487, 1487, 1487, 1487, 1487], [608, 609, 1487, 1487, 1487, 1487, 1487, 1487, 1487], [610, 611, 612, 613, 614, 615, 616, 1487, 1487], [617, 618, 619, 620, 1487, 1487, 1487, 1487, 1487], [621, 622, 623, 624, 625, 626, 627, 1487, 1487], [628, 629, 630, 631, 632, 633, 634, 1487, 1487], [635, 636, 637, 1487, 1487, 1487, 1487, 1487, 1487], [638, 639, 640, 641, 642, 1487, 1487, 1487, 1487], [643, 644, 645, 646, 647, 648, 649, 650, 651], [652, 653, 654, 655, 656, 657, 658, 659, 1487], [660, 661, 662, 663, 664, 665, 666, 1487, 1487], [667, 668, 669, 670, 671, 672, 673, 674, 675], [676, 677, 678, 679, 680, 681, 682, 683, 1487], [684, 685, 686, 687, 688, 1487, 1487, 1487, 1487], [689, 690, 691, 692, 693, 694, 695, 696, 697], [698, 699, 700, 701, 702, 703, 704, 1487, 1487], [705, 706, 707, 708, 709, 710, 711, 1487, 1487], [712, 713, 714, 715, 716, 717, 1487, 1487, 1487], [718, 719, 720, 721, 722, 723, 724, 725, 726], [727, 728, 729, 730, 731, 1487, 1487, 1487, 1487], [732, 733, 734, 735, 736, 737, 738, 1487, 1487], [739, 740, 741, 742, 743, 744, 745, 1487, 1487], [746, 747, 748, 1487, 1487, 1487, 1487, 1487, 1487], [749, 750, 751, 752, 1487, 1487, 1487, 1487, 1487], [753, 754, 755, 1487, 1487, 1487, 1487, 1487, 1487], [756, 757, 758, 759, 760, 761, 762, 1487, 1487], [763, 764, 765, 766, 767, 768, 769, 770, 771], [772, 773, 774, 775, 776, 777, 778, 1487, 1487], [779, 780, 781, 782, 783, 784, 785, 786, 787], [788, 789, 790, 1487, 1487, 1487, 1487, 1487, 1487], [791, 792, 793, 794, 795, 1487, 1487, 1487, 1487], [796, 797, 1487, 1487, 1487, 1487, 1487, 1487, 1487], [798, 799, 800, 801, 802, 803, 804, 805, 806], [807, 808, 809, 810, 811, 812, 813, 1487, 1487], [814, 815, 816, 817, 1487, 1487, 1487, 1487, 1487], [818, 819, 820, 821, 1487, 1487, 1487, 1487, 1487], [822, 823, 824, 825, 826, 827, 828, 829, 830], [831, 832, 833, 834, 835, 836, 837, 838, 839], [840, 841, 1487, 1487, 1487, 1487, 1487, 1487, 1487], [842, 843, 844, 845, 846, 847, 848, 1487, 1487], [849, 850, 851, 852, 853, 1487, 1487, 1487, 1487], [854, 855, 856, 857, 858, 859, 860, 1487, 1487], [861, 862, 863, 864, 865, 866, 867, 1487, 1487], [868, 869, 870, 871, 1487, 1487, 1487, 1487, 1487], [872, 873, 874, 875, 876, 877, 878, 1487, 1487], [879, 880, 881, 882, 883, 884, 885, 886, 887], [888, 889, 890, 891, 892, 893, 894, 1487, 1487], [895, 896, 897, 1487, 1487, 1487, 1487, 1487, 1487], [898, 899, 900, 901, 1487, 1487, 1487, 1487, 1487], [902, 903, 904, 905, 906, 907, 908, 1487, 1487], [909, 910, 911, 912, 913, 914, 915, 916, 917], [918, 919, 920, 921, 922, 923, 924, 1487, 1487], [925, 926, 1487, 1487, 1487, 1487, 1487, 1487, 1487], [927, 928, 929, 930, 931, 932, 933, 934, 935], [936, 937, 938, 939, 1487, 1487, 1487, 1487, 1487], [940, 941, 942, 943, 944, 945, 946, 947, 948], [949, 950, 951, 952, 953, 954, 955, 1487, 1487], [956, 957, 958, 959, 960, 961, 962, 963, 964], [965, 966, 967, 968, 969, 970, 971, 972, 1487], [973, 974, 975, 976, 977, 978, 979, 1487, 1487], [980, 981, 982, 983, 984, 985, 986, 987, 988], [989, 990, 1487, 1487, 1487, 1487, 1487, 1487, 1487], [991, 992, 993, 994, 1487, 1487, 1487, 1487, 1487], [995, 996, 997, 998, 999, 1000, 1001, 1487, 1487], [1002, 1003, 1004, 1487, 1487, 1487, 1487, 1487, 1487], [1005, 1006, 1007, 1008, 1487, 1487, 1487, 1487, 1487], [1009, 1010, 1011, 1012, 1013, 1014, 1015, 1016, 1017], [1018, 1019, 1020, 1487, 1487, 1487, 1487, 1487, 1487], [1021, 1022, 1023, 1024, 1487, 1487, 1487, 1487, 1487], [1025, 1026, 1027, 1028, 1029, 1030, 1031, 1032, 1487], [1033, 1034, 1035, 1036, 1037, 1038, 1039, 1487, 1487], [1040, 1041, 1042, 1043, 1044, 1045, 1046, 1487, 1487], [1047, 1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055], [1056, 1057, 1058, 1059, 1060, 1061, 1062, 1487, 1487], [1063, 1064, 1065, 1066, 1487, 1487, 1487, 1487, 1487], [1067, 1068, 1069, 1070, 1487, 1487, 1487, 1487, 1487], [1071, 1072, 1073, 1487, 1487, 1487, 1487, 1487, 1487], [1074, 1075, 1076, 1487, 1487, 1487, 1487, 1487, 1487], [1077, 1078, 1079, 1080, 1081, 1082, 1083, 1084, 1085], [1086, 1087, 1088, 1089, 1090, 1091, 1092, 1487, 1487], [1093, 1094, 1095, 1096, 1097, 1098, 1487, 1487, 1487], [1099, 1100, 1101, 1102, 1103, 1104, 1105, 1106, 1487], [1107, 1108, 1109, 1110, 1111, 1487, 1487, 1487, 1487], [1112, 1113, 1114, 1115, 1487, 1487, 1487, 1487, 1487], [1116, 1117, 1118, 1119, 1120, 1121, 1122, 1123, 1124], [1125, 1126, 1127, 1128, 1129, 1130, 1131, 1132, 1133], [1134, 1135, 1487, 1487, 1487, 1487, 1487, 1487, 1487], [1136, 1137, 1138, 1487, 1487, 1487, 1487, 1487, 1487], [1139, 1140, 1141, 1142, 1143, 1144, 1145, 1487, 1487], [1146, 1147, 1148, 1149, 1150, 1151, 1152, 1153, 1154], [1155, 1156, 1157, 1158, 1159, 1160, 1161, 1487, 1487], [1162, 1163, 1164, 1487, 1487, 1487, 1487, 1487, 1487], [1165, 1166, 1167, 1487, 1487, 1487, 1487, 1487, 1487], [1168, 1169, 1170, 1171, 1487, 1487, 1487, 1487, 1487], [1172, 1173, 1174, 1175, 1176, 1177, 1487, 1487, 1487], [1178, 1179, 1180, 1181, 1182, 1183, 1487, 1487, 1487], [1184, 1185, 1186, 1187, 1188, 1189, 1190, 1191, 1192], [1193, 1194, 1195, 1196, 1197, 1487, 1487, 1487, 1487], [1198, 1199, 1200, 1201, 1202, 1203, 1204, 1205, 1206], [1207, 1208, 1209, 1210, 1211, 1212, 1213, 1487, 1487], [1214, 1215, 1216, 1217, 1218, 1487, 1487, 1487, 1487], [1219, 1220, 1221, 1222, 1223, 1487, 1487, 1487, 1487], [1224, 1225, 1226, 1487, 1487, 1487, 1487, 1487, 1487], [1227, 1228, 1229, 1230, 1487, 1487, 1487, 1487, 1487], [1231, 1232, 1233, 1234, 1487, 1487, 1487, 1487, 1487], [1235, 1236, 1237, 1238, 1239, 1240, 1241, 1487, 1487], [1242, 1243, 1487, 1487, 1487, 1487, 1487, 1487, 1487], [1244, 1245, 1246, 1247, 1487, 1487, 1487, 1487, 1487], [1248, 1249, 1487, 1487, 1487, 1487, 1487, 1487, 1487], [1250, 1251, 1252, 1253, 1487, 1487, 1487, 1487, 1487], [1254, 1255, 1487, 1487, 1487, 1487, 1487, 1487, 1487], [1256, 1257, 1258, 1259, 1260, 1261, 1262, 1263, 1264], [1265, 1266, 1487, 1487, 1487, 1487, 1487, 1487, 1487], [1267, 1268, 1269, 1270, 1271, 1272, 1273, 1274, 1275], [1276, 1277, 1278, 1487, 1487, 1487, 1487, 1487, 1487], [1279, 1280, 1281, 1487, 1487, 1487, 1487, 1487, 1487], [1282, 1283, 1284, 1285, 1487, 1487, 1487, 1487, 1487], [1286, 1287, 1288, 1289, 1290, 1291, 1292, 1293, 1294], [1295, 1296, 1297, 1298, 1299, 1487, 1487, 1487, 1487], [1300, 1301, 1302, 1303, 1304, 1305, 1306, 1307, 1308], [1309, 1310, 1311, 1487, 1487, 1487, 1487, 1487, 1487], [1312, 1313, 1314, 1315, 1316, 1317, 1487, 1487, 1487], [1318, 1319, 1320, 1321, 1487, 1487, 1487, 1487, 1487], [1322, 1323, 1487, 1487, 1487, 1487, 1487, 1487, 1487], [1324, 1325, 1487, 1487, 1487, 1487, 1487, 1487, 1487], [1326, 1327, 1328, 1329, 1330, 1331, 1332, 1487, 1487], [1333, 1334, 1335, 1336, 1337, 1338, 1339, 1487, 1487], [1340, 1341, 1342, 1343, 1487, 1487, 1487, 1487, 1487], [1344, 1345, 1346, 1487, 1487, 1487, 1487, 1487, 1487], [1347, 1348, 1349, 1350, 1351, 1352, 1353, 1354, 1487], [1355, 1356, 1357, 1358, 1359, 1360, 1361, 1362, 1363], [1364, 1365, 1366, 1367, 1368, 1369, 1370, 1371, 1372], [1373, 1374, 1375, 1376, 1377, 1378, 1379, 1380, 1381], [1382, 1383, 1384, 1385, 1386, 1487, 1487, 1487, 1487], [1387, 1388, 1389, 1390, 1391, 1392, 1487, 1487, 1487], [1393, 1394, 1395, 1396, 1397, 1398, 1487, 1487, 1487], [1399, 1400, 1401, 1402, 1403, 1487, 1487, 1487, 1487], [1404, 1405, 1406, 1487, 1487, 1487, 1487, 1487, 1487], [1407, 1408, 1487, 1487, 1487, 1487, 1487, 1487, 1487], [1409, 1410, 1411, 1412, 1413, 1414, 1415, 1487, 1487], [1416, 1417, 1418, 1419, 1420, 1421, 1422, 1423, 1424], [1425, 1426, 1427, 1487, 1487, 1487, 1487, 1487, 1487], [1428, 1429, 1430, 1431, 1432, 1433, 1434, 1435, 1436], [1437, 1438, 1439, 1440, 1441, 1442, 1443, 1444, 1445], [1446, 1447, 1448, 1449, 1450, 1451, 1452, 1453, 1454], [1455, 1456, 1457, 1458, 1459, 1460, 1461, 1462, 1463], [1464, 1465, 1466, 1467, 1487, 1487, 1487, 1487, 1487], [1468, 1469, 1470, 1471, 1472, 1473, 1474, 1475, 1476], [1477, 1478, 1479, 1487, 1487, 1487, 1487, 1487, 1487], [1480, 1481, 1482, 1487, 1487, 1487, 1487, 1487, 1487], [1483, 1484, 1485, 1486, 1487, 1487, 1487, 1487, 1487]]
            '''
            cprint('info_map[click_list]: {}'.format(info_map['click_list']),
                   'green')
            '''
            是一个list的list,内部list中的item个数为9, 数值为1的表示点击,为0的表示不点击。
            [[0, 0, 0, 0, 0, 1, 0, 0, 0], [0, 1, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 1, 0, 0, 0, 0, 0], [0, 0, 0, 1, 0, 0, 0, 0, 0], [0, 0, 0, 1, 0, 0, 0, 0, 0], [1, 0, 1, 0, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 1, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0], [1, 0, 1, 0, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 1, 0, 0, 0, 0], [0, 1, 0, 1, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 1, 0, 0, 0, 0, 0], [0, 0, 0, 1, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0, 0, 0, 0], [0, 0, 1, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 1, 0, 0], [0, 0, 0, 1, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 1, 0, 0, 0], [0, 0, 0, 0, 0, 0, 1, 0, 0], [0, 1, 0, 1, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 1], [0, 1, 1, 0, 0, 0, 0, 0, 0], [0, 0, 1, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 1, 0], [0, 1, 0, 0, 0, 0, 0, 0, 0], [0, 1, 1, 0, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0, 1, 0, 0], [0, 1, 0, 0, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0, 0, 0, 0], [0, 0, 1, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 1, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 1, 0, 0, 0], [0, 0, 1, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 1, 0, 0, 0, 0], [0, 0, 1, 1, 0, 0, 0, 0, 0], [0, 1, 0, 1, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0, 0, 0, 0], [0, 0, 1, 0, 0, 0, 0, 0, 0], [0, 0, 1, 0, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 1, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 1, 0, 0, 0], [1, 1, 0, 0, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 1, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 1, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 1, 0], [0, 1, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 1, 0, 0, 0], [0, 0, 1, 0, 0, 0, 0, 0, 1], [0, 0, 1, 0, 0, 0, 0, 0, 0], [0, 0, 0, 1, 0, 0, 0, 0, 0], [0, 0, 1, 1, 0, 0, 0, 0, 0], [0, 0, 1, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 1], [1, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 1, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 1, 0, 0, 0, 0], [0, 0, 1, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 1, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0], [1, 0, 1, 0, 0, 0, 0, 0, 0], [0, 0, 0, 1, 0, 0, 0, 0, 0], [0, 0, 0, 0, 1, 0, 0, 0, 0], [0, 0, 0, 0, 1, 0, 0, 0, 0], [1, 1, 0, 0, 0, 0, 0, 0, 0], [0, 1, 0, 1, 0, 0, 0, 0, 0], [0, 0, 0, 1, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 1], [0, 0, 0, 1, 0, 0, 0, 0, 0], [0, 0, 0, 1, 0, 0, 0, 0, 0], [0, 0, 1, 0, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 1, 0], [0, 0, 0, 1, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 1, 0, 0], [0, 0, 0, 0, 1, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 1, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0, 0, 0, 0], [0, 0, 1, 0, 0, 0, 0, 0, 0], [0, 0, 1, 0, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 1, 0, 0, 0, 0, 0, 0], [0, 0, 1, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 1, 0, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 1, 0], [0, 0, 0, 0, 0, 0, 0, 1, 0], [0, 0, 1, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 1, 0, 0, 0, 0], [0, 0, 1, 0, 0, 0, 0, 1, 0], [0, 0, 1, 0, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 1, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 1, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 1, 0], [1, 0, 0, 1, 0, 0, 0, 0, 0], [0, 0, 0, 1, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 1, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 1, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 1, 0], [0, 0, 0, 0, 0, 0, 0, 0, 1], [0, 0, 1, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 1, 0, 0, 0, 0], [0, 0, 0, 1, 0, 0, 0, 0, 0], [1, 0, 1, 0, 0, 0, 0, 0, 0], [0, 1, 0, 1, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0], [1, 0, 0, 1, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 1, 0], [0, 0, 1, 0, 0, 0, 0, 0, 0], [0, 0, 1, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0], [1, 0, 0, 1, 1, 0, 0, 0, 0], [1, 0, 0, 1, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0, 0, 0, 0], [0, 0, 1, 0, 0, 0, 0, 0, 0], [0, 0, 1, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 1, 0], [1, 0, 1, 0, 0, 0, 0, 0, 0], [0, 0, 0, 1, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0], [1, 0, 1, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0], [1, 0, 1, 0, 0, 1, 0, 0, 0], [0, 0, 0, 0, 1, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 1, 1, 0, 0, 0, 0, 0], [0, 0, 0, 1, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 1, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 1, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0, 0, 0, 0], [0, 0, 1, 0, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 1, 0, 0, 0], [0, 1, 0, 0, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 1, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 1, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 1, 0, 0, 0, 0, 0, 0], [0, 0, 1, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0], [1, 1, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 1, 0, 0], [0, 0, 0, 0, 0, 1, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 1, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 1, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 1, 0, 0, 0], [0, 1, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 1, 0, 0, 0, 0], [0, 0, 1, 0, 0, 1, 0, 0, 0], [0, 0, 1, 0, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0, 0, 0, 0], [0, 0, 1, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 1, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0, 0, 0, 0], [0, 1, 1, 0, 0, 0, 0, 0, 0], [0, 0, 1, 0, 0, 1, 0, 0, 0], [0, 1, 1, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 1, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0, 0, 0, 0], [0, 0, 1, 0, 1, 0, 0, 0, 0], [0, 0, 0, 0, 1, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0], [1, 0, 1, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 1, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 1, 0, 0], [0, 0, 1, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 1, 0, 0, 0, 0, 0, 0], [0, 0, 0, 1, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0, 0, 0, 0], [0, 0, 1, 0, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 1, 0, 0, 0, 0], [0, 0, 1, 0, 0, 0, 0, 0, 0], [0, 0, 1, 0, 0, 0, 0, 0, 0], [0, 0, 1, 0, 0, 0, 0, 0, 0], [0, 0, 1, 0, 0, 0, 0, 0, 0], [0, 0, 0, 1, 0, 0, 0, 1, 0], [0, 1, 0, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 1], [0, 1, 0, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 1, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0, 0, 0, 0]]
            '''
            exit()
            step_loss, _, summary = model.step(sess, input_feed, False)
            step_time += (time.time() -
                          start_time) / FLAGS.steps_per_checkpoint
            loss += step_loss / FLAGS.steps_per_checkpoint
            current_step += 1
            train_writer.add_summary(summary, model.global_step.eval())

            # Once in a while, we save checkpoint, print statistics, and run evals.
            if current_step % FLAGS.steps_per_checkpoint == 0:
                # Print statistics for the previous epoch.
                cprint(
                    "global step {} learning rate {:.4f} step-time {:.2f} loss {:.4f}"
                    .format(model.global_step.eval(),
                            model.learning_rate.eval(), step_time, loss),
                    'green')
                previous_losses.append(loss)

                # Validate model
                def validate_model(data_set, data_input_feed):
                    it = 0
                    count_batch = 0.0
                    summary_list = []
                    batch_size_list = []
                    while it < len(data_set.initial_list):
                        input_feed, info_map = data_input_feed.get_next_batch(
                            it, data_set, check_validation=False)
                        _, _, summary = model.step(sess, input_feed, True)
                        summary_list.append(summary)
                        batch_size_list.append(len(info_map['input_list']))
                        it += batch_size_list[-1]
                        count_batch += 1.0
                    return merge_TFSummary(summary_list, batch_size_list)

                valid_summary = validate_model(valid_set, valid_input_feed)
                valid_writer.add_summary(valid_summary,
                                         model.global_step.eval())
                cprint(
                    "[Valid]: %s" % (' '.join([
                        '%s: %.3f' % (x.tag, x.simple_value)
                        for x in valid_summary.value
                    ])), 'green')

                if FLAGS.test_while_train:
                    test_summary = validate_model(test_set, test_input_feed)
                    test_writer.add_summary(test_summary,
                                            model.global_step.eval())
                    cprint(
                        "[Test]: %s" % (' '.join([
                            '%s:%.3f' % (x.tag, x.simple_value)
                            for x in test_summary.value
                        ])), 'green')

                # Save checkpoint if the objective metric on the validation set is better
                if "objective_metric" in exp_settings:
                    for x in valid_summary.value:
                        if x.tag == exp_settings["objective_metric"]:
                            if current_step >= FLAGS.start_saving_iteration:
                                if best_perf == None or best_perf < x.simple_value:
                                    checkpoint_path = os.path.join(
                                        FLAGS.model_dir, "%s.ckpt" %
                                        exp_settings['learning_algorithm'])
                                    model.saver.save(
                                        sess,
                                        checkpoint_path,
                                        global_step=model.global_step)
                                    best_perf = x.simple_value
                                    print('Save model, valid %s:%.3f' %
                                          (x.tag, best_perf))
                                    break
                # Save checkpoint if there is no objective metic
                if best_perf == None and current_step > FLAGS.start_saving_iteration:
                    checkpoint_path = os.path.join(
                        FLAGS.model_dir,
                        "%s.ckpt" % exp_settings['learning_algorithm'])
                    model.saver.save(sess,
                                     checkpoint_path,
                                     global_step=model.global_step)
                if loss == float('inf'):
                    break

                step_time, loss = 0.0, 0.0
                sys.stdout.flush()

                if FLAGS.max_train_iteration > 0 and current_step > FLAGS.max_train_iteration:
                    break
Ejemplo n.º 21
0
def infer(config, test_bleu=True):

    work_space = config["workspace"]
    name = config["Name"]

    # Construct or load embeddings
    print("Initializing embeddings ...")
    vocab_size = config["embeddings"]["vocab_size"]
    embed_size = config["embeddings"]["embed_size"]
    vocab_file = config["inference"]["vocab_file"]

    # Build the model
    (
        encode_num_layers,
        encode_num_units,
        encode_cell_type,
        encode_bidir,
        attn_num_units,
        decode_num_layers,
        decode_num_units,
        decode_cell_type,
        use_user_feat,
        use_gate_memory,
        use_user_desc,
        use_blog_user_coattn,
        use_external_desc_express,
        use_external_feat_express,
        user_feat_dim,
        user_feat_unit,
        user_feat_mem_unit,
        desc_rnn_unit,
        desc_attn_num_units,
        user_map_unit,
    ) = get_pcgn_model_config(config)

    (infer_file, batch_size, is_beam_search, beam_size,
     infer_source_max_length, infer_target_max_length, infer_desc_max_length,
     infer_max_iter, output_path, gpu_fraction,
     gpu_id) = get_pcgn_infer_config(config)

    print("Building model architecture ...")
    pcg_model = PCGNModel(
        mode='infer',
        model_name=name,
        vocab_size=vocab_size,
        embedding_size=embed_size,
        encode_num_layers=encode_num_layers,
        encode_num_units=encode_num_units,
        encode_cell_type=encode_cell_type,
        encode_bidir=encode_bidir,
        attn_num_units=attn_num_units,
        decode_num_layers=decode_num_layers,
        decode_num_units=decode_num_units,
        decode_cell_type=decode_cell_type,
        use_user_feat=use_user_feat,
        use_gate_memory=use_gate_memory,
        use_user_desc=use_user_desc,
        use_blog_user_coattn=use_blog_user_coattn,
        use_external_desc_express=use_external_desc_express,
        use_external_feat_express=use_external_feat_express,
        user_feat_dim=user_feat_dim,
        user_feat_unit=user_feat_unit,
        user_feat_mem_unit=user_feat_mem_unit,
        desc_rnn_unit=desc_rnn_unit,
        desc_attn_num_units=desc_attn_num_units,
        user_map_unit=user_map_unit,
        batch_size=batch_size,
        beam_search=is_beam_search,
        beam_size=beam_size,
        infer_max_iter=infer_max_iter,
        target_max_length=infer_target_max_length,
    )

    print("\tDone.")

    logdir = '%s/nn_models/' % work_space
    # Set up session
    gpu_fraction = config["training"]["gpu_fraction"]
    gpu_id = config["training"]["gpu_id"]
    gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=gpu_fraction,
                                visible_device_list=gpu_id,
                                allow_growth=True)
    sess = tf.Session(config=tf.ConfigProto(log_device_placement=False,
                                            gpu_options=gpu_options))

    init = tf.global_variables_initializer()
    sess.run(init)
    try:
        saved_global_step = load_model(pcg_model.saver, sess, logdir)
        if saved_global_step is None:
            raise ValueError("Cannot find the checkpoint to restore from.")

    except Exception:
        print("Something went wrong while restoring checkpoint. ")
        raise

    # ##### Inference #####
    # Load data
    print("Loading inference data ...")

    # Load vocabularies.
    vocab_table, reverse_vocab_table = create_vocab_tables(vocab_file)

    infer_dataset = read_data(infer_file)
    print(' # infer data:', len(infer_dataset))
    print("\tDone.")

    # Inference
    print("Start inferring ...")
    final_result = pd.DataFrame()
    infer_step = int(len(infer_dataset) / batch_size)
    preds = []
    for ith in range(infer_step):
        print('step:', ith)
        start = ith * batch_size
        end = (ith + 1) * batch_size
        batch = get_pcgn_batch(infer_dataset[start:end], 'infer', -1,
                               infer_source_max_length,
                               infer_target_max_length, infer_desc_max_length)

        result = pcg_model.infer(sess, batch)
        result1 = batch_token_to_str(result[:, 0, :], reverse_vocab_table)
        #result2 = batch_token_to_str(result[:, 1,:], reverse_vocab_table)
        #result3 = batch_token_to_str(result[:, 2,:], reverse_vocab_table)
        #result4 = batch_token_to_str(result[:, 3,:], reverse_vocab_table)
        #result5 = batch_token_to_str(result[:, 4,:], reverse_vocab_table)
        preds += list(result1)

        if test_bleu:
            blog = batch_token_to_str(batch[0], reverse_vocab_table)
            cmt = batch_token_to_str(batch[2], reverse_vocab_table)
            desc = batch_token_to_str(batch[6], reverse_vocab_table)
            feat_df = featinds2df(batch[8])

            df_result = pd.DataFrame({
                'Blog': blog,
                'Comment': cmt,
                'Individual_Description': desc,
                'Prediction': result1,
            })
            df_result = pd.concat([df_result, feat_df], axis=1)
            final_result = pd.concat([final_result, df_result])

    out_path = config["inference"]["output_path"] + 'prediction' + '.txt'
    with open(out_path, 'w') as f:
        f.write('\n'.join(preds))

    if test_bleu:
        bleu2 = calc_bleu2(final_result['Prediction'].values,
                           final_result['Comment'].values)
        print('test bleu:', bleu2)
        bleurecord = 'test_size:{}\trestore_step:{}\n'.format(
            str(int(infer_step * batch_size)), str(saved_global_step))
        bleurecord += 'bleu2:{}\n\n'.format(str(bleu2[0]))
        with open(logdir + 'bleu.txt', 'a') as f:
            f.write(bleurecord)

        out_path = config["inference"]["output_path"] + 'prediction' + '.csv'
        final_result.to_csv(out_path, index=False)

    print("\tDone.")
Ejemplo n.º 22
0
                    type=str, help='Path to file containing paths to the data to use.')
parser.add_argument('--data_root', default='data/repos', type=str,
                    help='Path root folder containing the cloned repositories.')
parser.add_argument('--out_path', default='data', type=str, help='Path to save vocabulary object.')
parser.add_argument('--oov_threshold', default=20, type=int, help='Ignore words that appear less than this many times.')
args = parser.parse_args()


def build_vocab(dataset, oov_threshold):
    counter = collections.Counter(itertools.chain(itertools.chain(*dataset)))
    count_pairs = sorted(counter.items(), key=lambda x: -x[1])

    count_pairs = (p for p in count_pairs if p[1] > oov_threshold)

    words, _ = list(zip(*count_pairs))
    word_to_id = dict(zip(words, range(2, len(words) + 2)))
    word_to_id[du.OOV_TOKEN] = du.OOV_IDX
    word_to_id[du.PAD_TOKEN] = du.PAD_IDX
    return word_to_id


if __name__ == "__main__":
    data_for_vocab = args.vocab_data
    data_root = args.data_root
    data = du.read_data(data_for_vocab, data_root)
    print("Loaded {} files".format(len(data)))
    vocab = build_vocab(data, args.oov_threshold)
    print("Vocabulary size is:", len(vocab))
    out_file = args.out_path + "/vocab.p"
    pickle.dump(vocab, open(out_file, "wb"), protocol=pickle.HIGHEST_PROTOCOL)
Ejemplo n.º 23
0
  def train():
    """Train a query2vec model"""
    # Prepare train data.
    print("Preparing Seq2seq Model in %s" % FLAGS.train_dir)
    train_data, test_data, _ = data_utils.prepare_data(FLAGS.train_dir, FLAGS.vocab_size)
    checkpoint_path = os.path.join(FLAGS.train_dir, FLAGS.seq2seq_model)

    print("Loading training data from %s" % train_data)
    print("Loading development data from %s" % test_data)

    gpu_options = tf.GPUOptions(allow_growth=True)
    with tf.Session(config=tf.ConfigProto(allow_soft_placement=True, gpu_options=gpu_options,
                                          intra_op_parallelism_threads=20)) as sess:
      # Create model.
      print("Creating %d layers of %d units." % (FLAGS.num_layers, FLAGS.size))
      with tf.device("/gpu:0"):
        model = model_helper.create_model(sess, False)

      # Read data into buckets and compute their sizes.
      print("Reading development and training data (limit: %d)."
            % FLAGS.max_train_data_size)
      test_set = data_utils.read_data(test_data)
      train_set = data_utils.read_data(train_data, max_size=FLAGS.max_train_data_size)
      train_bucket_sizes = [len(train_set[b]) for b in xrange(len(_buckets))]
      train_total_size = float(sum(train_bucket_sizes))

      # A bucket scale is a list of increasing numbers from 0 to 1 that we'll use
      # to select a bucket. Length of [scale[i], scale[i+1]] is proportional to
      # the size if i-th training bucket, as used later.
      train_buckets_scale = [sum(train_bucket_sizes[:i + 1]) / train_total_size
                             for i in xrange(len(train_bucket_sizes))]

      # This is the training loop.
      step_time, loss = 0.0, 0.0
      current_step = 0
      previous_losses = []
      prev_loss = [1000000] * len(_buckets)

      train_writer = tf.summary.FileWriter(os.path.join("summary/train"), sess.graph)
      test_writer = tf.summary.FileWriter(os.path.join("summary/test"), sess.graph)
      while True:
        # Choose a bucket according to data distribution. We pick a random number
        # in [0, 1] and use the corresponding interval in train_buckets_scale.
        random_number_01 = np.random.random_sample()
        bucket_id = min([i for i in xrange(len(train_buckets_scale))
                         if train_buckets_scale[i] > random_number_01])

        # Get a batch and make a step.
        start_time = time.time()
        encoder_inputs, decoder_inputs, target_weights = model.get_batch(
          train_set[bucket_id], bucket_id)
        summaries, _, step_loss, _ = model.step(sess, encoder_inputs, decoder_inputs,
                                                target_weights, bucket_id, False)
        step_time += (time.time() - start_time) / FLAGS.steps_per_checkpoint
        loss += step_loss / FLAGS.steps_per_checkpoint
        current_step += 1
        if current_step % FLAGS.steps_per_summary == 0:
          train_writer.add_summary(summaries, current_step)
          train_writer.flush()
          print('Step: %s' % current_step)
        # Once in a while, we save checkpoint, print statistics, and run evals.
        if current_step % FLAGS.steps_per_checkpoint == 0:
          # Print statistics for the previous epoch.
          perplexity = math.exp(loss) if loss < 300 else float('inf')
          print("global step %d learning rate %.4f step-time %.2f perplexity "
                "%.2f" % (model.global_step.eval(), model.learning_rate.eval(),
                          step_time, perplexity))
          # Decrease learning rate if no improvement was seen over last 3 times.
          if len(previous_losses) > 2 and loss > max(previous_losses[-3:]):
            sess.run(model.learning_rate_decay_op)
          previous_losses.append(loss)
          # Save checkpoint and zero timer and loss.
          step_time, loss = 0.0, 0.0
          # Run evals on development set and print their perplexity.
          count = 0
          for bucket_id in xrange(len(_buckets)):
            if len(test_set[bucket_id]) == 0:
              print("  eval: empty bucket %d" % (bucket_id))
              continue
            encoder_inputs, decoder_inputs, target_weights = model.get_batch(
              test_set[bucket_id], bucket_id)
            summaries, _, eval_loss, _ = model.step(sess, encoder_inputs, decoder_inputs,
                                                    target_weights, bucket_id, True)
            test_writer.add_summary(summaries, current_step)
            eval_ppx = math.exp(eval_loss) if eval_loss < 300 else float('inf')
            if eval_ppx < prev_loss[bucket_id]:
              prev_loss[bucket_id] = eval_ppx
              count += 1
            print("  eval: bucket %d perplexity %.2f" % (bucket_id, eval_ppx))

          if count > len(_buckets) / 3:
            print("saving model...")
            model.saver.save(sess, checkpoint_path, global_step=model.global_step)
          sys.stdout.flush()
          test_writer.flush()
Ejemplo n.º 24
0
    TRAIN_DATA_DIR = '/data/public/NER/ner/'
    #external_words_fname = '/data/xueyou/ner/sogou.words.txt'
    checkpoint_dir = '/data/xueyou/ner/ner_lstm_dim256_no_external_words_0201/'

    if not os.path.exists(checkpoint_dir):
        os.mkdir(checkpoint_dir)

    #add_external_words(external_words_fname)

    # read training data
    train_files = [
        os.path.join(DATA_DIR, "example.train"),
        os.path.join(TRAIN_DATA_DIR, "people.199801.tagged.txt"),
        os.path.join(TRAIN_DATA_DIR, "boson_nlp.tagged.txt")
    ]
    train_data = read_data(train_files, lower=True)

    # convert tags to iobes
    update_tag_scheme(train_data)

    # create vocab from training data
    word_vocab, tag_vocab = create_vocab(train_data,
                                         lower_case=True,
                                         min_cnt=2)
    segment_vocab = segment_vocab()

    # save vocab
    save_vocab(word_vocab, os.path.join(checkpoint_dir, "word.vocab"))
    save_vocab(tag_vocab, os.path.join(checkpoint_dir, "tag.vocab"))
    save_vocab(segment_vocab, os.path.join(checkpoint_dir, "seg.vocab"))
Ejemplo n.º 25
0
def main():
    parser = argparse.ArgumentParser()
    add_argument(parser)
    args = parser.parse_args()

    config = Config()

    train_data = read_data(config.train_data_files, config.model)
    # 试试对负样本进行降采样
    # train_data = sample(train_data)
    eval_data = read_data(config.eval_data_files, config.model)
    # train_data_sen = read_data_sen("data/data_tech.train")
    # eval_data_sen = read_data_sen("data/data_tech.eval")
    # 这里使用了预训练的词向量的词表作为了模型的词表
    create_vocab_from_pretrained_w2v(config.w2v_path, config.word_vocab_file)
    create_tag_vocab_from_data(train_data, config.tag_vocab_file)

    word2id, id2word = read_vocab(config.word_vocab_file)
    tag2id, id2tag = read_vocab(config.tag_vocab_file)

    # convert word into ids
    train_data = convert_dataset(train_data, word2id, tag2id,
                                 config.sentence_length, config.num_classes,
                                 config.model)
    # train_data_sen = convert_dataset_sen(train_data_sen, word2id, tag2id, config.num_classes, one_hot_label=True)
    print(train_data[0])
    eval_data = convert_dataset(eval_data, word2id, tag2id,
                                config.sentence_length, config.num_classes,
                                config.model)
    # eval_data_sen = convert_dataset_sen(eval_data_sen, word2id, tag2id, config.num_classes, one_hot_label=True)
    print("train_data size: {0}".format(len(train_data)))

    if os.path.exists(os.path.join(config.checkpoint_dir, "config.pkl")):
        config = pickle.load(
            open(os.path.join(config.checkpoint_dir, "config.pkl"), 'rb'))
    else:
        pickle.dump(
            config,
            open(os.path.join(config.checkpoint_dir, "config.pkl"), 'wb'))

    with tf.Session(config=get_config_proto(
            log_device_placement=False)) as sess:
        model = get_model(config.model, config, sess)
        model.build()
        model.init()

        batch_manager = Batch_self_attention(train_data, config.batch_size)
        batch_manager_eval = Batch_self_attention(eval_data, config.batch_size)
        # batch_manager = Batch(train_data, config.batch_size)
        # batch_manager_eval = Batch(eval_data, config.batch_size)
        epoches = config.epoch
        max_acc = 0
        for i in range(epoches):
            for batch in batch_manager.next_batch():
                # print(batch)
                loss, accuracy, global_step = model.train_one_step(*zip(
                    *batch))
                # key_shape, query_shape = model.test(*zip(*batch))
                # print(key_shape, query_shape)
                # break
            train_accuracy = evaluate(model, batch_manager)
            eval_accuracy = evaluate(model, batch_manager_eval)
            # train_accuracy = evaluate_attention(model, train_data_sen, id2tag)
            # eval_accuracy = evaluate_attention(model, eval_data_sen, id2tag)
            print("epoch - {0}      step - {1}      loss - {2}      train_accuracy - {3}    eval_accuracy - {4}"\
                .format(i, global_step, loss, train_accuracy, eval_accuracy))

            # train_accuracy = evaluate_attention(model, train_data_sen, id2tag)
            # eval_accuracy = evaluate_attention(model, eval_data_sen, id2tag)
            # print("epoch - {0}      step - {1}      loss - {2}      train_accuracy - {3}    eval_accuracy - {4}"\
            #         .format(i, global_step, loss, train_accuracy, eval_accuracy))

            if max_acc < eval_accuracy:
                max_acc = eval_accuracy
                model.save_model()
Ejemplo n.º 26
0
def main(config):

	# set up workspace
	work_space = config["workspace"]
	tf_board = config["tf_board"]
	setup_workpath(work_space)
	name = config["Name"]

	# Construct or load embeddings
	print("Initializing embeddings ...")
	vocab_size = config["embeddings"]["vocab_size"]
	embed_size = config["embeddings"]["embed_size"]

	# Build the model and compute losses
	(encode_num_layers, encode_num_units, encode_cell_type, encode_bidir,
	 attn_num_units, decode_num_layers, decode_num_units, decode_cell_type,
	 use_user_feat,use_gate_memory,use_user_desc,use_blog_user_coattn,
	 use_external_desc_express,use_external_feat_express,
	 user_feat_dim,user_feat_unit,user_feat_mem_unit,
	 desc_rnn_unit,desc_attn_num_units,user_map_unit,
	 ) = get_pcgn_model_config(config)

	(train_file, dev_file,
	 source_max_length, target_max_length, desc_max_length,
	 gpu_fraction, gpu_id, train_steps, checkpoint_every, print_every,
	 batch_size,is_beam_search,beam_size,infer_max_iter,
	 l2_regularize,learning_rate,max_checkpoints,max_gradient_norm,
	  ) = get_pcgn_training_config(config)

	train_set=read_data(train_file)
	print(' # train data:',len(train_set))
	dev_set=read_data(dev_file)
	print(' # dev data:',len(dev_set))

	print("Building model architecture ")
	pcg_model = PCGNModel(
		mode='train', model_name=name,
		vocab_size=vocab_size, embedding_size=embed_size,
		encode_num_layers=encode_num_layers, encode_num_units=encode_num_units,
		encode_cell_type=encode_cell_type, encode_bidir=encode_bidir,
		attn_num_units=attn_num_units, decode_num_layers=decode_num_layers,
		decode_num_units=decode_num_units, decode_cell_type=decode_cell_type,
		use_user_feat=use_user_feat, use_gate_memory=use_gate_memory,
		use_user_desc=use_user_desc, use_blog_user_coattn=use_blog_user_coattn,
		use_external_desc_express=use_external_desc_express, use_external_feat_express=use_external_feat_express,

		user_feat_dim=user_feat_dim, user_feat_unit=user_feat_unit, user_feat_mem_unit=user_feat_mem_unit,
		desc_rnn_unit=desc_rnn_unit, desc_attn_num_units=desc_attn_num_units, user_map_unit=user_map_unit,

		batch_size=batch_size, beam_search=is_beam_search, beam_size=beam_size, infer_max_iter=infer_max_iter, target_max_length=target_max_length,
		l2_regularize=l2_regularize, learning_rate=learning_rate, max_to_keep=max_checkpoints, max_gradient_norm=max_gradient_norm,
	)

	print("\tDone.")


	logdir = '%s/nn_models/' % work_space

	# Set up session
	gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=gpu_fraction, visible_device_list=gpu_id,allow_growth=True)

	sess = tf.Session(config=tf.ConfigProto(log_device_placement=False,
											gpu_options=gpu_options))
	init = tf.global_variables_initializer()
	sess.run(init)

	# tensorbord
	if use_tensorboard:
		train_writer = tf.summary.FileWriter(tf_board + 'train/', sess.graph)
		test_writer = tf.summary.FileWriter(tf_board + 'test/', sess.graph)

	try:
		saved_global_step = load_model(pcg_model.saver, sess, logdir)
		if saved_global_step is None:
			saved_global_step = -1

	except Exception:
		print("Something went wrong while restoring checkpoint. "
			  "Training is terminated to avoid the overwriting.")
		raise

	# ##### Training #####

	# Training
	last_saved_step = saved_global_step
	num_steps = saved_global_step + train_steps
	steps = []
	previous_losses=[]
	lr = pcg_model.learning_rate

	print("Start training ...")
	print('steps per epoch:',len(train_set)//batch_size)
	try:
		for step in range(saved_global_step + 1, num_steps):
			start_time = time.time()

			batch = get_pcgn_batch(train_set,'train', batch_size,source_max_length, target_max_length,desc_max_length)
			loss_value = pcg_model.train(sess, batch)
			previous_losses.append(loss_value)
			lr_decay_step = 10
			if step % 500 == 0 and len(previous_losses)-5 > lr_decay_step and np.mean(previous_losses[-5:]) >= np.mean(previous_losses[-lr_decay_step -5:-5]):
				lr=pcg_model.learning_rate
				if lr > 0.00001:
					pcg_model.learning_rate=lr*0.9
					print('learning rate decay:',lr*0.9)
			duration = (time.time() - start_time)
			if step % print_every == 0 and step != 0:
				# train perplexity
				t_perp = pcg_model.compute_perplexity(sess, batch)
				if use_tensorboard:
					add_summary(train_writer, step, 'train perplexity', t_perp)

				# eval perplexity
				dev_str = ""
				if dev_set is not None:
					eval_batch = get_pcgn_batch(dev_set,'train', batch_size,source_max_length, target_max_length,desc_max_length)
					eval_perp = pcg_model.compute_perplexity(sess, eval_batch)
					with open(logdir+'eval_perp.txt','a',encoding='utf-8') as f:
						f.write('{}\t{}\n'.format(str(step),str(eval_perp)))

					if use_tensorboard:
						add_summary(test_writer, step, 'eval perplexity', eval_perp)
					dev_str += "val_prep: {:.3f}\n".format(eval_perp)

				steps.append(step)
				ep=step//(len(train_set)//batch_size)
				info = 'epoch {:d}, step {:d},lr:{:.5f}, loss = {:.6f},perp: {:.3f}\n{}({:.3f} sec/step)'
				print(info.format(ep,step,lr, loss_value, t_perp, dev_str, duration))

			if step % checkpoint_every == 0:
				save_model(pcg_model.saver, sess, logdir, step)
				last_saved_step = step

	except KeyboardInterrupt:
		# Introduce a line break after ^C so save message is on its own line.
		print()

	finally:
		if step > last_saved_step:
			save_model(pcg_model.saver, sess, logdir, step)
Ejemplo n.º 27
0
from config import Config
from utils.data_utils import read_data, write_scores, write_predictions, get_segment, build_dictionary, remove_low_words
from models.BowModel import BowModel
from models.BocModel import BocModel
from models.Ensemble import Ensemble

if __name__ == "__main__":
    config = Config()

    # read data
    train_set = read_data(config.train_set_file_name)
    dev_set = read_data(config.dev_set_file_name)

    # segmentation
    train_set = get_segment(train_set)
    dev_set = get_segment(dev_set)

    # remove words with low frequency
    dictionary = build_dictionary([train_set, dev_set], config.low_frequency,
                                  config.high_frequency)
    train_set = remove_low_words(train_set, dictionary)
    dev_set = remove_low_words(dev_set, dictionary)

    # get predictions
    ensemble_model = Ensemble(config, [BowModel(config), BocModel(config)])
    scores = ensemble_model.test(dev_set)

    # write predictions
    # write_predictions(dev_set, labels, config.result_file_name)
    write_scores(scores, config.result_file_name)
Ejemplo n.º 28
0
nnlm_embedder = hub.load(config['tf_hub_model'])
batch_size = config['batch_size']

print('Processing Train Data...')
train_data, train_tables, in_domain_test = process_train_data(
    config, nnlm_embedder, config['train_data'], config['train_tables'])
train_batches = create_train_batches(train_data, train_tables, config)

train_samples_batches = create_samples_batches(train_data, batch_size)
train_tables_batches = create_tables_batches(train_tables, config)

if config['use_in_domain_test']:
    in_domain_test_batches = create_samples_batches(in_domain_test, batch_size)

print('Processing Dev Data...')
dev_data, dev_tables, _ = read_data(config['dev_data'], config['dev_tables'],
                                    config['real_proxy_token'])
dev_samples_batches = create_samples_batches(dev_data, batch_size)
dev_tables_batches = create_tables_batches(dev_tables, config)

print('Processing Test Data...')
test_data, test_tables, _ = read_data(config['test_data'],
                                      config['test_tables'],
                                      config['real_proxy_token'])
test_samples_batches = create_samples_batches(test_data, batch_size)
test_tables_batches = create_tables_batches(test_tables, config)

all_data = {
    'train_batches': train_batches,
    'dev_samples_batches': dev_samples_batches,
    'dev_tables_batches': dev_tables_batches,
    'test_samples_batches': test_samples_batches,