Exemple #1
0
def main():
    # Define argument parsing
    parser = argparse.ArgumentParser(description="Tell RaNNdy what to do.")
    parser.add_argument('--mode', help='Whether to train or infer.', default='train', choices=['train', 'infer'])
    parser.add_argument('--sentence_tokens', help ='Tokenized sentences to train on.', default='../data/sentence_tokens.csv')
    parser.add_argument('--vocab', help ='Vocabulary to use for training.', default='../data/vocabulary.csv')
    args = parser.parse_args()

    if args.mode == 'train':
        # Step 1: Load Dataset
        data_iterator = DataIterator(args.sentence_tokens, args.vocab)

        # Step 2: Create Auto Encoder in Trainng Mode
        ranndy = SentenceAutoEncoder(data_iterator, tf.estimator.ModeKeys.TRAIN)

        # Step 3: Train
        ranndy.train()
    else:
        # Step 1: Load Dataset w/ batch size of 1
        data_iterator = DataIterator(args.sentence_tokens, args.vocab, batch_size=2, shuffle=False)

        # Step 2: Create Auto Encoder in Inference Mode
        ranndy = SentenceAutoEncoder(data_iterator, tf.estimator.ModeKeys.PREDICT)

        # Step 3: Infer
        ranndy.infer(num_batch_infer=1)
Exemple #2
0
def train(estimator):
    tf.logging.set_verbosity(tf.logging.INFO)
    train_file_pattern = "./data/part-00000"
    data_iterator = DataIterator(params)
    train_input_fn = lambda: data_iterator.input_fn(train_file_pattern,
                                                    'offline')
    estimator.train(input_fn=train_input_fn, steps=None)
Exemple #3
0
def evaluate(sess, model, data_set):
    # Run evals on development set and print their perplexity/loss.
    sess.run(model.dropout10_op)  # 验证的时候dropout设置为1,也就是不dropout
    loss = 0.0
    n_steps = 0
    n_valids = 0
    batch_size = FLAGS.batch_size

    dite = DataIterator(model, data_set, len(FLAGS._buckets), batch_size, None)
    ite = dite.next_sequence(stop=True)

    for inputs, outputs, weights, bucket_id in ite:
        L = model.step(sess,
                       inputs,
                       outputs,
                       weights,
                       bucket_id,
                       forward_only=True)
        loss += L
        n_steps += 1
        n_valids += np.sum(weights)
    loss = loss / (n_valids)
    ppx = math.exp(loss) if loss < 300 else float("inf")
    sess.run(model.dropoutAssign_op)  #验证结束需要将 dropout恢复原来的设置。
    return loss, ppx
Exemple #4
0
def predict(data_params):
    meta_path = "./model/dssm.ckpt.meta"
    ckpt_path = "./model/dssm.ckpt"
    data_file = "./data/train.txt.10"
    dssm = DSSM()
    data_iterator = DataIterator(data_params)
    iterator = data_iterator.input_fn(data_file)
    # config
    with tf.Session() as sess:
        saver = tf.train.import_meta_graph(meta_path)
        saver.restore(sess, ckpt_path)
        sess.run(tf.global_variables_initializer())
        sess.run(iterator.initializer)
        s = time.time()
        while True:
            try:
                (query_features, creative_ids, labels) = iterator.get_next()
                (batch_query, batch_creative_ids, batch_labels) = sess.run(
                    [query_features, creative_ids, labels])
                prediction = sess.run(dssm.score,
                                      feed_dict={
                                          dssm.query: batch_query,
                                          dssm.doc: batch_creative_ids
                                      })
                print(prediction)
            except tf.errors.OutOfRangeError:
                break
        e = time.time()
        # 平均每条 0.0001s
        print(e - s)
 def __init__(self,
              train_file="local_train_splitByUser",
              test_file="local_test_splitByUser",
              uid_voc="uid_voc.pkl",
              mid_voc="mid_voc.pkl",
              cat_voc="cat_voc.pkl",
              item_info='item-info',
              reviews_info='reviews-info',
              batch_size=128,
              maxlen=100,
              embedding_dim=18,
              light_embedding_dim=4,
              return_neg=True):
     self.maxlen = maxlen
     self.return_neg = return_neg
     self.train_data = DataIterator(train_file,
                                    uid_voc,
                                    mid_voc,
                                    cat_voc,
                                    item_info,
                                    reviews_info,
                                    batch_size,
                                    maxlen,
                                    shuffle_each_epoch=False)
     self.test_data = DataIterator(test_file, uid_voc, mid_voc, cat_voc,
                                   item_info, reviews_info, batch_size,
                                   maxlen)
     self.n_uid, self.n_mid, self.n_cat = self.train_data.get_n()
     self.embedding_dim = embedding_dim
     self.light_embedding_dim = light_embedding_dim
def test(depth, p, dataset, num_epochs=200, seed=None):
    if seed is None:
        seed = 0

    np.random.seed(seed)

    data = None
    if dataset == "mnist":
        data = mnist.load().astype(np.float32)
    elif dataset == "cifar10":
        data = cifar10.load().astype(np.float32)

    num_observations, input_dim = data.shape
    data_split_index = int(num_observations * 0.9)
    training_data_iterator = DataIterator(batch_size, data[:data_split_index],
                                          data[:data_split_index])
    validation_data_iterator = DataIterator(batch_size,
                                            data[data_split_index:],
                                            data[data_split_index:])

    # make net
    net = Network(input_dim, input_dim, hidden_layers=([
        1000,
    ] * depth), p=p)
    losses = net.train(training_data_iterator,
                       validation_data_iterator,
                       num_epochs=num_epochs)
    net.close()

    return losses
Exemple #7
0
def evaluate(sess, model, data_set, item_sampled_id2idx=None):
    # Run evals on development set and print their perplexity/loss.
    dropoutRateRaw = FLAGS.keep_prob
    sess.run(model.dropout10_op)

    start_id = 0
    loss = 0.0
    n_steps = 0
    n_valids = 0
    batch_size = FLAGS.batch_size

    dite = DataIterator(model, data_set, len(_buckets), batch_size, None)
    ite = dite.next_sequence(stop=True)

    for users, inputs, outputs, weights, bucket_id in ite:
        L = model.step(sess,
                       users,
                       inputs,
                       outputs,
                       weights,
                       bucket_id,
                       forward_only=True)
        loss += L
        n_steps += 1
        n_valids += np.sum(np.sign(weights[0]))

    loss = loss / (n_valids)
    ppx = math.exp(loss) if loss < 300 else float("inf")

    sess.run(model.dropoutAssign_op)

    return loss, ppx
def test(train_file="local_train_splitByUser",
         test_file="local_test_splitByUser",
         uid_voc="uid_voc.pkl",
         mid_voc="mid_voc.pkl",
         cat_voc="cat_voc.pkl",
         batch_size=128,
         maxlen=100,
         model_type='DNN',
         seed=2):

    model_path = "dnn_best_model/ckpt_noshuff" + model_type + str(seed)
    gpu_options = tf.GPUOptions(allow_growth=True)
    with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) as sess:
        train_data = DataIterator(train_file, uid_voc, mid_voc, cat_voc,
                                  batch_size, maxlen)
        test_data = DataIterator(test_file, uid_voc, mid_voc, cat_voc,
                                 batch_size, maxlen)
        n_uid, n_mid, n_cat = train_data.get_n()
        model = Model_DIN_V2_Gru_Vec_attGru_Neg(n_uid, n_mid, n_cat,
                                                EMBEDDING_DIM, HIDDEN_SIZE,
                                                ATTENTION_SIZE)
        model.restore(sess, model_path)
        print(
            'test_auc: %.4f ----test_loss: %.4f ---- test_accuracy: %.4f ---- test_aux_loss: %.4f'
            % eval(sess, test_data, model, model_path))
Exemple #9
0
def test(train_file="local_train_splitByUser",
         test_file="local_test_splitByUser",
         uid_voc="uid_voc.pkl",
         mid_voc="mid_voc.pkl",
         cat_voc="cat_voc.pkl",
         batch_size=128,
         maxlen=20,
         model_type='DNN',
         seed=2):

    model_path = "dnn_best_model/ckpt_noshuff" + model_type + str(seed)
    gpu_options = tf.GPUOptions(allow_growth=True)
    with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) as sess:
        train_data = DataIterator(train_file, uid_voc, mid_voc, cat_voc,
                                  batch_size, maxlen)
        test_data = DataIterator(test_file, uid_voc, mid_voc, cat_voc,
                                 batch_size, maxlen)
        n_uid, n_mid, n_cat = train_data.get_n()
        if model_type == 'DNN':
            model = Model_DNN(n_uid, n_mid, n_cat, EMBEDDING_DIM, HIDDEN_SIZE,
                              ATTENTION_SIZE)
        elif model_type == 'PNN':
            model = Model_PNN(n_uid, n_mid, n_cat, EMBEDDING_DIM, HIDDEN_SIZE,
                              ATTENTION_SIZE)
        elif model_type == 'Wide':
            model = Model_WideDeep(n_uid, n_mid, n_cat, EMBEDDING_DIM,
                                   HIDDEN_SIZE, ATTENTION_SIZE)
        elif model_type == 'DIN':
            model = Model_DIN(n_uid, n_mid, n_cat, EMBEDDING_DIM, HIDDEN_SIZE,
                              ATTENTION_SIZE)
        elif model_type == 'DIN-V2-gru-att-gru':
            model = Model_DIN_V2_Gru_att_Gru(n_uid, n_mid, n_cat,
                                             EMBEDDING_DIM, HIDDEN_SIZE,
                                             ATTENTION_SIZE)
        elif model_type == 'DIN-V2-gru-gru-att':
            model = Model_DIN_V2_Gru_Gru_att(n_uid, n_mid, n_cat,
                                             EMBEDDING_DIM, HIDDEN_SIZE,
                                             ATTENTION_SIZE)
        elif model_type == 'DIN-V2-gru-qa-attGru':
            model = Model_DIN_V2_Gru_QA_attGru(n_uid, n_mid, n_cat,
                                               EMBEDDING_DIM, HIDDEN_SIZE,
                                               ATTENTION_SIZE)
        elif model_type == 'DIN-V2-gru-vec-attGru':
            model = Model_DIN_V2_Gru_Vec_attGru(n_uid, n_mid, n_cat,
                                                EMBEDDING_DIM, HIDDEN_SIZE,
                                                ATTENTION_SIZE)
        elif model_type == 'DIEN':
            model = Model_DIN_V2_Gru_Vec_attGru_Neg(n_uid, n_mid, n_cat,
                                                    EMBEDDING_DIM, HIDDEN_SIZE,
                                                    ATTENTION_SIZE)
        elif model_type == 'DMIN':
            model = Model_DNN_Multi_Head(n_uid, n_mid, n_cat, EMBEDDING_DIM,
                                         HIDDEN_SIZE, ATTENTION_SIZE)
        else:
            print("Invalid model_type : %s", model_type)
            return
        model.restore(sess, model_path)
        print(
            'test_auc: %.4f ----test_loss: %.4f ---- test_accuracy: %.4f ---- test_aux_loss: %.4f'
            % eval(sess, test_data, model, model_path, maxlen=maxlen))
def test(
    files,
    batch_size=1024,
    max_len=100,
    seed=2,
    shuffle_each_epoch=False,
):
    train_file, test_file, uid_voc, mid_voc, cat_voc = files[0], files[
        1], files[2], files[3], files[4]
    if shuffle_each_epoch:
        model_path = "best_model_SGD/ckpt_shuffle" + str(seed)
    else:
        model_path = "best_model_SGD/ckpt_noshuffle" + str(seed)
    gpu_options = tf.GPUOptions(allow_growth=True)
    with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) as sess:
        train_data = DataIterator(train_file, uid_voc, mid_voc, cat_voc,
                                  batch_size, max_len)
        test_data = DataIterator(test_file, uid_voc, mid_voc, cat_voc,
                                 batch_size, max_len)
        n_uid, n_mid, n_cat = train_data.get_n()

        model = Model_DIN(n_uid, n_mid, n_cat, EMBEDDING_DIM, HIDDEN_SIZE,
                          ATTENTION_SIZE)
        model.restore(sess, model_path)
        print('test_auc: %.4f ----test_loss: %.4f ---- test_accuracy: %.4f ' %
              eval(sess, test_data, model, model_path))
    def get_batch_data(self):
        r""" 输出一个batch预处理的样本 """
        if self.shuffle:
            random.shuffle(self.data)
        it = DataIterator(self.data, self.batch_size)

        for batch_data in it.get_batch_data():
            texts, emotions = [], []
            for item in batch_data:
                texts.append(item['text'].strip().split())
                emotions.append(item['emotion'])

            id_texts, len_texts = [], []
            for text in texts:
                id_text, len_text = self.sp.word2index(text)
                id_texts.append(id_text)
                len_texts.append(len_text)

            len_texts = [l + 2 for l in len_texts]  # 加上start和end后的长度
            maxlen_text = max(len_texts)

            pad_id_texts = [
                self.sp.pad_sentence(t, maxlen_text) for t in id_texts
            ]  # 补齐长度

            new_batch_data = {
                'str_texts': texts,  # [batch, len]
                'texts': pad_id_texts,  # [batch, len]
                'len_texts': len_texts,  # [batch]
                'emotions': emotions
            }  # [batch]

            yield new_batch_data
Exemple #12
0
def main(params, train_file, test_file):
  label_dict, word_dict, word_freq_dict, train_data = init_required_data(train_file)
  print(len(word_dict))
  train_iterator = DataIterator(params, label_dict, word_dict, train_data)
  params["n_classes"] = len(label_dict)
  deep_match = DeepMatch(params)
  config = tf.ConfigProto()
  config.gpu_options.allow_growth = True
  config.gpu_options.per_process_gpu_memory_fraction = 0.5
  sess = tf.Session(config=config)
  sess.run(tf.global_variables_initializer())
  i = 1
  for (x, y, word_num) in train_iterator:
    # [input, label] = sess.run([deep_match.input, deep_match.label], feed_dict={deep_match.user_input : x, deep_match.label : y}) 
    [_, loss] = sess.run([deep_match.train_op, deep_match.loss], feed_dict={deep_match.user_input : x, deep_match.label : y, deep_match.word_num : word_num}) 
    if i % 100 == 0:
      print("step %d, loss %.6f" % (i, loss[0]))
    i += 1
  # test accuracy
  label_dict, word_dict, word_freq_dict, test_data = init_required_data(test_file)
  test_iterator = DataIterator(params, label_dict, word_dict, test_data)
  acc = 0
  batch_num = len(test_data) // params["batch_size"]
  for (x, y, word_num) in test_iterator:
    batch_acc = sess.run(deep_match.accuracy, feed_dict={deep_match.user_input : x, deep_match.label : y, deep_match.word_num : word_num})
    acc += batch_acc
  print(acc / batch_num)
Exemple #13
0
def test(buckets,
        train_file = "local_train_splitByUser",
        test_file = "local_test_splitByUser",
        uid_voc = "uid_voc.pkl",
        mid_voc = "mid_voc.pkl",
        cat_voc = "cat_voc.pkl",
        model_folder = "dnn_best_model/ckpt_noshuff",
        batch_size = 128,
        maxlen = 100,
        model_type = 'DNN',
	seed = 2
):

    model_path = model_folder + model_type + str(seed)
    gpu_options = tf.GPUOptions(allow_growth=True)
    with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) as sess:
        train_data = DataIterator(train_file,buckets, uid_voc, mid_voc, cat_voc, batch_size, maxlen)
        test_data = DataIterator(test_file,buckets, uid_voc, mid_voc, cat_voc, batch_size, maxlen)
        n_uid, n_mid, n_cat = train_data.get_n()
        if model_type == 'PNN':
            model = Model_PNN(n_uid, n_mid, n_cat, EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE)
        elif model_type == 'Wide':
            model = Model_WideDeep(n_uid, n_mid, n_cat, EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE)
        elif model_type == 'DIN':
            model = Model_DIN(n_uid, n_mid, n_cat, EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE)
        elif model_type == 'DIEN':
            model = Model_DIEN(n_uid, n_mid, n_cat, EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE)
        elif model_type =='DHAN':
            model = Model_DHAN(n_uid, n_mid, n_cat, EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE)
        else:
            print ("Invalid model_type : %s", model_type)
            return
        model.restore(sess, model_path)
        print('test_auc: %.4f ----test_loss: %.4f ---- test_accuracy: %.4f ---- test_aux_loss: %.4f' % eval(sess, test_data, model, model_path))
Exemple #14
0
def beam_search():
    mylog("Reading Data...")
    task = Task(FLAGS.dataset)
    _, _, test_set, embAttr, START_ID, _, _, evaluation, uids = read_data(
        task, test=True)
    test_bucket_sizes = [len(test_set[b]) for b in xrange(len(_buckets))]
    test_total_size = int(sum(test_bucket_sizes))

    # reports
    mylog(_buckets)
    mylog("Test:")
    mylog("total: {}".format(test_total_size))
    mylog("buckets: {}".format(test_bucket_sizes))

    with tf.Session(config=tf.ConfigProto(allow_soft_placement=True,
                                          log_device_placement=False)) as sess:

        # runtime profile
        if FLAGS.profile:
            run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE)
            run_metadata = tf.RunMetadata()
        else:
            run_options = None
            run_metadata = None

        mylog("Creating Model")
        model = create_model(sess, embAttr, START_ID, run_options,
                             run_metadata)
        show_all_variables()
        model.init_beam_decoder()

        sess.run(model.dropoutRate.assign(1.0))

        start_id = 0
        n_steps = 0
        batch_size = FLAGS.batch_size

        dite = DataIterator(model, test_set, len(_buckets), batch_size, None)
        ite = dite.next_sequence(stop=True, recommend=True)

        n_total_user = len(uids)
        n_recommended = 0
        uid2rank = {}
        for r, uid in enumerate(uids):
            uid2rank[uid] = r
        rec = np.zeros((n_total_user, FLAGS.topk), dtype=int)
        rec_value = np.zeros((n_total_user, FLAGS.topk), dtype=float)
        start = time.time()

        for users, inputs, positions, valids, bucket_id in ite:
            print(inputs)
            print(positions)
            results = model.beam_step(sess,
                                      index=0,
                                      user_input=users,
                                      item_inputs=inputs,
                                      sequence_length=positions,
                                      bucket_id=bucket_id)
            break
Exemple #15
0
def test(
        train_file = "local_train",
        test_file = "local_test",
        uid_voc = "uid_voc_large.pkl",
        mid_voc = "mid_voc_large.pkl",
        cat_voc = "cat_voc_large.pkl",
        batch_size = 128,
        maxlen = 100,
        model_type = 'ASVD',
	      seed = 2
):

    model_path = "dnn_best_model/ckpt_noshuff" + model_type + str(seed)
    gpu_options = tf.GPUOptions(allow_growth=True)
    with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) as sess:
        train_data = DataIterator(train_file, uid_voc, mid_voc, cat_voc, batch_size, maxlen)
        test_data = DataIterator(test_file, uid_voc, mid_voc, cat_voc, batch_size, maxlen)
        n_uid, n_mid, n_cat = train_data.get_n()
        #Baselines
        if model_type == 'ASVD': 
            model = Model_ASVD(n_uid, n_mid, n_cat, EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE)
        elif model_type == 'DIN':
            model = Model_DIN(n_uid, n_mid, n_cat, EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE) 
        elif model_type == 'LSTM':
            model = Model_LSTM(n_uid, n_mid, n_cat, EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE) 
        elif model_type == 'LSTMPP':
            model = Model_LSTMPP(n_uid, n_mid, n_cat, EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE)
        elif model_type == 'NARM':
            model = Model_NARM(n_uid, n_mid, n_cat, EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE)
        elif model_type == 'CARNN':
            model = Model_CARNN(n_uid, n_mid, n_cat, EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE)
        elif model_type == 'Time1LSTM':
            model = Model_Time1LSTM(n_uid, n_mid, n_cat, EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE) 
        elif model_type == 'Time2LSTM':
            model = Model_Time2LSTM(n_uid, n_mid, n_cat, EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE) 
        elif model_type == 'Time3LSTM':
            model = Model_Time3LSTM(n_uid, n_mid, n_cat, EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE)   
        elif model_type == 'DIEN':
            model = Model_DIEN(n_uid, n_mid, n_cat, EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE) 
        #Our models  
        elif model_type == 'A2SVD':
            model = Model_A2SVD(n_uid, n_mid, n_cat, EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE)  
        elif model_type == 'T_SeqRec': 
            model = Model_T_SeqRec(n_uid, n_mid, n_cat, EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE) 
        elif model_type == 'TC_SeqRec_I':
            model = Model_TC_SeqRec_I(n_uid, n_mid, n_cat, EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE)
        elif model_type == 'TC_SeqRec_G':
            model = Model_TC_SeqRec_G(n_uid, n_mid, n_cat, EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE)  
        elif model_type == 'TC_SeqRec':
            model = Model_TC_SeqRec(n_uid, n_mid, n_cat, EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE) 
        elif model_type == 'SLi_Rec_Fixed':
            model = Model_SLi_Rec_Fixed(n_uid, n_mid, n_cat, EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE) 
        elif model_type == 'SLi_Rec_Adaptive':
            model = Model_SLi_Rec_Adaptive(n_uid, n_mid, n_cat, EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE)
        else:
            print ("Invalid model_type : %s", model_type)
            return
        model.restore(sess, model_path)
        print('test_auc: %.4f ----test_loss: %.4f ---- test_accuracy: %.4f ---- test_aux_loss: %.4f' % eval(sess, test_data, model, model_path))
Exemple #16
0
def eval(estimator):
    eval_file_pattern = "./data/part-5"
    data_iterator = DataIterator(params)
    eval_input_fn = lambda: data_iterator.input_fn(eval_file_pattern, 'offline'
                                                   )
    eval_results = estimator.evaluate(input_fn=eval_input_fn)
    auc_score = eval_results["auc"]
    # print(type(auc_score)) # numpy.float32
    print("\nTest auc: %.6f" % auc_score)
    def get_batch_data(self):

        if self.shuffle:
            random.shuffle(self.data)

        it = DataIterator(self.data, self.batch_size)

        for batch_data in it.get_batch_data():

            str_posts, str_responses = [], []

            for item in batch_data:

                str_posts.append(item['post'])
                str_responses.append(item['response'])

            id_posts, id_responses = [], []
            len_posts, len_responses = [], []

            for post in str_posts:

                id_post, len_post = self.sp.word2index(post)

                id_posts.append(id_post)
                len_posts.append(len_post)

            for response in str_responses:

                id_response, len_response = self.sp.word2index(response)

                id_responses.append(id_response)
                len_responses.append(len_response)

            len_posts = [l + 2 for l in len_posts]  # 加上start和end后的长度
            len_responses = [l + 2 for l in len_responses]

            maxlen_post = max(len_posts)
            maxlen_response = max(len_responses)

            pad_id_posts = [
                self.sp.pad_sentence(p, maxlen_post) for p in id_posts
            ]  #
            pad_id_responses = [
                self.sp.pad_sentence(r, maxlen_response) for r in id_responses
            ]

            new_batch_data = {
                'str_posts': str_posts,
                'str_responses': str_responses,
                'posts': pad_id_posts,
                'responses': pad_id_responses,
                'len_posts': len_posts,
                'len_responses': len_responses
            }

            yield new_batch_data
Exemple #18
0
def test(train_file="local_train_splitByUser",
         test_file="local_test_splitByUser",
         batch_size=BATCH_SIZE,
         maxlen=MAXLEN,
         model_type='DNN',
         seed=2):

    model_path = "dnn_best_model/ckpt_noshuff" + model_type + str(seed)
    gpu_options = tf.GPUOptions(allow_growth=True)
    with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) as sess:
        train_data = DataIterator(train_file, FEATURE_COUNT, QUERY_COUNT,
                                  voc_list, batch_size, maxlen)
        test_data = DataIterator(test_file, FEATURE_COUNT, QUERY_COUNT,
                                 voc_list, batch_size, maxlen)
        n_query, n = train_data.get_n()

        if model_type == 'DNN':
            model = Model_DNN(n, n_query, EMBEDDING_DIM, HIDDEN_SIZE,
                              ATTENTION_SIZE)
        elif model_type == 'PNN':
            model = Model_PNN(n, n_query, EMBEDDING_DIM, HIDDEN_SIZE,
                              ATTENTION_SIZE)
        elif model_type == 'Wide':
            model = Model_WideDeep(n, n_query, EMBEDDING_DIM, HIDDEN_SIZE,
                                   ATTENTION_SIZE)
        elif model_type == 'DIN':
            model = Model_DIN(n, n_query, EMBEDDING_DIM, HIDDEN_SIZE,
                              ATTENTION_SIZE)
        elif model_type == 'DIN-V2-gru-att-gru':
            model = Model_DIN_V2_Gru_att_Gru(n, n_query, EMBEDDING_DIM,
                                             HIDDEN_SIZE, ATTENTION_SIZE)
        elif model_type == 'DIN-V2-gru-gru-att':
            model = Model_DIN_V2_Gru_Gru_att(n, n_query, EMBEDDING_DIM,
                                             HIDDEN_SIZE, ATTENTION_SIZE)
        elif model_type == 'DIN-V2-gru-qa-attGru':
            model = Model_DIN_V2_Gru_QA_attGru(n, n_query, EMBEDDING_DIM,
                                               HIDDEN_SIZE, ATTENTION_SIZE)
        elif model_type == 'DIN-V2-gru-vec-attGru':
            model = Model_DIN_V2_Gru_Vec_attGru(n, n_query, EMBEDDING_DIM,
                                                HIDDEN_SIZE, ATTENTION_SIZE)
        elif model_type == 'DIEN':
            model = Model_DIN_V2_Gru_Vec_attGru_Neg(n, n_query, EMBEDDING_DIM,
                                                    HIDDEN_SIZE,
                                                    ATTENTION_SIZE)
        else:
            print("Invalid model_type : %s", model_type)
            return
        with tf.summary.FileWriter('./test_log') as writer:
            writer.add_graph(sess.graph)
            model.restore(sess, model_path)
            print(
                'test_auc: %.4f ----test_loss: %.4f ---- test_accuracy: %.4f ---- test_aux_loss: %.4f'
                % eval(sess, test_data, model, model_path))
            writer.flush()
def train(train_file = "data/local_train_splitByUser",
        test_file = "data/local_test_splitByUser",
        uid_voc = "data/uid_voc.pkl",
        mid_voc = "data/mid_voc.pkl",
        cat_voc = "data/cat_voc.pkl",
        batch_size = 128,
        maxlen = 100,
        test_iter = 100,
        save_iter = 100,
        model_type = 'DNN',
	    seed = 2,
    ):

    model_path = "dnn_save_path/ckpt_noshuff" + model_type + str(seed)
    best_model_path = "dnn_bast_model/ckpt_noshuff" + model_type + str(seed)
    gpu_options = tf.GPUOptions(allow_growth=True)
    with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) as sess:
        train_data = DataIterator(train_file,uid_voc,mid_voc,cat_voc,batch_size,maxlen)
        test_data = DataIterator(test_file,uid_voc,mid_voc,cat_voc,batch_size,maxlen)

        n_uid,n_mid,n_cat = train_data.get_n()
        model = Model_DIN_V2_Gru_Vec_attGru_Neg(n_uid,n_mid,n_cat,EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE)

        sess.run(tf.global_variables_initializer())
        sess.run(tf.local_variables_initializer())
        iter = 0
        lr = 0.001
        for itr in range(3):
            loss_sum = 0.0
            accuracy_sum = 0.0
            aux_loss_sum = 0.0

            for src,tgt in train_data:
                uids, mids, cats, mid_his, cat_his, mid_mask, target, sl, noclk_mids, noclk_cats = prepare_data(src,tgt,maxlen,return_neg=True)
                loss, acc, aux_loss = model.train(sess, [uids, mids, cats, mid_his, cat_his, mid_mask, target, sl, lr,noclk_mids, noclk_cats])
                loss_sum += loss
                accuracy_sum += acc
                aux_loss_sum += aux_loss
                iter += 1
                if (iter % test_iter) == 0:
                    print('iter: %d ----> train_loss: %.8f ---- train_accuracy: %.4f ---- tran_aux_loss: %.4f' % \
                          (iter, loss_sum / test_iter, accuracy_sum / test_iter, aux_loss_sum / test_iter))
                    print('test_auc: %.4f ----test_loss: %.4f ---- test_accuracy: %.4f ---- test_aux_loss: %.4f' %
                          eval(sess, test_data, model, best_model_path))
                    loss_sum = 0.0
                    accuracy_sum = 0.0
                    aux_loss_sum = 0.0
                if (iter % save_iter) == 0:
                    print('save model iter: %d' % (iter))
                    model.save(sess, model_path + "--" + str(iter))
            lr *= 0.5
def train(train_file = "data/local_train_splitByUser",
        test_file = "data/local_test_splitByUser",
        uid_voc = "data/uid_voc.pkl",
        mid_voc = "data/mid_voc.pkl",
        cat_voc = "data/cat_voc.pkl",
        batch_size = 128,
        maxlen = 100,
        test_iter = 100,
        save_iter = 100,
        model_type = 'DNN',
	    seed = 2,
    ):

    model_path = "dnn_save_path/ckpt_noshuff" + model_type + str(seed)
    best_model_path = "dnn_bast_model/ckpt_noshuff" + model_type + str(seed)
    gpu_options = tf.GPUOptions(allow_growth=True)
    with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) as sess:
        train_data = DataIterator(train_file,uid_voc,mid_voc,cat_voc,batch_size,maxlen)
        test_data = DataIterator(test_file,uid_voc,mid_voc,cat_voc,batch_size,maxlen)

        n_uid,n_mid,n_cat = train_data.get_n()
        model = Model_DIN_V2_Gru_Vec_attGru_Neg(n_uid,n_mid,n_cat,EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE)

        sess.run(tf.global_variables_initializer())
        sess.run(tf.local_variables_initializer())
        iter = 0
        lr = 0.001
        for itr in range(3):
            loss_sum = 0.0
            accuracy_sum = 0.0
            aux_loss_sum = 0.0

            for src,tgt in train_data:
                uids, mids, cats, mid_his, cat_his, mid_mask, target, sl, noclk_mids, noclk_cats = prepare_data(src,tgt,maxlen,return_neg=True)
                loss, acc, aux_loss = model.train(sess, [uids, mids, cats, mid_his, cat_his, mid_mask, target, sl, lr,noclk_mids, noclk_cats])
                loss_sum += loss
                accuracy_sum += acc
                aux_loss_sum += aux_loss
                iter += 1
                if (iter % test_iter) == 0:
                    print('iter: %d ----> train_loss: %.8f ---- train_accuracy: %.4f ---- tran_aux_loss: %.4f' % \
                          (iter, loss_sum / test_iter, accuracy_sum / test_iter, aux_loss_sum / test_iter))
                    print('test_auc: %.4f ----test_loss: %.4f ---- test_accuracy: %.4f ---- test_aux_loss: %.4f' %
                          eval(sess, test_data, model, best_model_path))
                    loss_sum = 0.0
                    accuracy_sum = 0.0
                    aux_loss_sum = 0.0
                if (iter % save_iter) == 0:
                    print('save model iter: %d' % (iter))
                    model.save(sess, model_path + "--" + str(iter))
            lr *= 0.5
Exemple #21
0
    def run(self, is_test=False, is_log=False):
        if is_test:

            self._test_data_iterator = DataIterator(
                data_path=os.path.abspath(
                    os.path.join(self._base_path, self._conf["test_file"])),
                digit_vocab=self._digit_vocab,
                data_type_vocab=self._data_type_vocab,
                operation_vocab=self._operation_vocab,
                lambda_vocab=self._lambda_vocab,
                batch_size=self._batch_size,
                max_argument_num=self._max_argument_num,
                max_memory_size=self._max_memory_size,
                max_value_length=self._max_value_size,
                case_num=self._case_num)

            test_accuracy, test_opt_accuracy, test_arg_accuracy = self.test(
                self._test_data_iterator, is_log=is_log)
            tqdm.write(
                "Test, accuracy: %f, opt_accuracy: %f, arg_accuracy: %f" %
                (test_accuracy, test_opt_accuracy, test_arg_accuracy))
        else:

            self._train_data_iterator = DataIterator(
                data_path=os.path.abspath(
                    os.path.join(self._base_path, self._conf["train_file"])),
                digit_vocab=self._digit_vocab,
                data_type_vocab=self._data_type_vocab,
                operation_vocab=self._operation_vocab,
                lambda_vocab=self._lambda_vocab,
                batch_size=self._batch_size,
                max_argument_num=self._max_argument_num,
                max_memory_size=self._max_memory_size,
                max_value_length=self._max_value_size,
                case_num=self._case_num)

            self._dev_data_iterator = DataIterator(
                data_path=os.path.abspath(
                    os.path.join(self._base_path, self._conf["dev_file"])),
                digit_vocab=self._digit_vocab,
                data_type_vocab=self._data_type_vocab,
                operation_vocab=self._operation_vocab,
                lambda_vocab=self._lambda_vocab,
                batch_size=self._batch_size,
                max_argument_num=self._max_argument_num,
                max_memory_size=self._max_memory_size,
                max_value_length=self._max_value_size,
                case_num=self._case_num)

            self.train()
Exemple #22
0
def predict(estimator):
    predict_file = "./data/part-predict"
    data_iterator = DataIterator(params)
    with open(predict_file, 'r') as infile:
        for line in infile:
            line = line.strip('\n')
            items = line.split('\t')
            dmp_id = items[0]
            ins = "\t".join(items[1:])
            predict_input_fn = lambda: data_iterator.input_fn(ins, 'online')
            predictions = estimator.predict(input_fn=predict_input_fn)
            predictions = itertools.islice(predictions, 1)
            for i, p in enumerate(predictions):
                print("dmp_id %s: logits:%.6f probability:%.6f" %
                      (dmp_id, p["logits"], p["probabilities"]))
Exemple #23
0
def kfold_validate(model, k, kwargs):
  """
  This functin does something similar to k fold validation. We train and test 
  our model k times, by randomly splitting our entire data set into three parts
  (train, dev and test) and return the average of the K runs.
  Args:
      model (str): What kind of model to use. It can be either lstm or cnn
      k (int): Number of iterations over which to average
      kwargs (dict): The parameters that define the model
  
  Returns:
      dict: A dictionary of results, contating the keys precision, recall and 
        fscore.
  """
  p_1 = 0.0
  r_1 = 0.0
  f_1 = 0.0
  train_data = ATEDataProcessor(kwargs["train_file"], **kwargs)
  test_data = ATEDataProcessor(kwargs["test_file"],
                               pos_id=get_count(train_data.annotated_sentences),
                               **kwargs)
  sentences = train_data.annotated_sentences + test_data.annotated_sentences
  for i in range(k):
    print("Run number: {}".format(i))
    train_set, test_set = split(sentences, test_size=0.2, random_state=42)
    train_set, dev_set = split(train_set, test_size=kwargs["test_size"], 
                               random_state=42)
    train = DataIterator(train_set, **kwargs)
    dev = DataIterator(dev_set, **kwargs)
    test = DataIterator(test_set, **kwargs)
    if model == "lstm":
      model = LSTMNetwork(**kwargs)
    elif model == "cnn":
      model = CNNNetwork(max_sentence_length=train_data.max_sentence_len,
                         **kwargs)
    model.build()
    model.train(train, dev)
    results = model.evaluate(test)
    p_1 += float(results["p_1"])
    r_1 += float(results["r_1"])
    f_1 += float(results["f_1"])
    model.close_session()
  print("p_1: {}\nr_1: {}\nf_1: {}".format(p_1/k, r_1/k, f_1/k))
  return {
    "precision": p_1/k,
    "recall": r_1/k,
    "fscore": f_1/k
  }
Exemple #24
0
 def get_generator():
     print('dddddddddddddddd')
     train_generator = DataIterator(train_file, uid_voc, mid_voc, cat_voc, batch_size, maxlen, shuffle_each_epoch=False)
     for src, tgt in train_generator:
         uids, mids, cats, mid_his, cat_his, mid_mask, target, sl, noclk_mids, noclk_cats = prepare_data(src, tgt, maxlen, return_neg=True)
         features['uids'] = uids
         features['mids'] = mids
         features['cats' ] = cats
         features['mid_his']= mid_his
         features['cat_his'] = cat_his
         features['mid_mask'] = mid_mask
         features['sl'] = sl
         features['noclk_mids'] = noclk_mids
         features['noclk_cats'] = noclk_cats
         '''
         print("features uids:{}".format(features['uids']))
         print("features mids:{}".format(features['uids']))
         print("features cats:{}".format(features['uids']))
         print("features mid_his:{}".format(features['mid_his']))
         print("features cat_his:{}".format(features['cat_his']))
         print("features mid_mask:{}".format(features['mid_mask']))
         print("features sl:{}".format(features['sl']))
         print("features noclk_mids:{}".format(features['noclk_mids']))
         print("features noclk_cats:{}".format(features['noclk_cats']))
         print("target:{}".format(target))
         print("features:{}".format(features))
         '''
         #features =[uids, mids, cats, mid_his, cat_his, mid_mask, sl, noclk_mids, noclk_cats]
         yield features, target
Exemple #25
0
def main(network_type):
    if network_type == "cnn":
        print("Testing CNN network")
        from cnn_params import params
    if network_type == "lstm":
        print("Testing LSTM network")
        from params import params
    train_data = ATEDataProcessor(params["train_file"], **params)
    test_data = ATEDataProcessor(params["test_file"],
                                 pos_id=get_count(
                                     train_data.annotated_sentences),
                                 **params)

    test_set = test_data.annotated_sentences
    test = DataIterator(test_set,
                        word_file=params["word_file"],
                        char_file=params["char_file"])
    if network_type == "cnn":
        model = CNNNetwork(max_sentence_length=train_data.max_sentence_len,
                           **params)
    elif network_type == "lstm":
        model = LSTMNetwork(**params)
    model.build()
    model.restore_session(model.model_directory)
    model.evaluate(test)
Exemple #26
0
def test(test_file,
         cate_file,
         item_count,
         dataset="book",
         batch_size=128,
         maxlen=100,
         model_type='DNN',
         lr=0.001):
    exp_name = get_exp_name(dataset,
                            model_type,
                            batch_size,
                            lr,
                            maxlen,
                            save=False)
    best_model_path = "best_model/" + exp_name + '/'
    gpu_options = tf.GPUOptions(allow_growth=True)
    model = get_model(dataset, model_type, item_count, batch_size, maxlen)
    item_cate_map = load_item_cate(cate_file)

    with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) as sess:
        model.restore(sess, best_model_path)

        test_data = DataIterator(test_file, batch_size, maxlen, train_flag=2)
        metrics = evaluate_full(sess,
                                test_data,
                                model,
                                best_model_path,
                                batch_size,
                                item_cate_map,
                                save=False,
                                coef=args.coef)
        print(', '.join([
            'test ' + key + ': %.6f' % value for key, value in metrics.items()
        ]))
Exemple #27
0
def main(params):
    text_cnn = TextCNN(params)
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    config.gpu_options.per_process_gpu_memory_fraction = 0.5
    with tf.Session(config=config) as sess:
        sess.run(tf.global_variables_initializer())
        for i in range(params["epoch"]):
            train_iterator = DataIterator(params["train_file"],
                                          params["train_data_size"],
                                          params["batch_size"])
            sum_loss = 0.0
            for x, y in train_iterator:
                _, loss, score, label, global_step = sess.run([text_cnn.train_op, \
                                                               text_cnn.loss, \
                                                               text_cnn.score, \
                                                               text_cnn.max_score_label, \
                                                               text_cnn.global_step], \
                                                               feed_dict={text_cnn.input : x, text_cnn.label : y})
                sum_loss += loss
                if global_step % 100 == 0:
                    logging.info("global_step:%d, loss:%.6f" %
                                 (global_step, sum_loss / 100))
                    eval(sess, text_cnn)
                    sum_loss = 0.0
Exemple #28
0
def train_and_eval(estimator):
    tf.logging.set_verbosity(tf.logging.INFO)
    # train_file_pattern and eval_file_pattern could be the parameters of FLAGS
    train_file_pattern = "./data/part-00000"
    eval_file_pattern = "./data/part-5"
    data_iterator = DataIterator(params)
    train_input_fn = lambda: data_iterator.input_fn(train_file_pattern,
                                                    'offline')
    eval_input_fn = lambda: data_iterator.input_fn(eval_file_pattern, 'offline'
                                                   )
    train_spec = tf.estimator.TrainSpec(input_fn=train_input_fn,
                                        max_steps=None)
    eval_spec = tf.estimator.EvalSpec(input_fn=eval_input_fn,
                                      steps=100,
                                      start_delay_secs=60,
                                      throttle_secs=30)
    tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec)
Exemple #29
0
def eval(sess, model):
    iterator = DataIterator(params["test_file"], params["test_data_size"],
                            params["test_data_size"])
    for x, y in iterator:
        accuracy, loss = sess.run([model.accuracy, model.loss],
                                  feed_dict={
                                      model.input: x,
                                      model.label: y
                                  })
        logging.info("accuracy:%.6f, test_loss:%.6f" % (accuracy, loss))
Exemple #30
0
def main():
    train_file_pattern = "./data/part-00000"
    eval_file_pattern = "./data/part-5"
    data_iterator = DataIterator(params)
    train_input_fn = lambda: data_iterator.input_fn(train_file_pattern,
                                                    'offline')
    eval_input_fn = lambda: data_iterator.input_fn(eval_file_pattern, 'offline'
                                                   )
    predict_input_fn = lambda: data_iterator.input_fn(eval_file_pattern,
                                                      'offline')
    # define estimator
    # estimator = tf.estimator.Estimator(model_fn=model_fn, params=params, model_dir="./model")
    estimator = tf.estimator.Estimator(model_fn=model_fn,
                                       params=params,
                                       model_dir="./model")

    # train(estimator)
    # eval(estimator)
    train_and_eval(estimator)
Exemple #31
0
 def __init__(self,
              train_file="local_train_splitByUser",
              test_file="local_test_splitByUser",
              uid_voc="uid_voc.pkl",
              mid_voc="mid_voc.pkl",
              cat_voc="cat_voc.pkl",
              item_info='item-info',
              reviews_info='reviews-info',
              batch_size=128,
              maxlen=100,
              embedding_dim=None,
              return_neg=True):
     self.maxlen = maxlen
     self.embedding_dim = embedding_dim
     self.return_neg = return_neg
     self.train_data = DataIterator(
         train_file, uid_voc, mid_voc, cat_voc, item_info, reviews_info, batch_size, maxlen, shuffle_each_epoch=False)
     self.test_data = DataIterator(
         test_file, uid_voc, mid_voc, cat_voc, item_info, reviews_info, batch_size, maxlen)
     self.n_uid, self.n_mid, self.n_cat = self.train_data.get_n()
def test(
        train_file = "local_train_splitByUser",
        test_file = "local_test_splitByUser",
        uid_voc = "uid_voc.pkl",
        mid_voc = "mid_voc.pkl",
        cat_voc = "cat_voc.pkl",
        batch_size = 128,
        maxlen = 100,
        model_type = 'DNN',
	seed = 2
):

    model_path = "dnn_best_model/ckpt_noshuff" + model_type + str(seed)
    gpu_options = tf.GPUOptions(allow_growth=True)
    with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) as sess:
        train_data = DataIterator(train_file, uid_voc, mid_voc, cat_voc, batch_size, maxlen)
        test_data = DataIterator(test_file, uid_voc, mid_voc, cat_voc, batch_size, maxlen)
        n_uid, n_mid, n_cat = train_data.get_n()
        if model_type == 'DNN':
            model = Model_DNN(n_uid, n_mid, n_cat, EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE)
        elif model_type == 'PNN':
            model = Model_PNN(n_uid, n_mid, n_cat, EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE)
        elif model_type == 'Wide':
	    model = Model_WideDeep(n_uid, n_mid, n_cat, EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE)
        elif model_type == 'DIN':
            model = Model_DIN(n_uid, n_mid, n_cat, EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE)
        elif model_type == 'DIN-V2-gru-att-gru':
            model = Model_DIN_V2_Gru_att_Gru(n_uid, n_mid, n_cat, EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE)
        elif model_type == 'DIN-V2-gru-gru-att':
            model = Model_DIN_V2_Gru_Gru_att(n_uid, n_mid, n_cat, EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE)
        elif model_type == 'DIN-V2-gru-qa-attGru':
            model = Model_DIN_V2_Gru_QA_attGru(n_uid, n_mid, n_cat, EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE)
        elif model_type == 'DIN-V2-gru-vec-attGru':
            model = Model_DIN_V2_Gru_Vec_attGru(n_uid, n_mid, n_cat, EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE)
	elif model_type == 'DIEN':
            model = Model_DIN_V2_Gru_Vec_attGru_Neg(n_uid, n_mid, n_cat, EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE)
        else:
            print ("Invalid model_type : %s", model_type)
            return
        model.restore(sess, model_path)
        print('test_auc: %.4f ----test_loss: %.4f ---- test_accuracy: %.4f ---- test_aux_loss: %.4f' % eval(sess, test_data, model, model_path))
def test(
        train_file = "local_train_splitByUser",
        test_file = "local_test_splitByUser",
        uid_voc = "uid_voc.pkl",
        mid_voc = "mid_voc.pkl",
        cat_voc = "cat_voc.pkl",
        batch_size = 128,
        maxlen = 100,
        model_type = 'DNN',
	seed = 2
):

    model_path = "dnn_best_model/ckpt_noshuff" + model_type + str(seed)
    gpu_options = tf.GPUOptions(allow_growth=True)
    with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) as sess:
        train_data = DataIterator(train_file, uid_voc, mid_voc, cat_voc, batch_size, maxlen)
        test_data = DataIterator(test_file, uid_voc, mid_voc, cat_voc, batch_size, maxlen)
        n_uid, n_mid, n_cat = train_data.get_n()
        model = Model_DIN_V2_Gru_Vec_attGru_Neg(n_uid, n_mid, n_cat, EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE)
        model.restore(sess, model_path)
        print('test_auc: %.4f ----test_loss: %.4f ---- test_accuracy: %.4f ---- test_aux_loss: %.4f' % eval(sess, test_data, model, model_path))
Exemple #34
0
def test(
        train_file = "local_train_sample_sorted_by_time",
        test_file = "local_test_sample_sorted_by_time",
        uid_voc = "uid_voc.pkl",
        mid_voc = "mid_voc.pkl",
        cat_voc = "cat_voc.pkl",
        batch_size = 128,
        user_maxlen = 50,
        maxlen = 20,
        model_type = 'DNN',
	seed = 2
):

    model_path = "dnn_best_model/ckpt_noshuff" + model_type + str(seed)+ "_"+str(user_maxlen)
    gpu_options = tf.GPUOptions(allow_growth=True)
    with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) as sess:
        train_data = DataIterator(train_file, uid_voc, mid_voc, cat_voc, batch_size, maxlen)
        test_data = DataIterator(test_file, uid_voc, mid_voc, cat_voc, batch_size, maxlen)
        n_uid, n_mid, n_cat = train_data.get_n()
        if model_type == 'DNN':
            model = Model_DNN(n_uid, n_mid, n_cat, EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE)
        elif model_type == 'SVDPP':
            model = Model_SVDPP(n_uid, n_mid, n_cat, EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE)
        elif model_type == 'GRU4REC':
            model = Model_GRU4REC(n_uid, n_mid, n_cat, EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE)
        elif model_type == 'PNN':
            model = Model_PNN(n_uid, n_mid, n_cat, EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE)
        elif model_type == 'DUMN':
            model = Model_DUMN(n_uid, n_mid, n_cat, EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE)
        elif model_type == 'Wide':
            model = Model_WideDeep(n_uid, n_mid, n_cat, EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE)
        elif model_type == 'DIN':
            model = Model_DIN(n_uid, n_mid, n_cat, EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE)
        elif model_type == 'DIEN':
            model = DIEN(n_uid, n_mid, n_cat, EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE)
        else:
            print ("Invalid model_type : %s", model_type)
            return
        model.restore(sess, model_path)
        print('test_auc: %.4f ----test_loss: %.4f ---- test_accuracy: %.4f ---- Logloss: %.4f' % eval(sess, test_data, model, model_path, maxlen,user_maxlen))
Exemple #35
0
class SampleIO(object):
    def __init__(self,
                 train_file="local_train_splitByUser",
                 test_file="local_test_splitByUser",
                 uid_voc="uid_voc.pkl",
                 mid_voc="mid_voc.pkl",
                 cat_voc="cat_voc.pkl",
                 item_info='item-info',
                 reviews_info='reviews-info',
                 batch_size=128,
                 maxlen=100,
                 embedding_dim=None,
                 return_neg=True):
        self.maxlen = maxlen
        self.embedding_dim = embedding_dim
        self.return_neg = return_neg
        self.train_data = DataIterator(
            train_file, uid_voc, mid_voc, cat_voc, item_info, reviews_info, batch_size, maxlen, shuffle_each_epoch=False)
        self.test_data = DataIterator(
            test_file, uid_voc, mid_voc, cat_voc, item_info, reviews_info, batch_size, maxlen)
        self.n_uid, self.n_mid, self.n_cat = self.train_data.get_n()

    def get_n(self):
        return self.n_uid, self.n_mid, self.n_cat

    def next_train(self):
        if self.return_neg:
            return self._py_func(self._next_train)
        else:
            return self._py_func(self._next_train, sparse_cnt=5)

    def next_test(self):
        if self.return_neg:
            return self._py_func(self._next_test)
        else:
            return self._py_func(self._next_test, sparse_cnt=5)

    def _next_train(self):
        try:
            src, tgt = self.train_data.next()
        except StopIteration:
            self.src = self.tgt = None
            raise OutOfRange("train end")
        return self.prepare_data(src, tgt, self.maxlen, return_neg=self.return_neg)

    def _next_test(self):
        try:
            src, tgt = self.test_data.next()
        except StopIteration:
            self.src = self.tgt = None
            raise OutOfRange("test end")
        return self.prepare_data(src, tgt, self.maxlen, return_neg=self.return_neg)

    def _py_func(self, fn, sparse_cnt=7):
        types = []
        for _ in range(sparse_cnt):
            types.extend([np.int64, np.float32, np.int32])
        types.extend([np.float32, np.float32, np.int32])
        types.extend([np.int32 for _ in range(5)])
        datas = xdl.py_func(fn, [], output_type=types)
        sparse_tensors = []
        for i in range(sparse_cnt):
            sparse_tensors.append(xdl.SparseTensor(
                    datas[3 * i], datas[3 * i + 1], datas[3 * i + 2]))
        return sparse_tensors + datas[sparse_cnt * 3:]

    def prepare_data(self, input, target, maxlen=None, return_neg=False):
        # x: a list of sentences
        lengths_x = [len(s[4]) for s in input]
        seqs_mid = [inp[3] for inp in input]
        seqs_cat = [inp[4] for inp in input]
        noclk_seqs_mid = [inp[5] for inp in input]
        noclk_seqs_cat = [inp[6] for inp in input]

        if maxlen is not None:
            new_seqs_mid = []
            new_seqs_cat = []
            new_noclk_seqs_mid = []
            new_noclk_seqs_cat = []
            new_lengths_x = []
            for l_x, inp in zip(lengths_x, input):
                if l_x > maxlen:
                    new_seqs_mid.append(inp[3][l_x - maxlen:])
                    new_seqs_cat.append(inp[4][l_x - maxlen:])
                    new_noclk_seqs_mid.append(inp[5][l_x - maxlen:])
                    new_noclk_seqs_cat.append(inp[6][l_x - maxlen:])
                    new_lengths_x.append(maxlen)
                else:
                    new_seqs_mid.append(inp[3])
                    new_seqs_cat.append(inp[4])
                    new_noclk_seqs_mid.append(inp[5])
                    new_noclk_seqs_cat.append(inp[6])
                    new_lengths_x.append(l_x)
            lengths_x = new_lengths_x
            seqs_mid = new_seqs_mid
            seqs_cat = new_seqs_cat
            noclk_seqs_mid = new_noclk_seqs_mid
            noclk_seqs_cat = new_noclk_seqs_cat

            if len(lengths_x) < 1:
                return None, None, None, None

        n_samples = len(seqs_mid)
        maxlen_x = np.max(lengths_x) + 1
        neg_samples = len(noclk_seqs_mid[0][0])

        mid_his = np.zeros((n_samples, maxlen_x)).astype('int64')
        cat_his = np.zeros((n_samples, maxlen_x)).astype('int64')
        noclk_mid_his = np.zeros(
            (n_samples, maxlen_x, neg_samples)).astype('int64')
        noclk_cat_his = np.zeros(
            (n_samples, maxlen_x, neg_samples)).astype('int64')
        mid_mask = np.zeros((n_samples, maxlen_x)).astype('float32')
        for idx, [s_x, s_y, no_sx, no_sy] in enumerate(zip(seqs_mid, seqs_cat, noclk_seqs_mid, noclk_seqs_cat)):
            mid_mask[idx, :lengths_x[idx] + 1] = 1.
            mid_his[idx, :lengths_x[idx]] = s_x
            cat_his[idx, :lengths_x[idx]] = s_y
            noclk_mid_his[idx, :lengths_x[idx], :] = no_sx
            noclk_cat_his[idx, :lengths_x[idx], :] = no_sy

        uids = np.array([inp[0] for inp in input], dtype=np.int64)
        mids = np.array([inp[1] for inp in input], dtype=np.int64)
        cats = np.array([inp[2] for inp in input], dtype=np.int64)

        id_values = np.ones([n_samples], np.float32)
        his_values = np.ones([n_samples * maxlen_x], np.float32)
        neg_his_values = np.ones(
            [n_samples * maxlen_x * neg_samples], np.float32)

        id_seg = np.array([i + 1 for i in range(n_samples)], dtype=np.int32)
        his_seg = np.array(
            [i + 1 for i in range(n_samples * maxlen_x)], dtype=np.int32)
        neg_his_seg = np.array(
            [i + 1 for i in range(n_samples * maxlen_x * neg_samples)], dtype=np.int32)

        results = []
        for e in [uids, mids, cats]:
            results.append(np.reshape(e, (-1)))
            results.append(id_values)
            results.append(id_seg)
        for e in [mid_his, cat_his]:
            results.append(np.reshape(e, (-1)))
            results.append(his_values)
            results.append(his_seg)
        if return_neg:
            for e in [noclk_mid_his, noclk_cat_his]:
                results.append(np.reshape(e, (-1)))
                results.append(neg_his_values)
                results.append(neg_his_seg)
        results.extend(
            [mid_mask, np.array(target, dtype=np.float32), np.array(lengths_x, dtype=np.int32)])
        # for split
        results.append(np.array([n_samples, n_samples], dtype=np.int32))
        # shape
        results.extend([np.array([-1, self.embedding_dim], dtype=np.int32),
                        np.array([-1, maxlen_x, self.embedding_dim],
                                 dtype=np.int32),
                        np.array(
                            [-1, maxlen_x, neg_samples, self.embedding_dim], dtype=np.int32),
                        np.array([-1, maxlen_x], dtype=np.int32)])
        return results
def train(
        train_file = "local_train_splitByUser",
        test_file = "local_test_splitByUser",
        uid_voc = "uid_voc.pkl",
        mid_voc = "mid_voc.pkl",
        cat_voc = "cat_voc.pkl",
        batch_size = 128,
        maxlen = 100,
        test_iter = 100,
        save_iter = 100,
        model_type = 'DNN',
	seed = 2,
):
    model_path = "dnn_save_path/ckpt_noshuff" + model_type + str(seed)
    best_model_path = "dnn_best_model/ckpt_noshuff" + model_type + str(seed)
    gpu_options = tf.GPUOptions(allow_growth=True)
    with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) as sess:
        train_data = DataIterator(train_file, uid_voc, mid_voc, cat_voc, batch_size, maxlen, shuffle_each_epoch=False)
        test_data = DataIterator(test_file, uid_voc, mid_voc, cat_voc, batch_size, maxlen)
        n_uid, n_mid, n_cat = train_data.get_n()
        if model_type == 'DNN':
            model = Model_DNN(n_uid, n_mid, n_cat, EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE)
        elif model_type == 'PNN':
            model = Model_PNN(n_uid, n_mid, n_cat, EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE)
	elif model_type == 'Wide':
            model = Model_WideDeep(n_uid, n_mid, n_cat, EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE)
        elif model_type == 'DIN':
            model = Model_DIN(n_uid, n_mid, n_cat, EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE)
        elif model_type == 'DIN-V2-gru-att-gru':
            model = Model_DIN_V2_Gru_att_Gru(n_uid, n_mid, n_cat, EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE)
        elif model_type == 'DIN-V2-gru-gru-att':
            model = Model_DIN_V2_Gru_Gru_att(n_uid, n_mid, n_cat, EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE)
        elif model_type == 'DIN-V2-gru-qa-attGru':
            model = Model_DIN_V2_Gru_QA_attGru(n_uid, n_mid, n_cat, EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE)
        elif model_type == 'DIN-V2-gru-vec-attGru':
            model = Model_DIN_V2_Gru_Vec_attGru(n_uid, n_mid, n_cat, EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE)
	elif model_type == 'DIEN':
            model = Model_DIN_V2_Gru_Vec_attGru_Neg(n_uid, n_mid, n_cat, EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE)
        else:
            print ("Invalid model_type : %s", model_type)
            return
        # model = Model_DNN(n_uid, n_mid, n_cat, EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE)
        sess.run(tf.global_variables_initializer())
        sess.run(tf.local_variables_initializer())
        sys.stdout.flush()
        print('                                                                                      test_auc: %.4f ---- test_loss: %.4f ---- test_accuracy: %.4f ---- test_aux_loss: %.4f' % eval(sess, test_data, model, best_model_path))
        sys.stdout.flush()

        start_time = time.time()
        iter = 0
        lr = 0.001
        for itr in range(3):
            loss_sum = 0.0
            accuracy_sum = 0.
            aux_loss_sum = 0.
            for src, tgt in train_data:
                uids, mids, cats, mid_his, cat_his, mid_mask, target, sl, noclk_mids, noclk_cats = prepare_data(src, tgt, maxlen, return_neg=True)
                loss, acc, aux_loss = model.train(sess, [uids, mids, cats, mid_his, cat_his, mid_mask, target, sl, lr, noclk_mids, noclk_cats])
                loss_sum += loss
                accuracy_sum += acc
                aux_loss_sum += aux_loss
                iter += 1
                sys.stdout.flush()
                if (iter % test_iter) == 0:
                    print('iter: %d ----> train_loss: %.4f ---- train_accuracy: %.4f ---- tran_aux_loss: %.4f' % \
                                          (iter, loss_sum / test_iter, accuracy_sum / test_iter, aux_loss_sum / test_iter))
                    print('                                                                                          test_auc: %.4f ----test_loss: %.4f ---- test_accuracy: %.4f ---- test_aux_loss: %.4f' % eval(sess, test_data, model, best_model_path))
                    loss_sum = 0.0
                    accuracy_sum = 0.0
                    aux_loss_sum = 0.0
                if (iter % save_iter) == 0:
                    print('save model iter: %d' %(iter))
                    model.save(sess, model_path+"--"+str(iter))
            lr *= 0.5