Beispiel #1
0
def get_data(raw_data,
             data_dir=FLAGS.data_dir,
             combine_att=FLAGS.combine_att,
             logits_size_tr=FLAGS.item_vocab_size,
             thresh=FLAGS.vocab_min_thresh,
             use_user_feature=FLAGS.use_user_feature,
             test=FLAGS.test,
             mylog=mylog,
             use_item_feature=FLAGS.use_item_feature,
             recommend=False):

    (data_tr, data_va, u_attr, i_attr, item_ind2logit_ind, logit_ind2item_ind,
     user_index,
     item_index) = read_attributed_data(raw_data_dir=raw_data,
                                        data_dir=data_dir,
                                        combine_att=combine_att,
                                        logits_size_tr=logits_size_tr,
                                        thresh=thresh,
                                        use_user_feature=use_user_feature,
                                        use_item_feature=use_item_feature,
                                        test=test,
                                        mylog=mylog)

    # remove unk
    data_tr = [p for p in data_tr if (p[1] in item_ind2logit_ind)]

    # remove items before week 40
    if FLAGS.after40:
        data_tr = [p for p in data_tr if (to_week(p[2]) >= 40)]

    # item frequency (for sampling)
    item_population, p_item = item_frequency(data_tr, FLAGS.power)

    # UNK and START
    # print(len(item_ind2logit_ind))
    # print(len(logit_ind2item_ind))
    # print(len(item_index))
    START_ID = len(item_index)
    # START_ID = i_attr.get_item_last_index()
    item_ind2logit_ind[START_ID] = 0
    seq_all = form_sequence(data_tr, maxlen=FLAGS.L)
    seq_tr0, seq_va0 = split_train_dev(seq_all, ratio=0.05)

    # calculate buckets
    global _buckets
    _buckets = calculate_buckets(seq_tr0 + seq_va0, FLAGS.L, FLAGS.n_bucket)
    _buckets = sorted(_buckets)

    # split_buckets
    seq_tr = split_buckets(seq_tr0, _buckets)
    seq_va = split_buckets(seq_va0, _buckets)

    # get test data
    if recommend:
        from evaluate import Evaluation as Evaluate
        evaluation = Evaluate(raw_data, test=test)
        uids = evaluation.get_uinds()  # abuse of 'uids'  : actually uinds
        seq_test = form_sequence_prediction(seq_all, uids, FLAGS.L, START_ID)
        _buckets = calculate_buckets(seq_test, FLAGS.L, FLAGS.n_bucket)
        _buckets = sorted(_buckets)
        seq_test = split_buckets(seq_test, _buckets)
    else:
        seq_test = []
        evaluation = None
        uids = []

    # create embedAttr

    devices = get_device_address(FLAGS.N)
    with tf.device(devices[0]):
        u_attr.set_model_size(FLAGS.size)
        i_attr.set_model_size(FLAGS.size)

        # if not FLAGS.use_item_feature:
        #     mylog("NOT using item attributes")
        #     i_attr.num_features_cat = 1
        #     i_attr.num_features_mulhot = 0

        # if not FLAGS.use_user_feature:
        #     mylog("NOT using user attributes")
        #     u_attr.num_features_cat = 1
        #     u_attr.num_features_mulhot = 0

        embAttr = embed_attribute.EmbeddingAttribute(u_attr,
                                                     i_attr,
                                                     FLAGS.batch_size,
                                                     FLAGS.n_sampled,
                                                     _buckets[-1],
                                                     FLAGS.use_sep_item,
                                                     item_ind2logit_ind,
                                                     logit_ind2item_ind,
                                                     devices=devices)

        if FLAGS.loss in ["warp", 'mw']:
            prepare_warp(embAttr, seq_tr0, seq_va0)

    return seq_tr, seq_va, seq_test, embAttr, START_ID, item_population, p_item, evaluation, uids, user_index, item_index, logit_ind2item_ind
Beispiel #2
0
def recommend(raw_data=FLAGS.raw_data, test=FLAGS.test, loss=FLAGS.loss, 
  batch_size=FLAGS.batch_size, topN=FLAGS.top_N_items,
  device_log=FLAGS.device_log):

  with tf.Session(config=tf.ConfigProto(allow_soft_placement=True, 
    log_device_placement=device_log)) as sess:
    mylog("reading data")

    (_, items_dev, _, _, u_attributes, i_attributes, item_ind2logit_ind, 
      logit_ind2item_ind, _, user_index, item_index) = get_data(raw_data, 
      data_dir=FLAGS.data_dir)
    
    from evaluate import Evaluation as Evaluate
    
    evaluation = Evaluate(raw_data, test=test)
    
    model = create_model(sess, u_attributes, i_attributes, item_ind2logit_ind,
      logit_ind2item_ind, loss=loss, ind_item=None)

    Uinds = evaluation.get_uinds()
    N = len(Uinds)
    mylog("N = %d" % N)
    Uinds = [p for p in Uinds if p in items_dev]
    mylog("new N = {}, (reduced from original {})".format(len(Uinds), N))
    if len(Uinds) < N:
      evaluation.set_uinds(Uinds)
    N = len(Uinds)
    rec = np.zeros((N, topN), dtype=int)
    count = 0
    time_start = time.time()
    for idx_s in range(0, N, batch_size):
      count += 1
      if count % 100 == 0:
        mylog("idx: %d, c: %d" % (idx_s, count))
        
      idx_e = idx_s + batch_size
      if idx_e <= N:
        users = Uinds[idx_s: idx_e]
        items_input = [items_dev[u] for u in users]
        items_input = map(list, zip(*items_input))
        recs = model.step(sess, users, items_input, forward_only=True, 
          recommend = True, recommend_new = FLAGS.recommend_new)
        rec[idx_s:idx_e, :] = recs
      else:
        users = range(idx_s, N) + [0] * (idx_e - N)
        users = [Uinds[t] for t in users]
        items_input = [items_dev[u] for u in users]
        items_input = map(list, zip(*items_input))
        recs = model.step(sess, users, items_input, forward_only=True, 
          recommend = True, recommend_new = FLAGS.recommend_new)
        idx_e = N
        rec[idx_s:idx_e, :] = recs[:(idx_e-idx_s),:]
    # return rec: i:  uinds[i] --> logid

    time_end = time.time()
    mylog("Time used %.1f" % (time_end - time_start))

    ind2id = {}
    for iid in item_index:
      uind = item_index[iid]
      assert(uind not in ind2id)
      ind2id[uind] = iid
    
    uids = evaluation.get_uids()
    R = {}
    for i in xrange(N):
      uid = uids[i]
      R[uid] = [ind2id[logit_ind2item_ind[v]] for v in list(rec[i, :])]

    evaluation.eval_on(R)
    scores_self, scores_ex = evaluation.get_scores()
    mylog("====evaluation scores (NDCG, RECALL, PRECISION, MAP) @ 2,5,10,20,30====")
    mylog("METRIC_FORMAT (self): {}".format(scores_self))
    mylog("METRIC_FORMAT (ex  ): {}".format(scores_ex))

  return