def process_meta(file):
    fi = open(datafilename(category, file), "r")
    fo = open(datafilename(category, "item-info"), "w")
    for line in fi:
        obj = eval(line)
        cat = obj["categories"][0][-1]
        print >> fo, obj["asin"] + "\t" + cat
def process_reviews(file):
    fi = open(datafilename(category, file), "r")
    user_map = {}
    fo = open(datafilename(category, "reviews-info"), "w")
    for line in fi:
        obj = eval(line)
        userID = obj["reviewerID"]
        itemID = obj["asin"]
        rating = obj["overall"]
        time = obj["unixReviewTime"]
        print >> fo, userID + "\t" + itemID + "\t" + str(rating) + "\t" + str(
            time)
コード例 #3
0
def split_test_by_time(cut_time):
    fi = open(datafilename(category, "local_all_sample_by_time"), "r")
    ftrain = open(datafilename(category, "local_train_by_time"), "w")
    ftest = open(datafilename(category, "local_test_by_time"), "w")

    for line in fi:
        line = line.strip()
        time = float(line.split("\t")[-1])

        if time <= cut_time:
            print >> ftrain, line[:-2]
        else:
            print >> ftest, line[:-2]
コード例 #4
0
def get_cut_timestamp(train_percent=0.85):
    time_list = []

    fi = open(datafilename(category, "local_all_sample_by_time"), "r")
    path = datafilename(category, "local_all_sample_by_time")
    samples_count = file_len(path)
    train_size = int(samples_count * train_percent)
    for line in fi:
        line = line.strip()
        time = float(line.split("\t")[-1])
        time_list.append(time)
    index = np.argsort(time_list, axis=-1)
    cut_time_index = index[train_size]
    return time_list[cut_time_index]
コード例 #5
0
def get_all_samples():
    fin = open(datafilename(category, "jointed-new-by-time"), "r")
    fall = open(datafilename(category, "local_all_sample_by_time"), "w")
    gap = np.array(
        [1.1, 1.4, 1.7, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096])

    last_user = "******"
    line_idx = 0
    for line in fin:
        items = line.strip().split("\t")
        clk = int(items[0])
        user = items[1]
        movie_id = items[2]
        dt = items[4]
        cat1 = items[5]
        user_list = items[6]
        user_t_list = items[7]

        if user != last_user:
            movie_id_list = []
            cate1_list = []
            movie_id_t_list = []
        else:
            history_clk_num = len(movie_id_list)
            cat_str = ""
            mid_str = ""
            for c1 in cate1_list:
                cat_str += c1 + ""
            for mid in movie_id_list:
                mid_str += mid + ""
            dt_gap = []
            for t in movie_id_t_list:
                temp = float(dt) / 3600.0 / 24.0 - float(
                    t) / 3600.0 / 24.0 + 1.
                dt_gap.append(str(np.sum(temp >= gap)))
            dt_gap_str = "".join(dt_gap)
            if len(cat_str) > 0: cat_str = cat_str[:-1]
            if len(mid_str) > 0: mid_str = mid_str[:-1]

            if history_clk_num >= 1:  # 8 is the average length of user behavior
                print >> fall, items[0] + "\t" + user + "\t" + movie_id + "\t" + cat1 + "\t" + mid_str + "\t" + cat_str + \
                               "\t" + user_list + "\t" + user_t_list + '\t' + dt_gap_str + "\t" + dt
        last_user = user
        if clk:
            movie_id_list.append(movie_id)
            cate1_list.append(cat1)
            movie_id_t_list.append(dt)
        line_idx += 1
コード例 #6
0
def split_test_by_seqlen():
    fi = open(datafilename(category, "local_test_by_time"), "r")
    ftest_u1 = open(datafilename(category, "local_test_u1"), "w")
    ftest_u2 = open(datafilename(category, "local_test_u2"), "w")
    ftest_u3 = open(datafilename(category, "local_test_u3"), "w")

    for line in fi:
        line = line.strip()
        item_seq = line.split("\t")[4]
        sl = len(item_seq.split(""))
        if sl < 5:
            print >> ftest_u1, line
        elif sl < 15:
            print >> ftest_u2, line
        else:
            print >> ftest_u3, line
コード例 #7
0
import cPickle
import random

import numpy as np

category = 'Amazon_Clothing_Shoes_and_Jewelry'

from path import datafilename

np.random.seed(1234)
random.seed(1234)

f_train = open(datafilename(category, "local_train_by_time"), "r").readlines()
f_test = open(datafilename(category, "local_test_by_time"), "r").readlines()

f_all = f_train + f_test

uid_dict = {}
mid_dict = {}
cat_dict = {}

iddd = 0
for line in f_all:
    arr = line.strip("\n").split("\t")
    clk = arr[0]
    uid = arr[1]
    mid = arr[2]
    cat = arr[3]
    mid_list = arr[4]
    cat_list = arr[5]
    if uid not in uid_dict:
def manual_join():
    f_rev = open(datafilename(category, "reviews-info"), "r")
    user_map = {}  ## User clicked on the list of items
    item_list = []  # all items list
    useridToClickItem = {}  # The user dict who clicked on the item
    for line in f_rev:
        line = line.strip()
        items = line.split("\t")
        # loctime = time.localtime(float(items[-1]))
        # items[-1] = time.strftime('%Y-%m-%d', loctime)
        if items[0] not in user_map:
            user_map[items[0]] = []
        user_map[items[0]].append(("\t".join(items), float(items[-1])))
        item_list.append(items[1])

    # The user dict who clicked on the item
    f_rev = open(datafilename(category, "reviews-info"), "r")
    for line in f_rev:
        data = line.split("\t")
        if data[1] not in useridToClickItem:
            useridToClickItem[data[1]] = []
        useridToClickItem[data[1]].append((data[0], float(data[-1])))

    f_meta = open(datafilename(category, "item-info"), "r")
    meta_map = {}  # itemID map cate
    for line in f_meta:
        arr = line.strip().split("\t")
        if arr[0] not in meta_map:
            meta_map[arr[0]] = arr[1]
            arr = line.strip().split("\t")
    fo = open(datafilename(category, "jointed-new-by-time"), "w")
    for key in user_map:
        sorted_user_bh = sorted(user_map[key], key=lambda x: x[1])
        for line, t in sorted_user_bh:
            items = line.split("\t")
            asin = items[1]
            cur_t = float(items[3]) // 3600 // 24
            j = 0
            target_user_pos_in_seq = 0
            while True:
                asin_neg_index = random.randint(0, len(item_list) - 1)
                asin_neg = item_list[asin_neg_index]
                if asin_neg == asin:
                    continue
                items[1] = asin_neg

                if len(useridToClickItem[asin_neg]) == 0:
                    user_str = "default_user"
                    user_t_str = "-1"
                else:
                    user_str = ""
                    user_t_str = ""
                    sorted_user_in_item_seq = sorted(
                        useridToClickItem[asin_neg], key=lambda x: x[1])
                    for i, (u, t) in enumerate(sorted_user_in_item_seq):
                        if int(t) > int(items[-1]):
                            target_user_pos_in_seq = i + 1
                            break
                        if u == items[0]:
                            continue
                        user_str += u + ""
                        user_t = float(cur_t) - t // 3600 // 24 + 1.
                        user_t_str += str(np.sum(user_t >= gap)) + ""

                if len(user_str) > 0:
                    user_str = user_str[:-1]
                    user_t_str = user_t_str[:-1]
                if len(user_str) == 0:
                    user_str = "default_user"
                    user_t_str = "-1"
                if asin_neg in meta_map:
                    print >> fo, "0" + "\t" + "\t".join(
                        items
                    ) + "\t" + meta_map[
                        asin_neg] + "\t" + user_str + "\t" + user_t_str + "\t" + items[
                            3]
                else:
                    print >> fo, "0" + "\t" + "\t".join(
                        items
                    ) + "\t" + "default_cat" + "\t" + user_str + "\t" + user_t_str + "\t" + items[
                        3]

                j += 1
                if j == 1:  # negative sampling frequency
                    break

            target_user_pos_in_seq = 0
            # useridToClickItem[asin][0].remove(items[0])
            if len(useridToClickItem[asin]) == 0:
                user_str = "default_user"
                user_t_str = "-1"
            else:
                user_str = ""
                user_t_str = ""
                sorted_user_in_item_seq = sorted(useridToClickItem[asin],
                                                 key=lambda x: x[1])
                for i, (u, t) in enumerate(sorted_user_in_item_seq):
                    if int(t) > int(items[-1]):
                        target_user_pos_in_seq = i
                        break
                    if u == items[0]:
                        continue
                    user_str += u + ""
                    user_t = float(cur_t) - t // 3600 // 24 + 1.
                    user_t_str += str(np.sum(user_t >= gap)) + ""
            if len(user_str) > 0:
                user_str = user_str[:-1]
                user_t_str = user_t_str[:-1]
            if len(user_str) == 0:
                user_str = "default_user"
                user_t_str = "-1"
            if asin in meta_map:
                print >> fo, "1" + "\t" + line + "\t" + meta_map[
                    asin] + "\t" + user_str + "\t" + user_t_str + "\t" + items[
                        3]
            else:
                print >> fo, "1" + "\t" + line + "\t" + "default_cat" + "\t" + user_str + "\t" + user_t_str + "\t" + items[
                    3]