Ejemplo n.º 1
0
def ceshi_file(data_dir):
    test = pd.read_csv(data_dir + '\\test.csv')
    # test_item_list = []
    test_session_item_list = []
    user_session_dic = {}
    for i in range(test.shape[0]):  # df.shape[0],df.shape[1]分别获取行数、列数
        cur_user = test.ix[i, 'userid']
        if cur_user in user_session_dic:
            user_session_dic[cur_user].append(test.ix[i, 'session_id'])
        else:
            user_session_dic[cur_user] = [test.ix[i, 'session_id']]
        test_session_item_list.append([test.ix[i, 'session_id'], [], []])

        item_bought = test.ix[i, 'itemid1']
        test_session_item_list[i][1].append(item_bought)

        item_clicked = test.ix[i, 'itemid2']
        # test_item_list.append(item_clicked)
        test_session_item_list[i][2].append(item_clicked)

    # test_item_list = list(set(test_item_list))
    p2f.print_data_lists_to_file(test_session_item_list,
                                 data_dir + '\\test\\session_item.txt')
    # p2f.print_list_to_file(test_item_list, data_dir + '\\test\\items.txt')
    p2f.print_list_dict_to_file(user_session_dic,
                                data_dir + '\\test\\user_session.txt')

    print('test file already')
Ejemplo n.º 2
0
    def get_data(file_dir):

        session_item_file_path = file_dir + r"\session_item.txt"
        item_file_path = file_dir + r"\items.txt"
        # 获取session_item_data和user_session_data(此处data即为session_item_data)
        data, user_sessions_data = get_session_item_and_user_data(
            session_item_file_path)
        all_data_items = list()
        item_file = open(item_file_path, 'r')
        try:
            line = item_file.readline()
            tmp = line.split(',')
            for item_str in tmp:
                if item_str != '':
                    item = int(item_str)
                    all_data_items.append(item)
        except Exception as e:
            print(e)
        finally:
            item_file.close()
        # 获取item_session_data
        item_session_file_path = file_dir + r"\item_session.txt"
        if os.path.exists(item_session_file_path):
            item_session_data = rff.get_data_lists(item_session_file_path)
        else:
            # 获取item_session_data
            item_session_data = extract_item_data(data, all_data_items)
            # print("item_session_data: ", item_session_data)
            p2f.print_data_lists_to_file(item_session_data,
                                         item_session_file_path)
        print("finish get item session data")
        return user_sessions_data, data, item_session_data
Ejemplo n.º 3
0
def data_selection(in_file_path, out_file_dir):
    out_data_file_path = out_file_dir + r'\session_item.txt'
    out_items_file_path = out_file_dir + r'\items.txt'
    data = rff.get_data_lists(in_file_path)
    selected_data = list()
    for cur_data in data:
        buy_items = cur_data[1]
        if len(buy_items) < 2:
            selected_data.append(cur_data)
    selected_items = extract_items(selected_data)
    p2f.print_data_lists_to_file(selected_data, out_data_file_path)
    p2f.print_list_to_file(selected_items, out_items_file_path)
Ejemplo n.º 4
0
def sample_patition(rate, origin_file_dir, sampling_file_dir):

    origin_file_path = origin_file_dir + r"\session_item.txt"

    if not os.path.exists(sampling_file_dir):
        os.makedirs(sampling_file_dir)
    data_write_path = sampling_file_dir + r"\session_item.txt"
    items_write_path = sampling_file_dir + r"\items.txt"

    # 读取完整数据
    all_data = rff.get_data_lists(origin_file_path)
    # 进行采样
    sample_data, sample_items = sample_partition_help(all_data, rate)
    # 输出采样数据
    p2f.print_data_lists_to_file(sample_data, data_write_path)
    p2f.print_list_to_file(sample_items, items_write_path)
Ejemplo n.º 5
0
def extract_real_data1(click_file_path, buys_file_path, write_file_dir):
    print("processing click file...")
    # dic1表示点击数据中,每个session看了哪些商品
    dic1, all_sessions, all_items_set = get_session_itemList(click_file_path)

    print("processing buy file...")
    # 提取购买数据
    # dic2表示购买数据中,某个session与"其购买商品的list"的map
    dic2, all_buy_sessions, all_buy_items_set = get_session_itemList(
        buys_file_path)

    print("extracting session_item_data...")
    # 此处data即为session_item_data
    data = list()
    all_data_items_set = set()
    # 考虑所有的session(包括没有购买商品或者全部点击商品都购买了的session)
    idx = 1
    # 节约时间——表示一个session是否有购买了商品
    session_flag_dic = dict()
    for d in all_sessions:
        session_flag_dic[d] = 0
    for d in all_buy_sessions:
        session_flag_dic[d] = 1
    for d in all_sessions:
        if idx % 10000 == 0:
            print("processing all_session, idx:", idx)
        # 当前session所有点击商品
        click_items_list = dic1[d]
        for item in click_items_list:
            all_data_items_set.add(item)
        buy_items_list = list()
        if session_flag_dic[d] == 1:
            # 当前session所有购买商品
            buy_items_list = dic2[d]
        click_items_set = set(click_items_list)
        buy_items_set = set(buy_items_list)
        click_not_buy_items_set = click_items_set - buy_items_set
        # 数据格式(每一行):session;购买items(用逗号隔开);点击不购买items(用逗号隔开)
        cur_data = [d, buy_items_list, list(click_not_buy_items_set)]
        data.append(cur_data)

    all_data_items = list(all_data_items_set)
    # 输出数据到文件中
    session_item_write_path = write_file_dir + r"\session_item.txt"
    item_write_path = write_file_dir + r"\items.txt"
    p2f.print_data_lists_to_file(data, session_item_write_path)
    p2f.print_list_to_file(all_data_items, item_write_path)
Ejemplo n.º 6
0
def extract_real_data2(click_file_path, buys_file_path, write_file_dir):
    print("processing click file...")
    # dic1表示点击数据中,每个session看了哪些商品
    dic1, all_sessions, all_items_set = get_session_itemList(click_file_path)

    print("processing buy file...")
    # 提取购买数据
    # dic2表示购买数据中某个session与"其购买商品的list"的map
    dic2, all_buy_sessions, all_buy_items_set = get_session_itemList(
        buys_file_path)

    print("extracting session_item_data...")
    # start = time.time()
    # 此处data即为session_item_data
    data = list()
    # 只提取那些“既有点击商品也有点击不购买商品的session”的商品
    all_data_items_set = set()
    # 只考虑那些发生了购买行为的session(且只取其中“购买商品数等于1的session”)
    for d in all_buy_sessions:
        # 当前session所有点击商品
        click_items_list = dic1[d]
        # 当前session所有购买商品
        buy_items_list = dic2[d]
        buy_nums = len(buy_items_list)
        if buy_nums == 1:
            # 只提取那些“有购买且只购买1个商品的session”的商品
            for item in click_items_list:
                all_data_items_set.add(item)
            click_items_set = set(click_items_list)
            buy_items_set = set(buy_items_list)
            click_not_buy_items_set = click_items_set - buy_items_set
            # 数据格式(每一行):session;购买items(用逗号隔开);点击不购买items(用逗号隔开)
            cur_data = [d, buy_items_list, list(click_not_buy_items_set)]
            # print("current data: ", cur_data)
            data.append(cur_data)
            # print2file(d, buy_items_list, click_not_buy_items_set)

    all_data_items = list(all_data_items_set)
    # 输出数据到文件中
    session_item_write_path = write_file_dir + r"\session_item.txt"
    item_write_path = write_file_dir + r"\items.txt"
    p2f.print_data_lists_to_file(data, session_item_write_path)
    p2f.print_list_to_file(all_data_items, item_write_path)

    print("finish extracting real data")
Ejemplo n.º 7
0
def train_file(data_dir):
    # 读取分好的数据,生成模型所用的session_item.txt,items.txt,user_session.txt
    train = pd.read_csv(data_dir + '\\train.csv')
    train_item_list = []
    train_session_item_list = []
    train_item_session_dic = {}
    user_session_dic = {}
    for i in range(train.shape[0]):
        cur_user = train.ix[i, 'userid']
        cur_session = train.ix[i, 'session_id']
        if cur_user in user_session_dic:
            user_session_dic[cur_user].append(cur_session)
        else:
            user_session_dic[cur_user] = [cur_session]
        train_session_item_list.append([cur_session, [], []])

        item_bought = train.ix[i, 'itemid1']
        train_session_item_list[i][1].append(item_bought)
        if item_bought in train_item_session_dic:
            train_item_session_dic[item_bought][0].append(cur_session)
        else:
            train_item_session_dic[item_bought] = [[cur_session], []]

        item_clicked = train.ix[i, 'itemid2']
        train_session_item_list[i][2].append(item_clicked)
        if item_clicked in train_item_session_dic:
            train_item_session_dic[item_clicked][1].append(cur_session)
        else:
            train_item_session_dic[item_clicked] = [[], [cur_session]]

    p2f.print_data_lists_to_file(train_session_item_list,
                                 data_dir + '\\train\\session_item.txt')
    # p2f.print_list_to_file(train_item_list,data_dir + '\\dataset\\train\\items.txt')
    p2f.print_list_dict_to_file(user_session_dic,
                                data_dir + '\\train\\user_session.txt')
    p2f.print_2lists_dict_to_file(train_item_session_dic,
                                  data_dir + '\\train\\item_session.txt')

    print('train file already')
Ejemplo n.º 8
0
def test_data_selection(out_file_dir, in_test_file_path, out_test_file_dir):
    train_items_file_path = out_file_dir + r'\items.txt'
    train_items = rff.get_int_list(train_items_file_path)
    test_data = rff.get_data_lists(in_test_file_path)
    test_data_selected = list()

    for cur_test_data in test_data:
        cur_items = cur_test_data[1] + cur_test_data[2]
        selection = True
        for item in cur_items:
            if item in train_items:
                continue
            else:
                selection = False
                break
        if selection:
            test_data_selected.append(cur_test_data)
    test_items_selected = extract_items(test_data_selected)
    out_test_data_file_path = out_test_file_dir + r'\session_item.txt'
    out_test_items_file_path = out_test_file_dir + r'\items.txt'
    p2f.print_data_lists_to_file(test_data_selected, out_test_data_file_path)
    p2f.print_list_to_file(test_items_selected, out_test_items_file_path)
Ejemplo n.º 9
0
def extract_real_data(click_file_path, buys_file_path, write_file_dir):
    print("processing click file...")
    # dic1 表示点击数据中,每个session看了哪些商品
    dic1, all_sessions, all_items_set = get_session_itemList(click_file_path)

    print("processing buy file...")
    # 提取购买数据
    # dic2表示购买数据中某个session与"其购买商品的list"的map
    dic2, all_buy_sessions, all_buy_items_set = get_session_itemList(
        buys_file_path)

    print("extracting session_item_data...")
    # start = time.time()
    # 整合点击数据与购买数据,获取session_item_data
    allClicksBuy_sessions = list()
    # 此处data即为session_item_data
    data = list()
    # 提取出那些既有点击商品也有点击不购买商品的session
    all_data_sessions = list()
    # 只提取那些“既有点击商品也有点击不购买商品的session”的商品
    all_data_items_set = set()
    # 只考虑那些发生了购买行为的session(再去掉其中“所有点击商品都购买的session”)
    for d in all_buy_sessions:
        # 当前session所有点击商品
        click_items_list = dic1[d]
        click_nums = len(click_items_list)
        # 当前session所有购买商品
        buy_items_list = dic2[d]
        buy_nums = len(buy_items_list)
        # “去掉”所有点击商品都购买的session
        if click_nums == buy_nums:
            allClicksBuy_sessions.append(d)
        # 该session中既有购买商品,也有点击不购买商品
        else:
            all_data_sessions.append(d)
            # 只提取那些“既有点击商品也有点击不购买商品的session”的商品
            for item in click_items_list:
                all_data_items_set.add(item)
            click_items_set = set(click_items_list)
            buy_items_set = set(buy_items_list)
            click_not_buy_items_set = click_items_set - buy_items_set
            # 数据格式(每一行):session;购买items(用逗号隔开);点击不购买items(用逗号隔开)
            cur_data = [d, buy_items_list, list(click_not_buy_items_set)]
            # print("current data: ", cur_data)
            data.append(cur_data)
            # print2file(d, buy_items_list, click_not_buy_items_set)

    # 所有session
    all_sessions_set = set(all_sessions)
    # 所有点击商品都购买了的session
    allClicksBuy_sessions_set = set(allClicksBuy_sessions)
    # 既有购买商品,也有点击不购买商品的session
    all_data_sessions_set = set(all_data_sessions)
    # 所有点击商品都没有购买的session
    allClicksNotBuy_sessions_set = all_sessions_set - allClicksBuy_sessions_set - all_data_sessions_set

    # 所有"点击过但没被买的item"
    all_clickNotBuy_items_set = all_items_set - all_buy_items_set

    print("session item data: ", data)
    print("0、(data中的)既有购买商品,也有点击不购买商品的session: ", all_data_sessions)
    print("1、all_sessions: ", all_sessions)
    print("2、所有点击商品都购买了的session: ", allClicksBuy_sessions)
    print("3、所有点击商品都没有购买的session: ", allClicksNotBuy_sessions_set)
    print("4、data中的所有item: ", all_data_items_set)
    print("5、all_items: ", all_items_set)
    print("6、所有被购买的item: ", all_buy_items_set)
    print("7、所有点击过但没被买的item: ", all_clickNotBuy_items_set)
    print("printing to file...")

    allClicksNotBuy_sessions = list(allClicksNotBuy_sessions_set)
    all_data_items = list(all_data_items_set)
    all_items = list(all_items_set)
    all_buy_items = list(all_buy_items_set)
    all_clickNotBuy_items = list(all_clickNotBuy_items_set)

    # 输出数据到文件中
    session_item_write_path = write_file_dir + r"\session_item.txt"
    p2f.print_data_lists_to_file(data, session_item_write_path)
    # 其他有用信息
    print2file_list = [
        all_data_sessions, all_sessions, allClicksBuy_sessions,
        allClicksNotBuy_sessions, all_data_items, all_items, all_buy_items,
        all_clickNotBuy_items
    ]
    file_name_list = [
        "all_data_sessions.txt", "all_sessions.txt",
        "allClicksBuy_sessions.txt", "allClicksNotBuy_sessions.txt",
        "items.txt", "all_items.txt", "all_buy_items.txt",
        "all_clickNotBuy_items.txt"
    ]
    idx = 0
    for cur_list in print2file_list:
        cur_file_path = write_file_dir + "\\" + file_name_list[idx]
        p2f.print_list_to_file(cur_list, cur_file_path)
        idx += 1

    print("finish extracting real data")