Esempio n. 1
0
def data_selection(in_file_path, out_file_dir):
    out_data_file_path = out_file_dir + r'\session_item.txt'
    out_items_file_path = out_file_dir + r'\items.txt'
    data = rff.get_data_lists(in_file_path)
    selected_data = list()
    for cur_data in data:
        buy_items = cur_data[1]
        if len(buy_items) < 2:
            selected_data.append(cur_data)
    selected_items = extract_items(selected_data)
    p2f.print_data_lists_to_file(selected_data, out_data_file_path)
    p2f.print_list_to_file(selected_items, out_items_file_path)
Esempio n. 2
0
def sample_patition(rate, origin_file_dir, sampling_file_dir):

    origin_file_path = origin_file_dir + r"\session_item.txt"

    if not os.path.exists(sampling_file_dir):
        os.makedirs(sampling_file_dir)
    data_write_path = sampling_file_dir + r"\session_item.txt"
    items_write_path = sampling_file_dir + r"\items.txt"

    # 读取完整数据
    all_data = rff.get_data_lists(origin_file_path)
    # 进行采样
    sample_data, sample_items = sample_partition_help(all_data, rate)
    # 输出采样数据
    p2f.print_data_lists_to_file(sample_data, data_write_path)
    p2f.print_list_to_file(sample_items, items_write_path)
Esempio n. 3
0
def extract_real_data1(click_file_path, buys_file_path, write_file_dir):
    print("processing click file...")
    # dic1表示点击数据中,每个session看了哪些商品
    dic1, all_sessions, all_items_set = get_session_itemList(click_file_path)

    print("processing buy file...")
    # 提取购买数据
    # dic2表示购买数据中,某个session与"其购买商品的list"的map
    dic2, all_buy_sessions, all_buy_items_set = get_session_itemList(
        buys_file_path)

    print("extracting session_item_data...")
    # 此处data即为session_item_data
    data = list()
    all_data_items_set = set()
    # 考虑所有的session(包括没有购买商品或者全部点击商品都购买了的session)
    idx = 1
    # 节约时间——表示一个session是否有购买了商品
    session_flag_dic = dict()
    for d in all_sessions:
        session_flag_dic[d] = 0
    for d in all_buy_sessions:
        session_flag_dic[d] = 1
    for d in all_sessions:
        if idx % 10000 == 0:
            print("processing all_session, idx:", idx)
        # 当前session所有点击商品
        click_items_list = dic1[d]
        for item in click_items_list:
            all_data_items_set.add(item)
        buy_items_list = list()
        if session_flag_dic[d] == 1:
            # 当前session所有购买商品
            buy_items_list = dic2[d]
        click_items_set = set(click_items_list)
        buy_items_set = set(buy_items_list)
        click_not_buy_items_set = click_items_set - buy_items_set
        # 数据格式(每一行):session;购买items(用逗号隔开);点击不购买items(用逗号隔开)
        cur_data = [d, buy_items_list, list(click_not_buy_items_set)]
        data.append(cur_data)

    all_data_items = list(all_data_items_set)
    # 输出数据到文件中
    session_item_write_path = write_file_dir + r"\session_item.txt"
    item_write_path = write_file_dir + r"\items.txt"
    p2f.print_data_lists_to_file(data, session_item_write_path)
    p2f.print_list_to_file(all_data_items, item_write_path)
Esempio n. 4
0
def extract_real_data2(click_file_path, buys_file_path, write_file_dir):
    print("processing click file...")
    # dic1表示点击数据中,每个session看了哪些商品
    dic1, all_sessions, all_items_set = get_session_itemList(click_file_path)

    print("processing buy file...")
    # 提取购买数据
    # dic2表示购买数据中某个session与"其购买商品的list"的map
    dic2, all_buy_sessions, all_buy_items_set = get_session_itemList(
        buys_file_path)

    print("extracting session_item_data...")
    # start = time.time()
    # 此处data即为session_item_data
    data = list()
    # 只提取那些“既有点击商品也有点击不购买商品的session”的商品
    all_data_items_set = set()
    # 只考虑那些发生了购买行为的session(且只取其中“购买商品数等于1的session”)
    for d in all_buy_sessions:
        # 当前session所有点击商品
        click_items_list = dic1[d]
        # 当前session所有购买商品
        buy_items_list = dic2[d]
        buy_nums = len(buy_items_list)
        if buy_nums == 1:
            # 只提取那些“有购买且只购买1个商品的session”的商品
            for item in click_items_list:
                all_data_items_set.add(item)
            click_items_set = set(click_items_list)
            buy_items_set = set(buy_items_list)
            click_not_buy_items_set = click_items_set - buy_items_set
            # 数据格式(每一行):session;购买items(用逗号隔开);点击不购买items(用逗号隔开)
            cur_data = [d, buy_items_list, list(click_not_buy_items_set)]
            # print("current data: ", cur_data)
            data.append(cur_data)
            # print2file(d, buy_items_list, click_not_buy_items_set)

    all_data_items = list(all_data_items_set)
    # 输出数据到文件中
    session_item_write_path = write_file_dir + r"\session_item.txt"
    item_write_path = write_file_dir + r"\items.txt"
    p2f.print_data_lists_to_file(data, session_item_write_path)
    p2f.print_list_to_file(all_data_items, item_write_path)

    print("finish extracting real data")
Esempio n. 5
0
def test_data_selection(out_file_dir, in_test_file_path, out_test_file_dir):
    train_items_file_path = out_file_dir + r'\items.txt'
    train_items = rff.get_int_list(train_items_file_path)
    test_data = rff.get_data_lists(in_test_file_path)
    test_data_selected = list()

    for cur_test_data in test_data:
        cur_items = cur_test_data[1] + cur_test_data[2]
        selection = True
        for item in cur_items:
            if item in train_items:
                continue
            else:
                selection = False
                break
        if selection:
            test_data_selected.append(cur_test_data)
    test_items_selected = extract_items(test_data_selected)
    out_test_data_file_path = out_test_file_dir + r'\session_item.txt'
    out_test_items_file_path = out_test_file_dir + r'\items.txt'
    p2f.print_data_lists_to_file(test_data_selected, out_test_data_file_path)
    p2f.print_list_to_file(test_items_selected, out_test_items_file_path)
Esempio n. 6
0
def print_partition(out_dir, train_session_item, test_session_item):

    train_items_list = extract_items2(train_session_item)
    test_items_list = extract_items2(test_session_item)

    # 输出文件路径
    out_train_dir = out_dir + r'\train'
    if not os.path.exists(out_train_dir):
        os.makedirs(out_train_dir)
    out_train_data_path = out_train_dir + r'\session_item.txt'
    out_train_item_path = out_train_dir + r'\items.txt'

    out_test_dir = out_dir + r'\test'
    if not os.path.exists(out_test_dir):
        os.makedirs(out_test_dir)
    out_test_data_path = out_test_dir + r'\session_item.txt'
    out_test_item_path = out_test_dir + r'\items.txt'

    # 输出数据
    p2f.print_2lists_dict_to_file(train_session_item, out_train_data_path)
    p2f.print_list_to_file(train_items_list, out_train_item_path)
    p2f.print_2lists_dict_to_file(test_session_item, out_test_data_path)
    p2f.print_list_to_file(test_items_list, out_test_item_path)
    '''
Esempio n. 7
0
def extract_real_data(click_file_path, buys_file_path, write_file_dir):
    print("processing click file...")
    # dic1 表示点击数据中,每个session看了哪些商品
    dic1, all_sessions, all_items_set = get_session_itemList(click_file_path)

    print("processing buy file...")
    # 提取购买数据
    # dic2表示购买数据中某个session与"其购买商品的list"的map
    dic2, all_buy_sessions, all_buy_items_set = get_session_itemList(
        buys_file_path)

    print("extracting session_item_data...")
    # start = time.time()
    # 整合点击数据与购买数据,获取session_item_data
    allClicksBuy_sessions = list()
    # 此处data即为session_item_data
    data = list()
    # 提取出那些既有点击商品也有点击不购买商品的session
    all_data_sessions = list()
    # 只提取那些“既有点击商品也有点击不购买商品的session”的商品
    all_data_items_set = set()
    # 只考虑那些发生了购买行为的session(再去掉其中“所有点击商品都购买的session”)
    for d in all_buy_sessions:
        # 当前session所有点击商品
        click_items_list = dic1[d]
        click_nums = len(click_items_list)
        # 当前session所有购买商品
        buy_items_list = dic2[d]
        buy_nums = len(buy_items_list)
        # “去掉”所有点击商品都购买的session
        if click_nums == buy_nums:
            allClicksBuy_sessions.append(d)
        # 该session中既有购买商品,也有点击不购买商品
        else:
            all_data_sessions.append(d)
            # 只提取那些“既有点击商品也有点击不购买商品的session”的商品
            for item in click_items_list:
                all_data_items_set.add(item)
            click_items_set = set(click_items_list)
            buy_items_set = set(buy_items_list)
            click_not_buy_items_set = click_items_set - buy_items_set
            # 数据格式(每一行):session;购买items(用逗号隔开);点击不购买items(用逗号隔开)
            cur_data = [d, buy_items_list, list(click_not_buy_items_set)]
            # print("current data: ", cur_data)
            data.append(cur_data)
            # print2file(d, buy_items_list, click_not_buy_items_set)

    # 所有session
    all_sessions_set = set(all_sessions)
    # 所有点击商品都购买了的session
    allClicksBuy_sessions_set = set(allClicksBuy_sessions)
    # 既有购买商品,也有点击不购买商品的session
    all_data_sessions_set = set(all_data_sessions)
    # 所有点击商品都没有购买的session
    allClicksNotBuy_sessions_set = all_sessions_set - allClicksBuy_sessions_set - all_data_sessions_set

    # 所有"点击过但没被买的item"
    all_clickNotBuy_items_set = all_items_set - all_buy_items_set

    print("session item data: ", data)
    print("0、(data中的)既有购买商品,也有点击不购买商品的session: ", all_data_sessions)
    print("1、all_sessions: ", all_sessions)
    print("2、所有点击商品都购买了的session: ", allClicksBuy_sessions)
    print("3、所有点击商品都没有购买的session: ", allClicksNotBuy_sessions_set)
    print("4、data中的所有item: ", all_data_items_set)
    print("5、all_items: ", all_items_set)
    print("6、所有被购买的item: ", all_buy_items_set)
    print("7、所有点击过但没被买的item: ", all_clickNotBuy_items_set)
    print("printing to file...")

    allClicksNotBuy_sessions = list(allClicksNotBuy_sessions_set)
    all_data_items = list(all_data_items_set)
    all_items = list(all_items_set)
    all_buy_items = list(all_buy_items_set)
    all_clickNotBuy_items = list(all_clickNotBuy_items_set)

    # 输出数据到文件中
    session_item_write_path = write_file_dir + r"\session_item.txt"
    p2f.print_data_lists_to_file(data, session_item_write_path)
    # 其他有用信息
    print2file_list = [
        all_data_sessions, all_sessions, allClicksBuy_sessions,
        allClicksNotBuy_sessions, all_data_items, all_items, all_buy_items,
        all_clickNotBuy_items
    ]
    file_name_list = [
        "all_data_sessions.txt", "all_sessions.txt",
        "allClicksBuy_sessions.txt", "allClicksNotBuy_sessions.txt",
        "items.txt", "all_items.txt", "all_buy_items.txt",
        "all_clickNotBuy_items.txt"
    ]
    idx = 0
    for cur_list in print2file_list:
        cur_file_path = write_file_dir + "\\" + file_name_list[idx]
        p2f.print_list_to_file(cur_list, cur_file_path)
        idx += 1

    print("finish extracting real data")