def data_selection(in_file_path, out_file_dir): out_data_file_path = out_file_dir + r'\session_item.txt' out_items_file_path = out_file_dir + r'\items.txt' data = rff.get_data_lists(in_file_path) selected_data = list() for cur_data in data: buy_items = cur_data[1] if len(buy_items) < 2: selected_data.append(cur_data) selected_items = extract_items(selected_data) p2f.print_data_lists_to_file(selected_data, out_data_file_path) p2f.print_list_to_file(selected_items, out_items_file_path)
def sample_patition(rate, origin_file_dir, sampling_file_dir): origin_file_path = origin_file_dir + r"\session_item.txt" if not os.path.exists(sampling_file_dir): os.makedirs(sampling_file_dir) data_write_path = sampling_file_dir + r"\session_item.txt" items_write_path = sampling_file_dir + r"\items.txt" # 读取完整数据 all_data = rff.get_data_lists(origin_file_path) # 进行采样 sample_data, sample_items = sample_partition_help(all_data, rate) # 输出采样数据 p2f.print_data_lists_to_file(sample_data, data_write_path) p2f.print_list_to_file(sample_items, items_write_path)
def extract_real_data1(click_file_path, buys_file_path, write_file_dir): print("processing click file...") # dic1表示点击数据中,每个session看了哪些商品 dic1, all_sessions, all_items_set = get_session_itemList(click_file_path) print("processing buy file...") # 提取购买数据 # dic2表示购买数据中,某个session与"其购买商品的list"的map dic2, all_buy_sessions, all_buy_items_set = get_session_itemList( buys_file_path) print("extracting session_item_data...") # 此处data即为session_item_data data = list() all_data_items_set = set() # 考虑所有的session(包括没有购买商品或者全部点击商品都购买了的session) idx = 1 # 节约时间——表示一个session是否有购买了商品 session_flag_dic = dict() for d in all_sessions: session_flag_dic[d] = 0 for d in all_buy_sessions: session_flag_dic[d] = 1 for d in all_sessions: if idx % 10000 == 0: print("processing all_session, idx:", idx) # 当前session所有点击商品 click_items_list = dic1[d] for item in click_items_list: all_data_items_set.add(item) buy_items_list = list() if session_flag_dic[d] == 1: # 当前session所有购买商品 buy_items_list = dic2[d] click_items_set = set(click_items_list) buy_items_set = set(buy_items_list) click_not_buy_items_set = click_items_set - buy_items_set # 数据格式(每一行):session;购买items(用逗号隔开);点击不购买items(用逗号隔开) cur_data = [d, buy_items_list, list(click_not_buy_items_set)] data.append(cur_data) all_data_items = list(all_data_items_set) # 输出数据到文件中 session_item_write_path = write_file_dir + r"\session_item.txt" item_write_path = write_file_dir + r"\items.txt" p2f.print_data_lists_to_file(data, session_item_write_path) p2f.print_list_to_file(all_data_items, item_write_path)
def extract_real_data2(click_file_path, buys_file_path, write_file_dir): print("processing click file...") # dic1表示点击数据中,每个session看了哪些商品 dic1, all_sessions, all_items_set = get_session_itemList(click_file_path) print("processing buy file...") # 提取购买数据 # dic2表示购买数据中某个session与"其购买商品的list"的map dic2, all_buy_sessions, all_buy_items_set = get_session_itemList( buys_file_path) print("extracting session_item_data...") # start = time.time() # 此处data即为session_item_data data = list() # 只提取那些“既有点击商品也有点击不购买商品的session”的商品 all_data_items_set = set() # 只考虑那些发生了购买行为的session(且只取其中“购买商品数等于1的session”) for d in all_buy_sessions: # 当前session所有点击商品 click_items_list = dic1[d] # 当前session所有购买商品 buy_items_list = dic2[d] buy_nums = len(buy_items_list) if buy_nums == 1: # 只提取那些“有购买且只购买1个商品的session”的商品 for item in click_items_list: all_data_items_set.add(item) click_items_set = set(click_items_list) buy_items_set = set(buy_items_list) click_not_buy_items_set = click_items_set - buy_items_set # 数据格式(每一行):session;购买items(用逗号隔开);点击不购买items(用逗号隔开) cur_data = [d, buy_items_list, list(click_not_buy_items_set)] # print("current data: ", cur_data) data.append(cur_data) # print2file(d, buy_items_list, click_not_buy_items_set) all_data_items = list(all_data_items_set) # 输出数据到文件中 session_item_write_path = write_file_dir + r"\session_item.txt" item_write_path = write_file_dir + r"\items.txt" p2f.print_data_lists_to_file(data, session_item_write_path) p2f.print_list_to_file(all_data_items, item_write_path) print("finish extracting real data")
def test_data_selection(out_file_dir, in_test_file_path, out_test_file_dir): train_items_file_path = out_file_dir + r'\items.txt' train_items = rff.get_int_list(train_items_file_path) test_data = rff.get_data_lists(in_test_file_path) test_data_selected = list() for cur_test_data in test_data: cur_items = cur_test_data[1] + cur_test_data[2] selection = True for item in cur_items: if item in train_items: continue else: selection = False break if selection: test_data_selected.append(cur_test_data) test_items_selected = extract_items(test_data_selected) out_test_data_file_path = out_test_file_dir + r'\session_item.txt' out_test_items_file_path = out_test_file_dir + r'\items.txt' p2f.print_data_lists_to_file(test_data_selected, out_test_data_file_path) p2f.print_list_to_file(test_items_selected, out_test_items_file_path)
def print_partition(out_dir, train_session_item, test_session_item): train_items_list = extract_items2(train_session_item) test_items_list = extract_items2(test_session_item) # 输出文件路径 out_train_dir = out_dir + r'\train' if not os.path.exists(out_train_dir): os.makedirs(out_train_dir) out_train_data_path = out_train_dir + r'\session_item.txt' out_train_item_path = out_train_dir + r'\items.txt' out_test_dir = out_dir + r'\test' if not os.path.exists(out_test_dir): os.makedirs(out_test_dir) out_test_data_path = out_test_dir + r'\session_item.txt' out_test_item_path = out_test_dir + r'\items.txt' # 输出数据 p2f.print_2lists_dict_to_file(train_session_item, out_train_data_path) p2f.print_list_to_file(train_items_list, out_train_item_path) p2f.print_2lists_dict_to_file(test_session_item, out_test_data_path) p2f.print_list_to_file(test_items_list, out_test_item_path) '''
def extract_real_data(click_file_path, buys_file_path, write_file_dir): print("processing click file...") # dic1 表示点击数据中,每个session看了哪些商品 dic1, all_sessions, all_items_set = get_session_itemList(click_file_path) print("processing buy file...") # 提取购买数据 # dic2表示购买数据中某个session与"其购买商品的list"的map dic2, all_buy_sessions, all_buy_items_set = get_session_itemList( buys_file_path) print("extracting session_item_data...") # start = time.time() # 整合点击数据与购买数据,获取session_item_data allClicksBuy_sessions = list() # 此处data即为session_item_data data = list() # 提取出那些既有点击商品也有点击不购买商品的session all_data_sessions = list() # 只提取那些“既有点击商品也有点击不购买商品的session”的商品 all_data_items_set = set() # 只考虑那些发生了购买行为的session(再去掉其中“所有点击商品都购买的session”) for d in all_buy_sessions: # 当前session所有点击商品 click_items_list = dic1[d] click_nums = len(click_items_list) # 当前session所有购买商品 buy_items_list = dic2[d] buy_nums = len(buy_items_list) # “去掉”所有点击商品都购买的session if click_nums == buy_nums: allClicksBuy_sessions.append(d) # 该session中既有购买商品,也有点击不购买商品 else: all_data_sessions.append(d) # 只提取那些“既有点击商品也有点击不购买商品的session”的商品 for item in click_items_list: all_data_items_set.add(item) click_items_set = set(click_items_list) buy_items_set = set(buy_items_list) click_not_buy_items_set = click_items_set - buy_items_set # 数据格式(每一行):session;购买items(用逗号隔开);点击不购买items(用逗号隔开) cur_data = [d, buy_items_list, list(click_not_buy_items_set)] # print("current data: ", cur_data) data.append(cur_data) # print2file(d, buy_items_list, click_not_buy_items_set) # 所有session all_sessions_set = set(all_sessions) # 所有点击商品都购买了的session allClicksBuy_sessions_set = set(allClicksBuy_sessions) # 既有购买商品,也有点击不购买商品的session all_data_sessions_set = set(all_data_sessions) # 所有点击商品都没有购买的session allClicksNotBuy_sessions_set = all_sessions_set - allClicksBuy_sessions_set - all_data_sessions_set # 所有"点击过但没被买的item" all_clickNotBuy_items_set = all_items_set - all_buy_items_set print("session item data: ", data) print("0、(data中的)既有购买商品,也有点击不购买商品的session: ", all_data_sessions) print("1、all_sessions: ", all_sessions) print("2、所有点击商品都购买了的session: ", allClicksBuy_sessions) print("3、所有点击商品都没有购买的session: ", allClicksNotBuy_sessions_set) print("4、data中的所有item: ", all_data_items_set) print("5、all_items: ", all_items_set) print("6、所有被购买的item: ", all_buy_items_set) print("7、所有点击过但没被买的item: ", all_clickNotBuy_items_set) print("printing to file...") allClicksNotBuy_sessions = list(allClicksNotBuy_sessions_set) all_data_items = list(all_data_items_set) all_items = list(all_items_set) all_buy_items = list(all_buy_items_set) all_clickNotBuy_items = list(all_clickNotBuy_items_set) # 输出数据到文件中 session_item_write_path = write_file_dir + r"\session_item.txt" p2f.print_data_lists_to_file(data, session_item_write_path) # 其他有用信息 print2file_list = [ all_data_sessions, all_sessions, allClicksBuy_sessions, allClicksNotBuy_sessions, all_data_items, all_items, all_buy_items, all_clickNotBuy_items ] file_name_list = [ "all_data_sessions.txt", "all_sessions.txt", "allClicksBuy_sessions.txt", "allClicksNotBuy_sessions.txt", "items.txt", "all_items.txt", "all_buy_items.txt", "all_clickNotBuy_items.txt" ] idx = 0 for cur_list in print2file_list: cur_file_path = write_file_dir + "\\" + file_name_list[idx] p2f.print_list_to_file(cur_list, cur_file_path) idx += 1 print("finish extracting real data")