def ceshi_file(data_dir): test = pd.read_csv(data_dir + '\\test.csv') # test_item_list = [] test_session_item_list = [] user_session_dic = {} for i in range(test.shape[0]): # df.shape[0],df.shape[1]分别获取行数、列数 cur_user = test.ix[i, 'userid'] if cur_user in user_session_dic: user_session_dic[cur_user].append(test.ix[i, 'session_id']) else: user_session_dic[cur_user] = [test.ix[i, 'session_id']] test_session_item_list.append([test.ix[i, 'session_id'], [], []]) item_bought = test.ix[i, 'itemid1'] test_session_item_list[i][1].append(item_bought) item_clicked = test.ix[i, 'itemid2'] # test_item_list.append(item_clicked) test_session_item_list[i][2].append(item_clicked) # test_item_list = list(set(test_item_list)) p2f.print_data_lists_to_file(test_session_item_list, data_dir + '\\test\\session_item.txt') # p2f.print_list_to_file(test_item_list, data_dir + '\\test\\items.txt') p2f.print_list_dict_to_file(user_session_dic, data_dir + '\\test\\user_session.txt') print('test file already')
def get_data(file_dir): session_item_file_path = file_dir + r"\session_item.txt" item_file_path = file_dir + r"\items.txt" # 获取session_item_data和user_session_data(此处data即为session_item_data) data, user_sessions_data = get_session_item_and_user_data( session_item_file_path) all_data_items = list() item_file = open(item_file_path, 'r') try: line = item_file.readline() tmp = line.split(',') for item_str in tmp: if item_str != '': item = int(item_str) all_data_items.append(item) except Exception as e: print(e) finally: item_file.close() # 获取item_session_data item_session_file_path = file_dir + r"\item_session.txt" if os.path.exists(item_session_file_path): item_session_data = rff.get_data_lists(item_session_file_path) else: # 获取item_session_data item_session_data = extract_item_data(data, all_data_items) # print("item_session_data: ", item_session_data) p2f.print_data_lists_to_file(item_session_data, item_session_file_path) print("finish get item session data") return user_sessions_data, data, item_session_data
def data_selection(in_file_path, out_file_dir): out_data_file_path = out_file_dir + r'\session_item.txt' out_items_file_path = out_file_dir + r'\items.txt' data = rff.get_data_lists(in_file_path) selected_data = list() for cur_data in data: buy_items = cur_data[1] if len(buy_items) < 2: selected_data.append(cur_data) selected_items = extract_items(selected_data) p2f.print_data_lists_to_file(selected_data, out_data_file_path) p2f.print_list_to_file(selected_items, out_items_file_path)
def sample_patition(rate, origin_file_dir, sampling_file_dir): origin_file_path = origin_file_dir + r"\session_item.txt" if not os.path.exists(sampling_file_dir): os.makedirs(sampling_file_dir) data_write_path = sampling_file_dir + r"\session_item.txt" items_write_path = sampling_file_dir + r"\items.txt" # 读取完整数据 all_data = rff.get_data_lists(origin_file_path) # 进行采样 sample_data, sample_items = sample_partition_help(all_data, rate) # 输出采样数据 p2f.print_data_lists_to_file(sample_data, data_write_path) p2f.print_list_to_file(sample_items, items_write_path)
def extract_real_data1(click_file_path, buys_file_path, write_file_dir): print("processing click file...") # dic1表示点击数据中,每个session看了哪些商品 dic1, all_sessions, all_items_set = get_session_itemList(click_file_path) print("processing buy file...") # 提取购买数据 # dic2表示购买数据中,某个session与"其购买商品的list"的map dic2, all_buy_sessions, all_buy_items_set = get_session_itemList( buys_file_path) print("extracting session_item_data...") # 此处data即为session_item_data data = list() all_data_items_set = set() # 考虑所有的session(包括没有购买商品或者全部点击商品都购买了的session) idx = 1 # 节约时间——表示一个session是否有购买了商品 session_flag_dic = dict() for d in all_sessions: session_flag_dic[d] = 0 for d in all_buy_sessions: session_flag_dic[d] = 1 for d in all_sessions: if idx % 10000 == 0: print("processing all_session, idx:", idx) # 当前session所有点击商品 click_items_list = dic1[d] for item in click_items_list: all_data_items_set.add(item) buy_items_list = list() if session_flag_dic[d] == 1: # 当前session所有购买商品 buy_items_list = dic2[d] click_items_set = set(click_items_list) buy_items_set = set(buy_items_list) click_not_buy_items_set = click_items_set - buy_items_set # 数据格式(每一行):session;购买items(用逗号隔开);点击不购买items(用逗号隔开) cur_data = [d, buy_items_list, list(click_not_buy_items_set)] data.append(cur_data) all_data_items = list(all_data_items_set) # 输出数据到文件中 session_item_write_path = write_file_dir + r"\session_item.txt" item_write_path = write_file_dir + r"\items.txt" p2f.print_data_lists_to_file(data, session_item_write_path) p2f.print_list_to_file(all_data_items, item_write_path)
def extract_real_data2(click_file_path, buys_file_path, write_file_dir): print("processing click file...") # dic1表示点击数据中,每个session看了哪些商品 dic1, all_sessions, all_items_set = get_session_itemList(click_file_path) print("processing buy file...") # 提取购买数据 # dic2表示购买数据中某个session与"其购买商品的list"的map dic2, all_buy_sessions, all_buy_items_set = get_session_itemList( buys_file_path) print("extracting session_item_data...") # start = time.time() # 此处data即为session_item_data data = list() # 只提取那些“既有点击商品也有点击不购买商品的session”的商品 all_data_items_set = set() # 只考虑那些发生了购买行为的session(且只取其中“购买商品数等于1的session”) for d in all_buy_sessions: # 当前session所有点击商品 click_items_list = dic1[d] # 当前session所有购买商品 buy_items_list = dic2[d] buy_nums = len(buy_items_list) if buy_nums == 1: # 只提取那些“有购买且只购买1个商品的session”的商品 for item in click_items_list: all_data_items_set.add(item) click_items_set = set(click_items_list) buy_items_set = set(buy_items_list) click_not_buy_items_set = click_items_set - buy_items_set # 数据格式(每一行):session;购买items(用逗号隔开);点击不购买items(用逗号隔开) cur_data = [d, buy_items_list, list(click_not_buy_items_set)] # print("current data: ", cur_data) data.append(cur_data) # print2file(d, buy_items_list, click_not_buy_items_set) all_data_items = list(all_data_items_set) # 输出数据到文件中 session_item_write_path = write_file_dir + r"\session_item.txt" item_write_path = write_file_dir + r"\items.txt" p2f.print_data_lists_to_file(data, session_item_write_path) p2f.print_list_to_file(all_data_items, item_write_path) print("finish extracting real data")
def train_file(data_dir): # 读取分好的数据,生成模型所用的session_item.txt,items.txt,user_session.txt train = pd.read_csv(data_dir + '\\train.csv') train_item_list = [] train_session_item_list = [] train_item_session_dic = {} user_session_dic = {} for i in range(train.shape[0]): cur_user = train.ix[i, 'userid'] cur_session = train.ix[i, 'session_id'] if cur_user in user_session_dic: user_session_dic[cur_user].append(cur_session) else: user_session_dic[cur_user] = [cur_session] train_session_item_list.append([cur_session, [], []]) item_bought = train.ix[i, 'itemid1'] train_session_item_list[i][1].append(item_bought) if item_bought in train_item_session_dic: train_item_session_dic[item_bought][0].append(cur_session) else: train_item_session_dic[item_bought] = [[cur_session], []] item_clicked = train.ix[i, 'itemid2'] train_session_item_list[i][2].append(item_clicked) if item_clicked in train_item_session_dic: train_item_session_dic[item_clicked][1].append(cur_session) else: train_item_session_dic[item_clicked] = [[], [cur_session]] p2f.print_data_lists_to_file(train_session_item_list, data_dir + '\\train\\session_item.txt') # p2f.print_list_to_file(train_item_list,data_dir + '\\dataset\\train\\items.txt') p2f.print_list_dict_to_file(user_session_dic, data_dir + '\\train\\user_session.txt') p2f.print_2lists_dict_to_file(train_item_session_dic, data_dir + '\\train\\item_session.txt') print('train file already')
def test_data_selection(out_file_dir, in_test_file_path, out_test_file_dir): train_items_file_path = out_file_dir + r'\items.txt' train_items = rff.get_int_list(train_items_file_path) test_data = rff.get_data_lists(in_test_file_path) test_data_selected = list() for cur_test_data in test_data: cur_items = cur_test_data[1] + cur_test_data[2] selection = True for item in cur_items: if item in train_items: continue else: selection = False break if selection: test_data_selected.append(cur_test_data) test_items_selected = extract_items(test_data_selected) out_test_data_file_path = out_test_file_dir + r'\session_item.txt' out_test_items_file_path = out_test_file_dir + r'\items.txt' p2f.print_data_lists_to_file(test_data_selected, out_test_data_file_path) p2f.print_list_to_file(test_items_selected, out_test_items_file_path)
def extract_real_data(click_file_path, buys_file_path, write_file_dir): print("processing click file...") # dic1 表示点击数据中,每个session看了哪些商品 dic1, all_sessions, all_items_set = get_session_itemList(click_file_path) print("processing buy file...") # 提取购买数据 # dic2表示购买数据中某个session与"其购买商品的list"的map dic2, all_buy_sessions, all_buy_items_set = get_session_itemList( buys_file_path) print("extracting session_item_data...") # start = time.time() # 整合点击数据与购买数据,获取session_item_data allClicksBuy_sessions = list() # 此处data即为session_item_data data = list() # 提取出那些既有点击商品也有点击不购买商品的session all_data_sessions = list() # 只提取那些“既有点击商品也有点击不购买商品的session”的商品 all_data_items_set = set() # 只考虑那些发生了购买行为的session(再去掉其中“所有点击商品都购买的session”) for d in all_buy_sessions: # 当前session所有点击商品 click_items_list = dic1[d] click_nums = len(click_items_list) # 当前session所有购买商品 buy_items_list = dic2[d] buy_nums = len(buy_items_list) # “去掉”所有点击商品都购买的session if click_nums == buy_nums: allClicksBuy_sessions.append(d) # 该session中既有购买商品,也有点击不购买商品 else: all_data_sessions.append(d) # 只提取那些“既有点击商品也有点击不购买商品的session”的商品 for item in click_items_list: all_data_items_set.add(item) click_items_set = set(click_items_list) buy_items_set = set(buy_items_list) click_not_buy_items_set = click_items_set - buy_items_set # 数据格式(每一行):session;购买items(用逗号隔开);点击不购买items(用逗号隔开) cur_data = [d, buy_items_list, list(click_not_buy_items_set)] # print("current data: ", cur_data) data.append(cur_data) # print2file(d, buy_items_list, click_not_buy_items_set) # 所有session all_sessions_set = set(all_sessions) # 所有点击商品都购买了的session allClicksBuy_sessions_set = set(allClicksBuy_sessions) # 既有购买商品,也有点击不购买商品的session all_data_sessions_set = set(all_data_sessions) # 所有点击商品都没有购买的session allClicksNotBuy_sessions_set = all_sessions_set - allClicksBuy_sessions_set - all_data_sessions_set # 所有"点击过但没被买的item" all_clickNotBuy_items_set = all_items_set - all_buy_items_set print("session item data: ", data) print("0、(data中的)既有购买商品,也有点击不购买商品的session: ", all_data_sessions) print("1、all_sessions: ", all_sessions) print("2、所有点击商品都购买了的session: ", allClicksBuy_sessions) print("3、所有点击商品都没有购买的session: ", allClicksNotBuy_sessions_set) print("4、data中的所有item: ", all_data_items_set) print("5、all_items: ", all_items_set) print("6、所有被购买的item: ", all_buy_items_set) print("7、所有点击过但没被买的item: ", all_clickNotBuy_items_set) print("printing to file...") allClicksNotBuy_sessions = list(allClicksNotBuy_sessions_set) all_data_items = list(all_data_items_set) all_items = list(all_items_set) all_buy_items = list(all_buy_items_set) all_clickNotBuy_items = list(all_clickNotBuy_items_set) # 输出数据到文件中 session_item_write_path = write_file_dir + r"\session_item.txt" p2f.print_data_lists_to_file(data, session_item_write_path) # 其他有用信息 print2file_list = [ all_data_sessions, all_sessions, allClicksBuy_sessions, allClicksNotBuy_sessions, all_data_items, all_items, all_buy_items, all_clickNotBuy_items ] file_name_list = [ "all_data_sessions.txt", "all_sessions.txt", "allClicksBuy_sessions.txt", "allClicksNotBuy_sessions.txt", "items.txt", "all_items.txt", "all_buy_items.txt", "all_clickNotBuy_items.txt" ] idx = 0 for cur_list in print2file_list: cur_file_path = write_file_dir + "\\" + file_name_list[idx] p2f.print_list_to_file(cur_list, cur_file_path) idx += 1 print("finish extracting real data")