def get_topsku(cls, folder, file_in, file_out_topsku, file_out_lines, min_sku_cnt): """ 筛选出现次数大于x的物品 """ FileTool.func_begin("get_topsku") listlist = FileTool.read_file_to_list_list( os.path.join(folder, file_in), Config.file_sep) dict_sku = {} # 统计item出现次数 for line in listlist: for unit in line: item = SessionItemBase(unit) # 构造对象 sku = item.sku # item id StructureTool.addDict(dict_sku, sku) print "dict_sku:", len(dict_sku) # 物品个数 # 筛去出现次数小于min_sku_cnt的物品 valid_sku = set() for sku in dict_sku: if (dict_sku[sku] >= min_sku_cnt): valid_sku.add(sku) valid_sku_list = list(valid_sku) print "valid_sku: ", len(valid_sku) # 筛完以后物品个数 FileTool.write_file_listStr(os.path.join(folder, file_out_topsku), valid_sku_list) # 把符合出现次数的物品写入文件topsku valid_lines = [] valid_lines_len = [] for line in listlist: flag = True if (Config.get_exp_label() != "tianchi"): for unit in line: item = SessionItemBase(unit) sku = item.sku if (sku not in valid_sku): flag = False break if (flag): valid_lines.append(line) else: newLine = [] for unit in line: item = SessionItemBase(unit) sku = item.sku if (sku in valid_sku): newLine.append(unit) if (len(newLine) >= Config.min_cnt_line_items_top): valid_lines.append(newLine) print "valid_lines: ", len(valid_lines) # 筛选完出现次数少于x的session # 写入文件 结构与初始数据文件一样 文件名JD_Computers.topsku FileTool.write_file_list_list(os.path.join(folder, file_out_lines), valid_lines, Config.file_sep) FileTool.func_end("get_topsku")
def get_topsku(cls, folder, file_in, file_out_topsku, file_out_lines, min_sku_cnt): FileTool.func_begin("get_topsku") listlist = FileTool.read_file_to_list_list( os.path.join(folder, file_in), Config.file_sep) dict_sku = {} for line in listlist: for unit in line: item = SessionItemBase(unit) sku = item.sku StructureTool.addDict(dict_sku, sku) print "dict_sku:", len(dict_sku) valid_sku = set() for sku in dict_sku: if (dict_sku[sku] >= min_sku_cnt): valid_sku.add(sku) valid_sku_list = list(valid_sku) print "valid_sku: ", len(valid_sku) FileTool.write_file_listStr(os.path.join(folder, file_out_topsku), valid_sku_list) valid_lines = [] valid_lines_len = [] for line in listlist: flag = True if (Config.get_exp_label() != "tianchi"): for unit in line: item = SessionItemBase(unit) sku = item.sku if (sku not in valid_sku): flag = False break if (flag): valid_lines.append(line) else: newLine = [] for unit in line: item = SessionItemBase(unit) sku = item.sku if (sku in valid_sku): newLine.append(unit) if (len(newLine) >= Config.min_cnt_line_items_top): valid_lines.append(newLine) print "valid_lines: ", len(valid_lines) FileTool.write_file_list_list(os.path.join(folder, file_out_lines), valid_lines, Config.file_sep) FileTool.func_end("get_topsku")
def get_data_raw_data_norm(cls, file_in, file_out): fout = open(file_out, 'w') with open(file_in, "r") as f: for l in f: events = l.strip().split(' ') listNew = [] for e in events: sessionItem = SessionItemBase(e) if (Config.get_exp_label() != "tianchi"): sessionItem.normItem() itemNew = sessionItem.toNormString(mode="SBCD") listNew.append(itemNew) strNew = ' '.join(listNew) fout.write(strNew + '\n') pass fout.close()
def get_data_session_train_filt_data_to_itemid(cls, file_in, mode): print "get_data_session_train_filt_data_to_itemid %s" % file_in file_in = Preprocess.add_folder_file(file_in) #dict_sessionItem_to_id = {} sessionItem_id = 1 micro_item_list = Config.get_micro_item_list(mode) print "micro_item_list:", micro_item_list micro_item2vec = Data.load_micro_item_vec_mode(mode) micro_item2id = Data.load_micro_itemInt_idInt(micro_item_list) list_lines = [] cnt = 0 with open(file_in, 'r') as f: for l in f: events = l.strip().split() w = [] for e in events: sessionItem = SessionItemBase(e) sessionItem.normItem() #print "e:", e # check valid flagItemValid = sessionItem.checkItemAllPartHasItemVec( micro_item2vec, micro_item_list) if not flagItemValid: continue try: #print "valid\n" sessionItem_idStr = sessionItem.toIdString( micro_item2id, micro_item_list) #if(sessionItem_idStr not in dict_sessionItem_to_id): # dict_sessionItem_to_id[sessionItem_idStr] = sessionItem_id # sessionItem_id += 1 curAppend = sessionItem_idStr #curAppend = str(dict_sessionItem_to_id[sessionItem_idStr]) w.append(curAppend) except: print "excepttion: sessionItem.toIdString" if (len(w) > 0): list_lines.append(w) cnt += 1 FileTool.print_info_pro(cnt) #if(cnt > 100): # break print "get_data_session_train_filt_data_to_itemid done!" return list_lines
def get_data_desc(cls, folder, file_in, file_out): FileTool.func_begin("get_data_desc") file_in = FileTool.add_folder(folder, file_in) file_out = FileTool.add_folder(folder, file_out) set_sku = set() set_bh = set() set_cid3 = set() list_gap = [] list_dwell = [] list_itemCnt = [] list_timeCnt = [] sessionCnt = 0 listlist = FileTool.read_file_to_list_list(file_in) microCnt = 0 list_res = [] for line in listlist: cnt = len(line) sessionCnt += 1 microCnt += cnt list_itemCnt.append(cnt) seconds = 0 for item in line: sessionItem = SessionItemBase(item) sku = sessionItem.sku bh = sessionItem.bh cid3 = sessionItem.cid3 gap = int(sessionItem.gap) dwell = int(sessionItem.dwell) set_sku.add(sku) set_bh.add(bh) set_cid3.add(cid3) if (gap < 86400): list_gap.append(int(gap)) seconds += int(gap) if (dwell < 60 * 10): list_dwell.append(int(dwell)) seconds += int(dwell) list_timeCnt.append(seconds) list_res.append("session: %d" % sessionCnt) list_res.append("sku: %d" % len(set_sku)) list_res.append("bh: %d" % len(set_bh)) list_res.append("cid3: %d" % len(set_cid3)) list_res.append("microCnt: %d" % microCnt) arr_gap = np.array(list_gap) list_res.append( "gap: min:%d, max:%d, avg:%f" % (np.min(arr_gap), np.max(arr_gap), np.average(arr_gap))) arr_dwell = np.array(list_dwell) list_res.append( "dwell: min:%d, max:%d, avg:%f" % (np.min(arr_dwell), np.max(arr_dwell), np.average(arr_dwell))) arr_itemCnt = np.array(list_itemCnt) list_res.append("itemCnt: min:%d, max:%d, avg:%f, total:%d" % (np.min(arr_itemCnt), np.max(arr_itemCnt), np.average(arr_itemCnt), np.sum(arr_itemCnt))) arr_timeCnt = np.array(list_timeCnt) list_res.append("timeCnt: min:%d, max:%d, avg:%f" % (np.min(arr_timeCnt), np.max(arr_timeCnt), np.average(arr_timeCnt))) FileTool.write_file_listStr(file_out, list_res) FileTool.func_end("get_data_desc")