예제 #1
0
    def get_topsku(cls, folder, file_in, file_out_topsku, file_out_lines,
                   min_sku_cnt):
        """
        筛选出现次数大于x的物品
        """
        FileTool.func_begin("get_topsku")
        listlist = FileTool.read_file_to_list_list(
            os.path.join(folder, file_in), Config.file_sep)

        dict_sku = {}  # 统计item出现次数
        for line in listlist:
            for unit in line:
                item = SessionItemBase(unit)  # 构造对象
                sku = item.sku  # item id
                StructureTool.addDict(dict_sku, sku)

        print "dict_sku:", len(dict_sku)  # 物品个数

        # 筛去出现次数小于min_sku_cnt的物品
        valid_sku = set()
        for sku in dict_sku:
            if (dict_sku[sku] >= min_sku_cnt):
                valid_sku.add(sku)

        valid_sku_list = list(valid_sku)

        print "valid_sku: ", len(valid_sku)  # 筛完以后物品个数
        FileTool.write_file_listStr(os.path.join(folder, file_out_topsku),
                                    valid_sku_list)  # 把符合出现次数的物品写入文件topsku

        valid_lines = []
        valid_lines_len = []
        for line in listlist:
            flag = True
            if (Config.get_exp_label() != "tianchi"):
                for unit in line:
                    item = SessionItemBase(unit)
                    sku = item.sku
                    if (sku not in valid_sku):
                        flag = False
                        break
                if (flag):
                    valid_lines.append(line)
            else:
                newLine = []
                for unit in line:
                    item = SessionItemBase(unit)
                    sku = item.sku
                    if (sku in valid_sku):
                        newLine.append(unit)
                if (len(newLine) >= Config.min_cnt_line_items_top):
                    valid_lines.append(newLine)

        print "valid_lines: ", len(valid_lines)  # 筛选完出现次数少于x的session
        # 写入文件 结构与初始数据文件一样 文件名JD_Computers.topsku
        FileTool.write_file_list_list(os.path.join(folder, file_out_lines),
                                      valid_lines, Config.file_sep)

        FileTool.func_end("get_topsku")
예제 #2
0
    def get_topsku(cls, folder, file_in, file_out_topsku, file_out_lines,
                   min_sku_cnt):
        FileTool.func_begin("get_topsku")
        listlist = FileTool.read_file_to_list_list(
            os.path.join(folder, file_in), Config.file_sep)

        dict_sku = {}
        for line in listlist:
            for unit in line:
                item = SessionItemBase(unit)
                sku = item.sku
                StructureTool.addDict(dict_sku, sku)

        print "dict_sku:", len(dict_sku)

        valid_sku = set()
        for sku in dict_sku:
            if (dict_sku[sku] >= min_sku_cnt):
                valid_sku.add(sku)

        valid_sku_list = list(valid_sku)

        print "valid_sku: ", len(valid_sku)
        FileTool.write_file_listStr(os.path.join(folder, file_out_topsku),
                                    valid_sku_list)

        valid_lines = []
        valid_lines_len = []
        for line in listlist:
            flag = True
            if (Config.get_exp_label() != "tianchi"):
                for unit in line:
                    item = SessionItemBase(unit)
                    sku = item.sku
                    if (sku not in valid_sku):
                        flag = False
                        break
                if (flag):
                    valid_lines.append(line)
            else:
                newLine = []
                for unit in line:
                    item = SessionItemBase(unit)
                    sku = item.sku
                    if (sku in valid_sku):
                        newLine.append(unit)
                if (len(newLine) >= Config.min_cnt_line_items_top):
                    valid_lines.append(newLine)

        print "valid_lines: ", len(valid_lines)
        FileTool.write_file_list_list(os.path.join(folder, file_out_lines),
                                      valid_lines, Config.file_sep)

        FileTool.func_end("get_topsku")
예제 #3
0
    def get_data_raw_data_norm(cls, file_in, file_out):

        fout = open(file_out, 'w')
        with open(file_in, "r") as f:
            for l in f:
                events = l.strip().split(' ')
                listNew = []
                for e in events:
                    sessionItem = SessionItemBase(e)
                    if (Config.get_exp_label() != "tianchi"):
                        sessionItem.normItem()
                    itemNew = sessionItem.toNormString(mode="SBCD")
                    listNew.append(itemNew)
                strNew = ' '.join(listNew)
                fout.write(strNew + '\n')
                pass
        fout.close()
예제 #4
0
    def get_data_session_train_filt_data_to_itemid(cls, file_in, mode):
        print "get_data_session_train_filt_data_to_itemid %s" % file_in

        file_in = Preprocess.add_folder_file(file_in)

        #dict_sessionItem_to_id = {}
        sessionItem_id = 1

        micro_item_list = Config.get_micro_item_list(mode)
        print "micro_item_list:", micro_item_list

        micro_item2vec = Data.load_micro_item_vec_mode(mode)
        micro_item2id = Data.load_micro_itemInt_idInt(micro_item_list)

        list_lines = []
        cnt = 0
        with open(file_in, 'r') as f:
            for l in f:
                events = l.strip().split()
                w = []

                for e in events:
                    sessionItem = SessionItemBase(e)
                    sessionItem.normItem()
                    #print "e:", e
                    # check valid
                    flagItemValid = sessionItem.checkItemAllPartHasItemVec(
                        micro_item2vec, micro_item_list)

                    if not flagItemValid:
                        continue

                    try:
                        #print "valid\n"
                        sessionItem_idStr = sessionItem.toIdString(
                            micro_item2id, micro_item_list)

                        #if(sessionItem_idStr not in dict_sessionItem_to_id):
                        #    dict_sessionItem_to_id[sessionItem_idStr] = sessionItem_id
                        #    sessionItem_id += 1

                        curAppend = sessionItem_idStr
                        #curAppend = str(dict_sessionItem_to_id[sessionItem_idStr])
                        w.append(curAppend)
                    except:
                        print "excepttion: sessionItem.toIdString"

                if (len(w) > 0):
                    list_lines.append(w)

                cnt += 1
                FileTool.print_info_pro(cnt)

                #if(cnt > 100):
                #    break

        print "get_data_session_train_filt_data_to_itemid done!"
        return list_lines
예제 #5
0
    def get_data_desc(cls, folder, file_in, file_out):
        FileTool.func_begin("get_data_desc")
        file_in = FileTool.add_folder(folder, file_in)
        file_out = FileTool.add_folder(folder, file_out)

        set_sku = set()
        set_bh = set()
        set_cid3 = set()
        list_gap = []
        list_dwell = []

        list_itemCnt = []

        list_timeCnt = []
        sessionCnt = 0

        listlist = FileTool.read_file_to_list_list(file_in)

        microCnt = 0
        list_res = []
        for line in listlist:
            cnt = len(line)
            sessionCnt += 1
            microCnt += cnt
            list_itemCnt.append(cnt)

            seconds = 0
            for item in line:
                sessionItem = SessionItemBase(item)
                sku = sessionItem.sku
                bh = sessionItem.bh
                cid3 = sessionItem.cid3
                gap = int(sessionItem.gap)
                dwell = int(sessionItem.dwell)

                set_sku.add(sku)
                set_bh.add(bh)
                set_cid3.add(cid3)

                if (gap < 86400):
                    list_gap.append(int(gap))
                    seconds += int(gap)

                if (dwell < 60 * 10):
                    list_dwell.append(int(dwell))
                    seconds += int(dwell)

            list_timeCnt.append(seconds)

        list_res.append("session: %d" % sessionCnt)
        list_res.append("sku: %d" % len(set_sku))
        list_res.append("bh: %d" % len(set_bh))
        list_res.append("cid3: %d" % len(set_cid3))

        list_res.append("microCnt: %d" % microCnt)

        arr_gap = np.array(list_gap)
        list_res.append(
            "gap: min:%d, max:%d, avg:%f" %
            (np.min(arr_gap), np.max(arr_gap), np.average(arr_gap)))

        arr_dwell = np.array(list_dwell)
        list_res.append(
            "dwell: min:%d, max:%d, avg:%f" %
            (np.min(arr_dwell), np.max(arr_dwell), np.average(arr_dwell)))

        arr_itemCnt = np.array(list_itemCnt)
        list_res.append("itemCnt: min:%d, max:%d, avg:%f, total:%d" %
                        (np.min(arr_itemCnt), np.max(arr_itemCnt),
                         np.average(arr_itemCnt), np.sum(arr_itemCnt)))

        arr_timeCnt = np.array(list_timeCnt)
        list_res.append("timeCnt: min:%d, max:%d, avg:%f" %
                        (np.min(arr_timeCnt), np.max(arr_timeCnt),
                         np.average(arr_timeCnt)))

        FileTool.write_file_listStr(file_out, list_res)

        FileTool.func_end("get_data_desc")