def outputSvmFormatFile(fReader, LAST_ROW, outF, for_test, idxDic): count = 0 preVN = -1 firstCol = -1 w = -1 vnFeatDic = {} upcVnList = [] scVnList = [] ddVnList = [] flnVnList = [] next(fReader, None) # skip header for row in fReader: curVN = row[idxDic['VN_IDX']] count = count + 1 if (count % 10000 == 0): print('--count -> ' + str(count)) print('VisitNumber ->' + str(curVN)) print(vnFeatDic) if ((preVN != curVN and len(vnFeatDic) > 0) or count == LAST_ROW): ## ************* CUSTOM FEATURES START ************* ## vnFeatDic[emStart + extraMetricsList.index('records')] = len(ddVnList) if len(ddVnList) > 1: vnFeatDic[emStart + extraMetricsList.index('uniqDD')] = len( np.unique(ddVnList)) vnFeatDic[emStart + extraMetricsList.index('uniqFLN')] = len( np.unique(flnVnList)) vnFeatDic[emStart + extraMetricsList.index('sumSC')] = sum(scVnList) # SC-VN Stat features if len(scVnList) > 1: vnFeatDic[emStart + extraMetricsList.index('meanSC')] = np.mean(scVnList) vnFeatDic[emStart + extraMetricsList.index('medianSC')] = np.median( scVnList) vnFeatDic[emStart + extraMetricsList.index('maxSC')] = max(scVnList) vnFeatDic[emStart + extraMetricsList.index('sdSC')] = np.std( scVnList, ddof=1) vnFeatDic[emStart + extraMetricsList.index('skewSC')] = stats.skew( scVnList) vnFeatDic[emStart + extraMetricsList.index('kurtSC')] = stats.kurtosis( scVnList) vnFeatDic[emStart + extraMetricsList.index('iqrSC')] = np.subtract( *np.percentile(scVnList, [75, 25])) # DD-VN-SC Stat features ddVnScList = [ vnFeatDic.get(x) for x in list(range(ddStart, flnStart)) if vnFeatDic.get(x) != None ] if len(ddVnScList) > 1: vnFeatDic[emStart + extraMetricsList.index('meanDDSC')] = np.mean( ddVnScList) vnFeatDic[emStart + extraMetricsList.index('medianDDSC')] = np.median( ddVnScList) vnFeatDic[emStart + extraMetricsList.index('maxDDSC')] = max(ddVnScList) vnFeatDic[emStart + extraMetricsList.index('sdDDSC')] = np.std( ddVnScList, ddof=1) vnFeatDic[emStart + extraMetricsList.index('skewDDSC')] = stats.skew( ddVnScList) vnFeatDic[emStart + extraMetricsList.index('kurtDDSC')] = stats.kurtosis( ddVnScList) vnFeatDic[emStart + extraMetricsList.index('iqrDDSC')] = np.subtract( *np.percentile(ddVnScList, [75, 25])) # FLN-VN Stat features flnVnScList = [ vnFeatDic.get(x) for x in list(range(flnStart, upcStart)) if vnFeatDic.get(x) != None ] if len(flnVnScList) > 1: vnFeatDic[emStart + extraMetricsList.index('meanFLNSC')] = np.mean( flnVnScList) vnFeatDic[emStart + extraMetricsList.index('medianFLNSC')] = np.median( flnVnScList) vnFeatDic[emStart + extraMetricsList.index('maxFLNSC')] = max( flnVnScList) vnFeatDic[emStart + extraMetricsList.index('sdFLNSC')] = np.std( flnVnScList, ddof=1) vnFeatDic[emStart + extraMetricsList.index('skewFLNSC')] = stats.skew( flnVnScList) vnFeatDic[emStart + extraMetricsList.index( 'kurtFLNSC')] = stats.kurtosis(flnVnScList) vnFeatDic[emStart + extraMetricsList.index('iqrFLNSC')] = np.subtract( *np.percentile(flnVnScList, [75, 25])) # DD's with top two SC; Cond: SC1>1 and SC2>SC1/3 ddVnFeatDicSub = { k: vnFeatDic.get(k) for k in list(range(ddStart, flnStart)) if vnFeatDic.get(k) != None } if len(ddVnFeatDicSub) > 1: sorted_ddVnFeatDicSub = sorted(ddVnFeatDicSub.items(), key=operator.itemgetter(1)) ddVnMax1 = sorted_ddVnFeatDicSub[-1] if ddVnMax1[1] > 1: # Cond1: SC1>1 vnFeatDic[emStart + extraMetricsList.index('DDMaxSC1')] = ddVnMax1[0] if len(ddVnFeatDicSub) > 2: ddVnMax2 = sorted_ddVnFeatDicSub[-2] if ddVnMax2[1] > 1 and ddVnMax2[1] > ddVnMax1[ 1] / 3: # Cond2: SC2>1 and SC2>SC1/3 vnFeatDic[emStart + extraMetricsList.index( 'DDMaxSC2')] = ddVnMax2[0] # DD adjacency usage ddVnFeatNewDic = { k - ddStart: v for k, v in ddVnFeatDicSub.items() } # sub ddStart from all keys of ddVnFeatDicSub cosine_dist_all_list = get_sp_weights_from_matrix(ddVnFeatNewDic, cosine_dist_all, multiply_10=True) add_metrics_to_main_dict(vnFeatDic, cosine_dist_all_list, spAllStart, spMetricList) sp_k_1_list = get_sp_weights_from_matrix(ddVnFeatNewDic, sp_all_k_1) add_metrics_to_main_dict(vnFeatDic, sp_k_1_list, spAllStart + len(spMetricList), spMetricList) sp_k_3_list = get_sp_weights_from_matrix(ddVnFeatNewDic, sp_all_k_3) add_metrics_to_main_dict(vnFeatDic, sp_k_3_list, spAllStart + 2 * len(spMetricList), spMetricList) spl_k_1_list = get_sp_length_from_dict(ddVnFeatNewDic, spl_all_k_1) add_metrics_to_main_dict(vnFeatDic, spl_k_1_list, spAllStart + 3 * len(spMetricList), spMetricList) spl_k_3_list = get_sp_length_from_dict(ddVnFeatNewDic, spl_all_k_3) add_metrics_to_main_dict(vnFeatDic, spl_k_3_list, spAllStart + 4 * len(spMetricList), spMetricList) # 3 features for UPC length upcVnLenList = [len(j) for j in upcVnList] if len(upcVnLenList) > 1: vnFeatDic[emStart + extraMetricsList.index('sdUpcLen')] = np.std( upcVnLenList, ddof=1) vnFeatDic[emStart + extraMetricsList.index('skewUpcLen')] = stats.skew( upcVnLenList) vnFeatDic[emStart + extraMetricsList.index( 'kurtUpcLen')] = stats.kurtosis(upcVnLenList) # upcLenList freq add if upcVnLenList: upcLenDic = {(upcLenStart + upcLenList.index(i)): upcVnLenList.count(i) for i in set(upcVnLenList)} vnFeatDic.update(upcLenDic) ## ************* CUSTOM FEATURES END ************* ## vnFeatDic[wStart + wList.index(w)] = 1 # WEEKDAY #Start writing the 'vnFeatDic' to the output file value = [''] vnFeatDicSorted = collections.OrderedDict(sorted( vnFeatDic.items())) for kk, vv in vnFeatDicSorted.items(): value.append(' %s:%s' % (str(kk), str(vv))) outF.write(str(firstCol) + ''.join(value) + '\n') # Process the dict for VN and output a single line for that VN # reset for next VN vnFeatDic = {} upcVnList = [] scVnList = [] ddVnList = [] flnVnList = [] if count == LAST_ROW: break if for_test: firstCol = row[idxDic['VN_IDX']] # set VN for TEST else: firstCol = row[idxDic['TT_IDX']] # set TT for TRAIN w = row[idxDic['W_IDX']] # doesn't change for a VN sc = int(row[idxDic['SC_IDX']]) scVnList.append(sc) upc = row[idxDic['UPC_IDX']] if upc.isdigit(): upcVnList.append(upc) upcKeys = returnUpcKeys(upc) for ke in upcKeys: keIndex = upcStart + upcList.index(ke) if keIndex not in vnFeatDic: vnFeatDic[keIndex] = sc else: vnFeatDic[keIndex] += sc dd = row[idxDic['DD_IDX']] if dd == 'MENSWEAR': dd = 'MENS WEAR' ddVnList.append(dd) ddIdx = ddStart + ddList.index(dd) if ddIdx not in vnFeatDic: vnFeatDic[ddIdx] = sc else: vnFeatDic[ddIdx] += sc fln = row[idxDic['FLN_IDX']] if fln: #for empty fln # and sc>0: flnVnList.append(fln) flnIdx = flnStart + flnList.index(fln) if flnIdx not in vnFeatDic: vnFeatDic[flnIdx] = sc else: vnFeatDic[flnIdx] += sc preVN = curVN
def outputSvmFormatFile(fReader,LAST_ROW,outF,for_test,idxDic): count = 0 preVN = -1 firstCol = -1 w = -1 vnFeatDic = {} upcVnList = [] scVnList = [] ddVnList = [] flnVnList = [] next(fReader, None) # skip header for row in fReader: curVN = row[idxDic['VN_IDX']] count = count + 1 if(count % 10000 == 0): print('--count -> ' + str(count)) print('VisitNumber ->' + str(curVN)) print(vnFeatDic) if((preVN != curVN and len(vnFeatDic)>0) or count == LAST_ROW): ## ************* CUSTOM FEATURES START ************* ## vnFeatDic[emStart + extraMetricsList.index('records')] = len(ddVnList) if len(ddVnList)>1: vnFeatDic[emStart + extraMetricsList.index('uniqDD')] = len(np.unique(ddVnList)) vnFeatDic[emStart + extraMetricsList.index('uniqFLN')] = len(np.unique(flnVnList)) vnFeatDic[emStart + extraMetricsList.index('sumSC')] = sum(scVnList) # SC-VN Stat features if len(scVnList)>1: vnFeatDic[emStart + extraMetricsList.index('meanSC')] = np.mean(scVnList) vnFeatDic[emStart + extraMetricsList.index('medianSC')] = np.median(scVnList) vnFeatDic[emStart + extraMetricsList.index('maxSC')] = max(scVnList) vnFeatDic[emStart + extraMetricsList.index('sdSC')] = np.std(scVnList,ddof=1) vnFeatDic[emStart + extraMetricsList.index('skewSC')] = stats.skew(scVnList) vnFeatDic[emStart + extraMetricsList.index('kurtSC')] = stats.kurtosis(scVnList) vnFeatDic[emStart + extraMetricsList.index('iqrSC')] = np.subtract(*np.percentile(scVnList, [75, 25])) # DD-VN-SC Stat features ddVnScList = [vnFeatDic.get(x) for x in list(range(ddStart,flnStart)) if vnFeatDic.get(x)!=None] if len(ddVnScList)>1: vnFeatDic[emStart + extraMetricsList.index('meanDDSC')] = np.mean(ddVnScList) vnFeatDic[emStart + extraMetricsList.index('medianDDSC')] = np.median(ddVnScList) vnFeatDic[emStart + extraMetricsList.index('maxDDSC')] = max(ddVnScList) vnFeatDic[emStart + extraMetricsList.index('sdDDSC')] = np.std(ddVnScList,ddof=1) vnFeatDic[emStart + extraMetricsList.index('skewDDSC')] = stats.skew(ddVnScList) vnFeatDic[emStart + extraMetricsList.index('kurtDDSC')] = stats.kurtosis(ddVnScList) vnFeatDic[emStart + extraMetricsList.index('iqrDDSC')] = np.subtract(*np.percentile(ddVnScList, [75, 25])) # FLN-VN Stat features flnVnScList = [vnFeatDic.get(x) for x in list(range(flnStart,upcStart)) if vnFeatDic.get(x)!=None] if len(flnVnScList)>1: vnFeatDic[emStart + extraMetricsList.index('meanFLNSC')] = np.mean(flnVnScList) vnFeatDic[emStart + extraMetricsList.index('medianFLNSC')] = np.median(flnVnScList) vnFeatDic[emStart + extraMetricsList.index('maxFLNSC')] = max(flnVnScList) vnFeatDic[emStart + extraMetricsList.index('sdFLNSC')] = np.std(flnVnScList,ddof=1) vnFeatDic[emStart + extraMetricsList.index('skewFLNSC')] = stats.skew(flnVnScList) vnFeatDic[emStart + extraMetricsList.index('kurtFLNSC')] = stats.kurtosis(flnVnScList) vnFeatDic[emStart + extraMetricsList.index('iqrFLNSC')] = np.subtract(*np.percentile(flnVnScList, [75, 25])) # DD's with top two SC; Cond: SC1>1 and SC2>SC1/3 ddVnFeatDicSub = {k: vnFeatDic.get(k) for k in list(range(ddStart,flnStart)) if vnFeatDic.get(k)!=None} if len(ddVnFeatDicSub) > 1: sorted_ddVnFeatDicSub = sorted(ddVnFeatDicSub.items(), key=operator.itemgetter(1)) ddVnMax1 = sorted_ddVnFeatDicSub[-1] if ddVnMax1[1] > 1: # Cond1: SC1>1 vnFeatDic[emStart + extraMetricsList.index('DDMaxSC1')] = ddVnMax1[0] if len(ddVnFeatDicSub) > 2: ddVnMax2 = sorted_ddVnFeatDicSub[-2] if ddVnMax2[1] > 1 and ddVnMax2[1] > ddVnMax1[1]/3: # Cond2: SC2>1 and SC2>SC1/3 vnFeatDic[emStart + extraMetricsList.index('DDMaxSC2')] = ddVnMax2[0] # DD adjacency usage ddVnFeatNewDic = {k-ddStart:v for k,v in ddVnFeatDicSub.items()} # sub ddStart from all keys of ddVnFeatDicSub cosine_dist_all_dd_list = get_sp_weights_from_matrix(ddVnFeatNewDic, cosine_dist_all_dd,multiply_10=True) add_metrics_to_main_dict(vnFeatDic, cosine_dist_all_dd_list, spAllStart+0*len(spMetricList), spMetricList) sp_all_dd_k_1_list = get_sp_weights_from_matrix(ddVnFeatNewDic, sp_all_dd_k_1) add_metrics_to_main_dict(vnFeatDic, sp_all_dd_k_1_list, spAllStart+1*len(spMetricList), spMetricList) sp_all_dd_k_3_list = get_sp_weights_from_matrix(ddVnFeatNewDic, sp_all_dd_k_3) add_metrics_to_main_dict(vnFeatDic, sp_all_dd_k_3_list, spAllStart+2*len(spMetricList), spMetricList) spl_all_dd_k_1_list = get_sp_length_from_dict(ddVnFeatNewDic, spl_all_dd_k_1) add_metrics_to_main_dict(vnFeatDic, spl_all_dd_k_1_list, spAllStart+3*len(spMetricList), spMetricList) spl_all_dd_k_3_list = get_sp_length_from_dict(ddVnFeatNewDic, spl_all_dd_k_3) add_metrics_to_main_dict(vnFeatDic, spl_all_dd_k_3_list, spAllStart+4*len(spMetricList), spMetricList) # UPC adjacency usage upcVnFeatDicSub = {k: vnFeatDic.get(k) for k in list(range(upcStart,nextStart)) if vnFeatDic.get(k)!=None} upcVnFeatNewDic = {k-upcStart:v for k,v in upcVnFeatDicSub.items()} # sub upcStart from all keys of upcVnFeatDicSub cosine_dist_all_upc_list = get_sp_weights_from_matrix(upcVnFeatNewDic, cosine_dist_all_upc,multiply_10=True) add_metrics_to_main_dict(vnFeatDic, cosine_dist_all_upc_list, spAllStart+5*len(spMetricList), spMetricList) sp_all_upc_k_1_list = get_sp_weights_from_matrix(upcVnFeatNewDic, sp_all_upc_k_1) add_metrics_to_main_dict(vnFeatDic, sp_all_upc_k_1_list, spAllStart+6*len(spMetricList), spMetricList) sp_all_upc_k_3_list = get_sp_weights_from_matrix(upcVnFeatNewDic, sp_all_upc_k_3) add_metrics_to_main_dict(vnFeatDic, sp_all_upc_k_3_list, spAllStart+7*len(spMetricList), spMetricList) spl_all_upc_k_1_list = get_sp_length_from_dict(upcVnFeatNewDic, spl_all_upc_k_1) add_metrics_to_main_dict(vnFeatDic, spl_all_upc_k_1_list, spAllStart+8*len(spMetricList), spMetricList) spl_all_upc_k_3_list = get_sp_length_from_dict(upcVnFeatNewDic, spl_all_upc_k_3) add_metrics_to_main_dict(vnFeatDic, spl_all_upc_k_3_list, spAllStart+9*len(spMetricList), spMetricList) # UPC add neighbors via lookup in dict- knn_upc_k_3 # knnmatrix[i].nonzero()[1] # nearest neighbors(k) for i'th element in matrix upcVnFeatDicUpdate={} # will hold original UPC and their KNeighbors for upcKey,upcVal in upcVnFeatNewDic.items(): if upcVal>0: knn_idx_list = knn_upc_k_3[upcKey].nonzero()[1].tolist() # indexes of other UPC which are neighbor if upcKey in knn_idx_list: knn_idx_list.remove(upcKey) ll = len(knn_idx_list) if upcKey not in upcVnFeatDicUpdate: upcVnFeatDicUpdate[upcKey]=(1-ll/10)*upcVal #adding original member with 70% value else: upcVnFeatDicUpdate[upcKey]+=(1-ll/10)*upcVal # add neighbors(k) to upcVnFeatDicUpdate dictionary with a fraction of 'v' value for neigh in knn_idx_list: if neigh not in upcVnFeatDicUpdate: upcVnFeatDicUpdate[neigh]=0.1*upcVal else: upcVnFeatDicUpdate[neigh]+=0.1*upcVal upcVnFeatDicUpdate = {k+upcStart:v for k,v in upcVnFeatDicUpdate.items()} vnFeatDic.update(upcVnFeatDicUpdate) # 3 features for UPC length upcVnLenList = [len(j) for j in upcVnList] if len(upcVnLenList)>1: vnFeatDic[emStart + extraMetricsList.index('sdUpcLen')] = np.std(upcVnLenList,ddof=1) vnFeatDic[emStart + extraMetricsList.index('skewUpcLen')] = stats.skew(upcVnLenList) vnFeatDic[emStart + extraMetricsList.index('kurtUpcLen')] = stats.kurtosis(upcVnLenList) # upcLenList freq add if upcVnLenList: upcLenDic = {(upcLenStart + upcLenList.index(i)):upcVnLenList.count(i) for i in set(upcVnLenList)} vnFeatDic.update(upcLenDic) ## ************* CUSTOM FEATURES END ************* ## vnFeatDic[wStart + wList.index(w)] = 1 # WEEKDAY #Start writing the 'vnFeatDic' to the output file value = [''] vnFeatDicSorted = collections.OrderedDict(sorted(vnFeatDic.items())) for kk, vv in vnFeatDicSorted.items(): value.append(' %s:%s' % (str(kk), str(vv))) outF.write(str(firstCol) + ''.join(value) + '\n');# Process the dict for VN and output a single line for that VN # reset for next VN vnFeatDic = {} upcVnList = [] scVnList = [] ddVnList = [] flnVnList = [] if count == LAST_ROW: break if for_test: firstCol = row[idxDic['VN_IDX']] # set VN for TEST else: firstCol = row[idxDic['TT_IDX']] # set TT for TRAIN w = row[idxDic['W_IDX']] # doesn't change for a VN sc = int(row[idxDic['SC_IDX']]) scVnList.append(sc) upc = row[idxDic['UPC_IDX']] if upc.isdigit(): upcVnList.append(upc) upcKeys = returnUpcKeys(upc) for ke in upcKeys: if (not for_test) or (for_test and ke in upcList): keIndex = upcStart + upcList.index(ke) if keIndex not in vnFeatDic: vnFeatDic[keIndex] = sc else: vnFeatDic[keIndex] += sc dd = row[idxDic['DD_IDX']] if dd == 'MENSWEAR': dd = 'MENS WEAR' ddVnList.append(dd) if (not for_test) or (for_test and dd in ddList): ddIdx = ddStart + ddList.index(dd) if ddIdx not in vnFeatDic: vnFeatDic[ddIdx] = sc else: vnFeatDic[ddIdx] += sc fln = row[idxDic['FLN_IDX']] if fln: #for empty fln # and sc>0: flnVnList.append(fln) if (not for_test) or (for_test and fln in flnList): flnIdx = flnStart + flnList.index(fln) if flnIdx not in vnFeatDic: vnFeatDic[flnIdx] = sc else: vnFeatDic[flnIdx] += sc preVN = curVN
def outputSvmFormatFile(fReader,LAST_ROW,outF,for_test,idxDic): count = 0 preVN = -1 firstCol = -1 w = -1 vnFeatDic = {} upcList = [] scVnList = [] ddVnList = [] flnVnList = [] next(fReader, None) # skip header for row in fReader: curVN = row[idxDic['VN_IDX']] count = count + 1 if(count % 10000 == 0): print('--count -> ' + str(count)) print('VisitNumber ->' + str(curVN)) print(vnFeatDic) if((preVN != curVN and len(vnFeatDic)>0) or count == LAST_ROW): ## ************* CUSTOM FEATURES START ************* ## vnFeatDic[emStart + extraMetricsList.index('records')] = len(ddVnList) vnFeatDic[emStart + extraMetricsList.index('uniqDD')] = len(np.unique(ddVnList)) vnFeatDic[emStart + extraMetricsList.index('uniqFLN')] = len(np.unique(flnVnList)) vnFeatDic[emStart + extraMetricsList.index('sumSC')] = sum(scVnList) # SC-VN Stat features if len(scVnList)>1: vnFeatDic[emStart + extraMetricsList.index('meanSC')] = np.mean(scVnList) vnFeatDic[emStart + extraMetricsList.index('medianSC')] = np.median(scVnList) vnFeatDic[emStart + extraMetricsList.index('maxSC')] = max(scVnList) vnFeatDic[emStart + extraMetricsList.index('sdSC')] = np.std(scVnList,ddof=1) vnFeatDic[emStart + extraMetricsList.index('skewSC')] = stats.skew(scVnList) vnFeatDic[emStart + extraMetricsList.index('kurtSC')] = stats.kurtosis(scVnList) vnFeatDic[emStart + extraMetricsList.index('iqrSC')] = np.subtract(*np.percentile(scVnList, [75, 25])) # DD-VN-SC Stat features ddVnScList = [vnFeatDic.get(x) for x in list(range(ddStart,flnStart)) if vnFeatDic.get(x)!=None] if len(ddVnScList)>1: vnFeatDic[emStart + extraMetricsList.index('meanDDSC')] = np.mean(ddVnScList) vnFeatDic[emStart + extraMetricsList.index('medianDDSC')] = np.median(ddVnScList) vnFeatDic[emStart + extraMetricsList.index('maxDDSC')] = max(ddVnScList) vnFeatDic[emStart + extraMetricsList.index('sdDDSC')] = np.std(ddVnScList,ddof=1) vnFeatDic[emStart + extraMetricsList.index('skewDDSC')] = stats.skew(ddVnScList) vnFeatDic[emStart + extraMetricsList.index('kurtDDSC')] = stats.kurtosis(ddVnScList) vnFeatDic[emStart + extraMetricsList.index('iqrDDSC')] = np.subtract(*np.percentile(ddVnScList, [75, 25])) # FLN-VN Stat features flnVnScList = [vnFeatDic.get(x) for x in list(range(flnStart,nextStart)) if vnFeatDic.get(x)!=None] if len(flnVnScList)>1: vnFeatDic[emStart + extraMetricsList.index('meanFLNSC')] = np.mean(flnVnScList) vnFeatDic[emStart + extraMetricsList.index('medianFLNSC')] = np.median(flnVnScList) vnFeatDic[emStart + extraMetricsList.index('maxFLNSC')] = max(flnVnScList) vnFeatDic[emStart + extraMetricsList.index('sdFLNSC')] = np.std(flnVnScList,ddof=1) vnFeatDic[emStart + extraMetricsList.index('skewFLNSC')] = stats.skew(flnVnScList) vnFeatDic[emStart + extraMetricsList.index('kurtFLNSC')] = stats.kurtosis(flnVnScList) vnFeatDic[emStart + extraMetricsList.index('iqrFLNSC')] = np.subtract(*np.percentile(flnVnScList, [75, 25])) # DD's with top two SC; Cond: SC1>1 and SC2>SC1/3 ddVnFeatDicSub = {k: vnFeatDic.get(k) for k in list(range(ddStart,flnStart)) if vnFeatDic.get(k)!=None} if len(ddVnFeatDicSub) > 1: sorted_ddVnFeatDicSub = sorted(ddVnFeatDicSub.items(), key=operator.itemgetter(1)) ddVnMax1 = sorted_ddVnFeatDicSub[-1] if ddVnMax1[1] > 1: # Cond1: SC1>1 vnFeatDic[emStart + extraMetricsList.index('DDMaxSC1')] = ddVnMax1[0] if len(ddVnFeatDicSub) > 2: ddVnMax2 = sorted_ddVnFeatDicSub[-2] if ddVnMax2[1] > 1 and ddVnMax2[1] > ddVnMax1[1]/3: # Cond2: SC2>1 and SC2>SC1/3 vnFeatDic[emStart + extraMetricsList.index('DDMaxSC2')] = ddVnMax2[0] # DD adjacency usage ddVnFeatNewDic = {k-ddStart:v for k,v in ddVnFeatDicSub.items()} # sub ddStart from all keys of ddVnFeatDicSub cosine_dist_all_list = get_sp_weights_from_matrix(ddVnFeatNewDic, cosine_dist_all,multiply_10=True) add_metrics_to_main_dict(vnFeatDic, cosine_dist_all_list, spAllStart, spMetricList) sp_k_1_list = get_sp_weights_from_matrix(ddVnFeatNewDic, sp_all_k_1) add_metrics_to_main_dict(vnFeatDic, sp_k_1_list, spAllStart+len(spMetricList), spMetricList) sp_k_3_list = get_sp_weights_from_matrix(ddVnFeatNewDic, sp_all_k_3) add_metrics_to_main_dict(vnFeatDic, sp_k_3_list, spAllStart+2*len(spMetricList), spMetricList) spl_k_1_list = get_sp_length_from_dict(ddVnFeatNewDic, spl_all_k_1) add_metrics_to_main_dict(vnFeatDic, spl_k_1_list, spAllStart+3*len(spMetricList), spMetricList) spl_k_3_list = get_sp_length_from_dict(ddVnFeatNewDic, spl_all_k_3) add_metrics_to_main_dict(vnFeatDic, spl_k_3_list, spAllStart+4*len(spMetricList), spMetricList) # 3 features for UPC length upcVnLenList = [len(j) for j in upcList] if len(upcVnLenList)>1: vnFeatDic[emStart + extraMetricsList.index('sdUpcLen')] = np.std(upcVnLenList,ddof=1) vnFeatDic[emStart + extraMetricsList.index('skewUpcLen')] = stats.skew(upcVnLenList) vnFeatDic[emStart + extraMetricsList.index('kurtUpcLen')] = stats.kurtosis(upcVnLenList) # upcLenList freq add if upcVnLenList: upcLenDic = {(upcLenStart + upcLenList.index(i)):upcVnLenList.count(i) for i in set(upcVnLenList)} vnFeatDic.update(upcLenDic) ## ************* CUSTOM FEATURES END ************* ## vnFeatDic[wStart + wList.index(w)] = 1 # WEEKDAY #Start writing the 'vnFeatDic' to the output file value = [''] vnFeatDicSorted = collections.OrderedDict(sorted(vnFeatDic.items())) for kk, vv in vnFeatDicSorted.items(): value.append(' %s:%s' % (str(kk), str(vv))) outF.write(str(firstCol) + ''.join(value) + '\n');# Process the dict for VN and output a single line for that VN # reset for next VN vnFeatDic = {} upcList = [] scVnList = [] ddVnList = [] flnVnList = [] if count == LAST_ROW: break if for_test: firstCol = row[idxDic['VN_IDX']] # set VN for TEST else: firstCol = row[idxDic['TT_IDX']] # set TT for TRAIN w = row[idxDic['W_IDX']] # doesn't change for a VN upc = row[idxDic['UPC_IDX']] upcList.append(upc) sc = int(row[idxDic['SC_IDX']]) scVnList.append(sc) dd = row[idxDic['DD_IDX']] if dd == 'MENSWEAR': dd = 'MENS WEAR' ddVnList.append(dd) ddIdx = ddStart + ddList.index(dd) if ddIdx not in vnFeatDic: vnFeatDic[ddIdx] = sc else: vnFeatDic[ddIdx] = vnFeatDic[ddIdx] + sc fln = row[idxDic['FLN_IDX']] if fln: #for empty fln # and sc>0: flnVnList.append(fln) flnIdx = flnStart + flnList.index(fln) if flnIdx not in vnFeatDic: vnFeatDic[flnIdx] = sc else: vnFeatDic[flnIdx] = vnFeatDic[flnIdx] + sc preVN = curVN