def get_top_anomaly(DATA,GMM_pairwise,MI_pairwise,max_order, top_list=500,all=True,start_TA=float('inf'),TA_num=500): BEST = [] SEQ = [] N,K = DATA.shape index_set = np.array(range(0,N)) feature_set = range(0,K) while len(SEQ)<top_list: temp_score = 0 N,K = DATA.shape #N samples and K features for i in range(2,max_order+1): print('evaluating order '+str(i)+'\n') TA_list = [] if i<=start_TA or all == True: #here, f_subset is always sorted in ascending order for f_subset in itertools.combinations(feature_set,i): #get the subset matrices DATA_subset,GMM_subset,MI_subset = \ GAD.get_subset_DATA_GMM_MI(DATA,GMM_pairwise,MI_pairwise,f_subset) #learn a DT on the feature subset DT = GAD.get_DT(MI_subset) #calculate DT p-value for each sample data_logpval = GAD.calculate_logpval_DT(DATA_subset,GMM_subset,DT) #print('size of log pval'+str(data_logpval.shape)) #calculate the subset score subset_score, subset_seq = \ GAD.get_subset_score(DT,data_logpval,N,K,len(f_subset)) if subset_score < temp_score: temp_score = subset_score temp_seq = subset_seq temp_order = i temp_fsubset = f_subset #save the top-k if we start from i if i==start_TA: ''' #insertion approach if len(TA_list)<TA_num: TA_list.append([f_subset,subset_score]) elif len(TA_list)==TA_num: TA_list.sort(key=lambda x:x[1]) else: insert_into_list(TA_list,f_subset,subset_score) ''' TA_list.append([f_subset,subset_score]) if i==start_TA: if len(TA_list)<TA_num: trial_subset = sorted(TA_list,key=lambda x:x[1]) else: trial_subset = sorted(TA_list,key=lambda x:x[1])[0:TA_num] del TA_list elif i>start_TA: #later this part #trial-add feature from the best i-1 candidates tried_list = [] for temp_trial in trial_subset: remain_indicator = range(0,K) for temp_idx in temp_trial[0]: ##print temp_idx remain_indicator[temp_idx]=-1 for j in range(len(remain_indicator)): if remain_indicator[j]!=-1: #a valid trial passed #set as a list to insert candidate valid_trial = list(temp_trial[0]) for jj in range(len(valid_trial)): if valid_trial[jj]>remain_indicator[j]: valid_trial.insert(jj,remain_indicator[j]) break #if jj==len(valid_trial)-1: elif jj==len(valid_trial)-1: valid_trial.insert(jj+1,remain_indicator[j]) #turn into a tuple for consistency valid_trial = tuple(valid_trial) if valid_trial not in tried_list: tried_list.append(valid_trial) #get the subset matrices DATA_subset,GMM_subset,MI_subset = \ GAD.get_subset_DATA_GMM_MI(DATA,GMM_pairwise, MI_pairwise,valid_trial) #learn a DT on the feature subset DT = GAD.get_DT(MI_subset) #calculate DT p-value for each sample data_logpval = GAD.calculate_logpval_DT(DATA_subset,GMM_subset,DT) #calculate the subset score subset_score, subset_seq = \ GAD.get_subset_score(DT,data_logpval,N,K,len(f_subset)) if subset_score < temp_score: temp_score = subset_score temp_seq = subset_seq temp_order = i temp_fsubset = valid_trial TA_list.append([valid_trial,subset_score]) if len(TA_list)<TA_num: trial_subset = sorted(TA_list,key=lambda x:x[1]) else: trial_subset = sorted(TA_list,key=lambda x:x[1])[0:TA_num] del TA_list BEST.append([len(temp_seq),temp_order,temp_score,temp_fsubset]) print str(len(temp_seq))+' samples added into the list\n' SEQ.extend([index_set[i] for i in temp_seq]) #remove these samples np.delete(DATA,temp_seq,axis=0) np.delete(index_set,temp_seq) return SEQ,BEST
trial_subset = sorted(TA_list,key=lambda x:x[1]) else: trial_subset = sorted(TA_list,key=lambda x:x[1])[0:TA_num] del TA_list BEST.append([len(temp_seq),temp_order,temp_score,temp_fsubset]) print str(len(temp_seq))+' samples added into the list\n' SEQ.extend([index_set[i] for i in temp_seq]) #remove these samples np.delete(DATA,temp_seq,axis=0) np.delete(index_set,temp_seq) return SEQ,BEST if __name__ == '__main__': TRAIN = np.loadtxt('TRAIN.txt') DATA = np.loadtxt('TEST.txt') LABEL = np.loadtxt('LABEL.txt') normal_cat = -1 #max number of component per gmm models M_max = 50 _,K = TRAIN.shape #get pairwise gmm clusters from DATA num_comp,GMM_pair = GAD.get_all_pairwise_gmm(TRAIN,M_max) #get mutual info for each gm pair, by mc sampling MI_pair = GAD.get_all_pairwise_MI(GMM_pair,1e4) #set the order be the number of features to ensure monotonicity anomaly_list,BEST = get_top_anomaly(DATA,GMM_pair,MI_pair,K/2) roc_auc = GAD.calculate_roc(anomaly_list,LABEL,normal_cat) print 'the final roc is ' + str(roc_auc)