def mrmrTest(cutMethod=1, method=0, runs=3): #Artifial Datasets files = ['data1000-f1.csv', 'data1000-f2.csv','data1000-f3.csv','data1000-f4.csv','data5000-f1.csv', 'data5000-f2.csv','data5000-f3.csv','data5000-f4.csv','data20000-f1.csv', 'data20000-f2.csv','data20000-f3.csv','data20000-f4.csv','data1000-f1-r500.csv','data5000-f1-r500.csv','data20000-f1-r500.csv'] buenos = [[0,1,2,3,4,5,6,13,14],[0,1,8,9],[0,1,6,7],[0,1,3,2],[0,1,2,3,4,5,6,13,14],[0,1,8,9],[0,1,6,7],[0,1,3,2],[0,1,2,3,4,5,6,13,14],[0,1,8,9],[0,1,6,7],[0,1,3,2],[0,1,2,3,4,5,6,13,14],[0,1,2,3,4,5,6,13,14],[0,1,2,3,4,5,6,13,14]] modelsType = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] #Real Datasets #files = ['real/sonar_scale.csv', 'real/splice_scale.csv', 'real/colon-cancer.csv', 'real/leu.csv', 'real/duke.csv', 'real/BH20000.csv', 'real/madelon-test.csv'] #buenos = [['?'],['?'],['?'],['?'],['?'],['?'],['?']] #modelsType = [0,0,0,0,0,0,0] i=0 verboseClassifiers = True for f in files: modelType = modelsType[i] filepath = 'Data/'+f filepath2 = 'Data2/'+f data = read_csv(filepath) X = np.array(data.ix[:,0:-1]) y = np.array(data.ix[:,-1]) print (filepath, buenos[i]) startTime = time.time() if(modelType==0): acc = ml.clasificationJudge(X=X,y=y, testPerc=0.5, runs=runs) else: acc = ml.regresionJudge(X=X,y=y, testPerc=0.5, runs=runs) endTime = time.time() print ("original:", acc, X.shape[1], str(round(endTime-startTime,3))+"s") #try: startTime = time.time() [rank,featureImportance] = rankExtraction(filepath2,method) if(cutMethod==0): cutpos = cuts.greatestDiffCut(weights=featureImportance) elif(cutMethod==1): cutpos = cuts.monotonicValidationCut(X=X, y=y, modelType=modelType, rank=rank, consecutives=5, runs=runs) rank = rank[0:cutpos] endTime = time.time() timefs = round(endTime-startTime,3) X = np.array(data.ix[:,rank]) startTime = time.time() if(modelType==0): acc = ml.clasificationJudge(X=X,y=y, testPerc=0.5, runs=runs) else: acc = ml.regresionJudge(X=X,y=y, testPerc=0.5, runs=runs) endTime = time.time() timeml = round(endTime-startTime,3) print ("result: ",acc, timefs, timeml, len(rank), rank[0:5]) print ()
def featureSelection(X,y, modelType=0, runs=3, processes=0, measure=1, binMethod=0, cutMethod=1, minRed=0, rrThreshold=0.9, debug=False): if(measure<=4): corrMethod = measure elif(measure==5): measure = [0,1] elif(measure==6): measure = [1,3,4] wlist = [] if(measure<=4): if(binMethod==0): weights = p.binStatic(X=X,y=y,processes=processes,measure=corrMethod) elif(binMethod==1): weights = p.binarySearchBins(X=X, y=y, processes=processes, measure=corrMethod, split=0, useSteps=2, normalizeResult=False, debug=False) else: for corrMethod in measure: if(binMethod==0): wlist.append(p.binStatic(X=X,y=y,processes=processes,measure=corrMethod)) elif(binMethod==1): wlist.append(p.binarySearchBins(X=X, y=y, processes=processes, measure=corrMethod, split=0, useSteps=2, normalizeResult=False, debug=False)) weights = (ut.sumMixedCorrelation(wlist)) #print weights rank = ut.getOrderRank(weights) orank = set(rank) if(cutMethod==-1): rank = rank [0:20] if(cutMethod==0): rank = rank[0:cuts.greatestDiffCut(weights=weights)] elif(cutMethod==1): rank = rank[0:cuts.monotonicValidationCut(X=X, y=y, modelType=modelType, rank=rank, consecutives=5, runs=runs)] elif(cutMethod==2): #rank = rank[0:cuts.monotonicValidationCut(X=X, y=y, modelType=modelType, rank=rank, consecutives=X.shape[1], runs=runs)] [rank,originalRankPositions] = cuts.searchValidationCut(X=X, y=y, modelType=modelType, rank=rank, consecutives=X.shape[1], runs=runs) elif(cutMethod==3): [rank,originalRankPositions] = cuts.searchValidationCut(X=X, y=y, modelType=modelType, rank=rank, consecutives=5, runs=runs) if(debug): print "cutted",rank if(minRed==1): rank = p.parallelRemoveRedundant(X=X, rank=rank, processes=processes, measure=measure, threshold=rrThreshold) if(debug): print "mrmr",rank #print "weights:", #for i in rank: # print weights[i], #print return rank
def artificialTest(): #Syntentic classification datasets #files = ['data1000-f1.csv', 'data1000-f2.csv','data1000-f3.csv','data1000-f4.csv','data5000-f1.csv', 'data5000-f2.csv','data5000-f3.csv','data5000-f4.csv','data20000-f1.csv', 'data20000-f2.csv','data20000-f3.csv','data20000-f4.csv','data1000-f1-r500.csv','data5000-f1-r500.csv','data20000-f1-r500.csv'] #buenos = [[0,1,2,3,4,5,6,13,14],[0,1,8,9],[0,1,6,7],[0,1,3,2],[0,1,2,3,4,5,6,13,14],[0,1,8,9],[0,1,6,7],[0,1,3,2],[0,1,2,3,4,5,6,13,14],[0,1,8,9],[0,1,6,7],[0,1,3,2],[0,1,2,3,4,5,6,13,14],[0,1,2,3,4,5,6,13,14],[0,1,2,3,4,5,6,13,14]] #modelsType = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] #Syntetic Regression datasets #files = ['regression/reg1000-f1.csv'] #buenos = [[0,1,2,3,4,5]] #modelsType = [1] #Real Datasets files = [ 'real/sonar_scale.csv', 'real/splice_scale.csv', 'real/colon-cancer.csv', 'real/leu.csv', 'real/duke.csv', 'real/BH20000.csv', 'real/madelon-test.csv' ] buenos = [['?'], ['?'], ['?'], ['?'], ['?'], ['?'], ['?']] modelsType = [0, 0, 0, 0, 0, 0, 0] i = 0 for f in files: modelType = modelsType[i] filename = 'Data/' + f ########### Separate Data ########### print(filename, buenos[i]) data = read_csv(filename) X = np.array(data.ix[:, 0:-1]) y = np.array(data.ix[:, -1]) ########### Search ########### #Static search #''' startTime = time.time() weights = bs.binStatic(X, y, 2) endTime = time.time() print("Serial static " + str(round(endTime - startTime, 3)) + " seconds.") print("weights:", weights[0:20]) startTime = time.time() weights = p.binStatic(X, y, 0, 2) endTime = time.time() print("Parallel static " + str(round(endTime - startTime, 3)) + " seconds.") print("weights:", weights[0:20]) weights = ut.sumMixedCorrelation( [bs.binStatic(X, y, 0), bs.binStatic(X, y, 1)]) print("Combined Static:", weights) #''' #Dynamic search ''' startTime = time.time() weights = bs.binarySearchBins(X,y,2,0,2) endTime = time.time() print "Serial dynamic " + str(round(endTime-startTime,3)) + " seconds." print "weights:", weights[0:20] startTime = time.time() weights = p.binarySearchBins(X,y,0,2,0,2) endTime = time.time() print "Parallel dynamic " + str(round(endTime-startTime,3)) + " seconds." print "weights:", weights[0:20] weights = ut.sumMixedCorrelation([bs.binarySearchBins(X,y,0,0,2),bs.binarySearchBins(X,y,1,0,2)]) print "Combined Dyniamic:",weights #''' ########### Cuts ########### print("\nCuts:") rank = ut.getOrderRank(weights) print("rank:", rank[0:20]) #''' startTime = time.time() print("Full features Accurracy:", ml.clasificationJudge(X=X, y=y, testPerc=0.5, runs=3)) endTime = time.time() print("Full classification time: " + str(round(endTime - startTime, 3)) + " seconds.") startTime = time.time() cutpos1 = cuts.greatestDiffCut(weights) print(rank[0:cutpos1]) endTime = time.time() print("\nCut greatestDiffCut time: " + str(round(endTime - startTime, 3)) + " seconds.") startTime = time.time() print( "greatestDiffCut Accurracy:", ml.clasificationJudge(X=X[:, rank[0:cutpos1]], y=y, testPerc=0.5, runs=3), " #features:", cutpos1) endTime = time.time() print("Classification greatestDiffCut time: " + str(round(endTime - startTime, 3)) + " seconds.") #''' startTime = time.time() cutpos2 = cuts.monotonicValidationCut(X=X, y=y, rank=rank, modelType=modelType, consecutives=5, runs=3) endTime = time.time() print("\nCut MonotonicValidationCut time: " + str(round(endTime - startTime, 3)) + " seconds.") startTime = time.time() print( "MonotonicValidation Accurracy:", ml.clasificationJudge(X=X[:, rank[0:cutpos2]], y=y, testPerc=0.5, runs=3), " #features:", cutpos2) endTime = time.time() print("Classification MonotonicValidationCut time: " + str(round(endTime - startTime, 3)) + " seconds.") #''' #''' startTime = time.time() cutpos3 = cuts.monotonicValidationCut(X=X, y=y, rank=rank, modelType=modelType, consecutives=X.shape[1], runs=3) endTime = time.time() print("Cut FullValidationCut time: " + str(round(endTime - startTime, 3)) + " seconds.") startTime = time.time() print( "FullValidationCut Accurracy:", ml.clasificationJudge(X=X[:, rank[0:cutpos3]], y=y, testPerc=0.5, runs=3), " #features:", cutpos3) endTime = time.time() print("Classification FullValidationCut time: " + str(round(endTime - startTime, 3)) + " seconds.") #''' #Removing redundant originalRank = list(rank) print("\nFinding redundant features:") ''' startTime = time.time() rank = set(bs.removeRedundant(X, rank)) print "Serial mode" print "Original Rank:", originalRank print "Not redundant:",rank print "Redundant:",set(originalRank).difference(set(rank)) endTime = time.time() print "Time finding redundant: " + str(round(endTime-startTime,3)) + " seconds." #''' #''' startTime = time.time() rank = list(originalRank) rank = set(p.parallelRemoveRedundant(X, rank)) print("Parallel mode") print("Original Rank:", originalRank) print("Not redundant:", rank) print("Redundant:", set(originalRank).difference(set(rank))) endTime = time.time() rank = list(rank) print("Time finding redundant: " + str(round(endTime - startTime, 3)) + " seconds.") print("Final not redundant features Accurracy:", ml.clasificationJudge(X=X[:, rank], y=y, testPerc=0.5, runs=3)) #''' i = i + 1 print("-------------------------------------\n")