def mrmrTest(cutMethod=1, method=0, runs=3):
	#Artifial Datasets
	files = ['data1000-f1.csv', 'data1000-f2.csv','data1000-f3.csv','data1000-f4.csv','data5000-f1.csv', 'data5000-f2.csv','data5000-f3.csv','data5000-f4.csv','data20000-f1.csv', 'data20000-f2.csv','data20000-f3.csv','data20000-f4.csv','data1000-f1-r500.csv','data5000-f1-r500.csv','data20000-f1-r500.csv']
	buenos = [[0,1,2,3,4,5,6,13,14],[0,1,8,9],[0,1,6,7],[0,1,3,2],[0,1,2,3,4,5,6,13,14],[0,1,8,9],[0,1,6,7],[0,1,3,2],[0,1,2,3,4,5,6,13,14],[0,1,8,9],[0,1,6,7],[0,1,3,2],[0,1,2,3,4,5,6,13,14],[0,1,2,3,4,5,6,13,14],[0,1,2,3,4,5,6,13,14]]	
	modelsType = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
	#Real Datasets
	#files = ['real/sonar_scale.csv', 'real/splice_scale.csv', 'real/colon-cancer.csv', 'real/leu.csv', 'real/duke.csv', 'real/BH20000.csv', 'real/madelon-test.csv']
	#buenos = [['?'],['?'],['?'],['?'],['?'],['?'],['?']]
	#modelsType = [0,0,0,0,0,0,0]
	i=0
	verboseClassifiers = True
	for f in files:
		modelType = modelsType[i]
		filepath = 'Data/'+f		
		filepath2 = 'Data2/'+f				
		data = read_csv(filepath)
		X = np.array(data.ix[:,0:-1])
		y = np.array(data.ix[:,-1])
		print (filepath, buenos[i])
		startTime = time.time()
		if(modelType==0):
			acc = ml.clasificationJudge(X=X,y=y, testPerc=0.5, runs=runs)
		else:
			acc = ml.regresionJudge(X=X,y=y, testPerc=0.5, runs=runs)
		endTime = time.time()
		print ("original:", acc, X.shape[1], str(round(endTime-startTime,3))+"s")
		#try:
		startTime = time.time()
		[rank,featureImportance] = rankExtraction(filepath2,method)
		if(cutMethod==0):
			cutpos = cuts.greatestDiffCut(weights=featureImportance)
		elif(cutMethod==1):
			cutpos = cuts.monotonicValidationCut(X=X, y=y, modelType=modelType, rank=rank, consecutives=5, runs=runs)
		
		rank = rank[0:cutpos]
		endTime = time.time()
		timefs = round(endTime-startTime,3)
		X = np.array(data.ix[:,rank])
		startTime = time.time()
		if(modelType==0):
			acc = ml.clasificationJudge(X=X,y=y, testPerc=0.5, runs=runs)
		else:
			acc = ml.regresionJudge(X=X,y=y, testPerc=0.5, runs=runs)
		endTime = time.time()
		timeml = round(endTime-startTime,3)
		print ("result: ",acc, timefs, timeml, len(rank), rank[0:5])
		print ()	
def featureSelection(X,y, modelType=0, runs=3, processes=0, measure=1, binMethod=0, cutMethod=1, minRed=0, rrThreshold=0.9, debug=False):
	
	if(measure<=4):
		corrMethod = measure
	elif(measure==5):
		measure = [0,1]
	elif(measure==6):
		measure = [1,3,4]
	wlist = []
	if(measure<=4):
		if(binMethod==0):
			weights = p.binStatic(X=X,y=y,processes=processes,measure=corrMethod)
		elif(binMethod==1):
			weights = p.binarySearchBins(X=X, y=y, processes=processes, measure=corrMethod, split=0, useSteps=2, normalizeResult=False, debug=False)			
	else:
		for corrMethod in measure: 	
			if(binMethod==0):
				wlist.append(p.binStatic(X=X,y=y,processes=processes,measure=corrMethod))
			elif(binMethod==1):
				wlist.append(p.binarySearchBins(X=X, y=y, processes=processes, measure=corrMethod, split=0, useSteps=2, normalizeResult=False, debug=False))
		weights = (ut.sumMixedCorrelation(wlist))
	#print weights
	rank = ut.getOrderRank(weights)
	orank = set(rank)
	if(cutMethod==-1):
		rank = rank	[0:20]
	if(cutMethod==0):
		rank = rank[0:cuts.greatestDiffCut(weights=weights)]
	elif(cutMethod==1):
		rank = rank[0:cuts.monotonicValidationCut(X=X, y=y, modelType=modelType, rank=rank, consecutives=5, runs=runs)]
	elif(cutMethod==2):
		#rank = rank[0:cuts.monotonicValidationCut(X=X, y=y, modelType=modelType, rank=rank, consecutives=X.shape[1], runs=runs)]
		[rank,originalRankPositions] = cuts.searchValidationCut(X=X, y=y, modelType=modelType, rank=rank, consecutives=X.shape[1], runs=runs)
	elif(cutMethod==3):
		[rank,originalRankPositions] = cuts.searchValidationCut(X=X, y=y, modelType=modelType, rank=rank, consecutives=5, runs=runs)
	if(debug):
		print "cutted",rank
	if(minRed==1):
		rank = p.parallelRemoveRedundant(X=X, rank=rank, processes=processes, measure=measure, threshold=rrThreshold)
	if(debug):
		print "mrmr",rank
	#print "weights:",
	#for i in rank:
	#	print weights[i],
	#print
	return rank
Example #3
0
def artificialTest():
    #Syntentic classification datasets
    #files = ['data1000-f1.csv', 'data1000-f2.csv','data1000-f3.csv','data1000-f4.csv','data5000-f1.csv', 'data5000-f2.csv','data5000-f3.csv','data5000-f4.csv','data20000-f1.csv', 'data20000-f2.csv','data20000-f3.csv','data20000-f4.csv','data1000-f1-r500.csv','data5000-f1-r500.csv','data20000-f1-r500.csv']
    #buenos = [[0,1,2,3,4,5,6,13,14],[0,1,8,9],[0,1,6,7],[0,1,3,2],[0,1,2,3,4,5,6,13,14],[0,1,8,9],[0,1,6,7],[0,1,3,2],[0,1,2,3,4,5,6,13,14],[0,1,8,9],[0,1,6,7],[0,1,3,2],[0,1,2,3,4,5,6,13,14],[0,1,2,3,4,5,6,13,14],[0,1,2,3,4,5,6,13,14]]
    #modelsType = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]

    #Syntetic Regression datasets
    #files = ['regression/reg1000-f1.csv']
    #buenos = [[0,1,2,3,4,5]]
    #modelsType = [1]

    #Real Datasets
    files = [
        'real/sonar_scale.csv', 'real/splice_scale.csv',
        'real/colon-cancer.csv', 'real/leu.csv', 'real/duke.csv',
        'real/BH20000.csv', 'real/madelon-test.csv'
    ]
    buenos = [['?'], ['?'], ['?'], ['?'], ['?'], ['?'], ['?']]
    modelsType = [0, 0, 0, 0, 0, 0, 0]

    i = 0
    for f in files:
        modelType = modelsType[i]
        filename = 'Data/' + f
        ########### Separate Data ###########
        print(filename, buenos[i])
        data = read_csv(filename)
        X = np.array(data.ix[:, 0:-1])
        y = np.array(data.ix[:, -1])

        ########### Search ###########
        #Static search
        #'''
        startTime = time.time()
        weights = bs.binStatic(X, y, 2)
        endTime = time.time()
        print("Serial static " + str(round(endTime - startTime, 3)) +
              " seconds.")
        print("weights:", weights[0:20])
        startTime = time.time()
        weights = p.binStatic(X, y, 0, 2)
        endTime = time.time()
        print("Parallel static " + str(round(endTime - startTime, 3)) +
              " seconds.")
        print("weights:", weights[0:20])
        weights = ut.sumMixedCorrelation(
            [bs.binStatic(X, y, 0),
             bs.binStatic(X, y, 1)])
        print("Combined Static:", weights)
        #'''

        #Dynamic search
        '''
		startTime = time.time()
		weights = bs.binarySearchBins(X,y,2,0,2)
		endTime = time.time()
		print "Serial dynamic " + str(round(endTime-startTime,3)) + " seconds."
		print "weights:", weights[0:20]
		startTime = time.time()
		weights = p.binarySearchBins(X,y,0,2,0,2)
		endTime = time.time()
		print "Parallel dynamic " + str(round(endTime-startTime,3)) + " seconds."
		print "weights:", weights[0:20]
		weights = ut.sumMixedCorrelation([bs.binarySearchBins(X,y,0,0,2),bs.binarySearchBins(X,y,1,0,2)])
		print "Combined Dyniamic:",weights
		#'''

        ########### Cuts ###########

        print("\nCuts:")
        rank = ut.getOrderRank(weights)
        print("rank:", rank[0:20])

        #'''
        startTime = time.time()

        print("Full features Accurracy:",
              ml.clasificationJudge(X=X, y=y, testPerc=0.5, runs=3))
        endTime = time.time()
        print("Full classification time: " +
              str(round(endTime - startTime, 3)) + " seconds.")

        startTime = time.time()
        cutpos1 = cuts.greatestDiffCut(weights)
        print(rank[0:cutpos1])
        endTime = time.time()
        print("\nCut greatestDiffCut time: " +
              str(round(endTime - startTime, 3)) + " seconds.")
        startTime = time.time()
        print(
            "greatestDiffCut Accurracy:",
            ml.clasificationJudge(X=X[:, rank[0:cutpos1]],
                                  y=y,
                                  testPerc=0.5,
                                  runs=3), " #features:", cutpos1)
        endTime = time.time()
        print("Classification greatestDiffCut time: " +
              str(round(endTime - startTime, 3)) + " seconds.")

        #'''
        startTime = time.time()

        cutpos2 = cuts.monotonicValidationCut(X=X,
                                              y=y,
                                              rank=rank,
                                              modelType=modelType,
                                              consecutives=5,
                                              runs=3)
        endTime = time.time()
        print("\nCut MonotonicValidationCut time: " +
              str(round(endTime - startTime, 3)) + " seconds.")
        startTime = time.time()
        print(
            "MonotonicValidation Accurracy:",
            ml.clasificationJudge(X=X[:, rank[0:cutpos2]],
                                  y=y,
                                  testPerc=0.5,
                                  runs=3), " #features:", cutpos2)
        endTime = time.time()
        print("Classification MonotonicValidationCut time: " +
              str(round(endTime - startTime, 3)) + " seconds.")
        #'''
        #'''
        startTime = time.time()
        cutpos3 = cuts.monotonicValidationCut(X=X,
                                              y=y,
                                              rank=rank,
                                              modelType=modelType,
                                              consecutives=X.shape[1],
                                              runs=3)
        endTime = time.time()
        print("Cut FullValidationCut time: " +
              str(round(endTime - startTime, 3)) + " seconds.")
        startTime = time.time()
        print(
            "FullValidationCut Accurracy:",
            ml.clasificationJudge(X=X[:, rank[0:cutpos3]],
                                  y=y,
                                  testPerc=0.5,
                                  runs=3), " #features:", cutpos3)
        endTime = time.time()
        print("Classification FullValidationCut time: " +
              str(round(endTime - startTime, 3)) + " seconds.")
        #'''

        #Removing redundant
        originalRank = list(rank)
        print("\nFinding redundant features:")
        '''
		startTime = time.time()
		rank =  set(bs.removeRedundant(X, rank))
		print "Serial mode"
		print "Original Rank:", originalRank
		print "Not redundant:",rank
		print "Redundant:",set(originalRank).difference(set(rank))
		endTime = time.time()
		print "Time finding redundant: " + str(round(endTime-startTime,3)) + " seconds."
		#'''
        #'''
        startTime = time.time()
        rank = list(originalRank)
        rank = set(p.parallelRemoveRedundant(X, rank))
        print("Parallel mode")
        print("Original Rank:", originalRank)
        print("Not redundant:", rank)
        print("Redundant:", set(originalRank).difference(set(rank)))
        endTime = time.time()
        rank = list(rank)
        print("Time finding redundant: " + str(round(endTime - startTime, 3)) +
              " seconds.")
        print("Final not redundant features Accurracy:",
              ml.clasificationJudge(X=X[:, rank], y=y, testPerc=0.5, runs=3))
        #'''

        i = i + 1
        print("-------------------------------------\n")