loss='binary_crossentropy', metrics=['accuracy']) return model sizes = ['10', '50', '100', '150', '200', '250'] methods = ['MRMR', 'JMI', 'JMIM'] for method in methods: for size in sizes: global curr_size curr_size = int(size) print size print method import time d = DataSetLoader() X_train = d.LoadDataSet("A") y_train = d.LoadDataSetClasses("A") print X_train.shape print y_train.shape #chaipee will fix it later on y_train = numpy.transpose(y_train) print y_train.shape targets = list(y_train) y_train = [] for i in targets: #print i y_train.append(int(i)) #print len(y_train) #first run indices indices = joblib.load('datasetA_pickles/selected_indices_' + method + '.joblib.pkl')
from sklearn.neural_network import MLPClassifier from sklearn.pipeline import make_pipeline from sklearn import metrics from sklearn import preprocessing from sklearn.ensemble import VotingClassifier from sklearn.model_selection import LeaveOneOut sizes = ['10', '50', '100', '150', '200', '250'] methods = ['MRMR', 'JMI', 'JMIM'] for method in methods: for size in sizes: print size print method import time d = DataSetLoader() X_train = d.LoadDataSet("B_train") y_train = d.LoadDataSetClasses("B_train") print X_train.shape print y_train.shape #chaipee will fix it later on y_train = numpy.transpose(y_train) print y_train.shape targets = list(y_train) y_train = [] for i in targets: #print i y_train.append(int(i)) #print len(y_train) indices = joblib.load('datasetB_pickles/datasetB' + size + '-' + method + '.joblib.pkl')
preprocessings = ['Standard','Imputer','Robust','Quantile'] #datasets = ["A","B"] classifiers = ["MLP","SVM","AdaBoost","DT","RandomForest","ExtraTree"] dataset = "B" f=open('mcc/mccResults'+dataset+'.txt','w'); f.write("dataset, size, method, classifier, validationTechnique, mc, timeTaken, extra info"); for classifierName in classifiers: for method in methods: for size in sizes: for preproc in preprocessings: for validation in validationTechnique: #print size #print method d = DataSetLoader(); X_train= d.LoadDataSet("B_train"); y_train = d.LoadDataSetClasses("B_train"); X_test= d.LoadDataSet("B_test"); y_test = d.LoadDataSetClasses("B_test"); #chaipee will fix it later on y_train=numpy.transpose(y_train) print y_train.shape targets=list(y_train) y_train=[] for i in targets: #print i y_train.append(int(i)) y_test=numpy.transpose(y_train) print y_test.shape
methods=['MRMR','JMI','JMIM'] sizes=['10','50','100','150','200','250'] classifiers = ["MLP","SVM","RandomForest","AdaBoost","DT","ExtraTree"] validationTechniques = ["10FoldCV","LOOCV"], preps=["Standard","Robust","Quantile","Imputer"] n_iter_search = 20 #Iterating over each method for dataset in datasets: f=open('mcc/mccResults'+dataset+'.txt','a'); f.write("dataset, size, method, classifier, validationTechnique, mc, timeTaken, cv.max, cv.mean, cv.min, cv.std, preprocessing"); print "Dataset = ",dataset #initiating datasetloader object d = DataSetLoader(); #loading relevant Data and coresponding labels of dataset A X_train_full = d.LoadDataSet(dataset+"_train"); y_train = d.LoadDataSetClasses(dataset+"_train"); X_validate_full = d.LoadDataSet(dataset+"_test"); y_validate = d.LoadDataSetClasses(dataset+"_test"); print ("Dimensions of training data and labels:",X_train_full.shape,y_train.shape) print ("Dimensions of validation data and labels:",X_validate_full.shape,y_validate.shape) #chaipee will fix it later on targets=list(numpy.transpose(y_train)) y_train=[] for i in targets: y_train.append(int(i)) targets=list(numpy.transpose(y_validate))
results = ""; FTs = eval(Dataset + "_FTs"); FT_Accuracies = eval(Dataset + "_FT_Accuracies") start_time = time.time(); padding = 0; #load the dataset d = DataSetLoader(); #X_train_full = d.LoadDataSet(Dataset+"_train"); #y_train = d.LoadDataSetClasses(Dataset+"_train"); #targets=list(numpy.transpose(y_train)) #y_train=[] #for i in targets: # y_train.append(int(i)) X_validate_full = d.LoadDataSet(Dataset + "_test"); y_validate = d.LoadDataSetClasses(Dataset + "_test"); print ("Dimensions of validation data and labels:", X_validate_full.shape, y_validate.shape) targets = list(numpy.transpose(y_validate)) y_validate = [] if Dataset == "C": y_validate = numpy.array(targets) #y_validate[y_validate == 0] = -1 else: for i in targets: y_validate.append(int(i)) y_test = y_validate actuals = ','.join([str(elem) for elem in y_test]) actuals = actuals.replace("\n","").replace("[","").replace("]","").replace(" ",",") #to handle dataset C targets
from MachineSpecificSettings import Settings import scipy.io from DataSetLoaderLib import DataSetLoader from sklearn.neural_network import MLPClassifier from sklearn import metrics print("") print("") print("") print("") targets=numpy.array(joblib.load('DatasetA_ValidationClasses.joblib.pkl')) d = DataSetLoader(); G = d.LoadDataSet("A"); indices= joblib.load('selected_indicesv2.joblib.pkl') result=numpy.array(G)[:,indices] clf = MLPClassifier(activation='logistic',solver='sgd') import time start_time=time.time() scores = cross_val_score(clf, result, targets, cv=10) end_time=time.time()-start_time print end_time for i in scores: print i print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2)) filename='MLP_k-fold.joblib.pkl' joblib.dump(clf,filename, compress=9)
methods = ['MRMR'] #,'JMI','JMIM' sizes = ['10'] #,'50','100','150','200','250' validationTechniques = ["10FoldCV"] #"LOOCV", preps = ["Standard", "Robust", "Quantile", "Imputer"] #Iterating over each method print("Dataset, prepType, validationTechnique, method, size") for dataset in datasets: # f=open('mcc/mccEnsembleResults.txt','w'); # f.write("dataset, size, method, classifier, validationTechnique, mc, timeTaken, cv.max, cv.mean, cv.min, cv.std, preprocessing"); print "Dataset = ", dataset #initiating datasetloader object d = DataSetLoader() #loading relevant Data and coresponding labels of dataset A X_train = d.LoadDataSet(dataset + "_train") y_train = d.LoadDataSetClasses(dataset + "_train") X_test = d.LoadDataSet(dataset + "_test") y_test = d.LoadDataSetClasses(dataset + "_test") print("Dimensions of validation data and labels:", X_test.shape, y_test.shape) #chaipee will fix it later on targets = list(numpy.transpose(y_train)) y_train = [] for i in targets: y_train.append(int(i)) targets = list(numpy.transpose(y_test)) y_test = []
""" TEST PROGRAM """ from GlobalUtils import * import scipy from MachineSpecificSettings import Settings import scipy.io import numpy from DataSetLoaderLib import DataSetLoader import csv #variables =joblib.load('DatasetA_Validation.joblib.pkl') #targets =joblib.load('DatasetA_ValidationClasses.joblib.pkl') d = DataSetLoader(); variables = d.LoadDataSet("A"); targets = d.LoadDataSetClasses("A"); def test_range(start,end,G,targets,half): try: print "first" print start print end-half SelectSubSetmRMR(G[:,start:end-half],targets) except: return 1 try: print "second" print start+half print end SelectSubSetmRMR(G[:,start+half:end],targets)
i+=1 values.append(i+self.add_by) joblib.dump(values,'selected_features/datasetC/selected_indices'+'_'+useMethod+'.joblib.pkl', compress=9) except: type, value, traceback = sys.exc_info() print('Error Occured %s: %s: %s' % (type, value, traceback)) threadLock.release() print len(values) print "Exiting " + self.name return threads=[] d = DataSetLoader(); G = d.LoadDataSet("C_train"); targets = d.LoadDataSetClasses("C_train"); #chaipee will fix it later on #y_train=numpy.transpose(targets) y_train=numpy.asarray(targets) print y_train.shape targets=list(y_train) y_train=[] for i in targets: print i y_train.append(int(i)) targets = y_train #targets =numpy.asarray(targets ) #targets = column_or_1d(targets, warn=True) print "Dataset loaded"
def performTreeletClustering(DatasetName): saveFreq = 1000 #temp value for i x = -1 if (not (os.path.isfile("objs.pickle"))): print "New Start" d = DataSetLoader() G = d.LoadDataSet(DatasetName) F = G M = [] cacheTopXPerPart = d.CacheTopXPerPart(DatasetName) corrCalculator = PairwisePearsonCorrelationCalculator() print "calling corr calculator" corrMatrix = corrCalculator.CalculateSimilarity( G, d.GetPartSize(DatasetName), cacheTopXPerPart) else: print "continuing from where we left off" d = DataSetLoader() G, F, M, x, corrMatrix, cacheTopXPerPart = read() corrCalculator = PairwisePearsonCorrelationCalculator() p = F[0, :].size #because we have already done the previous iteration and loaded that one #save(G,F,M,i,corrMatrix,cacheTopXPerPart) i = x + 1 print corrMatrix[0] while i < p: #for i in range (x+1, p): recalc = False if checkCorr(corrMatrix, p) == 0: print "ERROR IN CORRMATRIX INDEX" return 0 #calculating value of p p = F[0, :].size print "Value of i is : " + str(i) + " out of " + str(p) theVectors = corrMatrix[0] #this is always the max corr so the element we want to process try: if (corrMatrix[0][3] == ''): recalc = False else: print corrMatrix[0] recalc = True except: pass Fa = F[:, theVectors[0]] Fb = F[:, theVectors[1]] print "calling generate metagene" m = generateNewMetaGene(Fa, Fb) print "calling scipy delete on F" F = scipy.delete(F, theVectors[1], 1) if not len(M): #if this is the first meta gene in this matrix M = m else: M = numpy.column_stack( (m, M)) #include in the meta genes set as well corrMatrix.pop(0) corrMatrix = corrCalculator.UpdateSimilarity(corrMatrix, F, list(m), theVectors[0], theVectors[1]) F[:, theVectors[0]] = m if len( corrMatrix ) <= 0 or recalc == True: #everything after this is potentially incorrect so lets recalculate the matrix corrMatrix = corrCalculator.CalculateSimilarity( F, d.GetPartSize(DatasetName), cacheTopXPerPart) if i % saveFreq == 0: save(G, F, M, i, corrMatrix, cacheTopXPerPart) i += 1 F = numpy.column_stack(( G, M)) #scipy.append(G, M, 1) #define a new expanded featureset F = G U M return F