def loadDataset(identifier):
    d = DataSetLoader()
    x = d.LoadDataSet(identifier)
    print 'X', x.shape
    y = d.LoadDataSetClasses(identifier)
    print 'Y', y.shape
    y = numpy.transpose(y.astype(numpy.int64))
    print 'Y', y.shape
    target = []
    y = list(y)
    print "y before manual transform =", y
    for i in y:
        target.append(int(i))
    print len(y)
    print y
    return x, y
Ejemplo n.º 2
0
def loadDataset(identifier):
    d = DataSetLoader()
    x = d.LoadDataSet(identifier)
    print 'X', x.shape
    y = d.LoadDataSetClasses(identifier)
    print 'Y', y.shape
    #y=numpy.transpose(y.astype(numpy.int64))
    y = sklearn.utils.validation.column_or_1d(y, warn=True)
    print 'Y', y.shape
    target = []
    y = list(y)
    print "y before manual transform =", y
    for i in y:
        target.append(int(i))
    print len(y)
    print y
    return x, y
Ejemplo n.º 3
0
	# check selected features
	print (feat_selector.support_)

	
	# check ranking of features
	print (feat_selector.ranking_)
	print (len(feat_selector.ranking_))
	selected_indices=feat_selector.ranking_

	# call transform() on X to filter it down to selected features
	X_filtered = feat_selector.transform(X)
	return [X_filtered,selected_indices]



d = DataSetLoader();
x = d.LoadDataSet("B_train");
y=d.LoadDataSetClasses("B_train");
print y.shape
y=numpy.transpose(y)
print x.shape
print y.shape
target=[]
y=list(y)
for i in y:
	target.append(int(i))
print len(y)
sizes=['10','50','100','150','200','250']
methods=['MRMR','JMI','JMIM']
for method in methods:
    for size in sizes:
Ejemplo n.º 4
0
from MachineSpecificSettings import Settings
import scipy.io
import numpy
from DataSetLoaderLib import DataSetLoader
import csv
#Used for storing and loading the trained classifier
from sklearn.externals import joblib

print("")
print("")
print("")
print("")
#targets = numpy.array([0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1])
targets = numpy.array(joblib.load('DatasetA_ValidationClasses.joblib.pkl'))
variables = None
d = DataSetLoader()
variables = d.LoadDataSet("A")

#variables = G[:,0:100];
indices = joblib.load('selected_indices_MRMR.joblib.pkl')
variables = numpy.array(variables)[:, indices]
#print variables .shape
#print len(variables )
variables = variables
"""
convert an array to csv
http://stackoverflow.com/questions/16482895/convert-a-numpy-array-to-a-csv-string-and-a-csv-string-back-to-a-numpy-array
targetsString = ','.join(['%d' % num for num in targets[0]])
variablesString = ','.join(['%.5f' % num for num in variables[0]])
numpy.fromstring(targetsString, sep=',')
import numpy as np
import random
from sklearn.svm import SVC
from sklearn.cross_validation import StratifiedKFold
from MachineSpecificSettings import Settings
from DataSetLoaderLib import DataSetLoader
from sklearn.externals import joblib
from evolutionary_search import EvolutionaryAlgorithmSearchCV


y=np.array(joblib.load('DatasetA_ValidationClasses.joblib.pkl'))
d = DataSetLoader();
X_original = d.LoadDataSet("A");
paramgrid = {"kernel": ["rbf"],
             "C"     : np.logspace(-9, 9, num=25, base=10),
             "gamma" : np.logspace(-9, 9, num=25, base=10)}

sizes=['10','50','100','150','200','250']
methods=['MRMR','JMI','JMIM']
targets=np.array(joblib.load('DatasetA_ValidationClasses.joblib.pkl'))
	
for method in methods:
    for size in sizes:
	random.seed(1)
	X=X_original
	indices= joblib.load(method+' PICKLES/selected_indices_'+method+'.joblib.pkl')
	X=np.array(X)[:,indices]
	indices= joblib.load(method+' PICKLES/'+size+'-'+method+'.joblib.pkl')
	X=np.array(X)[:,indices]
	f=open('genetic/'+method+'-'+size+'.txt','w')
        print size
Ejemplo n.º 6
0
import scipy.io
import numpy
from DataSetLoaderLib import DataSetLoader
import csv
#Used for storing and loading the trained classifier
from sklearn.externals import joblib

print("")
print("")
print("")
print("")

variables = None
targets = None

d = DataSetLoader()
variables = d.LoadDataSet("A")
targets = d.LoadDataSetClasses("A")
"""
convert an array to csv
http://stackoverflow.com/questions/16482895/convert-a-numpy-array-to-a-csv-string-and-a-csv-string-back-to-a-numpy-array
targetsString = ','.join(['%d' % num for num in targets[0]])
variablesString = ','.join(['%.5f' % num for num in variables[0]])
numpy.fromstring(targetsString, sep=',')

load a csv to an array
http://stackoverflow.com/questions/13381815/python-csv-text-file-to-arrayi-j
"""
selected_indices = []
[subset, selected_indices] = SelectSubSetmRMR(variables, targets)
Ejemplo n.º 7
0
from DataSetLoaderLib import DataSetLoader
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import make_pipeline
from sklearn import metrics
from sklearn import preprocessing
from sklearn.ensemble import VotingClassifier


sizes=['10','50','100','150','200','250']
methods=['MRMR','JMI','JMIM']
for method in methods:
    for size in sizes:
	print size
        print method
        import time
	d = DataSetLoader();
        X_train= d.LoadDataSet("B_train");
        y_train = d.LoadDataSetClasses("B_train");
	print X_train.shape
	print y_train.shape
	#chaipee will fix it later on
	y_train=numpy.transpose(y_train)
	print y_train.shape
	targets=list(y_train)
	y_train=[]
	for i in targets:
		#print i
		y_train.append(int(i))
	#print len(y_train)

        indices= joblib.load('datasetB_pickles/datasetB'+size+'-'+method+'.joblib.pkl')
					if i+self.add_by>=545089:
						i+=1
					values.append(i+self.add_by)
				joblib.dump(values,'selected_features/datasetC/selected_indices'+'_'+useMethod+'.joblib.pkl', compress=9) 
		except:
			type, value, traceback = sys.exc_info()
			print('Error Occured %s: %s: %s' % (type, value, traceback))
		threadLock.release()
		print len(values)
        	print "Exiting " + self.name
		return
		

threads=[]

d = DataSetLoader();
G = d.LoadDataSet("C_train");
targets = d.LoadDataSetClasses("C_train");
#chaipee will fix it later on
#y_train=numpy.transpose(targets)
y_train=numpy.asarray(targets)
print y_train.shape
targets=list(y_train)
y_train=[]
for i in targets:
	print i
	y_train.append(int(i))
targets = y_train
#targets =numpy.asarray(targets )
#targets = column_or_1d(targets, warn=True)
Ejemplo n.º 9
0
#datasets = ["A","B"]
classifiers = ["AdaBoost", "DT", "MLP", "SVM", "RandomForest", "ExtraTree"]
dataset = "B"
f = open('mcc/B-Full-mccResults' + dataset + '.txt', 'w')
f.write(
    "dataset, size, method, classifier, validationTechnique, mc, timeTaken, extra info"
)

for classifierName in classifiers:
    for method in methods:
        for size in sizes:
            for preproc in preprocessings:
                for validation in validationTechnique:
                    #print size
                    #print method
                    d = DataSetLoader()
                    X_train = d.LoadDataSet("B_train")
                    y_train = d.LoadDataSetClasses("B_train")
                    X_test = d.LoadDataSet("B_test")
                    y_test = d.LoadDataSetClasses("B_test")

                    #chaipee will fix it later on
                    y_train = numpy.transpose(y_train)
                    print y_train.shape
                    targets = list(y_train)
                    y_train = []
                    for i in targets:
                        #print i
                        y_train.append(int(i))

                    y_test = numpy.transpose(y_test)
Ejemplo n.º 10
0
                    i += 1
                values.append(i + self.add_by)
            joblib.dump(values,
                        'selected_indices' + '_' + useMethod + '.joblib.pkl',
                        compress=9)
        except:
            print "Error Occured"
        threadLock.release()
        print len(values)
        print "Exiting " + self.name
        return


threads = []

d = DataSetLoader()
G = d.LoadDataSet("B_train")
targets = d.LoadDataSetClasses("B_train")

print "Dataset loaded"

G = numpy.asarray(G)
targets = numpy.asarray(targets)
threadLock = threading.Lock()
print G.shape
vals = 649
original = 649
for i in range(0, 1547):
    print "vals= " + str(vals) + "\n"
    # Create new threads
    thread = myThread(i, "Thread-" + str(i), vals - original,
#add headers
sizes = ['10','50','100','150','200','250']
methods = ['MRMR','JMI','JMIM']
validationTechnique = ['LOOCV',"10FoldCV"]
preprocessing = ['Standard','Imputer','Robust','Quantile']
#datasets = ["A","B"]
classifiers = ["MLP","SVM","AdaBoost","DT","RandomForest","ExtraTree"]
dataset = "A"
f=open('mcc/mccResults'+dataset+'.txt','w');
f.write("dataset, size, method, classifier, validationTechnique, mc, timeTaken, extra info");

for classifierName in classifiers:
    for method in methods:
        for size in sizes:
			for preproc in preprocessing:
				d = DataSetLoader();
				X_train= d.LoadDataSet(dataset);
				y_train = d.LoadDataSetClasses(dataset);
				#print X_train.shape
				#print y_train.shape
				#chaipee will fix it later on
				y_train=numpy.transpose(y_train)
				#print y_train.shape
				targets=list(y_train)
				y_train=[]
				for i in targets:
					#print i
					y_train.append(int(i))
				#print len(y_train)
				
				#first run indices				
sizes = ['10','50','100','150','200','250']
methods = ['MRMR','JMI','JMIM']
#validationTechnique = ['LOOCV',"10FoldCV"] -- NOT USED???
#preprocessing = ['','NP']
#datasets = ["A","B"]
classifiers = ["MLP","SVM","AdaBoost","DT","RandomForest","ExtraTree"]

f=open('mcc/mccResultsC.txt','w');
f.write("dataset, size, method, classifier, validationTechnique, mc, timeTaken");

for classifierName in classifiers:
    for method in methods:
        for size in sizes:            
            #print size
            #print method        
            d = DataSetLoader();
            X_train= d.LoadDataSet("C_train");
            y_train = d.LoadDataSetClasses("C_train");
            X_test= d.LoadDataSet("C_test");
            y_test = d.LoadDataSetClasses("C_test");

            #chaipee will fix it later on
            y_train=numpy.transpose(y_train)
	    print y_train.shape
	    targets=list(y_train)
	    y_train=[]
	    for i in targets:
		#print i
		y_train.append(int(i))

            y_test=numpy.transpose(y_train)
Ejemplo n.º 13
0
#Different feature selection methods
datasets = ['B']  #,'A'
methods = ['MRMR']  #,'JMI','JMIM'
sizes = ['10']  #,'50','100','150','200','250'
validationTechniques = ["10FoldCV"]  #"LOOCV",
preps = ["Standard", "Robust", "Quantile", "Imputer"]

#Iterating over each method
print("Dataset, prepType, validationTechnique, method, size")
for dataset in datasets:
    #	f=open('mcc/mccEnsembleResults.txt','w');
    #	f.write("dataset, size, method, classifier, validationTechnique, mc, timeTaken, cv.max, cv.mean, cv.min, cv.std, preprocessing");

    print "Dataset = ", dataset
    #initiating datasetloader object
    d = DataSetLoader()
    #loading relevant Data and coresponding labels of dataset A
    X_train = d.LoadDataSet(dataset + "_train")
    y_train = d.LoadDataSetClasses(dataset + "_train")
    X_test = d.LoadDataSet(dataset + "_test")
    y_test = d.LoadDataSetClasses(dataset + "_test")

    print("Dimensions of validation data and labels:", X_test.shape,
          y_test.shape)

    #chaipee will fix it later on
    targets = list(numpy.transpose(y_train))
    y_train = []
    for i in targets:
        y_train.append(int(i))
Ejemplo n.º 14
0
def main():
    """
        for each of the subsets s of indexes from 0-1004003 of length between 1 and 10
        #use biological info of known genes and mutual information and top down level of tree nodes [top down means ok interpretation and vague idea of root cause. bottom up means poor interpretation but pin pointed root cause identification]
        #check which subset is the best one by sorting them on desc order of error and then reliability
            create d as vertical projection of dataset using s indexes only
            for partition = 1 to length-2
                create trainingSet of size partition
                create testSet of size length-partition
                calculate Error Rate & Reliability using CV10
                calculate avg Error Rate and Avg Reliability 
        pick the best
    """
    print("\n\n\n\n\n")
    datasetLoader = DataSetLoader()
    setSize = 3
    CVSetting = 2
    classLabels = []
    enhancedGeneSet = []
    classLabels.extend(datasetLoader.GetClassLabels("A"))
    enhancedGeneSet.extend(datasetLoader.LoadEnhancedDataSet("A"))
    enhancedGeneSet = np.array(enhancedGeneSet)
    logInfo('Loaded the datasets')

    for s in range(1, 1 + setSize):
        for i in range(0, np.array(enhancedGeneSet).shape[1]):
            allCombinations = combinations(
                range(1, 1 + enhancedGeneSet.shape[1] - 1), s)
            #TODO: go from 1 to setSize and for the selected top X from amongst one level, make sure the next level subset contains them as prefix so we
            logInfo("allCombinations generated...")
            for aCombination in allCombinations:
                logDebug('aCombination')
                logDebug(aCombination)
                #on this combination, perform LOOCV (Leave one out cross validation)
                tempDataSet = enhancedGeneSet[:, aCombination[:]]

                logDebug('temp Data Set ')
                logDebug(tempDataSet.shape)

                logInfo('going to partition the tempDataSet')
                logDebug(tempDataSet.shape[0])

                for partition in range(CVSetting,
                                       1 + tempDataSet.shape[0]):  #Using CV1
                    logDebug("Partition")
                    logDebug(partition)

                    trainingLabels = classLabels[0:partition]
                    trainingSet = tempDataSet[0:partition, :]

                    logDebug('training set')
                    logDebug(trainingSet.shape)

                    testSet = tempDataSet[partition:tempDataSet.shape[0], :]
                    testLabels = classLabels[partition:tempDataSet.shape[0]]

                    logDebug('test set')
                    logDebug(testSet.shape)
                    print(trainingLabels)
                    print(trainingSet)
                    classifier = Train(trainingSet, trainingLabels)
                    errorRate, reliability, jScore = Evaluate(
                        classifier, tempDataSet, testSet, testLabels, 1)

                    print errorRate, ";", reliability, ";", jScore
                    return
Ejemplo n.º 15
0
from sklearn.externals import joblib
import numpy
from MachineSpecificSettings import Settings
import scipy.io
from DataSetLoaderLib import DataSetLoader
from sklearn.metrics import accuracy_score
import time

y = numpy.array(joblib.load('DatasetA_ValidationClasses.joblib.pkl'))
x = DataSetLoader()
x = x.LoadDataSet("A")

train_p = 0
train_n = 0
test_p = 0
test_n = 0
total = 0
x_test = []
y_test = []
x_train = []
y_train = []
for i in range(0, len(y)):
    if y[i] == 1:
        if train_p < 26:
            x_train.append(x[i])
            y_train.append(y[i])
            train_p += 1
        if test_p < 28:
            x_test.append(x[i])
            y_test.append(y[i])
            test_p += 1
Ejemplo n.º 16
0
datasets = ['C', 'B', 'A']
methods = ['MRMR', 'JMI', 'JMIM']
sizes = ['10', '50', '100', '150', '200', '250']
classifiers = ["RandomForest", "AdaBoost", "DT", "ExtraTree", "MLP", "SVM"]
validationTechniques = ["10FoldCV"]  #"LOOCV",
preps = ["Standard", "Robust", "Quantile", "Imputer"]

basePath = ''  #needed when we want to run it locally
#Iterating over each method
for dataset in datasets:
    f = open('mcc/mccResults' + dataset + '.txt', 'a')
    f.write('\n{date:%Y-%m-%d_%H:%M:%S}'.format(date=datetime.datetime.now()))
    #f.write("dataset, size, method, classifier, validationTechnique, mc, timeTaken, cv.max, cv.mean, cv.min, cv.std, preprocessing");
    print "Dataset = ", dataset
    #initiating datasetloader object
    d = DataSetLoader()
    #loading relevant Data and coresponding labels of dataset A
    X_train_full = d.LoadDataSet(dataset + "_train")
    y_train = d.LoadDataSetClasses(dataset + "_train")
    X_validate_full = d.LoadDataSet(dataset + "_test")
    y_validate = d.LoadDataSetClasses(dataset + "_test")

    print("Dimensions of training data and labels:", X_train_full.shape,
          y_train.shape)
    print("Dimensions of validation data and labels:", X_validate_full.shape,
          y_validate.shape)

    #READY with Dataset, going to perform the main loop now

    for method in methods:
        #Iterating over each size
Ejemplo n.º 17
0
from DataSetLoaderLib import DataSetLoader
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import make_pipeline
from sklearn import metrics
from sklearn import preprocessing
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import LeaveOneOut

sizes = ['10', '50', '100', '150', '200', '250']
methods = ['MRMR', 'JMI', 'JMIM']
for method in methods:
    for size in sizes:
        print size
        print method
        import time
        d = DataSetLoader()
        X_train = d.LoadDataSet("A")
        y_train = d.LoadDataSetClasses("A")
        print X_train.shape
        print y_train.shape
        #chaipee will fix it later on
        y_train = numpy.transpose(y_train)
        print y_train.shape
        targets = list(y_train)
        y_train = []
        for i in targets:
            #print i
            y_train.append(int(i))
        #print len(y_train)
        #first run indices
        indices = joblib.load('datasetA_pickles/selected_indices_' + method +
Ejemplo n.º 18
0
Store the ensemble outputs to basePath + "\Infiltration_ensembles\Dataset.lig.csv"
'''
Datasets = ["C"]  #"B","A",
#Dataset ="B" #for testing purposes
for Dataset in Datasets:
    if (eval("len(" + Dataset + "_LIG_Accuracies) != len(" + Dataset +
             "_LIGs)")):
        print Dataset + "_LIG mismatches the accuracies list"
    actuals = ""
    results = ""
    LIGs = eval(Dataset + "_LIGs")
    LIG_Accuracies = eval(Dataset + "_LIG_Accuracies")
    start_time = time.time()
    padding = 0
    #load the dataset
    d = DataSetLoader()
    #X_train_full = d.LoadDataSet(Dataset+"_train");
    #y_train = d.LoadDataSetClasses(Dataset+"_train");
    #targets=list(numpy.transpose(y_train))
    #y_train=[]
    #for i in targets:
    #    y_train.append(int(i))

    X_validate_full = d.LoadDataSet(Dataset + "_test")
    y_validate = d.LoadDataSetClasses(Dataset + "_test")
    print("Dimensions of validation data and labels:", X_validate_full.shape,
          y_validate.shape)
    targets = list(numpy.transpose(y_validate))
    y_validate = []
    if Dataset == "C":
        y_validate = numpy.array(targets)
#Used for storing and loading the trained classifier
from sklearn.externals import joblib
import numpy
from MachineSpecificSettings import Settings
import scipy.io
from DataSetLoaderLib import DataSetLoader
from sklearn.ensemble import ExtraTreesClassifier
from sklearn import metrics

print("")
print("")
print("")
print("")

targets = numpy.array(joblib.load('DatasetA_ValidationClasses.joblib.pkl'))
d = DataSetLoader()
G = d.LoadDataSet("A")
indices = joblib.load('selected_indicesv2.joblib.pkl')
result = numpy.array(G)[:, indices]
clf = ExtraTreesClassifier()
import time
start_time = time.time()
scores = cross_val_score(clf, result, targets, cv=10)
end_time = time.time() - start_time
print end_time
for i in scores:
    print i
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

filename = 'ExtraTreesClassifier_k-fold.joblib.pkl'
joblib.dump(clf, filename, compress=9)
Ejemplo n.º 20
0
def performTreeletClustering(DatasetName):
    saveFreq = 1000
    #temp value for i
    x = -1
    if (not (os.path.isfile("objs.pickle"))):
        print "New Start"
        d = DataSetLoader()
        G = d.LoadDataSet(DatasetName)
        F = G
        M = []
        cacheTopXPerPart = d.CacheTopXPerPart(DatasetName)
        corrCalculator = PairwisePearsonCorrelationCalculator()
        print "calling corr calculator"
        corrMatrix = corrCalculator.CalculateSimilarity(
            G, d.GetPartSize(DatasetName), cacheTopXPerPart)

    else:
        print "continuing from where we left off"
        d = DataSetLoader()
        G, F, M, x, corrMatrix, cacheTopXPerPart = read()
        corrCalculator = PairwisePearsonCorrelationCalculator()
    p = F[0, :].size
    #because we have already done the previous iteration and loaded that one
    #save(G,F,M,i,corrMatrix,cacheTopXPerPart)
    i = x + 1
    print corrMatrix[0]
    while i < p:
        #for i in range (x+1, p):
        recalc = False
        if checkCorr(corrMatrix, p) == 0:
            print "ERROR IN CORRMATRIX INDEX"
            return 0
        #calculating value of p
        p = F[0, :].size
        print "Value of i is : " + str(i) + " out of " + str(p)
        theVectors = corrMatrix[0]
        #this is always the max corr so the element we want to process
        try:
            if (corrMatrix[0][3] == ''):
                recalc = False
            else:
                print corrMatrix[0]
                recalc = True
        except:
            pass
        Fa = F[:, theVectors[0]]
        Fb = F[:, theVectors[1]]
        print "calling generate metagene"
        m = generateNewMetaGene(Fa, Fb)
        print "calling scipy delete on F"
        F = scipy.delete(F, theVectors[1], 1)
        if not len(M):  #if this is the first meta gene in this matrix
            M = m
        else:
            M = numpy.column_stack(
                (m, M))  #include in the meta genes set as well
        corrMatrix.pop(0)
        corrMatrix = corrCalculator.UpdateSimilarity(corrMatrix, F, list(m),
                                                     theVectors[0],
                                                     theVectors[1])
        F[:, theVectors[0]] = m
        if len(
                corrMatrix
        ) <= 0 or recalc == True:  #everything after this is potentially incorrect so lets recalculate the matrix
            corrMatrix = corrCalculator.CalculateSimilarity(
                F, d.GetPartSize(DatasetName), cacheTopXPerPart)
        if i % saveFreq == 0:
            save(G, F, M, i, corrMatrix, cacheTopXPerPart)
        i += 1
    F = numpy.column_stack((
        G,
        M))  #scipy.append(G, M, 1) #define a new expanded featureset F = G U M
    return F