def extratreeclassifier(input_file,Output): lvltrace.lvltrace("LVLEntree dans extratreeclassifier") ncol=tools.file_col_coma(input_file) data = np.loadtxt(input_file, delimiter=',', usecols=range(ncol-1)) X = data[:,1:] y = data[:,0] n_samples, n_features = X.shape clf = ExtraTreesClassifier(n_estimators=10) clf.fit(X,y) y_pred = clf.predict(X) print "#########################################################################################################\n" print " Extremely Randomized Trees " print "classification accuracy:", metrics.accuracy_score(y, y_pred) print "precision:", metrics.precision_score(y, y_pred) print "recall:", metrics.recall_score(y, y_pred) print "\n" print "#########################################################################################################\n" results = Output+"_Extremely_Random_Forest_metrics.txt" file = open(results, "w") file.write("Extremely Random Forest Classifier estimator accuracy\n") file.write("Classification Accuracy Score: %f\n"%metrics.accuracy_score(y, y_pred)) file.write("Precision Score: %f\n"%metrics.precision_score(y, y_pred)) file.write("Recall Score: %f\n"%metrics.recall_score(y, y_pred)) file.write("F1 Score: %f\n"%metrics.f1_score(y, y_pred)) file.write("\n") file.write("True Value, Predicted Value, Iteration\n") for n in xrange(len(y)): file.write("%f,%f,%i\n"%(y[n],y_pred[n],(n+1))) file.close() title = "Extremely Randomized Trees" save = Output + "Extremely_Randomized_Trees_confusion_matrix.png" plot_confusion_matrix(y, y_pred,title,save) lvltrace.lvltrace("LVLSortie dans extratreeclassifier")
def randomforest(input_file,Output): lvltrace.lvltrace("LVLEntree dans randomforest") ncol=tools.file_col_coma(input_file) data = np.loadtxt(input_file, delimiter=',', usecols=range(ncol-1)) X = data[:,1:] y = data[:,0] n_samples, n_features = X.shape clf = RandomForestClassifier(n_estimators=10) clf.fit(X,y) y_pred = clf.predict(X) print "#########################################################################################################\n" print "The Random forest algo " print "classification accuracy:", metrics.accuracy_score(y, y_pred) print "precision:", metrics.precision_score(y, y_pred) print "recall:", metrics.recall_score(y, y_pred) print "f1 score:", metrics.f1_score(y, y_pred) print "\n" print "#########################################################################################################\n" results = Output+"Random_Forest_metrics.txt" file = open(results, "w") file.write("Random Forest Classifier estimator accuracy\n") file.write("Classification Accuracy Score: %f\n"%metrics.accuracy_score(y, y_pred)) file.write("Precision Score: %f\n"%metrics.precision_score(y, y_pred)) file.write("Recall Score: %f\n"%metrics.recall_score(y, y_pred)) file.write("F1 Score: %f\n"%metrics.f1_score(y, y_pred)) file.write("\n") file.write("True Value, Predicted Value, Iteration\n") for n in xrange(len(y)): file.write("%f,%f,%i\n"%(y[n],y_pred[n],(n+1))) file.close() title = "The Random forest" save = Output + "Random_Forest_confusion_matrix.png" plot_confusion_matrix(y, y_pred,title,save) lvltrace.lvltrace("LVLSortie dans randomforest")
def stochasticGD(input_file,Output): lvltrace.lvltrace("LVLEntree dans stochasticGD") ncol=tools.file_col_coma(input_file) data = np.loadtxt(input_file, delimiter=',', usecols=range(ncol-1)) X = data[:,1:] y = data[:,0] n_samples, n_features = X.shape clf = SGDClassifier(loss="hinge", penalty="l2") clf.fit(X,y) y_pred = clf.predict(X) print "#########################################################################################################\n" print "Stochastic Gradient Descent " print "classification accuracy:", metrics.accuracy_score(y, y_pred) print "precision:", metrics.precision_score(y, y_pred) print "recall:", metrics.recall_score(y, y_pred) print "f1 score:", metrics.f1_score(y, y_pred) print "\n" print "#########################################################################################################\n" results = Output+"Stochastic_GD_metrics.txt" file = open(results, "w") file.write("Stochastic Gradient Descent estimator accuracy\n") file.write("Classification Accuracy Score: %f\n"%metrics.accuracy_score(y, y_pred)) file.write("Precision Score: %f\n"%metrics.precision_score(y, y_pred)) file.write("Recall Score: %f\n"%metrics.recall_score(y, y_pred)) file.write("F1 Score: %f\n"%metrics.f1_score(y, y_pred)) file.write("\n") file.write("True Value, Predicted Value, Iteration\n") for n in xrange(len(y)): file.write("%f,%f,%i\n"%(y[n],y_pred[n],(n+1))) file.close() title = "Stochastic Gradient Descent" save = Output + "Stochastic_GD_confusion_matrix.png" plot_confusion_matrix(y, y_pred,title,save) lvltrace.lvltrace("LVLSortie dans stochasticGD")
def SVC_linear(input_file,Output): lvltrace.lvltrace("LVLEntree dans SVC_linear") ncol=tools.file_col_coma(input_file) data = np.loadtxt(input_file, delimiter=',', usecols=range(ncol-1)) X = data[:,1:] y = data[:,0] n_samples, n_features = X.shape clf=svm.SVC(kernel='linear') clf.fit(X,y) y_pred = clf.predict(X) print "#########################################################################################################\n" print "C-Support Vector Classifcation (with linear kernel) " print "classification accuracy:", metrics.accuracy_score(y, y_pred) print "precision:", metrics.precision_score(y, y_pred) print "recall:", metrics.recall_score(y, y_pred) print "f1 score:", metrics.f1_score(y, y_pred) print "\n" print "#########################################################################################################\n" results = Output+"SVM_Linear_Kernel_metrics.txt" file = open(results, "w") file.write("Support Vector Machine with Linear Kernel estimator accuracy\n") file.write("Classification Accuracy Score: %f\n"%metrics.accuracy_score(y, y_pred)) file.write("Precision Score: %f\n"%metrics.precision_score(y, y_pred)) file.write("Recall Score: %f\n"%metrics.recall_score(y, y_pred)) file.write("F1 Score: %f\n"%metrics.f1_score(y, y_pred)) file.write("\n") file.write("True Value, Predicted Value, Iteration\n") for n in xrange(len(y)): file.write("%f,%f,%i\n"%(y[n],y_pred[n],(n+1))) file.close() title = "SVC - linear Kernel" save = Output + "SVC_linear_confusion_matrix.png" plot_confusion_matrix(y, y_pred,title,save) lvltrace.lvltrace("LVLSortie dans SVC_linear")
def pca(input_file,Output): lvltrace.lvltrace("LVLEntree dans pca unsupervised") ncol=tools.file_col_coma(input_file) data = np.loadtxt(input_file, delimiter=',', usecols=range(ncol-1)) X = data[:,1:] y = data[:,0] n_samples, n_features = X.shape # instantiate the model model = PCA(n_components=2) # fit the model: notice we don't pass the labels! model.fit(X) # transform the data to two dimensions X_PCA = model.transform(X) print "#########################################################################################################\n" print "PCA" print "shape of result:", X_PCA.shape print model.explained_variance_ratio_ print "#########################################################################################################\n" results = Output+"pca.txt" file = open(results, "w") file.write("PCA\n") file.write("shape of result: %f,%f\n"%(X_PCA.shape[0],X_PCA.shape[1])) file.write("Explained variance ratio: %f,%f\n"%(model.explained_variance_ratio_[0],model.explained_variance_ratio_[1])) file.close() # plot the results along with the labels fig, ax = plt.subplots() im = ax.scatter(X_PCA[:, 0], X_PCA[:, 1], c=y) fig.colorbar(im); save = Output + "pca.png" plt.savefig(save) lvltrace.lvltrace("LVLSortie dans pca unsupervised")
def features_by_class(): lvltrace.lvltrace("LVLEntree dans features_by_class") if not os.path.exists(inputs.features_output): os.makedirs(inputs.features_output) for root, dirs, files in os.walk(inputs.morphology): for i in dirs: #LVlprint ("on traite le repertoire "+ str(i)) neuron_dir=root+'/'+i neuron_file_out=inputs.features_output+'/'+i+'.csv' features_by_class = open(neuron_file_out, "w") writer=csv.writer(features_by_class, lineterminator='\t') features_name=tools.random_file(neuron_dir) lines=tools.file_lines(features_name) features_by_class.write("mtype\tneuron_name\t") for line in xrange(lines): features_by_class.write("%s\t"%tools.read_csv_tab(features_name,1,line)) features_by_class.write("\n") for file in os.listdir(neuron_dir): neuron_file_in=root+'/'+i+'/'+file # if the extracted feature file from lmeasure is empty, then skip if file.endswith(".csv") and os.path.getsize(neuron_file_in) > 0: lines=tools.file_lines(neuron_file_in) features_by_class.write("%s\t"%i) features_by_class.write("%s\t"%file) for line in xrange(lines): features_by_class.write("%s\t"%tools.read_csv_tab(neuron_file_in,2,line)) features_by_class.write("\n") lvltrace.lvltrace("LVLSortie dans features_by_class")
def nearest_centroid(input_file,Output,test_size): ncol=tools.file_col_coma(input_file) data = np.loadtxt(input_file, delimiter=',', usecols=range(ncol-1)) X = data[:,1:] y = data[:,0] n_samples, n_features = X.shape X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size) print X_train.shape, X_test.shape clf = NearestCentroid() clf.fit(X_train,y_train) y_pred = clf.predict(X_test) print "Nearest Centroid Classifier " print "classification accuracy:", metrics.accuracy_score(y_test, y_pred) print "precision:", metrics.precision_score(y_test, y_pred) print "recall:", metrics.recall_score(y_test, y_pred) print "f1 score:", metrics.f1_score(y_test, y_pred) print "\n" results = Output+"Nearest_Centroid_metrics_test.txt" file = open(results, "w") file.write("Nearest Centroid Classifier estimator accuracy\n") file.write("Classification Accuracy Score: %f\n"%metrics.accuracy_score(y_test, y_pred)) file.write("Precision Score: %f\n"%metrics.precision_score(y_test, y_pred)) file.write("Recall Score: %f\n"%metrics.recall_score(y_test, y_pred)) file.write("F1 Score: %f\n"%metrics.f1_score(y_test, y_pred)) file.write("\n") file.write("True Value, Predicted Value, Iteration\n") for n in xrange(len(y_test)): file.write("%f,%f,%i\n"%(y_test[n],y_pred[n],(n+1))) file.close() title = "Nearest Centroid %f"%test_size save = Output + "Nearest_Centroid_confusion_matrix"+"_%s.png"%test_size plot_confusion_matrix(y_test, y_pred,title,save) lvltrace.lvltrace("LVLSortie dans stochasticGD split_test")
def SVC_linear(input_file,Output,test_size): lvltrace.lvltrace("LVLEntree dans SVC_linear split_test") ncol=tools.file_col_coma(input_file) data = np.loadtxt(input_file, delimiter=',', usecols=range(ncol-1)) X = data[:,1:] y = data[:,0] n_samples, n_features = X.shape X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size) print X_train.shape, X_test.shape clf=svm.SVC(kernel='linear') clf.fit(X_train,y_train) y_pred = clf.predict(X_test) print "C-Support Vector Classifcation (with RBF linear) " print "y_test, y_pred, iteration" print "classification accuracy:", metrics.accuracy_score(y_test, y_pred) print "precision:", metrics.precision_score(y_test, y_pred) print "recall:", metrics.recall_score(y_test, y_pred) print "f1 score:", metrics.f1_score(y_test, y_pred) print "\n" results = Output+"SVM_Linear_Kernel_metrics_test.txt" file = open(results, "w") file.write("Support Vector Machine with Linear Kernel estimator accuracy\n") file.write("Classification Accuracy Score: %f\n"%metrics.accuracy_score(y_test, y_pred)) file.write("Precision Score: %f\n"%metrics.precision_score(y_test, y_pred)) file.write("Recall Score: %f\n"%metrics.recall_score(y_test, y_pred)) file.write("F1 Score: %f\n"%metrics.f1_score(y_test, y_pred)) file.write("\n") file.write("True Value, Predicted Value, Iteration\n") for n in xrange(len(y_test)): file.write("%f,%f,%i\n"%(y_test[n],y_pred[n],(n+1))) file.close() title = "SVC linear %f"%test_size save = Output + "SVC_linear_confusion_matrix"+"_%s.png"%test_size plot_confusion_matrix(y_test, y_pred,title,save) lvltrace.lvltrace("LVLsortie dans SVC_linear split_test")
def merger_labelled(Preprocessed_file, file_name, Corrected_Features): lvltrace.lvltrace("LVLEntree dans merger_labelled dabs data_preproc") # Merge all features files into one file_random = Corrected_Features + '/' + file_name ncol = tools.file_col_coma(file_random) data = np.loadtxt(file_random, delimiter=',', usecols=range(ncol - 1)) X = data[:, 1:] n_samples, n_features = X.shape fout = open(Preprocessed_file, "a") fout.write("Class,") for n in xrange(1, n_features + 1): fout.write("feature_%i," % n) fout.write("\n") # first file: first_file = Corrected_Features + file_name for line in open(first_file): fout.write(line) # now the rest: for root, dirs, files in os.walk(Corrected_Features): for i in files: if i != file_name: if not i.startswith('.'): f = open(Corrected_Features + i) f.next() #Skip the header for line in f: fout.write(line) f.close() fout.close() lvltrace.lvltrace("LVLSortie dans merger_labelled dans data_preproc")
def data_preprocessing_descriptive(Extracted_Features,Coma_Features,Corrected_Features): lvltrace.lvltrace("LVLEntree dans data_preprocessing_descriptive dans preproc_descriptive") tools.separate_coma(Extracted_Features,Coma_Features) for root, dirs, files in os.walk(Coma_Features): for i in files: if not i.startswith('.'): input_i=Coma_Features+i output_i=Corrected_Features+i lines=tools.file_lines(input_i) ncol=tools.file_col(input_i) if lines >= 2: file = open(output_i, "w") writer=csv.writer(file, lineterminator='\t') data = np.genfromtxt(input_i,delimiter=',') X = data[1:, 2:] neuron_type = np.genfromtxt(input_i,delimiter=',',dtype=None) y = neuron_type[:, 0] # (class) neuron_name = np.genfromtxt(input_i,delimiter=',',dtype=None) z = neuron_name[:, 1] # Neuron names features = np.genfromtxt(input_i,delimiter=',',dtype=None) w = features[0, :] # features names #Replace missing values 'nan' by column mean imp = Imputer(missing_values='NaN', strategy='mean', axis=0) imp.fit(X) Imputer(axis=0, copy=True, missing_values='NaN', strategy='mean', verbose=0) # Output replacement "Nan" values Y=imp.transform(X) #print i #print Y.shape, y.shape,z.shape #print Y.shape[1] #################### for line in xrange(Y.shape[0]+1): for colonne in xrange(Y.shape[1]+2): if line == 0: if colonne == 0: file.write("%s\t"%y[line]) else: if colonne == 1: file.write("%s\t"%z[line]) else: file.write("%s\t"%w[colonne]) else: if colonne == 0: file.write("%s\t"%y[line]) else: if colonne == 1: file.write("%s\t"%z[line]) else: file.write("%f\t"%Y[line-1,colonne-2]) file.write("\n") ######################### else: print "Only one morphology !!!" file.close() lvltrace.lvltrace("LVLSortie dans data_preprocessing_descriptive dans preproc_descriptive")
def extratreeclassifier(input_file,Output,test_size): lvltrace.lvltrace("LVLEntree dans extratreeclassifier split_test") ncol=tools.file_col_coma(input_file) data = np.loadtxt(input_file, delimiter=',', usecols=range(ncol-1)) X = data[:,1:] y = data[:,0] n_samples, n_features = X.shape X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size) print X_train.shape, X_test.shape clf = ExtraTreesClassifier(n_estimators=10) clf.fit(X_train,y_train) y_pred = clf.predict(X_test) print "Extremely Randomized Trees" print "classification accuracy:", metrics.accuracy_score(y_test, y_pred) print "precision:", metrics.precision_score(y_test, y_pred) print "recall:", metrics.recall_score(y_test, y_pred) print "f1 score:", metrics.f1_score(y_test, y_pred) print "\n" results = Output+"_Extremely_Random_Forest_metrics_test.txt" file = open(results, "w") file.write("Extremely Random Forest Classifier estimator accuracy\n") file.write("Classification Accuracy Score: %f\n"%metrics.accuracy_score(y_test, y_pred)) file.write("Precision Score: %f\n"%metrics.precision_score(y_test, y_pred)) file.write("Recall Score: %f\n"%metrics.recall_score(y_test, y_pred)) file.write("F1 Score: %f\n"%metrics.f1_score(y_test, y_pred)) file.write("\n") file.write("True Value, Predicted Value, Iteration\n") for n in xrange(len(y_test)): file.write("%f,%f,%i\n"%(y_test[n],y_pred[n],(n+1))) file.close() title = "Extremely Randomized Trees %f"%test_size save = Output + "Extremely_Randomized_Trees_confusion_matrix"+"_%s.png"%test_size plot_confusion_matrix(y_test, y_pred,title,save) lvltrace.lvltrace("LVLSortie dans extratreeclassifier split_test")
def merger_labelled(Preprocessed_file,file_name,Corrected_Features): lvltrace.lvltrace("LVLEntree dans merger_labelled dabs data_preproc") # Merge all features files into one file_random=Corrected_Features+'/'+file_name ncol=tools.file_col_coma(file_random) data = np.loadtxt(file_random, delimiter=',', usecols=range(ncol-1)) X = data[:,1:] n_samples, n_features = X.shape fout=open(Preprocessed_file,"a") fout.write("Class,") for n in xrange(1,n_features+1): fout.write("feature_%i,"%n) fout.write("\n") # first file: first_file=Corrected_Features+file_name for line in open(first_file): fout.write(line) # now the rest: for root, dirs, files in os.walk(Corrected_Features): for i in files: if i != file_name: if not i.startswith('.'): f=open(Corrected_Features+i) f.next() #Skip the header for line in f: fout.write(line) f.close() fout.close() lvltrace.lvltrace("LVLSortie dans merger_labelled dans data_preproc")
def descriptive_analysis(Extracted_Features,descriptive_analysis_output,cores,Corrected_Features_descriptive): lvltrace.lvltrace("LVLEntree dans descriptive_analysis") Input_population_analysis=Corrected_Features_descriptive folder = descriptive_analysis_output # Create a pool of processes depending on number of CPUs available, # with each file in input_files to be loaded using the appropriate function cpuCount = multiprocessing.cpu_count() #if cpuCount > 12: cpuCount = 12 # Limit processor count to 12 when executing on server if cores == None: cores = cpuCount if cores > 1: s='s' else: s='' if cores == False: # Sequential loading used for debugging for root, dirs, files in os.walk(Input_population_analysis): for i in files: if not i.startswith('.'): descriptive_single(Extracted_Features,descriptive_analysis_output,i,Input_population_analysis) else: # Multi-process loading pool = Pool(processes=cores) data = [] for root, dirs, files in os.walk(Input_population_analysis): for i in files: if not i.startswith('.'): morphology_csv=Input_population_analysis+i features = np.genfromtxt(morphology_csv,delimiter='\t',dtype=None) w = features[0, :] data.append((Extracted_Features,descriptive_analysis_output,i,morphology_csv,w)) pool.map(descriptive_multi_cores,data) lvltrace.lvltrace("LVLSortie dans descriptive_analysis")
def gaussianNB(input_file,Output,test_size): lvltrace.lvltrace("LVLEntree dans gaussianNB split_test") ncol=tools.file_col_coma(input_file) data = np.loadtxt(input_file, delimiter=',', usecols=range(ncol-1)) X = data[:,1:] y = data[:,0] n_samples, n_features = X.shape X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size) print X_train.shape, X_test.shape # Instantiate the estimator clf = GaussianNB() # Fit the estimator to the data clf.fit(X_train, y_train) # Use the model to predict the last several labels y_pred = clf.predict(X_test) print "Gaussian Naive Bayes estimator accuracy " print "classification accuracy:", metrics.accuracy_score(y_test, y_pred) print "precision:", metrics.precision_score(y_test, y_pred) print "recall:", metrics.recall_score(y_test, y_pred) print "f1 score:", metrics.f1_score(y_test, y_pred) results = Output+"GaussianNB_metrics_test.txt" file = open(results, "w") file.write("Gaussian Naive Bayes estimator accuracy\n") file.write("Classification Accuracy Score: %f\n"%metrics.accuracy_score(y_test, y_pred)) file.write("Precision Score: %f\n"%metrics.precision_score(y_test, y_pred)) file.write("Recall Score: %f\n"%metrics.recall_score(y_test, y_pred)) file.write("F1 Score: %f\n"%metrics.f1_score(y_test, y_pred)) file.write("True Value, Predicted Value, Iteration\n") for n in xrange(len(y_test)): file.write("%f,%f,%i\n"%(y_test[n],y_pred[n],(n+1))) file.close() title = "Gaussian Naive Bayes %f"%test_size save = Output + "Gaussian_NB_confusion_matrix"+"_%s.png"%test_size plot_confusion_matrix(y_test, y_pred,title,save) lvltrace.lvltrace("LVLsortie dans gaussianNB split_test")
def predict_class_glm(input_file, Output): lvltrace.lvltrace( "LVLEntree dans predict_class_glm dans feature_selection") csv = input_file df = pd.read_csv(csv) #print df df = df[[ 'Class', 'feature_1', 'feature_2', 'feature_3', 'feature_4', 'feature_5', 'feature_6', 'feature_7', 'feature_8', 'feature_9', 'feature_10', 'feature_11', 'feature_12', 'feature_13', 'feature_14', 'feature_15', 'feature_16', 'feature_17', 'feature_18', 'feature_19', 'feature_20', 'feature_21', 'feature_22', 'feature_23', 'feature_24', 'feature_25', 'feature_26', 'feature_27', 'feature_28', 'feature_29', 'feature_30', 'feature_31', 'feature_32', 'feature_33', 'feature_34', 'feature_35', 'feature_36', 'feature_37', 'feature_38', 'feature_39', 'feature_40', 'feature_41', 'feature_42', 'feature_43' ]].dropna() df.head() logit = glm( formula= 'Class ~ feature_1+feature_2+feature_3+feature_4+feature_5+feature_6+feature_7+feature_8+feature_9+feature_10+feature_11+feature_12+feature_13+feature_14+feature_15+feature_16+feature_17+feature_18+feature_19+feature_20+feature_21+feature_22+feature_23+feature_24+feature_25+feature_26+feature_27+feature_28+feature_29+feature_30+feature_31+feature_32+feature_33+feature_34+feature_35+feature_36+feature_37+feature_38+feature_39+feature_40+feature_41+feature_42+feature_43', data=df).fit() print logit.summary() save = Output + "glm.txt" old_stdout = sys.stdout log_file = open(save, "w") sys.stdout = log_file print logit.summary() sys.stdout = old_stdout log_file.close() lvltrace.lvltrace( "LVLSortie dans predict_class_glm dans feature_selection")
def neuron_similarity_matrix_labelled(Preprocessed_file, similarity, Corrected_Features): lvltrace.lvltrace( "LVLEntree dans neuron_similarity_matrix_labelled data_preproc") file = open(similarity, "w") writer = csv.writer(file, lineterminator=',') file.write("mtype,") for root, dirs, files in os.walk(Corrected_Features): for v in files: if not v.startswith('.'): input = Corrected_Features + v ncol = tools.file_col_coma(input) - 1 data = np.loadtxt(input, delimiter=',', usecols=range( ncol)) # ncol-1 because we skip the neuron manes y = data[:, 0].astype(np.int) # Labels (class) file.write("%i," % np.mean(y)) file.write("\n") for root, dirs, files in os.walk(Corrected_Features): for j in files: if not j.startswith('.'): input_1 = Corrected_Features + j ncol1 = tools.file_col_coma(input_1) - 1 data = np.loadtxt(input_1, delimiter=',', usecols=range( ncol1)) # ncol-1 because we skip the neuron manes X1 = data[:, :] y1 = data[:, 0].astype(np.int) # Labels (class) mtype1 = [0 for x in xrange(ncol1 - 1)] f = [0 for x in xrange(ncol1 - 1)] label1 = np.mean(y1) for col in xrange(1, ncol1): mtype1[col - 1] = np.mean(X1[:, col]) #mean of each feature file.write("%i," % label1) for root, dirs, files in os.walk(Corrected_Features): for i in files: if not i.startswith('.'): input_2 = Corrected_Features + i ncol2 = tools.file_col_coma(input_2) - 1 data = np.loadtxt( input_2, delimiter=',', usecols=range(ncol2) ) # ncol-1 because we skip the neuron manes X2 = data[:, :] y2 = data[:, 0].astype(np.int) # Labels (class) mtype2 = [0 for x in xrange(ncol2 - 1)] for col in xrange(1, ncol2): mtype2[col - 1] = np.mean( X2[:, col]) #mean of each feature for col in xrange(1, ncol2): f[col - 1] = np.abs(mtype2[col - 1] - mtype1[col - 1]) similarity = np.mean(f) file.write("%f," % similarity) file.write("\n") file.close() lvltrace.lvltrace( "LVLSortie dans neuron_similarity_matrix_labelled data_preproc")
def kmeans(input_file, n_clusters, Output): lvltrace.lvltrace("LVLEntree dans kmeans unsupervised") ncol=tools.file_col_coma(input_file) data = np.loadtxt(input_file, delimiter=',', usecols=range(ncol-1)) X = data[:,1:] y = data[:,0] sample_size, n_features = X.shape k_means=cluster.KMeans(init='k-means++', n_clusters=n_clusters, n_init=10) k_means.fit(X) reduced_data = k_means.transform(X) values = k_means.cluster_centers_.squeeze() labels = k_means.labels_ k_means_cluster_centers = k_means.cluster_centers_ print "#########################################################################################################\n" #print y #print labels print "K-MEANS\n" print('homogeneity_score: %f'%metrics.homogeneity_score(y, labels)) print('completeness_score: %f'%metrics.completeness_score(y, labels)) print('v_measure_score: %f'%metrics.v_measure_score(y, labels)) print('adjusted_rand_score: %f'%metrics.adjusted_rand_score(y, labels)) print('adjusted_mutual_info_score: %f'%metrics.adjusted_mutual_info_score(y, labels)) print('silhouette_score: %f'%metrics.silhouette_score(X, labels, metric='euclidean', sample_size=sample_size)) print('\n') print "#########################################################################################################\n" results = Output+"kmeans_scores.txt" file = open(results, "w") file.write("K-Means Scores\n") file.write("Homogeneity Score: %f\n"%metrics.homogeneity_score(y, labels)) file.write("Completeness Score: %f\n"%metrics.completeness_score(y, labels)) file.write("V-Measure: %f\n"%metrics.v_measure_score(y, labels)) file.write("The adjusted Rand index: %f\n"%metrics.adjusted_rand_score(y, labels)) file.write("Adjusted Mutual Information: %f\n"%metrics.adjusted_mutual_info_score(y, labels)) file.write("Silhouette Score: %f\n"%metrics.silhouette_score(X, labels, metric='euclidean', sample_size=sample_size)) file.write("\n") file.write("True Value, Cluster numbers, Iteration\n") for n in xrange(len(y)): file.write("%f, %f, %i\n"%(y[n],labels[n],(n+1))) file.close() import pylab as pl from itertools import cycle # plot the results along with the labels k_means_cluster_centers = k_means.cluster_centers_ fig, ax = plt.subplots() im=ax.scatter(X[:, 0], X[:, 1], c=labels, marker='.') for k in xrange(n_clusters): my_members = labels == k cluster_center = k_means_cluster_centers[k] ax.plot(cluster_center[0], cluster_center[1], 'w', color='b', marker='x', markersize=6) fig.colorbar(im) plt.title("Number of clusters: %i"%n_clusters) save = Output + "kmeans.png" plt.savefig(save) lvltrace.lvltrace("LVLsortie dans kmeans unsupervised")
def forest_of_trees(input_file, Output): import numpy as np from sklearn.datasets import make_classification from sklearn.ensemble import ExtraTreesClassifier lvltrace.lvltrace("LVLEntree dans forest_of_trees dans feature_selection") # Build a classification task using 3 informative features ncol = tools.file_col_coma(input_file) data = np.loadtxt(input_file, delimiter=',', usecols=range(ncol - 1)) X = data[:, 1:] y = data[:, 0] #print X #print y sample_size, n_features = X.shape # Build a forest and compute the feature importances forest = ExtraTreesClassifier(n_estimators=250, random_state=0) forest.fit(X, y) importances = forest.feature_importances_ std = np.std([tree.feature_importances_ for tree in forest.estimators_], axis=0) indices = np.argsort(importances)[::-1] results = Output + "forest_of_tree.txt" file = open(results, "w") file.write("Feature Ranking\n") # Print the feature ranking #print("Feature ranking:") for f in range(n_features): file.write("%d. feature %d (%f)\n" % (f + 1, indices[f] + 1, importances[indices[f]])) #print("%d. feature %d (%f)" % (f + 1, indices[f]+1, importances[indices[f]])) file.close() # Plot the feature importances of the forest import pylab as pl pl.figure() pl.title("Feature importances: Forest of trees applied to Layers + Types") pl.bar(range(n_features), importances[indices], color="r", yerr=std[indices], align="center") pl.xticks(range(n_features), indices + 1) pl.axis('tight') #pl.xlim([-1, 73]) save = Output + "forest_of_tree.png" pl.savefig(save) lvltrace.lvltrace("LVLSortie dans forest_of_trees dans feature_selection")
def forest_of_trees(input_file,Output): import numpy as np from sklearn.datasets import make_classification from sklearn.ensemble import ExtraTreesClassifier lvltrace.lvltrace("LVLEntree dans forest_of_trees dans feature_selection") # Build a classification task using 3 informative features ncol=tools.file_col_coma(input_file) data = np.loadtxt(input_file, delimiter=',', usecols=range(ncol-1)) X = data[:,1:] y = data[:,0] #print X #print y sample_size, n_features = X.shape # Build a forest and compute the feature importances forest = ExtraTreesClassifier(n_estimators=250, random_state=0) forest.fit(X, y) importances = forest.feature_importances_ std = np.std([tree.feature_importances_ for tree in forest.estimators_], axis=0) indices = np.argsort(importances)[::-1] results = Output+"forest_of_tree.txt" file = open(results, "w") file.write("Feature Ranking\n") # Print the feature ranking #print("Feature ranking:") for f in range(n_features): file.write("%d. feature %d (%f)\n" % (f + 1, indices[f]+1, importances[indices[f]])) #print("%d. feature %d (%f)" % (f + 1, indices[f]+1, importances[indices[f]])) file.close() # Plot the feature importances of the forest import pylab as pl pl.figure() pl.title("Feature importances: Forest of trees applied to Layers + Types") pl.bar(range(n_features), importances[indices], color="r", yerr=std[indices], align="center") pl.xticks(range(n_features), indices+1) pl.axis('tight') #pl.xlim([-1, 73]) save=Output+"forest_of_tree.png" pl.savefig(save) lvltrace.lvltrace("LVLSortie dans forest_of_trees dans feature_selection")
def features_variability(descriptive_analysis_output, descriptive_variability, cores): lvltrace.lvltrace("LVLEntree dans features_variability ") path_csv_analysis = descriptive_analysis_output #print path_csv_analysis Score = descriptive_variability + "Global_Features_Variability.csv" file = open(Score, "w") writer = csv.writer(file, lineterminator=',') # Create a pool of processes depending on number of CPUs available, # with each file in input_files to be loaded using the appropriate function cpuCount = multiprocessing.cpu_count() #if cpuCount > 12: cpuCount = 12 # Limit processor count to 12 when executing on server if cores == None: cores = cpuCount if cores > 1: s = 's' else: s = '' if cores == False: # Sequential loading used for debugging for root, dirs, files in os.walk(path_csv_analysis): for i in files: if not i.startswith('.') and i.endswith('.csv'): scoring = features_variability_single(path_csv_analysis, i) file.write("%s," % os.path.splitext(i)[0]) file.write("%f\n" % scoring) file.close() else: # Multi-process loading pool = Pool(processes=cores) data = [] b = [0 for x in xrange(len(os.listdir(path_csv_analysis)))] iter = 0 for root, dirs, files in os.walk(path_csv_analysis): for i in files: if not i.startswith('.') and i.endswith('.csv'): csv_file = path_csv_analysis + i #print csv_file #print i,iter #b[iter]=os.path.splitext(i)[0] b[iter] = i iter = iter + 1 #data.append((csv_file,i,file)) #print csv_file data.append((path_csv_analysis, i)) scoring_multi = pool.map(features_variability_multi_cores, data) for k in xrange(iter): file.write("%s," % b[k]) file.write("%f\n" % scoring_multi[k]) file.close() variability_plotting(descriptive_variability, Score) lvltrace.lvltrace("LVLSortie dans features_variability ")
def gmm(input_file,Output): lvltrace.lvltrace("LVLEntree dans gmm unsupervised") print "#########################################################################################################\n" print "GMM" print "#########################################################################################################\n" ncol=tools.file_col_coma(input_file) data = np.loadtxt(input_file, delimiter=',', usecols=range(ncol-1)) X = data[:,1:] y = data[:,0] n_samples, n_features = X.shape # Fit a mixture of gaussians with EM using five components gmm = mixture.GMM(n_components=5, covariance_type='spherical', init_params = 'wmc') gmm.fit(X) # Fit a dirichlet process mixture of gaussians using five components dpgmm = mixture.DPGMM(n_components=5, covariance_type='spherical',init_params = 'wmc') dpgmm.fit(X) color_iter = itertools.cycle(['r', 'g', 'b', 'c', 'm', 'b','g','r','c','m','y','k','b','g','r','c','m','y','k','b','g','r','c','m','y','k','b','g','r','c','m','y','k']) for i, (clf, title) in enumerate([(gmm, 'GMM'), (dpgmm, 'Dirichlet Process GMM')]): splot = pl.subplot(2, 1, 1 + i) Y_ = clf.predict(X) for i, (mean, covar, color) in enumerate(zip( clf.means_, clf._get_covars(), color_iter)): v, w = linalg.eigh(covar) u = w[0] / linalg.norm(w[0]) # as the DP will not use every component it has access to # unless it needs it, we shouldn't plot the redundant # components. if not np.any(Y_ == i): continue pl.scatter(X[Y_ == i, 0], X[Y_ == i, 1], .8, color=color) # Plot an ellipse to show the Gaussian component angle = np.arctan(u[1] / u[0]) angle = 180 * angle / np.pi # convert to degrees ell = mpl.patches.Ellipse(mean, v[0], v[1], 180 + angle, color=color) ell.set_clip_box(splot.bbox) ell.set_alpha(0.5) splot.add_artist(ell) pl.xticks(()) pl.yticks(()) pl.title(title) save = Output + "gmm.png" plt.savefig(save) lvltrace.lvltrace("LVLSortie dans gmm unsupervised")
def neuron_similarity_matrix_labelled(Preprocessed_file,similarity,Corrected_Features): lvltrace.lvltrace("LVLEntree dans neuron_similarity_matrix_labelled data_preproc") file = open(similarity, "w") writer=csv.writer(file, lineterminator=',') file.write("mtype,") for root, dirs, files in os.walk(Corrected_Features): for v in files: if not v.startswith('.'): input=Corrected_Features+v ncol=tools.file_col_coma(input)-1 data = np.loadtxt(input, delimiter=',', usecols=range(ncol)) # ncol-1 because we skip the neuron manes y = data[:, 0].astype(np.int) # Labels (class) file.write("%i,"%np.mean(y)) file.write("\n") for root, dirs, files in os.walk(Corrected_Features): for j in files: if not j.startswith('.'): input_1=Corrected_Features+j ncol1=tools.file_col_coma(input_1)-1 data = np.loadtxt(input_1, delimiter=',', usecols=range(ncol1)) # ncol-1 because we skip the neuron manes X1 = data[:,:] y1 = data[:, 0].astype(np.int) # Labels (class) mtype1=[0 for x in xrange(ncol1-1)] f=[0 for x in xrange(ncol1-1)] label1=np.mean(y1) for col in xrange(1,ncol1): mtype1[col-1]=np.mean(X1[:,col]) #mean of each feature file.write("%i,"%label1) for root, dirs, files in os.walk(Corrected_Features): for i in files: if not i.startswith('.'): input_2=Corrected_Features+i ncol2=tools.file_col_coma(input_2)-1 data = np.loadtxt(input_2, delimiter=',', usecols=range(ncol2)) # ncol-1 because we skip the neuron manes X2 = data[:,:] y2 = data[:, 0].astype(np.int) # Labels (class) mtype2=[0 for x in xrange(ncol2-1)] for col in xrange(1,ncol2): mtype2[col-1]=np.mean(X2[:,col]) #mean of each feature for col in xrange(1,ncol2): f[col-1]=np.abs(mtype2[col-1]-mtype1[col-1]) similarity=np.mean(f) file.write("%f,"%similarity) file.write("\n") file.close() lvltrace.lvltrace("LVLSortie dans neuron_similarity_matrix_labelled data_preproc")
def features_variability(descriptive_analysis_output,descriptive_variability,cores): lvltrace.lvltrace("LVLEntree dans features_variability ") path_csv_analysis=descriptive_analysis_output #print path_csv_analysis Score=descriptive_variability+"Global_Features_Variability.csv" file = open(Score, "w") writer=csv.writer(file, lineterminator=',') # Create a pool of processes depending on number of CPUs available, # with each file in input_files to be loaded using the appropriate function cpuCount = multiprocessing.cpu_count() #if cpuCount > 12: cpuCount = 12 # Limit processor count to 12 when executing on server if cores == None: cores = cpuCount if cores > 1: s='s' else: s='' if cores == False: # Sequential loading used for debugging for root, dirs, files in os.walk(path_csv_analysis): for i in files: if not i.startswith('.') and i.endswith('.csv'): scoring=features_variability_single(path_csv_analysis,i) file.write("%s,"%os.path.splitext(i)[0]) file.write("%f\n"%scoring) file.close() else: # Multi-process loading pool = Pool(processes=cores) data = [] b=[0 for x in xrange(len(os.listdir(path_csv_analysis)))] iter=0 for root, dirs, files in os.walk(path_csv_analysis): for i in files: if not i.startswith('.') and i.endswith('.csv'): csv_file=path_csv_analysis+i #print csv_file #print i,iter #b[iter]=os.path.splitext(i)[0] b[iter]=i iter=iter+1 #data.append((csv_file,i,file)) #print csv_file data.append((path_csv_analysis,i)) scoring_multi=pool.map(features_variability_multi_cores,data) for k in xrange(iter): file.write("%s,"%b[k]) file.write("%f\n"%scoring_multi[k]) file.close() variability_plotting(descriptive_variability,Score) lvltrace.lvltrace("LVLSortie dans features_variability ")
def predict_class_glm(input_file,Output): lvltrace.lvltrace("LVLEntree dans predict_class_glm dans feature_selection") csv=input_file df = pd.read_csv(csv) #print df df = df[['Class','feature_1','feature_2','feature_3','feature_4','feature_5','feature_6','feature_7','feature_8','feature_9','feature_10','feature_11','feature_12','feature_13','feature_14','feature_15','feature_16','feature_17','feature_18','feature_19','feature_20','feature_21','feature_22','feature_23','feature_24','feature_25','feature_26','feature_27','feature_28','feature_29','feature_30','feature_31','feature_32','feature_33','feature_34','feature_35','feature_36','feature_37','feature_38','feature_39','feature_40','feature_41','feature_42','feature_43']].dropna() df.head() logit = glm(formula='Class ~ feature_1+feature_2+feature_3+feature_4+feature_5+feature_6+feature_7+feature_8+feature_9+feature_10+feature_11+feature_12+feature_13+feature_14+feature_15+feature_16+feature_17+feature_18+feature_19+feature_20+feature_21+feature_22+feature_23+feature_24+feature_25+feature_26+feature_27+feature_28+feature_29+feature_30+feature_31+feature_32+feature_33+feature_34+feature_35+feature_36+feature_37+feature_38+feature_39+feature_40+feature_41+feature_42+feature_43', data=df).fit() print logit.summary() save = Output + "glm.txt" old_stdout = sys.stdout log_file = open(save,"w") sys.stdout = log_file print logit.summary() sys.stdout = old_stdout log_file.close() lvltrace.lvltrace("LVLSortie dans predict_class_glm dans feature_selection")
def multinomialNB(input_file,Output): lvltrace.lvltrace("LVLEntree dans multinomialNB") try: ncol=tools.file_col_coma(input_file) data = np.loadtxt(input_file, delimiter=',', usecols=range(ncol-1)) X = data[:,1:] y = data[:,0] n_samples, n_features = X.shape # Instantiate the estimator clf = MultinomialNB() # Fit the estimator to the data clf.fit(X, y) # Use the model to predict the last several labels y_pred = clf.predict(X) print "#########################################################################################################\n" print "Multinomial Naive Bayes estimator accuracy " print "classification accuracy:", metrics.accuracy_score(y, y_pred) print "precision:", metrics.precision_score(y, y_pred) print "recall:", metrics.recall_score(y, y_pred) print "f1 score:", metrics.f1_score(y, y_pred) print "\n" print "#########################################################################################################\n" results = Output+"Multinomial_NB_metrics.txt" file = open(results, "w") file.write("Multinomial Naive Bayes estimator accuracy\n") file.write("Classification Accuracy Score: %f\n"%metrics.accuracy_score(y, y_pred)) file.write("Precision Score: %f\n"%metrics.precision_score(y, y_pred)) file.write("Recall Score: %f\n"%metrics.recall_score(y, y_pred)) file.write("F1 Score: %f\n"%metrics.f1_score(y, y_pred)) file.write("True Value, Predicted Value, Iteration\n") for n in xrange(len(y)): file.write("%f,%f,%i\n"%(y[n],y_pred[n],(n+1))) file.close() title = "Multinomial Naive Bayes" save = Output + "Multinomial_NB_confusion_matrix.png" plot_confusion_matrix(y, y_pred,title,save) except (ValueError): if configuration.normalization == 'normalize': results = Output+"Multinomial_NB_metrics.txt" file = open(results, "w") file.write("In configuration.py file, normalization=normalize -- Input Values must be superior to 0\n") file.close() lvltrace.lvltrace("LVLSortie dans multinomialNB")
def lda(input_file,Output): lvltrace.lvltrace("LVLEntree dans lda") ncol=tools.file_col_coma(input_file) data = np.loadtxt(input_file, delimiter=',', usecols=range(ncol-1)) X = data[:,1:] y = data[:,0] n_samples, n_features = X.shape #lda=LDA(n_components=2) lda=LDA() lda.fit(X,y) X_LDA = lda.transform(X) y_pred = lda.predict(X) print "#########################################################################################################\n" print "Linear Discriminant Analysis Accuracy " print "classification accuracy:", metrics.accuracy_score(y, y_pred) print "precision:", metrics.precision_score(y, y_pred) print "recall:", metrics.recall_score(y, y_pred) print "f1 score:", metrics.f1_score(y, y_pred) print "\n" print "#########################################################################################################\n" results = Output+"LDA_metrics.txt" file = open(results, "w") file.write("Linear Discriminant Analaysis estimator accuracy\n") file.write("Classification Accuracy Score: %f\n"%metrics.accuracy_score(y, y_pred)) file.write("Precision Score: %f\n"%metrics.precision_score(y, y_pred)) file.write("Recall Score: %f\n"%metrics.recall_score(y, y_pred)) file.write("F1 Score: %f\n"%metrics.f1_score(y, y_pred)) file.write("\n") file.write("True Value, Predicted Value, Iteration\n") for n in xrange(len(y)): file.write("%f,%f,%i\n"%(y[n],y_pred[n],(n+1))) file.close() title = "LDA" save = Output + "LDA_confusion_matrix.png" plot_confusion_matrix(y, y_pred,title,save) # plot the results along with the labels fig, ax = plt.subplots() im = ax.scatter(X_LDA[:, 0], X_LDA[:, 1], c=y) fig.colorbar(im); save_lda = Output + "LDA_plot.png" plt.savefig(save_lda) plt.close() lvltrace.lvltrace("LVLSortie dans lda")
def lda(input_file,Output,test_size): lvltrace.lvltrace("LVLEntree dans lda split_test") ncol=tools.file_col_coma(input_file) data = np.loadtxt(input_file, delimiter=',', usecols=range(ncol-1)) X = data[:,1:] y = data[:,0] n_samples, n_features = X.shape X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size) print X_train.shape, X_test.shape lda=LDA(n_components=2) lda.fit(X_train,y_train) X_LDA = lda.transform(X_train) print "shape of result:", X_LDA.shape y_pred = lda.predict(X_test) print "Linear Discriminant Analysis Accuracy " print "classification accuracy:", metrics.accuracy_score(y_test, y_pred) print "precision:", metrics.precision_score(y_test, y_pred) print "recall:", metrics.recall_score(y_test, y_pred) print "f1 score:", metrics.f1_score(y_test, y_pred) #LVLprint "\n" results = Output+"LDA_metrics_test.txt" file = open(results, "w") file.write("Linear Discriminant Analaysis estimator accuracy\n") file.write("Classification Accuracy Score: %f\n"%metrics.accuracy_score(y_test, y_pred)) file.write("Precision Score: %f\n"%metrics.precision_score(y_test, y_pred)) file.write("Recall Score: %f\n"%metrics.recall_score(y_test, y_pred)) file.write("F1 Score: %f\n"%metrics.f1_score(y_test, y_pred)) file.write("\n") file.write("True Value, Predicted Value, Iteration\n") for n in xrange(len(y_test)): file.write("%f,%f,%i\n"%(y_test[n],y_pred[n],(n+1))) file.close() title = "LDA %f"%test_size save = Output + "LDA_confusion_matrix"+"_%s.png"%test_size plot_confusion_matrix(y_test, y_pred,title,save) # plot the results along with the labels fig, ax = plt.subplots() im = ax.scatter(X_LDA[:, 0], X_LDA[:, 1], c=y_train) fig.colorbar(im); save_lda = Output + "LDA_plot_test"+"_%s.png"%test_size plt.savefig(save_lda) plt.close() lvltrace.lvltrace("LVLSortie dans lda split_test")
def merger(Preprocessed_file, file_name, Corrected_Features): # Merge all features files into one lvltrace.lvltrace("LVLEntree dans merger data_preproc") fout = open(Preprocessed_file, "a") # first file: first_file = Corrected_Features + file_name for line in open(first_file): fout.write(line) # now the rest: for root, dirs, files in os.walk(Corrected_Features): for i in files: if i != file_name: if not i.startswith('.'): f = open(Corrected_Features + i) #f.next() #Skip the header for line in f: fout.write(line) f.close() fout.close() lvltrace.lvltrace("LVLSortie dans merger data_preproc")
def merger(Preprocessed_file,file_name,Corrected_Features): # Merge all features files into one lvltrace.lvltrace("LVLEntree dans merger data_preproc") fout=open(Preprocessed_file,"a") # first file: first_file=Corrected_Features+file_name for line in open(first_file): fout.write(line) # now the rest: for root, dirs, files in os.walk(Corrected_Features): for i in files: if i != file_name: if not i.startswith('.'): f=open(Corrected_Features+i) #f.next() #Skip the header for line in f: fout.write(line) f.close() fout.close() lvltrace.lvltrace("LVLSortie dans merger data_preproc")
def qda(input_file,Output): lvltrace.lvltrace("LVLEntree dans qda") try: ncol=tools.file_col_coma(input_file) data = np.loadtxt(input_file, delimiter=',', usecols=range(ncol-1)) X = data[:,1:] y = data[:,0] n_samples, n_features = X.shape qda=QDA() qda.fit(X,y) y_pred = qda.predict(X) print "#########################################################################################################\n" print "Quadratic Discriminant Analysis Accuracy " print "classification accuracy:", metrics.accuracy_score(y, y_pred) print "precision:", metrics.precision_score(y, y_pred) print "recall:", metrics.recall_score(y, y_pred) print "f1 score:", metrics.f1_score(y, y_pred) print "\n" print "#########################################################################################################\n" results = Output+"QDA_metrics.txt" file = open(results, "w") file.write("Quadratic Discriminant Analaysis estimator accuracy\n") file.write("Classification Accuracy Score: %f\n"%metrics.accuracy_score(y, y_pred)) file.write("Precision Score: %f\n"%metrics.precision_score(y, y_pred)) file.write("Recall Score: %f\n"%metrics.recall_score(y, y_pred)) file.write("F1 Score: %f\n"%metrics.f1_score(y, y_pred)) file.write("\n") file.write("True Value, Predicted Value, Iteration\n") for n in xrange(len(y)): file.write("%f,%f,%i\n"%(y[n],y_pred[n],(n+1))) file.close() title = "QDA" save = Output + "QDA_confusion_matrix.png" plot_confusion_matrix(y, y_pred,title,save) except (AttributeError): if configuration.normalization == 'normalize': results = Output+"Multinomial_NB_metrics.txt" file = open(results, "w") file.write("In configuration.py file, normalization='normalize' -- Input Values must be superior to 0\n") file.close() lvltrace.lvltrace("LVLSortie dans qda")
def qda(input_file,Output,test_size): lvltrace.lvltrace("LVLEntree dans qda split_test") try: ncol=tools.file_col_coma(input_file) data = np.loadtxt(input_file, delimiter=',', usecols=range(ncol-1)) X = data[:,1:] y = data[:,0] n_samples, n_features = X.shape X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size) print X_train.shape, X_test.shape lda=QDA() lda.fit(X_train,y_train) y_pred = lda.predict(X_test) print "Quadratic Discriminant Analysis Accuracy " print "classification accuracy:", metrics.accuracy_score(y_test, y_pred) print "precision:", metrics.precision_score(y_test, y_pred) print "recall:", metrics.recall_score(y_test, y_pred) print "f1 score:", metrics.f1_score(y_test, y_pred) #LVLprint "\n" results = Output+"QDA_metrics_test.txt" file = open(results, "w") file.write("Quadratic Discriminant Analaysis estimator accuracy\n") file.write("Classification Accuracy Score: %f\n"%metrics.accuracy_score(y_test, y_pred)) file.write("Precision Score: %f\n"%metrics.precision_score(y_test, y_pred)) file.write("Recall Score: %f\n"%metrics.recall_score(y_test, y_pred)) file.write("F1 Score: %f\n"%metrics.f1_score(y_test, y_pred)) file.write("\n") file.write("True Value, Predicted Value, Iteration\n") for n in xrange(len(y_test)): file.write("%f,%f,%i\n"%(y_test[n],y_pred[n],(n+1))) file.close() title = "QDA %f"%test_size save = Output + "QDA_confusion_matrix"+"_%s.png"%test_size plot_confusion_matrix(y_test, y_pred,title,save) except (AttributeError): if configuration.normalization == 'normalize': results = Output+"Multinomial_NB_metrics_test.txt" file = open(results, "w") file.write("In configuration.py file, normalization='normalize' -- Input Values must be superior to 0\n") file.close() lvltrace.lvltrace("LVLSortie dans qda split_test")
def Radius_Neighbors(input_file,Output,test_size): lvltrace.lvltrace("LVLEntree dans radius_kneighbors split_test") try: ncol=tools.file_col_coma(input_file) data = np.loadtxt(input_file, delimiter=',', usecols=range(ncol-1)) X = data[:,1:] y = data[:,0] n_samples, n_features = X.shape X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size) print X_train.shape, X_test.shape clf = RadiusNeighborsClassifier(radius=0.001, weights='uniform', algorithm='auto') clf.fit(X_train, y_train) y_pred = clf.predict(X_test) print "Radius Neighbors accuracy " print "classification accuracy:", metrics.accuracy_score(y_test, y_pred) print "precision:", metrics.precision_score(y_test, y_pred) print "recall:", metrics.recall_score(y_test, y_pred) print "f1 score:", metrics.f1_score(y_test, y_pred) print "\n" results = Output+"Raidus_Neighbors_metrics_test.txt" file = open(results, "w") file.write("Radius Neighbors estimator accuracy\n") file.write("Classification Accuracy Score: %f\n"%metrics.accuracy_score(y_test, y_pred)) file.write("Precision Score: %f\n"%metrics.precision_score(y_test, y_pred)) file.write("Recall Score: %f\n"%metrics.recall_score(y_test, y_pred)) file.write("F1 Score: %f\n"%metrics.f1_score(y_test, y_pred)) file.write("\n") file.write("True Value, Predicted Value, Iteration\n") for n in xrange(len(y_test)): file.write("%f,%f,%i\n"%(y_test[n],y_pred[n],(n+1))) file.close() title = "Radius Neighbors %f"%test_size save = Output + "Radius_Neighbors_confusion_matrix"+"_%s.png"%test_size plot_confusion_matrix(y_test, y_pred,title,save) except (ValueError): results = Output+"Raidus_Neighbors_metrics_test.txt" file = open(results, "w") file.write("In configuration.py file: No neighbors found for test samples, you can try using larger radius, give a label for outliers, consider or removing them from your dataset.") file.close() lvltrace.lvltrace("LVLSortie dans radius_kneighbors split_test")
def features_extraction(): lvltrace.lvltrace("LVLEntree dans features_extraction") data =[] for root, dirs, files in os.walk(inputs.morphology): for filename in files: if not filename.startswith('.'): if not os.path.isfile(filename) and filename.endswith('.swc'): # neuron_path and neuron_features are the inputs of lmeasure neuron_path=root+"/"+filename neuron_features="-s"+os.path.splitext(os.path.splitext(neuron_path)[0])[0]+".csv" # neuron_csv is the name of the extracted features output file from lmeasure #neuron_csv=os.path.splitext(os.path.splitext(neuron_path)[0])[0]+".csv" #print ("LVL neuron csv est "+ neuron_csv) # Use of lmeasure tool to extract features and save it to input folders data.append((neuron_features,neuron_path)) cpuCount = multiprocessing.cpu_count() pool = Pool(processes=cpuCount) pool.map(lmeasure,data) lvltrace.lvltrace("LVLSortie dans features_extraction")
def KMeans_PCA(input_file, n_clusters, Output): lvltrace.lvltrace("LVLEntree dans KMeans_PCA unsupervised") ncol=tools.file_col_coma(input_file) data = np.loadtxt(input_file, delimiter=',', usecols=range(ncol-1)) X = data[:,1:] y = data[:,0] sample_size, n_features = X.shape reduced_data = PCA(n_components=2).fit_transform(X) k_means = KMeans(init='k-means++', n_clusters=n_clusters, n_init=50) k_means.fit(reduced_data) labels = k_means.labels_ print "#########################################################################################################\n" print "K-MEANS on PCA-reduced data" #print labels #print y print('homogeneity_score: %f'%metrics.homogeneity_score(y, labels)) print('completeness_score: %f'%metrics.completeness_score(y, labels)) print('v_measure_score: %f'%metrics.v_measure_score(y, labels)) print('adjusted_rand_score: %f'%metrics.adjusted_rand_score(y, labels)) print('adjusted_mutual_info_score: %f'%metrics.adjusted_mutual_info_score(y, labels)) print('silhouette_score: %f'%metrics.silhouette_score(X, labels, metric='euclidean', sample_size=sample_size)) print "\n" print "#########################################################################################################\n" results = Output+"kmeans_PCA_metrics.txt" file = open(results, "w") file.write("K-Means clustering on the PCA-reduced data\n") file.write("Homogeneity Score: %f\n"%metrics.homogeneity_score(y, labels)) file.write("Completeness Score: %f\n"%metrics.completeness_score(y, labels)) file.write("V-Measure: %f\n"%metrics.v_measure_score(y, labels)) file.write("The adjusted Rand index: %f\n"%metrics.adjusted_rand_score(y, labels)) file.write("Adjusted Mutual Information: %f\n"%metrics.adjusted_mutual_info_score(y, labels)) file.write("Silhouette Score: %f\n"%metrics.silhouette_score(X, labels, metric='euclidean', sample_size=sample_size)) file.write("\n") file.write("True Value, Clusters, Iteration\n") for n in xrange(len(y)): file.write("%f,%f,%i\n"%(y[n],labels[n],(n+1))) file.close() # Step size of the mesh. Decrease to increase the quality of the VQ. h = .02 # point in the mesh [x_min, m_max]x[y_min, y_max] # Plot the decision boundary. For that, we will assign a color to each x_min, x_max = reduced_data[:, 0].min() , reduced_data[:, 0].max() y_min, y_max = reduced_data[:, 1].min() , reduced_data[:, 1].max() xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h)) # Obtain labels for each point in mesh. Use last trained model. Z = k_means.predict(np.c_[xx.ravel(), yy.ravel()]) # Put the result into a color plot Z = Z.reshape(xx.shape) pl.figure(1) pl.clf() pl.imshow(Z, interpolation='nearest', extent=(xx.min(), xx.max(), yy.min(), yy.max()), cmap=pl.cm.Paired, aspect='auto', origin='lower') pl.plot(reduced_data[:, 0], reduced_data[:, 1], 'k.', markersize=2) # Plot the centroids as a white X centroids = k_means.cluster_centers_ pl.scatter(centroids[:, 0], centroids[:, 1], marker='x', s=169, linewidths=3, color='w', zorder=10) pl.title('K-means clustering on the PCA-reduced data\n' 'Number of clusters: %i'%n_clusters) pl.xlim(x_min, x_max) pl.ylim(y_min, y_max) pl.xticks(()) pl.yticks(()) save = Output + "kmeans_PCA.png" pl.savefig(save) lvltrace.lvltrace("LVLSortie dans KMeans_PCA unsupervised")
def data_preprocessing_descriptive(Extracted_Features, Coma_Features, Corrected_Features): lvltrace.lvltrace( "LVLEntree dans data_preprocessing_descriptive dans preproc_descriptive" ) tools.separate_coma(Extracted_Features, Coma_Features) for root, dirs, files in os.walk(Coma_Features): for i in files: if not i.startswith('.'): input_i = Coma_Features + i output_i = Corrected_Features + i lines = tools.file_lines(input_i) ncol = tools.file_col(input_i) if lines >= 2: file = open(output_i, "w") writer = csv.writer(file, lineterminator='\t') data = np.genfromtxt(input_i, delimiter=',') X = data[1:, 2:] neuron_type = np.genfromtxt(input_i, delimiter=',', dtype=None) y = neuron_type[:, 0] # (class) neuron_name = np.genfromtxt(input_i, delimiter=',', dtype=None) z = neuron_name[:, 1] # Neuron names features = np.genfromtxt(input_i, delimiter=',', dtype=None) w = features[0, :] # features names #Replace missing values 'nan' by column mean imp = Imputer(missing_values='NaN', strategy='mean', axis=0) imp.fit(X) Imputer(axis=0, copy=True, missing_values='NaN', strategy='mean', verbose=0) # Output replacement "Nan" values Y = imp.transform(X) #print i #print Y.shape, y.shape,z.shape #print Y.shape[1] #################### for line in xrange(Y.shape[0] + 1): for colonne in xrange(Y.shape[1] + 2): if line == 0: if colonne == 0: file.write("%s\t" % y[line]) else: if colonne == 1: file.write("%s\t" % z[line]) else: file.write("%s\t" % w[colonne]) else: if colonne == 0: file.write("%s\t" % y[line]) else: if colonne == 1: file.write("%s\t" % z[line]) else: file.write("%f\t" % Y[line - 1, colonne - 2]) file.write("\n") ######################### else: print "Only one morphology !!!" file.close() lvltrace.lvltrace( "LVLSortie dans data_preprocessing_descriptive dans preproc_descriptive" )
def univariate(input_file, Output, percentile): ############################################################################### # import some data to play with lvltrace.lvltrace("LVLEntree dans univariate dans feature_selection") ncol = tools.file_col_coma(input_file) data = np.loadtxt(input_file, delimiter=',', usecols=range(ncol - 1)) X = data[:, 1:] y = data[:, 0] sample_size, n_features = X.shape ############################################################################### pl.figure(1) pl.clf() X_indices = np.arange(X.shape[-1]) #print X_indices ############################################################################### # Univariate feature selection with F-test for feature scoring # We use the default selection function: the 10% most significant features selector = SelectPercentile(f_classif, percentile=percentile) selector.fit(X, y) scores = -np.log10(selector.pvalues_) scores /= scores.max() pl.bar(X_indices - .45, scores, width=.2, label=r'Univariate score ($-Log(p_{value})$)', color='g') ############################################################################### # Compare to the weights of an SVM clf = svm.SVC(kernel='linear') clf.fit(X, y) svm_weights = (clf.coef_**2).sum(axis=0) svm_weights /= svm_weights.max() pl.bar((X_indices + 1) - .25, svm_weights, width=.2, label='SVM weight', color='r') clf_selected = svm.SVC(kernel='linear') clf_selected.fit(selector.transform(X), y) svm_weights_selected = (clf_selected.coef_**2).sum(axis=0) svm_weights_selected /= svm_weights_selected.max() pl.bar(X_indices[selector.get_support()] - .05, svm_weights_selected, width=.2, label='SVM weights after selection', color='b') pl.title("Feature selection") pl.xlabel('Feature number') pl.yticks(()) pl.axis('tight') pl.legend(loc='upper right') save = Output + "univariate.png" pl.savefig(save) # Print the feature ranking results = Output + "univariate.txt" file = open(results, "w") file.write("Feature Ranking\n") #print len(X_indices[selector.get_support()]) for i in xrange(len(X_indices[selector.get_support()])): #print i #print (X_indices[selector.get_support()][i]+1) #print svm_weights_selected[i] file.write("%f,%f\n" % ((X_indices[selector.get_support()][i] + 1), svm_weights_selected[i])) file.close() #print("Feature ranking:") #print (X_indices[selector.get_support()] +1) #print svm_weights_selected lvltrace.lvltrace("LVLSortie dans univariate dans feature_selection")
def preprocessing_module(Extracted_Features,Coma_Features,Corrected_Features, Norm,ontology): # Replace tab separated csv into comma separated csv and replace categorial variables into iteration lvltrace.lvltrace("LVLEntree dans preprocessing_module data_preproc") onto = open(ontology, "w") writer=csv.writer(onto, lineterminator=',') class_number = 1 onto.write("Iteration,Class,Class_number,Neuron_name\n") Iteration=1 for root, dirs, files in os.walk(Extracted_Features): for i in files: if not i.startswith('.'): #print i input_i=Extracted_Features+i output_i=Coma_Features+i file = open(output_i, "w") writer=csv.writer(file, lineterminator=',') lines=tools.file_lines(input_i)+1 ncol=tools.file_col(input_i)-1 for line in xrange(lines): for col in xrange(ncol): if line == 0: if col == 1: # Skipping neuron names laurent=1 else: file.write("%s,"%tools.read_csv_tab(input_i,col,line)) else: if col == 0: # replace class names by an integer file.write("%i,"%class_number) else: if col == 1: #print "skip neuron name" onto.write("%i,%s,%i,%s\n"%(Iteration,i,class_number,tools.read_csv_tab(input_i,col,line))) Iteration=Iteration+1 else: file.write("%s,"%tools.read_csv_tab(input_i,col,line)) file.write("\n") file.close() class_number = class_number + 1 if lines > 3 : input_file=Coma_Features+i data = np.loadtxt(input_file, delimiter=',', usecols=range(ncol-1),skiprows=1) # ncol-1 because we skip the class names X = data[:, :ncol] y = data[:, 0].astype(np.int) # Labels (class) #Replace missing values 'nan' by column mean imp = Imputer(missing_values='NaN', strategy='mean', axis=0) imp.fit(X) Imputer(axis=0, copy=True, missing_values='NaN', strategy='mean', verbose=0) # Output replacement "Nan" values Y=imp.transform(X) #Data Standardization if Norm == 'normalize': Z=preprocessing.normalize(Y, axis=0, norm='l2') # Normalize else: if Norm == 'binarize': binarizer=preprocessing.Binarizer().fit(Y) # Binarize for Bernoulli Z = binarizer.transform(Y) else: if Norm == 'standardize': min_max_scaler = preprocessing.MinMaxScaler() # Normalize the data to [0,1] Z=min_max_scaler.fit_transform(Y) else: Z=preprocessing.scale(Y) #Scaling #Create new files with corrected and standardized data output_file=Corrected_Features+i file = open(output_file, "w") writer=csv.writer(file, lineterminator=',') for line_1 in xrange(lines-1): for col_1 in xrange(ncol-1): if col_1==0: file.write("%s,"%y[line_1]) else: file.write("%f,"%Z[line_1,col_1]) file.write("\n") file.close() else: laurent=1 onto.close() lvltrace.lvltrace("LVLSortie dans preprocessing_module data_preproc")
def univariate(input_file, Output, percentile): ############################################################################### # import some data to play with lvltrace.lvltrace("LVLEntree dans univariate dans feature_selection") ncol=tools.file_col_coma(input_file) data = np.loadtxt(input_file, delimiter=',', usecols=range(ncol-1)) X = data[:,1:] y = data[:,0] sample_size, n_features = X.shape ############################################################################### pl.figure(1) pl.clf() X_indices = np.arange(X.shape[-1]) #print X_indices ############################################################################### # Univariate feature selection with F-test for feature scoring # We use the default selection function: the 10% most significant features selector = SelectPercentile(f_classif, percentile=percentile) selector.fit(X, y) scores = -np.log10(selector.pvalues_) scores /= scores.max() pl.bar(X_indices - .45, scores, width=.2, label=r'Univariate score ($-Log(p_{value})$)', color='g') ############################################################################### # Compare to the weights of an SVM clf = svm.SVC(kernel='linear') clf.fit(X, y) svm_weights = (clf.coef_ ** 2).sum(axis=0) svm_weights /= svm_weights.max() pl.bar((X_indices+1) - .25, svm_weights, width=.2, label='SVM weight', color='r') clf_selected = svm.SVC(kernel='linear') clf_selected.fit(selector.transform(X), y) svm_weights_selected = (clf_selected.coef_ ** 2).sum(axis=0) svm_weights_selected /= svm_weights_selected.max() pl.bar(X_indices[selector.get_support()] - .05, svm_weights_selected, width=.2, label='SVM weights after selection', color='b') pl.title("Feature selection") pl.xlabel('Feature number') pl.yticks(()) pl.axis('tight') pl.legend(loc='upper right') save=Output+"univariate.png" pl.savefig(save) # Print the feature ranking results = Output+"univariate.txt" file = open(results, "w") file.write("Feature Ranking\n") #print len(X_indices[selector.get_support()]) for i in xrange(len(X_indices[selector.get_support()])): #print i #print (X_indices[selector.get_support()][i]+1) #print svm_weights_selected[i] file.write("%f,%f\n"%((X_indices[selector.get_support()][i]+1),svm_weights_selected[i])) file.close() #print("Feature ranking:") #print (X_indices[selector.get_support()] +1) #print svm_weights_selected lvltrace.lvltrace("LVLSortie dans univariate dans feature_selection")
def affinitypropagation(input_file,type,pref,Output): lvltrace.lvltrace("LVLEntree dans affinitypropagation unsupervised") ncol=tools.file_col_coma(input_file) data = np.loadtxt(input_file, delimiter=',', usecols=range(ncol-1)) X = data[:,1:] #print (" ici X vaut ") #print X #print (" fin de print X") labels_true = data[:,0] # A tester if type == 'spearmanr': X = scipy.stats.stats.spearmanr(X,axis=1)[0] else: if type == 'euclidean': X = -euclidean_distances(X, squared=True) else: print "something wrong" if pref == 'median': # A tester entre min ou median preference = np.median(X) else: if pref == 'mean': preference = np.mean(X) else: if pref == 'min': preference = np.min(X) else: print "something wrong" print "#########################################################################################################\n" print "Affinity Propagation" print preference n_samples, n_features = X.shape cluster_centers_indices, labels = affinity_propagation(X, preference=preference) #print cluster_centers_indices n_clusters_ = len(cluster_centers_indices) n_clusters_ = len(cluster_centers_indices) #print labels_true #print labels print('Estimated number of clusters: %d' % n_clusters_) print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels_true, labels)) print("Completeness: %0.3f" % metrics.completeness_score(labels_true, labels)) print("V-measure: %0.3f" % metrics.v_measure_score(labels_true, labels)) print("Adjusted Rand Index: %0.3f" % metrics.adjusted_rand_score(labels_true, labels)) print("Adjusted Mutual Information: %0.3f" % metrics.adjusted_mutual_info_score(labels_true, labels)) print("Silhouette Coefficient: %0.3f" % metrics.silhouette_score(X, labels, metric='sqeuclidean')) print "\n" print "#########################################################################################################\n" results = Output+"affinity_propagation.txt" file = open(results, "w") file.write("Affinity Propagation\n") file.write("Homogeneity Score: %f\n"%metrics.homogeneity_score(labels_true, labels)) file.write("Completeness Score: %f\n"%metrics.completeness_score(labels_true, labels)) file.write("V-Measure: %f\n"%metrics.v_measure_score(labels_true, labels)) file.write("The adjusted Rand index: %f\n"%metrics.adjusted_rand_score(labels_true, labels)) file.write("Adjusted Mutual Information: %f\n"%metrics.adjusted_mutual_info_score(labels_true, labels)) file.write("Silhouette Score: %f\n"%metrics.silhouette_score(X, labels, metric='sqeuclidean')) file.write("\n") file.write("True Value, Clusters, Iteration\n") for n in xrange(len(labels_true)): file.write("%f,%f,%i\n"%(labels_true[n],labels[n],(n+1))) file.close() # Plot result import pylab as pl from itertools import cycle pl.close('all') pl.figure(1) pl.clf() colors = cycle('bgrcmykbgrcmykbgrcmykbgrcmykbgrcmykbgrcmykbg') for k, col in zip(range(n_clusters_), colors): class_members = labels == k cluster_center = X[cluster_centers_indices[k]] pl.plot(X[class_members, 0], X[class_members, 1], col + '.') pl.plot(cluster_center[0], cluster_center[1], 'o', markerfacecolor=col, markeredgecolor='k', markersize=14) for x in X[class_members]: pl.plot([cluster_center[0], x[0]], [cluster_center[1], x[1]], col) pl.title('Estimated number of clusters: %d' % n_clusters_) save = Output + "affinity_propagation.png" plt.savefig(save) lvltrace.lvltrace("LVLSortie dans affinitypropagation unsupervised")
def dbscan(input_file, Output): lvltrace.lvltrace("LVLEntree dans dbscan unsupervised") # Generate sample data ncol=tools.file_col_coma(input_file) data = np.loadtxt(input_file, delimiter=',', usecols=range(ncol-1)) X = data[:,1:] labels_true = data[:,0] #X = StandardScaler().fit_transform(Y) # Compute DBSCAN db = DBSCAN().fit(X) core_samples = db.core_sample_indices_ labels = db.labels_ print "#########################################################################################################\n" print "DBSCAN" print labels_true print labels # Number of clusters in labels, ignoring noise if present. n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0) print('Estimated number of clusters: %d' % n_clusters_) print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels_true, labels)) print("Completeness: %0.3f" % metrics.completeness_score(labels_true, labels)) print("V-measure: %0.3f" % metrics.v_measure_score(labels_true, labels)) print("Adjusted Rand Index: %0.3f" % metrics.adjusted_rand_score(labels_true, labels)) print("Adjusted Mutual Information: %0.3f" % metrics.adjusted_mutual_info_score(labels_true, labels)) print("Silhouette Coefficient: %0.3f" % metrics.silhouette_score(X, labels)) print "\n" print "#########################################################################################################\n" results = Output+"dbscan.txt" file = open(results, "w") file.write("DBSCAN\n") file.write("Homogeneity Score: %f\n"%metrics.homogeneity_score(y, labels)) file.write("Completeness Score: %f\n"%metrics.completeness_score(y, labels)) file.write("V-Measure: %f\n"%metrics.v_measure_score(y, labels)) file.write("The adjusted Rand index: %f\n"%metrics.adjusted_rand_score(y, labels)) file.write("Adjusted Mutual Information: %f\n"%metrics.adjusted_mutual_info_score(y, labels)) file.write("Silhouette Score: %f\n"%metrics.silhouette_score(X, labels, metric='euclidean', sample_size=sample_size)) file.write("\n") file.write("True Value, Clusters, Iteration\n") for n in xrange(len(y)): file.write("%f,%f,%i\n"%(y[n],labels[n],(n+1))) file.close() # Plot result import pylab as pl # Black removed and is used for noise instead. unique_labels = set(labels) colors = pl.cm.Spectral(np.linspace(0, 1, len(unique_labels))) for k, col in zip(unique_labels, colors): if k == -1: # Black used for noise. col = 'k' markersize = 6 class_members = [index[0] for index in np.argwhere(labels == k)] cluster_core_samples = [index for index in core_samples if labels[index] == k] for index in class_members: x = X[index] if index in core_samples and k != -1: markersize = 14 else: markersize = 6 pl.plot(x[0], x[1], 'o', markerfacecolor=col, markeredgecolor='k', markersize=markersize) pl.title('Estimated number of clusters: %d' % n_clusters_) save = Output + "dbscan.png" plt.savefig(save) lvltrace.lvltrace("LVLSortie dans dbscan unsupervised")
def preprocessing_module(Extracted_Features, Coma_Features, Corrected_Features, Norm, ontology): # Replace tab separated csv into comma separated csv and replace categorial variables into iteration lvltrace.lvltrace("LVLEntree dans preprocessing_module data_preproc") onto = open(ontology, "w") writer = csv.writer(onto, lineterminator=',') class_number = 1 onto.write("Iteration,Class,Class_number,Neuron_name\n") Iteration = 1 for root, dirs, files in os.walk(Extracted_Features): for i in files: if not i.startswith('.'): #print i input_i = Extracted_Features + i output_i = Coma_Features + i file = open(output_i, "w") writer = csv.writer(file, lineterminator=',') lines = tools.file_lines(input_i) + 1 ncol = tools.file_col(input_i) - 1 for line in xrange(lines): for col in xrange(ncol): if line == 0: if col == 1: # Skipping neuron names laurent = 1 else: file.write( "%s," % tools.read_csv_tab(input_i, col, line)) else: if col == 0: # replace class names by an integer file.write("%i," % class_number) else: if col == 1: #print "skip neuron name" onto.write("%i,%s,%i,%s\n" % (Iteration, i, class_number, tools.read_csv_tab( input_i, col, line))) Iteration = Iteration + 1 else: file.write( "%s," % tools.read_csv_tab(input_i, col, line)) file.write("\n") file.close() class_number = class_number + 1 if lines > 3: input_file = Coma_Features + i data = np.loadtxt( input_file, delimiter=',', usecols=range(ncol - 1), skiprows=1) # ncol-1 because we skip the class names X = data[:, :ncol] y = data[:, 0].astype(np.int) # Labels (class) #Replace missing values 'nan' by column mean imp = Imputer(missing_values='NaN', strategy='mean', axis=0) imp.fit(X) Imputer(axis=0, copy=True, missing_values='NaN', strategy='mean', verbose=0) # Output replacement "Nan" values Y = imp.transform(X) #Data Standardization if Norm == 'normalize': Z = preprocessing.normalize(Y, axis=0, norm='l2') # Normalize else: if Norm == 'binarize': binarizer = preprocessing.Binarizer().fit( Y) # Binarize for Bernoulli Z = binarizer.transform(Y) else: if Norm == 'standardize': min_max_scaler = preprocessing.MinMaxScaler( ) # Normalize the data to [0,1] Z = min_max_scaler.fit_transform(Y) else: Z = preprocessing.scale(Y) #Scaling #Create new files with corrected and standardized data output_file = Corrected_Features + i file = open(output_file, "w") writer = csv.writer(file, lineterminator=',') for line_1 in xrange(lines - 1): for col_1 in xrange(ncol - 1): if col_1 == 0: file.write("%s," % y[line_1]) else: file.write("%f," % Z[line_1, col_1]) file.write("\n") file.close() else: laurent = 1 onto.close() lvltrace.lvltrace("LVLSortie dans preprocessing_module data_preproc")
def meanshift(input_file,Output): lvltrace.lvltrace("LVLEntree dans meanshift unsupervised") ncol=tools.file_col_coma(input_file) data = np.loadtxt(input_file, delimiter=',', usecols=range(ncol-1)) X = data[:,1:] y = data[:,0] sample_size, n_features = X.shape # Compute clustering with MeanShift # The following bandwidth can be automatically detected using bandwidth = estimate_bandwidth(X, quantile=0.2, n_samples=sample_size) ms = MeanShift() ms.fit(X) labels = ms.labels_ cluster_centers = ms.cluster_centers_ labels_unique = np.unique(labels) n_clusters_ = len(labels_unique) print "#########################################################################################################\n" print "Mean Shift" print("number of estimated clusters : %d" % n_clusters_) #print labels #print y print('homogeneity_score: %f'%metrics.homogeneity_score(y, labels)) print('completeness_score: %f'%metrics.completeness_score(y, labels)) print('v_measure_score: %f'%metrics.v_measure_score(y, labels)) print('adjusted_rand_score: %f'%metrics.adjusted_rand_score(y, labels)) print('adjusted_mutual_info_score: %f'%metrics.adjusted_mutual_info_score(y, labels)) try: print('silhouette_score: %f'%metrics.silhouette_score(X, labels, metric='euclidean', sample_size=sample_size)) except (ValueError): print "ValueError: Number of labels is 1 but should be more than 2and less than n_samples - 1" print "\n" print "#########################################################################################################\n" results = Output+"mean_shift_metrics.txt" file = open(results, "w") file.write("Mean Shift\n") file.write("Homogeneity Score: %f\n"%metrics.homogeneity_score(y, labels)) file.write("Completeness Score: %f\n"%metrics.completeness_score(y, labels)) file.write("V-Measure: %f\n"%metrics.v_measure_score(y, labels)) file.write("The adjusted Rand index: %f\n"%metrics.adjusted_rand_score(y, labels)) file.write("Adjusted Mutual Information: %f\n"%metrics.adjusted_mutual_info_score(y, labels)) try: file.write("Silhouette Score: %f\n"%metrics.silhouette_score(X, labels, metric='euclidean', sample_size=sample_size)) except (ValueError): file.write("ValueError: Number of labels is 1 but should be more than 2and less than n_samples - 1") file.write("\n") file.write("True Value, Clusters, Iteration\n") for n in xrange(len(y)): file.write("%f,%f,%i\n"%(y[n],labels[n],(n+1))) file.close() # Plot result import pylab as pl from itertools import cycle fig, ax = plt.subplots() im=ax.scatter(X[:, 0], X[:, 1], c=labels, marker='.') for k in xrange(n_clusters_): my_members = labels == k cluster_center = cluster_centers[k] #print cluster_center[0], cluster_center[1] ax.plot(cluster_center[0], cluster_center[1], 'w', color='b', marker='x', markersize=6) fig.colorbar(im); plt.title('Estimated number of clusters: %d' % n_clusters_) save = Output + "mean_shift.png" plt.savefig(save) lvltrace.lvltrace("LVLSortie dans meanshift unsupervised")