コード例 #1
0
def extratreeclassifier(input_file,Output):
    lvltrace.lvltrace("LVLEntree dans extratreeclassifier")
    ncol=tools.file_col_coma(input_file)
    data = np.loadtxt(input_file, delimiter=',', usecols=range(ncol-1))
    X = data[:,1:]
    y = data[:,0]
    n_samples, n_features = X.shape
    clf = ExtraTreesClassifier(n_estimators=10)
    clf.fit(X,y)
    y_pred = clf.predict(X)
    print "#########################################################################################################\n"
    print " Extremely Randomized Trees "
    print "classification accuracy:", metrics.accuracy_score(y, y_pred)
    print "precision:", metrics.precision_score(y, y_pred)
    print "recall:", metrics.recall_score(y, y_pred)
    print "\n"
    print "#########################################################################################################\n"
    results = Output+"_Extremely_Random_Forest_metrics.txt"
    file = open(results, "w")
    file.write("Extremely Random Forest Classifier estimator accuracy\n")
    file.write("Classification Accuracy Score: %f\n"%metrics.accuracy_score(y, y_pred))
    file.write("Precision Score: %f\n"%metrics.precision_score(y, y_pred))
    file.write("Recall Score: %f\n"%metrics.recall_score(y, y_pred))
    file.write("F1 Score: %f\n"%metrics.f1_score(y, y_pred))
    file.write("\n")
    file.write("True Value, Predicted Value, Iteration\n")
    for n in xrange(len(y)):
        file.write("%f,%f,%i\n"%(y[n],y_pred[n],(n+1)))
    file.close()
    title = "Extremely Randomized Trees"
    save = Output + "Extremely_Randomized_Trees_confusion_matrix.png"
    plot_confusion_matrix(y, y_pred,title,save)
    lvltrace.lvltrace("LVLSortie dans extratreeclassifier")
コード例 #2
0
def randomforest(input_file,Output):
    lvltrace.lvltrace("LVLEntree dans randomforest")
    ncol=tools.file_col_coma(input_file)
    data = np.loadtxt(input_file, delimiter=',', usecols=range(ncol-1))
    X = data[:,1:]
    y = data[:,0]
    n_samples, n_features = X.shape
    clf = RandomForestClassifier(n_estimators=10)
    clf.fit(X,y)
    y_pred = clf.predict(X)
    print "#########################################################################################################\n"
    print "The Random forest algo "
    print "classification accuracy:", metrics.accuracy_score(y, y_pred)
    print "precision:", metrics.precision_score(y, y_pred)
    print "recall:", metrics.recall_score(y, y_pred)
    print "f1 score:", metrics.f1_score(y, y_pred)
    print "\n"
    print "#########################################################################################################\n"
    results = Output+"Random_Forest_metrics.txt"
    file = open(results, "w")
    file.write("Random Forest Classifier estimator accuracy\n")
    file.write("Classification Accuracy Score: %f\n"%metrics.accuracy_score(y, y_pred))
    file.write("Precision Score: %f\n"%metrics.precision_score(y, y_pred))
    file.write("Recall Score: %f\n"%metrics.recall_score(y, y_pred))
    file.write("F1 Score: %f\n"%metrics.f1_score(y, y_pred))
    file.write("\n")
    file.write("True Value, Predicted Value, Iteration\n")
    for n in xrange(len(y)):
        file.write("%f,%f,%i\n"%(y[n],y_pred[n],(n+1)))
    file.close()
    title = "The Random forest"
    save = Output + "Random_Forest_confusion_matrix.png"
    plot_confusion_matrix(y, y_pred,title,save)
    lvltrace.lvltrace("LVLSortie dans randomforest")
コード例 #3
0
def stochasticGD(input_file,Output):
    lvltrace.lvltrace("LVLEntree dans stochasticGD")
    ncol=tools.file_col_coma(input_file)
    data = np.loadtxt(input_file, delimiter=',', usecols=range(ncol-1))
    X = data[:,1:]
    y = data[:,0]
    n_samples, n_features = X.shape
    clf = SGDClassifier(loss="hinge", penalty="l2")
    clf.fit(X,y)
    y_pred = clf.predict(X)
    print "#########################################################################################################\n"
    print "Stochastic Gradient Descent "
    print "classification accuracy:", metrics.accuracy_score(y, y_pred)
    print "precision:", metrics.precision_score(y, y_pred)
    print "recall:", metrics.recall_score(y, y_pred)
    print "f1 score:", metrics.f1_score(y, y_pred)
    print "\n"
    print "#########################################################################################################\n"
    results = Output+"Stochastic_GD_metrics.txt"
    file = open(results, "w")
    file.write("Stochastic Gradient Descent estimator accuracy\n")
    file.write("Classification Accuracy Score: %f\n"%metrics.accuracy_score(y, y_pred))
    file.write("Precision Score: %f\n"%metrics.precision_score(y, y_pred))
    file.write("Recall Score: %f\n"%metrics.recall_score(y, y_pred))
    file.write("F1 Score: %f\n"%metrics.f1_score(y, y_pred))
    file.write("\n")
    file.write("True Value, Predicted Value, Iteration\n")
    for n in xrange(len(y)):
        file.write("%f,%f,%i\n"%(y[n],y_pred[n],(n+1)))
    file.close()
    title = "Stochastic Gradient Descent"
    save = Output + "Stochastic_GD_confusion_matrix.png"
    plot_confusion_matrix(y, y_pred,title,save)
    lvltrace.lvltrace("LVLSortie dans stochasticGD")
コード例 #4
0
def SVC_linear(input_file,Output):
    lvltrace.lvltrace("LVLEntree dans SVC_linear")
    ncol=tools.file_col_coma(input_file)
    data = np.loadtxt(input_file, delimiter=',', usecols=range(ncol-1))
    X = data[:,1:]
    y = data[:,0]
    n_samples, n_features = X.shape
    clf=svm.SVC(kernel='linear')
    clf.fit(X,y)
    y_pred = clf.predict(X)
    print "#########################################################################################################\n"
    print "C-Support Vector Classifcation (with linear kernel) "
    print "classification accuracy:", metrics.accuracy_score(y, y_pred)
    print "precision:", metrics.precision_score(y, y_pred)
    print "recall:", metrics.recall_score(y, y_pred)
    print "f1 score:", metrics.f1_score(y, y_pred)
    print "\n"
    print "#########################################################################################################\n"
    results = Output+"SVM_Linear_Kernel_metrics.txt"
    file = open(results, "w")
    file.write("Support Vector Machine with Linear Kernel estimator accuracy\n")
    file.write("Classification Accuracy Score: %f\n"%metrics.accuracy_score(y, y_pred))
    file.write("Precision Score: %f\n"%metrics.precision_score(y, y_pred))
    file.write("Recall Score: %f\n"%metrics.recall_score(y, y_pred))
    file.write("F1 Score: %f\n"%metrics.f1_score(y, y_pred))
    file.write("\n")
    file.write("True Value, Predicted Value, Iteration\n")
    for n in xrange(len(y)):
        file.write("%f,%f,%i\n"%(y[n],y_pred[n],(n+1)))
    file.close()
    title = "SVC - linear Kernel"
    save = Output + "SVC_linear_confusion_matrix.png"
    plot_confusion_matrix(y, y_pred,title,save)
    lvltrace.lvltrace("LVLSortie dans SVC_linear")
コード例 #5
0
def pca(input_file,Output):
    lvltrace.lvltrace("LVLEntree dans pca unsupervised")
    ncol=tools.file_col_coma(input_file)
    data = np.loadtxt(input_file, delimiter=',', usecols=range(ncol-1))
    X = data[:,1:]
    y = data[:,0]
    n_samples, n_features = X.shape
    # instantiate the model
    model = PCA(n_components=2)
    # fit the model: notice we don't pass the labels!
    model.fit(X)
    # transform the data to two dimensions
    X_PCA = model.transform(X)
    print "#########################################################################################################\n"
    print "PCA"
    print "shape of result:", X_PCA.shape
    print model.explained_variance_ratio_
    print "#########################################################################################################\n"
    
    results = Output+"pca.txt"
    file = open(results, "w")
    file.write("PCA\n")
    file.write("shape of result: %f,%f\n"%(X_PCA.shape[0],X_PCA.shape[1]))
    file.write("Explained variance ratio: %f,%f\n"%(model.explained_variance_ratio_[0],model.explained_variance_ratio_[1]))
    file.close()
    
    # plot the results along with the labels
    fig, ax = plt.subplots()
    im = ax.scatter(X_PCA[:, 0], X_PCA[:, 1], c=y)
    fig.colorbar(im);
    save = Output + "pca.png"
    plt.savefig(save)
    lvltrace.lvltrace("LVLSortie dans pca unsupervised")
コード例 #6
0
def features_by_class():
    lvltrace.lvltrace("LVLEntree dans features_by_class")
    if not os.path.exists(inputs.features_output):
        os.makedirs(inputs.features_output)
    for root, dirs, files in os.walk(inputs.morphology):
        for i in dirs:
            #LVlprint ("on traite le repertoire "+ str(i))
            neuron_dir=root+'/'+i
            neuron_file_out=inputs.features_output+'/'+i+'.csv'
            features_by_class = open(neuron_file_out, "w")
            writer=csv.writer(features_by_class, lineterminator='\t')
            features_name=tools.random_file(neuron_dir)
            lines=tools.file_lines(features_name)
            features_by_class.write("mtype\tneuron_name\t")
            for line in xrange(lines):
                features_by_class.write("%s\t"%tools.read_csv_tab(features_name,1,line))
            features_by_class.write("\n")

            for file in os.listdir(neuron_dir):
                neuron_file_in=root+'/'+i+'/'+file
                # if the extracted feature file from lmeasure is empty, then skip
                if file.endswith(".csv") and os.path.getsize(neuron_file_in) > 0:
                    lines=tools.file_lines(neuron_file_in)
                    features_by_class.write("%s\t"%i)
                    features_by_class.write("%s\t"%file)
                    for line in xrange(lines):
                        features_by_class.write("%s\t"%tools.read_csv_tab(neuron_file_in,2,line))
                    features_by_class.write("\n")
    lvltrace.lvltrace("LVLSortie dans features_by_class")
def nearest_centroid(input_file,Output,test_size):
    ncol=tools.file_col_coma(input_file)
    data = np.loadtxt(input_file, delimiter=',', usecols=range(ncol-1))
    X = data[:,1:]
    y = data[:,0]
    n_samples, n_features = X.shape
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size)
    print X_train.shape, X_test.shape
    clf = NearestCentroid()
    clf.fit(X_train,y_train)
    y_pred = clf.predict(X_test)
    print "Nearest Centroid Classifier "
    print "classification accuracy:", metrics.accuracy_score(y_test, y_pred)
    print "precision:", metrics.precision_score(y_test, y_pred)
    print "recall:", metrics.recall_score(y_test, y_pred)
    print "f1 score:", metrics.f1_score(y_test, y_pred)
    print "\n"
    results = Output+"Nearest_Centroid_metrics_test.txt"
    file = open(results, "w")
    file.write("Nearest Centroid Classifier estimator accuracy\n")
    file.write("Classification Accuracy Score: %f\n"%metrics.accuracy_score(y_test, y_pred))
    file.write("Precision Score: %f\n"%metrics.precision_score(y_test, y_pred))
    file.write("Recall Score: %f\n"%metrics.recall_score(y_test, y_pred))
    file.write("F1 Score: %f\n"%metrics.f1_score(y_test, y_pred))
    file.write("\n")
    file.write("True Value, Predicted Value, Iteration\n")
    for n in xrange(len(y_test)):
        file.write("%f,%f,%i\n"%(y_test[n],y_pred[n],(n+1)))
    file.close()
    title = "Nearest Centroid %f"%test_size
    save = Output + "Nearest_Centroid_confusion_matrix"+"_%s.png"%test_size
    plot_confusion_matrix(y_test, y_pred,title,save)
    lvltrace.lvltrace("LVLSortie dans stochasticGD split_test")
def SVC_linear(input_file,Output,test_size):
    lvltrace.lvltrace("LVLEntree dans SVC_linear split_test")
    ncol=tools.file_col_coma(input_file)
    data = np.loadtxt(input_file, delimiter=',', usecols=range(ncol-1))
    X = data[:,1:]
    y = data[:,0]
    n_samples, n_features = X.shape
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size)
    print X_train.shape, X_test.shape
    clf=svm.SVC(kernel='linear')
    clf.fit(X_train,y_train)
    y_pred = clf.predict(X_test)
    print "C-Support Vector Classifcation (with RBF linear) "
    print "y_test, y_pred, iteration"
    print "classification accuracy:", metrics.accuracy_score(y_test, y_pred)
    print "precision:", metrics.precision_score(y_test, y_pred)
    print "recall:", metrics.recall_score(y_test, y_pred)
    print "f1 score:", metrics.f1_score(y_test, y_pred)
    print "\n"
    results = Output+"SVM_Linear_Kernel_metrics_test.txt"
    file = open(results, "w")
    file.write("Support Vector Machine with Linear Kernel estimator accuracy\n")
    file.write("Classification Accuracy Score: %f\n"%metrics.accuracy_score(y_test, y_pred))
    file.write("Precision Score: %f\n"%metrics.precision_score(y_test, y_pred))
    file.write("Recall Score: %f\n"%metrics.recall_score(y_test, y_pred))
    file.write("F1 Score: %f\n"%metrics.f1_score(y_test, y_pred))
    file.write("\n")
    file.write("True Value, Predicted Value, Iteration\n")
    for n in xrange(len(y_test)):
        file.write("%f,%f,%i\n"%(y_test[n],y_pred[n],(n+1)))
    file.close()
    title = "SVC linear %f"%test_size
    save = Output + "SVC_linear_confusion_matrix"+"_%s.png"%test_size
    plot_confusion_matrix(y_test, y_pred,title,save)
    lvltrace.lvltrace("LVLsortie dans SVC_linear split_test")
def merger_labelled(Preprocessed_file, file_name, Corrected_Features):
    lvltrace.lvltrace("LVLEntree dans merger_labelled dabs data_preproc")
    # Merge all features files into one
    file_random = Corrected_Features + '/' + file_name
    ncol = tools.file_col_coma(file_random)
    data = np.loadtxt(file_random, delimiter=',', usecols=range(ncol - 1))
    X = data[:, 1:]
    n_samples, n_features = X.shape
    fout = open(Preprocessed_file, "a")
    fout.write("Class,")
    for n in xrange(1, n_features + 1):
        fout.write("feature_%i," % n)
    fout.write("\n")
    # first file:
    first_file = Corrected_Features + file_name
    for line in open(first_file):
        fout.write(line)
    # now the rest:
    for root, dirs, files in os.walk(Corrected_Features):
        for i in files:
            if i != file_name:
                if not i.startswith('.'):
                    f = open(Corrected_Features + i)
                    f.next()  #Skip the header
                    for line in f:
                        fout.write(line)
                    f.close()
    fout.close()
    lvltrace.lvltrace("LVLSortie dans merger_labelled dans data_preproc")
def data_preprocessing_descriptive(Extracted_Features,Coma_Features,Corrected_Features):
    lvltrace.lvltrace("LVLEntree dans data_preprocessing_descriptive dans preproc_descriptive")
    tools.separate_coma(Extracted_Features,Coma_Features)
    for root, dirs, files in os.walk(Coma_Features):
        for i in files:
            if not i.startswith('.'):
                input_i=Coma_Features+i
                output_i=Corrected_Features+i
                lines=tools.file_lines(input_i)
                ncol=tools.file_col(input_i)
                if lines >= 2:
                    file = open(output_i, "w")
                    writer=csv.writer(file, lineterminator='\t')
                    
                    data = np.genfromtxt(input_i,delimiter=',')
                    X = data[1:, 2:]
                    neuron_type = np.genfromtxt(input_i,delimiter=',',dtype=None)
                    y = neuron_type[:, 0] # (class)

                    neuron_name = np.genfromtxt(input_i,delimiter=',',dtype=None)
                    z = neuron_name[:, 1] # Neuron names
                    
                    features = np.genfromtxt(input_i,delimiter=',',dtype=None)
                    w = features[0, :] # features names
                    
                    #Replace missing values 'nan' by column mean
                    imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
                    imp.fit(X)
                    Imputer(axis=0, copy=True, missing_values='NaN', strategy='mean', verbose=0)
                    # Output replacement "Nan" values
                    Y=imp.transform(X)
                    #print i
                    #print Y.shape, y.shape,z.shape
                    #print Y.shape[1]
                    
                    ####################
                    for line in xrange(Y.shape[0]+1):
                        for colonne in xrange(Y.shape[1]+2):
                            if line == 0:
                                if colonne == 0:
                                    file.write("%s\t"%y[line])
                                else:
                                    if colonne == 1:
                                        file.write("%s\t"%z[line])
                                    else:
                                        file.write("%s\t"%w[colonne])
                            else:
                                if colonne == 0:
                                    file.write("%s\t"%y[line])
                                else:
                                    if colonne == 1:
                                        file.write("%s\t"%z[line])
                                    else:
                                        file.write("%f\t"%Y[line-1,colonne-2])
                        file.write("\n")
                    #########################
                else:
                    print "Only one morphology !!!"
                file.close()
    lvltrace.lvltrace("LVLSortie dans data_preprocessing_descriptive dans preproc_descriptive")
def extratreeclassifier(input_file,Output,test_size):
    lvltrace.lvltrace("LVLEntree dans extratreeclassifier split_test")
    ncol=tools.file_col_coma(input_file)
    data = np.loadtxt(input_file, delimiter=',', usecols=range(ncol-1))
    X = data[:,1:]
    y = data[:,0]
    n_samples, n_features = X.shape
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size)
    print X_train.shape, X_test.shape
    clf = ExtraTreesClassifier(n_estimators=10)
    clf.fit(X_train,y_train)
    y_pred = clf.predict(X_test)
    print "Extremely Randomized Trees"
    print "classification accuracy:", metrics.accuracy_score(y_test, y_pred)
    print "precision:", metrics.precision_score(y_test, y_pred)
    print "recall:", metrics.recall_score(y_test, y_pred)
    print "f1 score:", metrics.f1_score(y_test, y_pred)
    print "\n"
    results = Output+"_Extremely_Random_Forest_metrics_test.txt"
    file = open(results, "w")
    file.write("Extremely Random Forest Classifier estimator accuracy\n")
    file.write("Classification Accuracy Score: %f\n"%metrics.accuracy_score(y_test, y_pred))
    file.write("Precision Score: %f\n"%metrics.precision_score(y_test, y_pred))
    file.write("Recall Score: %f\n"%metrics.recall_score(y_test, y_pred))
    file.write("F1 Score: %f\n"%metrics.f1_score(y_test, y_pred))
    file.write("\n")
    file.write("True Value, Predicted Value, Iteration\n")
    for n in xrange(len(y_test)):
        file.write("%f,%f,%i\n"%(y_test[n],y_pred[n],(n+1)))
    file.close()
    title = "Extremely Randomized Trees %f"%test_size
    save = Output + "Extremely_Randomized_Trees_confusion_matrix"+"_%s.png"%test_size
    plot_confusion_matrix(y_test, y_pred,title,save)
    lvltrace.lvltrace("LVLSortie dans extratreeclassifier split_test")
def merger_labelled(Preprocessed_file,file_name,Corrected_Features):
    lvltrace.lvltrace("LVLEntree dans merger_labelled dabs data_preproc")
    # Merge all features files into one
    file_random=Corrected_Features+'/'+file_name
    ncol=tools.file_col_coma(file_random)
    data = np.loadtxt(file_random, delimiter=',', usecols=range(ncol-1))
    X = data[:,1:]
    n_samples, n_features = X.shape
    fout=open(Preprocessed_file,"a")
    fout.write("Class,")
    for n in xrange(1,n_features+1):
        fout.write("feature_%i,"%n)
    fout.write("\n")
    # first file:
    first_file=Corrected_Features+file_name
    for line in open(first_file):
        fout.write(line)
    # now the rest:
    for root, dirs, files in os.walk(Corrected_Features):
        for i in files:
            if i != file_name:
                if not i.startswith('.'):
                    f=open(Corrected_Features+i)
                    f.next() #Skip the header
                    for line in f:
                        fout.write(line)
                    f.close()
    fout.close()
    lvltrace.lvltrace("LVLSortie dans merger_labelled dans data_preproc")
コード例 #13
0
def descriptive_analysis(Extracted_Features,descriptive_analysis_output,cores,Corrected_Features_descriptive):
    lvltrace.lvltrace("LVLEntree dans descriptive_analysis")
    Input_population_analysis=Corrected_Features_descriptive
    folder = descriptive_analysis_output
    # Create a pool of processes depending on number of CPUs available,
    # with each file in input_files to be loaded using the appropriate function
    cpuCount = multiprocessing.cpu_count()
    #if cpuCount > 12: cpuCount = 12 # Limit processor count to 12 when executing on server
    if cores == None:
        cores = cpuCount
    if cores > 1: s='s'
    else: s=''
    if cores == False: # Sequential loading used for debugging
        for root, dirs, files in os.walk(Input_population_analysis):
            for i in files:
                if not i.startswith('.'):
                    descriptive_single(Extracted_Features,descriptive_analysis_output,i,Input_population_analysis)

    else: # Multi-process loading
        pool = Pool(processes=cores)
        data = []
        for root, dirs, files in os.walk(Input_population_analysis):
            for i in files:
                if not i.startswith('.'):
                        morphology_csv=Input_population_analysis+i
                        features = np.genfromtxt(morphology_csv,delimiter='\t',dtype=None)
                        w = features[0, :]
                        data.append((Extracted_Features,descriptive_analysis_output,i,morphology_csv,w))
        pool.map(descriptive_multi_cores,data)
    lvltrace.lvltrace("LVLSortie dans descriptive_analysis")
def gaussianNB(input_file,Output,test_size):
    lvltrace.lvltrace("LVLEntree dans gaussianNB split_test")
    ncol=tools.file_col_coma(input_file)
    data = np.loadtxt(input_file, delimiter=',', usecols=range(ncol-1))
    X = data[:,1:]
    y = data[:,0]
    n_samples, n_features = X.shape
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size)
    print X_train.shape, X_test.shape
    # Instantiate the estimator
    clf = GaussianNB()
    # Fit the estimator to the data
    clf.fit(X_train, y_train)
    # Use the model to predict the last several labels
    y_pred = clf.predict(X_test)
    print "Gaussian Naive Bayes estimator accuracy "
    print "classification accuracy:", metrics.accuracy_score(y_test, y_pred)
    print "precision:", metrics.precision_score(y_test, y_pred)
    print "recall:", metrics.recall_score(y_test, y_pred)
    print "f1 score:", metrics.f1_score(y_test, y_pred)
    results = Output+"GaussianNB_metrics_test.txt"
    file = open(results, "w")
    file.write("Gaussian Naive Bayes estimator accuracy\n")
    file.write("Classification Accuracy Score: %f\n"%metrics.accuracy_score(y_test, y_pred))
    file.write("Precision Score: %f\n"%metrics.precision_score(y_test, y_pred))
    file.write("Recall Score: %f\n"%metrics.recall_score(y_test, y_pred))
    file.write("F1 Score: %f\n"%metrics.f1_score(y_test, y_pred))
    file.write("True Value, Predicted Value, Iteration\n")
    for n in xrange(len(y_test)):
        file.write("%f,%f,%i\n"%(y_test[n],y_pred[n],(n+1)))
    file.close()
    title = "Gaussian Naive Bayes %f"%test_size
    save = Output + "Gaussian_NB_confusion_matrix"+"_%s.png"%test_size
    plot_confusion_matrix(y_test, y_pred,title,save)
    lvltrace.lvltrace("LVLsortie dans gaussianNB split_test")
コード例 #15
0
def predict_class_glm(input_file, Output):
    lvltrace.lvltrace(
        "LVLEntree dans predict_class_glm dans feature_selection")
    csv = input_file
    df = pd.read_csv(csv)
    #print df
    df = df[[
        'Class', 'feature_1', 'feature_2', 'feature_3', 'feature_4',
        'feature_5', 'feature_6', 'feature_7', 'feature_8', 'feature_9',
        'feature_10', 'feature_11', 'feature_12', 'feature_13', 'feature_14',
        'feature_15', 'feature_16', 'feature_17', 'feature_18', 'feature_19',
        'feature_20', 'feature_21', 'feature_22', 'feature_23', 'feature_24',
        'feature_25', 'feature_26', 'feature_27', 'feature_28', 'feature_29',
        'feature_30', 'feature_31', 'feature_32', 'feature_33', 'feature_34',
        'feature_35', 'feature_36', 'feature_37', 'feature_38', 'feature_39',
        'feature_40', 'feature_41', 'feature_42', 'feature_43'
    ]].dropna()
    df.head()
    logit = glm(
        formula=
        'Class  ~  feature_1+feature_2+feature_3+feature_4+feature_5+feature_6+feature_7+feature_8+feature_9+feature_10+feature_11+feature_12+feature_13+feature_14+feature_15+feature_16+feature_17+feature_18+feature_19+feature_20+feature_21+feature_22+feature_23+feature_24+feature_25+feature_26+feature_27+feature_28+feature_29+feature_30+feature_31+feature_32+feature_33+feature_34+feature_35+feature_36+feature_37+feature_38+feature_39+feature_40+feature_41+feature_42+feature_43',
        data=df).fit()
    print logit.summary()
    save = Output + "glm.txt"
    old_stdout = sys.stdout
    log_file = open(save, "w")
    sys.stdout = log_file
    print logit.summary()
    sys.stdout = old_stdout
    log_file.close()
    lvltrace.lvltrace(
        "LVLSortie dans predict_class_glm dans feature_selection")
def neuron_similarity_matrix_labelled(Preprocessed_file, similarity,
                                      Corrected_Features):
    lvltrace.lvltrace(
        "LVLEntree dans neuron_similarity_matrix_labelled data_preproc")
    file = open(similarity, "w")
    writer = csv.writer(file, lineterminator=',')
    file.write("mtype,")
    for root, dirs, files in os.walk(Corrected_Features):
        for v in files:
            if not v.startswith('.'):
                input = Corrected_Features + v
                ncol = tools.file_col_coma(input) - 1
                data = np.loadtxt(input, delimiter=',', usecols=range(
                    ncol))  # ncol-1 because we skip the neuron manes
                y = data[:, 0].astype(np.int)  # Labels (class)
                file.write("%i," % np.mean(y))
        file.write("\n")
    for root, dirs, files in os.walk(Corrected_Features):
        for j in files:
            if not j.startswith('.'):
                input_1 = Corrected_Features + j
                ncol1 = tools.file_col_coma(input_1) - 1
                data = np.loadtxt(input_1, delimiter=',', usecols=range(
                    ncol1))  # ncol-1 because we skip the neuron manes
                X1 = data[:, :]
                y1 = data[:, 0].astype(np.int)  # Labels (class)
                mtype1 = [0 for x in xrange(ncol1 - 1)]
                f = [0 for x in xrange(ncol1 - 1)]
                label1 = np.mean(y1)
                for col in xrange(1, ncol1):
                    mtype1[col - 1] = np.mean(X1[:,
                                                 col])  #mean of each feature
                file.write("%i," % label1)
                for root, dirs, files in os.walk(Corrected_Features):
                    for i in files:
                        if not i.startswith('.'):
                            input_2 = Corrected_Features + i
                            ncol2 = tools.file_col_coma(input_2) - 1
                            data = np.loadtxt(
                                input_2, delimiter=',', usecols=range(ncol2)
                            )  # ncol-1 because we skip the neuron manes
                            X2 = data[:, :]
                            y2 = data[:, 0].astype(np.int)  # Labels (class)
                            mtype2 = [0 for x in xrange(ncol2 - 1)]

                            for col in xrange(1, ncol2):
                                mtype2[col - 1] = np.mean(
                                    X2[:, col])  #mean of each feature

                            for col in xrange(1, ncol2):
                                f[col - 1] = np.abs(mtype2[col - 1] -
                                                    mtype1[col - 1])
                            similarity = np.mean(f)
                            file.write("%f," % similarity)
                file.write("\n")
    file.close()
    lvltrace.lvltrace(
        "LVLSortie dans neuron_similarity_matrix_labelled data_preproc")
コード例 #17
0
def kmeans(input_file, n_clusters, Output):
    lvltrace.lvltrace("LVLEntree dans kmeans unsupervised")
    ncol=tools.file_col_coma(input_file)
    data = np.loadtxt(input_file, delimiter=',', usecols=range(ncol-1))
    X = data[:,1:]
    y = data[:,0]
    sample_size, n_features = X.shape
    k_means=cluster.KMeans(init='k-means++', n_clusters=n_clusters, n_init=10)
    k_means.fit(X)
    reduced_data = k_means.transform(X)
    values = k_means.cluster_centers_.squeeze()
    labels = k_means.labels_
    k_means_cluster_centers = k_means.cluster_centers_
    print "#########################################################################################################\n"
    #print y
    #print labels
    print "K-MEANS\n"
    print('homogeneity_score: %f'%metrics.homogeneity_score(y, labels))
    print('completeness_score: %f'%metrics.completeness_score(y, labels))
    print('v_measure_score: %f'%metrics.v_measure_score(y, labels))
    print('adjusted_rand_score: %f'%metrics.adjusted_rand_score(y, labels))
    print('adjusted_mutual_info_score: %f'%metrics.adjusted_mutual_info_score(y,  labels))
    print('silhouette_score: %f'%metrics.silhouette_score(X, labels, metric='euclidean', sample_size=sample_size))
    print('\n')
    print "#########################################################################################################\n"
    results = Output+"kmeans_scores.txt"
    file = open(results, "w")
    file.write("K-Means Scores\n")
    file.write("Homogeneity Score: %f\n"%metrics.homogeneity_score(y, labels))
    file.write("Completeness Score: %f\n"%metrics.completeness_score(y, labels))
    file.write("V-Measure: %f\n"%metrics.v_measure_score(y, labels))
    file.write("The adjusted Rand index: %f\n"%metrics.adjusted_rand_score(y, labels))
    file.write("Adjusted Mutual Information: %f\n"%metrics.adjusted_mutual_info_score(y,  labels))
    file.write("Silhouette Score: %f\n"%metrics.silhouette_score(X, labels, metric='euclidean', sample_size=sample_size))
    file.write("\n")
    file.write("True Value, Cluster numbers, Iteration\n")
    for n in xrange(len(y)):
        file.write("%f, %f, %i\n"%(y[n],labels[n],(n+1)))
    file.close()
    import pylab as pl
    from itertools import cycle
    # plot the results along with the labels
    k_means_cluster_centers = k_means.cluster_centers_
    fig, ax = plt.subplots()
    im=ax.scatter(X[:, 0], X[:, 1], c=labels, marker='.')
    for k in xrange(n_clusters):
        my_members = labels == k
        cluster_center = k_means_cluster_centers[k]
        ax.plot(cluster_center[0], cluster_center[1], 'w', color='b',
                marker='x', markersize=6)
    fig.colorbar(im)
    plt.title("Number of clusters: %i"%n_clusters)
    save = Output + "kmeans.png"
    plt.savefig(save)
    lvltrace.lvltrace("LVLsortie dans kmeans unsupervised")
コード例 #18
0
def forest_of_trees(input_file, Output):

    import numpy as np

    from sklearn.datasets import make_classification
    from sklearn.ensemble import ExtraTreesClassifier
    lvltrace.lvltrace("LVLEntree dans forest_of_trees dans feature_selection")

    # Build a classification task using 3 informative features
    ncol = tools.file_col_coma(input_file)
    data = np.loadtxt(input_file, delimiter=',', usecols=range(ncol - 1))
    X = data[:, 1:]
    y = data[:, 0]
    #print X
    #print y
    sample_size, n_features = X.shape

    # Build a forest and compute the feature importances
    forest = ExtraTreesClassifier(n_estimators=250, random_state=0)

    forest.fit(X, y)
    importances = forest.feature_importances_
    std = np.std([tree.feature_importances_ for tree in forest.estimators_],
                 axis=0)
    indices = np.argsort(importances)[::-1]

    results = Output + "forest_of_tree.txt"
    file = open(results, "w")
    file.write("Feature Ranking\n")

    # Print the feature ranking
    #print("Feature ranking:")

    for f in range(n_features):
        file.write("%d. feature %d (%f)\n" %
                   (f + 1, indices[f] + 1, importances[indices[f]]))
        #print("%d. feature %d (%f)" % (f + 1, indices[f]+1, importances[indices[f]]))
    file.close()
    # Plot the feature importances of the forest
    import pylab as pl
    pl.figure()
    pl.title("Feature importances: Forest of trees applied to Layers + Types")
    pl.bar(range(n_features),
           importances[indices],
           color="r",
           yerr=std[indices],
           align="center")
    pl.xticks(range(n_features), indices + 1)
    pl.axis('tight')
    #pl.xlim([-1, 73])
    save = Output + "forest_of_tree.png"
    pl.savefig(save)
    lvltrace.lvltrace("LVLSortie dans forest_of_trees dans feature_selection")
def forest_of_trees(input_file,Output):

    import numpy as np

    from sklearn.datasets import make_classification
    from sklearn.ensemble import ExtraTreesClassifier
    lvltrace.lvltrace("LVLEntree dans forest_of_trees dans feature_selection")

    # Build a classification task using 3 informative features
    ncol=tools.file_col_coma(input_file)
    data = np.loadtxt(input_file, delimiter=',', usecols=range(ncol-1))
    X = data[:,1:]
    y = data[:,0]
    #print X
    #print y
    sample_size, n_features = X.shape

    # Build a forest and compute the feature importances
    forest = ExtraTreesClassifier(n_estimators=250,
                                  random_state=0)

    forest.fit(X, y)
    importances = forest.feature_importances_
    std = np.std([tree.feature_importances_ for tree in forest.estimators_],
                 axis=0)
    indices = np.argsort(importances)[::-1]

    results = Output+"forest_of_tree.txt"
    file = open(results, "w")
    file.write("Feature Ranking\n")
    
    # Print the feature ranking
    #print("Feature ranking:")

    for f in range(n_features):
        file.write("%d. feature %d (%f)\n" % (f + 1, indices[f]+1, importances[indices[f]]))
        #print("%d. feature %d (%f)" % (f + 1, indices[f]+1, importances[indices[f]]))
    file.close()
    # Plot the feature importances of the forest
    import pylab as pl
    pl.figure()
    pl.title("Feature importances: Forest of trees applied to Layers + Types")
    pl.bar(range(n_features), importances[indices],
           color="r", yerr=std[indices], align="center")
    pl.xticks(range(n_features), indices+1)
    pl.axis('tight')
    #pl.xlim([-1, 73])
    save=Output+"forest_of_tree.png"
    pl.savefig(save)
    lvltrace.lvltrace("LVLSortie dans forest_of_trees dans feature_selection")
コード例 #20
0
def features_variability(descriptive_analysis_output, descriptive_variability,
                         cores):
    lvltrace.lvltrace("LVLEntree dans features_variability ")
    path_csv_analysis = descriptive_analysis_output
    #print path_csv_analysis
    Score = descriptive_variability + "Global_Features_Variability.csv"
    file = open(Score, "w")
    writer = csv.writer(file, lineterminator=',')

    # Create a pool of processes depending on number of CPUs available,
    # with each file in input_files to be loaded using the appropriate function
    cpuCount = multiprocessing.cpu_count()
    #if cpuCount > 12: cpuCount = 12 # Limit processor count to 12 when executing on server
    if cores == None:
        cores = cpuCount
    if cores > 1: s = 's'
    else: s = ''
    if cores == False:  # Sequential loading used for debugging
        for root, dirs, files in os.walk(path_csv_analysis):
            for i in files:
                if not i.startswith('.') and i.endswith('.csv'):
                    scoring = features_variability_single(path_csv_analysis, i)
                    file.write("%s," % os.path.splitext(i)[0])
                    file.write("%f\n" % scoring)
        file.close()
    else:  # Multi-process loading
        pool = Pool(processes=cores)
        data = []
        b = [0 for x in xrange(len(os.listdir(path_csv_analysis)))]
        iter = 0
        for root, dirs, files in os.walk(path_csv_analysis):
            for i in files:
                if not i.startswith('.') and i.endswith('.csv'):
                    csv_file = path_csv_analysis + i
                    #print csv_file
                    #print i,iter
                    #b[iter]=os.path.splitext(i)[0]
                    b[iter] = i
                    iter = iter + 1
                    #data.append((csv_file,i,file))
                    #print csv_file
                    data.append((path_csv_analysis, i))
        scoring_multi = pool.map(features_variability_multi_cores, data)
        for k in xrange(iter):
            file.write("%s," % b[k])
            file.write("%f\n" % scoring_multi[k])
        file.close()
    variability_plotting(descriptive_variability, Score)
    lvltrace.lvltrace("LVLSortie dans features_variability ")
コード例 #21
0
def gmm(input_file,Output):
    lvltrace.lvltrace("LVLEntree dans gmm unsupervised")
    print "#########################################################################################################\n"
    print "GMM"
    print "#########################################################################################################\n"
    ncol=tools.file_col_coma(input_file)
    data = np.loadtxt(input_file, delimiter=',', usecols=range(ncol-1))
    X = data[:,1:]
    y = data[:,0]
    n_samples, n_features = X.shape
    
    # Fit a mixture of gaussians with EM using five components
    gmm = mixture.GMM(n_components=5, covariance_type='spherical', init_params = 'wmc')
    gmm.fit(X)

    # Fit a dirichlet process mixture of gaussians using five components
    dpgmm = mixture.DPGMM(n_components=5, covariance_type='spherical',init_params = 'wmc')
    dpgmm.fit(X)

    color_iter = itertools.cycle(['r', 'g', 'b', 'c', 'm', 'b','g','r','c','m','y','k','b','g','r','c','m','y','k','b','g','r','c','m','y','k','b','g','r','c','m','y','k'])

    for i, (clf, title) in enumerate([(gmm, 'GMM'),
                                      (dpgmm, 'Dirichlet Process GMM')]):
        splot = pl.subplot(2, 1, 1 + i)
        Y_ = clf.predict(X)
        for i, (mean, covar, color) in enumerate(zip(
                                                     clf.means_, clf._get_covars(), color_iter)):
            v, w = linalg.eigh(covar)
            u = w[0] / linalg.norm(w[0])
            # as the DP will not use every component it has access to
            # unless it needs it, we shouldn't plot the redundant
            # components.
            if not np.any(Y_ == i):
                continue
            pl.scatter(X[Y_ == i, 0], X[Y_ == i, 1], .8, color=color)
            
            # Plot an ellipse to show the Gaussian component
            angle = np.arctan(u[1] / u[0])
            angle = 180 * angle / np.pi  # convert to degrees
            ell = mpl.patches.Ellipse(mean, v[0], v[1], 180 + angle, color=color)
            ell.set_clip_box(splot.bbox)
            ell.set_alpha(0.5)
            splot.add_artist(ell)
        pl.xticks(())
        pl.yticks(())
        pl.title(title)
    save = Output + "gmm.png"
    plt.savefig(save)
    lvltrace.lvltrace("LVLSortie dans gmm unsupervised")
def neuron_similarity_matrix_labelled(Preprocessed_file,similarity,Corrected_Features):
    lvltrace.lvltrace("LVLEntree dans neuron_similarity_matrix_labelled data_preproc")
    file = open(similarity, "w")
    writer=csv.writer(file, lineterminator=',')
    file.write("mtype,")
    for root, dirs, files in os.walk(Corrected_Features):
        for v in files:
            if not v.startswith('.'):
                input=Corrected_Features+v
                ncol=tools.file_col_coma(input)-1
                data = np.loadtxt(input, delimiter=',', usecols=range(ncol)) # ncol-1 because we skip the neuron manes
                y = data[:, 0].astype(np.int) # Labels (class)
                file.write("%i,"%np.mean(y))
        file.write("\n")
    for root, dirs, files in os.walk(Corrected_Features):
        for j in files:
            if not j.startswith('.'):
                input_1=Corrected_Features+j
                ncol1=tools.file_col_coma(input_1)-1
                data = np.loadtxt(input_1, delimiter=',', usecols=range(ncol1)) # ncol-1 because we skip the neuron manes
                X1 = data[:,:]
                y1 = data[:, 0].astype(np.int) # Labels (class)
                mtype1=[0 for x in xrange(ncol1-1)]
                f=[0 for x in xrange(ncol1-1)]
                label1=np.mean(y1)
                for col in xrange(1,ncol1):
                    mtype1[col-1]=np.mean(X1[:,col]) #mean of each feature
                file.write("%i,"%label1)
                for root, dirs, files in os.walk(Corrected_Features):
                    for i in files:
                        if not i.startswith('.'):
                            input_2=Corrected_Features+i
                            ncol2=tools.file_col_coma(input_2)-1
                            data = np.loadtxt(input_2, delimiter=',', usecols=range(ncol2)) # ncol-1 because we skip the neuron manes
                            X2 = data[:,:]
                            y2 = data[:, 0].astype(np.int) # Labels (class)
                            mtype2=[0 for x in xrange(ncol2-1)]
                            
                            for col in xrange(1,ncol2):
                                mtype2[col-1]=np.mean(X2[:,col]) #mean of each feature
                            
                            for col in xrange(1,ncol2):
                                f[col-1]=np.abs(mtype2[col-1]-mtype1[col-1])
                            similarity=np.mean(f)
                            file.write("%f,"%similarity)
                file.write("\n")
    file.close()
    lvltrace.lvltrace("LVLSortie dans neuron_similarity_matrix_labelled data_preproc")
def features_variability(descriptive_analysis_output,descriptive_variability,cores):
    lvltrace.lvltrace("LVLEntree dans features_variability ")
    path_csv_analysis=descriptive_analysis_output
    #print path_csv_analysis
    Score=descriptive_variability+"Global_Features_Variability.csv"
    file = open(Score, "w")
    writer=csv.writer(file, lineterminator=',')

    # Create a pool of processes depending on number of CPUs available,
    # with each file in input_files to be loaded using the appropriate function
    cpuCount = multiprocessing.cpu_count()
    #if cpuCount > 12: cpuCount = 12 # Limit processor count to 12 when executing on server
    if cores == None:
        cores = cpuCount
    if cores > 1: s='s'
    else: s=''
    if cores == False: # Sequential loading used for debugging
        for root, dirs, files in os.walk(path_csv_analysis):
            for i in files:
                if not i.startswith('.') and i.endswith('.csv'):
                        scoring=features_variability_single(path_csv_analysis,i)
		        file.write("%s,"%os.path.splitext(i)[0])
    			file.write("%f\n"%scoring)
	file.close()
    else: # Multi-process loading
        pool = Pool(processes=cores)
        data = []
        b=[0 for x in xrange(len(os.listdir(path_csv_analysis)))]
        iter=0
        for root, dirs, files in os.walk(path_csv_analysis):
            for i in files:
                if not i.startswith('.') and i.endswith('.csv'):
                        csv_file=path_csv_analysis+i
                        #print csv_file
                        #print i,iter
                        #b[iter]=os.path.splitext(i)[0]
                        b[iter]=i
                        iter=iter+1
                        #data.append((csv_file,i,file))
                        #print csv_file
                        data.append((path_csv_analysis,i))
        scoring_multi=pool.map(features_variability_multi_cores,data)
	for k in xrange(iter):
		file.write("%s,"%b[k])
                file.write("%f\n"%scoring_multi[k])
	file.close()
    variability_plotting(descriptive_variability,Score)
    lvltrace.lvltrace("LVLSortie dans features_variability ")
def predict_class_glm(input_file,Output):
    lvltrace.lvltrace("LVLEntree dans predict_class_glm dans feature_selection")
    csv=input_file
    df = pd.read_csv(csv)
    #print df
    df = df[['Class','feature_1','feature_2','feature_3','feature_4','feature_5','feature_6','feature_7','feature_8','feature_9','feature_10','feature_11','feature_12','feature_13','feature_14','feature_15','feature_16','feature_17','feature_18','feature_19','feature_20','feature_21','feature_22','feature_23','feature_24','feature_25','feature_26','feature_27','feature_28','feature_29','feature_30','feature_31','feature_32','feature_33','feature_34','feature_35','feature_36','feature_37','feature_38','feature_39','feature_40','feature_41','feature_42','feature_43']].dropna()
    df.head()
    logit = glm(formula='Class  ~  feature_1+feature_2+feature_3+feature_4+feature_5+feature_6+feature_7+feature_8+feature_9+feature_10+feature_11+feature_12+feature_13+feature_14+feature_15+feature_16+feature_17+feature_18+feature_19+feature_20+feature_21+feature_22+feature_23+feature_24+feature_25+feature_26+feature_27+feature_28+feature_29+feature_30+feature_31+feature_32+feature_33+feature_34+feature_35+feature_36+feature_37+feature_38+feature_39+feature_40+feature_41+feature_42+feature_43', data=df).fit()
    print logit.summary()
    save = Output + "glm.txt"
    old_stdout = sys.stdout
    log_file = open(save,"w")
    sys.stdout = log_file
    print logit.summary()
    sys.stdout = old_stdout
    log_file.close()
    lvltrace.lvltrace("LVLSortie dans predict_class_glm dans feature_selection")
コード例 #25
0
def multinomialNB(input_file,Output):
    lvltrace.lvltrace("LVLEntree dans multinomialNB")
    try:
        ncol=tools.file_col_coma(input_file)
        data = np.loadtxt(input_file, delimiter=',', usecols=range(ncol-1))
        X = data[:,1:]
        y = data[:,0]
        n_samples, n_features = X.shape
        # Instantiate the estimator
        clf = MultinomialNB()
        # Fit the estimator to the data
        clf.fit(X, y)
        # Use the model to predict the last several labels
        y_pred = clf.predict(X)
        print "#########################################################################################################\n"
        print "Multinomial Naive Bayes estimator accuracy "
        print "classification accuracy:", metrics.accuracy_score(y, y_pred)
        print "precision:", metrics.precision_score(y, y_pred)
        print "recall:", metrics.recall_score(y, y_pred)
        print "f1 score:", metrics.f1_score(y, y_pred)
        print "\n"
        print "#########################################################################################################\n"
        results = Output+"Multinomial_NB_metrics.txt"
        file = open(results, "w")
        file.write("Multinomial Naive Bayes estimator accuracy\n")
        file.write("Classification Accuracy Score: %f\n"%metrics.accuracy_score(y, y_pred))
        file.write("Precision Score: %f\n"%metrics.precision_score(y, y_pred))
        file.write("Recall Score: %f\n"%metrics.recall_score(y, y_pred))
        file.write("F1 Score: %f\n"%metrics.f1_score(y, y_pred))
        file.write("True Value, Predicted Value, Iteration\n")
        for n in xrange(len(y)):
            file.write("%f,%f,%i\n"%(y[n],y_pred[n],(n+1)))
        file.close()
        title = "Multinomial Naive Bayes"
        save = Output + "Multinomial_NB_confusion_matrix.png"
        plot_confusion_matrix(y, y_pred,title,save)
    except (ValueError):
        if configuration.normalization == 'normalize':
            results = Output+"Multinomial_NB_metrics.txt"
            file = open(results, "w")
            file.write("In configuration.py file, normalization=normalize -- Input Values must be superior to 0\n")
            file.close()
    lvltrace.lvltrace("LVLSortie dans multinomialNB")
コード例 #26
0
def lda(input_file,Output):
    lvltrace.lvltrace("LVLEntree dans lda")
    ncol=tools.file_col_coma(input_file)
    data = np.loadtxt(input_file, delimiter=',', usecols=range(ncol-1))
    X = data[:,1:]
    y = data[:,0]
    n_samples, n_features = X.shape
    #lda=LDA(n_components=2)
    lda=LDA()
    lda.fit(X,y)
    X_LDA = lda.transform(X)
    y_pred = lda.predict(X)
    print "#########################################################################################################\n"
    print "Linear Discriminant Analysis Accuracy "
    print "classification accuracy:", metrics.accuracy_score(y, y_pred)
    print "precision:", metrics.precision_score(y, y_pred)
    print "recall:", metrics.recall_score(y, y_pred)
    print "f1 score:", metrics.f1_score(y, y_pred)
    print "\n"
    print "#########################################################################################################\n"
    results = Output+"LDA_metrics.txt"
    file = open(results, "w")
    file.write("Linear Discriminant Analaysis estimator accuracy\n")
    file.write("Classification Accuracy Score: %f\n"%metrics.accuracy_score(y, y_pred))
    file.write("Precision Score: %f\n"%metrics.precision_score(y, y_pred))
    file.write("Recall Score: %f\n"%metrics.recall_score(y, y_pred))
    file.write("F1 Score: %f\n"%metrics.f1_score(y, y_pred))
    file.write("\n")
    file.write("True Value, Predicted Value, Iteration\n")
    for n in xrange(len(y)):
        file.write("%f,%f,%i\n"%(y[n],y_pred[n],(n+1)))
    file.close()
    title = "LDA"
    save = Output + "LDA_confusion_matrix.png"
    plot_confusion_matrix(y, y_pred,title,save)
    # plot the results along with the labels
    fig, ax = plt.subplots()
    im = ax.scatter(X_LDA[:, 0], X_LDA[:, 1], c=y)
    fig.colorbar(im);
    save_lda = Output + "LDA_plot.png"
    plt.savefig(save_lda)
    plt.close()
    lvltrace.lvltrace("LVLSortie dans lda")
def lda(input_file,Output,test_size):
    lvltrace.lvltrace("LVLEntree dans lda split_test")
    ncol=tools.file_col_coma(input_file)
    data = np.loadtxt(input_file, delimiter=',', usecols=range(ncol-1))
    X = data[:,1:]
    y = data[:,0]
    n_samples, n_features = X.shape
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size)
    print X_train.shape, X_test.shape
    lda=LDA(n_components=2)
    lda.fit(X_train,y_train)
    X_LDA = lda.transform(X_train)
    print "shape of result:", X_LDA.shape
    y_pred = lda.predict(X_test)
    print "Linear Discriminant Analysis Accuracy "
    print "classification accuracy:", metrics.accuracy_score(y_test, y_pred)
    print "precision:", metrics.precision_score(y_test, y_pred)
    print "recall:", metrics.recall_score(y_test, y_pred)
    print "f1 score:", metrics.f1_score(y_test, y_pred)
    #LVLprint "\n"
    results = Output+"LDA_metrics_test.txt"
    file = open(results, "w")
    file.write("Linear Discriminant Analaysis estimator accuracy\n")
    file.write("Classification Accuracy Score: %f\n"%metrics.accuracy_score(y_test, y_pred))
    file.write("Precision Score: %f\n"%metrics.precision_score(y_test, y_pred))
    file.write("Recall Score: %f\n"%metrics.recall_score(y_test, y_pred))
    file.write("F1 Score: %f\n"%metrics.f1_score(y_test, y_pred))
    file.write("\n")
    file.write("True Value, Predicted Value, Iteration\n")
    for n in xrange(len(y_test)):
        file.write("%f,%f,%i\n"%(y_test[n],y_pred[n],(n+1)))
    file.close()
    title = "LDA %f"%test_size
    save = Output + "LDA_confusion_matrix"+"_%s.png"%test_size
    plot_confusion_matrix(y_test, y_pred,title,save)
    # plot the results along with the labels
    fig, ax = plt.subplots()
    im = ax.scatter(X_LDA[:, 0], X_LDA[:, 1], c=y_train)
    fig.colorbar(im);
    save_lda = Output + "LDA_plot_test"+"_%s.png"%test_size
    plt.savefig(save_lda)
    plt.close()
    lvltrace.lvltrace("LVLSortie dans lda split_test")
def merger(Preprocessed_file, file_name, Corrected_Features):
    # Merge all features files into one
    lvltrace.lvltrace("LVLEntree dans merger data_preproc")
    fout = open(Preprocessed_file, "a")
    # first file:
    first_file = Corrected_Features + file_name
    for line in open(first_file):
        fout.write(line)
    # now the rest:
    for root, dirs, files in os.walk(Corrected_Features):
        for i in files:
            if i != file_name:
                if not i.startswith('.'):
                    f = open(Corrected_Features + i)
                    #f.next() #Skip the header
                    for line in f:
                        fout.write(line)
                    f.close()
    fout.close()
    lvltrace.lvltrace("LVLSortie dans merger data_preproc")
def merger(Preprocessed_file,file_name,Corrected_Features):
    # Merge all features files into one
    lvltrace.lvltrace("LVLEntree dans merger data_preproc")
    fout=open(Preprocessed_file,"a")
    # first file:
    first_file=Corrected_Features+file_name
    for line in open(first_file):
        fout.write(line)
    # now the rest:
    for root, dirs, files in os.walk(Corrected_Features):
        for i in files:
            if i != file_name:
                if not i.startswith('.'):
                    f=open(Corrected_Features+i)
                    #f.next() #Skip the header
                    for line in f:
                        fout.write(line)
                    f.close()
    fout.close()
    lvltrace.lvltrace("LVLSortie dans merger data_preproc")
コード例 #30
0
def qda(input_file,Output):
    lvltrace.lvltrace("LVLEntree dans qda")
    try:
        ncol=tools.file_col_coma(input_file)
        data = np.loadtxt(input_file, delimiter=',', usecols=range(ncol-1))
        X = data[:,1:]
        y = data[:,0]
        n_samples, n_features = X.shape
        qda=QDA()
        qda.fit(X,y)
        y_pred = qda.predict(X)
        print "#########################################################################################################\n"
        print "Quadratic Discriminant Analysis Accuracy "
        print "classification accuracy:", metrics.accuracy_score(y, y_pred)
        print "precision:", metrics.precision_score(y, y_pred)
        print "recall:", metrics.recall_score(y, y_pred)
        print "f1 score:", metrics.f1_score(y, y_pred)
        print "\n"
        print "#########################################################################################################\n"
        results = Output+"QDA_metrics.txt"
        file = open(results, "w")
        file.write("Quadratic Discriminant Analaysis estimator accuracy\n")
        file.write("Classification Accuracy Score: %f\n"%metrics.accuracy_score(y, y_pred))
        file.write("Precision Score: %f\n"%metrics.precision_score(y, y_pred))
        file.write("Recall Score: %f\n"%metrics.recall_score(y, y_pred))
        file.write("F1 Score: %f\n"%metrics.f1_score(y, y_pred))
        file.write("\n")
        file.write("True Value, Predicted Value, Iteration\n")
        for n in xrange(len(y)):
            file.write("%f,%f,%i\n"%(y[n],y_pred[n],(n+1)))
        file.close()
        title = "QDA"
        save = Output + "QDA_confusion_matrix.png"
        plot_confusion_matrix(y, y_pred,title,save)
    except (AttributeError):
        if configuration.normalization == 'normalize':
            results = Output+"Multinomial_NB_metrics.txt"
            file = open(results, "w")
            file.write("In configuration.py file, normalization='normalize' -- Input Values must be superior to 0\n")
            file.close()
    lvltrace.lvltrace("LVLSortie dans qda")
def qda(input_file,Output,test_size):
    lvltrace.lvltrace("LVLEntree dans qda split_test")
    try:
        ncol=tools.file_col_coma(input_file)
        data = np.loadtxt(input_file, delimiter=',', usecols=range(ncol-1))
        X = data[:,1:]
        y = data[:,0]
        n_samples, n_features = X.shape
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size)
        print X_train.shape, X_test.shape
        lda=QDA()
        lda.fit(X_train,y_train)
        y_pred = lda.predict(X_test)
        print "Quadratic Discriminant Analysis Accuracy "
        print "classification accuracy:", metrics.accuracy_score(y_test, y_pred)
        print "precision:", metrics.precision_score(y_test, y_pred)
        print "recall:", metrics.recall_score(y_test, y_pred)
        print "f1 score:", metrics.f1_score(y_test, y_pred)
        #LVLprint "\n"
        results = Output+"QDA_metrics_test.txt"
        file = open(results, "w")
        file.write("Quadratic Discriminant Analaysis estimator accuracy\n")
        file.write("Classification Accuracy Score: %f\n"%metrics.accuracy_score(y_test, y_pred))
        file.write("Precision Score: %f\n"%metrics.precision_score(y_test, y_pred))
        file.write("Recall Score: %f\n"%metrics.recall_score(y_test, y_pred))
        file.write("F1 Score: %f\n"%metrics.f1_score(y_test, y_pred))
        file.write("\n")
        file.write("True Value, Predicted Value, Iteration\n")
        for n in xrange(len(y_test)):
            file.write("%f,%f,%i\n"%(y_test[n],y_pred[n],(n+1)))
        file.close()
        title = "QDA %f"%test_size
        save = Output + "QDA_confusion_matrix"+"_%s.png"%test_size
        plot_confusion_matrix(y_test, y_pred,title,save)
    except (AttributeError):
        if configuration.normalization == 'normalize':
            results = Output+"Multinomial_NB_metrics_test.txt"
            file = open(results, "w")
            file.write("In configuration.py file, normalization='normalize' -- Input Values must be superior to 0\n")
            file.close()
    lvltrace.lvltrace("LVLSortie dans qda split_test")
def Radius_Neighbors(input_file,Output,test_size):
    lvltrace.lvltrace("LVLEntree dans radius_kneighbors split_test")
    try:
        ncol=tools.file_col_coma(input_file)
        data = np.loadtxt(input_file, delimiter=',', usecols=range(ncol-1))
        X = data[:,1:]
        y = data[:,0]
        n_samples, n_features = X.shape
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size)
        print X_train.shape, X_test.shape
        clf = RadiusNeighborsClassifier(radius=0.001, weights='uniform', algorithm='auto')
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        print "Radius Neighbors accuracy "
        print "classification accuracy:", metrics.accuracy_score(y_test, y_pred)
        print "precision:", metrics.precision_score(y_test, y_pred)
        print "recall:", metrics.recall_score(y_test, y_pred)
        print "f1 score:", metrics.f1_score(y_test, y_pred)
        print "\n"
        results = Output+"Raidus_Neighbors_metrics_test.txt"
        file = open(results, "w")
        file.write("Radius Neighbors estimator accuracy\n")
        file.write("Classification Accuracy Score: %f\n"%metrics.accuracy_score(y_test, y_pred))
        file.write("Precision Score: %f\n"%metrics.precision_score(y_test, y_pred))
        file.write("Recall Score: %f\n"%metrics.recall_score(y_test, y_pred))
        file.write("F1 Score: %f\n"%metrics.f1_score(y_test, y_pred))
        file.write("\n")
        file.write("True Value, Predicted Value, Iteration\n")
        for n in xrange(len(y_test)):
            file.write("%f,%f,%i\n"%(y_test[n],y_pred[n],(n+1)))
        file.close()
        title = "Radius Neighbors %f"%test_size
        save = Output + "Radius_Neighbors_confusion_matrix"+"_%s.png"%test_size
        plot_confusion_matrix(y_test, y_pred,title,save)
    except (ValueError):
        results = Output+"Raidus_Neighbors_metrics_test.txt"
        file = open(results, "w")
        file.write("In configuration.py file:  No neighbors found for test samples, you can try using larger radius, give a label for outliers, consider or removing them from your dataset.")
        file.close()
    lvltrace.lvltrace("LVLSortie dans radius_kneighbors split_test")
コード例 #33
0
def features_extraction():
    lvltrace.lvltrace("LVLEntree dans features_extraction")
    data =[]
    for root, dirs, files in os.walk(inputs.morphology):
        for filename in files:
            if not filename.startswith('.'):
                if not os.path.isfile(filename) and filename.endswith('.swc'):
                    # neuron_path and neuron_features are the inputs of lmeasure
                    neuron_path=root+"/"+filename
 
                    neuron_features="-s"+os.path.splitext(os.path.splitext(neuron_path)[0])[0]+".csv"

                    # neuron_csv is the name of the extracted features output file from lmeasure
                    #neuron_csv=os.path.splitext(os.path.splitext(neuron_path)[0])[0]+".csv"
		    #print ("LVL neuron csv est "+ neuron_csv)

                    # Use of lmeasure tool to extract features and save it to input folders
		    data.append((neuron_features,neuron_path))

    cpuCount = multiprocessing.cpu_count()
    pool = Pool(processes=cpuCount)
    pool.map(lmeasure,data)
    lvltrace.lvltrace("LVLSortie dans features_extraction")
コード例 #34
0
def KMeans_PCA(input_file, n_clusters, Output):
    lvltrace.lvltrace("LVLEntree dans KMeans_PCA unsupervised")
    ncol=tools.file_col_coma(input_file)
    data = np.loadtxt(input_file, delimiter=',', usecols=range(ncol-1))
    X = data[:,1:]
    y = data[:,0]
    sample_size, n_features = X.shape
    reduced_data = PCA(n_components=2).fit_transform(X)
    k_means = KMeans(init='k-means++', n_clusters=n_clusters, n_init=50)
    k_means.fit(reduced_data)
    labels = k_means.labels_
    print "#########################################################################################################\n"
    print "K-MEANS on PCA-reduced data"
    #print labels
    #print y
    print('homogeneity_score: %f'%metrics.homogeneity_score(y, labels))
    print('completeness_score: %f'%metrics.completeness_score(y, labels))
    print('v_measure_score: %f'%metrics.v_measure_score(y, labels))
    print('adjusted_rand_score: %f'%metrics.adjusted_rand_score(y, labels))
    print('adjusted_mutual_info_score: %f'%metrics.adjusted_mutual_info_score(y,  labels))
    print('silhouette_score: %f'%metrics.silhouette_score(X, labels, metric='euclidean', sample_size=sample_size))
    print "\n"
    print "#########################################################################################################\n"
    results = Output+"kmeans_PCA_metrics.txt"
    file = open(results, "w")
    file.write("K-Means clustering on the PCA-reduced data\n")
    file.write("Homogeneity Score: %f\n"%metrics.homogeneity_score(y, labels))
    file.write("Completeness Score: %f\n"%metrics.completeness_score(y, labels))
    file.write("V-Measure: %f\n"%metrics.v_measure_score(y, labels))
    file.write("The adjusted Rand index: %f\n"%metrics.adjusted_rand_score(y, labels))
    file.write("Adjusted Mutual Information: %f\n"%metrics.adjusted_mutual_info_score(y,  labels))
    file.write("Silhouette Score: %f\n"%metrics.silhouette_score(X, labels, metric='euclidean', sample_size=sample_size))
    file.write("\n")
    file.write("True Value, Clusters, Iteration\n")
    for n in xrange(len(y)):
        file.write("%f,%f,%i\n"%(y[n],labels[n],(n+1)))
    file.close()
    # Step size of the mesh. Decrease to increase the quality of the VQ.
    h = .02     # point in the mesh [x_min, m_max]x[y_min, y_max]
    # Plot the decision boundary. For that, we will assign a color to each
    x_min, x_max = reduced_data[:, 0].min() , reduced_data[:, 0].max()
    y_min, y_max = reduced_data[:, 1].min() , reduced_data[:, 1].max()
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
    # Obtain labels for each point in mesh. Use last trained model.
    Z = k_means.predict(np.c_[xx.ravel(), yy.ravel()])
    # Put the result into a color plot
    Z = Z.reshape(xx.shape)
    pl.figure(1)
    pl.clf()
    pl.imshow(Z, interpolation='nearest',
              extent=(xx.min(), xx.max(), yy.min(), yy.max()),
              cmap=pl.cm.Paired,
              aspect='auto', origin='lower')
    pl.plot(reduced_data[:, 0], reduced_data[:, 1], 'k.', markersize=2)
    # Plot the centroids as a white X
    centroids = k_means.cluster_centers_
    pl.scatter(centroids[:, 0], centroids[:, 1],
               marker='x', s=169, linewidths=3,
               color='w', zorder=10)
    pl.title('K-means clustering on the PCA-reduced data\n'
             'Number of clusters: %i'%n_clusters)
    pl.xlim(x_min, x_max)
    pl.ylim(y_min, y_max)
    pl.xticks(())
    pl.yticks(())
    save = Output + "kmeans_PCA.png"
    pl.savefig(save)
    lvltrace.lvltrace("LVLSortie dans KMeans_PCA unsupervised")
def data_preprocessing_descriptive(Extracted_Features, Coma_Features,
                                   Corrected_Features):
    lvltrace.lvltrace(
        "LVLEntree dans data_preprocessing_descriptive dans preproc_descriptive"
    )
    tools.separate_coma(Extracted_Features, Coma_Features)
    for root, dirs, files in os.walk(Coma_Features):
        for i in files:
            if not i.startswith('.'):
                input_i = Coma_Features + i
                output_i = Corrected_Features + i
                lines = tools.file_lines(input_i)
                ncol = tools.file_col(input_i)
                if lines >= 2:
                    file = open(output_i, "w")
                    writer = csv.writer(file, lineterminator='\t')

                    data = np.genfromtxt(input_i, delimiter=',')
                    X = data[1:, 2:]
                    neuron_type = np.genfromtxt(input_i,
                                                delimiter=',',
                                                dtype=None)
                    y = neuron_type[:, 0]  # (class)

                    neuron_name = np.genfromtxt(input_i,
                                                delimiter=',',
                                                dtype=None)
                    z = neuron_name[:, 1]  # Neuron names

                    features = np.genfromtxt(input_i,
                                             delimiter=',',
                                             dtype=None)
                    w = features[0, :]  # features names

                    #Replace missing values 'nan' by column mean
                    imp = Imputer(missing_values='NaN',
                                  strategy='mean',
                                  axis=0)
                    imp.fit(X)
                    Imputer(axis=0,
                            copy=True,
                            missing_values='NaN',
                            strategy='mean',
                            verbose=0)
                    # Output replacement "Nan" values
                    Y = imp.transform(X)
                    #print i
                    #print Y.shape, y.shape,z.shape
                    #print Y.shape[1]

                    ####################
                    for line in xrange(Y.shape[0] + 1):
                        for colonne in xrange(Y.shape[1] + 2):
                            if line == 0:
                                if colonne == 0:
                                    file.write("%s\t" % y[line])
                                else:
                                    if colonne == 1:
                                        file.write("%s\t" % z[line])
                                    else:
                                        file.write("%s\t" % w[colonne])
                            else:
                                if colonne == 0:
                                    file.write("%s\t" % y[line])
                                else:
                                    if colonne == 1:
                                        file.write("%s\t" % z[line])
                                    else:
                                        file.write("%f\t" %
                                                   Y[line - 1, colonne - 2])
                        file.write("\n")
                    #########################
                else:
                    print "Only one morphology !!!"
                file.close()
    lvltrace.lvltrace(
        "LVLSortie dans data_preprocessing_descriptive dans preproc_descriptive"
    )
コード例 #36
0
def univariate(input_file, Output, percentile):
    ###############################################################################
    # import some data to play with
    lvltrace.lvltrace("LVLEntree dans univariate dans feature_selection")
    ncol = tools.file_col_coma(input_file)
    data = np.loadtxt(input_file, delimiter=',', usecols=range(ncol - 1))
    X = data[:, 1:]
    y = data[:, 0]
    sample_size, n_features = X.shape

    ###############################################################################
    pl.figure(1)
    pl.clf()

    X_indices = np.arange(X.shape[-1])
    #print X_indices

    ###############################################################################
    # Univariate feature selection with F-test for feature scoring
    # We use the default selection function: the 10% most significant features
    selector = SelectPercentile(f_classif, percentile=percentile)
    selector.fit(X, y)
    scores = -np.log10(selector.pvalues_)
    scores /= scores.max()
    pl.bar(X_indices - .45,
           scores,
           width=.2,
           label=r'Univariate score ($-Log(p_{value})$)',
           color='g')

    ###############################################################################
    # Compare to the weights of an SVM
    clf = svm.SVC(kernel='linear')
    clf.fit(X, y)

    svm_weights = (clf.coef_**2).sum(axis=0)
    svm_weights /= svm_weights.max()

    pl.bar((X_indices + 1) - .25,
           svm_weights,
           width=.2,
           label='SVM weight',
           color='r')

    clf_selected = svm.SVC(kernel='linear')
    clf_selected.fit(selector.transform(X), y)

    svm_weights_selected = (clf_selected.coef_**2).sum(axis=0)
    svm_weights_selected /= svm_weights_selected.max()

    pl.bar(X_indices[selector.get_support()] - .05,
           svm_weights_selected,
           width=.2,
           label='SVM weights after selection',
           color='b')

    pl.title("Feature selection")
    pl.xlabel('Feature number')
    pl.yticks(())
    pl.axis('tight')
    pl.legend(loc='upper right')
    save = Output + "univariate.png"
    pl.savefig(save)
    # Print the feature ranking
    results = Output + "univariate.txt"
    file = open(results, "w")
    file.write("Feature Ranking\n")
    #print len(X_indices[selector.get_support()])
    for i in xrange(len(X_indices[selector.get_support()])):
        #print i
        #print (X_indices[selector.get_support()][i]+1)
        #print svm_weights_selected[i]
        file.write("%f,%f\n" % ((X_indices[selector.get_support()][i] + 1),
                                svm_weights_selected[i]))
    file.close()

    #print("Feature ranking:")
    #print (X_indices[selector.get_support()] +1)
    #print svm_weights_selected
    lvltrace.lvltrace("LVLSortie dans univariate dans feature_selection")
def preprocessing_module(Extracted_Features,Coma_Features,Corrected_Features, Norm,ontology):
    # Replace tab separated csv into comma separated csv and replace categorial variables into iteration
    lvltrace.lvltrace("LVLEntree dans preprocessing_module data_preproc")
    onto = open(ontology, "w")
    writer=csv.writer(onto, lineterminator=',')
    class_number = 1
    onto.write("Iteration,Class,Class_number,Neuron_name\n")
    Iteration=1
    for root, dirs, files in os.walk(Extracted_Features):
        for i in files:
            if not i.startswith('.'):
                #print i
                input_i=Extracted_Features+i
                output_i=Coma_Features+i
                file = open(output_i, "w")
                writer=csv.writer(file, lineterminator=',')
                lines=tools.file_lines(input_i)+1
                ncol=tools.file_col(input_i)-1
                for line in xrange(lines):
                    for col in xrange(ncol):
                        if line == 0:
                            if col == 1: # Skipping neuron names
                                laurent=1
                            else:
                                file.write("%s,"%tools.read_csv_tab(input_i,col,line))
                        else:
                            if col == 0: # replace class names by an integer
                                file.write("%i,"%class_number)
                            
                            else:
                                if col == 1:
                                    #print "skip neuron name"
                                    onto.write("%i,%s,%i,%s\n"%(Iteration,i,class_number,tools.read_csv_tab(input_i,col,line)))
                                    Iteration=Iteration+1
                                else:
                                    file.write("%s,"%tools.read_csv_tab(input_i,col,line))
                    file.write("\n")
                file.close()
                class_number = class_number + 1
                if lines > 3 :
                    input_file=Coma_Features+i
                    data = np.loadtxt(input_file, delimiter=',', usecols=range(ncol-1),skiprows=1) # ncol-1 because we skip the class names
                    X = data[:, :ncol]
                    y = data[:, 0].astype(np.int) # Labels (class)
                    #Replace missing values 'nan' by column mean
                    imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
                    imp.fit(X)
                    Imputer(axis=0, copy=True, missing_values='NaN', strategy='mean', verbose=0)
                    # Output replacement "Nan" values
                    Y=imp.transform(X)
                    #Data Standardization
                    if Norm == 'normalize':
                        Z=preprocessing.normalize(Y, axis=0, norm='l2') # Normalize
                    else:
                        if Norm == 'binarize':
                            binarizer=preprocessing.Binarizer().fit(Y) # Binarize for Bernoulli
                            Z = binarizer.transform(Y)
                        else:
                            if Norm == 'standardize':
                                min_max_scaler = preprocessing.MinMaxScaler() # Normalize the data to [0,1]
                                Z=min_max_scaler.fit_transform(Y)
                            else:
                                Z=preprocessing.scale(Y) #Scaling

                    #Create new files with corrected and standardized data
                    output_file=Corrected_Features+i
                    file = open(output_file, "w")
                    writer=csv.writer(file, lineterminator=',')
                    for line_1 in xrange(lines-1):
                        for col_1 in xrange(ncol-1):
                            if col_1==0:
                                file.write("%s,"%y[line_1])
                            else:
                                file.write("%f,"%Z[line_1,col_1])
                        file.write("\n")
                    file.close()
                else:
                    laurent=1
    onto.close()
    lvltrace.lvltrace("LVLSortie dans preprocessing_module data_preproc")
def univariate(input_file, Output, percentile):
    ###############################################################################
    # import some data to play with
    lvltrace.lvltrace("LVLEntree dans univariate dans feature_selection")
    ncol=tools.file_col_coma(input_file)
    data = np.loadtxt(input_file, delimiter=',', usecols=range(ncol-1))
    X = data[:,1:]
    y = data[:,0]
    sample_size, n_features = X.shape

    ###############################################################################
    pl.figure(1)
    pl.clf()

    X_indices = np.arange(X.shape[-1])
    #print X_indices

    ###############################################################################
    # Univariate feature selection with F-test for feature scoring
    # We use the default selection function: the 10% most significant features
    selector = SelectPercentile(f_classif, percentile=percentile)
    selector.fit(X, y)
    scores = -np.log10(selector.pvalues_)
    scores /= scores.max()
    pl.bar(X_indices - .45, scores, width=.2,
           label=r'Univariate score ($-Log(p_{value})$)', color='g')

    ###############################################################################
    # Compare to the weights of an SVM
    clf = svm.SVC(kernel='linear')
    clf.fit(X, y)

    svm_weights = (clf.coef_ ** 2).sum(axis=0)
    svm_weights /= svm_weights.max()

    pl.bar((X_indices+1) - .25, svm_weights, width=.2, label='SVM weight', color='r')

    clf_selected = svm.SVC(kernel='linear')
    clf_selected.fit(selector.transform(X), y)

    svm_weights_selected = (clf_selected.coef_ ** 2).sum(axis=0)
    svm_weights_selected /= svm_weights_selected.max()

    pl.bar(X_indices[selector.get_support()] - .05, svm_weights_selected, width=.2,
           label='SVM weights after selection', color='b')
           
    pl.title("Feature selection")
    pl.xlabel('Feature number')
    pl.yticks(())
    pl.axis('tight')
    pl.legend(loc='upper right')
    save=Output+"univariate.png"
    pl.savefig(save)
    # Print the feature ranking
    results = Output+"univariate.txt"
    file = open(results, "w")
    file.write("Feature Ranking\n")
    #print len(X_indices[selector.get_support()])
    for i in xrange(len(X_indices[selector.get_support()])):
        #print i
        #print (X_indices[selector.get_support()][i]+1)
        #print svm_weights_selected[i]
        file.write("%f,%f\n"%((X_indices[selector.get_support()][i]+1),svm_weights_selected[i]))
    file.close()
    
    #print("Feature ranking:")
    #print (X_indices[selector.get_support()] +1)
    #print svm_weights_selected
    lvltrace.lvltrace("LVLSortie dans univariate dans feature_selection")
コード例 #39
0
def affinitypropagation(input_file,type,pref,Output):
    lvltrace.lvltrace("LVLEntree dans affinitypropagation unsupervised")
    ncol=tools.file_col_coma(input_file)
    data = np.loadtxt(input_file, delimiter=',', usecols=range(ncol-1))

    X = data[:,1:]
    #print (" ici X vaut ")
    #print X
    #print (" fin de print X")
    labels_true = data[:,0]
    # A tester
    if type == 'spearmanr':
        X = scipy.stats.stats.spearmanr(X,axis=1)[0]
    else:
        if type == 'euclidean':
            X = -euclidean_distances(X, squared=True)
        else:
            print "something wrong"
    if pref == 'median':
        # A tester entre min ou median
        preference = np.median(X)
    else:
        if pref == 'mean':
            preference = np.mean(X)
        else:
            if pref == 'min':
                preference = np.min(X)
            else:
                print "something wrong"
    print "#########################################################################################################\n"
    print "Affinity Propagation"
    print preference
    n_samples, n_features = X.shape
    cluster_centers_indices, labels = affinity_propagation(X, preference=preference)
    #print cluster_centers_indices
    n_clusters_ = len(cluster_centers_indices)
    n_clusters_ = len(cluster_centers_indices)
    #print labels_true
    #print labels
    print('Estimated number of clusters: %d' % n_clusters_)
    print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels_true, labels))
    print("Completeness: %0.3f" % metrics.completeness_score(labels_true, labels))
    print("V-measure: %0.3f" % metrics.v_measure_score(labels_true, labels))
    print("Adjusted Rand Index: %0.3f"
          % metrics.adjusted_rand_score(labels_true, labels))
    print("Adjusted Mutual Information: %0.3f"
          % metrics.adjusted_mutual_info_score(labels_true, labels))
    print("Silhouette Coefficient: %0.3f"
          % metrics.silhouette_score(X, labels, metric='sqeuclidean'))
    print "\n"
    print "#########################################################################################################\n"
    results = Output+"affinity_propagation.txt"
    file = open(results, "w")
    file.write("Affinity Propagation\n")
    file.write("Homogeneity Score: %f\n"%metrics.homogeneity_score(labels_true, labels))
    file.write("Completeness Score: %f\n"%metrics.completeness_score(labels_true, labels))
    file.write("V-Measure: %f\n"%metrics.v_measure_score(labels_true, labels))
    file.write("The adjusted Rand index: %f\n"%metrics.adjusted_rand_score(labels_true, labels))
    file.write("Adjusted Mutual Information: %f\n"%metrics.adjusted_mutual_info_score(labels_true,  labels))
    file.write("Silhouette Score: %f\n"%metrics.silhouette_score(X, labels, metric='sqeuclidean'))
    file.write("\n")
    file.write("True Value, Clusters, Iteration\n")
    for n in xrange(len(labels_true)):
        file.write("%f,%f,%i\n"%(labels_true[n],labels[n],(n+1)))
    file.close()
    
    # Plot result
    import pylab as pl
    from itertools import cycle
    pl.close('all')
    pl.figure(1)
    pl.clf()
    colors = cycle('bgrcmykbgrcmykbgrcmykbgrcmykbgrcmykbgrcmykbg')
    for k, col in zip(range(n_clusters_), colors):
        class_members = labels == k
        cluster_center = X[cluster_centers_indices[k]]
        pl.plot(X[class_members, 0], X[class_members, 1], col + '.')
        pl.plot(cluster_center[0], cluster_center[1], 'o', markerfacecolor=col,
                markeredgecolor='k', markersize=14)
        for x in X[class_members]:
            pl.plot([cluster_center[0], x[0]], [cluster_center[1], x[1]], col)
    pl.title('Estimated number of clusters: %d' % n_clusters_)
    save = Output + "affinity_propagation.png"
    plt.savefig(save)
    lvltrace.lvltrace("LVLSortie dans affinitypropagation unsupervised")
コード例 #40
0
def dbscan(input_file, Output):
    lvltrace.lvltrace("LVLEntree dans dbscan unsupervised")
    # Generate sample data
    ncol=tools.file_col_coma(input_file)
    data = np.loadtxt(input_file, delimiter=',', usecols=range(ncol-1))
    X = data[:,1:]
    labels_true = data[:,0]
    #X = StandardScaler().fit_transform(Y)
    # Compute DBSCAN
    db = DBSCAN().fit(X)
    core_samples = db.core_sample_indices_
    labels = db.labels_
    print "#########################################################################################################\n"
    print "DBSCAN"
    print labels_true
    print labels
    # Number of clusters in labels, ignoring noise if present.
    n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
    print('Estimated number of clusters: %d' % n_clusters_)
    print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels_true, labels))
    print("Completeness: %0.3f" % metrics.completeness_score(labels_true, labels))
    print("V-measure: %0.3f" % metrics.v_measure_score(labels_true, labels))
    print("Adjusted Rand Index: %0.3f"
          % metrics.adjusted_rand_score(labels_true, labels))
    print("Adjusted Mutual Information: %0.3f"
          % metrics.adjusted_mutual_info_score(labels_true, labels))
    print("Silhouette Coefficient: %0.3f"
          % metrics.silhouette_score(X, labels))
    print "\n"
    print "#########################################################################################################\n"
    results = Output+"dbscan.txt"
    file = open(results, "w")
    file.write("DBSCAN\n")
    file.write("Homogeneity Score: %f\n"%metrics.homogeneity_score(y, labels))
    file.write("Completeness Score: %f\n"%metrics.completeness_score(y, labels))
    file.write("V-Measure: %f\n"%metrics.v_measure_score(y, labels))
    file.write("The adjusted Rand index: %f\n"%metrics.adjusted_rand_score(y, labels))
    file.write("Adjusted Mutual Information: %f\n"%metrics.adjusted_mutual_info_score(y,  labels))
    file.write("Silhouette Score: %f\n"%metrics.silhouette_score(X, labels, metric='euclidean', sample_size=sample_size))
    file.write("\n")
    file.write("True Value, Clusters, Iteration\n")
    for n in xrange(len(y)):
        file.write("%f,%f,%i\n"%(y[n],labels[n],(n+1)))
    file.close()
    
    # Plot result
    import pylab as pl
    # Black removed and is used for noise instead.
    unique_labels = set(labels)
    colors = pl.cm.Spectral(np.linspace(0, 1, len(unique_labels)))
    for k, col in zip(unique_labels, colors):
        if k == -1:
            # Black used for noise.
            col = 'k'
            markersize = 6
        class_members = [index[0] for index in np.argwhere(labels == k)]
        cluster_core_samples = [index for index in core_samples
                                if labels[index] == k]
        for index in class_members:
            x = X[index]
            if index in core_samples and k != -1:
                markersize = 14
            else:
                markersize = 6
            pl.plot(x[0], x[1], 'o', markerfacecolor=col,
                    markeredgecolor='k', markersize=markersize)
    pl.title('Estimated number of clusters: %d' % n_clusters_)
    save = Output + "dbscan.png"
    plt.savefig(save)
    lvltrace.lvltrace("LVLSortie dans dbscan unsupervised")
def preprocessing_module(Extracted_Features, Coma_Features, Corrected_Features,
                         Norm, ontology):
    # Replace tab separated csv into comma separated csv and replace categorial variables into iteration
    lvltrace.lvltrace("LVLEntree dans preprocessing_module data_preproc")
    onto = open(ontology, "w")
    writer = csv.writer(onto, lineterminator=',')
    class_number = 1
    onto.write("Iteration,Class,Class_number,Neuron_name\n")
    Iteration = 1
    for root, dirs, files in os.walk(Extracted_Features):
        for i in files:
            if not i.startswith('.'):
                #print i
                input_i = Extracted_Features + i
                output_i = Coma_Features + i
                file = open(output_i, "w")
                writer = csv.writer(file, lineterminator=',')
                lines = tools.file_lines(input_i) + 1
                ncol = tools.file_col(input_i) - 1
                for line in xrange(lines):
                    for col in xrange(ncol):
                        if line == 0:
                            if col == 1:  # Skipping neuron names
                                laurent = 1
                            else:
                                file.write(
                                    "%s," %
                                    tools.read_csv_tab(input_i, col, line))
                        else:
                            if col == 0:  # replace class names by an integer
                                file.write("%i," % class_number)

                            else:
                                if col == 1:
                                    #print "skip neuron name"
                                    onto.write("%i,%s,%i,%s\n" %
                                               (Iteration, i, class_number,
                                                tools.read_csv_tab(
                                                    input_i, col, line)))
                                    Iteration = Iteration + 1
                                else:
                                    file.write(
                                        "%s," %
                                        tools.read_csv_tab(input_i, col, line))
                    file.write("\n")
                file.close()
                class_number = class_number + 1
                if lines > 3:
                    input_file = Coma_Features + i
                    data = np.loadtxt(
                        input_file,
                        delimiter=',',
                        usecols=range(ncol - 1),
                        skiprows=1)  # ncol-1 because we skip the class names
                    X = data[:, :ncol]
                    y = data[:, 0].astype(np.int)  # Labels (class)
                    #Replace missing values 'nan' by column mean
                    imp = Imputer(missing_values='NaN',
                                  strategy='mean',
                                  axis=0)
                    imp.fit(X)
                    Imputer(axis=0,
                            copy=True,
                            missing_values='NaN',
                            strategy='mean',
                            verbose=0)
                    # Output replacement "Nan" values
                    Y = imp.transform(X)
                    #Data Standardization
                    if Norm == 'normalize':
                        Z = preprocessing.normalize(Y, axis=0,
                                                    norm='l2')  # Normalize
                    else:
                        if Norm == 'binarize':
                            binarizer = preprocessing.Binarizer().fit(
                                Y)  # Binarize for Bernoulli
                            Z = binarizer.transform(Y)
                        else:
                            if Norm == 'standardize':
                                min_max_scaler = preprocessing.MinMaxScaler(
                                )  # Normalize the data to [0,1]
                                Z = min_max_scaler.fit_transform(Y)
                            else:
                                Z = preprocessing.scale(Y)  #Scaling

                    #Create new files with corrected and standardized data
                    output_file = Corrected_Features + i
                    file = open(output_file, "w")
                    writer = csv.writer(file, lineterminator=',')
                    for line_1 in xrange(lines - 1):
                        for col_1 in xrange(ncol - 1):
                            if col_1 == 0:
                                file.write("%s," % y[line_1])
                            else:
                                file.write("%f," % Z[line_1, col_1])
                        file.write("\n")
                    file.close()
                else:
                    laurent = 1
    onto.close()
    lvltrace.lvltrace("LVLSortie dans preprocessing_module data_preproc")
コード例 #42
0
def meanshift(input_file,Output):
    lvltrace.lvltrace("LVLEntree dans meanshift unsupervised")
    ncol=tools.file_col_coma(input_file)
    data = np.loadtxt(input_file, delimiter=',', usecols=range(ncol-1))
    X = data[:,1:]
    y = data[:,0]
    sample_size, n_features = X.shape
    # Compute clustering with MeanShift
    # The following bandwidth can be automatically detected using
    bandwidth = estimate_bandwidth(X, quantile=0.2, n_samples=sample_size)
    ms = MeanShift()
    ms.fit(X)
    labels = ms.labels_
    cluster_centers = ms.cluster_centers_
    labels_unique = np.unique(labels)
    n_clusters_ = len(labels_unique)
    print "#########################################################################################################\n"
    print "Mean Shift"
    print("number of estimated clusters : %d" % n_clusters_)
    #print labels
    #print y
    print('homogeneity_score: %f'%metrics.homogeneity_score(y, labels))
    print('completeness_score: %f'%metrics.completeness_score(y, labels))
    print('v_measure_score: %f'%metrics.v_measure_score(y, labels))
    print('adjusted_rand_score: %f'%metrics.adjusted_rand_score(y, labels))
    print('adjusted_mutual_info_score: %f'%metrics.adjusted_mutual_info_score(y,  labels))
    try:
        print('silhouette_score: %f'%metrics.silhouette_score(X, labels, metric='euclidean', sample_size=sample_size))
    except (ValueError):
        print "ValueError: Number of labels is 1 but should be more than 2and less than n_samples - 1"
    print "\n"
    print "#########################################################################################################\n"
    results = Output+"mean_shift_metrics.txt"
    file = open(results, "w")
    file.write("Mean Shift\n")
    file.write("Homogeneity Score: %f\n"%metrics.homogeneity_score(y, labels))
    file.write("Completeness Score: %f\n"%metrics.completeness_score(y, labels))
    file.write("V-Measure: %f\n"%metrics.v_measure_score(y, labels))
    file.write("The adjusted Rand index: %f\n"%metrics.adjusted_rand_score(y, labels))
    file.write("Adjusted Mutual Information: %f\n"%metrics.adjusted_mutual_info_score(y,  labels))
    try:
        file.write("Silhouette Score: %f\n"%metrics.silhouette_score(X, labels, metric='euclidean', sample_size=sample_size))
    except (ValueError):
        file.write("ValueError: Number of labels is 1 but should be more than 2and less than n_samples - 1")
    file.write("\n")
    file.write("True Value, Clusters, Iteration\n")
    for n in xrange(len(y)):
        file.write("%f,%f,%i\n"%(y[n],labels[n],(n+1)))
    file.close()

    # Plot result
    import pylab as pl
    from itertools import cycle
    fig, ax = plt.subplots()
    im=ax.scatter(X[:, 0], X[:, 1], c=labels, marker='.')
    for k in xrange(n_clusters_):
        my_members = labels == k
        cluster_center = cluster_centers[k]
        #print cluster_center[0], cluster_center[1]
        ax.plot(cluster_center[0], cluster_center[1], 'w', color='b',
                marker='x', markersize=6)
    fig.colorbar(im);
    plt.title('Estimated number of clusters: %d' % n_clusters_)
    save = Output + "mean_shift.png"
    plt.savefig(save)
    lvltrace.lvltrace("LVLSortie dans meanshift unsupervised")