def features_by_class():
    lvltrace.lvltrace("LVLEntree dans features_by_class")
    if not os.path.exists(inputs.features_output):
        os.makedirs(inputs.features_output)
    for root, dirs, files in os.walk(inputs.morphology):
        for i in dirs:
            #LVlprint ("on traite le repertoire "+ str(i))
            neuron_dir=root+'/'+i
            neuron_file_out=inputs.features_output+'/'+i+'.csv'
            features_by_class = open(neuron_file_out, "w")
            writer=csv.writer(features_by_class, lineterminator='\t')
            features_name=tools.random_file(neuron_dir)
            lines=tools.file_lines(features_name)
            features_by_class.write("mtype\tneuron_name\t")
            for line in xrange(lines):
                features_by_class.write("%s\t"%tools.read_csv_tab(features_name,1,line))
            features_by_class.write("\n")

            for file in os.listdir(neuron_dir):
                neuron_file_in=root+'/'+i+'/'+file
                # if the extracted feature file from lmeasure is empty, then skip
                if file.endswith(".csv") and os.path.getsize(neuron_file_in) > 0:
                    lines=tools.file_lines(neuron_file_in)
                    features_by_class.write("%s\t"%i)
                    features_by_class.write("%s\t"%file)
                    for line in xrange(lines):
                        features_by_class.write("%s\t"%tools.read_csv_tab(neuron_file_in,2,line))
                    features_by_class.write("\n")
    lvltrace.lvltrace("LVLSortie dans features_by_class")
Beispiel #2
0
    def part_and_queue(self):
        pos_list = []
        file_size = tools.file_lines(self.file_name)
        block_size = file_size / self.block_num
        start_pos = 0
        global q

        for i in range(self.block_num):
            if i == self.block_num - 1:
                end_pos = file_size - 1
                pos_list.append((start_pos, end_pos))
                break
            end_pos = start_pos + block_size - 1
            if end_pos >= file_size:
                end_pos = file_size - 1
            if start_pos >= file_size:
                break
            pos_list.append((start_pos, end_pos))
            start_pos = end_pos + 1

        fd = open(self.file_name, 'r')
        for pos_tu in pos_list:
            temp_text = []
            start = pos_tu[0]
            end = pos_tu[1]
            while start <= end:
                text = fd.readline().strip('\n')
                temp_text.append(text)
                start = start + 1

            q.put(temp_text)
        fd.close()
def data_preprocessing_descriptive(Extracted_Features,Coma_Features,Corrected_Features):
    lvltrace.lvltrace("LVLEntree dans data_preprocessing_descriptive dans preproc_descriptive")
    tools.separate_coma(Extracted_Features,Coma_Features)
    for root, dirs, files in os.walk(Coma_Features):
        for i in files:
            if not i.startswith('.'):
                input_i=Coma_Features+i
                output_i=Corrected_Features+i
                lines=tools.file_lines(input_i)
                ncol=tools.file_col(input_i)
                if lines >= 2:
                    file = open(output_i, "w")
                    writer=csv.writer(file, lineterminator='\t')
                    
                    data = np.genfromtxt(input_i,delimiter=',')
                    X = data[1:, 2:]
                    neuron_type = np.genfromtxt(input_i,delimiter=',',dtype=None)
                    y = neuron_type[:, 0] # (class)

                    neuron_name = np.genfromtxt(input_i,delimiter=',',dtype=None)
                    z = neuron_name[:, 1] # Neuron names
                    
                    features = np.genfromtxt(input_i,delimiter=',',dtype=None)
                    w = features[0, :] # features names
                    
                    #Replace missing values 'nan' by column mean
                    imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
                    imp.fit(X)
                    Imputer(axis=0, copy=True, missing_values='NaN', strategy='mean', verbose=0)
                    # Output replacement "Nan" values
                    Y=imp.transform(X)
                    #print i
                    #print Y.shape, y.shape,z.shape
                    #print Y.shape[1]
                    
                    ####################
                    for line in xrange(Y.shape[0]+1):
                        for colonne in xrange(Y.shape[1]+2):
                            if line == 0:
                                if colonne == 0:
                                    file.write("%s\t"%y[line])
                                else:
                                    if colonne == 1:
                                        file.write("%s\t"%z[line])
                                    else:
                                        file.write("%s\t"%w[colonne])
                            else:
                                if colonne == 0:
                                    file.write("%s\t"%y[line])
                                else:
                                    if colonne == 1:
                                        file.write("%s\t"%z[line])
                                    else:
                                        file.write("%f\t"%Y[line-1,colonne-2])
                        file.write("\n")
                    #########################
                else:
                    print "Only one morphology !!!"
                file.close()
    lvltrace.lvltrace("LVLSortie dans data_preprocessing_descriptive dans preproc_descriptive")
def separate_coma(input,output):
    for root, dirs, files in os.walk(input):
        for i in files:
            if not i.startswith('.'):
                input_i=input+i
                output_i=output+i
                file = open(output_i, "w")
                writer=csv.writer(file, lineterminator=',')
                lines=tools.file_lines(input_i)+1
                ncol=tools.file_col(input_i)
                for i in xrange(lines):
                    for j in xrange(ncol):
                        file.write("%s,"%tools.read_csv_tab(input_i,j,i))
                    file.write("\n")
                file.close()
def preprocessing_module(Extracted_Features, Coma_Features, Corrected_Features,
                         Norm, ontology):
    # Replace tab separated csv into comma separated csv and replace categorial variables into iteration
    lvltrace.lvltrace("LVLEntree dans preprocessing_module data_preproc")
    onto = open(ontology, "w")
    writer = csv.writer(onto, lineterminator=',')
    class_number = 1
    onto.write("Iteration,Class,Class_number,Neuron_name\n")
    Iteration = 1
    for root, dirs, files in os.walk(Extracted_Features):
        for i in files:
            if not i.startswith('.'):
                #print i
                input_i = Extracted_Features + i
                output_i = Coma_Features + i
                file = open(output_i, "w")
                writer = csv.writer(file, lineterminator=',')
                lines = tools.file_lines(input_i) + 1
                ncol = tools.file_col(input_i) - 1
                for line in xrange(lines):
                    for col in xrange(ncol):
                        if line == 0:
                            if col == 1:  # Skipping neuron names
                                laurent = 1
                            else:
                                file.write(
                                    "%s," %
                                    tools.read_csv_tab(input_i, col, line))
                        else:
                            if col == 0:  # replace class names by an integer
                                file.write("%i," % class_number)

                            else:
                                if col == 1:
                                    #print "skip neuron name"
                                    onto.write("%i,%s,%i,%s\n" %
                                               (Iteration, i, class_number,
                                                tools.read_csv_tab(
                                                    input_i, col, line)))
                                    Iteration = Iteration + 1
                                else:
                                    file.write(
                                        "%s," %
                                        tools.read_csv_tab(input_i, col, line))
                    file.write("\n")
                file.close()
                class_number = class_number + 1
                if lines > 3:
                    input_file = Coma_Features + i
                    data = np.loadtxt(
                        input_file,
                        delimiter=',',
                        usecols=range(ncol - 1),
                        skiprows=1)  # ncol-1 because we skip the class names
                    X = data[:, :ncol]
                    y = data[:, 0].astype(np.int)  # Labels (class)
                    #Replace missing values 'nan' by column mean
                    imp = Imputer(missing_values='NaN',
                                  strategy='mean',
                                  axis=0)
                    imp.fit(X)
                    Imputer(axis=0,
                            copy=True,
                            missing_values='NaN',
                            strategy='mean',
                            verbose=0)
                    # Output replacement "Nan" values
                    Y = imp.transform(X)
                    #Data Standardization
                    if Norm == 'normalize':
                        Z = preprocessing.normalize(Y, axis=0,
                                                    norm='l2')  # Normalize
                    else:
                        if Norm == 'binarize':
                            binarizer = preprocessing.Binarizer().fit(
                                Y)  # Binarize for Bernoulli
                            Z = binarizer.transform(Y)
                        else:
                            if Norm == 'standardize':
                                min_max_scaler = preprocessing.MinMaxScaler(
                                )  # Normalize the data to [0,1]
                                Z = min_max_scaler.fit_transform(Y)
                            else:
                                Z = preprocessing.scale(Y)  #Scaling

                    #Create new files with corrected and standardized data
                    output_file = Corrected_Features + i
                    file = open(output_file, "w")
                    writer = csv.writer(file, lineterminator=',')
                    for line_1 in xrange(lines - 1):
                        for col_1 in xrange(ncol - 1):
                            if col_1 == 0:
                                file.write("%s," % y[line_1])
                            else:
                                file.write("%f," % Z[line_1, col_1])
                        file.write("\n")
                    file.close()
                else:
                    laurent = 1
    onto.close()
    lvltrace.lvltrace("LVLSortie dans preprocessing_module data_preproc")
def data_preprocessing_descriptive(Extracted_Features, Coma_Features,
                                   Corrected_Features):
    lvltrace.lvltrace(
        "LVLEntree dans data_preprocessing_descriptive dans preproc_descriptive"
    )
    tools.separate_coma(Extracted_Features, Coma_Features)
    for root, dirs, files in os.walk(Coma_Features):
        for i in files:
            if not i.startswith('.'):
                input_i = Coma_Features + i
                output_i = Corrected_Features + i
                lines = tools.file_lines(input_i)
                ncol = tools.file_col(input_i)
                if lines >= 2:
                    file = open(output_i, "w")
                    writer = csv.writer(file, lineterminator='\t')

                    data = np.genfromtxt(input_i, delimiter=',')
                    X = data[1:, 2:]
                    neuron_type = np.genfromtxt(input_i,
                                                delimiter=',',
                                                dtype=None)
                    y = neuron_type[:, 0]  # (class)

                    neuron_name = np.genfromtxt(input_i,
                                                delimiter=',',
                                                dtype=None)
                    z = neuron_name[:, 1]  # Neuron names

                    features = np.genfromtxt(input_i,
                                             delimiter=',',
                                             dtype=None)
                    w = features[0, :]  # features names

                    #Replace missing values 'nan' by column mean
                    imp = Imputer(missing_values='NaN',
                                  strategy='mean',
                                  axis=0)
                    imp.fit(X)
                    Imputer(axis=0,
                            copy=True,
                            missing_values='NaN',
                            strategy='mean',
                            verbose=0)
                    # Output replacement "Nan" values
                    Y = imp.transform(X)
                    #print i
                    #print Y.shape, y.shape,z.shape
                    #print Y.shape[1]

                    ####################
                    for line in xrange(Y.shape[0] + 1):
                        for colonne in xrange(Y.shape[1] + 2):
                            if line == 0:
                                if colonne == 0:
                                    file.write("%s\t" % y[line])
                                else:
                                    if colonne == 1:
                                        file.write("%s\t" % z[line])
                                    else:
                                        file.write("%s\t" % w[colonne])
                            else:
                                if colonne == 0:
                                    file.write("%s\t" % y[line])
                                else:
                                    if colonne == 1:
                                        file.write("%s\t" % z[line])
                                    else:
                                        file.write("%f\t" %
                                                   Y[line - 1, colonne - 2])
                        file.write("\n")
                    #########################
                else:
                    print "Only one morphology !!!"
                file.close()
    lvltrace.lvltrace(
        "LVLSortie dans data_preprocessing_descriptive dans preproc_descriptive"
    )
def descriptive_multi_cores(data):
	fig, ax = plt.subplots()
	lines=tools.file_lines(data[3])
	ncol=tools.file_col(data[3])-1
	outputs_files=data[1]+data[2]

	if lines==1:
	    file = open(outputs_files, "w")
	    writer=csv.writer(file, lineterminator=',')
	    b=[[0 for x in xrange(6)] for x in xrange(ncol-2)]
	    normality=[0 for x in xrange(1)]
	    file.write("Variables,Class,N,Mean,Std_Dev,Variance,Max,Min,Coeff_Var,Interquartile,Normality_distrib,Confident_intervals_left,Confident_intervals_right\n")
	    for col in xrange(ncol-2):
		a = tools.read_float_tab(data[3],col+2,1)
		b[col][0]=1
		b[col][1]=float(a)
		b[col][2]=0
		b[col][3]=0
		b[col][4]=float(a)
		b[col][5]=float(a)
		file.write("%s,"%data[4][col+2])
		file.write("%s,"%(os.path.splitext(data[2])[0]))
		file.write("%f,"%b[col][0])
		file.write("%f,"%b[col][1])
		file.write("%f,"%b[col][2])
		file.write("%f,"%b[col][3])
		file.write("%f,"%b[col][4])
		file.write("%f,"%b[col][5])
		file.write("100,")
		file.write("0,")
		file.write("0,")
		file.write("0,")
		file.write("0,\n")
	    file.close()
	else:
	    file = open(outputs_files, "w")
	    writer=csv.writer(file, lineterminator=',')
	    b=[[0 for x in xrange(11)] for x in xrange(ncol-2)]
	    file.write("Variables,Class,N,Mean,Std_Dev,Variance,Max,Min,Coeff_Var,Interquartile,Normality_distrib,Confident_intervals_left,Confident_intervals_right\n")
	    a=[0 for x in xrange(lines)]
	    for col in xrange(ncol-2):
		for j in xrange(lines):
		   a[j]=tools.read_float_tab(data[3],col+2,j)
		   b[col][0]=float(len(a))
		b[col][1]=float(np.mean(a))
		b[col][2]=float(np.std(a, dtype=float))
		b[col][3]=float(np.var(a, dtype=float))
		b[col][4]=float(max(a))
		b[col][5]=float(min(a))
		if np.around(b[col][2],decimals=12) != 0.0:
		    if float(len(a)) < 3:
			b[col][6]=float(abs((np.std(a)/np.mean(a))*100))
			file.write("%s,"%data[4][col+2])
			file.write("%s,"%(os.path.splitext(data[2])[0]))
			file.write("%f,"%b[col][0])
			file.write("%f,"%b[col][1])
			file.write("%f,"%b[col][2])
			file.write("%f,"%b[col][3])
			file.write("%f,"%b[col][4])
			file.write("%f,"%b[col][5])
			file.write("%f,"%b[col][6])
			file.write("0,")
			file.write("0,")
			file.write("0,")
			file.write("0,\n")
		    else:
			b[col][6]=float(abs((np.std(a)/np.mean(a))*100))
			X=sort(a)
			upperQuartile = stats.scoreatpercentile(X,.75)
			lowerQuartile = stats.scoreatpercentile(X,.25)
			IQR = upperQuartile - lowerQuartile
			b[col][7]=float(IQR)
			normality=stats.shapiro(a)
			b[col][8]=float(normality[1])
			b[col][9]=np.mean(a)-1.96*(np.std(a)/math.sqrt(len(a)))
			b[col][10]=np.mean(a)+1.96*(np.std(a)/math.sqrt(len(a)))
			file.write("%s,"%data[4][col+2])
			file.write("%s,"%(os.path.splitext(data[2])[0]))
			file.write("%f,"%b[col][0])
			file.write("%f,"%b[col][1])
			file.write("%f,"%b[col][2])
			file.write("%f,"%b[col][3])
			file.write("%f,"%b[col][4])
			file.write("%f,"%b[col][5])
			file.write("%f,"%b[col][6])
			file.write("%f,"%b[col][7])
			file.write("%f,"%b[col][8])
			file.write("%f,"%b[col][9])
			file.write("%f,\n"%b[col][10])
			if normality[1] >= 0.05:
			    norm_distrib = np.linspace(-150,150,100)
			    ax.set_title('Normality of %s class features'%os.path.splitext(data[2])[0])
			    ax.plot(norm_distrib,mlab.normpdf(norm_distrib,np.mean(a),math.sqrt(np.var(a))),label=data[4][col+2],ms=10, alpha=0.3)
			    ax.legend(loc=2, ncol=1, bbox_to_anchor=(0, 0, 1, 0.7),fancybox=True,shadow=False,fontsize=5)
			    ax.grid(color='lightgray', alpha=0.5)
			    ax.patch.set_facecolor('white')
	    
		else:
		    b[col][6]=0
		    X=sort(a)
		    upperQuartile = stats.scoreatpercentile(X,.75)
		    lowerQuartile = stats.scoreatpercentile(X,.25)
		    IQR = upperQuartile - lowerQuartile
		    b[col][7]=float(IQR)
		    b[col][8]=0
		    b[col][9]=np.mean(a)-1.96*(np.std(a)/math.sqrt(len(a)))
		    b[col][10]=np.mean(a)+1.96*(np.std(a)/math.sqrt(len(a)))
		    file.write("%s,"%data[4][col+2])
		    file.write("%s,"%(os.path.splitext(data[2])[0]))
		    file.write("%f,"%b[col][0])
		    file.write("%f,"%b[col][1])
		    file.write("%f,"%b[col][2])
		    file.write("%f,"%b[col][3])
		    file.write("%f,"%b[col][4])
		    file.write("%f,"%b[col][5])
		    file.write("%f,"%b[col][6])
		    file.write("%f,"%b[col][7])
		    file.write("%f,"%b[col][8])
		    file.write("%f,"%b[col][9])
		    file.write("%f,\n"%b[col][10])
	    file.close()
	    display_d3(fig)
	    html = mpld3.fig_to_d3(fig)
	    html_normality=data[1]+data[2]+".html"
	    normality_display = open(html_normality, "w")
	    normality_display.write("%s"%html)
	    normality_display.close()
	    plt.close()
def preprocessing_module(Extracted_Features,Coma_Features,Corrected_Features, Norm,ontology):
    # Replace tab separated csv into comma separated csv and replace categorial variables into iteration
    lvltrace.lvltrace("LVLEntree dans preprocessing_module data_preproc")
    onto = open(ontology, "w")
    writer=csv.writer(onto, lineterminator=',')
    class_number = 1
    onto.write("Iteration,Class,Class_number,Neuron_name\n")
    Iteration=1
    for root, dirs, files in os.walk(Extracted_Features):
        for i in files:
            if not i.startswith('.'):
                #print i
                input_i=Extracted_Features+i
                output_i=Coma_Features+i
                file = open(output_i, "w")
                writer=csv.writer(file, lineterminator=',')
                lines=tools.file_lines(input_i)+1
                ncol=tools.file_col(input_i)-1
                for line in xrange(lines):
                    for col in xrange(ncol):
                        if line == 0:
                            if col == 1: # Skipping neuron names
                                laurent=1
                            else:
                                file.write("%s,"%tools.read_csv_tab(input_i,col,line))
                        else:
                            if col == 0: # replace class names by an integer
                                file.write("%i,"%class_number)
                            
                            else:
                                if col == 1:
                                    #print "skip neuron name"
                                    onto.write("%i,%s,%i,%s\n"%(Iteration,i,class_number,tools.read_csv_tab(input_i,col,line)))
                                    Iteration=Iteration+1
                                else:
                                    file.write("%s,"%tools.read_csv_tab(input_i,col,line))
                    file.write("\n")
                file.close()
                class_number = class_number + 1
                if lines > 3 :
                    input_file=Coma_Features+i
                    data = np.loadtxt(input_file, delimiter=',', usecols=range(ncol-1),skiprows=1) # ncol-1 because we skip the class names
                    X = data[:, :ncol]
                    y = data[:, 0].astype(np.int) # Labels (class)
                    #Replace missing values 'nan' by column mean
                    imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
                    imp.fit(X)
                    Imputer(axis=0, copy=True, missing_values='NaN', strategy='mean', verbose=0)
                    # Output replacement "Nan" values
                    Y=imp.transform(X)
                    #Data Standardization
                    if Norm == 'normalize':
                        Z=preprocessing.normalize(Y, axis=0, norm='l2') # Normalize
                    else:
                        if Norm == 'binarize':
                            binarizer=preprocessing.Binarizer().fit(Y) # Binarize for Bernoulli
                            Z = binarizer.transform(Y)
                        else:
                            if Norm == 'standardize':
                                min_max_scaler = preprocessing.MinMaxScaler() # Normalize the data to [0,1]
                                Z=min_max_scaler.fit_transform(Y)
                            else:
                                Z=preprocessing.scale(Y) #Scaling

                    #Create new files with corrected and standardized data
                    output_file=Corrected_Features+i
                    file = open(output_file, "w")
                    writer=csv.writer(file, lineterminator=',')
                    for line_1 in xrange(lines-1):
                        for col_1 in xrange(ncol-1):
                            if col_1==0:
                                file.write("%s,"%y[line_1])
                            else:
                                file.write("%f,"%Z[line_1,col_1])
                        file.write("\n")
                    file.close()
                else:
                    laurent=1
    onto.close()
    lvltrace.lvltrace("LVLSortie dans preprocessing_module data_preproc")