def collect_distances(input_dir, output_neuron_distance_csv): #collect results from log files df_neuron_distance = pd.DataFrame( columns=('image_file_name', 'swc_file', 'gold_swc_file', 'algorithm', 'neuron_distance_12', 'neuron_distance_21', 'neuron_distance_ave', 'neuron_distance_diff', 'neuron_distance_perc')) i = 0 for dirpath, dnames, fnames in os.walk(input_dir): for f in fnames: if f.endswith(".r.log"): log_f = (os.path.join(dirpath, f)) image = log_f.split('/')[-2] algorithm = rp.matchFileToAlgorithmName(log_f[0:-6]) nd = bn.read_neuron_dist_log(log_f) df_neuron_distance.loc[i] = [ image, nd['input_file1'], nd['input_file2'], algorithm, nd['dist_12'], nd['dist_21'], nd['ave'], nd['diff'], nd['perc'] ] i = i + 1 df_neuron_distance = df_neuron_distance[ df_neuron_distance['neuron_distance_ave'] != -1] # empty no df_neuron_distance.to_csv(output_neuron_distance_csv, index=False) return
def collect_distances(input_dir, output_neuron_distance_csv): #collect results from log files df_neuron_distance = pd.DataFrame(columns=('image_file_name','swc_file', 'gold_swc_file', 'algorithm', 'neuron_distance_12','neuron_distance_21', 'neuron_distance_ave','neuron_distance_diff', 'neuron_distance_perc')) i=0 for dirpath, dnames, fnames in os.walk(input_dir): for f in fnames: if f.endswith(".r.log"): log_f= (os.path.join(dirpath, f)) image = log_f.split('/')[-2] algorithm = rp.matchFileToAlgorithmName(log_f[0:-6]) nd = bn.read_neuron_dist_log(log_f) df_neuron_distance.loc[i] = [image,nd['input_file1'], nd['input_file2'], algorithm, nd['dist_12'],nd['dist_21'],nd['ave'],nd['diff'], nd['perc']] i=i+1 df_neuron_distance = df_neuron_distance[df_neuron_distance['neuron_distance_ave'] != -1] # empty no df_neuron_distance.to_csv(output_neuron_distance_csv, index=False) return
def pipe(input_data_dir, output_dir, imageIDs, distance_file_postfix='median_distances.csv', COLLECT_FROM_DISTANCE_MATRIX=1, EXTRACT_MEDIAN_CONSENSUS=1, DISPLAY=0): BeginTime = time.time() all_average_csv = output_dir + '/all_averaged_distances.csv' ###################################### if COLLECT_FROM_DISTANCE_MATRIX: #remove empty files os.system('find ' + input_data_dir + ' -size 0 -delete') os.system('mkdir ' + output_dir) else: print "Distance csv files had been collected. Skip collecting." if COLLECT_FROM_DISTANCE_MATRIX: df_all = pd.DataFrame(columns=[ 'image_id', 'algorithm', 'swc_file_name', 'average_sum_distance', 'average_structure_difference', 'average_max_distance' ]) count = 0 for image_id in imageIDs: df_image_filled_template = pd.DataFrame(columns=df_all.columns) df_image_filled_template['image_id'] = image_id csv_file = input_data_dir + '/' + image_id + '_' + distance_file_postfix if not os.path.exists( csv_file ): # for gold 163, consensus results are stored in individual image folders csv_file = input_data_dir + '/' + image_id + '/' + distance_file_postfix if not os.path.exists(csv_file): print "missing " + csv_file continue print "read " + csv_file df_ff = calculate_average_all_pair_distance(csv_file, hasConsensus=True) if not df_ff.empty: for i in range(df_ff.shape[0]): if isinstance(df_ff.iloc[i].swc_file_name, basestring): alg = rp.matchFileToAlgorithmName( (df_ff.iloc[i].swc_file_name).split('/')[-1]) else: print "nan input swc_file_name" print df_ff.iloc[i].swc_file_name continue df_image_filled_template.loc[count] = [ image_id, alg, df_ff.iloc[i]['swc_file_name'], df_ff.iloc[i]['average_sum_distance'], df_ff.iloc[i]['average_structure_difference'], df_ff.iloc[i]['average_max_distance'] ] count = count + 1 else: print "empty df_ff:" + csv_file df_all = df_all.append(df_image_filled_template, ignore_index=True) df_all.to_csv(all_average_csv, index=False) print "Done collecting median distances" print "Output:" + all_average_csv ##################################################################### PLOT_algorithm_consensus = 1 metric = 'average_sum_distance' if PLOT_algorithm_consensus: df_all = pd.read_csv(all_average_csv) all_algorithms = np.unique(df_all.algorithm) plt.figure() sb.set_context("talk", font_scale=0.7) dfg = df_all.groupby('algorithm') sample_size_per_algorithm = np.zeros(all_algorithms.size) jj = 0 for alg in all_algorithms: df_a = dfg.get_group(alg) df_a = df_a[df_a[metric] >= 0] sample_size_per_algorithm[jj] = df_a.shape[0] jj = jj + 1 order = sample_size_per_algorithm.argsort() algorithms_ordered = all_algorithms[order[::-1]] sample_size_per_algorithm = sample_size_per_algorithm[order[::-1]] a = sb.barplot(y='algorithm', x=metric, data=df_all, order=algorithms_ordered) #a = sb.tsplot(data=df_all, time='image_id', value='total_average_distance') algorithm_names = [ rp.map_better_algorithm_name(x) for x in algorithms_ordered ] a.set_yticklabels([ '%s ($n$=%d )' % (algorithm_names[i], sample_size_per_algorithm[i]) for i in range(algorithms_ordered.size) ]) plt.subplots_adjust(left=0.4, bottom=0.1, top=0.9) plt.savefig(output_dir + '/compare_distance_plot.png', format='png') if DISPLAY: plt.show() plt.close() print "Done plotting algorithm comparison." ##################################################### metric = 'average_sum_distance' output_median_and_consensus_csv_file = output_dir + '/extracted_median_consensus.csv' if not EXTRACT_MEDIAN_CONSENSUS: print "Median and consensus swc info had been collected. Skip this step." if EXTRACT_MEDIAN_CONSENSUS: df_all = pd.read_csv(all_average_csv) print df_all.shape dfg = df_all.groupby('image_id') df_median_and_consensus = pd.DataFrame(columns=[ 'image_id', 'algorithm', 'swc_file_name', 'average_sum_distance', 'average_structure_difference', 'average_max_distance' ]) PLOT_imageIDs = pd.unique(df_all['image_id']) count = 0 for image_id in PLOT_imageIDs: #print "image_id: "+ str( image_id) df_image = dfg.get_group(image_id) #drop nans df_image.dropna(axis=0, how="any", inplace=True) if len(df_image) < 1: print "no valid recons found for :" + image_id continue i = 0 for fn in df_image['swc_file_name']: if 'consensus' in fn: break i = i + 1 if i >= len(df_image): print 'wrong consensus id found' + str( i) + "mage id:" + image_id continue df_median_and_consensus.loc[count] = [ image_id, 'consensus', df_image.iloc[i]['swc_file_name'], df_image.iloc[i]['average_sum_distance'], df_image.iloc[i]['average_structure_difference'], df_image.iloc[i]['average_max_distance'] ] count = count + 1 df_image.drop(df_image.index[[i]], axis=0, inplace=True) df_image.sort(columns=[ 'average_sum_distance', 'average_structure_difference', 'average_max_distance' ], ascending=True, inplace=True) df_median_and_consensus.loc[count] = [ image_id, 'median', df_image.iloc[0]['swc_file_name'], df_image.iloc[0]['average_sum_distance'], df_image.iloc[0]['average_structure_difference'], df_image.iloc[0]['average_max_distance'] ] count = count + 1 df_median_and_consensus.to_csv(output_median_and_consensus_csv_file) print "Done extracting median distances" print "Output median and consensus distances for each image:" + output_median_and_consensus_csv_file PLOT_MEDIAN_CONSENSUS = 1 if PLOT_MEDIAN_CONSENSUS: # reorer by distance df_median_and_consensus = pd.read_csv( output_median_and_consensus_csv_file) dfg = df_median_and_consensus.groupby('algorithm') df_consensus = dfg.get_group('consensus') df_median = dfg.get_group('median') df_median.reset_index(inplace=True) df_consensus.reset_index(inplace=True) #sort by average distance df_median.sort(columns=['average_sum_distance'], inplace=True) df_median['order'] = range(0, len(df_median)) df_consensus = df_consensus.iloc[df_median.index] df_consensus['order'] = range(0, len(df_median)) df_diff = pd.DataFrame(columns=['image_id', 'difference']) df_diff['difference'] = df_median[ 'average_sum_distance'] - df_consensus['average_sum_distance'] df_diff['image_id'] = df_median['image_id'] df_ff = df_diff['difference'] print "median - consensus:" max_idx = np.nanargmin(np.abs(df_ff - df_ff.max())) print "max value: %f, image : %s" % ( df_ff.max(), df_median.iloc[max_idx]['image_id']) min_idx = np.nanargmin(np.abs(df_ff - df_ff.min())) print "min value: %f, image id: %s" % ( df_ff.min(), df_median.iloc[min_idx]['image_id']) median_idx = np.nanargmin(np.abs(df_ff - df_ff.median())) print "median value: %f, image : %s" % ( df_ff.median(), df_median.iloc[median_idx]['image_id']) df_ff_big = df_diff[df_diff['difference'] > 0] print "consensus is closer to each reconstructions than the median reconstructions in %.2f percent of the %d total images" % ( 100 * float(len(df_ff_big)) / len(df_diff), len(df_diff)) df_ff_small = df_diff[df_diff['difference'] < 0] df_ff_small.to_csv(output_dir + '/investigate.csv') print "investigate the following cases list in:" + output_dir + '/investigate.csv' #make sure the image_ids are matching for i in range(0, len(df_median)): if df_consensus.iloc[i]['image_id'] != df_median.iloc[i][ 'image_id']: print "error matching" print df_consensus.iloc[i]['image_id'] print df_median.iloc[i]['image_id'] print exit() frames = [df_consensus, df_median] df_order = pd.concat(frames) for type in ['ts']: plot_compare_median_consensus(output_dir, df_order, 'average_sum_distance', type, DISPLAY) plot_compare_median_consensus(output_dir, df_order, 'average_max_distance', type, DISPLAY) plot_compare_median_consensus(output_dir, df_order, 'average_structure_difference', type, DISPLAY) endtime = time.time() dur = (endtime - BeginTime) / 60 print "All takes %d min" % (dur)
df_image_filled_template['algorithm'] = all_algorithms df_image_filled_template['image_id'] = image_id df_image_filled_template['swc_file_name'] = np.nan df_image_filled_template['total_average_distance'] = np.nan df_image_filled_template['total_structure_difference'] = np.nan df_image_filled_template['total_max_distance'] = np.nan csv_file = data_DIR + '/'+image_id+'_median_distances.csv' if os.path.exists(csv_file): df_f = pd.read_csv(csv_file) for i in range(df_f.shape[0]): if type(df_f.iloc[i].swc_file_name ) == str: alg = rp.matchFileToAlgorithmName((df_f.iloc[i].swc_file_name).split('/')[-1]) else: print "nan input swc_file_name" print df_f.iloc[i].swc_file_name continue df_a=df_image_filled_template[df_image_filled_template.algorithm == alg] if df_a.shape[0] >0: id = df_a.index[0] df_image_filled_template.loc[id,'swc_file_name'] =df_f.iloc[i]['swc_file_name'] df_image_filled_template.loc[id,'total_average_distance'] =df_f.iloc[i]['total_average_distance'] df_image_filled_template.loc[id,'total_structure_difference'] = df_f.iloc[i]['total_structure_difference'] df_image_filled_template.loc[id,'total_max_distance'] = df_f.iloc[i]['total_max_distance'] else: print alg print "no match!"
i=0 for image in images:#images_have_consensus_results: print image df_nd_image = df_nd_g.get_group(image) num_rows = df_nd_image.shape[0] #print num_rows for j in range(num_rows): df_merge.loc[i] = [image, df_nd_image.iloc[j]['algorithm'], df_nd_image.iloc[j][METRIC] ] i= i+1 df_con_matching = df_consensus_wd[df_consensus_wd['image_file_name']==image] median_swc_file = dfg_median[dfg_median['image_file_name'] ==image] if len(median_swc_file) ==0: continue median_alg = rp.matchFileToAlgorithmName( median_swc_file.iloc[0]['swc_file_name']) df_median = df_nd_image[df_nd_image['algorithm']== median_alg] median_nd = df_median.iloc[0][METRIC] if df_con_matching.shape[0] >0 : consensus_wd_t = df_con_matching.iloc[0][REPORTING_METRIC] #print consensus_wd_t df_merge.loc[i] = [image,'consensus',consensus_wd_t] i= i+1 df_merge.loc[i] = [image,'median',median_nd] i= i+1 df_merge.to_csv(merged_csv, index=False)
#df_image_filled_template['swc_file_name'] = np.nan #df_image_filled_template['average_distance'] = np.nan #df_image_filled_template['average_structure_difference'] = np.nan #df_image_filled_template['average_max_distance'] = np.nan csv_file = data_DIR + '/'+subfolder+'/'+image_id+'_median_distances.csv' if os.path.exists(csv_file): df_f = pd.read_csv(csv_file) if df_f.empty: continue df_ff = calculate_average_all_pair_distance(df_f, hasConsensus = True) for i in range(df_ff.shape[0]): if type(df_ff.iloc[i].swc_file_name ) == str: alg = rp.matchFileToAlgorithmName((df_ff.iloc[i].swc_file_name).split('/')[-1]) else: print "nan input swc_file_name" print df_ff.iloc[i].swc_file_name continue df_a=df_image_filled_template[df_image_filled_template.algorithm == alg] if df_a.shape[0] >0: id = df_a.index[0] df_image_filled_template.iloc[id]['swc_file_name'] =df_ff.iloc[i]['swc_file_name'] df_image_filled_template.iloc[id]['average_distance'] =df_ff.iloc[i]['average_distance'] df_image_filled_template.iloc[id]['average_structure_difference'] = df_ff.iloc[i]['average_structure_difference'] df_image_filled_template.iloc[id]['average_max_distance'] = df_ff.iloc[i]['average_max_distance'] else: print alg print "no match!"
def pipe(input_data_dir, output_dir, imageIDs, distance_file_postfix='median_distances.csv',COLLECT_FROM_DISTANCE_MATRIX=1,EXTRACT_MEDIAN_CONSENSUS=1, DISPLAY=0): BeginTime = time.time() all_average_csv = output_dir+'/all_averaged_distances.csv' ###################################### if COLLECT_FROM_DISTANCE_MATRIX: #remove empty files os.system('find '+ input_data_dir +' -size 0 -delete') os.system('mkdir '+ output_dir) else: print "Distance csv files had been collected. Skip collecting." if COLLECT_FROM_DISTANCE_MATRIX: df_all = pd.DataFrame(columns=['image_id', 'algorithm','swc_file_name','average_sum_distance','average_structure_difference','average_max_distance']) count = 0 for image_id in imageIDs: df_image_filled_template = pd.DataFrame(columns = df_all.columns) df_image_filled_template['image_id'] = image_id csv_file = input_data_dir +'/'+image_id+'_'+distance_file_postfix if not os.path.exists(csv_file): # for gold 163, consensus results are stored in individual image folders csv_file = input_data_dir +'/'+image_id+'/'+distance_file_postfix if not os.path.exists(csv_file): print "missing "+csv_file continue print "read "+csv_file df_ff = calculate_average_all_pair_distance(csv_file, hasConsensus = True) if not df_ff.empty: for i in range(df_ff.shape[0]): if isinstance(df_ff.iloc[i].swc_file_name, basestring) : alg = rp.matchFileToAlgorithmName((df_ff.iloc[i].swc_file_name).split('/')[-1]) else: print "nan input swc_file_name" print df_ff.iloc[i].swc_file_name continue df_image_filled_template.loc[count] = [image_id, alg,df_ff.iloc[i]['swc_file_name'],df_ff.iloc[i]['average_sum_distance'],df_ff.iloc[i]['average_structure_difference'], df_ff.iloc[i]['average_max_distance']] count = count +1 else: print "empty df_ff:"+csv_file df_all = df_all.append(df_image_filled_template,ignore_index=True) df_all.to_csv(all_average_csv, index=False) print "Done collecting median distances" print "Output:" + all_average_csv ##################################################################### PLOT_algorithm_consensus = 1 metric = 'average_sum_distance' if PLOT_algorithm_consensus: df_all = pd.read_csv(all_average_csv) all_algorithms = np.unique(df_all.algorithm) plt.figure() sb.set_context("talk", font_scale=0.7) dfg = df_all.groupby('algorithm') sample_size_per_algorithm = np.zeros(all_algorithms.size) jj = 0 for alg in all_algorithms: df_a = dfg.get_group(alg) df_a = df_a[df_a[metric]>=0] sample_size_per_algorithm[jj] = df_a.shape[0] jj = jj+1 order = sample_size_per_algorithm.argsort() algorithms_ordered = all_algorithms[order[::-1]] sample_size_per_algorithm =sample_size_per_algorithm[order[::-1]] a = sb.barplot(y='algorithm', x=metric, data=df_all, order = algorithms_ordered) #a = sb.tsplot(data=df_all, time='image_id', value='total_average_distance') algorithm_names = [rp.map_better_algorithm_name(x) for x in algorithms_ordered] a.set_yticklabels(['%s ($n$=%d )'%(algorithm_names[i], sample_size_per_algorithm[i]) for i in range(algorithms_ordered.size) ]) plt.subplots_adjust(left=0.4, bottom=0.1, top=0.9) plt.savefig(output_dir + '/compare_distance_plot.png', format='png') if DISPLAY: plt.show() plt.close() print "Done plotting algorithm comparison." ##################################################### metric = 'average_sum_distance' output_median_and_consensus_csv_file = output_dir +'/extracted_median_consensus.csv' if not EXTRACT_MEDIAN_CONSENSUS : print "Median and consensus swc info had been collected. Skip this step." if EXTRACT_MEDIAN_CONSENSUS: df_all = pd.read_csv(all_average_csv) print df_all.shape dfg = df_all.groupby('image_id') df_median_and_consensus = pd.DataFrame(columns=['image_id', 'algorithm','swc_file_name','average_sum_distance','average_structure_difference','average_max_distance']) PLOT_imageIDs = pd.unique(df_all['image_id']) count = 0 for image_id in PLOT_imageIDs: #print "image_id: "+ str( image_id) df_image = dfg.get_group(image_id) #drop nans df_image.dropna(axis=0, how="any", inplace =True) if len(df_image) <1: print "no valid recons found for :" + image_id continue i = 0 for fn in df_image['swc_file_name']: if 'consensus' in fn: break i= i+1 if i>= len(df_image): print 'wrong consensus id found' + str(i) +"mage id:" +image_id continue df_median_and_consensus.loc[count] =[image_id, 'consensus',df_image.iloc[i]['swc_file_name'],df_image.iloc[i]['average_sum_distance'], df_image.iloc[i]['average_structure_difference'],df_image.iloc[i]['average_max_distance']] count = count +1 df_image.drop(df_image.index[[i]], axis=0, inplace =True) df_image.sort(columns=['average_sum_distance', 'average_structure_difference','average_max_distance'], ascending = True,inplace=True) df_median_and_consensus.loc[count] =[image_id, 'median',df_image.iloc[0]['swc_file_name'],df_image.iloc[0]['average_sum_distance'], df_image.iloc[0]['average_structure_difference'],df_image.iloc[0]['average_max_distance']] count = count +1 df_median_and_consensus.to_csv(output_median_and_consensus_csv_file) print "Done extracting median distances" print "Output median and consensus distances for each image:"+output_median_and_consensus_csv_file PLOT_MEDIAN_CONSENSUS = 1 if PLOT_MEDIAN_CONSENSUS: # reorer by distance df_median_and_consensus = pd.read_csv(output_median_and_consensus_csv_file) dfg = df_median_and_consensus.groupby('algorithm') df_consensus = dfg.get_group('consensus') df_median = dfg.get_group('median') df_median.reset_index(inplace=True) df_consensus.reset_index(inplace=True) #sort by average distance df_median.sort(columns=['average_sum_distance'], inplace=True) df_median['order'] = range(0,len(df_median)) df_consensus = df_consensus.iloc[df_median.index] df_consensus['order'] = range(0,len(df_median)) df_diff=pd.DataFrame(columns=['image_id','difference']) df_diff['difference'] = df_median['average_sum_distance'] -df_consensus['average_sum_distance'] df_diff['image_id'] = df_median['image_id'] df_ff = df_diff['difference'] print "median - consensus:" max_idx = np.nanargmin(np.abs(df_ff-df_ff.max())) print "max value: %f, image : %s" % (df_ff.max(), df_median.iloc[max_idx]['image_id']) min_idx = np.nanargmin(np.abs(df_ff-df_ff.min())) print "min value: %f, image id: %s" % (df_ff.min(),df_median.iloc[min_idx]['image_id']) median_idx = np.nanargmin(np.abs(df_ff-df_ff.median())) print "median value: %f, image : %s" % (df_ff.median(),df_median.iloc[median_idx]['image_id']) df_ff_big = df_diff[df_diff['difference']>0] print "consensus is closer to each reconstructions than the median reconstructions in %.2f percent of the %d total images" %( 100*float(len(df_ff_big))/len(df_diff), len(df_diff)) df_ff_small = df_diff[df_diff['difference']<0] df_ff_small.to_csv(output_dir+'/investigate.csv') print "investigate the following cases list in:"+ output_dir+'/investigate.csv' #make sure the image_ids are matching for i in range(0,len(df_median)): if df_consensus.iloc[i]['image_id'] != df_median.iloc[i]['image_id']: print "error matching" print df_consensus.iloc[i]['image_id'] print df_median.iloc[i]['image_id'] print exit() frames=[df_consensus,df_median] df_order = pd.concat(frames) for type in ['ts']: plot_compare_median_consensus(output_dir,df_order, 'average_sum_distance',type, DISPLAY) plot_compare_median_consensus(output_dir,df_order, 'average_max_distance',type, DISPLAY) plot_compare_median_consensus(output_dir,df_order, 'average_structure_difference',type, DISPLAY) endtime = time.time() dur=(endtime-BeginTime)/60 print "All takes %d min" %(dur)