def replacePearsonPvalueWithZscore(): all_sample_data={} for tissue in tissue_comparison_scores: for (r,p,sample) in tissue_comparison_scores[tissue]: all_sample_data[sample] = [] ### populate this dictionary and create sub-dictionaries break for tissue in tissue_comparison_scores: for (r,p,sample) in tissue_comparison_scores[tissue]: all_sample_data[sample].append(r) sample_stats={} all_dataset_rho_values=[] ### Get average and standard deviation for all sample rho's for sample in all_sample_data: all_dataset_rho_values+=all_sample_data[sample] avg=statistics.avg(all_sample_data[sample]) stdev=statistics.stdev(all_sample_data[sample]) sample_stats[sample]=avg,stdev global_rho_avg = statistics.avg(all_dataset_rho_values) global_rho_stdev = statistics.stdev(all_dataset_rho_values) ### Replace the p-value for each rho for tissue in tissue_comparison_scores: scores = [] for (r,p,sample) in tissue_comparison_scores[tissue]: #u,s=sample_stats[sample] #z = (r-u)/s z = (r-global_rho_avg)/global_rho_stdev ### Instead of doing this for the sample background, do it relative to all analyzed samples scores.append([r,z,sample]) tissue_comparison_scores[tissue] = scores
def reorderArraysOnly(filtered_exp_db,filetype,counts): ###array_order gives the final level order sorted, followed by the original index order as a tuple ###expr_group_list gives the final level order sorted, followed by the original index order as a tuple for probeset in filtered_exp_db: grouped_ordered_array_list = {}; group_list = [] for x in expr_group_list: y = x[1]; group = x[2] ### this is the new first index ### for example y = 5, therefore the filtered_exp_db[probeset][5] entry is now the first try: try: new_item = filtered_exp_db[probeset][y] except TypeError: print y,x,expr_group_list; kill except IndexError: print probeset,y,x,expr_group_list,'\n',filtered_exp_db[probeset];kill ###Used for comparision analysis try: grouped_ordered_array_list[group].append(new_item) except KeyError: grouped_ordered_array_list[group] = [new_item] ### For the exon-level expression data, export the group pair data for all pairwise comparisons to different comp files ###*******Include a database with the raw values saved for permuteAltAnalyze******* for info in comp_group_list: group1 = int(info[0]); group2 = int(info[1]); comp = str(info[0]),str(info[1]) g1_data = grouped_ordered_array_list[group1] g2_data = grouped_ordered_array_list[group2] #print probeset, group1, group2, g1_data, g2_data, info;kill data = comparision_export_db[comp] values = [probeset]+g2_data+g1_data; values = string.join(values,'\t')+'\n' ###groups are reversed since so are the labels #raw_data_comps[probeset,comp] = temp_raw data.write(values) ### Export all values grouped from the array for group in grouped_ordered_array_list: group_list.append(group) group_list.sort(); combined_value_list=[]; avg_values=[] for group in group_list: g_data = grouped_ordered_array_list[group] if exp_analysis_type == 'expression': try: avg_gdata = statistics.avg(g_data); avg_values.append(avg_gdata) except Exception: print g_data print avg_values kill combined_value_list+=g_data if exp_data_format == 'non-log' and counts == 'no': try: combined_value_list = logTransform(combined_value_list) except Exception: print probeset, combined_value_list,comp_group_list,expr_group_list print filtered_exp_db[probeset]; kill if filetype == 'expression': ### Export the expression values for all samples grouped (if meeting the above thresholds) values = string.join([probeset]+combined_value_list,'\t')+'\n' fulldataset_export_object.write(values) ### Don't need this for dabg data if exp_analysis_type == 'expression': avg_values.sort() ### Sort to get the lowest dabg and largest average expression if filetype == 'dabg': if avg_values[0]<=dabg_p_threshold: dabg_summary[probeset]=[] ### store probeset if the minimum p<user-threshold else: #if 'ENSMUSG00000018263:' in probeset: print probeset,[avg_values[-1],expression_threshold] if avg_values[-1]>=expression_threshold: expression_summary[probeset]=[] ### store probeset if the minimum p<user-threshold
def plotFeatureBoxPlots(qc_db,dataset_name,feature_type): pylab.figure() pylab.xlabel('Biological Sample Names') pylab.ylabel('Read Counts - Log2') pylab.title('Expression BoxPlots for %ss - %s' % (feature_type,dataset_name)) #pylab.subplots_adjust(left=0.085, right=0.95, top=0.2, bottom=0.35) pylab.subplots_adjust(left=0.075, right=0.95, top=0.9, bottom=0.35) #axes = getAxes(scores) ### adds buffer space to the end of each axis and creates room for a legend #pylab.axis(axes) boxplots=[] samples=[] sample_sorted_list=[] for sample_name in qc_db: try: qc = qc_db[sample_name][feature_type] except Exception: print 'No junction data found for at least one sample:',sample_name; forceExit sample_sorted_list.append([statistics.avg(qc),statistics.stdev(qc),sample_name]) sample_sorted_list.sort() sample_sorted_list.reverse() filename = 'QC-%s-BoxPlot-%s.pdf' % (dataset_name,feature_type) export_obj = export.ExportFile(root_dir + filename[:-4]+'.txt') export_obj.write('SampleID\tAverage Expression\n') firstEntry=True for (mean,stdev,sample_name) in sample_sorted_list: ls=[]; x_ls=[]; y_ls=[] qc = qc_db[sample_name][feature_type] boxplots.append(qc) samples.append(sample_name) export_obj.write(sample_name+'\t'+str(mean)+'\n') if firstEntry: threshold=mean-2*stdev firstEntry=False else: if mean<threshold: print sample_name,'expression is considered very low (2 standard deviations away from the max).' pylab.boxplot(boxplots, notch=0, whis=1.5, positions=None, widths=None, patch_artist=False) #pylab.boxplot(boxplots, notch=0, sym='+', vert=1, whis=1.5, positions=None, widths=None, patch_artist=False) xtickNames = pylab.setp(pylab.gca(), xticklabels=samples) pylab.setp(xtickNames, rotation=90, fontsize=10) export_obj.close() #print 'Exporting:',filename pylab.savefig(root_dir + filename) filename = filename[:-3]+'png' pylab.savefig(root_dir + filename) #,dpi=200 graphic_link.append(['QC - BoxPlot-'+feature_type+' Expression',root_dir+filename]) try: import gc pylab.figure.clf() pylab.close() gc.collect() except Exception: pass
def studAvg(): studentName = input('Student Name:') try: average = avg(studentdict[studentName]) print(studentName,'has an average grade of',average) except KeyError as keyErr: print('Student',studentName,'not found') print(studentdict)
def combine_profiles(profile_list): profile_group_sizes={} for db in profile_list: for key in db: profile_group_sizes[key] = len(db[key]) break new_profile_db={} for key in profile_group_sizes: x = profile_group_sizes[key] ###number of elements in list for key new_val_list=[]; i = 0 while i<x: temp_val_list=[] for db in profile_list: if key in db: val = db[key][i]; temp_val_list.append(val) i+=1; val_avg = statistics.avg(temp_val_list); new_val_list.append(val_avg) new_profile_db[key] = new_val_list return new_profile_db
def parse_input_data(filename,data_type): fn=filepath(filename); first_line = 1; array_group_name_db = {}; z=0; array_group_db = {}; output_file = [] #print "Reading",filename secondary_data_type = export.getParentDir(filename) ### e.g., expression or counts for line in open(fn,'rU').xreadlines(): data = cleanUpLine(line); t = string.split(data,'\t'); probeset = t[0]; z+=1 if first_line == 1: first_line = 0 #makes this value null for the next loop of actual array data ###Below ocucrs if the data is raw opposed to precomputed if data_type == 'export': if array_type == 'exon': folder = 'ExonArray'+'/'+species + '/' elif array_type == 'gene': folder = 'GeneArray'+'/'+species + '/' elif array_type == 'junction': folder = 'JunctionArray'+'/'+species + '/' elif array_type == 'RNASeq': folder = 'RNASeq'+'/'+species + '/' else: folder = array_type + '/' parent_path = root_dir+'AltExpression/'+folder if array_type == 'RNASeq': output_file = altanalzye_input[0:-4] + '.ExpCutoff-' + str(original_exp_threshold) +'_'+ filter_method+'.txt' else: output_file = altanalzye_input[0:-4] + '.p' + str(int(100*p)) +'_'+ filter_method+'.txt' output_file_dir = parent_path+output_file print "...Exporting",output_file_dir export_data = export.createExportFile(output_file_dir,root_dir+'AltExpression/'+folder) fn=filepath(output_file_dir); export_data = open(fn,'w'); export_data.write(line) if ':' in t[1]: array_group_list = []; x=0 ###gives us an original index value for each entry in the group for entry in t[1:]: array_group,array_name = string.split(entry,':') try: array_group_db[array_group].append(x) array_group_name_db[array_group].append(array_name) except KeyError: array_group_db[array_group] = [x] array_group_name_db[array_group] = [array_name] ### below only occurs with a new group addition array_group_list.append(array_group) #use this to generate comparisons in the below linked function x += 1 #print '##### array_group_list',array_group_list elif len(probeset)>0 and data_type != 'export': ###Use the index values from above to assign each expression value to a new database temp_group_array={}; array_index_list = [] ###Use this list for permutation analysis for group in array_group_db: #array_index_list.append(array_group_db[group]) group_values = [] for array_index in array_group_db[group]: try: exp_val = float(t[array_index+1]) except IndexError: print t, z,'\n',array_index,'\n',group, probeset;kill group_values.append(exp_val) avg_stat = statistics.avg(group_values) if data_type == 'expression': ###If non-log array data if exp_data_format == 'non-log': ### This works better for RNASeq as opposed to log transforming and then filtering which is more stringent and different than the filtering in ExonArray(). if array_type == 'RNASeq': if normalization_method == 'RPKM' and secondary_data_type == 'expression': if ':I' in probeset: k=1 ### Don't require an RPKM threshold for intron IDs (these will likely never meet this unless small or fully retained and highly expressed) elif ':' not in probeset: if avg_stat>=gene_rpkm_threshold: k=1 else: k=0 elif avg_stat>=exon_rpkm_threshold: k=1 elif '-' in probeset: k=1 ### Don't consider RPKM for junctions, just counts else: k=0 #if 'ENSMUSG00000045991:E2.2' in probeset: print [probeset, normalization_method, secondary_data_type, gene_rpkm_threshold, avg_stat, k] else: ### Otherwise, we are looking at count data if '-' in probeset: ### junction meeting minimum read-count number if avg_stat>=junction_exp_threshold: k=1 ### junction_exp_threshold is the same as nonlog_exp_threshold else: k=0 elif ':' not in probeset: if avg_stat>=gene_exp_threshold: k=1 else: k=0 else: ### exon or intron meeting minimum read-count number if avg_stat>=exon_exp_threshold: k=1 else: k=0 #if 'ENSMUSG00000045991:E2.2' in probeset: print [probeset, normalization_method, secondary_data_type, exon_exp_threshold, junction_exp_threshold, avg_stat, k] else: if avg_stat>=nonlog_exp_threshold: k=1 else: k=0 elif avg_stat>=log_expression_threshold: k=1 else: k=0 if normalization_method == 'RPKM' and secondary_data_type == 'expression': ### Treat as dabp p-value try: pvalue_status_db[probeset].append(k) except KeyError: pvalue_status_db[probeset] = [k] else: try: expression_status_db[probeset].append(k) except KeyError: expression_status_db[probeset] = [k] #if probeset == '3209315': print [group],k,len(group_values),array_group_list if data_type == 'p-value': if avg_stat<=p: k=1 else: k=0 #if 'G7216513_a_at' in probeset: print k, avg_stat try: pvalue_status_db[probeset].append(k) except KeyError: pvalue_status_db[probeset] = [k] elif data_type == 'export': if exp_data_format == 'non-log': ### This code was added in version 1.16 in conjunction with a switch from logstatus to ### non-log in AltAnalyze to prevent "Process AltAnalyze Filtered" associated errors exp_values = t[1:]; exp_values_log2=[] for exp_val in exp_values: exp_values_log2.append(str(math.log(float(exp_val),2))) ### exp_val+=1 was removed in 2.0.5 line = string.join([probeset]+exp_values_log2,'\t')+'\n' try: null = export_db[probeset]; export_data.write(line) except KeyError: null = [] ### occurs if not a probeset to include in the filtered results export file if data_type == 'export': export_data.close() return output_file
def performGroupNormalization(filename,export_dir,platform): expressionDataFormat,increment,convertNonLogToLog = ExpressionBuilder.checkExpressionFileFormat(filename) groups_dir = string.replace(export_dir,'exp.','batch.') fn=unique.filepath(filename); row_number=0; exp_db={}; relative_headers_exported = False group_db = importGroups(groups_dir) export_data = export.ExportFile(export_dir) for line in open(fn,'rU').xreadlines(): data = ExpressionBuilder.cleanUpLine(line) t = string.split(data,'\t') if data[0]=='#' and row_number==0: row_number = 0 elif row_number==0: sample_list = t[1:] new_sample_list = [] for group in group_db: group_samples = group_db[group] try: sample_index_list = map(lambda x: sample_list.index(x), group_samples) group_db[group] = sample_index_list new_sample_list+=group_samples except Exception: missing=[] for x in sample_list: if x not in t[1:]: missing.append(x) print 'missing:',missing print t print sample_list print filename, groups_dir print 'Unknown Error!!! Skipping cluster input file build (check column and row formats for conflicts)'; forceExit title = string.join([t[0]]+new_sample_list,'\t')+'\n' ### output the new sample order (group file order) export_data.write(title) row_number=1 else: gene = t[0] if expressionDataFormat == 'non-log' and (convertNonLogToLog or platform == 'RNASeq'): ### Convert to log2 RPKM values - or counts try: all_values = map(lambda x: math.log(float(x)+increment,2), t[1:]) except Exception: all_values = ExpressionBuilder.logTransformWithNAs(t[1:],increment) else: try: all_values = map(float,t[1:]) except Exception: all_values = ExpressionBuilder.logTransformWithNAs(t[1:],increment) row_number+=1 ### Keep track of the first gene as to write out column headers for the relative outputs gene_log_folds = [] for group in group_db: sample_index_list = group_db[group] ### Calculate log-fold values relative to the mean of all sample expression values try: values = map(lambda x: all_values[x], sample_index_list) ### simple and fast way to reorganize the samples except Exception: print len(values), sample_index_list;kill try: avg = statistics.avg(values) except Exception: values2=[] for v in values: try: values2.append(float(v)) except Exception: pass values = values2 try: avg = statistics.avg(values) except Exception: if len(values)>0: avg = values[0] else: avg = 0 try: log_folds = map(lambda x: (x-avg), values) except Exception: log_folds=[] for x in values: try: log_folds.append(x-avg) except Exception: log_folds.append('') gene_log_folds+=log_folds gene_log_folds = map(lambda x: str(x),gene_log_folds) export_data.write(string.join([gene]+gene_log_folds,'\t')+'\n') export_data.close()
def importTableEntries(filename,filter_db,ensembl_exon_db,gene_db,root_dir,transpose,display,showIntrons,analysisType='plot'): import collections average_samples = True if showIntrons == 'yes': include_introns = True else: include_introns = False uid_db={} ### probeset or AltAnalyze RNA-Seq ID keyed uid_list={} ### ordered from first to last exon region uid_gene_db={} ### Lets us look at multiple genes try: import UI biotypes = UI.getBiotypes(filename) except Exception: biotypes={} for gene in ensembl_exon_db: uid_list[gene]=[] for (index,ed,id) in ensembl_exon_db[gene]: proceed = False if 'exp.' in filename: if include_introns: proceed = True elif 'E' in ed.ExonID(): proceed = True else: ### Include introns for splicing index view if include_introns == True: proceed = True elif 'E' in ed.ExonID(): proceed = True if proceed: uid_db[id] = ed uid_list[gene].append(id) uid_gene_db[id]=gene if '_vs_' in filename: ### If one two groups, this is what will be output to the RawSplice folder - need to have this alternate way of getting the expression file location rootdir = string.split(filename, 'AltResults')[0] exp_dir = getValidExpFile(rootdir+'ExpressionInput') alt_groups_dir = string.split(exp_dir, 'ExpressionInput')[0]+'ExpressionInput/groups.'+findFilename(exp_dir) alt_groups_dir = string.replace(alt_groups_dir,'exp.','') start_time = time.time() fn = filepath(filename) matrix_gene_db={} stdev_gene_matrix_db={} row_header_gene={} ids={} x=0 if 'heatmap' in analysisType: average_samples = False if '/' in filename: dataset_name = string.split(filename,'/')[-1][:-4] else: dataset_name = string.split(filename,'\\')[-1][:-4] for line in open(fn,'rU').xreadlines(): data = line.strip() t = string.split(data,'\t') if data[0]=='#': x=0 elif x==0: if platform == 'RNASeq': removeExtension=True else: removeExtension=False group_db, column_header, sample_name_db = assignGroupColors(t[1:],'',removeExtension=removeExtension) x=1 altresults = False if average_samples: if 'AltResults' in filename: altresults=True groups_dir = string.split(filename, 'AltResults')[0]+'ExpressionInput/groups.'+findFilename(filename) if verifyFile(groups_dir)==False: groups_dir = alt_groups_dir new_column_header = reformatAltHeaders(t[3:]) start = 3 else: if 'exp.' in filename: groups_dir = string.replace(filename,'exp.','groups.') else: groups_dir = string.replace(filename,'counts.','groups.') new_column_header = column_header start = 1 ### starting index with numeric values groups_dir = string.replace(groups_dir,'stats.','groups.') groups_dir = string.replace(groups_dir,'-steady-state.txt','.txt') ### groups is for the non-steady-state file try: group_index_db=collections.OrderedDict() except Exception: import ordereddict group_index_db = ordereddict.OrderedDict() ### use comps in the future to visualize group comparison changes sample_list,group_sample_db,group_db,group_name_sample_db,comp_groups,comps_name_db = ExpressionBuilder.simpleGroupImport(groups_dir) for item in sample_list: group_name = group_db[item] proceed=False try: sample_index = new_column_header.index(item); proceed=True except Exception: try: item = string.replace(item,'.bed','') item = string.replace(item,'.CEL','') ### Probe-level analyses as RNA-Seq item = string.replace(item,'.cel','') item = string.replace(item,'.txt','') item = string.replace(item,'.TXT','') item = string.replace(item,'.TAB','') item = string.replace(item,'.tab','') sample_index = new_column_header.index(item) proceed=True except Exception: pass #print [item] #print column_header #print Error if proceed: try: group_index_db[group_name].append(sample_index) except Exception: try: group_index_db[group_name] = [sample_index] ### dictionary of group to input file sample indexes except Exception: pass ### Occurs when analyzing splicing-index for two groups when more than two groups exist (error from 5 lines up) groups = map(str, group_index_db) ### store group names new_sample_list = map(lambda item: group_db[item], sample_list) ### lookup index of each sample in the ordered group sample list column_header = groups else: if 'AltResults' in filename: start = 3 else: start = 1 ### starting index with numeric values column_header = t[start-1:] row_number=1 else: if ' ' not in t and '' not in t: ### Occurs for rows with missing data uid = t[start-1] if ';' in uid: uid = string.split(uid,';')[0] ids[uid]=None ens_geneID = string.split(uid,':')[0] #if ens_geneID in gene_db: print uid if uid in filter_db or ('heatmap' in analysisType and ens_geneID in gene_db): try: if len(biotypes)==1 and 'junction' in biotypes: gene = ens_geneID else: gene = uid_gene_db[uid] try: row_header_gene[gene].append(uid) except Exception: row_header_gene[gene] = [uid] if average_samples == False: values = map(float,t[start:]) try: matrix_gene_db[gene].append(values) except Exception: matrix_gene_db[gene]=[values] else: if platform == 'RNASeq' and altresults==False: ### Convert to log2 RPKM values - or counts values = map(lambda x: math.log(float(x),2), t[start:]) else: values = map(float,t[start:]) if 'AltResults' in filename: ### If splicing scores, normalize these to the mean values mean = statistics.avg(values) values = map(lambda x: x-mean, values) avg_ls=[]; std_ls = [] for group_name in group_index_db: group_values = map(lambda x: values[x], group_index_db[group_name]) ### simple and fast way to reorganize the samples avg = statistics.avg(group_values) try: st_err = statistics.stdev(group_values)/math.sqrt(len(group_values)) except Exception: ### Occurs if no replicates in the dataset st_err = 0 avg_ls.append(avg) std_ls.append(st_err) try: matrix_gene_db[gene].append(avg_ls) except Exception: matrix_gene_db[gene]=[avg_ls] try: stdev_gene_matrix_db[gene].append(std_ls) except Exception: stdev_gene_matrix_db[gene]=[std_ls] except Exception: #print traceback.format_exc() pass x+=1 global colors original_column_header = list(column_header) if len(uid_list)==0: print 'No genes found in the exon expression database'; forceNoExonExpError successfully_output_genes=0 display_count=0 ### Only display a certain number of genes for last_gene in uid_list: pass for gene in uid_list: fig = pylab.figure() ### Create this here - resulting in a single figure for memory purposes new_header = [] new_matrix = [] new_stdev = [] annotation_list=[] gene_symbol = gene_db[gene] try: matrix = matrix_gene_db[gene] except Exception: print gene_symbol, 'not in alternative expression database' continue ### go the next gene - no alt.expression for this gene row_header = row_header_gene[gene] try: stdev_matrix = stdev_gene_matrix_db[gene] except Exception: pass for uid in uid_list[gene]: #print row_header;sys.exit() try: i = row_header.index(uid) ### If the ID is in the filtered annotated exon list (not just core) new_header.append(uid) try: new_matrix.append(matrix[i]) except Exception: print uid, i,len(matrix);sys.exit() ed = uid_db[uid] annotation_list.append(ed) try: new_stdev.append(stdev_matrix[i]) except Exception: pass except Exception: pass if len(new_matrix)>0: matrix = new_matrix if len(new_header)>0: row_header = new_header if 'heatmap' in analysisType: export_dir = root_dir + gene_symbol + '-heatmap.txt' export_obj = export.ExportFile(export_dir) export_obj.write(string.join(column_header,'\t')+'\n') ki=0 if len(annotation_list)>0: for ed in annotation_list: if 'AltResults' not in filename and platform == 'RNASeq': values = map(lambda x: math.log(x,2), matrix[ki]) else: values = matrix[ki] export_obj.write(string.join([ed.ExonID()] + map(str,values),'\t')+'\n') ki+=1 row_metric = 'euclidean'; row_method = None else: ### Just junctions analyzed here... no sorted junctions yet ki=0 for uid in row_header_gene[gene]: if 'AltResults' not in filename and platform == 'RNASeq': values = map(lambda x: math.log(x,2), matrix[ki]) else: values = matrix[ki] export_obj.write(string.join([uid] + map(str,values),'\t')+'\n') ki+=1 row_metric = 'euclidean'; row_method = 'average' export_obj.close() import clustering column_metric = 'euclidean'; column_method = 'hopach' color_gradient = 'red_black_sky'; transpose = False; graphic_links=[] if ki>100: transpose = True if gene == last_gene: display = True else: display = False graphic_links = clustering.runHCexplicit(export_dir, graphic_links, row_method, row_metric, column_method, column_metric, color_gradient, transpose, display=display, Normalize=True, compressAxis = False, contrast = 2.5) successfully_output_genes+=1 else: stdev_matrix = new_stdev time_diff = str(round(time.time()-start_time,1)) #print '%d rows and %d columns imported for %s in %s seconds...' % (len(matrix),len(column_header),dataset_name,time_diff) if transpose == True: matrix = map(numpy.array, zip(*matrix)) ### coverts these to tuples column_header, row_header = row_header, original_column_header stdev_matrix = map(numpy.array, zip(*stdev_matrix)) matrix = numpy.array(matrix) stdev_matrix = numpy.array(stdev_matrix) try: if len(uid_list)>10: #if display_count==5: display=False display=False if display_count==0: ### store a consistent color palete to use colors=[] """ k=0 while k < len(row_header): colors.append(tuple(rand(3))) k+=1""" #http://stackoverflow.com/questions/3016283/create-a-color-generator-from-given-colormap-in-matplotlib cm = pylab.cm.get_cmap('gist_rainbow') #gist_ncar for i in range(len(row_header)): colors.append(cm(1.*i/len(row_header))) # color will now be an RGBA tuple plotExonExpression(fig,matrix,stdev_matrix,row_header,column_header,dataset_name,annotation_list,gene_symbol,root_dir,display=display) successfully_output_genes+=1 display_count+=1 except Exception: print traceback.format_exc();sys.exit() print gene_symbol, 'failed' try: pylab.close() except Exception: pass if successfully_output_genes>0: #try: print 'Gene graphs exported to ExonPlots...' #except Exception: pass pass else: print '\nWARNING!!!! No genes with associated alternative exon evidence found\n'; forceNoExonExpError try: import gc fig.clf() pylab.close() gc.collect() except Exception: pass
def reorder(data,data_headers,array_order,comp_group_list,probeset_db,include_raw_data,array_type,norm,fl,logvalues=True,blanksPresent=False): ###array_order gives the final level order sorted, followed by the original index order as a tuple expbuilder_value_db = {}; group_name_db = {}; summary_filtering_stats = {}; pval_summary_db= {} replicates = 'yes' stat_result_names = ['avg-','log_fold-','fold-','rawp-','adjp-'] group_summary_result_names = ['avg-'] ### Define expression variables try: probability_statistic = fl.ProbabilityStatistic() except Exception: probability_statistic = 'unpaired t-test' try: gene_exp_threshold = math.log(fl.GeneExpThreshold(),2) except Exception: gene_exp_threshold = 0 try: gene_rpkm_threshold = float(fl.RPKMThreshold()) except Exception: gene_rpkm_threshold = 0 try: FDR_statistic = fl.FDRStatistic() except Exception: FDR_statistic = 'Benjamini-Hochberg' calculateAsNonLog=True if blanksPresent: calculateAsNonLog=False ### Begin processing sample expression values according to the organized groups for row_id in data: try: gene = probeset_db[row_id][0] except TypeError: gene = '' #not needed if not altsplice data data_headers2 = {} #reset each time grouped_ordered_array_list = {} for x in array_order: y = x[1] #this is the new first index group = x[2] group_name = x[3] group_name_db[group] = group_name #for example y = 5, therefore the data[row_id][5] entry is now the first try: try: new_item = data[row_id][y] except IndexError: print row_id,data[row_id],len(data[row_id]),y,len(array_order),array_order;kill if logvalues==False and calculateAsNonLog and array_type == 'RNASeq': new_item = math.pow(2,new_item) except TypeError: new_item = '' #this is for a spacer added in the above function try: grouped_ordered_array_list[group].append(new_item) except KeyError: grouped_ordered_array_list[group] = [new_item] try: data_headers2[group].append(data_headers[y]) except KeyError: data_headers2[group]= [data_headers[y]] #perform statistics on each group comparison - comp_group_list: [(1,2),(3,4)] stat_results = {} group_summary_results = {} for comp in comp_group_list: group1 = int(comp[0]) group2 = int(comp[1]) group1_name = group_name_db[group1] group2_name = group_name_db[group2] groups_name = group1_name + "_vs_" + group2_name data_list1 = grouped_ordered_array_list[group1] data_list2 = grouped_ordered_array_list[group2] #baseline expression if blanksPresent: ### Allows for empty cells data_list1 = filterBlanks(data_list1) data_list2 = filterBlanks(data_list2) try: avg1 = statistics.avg(data_list1) except Exception: avg1 = '' try: avg2 = statistics.avg(data_list2) except Exception: avg2='' try: if (logvalues == False and array_type != 'RNASeq') or (logvalues==False and calculateAsNonLog): fold = avg1/avg2 log_fold = math.log(fold,2) if fold<1: fold = -1.0/fold else: log_fold = avg1 - avg2 fold = statistics.log_fold_conversion(log_fold) except Exception: log_fold=''; fold='' try: #t,df,tails = statistics.ttest(data_list1,data_list2,2,3) #unpaired student ttest, calls p_value function #t = abs(t); df = round(df); p = str(statistics.t_probability(t,df)) p = statistics.runComparisonStatistic(data_list1,data_list2,probability_statistic) except Exception: p = 1; sg = 1; N1=0; N2=0 comp = group1,group2 if array_type == 'RNASeq': ### Also non-log but treated differently if 'RPKM' == norm: adj = 0 else: adj = 1 if calculateAsNonLog == False: try: avg1 = math.pow(2,avg1)-adj; avg2 = math.pow(2,avg2)-adj except Exception: avg1=''; avg2='' if 'RPKM' == norm: if avg1 < gene_rpkm_threshold and avg2 < gene_rpkm_threshold: log_fold = 'Insufficient Expression' fold = 'Insufficient Expression' else: if avg1 < gene_exp_threshold and avg2 < gene_exp_threshold: log_fold = 'Insufficient Expression' fold = 'Insufficient Expression' #if row_id=='ENSG00000085514': #if fold=='Insufficient Expression': #print [norm, avg1, avg2, fold, comp, gene_exp_threshold, gene_rpkm_threshold, row_id] #5.96999111075 7.72930768675 Insufficient Expression (3, 1) 1.0 ENSG00000085514 if gene_rpkm_threshold!=0 and calculateAsNonLog: ### Any other data a1 = nonLogAvg(data_list1) a2 = nonLogAvg(data_list2) #print [a1,a2,gene_rpkm_threshold] if a1<gene_rpkm_threshold and a2<gene_rpkm_threshold: log_fold = 'Insufficient Expression' fold = 'Insufficient Expression' #print log_fold;kill try: gs = statistics.GroupStats(log_fold,fold,p) stat_results[comp] = groups_name,gs,group2_name if probability_statistic == 'moderated t-test': gs.setAdditionalStats(data_list1,data_list2) ### Assuming equal variance if probability_statistic == 'moderated Welch-test': gs.setAdditionalWelchStats(data_list1,data_list2) ### Assuming unequal variance except Exception: null=[]; replicates = 'no' ### Occurs when not enough replicates #print comp, len(stat_results); kill_program group_summary_results[group1] = group1_name,[avg1] group_summary_results[group2] = group2_name,[avg2] ### Replaces the below method to get the largest possible comparison fold and ftest p-value grouped_exp_data = []; avg_exp_data = [] for group in grouped_ordered_array_list: data_list = grouped_ordered_array_list[group] if blanksPresent: ### Allows for empty cells data_list = filterBlanks(data_list) if len(data_list)>0: grouped_exp_data.append(data_list) try: avg = statistics.avg(data_list); avg_exp_data.append(avg) except Exception: avg = '' #print row_id, group, data_list;kill try: avg_exp_data.sort(); max_fold = avg_exp_data[-1]-avg_exp_data[0] except Exception: max_fold = 'NA' try: ftestp = statistics.OneWayANOVA(grouped_exp_data) except Exception: ftestp = 1 gs = statistics.GroupStats(max_fold,0,ftestp) summary_filtering_stats[row_id] = gs stat_result_list = [] for entry in stat_results: data_tuple = entry,stat_results[entry] stat_result_list.append(data_tuple) stat_result_list.sort() grouped_ordered_array_list2 = [] for group in grouped_ordered_array_list: data_tuple = group,grouped_ordered_array_list[group] grouped_ordered_array_list2.append(data_tuple) grouped_ordered_array_list2.sort() #now the list is sorted by group number ###for each rowid, add in the reordered data, and new statistics for each group and for each comparison for entry in grouped_ordered_array_list2: group_number = entry[0] original_data_values = entry[1] if include_raw_data == 'yes': ###optionally exclude the raw values for value in original_data_values: if array_type == 'RNASeq': if norm == 'RPKM': adj = 0 else: adj = 1 if calculateAsNonLog == False: value = math.pow(2,value)-adj try: expbuilder_value_db[row_id].append(value) except KeyError: expbuilder_value_db[row_id] = [value] if group_number in group_summary_results: group_summary_data = group_summary_results[group_number][1] #the group name is listed as the first entry for value in group_summary_data: try: expbuilder_value_db[row_id].append(value) except KeyError: expbuilder_value_db[row_id] = [value] for info in stat_result_list: if info[0][0] == group_number: #comp,(groups_name,[avg1,log_fold,fold,ttest]) comp = info[0]; gs = info[1][1] expbuilder_value_db[row_id].append(gs.LogFold()) expbuilder_value_db[row_id].append(gs.Fold()) expbuilder_value_db[row_id].append(gs.Pval()) ### Create a placeholder and store the position of the adjusted p-value to be calculated expbuilder_value_db[row_id].append('') gs.SetAdjPIndex(len(expbuilder_value_db[row_id])-1) gs.SetPvalIndex(len(expbuilder_value_db[row_id])-2) pval_summary_db[(row_id,comp)] = gs ###do the same for the headers, but at the dataset level (redundant processes) array_fold_headers = []; data_headers3 = [] try: for group in data_headers2: data_tuple = group,data_headers2[group] #e.g. 1, ['X030910_25_hl.CEL', 'X030910_29R_hl.CEL', 'X030910_45_hl.CEL']) data_headers3.append(data_tuple) data_headers3.sort() except UnboundLocalError: print data_headers,'\n',array_order,'\n',comp_group_list,'\n'; kill_program for entry in data_headers3: x = 0 #indicates the times through a loop y = 0 #indicates the times through a loop group_number = entry[0] original_data_values = entry[1] if include_raw_data == 'yes': ###optionally exclude the raw values for value in original_data_values: array_fold_headers.append(value) if group_number in group_summary_results: group_name = group_summary_results[group_number][0] group_summary_data = group_summary_results[group_number][1] for value in group_summary_data: combined_name = group_summary_result_names[x] + group_name #group_summary_result_names = ['avg-'] array_fold_headers.append(combined_name) x += 1 #increment the loop index for info in stat_result_list: if info[0][0] == group_number: #comp,(groups_name,[avg1,log_fold,fold,ttest],group2_name) groups_name = info[1][0] only_add_these = stat_result_names[1:] for value in only_add_these: new_name = value + groups_name array_fold_headers.append(new_name) ###For the raw_data only export we need the headers for the different groups (data_headers2) and group names (group_name_db) raw_data_comp_headers = {} for comp in comp_group_list: temp_raw = [] group1 = int(comp[0]);group2 = int(comp[1]) comp = str(comp[0]),str(comp[1]) g1_headers = data_headers2[group1] g2_headers = data_headers2[group2] g1_name = group_name_db[group1] g2_name = group_name_db[group2] for header in g2_headers: temp_raw.append(g2_name+':'+header) for header in g1_headers: temp_raw.append(g1_name+':'+header) raw_data_comp_headers[comp] = temp_raw ###Calculate adjusted ftest p-values using BH95 sorted method statistics.adjustPermuteStats(summary_filtering_stats) ### Calculate adjusted p-values for all p-values using BH95 sorted method round=0 for info in comp_group_list: compid = int(info[0]),int(info[1]); pval_db={} for (rowid,comp) in pval_summary_db: if comp == compid: gs = pval_summary_db[(rowid,comp)] pval_db[rowid] = gs if 'moderated' in probability_statistic and replicates == 'yes': ### Moderates the original reported test p-value prior to adjusting try: statistics.moderateTestStats(pval_db,probability_statistic) except Exception: if round == 0: if replicates == 'yes': print 'Moderated test failed due to issue with mpmpath or out-of-range values\n ... using unmoderated unpaired test instead!' null=[] ### Occurs when not enough replicates round+=1 if FDR_statistic == 'Benjamini-Hochberg': statistics.adjustPermuteStats(pval_db) else: ### Calculate a qvalue (https://github.com/nfusi/qvalue) import numpy; import qvalue; pvals = []; keys = [] for key in pval_db: pvals.append(pval_db[key].Pval()); keys.append(key) pvals = numpy.array(pvals) pvals = qvalue.estimate(pvals) for i in range(len(pvals)): pval_db[keys[i]].SetAdjP(pvals[i]) for rowid in pval_db: gs = pval_db[rowid] expbuilder_value_db[rowid][gs.AdjIndex()] = gs.AdjP() ### set the place holder to the calculated value if 'moderated' in probability_statistic: expbuilder_value_db[rowid][gs.RawIndex()] = gs.Pval() ### Replace the non-moderated with a moderated p-value pval_summary_db=[] ###Finished re-ordering lists and adding statistics to expbuilder_value_db return expbuilder_value_db, array_fold_headers, summary_filtering_stats, raw_data_comp_headers
def nonLogAvg(data_list): return statistics.avg(map(lambda x: math.pow(2,x)-1,data_list))
def statisticallyFilterFile(input_file,output_file,threshold): if 'exp.' in input_file: counts_file = string.replace(input_file,'exp.','geneCount.') else: counts_file = input_file[:-4]+'-geneCount.txt' sample_expressed_genes={} header=True junction_max=[] count_sum_array=[] for line in open(input_file,'rU').xreadlines(): data = cleanUpLine(line) if '.csv' in input_file: t = string.split(data,',') else: t = string.split(data,'\t') if header: samples = t[1:] header=False count_sum_array=[0]*len(samples) else: try: values = map(float,t[1:]) except Exception: if 'NA' in t[1:]: tn = [0 if x=='NA' else x for x in t[1:]] ### Replace NAs values = map(float,tn) else: tn = [0 if x=='' else x for x in t[1:]] ### Replace NAs values = map(float,tn) binarized_values = [] for v in values: if v>threshold: binarized_values.append(1) else: binarized_values.append(0) count_sum_array = [sum(value) for value in zip(*[count_sum_array,binarized_values])] index=0 distribution=[] count_sum_array_db={} samples_to_retain =[] samples_to_exclude = [] for sample in samples: count_sum_array_db[sample] = count_sum_array[index] distribution.append(count_sum_array[index]) index+=1 import statistics distribution.sort() avg = int(statistics.avg(distribution)) stdev = int(statistics.stdev(distribution)) min_exp = int(min(distribution)) cutoff = avg - (stdev*2) dev = 2 print 'The average number of genes expressed above %s is %s, (SD is %s, min is %s)' % (threshold,avg,stdev,min_exp) if cutoff<0: if (stdev-avg)>0: cutoff = avg - (stdev/2); dev = 0.5 else: cutoff = avg - stdev; dev = 1 if min_exp>cutoff: cutoff = avg - stdev; dev = 1 import export eo = export.ExportFile(counts_file) eo.write('Sample\tGenes Expressed(threshold:'+str(threshold)+')\n') for sample in samples: ### keep the original order if count_sum_array_db[sample]>cutoff: samples_to_retain.append(sample) else: samples_to_exclude.append(sample) eo.write(sample+'\t'+str(count_sum_array_db[sample])+'\n') eo.close() print len(samples_to_exclude), 'samples removed (# exp. genes, < %d SD away) (%s)' % (dev,string.join(samples_to_exclude,', ')) print 'Exporting the filtered expression file to:' print output_file filterFile(input_file,output_file,samples_to_retain)
def generateConstitutiveExpression(exp_dbase,constitutive_gene_db,probeset_gene_db,pre_filtered_db,array_names,filename): """Generate Steady-State expression values for each gene for analysis in the main module of this package""" steady_state_db={}; k=0; l=0 remove_nonexpressed_genes = 'no' ### By default set to 'no' ###1st Pass: Identify probesets for steady-state calculation for gene in probeset_gene_db: if avg_all_probes_for_steady_state == 'yes': average_all_probesets[gene] = probeset_gene_db[gene] ### These are all exon aligning (not intron) probesets else: if gene not in constitutive_gene_db: average_all_probesets[gene] = probeset_gene_db[gene] else: constitutive_probeset_list = constitutive_gene_db[gene] constitutive_filtered=[] ###Added this extra code to eliminate constitutive probesets not in exp_dbase (gene level filters are more efficient when dealing with this many probesets) for probeset in constitutive_probeset_list: if probeset in probeset_gene_db[gene]: constitutive_filtered.append(probeset) if len(constitutive_filtered)>0: average_all_probesets[gene] = constitutive_filtered else: average_all_probesets[gene] = probeset_gene_db[gene] ###2nd Pass: Remove probesets that have no detected expression (keep all if none are expressed) if excludeLowExpressionExons: non_expressed_genes={} ### keep track of these for internal QC for gene in average_all_probesets: gene_probe_list=[]; x = 0 for probeset in average_all_probesets[gene]: if probeset in pre_filtered_db: gene_probe_list.append(probeset); x += 1 ###If no constitutive and there are probes with detected expression: replace entry if x >0: average_all_probesets[gene] = gene_probe_list elif remove_nonexpressed_genes == 'yes': non_expressed_genes[gene]=[] if remove_nonexpressed_genes == 'yes': for gene in non_expressed_genes: del average_all_probesets[gene] ###3rd Pass: Make sure the probesets are present in the input set (this is not typical unless a user is loading a pre-filtered probeset expression dataset) for gene in average_all_probesets: v=0 for probeset in average_all_probesets[gene]: try: null = exp_dbase[probeset]; v+=1 except KeyError: null =[] ###occurs if the expression probeset list is missing some of these probesets if v==0: ###Therefore, no probesets were found that were previously predicted to be best constitutive try: average_all_probesets[gene] = probeset_gene_db[gene] ###expand the average_all_probesets to include any exon linked to the gene except KeyError: print gene, probeset, len(probeset_gene_db), len(average_all_probesets);kill for probeset in exp_dbase: array_count = len(exp_dbase[probeset]); break try: null = array_count except Exception: print 'WARNING...CRITICAL ERROR. Make sure the correct array type is selected and that all input expression files are indeed present (array_count ERROR).'; forceError ###Calculate avg expression for each array for each probeset (using constitutive values) gene_count_db={} for gene in average_all_probesets: x = 0 ###For each array, average all probeset expression values gene_sum=0 probeset_list = average_all_probesets[gene]#; k+= len(average_all_probesets[gene]) if array_type != 'RNASeq': ### Just retain the list of probesets for RNA-seq while x < array_count: exp_list=[] ### average all exp values for constituitive probesets for each array for probeset in probeset_list: try: exp_val = exp_dbase[probeset][x] exp_list.append(exp_val) except KeyError: null =[] ###occurs if the expression probeset list is missing some of these probesets try: if len(exp_list)==0: for probeset in probeset_list: try: exp_val = exp_dbase[probeset][x] exp_list.append(exp_val) except KeyError: null =[] ###occurs if the expression probeset list is missing some of these probesets avg_const_exp=statistics.avg(exp_list) ### Add only one avg-expression value for each array, this loop try: steady_state_db[gene].append(avg_const_exp) except KeyError: steady_state_db[gene] = [avg_const_exp] except ZeroDivisionError: null=[] ### Occurs when processing a truncated dataset (for testing usually) - no values for the gene should be included x += 1 l = len(probeset_gene_db) - len(steady_state_db) steady_state_export = filename[0:-4]+'-steady-state.txt' steady_state_export = string.replace(steady_state_export,'counts.','exp.') fn=filepath(steady_state_export); data = open(fn,'w'); title = 'Gene_ID' if array_type == 'RNASeq': import RNASeq steady_state_db, pre_filtered_db = RNASeq.calculateGeneLevelStatistics(steady_state_export,species,average_all_probesets,normalize_feature_exp,array_names,UserOptions,excludeLowExp=excludeLowExpressionExons) ### This "pre_filtered_db" replaces the above since the RNASeq module performs the exon and junction-level filtering, not ExonArray (RPKM and count based) ### Use pre_filtered_db to exclude non-expressed features for multi-group alternative exon analysis removeNonExpressedProbesets(pre_filtered_db,full_dataset_export_dir) reload(RNASeq) for array in array_names: title = title +'\t'+ array data.write(title+'\n') for gene in steady_state_db: ss_vals = gene for exp_val in steady_state_db[gene]: ss_vals = ss_vals +'\t'+ str(exp_val) data.write(ss_vals+'\n') data.close() exp_dbase={}; steady_state_db={}; pre_filtered_db ={} #print k, "probesets were not found in the expression file, that could be used for the constitutive expression calculation" #print l, "genes were also not included that did not have such expression data" print "Steady-state data exported to",steady_state_export
# As you import something, you can assign it a custom name using `as` print('Let\'s alias something as we import it!') print('importing e...') from math import e print('importing e as wahoo...') from math import e as wahoo print('T/F: e and wahoo are equal:', e == wahoo) print() # To summarize, the following three approaches all achieve the exact same thing print('approach 1') import statistics avg = statistics.mean print('The average of [1,2,3] is:', avg([1, 2, 3])) # or... # print('The average of [1,2,3] is:', statistics.mean([1, 2, 3])) print() print('approach 2') from statistics import mean avg = mean print('The average of [1,2,3] is:', avg([1, 2, 3])) # or... # print('The average of [1,2,3] is:', mean([1, 2, 3])) print() print('approach 3') from statistics import mean as avg print('The average of [1,2,3] is:', avg([1, 2, 3]))
def reorder(data, data_headers, array_order, comp_group_list, probeset_db, include_raw_data, array_type, norm, fl, logvalues=True): ###array_order gives the final level order sorted, followed by the original index order as a tuple expbuilder_value_db = {} group_name_db = {} summary_filtering_stats = {} pval_summary_db = {} replicates = 'yes' stat_result_names = ['avg-', 'log_fold-', 'fold-', 'rawp-', 'adjp-'] group_summary_result_names = ['avg-'] ### Define expression variables try: probability_statistic = fl.ProbabilityStatistic() except Exception: probability_statistic = 'unpaired t-test' try: gene_exp_threshold = fl.GeneExpThreshold() except Exception: gene_exp_threshold = 0 try: gene_rpkm_threshold = fl.RPKMThreshold() except Exception: gene_rpkm_threshold = 0 calculateAsNonLog = True ### Begin processing sample expression values according to the organized groups for row_id in data: try: gene = probeset_db[row_id][0] except TypeError: gene = '' #not needed if not altsplice data data_headers2 = {} #reset each time grouped_ordered_array_list = {} for x in array_order: y = x[1] #this is the new first index group = x[2] group_name = x[3] group_name_db[group] = group_name #for example y = 5, therefore the data[row_id][5] entry is now the first try: try: new_item = data[row_id][y] except IndexError: print row_id, data[row_id], len( data[row_id]), y, len(array_order), array_order kill if logvalues == False and calculateAsNonLog and array_type == 'RNASeq': new_item = math.pow(2, new_item) except TypeError: new_item = '' #this is for a spacer added in the above function try: grouped_ordered_array_list[group].append(new_item) except KeyError: grouped_ordered_array_list[group] = [new_item] try: data_headers2[group].append(data_headers[y]) except KeyError: data_headers2[group] = [data_headers[y]] #perform statistics on each group comparison - comp_group_list: [(1,2),(3,4)] stat_results = {} group_summary_results = {} for comp in comp_group_list: group1 = int(comp[0]) group2 = int(comp[1]) group1_name = group_name_db[group1] group2_name = group_name_db[group2] groups_name = group1_name + "_vs_" + group2_name data_list1 = grouped_ordered_array_list[group1] data_list2 = grouped_ordered_array_list[ group2] #baseline expression avg1 = statistics.avg(data_list1) try: avg2 = statistics.avg(data_list2) except ValueError: print data_list2, row_id forceError if (logvalues == False and array_type != 'RNASeq') or (logvalues == False and calculateAsNonLog): fold = avg1 / avg2 log_fold = math.log(fold, 2) if fold < 1: fold = -1.0 / fold else: log_fold = avg1 - avg2 fold = statistics.log_fold_conversion(log_fold) try: #t,df,tails = statistics.ttest(data_list1,data_list2,2,3) #unpaired student ttest, calls p_value function #t = abs(t); df = round(df); p = str(statistics.t_probability(t,df)) p = statistics.runComparisonStatistic(data_list1, data_list2, probability_statistic) except Exception: p = 1 sg = 1 N1 = 0 N2 = 0 comp = group1, group2 if array_type == 'RNASeq': ### Also non-log but treated differently if norm == 'RPKM': adj = 0 else: adj = 1 if calculateAsNonLog == False: avg1 = math.pow(2, avg1) - adj avg2 = math.pow(2, avg2) - adj if norm == 'RPKM': if avg1 < gene_rpkm_threshold and avg2 < gene_rpkm_threshold: log_fold = 'Insufficient Expression' fold = 'Insufficient Expression' else: if avg1 < gene_exp_threshold and avg2 < gene_exp_threshold: log_fold = 'Insufficient Expression' fold = 'Insufficient Expression' try: gs = statistics.GroupStats(log_fold, fold, p) stat_results[comp] = groups_name, gs, group2_name if probability_statistic == 'moderated t-test': gs.setAdditionalStats( data_list1, data_list2) ### Assuming equal variance if probability_statistic == 'moderated Welch-test': gs.setAdditionalWelchStats( data_list1, data_list2) ### Assuming unequal variance except Exception: null = [] replicates = 'no' ### Occurs when not enough replicates #print comp, len(stat_results); kill_program group_summary_results[group1] = group1_name, [avg1] group_summary_results[group2] = group2_name, [avg2] ### Replaces the below method to get the largest possible comparison fold and ftest p-value grouped_exp_data = [] avg_exp_data = [] for group in grouped_ordered_array_list: data_list = grouped_ordered_array_list[group] grouped_exp_data.append(data_list) try: avg = statistics.avg(data_list) avg_exp_data.append(avg) except Exception: print row_id, group, data_list kill try: avg_exp_data.sort() max_fold = avg_exp_data[-1] - avg_exp_data[0] except Exception: max_fold = 'NA' try: ftestp = statistics.OneWayANOVA(grouped_exp_data) except Exception: ftestp = 1 gs = statistics.GroupStats(max_fold, 0, ftestp) summary_filtering_stats[row_id] = gs stat_result_list = [] for entry in stat_results: data_tuple = entry, stat_results[entry] stat_result_list.append(data_tuple) stat_result_list.sort() grouped_ordered_array_list2 = [] for group in grouped_ordered_array_list: data_tuple = group, grouped_ordered_array_list[group] grouped_ordered_array_list2.append(data_tuple) grouped_ordered_array_list2.sort( ) #now the list is sorted by group number ###for each rowid, add in the reordered data, and new statistics for each group and for each comparison for entry in grouped_ordered_array_list2: group_number = entry[0] original_data_values = entry[1] if include_raw_data == 'yes': ###optionally exclude the raw values for value in original_data_values: if array_type == 'RNASeq': if norm == 'RPKM': adj = 0 else: adj = 1 if calculateAsNonLog == False: value = math.pow(2, value) - adj try: expbuilder_value_db[row_id].append(value) except KeyError: expbuilder_value_db[row_id] = [value] if group_number in group_summary_results: group_summary_data = group_summary_results[group_number][ 1] #the group name is listed as the first entry for value in group_summary_data: try: expbuilder_value_db[row_id].append(value) except KeyError: expbuilder_value_db[row_id] = [value] for info in stat_result_list: if info[0][ 0] == group_number: #comp,(groups_name,[avg1,log_fold,fold,ttest]) comp = info[0] gs = info[1][1] expbuilder_value_db[row_id].append(gs.LogFold()) expbuilder_value_db[row_id].append(gs.Fold()) expbuilder_value_db[row_id].append(gs.Pval()) ### Create a placeholder and store the position of the adjusted p-value to be calculated expbuilder_value_db[row_id].append('') gs.SetAdjPIndex(len(expbuilder_value_db[row_id]) - 1) gs.SetPvalIndex(len(expbuilder_value_db[row_id]) - 2) pval_summary_db[(row_id, comp)] = gs ###do the same for the headers, but at the dataset level (redundant processes) array_fold_headers = [] data_headers3 = [] try: for group in data_headers2: data_tuple = group, data_headers2[ group] #e.g. 1, ['X030910_25_hl.CEL', 'X030910_29R_hl.CEL', 'X030910_45_hl.CEL']) data_headers3.append(data_tuple) data_headers3.sort() except UnboundLocalError: print data_headers, '\n', array_order, '\n', comp_group_list, '\n' kill_program for entry in data_headers3: x = 0 #indicates the times through a loop y = 0 #indicates the times through a loop group_number = entry[0] original_data_values = entry[1] if include_raw_data == 'yes': ###optionally exclude the raw values for value in original_data_values: array_fold_headers.append(value) if group_number in group_summary_results: group_name = group_summary_results[group_number][0] group_summary_data = group_summary_results[group_number][1] for value in group_summary_data: combined_name = group_summary_result_names[ x] + group_name #group_summary_result_names = ['avg-'] array_fold_headers.append(combined_name) x += 1 #increment the loop index for info in stat_result_list: if info[0][ 0] == group_number: #comp,(groups_name,[avg1,log_fold,fold,ttest],group2_name) groups_name = info[1][0] only_add_these = stat_result_names[1:] for value in only_add_these: new_name = value + groups_name array_fold_headers.append(new_name) ###For the raw_data only export we need the headers for the different groups (data_headers2) and group names (group_name_db) raw_data_comp_headers = {} for comp in comp_group_list: temp_raw = [] group1 = int(comp[0]) group2 = int(comp[1]) comp = str(comp[0]), str(comp[1]) g1_headers = data_headers2[group1] g2_headers = data_headers2[group2] g1_name = group_name_db[group1] g2_name = group_name_db[group2] for header in g2_headers: temp_raw.append(g2_name + ':' + header) for header in g1_headers: temp_raw.append(g1_name + ':' + header) raw_data_comp_headers[comp] = temp_raw ###Calculate adjusted ftest p-values using BH95 sorted method statistics.adjustPermuteStats(summary_filtering_stats) ### Calculate adjusted p-values for all p-values using BH95 sorted method round = 0 for info in comp_group_list: compid = int(info[0]), int(info[1]) pval_db = {} for (rowid, comp) in pval_summary_db: if comp == compid: gs = pval_summary_db[(rowid, comp)] pval_db[rowid] = gs if 'moderated' in probability_statistic and replicates == 'yes': ### Moderates the original reported test p-value prior to adjusting try: statistics.moderateTestStats(pval_db, probability_statistic) except Exception: if round == 0: if replicates == 'yes': print 'Moderated test failed due to issue with mpmpath or out-of-range values\n ... using unmoderated unpaired test instead!' null = [] ### Occurs when not enough replicates round += 1 statistics.adjustPermuteStats(pval_db) for rowid in pval_db: gs = pval_db[rowid] expbuilder_value_db[rowid][gs.AdjIndex()] = gs.AdjP( ) ### set the place holder to the calculated value if 'moderated' in probability_statistic: expbuilder_value_db[rowid][gs.RawIndex()] = gs.Pval( ) ### Replace the non-moderated with a moderated p-value pval_summary_db = [] ###Finished re-ordering lists and adding statistics to expbuilder_value_db return expbuilder_value_db, array_fold_headers, summary_filtering_stats, raw_data_comp_headers
def nonLogAvg(data_list): return statistics.avg(map(lambda x: math.pow(2, x) - 1, data_list))