def replacePearsonPvalueWithZscore(): all_sample_data={} for tissue in tissue_comparison_scores: for (r,p,sample) in tissue_comparison_scores[tissue]: all_sample_data[sample] = [] ### populate this dictionary and create sub-dictionaries break for tissue in tissue_comparison_scores: for (r,p,sample) in tissue_comparison_scores[tissue]: all_sample_data[sample].append(r) sample_stats={} all_dataset_rho_values=[] ### Get average and standard deviation for all sample rho's for sample in all_sample_data: all_dataset_rho_values+=all_sample_data[sample] avg=statistics.avg(all_sample_data[sample]) stdev=statistics.stdev(all_sample_data[sample]) sample_stats[sample]=avg,stdev global_rho_avg = statistics.avg(all_dataset_rho_values) global_rho_stdev = statistics.stdev(all_dataset_rho_values) ### Replace the p-value for each rho for tissue in tissue_comparison_scores: scores = [] for (r,p,sample) in tissue_comparison_scores[tissue]: #u,s=sample_stats[sample] #z = (r-u)/s z = (r-global_rho_avg)/global_rho_stdev ### Instead of doing this for the sample background, do it relative to all analyzed samples scores.append([r,z,sample]) tissue_comparison_scores[tissue] = scores
def combine_profiles(profile_list): profile_group_sizes = {} for db in profile_list: for key in db: profile_group_sizes[key] = len(db[key]) break new_profile_db = {} for key in profile_group_sizes: x = profile_group_sizes[key] ###number of elements in list for key new_val_list = [] i = 0 while i < x: temp_val_list = [] for db in profile_list: if key in db: val = db[key][i] temp_val_list.append(val) i += 1 val_avg = statistics.avg(temp_val_list) new_val_list.append(val_avg) new_profile_db[key] = new_val_list return new_profile_db
def statisticallyFilterFile(input_file, output_file, threshold): if 'exp.' in input_file: counts_file = string.replace(input_file, 'exp.', 'geneCount.') else: counts_file = input_file[:-4] + '-geneCount.txt' sample_expressed_genes = {} header = True junction_max = [] count_sum_array = [] for line in open(input_file, 'rU').xreadlines(): data = cleanUpLine(line) if '.csv' in input_file: t = string.split(data, ',') else: t = string.split(data, '\t') if header: header_len = len(t) full_header = t samples = t[1:] header = False count_sum_array = [0] * len(samples) else: if len(t) == (header_len + 1): ### Correct header with a missing UID column samples = full_header count_sum_array = [0] * len(samples) print 'fixing bad header' try: values = map(float, t[1:]) except Exception: if 'NA' in t[1:]: tn = [0 if x == 'NA' else x for x in t[1:]] ### Replace NAs values = map(float, tn) else: tn = [0 if x == '' else x for x in t[1:]] ### Replace NAs values = map(float, tn) binarized_values = [] for v in values: if v > threshold: binarized_values.append(1) else: binarized_values.append(0) count_sum_array = [ sum(value) for value in zip(*[count_sum_array, binarized_values]) ] index = 0 distribution = [] count_sum_array_db = {} samples_to_retain = [] samples_to_exclude = [] for sample in samples: count_sum_array_db[sample] = count_sum_array[index] distribution.append(count_sum_array[index]) index += 1 from stats_scripts import statistics distribution.sort() avg = int(statistics.avg(distribution)) stdev = int(statistics.stdev(distribution)) min_exp = int(min(distribution)) cutoff = avg - (stdev * 2) dev = 2 print 'The average number of genes expressed above %s is %s, (SD is %s, min is %s)' % ( threshold, avg, stdev, min_exp) if cutoff < 0: if (stdev - avg) > 0: cutoff = avg - (stdev / 2) dev = 0.5 print cutoff, 'genes expressed selected as a default cutoff to include cells (2-stdev away)' else: cutoff = avg - stdev dev = 1 print cutoff, 'genes expressed selected as a default cutoff to include cells (1-stdev away)' if min_exp > cutoff: cutoff = avg - stdev dev = 1 print 'Using a default cutoff of >=500 genes per cell expressed/cell' cutoff = 499 import export eo = export.ExportFile(counts_file) eo.write('Sample\tGenes Expressed(threshold:' + str(threshold) + ')\n') for sample in samples: ### keep the original order if count_sum_array_db[sample] > cutoff: samples_to_retain.append(sample) else: samples_to_exclude.append(sample) eo.write(sample + '\t' + str(count_sum_array_db[sample]) + '\n') if len(samples_to_retain) < 4: ### Don't remove any if too few samples samples_to_retain += samples_to_exclude else: print len( samples_to_exclude ), 'samples removed (< 500 genes expressed)' # (%s)' % (dev,string.join(samples_to_exclude,', ')) eo.close() print 'Exporting the filtered expression file to:' print output_file filterFile(input_file, output_file, samples_to_retain)
def filterFile(input_file, output_file, filter_names, force=False, calculateCentroids=False, comparisons=[], log2=False, convertPSIUID=False, partialMatch=False): if calculateCentroids: filter_names, group_index_db = filter_names export_object = open(output_file, 'w') firstLine = True row_count = 0 for line in open(input_file, 'rU').xreadlines(): data = cleanUpLine(line) if '.csv' in input_file: values = string.split(data, ',') else: values = string.split(data, '\t') row_count += 1 if firstLine: uid_index = 0 if data[0] != '#': if force == True: values2 = [] for x in values: if ':' in x: x = string.split(x, ':')[1] values2.append(x) else: values2.append(x) filter_names2 = [] for f in filter_names: if f in values: filter_names2.append(f) if len(filter_names2) < 2: filter_names2 = [] for f in filter_names: if f in values2: filter_names2.append(f) filter_names = filter_names2 else: filter_names = filter_names2 if force == 'include': values = ['UID'] + filter_names pass try: sample_index_list = map(lambda x: values.index(x), filter_names) except: ### If ":" in header name if ':' in line: values2 = [] for x in values: if ':' in x: x = string.split(x, ':')[1] values2.append(x) values = values2 sample_index_list = map(lambda x: values.index(x), filter_names) elif '.' in line: values2 = [] for x in values: if '.' in x: x = string.split(x, '.')[0] values2.append(x) values = values2 sample_index_list = map(lambda x: values.index(x), filter_names) elif '.$' in line: filter_names2 = [] for f in filter_names: ### if the name in the filter is a string within the input data-file for f1 in values: if f in f1: filter_names2.append( f1) ### change to the reference name break print len(filter_names2), len(values), len( filter_names) kill filter_names = filter_names2 #filter_names = map(lambda x: string.split(x,'.')[0], filter_names) #values = map(lambda x: string.split(x,'.')[0], values) sample_index_list = map(lambda x: values.index(x), filter_names) elif partialMatch: filter_names_upated = [] for x in filter_names: if x not in values: for y in values: if x in y: filter_names_upated.append(y) filter_names = filter_names_upated sample_index_list = map(lambda x: values.index(x), filter_names) else: temp_count = 1 for x in filter_names: if x not in values: temp_count += 1 if temp_count == 500: print 'too many to print' elif temp_count > 500: pass else: print x, print temp_count, 'are missing' kill firstLine = False header = values if 'PSI_EventAnnotation' in input_file: uid_index = values.index('UID') if log2: try: values = map(lambda x: math.log(x + 1, 2)) except: pass if calculateCentroids: if len(comparisons) > 0: export_object.write( string.join(['UID'] + map(lambda x: x[0] + '-fold', comparisons), '\t') + '\n') ### Use the numerator group name else: clusters = map(str, group_index_db) export_object.write( string.join([values[uid_index]] + clusters, '\t') + '\n') continue ### skip the below code if force == 'include': if row_count > 1: values += ['0'] try: filtered_values = map( lambda x: values[x], sample_index_list ) ### simple and fast way to reorganize the samples except Exception: """ print traceback.format_exc() print len(values), len(sample_index_list) print input_file, len(filter_names) for i in filter_names: if i not in header: print i, 'not found' sys.exit() """ ### For PSI files with missing values at the end of each line, often if len(header) != len(values): diff = len(header) - len(values) values += diff * [''] filtered_values = map( lambda x: values[x], sample_index_list ) ### simple and fast way to reorganize the samples #print values[0]; print sample_index_list; print values; print len(values); print len(prior_values);kill prior_values = values ######################## Begin Centroid Calculation ######################## if calculateCentroids: mean_matrix = [] means = {} for cluster in group_index_db: #### group_index_db[cluster] is all of the indeces for samples in a noted group, cluster is the actual cluster name (not number) raw_values = map(lambda x: filtered_values[x], group_index_db[cluster]) raw_values2 = [] for vx in raw_values: if vx != '' and vx != 'NA': raw_values2.append(float(vx)) if len(raw_values2) > 2: mean = statistics.avg(raw_values2) else: mean = "" #mean = map(lambda x: filtered_values[uid][x], group_index_db[cluster]) ### Only one value means[cluster] = mean mean_matrix.append(str(mean)) filtered_values = mean_matrix if len(comparisons) > 0: fold_matrix = [] for (group2, group1) in comparisons: try: fold = means[group2] - means[group1] except: ### Indicates a missing value - exclude fold = 0 fold_matrix.append(str(fold)) filtered_values = fold_matrix ######################## End Centroid Calculation ######################## new_uid = values[uid_index] if convertPSIUID: new_uid = string.replace(new_uid, ':', '__') if '|' in new_uid: new_uid = string.split(new_uid, '|')[0] new_uids = string.split(new_uid, '__') if len(new_uids) > 2: if 'ENS' in new_uids[1]: new_uid = string.join([new_uids[0]] + new_uids[2:], ' ') export_object.write( string.join([new_uid] + filtered_values, '\t') + '\n')
def plotFeatureBoxPlots(qc_db, dataset_name, feature_type): pylab.figure() pylab.xlabel('Biological Sample Names') pylab.ylabel('Read Counts - Log2') pylab.title('Expression BoxPlots for %ss - %s' % (feature_type, dataset_name)) #pylab.subplots_adjust(left=0.085, right=0.95, top=0.2, bottom=0.35) pylab.subplots_adjust(left=0.075, right=0.95, top=0.9, bottom=0.35) #axes = getAxes(scores) ### adds buffer space to the end of each axis and creates room for a legend #pylab.axis(axes) boxplots = [] samples = [] sample_sorted_list = [] for sample_name in qc_db: try: qc = qc_db[sample_name][feature_type] except Exception: print 'No junction data found for at least one sample:', sample_name forceExit sample_sorted_list.append( [statistics.avg(qc), statistics.stdev(qc), sample_name]) sample_sorted_list.sort() sample_sorted_list.reverse() filename = 'QC-%s-BoxPlot-%s.pdf' % (dataset_name, feature_type) export_obj = export.ExportFile(root_dir + filename[:-4] + '.txt') export_obj.write('SampleID\tAverage Expression\n') firstEntry = True for (mean, stdev, sample_name) in sample_sorted_list: ls = [] x_ls = [] y_ls = [] qc = qc_db[sample_name][feature_type] boxplots.append(qc) samples.append(sample_name) export_obj.write(sample_name + '\t' + str(mean) + '\n') if firstEntry: threshold = mean - 2 * stdev firstEntry = False else: if mean < threshold: print sample_name, 'expression is considered very low (2 standard deviations away from the max).' pylab.boxplot(boxplots, notch=0, whis=1.5, positions=None, widths=None, patch_artist=False) #pylab.boxplot(boxplots, notch=0, sym='+', vert=1, whis=1.5, positions=None, widths=None, patch_artist=False) xtickNames = pylab.setp(pylab.gca(), xticklabels=samples) pylab.setp(xtickNames, rotation=90, fontsize=10) export_obj.close() #print 'Exporting:',filename pylab.savefig(root_dir + filename) filename = filename[:-3] + 'png' pylab.savefig(root_dir + filename) #,dpi=200 graphic_link.append( ['QC - BoxPlot-' + feature_type + ' Expression', root_dir + filename]) try: import gc pylab.figure.clf() pylab.close() gc.collect() except Exception: pass
def importTableEntries(filename, filter_db, ensembl_exon_db, gene_db, root_dir, transpose, display, showIntrons, analysisType='plot'): import collections average_samples = True if showIntrons == 'yes': include_introns = True else: include_introns = False uid_db = {} ### probeset or AltAnalyze RNA-Seq ID keyed uid_list = {} ### ordered from first to last exon region uid_gene_db = {} ### Lets us look at multiple genes try: import UI biotypes = UI.getBiotypes(filename) except Exception: biotypes = {} for gene in ensembl_exon_db: uid_list[gene] = [] for (index, ed, id) in ensembl_exon_db[gene]: proceed = False if 'exp.' in filename: if include_introns: proceed = True elif 'E' in ed.ExonID(): proceed = True else: ### Include introns for splicing index view if include_introns == True: proceed = True elif 'E' in ed.ExonID(): proceed = True if proceed: uid_db[id] = ed uid_list[gene].append(id) uid_gene_db[id] = gene if '_vs_' in filename: ### If one two groups, this is what will be output to the RawSplice folder - need to have this alternate way of getting the expression file location rootdir = string.split(filename, 'AltResults')[0] exp_dir = getValidExpFile(rootdir + 'ExpressionInput') alt_groups_dir = string.split( exp_dir, 'ExpressionInput' )[0] + 'ExpressionInput/groups.' + findFilename(exp_dir) alt_groups_dir = string.replace(alt_groups_dir, 'exp.', '') start_time = time.time() fn = filepath(filename) matrix_gene_db = {} stdev_gene_matrix_db = {} row_header_gene = {} ids = {} x = 0 if 'heatmap' in analysisType: average_samples = False if '/' in filename: dataset_name = string.split(filename, '/')[-1][:-4] else: dataset_name = string.split(filename, '\\')[-1][:-4] for line in open(fn, 'rU').xreadlines(): data = line.strip() t = string.split(data, '\t') if data[0] == '#': x = 0 elif x == 0: if platform == 'RNASeq': removeExtension = True else: removeExtension = False group_db, column_header, sample_name_db = assignGroupColors( t[1:], '', removeExtension=removeExtension) x = 1 altresults = False if average_samples: if 'AltResults' in filename: altresults = True groups_dir = string.split( filename, 'AltResults' )[0] + 'ExpressionInput/groups.' + findFilename(filename) if verifyFile(groups_dir) == False: groups_dir = alt_groups_dir new_column_header = reformatAltHeaders(t[3:]) start = 3 else: if 'exp.' in filename: groups_dir = string.replace(filename, 'exp.', 'groups.') else: groups_dir = string.replace(filename, 'counts.', 'groups.') new_column_header = column_header start = 1 ### starting index with numeric values groups_dir = string.replace(groups_dir, 'stats.', 'groups.') groups_dir = string.replace( groups_dir, '-steady-state.txt', '.txt') ### groups is for the non-steady-state file try: group_index_db = collections.OrderedDict() except Exception: import ordereddict group_index_db = ordereddict.OrderedDict() ### use comps in the future to visualize group comparison changes sample_list, group_sample_db, group_db, group_name_sample_db, comp_groups, comps_name_db = ExpressionBuilder.simpleGroupImport( groups_dir) for item in sample_list: group_name = group_db[item] proceed = False try: sample_index = new_column_header.index(item) proceed = True except Exception: try: item = string.replace(item, '.bed', '') item = string.replace( item, '.CEL', '') ### Probe-level analyses as RNA-Seq item = string.replace(item, '.cel', '') item = string.replace(item, '.txt', '') item = string.replace(item, '.TXT', '') item = string.replace(item, '.TAB', '') item = string.replace(item, '.tab', '') sample_index = new_column_header.index(item) proceed = True except Exception: pass #print [item] #print column_header #print Error if proceed: try: group_index_db[group_name].append(sample_index) except Exception: try: group_index_db[group_name] = [ sample_index ] ### dictionary of group to input file sample indexes except Exception: pass ### Occurs when analyzing splicing-index for two groups when more than two groups exist (error from 5 lines up) groups = map(str, group_index_db) ### store group names new_sample_list = map( lambda item: group_db[item], sample_list ) ### lookup index of each sample in the ordered group sample list column_header = groups else: if 'AltResults' in filename: start = 3 else: start = 1 ### starting index with numeric values column_header = t[start - 1:] row_number = 1 else: if ' ' not in t and '' not in t: ### Occurs for rows with missing data uid = t[start - 1] if ';' in uid: uid = string.split(uid, ';')[0] ids[uid] = None ens_geneID = string.split(uid, ':')[0] #if ens_geneID in gene_db: print uid if uid in filter_db or ('heatmap' in analysisType and ens_geneID in gene_db): try: if len(biotypes) == 1 and 'junction' in biotypes: gene = ens_geneID else: gene = uid_gene_db[uid] try: row_header_gene[gene].append(uid) except Exception: row_header_gene[gene] = [uid] if average_samples == False: values = map(float, t[start:]) try: matrix_gene_db[gene].append(values) except Exception: matrix_gene_db[gene] = [values] else: if platform == 'RNASeq' and altresults == False: ### Convert to log2 RPKM values - or counts values = map(lambda x: math.log(float(x), 2), t[start:]) else: values = map(float, t[start:]) if 'AltResults' in filename: ### If splicing scores, normalize these to the mean values mean = statistics.avg(values) values = map(lambda x: x - mean, values) avg_ls = [] std_ls = [] for group_name in group_index_db: group_values = map( lambda x: values[x], group_index_db[group_name] ) ### simple and fast way to reorganize the samples avg = statistics.avg(group_values) try: st_err = statistics.stdev( group_values) / math.sqrt( len(group_values)) except Exception: ### Occurs if no replicates in the dataset st_err = 0 avg_ls.append(avg) std_ls.append(st_err) try: matrix_gene_db[gene].append(avg_ls) except Exception: matrix_gene_db[gene] = [avg_ls] try: stdev_gene_matrix_db[gene].append(std_ls) except Exception: stdev_gene_matrix_db[gene] = [std_ls] except Exception: #print traceback.format_exc() pass x += 1 global colors original_column_header = list(column_header) if len(uid_list) == 0: print 'No genes found in the exon expression database' forceNoExonExpError successfully_output_genes = 0 display_count = 0 ### Only display a certain number of genes for last_gene in uid_list: pass for gene in uid_list: fig = pylab.figure( ) ### Create this here - resulting in a single figure for memory purposes new_header = [] new_matrix = [] new_stdev = [] annotation_list = [] gene_symbol = gene_db[gene] try: matrix = matrix_gene_db[gene] except Exception: #print gene_symbol, 'not in alternative expression database' continue ### go the next gene - no alt.expression for this gene row_header = row_header_gene[gene] try: stdev_matrix = stdev_gene_matrix_db[gene] except Exception: pass for uid in uid_list[gene]: #print row_header;sys.exit() try: i = row_header.index( uid ) ### If the ID is in the filtered annotated exon list (not just core) new_header.append(uid) try: new_matrix.append(matrix[i]) except Exception: print uid, i, len(matrix) sys.exit() ed = uid_db[uid] annotation_list.append(ed) try: new_stdev.append(stdev_matrix[i]) except Exception: pass except Exception: pass if len(new_matrix) > 0: matrix = new_matrix if len(new_header) > 0: row_header = new_header if 'heatmap' in analysisType: export_dir = root_dir + gene_symbol + '-heatmap.txt' export_obj = export.ExportFile(export_dir) export_obj.write(string.join(column_header, '\t') + '\n') ki = 0 if len(annotation_list) > 0: for ed in annotation_list: if 'AltResults' not in filename and platform == 'RNASeq': values = map(lambda x: math.log(x, 2), matrix[ki]) else: values = matrix[ki] export_obj.write( string.join([ed.ExonID()] + map(str, values), '\t') + '\n') ki += 1 row_metric = 'euclidean' row_method = None else: ### Just junctions analyzed here... no sorted junctions yet ki = 0 for uid in row_header_gene[gene]: if 'AltResults' not in filename and platform == 'RNASeq': values = map(lambda x: math.log(x, 2), matrix[ki]) else: values = matrix[ki] export_obj.write( string.join([uid] + map(str, values), '\t') + '\n') ki += 1 row_metric = 'euclidean' row_method = 'average' export_obj.close() from visualization_scripts import clustering column_metric = 'euclidean' column_method = 'hopach' color_gradient = 'red_black_sky' transpose = False graphic_links = [] if ki > 100: transpose = True if gene == last_gene: display = True else: display = False graphic_links = clustering.runHCexplicit(export_dir, graphic_links, row_method, row_metric, column_method, column_metric, color_gradient, transpose, display=display, Normalize=True, compressAxis=False, contrast=2.5) successfully_output_genes += 1 else: stdev_matrix = new_stdev time_diff = str(round(time.time() - start_time, 1)) #print '%d rows and %d columns imported for %s in %s seconds...' % (len(matrix),len(column_header),dataset_name,time_diff) if transpose == True: matrix = map(numpy.array, zip(*matrix)) ### coverts these to tuples column_header, row_header = row_header, original_column_header stdev_matrix = map(numpy.array, zip(*stdev_matrix)) matrix = numpy.array(matrix) stdev_matrix = numpy.array(stdev_matrix) try: if len(uid_list) > 10: #if display_count==5: display=False display = False if display_count == 0: ### store a consistent color palete to use colors = [] """ k=0 while k < len(row_header): colors.append(tuple(rand(3))) k+=1""" #http://stackoverflow.com/questions/3016283/create-a-color-generator-from-given-colormap-in-matplotlib cm = pylab.cm.get_cmap('gist_rainbow') #gist_ncar for i in range(len(row_header)): colors.append(cm(1. * i / len(row_header)) ) # color will now be an RGBA tuple plotExonExpression(fig, matrix, stdev_matrix, row_header, column_header, dataset_name, annotation_list, gene_symbol, root_dir, display=display) successfully_output_genes += 1 display_count += 1 except Exception: print traceback.format_exc() sys.exit() print gene_symbol, 'failed' try: pylab.close() except Exception: pass if successfully_output_genes > 0: #try: print 'Gene graphs exported to ExonPlots...' #except Exception: pass pass else: print '\nWARNING!!!! No genes with associated alternative exon evidence found\n' forceNoExonExpError try: import gc fig.clf() pylab.close() gc.collect() except Exception: pass
def parse_input_data(filename, data_type): fn = filepath(filename) first_line = 1 array_group_name_db = {} z = 0 array_group_db = {} output_file = [] #print "Reading",filename secondary_data_type = export.getParentDir( filename) ### e.g., expression or counts for line in open(fn, 'rU').xreadlines(): data = cleanUpLine(line) t = string.split(data, '\t') probeset = t[0] z += 1 if first_line == 1: first_line = 0 #makes this value null for the next loop of actual array data ###Below ocucrs if the data is raw opposed to precomputed if data_type == 'export': if array_type == 'exon': folder = 'ExonArray' + '/' + species + '/' elif array_type == 'gene': folder = 'GeneArray' + '/' + species + '/' elif array_type == 'junction': folder = 'JunctionArray' + '/' + species + '/' elif array_type == 'RNASeq': folder = 'RNASeq' + '/' + species + '/' else: folder = array_type + '/' parent_path = root_dir + 'AltExpression/' + folder if array_type == 'RNASeq': output_file = altanalzye_input[0:-4] + '.ExpCutoff-' + str( original_exp_threshold) + '_' + filter_method + '.txt' else: output_file = altanalzye_input[0:-4] + '.p' + str( int(100 * p)) + '_' + filter_method + '.txt' output_file_dir = parent_path + output_file print "...Exporting", output_file_dir export_data = export.createExportFile( output_file_dir, root_dir + 'AltExpression/' + folder) fn = filepath(output_file_dir) export_data = open(fn, 'w') export_data.write(line) if ':' in t[1]: array_group_list = [] x = 0 ###gives us an original index value for each entry in the group for entry in t[1:]: array_group, array_name = string.split(entry, ':') try: array_group_db[array_group].append(x) array_group_name_db[array_group].append(array_name) except KeyError: array_group_db[array_group] = [x] array_group_name_db[array_group] = [array_name] ### below only occurs with a new group addition array_group_list.append( array_group ) #use this to generate comparisons in the below linked function x += 1 #print '##### array_group_list',array_group_list elif len(probeset) > 0 and data_type != 'export': ###Use the index values from above to assign each expression value to a new database temp_group_array = {} array_index_list = [] ###Use this list for permutation analysis for group in array_group_db: #array_index_list.append(array_group_db[group]) group_values = [] for array_index in array_group_db[group]: try: exp_val = float(t[array_index + 1]) except IndexError: print t, z, '\n', array_index, '\n', group, probeset kill group_values.append(exp_val) avg_stat = statistics.avg(group_values) if data_type == 'expression': ###If non-log array data if exp_data_format == 'non-log': ### This works better for RNASeq as opposed to log transforming and then filtering which is more stringent and different than the filtering in ExonArray(). if array_type == 'RNASeq': if normalization_method == 'RPKM' and secondary_data_type == 'expression': if ':I' in probeset: k = 1 ### Don't require an RPKM threshold for intron IDs (these will likely never meet this unless small or fully retained and highly expressed) elif ':' not in probeset: if avg_stat >= gene_rpkm_threshold: k = 1 else: k = 0 elif avg_stat >= exon_rpkm_threshold: k = 1 elif '-' in probeset: k = 1 ### Don't consider RPKM for junctions, just counts else: k = 0 #if 'ENSMUSG00000045991:E2.2' in probeset: print [probeset, normalization_method, secondary_data_type, gene_rpkm_threshold, avg_stat, k] else: ### Otherwise, we are looking at count data if '-' in probeset: ### junction meeting minimum read-count number if avg_stat >= junction_exp_threshold: k = 1 ### junction_exp_threshold is the same as nonlog_exp_threshold else: k = 0 elif ':' not in probeset: if avg_stat >= gene_exp_threshold: k = 1 else: k = 0 else: ### exon or intron meeting minimum read-count number if avg_stat >= exon_exp_threshold: k = 1 else: k = 0 #if 'ENSMUSG00000045991:E2.2' in probeset: print [probeset, normalization_method, secondary_data_type, exon_exp_threshold, junction_exp_threshold, avg_stat, k] else: if avg_stat >= nonlog_exp_threshold: k = 1 else: k = 0 elif avg_stat >= log_expression_threshold: k = 1 else: k = 0 if normalization_method == 'RPKM' and secondary_data_type == 'expression': ### Treat as dabp p-value try: pvalue_status_db[probeset].append(k) except KeyError: pvalue_status_db[probeset] = [k] else: try: expression_status_db[probeset].append(k) except KeyError: expression_status_db[probeset] = [k] #if probeset == '3209315': print [group],k,len(group_values),array_group_list if data_type == 'p-value': if avg_stat <= p: k = 1 else: k = 0 #if 'G7216513_a_at' in probeset: print k, avg_stat try: pvalue_status_db[probeset].append(k) except KeyError: pvalue_status_db[probeset] = [k] elif data_type == 'export': if exp_data_format == 'non-log': ### This code was added in version 1.16 in conjunction with a switch from logstatus to ### non-log in AltAnalyze to prevent "Process AltAnalyze Filtered" associated errors exp_values = t[1:] exp_values_log2 = [] for exp_val in exp_values: exp_values_log2.append(str( math.log(float(exp_val), 2))) ### exp_val+=1 was removed in 2.0.5 line = string.join([probeset] + exp_values_log2, '\t') + '\n' try: null = export_db[probeset] export_data.write(line) except KeyError: null = [ ] ### occurs if not a probeset to include in the filtered results export file if data_type == 'export': export_data.close() return output_file
def performGroupNormalization(filename, export_dir, platform): expressionDataFormat, increment, convertNonLogToLog = ExpressionBuilder.checkExpressionFileFormat( filename) groups_dir = string.replace(export_dir, 'exp.', 'batch.') fn = unique.filepath(filename) row_number = 0 exp_db = {} relative_headers_exported = False group_db = importGroups(groups_dir) export_data = export.ExportFile(export_dir) for line in open(fn, 'rU').xreadlines(): data = ExpressionBuilder.cleanUpLine(line) t = string.split(data, '\t') if data[0] == '#' and row_number == 0: row_number = 0 elif row_number == 0: sample_list = t[1:] new_sample_list = [] for group in group_db: group_samples = group_db[group] try: sample_index_list = map(lambda x: sample_list.index(x), group_samples) group_db[group] = sample_index_list new_sample_list += group_samples except Exception: missing = [] for x in sample_list: if x not in t[1:]: missing.append(x) print 'missing:', missing print t print sample_list print filename, groups_dir print 'Unknown Error!!! Skipping cluster input file build (check column and row formats for conflicts)' forceExit title = string.join( [t[0]] + new_sample_list, '\t' ) + '\n' ### output the new sample order (group file order) export_data.write(title) row_number = 1 else: gene = t[0] if expressionDataFormat == 'non-log' and (convertNonLogToLog or platform == 'RNASeq'): ### Convert to log2 RPKM values - or counts try: all_values = map( lambda x: math.log(float(x) + increment, 2), t[1:]) except Exception: all_values = ExpressionBuilder.logTransformWithNAs( t[1:], increment) else: try: all_values = map(float, t[1:]) except Exception: all_values = ExpressionBuilder.logTransformWithNAs( t[1:], increment) row_number += 1 ### Keep track of the first gene as to write out column headers for the relative outputs gene_log_folds = [] for group in group_db: sample_index_list = group_db[group] ### Calculate log-fold values relative to the mean of all sample expression values try: values = map( lambda x: all_values[x], sample_index_list ) ### simple and fast way to reorganize the samples except Exception: print len(values), sample_index_list kill try: avg = statistics.avg(values) except Exception: values2 = [] for v in values: try: values2.append(float(v)) except Exception: pass values = values2 try: avg = statistics.avg(values) except Exception: if len(values) > 0: avg = values[0] else: avg = 0 try: log_folds = map(lambda x: (x - avg), values) except Exception: log_folds = [] for x in values: try: log_folds.append(x - avg) except Exception: log_folds.append('') gene_log_folds += log_folds gene_log_folds = map(lambda x: str(x), gene_log_folds) export_data.write( string.join([gene] + gene_log_folds, '\t') + '\n') export_data.close()
def nonLogAvg(data_list): return statistics.avg(map(lambda x: math.pow(2, x) - 1, data_list))
def reorder(data, data_headers, array_order, comp_group_list, probeset_db, include_raw_data, array_type, norm, fl, logvalues=True, blanksPresent=False): ###array_order gives the final level order sorted, followed by the original index order as a tuple expbuilder_value_db = {} group_name_db = {} summary_filtering_stats = {} pval_summary_db = {} replicates = 'yes' stat_result_names = ['avg-', 'log_fold-', 'fold-', 'rawp-', 'adjp-'] group_summary_result_names = ['avg-'] ### Define expression variables try: probability_statistic = fl.ProbabilityStatistic() except Exception: probability_statistic = 'unpaired t-test' try: gene_exp_threshold = math.log(fl.GeneExpThreshold(), 2) except Exception: gene_exp_threshold = 0 try: gene_rpkm_threshold = float(fl.RPKMThreshold()) except Exception: gene_rpkm_threshold = 0 try: FDR_statistic = fl.FDRStatistic() except Exception: FDR_statistic = 'Benjamini-Hochberg' calculateAsNonLog = True if blanksPresent: calculateAsNonLog = False ### Begin processing sample expression values according to the organized groups for row_id in data: try: gene = probeset_db[row_id][0] except TypeError: gene = '' #not needed if not altsplice data data_headers2 = {} #reset each time grouped_ordered_array_list = {} for x in array_order: y = x[1] #this is the new first index group = x[2] group_name = x[3] group_name_db[group] = group_name #for example y = 5, therefore the data[row_id][5] entry is now the first try: try: new_item = data[row_id][y] except IndexError: print row_id, data[row_id], len( data[row_id]), y, len(array_order), array_order kill if logvalues == False and calculateAsNonLog and array_type == 'RNASeq': new_item = math.pow(2, new_item) except TypeError: new_item = '' #this is for a spacer added in the above function try: grouped_ordered_array_list[group].append(new_item) except KeyError: grouped_ordered_array_list[group] = [new_item] try: data_headers2[group].append(data_headers[y]) except KeyError: data_headers2[group] = [data_headers[y]] #perform statistics on each group comparison - comp_group_list: [(1,2),(3,4)] stat_results = {} group_summary_results = {} for comp in comp_group_list: group1 = int(comp[0]) group2 = int(comp[1]) group1_name = group_name_db[group1] group2_name = group_name_db[group2] groups_name = group1_name + "_vs_" + group2_name data_list1 = grouped_ordered_array_list[group1] data_list2 = grouped_ordered_array_list[ group2] #baseline expression if blanksPresent: ### Allows for empty cells data_list1 = filterBlanks(data_list1) data_list2 = filterBlanks(data_list2) try: avg1 = statistics.avg(data_list1) except Exception: avg1 = '' try: avg2 = statistics.avg(data_list2) except Exception: avg2 = '' try: if (logvalues == False and array_type != 'RNASeq') or (logvalues == False and calculateAsNonLog): fold = avg1 / avg2 log_fold = math.log(fold, 2) if fold < 1: fold = -1.0 / fold else: log_fold = avg1 - avg2 fold = statistics.log_fold_conversion(log_fold) except Exception: log_fold = '' fold = '' try: #t,df,tails = statistics.ttest(data_list1,data_list2,2,3) #unpaired student ttest, calls p_value function #t = abs(t); df = round(df); p = str(statistics.t_probability(t,df)) p = statistics.runComparisonStatistic(data_list1, data_list2, probability_statistic) except Exception: p = 1 sg = 1 N1 = 0 N2 = 0 comp = group1, group2 if array_type == 'RNASeq': ### Also non-log but treated differently if 'RPKM' == norm: adj = 0 else: adj = 1 if calculateAsNonLog == False: try: avg1 = math.pow(2, avg1) - adj avg2 = math.pow(2, avg2) - adj except Exception: avg1 = '' avg2 = '' if 'RPKM' == norm: if avg1 < gene_rpkm_threshold and avg2 < gene_rpkm_threshold: log_fold = 'Insufficient Expression' fold = 'Insufficient Expression' else: if avg1 < gene_exp_threshold and avg2 < gene_exp_threshold: log_fold = 'Insufficient Expression' fold = 'Insufficient Expression' #if row_id=='ENSG00000085514': #if fold=='Insufficient Expression': #print [norm, avg1, avg2, fold, comp, gene_exp_threshold, gene_rpkm_threshold, row_id] #5.96999111075 7.72930768675 Insufficient Expression (3, 1) 1.0 ENSG00000085514 if gene_rpkm_threshold != 0 and calculateAsNonLog: ### Any other data a1 = nonLogAvg(data_list1) a2 = nonLogAvg(data_list2) #print [a1,a2,gene_rpkm_threshold] if a1 < gene_rpkm_threshold and a2 < gene_rpkm_threshold: log_fold = 'Insufficient Expression' fold = 'Insufficient Expression' #print log_fold;kill try: gs = statistics.GroupStats(log_fold, fold, p) stat_results[comp] = groups_name, gs, group2_name if probability_statistic == 'moderated t-test': gs.setAdditionalStats( data_list1, data_list2) ### Assuming equal variance if probability_statistic == 'moderated Welch-test': gs.setAdditionalWelchStats( data_list1, data_list2) ### Assuming unequal variance except Exception: null = [] replicates = 'no' ### Occurs when not enough replicates #print comp, len(stat_results); kill_program group_summary_results[group1] = group1_name, [avg1] group_summary_results[group2] = group2_name, [avg2] ### Replaces the below method to get the largest possible comparison fold and ftest p-value grouped_exp_data = [] avg_exp_data = [] for group in grouped_ordered_array_list: data_list = grouped_ordered_array_list[group] if blanksPresent: ### Allows for empty cells data_list = filterBlanks(data_list) if len(data_list) > 0: grouped_exp_data.append(data_list) try: avg = statistics.avg(data_list) avg_exp_data.append(avg) except Exception: avg = '' #print row_id, group, data_list;kill try: avg_exp_data.sort() max_fold = avg_exp_data[-1] - avg_exp_data[0] except Exception: max_fold = 'NA' try: ftestp = statistics.OneWayANOVA(grouped_exp_data) except Exception: ftestp = 1 gs = statistics.GroupStats(max_fold, 0, ftestp) summary_filtering_stats[row_id] = gs stat_result_list = [] for entry in stat_results: data_tuple = entry, stat_results[entry] stat_result_list.append(data_tuple) stat_result_list.sort() grouped_ordered_array_list2 = [] for group in grouped_ordered_array_list: data_tuple = group, grouped_ordered_array_list[group] grouped_ordered_array_list2.append(data_tuple) grouped_ordered_array_list2.sort( ) #now the list is sorted by group number ###for each rowid, add in the reordered data, and new statistics for each group and for each comparison for entry in grouped_ordered_array_list2: group_number = entry[0] original_data_values = entry[1] if include_raw_data == 'yes': ###optionally exclude the raw values for value in original_data_values: if array_type == 'RNASeq': if norm == 'RPKM': adj = 0 else: adj = 1 if calculateAsNonLog == False: value = math.pow(2, value) - adj try: expbuilder_value_db[row_id].append(value) except KeyError: expbuilder_value_db[row_id] = [value] if group_number in group_summary_results: group_summary_data = group_summary_results[group_number][ 1] #the group name is listed as the first entry for value in group_summary_data: try: expbuilder_value_db[row_id].append(value) except KeyError: expbuilder_value_db[row_id] = [value] for info in stat_result_list: if info[0][ 0] == group_number: #comp,(groups_name,[avg1,log_fold,fold,ttest]) comp = info[0] gs = info[1][1] expbuilder_value_db[row_id].append(gs.LogFold()) expbuilder_value_db[row_id].append(gs.Fold()) expbuilder_value_db[row_id].append(gs.Pval()) ### Create a placeholder and store the position of the adjusted p-value to be calculated expbuilder_value_db[row_id].append('') gs.SetAdjPIndex(len(expbuilder_value_db[row_id]) - 1) gs.SetPvalIndex(len(expbuilder_value_db[row_id]) - 2) pval_summary_db[(row_id, comp)] = gs ###do the same for the headers, but at the dataset level (redundant processes) array_fold_headers = [] data_headers3 = [] try: for group in data_headers2: data_tuple = group, data_headers2[ group] #e.g. 1, ['X030910_25_hl.CEL', 'X030910_29R_hl.CEL', 'X030910_45_hl.CEL']) data_headers3.append(data_tuple) data_headers3.sort() except UnboundLocalError: print data_headers, '\n', array_order, '\n', comp_group_list, '\n' kill_program for entry in data_headers3: x = 0 #indicates the times through a loop y = 0 #indicates the times through a loop group_number = entry[0] original_data_values = entry[1] if include_raw_data == 'yes': ###optionally exclude the raw values for value in original_data_values: array_fold_headers.append(value) if group_number in group_summary_results: group_name = group_summary_results[group_number][0] group_summary_data = group_summary_results[group_number][1] for value in group_summary_data: combined_name = group_summary_result_names[ x] + group_name #group_summary_result_names = ['avg-'] array_fold_headers.append(combined_name) x += 1 #increment the loop index for info in stat_result_list: if info[0][ 0] == group_number: #comp,(groups_name,[avg1,log_fold,fold,ttest],group2_name) groups_name = info[1][0] only_add_these = stat_result_names[1:] for value in only_add_these: new_name = value + groups_name array_fold_headers.append(new_name) ###For the raw_data only export we need the headers for the different groups (data_headers2) and group names (group_name_db) raw_data_comp_headers = {} for comp in comp_group_list: temp_raw = [] group1 = int(comp[0]) group2 = int(comp[1]) comp = str(comp[0]), str(comp[1]) g1_headers = data_headers2[group1] g2_headers = data_headers2[group2] g1_name = group_name_db[group1] g2_name = group_name_db[group2] for header in g2_headers: temp_raw.append(g2_name + ':' + header) for header in g1_headers: temp_raw.append(g1_name + ':' + header) raw_data_comp_headers[comp] = temp_raw ###Calculate adjusted ftest p-values using BH95 sorted method statistics.adjustPermuteStats(summary_filtering_stats) ### Calculate adjusted p-values for all p-values using BH95 sorted method round = 0 for info in comp_group_list: compid = int(info[0]), int(info[1]) pval_db = {} for (rowid, comp) in pval_summary_db: if comp == compid: gs = pval_summary_db[(rowid, comp)] pval_db[rowid] = gs if 'moderated' in probability_statistic and replicates == 'yes': ### Moderates the original reported test p-value prior to adjusting try: statistics.moderateTestStats(pval_db, probability_statistic) except Exception: if round == 0: if replicates == 'yes': print 'Moderated test failed due to issue with mpmpath or out-of-range values\n ... using unmoderated unpaired test instead!' null = [] ### Occurs when not enough replicates round += 1 if FDR_statistic == 'Benjamini-Hochberg': statistics.adjustPermuteStats(pval_db) else: ### Calculate a qvalue (https://github.com/nfusi/qvalue) import numpy from stats_scripts import qvalue pvals = [] keys = [] for key in pval_db: pvals.append(pval_db[key].Pval()) keys.append(key) pvals = numpy.array(pvals) pvals = qvalue.estimate(pvals) for i in range(len(pvals)): pval_db[keys[i]].SetAdjP(pvals[i]) for rowid in pval_db: gs = pval_db[rowid] expbuilder_value_db[rowid][gs.AdjIndex()] = gs.AdjP( ) ### set the place holder to the calculated value if 'moderated' in probability_statistic: expbuilder_value_db[rowid][gs.RawIndex()] = gs.Pval( ) ### Replace the non-moderated with a moderated p-value pval_summary_db = [] ###Finished re-ordering lists and adding statistics to expbuilder_value_db return expbuilder_value_db, array_fold_headers, summary_filtering_stats, raw_data_comp_headers
def filterFile(input_file,output_file,filter_names,force=False,calculateCentroids=False,comparisons=[]): if calculateCentroids: filter_names,group_index_db=filter_names export_object = open(output_file,'w') firstLine = True for line in open(input_file,'rU').xreadlines(): data = cleanUpLine(line) if '.csv' in input_file: values = string.split(data,',') else: values = string.split(data,'\t') if firstLine: uid_index = 0 if data[0]!='#': if force: filter_names2=[] for f in filter_names: if f in values: filter_names2.append(f) filter_names = filter_names2 try: sample_index_list = map(lambda x: values.index(x), filter_names) except: ### If ":" in header name if ':' in line: values2=[] for x in values: if ':' in x: x=string.split(x,':')[1] values2.append(x) values = values2 sample_index_list = map(lambda x: values.index(x), filter_names) elif '.$' in line: filter_names2=[] for f in filter_names: ### if the name in the filter is a string within the input data-file for f1 in values: if f in f1: filter_names2.append(f1) ### change to the reference name break print len(filter_names2), len(values), len(filter_names);kill filter_names = filter_names2 #filter_names = map(lambda x: string.split(x,'.')[0], filter_names) #values = map(lambda x: string.split(x,'.')[0], values) sample_index_list = map(lambda x: values.index(x), filter_names) else: temp_count=1 for x in filter_names: if x not in values: temp_count+=1 if temp_count>500: print 'too many to print';kill print x, print 'are missing';kill firstLine = False header = values if 'PSI_EventAnnotation' in input_file: uid_index = values.index('UID') if calculateCentroids: if len(comparisons)>0: export_object.write(string.join(['UID']+map(lambda x: x[0]+'-fold',comparisons),'\t')+'\n') ### Use the numerator group name else: clusters = map(str,group_index_db) export_object.write(string.join([values[uid_index]]+clusters,'\t')+'\n') continue ### skip the below code try: filtered_values = map(lambda x: values[x], sample_index_list) ### simple and fast way to reorganize the samples except Exception: print traceback.format_exc() print len(values), len(sample_index_list) print input_file, len(filter_names) for i in filter_names: if i not in header: print i, 'not found' sys.exit() sys.exit() ### For PSI files with missing values at the end of each line, often if len(header) != len(values): diff = len(header)-len(values) values+=diff*[''] filtered_values = map(lambda x: values[x], sample_index_list) ### simple and fast way to reorganize the samples #print values[0]; print sample_index_list; print values; print len(values); print len(prior_values);kill prior_values=values ######################## Begin Centroid Calculation ######################## if calculateCentroids: mean_matrix=[] means={} for cluster in group_index_db: #### group_index_db[cluster] is all of the indeces for samples in a noted group, cluster is the actual cluster name (not number) try: mean=statistics.avg(map(lambda x: float(filtered_values[x]), group_index_db[cluster])) except: continue #mean = map(lambda x: filtered_values[uid][x], group_index_db[cluster]) ### Only one value means[cluster]=mean mean_matrix.append(str(mean)) filtered_values = mean_matrix if len(comparisons)>0: fold_matrix=[] for (group2, group1) in comparisons: fold = means[group2]-means[group1] fold_matrix.append(str(fold)) filtered_values = fold_matrix ######################## End Centroid Calculation ######################## export_object.write(string.join([values[uid_index]]+filtered_values,'\t')+'\n') export_object.close() print 'Filtered columns printed to:',output_file return output_file