def replacePearsonPvalueWithZscore(): all_sample_data={} for tissue in tissue_comparison_scores: for (r,p,sample) in tissue_comparison_scores[tissue]: all_sample_data[sample] = [] ### populate this dictionary and create sub-dictionaries break for tissue in tissue_comparison_scores: for (r,p,sample) in tissue_comparison_scores[tissue]: all_sample_data[sample].append(r) sample_stats={} all_dataset_rho_values=[] ### Get average and standard deviation for all sample rho's for sample in all_sample_data: all_dataset_rho_values+=all_sample_data[sample] avg=statistics.avg(all_sample_data[sample]) stdev=statistics.stdev(all_sample_data[sample]) sample_stats[sample]=avg,stdev global_rho_avg = statistics.avg(all_dataset_rho_values) global_rho_stdev = statistics.stdev(all_dataset_rho_values) ### Replace the p-value for each rho for tissue in tissue_comparison_scores: scores = [] for (r,p,sample) in tissue_comparison_scores[tissue]: #u,s=sample_stats[sample] #z = (r-u)/s z = (r-global_rho_avg)/global_rho_stdev ### Instead of doing this for the sample background, do it relative to all analyzed samples scores.append([r,z,sample]) tissue_comparison_scores[tissue] = scores
def statisticallyFilterFile(input_file, output_file, threshold): if 'exp.' in input_file: counts_file = string.replace(input_file, 'exp.', 'geneCount.') else: counts_file = input_file[:-4] + '-geneCount.txt' sample_expressed_genes = {} header = True junction_max = [] count_sum_array = [] for line in open(input_file, 'rU').xreadlines(): data = cleanUpLine(line) if '.csv' in input_file: t = string.split(data, ',') else: t = string.split(data, '\t') if header: header_len = len(t) full_header = t samples = t[1:] header = False count_sum_array = [0] * len(samples) else: if len(t) == (header_len + 1): ### Correct header with a missing UID column samples = full_header count_sum_array = [0] * len(samples) print 'fixing bad header' try: values = map(float, t[1:]) except Exception: if 'NA' in t[1:]: tn = [0 if x == 'NA' else x for x in t[1:]] ### Replace NAs values = map(float, tn) else: tn = [0 if x == '' else x for x in t[1:]] ### Replace NAs values = map(float, tn) binarized_values = [] for v in values: if v > threshold: binarized_values.append(1) else: binarized_values.append(0) count_sum_array = [ sum(value) for value in zip(*[count_sum_array, binarized_values]) ] index = 0 distribution = [] count_sum_array_db = {} samples_to_retain = [] samples_to_exclude = [] for sample in samples: count_sum_array_db[sample] = count_sum_array[index] distribution.append(count_sum_array[index]) index += 1 from stats_scripts import statistics distribution.sort() avg = int(statistics.avg(distribution)) stdev = int(statistics.stdev(distribution)) min_exp = int(min(distribution)) cutoff = avg - (stdev * 2) dev = 2 print 'The average number of genes expressed above %s is %s, (SD is %s, min is %s)' % ( threshold, avg, stdev, min_exp) if cutoff < 0: if (stdev - avg) > 0: cutoff = avg - (stdev / 2) dev = 0.5 print cutoff, 'genes expressed selected as a default cutoff to include cells (2-stdev away)' else: cutoff = avg - stdev dev = 1 print cutoff, 'genes expressed selected as a default cutoff to include cells (1-stdev away)' if min_exp > cutoff: cutoff = avg - stdev dev = 1 print 'Using a default cutoff of >=500 genes per cell expressed/cell' cutoff = 499 import export eo = export.ExportFile(counts_file) eo.write('Sample\tGenes Expressed(threshold:' + str(threshold) + ')\n') for sample in samples: ### keep the original order if count_sum_array_db[sample] > cutoff: samples_to_retain.append(sample) else: samples_to_exclude.append(sample) eo.write(sample + '\t' + str(count_sum_array_db[sample]) + '\n') if len(samples_to_retain) < 4: ### Don't remove any if too few samples samples_to_retain += samples_to_exclude else: print len( samples_to_exclude ), 'samples removed (< 500 genes expressed)' # (%s)' % (dev,string.join(samples_to_exclude,', ')) eo.close() print 'Exporting the filtered expression file to:' print output_file filterFile(input_file, output_file, samples_to_retain)
def plotFeatureBoxPlots(qc_db, dataset_name, feature_type): pylab.figure() pylab.xlabel('Biological Sample Names') pylab.ylabel('Read Counts - Log2') pylab.title('Expression BoxPlots for %ss - %s' % (feature_type, dataset_name)) #pylab.subplots_adjust(left=0.085, right=0.95, top=0.2, bottom=0.35) pylab.subplots_adjust(left=0.075, right=0.95, top=0.9, bottom=0.35) #axes = getAxes(scores) ### adds buffer space to the end of each axis and creates room for a legend #pylab.axis(axes) boxplots = [] samples = [] sample_sorted_list = [] for sample_name in qc_db: try: qc = qc_db[sample_name][feature_type] except Exception: print 'No junction data found for at least one sample:', sample_name forceExit sample_sorted_list.append( [statistics.avg(qc), statistics.stdev(qc), sample_name]) sample_sorted_list.sort() sample_sorted_list.reverse() filename = 'QC-%s-BoxPlot-%s.pdf' % (dataset_name, feature_type) export_obj = export.ExportFile(root_dir + filename[:-4] + '.txt') export_obj.write('SampleID\tAverage Expression\n') firstEntry = True for (mean, stdev, sample_name) in sample_sorted_list: ls = [] x_ls = [] y_ls = [] qc = qc_db[sample_name][feature_type] boxplots.append(qc) samples.append(sample_name) export_obj.write(sample_name + '\t' + str(mean) + '\n') if firstEntry: threshold = mean - 2 * stdev firstEntry = False else: if mean < threshold: print sample_name, 'expression is considered very low (2 standard deviations away from the max).' pylab.boxplot(boxplots, notch=0, whis=1.5, positions=None, widths=None, patch_artist=False) #pylab.boxplot(boxplots, notch=0, sym='+', vert=1, whis=1.5, positions=None, widths=None, patch_artist=False) xtickNames = pylab.setp(pylab.gca(), xticklabels=samples) pylab.setp(xtickNames, rotation=90, fontsize=10) export_obj.close() #print 'Exporting:',filename pylab.savefig(root_dir + filename) filename = filename[:-3] + 'png' pylab.savefig(root_dir + filename) #,dpi=200 graphic_link.append( ['QC - BoxPlot-' + feature_type + ' Expression', root_dir + filename]) try: import gc pylab.figure.clf() pylab.close() gc.collect() except Exception: pass
def importTableEntries(filename, filter_db, ensembl_exon_db, gene_db, root_dir, transpose, display, showIntrons, analysisType='plot'): import collections average_samples = True if showIntrons == 'yes': include_introns = True else: include_introns = False uid_db = {} ### probeset or AltAnalyze RNA-Seq ID keyed uid_list = {} ### ordered from first to last exon region uid_gene_db = {} ### Lets us look at multiple genes try: import UI biotypes = UI.getBiotypes(filename) except Exception: biotypes = {} for gene in ensembl_exon_db: uid_list[gene] = [] for (index, ed, id) in ensembl_exon_db[gene]: proceed = False if 'exp.' in filename: if include_introns: proceed = True elif 'E' in ed.ExonID(): proceed = True else: ### Include introns for splicing index view if include_introns == True: proceed = True elif 'E' in ed.ExonID(): proceed = True if proceed: uid_db[id] = ed uid_list[gene].append(id) uid_gene_db[id] = gene if '_vs_' in filename: ### If one two groups, this is what will be output to the RawSplice folder - need to have this alternate way of getting the expression file location rootdir = string.split(filename, 'AltResults')[0] exp_dir = getValidExpFile(rootdir + 'ExpressionInput') alt_groups_dir = string.split( exp_dir, 'ExpressionInput' )[0] + 'ExpressionInput/groups.' + findFilename(exp_dir) alt_groups_dir = string.replace(alt_groups_dir, 'exp.', '') start_time = time.time() fn = filepath(filename) matrix_gene_db = {} stdev_gene_matrix_db = {} row_header_gene = {} ids = {} x = 0 if 'heatmap' in analysisType: average_samples = False if '/' in filename: dataset_name = string.split(filename, '/')[-1][:-4] else: dataset_name = string.split(filename, '\\')[-1][:-4] for line in open(fn, 'rU').xreadlines(): data = line.strip() t = string.split(data, '\t') if data[0] == '#': x = 0 elif x == 0: if platform == 'RNASeq': removeExtension = True else: removeExtension = False group_db, column_header, sample_name_db = assignGroupColors( t[1:], '', removeExtension=removeExtension) x = 1 altresults = False if average_samples: if 'AltResults' in filename: altresults = True groups_dir = string.split( filename, 'AltResults' )[0] + 'ExpressionInput/groups.' + findFilename(filename) if verifyFile(groups_dir) == False: groups_dir = alt_groups_dir new_column_header = reformatAltHeaders(t[3:]) start = 3 else: if 'exp.' in filename: groups_dir = string.replace(filename, 'exp.', 'groups.') else: groups_dir = string.replace(filename, 'counts.', 'groups.') new_column_header = column_header start = 1 ### starting index with numeric values groups_dir = string.replace(groups_dir, 'stats.', 'groups.') groups_dir = string.replace( groups_dir, '-steady-state.txt', '.txt') ### groups is for the non-steady-state file try: group_index_db = collections.OrderedDict() except Exception: import ordereddict group_index_db = ordereddict.OrderedDict() ### use comps in the future to visualize group comparison changes sample_list, group_sample_db, group_db, group_name_sample_db, comp_groups, comps_name_db = ExpressionBuilder.simpleGroupImport( groups_dir) for item in sample_list: group_name = group_db[item] proceed = False try: sample_index = new_column_header.index(item) proceed = True except Exception: try: item = string.replace(item, '.bed', '') item = string.replace( item, '.CEL', '') ### Probe-level analyses as RNA-Seq item = string.replace(item, '.cel', '') item = string.replace(item, '.txt', '') item = string.replace(item, '.TXT', '') item = string.replace(item, '.TAB', '') item = string.replace(item, '.tab', '') sample_index = new_column_header.index(item) proceed = True except Exception: pass #print [item] #print column_header #print Error if proceed: try: group_index_db[group_name].append(sample_index) except Exception: try: group_index_db[group_name] = [ sample_index ] ### dictionary of group to input file sample indexes except Exception: pass ### Occurs when analyzing splicing-index for two groups when more than two groups exist (error from 5 lines up) groups = map(str, group_index_db) ### store group names new_sample_list = map( lambda item: group_db[item], sample_list ) ### lookup index of each sample in the ordered group sample list column_header = groups else: if 'AltResults' in filename: start = 3 else: start = 1 ### starting index with numeric values column_header = t[start - 1:] row_number = 1 else: if ' ' not in t and '' not in t: ### Occurs for rows with missing data uid = t[start - 1] if ';' in uid: uid = string.split(uid, ';')[0] ids[uid] = None ens_geneID = string.split(uid, ':')[0] #if ens_geneID in gene_db: print uid if uid in filter_db or ('heatmap' in analysisType and ens_geneID in gene_db): try: if len(biotypes) == 1 and 'junction' in biotypes: gene = ens_geneID else: gene = uid_gene_db[uid] try: row_header_gene[gene].append(uid) except Exception: row_header_gene[gene] = [uid] if average_samples == False: values = map(float, t[start:]) try: matrix_gene_db[gene].append(values) except Exception: matrix_gene_db[gene] = [values] else: if platform == 'RNASeq' and altresults == False: ### Convert to log2 RPKM values - or counts values = map(lambda x: math.log(float(x), 2), t[start:]) else: values = map(float, t[start:]) if 'AltResults' in filename: ### If splicing scores, normalize these to the mean values mean = statistics.avg(values) values = map(lambda x: x - mean, values) avg_ls = [] std_ls = [] for group_name in group_index_db: group_values = map( lambda x: values[x], group_index_db[group_name] ) ### simple and fast way to reorganize the samples avg = statistics.avg(group_values) try: st_err = statistics.stdev( group_values) / math.sqrt( len(group_values)) except Exception: ### Occurs if no replicates in the dataset st_err = 0 avg_ls.append(avg) std_ls.append(st_err) try: matrix_gene_db[gene].append(avg_ls) except Exception: matrix_gene_db[gene] = [avg_ls] try: stdev_gene_matrix_db[gene].append(std_ls) except Exception: stdev_gene_matrix_db[gene] = [std_ls] except Exception: #print traceback.format_exc() pass x += 1 global colors original_column_header = list(column_header) if len(uid_list) == 0: print 'No genes found in the exon expression database' forceNoExonExpError successfully_output_genes = 0 display_count = 0 ### Only display a certain number of genes for last_gene in uid_list: pass for gene in uid_list: fig = pylab.figure( ) ### Create this here - resulting in a single figure for memory purposes new_header = [] new_matrix = [] new_stdev = [] annotation_list = [] gene_symbol = gene_db[gene] try: matrix = matrix_gene_db[gene] except Exception: #print gene_symbol, 'not in alternative expression database' continue ### go the next gene - no alt.expression for this gene row_header = row_header_gene[gene] try: stdev_matrix = stdev_gene_matrix_db[gene] except Exception: pass for uid in uid_list[gene]: #print row_header;sys.exit() try: i = row_header.index( uid ) ### If the ID is in the filtered annotated exon list (not just core) new_header.append(uid) try: new_matrix.append(matrix[i]) except Exception: print uid, i, len(matrix) sys.exit() ed = uid_db[uid] annotation_list.append(ed) try: new_stdev.append(stdev_matrix[i]) except Exception: pass except Exception: pass if len(new_matrix) > 0: matrix = new_matrix if len(new_header) > 0: row_header = new_header if 'heatmap' in analysisType: export_dir = root_dir + gene_symbol + '-heatmap.txt' export_obj = export.ExportFile(export_dir) export_obj.write(string.join(column_header, '\t') + '\n') ki = 0 if len(annotation_list) > 0: for ed in annotation_list: if 'AltResults' not in filename and platform == 'RNASeq': values = map(lambda x: math.log(x, 2), matrix[ki]) else: values = matrix[ki] export_obj.write( string.join([ed.ExonID()] + map(str, values), '\t') + '\n') ki += 1 row_metric = 'euclidean' row_method = None else: ### Just junctions analyzed here... no sorted junctions yet ki = 0 for uid in row_header_gene[gene]: if 'AltResults' not in filename and platform == 'RNASeq': values = map(lambda x: math.log(x, 2), matrix[ki]) else: values = matrix[ki] export_obj.write( string.join([uid] + map(str, values), '\t') + '\n') ki += 1 row_metric = 'euclidean' row_method = 'average' export_obj.close() from visualization_scripts import clustering column_metric = 'euclidean' column_method = 'hopach' color_gradient = 'red_black_sky' transpose = False graphic_links = [] if ki > 100: transpose = True if gene == last_gene: display = True else: display = False graphic_links = clustering.runHCexplicit(export_dir, graphic_links, row_method, row_metric, column_method, column_metric, color_gradient, transpose, display=display, Normalize=True, compressAxis=False, contrast=2.5) successfully_output_genes += 1 else: stdev_matrix = new_stdev time_diff = str(round(time.time() - start_time, 1)) #print '%d rows and %d columns imported for %s in %s seconds...' % (len(matrix),len(column_header),dataset_name,time_diff) if transpose == True: matrix = map(numpy.array, zip(*matrix)) ### coverts these to tuples column_header, row_header = row_header, original_column_header stdev_matrix = map(numpy.array, zip(*stdev_matrix)) matrix = numpy.array(matrix) stdev_matrix = numpy.array(stdev_matrix) try: if len(uid_list) > 10: #if display_count==5: display=False display = False if display_count == 0: ### store a consistent color palete to use colors = [] """ k=0 while k < len(row_header): colors.append(tuple(rand(3))) k+=1""" #http://stackoverflow.com/questions/3016283/create-a-color-generator-from-given-colormap-in-matplotlib cm = pylab.cm.get_cmap('gist_rainbow') #gist_ncar for i in range(len(row_header)): colors.append(cm(1. * i / len(row_header)) ) # color will now be an RGBA tuple plotExonExpression(fig, matrix, stdev_matrix, row_header, column_header, dataset_name, annotation_list, gene_symbol, root_dir, display=display) successfully_output_genes += 1 display_count += 1 except Exception: print traceback.format_exc() sys.exit() print gene_symbol, 'failed' try: pylab.close() except Exception: pass if successfully_output_genes > 0: #try: print 'Gene graphs exported to ExonPlots...' #except Exception: pass pass else: print '\nWARNING!!!! No genes with associated alternative exon evidence found\n' forceNoExonExpError try: import gc fig.clf() pylab.close() gc.collect() except Exception: pass