def sashmi_plot_list(bamdir,eventsToVisualizeFilename,PSIFilename,events=None): import gene_associations gene_to_symbol = gene_associations.getGeneToUid(species,('hide','Ensembl-Symbol')) import OBO_import; symbol_to_gene = OBO_import.swapKeyValues(gene_to_symbol) if events==None: splicing_events,expandedSearch = importSplicingEventsToVisualize(eventsToVisualizeFilename) else: ### Replace any ":" from the input events #for i in range(len(events)): events[i] = string.replace(events[i],':','__') expandedSearch = True for i in range(len(events)): gene = string.split(events[i],'__')[0] if gene in gene_to_symbol: symbol = gene_to_symbol[gene][0] elif 'ENS' not in gene or 'G0000' in gene: if gene in symbol_to_gene: ensID = symbol_to_gene[gene][0] symbol = gene events[i] = ensID ### translate this ID to an Ensembl gene ID for propper SashimiPlot lookup splicing_events = events ### optionally get from supplied variable if len(splicing_events)==0: print eventsToVisualizeFilename forceNoCompatibleEventsInFile print 'Exporting plots', ### Determine Groups for Coloring groups_file = 'None' dir_list = unique.read_directory(root_dir+'/ExpressionInput') for file in dir_list: if 'groups.' in file: groups_file = root_dir+'/ExpressionInput/'+file if groups_file != None: try: import ExpressionBuilder sample_group_db = ExpressionBuilder.simplerGroupImport(groups_file) groups=[] for sample in sample_group_db: if sample_group_db[sample] not in groups: groups.append(sample_group_db[sample]) ### create an ordered list of unique group except Exception: groups = ['None'] #print traceback.format_exc() pass processed_events = formatAndSubmitSplicingEventsToSashimiPlot(PSIFilename, bamdir, splicing_events, sample_group_db, groups, False) mopup_events = getMopUpEvents(splicing_events, processed_events) ### Do the same for supplied gene queries or junctions that didn't map above using the gene expression values as a guide #print len(splicing_events),len(processed_events),len(mopup_events) processed_events = formatAndSubmitSplicingEventsToSashimiPlot(steady_state_exp_file,bamdir,mopup_events,sample_group_db,groups,expandedSearch) if len(processed_events)>0: mopup_events = getMopUpEvents(mopup_events, processed_events) processed_events = formatAndSubmitSplicingEventsToSashimiPlot(PSIFilename, bamdir, mopup_events, sample_group_db, groups, True) return gene_to_symbol
def filterRows(input_file,output_file,filterDB=None,logData=False): orderlst={} counter=[] export_object = open(output_file,'w') firstLine = True Flag=0; species="Hs" import OBO_import; import ExpressionBuilder gene_to_symbol_db = ExpressionBuilder.importGeneAnnotations(species) symbol_to_gene = OBO_import.swapKeyValues(gene_to_symbol_db) for line in open(input_file,'rU').xreadlines(): flag1 = 0 data = cleanUpLine(line) values = string.split(data,'\t') if firstLine: firstLine = False if Flag==0: export_object.write(line) else: try: symbolID = gene_to_symbol_db[values[0]][0] except Exception: symbolID = values[0] if symbolID in filterDB: counter=[index for index, value in enumerate(filterDB) if value == symbolID] for it in range(0,len(counter)): orderlst[counter[it]]=line try: for i in range(0,len(orderlst)): export_object.write(orderlst[i]) except Exception: print i,filterDB[i] export_object.close() print 'Filtered rows printed to:',output_file
def matrixImport(filename): matrix={} original_data={} headerRow=True for line in open(filename,'rU').xreadlines(): original_line = line data = line.rstrip() values = string.split(data,'\t') if headerRow: group_db={} groups=[] if ':' in data: group_sample_list = map(lambda x: string.split(x,':'),values[1:]) index=1 for (g,s) in group_sample_list: try: group_db[g].append(index) except Exception: group_db[g] = [index] index+=1 if g not in groups: groups.append(g) else: import ExpressionBuilder search_dir = string.split(filename,'AltResults')[0]+'ExpressionInput' files = unique.read_directory(search_dir) for file in files: if 'groups.' in file: sample_group_db = ExpressionBuilder.simplerGroupImport(search_dir+'/'+file) index=0 for s in values[1:]: g = sample_group_db[s] try: group_db[g].append(index) except Exception: group_db[g] = [index] index+=1 if g not in groups: groups.append(g) headerRow = False grouped_values=[] original_data['header'] = original_line else: key = values[0] grouped_floats=[] float_values = [] for g in groups: ### string values gvalues_list=[] for i in group_db[g]: if values[i] != '0': try: gvalues_list.append(float(values[i])) except Exception: pass else: try: gvalues_list.append('') ### Thus are missing values except Exception: pass grouped_floats.append(gvalues_list) matrix[key] = grouped_floats if '\n' not in original_line: original_line+='\n' original_data[key] = original_line last_line = line return matrix,original_data
def importGroups(fn): try: group_db=collections.OrderedDict() except Exception: try: import ordereddict group_db=ordereddict.OrderedDict() except Exception: group_db={} for line in open(fn,'rU').xreadlines(): data = ExpressionBuilder.cleanUpLine(line) sample_filename,group_number,group_name = string.split(data,'\t') try: group_db[group_name].append(sample_filename) except Exception: group_db[group_name] = [sample_filename] return group_db
def exportChromosomeStrandCoordinates(species): import EnsemblImport gene_location_db = EnsemblImport.getEnsemblGeneLocations( species, 'RNASeq', 'key_by_array') import ExpressionBuilder gene_biotype_db = ExpressionBuilder.importTranscriptBiotypeAnnotations( species) export_path = 'GenMAPPDBs/' + species + '/chr_gene_locations.txt' export_data = export.ExportFile(export_path) import ExonAnalyze_module gene_annotation_file = "AltDatabase/ensembl/" + species + "/" + species + "_Ensembl-annotations.txt" annotate_db = ExonAnalyze_module.import_annotations( gene_annotation_file, 'RNASeq') print 'Annotations for', len(gene_location_db), 'genes imported' sorted_list = [] protein_coding = 0 for gene in gene_location_db: chr, strand, start, end = gene_location_db[gene] if gene in gene_biotype_db: biotype = gene_biotype_db[gene][-1] if biotype == 'protein_coding': protein_coding += 1 else: biotype = 'NA' if len(chr) < 7: sorted_list.append( [chr, strand, int(start), int(end), gene, biotype]) #else: print chr;sys.exit() print len(sorted_list), 'genes for typical chromosomes present' print protein_coding, 'protein coding genes present' sorted_list.sort() for values in sorted_list: chr, strand, start, end, gene, biotype = values try: symbol = annotate_db[gene].Symbol() except Exception: symbol = '' values = [gene, symbol, chr, strand, str(start), str(end), biotype] export_data.write(string.join(values, '\t') + '\n') export_data.close() print species, 'chromosome locations exported to:\n', export_path
def exportChromosomeStrandCoordinates(species): import EnsemblImport gene_location_db = EnsemblImport.getEnsemblGeneLocations(species,'RNASeq','key_by_array') import ExpressionBuilder gene_biotype_db = ExpressionBuilder.importTranscriptBiotypeAnnotations(species) export_path = 'GenMAPPDBs/'+species+'/chr_gene_locations.txt' export_data = export.ExportFile(export_path) import ExonAnalyze_module gene_annotation_file = "AltDatabase/ensembl/"+species+"/"+species+"_Ensembl-annotations.txt" annotate_db = ExonAnalyze_module.import_annotations(gene_annotation_file,'RNASeq') print 'Annotations for',len(gene_location_db),'genes imported' sorted_list=[]; protein_coding=0 for gene in gene_location_db: chr,strand,start,end = gene_location_db[gene] if gene in gene_biotype_db: biotype = gene_biotype_db[gene][-1] if biotype == 'protein_coding': protein_coding+=1 else: biotype = 'NA' if len(chr)<7: sorted_list.append([chr,strand,int(start),int(end),gene,biotype]) #else: print chr;sys.exit() print len(sorted_list),'genes for typical chromosomes present' print protein_coding, 'protein coding genes present' sorted_list.sort() for values in sorted_list: chr,strand,start,end,gene,biotype=values try: symbol = annotate_db[gene].Symbol() except Exception: symbol = '' values = [gene,symbol,chr,strand,str(start),str(end),biotype] export_data.write(string.join(values,'\t')+'\n') export_data.close() print species, 'chromosome locations exported to:\n',export_path
def importTableEntries(filename, filter_db, ensembl_exon_db, gene_db, root_dir, transpose, display, showIntrons, analysisType='plot'): import collections average_samples = True if showIntrons == 'yes': include_introns = True else: include_introns = False uid_db = {} ### probeset or AltAnalyze RNA-Seq ID keyed uid_list = {} ### ordered from first to last exon region uid_gene_db = {} ### Lets us look at multiple genes try: import UI biotypes = UI.getBiotypes(filename) except Exception: biotypes = {} for gene in ensembl_exon_db: uid_list[gene] = [] for (index, ed, id) in ensembl_exon_db[gene]: proceed = False if 'exp.' in filename: if include_introns: proceed = True elif 'E' in ed.ExonID(): proceed = True else: ### Include introns for splicing index view if include_introns == True: proceed = True elif 'E' in ed.ExonID(): proceed = True if proceed: uid_db[id] = ed uid_list[gene].append(id) uid_gene_db[id] = gene if '_vs_' in filename: ### If one two groups, this is what will be output to the RawSplice folder - need to have this alternate way of getting the expression file location rootdir = string.split(filename, 'AltResults')[0] exp_dir = getValidExpFile(rootdir + 'ExpressionInput') alt_groups_dir = string.split( exp_dir, 'ExpressionInput' )[0] + 'ExpressionInput/groups.' + findFilename(exp_dir) alt_groups_dir = string.replace(alt_groups_dir, 'exp.', '') start_time = time.time() fn = filepath(filename) matrix_gene_db = {} stdev_gene_matrix_db = {} row_header_gene = {} ids = {} x = 0 if 'heatmap' in analysisType: average_samples = False if '/' in filename: dataset_name = string.split(filename, '/')[-1][:-4] else: dataset_name = string.split(filename, '\\')[-1][:-4] for line in open(fn, 'rU').xreadlines(): data = line.strip() t = string.split(data, '\t') if data[0] == '#': x = 0 elif x == 0: if platform == 'RNASeq': removeExtension = True else: removeExtension = False group_db, column_header, sample_name_db = assignGroupColors( t[1:], '', removeExtension=removeExtension) x = 1 altresults = False if average_samples: if 'AltResults' in filename: altresults = True groups_dir = string.split( filename, 'AltResults' )[0] + 'ExpressionInput/groups.' + findFilename(filename) if verifyFile(groups_dir) == False: groups_dir = alt_groups_dir new_column_header = reformatAltHeaders(t[3:]) start = 3 else: if 'exp.' in filename: groups_dir = string.replace(filename, 'exp.', 'groups.') else: groups_dir = string.replace(filename, 'counts.', 'groups.') new_column_header = column_header start = 1 ### starting index with numeric values groups_dir = string.replace(groups_dir, 'stats.', 'groups.') groups_dir = string.replace( groups_dir, '-steady-state.txt', '.txt') ### groups is for the non-steady-state file try: group_index_db = collections.OrderedDict() except Exception: import ordereddict group_index_db = ordereddict.OrderedDict() ### use comps in the future to visualize group comparison changes sample_list, group_sample_db, group_db, group_name_sample_db, comp_groups, comps_name_db = ExpressionBuilder.simpleGroupImport( groups_dir) for item in sample_list: group_name = group_db[item] proceed = False try: sample_index = new_column_header.index(item) proceed = True except Exception: try: item = string.replace(item, '.bed', '') item = string.replace( item, '.CEL', '') ### Probe-level analyses as RNA-Seq item = string.replace(item, '.cel', '') item = string.replace(item, '.txt', '') item = string.replace(item, '.TXT', '') item = string.replace(item, '.TAB', '') item = string.replace(item, '.tab', '') sample_index = new_column_header.index(item) proceed = True except Exception: pass #print [item] #print column_header #print Error if proceed: try: group_index_db[group_name].append(sample_index) except Exception: try: group_index_db[group_name] = [ sample_index ] ### dictionary of group to input file sample indexes except Exception: pass ### Occurs when analyzing splicing-index for two groups when more than two groups exist (error from 5 lines up) groups = map(str, group_index_db) ### store group names new_sample_list = map( lambda item: group_db[item], sample_list ) ### lookup index of each sample in the ordered group sample list column_header = groups else: if 'AltResults' in filename: start = 3 else: start = 1 ### starting index with numeric values column_header = t[start - 1:] row_number = 1 else: if ' ' not in t and '' not in t: ### Occurs for rows with missing data uid = t[start - 1] if ';' in uid: uid = string.split(uid, ';')[0] ids[uid] = None ens_geneID = string.split(uid, ':')[0] #if ens_geneID in gene_db: print uid if uid in filter_db or ('heatmap' in analysisType and ens_geneID in gene_db): try: if len(biotypes) == 1 and 'junction' in biotypes: gene = ens_geneID else: gene = uid_gene_db[uid] try: row_header_gene[gene].append(uid) except Exception: row_header_gene[gene] = [uid] if average_samples == False: values = map(float, t[start:]) try: matrix_gene_db[gene].append(values) except Exception: matrix_gene_db[gene] = [values] else: if platform == 'RNASeq' and altresults == False: ### Convert to log2 RPKM values - or counts values = map(lambda x: math.log(float(x), 2), t[start:]) else: values = map(float, t[start:]) if 'AltResults' in filename: ### If splicing scores, normalize these to the mean values mean = statistics.avg(values) values = map(lambda x: x - mean, values) avg_ls = [] std_ls = [] for group_name in group_index_db: group_values = map( lambda x: values[x], group_index_db[group_name] ) ### simple and fast way to reorganize the samples avg = statistics.avg(group_values) try: st_err = statistics.stdev( group_values) / math.sqrt( len(group_values)) except Exception: ### Occurs if no replicates in the dataset st_err = 0 avg_ls.append(avg) std_ls.append(st_err) try: matrix_gene_db[gene].append(avg_ls) except Exception: matrix_gene_db[gene] = [avg_ls] try: stdev_gene_matrix_db[gene].append(std_ls) except Exception: stdev_gene_matrix_db[gene] = [std_ls] except Exception: #print traceback.format_exc() pass x += 1 global colors original_column_header = list(column_header) if len(uid_list) == 0: print 'No genes found in the exon expression database' forceNoExonExpError successfully_output_genes = 0 display_count = 0 ### Only display a certain number of genes for last_gene in uid_list: pass for gene in uid_list: fig = pylab.figure( ) ### Create this here - resulting in a single figure for memory purposes new_header = [] new_matrix = [] new_stdev = [] annotation_list = [] gene_symbol = gene_db[gene] try: matrix = matrix_gene_db[gene] except Exception: print gene_symbol, 'not in alternative expression database' continue ### go the next gene - no alt.expression for this gene row_header = row_header_gene[gene] try: stdev_matrix = stdev_gene_matrix_db[gene] except Exception: pass for uid in uid_list[gene]: #print row_header;sys.exit() try: i = row_header.index( uid ) ### If the ID is in the filtered annotated exon list (not just core) new_header.append(uid) try: new_matrix.append(matrix[i]) except Exception: print uid, i, len(matrix) sys.exit() ed = uid_db[uid] annotation_list.append(ed) try: new_stdev.append(stdev_matrix[i]) except Exception: pass except Exception: pass if len(new_matrix) > 0: matrix = new_matrix if len(new_header) > 0: row_header = new_header if 'heatmap' in analysisType: export_dir = root_dir + gene_symbol + '-heatmap.txt' export_obj = export.ExportFile(export_dir) export_obj.write(string.join(column_header, '\t') + '\n') ki = 0 if len(annotation_list) > 0: for ed in annotation_list: if 'AltResults' not in filename and platform == 'RNASeq': values = map(lambda x: math.log(x, 2), matrix[ki]) else: values = matrix[ki] export_obj.write( string.join([ed.ExonID()] + map(str, values), '\t') + '\n') ki += 1 row_metric = 'euclidean' row_method = None else: ### Just junctions analyzed here... no sorted junctions yet ki = 0 for uid in row_header_gene[gene]: if 'AltResults' not in filename and platform == 'RNASeq': values = map(lambda x: math.log(x, 2), matrix[ki]) else: values = matrix[ki] export_obj.write( string.join([uid] + map(str, values), '\t') + '\n') ki += 1 row_metric = 'euclidean' row_method = 'average' export_obj.close() import clustering column_metric = 'euclidean' column_method = 'hopach' color_gradient = 'red_black_sky' transpose = False graphic_links = [] if ki > 100: transpose = True if gene == last_gene: display = True else: display = False graphic_links = clustering.runHCexplicit(export_dir, graphic_links, row_method, row_metric, column_method, column_metric, color_gradient, transpose, display=display, Normalize=True, compressAxis=False, contrast=2.5) successfully_output_genes += 1 else: stdev_matrix = new_stdev time_diff = str(round(time.time() - start_time, 1)) #print '%d rows and %d columns imported for %s in %s seconds...' % (len(matrix),len(column_header),dataset_name,time_diff) if transpose == True: matrix = map(numpy.array, zip(*matrix)) ### coverts these to tuples column_header, row_header = row_header, original_column_header stdev_matrix = map(numpy.array, zip(*stdev_matrix)) matrix = numpy.array(matrix) stdev_matrix = numpy.array(stdev_matrix) try: if len(uid_list) > 10: #if display_count==5: display=False display = False if display_count == 0: ### store a consistent color palete to use colors = [] """ k=0 while k < len(row_header): colors.append(tuple(rand(3))) k+=1""" #http://stackoverflow.com/questions/3016283/create-a-color-generator-from-given-colormap-in-matplotlib cm = pylab.cm.get_cmap('gist_rainbow') #gist_ncar for i in range(len(row_header)): colors.append(cm(1. * i / len(row_header)) ) # color will now be an RGBA tuple plotExonExpression(fig, matrix, stdev_matrix, row_header, column_header, dataset_name, annotation_list, gene_symbol, root_dir, display=display) successfully_output_genes += 1 display_count += 1 except Exception: print traceback.format_exc() sys.exit() print gene_symbol, 'failed' try: pylab.close() except Exception: pass if successfully_output_genes > 0: try: print 'Gene graphs exported to ExonPlots...' except Exception: pass else: print '\nWARNING!!!! No genes with associated alternative exon evidence found\n' forceNoExonExpError try: import gc fig.clf() pylab.close() gc.collect() except Exception: pass
def remoteSashimiPlot(Species, fl, bamdir, eventsToVisualizeFilename, events=None, show=False): global PSIFilename global outputdir global root_dir global steady_state_exp_file global species species = Species try: countinp = fl.CountsFile() root_dir = fl.RootDir() except Exception: root_dir = fl search_dir = root_dir + "/ExpressionInput" files = unique.read_directory(search_dir) for file in files: if "counts." in file and "steady-state.txt" not in file: countinp = search_dir + "/" + file PSIFilename = root_dir + "/AltResults/AlternativeOutput/" + species + "_RNASeq_top_alt_junctions-PSI.txt" import ExpressionBuilder dir_list = unique.read_directory(root_dir + "/ExpressionInput") for file in dir_list: if "exp." in file and "steady-state" not in file: exp_file = root_dir + "/ExpressionInput/" + file elif "exp." in file and "steady-state" in file: steady_state_exp_file = root_dir + "/ExpressionInput/" + file global sample_group_db sample_group_db = ExpressionBuilder.simplerGroupImport(exp_file) # outputdir=findParentDir(PSIFilename)+"sashimiplots" outputdir = root_dir + "/ExonPlots" outputdir = root_dir + "/SashimiPlots" try: os.mkdir(unique.filepath(outputdir)) except Exception: pass if show: s = open(outputdir + "/show.txt", "w") s.write("TRUE") s.close() else: s = open(outputdir + "/show.txt", "w") s.write("FALSE") s.close() geneSymbol_db = Sashimiplottting(bamdir, countinp, PSIFilename, eventsToVisualizeFilename, events=events) for filename in os.listdir(outputdir): if ".pdf" in filename or ".png" in filename: fn = string.replace(filename, ".pdf", "") fn = string.replace(fn, ".png", "") newname = string.split(fn, "__") if newname[0] in geneSymbol_db: new_filename = str(filename) if "__" in filename: new_filename = string.split(filename, "__")[1] elif "\\" in filename: new_filename = string.split(filename, "\\")[1] elif "/" in filename: new_filename = string.split(filename, "/")[1] nnname = geneSymbol_db[newname[0]][0] + "-SashimiPlot_" + new_filename try: os.rename(os.path.join(outputdir, filename), os.path.join(outputdir, nnname)) except Exception: if "already exists" in traceback.format_exc(): ### File already exists, delete the new one try: os.remove(os.path.join(outputdir, nnname)) except Exception: pass ### Now right the new one try: os.rename(os.path.join(outputdir, filename), os.path.join(outputdir, nnname)) except Exception: pass pass else: continue print ""
def performGroupNormalization(filename,export_dir,platform): expressionDataFormat,increment,convertNonLogToLog = ExpressionBuilder.checkExpressionFileFormat(filename) groups_dir = string.replace(export_dir,'exp.','batch.') fn=unique.filepath(filename); row_number=0; exp_db={}; relative_headers_exported = False group_db = importGroups(groups_dir) export_data = export.ExportFile(export_dir) for line in open(fn,'rU').xreadlines(): data = ExpressionBuilder.cleanUpLine(line) t = string.split(data,'\t') if data[0]=='#' and row_number==0: row_number = 0 elif row_number==0: sample_list = t[1:] new_sample_list = [] for group in group_db: group_samples = group_db[group] try: sample_index_list = map(lambda x: sample_list.index(x), group_samples) group_db[group] = sample_index_list new_sample_list+=group_samples except Exception: missing=[] for x in sample_list: if x not in t[1:]: missing.append(x) print 'missing:',missing print t print sample_list print filename, groups_dir print 'Unknown Error!!! Skipping cluster input file build (check column and row formats for conflicts)'; forceExit title = string.join([t[0]]+new_sample_list,'\t')+'\n' ### output the new sample order (group file order) export_data.write(title) row_number=1 else: gene = t[0] if expressionDataFormat == 'non-log' and (convertNonLogToLog or platform == 'RNASeq'): ### Convert to log2 RPKM values - or counts try: all_values = map(lambda x: math.log(float(x)+increment,2), t[1:]) except Exception: all_values = ExpressionBuilder.logTransformWithNAs(t[1:],increment) else: try: all_values = map(float,t[1:]) except Exception: all_values = ExpressionBuilder.logTransformWithNAs(t[1:],increment) row_number+=1 ### Keep track of the first gene as to write out column headers for the relative outputs gene_log_folds = [] for group in group_db: sample_index_list = group_db[group] ### Calculate log-fold values relative to the mean of all sample expression values try: values = map(lambda x: all_values[x], sample_index_list) ### simple and fast way to reorganize the samples except Exception: print len(values), sample_index_list;kill try: avg = statistics.avg(values) except Exception: values2=[] for v in values: try: values2.append(float(v)) except Exception: pass values = values2 try: avg = statistics.avg(values) except Exception: if len(values)>0: avg = values[0] else: avg = 0 try: log_folds = map(lambda x: (x-avg), values) except Exception: log_folds=[] for x in values: try: log_folds.append(x-avg) except Exception: log_folds.append('') gene_log_folds+=log_folds gene_log_folds = map(lambda x: str(x),gene_log_folds) export_data.write(string.join([gene]+gene_log_folds,'\t')+'\n') export_data.close()
def importTableEntries(filename,filter_db,ensembl_exon_db,gene_db,root_dir,transpose,display,showIntrons,analysisType='plot'): import collections average_samples = True if showIntrons == 'yes': include_introns = True else: include_introns = False uid_db={} ### probeset or AltAnalyze RNA-Seq ID keyed uid_list={} ### ordered from first to last exon region uid_gene_db={} ### Lets us look at multiple genes try: import UI biotypes = UI.getBiotypes(filename) except Exception: biotypes={} for gene in ensembl_exon_db: uid_list[gene]=[] for (index,ed,id) in ensembl_exon_db[gene]: proceed = False if 'exp.' in filename: if include_introns: proceed = True elif 'E' in ed.ExonID(): proceed = True else: ### Include introns for splicing index view if include_introns == True: proceed = True elif 'E' in ed.ExonID(): proceed = True if proceed: uid_db[id] = ed uid_list[gene].append(id) uid_gene_db[id]=gene if '_vs_' in filename: ### If one two groups, this is what will be output to the RawSplice folder - need to have this alternate way of getting the expression file location rootdir = string.split(filename, 'AltResults')[0] exp_dir = getValidExpFile(rootdir+'ExpressionInput') alt_groups_dir = string.split(exp_dir, 'ExpressionInput')[0]+'ExpressionInput/groups.'+findFilename(exp_dir) alt_groups_dir = string.replace(alt_groups_dir,'exp.','') start_time = time.time() fn = filepath(filename) matrix_gene_db={} stdev_gene_matrix_db={} row_header_gene={} ids={} x=0 if 'heatmap' in analysisType: average_samples = False if '/' in filename: dataset_name = string.split(filename,'/')[-1][:-4] else: dataset_name = string.split(filename,'\\')[-1][:-4] for line in open(fn,'rU').xreadlines(): data = line.strip() t = string.split(data,'\t') if data[0]=='#': x=0 elif x==0: if platform == 'RNASeq': removeExtension=True else: removeExtension=False group_db, column_header, sample_name_db = assignGroupColors(t[1:],'',removeExtension=removeExtension) x=1 altresults = False if average_samples: if 'AltResults' in filename: altresults=True groups_dir = string.split(filename, 'AltResults')[0]+'ExpressionInput/groups.'+findFilename(filename) if verifyFile(groups_dir)==False: groups_dir = alt_groups_dir new_column_header = reformatAltHeaders(t[3:]) start = 3 else: if 'exp.' in filename: groups_dir = string.replace(filename,'exp.','groups.') else: groups_dir = string.replace(filename,'counts.','groups.') new_column_header = column_header start = 1 ### starting index with numeric values groups_dir = string.replace(groups_dir,'stats.','groups.') groups_dir = string.replace(groups_dir,'-steady-state.txt','.txt') ### groups is for the non-steady-state file try: group_index_db=collections.OrderedDict() except Exception: import ordereddict group_index_db = ordereddict.OrderedDict() ### use comps in the future to visualize group comparison changes sample_list,group_sample_db,group_db,group_name_sample_db,comp_groups,comps_name_db = ExpressionBuilder.simpleGroupImport(groups_dir) for item in sample_list: group_name = group_db[item] proceed=False try: sample_index = new_column_header.index(item); proceed=True except Exception: try: item = string.replace(item,'.bed','') item = string.replace(item,'.CEL','') ### Probe-level analyses as RNA-Seq item = string.replace(item,'.cel','') item = string.replace(item,'.txt','') item = string.replace(item,'.TXT','') item = string.replace(item,'.TAB','') item = string.replace(item,'.tab','') sample_index = new_column_header.index(item) proceed=True except Exception: pass #print [item] #print column_header #print Error if proceed: try: group_index_db[group_name].append(sample_index) except Exception: try: group_index_db[group_name] = [sample_index] ### dictionary of group to input file sample indexes except Exception: pass ### Occurs when analyzing splicing-index for two groups when more than two groups exist (error from 5 lines up) groups = map(str, group_index_db) ### store group names new_sample_list = map(lambda item: group_db[item], sample_list) ### lookup index of each sample in the ordered group sample list column_header = groups else: if 'AltResults' in filename: start = 3 else: start = 1 ### starting index with numeric values column_header = t[start-1:] row_number=1 else: if ' ' not in t and '' not in t: ### Occurs for rows with missing data uid = t[start-1] if ';' in uid: uid = string.split(uid,';')[0] ids[uid]=None ens_geneID = string.split(uid,':')[0] #if ens_geneID in gene_db: print uid if uid in filter_db or ('heatmap' in analysisType and ens_geneID in gene_db): try: if len(biotypes)==1 and 'junction' in biotypes: gene = ens_geneID else: gene = uid_gene_db[uid] try: row_header_gene[gene].append(uid) except Exception: row_header_gene[gene] = [uid] if average_samples == False: values = map(float,t[start:]) try: matrix_gene_db[gene].append(values) except Exception: matrix_gene_db[gene]=[values] else: if platform == 'RNASeq' and altresults==False: ### Convert to log2 RPKM values - or counts values = map(lambda x: math.log(float(x),2), t[start:]) else: values = map(float,t[start:]) if 'AltResults' in filename: ### If splicing scores, normalize these to the mean values mean = statistics.avg(values) values = map(lambda x: x-mean, values) avg_ls=[]; std_ls = [] for group_name in group_index_db: group_values = map(lambda x: values[x], group_index_db[group_name]) ### simple and fast way to reorganize the samples avg = statistics.avg(group_values) try: st_err = statistics.stdev(group_values)/math.sqrt(len(group_values)) except Exception: ### Occurs if no replicates in the dataset st_err = 0 avg_ls.append(avg) std_ls.append(st_err) try: matrix_gene_db[gene].append(avg_ls) except Exception: matrix_gene_db[gene]=[avg_ls] try: stdev_gene_matrix_db[gene].append(std_ls) except Exception: stdev_gene_matrix_db[gene]=[std_ls] except Exception: #print traceback.format_exc() pass x+=1 global colors original_column_header = list(column_header) if len(uid_list)==0: print 'No genes found in the exon expression database'; forceNoExonExpError successfully_output_genes=0 display_count=0 ### Only display a certain number of genes for last_gene in uid_list: pass for gene in uid_list: fig = pylab.figure() ### Create this here - resulting in a single figure for memory purposes new_header = [] new_matrix = [] new_stdev = [] annotation_list=[] gene_symbol = gene_db[gene] try: matrix = matrix_gene_db[gene] except Exception: print gene_symbol, 'not in alternative expression database' continue ### go the next gene - no alt.expression for this gene row_header = row_header_gene[gene] try: stdev_matrix = stdev_gene_matrix_db[gene] except Exception: pass for uid in uid_list[gene]: #print row_header;sys.exit() try: i = row_header.index(uid) ### If the ID is in the filtered annotated exon list (not just core) new_header.append(uid) try: new_matrix.append(matrix[i]) except Exception: print uid, i,len(matrix);sys.exit() ed = uid_db[uid] annotation_list.append(ed) try: new_stdev.append(stdev_matrix[i]) except Exception: pass except Exception: pass if len(new_matrix)>0: matrix = new_matrix if len(new_header)>0: row_header = new_header if 'heatmap' in analysisType: export_dir = root_dir + gene_symbol + '-heatmap.txt' export_obj = export.ExportFile(export_dir) export_obj.write(string.join(column_header,'\t')+'\n') ki=0 if len(annotation_list)>0: for ed in annotation_list: if 'AltResults' not in filename and platform == 'RNASeq': values = map(lambda x: math.log(x,2), matrix[ki]) else: values = matrix[ki] export_obj.write(string.join([ed.ExonID()] + map(str,values),'\t')+'\n') ki+=1 row_metric = 'euclidean'; row_method = None else: ### Just junctions analyzed here... no sorted junctions yet ki=0 for uid in row_header_gene[gene]: if 'AltResults' not in filename and platform == 'RNASeq': values = map(lambda x: math.log(x,2), matrix[ki]) else: values = matrix[ki] export_obj.write(string.join([uid] + map(str,values),'\t')+'\n') ki+=1 row_metric = 'euclidean'; row_method = 'average' export_obj.close() import clustering column_metric = 'euclidean'; column_method = 'hopach' color_gradient = 'red_black_sky'; transpose = False; graphic_links=[] if ki>100: transpose = True if gene == last_gene: display = True else: display = False graphic_links = clustering.runHCexplicit(export_dir, graphic_links, row_method, row_metric, column_method, column_metric, color_gradient, transpose, display=display, Normalize=True, compressAxis = False, contrast = 2.5) successfully_output_genes+=1 else: stdev_matrix = new_stdev time_diff = str(round(time.time()-start_time,1)) #print '%d rows and %d columns imported for %s in %s seconds...' % (len(matrix),len(column_header),dataset_name,time_diff) if transpose == True: matrix = map(numpy.array, zip(*matrix)) ### coverts these to tuples column_header, row_header = row_header, original_column_header stdev_matrix = map(numpy.array, zip(*stdev_matrix)) matrix = numpy.array(matrix) stdev_matrix = numpy.array(stdev_matrix) try: if len(uid_list)>10: #if display_count==5: display=False display=False if display_count==0: ### store a consistent color palete to use colors=[] """ k=0 while k < len(row_header): colors.append(tuple(rand(3))) k+=1""" #http://stackoverflow.com/questions/3016283/create-a-color-generator-from-given-colormap-in-matplotlib cm = pylab.cm.get_cmap('gist_rainbow') #gist_ncar for i in range(len(row_header)): colors.append(cm(1.*i/len(row_header))) # color will now be an RGBA tuple plotExonExpression(fig,matrix,stdev_matrix,row_header,column_header,dataset_name,annotation_list,gene_symbol,root_dir,display=display) successfully_output_genes+=1 display_count+=1 except Exception: print traceback.format_exc();sys.exit() print gene_symbol, 'failed' try: pylab.close() except Exception: pass if successfully_output_genes>0: #try: print 'Gene graphs exported to ExonPlots...' #except Exception: pass pass else: print '\nWARNING!!!! No genes with associated alternative exon evidence found\n'; forceNoExonExpError try: import gc fig.clf() pylab.close() gc.collect() except Exception: pass
def matrixImport(filename): matrix={} compared_groups={} ### track which values correspond to which groups for pairwise group comparisons original_data={} headerRow=True for line in open(filename,'rU').xreadlines(): original_line = line data = line.rstrip() values = string.split(data,'\t') #print len(values) if headerRow: group_db={} groups=[] if ':' in data: group_sample_list = map(lambda x: string.split(x,':'),values[1:]) index=1 for (g,s) in group_sample_list: try: group_db[g].append(index) except Exception: group_db[g] = [index] index+=1 if g not in groups: groups.append(g) else: import ExpressionBuilder search_dir = string.split(filename,'AltResults')[0]+'ExpressionInput' files = unique.read_directory(search_dir) for file in files: if 'groups.' in file and '.txt' in file: #print file sample_group_db = ExpressionBuilder.simplerGroupImport(search_dir+'/'+file) index=0; count=0 for s in values[1:]: if s in sample_group_db: g = sample_group_db[s] try: group_db[g].append(index) except Exception: group_db[g] = [index] count+=1 if g not in groups: groups.append(g) #else: print [s] index+=1 #print count headerRow = False grouped_values=[] original_data['header'] = original_line else: key = values[0] values=values[1:] grouped_floats=[] float_values = [] associated_groups=[] for g in groups: ### string values gvalues_list=[] for i in group_db[g]: try: if values[i] != '0': try: gvalues_list.append(float(values[i])) except Exception: pass else: #try: gvalues_list.append('') ### Thus are missing values #except Exception: pass pass except Exception: #try: gvalues_list.append('') ### Thus are missing values #except Exception: pass pass grouped_floats.append(gvalues_list) if len(gvalues_list)>1: associated_groups.append(g) matrix[key] = grouped_floats compared_groups[key] = associated_groups if '\n' not in original_line: original_line+='\n' original_data[key] = original_line last_line = line return matrix,compared_groups,original_data
def getAnnotations(fl,Array_type,p_threshold,e_threshold,data_source,manufacturer,constitutive_source,Species,avg_all_for_ss,filter_by_DABG,perform_alt_analysis,expression_data_format): global species; species = Species; global average_all_probesets; average_all_probesets={} global avg_all_probes_for_steady_state; avg_all_probes_for_steady_state = avg_all_for_ss; global filter_by_dabg; filter_by_dabg = filter_by_DABG global dabg_p_threshold; dabg_p_threshold = float(p_threshold); global root_dir; global biotypes; global normalize_feature_exp global expression_threshold; global exp_data_format; exp_data_format = expression_data_format; global UserOptions; UserOptions = fl global full_dataset_export_dir; global excludeLowExpressionExons """ try: exon_exp_threshold = fl.ExonExpThreshold() except Exception: exon_exp_threshold = 0 try: exon_rpkm_threshold = fl.ExonRPKMThreshold() except Exception: exon_rpkm_threshold = 0 try: gene_rpkm_threshold = fl.RPKMThreshold() except Exception: gene_rpkm_threshold = 0 try: gene_exp_threshold = fl.GeneExpThreshold() except Exception: gene_exp_threshold = 0 """ ### The input expression data can be log or non-log. If non-log, transform to log in FilterDABG prior to the alternative exon analysis - v.1.16 if expression_data_format == 'log': try: expression_threshold = math.log(float(e_threshold),2) except Exception: expression_threshold = 0 ### Applies to RNASeq datasets else: expression_threshold = float(e_threshold) process_from_scratch = 'no' ###internal variables used while testing global dabg_summary; global expression_summary; dabg_summary={};expression_summary={} global fulldataset_export_object; global array_type; array_type = Array_type global exp_analysis_type; exp_analysis_type = 'expression' global stats_input_dir expr_input_dir = fl.ExpFile(); stats_input_dir = fl.StatsFile(); root_dir = fl.RootDir() try: normalize_feature_exp = fl.FeatureNormalization() except Exception: normalize_feature_exp = 'NA' try: excludeLowExpressionExons = fl.excludeLowExpressionExons() except Exception: excludeLowExpressionExons = True try: useJunctionsForGeneExpression = fl.useJunctionsForGeneExpression() if useJunctionsForGeneExpression: print 'Using known junction only to estimate gene expression!!!' except Exception: useJunctionsForGeneExpression = False source_biotype = 'mRNA' if array_type == 'gene': source_biotype = 'gene' elif array_type == 'junction': source_biotype = 'junction' ###Get annotations using Affymetrix as a trusted source or via links to Ensembl if array_type == 'AltMouse': probeset_db,constitutive_gene_db = ExpressionBuilder.importAltMerge('full'); annotate_db={} source_biotype = 'AltMouse' elif manufacturer == 'Affymetrix' or array_type == 'RNASeq': if array_type == 'RNASeq': source_biotype = array_type, root_dir probeset_db,annotate_db,constitutive_gene_db,splicing_analysis_db = ExonArrayEnsemblRules.getAnnotations(process_from_scratch,constitutive_source,source_biotype,species) ### Get all file locations and get array headers #print len(splicing_analysis_db),"genes included in the splicing annotation database (constitutive only containing)" stats_file_status = verifyFile(stats_input_dir) array_linker_db,array_names = importExonProbesetData(expr_input_dir,{},'arraynames') input_dir_split = string.split(expr_input_dir,'/') full_dataset_export_dir = root_dir+'AltExpression/FullDatasets/ExonArray/'+species+'/'+string.replace(input_dir_split[-1],'exp.','') if array_type == 'gene': full_dataset_export_dir = string.replace(full_dataset_export_dir,'ExonArray','GeneArray') if array_type == 'junction': full_dataset_export_dir = string.replace(full_dataset_export_dir,'ExonArray','JunctionArray') if array_type == 'AltMouse': full_dataset_export_dir = string.replace(full_dataset_export_dir,'ExonArray','AltMouse') if array_type == 'RNASeq': full_dataset_export_dir = string.replace(full_dataset_export_dir,'ExonArray','RNASeq') try: fulldataset_export_object = export.ExportFile(full_dataset_export_dir) except Exception: print 'AltAnalyze is having trouble creating the directory:\n',full_dataset_export_dir print 'Report this issue to the AltAnalyze help desk or create this directory manually (Error Code X1).'; force_exception ### Organize arrays according to groups and export all probeset data and any pairwise comparisons data_type = 'expression' if array_type == 'RNASeq': expr_input_dir = string.replace(expr_input_dir,'exp.','counts.') ### Filter based on the counts file and then replace values with the normalized as the last step comparison_filename_list,biotypes = exportGroupedComparisonProbesetData(expr_input_dir,probeset_db,data_type,array_names,array_linker_db,perform_alt_analysis) if useJunctionsForGeneExpression: if 'junction' in biotypes: if 'exon' in biotypes: del biotypes['exon'] if filter_by_dabg == 'yes' and stats_file_status == 'found': data_type = 'dabg' exportGroupedComparisonProbesetData(stats_input_dir,probeset_db,data_type,array_names,array_linker_db,perform_alt_analysis) ###Filter expression data based on DABG and annotation filtered probesets (will work without DABG filtering as well) - won't work for RNA-Seq (execute function later) filtered_exon_db = removeNonExpressedProbesets(probeset_db,full_dataset_export_dir) filterExpressionData(expr_input_dir,filtered_exon_db,constitutive_gene_db,probeset_db,'expression',array_names,perform_alt_analysis) constitutive_gene_db={}; probeset_gene_db = makeGeneLevelAnnotations(probeset_db) if array_type == 'RNASeq': fulldataset_export_object = export.ExportFile(full_dataset_export_dir) data_type = 'expression' ### Repeat with counts and then with exp. to add gene-level estimates to both exportGroupedComparisonProbesetData(expr_input_dir,probeset_db,data_type,array_names,array_linker_db,perform_alt_analysis) fulldataset_export_object = export.ExportFile(full_dataset_export_dir) expr_input_dir = string.replace(expr_input_dir,'counts.','exp.') exportGroupedComparisonProbesetData(expr_input_dir,probeset_db,data_type,array_names,array_linker_db,perform_alt_analysis) try: clearObjectsFromMemory(average_all_probesets); clearObjectsFromMemory(expression_summary); clearObjectsFromMemory(splicing_analysis_db) except Exception: null=[] filtered_exon_db=[]; probeset_db={}; average_all_probesets={}; expression_summary={}; splicing_analysis_db={} #filtered_exp_db,group_count,ranked_array_headers = filterExpressionData(expr_input_dir,filtered_exon_db,constitutive_gene_db,probeset_db) #filtered_gene_db = permformFtests(filtered_exp_db,group_count,probeset_db) """ pre_filtered_db=[] print 'global vars' returnLargeGlobalVars() print 'local vars' all = [var for var in locals() if (var[:2], var[-2:]) != ("__", "__")] for var in all: try: if len(locals()[var])>500: print var, len(locals()[var]) except Exception: null=[] """ return probeset_gene_db, annotate_db, comparison_filename_list
def exportGroupedComparisonProbesetData(filename,probeset_db,data_type,array_names,array_linker_db,perform_alt_analysis): """This function organizes the raw expression data into sorted groups, exports the organized data for all conditions and comparisons and calculates which probesets have groups that meet the user defined dabg and expression thresholds.""" #comparison_filename_list=[] #if perform_alt_analysis != 'expression': ### User Option (removed in version 2.0 since the option prevented propper filtering) comparison_filename_list=[] probeset_dbase={}; exp_dbase={}; constitutive_gene_db={}; probeset_gene_db={} ### reset databases to conserve memory global expr_group_list; global comp_group_list; global expr_group_db if data_type == 'residuals': expr_group_dir = string.replace(filename,'residuals.','groups.') comp_group_dir = string.replace(filename,'residuals.','comps.') elif data_type == 'expression': expr_group_dir = string.replace(filename,'exp.','groups.') comp_group_dir = string.replace(filename,'exp.','comps.') if 'counts.' in filename: expr_group_dir = string.replace(expr_group_dir,'counts.','groups.') comp_group_dir = string.replace(comp_group_dir,'counts.','comps.') data_type = 'counts' elif data_type == 'dabg': expr_group_dir = string.replace(filename,'stats.','groups.') comp_group_dir = string.replace(filename,'stats.','comps.') comp_group_list, comp_group_list2 = ExpressionBuilder.importComparisonGroups(comp_group_dir) expr_group_list,expr_group_db = ExpressionBuilder.importArrayGroups(expr_group_dir,array_linker_db) print "Reorganizing expression data into comparison groups for export to down-stream splicing analysis software" ###Do this only for the header data group_count,raw_data_comp_headers = reorder_arrays.reorderArrayHeaders(array_names,expr_group_list,comp_group_list,array_linker_db) ###Export the header info and store the export write data for reorder_arrays global comparision_export_db; comparision_export_db={}; array_type_name = 'Exon' if array_type == 'junction': array_type_name = 'Junction' elif array_type == 'RNASeq': array_type_name = 'RNASeq' if data_type != 'residuals': AltAnalzye_input_dir = root_dir+"AltExpression/pre-filtered/"+data_type+'/' else: AltAnalzye_input_dir = root_dir+"AltExpression/FIRMA/residuals/"+array_type+'/'+species+'/' ### These files does not need to be filtered until AltAnalyze.py for comparison in comp_group_list2: ###loop throught the list of comparisons group1 = comparison[0]; group2 = comparison[1] group1_name = expr_group_db[group1]; group2_name = expr_group_db[group2] comparison_filename = species+'_'+array_type_name+'_'+ group1_name + '_vs_' + group2_name + '.txt' new_file = AltAnalzye_input_dir + comparison_filename; comparison_filename_list.append(comparison_filename) data = export.createExportFile(new_file,AltAnalzye_input_dir[:-1]) try: array_names = raw_data_comp_headers[comparison] except KeyError: print raw_data_comp_headers;kill title = ['UID']+array_names; title = string.join(title,'\t')+'\n'; data.write(title) comparision_export_db[comparison] = data ###store the export file write data so we can write after organizing #print filename, normalize_feature_exp biotypes = importExonProbesetData(filename,probeset_db,'reorderFilterAndExportAll') if normalize_feature_exp == 'RPKM': ### Add the gene-level RPKM data (this is in addition to the counts. file) exp_gene_db={} for i in probeset_db: exp_gene_db[probeset_db[i][0]]=[] filename = string.replace(filename,'.txt','-steady-state.txt') #print filename, normalize_feature_exp, 'here' importExonProbesetData(filename,exp_gene_db,'reorderFilterAndExportAll') for comparison in comparision_export_db: data = comparision_export_db[comparison]; data.close() print "Pairwise comparisons for AltAnalyze exported..." try: fulldataset_export_object.close() except Exception: null=[] return comparison_filename_list, biotypes
eo.close() if __name__ == '__main__': ################ Comand-line arguments ################ import getopt CLIP_dir = None species = 'Hs' """ Usage: bedtools intersect -wb -a /Clip_merged_reproducible_ENCODE/K562/AARS-human.bed -b /annotations/combined/hg19_annotations-full.bed > /test.bed """ if len( sys.argv[1:] ) <= 1: ### Indicates that there are insufficient number of command-line arguments print 'WARNING!!!! Too commands supplied.' else: options, remainder = getopt.getopt(sys.argv[1:], '', ['species=', 'clip=']) #print sys.argv[1:] for opt, arg in options: if opt == '--species': species = arg elif opt == '--clip': CLIP_dir = arg import ExpressionBuilder coding_db = ExpressionBuilder.importTranscriptBiotypeAnnotations(species) dataset_peaks = eCLIPimport(CLIP_dir)
def sashmi_plot_list(bamdir, eventsToVisualizeFilename, PSIFilename, events=None): try: import gene_associations gene_to_symbol = gene_associations.getGeneToUid( species, ('hide', 'Ensembl-Symbol')) from import_scripts import OBO_import symbol_to_gene = OBO_import.swapKeyValues(gene_to_symbol) except Exception: symbol_to_gene = {} if events == None: splicing_events, expandedSearch = importSplicingEventsToVisualize( eventsToVisualizeFilename) else: ### Replace any ":" from the input events #for i in range(len(events)): events[i] = string.replace(events[i],':','__') expandedSearch = True for i in range(len(events)): gene = string.split(events[i], '__')[0] if gene in gene_to_symbol: symbol = gene_to_symbol[gene][0] elif 'ENS' not in gene or 'G0000' in gene: if gene in symbol_to_gene: ensID = symbol_to_gene[gene][0] symbol = gene events[ i] = ensID ### translate this ID to an Ensembl gene ID for propper SashimiPlot lookup splicing_events = events ### optionally get from supplied variable if len(splicing_events) == 0: print eventsToVisualizeFilename forceNoCompatibleEventsInFile print 'Exporting plots', ### Determine Groups for Coloring groups_file = 'None' dir_list = unique.read_directory(root_dir + '/ExpressionInput') for file in dir_list: if 'groups.' in file: groups_file = root_dir + '/ExpressionInput/' + file if groups_file != None: try: import ExpressionBuilder sample_group_db = ExpressionBuilder.simplerGroupImport(groups_file) groups = [] for sample in sample_group_db: if sample_group_db[sample] not in groups: groups.append(sample_group_db[sample] ) ### create an ordered list of unique group except Exception: groups = ['None'] #print traceback.format_exc() pass processed_events = formatAndSubmitSplicingEventsToSashimiPlot( PSIFilename, bamdir, splicing_events, sample_group_db, groups, False) mopup_events = getMopUpEvents(splicing_events, processed_events) ### Do the same for supplied gene queries or junctions that didn't map above using the gene expression values as a guide #print len(splicing_events),len(processed_events),len(mopup_events) processed_events = formatAndSubmitSplicingEventsToSashimiPlot( steady_state_exp_file, bamdir, mopup_events, sample_group_db, groups, expandedSearch) if len(processed_events) > 0: mopup_events = getMopUpEvents(mopup_events, processed_events) processed_events = formatAndSubmitSplicingEventsToSashimiPlot( PSIFilename, bamdir, mopup_events, sample_group_db, groups, True) return gene_to_symbol
def remoteSashimiPlot(Species,fl,bamdir,eventsToVisualizeFilename,events=None,show=False): global PSIFilename global outputdir global root_dir global steady_state_exp_file global species species = Species try: countinp = fl.CountsFile() root_dir = fl.RootDir() except Exception: root_dir = fl search_dir = root_dir+'/ExpressionInput' files = unique.read_directory(search_dir) for file in files: if 'counts.' in file and 'steady-state.txt' not in file: countinp = search_dir+'/'+file PSIFilename = root_dir+'/AltResults/AlternativeOutput/'+species+'_RNASeq_top_alt_junctions-PSI.txt' import ExpressionBuilder dir_list = unique.read_directory(root_dir+'/ExpressionInput') for file in dir_list: if 'exp.' in file and 'steady-state' not in file: exp_file = root_dir+'/ExpressionInput/'+file elif 'exp.' in file and 'steady-state' in file: steady_state_exp_file = root_dir+'/ExpressionInput/'+file global sample_group_db sample_group_db = ExpressionBuilder.simplerGroupImport(exp_file) #outputdir=findParentDir(PSIFilename)+"sashimiplots" outputdir = root_dir+'/ExonPlots' outputdir = root_dir+'/SashimiPlots' try: os.mkdir(unique.filepath(outputdir)) except Exception: pass if show: s = open(outputdir+'/show.txt','w') s.write('TRUE'); s.close() else: s = open(outputdir+'/show.txt','w') s.write('FALSE'); s.close() geneSymbol_db=Sashimiplottting(bamdir,countinp,PSIFilename,eventsToVisualizeFilename,events=events) for filename in os.listdir(outputdir): if '.pdf' in filename or '.png' in filename: fn = string.replace(filename,'.pdf','') fn = string.replace(fn,'.png','') newname=string.split(fn,'__') if newname[0] in geneSymbol_db: new_filename = str(filename) if '__' in filename: new_filename = string.split(filename,'__')[1] elif '\\' in filename: new_filename = string.split(filename,'\\')[1] elif '/' in filename: new_filename = string.split(filename,'/')[1] nnname=geneSymbol_db[newname[0]][0]+'-SashimiPlot_'+new_filename try: os.rename(os.path.join(outputdir, filename), os.path.join(outputdir,nnname)) except Exception: if 'already exists' in traceback.format_exc(): ### File already exists, delete the new one try: os.remove(os.path.join(outputdir,nnname)) except Exception: pass ### Now right the new one try: os.rename(os.path.join(outputdir, filename), os.path.join(outputdir,nnname)) except Exception: pass pass else: continue print ''
def remoteSashimiPlot(Species, fl, bamdir, eventsToVisualizeFilename, events=None, show=False): global PSIFilename global outputdir global root_dir global steady_state_exp_file global species species = Species try: countinp = fl.CountsFile() root_dir = fl.RootDir() except Exception: root_dir = fl search_dir = root_dir + '/ExpressionInput' files = unique.read_directory(search_dir) for file in files: if 'counts.' in file and 'steady-state.txt' not in file: countinp = search_dir + '/' + file PSIFilename = root_dir + '/AltResults/AlternativeOutput/' + species + '_RNASeq_top_alt_junctions-PSI.txt' import ExpressionBuilder dir_list = unique.read_directory(root_dir + '/ExpressionInput') for file in dir_list: if 'exp.' in file and 'steady-state' not in file: exp_file = root_dir + '/ExpressionInput/' + file elif 'exp.' in file and 'steady-state' in file: steady_state_exp_file = root_dir + '/ExpressionInput/' + file global sample_group_db sample_group_db = ExpressionBuilder.simplerGroupImport(exp_file) #outputdir=findParentDir(PSIFilename)+"sashimiplots" outputdir = root_dir + '/ExonPlots' outputdir = root_dir + '/SashimiPlots' try: os.mkdir(unique.filepath(outputdir)) except Exception: pass if show: s = open(outputdir + '/show.txt', 'w') s.write('TRUE') s.close() else: s = open(outputdir + '/show.txt', 'w') s.write('FALSE') s.close() geneSymbol_db = Sashimiplottting(bamdir, countinp, PSIFilename, eventsToVisualizeFilename, events=events) for filename in os.listdir(outputdir): if '.pdf' in filename or '.png' in filename: fn = string.replace(filename, '.pdf', '') fn = string.replace(fn, '.png', '') newname = string.split(fn, '__') if newname[0] in geneSymbol_db: new_filename = str(filename) if '__' in filename: new_filename = string.split(filename, '__')[1] elif '\\' in filename: new_filename = string.split(filename, '\\')[1] elif '/' in filename: new_filename = string.split(filename, '/')[1] nnname = geneSymbol_db[ newname[0]][0] + '-SashimiPlot_' + new_filename try: os.rename(os.path.join(outputdir, filename), os.path.join(outputdir, nnname)) except Exception: if 'already exists' in traceback.format_exc(): ### File already exists, delete the new one try: os.remove(os.path.join(outputdir, nnname)) except Exception: pass ### Now right the new one try: os.rename(os.path.join(outputdir, filename), os.path.join(outputdir, nnname)) except Exception: pass pass else: continue print ''
def matrixImport(filename): matrix={} compared_groups={} ### track which values correspond to which groups for pairwise group comparisons original_data={} headerRow=True for line in open(filename,'rU').xreadlines(): original_line = line data = line.rstrip() values = string.split(data,'\t') #print len(values) if headerRow: group_db={} groups=[] if ':' in data: group_sample_list = map(lambda x: string.split(x,':'),values[1:]) index=1 for (g,s) in group_sample_list: try: group_db[g].append(index) except Exception: group_db[g] = [index] index+=1 if g not in groups: groups.append(g) else: import ExpressionBuilder search_dir = string.split(filename,'AltResults')[0]+'ExpressionInput' files = unique.read_directory(search_dir) for file in files: if 'groups.' in file and '.txt' in file: #print file sample_group_db = ExpressionBuilder.simplerGroupImport(search_dir+'/'+file) index=0; count=0 for s in values[1:]: if s in sample_group_db: g = sample_group_db[s] try: group_db[g].append(index) except Exception: group_db[g] = [index] count+=1 if g not in groups: groups.append(g) #else: print [s] index+=1 #print count headerRow = False grouped_values=[] original_data['header'] = original_line else: key = values[0] values=values[1:] grouped_floats=[] float_values = [] associated_groups=[] for g in groups: ### string values gvalues_list=[] for i in group_db[g]: try: if values[i] != '0': try: gvalues_list.append(float(values[i])) except Exception: pass else: #try: gvalues_list.append('') ### Thus are missing values #except Exception: pass pass except Exception: #try: gvalues_list.append('') ### Thus are missing values #except Exception: pass pass grouped_floats.append(gvalues_list) if len(gvalues_list)>1: associated_groups.append(g) matrix[key] = grouped_floats compared_groups[key] = associated_groups if '\n' not in original_line: original_line+='\n' original_data[key] = original_line last_line = line return matrix,compared_groups,original_data