def parse_idx(idx_summary, bam_regex): ''' produces a table with read counts: columns = count types -> 1 column for mapped reads, rows = samples idx_summary: summary filw of idxstats on bamfiles bam_regex: regular expression to match bam files, group 1 -> samplename ''' # map sample name to list counts sample_to_mapped = defaultdict(lambda: 0) #read idxstat table and update dictionary idxTable = tf.readTable(idx_summary, sep='\t', header=True) for row in range(0, idxTable.rowNum()): bam_match = re.search(bam_regex, idxTable.get(row, 4)) if (bam_match): sample = bam_match.group(1) count = int(idxTable.get(row, 2)) sample_to_mapped[sample] += count # resulting table counts = Table() counts.addColumn(str, 'sample', None) counts.addColumn(int, 'mapped', 0) # transform content of the dictionary into the resulting table for sample, readnr in sorted(sample_to_mapped.items()): counts.addRow([sample, readnr]) return counts
def parse_chrom_file(chrom_file, idx_summary, bam_regex): ''' produces a table with read counts: columns = mapped reads for chromosome groups, rows = samples idx_summary: summary filw of idxstats on bamfiles bam_regex: regular expression to match bam files, group 1 -> samplename chrom_file: tab-separated table with chromosome name (col 0) and organism/group name (col 1) ''' # read assignment of chromosomes to groups as dictionary chrom_to_group = {} chr_tab = tf.readTable(chrom_file, sep='\t', header=True, headerstart='#') for row in range(0, chr_tab.rowNum()): chrom_to_group[chr_tab.get(row, 0)] = chr_tab.get(row, 1) # map sample name to map group -> counts sample_to_mapped = defaultdict(lambda: defaultdict(int)) #read idxstat table and update dictionary idxTable = tf.readTable(idx_summary, sep='\t', header=True) for row in range(0, idxTable.rowNum()): bam_match = re.search(bam_regex, idxTable.get(row, 4)) chr_name = idxTable.get(row, 0) if (bam_match and chr_name in chrom_to_group): sample = bam_match.group(1) count = int(idxTable.get(row, 2)) sample_to_mapped[sample][chrom_to_group[chr_name]] += count # resulting table counts = Table() counts.addColumn(str, 'sample', None) groups = sorted(list(set(chrom_to_group.values()))) for val in groups: counts.addColumn(int, val, 0) # transform content of the dictionary into the resulting table for sample, readnr_dict in sorted(sample_to_mapped.items()): newRow = [sample] for val in groups: newRow.append(readnr_dict[val]) counts.addRow(newRow) return counts
def prep_go_enrichment_significant(infile, outfile, alpha=None, resultColumn='ID', fc_direction='both', generateBackground=False): ''' generates one or more input files for gene enrichment analysis given the results of edgeR from DETest module 'infile': results from edgeR (results for all genes or only significant genes) 'outfile': prefix for input file for GO enrichment (with one gene per line) created by this function 'resultColumn': one or more column names -> for each column name, a separate file is generated 'alpha': significance level, if set to None all genes of 'infile' are used (e.g. input= edgeR.significant.csv) if a value is set for alpha, all genes with pvalue < alpha are selected 'fc_direction': a single value or a list of values that can take 3 values: 'both' (no filtering based on fold change), 'up': take only genes with positive fold change 'down': take only genes with negative fold change -> for each fc_direction, a separate file is generated ''' # generate Backround -> whole edgeR output as input, requires to select significant genes if(generateBackground is True and alpha is None): raise ValueError('Generate Background is only possible if a cutoff is set for the pvalue (alpha not None)!') # read in the resutls from edgeR (either edgeR.significant.csv or edgeR.all.csv) if not isinstance(resultColumn, list): resultColumn = [resultColumn] tab=tf.readTable(infile, sep='\t', header=True, colsToRead=['log2FC', 'adj.PValue']+resultColumn) # generate background for enrichment analysis if (generateBackground is True): for wcol in resultColumn: tf.writeTable(tab, outfile+'_background_'+wcol+'.txt', sep='\t', header=False, colsToWrite=[wcol]) # select significant genes if required if(alpha is not None): tab = tf.selectRows(tab, lambda t,r: float(t.get(r, 'adj.PValue'))<alpha) # select genes with the required fold change direction if not isinstance(fc_direction, list): fc_direction = [fc_direction] for fcdir in fc_direction: if(fcdir == 'up'): res = tf.selectRows(tab, lambda t,r: float(t.get(r, 'log2FC'))>0) elif(fcdir=='down'): res = tf.selectRows(tab, lambda t,r: float(t.get(r, 'log2FC'))<0) elif(fcdir=='both'): res = tab else: raise ValueError('Unknown fc_direction mode: '+str(fcdir)) # iterate over list of desired gene identifiers and write an output file for each for wcol in resultColumn: tf.writeTable(res, outfile+'_'+fcdir+'_'+wcol+'.txt', sep='\t', header=False, colsToWrite=[wcol])
def gsea_set_scatter(geneset_file, edgeRcomplete_table, outfolder, genesetname=None, testpairs=None, alpha=0.01): ''' 'geneset_file': file downloaded from msigdb with the gene sets used for GSEA analysis 'edgeRcomplete_table': table with log2 fold changes of all edgeR runs 'outfolder': folder for storing the scatter plots 'genesetname': if None, all gene sets are from 'geneset_file' are analyzed otherwise genesetname is the name of the set to analyze or a list of set names 'testpairs': if None, all pairs of fold changes columns of 'edgeRcomplete_table' are plotted otherwise testnames lists all fold change pairs (DE test names) to plot returns list of generated scatter figures, order of figure corresponds to the order given in 'geneset_file' ''' # import only used for this function (but not required watchdog module) import utils.fc_scatter as fcs if(isinstance(edgeRcomplete_table, str)): edgeRcomplete_table = tf.readTable(edgeRcomplete_table, header=True, sep='\t') # list of generated figures fig_list=[] # read gene set file with open(geneset_file, 'rt') as gsreader: for line in gsreader: line = line.strip('\n') content = line.split('\t') cursetname = content[0] if(genesetname is None or cursetname==genesetname or cursetname in genesetname): targets = set([ x.lower().capitalize() for x in content[2:]]) print('Plotting '+cursetname) print('gene set: '+str(len(targets))) # reduce fold changes to gene set plot_table = tf.selectRows(edgeRcomplete_table, lambda t,r: t.get(r, 'name') in targets) # create scatter plot plotfile = os.path.join(outfolder, cursetname+'.svg') if(testpairs is not None): f = fcs.multiple_fc_scatter_plots(plot_table, testpairs, plotfile, colormode='all', alpha=alpha, adjustP=False, title=cursetname, add_dot_counts=True) else: f = fcs.fc_scatter_overview(plot_table, plotfile, samples=None, allpairs=False, colormode='all', alpha=alpha, add_dot_counts=True) fig_list.append(f) return fig_list
def create_gsea_from_tsv(inFile, outFile, hasHeader, gene_pos, rank_pos): ''' transforms the results of a gene expression analysis into input for GSEA preranked 'inFile': tab-separated table with fold changes for all genes 'hasHeader': boolean that indicates if the first line of 'inFile' is a table header 'gene_pos': 0-based position of the column with the gene names 'rank_pos': 0-based position of the column with the fold change or other values to rank the genes 'outFile': file ending with *.rnk with the gene names and fold changes changes given in 'inFile' ''' # extract column with gene name and log fold change edgeTab = tf.readTable(inFile, sep='\t', header=hasHeader, colsToRead=[gene_pos,rank_pos]) # convert gene names to upper case -> match GSEA database edgeTab.modifyColumn(0, lambda s: s.upper()) # sort by log fold change edgeTab.sortRows([1], [lambda x:float(x)], [False]) # write file back to disk tf.writeTable(edgeTab, outFile, sep='\t', header=False)
def parse_fastqc(fastqc_summary, raw_regex, trim_regex): ''' produces a table with read counts: columns = count types -> raw and trimmed, rows = samples fastqc_summary: summary file of fastqc raw_regex: regular expression to match raw fastq files, group 1 -> samplename trim_regex: regular expression to match trimmed fastq files, group 1 -> samplename ''' # map sample name to list (raw, trimmed) counts sample_to_counts = defaultdict(lambda: [0, 0]) # read fastqc statistics and update dictionary qcTable = tf.readTable(fastqc_summary, sep='\t', header=True) for rowInd in range(0, qcTable.rowNum()): if (qcTable.get(rowInd, 0) == 'Total Sequences'): # get filename of the row and compare it against the regular expression for trimmed and raw fastq files to find the sample name filename = qcTable.get(rowInd, 2) readcount = int(qcTable.get(rowInd, 1)) raw_match = re.search(raw_regex, filename) trim_match = re.search(trim_regex, filename) # add readcount if samplename is found if (raw_match): sample_to_counts[raw_match.group(1)][0] += readcount elif (trim_match): sample_to_counts[trim_match.group(1)][1] += readcount # resulting table counts = Table() counts.addColumn(str, 'sample', None) counts.addColumn(int, 'raw', 0) counts.addColumn(int, 'trimmed', 0) # transform content of the dictionary into the resulting table for sample, readnrs in sorted(sample_to_counts.items()): counts.addRow([sample, readnrs[0], readnrs[1]]) return counts
def plot_gsea_summary(summary_table, plotout, title, overview=False, nes_range=(-2,2), offset_left=4, offset_bottom=1.5, plot_top=None): ''' visualization of nes and fdr values for a set of GSEA runs 'summary_table': table produced by 'generate_overview_table' giving gene sets with their nes and fdr in different runs of GSEA 'plotout': file for saving the plot 'title': name of the plot 'overview': flag for visualizing many gene sets, if set to True it does not label the gene sets and does not write pvalues into the cells 'nes_range': range for colorcoding the normalized enrichment scores -> nes_range[0] = dark red, nes_range[1] = dark blue 'left_offset': width of left part of the figure for the gene set names (given in inches), default: 4 inches, only used if overview=False plot structure: rows = gene sets, columns = GSEA runs, color of a cell = NES of the gene set in the GSEA run ''' # read table with plotting data tab= tf.readTable(summary_table, sep='\t', header=True) if(plot_top is not None): tab = tf.selectRows(tab, lambda _,r: r<plot_top) # find column positions with NES scores nescols=[] for c in range(0, tab.colNum()): if(tab.getColumnName(c).startswith('NES')): nescols.append(c) # extract names GSEA runs = suffixes of NES columns names=[] for col in nescols: name_parts=tab.getColumnName(col).split('_') names.append('_'.join(name_parts[1:len(name_parts)])) # extract names of gene sets gene_sets = tab.getColumn(0) # extract plotting data from the table: normalized enrichment scores and adjusted pvalues plotarray = np.zeros((tab.rowNum(),len(nescols))) pvalarray = np.zeros((tab.rowNum(),len(nescols))) for rowInd in range(0, tab.rowNum()): # iterate over NES columns of all GSEA runs for arraypos, colInd in enumerate(nescols): # plotarray[0] -> row plotted at the bottom, plotarray[rowNum-1] -> row plotted at the top # normalized enrichment score plotarray[tab.rowNum()-1-rowInd][arraypos]=tab.get(rowInd,colInd) # get corresponding pvalue for the current sample pvalcol = re.sub('NES_', 'FDR_', tab.getColumnName(colInd)) pvalarray[tab.rowNum()-1-rowInd][arraypos]=tab.get(rowInd,pvalcol) # set up figure size # space for the columns (fixed width per column) map_width=len(nescols)*1 # space for sample names at the bottom #offset_bottom=1.5 # space for figure title offset_top= 0.5 # space for colorbar at the right offset_right = 1.2 # annotated gene names -> height depends on the number of rows, create additional space at the left for gene set names if(overview is False): height=tab.rowNum()*0.25 # do not annotated gene names -> height independent of row number, only small margin at left hand side else: height=10 offset_left=0.2 height = height+offset_bottom+offset_top width=map_width+offset_left+offset_right # define figure and plotting area f=plt.figure(figsize=(width, height)) # position of plot area (without axis labels and color legend) left, bottom, width, heigth as fraction of total figure size f.add_axes([offset_left/width, offset_bottom/height, map_width/width, (height-offset_top-offset_bottom)/height]) # add column names: samples plt.xticks(np.arange(0.5,len(plotarray),1), names, fontsize=10, rotation=90) plt.xlabel('Samples', fontsize=12) # add row names: gene sets (as sorted in summary table) in inverted order -> inverted oder=first element of table plotted as top of the colorplot if(overview is False): plt.yticks(np.arange(tab.rowNum()-0.5,0,-1), gene_sets, fontsize=10) else: plt.tick_params(axis='y', left='off', labelleft='off', which='both') # set up color map: transition blue (down reg) -> white -> red (up reg) colors = [(0, (5/255,113/255,176/255)), (0.375, (1, 1, 1)), (0.625, (1, 1, 1)), (1, (202/255,0/255,32/255))] cm = LinearSegmentedColormap.from_list('my_list',colors, N=200) # color plot of NES for gene sets vs. GSEA runs plt.pcolor(plotarray, cmap=cm, vmin=nes_range[0], vmax=nes_range[1], edgecolors='black') # add pvalue information ax = plt.gca() # row of plot = y-coordinate, column of plot= x-coordinate for x in range(0, len(pvalarray[0])): for y in range(0, len(pvalarray)): pvaltext = '{:.2f}'.format(float(pvalarray[y][x])) if(overview is False): ax.text(x+0.5,y+0.5, pvaltext, color='black', horizontalalignment='center', verticalalignment='center', fontsize=10) # mark significant cells with a star if(float(pvalarray[y][x])<0.05): ax.plot(x+0.8, y+0.5, marker='*', color='gold', markersize=8) # add legend for NES color coding barax = f.add_axes([(width-offset_right+0.2)/width, offset_bottom/height, 0.3/width, (height-offset_top-offset_bottom)/height]) b=plt.colorbar(cax=barax) b.set_label('Normalized Enrichment Score', fontsize=12) # add legend for p values if(overview is False): lax = f.add_axes([0,0,offset_left/width, offset_bottom/height]) lax.set_axis_off() lax.plot(0.05, 0.1, marker='*', color='gold', markersize=8, transform=lax.transAxes) lax.text(0.07, 0.1, 'Significant at Level 0.05', transform=lax.transAxes, horizontalalignment='left', verticalalignment='center', fontsize=10) rect = mpatches.Rectangle(xy=(0.05,0.2), width=0.2, height=0.12, linewidth=1, edgecolor='black', facecolor='none', transform=lax.transAxes) lax.add_patch(rect) lax.text(0.27, 0.2, 'per Sample and Set', horizontalalignment='left', verticalalignment='bottom', fontsize=10, transform=lax.transAxes) lax.text(0.15, 0.26, 'Pvalue', transform=lax.transAxes, horizontalalignment='center', verticalalignment='center', fontsize=10) lax.text(0.05, 0.4, 'GSEA Adjusted Pvalue', transform=lax.transAxes, horizontalalignment='left', verticalalignment='bottom', fontsize=12) # set title plt.text(s=title, fontsize=14, x=(offset_left+map_width/2)/width, y=(height-offset_top+0.2)/height, transform=f.transFigure, horizontalalignment='center') plt.savefig(plotout)
def generate_gsea_overview_table(basefolder, outfile, test_names=None, writeExcel=False, sort_by_score=False, col_sort_func=lambda x:x): ''' summarizes results of GSEA for several samples takes all gsea results available in 'basefolder' selects all gene sets with fdr <0.05 in at least one run of gsea creates a summary table 'outfile' with the enrichment scores and fdrs for all gene sets in all runs (row = gene sets with fdr<0.05 for at least one sample, column = NES or FDR per gsea run) writeExcel = True: creates a summary table in xls format sort_by_score: flag to control the sorting order of gene sets True <-> sort by number of samples with significant FDR in descending order and in case of ties by sum of absolute enrichment scores in descending order False <-> sort lexicographically by gene set names ''' # handle list of folders with gsea results from differently labeled tests if(isinstance(basefolder,str)): basefolder = [basefolder] # map: testname (compared conditions) -> table with merged NES and FDR value table_map = {} #read in all results tables given in the subdirectories of basefolder for bfolder in basefolder: for file in os.listdir(bfolder): content = file.split('.') if(content[1]=='GseaPreranked'): testname = content[0] if(test_names is None or testname in test_names): tablist=[] for direction in ['neg', 'pos']: tabpath= os.path.join(bfolder, file, 'gsea_report_for_na_'+direction+'_'+content[2]+'.xls') tab = tf.readTable(tabpath, sep='\t', header=True, colsToRead=['NAME', 'NES', 'FDR q-val']) tab.changeColumnName('NES', 'NES_'+direction) tab.changeColumnName('FDR q-val', 'FDR_'+direction) tablist.append(tab) # join results of negative and positive enrichment -> one column with NES values and one column with pvalue final_nes_colname = 'NES_'+testname final_fdr_colname = 'FDR_'+testname merge_negpos = tf.joinTables(tablist[0], tablist[1], joinCols=[(0,0)], joinType='fullouter') merge_negpos.addColumn(str, columnName=final_nes_colname, defaultValue='0') merge_negpos.modifyColumn(final_nes_colname, _merge_nes_scores, wholeRow=True) merge_negpos.addColumn(str, columnName=final_fdr_colname, defaultValue='1') merge_negpos.modifyColumn(final_fdr_colname, _merge_fdr, wholeRow=True) merge_negpos = tf.selectColumns(merge_negpos, colList=['NAME', final_nes_colname, final_fdr_colname]) table_map[testname] = merge_negpos # get significant gene sets: q-value < 0.05 sign_gene_sets=set() for testname, tab in table_map.items(): for r in range(0, tab.rowNum()): if(float(tab.get(r, 'FDR_'+testname))<0.05): sign_gene_sets.add(tab.get(r, 'NAME')) # set up result table with significant gene sets res =tc.Table() res.addColumn(str, columnName='Gene Set') for gs in sorted(sign_gene_sets): res.addRow([gs]) # add NES and FDR for each comparison: join tables for run in sorted(table_map.keys(), key=col_sort_func): res = tf.joinTables(res, table_map[run], joinCols=[(0,0)], joinType='leftouter') # replace None by pvalue 1 or enrichment score 0 -> facilitate plotting of the data for c in range(1, res.colNum()): if(c%2 == 0): res.modifyColumn(c ,modifying_function=lambda v: '1' if v=='None' else v) else: res.modifyColumn(c ,modifying_function=lambda v: '0' if v=='None' else v) # sort sets by number of significant samples and sum of absolute normalized enrichment score if(sort_by_score is True): res.addColumn(float, 'abs_sum_NES', 0) res.modifyColumn('abs_sum_NES', _sum_abs_nes, wholeRow=True) res.addColumn(int, 'count_FDR<0.05') res.modifyColumn('count_FDR<0.05', _count_sign_fdrs ,wholeRow=True) res.sortRows(['count_FDR<0.05', 'abs_sum_NES'], [lambda x:x, lambda x:x], [False, False]) # write to file tf.writeTable(res, outfile, sep='\t', header=True) if(writeExcel is True): components=outfile.split('.') excelout = '.'.join(components[0:len(components)-1])+'.xls' tf.writeExcelTable(res, excelout, header=True)