def loadCuffdiff(dbhandle, infile, outfile, min_fpkm=1.0): '''load results from cuffdiff analysis to database This functions parses and loads the results of a cuffdiff differential expression analysis. Parsing is performed by the parseCuffdiff function. Multiple tables will be created as cuffdiff outputs information on gene, isoform, tss, etc. levels. The method converts from ln(fold change) to log2 fold change. Pairwise comparisons in which one gene is not expressed (fpkm < `min_fpkm`) are set to status 'NOCALL'. These transcripts might nevertheless be significant. Arguments --------- dbhandle : object Database handle. infile : string Input filename, output from cuffdiff outfile : string Output filename in :term:`tsv` format. min_fpkm : float Minimum fpkm. Genes with an fpkm lower than this will be set to status `NOCALL`. ''' prefix = P.toTable(outfile) indir = infile + ".dir" if not os.path.exists(indir): P.touch(outfile) return # E.info( "building cummeRbund database" ) # R('''library(cummeRbund)''') # cuff = R('''readCufflinks(dir = %(indir)s, dbfile=%(indir)s/csvdb)''' ) # to be continued... tmpname = P.getTempFilename(shared=True) # ignore promoters and splicing - no fold change column, but sqrt(JS) for fn, level in (("cds_exp.diff.gz", "cds"), ("gene_exp.diff.gz", "gene"), ("isoform_exp.diff.gz", "isoform"), # ("promoters.diff.gz", "promotor"), # ("splicing.diff.gz", "splice"), ("tss_group_exp.diff.gz", "tss")): tablename = prefix + "_" + level + "_diff" infile = os.path.join(indir, fn) results = parseCuffdiff(infile, min_fpkm=min_fpkm) Expression.writeExpressionResults(tmpname, results) P.load(tmpname, outfile, tablename=tablename, options="--allow-empty-file " "--add-index=treatment_name " "--add-index=control_name " "--add-index=test_id") for fn, level in (("cds.fpkm_tracking.gz", "cds"), ("genes.fpkm_tracking.gz", "gene"), ("isoforms.fpkm_tracking.gz", "isoform"), ("tss_groups.fpkm_tracking.gz", "tss")): tablename = prefix + "_" + level + "_levels" infile = os.path.join(indir, fn) P.load(infile, outfile, tablename=tablename, options="--allow-empty-file " "--add-index=tracking_id " "--add-index=control_name " "--add-index=test_id") # Jethro - load tables of sample specific cuffdiff fpkm values into csvdb # IMS: First read in lookup table for CuffDiff/Pipeline sample name # conversion inf = IOTools.openFile(os.path.join(indir, "read_groups.info.gz")) inf.readline() sample_lookup = {} for line in inf: line = line.split("\t") our_sample_name = IOTools.snip(line[0]) our_sample_name = re.sub("-", "_", our_sample_name) cuffdiff_sample_name = "%s_%s" % (line[1], line[2]) sample_lookup[cuffdiff_sample_name] = our_sample_name inf.close() for fn, level in (("cds.read_group_tracking.gz", "cds"), ("genes.read_group_tracking.gz", "gene"), ("isoforms.read_group_tracking.gz", "isoform"), ("tss_groups.read_group_tracking.gz", "tss")): tablename = prefix + "_" + level + "sample_fpkms" tmpf = P.getTempFilename(".") inf = IOTools.openFile(os.path.join(indir, fn)).readlines() outf = IOTools.openFile(tmpf, "w") samples = [] genes = {} is_first = True for line in inf: if is_first: is_first = False continue line = line.split() gene_id = line[0] condition = line[1] replicate = line[2] fpkm = line[6] status = line[8] sample_id = condition + "_" + replicate if sample_id not in samples: samples.append(sample_id) # IMS: The following block keeps getting its indenting messed # up. It is not part of the 'if sample_id not in samples' block # please make sure it does not get made part of it if gene_id not in genes: genes[gene_id] = {} genes[gene_id][sample_id] = fpkm else: if sample_id in genes[gene_id]: raise ValueError( 'sample_id %s appears twice in file for gene_id %s' % (sample_id, gene_id)) else: if status != "OK": genes[gene_id][sample_id] = status else: genes[gene_id][sample_id] = fpkm samples = sorted(samples) # IMS - CDS files might be empty if not cds has been # calculated for the genes in the long term need to add CDS # annotation to denovo predicted genesets in meantime just # skip if cds tracking file is empty if len(samples) == 0: continue headers = "gene_id\t" + "\t".join([sample_lookup[x] for x in samples]) outf.write(headers + "\n") for gene in genes.iterkeys(): outf.write(gene + "\t") s = 0 while x < len(samples) - 1: outf.write(genes[gene][samples[s]] + "\t") s += 1 # IMS: Please be careful with this line. It keeps getting moved # into the above while block where it does not belong outf.write(genes[gene][samples[len(samples) - 1]] + "\n") outf.close() P.load(tmpf, outfile, tablename=tablename, options="--allow-empty-file " " --add-index=gene_id") os.unlink(tmpf) # build convenience table with tracks tablename = prefix + "_isoform_levels" tracks = Database.getColumnNames(dbhandle, tablename) tracks = [x[:-len("_FPKM")] for x in tracks if x.endswith("_FPKM")] tmpfile = P.getTempFile(dir=".") tmpfile.write("track\n") tmpfile.write("\n".join(tracks) + "\n") tmpfile.close() P.load(tmpfile.name, outfile) os.unlink(tmpfile.name)
def createView(dbhandle, tables, tablename, outfile, view_type="TABLE", ignore_duplicates=True): '''create a database view for a list of tables. This method performs a join across multiple tables and stores the result either as a view or a table in the database. Arguments --------- dbhandle : A database handle. tables : list of tuples Tables to merge. Each tuple contains the name of a table and the field to join with the first table. For example:: tables = ( "reads_summary", "track", "bam_stats", "track", "context_stats", "track", "picard_stats_alignment_summary_metrics", "track") tablename : string Name of the view or table to be created. outfile : string Output filename for status information. view_type : string Type of view, either ``VIEW`` or ``TABLE``. If a view is to be created across multiple databases, use ``TABLE``. ignore_duplicates : bool If set to False, duplicate column names will be added with the tablename as prefix. The default is to ignore. ''' Database.executewait( dbhandle, "DROP %(view_type)s IF EXISTS %(tablename)s" % locals()) tracks, columns = [], [] tablenames = [x[0] for x in tables] for table, track in tables: d = Database.executewait( dbhandle, "SELECT COUNT(DISTINCT %s) FROM %s" % (track, table)) tracks.append(d.fetchone()[0]) columns.append( [x.lower() for x in Database.getColumnNames(dbhandle, table) if x != track]) E.info("creating %s from the following tables: %s" % (tablename, str(list(zip(tablenames, tracks))))) if min(tracks) != max(tracks): raise ValueError( "number of rows not identical - will not create view") from_statement = " , ".join( ["%s as t%i" % (y[0], x) for x, y in enumerate(tables)]) f = tables[0][1] where_statement = " AND ".join( ["t0.%s = t%i.%s" % (f, x + 1, y[1]) for x, y in enumerate(tables[1:])]) all_columns, taken = [], set() for x, c in enumerate(columns): i = set(taken).intersection(set(c)) if i: E.warn("duplicate column names: %s " % i) if not ignore_duplicates: table = tables[x][0] all_columns.extend( ["t%i.%s AS %s_%s" % (x, y, table, y) for y in i]) c = [y for y in c if y not in i] all_columns.extend(["t%i.%s" % (x, y) for y in c]) taken.update(set(c)) all_columns = ",".join(all_columns) statement = ''' CREATE %(view_type)s %(tablename)s AS SELECT t0.track, %(all_columns)s FROM %(from_statement)s WHERE %(where_statement)s ''' % locals() Database.executewait(dbhandle, statement) nrows = Database.executewait( dbhandle, "SELECT COUNT(*) FROM view_mapping").fetchone()[0] if nrows == 0: raise ValueError( "empty view mapping, check statement = %s" % (statement % locals())) if nrows != min(tracks): E.warn("view creates duplicate rows, got %i, expected %i" % (nrows, min(tracks))) E.info("created view_mapping with %i rows" % nrows) touchFile(outfile)
def createView(dbhandle, tables, tablename, outfile, view_type="TABLE", ignore_duplicates=True): '''create a database view for a list of tables. This method performs a join across multiple tables and stores the result either as a view or a table in the database. Arguments --------- dbhandle : A database handle. tables : list of tuples Tables to merge. Each tuple contains the name of a table and the field to join with the first table. For example:: tables = ( "reads_summary", "track", "bam_stats", "track", "context_stats", "track", "picard_stats_alignment_summary_metrics", "track") tablename : string Name of the view or table to be created. outfile : string Output filename for status information. view_type : string Type of view, either ``VIEW`` or ``TABLE``. If a view is to be created across multiple databases, use ``TABLE``. ignore_duplicates : bool If set to False, duplicate column names will be added with the tablename as prefix. The default is to ignore. ''' Database.executewait( dbhandle, "DROP %(view_type)s IF EXISTS %(tablename)s" % locals()) tracks, columns = [], [] tablenames = [x[0] for x in tables] for table, track in tables: d = Database.executewait( dbhandle, "SELECT COUNT(DISTINCT %s) FROM %s" % (track, table)) tracks.append(d.fetchone()[0]) columns.append([ x.lower() for x in Database.getColumnNames(dbhandle, table) if x != track ]) E.info("creating %s from the following tables: %s" % (tablename, str(list(zip(tablenames, tracks))))) if min(tracks) != max(tracks): raise ValueError("number of rows not identical - will not create view") from_statement = " , ".join( ["%s as t%i" % (y[0], x) for x, y in enumerate(tables)]) f = tables[0][1] where_statement = " AND ".join([ "t0.%s = t%i.%s" % (f, x + 1, y[1]) for x, y in enumerate(tables[1:]) ]) all_columns, taken = [], set() for x, c in enumerate(columns): i = set(taken).intersection(set(c)) if i: E.warn("duplicate column names: %s " % i) if not ignore_duplicates: table = tables[x][0] all_columns.extend( ["t%i.%s AS %s_%s" % (x, y, table, y) for y in i]) c = [y for y in c if y not in i] all_columns.extend(["t%i.%s" % (x, y) for y in c]) taken.update(set(c)) all_columns = ",".join(all_columns) statement = ''' CREATE %(view_type)s %(tablename)s AS SELECT t0.track, %(all_columns)s FROM %(from_statement)s WHERE %(where_statement)s ''' % locals() Database.executewait(dbhandle, statement) nrows = Database.executewait( dbhandle, "SELECT COUNT(*) FROM view_mapping").fetchone()[0] if nrows == 0: raise ValueError("empty view mapping, check statement = %s" % (statement % locals())) if nrows != min(tracks): E.warn("view creates duplicate rows, got %i, expected %i" % (nrows, min(tracks))) E.info("created view_mapping with %i rows" % nrows) touchFile(outfile)
def loadCuffdiff(infile, outfile, min_fpkm=1.0): '''load results from differential expression analysis and produce summary plots. Note: converts from ln(fold change) to log2 fold change. The cuffdiff output is parsed. Pairwise comparisons in which one gene is not expressed (fpkm < fpkm_silent) are set to status 'NOCALL'. These transcripts might nevertheless be significant. This requires the cummeRbund library to be present in R. ''' prefix = P.toTable(outfile) indir = infile + ".dir" if not os.path.exists(indir): P.touch(outfile) return # E.info( "building cummeRbund database" ) # R('''library(cummeRbund)''') # cuff = R('''readCufflinks(dir = %(indir)s, dbfile=%(indir)s/csvdb)''' ) # to be continued dbhandle = sqlite3.connect(PARAMS["database"]) tmpname = P.getTempFilename(".") # ignore promoters and splicing - no fold change column, but sqrt(JS) for fn, level in (("cds_exp.diff.gz", "cds"), ("gene_exp.diff.gz", "gene"), ("isoform_exp.diff.gz", "isoform"), # ("promoters.diff.gz", "promotor"), # ("splicing.diff.gz", "splice"), ("tss_group_exp.diff.gz", "tss")): tablename = prefix + "_" + level + "_diff" infile = os.path.join(indir, fn) results = parseCuffdiff(infile, min_fpkm=min_fpkm) Expression.writeExpressionResults(tmpname, results) statement = '''cat %(tmpname)s | python %(scriptsdir)s/csv2db.py %(csv2db_options)s --allow-empty-file --add-index=treatment_name --add-index=control_name --add-index=test_id --table=%(tablename)s >> %(outfile)s.log ''' P.run() for fn, level in (("cds.fpkm_tracking.gz", "cds"), ("genes.fpkm_tracking.gz", "gene"), ("isoforms.fpkm_tracking.gz", "isoform"), ("tss_groups.fpkm_tracking.gz", "tss")): tablename = prefix + "_" + level + "_levels" statement = '''zcat %(indir)s/%(fn)s | python %(scriptsdir)s/csv2db.py %(csv2db_options)s --allow-empty-file --add-index=tracking_id --table=%(tablename)s >> %(outfile)s.log ''' P.run() # Jethro - load tables of sample specific cuffdiff fpkm values into csvdb # IMS: First read in lookup table for CuffDiff/Pipeline sample name # conversion inf = IOTools.openFile(os.path.join(indir, "read_groups.info.gz")) inf.readline() sample_lookup = {} for line in inf: line = line.split("\t") our_sample_name = IOTools.snip(line[0]) our_sample_name = re.sub("-", "_", our_sample_name) cuffdiff_sample_name = "%s_%s" % (line[1], line[2]) sample_lookup[cuffdiff_sample_name] = our_sample_name inf.close() for fn, level in (("cds.read_group_tracking.gz", "cds"), ("genes.read_group_tracking.gz", "gene"), ("isoforms.read_group_tracking.gz", "isoform"), ("tss_groups.read_group_tracking.gz", "tss")): tablename = prefix + "_" + level + "sample_fpkms" tmpf = P.getTempFilename(".") inf = IOTools.openFile(os.path.join(indir, fn)).readlines() outf = IOTools.openFile(tmpf, "w") samples = [] genes = {} x = 0 for line in inf: if x == 0: x += 1 continue line = line.split() gene_id = line[0] condition = line[1] replicate = line[2] fpkm = line[6] status = line[8] sample_id = condition + "_" + replicate if sample_id not in samples: samples.append(sample_id) # IMS: The following block keeps getting its indenting messed # up. It is not part of the 'if sample_id not in samples' block # plesae make sure it does not get made part of it if gene_id not in genes: genes[gene_id] = {} genes[gene_id][sample_id] = fpkm else: if sample_id in genes[gene_id]: raise ValueError( 'sample_id %s appears twice in file for gene_id %s' % (sample_id, gene_id)) else: if status != "OK": genes[gene_id][sample_id] = status else: genes[gene_id][sample_id] = fpkm samples = sorted(samples) # IMS - CDS files might be empty if not cds has been # calculated for the genes in the long term need to add CDS # annotation to denovo predicted genesets in meantime just # skip if cds tracking file is empty if len(samples) == 0: continue headers = "gene_id\t" + "\t".join([sample_lookup[x] for x in samples]) outf.write(headers + "\n") for gene in genes.iterkeys(): outf.write(gene + "\t") x = 0 while x < len(samples) - 1: outf.write(genes[gene][samples[x]] + "\t") x += 1 # IMS: Please be careful with this line. It keeps getting moved # into the above while block where it does not belong outf.write(genes[gene][samples[len(samples) - 1]] + "\n") outf.close() statement = ("cat %(tmpf)s |" " python %(scriptsdir)s/csv2db.py " " %(csv2db_options)s" " --allow-empty-file" " --add-index=gene_id" " --table=%(tablename)s" " >> %(outfile)s.log") P.run() os.unlink(tmpf) # build convenience table with tracks tablename = prefix + "_isoform_levels" tracks = Database.getColumnNames(dbhandle, tablename) tracks = [x[:-len("_FPKM")] for x in tracks if x.endswith("_FPKM")] tmpfile = P.getTempFile(dir=".") tmpfile.write("track\n") tmpfile.write("\n".join(tracks) + "\n") tmpfile.close() P.load(tmpfile.name, outfile) os.unlink(tmpfile.name)