def filterByCoverage(infiles, outfile): fcoverage = PARAMS["coverage_filter"] contig_file = infiles[0] dbh = sqlite3.connect( os.path.join(PARAMS["results_resultsdir"], PARAMS["database_name"])) cc = dbh.cursor() contigs = set() for infile in infiles[1:]: dirsplit = infile.split("/") infile = os.path.join( PARAMS["results_resultsdir"], dirsplit[-2].split(".dir")[0] + "-" + dirsplit[-1]) tablename = P.toTable(os.path.basename(infile)) if P.snip(contig_file, ".fa") == P.snip(os.path.basename(infile), ".coverage.load"): statement = """SELECT contig_id ave FROM (SELECT contig_id, AVG(coverage) as ave FROM %s GROUP BY contig_id) WHERE ave > %i""" % (tablename, PARAMS["coverage_filter"]) for data in cc.execute(statement).fetchall(): contigs.add(data[0]) outf = open(outfile, "w") print(contigs) for fasta in FastaIterator.iterate(iotools.openFile(contig_file)): identifier = fasta.title.split(" ")[0] if identifier in contigs: outf.write(">%s\n%s\n" % (identifier, fasta.sequence)) outf.close()
def loadPicardAlignStats(infiles, outfile): '''Merge Picard alignment stats into single table and load into SQLite.''' # Join data for all tracks into single file outf = open("tophat/tophat.dir/picard_align_stats.tsv", "w") first = True for f in infiles: track = P.snip(os.path.basename(f), ".alignstats") if not os.path.exists(f): E.warn("File %s missing" % f) continue lines = [ x for x in open(f, "r").readlines() if not x.startswith("#") and x.strip() ] if first: outf.write("%s\t%s" % ("track", lines[0])) first = False for i in range(1, len(lines)): outf.write("%s\t%s" % (track, lines[i])) outf.close() tmpfilename = outf.name # Load into database tablename = P.toTable(outfile) statement = '''cat %(tmpfilename)s | python %(scriptsdir)s/csv2db.py --add-index=track --table=%(tablename)s > %(outfile)s''' P.run()
def loadPicardDuplicateStats(infiles, outfile): '''Merge Picard duplicate stats into single table and load into SQLite.''' tablename = P.toTable(outfile) outf = open('dupstats.txt', 'w') first = True for f in infiles: track = P.snip(os.path.basename(f), ".dedup.bam") statfile = P.snip(f, ".bam") + ".dupstats" if not os.path.exists(statfile): E.warn("File %s missing" % statfile) continue lines = [ x for x in open(statfile, "r").readlines() if not x.startswith("#") and x.strip() ] if first: outf.write("%s\t%s" % ("track", lines[0])) first = False outf.write("%s\t%s" % (track, lines[1])) outf.close() tmpfilename = outf.name statement = '''cat %(tmpfilename)s | cgat csv2db --add-index=track --table=%(tablename)s > %(outfile)s ''' P.run()
def loadPicardDuplicateStats(infiles, outfile): '''Merge Picard duplicate stats into single table and load into SQLite.''' # Join data for all tracks into single file outf = open('tophat/tophat.dir/dupstats.txt', 'w') first = True for f in infiles: track = P.snip(os.path.basename(f), ".dedup.stats") statfile = f lines = [ x for x in open(statfile, "r").readlines() if not x.startswith("#") and x.strip() ] if first: outf.write("%s\t%s" % ("track", lines[0])) first = False outf.write("%s\t%s" % (track, lines[1])) outf.close() tmpfilename = outf.name # Load into database tablename = P.toTable(outfile) statement = '''cat %(tmpfilename)s | python %(scriptsdir)s/csv2db.py --add-index=track --table=%(tablename)s > %(outfile)s ''' P.run()
def createViewMapping(infile, outfile): '''create view in database for alignment stats. This view aggregates all information on a per-track basis. The table is built from the following tracks: mapping_stats bam_stats ''' tablename = P.toTable(outfile) # can not create views across multiple database, so use table view_type = "TABLE" dbhandle = connect() Database.executewait( dbhandle, "DROP %(view_type)s IF EXISTS %(tablename)s" % locals()) statement = ''' CREATE %(view_type)s %(tablename)s AS SELECT * FROM bam_stats AS b ''' Database.executewait(dbhandle, statement % locals())
def mergeAndLoad(infiles, outfile, suffix): '''load categorical tables (two columns) into a database. The tables are merged and entered row-wise. ''' header = ",".join([P.tablequote(P.snip(x, suffix)) for x in infiles]) if suffix.endswith(".gz"): filenames = " ".join( ["<( zcat %s | cut -f 1,2 )" % x for x in infiles]) else: filenames = " ".join(["<( cat %s | cut -f 1,2 )" % x for x in infiles]) tablename = P.toTable(outfile) statement = """cgat combine_tables --header-names=%(header)s --missing-value=0 --ignore-empty %(filenames)s | perl -p -e "s/bin/track/" | cgat table2table --transpose | cgat csv2db --add-index=track --table=%(tablename)s > %(outfile)s """ P.run()
def loadPicardGCStats(infiles, outfile): '''Merge Picard insert size stats into single table and load into SQLite.''' tablename = P.toTable(outfile) outf = P.getTempFile() first = True for f in infiles: track = P.snip(os.path.basename(f), ".gcstats") if not os.path.exists(f): E.warn("File %s missing" % f) continue lines = [ x for x in open(f, "r").readlines() if not x.startswith("#") and x.strip() ] if first: outf.write("%s\t%s" % ("track", lines[0])) first = False outf.write("%s\t%s" % (track, lines[1])) outf.close() tmpfilename = outf.name statement = '''cat %(tmpfilename)s | cgat csv2db %(csv2db_options)s --add-index=track --table=%(tablename)s > %(outfile)s ''' P.run() os.unlink(tmpfilename)
def loadCodingPotential(infile, outfile): '''load annotations''' table = P.toTable(outfile) statement = ''' gunzip < %(infile)s | cgat csv2db %(csv2db_options)s --allow-empty-file --add-index=gene_id --map=gene_id:str --table=%(table)s > %(outfile)s''' P.run() # set the is_coding flag dbhandle = sqlite3.connect(PARAMS["database_name"]) Database.executewait( dbhandle, '''ALTER TABLE %(table)s ADD COLUMN is_coding INTEGER''' % locals()) Database.executewait( dbhandle, '''UPDATE %(table)s SET is_coding = (result == 'coding')''' % locals()) dbhandle.commit()
def buildGeneOntology(infile, outfile): '''create an output file akin to GO ontology files to be used with GO.py ''' table = P.toTable(infile) columns = ("cpg", "tata") dbh = connect() cc = dbh.cursor() outf = iotools.openFile(outfile, "w") outf.write("go_type\tgene_id\tgo_id\tdescription\tevidence\n") i = 1 for c in columns: cc.execute("SELECT DISTINCT gene_id FROM %(table)s WHERE %(c)s" % locals()) outf.write("".join([ "promotor\t%s\tGO:%07i\twith_%s\tNA\n" % (x[0], i, c) for x in cc ])) i += 1 cc.execute("SELECT DISTINCT gene_id FROM %(table)s WHERE %(c)s = 0" % locals()) outf.write("".join([ "promotor\t%s\tGO:%07i\twithout_%s\tNA\n" % (x[0], i, c) for x in cc ])) i += 1 outf.close()
def loadAlignmentStats(infiles, outfile): '''merge alignment stats into single tables.''' tablename = P.toTable(outfile) outf = P.getTempFile() first = True for f in infiles: track = P.snip(f, ".bam.stats") fn = f + ".alignment_summary_metrics" if not os.path.exists(fn): E.warn("file %s missing" % fn) continue lines = [ x for x in open(fn, "r").readlines() if not x.startswith("#") and x.strip() ] if first: outf.write("%s\t%s" % ("track", lines[0])) first = False outf.write("%s\t%s" % (track, lines[1])) outf.close() tmpfilename = outf.name statement = '''cat %(tmpfilename)s | cgat csv2db --add-index=track --table=%(tablename)s > %(outfile)s ''' P.run() for suffix, column in (("quality_by_cycle_metrics", "cycle"), ("quality_distribution_metrics", "quality")): # some files might be missing - bugs in Picard xfiles = [x for x in infiles if os.path.exists("%s.%s" % (x, suffix))] header = ",".join([P.snip(x, ".bam.stats") for x in xfiles]) filenames = " ".join(["%s.%s" % (x, suffix) for x in xfiles]) tname = "%s_%s" % (tablename, suffix) statement = """cgat combine_tables --missing-value=0 %(filenames)s | cgat csv2db --header-names=%(column)s,%(header)s --replace-header --add-index=track --table=%(tname)s >> %(outfile)s """ P.run() os.unlink(tmpfilename)
def buildDMRStats(infiles, outfile): '''compute differential methylation stats.''' tablenames = [P.toTable(x) for x in infiles] method = P.snip(outfile, "_stats.tsv") PipelineMedip.buildDMRStats(tablenames, method, outfile, dbhandle=connect())
def loadCoveredCpGs(infile, outfile): dbh = connect() tablename = P.toTable(outfile) statement = '''cat %(infile)s | cgat csv2db --table %(tablename)s --retry --ignore-empty > %(outfile)s''' % locals() P.run()
def calculateFalsePositiveRate(infiles, outfile): ''' taxonomy false positives and negatives etc ''' # connect to database dbh = sqlite3.connect(PARAMS["database_name"]) cc = dbh.cursor() levels = ["phylum", "class", "order", "family", "genus", "species"] tablename_true = P.toTable(infiles[0]) # get corresponding estimate file tablename_estimate = P.toTable( os.path.basename([ inf for inf in infiles[1:] if os.path.basename(inf) [len("metaphlan_"):] == os.path.basename(infiles[0]) ][0])) outf = open(outfile, "w") track = P.snip(os.path.basename(infiles[0]), ".taxonomy.relab.load") for level in levels: for cutoff in [0, 1]: true_set = set() estimate_set = set() for taxa in cc.execute( """SELECT taxa FROM %s WHERE level == '%s' AND relab > %f""" % (tablename_true, level, float(cutoff) / 100)): true_set.add(taxa[0]) for taxa in cc.execute( """SELECT taxon FROM %s WHERE taxon_level == '%s' AND rel_abundance > %f""" % (tablename_estimate, level, float(cutoff))): estimate_set.add(taxa[0]) total_true = len(true_set) total_estimate = len(estimate_set) tp = true_set.intersection(estimate_set) fp = estimate_set.difference(true_set) fp_rate = float(len(fp)) / total_estimate tp_rate = float(len(tp)) / total_true outf.write("%s\t%f\t%f\t%s\t%s\n" % (level, fp_rate, tp_rate, track, str(cutoff))) outf.close()
def loadMergeCoverage(infile, outfile): dbh = connect() tablename = P.toTable(outfile) job_options = "-l mem_free=23G" job_threads = 2 statement = '''cat %(infile)s | cgat csv2db --table %(tablename)s --retry --ignore-empty > %(outfile)s''' % locals() P.run()
def importPresence(infile, outfile): '''import presence/absence data.''' tablename = P.toTable(outfile) statement = ''' cgat csv2db %(csv2db_options)s \ --add-index=probeset \ --table=%(tablename)s \ < %(infile)s > %(outfile)s ''' P.run()
def mergeEffects(infiles, outfile): '''load transcript effects into single table.''' tablename = P.toTable(outfile) outf = open('effects.txt', 'w') first = True for f in infiles: track = P.snip(os.path.basename(f), ".effects.gz") if not os.path.exists(f): E.warn("File %s missing" % f) continue lines = [x for x in gzip.open(f, "r").readlines()] if first: outf.write("%s\t%s" % ("track", lines[0])) first = False for i in range(1, len(lines)): outf.write("%s\t%s" % (track, lines[i])) outf.close() P.load("effect.txt", outfile, options="--add-index=transcript_id") for suffix in ("cds", "intron", "splicing", "translation", "genes"): outf = open('effects.' + suffix + '.txt', 'w') first = True for f in infiles: track = P.snip(os.path.basename(f), ".effects.gz") statfile = f + "." + suffix + ".gz" print(statfile) if not os.path.exists(statfile): E.warn("File %s missing" % statfile) continue lines = [x for x in gzip.open(statfile, "r").readlines()] if first: outf.write("%s\t%s" % ("track", lines[0])) first = False for i in range(1, len(lines)): outf.write("%s\t%s" % (track, lines[i])) outf.close() tmpfilename = outf.name P.load(outf.name, outfile, tablename=tabelname + "_" + suffix, options="--add-index=transcript_id " "--allow-empty-file " "--ignore-column=seq_na " "--ignore-column=seq_aa")
def importExpressionLevels(infiles, outfile): '''import presence/absence data.''' infile_data, infile_table = infiles tablename = P.toTable(outfile) statement = ''' cgat csv2db %(csv2db_options)s \ --add-index=probeset \ --table=%(tablename)s \ < %(infile_table)s > %(outfile)s ''' P.run()
def combineM3Dsummaries(infiles, outfiles): ''' combine M3D summary tables''' outfile1, outfile2 = outfiles print(outfile1, outfile2) tablename = P.toTable(outfile2) statement = ''' cgat combine_tables -v0 -a file --glob=M3D_plots.dir/*between_summary.tsv > %(outfile1)s; cat %(outfile1)s | cgat csv2db --table %(tablename)s --retry --ignore-empty > %(outfile2)s''' % locals() print(statement) P.run()
def loadPolyphen(infile, outfile): '''load polyphen results.''' load_statement = P.build_load_statement( P.toTable(outfile), options="--add-index=snp_id " "--add-index=protein_id " "--map=effect:str") statement = ''' gunzip < %(infile)s | perl -p -e "s/o_acc/protein_id/; s/ +//g; s/^#//;" | %(load_statement)s > %(outfile)s ''' P.run()
def loadGeneSummary(infile, outfile): '''summarize binding information per gene.''' dbh = connect() table = P.toTable(outfile) cc = dbh.cursor() cc.execute("""DROP TABLE IF EXISTS %(table)s """ % locals()) cc.execute("""CREATE TABLE %(table)s AS SELECT gene_id, SUM( tata ) AS tata, SUM( cpg ) AS cpg FROM promotorinfo_transcripts AS p, annotations.transcript_info as i WHERE i.transcript_id = p.transcript_id GROUP BY gene_id""" % locals()) cc.close() P.touch(outfile)
def loadReadCorrespondence(infiles, outfile): '''load read correspondence data into database.''' to_cluster = USECLUSTER infiles = " ".join(infiles) tablename = P.toTable(outfile) statement = ''' cgat combine_tables %(infiles)s | cgat csv2db --table=%(tablename)s > %(outfile)s ''' P.run()
def loadGTF(infile, outfile): '''load gtf files.''' table = P.toTable(outfile) to_cluster = USECLUSTER statement = '''gunzip < %(infile)s | cgat gtf2tsv |cgat csv2db %(csv2db_options)s --add-index=gene_id --map=gene_id:str --add-index=transcript_id --map=transcript_id:str --table=%(table)s > %(outfile)s ''' P.run()
def loadPolyphen(infile, outfile): '''load polyphen results. The comment column is ignored. ''' table = P.toTable(outfile) statement = '''gunzip < %(infile)s | perl -p -e "s/o_acc/protein_id/; s/ +//g" | cut -f 1-55 |python %(scriptsdir)s/csv2db.py %(csv2db_options)s --add-index=snp_id --add-index=protein_id --table=%(table)s --map=effect:str > %(outfile)s ''' P.run()
def loadTranscriptSummary(infile, outfile): '''summarize binding information per transcript.''' dbh = connect() table = P.toTable(outfile) cc = dbh.cursor() # sqlite can not do full outer join cc.execute("""DROP TABLE IF EXISTS %(table)s""" % locals()) transcripts = [ x[0] for x in cc.execute( "SELECT DISTINCT(transcript_id) FROM annotations.transcript_info"). fetchall() ] tmpf = P.getTempFile() tables = ("tata", "cpg") titles = tables vals = [] for table in tables: t = set([ x[0] for x in cc.execute("SELECT DISTINCT(transcript_id) FROM %(table)s" % locals()).fetchall() ]) vals.append(t) tmpf.write("transcript_id\t%s\n" % "\t".join(titles)) for transcript_id in transcripts: tmpf.write("%s\t%s\n" % (transcript_id, "\t".join( [str(int(transcript_id in v)) for v in vals]))) tmpf.close() P.load(tmpf.name, outfile) os.unlink(tmpf.name)
def loadBAMStats(infiles, outfile): '''Import bam statistics into SQLite''' scriptsdir = PARAMS["general_scriptsdir"] header = ",".join( [P.snip(os.path.basename(x), ".readstats") for x in infiles]) filenames = " ".join(["<( cut -f 1,2 < %s)" % x for x in infiles]) tablename = P.toTable(outfile) E.info("loading bam stats - summary") statement = """cgat combine_tables --header-names=%(header)s --missing-value=0 --ignore-empty %(filenames)s | perl -p -e "s/bin/track/" | perl -p -e "s/unique/unique_alignments/" | cgat table2table --transpose | cgat csv2db --allow-empty-file --add-index=track --table=%(tablename)s > %(outfile)s""" P.run() for suffix in ("nm", "nh"): E.info("loading bam stats - %s" % suffix) filenames = " ".join(["%s.%s" % (x, suffix) for x in infiles]) tname = "%s_%s" % (tablename, suffix) statement = """cgat combine_tables --header-names=%(header)s --skip-titles --missing-value=0 --ignore-empty %(filenames)s | perl -p -e "s/bin/%(suffix)s/" | cgat csv2db --table=%(tname)s --allow-empty-file >> %(outfile)s """ P.run()
def loadMEDIPS(infile, outfile): '''load medips results''' table_prefix = re.sub("_prep", "", P.toTable(outfile)) table = table_prefix + "_coveredpos" statement = """ cat %(infile)s_saturation_coveredpos.csv | tail -n 3 | perl -p -e 's/\\"//g; s/[,;]/\\t/g; ' | cgat table2table --transpose | cgat csv2db %(csv2db_options)s --table=%(table)s --replace-header --header-names=coverage,ncovered,pcovered >> %(outfile)s """ P.run()
def mergeAnnotations(infiles, outfile): '''load variant annotations into single database table''' tablename = P.toTable(outfile) outf = open('anno.txt', 'w') first = True for f in infiles: track = P.snip(os.path.basename(f), ".annotations.gz") if not os.path.exists(f): E.warn("File %s missing" % f) continue lines = [x for x in gzip.open(f, "r").readlines()] if first: outf.write("%s\t%s" % ("track", lines[0])) first = False for i in range(1, len(lines)): outf.write("%s\t%s" % (track, lines[i])) outf.close() P.load('anno.text', outfile)
def loadBAMStats(infiles, outfile): '''import bam statisticis.''' header = ",".join([P.tablequote(P.snip(x, ".readstats")) for x in infiles]) filenames = " ".join(["<( cut -f 1,2 < %s)" % x for x in infiles]) tablename = P.toTable(outfile) E.info("loading bam stats - summary") statement = """cgat combine_tables --header-names=%(header)s --missing-value=0 --ignore-empty %(filenames)s | perl -p -e "s/bin/track/" | perl -p -e "s/unique/unique_alignments/" | cgat table2table --transpose | cgat csv2db --add-index=track --table=%(tablename)s > %(outfile)s """ P.run() for suffix in ("nm", "nh"): E.info("loading bam stats - %s" % suffix) filenames = " ".join(["%s.%s" % (x, suffix) for x in infiles]) tname = "%s_%s" % (tablename, suffix) statement = """cgat combine_tables --header-names=%(header)s --skip-titles --missing-value=0 --ignore-empty %(filenames)s | perl -p -e "s/bin/%(suffix)s/" | cgat csv2db --table=%(tname)s >> %(outfile)s """ P.run()
def exportMacsIntervalsAsBed(infile, outfile, foldchange): '''export sequences for all intervals.''' dbhandle = sqlite3.connect(PARAMS["database_name"]) track = P.toTable(os.path.basename(infile)) assert track.endswith("_macs") track = track[:-len("_macs")] cc = dbhandle.cursor() statement = "SELECT contig, start, end, interval_id, fold FROM %(track)s_macs_intervals where fold >= %(foldchange)s ORDER by contig, start" % locals( ) cc.execute(statement) outs = open(outfile, "w") for result in cc: contig, start, end, interval_id, fold = result outs.write("%s\t%i\t%i\t%s\t%d\n" % (contig, start, end, str(interval_id), fold)) cc.close() outs.close()
def importFromSeries(infiles, outfile): '''import expression levels from a GEO series.''' tablename = P.toTable(outfile) tmpf = P.getTempFile() infile_data, infile_map = infiles map_header = iotools.readMap(open(infile_map, "r")) if "ID_REF" not in map_header: map_header["ID_REF"] = "probeset" inf = gzip.open(infile_data, "r") for line in inf: if line.startswith("!"): continue if not line.strip(): continue line = re.sub('"', "", line) if line.startswith("ID_REF"): line = "\t".join([map_header[x] for x in line[:-1].split("\t")]) + "\n" tmpf.write(line) tmpf.close() tmpname = tmpf.name header = map_header["ID_REF"] statement = ''' cgat csv2db %(csv2db_options)s \ --add-index=%(header)s \ --table=%(tablename)s \ < %(tmpname)s > %(outfile)s ''' P.run() os.unlink(tmpname)