def loadPicardAlignStats(infiles, outfile): '''Merge Picard alignment stats into single table and load into SQLite.''' tablename = P.toTable(outfile) outf = P.getTempFile() first = True for f in infiles: track = P.snip(os.path.basename(f), ".dedup.alignstats") if not os.path.exists(f): E.warn("File %s missing" % f) continue lines = [ x for x in open(f, "r").readlines() if not x.startswith("#") and x.strip()] if first: outf.write("%s\t%s" % ("track", lines[0])) first = False for i in range(1, len(lines)): outf.write("%s\t%s" % (track, lines[i])) outf.close() tmpfilename = outf.name statement = '''cat %(tmpfilename)s | python %(scriptsdir)s/csv2db.py --add-index=track --table=%(tablename)s > %(outfile)s ''' P.run() os.unlink(tmpfilename)
def exportMotifLocations(infiles, outfile): '''export motif locations. There will be a bed-file per motif. Overlapping motif matches in different tracks will be merged. ''' dbh = connect() cc = dbh.cursor() motifs = [ x[0] for x in cc.execute("SELECT motif FROM motif_info").fetchall() ] for motif in motifs: tmpf = P.getTempFile(".") for infile in infiles: table = P.toTable(infile) track = P.snip(table, "_mast") for x in cc.execute( """SELECT contig, start, end, '%(track)s', evalue FROM %(table)s WHERE motif = '%(motif)s' AND start IS NOT NULL""" % locals()): tmpf.write("\t".join(map(str, x)) + "\n") tmpf.close() outfile = os.path.join(PARAMS["exportdir"], "motifs", "%s.bed.gz" % motif) tmpfname = tmpf.name statement = '''mergeBed -i %(tmpfname)s -nms | gzip > %(outfile)s''' P.run() os.unlink(tmpf.name)
def loadTranscriptSummary(infile, outfile): '''summarize binding information per transcript.''' dbh = connect() table = P.toTable(outfile) cc = dbh.cursor() # sqlite can not do full outer join cc.execute( """DROP TABLE IF EXISTS %(table)s""" % locals() ) transcripts = [x[0] for x in cc.execute( "SELECT DISTINCT(transcript_id) FROM annotations.transcript_info").fetchall()] tmpf = P.getTempFile() tables = ("tata", "cpg") titles = tables vals = [] for table in tables: t = set([x[0] for x in cc.execute( "SELECT DISTINCT(transcript_id) FROM %(table)s" % locals()).fetchall()]) vals.append(t) tmpf.write("transcript_id\t%s\n" % "\t".join(titles)) for transcript_id in transcripts: tmpf.write("%s\t%s\n" % (transcript_id, "\t".join([str(int(transcript_id in v)) for v in vals]))) tmpf.close() P.load(tmpf.name, outfile) os.unlink(tmpf.name)
def loadPicardGCStats(infiles, outfile): '''Merge Picard insert size stats into single table and load into SQLite.''' tablename = P.toTable(outfile) outf = P.getTempFile() first = True for f in infiles: track = P.snip(os.path.basename(f), ".gcstats") if not os.path.exists(f): E.warn("File %s missing" % f) continue lines = [ x for x in open(f, "r").readlines() if not x.startswith("#") and x.strip() ] if first: outf.write("%s\t%s" % ("track", lines[0])) first = False outf.write("%s\t%s" % (track, lines[1])) outf.close() tmpfilename = outf.name statement = '''cat %(tmpfilename)s | cgat csv2db %(csv2db_options)s --add-index=track --table=%(tablename)s > %(outfile)s ''' P.run() os.unlink(tmpfilename)
def calculateSequenceComposition(interval_names, sequence_file, outfile, header_line=True): ''' given a set of interval names that are present in a fasta file, return CpG content file ''' interval_file = open(interval_names) if header_line: interval_file.readline() sequence_file = open(sequence_file) interval_set = set() for line in interval_file.readlines(): interval_set.add(line[:-1]) temp = P.getTempFile("/ifs/scratch") for record in FastaIterator.iterate(sequence_file): seq_id = record.title.split(" ")[0] if seq_id in interval_set: temp.write(">%s\n%s\n" % (record.title, record.sequence)) temp.close() inf = temp.name statement = ''' cat %(inf)s | cgat fasta2table -s na -s cpg -s length --log=%(outfile)s.log > %(outfile)s''' P.run()
def loadPicardGCStats(infiles, outfile): '''Merge Picard insert size stats into single table and load into SQLite.''' tablename = P.toTable(outfile) outf = P.getTempFile() first = True for f in infiles: track = P.snip(os.path.basename(f), ".gcstats") if not os.path.exists(f): E.warn("File %s missing" % f) continue lines = [ x for x in open(f, "r").readlines() if not x.startswith("#") and x.strip()] if first: outf.write("%s\t%s" % ("track", lines[0])) first = False outf.write("%s\t%s" % (track, lines[1])) outf.close() tmpfilename = outf.name statement = '''cat %(tmpfilename)s | python %(scriptsdir)s/csv2db.py %(csv2db_options)s --add-index=track --table=%(tablename)s > %(outfile)s ''' P.run() os.unlink(tmpfilename)
def loadPicardAlignStats(infiles, outfile): '''Merge Picard alignment stats into single table and load into SQLite.''' tablename = P.toTable(outfile) outf = P.getTempFile() first = True for f in infiles: track = P.snip(os.path.basename(f), ".dedup.alignstats") if not os.path.exists(f): E.warn("File %s missing" % f) continue lines = [ x for x in open(f, "r").readlines() if not x.startswith("#") and x.strip()] if first: outf.write("%s\t%s" % ("track", lines[0])) first = False for i in range(1, len(lines)): outf.write("%s\t%s" % (track, lines[i])) outf.close() tmpfilename = outf.name statement = '''cat %(tmpfilename)s | cgat csv2db --add-index=track --table=%(tablename)s > %(outfile)s ''' P.run() os.unlink(tmpfilename)
def loadPicardCoverageStats(infiles, outfile): '''import coverage statistics into database. Arguments --------- infiles : string Filenames of files with picard metric information. Each file corresponds to a different track. outfile : string Logfile. The table name will be derived from `outfile`. ''' outf = P.getTempFile(".") first = True for f in infiles: track = P.snip(os.path.basename(f), ".cov") lines = [x for x in open(f, "r").readlines() if not x.startswith("#") and x.strip()] if first: outf.write("%s\t%s" % ("track", lines[0])) first = False outf.write("%s\t%s" % (track, lines[1])) outf.close() P.load(outf.name, outfile, options="--ignore-empty --add-index=track") os.unlink(outf.name)
def buildBenchmarkInput(infile, outfile): tmpfile = P.getTempFile() dbhandle = sqlite3.connect(PARAMS["database_name"]) cc = dbhandle.cursor() statement = ''' SELECT DISTINCT transcript_id, protein_id FROM peptide_info ''' cc.execute(statement) tmpfile.write("transcript_id\tprotein_id\n") tmpfile.write("\n".join(["\t".join(x) for x in cc])) tmpfile.write("\n") tmpfilename = tmpfile.name statement = ''' perl %(scriptsdir)s/extract_fasta.pl %(infile)s < cds.fasta python %(scripstdir)s/fasta2variants.py --is-cds | python %(scriptsdir)s/substitute_tokens.py --map-tsv-file=%(tmpfilename)s > %(outfile)s ''' P.run() os.unlink(tmpfilename)
def exportMotifLocations(infiles, outfile): '''export motif locations. There will be a bed-file per motif. Overlapping motif matches in different tracks will be merged. ''' dbh = connect() cc = dbh.cursor() motifs = [x[0] for x in cc.execute("SELECT motif FROM motif_info").fetchall()] for motif in motifs: tmpf = P.getTempFile(".") for infile in infiles: table = P.toTable(infile) track = P.snip(table, "_mast") for x in cc.execute( """SELECT contig, start, end, '%(track)s', evalue FROM %(table)s WHERE motif = '%(motif)s' AND start IS NOT NULL""" % locals()): tmpf.write("\t".join(map(str, x)) + "\n") tmpf.close() outfile = os.path.join( PARAMS["exportdir"], "motifs", "%s.bed.gz" % motif) tmpfname = tmpf.name statement = '''mergeBed -i %(tmpfname)s -nms | gzip > %(outfile)s''' P.run() os.unlink(tmpf.name)
def importRepeatsFromUCSC(infile, outfile, ucsc_database, repeattypes, genome): '''import repeats from a UCSC formatted file. The repeats are stored as a :term:`gff` formatted file. ''' repclasses = "','".join(repeattypes.split(",")) # Repeats are either stored in a single ``rmsk`` table (hg19) or in # individual ``rmsk`` tables (mm9) like chr1_rmsk, chr2_rmsk, .... # In order to do a single statement, the ucsc mysql database is # queried for tables that end in rmsk. dbhandle = PipelineUCSC.connectToUCSC( host=PARAMS["ucsc_host"], user=PARAMS["ucsc_user"], database=ucsc_database) cc = dbhandle.execute("SHOW TABLES LIKE '%%rmsk'") tables = [x[0] for x in cc.fetchall()] if len(tables) == 0: raise ValueError("could not find any `rmsk` tables") tmpfile = P.getTempFile(shared=True) total_repeats = 0 for table in tables: E.info("%s: loading repeats from %s" % (ucsc_database, table)) cc = dbhandle.execute( """SELECT genoName, 'repeat', 'exon', genoStart+1, genoEnd, '.', strand, '.', CONCAT('class \\"', repClass, '\\"; family \\"', repFamily, '\\";') FROM %(table)s WHERE repClass in ('%(repclasses)s') """ % locals()) n = 0 for data in cc.fetchall(): n += 1 tmpfile.write("\t".join(map(str, data)) + "\n") E.info("%s: %s=%i repeats downloaded" % (ucsc_database, table, n)) total_repeats += n if total_repeats == 0: raise ValueErrror("did not find any repeats for %s" % ucsc_database) tmpfile.close() tmpfilename = tmpfile.name statement = '''cat %(tmpfilename)s | %(pipeline_scriptsdir)s/gff_sort pos | cgat gff2gff --method=sanitize --sanitize-method=genome --skip-missing --genome-file=%(genome)s --log=%(outfile)s.log | gzip > %(outfile)s ''' P.run() os.unlink(tmpfilename)
def loadAlignmentStats(infiles, outfile): '''merge alignment stats into single tables.''' tablename = P.toTable(outfile) outf = P.getTempFile() first = True for f in infiles: track = P.snip(f, ".bam.stats") fn = f + ".alignment_summary_metrics" if not os.path.exists(fn): E.warn("file %s missing" % fn) continue lines = [ x for x in open(fn, "r").readlines() if not x.startswith("#") and x.strip() ] if first: outf.write("%s\t%s" % ("track", lines[0])) first = False outf.write("%s\t%s" % (track, lines[1])) outf.close() tmpfilename = outf.name statement = '''cat %(tmpfilename)s | python %(scriptsdir)s/csv2db.py --add-index=track --table=%(tablename)s > %(outfile)s ''' P.run() for suffix, column in (("quality_by_cycle_metrics", "cycle"), ("quality_distribution_metrics", "quality")): # some files might be missing - bugs in Picard xfiles = [x for x in infiles if os.path.exists("%s.%s" % (x, suffix))] header = ",".join([P.snip(x, ".bam.stats") for x in xfiles]) filenames = " ".join(["%s.%s" % (x, suffix) for x in xfiles]) tname = "%s_%s" % (tablename, suffix) statement = """python %(scriptsdir)s/combine_tables.py --missing-value=0 %(filenames)s | python %(scriptsdir)s/csv2db.py --header-names=%(column)s,%(header)s --replace-header --add-index=track --table=%(tname)s >> %(outfile)s """ P.run() os.unlink(tmpfilename)
def buildProbe2GeneMap(infile, outfile, PARAMS, platform = "affy"): ''' build file mapping probe id to gene id ''' if platform == "affy": array = PARAMS.get("affy_array") dataset = PARAMS.get("affy_dataset") R('''library("biomaRt")''') R('''library("affy")''') R('''dat <- ReadAffy()''') E.info("getting probes") R('''probes <- featureNames(dat)''') E.info("getting mart") R('''mart <- useMart("ensembl", dataset = "%s")''' % dataset) # matches to hgnc symbol - this might not be appropriate for mouse data... E.info("mapping probes to gene") R('''probe2gene <- getBM(attributes = c("%s", "external_gene_name"), filters = "%s", values = probes, mart = mart)''' % (array, array)) R('''colnames(probe2gene) <- c("probe", "gene")''') R('''probe2gene$gene <- toupper(probe2gene$gene)''') # remove probes that have no gene assignment (i.e returned "" from biomaRt) and those with # multiple gene assignments - cross-hyb temp = P.getTempFile(".") E.info("writing temp file") R('''write.table(probe2gene, file = "%s", sep = "\t", row.names = F)''' % temp.name) temp.close() E.info("filtering probes") inf = open(temp.name) header = inf.readline() outf = open(outfile, "w") outf.write(header) counts = collections.defaultdict(int) probe2gene = {} for line in inf.readlines(): data = line[:-1].split("\t") probe, gene = data[0], data[1] if gene.strip('"') == '': continue probe2gene[probe] = gene counts[probe] += 1 for probe, count in probe2gene.iteritems(): if count > 1: outf.write("%s\t%s\n" % (probe, probe2gene[probe])) outf.close() os.unlink(temp.name) else: R(''' library(limma) # read in data - maintain detection p-values for bg correction dat <- read.ilmn(files = "%s", other.columns = "Detection") probe2gene <- data.frame("probe" = rownames(dat), "gene" = dat$genes$TargetID) write.table(probe2gene, file = "%s", row.names = F, sep = "\t") ''' % (infile, outfile))
def loadIdxstats(infiles, outfile): '''take list of file paths to samtools idxstats output files and merge to create single dataframe containing mapped reads per contig for each track. This dataframe is then loaded into database. Loads tables into the database * idxstats_reads_per_chromosome Arguments --------- infiles : list list where each element is a string of the filename containing samtools idxstats output. Filename format is expected to be 'sample.idxstats' outfile : string Logfile. The table name will be derived from `outfile`. ''' outf = P.getTempFile(".") dfs = [] for f in infiles: track = P.snip(f, ".idxstats").split('/')[-1] if not os.path.exists(f): E.warn("File %s missing" % f) continue # reformat idx stats df = pandas.read_csv(f, sep='\t', header=None) df.columns = ['region', 'length', 'mapped', 'unmapped'] # calc total reads mapped & unmappedpep total_reads = df.unmapped.sum() + df.mapped.sum() total_mapped_reads = df.mapped.sum() reformatted_df = pandas.DataFrame([['total_mapped_reads', total_mapped_reads], ['total_reads', total_reads], ['track', track]], columns=(['region', 'mapped'])) # reformat the df df = df.append(reformatted_df, ignore_index=True) df.set_index('region', inplace=True) df1 = df[['mapped']].T # set track as index df1.set_index('track', inplace=True) dfs.append(df1) # merge dataframes into single table master_df = pandas.concat(dfs) master_df.drop('*', axis=1, inplace=True) # transform dataframe to avoid reaching column limit master_df = master_df.T master_df.to_csv(outf, sep='\t', index=True) outf.close() P.load(outf.name, outfile, options="--ignore-empty --add-index=track") os.unlink(outf.name)
def loadIdxstats(infiles, outfile): '''take list of file paths to samtools idxstats output files and merge to create single dataframe containing mapped reads per contig for each track. This dataframe is then loaded into database. Loads tables into the database * idxstats_reads_per_chromosome Arguments --------- infiles : list list where each element is a string of the filename containing samtools idxstats output. Filename format is expected to be 'sample.idxstats' outfile : string Logfile. The table name will be derived from `outfile`. ''' outf = P.getTempFile(".") dfs = [] for f in infiles: track = P.snip(f, ".idxstats").split('/')[-1] if not os.path.exists(f): E.warn("File %s missing" % f) continue # reformat idx stats df = pandas.read_csv(f, sep='\t', header=None) df.columns = ['region', 'length', 'mapped', 'unmapped'] # calc total reads mapped & unmappedpep total_reads = df.unmapped.sum() + df.mapped.sum() total_mapped_reads = df.mapped.sum() reformatted_df = pandas.DataFrame( [['total_mapped_reads', total_mapped_reads], ['total_reads', total_reads], ['track', track]], columns=(['region', 'mapped'])) # reformat the df df = df.append(reformatted_df, ignore_index=True) df.set_index('region', inplace=True) df1 = df[['mapped']].T # set track as index df1.set_index('track', inplace=True) dfs.append(df1) # merge dataframes into single table master_df = pandas.concat(dfs) master_df.drop('*', axis=1, inplace=True) # transform dataframe to avoid reaching column limit master_df = master_df.T master_df.to_csv(outf, sep='\t', index=True) outf.close() P.load(outf.name, outfile, options="--ignore-empty --add-index=track") os.unlink(outf.name)
def loadBioProspector(infile, outfile): '''load results from bioprospector.''' target_path = os.path.join( os.path.abspath(PARAMS["exportdir"]), "bioprospector") try: os.makedirs(target_path) except OSError: pass track = infile[:-len(".bioprospector")] results = Bioprospector.parse(IOTools.openFile(infile, "r")) tmpfile = P.getTempFile() tmpfile.write("id\tmotif\tstart\tend\tstrand\tarrangement\n") for x, motifs in enumerate(results): outname = os.path.join(target_path, "%s_%02i.png" % (track, x)) Bioprospector.build_logo([y.sequence for y in motifs.matches], outname) for match in motifs.matches: distance = abs( match.start + match.width1 - (match.end - match.width2)) if match.strand in ("+-", "-+"): arrangement = "ER" elif match.strand in ("++", "--"): arrangement = "DR" else: arrangement = "SM" distance = 0 arrangement += "%i" % distance strand = match.strand[0] id = re.sub(".*_", "", match.id) tmpfile.write("%s\t%i\t%i\t%i\t%s\t%s\n" % (id, x, match.start, match.end, strand, arrangement)) tmpfile.close() P.load(tmpfile.name, outfile, options="--add-index=id " "--add-index=motif " "--add-index=id,motif " "--allow-empty-file " "--map=base_qualities:text") os.unlink(tmpfile.name)
def loadTranscriptProfile(infiles, outfile, suffix="transcript_profile", tablename=None): '''load transcript profiles into one table. Arguments --------- infiles : string Filenames of files with matrix from bam2geneprofile. Each file corresponds to a different track. outfile : string Logfile. suffix : string Suffix to append to table name. pipeline_suffix : string Suffix to remove from track name. tablename : string Tablename to use. If unset, the table name will be derived from `outfile` and suffix as ``toTable(outfile) + "_" + suffix``. ''' if not tablename: tablename = "%s" % (suffix) outf = P.getTempFile(".") table_count = 0 table_join = None for infile in infiles: matrix_file = str( infile ) + ".geneprofileabsolutedistancefromthreeprimeend.matrix.tsv.gz" name = P.snip(os.path.basename(infile), ".transcriptprofile.gz") table = pd.read_csv(matrix_file, sep="\t") table.rename(columns={'none': name}, inplace=True) table.drop(["area", "counts", "background"], axis=1, inplace=True) if table_count == 0: table_join = table table_count += 1 else: table_join = table.merge(table_join, on=["bin", "region", "region_bin"], how="left") table_join.to_csv(outf, sep="\t", index=False) outf.close() P.load(infile=outf.name, outfile=outfile, tablename=tablename, options="--add-index=bin") os.unlink(outf.name)
def loadAlignmentStats(infiles, outfile): '''merge alignment stats into single tables.''' tablename = P.toTable(outfile) outf = P.getTempFile() first = True for f in infiles: track = P.snip(f, ".bam.stats") fn = f + ".alignment_summary_metrics" if not os.path.exists(fn): E.warn("file %s missing" % fn) continue lines = [ x for x in open(fn, "r").readlines() if not x.startswith("#") and x.strip()] if first: outf.write("%s\t%s" % ("track", lines[0])) first = False outf.write("%s\t%s" % (track, lines[1])) outf.close() tmpfilename = outf.name statement = '''cat %(tmpfilename)s | python %(scriptsdir)s/csv2db.py --add-index=track --table=%(tablename)s > %(outfile)s ''' P.run() for suffix, column in (("quality_by_cycle_metrics", "cycle"), ("quality_distribution_metrics", "quality")): # some files might be missing - bugs in Picard xfiles = [x for x in infiles if os.path.exists("%s.%s" % (x, suffix))] header = ",".join([P.snip(x, ".bam.stats") for x in xfiles]) filenames = " ".join(["%s.%s" % (x, suffix) for x in xfiles]) tname = "%s_%s" % (tablename, suffix) statement = """python %(scriptsdir)s/combine_tables.py --missing-value=0 %(filenames)s | python %(scriptsdir)s/csv2db.py --header-names=%(column)s,%(header)s --replace-header --add-index=track --table=%(tname)s >> %(outfile)s """ P.run() os.unlink(tmpfilename)
def buildGenomicFunctionalAnnotation(gtffile, dbh, outfiles): '''output a bed file with genomic regions with functional annotations. The regions for each gene are given in the gtf file. Each bed entry is a gene territory. Bed entries are labeled by functional annotations associated with a gene. Ambiguities in territories are resolved by outputting annotations for all genes within a territory. The output file contains annotations for both GO and GOSlim. These are prefixed by ``go:`` and ``goslim:``. ''' territories_file = gtffile outfile_bed, outfile_tsv = outfiles gene2region = {} for gtf in GTF.iterator(IOTools.openFile(gtffile, "r")): gid = gtf.gene_id.split(":") for g in gid: gene2region[g] = (gtf.contig, gtf.start, gtf.end, gtf.strand) cc = dbh.cursor() outf = P.getTempFile(".") c = E.Counter() term2description = {} for db in ('go', 'goslim'): for gene_id, go_id, description in cc.execute( "SELECT gene_id, go_id, description FROM %s_assignments" % db): try: contig, start, end, strand = gene2region[gene_id] except KeyError: c.notfound += 1 continue outf.write( "\t".join(map(str, ( contig, start, end, "%s:%s" % (db, go_id), 1, strand))) + "\n") term2description["%s:%s" % (db, go_id)] = description outf.close() tmpfname = outf.name statement = '''sort -k1,1 -k2,2n < %(tmpfname)s | uniq | gzip > %(outfile_bed)s''' P.run() outf = IOTools.openFile(outfile_tsv, "w") outf.write("term\tdescription\n") for term, description in term2description.iteritems(): outf.write("%s\t%s\n" % (term, description)) outf.close() os.unlink(tmpfname)
def loadTranscriptProfile(infiles, outfile, suffix="transcript_profile", tablename=None): '''load transcript profiles into one table. Arguments --------- infiles : string Filenames of files with matrix from bam2geneprofile. Each file corresponds to a different track. outfile : string Logfile. suffix : string Suffix to append to table name. pipeline_suffix : string Suffix to remove from track name. tablename : string Tablename to use. If unset, the table name will be derived from `outfile` and suffix as ``toTable(outfile) + "_" + suffix``. ''' if not tablename: tablename = "%s" % (suffix) outf = P.getTempFile(".") table_count = 0 table_join = None for infile in infiles: matrix_file = str(infile) + ".geneprofileabsolutedistancefromthreeprimeend.matrix.tsv.gz" name = P.snip(os.path.basename(infile), ".transcriptprofile.gz") table = pd.read_csv(matrix_file, sep="\t") table.rename(columns={'none': name}, inplace=True) table.drop(["area", "counts", "background"], axis=1, inplace=True) if table_count == 0: table_join = table table_count += 1 else: table_join = table.merge(table_join, on=["bin", "region", "region_bin"], how="left") table_join.to_csv(outf, sep="\t", index=False) outf.close() P.load(infile=outf.name, outfile=outfile, tablename=tablename, options="--add-index=bin") os.unlink(outf.name)
def buildPicardDuplicationStats(infile, outfile): '''run picard:MarkDuplicates Record duplicate metrics using Picard, the marked records are discarded. Arguments --------- infile : string Input filename in :term:`BAM` format. outfile : string Output filename with picard output. ''' job_memory = PICARD_MEMORY job_threads = 3 if BamTools.getNumReads(infile) == 0: E.warn("no reads in %s - no metrics" % infile) P.touch(outfile) return # currently, MarkDuplicates cannot handle split alignments from gsnap # these can be identified by the custom XT tag. if ".gsnap.bam" in infile: tmpf = P.getTempFile(".") tmpfile_name = tmpf.name statement = '''samtools view -h %(infile)s | awk "!/\\tXT:/" | samtools view /dev/stdin -S -b > %(tmpfile_name)s; ''' % locals() data_source = tmpfile_name else: statement = "" data_source = infile os.environ["CGAT_JAVA_OPTS"] = "-Xmx%s -XX:+UseParNewGC\ -XX:+UseConcMarkSweepGC" % (PICARD_MEMORY) statement += '''MarkDuplicates INPUT=%(data_source)s ASSUME_SORTED=true METRICS_FILE=%(outfile)s OUTPUT=/dev/null VALIDATION_STRINGENCY=SILENT ''' P.run() os.unsetenv("CGAT_JAVA_OPTS") if ".gsnap.bam" in infile: os.unlink(tmpfile_name)
def loadCountReads(infiles, outfile, suffix="nreads", pipeline_suffix=".nreads", tablename=None): '''load read counts. Arguments --------- infiles : string Filenames of files with number of reads per sample. Each file corresponds to a different track. outfile : string Logfile. suffix : string Suffix to append to table name. pipeline_suffix : string Suffix to remove from track name. tablename : string Tablename to use. If unset, the table name will be derived from `outfile` and suffix as ``toTable(outfile) + "_" + suffix``. ''' if not tablename: tablename = "%s_%s" % (P.toTable(outfile), suffix) outf = P.getTempFile(".") outf.write("%s\t%s\n" % ("track", "nreads")) for filename in infiles: track = P.snip(os.path.basename(filename), pipeline_suffix) if not os.path.exists(filename): E.warn("File %s missing" % filename) continue lines = IOTools.openFile(filename, "r").readlines() for line in lines: count = line.split("\t")[1] outf.write("%s\t%s\n" % (track, count)) outf.close() P.load(infile=outf.name, outfile=outfile, tablename=tablename, options="--add-index=track") os.unlink(outf.name)
def load_last_exon_chunks(infile, outfile): '''Load gene and exon_ids for last exons into database''' from CGAT import GTF with P.getTempFile(shared=True) as tmpfile: tmpfile.write("gene_id\tchunk_id\n") for exon in GTF.iterator(IOTools.openFile(infile)): tmpfile.write("\t".join( [exon.gene_id, re.sub(";", "", exon["exon_id"])]) + "\n") tmpfn = tmpfile.name P.load(tmpfn, outfile, options="-i gene_id -i exon_id") os.unlink(tmpfn)
def buildCDSFasta(infile, outfile): '''load ENSEMBL cdna FASTA file *infile* is an ENSEMBL cdna file. ''' dbname = outfile[:-len(".fasta")] # infile_peptides, infile_cdnas = infiles statement = '''gunzip < %(infile)s | python %(scriptsdir)s/gff2fasta.py --is-gtf --genome=%(genome_dir)s/%(genome)s | python %(scriptsdir)s/index_fasta.py %(dbname)s --force-output - > %(dbname)s.log ''' P.run() return tmpfile = P.getTempFile(".") dbhandle = sqlite3.connect(PARAMS["database"]) cc = dbhandle.cursor() tmpfile.write("protein_id\ttranscript_id\n") tmpfile.write("\n".join( ["%s\t%s" % x for x in cc.execute( "SELECT DISTINCT protein_id, transcript_id " "FROM transcript_info")])) tmpfile.write("\n") tmpfile.close() tmpfilename = tmpfile.name statement = ''' python %(scriptsdir)s/peptides2cds.py --peptides-fasta-file=%(infile_peptides)s --cdnas=%(infile_cdnas)s --map=%(tmpfilename)s --output-format=fasta --log=%(outfile)s.log | python %(scriptsdir)s/index_fasta.py %(dbname)s --force-output - > %(dbname)s.log ''' P.run() os.unlink(tmpfilename)
def loadMotifInformation(infiles, outfile): '''load information about motifs into database.''' outf = P.getTempFile(".") outf.write("motif\n") for infile in infiles: if IOTools.isEmpty(infile): continue motif = P.snip(infile, ".motif") outf.write("%s\n" % motif) outf.close() P.load(outf.name, outfile, "--allow-empty-file") os.unlink(outf.name)
def extractEnsemblLincRNA(infile, outfile): tmpf = P.getTempFile("/ifs/scratch") for gtf in GTF.iterator(IOTools.openFile(infile)): if gtf.source == "lincRNA": tmpf.write(str(gtf) + "\n") else: continue tmpf.close() tmpf = tmpf.name statement = ("cat %(tmpf)s |" " python %(scriptsdir)s/gtf2gtf.py" " --method=sort --sort-order=gene" " --log=%(outfile)s.log |" " gzip > %(outfile)s") P.run() os.unlink(tmpf)
def loadMatchResults(infile, outfile): ''' load the results of the match analysis into sqlite database ''' temp = P.getTempFile("./match.dir") temp.write("seq_id\tmatrix_id\tposition\tstrand\t" "core_score\tmatrix_score\tsequence\n") for details in PipelineTFM.match_iterator(infile): temp.write("\t".join( map(str, [ details.seq_id, details.matrix_id, details.position, details.strand, details.core_score, details.matrix_score, details.sequence ])) + "\n") temp.close() P.load(temp.name, outfile, options="--add-index=seq_id") os.unlink(temp.name)
def loadMemeSummary(infiles, outfile): '''load information about motifs into database.''' outf = P.getTempFile(".") outf.write("track\n") for infile in infiles: if IOTools.isEmpty(infile): continue motif = P.snip(infile, ".meme") outf.write("%s\n" % motif) outf.close() P.load(outf.name, outfile) os.unlink(outf.name)
def extractEnsemblLincRNA(infile, outfile): tmpf = P.getTempFile("/ifs/scratch") for gtf in GTF.iterator(IOTools.openFile(infile)): if gtf.source == "lincRNA": tmpf.write(str(gtf) + "\n") else: continue tmpf.close() tmpf = tmpf.name statement = ("cat %(tmpf)s |" " cgat gtf2gtf" " --method=sort --sort-order=gene" " --log=%(outfile)s.log |" " gzip > %(outfile)s") P.run() os.unlink(tmpf)
def importFromIterator( outfile, tablename, iterator, columns=None, indices=None): '''import data in *iterator* into *tablename* via temporary file. ''' tmpfile = P.getTempFile(".") if columns: keys, values = zip(*columns.items()) tmpfile.write("\t".join(values) + "\n") for row in iterator: if not columns: keys = row[0].keys() values = keys columns = keys tmpfile.write("\t".join(values) + "\n") tmpfile.write("\t".join(str(row[x]) for x in keys) + "\n") tmpfile.close() if indices: indices = " ".join("--add-index=%s" % x for x in indices) else: indices = "" tmpfilename = tmpfile.name statement = ''' python %(scriptsdir)s/csv2db.py %(csv2db_options)s --table=%(tablename)s %(indices)s < %(tmpfilename)s > %(outfile)s ''' P.run() os.unlink(tmpfilename)
def loadTomTom(infile, outfile): '''load tomtom results''' tablename = P.toTable(outfile) resultsdir = os.path.join( os.path.abspath(PARAMS["exportdir"]), "tomtom", infile) xml_file = os.path.join(resultsdir, "tomtom.xml") if not os.path.exists(xml_file): E.warn("no tomtom output - skipped loading ") P.touch(outfile) return # get the motif name from the xml file tree = xml.etree.ElementTree.ElementTree() tree.parse(xml_file) motifs = tree.find("targets") name2alt = {} for motif in motifs.getiterator("motif"): name = motif.get("name") alt = motif.get("alt") name2alt[name] = alt tmpfile = P.getTempFile(".") # parse the text file for line in IOTools.openFile(infile): if line.startswith("#Query"): tmpfile.write('\t'.join( ("target_name", "query_id", "target_id", "optimal_offset", "pvalue", "evalue", "qvalue", "Overlap", "query_consensus", "target_consensus", "orientation")) + "\n") continue data = line[:-1].split("\t") target_name = name2alt[data[1]] tmpfile.write("%s\t%s" % (target_name, line)) tmpfile.close() P.load(tmpfile.name, outfile) os.unlink(tmpfile.name)
def vennOverlap(dbh, outfiles): ''' use limma to venn diagram the overlap between two conditions ''' cc = dbh.cursor() contrasts = set() for data in cc.execute("""SELECT contrast FROM differentially_expressed""").fetchall(): contrasts.add(data[0]) for cont1, cont2 in itertools.combinations(contrasts, 2): result = collections.defaultdict(list) temp = P.getTempFile(".") for data in cc.execute("""SELECT contrast, probe_id, status FROM differentially_expressed""").fetchall(): contrast, probe, status = data[0], data[1], data[2] if probe in result: continue result["probe"].append(probe) if status == 2: status = 1 if contrast == cont1: result[cont1].append(status) elif contrast == cont2: result[cont2].append(status) temp.write("\t".join(["probe", P.snip(cont1, "_result"), P.snip(cont2, "_result")]) + "\n") for data in zip(*[result["probe"], result[cont1], result[cont2]]): temp.write("\t".join(map(str, list(data))) + "\n") temp.close() inf = temp.name outf = os.path.join("differential_expression.dir", "%s_vs_%s.venn.pdf" % (cont1, cont2)) R(''' library("limma") dat <- read.csv("%s", header = T, stringsAsFactors = F, sep = "\t", row.names = 1) venn.data <- vennCounts(dat) pdf("%s") vennDiagram(venn.data, circle.col = c(rgb(1,0,0,0.5), rgb(0,1,0,0.5)), lwd = 2) dev.off() ''' % (inf, outf)) os.unlink(inf)
def loadMemeSummary(infiles, outfile): '''load information about motifs into database.''' outf = P.getTempFile(".") outf.write("method\ttrack\n") for infile in infiles: if IOTools.isEmpty(infile): continue method = re.match("(.+).dir/", infile).groups()[0] track = os.path.basename(".".join(infile.split(".")[:-1])) outf.write("%s\t%s\n" % (method, track)) outf.close() P.load(outf.name, outfile) os.unlink(outf.name)
def loadMissedReadCounts(infiles, outfile): """load summary table of numbers of missed reads.""" def _getlines(inf): return len(IOTools.openFile(inf).readlines()) - 1 tmpfile = P.getTempFile() infiles = sorted(infiles) tmpfile.write("track\tmapped_genome\tmissed_junctions\tmissed_transcriptome\n") for x in range(0, len(infiles), 2): junctions, transcriptome = infiles[x], infiles[x + 1] track = P.snip(junctions, ".missed_junctions.gz") mapped_genome = _getlines(track + ".mapped_reads.gz") tmpfile.write("%s\t%i\t%i\t%i\n" % (track, mapped_genome, _getlines(junctions), _getlines(transcriptome))) tmpfile.close() P.load(tmpfile.name, outfile) os.unlink(tmpfile.name)
def loadTomTom(infile, outfile): '''load tomtom results''' tablename = P.toTable(outfile) resultsdir = os.path.join( os.path.abspath(PARAMS["exportdir"]), "tomtom", infile) xml_file = os.path.join(resultsdir, "tomtom.xml") if not os.path.exists(xml_file): E.warn("no tomtom output - skipped loading ") P.touch(outfile) return # get the motif name from the xml file tree = xml.etree.ElementTree.ElementTree() tree.parse(xml_file) motifs = tree.find("targets") name2alt = {} for motif in motifs.getiterator("motif"): name = motif.get("name") alt = motif.get("alt") name2alt[name] = alt tmpfile = P.getTempFile(".") # parse the text file for line in IOTools.openFile(infile): if line.startswith("#Query"): tmpfile.write( "target_name\tquery_id\ttarget_id\toptimal_offset\tpvalue\tevalue\tqvalue\tOverlap\tquery_consensus\ttarget_consensus\torientation\n") continue data = line[:-1].split("\t") target_name = name2alt[data[1]] tmpfile.write("%s\t%s" % (target_name, line)) tmpfile.close() P.load(tmpfile.name, outfile) os.unlink(tmpfile.name)
def loadTranscriptSummary(infile, outfile): '''summarize binding information per transcript.''' dbh = connect() table = P.toTable(outfile) cc = dbh.cursor() # sqlite can not do full outer join cc.execute("""DROP TABLE IF EXISTS %(table)s""" % locals()) transcripts = [ x[0] for x in cc.execute( "SELECT DISTINCT(transcript_id) FROM annotations.transcript_info"). fetchall() ] tmpf = P.getTempFile() tables = ("tata", "cpg") titles = tables vals = [] for table in tables: t = set([ x[0] for x in cc.execute("SELECT DISTINCT(transcript_id) FROM %(table)s" % locals()).fetchall() ]) vals.append(t) tmpf.write("transcript_id\t%s\n" % "\t".join(titles)) for transcript_id in transcripts: tmpf.write("%s\t%s\n" % (transcript_id, "\t".join( [str(int(transcript_id in v)) for v in vals]))) tmpf.close() P.load(tmpf.name, outfile) os.unlink(tmpf.name)
def genericImportAnnotator(infiles, outfile, table, workspace, slice, subset, fdr_method): '''generic import of annotator results. Assumes that the suffix of all infiles is the same. ''' infile = " ".join(infiles) x, suffix = os.path.splitext(infiles[0]) tmpfilename = P.getTempFilename() statement = ''' cgat annotator2tsv \ --method=fdr-table \ --fdr-method=%(fdr_method)s \ --log=%(outfile)s.log \ --regex-identifier="(.*)%(suffix)s" \ %(infile)s > %(tmpfilename)s ''' P.run() tmpfile = P.getTempFile() for line in open(tmpfilename, "r"): if line.startswith("id"): line = "subset\tworkspace\tslice\t" + re.sub("^id", "track", line) else: line = "%s\t%s\t%s\t%s" % (subset, workspace, slice, line) tmpfile.write(line) tmpfile.close() tmpfilename2 = tmpfile.name statement = ''' cgat csv2db %(csv2db_options)s \ --table=%(table)s < %(tmpfilename2)s > %(outfile)s''' P.run(**dict(list(locals().items()) + list(PARAMS.items()))) os.unlink(tmpfilename) os.unlink(tmpfilename2)
def loadMemeChipSummary(infiles, outfile): '''load information about motifs into database.''' outf = P.getTempFile(".") outf.write("track\tnpeaks\twidth\tmasking\tpath\n") for infile in infiles: if IOTools.isEmpty(infile): continue fn = P.snip(os.path.basename(infile), ".memechip") track, npeaks, width, masking = fn.split(".") outf.write("\t".join(map(str, (track, npeaks, width, masking, fn))) + "\n") outf.close() P.load(outf.name, outfile) os.unlink(outf.name)
def loadLncRNAClass(infile, outfile): ''' load the lncRNA classifications ''' # just load each transcript with its classification temp = P.getTempFile(".") inf = IOTools.openFile(infile) for transcript in GTF.transcript_iterator(GTF.iterator(inf)): temp.write("%s\t%s\t%s\n" % (transcript[0].transcript_id, transcript[0].gene_id, transcript[0].source)) temp.close() P.load(temp.name, outfile, options="--header-names=transcript_id,gene_id,class " "--add-index=transcript_id " "--add-index=gene_id") os.unlink(temp.name)
def loadMemeChipSummary(infiles, outfile): '''load information about motifs into database.''' outf = P.getTempFile(".") outf.write("track\tnpeaks\twidth\tmasking\tpath\n") for infile in infiles: if IOTools.isEmpty(infile): continue fn = P.snip(os.path.basename(infile), ".memechip") track, npeaks, width, masking = fn.split(".") outf.write( "\t".join(map(str, (track, npeaks, width, masking, fn))) + "\n") outf.close() P.load(outf.name, outfile) os.unlink(outf.name)
def readChunk(lines, chunk): # use real file, as MAST parser can not deal with a # list of lines tmpfile2 = P.getTempFile(".") try: motif, part = re.match( ":: motif = (\S+) - (\S+) ::", lines[chunks[chunk]]).groups() except AttributeError: raise ValueError( "parsing error in line '%s'" % lines[chunks[chunk]]) E.info("reading %s - %s" % (motif, part)) tmpfile2.write("".join(lines[chunks[chunk] + 1:chunks[chunk + 1]])) tmpfile2.close() mast = MAST.parse(IOTools.openFile(tmpfile2.name, "r")) os.unlink(tmpfile2.name) return motif, part, mast
def getNumReadsFromBAMFile(infile): '''count number of reads in bam file.''' # by-passes a problem with pysam, which was reading in stdout as the first # elements in list data tmpf = P.getTempFile(".") tmpfile_name = tmpf.name statement = '''samtools idxstats %(infile)s > %(tmpfile_name)s''' P.run() read_info = IOTools.openFile(tmpfile_name).readlines() os.unlink(tmpfile_name) try: data = sum(map(int, [x.split("\t")[2] for x in read_info if not x.startswith("#")])) except IndexError, msg: raise IndexError( "can't get number of reads from bamfile, msg=%s, data=%s" % (msg, read_info))
def loadMatchResults(infile, outfile): ''' load the results of the match analysis into sqlite database ''' temp = P.getTempFile("./match.dir") temp.write("seq_id\tmatrix_id\tposition\tstrand\t" "core_score\tmatrix_score\tsequence\n") for details in PipelineTFM.match_iterator(infile): temp.write("\t".join(map(str, [details.seq_id, details.matrix_id, details.position, details.strand, details.core_score, details.matrix_score, details.sequence])) + "\n") temp.close() P.load(temp.name, outfile, options="--add-index=seq_id") os.unlink(temp.name)
def readChunk(lines, chunk): # use real file, as MAST parser can not deal with a # list of lines tmpfile2 = P.getTempFile(".") try: motif, part = re.match(":: motif = (\S+) - (\S+) ::", lines[chunks[chunk]]).groups() except AttributeError: raise ValueError("parsing error in line '%s'" % lines[chunks[chunk]]) E.info("reading %s - %s" % (motif, part)) tmpfile2.write("".join(lines[chunks[chunk] + 1:chunks[chunk + 1]])) tmpfile2.close() mast = MAST.parse(IOTools.openFile(tmpfile2.name, "r")) os.unlink(tmpfile2.name) return motif, part, mast
def genericImportAnnotator(infiles, outfile, table, workspace, slice, subset, fdr_method): '''generic import of annotator results. Assumes that the suffix of all infiles is the same. ''' infile = " ".join(infiles) x, suffix = os.path.splitext(infiles[0]) tmpfilename = P.getTempFilename() statement = ''' python %(scriptsdir)s/annotator2tsv.py \ --method=fdr-table \ --fdr-method=%(fdr_method)s \ --log=%(outfile)s.log \ --regex-identifier="(.*)%(suffix)s" \ %(infile)s > %(tmpfilename)s ''' P.run() tmpfile = P.getTempFile() for line in open(tmpfilename, "r"): if line.startswith("id"): line = "subset\tworkspace\tslice\t" + re.sub("^id", "track", line) else: line = "%s\t%s\t%s\t%s" % (subset, workspace, slice, line) tmpfile.write(line) tmpfile.close() tmpfilename2 = tmpfile.name statement = ''' python %(scriptsdir)s/csv2db.py %(csv2db_options)s \ --table=%(table)s < %(tmpfilename2)s > %(outfile)s''' P.run(**dict(locals().items() + PARAMS.items())) os.unlink(tmpfilename) os.unlink(tmpfilename2)
def loadMissedReadCounts(infiles, outfile): '''load summary table of numbers of missed reads.''' def _getlines(inf): return len(IOTools.openFile(inf).readlines()) - 1 tmpfile = P.getTempFile() infiles = sorted(infiles) tmpfile.write( "track\tmapped_genome\tmissed_junctions\tmissed_transcriptome\n") for x in range(0, len(infiles), 2): junctions, transcriptome = infiles[x], infiles[x + 1] track = P.snip(junctions, ".missed_junctions.gz") mapped_genome = _getlines(track + ".mapped_reads.gz") tmpfile.write("%s\t%i\t%i\t%i\n" % (track, mapped_genome, _getlines(junctions), _getlines(transcriptome))) tmpfile.close() P.load(tmpfile.name, outfile) os.unlink(tmpfile.name)
def buildPicardDuplicationStats(infile, outfile): '''Record duplicate metrics using Picard, the marked records are discarded ''' job_options = getPicardOptions() job_threads = 3 if getNumReadsFromBAMFile(infile) == 0: E.warn("no reads in %s - no metrics" % infile) P.touch(outfile) return # currently, MarkDuplicates cannot handle split alignments from gsnap # these can be identified by the custom XT tag. if ".gsnap.bam" in infile: tmpf = P.getTempFile(".") tmpfile_name = tmpf.name statement = '''samtools view -h %(infile)s | awk "!/\\tXT:/" | samtools view /dev/stdin -S -b > %(tmpfile_name)s; ''' % locals() data_source = tmpfile_name else: statement = "" data_source = infile statement += '''MarkDuplicates INPUT=%(data_source)s ASSUME_SORTED=true METRICS_FILE=%(outfile)s OUTPUT=/dev/null VALIDATION_STRINGENCY=SILENT ''' P.run() if ".gsnap.bam" in infile: os.unlink(tmpfile_name)
def importFromSeries(infiles, outfile): '''import expression levels from a GEO series.''' tablename = P.toTable(outfile) tmpf = P.getTempFile() infile_data, infile_map = infiles map_header = IOTools.readMap(open(infile_map, "r")) if "ID_REF" not in map_header: map_header["ID_REF"] = "probeset" inf = gzip.open(infile_data, "r") for line in inf: if line.startswith("!"): continue if not line.strip(): continue line = re.sub('"', "", line) if line.startswith("ID_REF"): line = "\t".join([map_header[x] for x in line[:-1].split("\t")]) + "\n" tmpf.write(line) tmpf.close() tmpname = tmpf.name header = map_header["ID_REF"] statement = ''' cgat csv2db %(csv2db_options)s \ --add-index=%(header)s \ --table=%(tablename)s \ < %(tmpname)s > %(outfile)s ''' P.run() os.unlink(tmpname)
def loadStrandSpecificity(infiles, outfile, suffix="strand", tablename=None): ''' ''' if not tablename: tablename = "%s_%s" % (P.toTable(outfile), suffix) outf = P.getTempFile(".") table_count = 0 table_join = None for infile in infiles: name = P.snip(os.path.basename(infile), ".strand") table = pd.read_csv(infile, sep="\t") table["track"] = name if table_count == 0: table_join = table table_count += 1 else: table_join = table.merge(table_join, on=[ "MSR", "ISR", "OSR", "ISF", "MSF", "OSF", "SF", "SR", "track" ], how="outer") table_join.to_csv(outf, sep="\t", index=False) outf.close() P.load(infile=outf.name, outfile=outfile, tablename=tablename, options="--add-index=track") os.unlink(outf.name)