def buildFinalLncRNAGeneSet(filteredLncRNAGeneSet, cpc_table, outfile, filter_cpc = None): ''' filters lncRNA set based on the coding potential as output from the CPC ''' if filter_cpc: # get the transcripts that are designated as coding coding_set = set() dbh = sqlite3.connect("csvdb") cc = dbh.cursor() for transcript_id in cc.execute("SELECT transcript_id from %s WHERE CP_score > 1" % cpc_table): coding_set.add(transcript_id[0]) remove = set() outf_coding = gzip.open("gtfs/cpc_removed.gtf.gz", "w") for gtf in GTF.iterator(IOTools.openFile(filteredLncRNAGeneSet)): if gtf.transcript_id in coding_set: remove.add(gtf.gene_id) outf_coding.write("%s\n" % gtf) outf_coding.close() else: # create empty set remove = set() # get temporary file for built lncrna temp = P.getTempFile(dir=".") # get temporary file for known lncrna temp2 = P.getTempFile(dir = ".") for gtf in GTF.iterator(IOTools.openFile(filteredLncRNAGeneSet)): if gtf.gene_id in remove: continue if gtf.transcript_id.find("TCONS") != -1: # output known and buil transcripts separately temp.write("%s\n" % gtf) else: temp2.write("%s\n" % gtf) temp.close() temp2.close() filename = temp.name filename2 = temp2.name statement = '''cat %(filename)s | python %(scriptsdir)s/gtf2gtf.py --sort=gene | python %(scriptsdir)s/gtf2gtf.py --renumber-genes=NONCO%%i --log=%(outfile)s.log | python %(scriptsdir)s/gtf2gtf.py --sort=gene --log=%(outfile)s.log > temp.gtf''' P.run() # recombine all transcripts with new ids statement = ('''cat %(filename2)s temp.gtf | python %(scriptsdir)s/gtf2gtf.py --sort=contig+gene --log = %(outfile)s.log | gzip > %(outfile)s''') P.run()
def buildIntervalsFasta(infile, outfile): ''' build fasta file from intervals. Alternatively if a gtf file is specified this function will use parameters specified in the .ini file to use intervals upstream / downstream of tss ''' # define upstream and downstream extensions upstream = PARAMS["intervals_extension_upstream"] downstream = PARAMS["intervals_extension_downstream"] assert len(str(upstream)), ("extension_upstream cannot be of %s type." "If no extension is to be used specify 0" % type(upstream)) assert len(str(downstream)), ("downstream extension cannot be of %s type." "If no extension is to be used specify 0" % type(downstream)) # if input is gtf then convert to bed # with intervals defined by .ini file temp = P.getTempFile("/ifs/scratch") if infile.endswith(".gtf.gz"): # the resulting temporary file will not be zipped concatenate = "cat" for gene in GTF.merged_gene_iterator( GTF.iterator(IOTools.openFile(infile))): if gene.strand == "+": temp.write("%s\t%s\t%s\t%s\t%s\t%s\n" % (gene.contig, str(gene.start - upstream), str(gene.start + downstream), gene.gene_id, ".", gene.strand)) elif gene.strand == "-": temp.write( "%s\t%s\t%s\t%s\t%s\t%s\n" % (gene.contig, str(gene.end - downstream), str(gene.end + upstream), gene.gene_id, ".", gene.strand)) temp.close() inf = temp.name else: inf = infile concatenate = "zcat" to_cluster = True # define statement # option to specify strand in config file. statement = ("%(concatenate)s %(inf)s |" " python %(scriptsdir)s/bed2fasta.py" " --genome=%(genomedir)s/%(genome)s") if PARAMS["intervals_stranded"]: statement += (" --use-strand --log=%(outfile)s.log > %(outfile)s") else: statement += (" --log=%(outfile)s.log > %(outfile)s") P.run() if infile.endswith(".gtf.gz"): os.remove(inf)
def buildAnnotatorDistanceAnnotations(annotations="expression"): '''build an annotations file for annotator_distance.''' tmpfile = P.getTempFile(".") tmpfilename = tmpfile.name if annotations == "expression": dbhandle = sqlite3.connect(PARAMS["database"]) cc = dbhandle.cursor() statement = """ SELECT gene_id, CASE WHEN %(annodist_master_expression_select)s THEN 'responsive' ELSE 'nonresponsive' END FROM probeset2transcript AS e, %(annodist_master_expression)s AS d WHERE d.cluster_id = e.cluster_id """ % dict(locals().items() + PARAMS.items()) data = cc.execute(statement).fetchall() tmpfile.write("gene_id\tlabel\n") for gene_id, label in data: tmpfile.write("%s\t%s\n" % (gene_id, label)) tmpfile.close() return tmpfilename
def loadMatchResults(infile, outfile): ''' load the results of the match analysis into sqlite database ''' temp = P.getTempFile("./match.dir") temp.write("seq_id\tmatrix_id\tposition\tstrand\t" "core_score\tmatrix_score\tsequence\n") for details in PipelineTFM.match_iterator(infile): temp.write("\t".join( map(str, [ details.seq_id, details.matrix_id, details.position, details.strand, details.core_score, details.matrix_score, details.sequence ])) + "\n") temp.close() to_cluster = True job_options = "-l mem_free=64G" inf = temp.name tablename = filenameToTablename(os.path.basename(infile)) statement = ("python %(scriptsdir)s/csv2db.py" " -t %(tablename)s" " --log=%(outfile)s.log" " --index=seq_id" " %(csv2db_options)s" " < %(inf)s > %(outfile)s") P.run() os.unlink(temp.name)
def exportMotifLocations( infiles, outfile ): '''export motif locations. There will be a bed-file per motif. Overlapping motif matches in different tracks will be merged. ''' dbh = connect() cc = dbh.cursor() motifs = [ x[0] for x in cc.execute( "SELECT motif FROM motif_info" ).fetchall()] for motif in motifs: tmpf = P.getTempFile(".") for infile in infiles: table = P.toTable(infile) track = P.snip( table, "_mast" ) for x in cc.execute( """SELECT contig, start, end, '%(track)s', evalue FROM %(table)s WHERE motif = '%(motif)s' AND start IS NOT NULL""" % locals() ): tmpf.write( "\t".join( map(str, x) ) + "\n" ) tmpf.close() outfile = os.path.join( PARAMS["exportdir"], "motifs", "%s.bed.gz" % motif ) tmpfname = tmpf.name statement = '''mergeBed -i %(tmpfname)s -nms | gzip > %(outfile)s''' P.run() os.unlink( tmpf.name )
def calculateSequenceComposition(interval_names, sequence_file, outfile, header_line=True): ''' given a set of interval names that are present in a fasta file, return CpG content file ''' interval_file = open(interval_names) if header_line: interval_file.readline() sequence_file = open(sequence_file) interval_set = set() for line in interval_file.readlines(): interval_set.add(line[:-1]) temp = P.getTempFile("/ifs/scratch") for record in FastaIterator.iterate(sequence_file): seq_id = record.title.split(" ")[0] if seq_id in interval_set: temp.write(">%s\n%s\n" % (record.title, record.sequence)) temp.close() inf = temp.name statement = '''cat %(inf)s | python %(scriptsdir)s/fasta2table.py -s cpg -s length --log=%(outfile)s.log > %(outfile)s''' P.run()
def loadMatchResults(infile, outfile): ''' load the results of the match analysis into sqlite database ''' temp = P.getTempFile("./match.dir") temp.write("seq_id\tmatrix_id\tposition\tstrand\t" "core_score\tmatrix_score\tsequence\n") for details in PipelineTFM.match_iterator(infile): temp.write("\t".join(map(str, [details.seq_id, details.matrix_id, details.position, details.strand, details.core_score, details.matrix_score, details.sequence])) + "\n") temp.close() to_cluster = True job_options = "-l mem_free=64G" inf = temp.name tablename = filenameToTablename(os.path.basename(infile)) statement = ("python %(scriptsdir)s/csv2db.py" " -t %(tablename)s" " --log=%(outfile)s.log" " --index=seq_id" " %(csv2db_options)s" " < %(inf)s > %(outfile)s") P.run() os.unlink(temp.name)
def loadPicardAlignStats(infiles, outfile): '''Merge Picard alignment stats into single table and load into SQLite.''' # Join data for all tracks into single file outf = P.getTempFile() first = True for f in infiles: track = P.snip(os.path.basename(f), ".alignstats") if not os.path.exists(f): E.warn("File %s missing" % f) continue lines = [ x for x in open(f, "r").readlines() if not x.startswith("#") and x.strip()] if first: outf.write("%s\t%s" % ("track", lines[0])) first = False for i in range(1, len(lines)): outf.write("%s\t%s" % (track, lines[i])) outf.close() tmpfilename = outf.name # Load into database tablename = P.toTable(outfile) statement = '''cat %(tmpfilename)s | python %(scriptsdir)s/csv2db.py --index=track --table=%(tablename)s > %(outfile)s''' P.run() os.unlink(tmpfilename)
def buildBenchmarkInput(infile, outfile): tmpfile = P.getTempFile() dbhandle = sqlite3.connect(PARAMS["database"]) cc = dbhandle.cursor() statement = ''' SELECT DISTINCT transcript_id, protein_id FROM peptide_info ''' cc.execute(statement) tmpfile.write("transcript_id\tprotein_id\n") tmpfile.write("\n".join(["\t".join(x) for x in cc])) tmpfile.write("\n") tmpfilename = tmpfile.name statement = ''' perl %(scriptsdir)s/extract_fasta.pl %(infile)s < cds.fasta python %(scripstdir)s/fasta2variants.py --is-cds | python %(scriptsdir)s/substitute_tokens.py --apply=%(tmpfilename)s > %(outfile)s ''' P.run() os.unlink(tmpfilename)
def loadPicardGCStats(infiles, outfile): '''Merge Picard insert size stats into single table and load into SQLite.''' tablename = P.toTable(outfile) outf = P.getTempFile() first = True for f in infiles: track = P.snip(os.path.basename(f), ".gcstats") if not os.path.exists(f): E.warn("File %s missing" % f) continue lines = [ x for x in open(f, "r").readlines() if not x.startswith("#") and x.strip()] if first: outf.write("%s\t%s" % ("track", lines[0])) first = False outf.write("%s\t%s" % (track, lines[1])) outf.close() tmpfilename = outf.name statement = '''cat %(tmpfilename)s | python %(scriptsdir)s/csv2db.py %(csv2db_options)s --index=track --table=%(tablename)s > %(outfile)s ''' P.run() os.unlink(tmpfilename)
def exportMotifLocations(infiles, outfile): '''export motif locations. There will be a bed-file per motif. Overlapping motif matches in different tracks will be merged. ''' dbh = connect() cc = dbh.cursor() motifs = [ x[0] for x in cc.execute("SELECT motif FROM motif_info").fetchall() ] for motif in motifs: tmpf = P.getTempFile(".") for infile in infiles: table = P.toTable(infile) track = P.snip(table, "_mast") for x in cc.execute( """SELECT contig, start, end, '%(track)s', evalue FROM %(table)s WHERE motif = '%(motif)s' AND start IS NOT NULL""" % locals()): tmpf.write("\t".join(map(str, x)) + "\n") tmpf.close() outfile = os.path.join(PARAMS["exportdir"], "motifs", "%s.bed.gz" % motif) tmpfname = tmpf.name statement = '''mergeBed -i %(tmpfname)s -nms | gzip > %(outfile)s''' P.run() os.unlink(tmpf.name)
def loadLncRNAClass(infile, outfile): ''' load the lncRNA classifications ''' tablename = os.path.basename( filenameToTablename(P.snip(infile, ".gtf.gz"))) to_cluster = False # just load each transcript with its classification temp = P.getTempFile() inf = IOTools.openFile(infile) for transcript in GTF.transcript_iterator(GTF.iterator(inf)): temp.write("%s\t%s\t%s\n" % ( transcript[0].transcript_id, transcript[0].gene_id, transcript[0].source)) temp.close() inf_1 = temp.name statement = ("python %(scriptsdir)s/csv2db.py" " -t %(tablename)s" " --log=%(outfile)s.log" " --header=transcript_id,gene_id,class" " < %(inf_1)s > %(outfile)s") P.run()
def loadPicardAlignStats(infiles, outfile): '''Merge Picard alignment stats into single table and load into SQLite.''' # Join data for all tracks into single file outf = P.getTempFile() first = True for f in infiles: track = P.snip(os.path.basename(f), ".alignstats") if not os.path.exists(f): E.warn("File %s missing" % f) continue lines = [ x for x in open(f, "r").readlines() if not x.startswith("#") and x.strip() ] if first: outf.write("%s\t%s" % ("track", lines[0])) first = False for i in range(1, len(lines)): outf.write("%s\t%s" % (track, lines[i])) outf.close() tmpfilename = outf.name # Load into database tablename = P.toTable(outfile) statement = '''cat %(tmpfilename)s | python %(scriptsdir)s/csv2db.py --index=track --table=%(tablename)s > %(outfile)s''' P.run() os.unlink(tmpfilename)
def loadMatchResults(infile, outfile): ''' load the results of the match analysis into sqlite database ''' temp = P.getTempFile() temp.write( "seq_id\tmatrix_id\tposition\tstrand\tcore_score\tmatrix_score\tsequence\n" ) for details in PipelineTransfacMatch.match_iterator(infile): temp.write("\t".join( map(str, [ details.seq_id, details.matrix_id, details.position, details.strand, details.core_score, details.matrix_score, details.sequence ])) + "\n") inf = temp.name tablename = filenameToTablename(os.path.basename(infile)) statement = '''python %(scriptsdir)s/csv2db.py -t %(tablename)s --log=%(outfile)s.log --index=seq_id %(csv2db_options)s < %(inf)s > %(outfile)s''' P.run() os.remove(inf)
def loadTranscriptSummary(infile, outfile): '''summarize binding information per transcript.''' dbh = connect() table = P.toTable(outfile) cc = dbh.cursor() # sqlite can not do full outer join cc.execute( """DROP TABLE IF EXISTS %(table)s""" % locals() ) transcripts = [x[0] for x in cc.execute( "SELECT DISTINCT(transcript_id) FROM annotations.transcript_info").fetchall()] tmpf = P.getTempFile() tables = ("tata", "cpg") titles = tables vals = [] for table in tables: t = set([x[0] for x in cc.execute( "SELECT DISTINCT(transcript_id) FROM %(table)s" % locals()).fetchall()]) vals.append(t) tmpf.write("transcript_id\t%s\n" % "\t".join(titles)) for transcript_id in transcripts: tmpf.write("%s\t%s\n" % (transcript_id, "\t".join([str(int(transcript_id in v)) for v in vals]))) tmpf.close() P.load(tmpf.name, outfile) os.unlink(tmpf.name)
def importRepeatsFromUCSC(infile, outfile, ucsc_database, repeattypes, genome): """import repeats from a UCSC formatted file. The repeats are stored as a :term:`gff` formatted file. """ repclasses = "','".join(repeattypes.split(",")) # Repeats are either stored in a single ``rmsk`` table (hg19) or in # individual ``rmsk`` tables (mm9) like chr1_rmsk, chr2_rmsk, .... # In order to do a single statement, the ucsc mysql database is # queried for tables that end in rmsk. import MySQLdb dbhandle = MySQLdb.Connect(host=PARAMS["ucsc_host"], user=PARAMS["ucsc_user"]) cc = dbhandle.cursor() cc.execute("USE %s " % ucsc_database) cc = dbhandle.cursor() cc.execute("SHOW TABLES LIKE '%rmsk'") tables = [x[0] for x in cc.fetchall()] if len(tables) == 0: raise ValueError("could not find any `rmsk` tables") tmpfile = P.getTempFile(".") for table in tables: E.info("loading repeats from %s" % table) cc = dbhandle.cursor() cc.execute( """SELECT genoName, 'repeat', 'exon', genoStart+1, genoEnd, '.', strand, '.', CONCAT('class \\"', repClass, '\\"; family \\"', repFamily, '\\";') FROM %(table)s WHERE repClass in ('%(repclasses)s') """ % locals() ) for data in cc.fetchall(): tmpfile.write("\t".join(map(str, data)) + "\n") tmpfile.close() tmpfilename = tmpfile.name to_cluster = USECLUSTER statement = """cat %(tmpfilename)s | %(scriptsdir)s/gff_sort pos | python %(scriptsdir)s/gff2gff.py --sanitize=genome --skip-missing --genome-file=%(genome)s --log=%(outfile)s.log | gzip > %(outfile)s """ P.run() os.unlink(tmpfilename)
def loadAlignmentStats(infiles, outfile): '''merge alignment stats into single tables.''' tablename = P.toTable(outfile) outf = P.getTempFile() first = True for f in infiles: track = P.snip(f, ".bam.stats") fn = f + ".alignment_summary_metrics" if not os.path.exists(fn): E.warn("file %s missing" % fn) continue lines = [ x for x in open(fn, "r").readlines() if not x.startswith("#") and x.strip() ] if first: outf.write("%s\t%s" % ("track", lines[0])) first = False outf.write("%s\t%s" % (track, lines[1])) outf.close() tmpfilename = outf.name statement = '''cat %(tmpfilename)s | python %(scriptsdir)s/csv2db.py --index=track --table=%(tablename)s > %(outfile)s ''' P.run() for suffix, column in (("quality_by_cycle_metrics", "cycle"), ("quality_distribution_metrics", "quality")): # some files might be missing - bugs in Picard xfiles = [x for x in infiles if os.path.exists("%s.%s" % (x, suffix))] header = ",".join([P.snip(x, ".bam.stats") for x in xfiles]) filenames = " ".join(["%s.%s" % (x, suffix) for x in xfiles]) tname = "%s_%s" % (tablename, suffix) statement = """python %(scriptsdir)s/combine_tables.py --missing=0 %(filenames)s | python %(scriptsdir)s/csv2db.py --header=%(column)s,%(header)s --replace-header --index=track --table=%(tname)s >> %(outfile)s """ P.run() os.unlink(tmpfilename)
def buildGenomicFunctionalAnnotation(gtffile, dbh, outfiles): '''output a bed file with genomic regions with functional annotations. The regions for each gene are given in the gtf file. Each bed entry is a gene territory. Bed entries are labeled by functional annotations associated with a gene. Ambiguities in territories are resolved by outputting annotations for all genes within a territory. The output file contains annotations for both GO and GOSlim. These are prefixed by ``go:`` and ``goslim:``. ''' to_cluster = True territories_file = gtffile outfile_bed, outfile_tsv = outfiles gene2region = {} for gtf in GTF.iterator(IOTools.openFile(gtffile, "r")): gid = gtf.gene_id.split(":") for g in gid: gene2region[g] = (gtf.contig, gtf.start, gtf.end, gtf.strand) # IMS: connect is not in this module. dbh needs to be passed from caller #dbh = connect() cc = dbh.cursor() outf = P.getTempFile(".") c = E.Counter() term2description = {} for db in ('go', 'goslim'): for gene_id, go_id, description in cc.execute( "SELECT gene_id, go_id, description FROM %s_assignments" % db): try: contig, start, end, strand = gene2region[gene_id] except KeyError: c.notfound += 1 continue outf.write("\t".join( map(str, (contig, start, end, "%s:%s" % (db, go_id), 1, strand))) + "\n") term2description["%s:%s" % (db, go_id)] = description outf.close() tmpfname = outf.name statement = '''sort -k1,1 -k2,2n < %(tmpfname)s | uniq | gzip > %(outfile_bed)s''' P.run() outf = IOTools.openFile(outfile_tsv, "w") outf.write("term\tdescription\n") for term, description in term2description.iteritems(): outf.write("%s\t%s\n" % (term, description)) outf.close()
def loadBioProspector(infile, outfile): '''load results from bioprospector.''' tablename = outfile[:-len(".load")] target_path = os.path.join(os.path.abspath(PARAMS["exportdir"]), "bioprospector") try: os.makedirs(target_path) except OSError: pass track = infile[:-len(".bioprospector")] results = Bioprospector.parse(IOTools.openFile(infile, "r")) tmpfile = P.getTempFile() tmpfile.write("id\tmotif\tstart\tend\tstrand\tarrangement\n") for x, motifs in enumerate(results): outname = os.path.join(target_path, "%s_%02i.png" % (track, x)) Bioprospector.build_logo([y.sequence for y in motifs.matches], outname) for match in motifs.matches: distance = abs(match.start + match.width1 - (match.end - match.width2)) if match.strand in ("+-", "-+"): arrangement = "ER" elif match.strand in ("++", "--"): arrangement = "DR" else: arrangement = "SM" distance = 0 arrangement += "%i" % distance strand = match.strand[0] id = re.sub(".*_", "", match.id) tmpfile.write("%s\t%i\t%i\t%i\t%s\t%s\n" % (id, x, match.start, match.end, strand, arrangement)) tmpfile.close() tmpfilename = tmpfile.name statement = ''' python %(scriptsdir)s/csv2db.py %(csv2db_options)s \ --allow-empty \ -b sqlite \ --index=id \ --index=motif \ --index=id,motif \ --table=%(tablename)s \ < %(tmpfilename)s > %(outfile)s ''' P.run()
def loadAlignmentStats(infiles, outfile): '''merge alignment stats into single tables.''' tablename = P.toTable(outfile) outf = P.getTempFile() first = True for f in infiles: track = P.snip(f, ".bam.stats") fn = f + ".alignment_summary_metrics" if not os.path.exists(fn): E.warn("file %s missing" % fn) continue lines = [ x for x in open(fn, "r").readlines() if not x.startswith("#") and x.strip()] if first: outf.write("%s\t%s" % ("track", lines[0])) first = False outf.write("%s\t%s" % (track, lines[1])) outf.close() tmpfilename = outf.name statement = '''cat %(tmpfilename)s | python %(scriptsdir)s/csv2db.py --index=track --table=%(tablename)s > %(outfile)s ''' P.run() for suffix, column in (("quality_by_cycle_metrics", "cycle"), ("quality_distribution_metrics", "quality")): # some files might be missing - bugs in Picard xfiles = [x for x in infiles if os.path.exists("%s.%s" % (x, suffix))] header = ",".join([P.snip(x, ".bam.stats") for x in xfiles]) filenames = " ".join(["%s.%s" % (x, suffix) for x in xfiles]) tname = "%s_%s" % (tablename, suffix) statement = """python %(scriptsdir)s/combine_tables.py --missing=0 %(filenames)s | python %(scriptsdir)s/csv2db.py --header=%(column)s,%(header)s --replace-header --index=track --table=%(tname)s >> %(outfile)s """ P.run() os.unlink(tmpfilename)
def importRepeatsFromUCSC(infile, outfile, ucsc_database, repeattypes, genome): '''import repeats from a UCSC formatted file. The repeats are stored as a :term:`gff` formatted file. ''' repclasses = "','".join(repeattypes.split(",")) # Repeats are either stored in a single ``rmsk`` table (hg19) or in # individual ``rmsk`` tables (mm9) like chr1_rmsk, chr2_rmsk, .... # In order to do a single statement, the ucsc mysql database is # queried for tables that end in rmsk. import MySQLdb dbhandle = MySQLdb.Connect(host=PARAMS["ucsc_host"], user=PARAMS["ucsc_user"]) cc = dbhandle.cursor() cc.execute("USE %s " % ucsc_database) cc = dbhandle.cursor() cc.execute("SHOW TABLES LIKE '%rmsk'") tables = [x[0] for x in cc.fetchall()] if len(tables) == 0: raise ValueError("could not find any `rmsk` tables") tmpfile = P.getTempFile(".") for table in tables: E.info("loading repeats from %s" % table) cc = dbhandle.cursor() cc.execute( """SELECT genoName, 'repeat', 'exon', genoStart+1, genoEnd, '.', strand, '.', CONCAT('class \\"', repClass, '\\"; family \\"', repFamily, '\\";') FROM %(table)s WHERE repClass in ('%(repclasses)s') """ % locals()) for data in cc.fetchall(): tmpfile.write("\t".join(map(str, data)) + "\n") tmpfile.close() tmpfilename = tmpfile.name to_cluster = USECLUSTER statement = '''cat %(tmpfilename)s | %(scriptsdir)s/gff_sort pos | python %(scriptsdir)s/gff2gff.py --sanitize=genome --skip-missing --genome-file=%(genome)s --log=%(outfile)s.log | gzip > %(outfile)s ''' P.run() os.unlink(tmpfilename)
def computeOverlapGO(infile, outfile): '''compute overlap between codingmarkers and windows. Only markers of certain GO categories are counted. This is done by setting the gene_id and transcript_id of markers of the ENSEMBEL gene that it overlaps with. This list is filtered first to keep only those ids with valid GO associations ''' to_cluster = False filter_goid = set(IOTools.readList(open(PARAMS["filename_gofilter"]))) filter_genes = set() E.info("number of goids: %i" % len(filter_goid)) for l in open(PARAMS["filename_go"]): f, id, goid, desc, evd = l[:-1].split("\t")[:5] if goid in filter_goid: filter_genes.add(id) tmpfile1 = P.getTempFile(dir=".") for line in open("ensembl.diff.genes_ovl"): a, b = line[:-1].split("\t") if b not in filter_genes: continue tmpfile1.write(line) E.info("number of genes taken: %i" % len(filter_genes)) tmpfile1.close() tmpfilename1 = tmpfile1.name tmpfilename = P.getTempFilename(dir=".") statement = '''python %(scriptsdir)s/gtf2gtf.py --rename=gene \ --apply=%(tmpfilename1)s \ < %(infile)s > %(tmpfilename)s ''' P.run(**dict(locals().items() + PARAMS.items())) statement = '''python %(scriptsdir)s/gff2table.py --filename-windows=<(python %(scriptsdir)s/bed2gff.py < windows.bed) --decorator=counts --filename-data=%(tmpfilename)s \ --skip-empty \ --is-gtf \ --log=%(outfile)s.log \ < %(genome)s.fasta > %(outfile)s''' P.run(**dict(locals().items() + PARAMS.items())) os.unlink(tmpfilename)
def computeOverlapGO( infile, outfile ): '''compute overlap between codingmarkers and windows. Only markers of certain GO categories are counted. This is done by setting the gene_id and transcript_id of markers of the ENSEMBEL gene that it overlaps with. This list is filtered first to keep only those ids with valid GO associations ''' to_cluster = False filter_goid = set(IOTools.readList( open( PARAMS["filename_gofilter"] ) )) filter_genes = set() E.info( "number of goids: %i" % len(filter_goid)) for l in open( PARAMS["filename_go"]): f, id, goid, desc, evd = l[:-1].split("\t")[:5] if goid in filter_goid: filter_genes.add( id ) tmpfile1 = P.getTempFile( dir = "." ) for line in open("ensembl.diff.genes_ovl" ): a,b = line[:-1].split( "\t" ) if b not in filter_genes: continue tmpfile1.write(line) E.info( "number of genes taken: %i" % len(filter_genes)) tmpfile1.close() tmpfilename1 = tmpfile1.name tmpfilename = P.getTempFilename( dir = "." ) statement = '''python %(scriptsdir)s/gtf2gtf.py --rename=gene \ --apply=%(tmpfilename1)s \ < %(infile)s > %(tmpfilename)s ''' P.run( **dict( locals().items() + PARAMS.items() ) ) statement = '''python %(scriptsdir)s/gff2table.py --filename-windows=<(python %(scriptsdir)s/bed2gff.py < windows.bed) --decorator=counts --filename-data=%(tmpfilename)s \ --skip-empty \ --is-gtf \ --log=%(outfile)s.log \ < %(genome)s.fasta > %(outfile)s''' P.run( **dict( locals().items() + PARAMS.items() ) ) os.unlink( tmpfilename )
def buildGenomicFunctionalAnnotation(gtffile, dbh, outfiles): '''output a bed file with genomic regions with functional annotations. The regions for each gene are given in the gtf file. Each bed entry is a gene territory. Bed entries are labeled by functional annotations associated with a gene. Ambiguities in territories are resolved by outputting annotations for all genes within a territory. The output file contains annotations for both GO and GOSlim. These are prefixed by ``go:`` and ``goslim:``. ''' territories_file = gtffile outfile_bed, outfile_tsv = outfiles gene2region = {} for gtf in GTF.iterator(IOTools.openFile(gtffile, "r")): gid = gtf.gene_id.split(":") for g in gid: gene2region[g] = (gtf.contig, gtf.start, gtf.end, gtf.strand) cc = dbh.cursor() outf = P.getTempFile(".") c = E.Counter() term2description = {} for db in ('go', 'goslim'): for gene_id, go_id, description in cc.execute( "SELECT gene_id, go_id, description FROM %s_assignments" % db): try: contig, start, end, strand = gene2region[gene_id] except KeyError: c.notfound += 1 continue outf.write( "\t".join(map(str, ( contig, start, end, "%s:%s" % (db, go_id), 1, strand))) + "\n") term2description["%s:%s" % (db, go_id)] = description outf.close() tmpfname = outf.name statement = '''sort -k1,1 -k2,2n < %(tmpfname)s | uniq | gzip > %(outfile_bed)s''' P.run() outf = IOTools.openFile(outfile_tsv, "w") outf.write("term\tdescription\n") for term, description in term2description.iteritems(): outf.write("%s\t%s\n" % (term, description)) outf.close() os.unlink(tmpfname)
def sharedIntervals(infile, outfile): '''identify shared intervals between datasets''' to_cluster = True tmpfile = P.getTempFile() tmpfilename = tmpfile.name statement = '''cat %(infile)s > %(outfile)s;''' P.run() in_track = P.snip(os.path.basename(infile), ".merged.cleaned.bed") for track in TRACKS: if str(track) != in_track: statement = '''intersectBed -a %(outfile)s -b intervals/%(track)s.merged.cleaned.bed -u > %(tmpfilename)s; mv %(tmpfilename)s %(outfile)s; ''' P.run()
def uniqueIntervals(infile, outfile): '''identify unique intervals for each dataset''' to_cluster = True tmpfile = P.getTempFile() tmpfilename = tmpfile.name statement = '''cat %(infile)s > %(outfile)s;''' P.run() in_track = P.snip( os.path.basename( infile ),".merged.cleaned.bed") for track in TRACKS: if str(track) <> in_track: statement = '''intersectBed -a %(outfile)s -b intervals/%(track)s.merged.cleaned.bed -v > %(tmpfilename)s; mv %(tmpfilename)s %(outfile)s ''' P.run()
def uniqueIntervals(infile, outfile): '''identify unique intervals for each dataset''' to_cluster = True tmpfile = P.getTempFile() tmpfilename = tmpfile.name statement = '''cat %(infile)s > %(outfile)s;''' P.run() in_track = P.snip(os.path.basename(infile), ".merged.cleaned.bed") for track in TRACKS: if str(track) <> in_track: statement = '''intersectBed -a %(outfile)s -b intervals/%(track)s.merged.cleaned.bed -v > %(tmpfilename)s; mv %(tmpfilename)s %(outfile)s ''' P.run()
def sharedIntervals(infile, outfile): '''identify shared intervals between datasets''' to_cluster = True tmpfile = P.getTempFile() tmpfilename = tmpfile.name statement = '''cat %(infile)s > %(outfile)s;''' P.run() in_track = P.snip( os.path.basename( infile ), ".merged.cleaned.bed") for track in TRACKS: if str(track) != in_track: statement = '''intersectBed -a %(outfile)s -b intervals/%(track)s.merged.cleaned.bed -u > %(tmpfilename)s; mv %(tmpfilename)s %(outfile)s; ''' P.run()
def buildCDSFasta(infile, outfile): '''load ENSEMBL cdna FASTA file *infile* is an ENSEMBL cdna file. ''' dbname = outfile[:-len(".fasta")] # infile_peptides, infile_cdnas = infiles statement = '''gunzip < %(infile)s | python %(scriptsdir)s/gff2fasta.py --is-gtf --genome=%(genome_dir)s/%(genome)s | python %(scriptsdir)s/index_fasta.py %(dbname)s --force - > %(dbname)s.log ''' P.run() return tmpfile = P.getTempFile(".") dbhandle = sqlite3.connect(PARAMS["database"]) cc = dbhandle.cursor() tmpfile.write("protein_id\ttranscript_id\n") tmpfile.write("\n".join( ["%s\t%s" % x for x in cc.execute( "SELECT DISTINCT protein_id, transcript_id " "FROM transcript_info")])) tmpfile.write("\n") tmpfile.close() tmpfilename = tmpfile.name statement = ''' python %(scriptsdir)s/peptides2cds.py --peptides=%(infile_peptides)s --cdnas=%(infile_cdnas)s --map=%(tmpfilename)s --output-format=fasta --log=%(outfile)s.log | python %(scriptsdir)s/index_fasta.py %(dbname)s --force - > %(dbname)s.log ''' P.run() os.unlink(tmpfilename)
def compareAbundanceOfFalsePositiveSpecies(infiles, outfile): ''' boxplot the relative abundance of false positive species compared to true positives ''' tablename_estimate = P.toTable(infiles[0]) track = P.snip( os.path.basename(infiles[0]).replace("metaphlan_", ""), ".load") tablename_true = [ P.toTable(x) for x in infiles[1:] if P.snip(os.path.basename(x), ".load") == track ][0] dbh = sqlite3.connect("csvdb") cc = dbh.cursor() tmp = P.getTempFile(".") tmp.write("taxa\tabundance\tstatus\n") estimate = {} true = set() for data in cc.execute( """SELECT taxon, rel_abundance FROM %s WHERE taxon_level == 'species'""" % tablename_estimate).fetchall(): estimate[data[0]] = data[1] for data in cc.execute("""SELECT taxa FROM %s WHERE level == 'species'""" % tablename_true).fetchall(): true.add(data[0]) for taxa, abundance in estimate.iteritems(): if taxa in true: tmp.write("%s\t%f\ttp\n" % (taxa, abundance)) else: tmp.write("%s\t%f\tfp\n" % (taxa, abundance)) tmp.close() inf = tmp.name if track.find("15M") != -1: col = "cadetblue" elif track.find("30M") != -1: col = "lightblue" elif track.find("50M") != -1: col = "slategray" R('''dat <- read.csv("%s", header = T, stringsAsFactors = F, sep = "\t")''' % inf) R('''library(ggplot2)''') R('''ggplot(dat, aes(x = status, y = log2(abundance))) + geom_boxplot(colour = "%s") + geom_hline(yintersect=0, linetype="dashed")''' % col) R('''ggsave("%s")''' % outfile) os.unlink(inf)
def loadReadCounts(infiles, outfile): '''load read counts into database.''' outf = P.getTempFile() outf.write("track\ttotal_reads\n") for infile in infiles: track = P.snip(infile, ".nreads") lines = IOTools.openFile(infile).readlines() nreads = int(lines[0][:-1].split("\t")[1]) outf.write("%s\t%i\n" % (track, nreads)) outf.close() P.load(outf.name, outfile) os.unlink(outf.name)
def loadReadCounts( infiles, outfile ): '''load read counts into database.''' outf = P.getTempFile() outf.write( "track\ttotal_reads\n") for infile in infiles: track = P.snip(infile, ".nreads") lines = IOTools.openFile( infile ).readlines() nreads = int( lines[0][:-1].split("\t")[1]) outf.write( "%s\t%i\n" % (track,nreads)) outf.close() P.load( outf.name, outfile ) os.unlink(outf.name)
def buildIntervalsFasta(infile, outfile): ''' build fasta file from intervals. Alternatively if a gtf file is specified this function will use parameters specified in the .ini file to use intervals upstream / downstream of tss ''' # define upstream and downstream extensions upstream = PARAMS["intervals_extension_upstream"] downstream = PARAMS["intervals_extension_downstream"] assert len(str(upstream)), """extension_upstream cannot be of %s type. If no extension is to be used specify 0""" % type(upstream) assert len(str(downstream)), """downstream extension cannot be of %s type. If no extension is to be used specify 0""" % type(downstream) # if input is gtf then convert to bed # with intervals defined by .ini file temp = P.getTempFile() if infile.endswith(".gtf.gz"): # the resulting temporary file will not be zipped concatenate = "cat" for gene in GTF.merged_gene_iterator(GTF.iterator(IOTools.openFile(infile))): if gene.strand == "+": temp.write("%s\t%s\t%s\t%s\t%s\t%s\n" % (gene.contig, str(gene.start - upstream) , str(gene.start + downstream), gene.gene_id , ".", gene.strand)) elif gene.strand == "-": temp.write("%s\t%s\t%s\t%s\t%s\t%s\n" % (gene.contig, str(gene.end - downstream) , str(gene.end + upstream), gene.gene_id , ".", gene.strand)) temp.close() inf = temp.name else: inf = infile concatenate = "zcat" # define statement # doesn't use strand information statement = '''%(concatenate)s %(inf)s | python %(scriptsdir)s/bed2fasta.py --genome=%(genomedir)s/%(genome)s --log=%(outfile)s.log > %(outfile)s''' P.run() if infile.endswith(".gtf.gz"): os.remove(inf)
def loadMotifInformation( infiles, outfile ): '''load information about motifs into database.''' outf = P.getTempFile(".") outf.write("motif\n" ) for infile in infiles: if IOTools.isEmpty( infile ): continue motif = P.snip( infile, ".motif" ) outf.write( "%s\n" % motif ) outf.close() P.load( outf.name, outfile, "--allow-empty" ) os.unlink( outf.name )
def loadMemeSummary( infiles, outfile ): '''load information about motifs into database.''' outf = P.getTempFile(".") outf.write("track\n" ) for infile in infiles: if IOTools.isEmpty( infile ): continue motif = P.snip( infile, ".meme" ) outf.write( "%s\n" % motif ) outf.close() P.load( outf.name, outfile ) os.unlink( outf.name )
def importAssignments( infiles, outfile): genes = collections.defaultdict( set ) tags = [] for infile in infiles: tag = re.sub(".*HVC", "", infile[:-len(".csv")]) reader = csv.DictReader( open(infile,"rU") ) for row in reader: try: gene_id = row["ESTIMA_ID"] except KeyError: print "Parsing error in line %s" % str(row) raise if gene_id == "": continue # remove ".A", ".B" and ".M" suffixes if gene_id[-2] == ".": gene_id = gene_id[:-2] genes[gene_id].add( tag ) tags.append(tag) outf = P.getTempFile() outf.write( "gene_id\t%s\n" % "\t".join( tags) ) for gene, present in genes.iteritems(): x = [] for t in tags: if t in present: x.append( "1" ) else: x.append( "0" ) outf.write( "%s\t%s\n" % (gene, "\t".join(x) )) outf.close() tablename = outfile[:-len(".import")] tmpfilename = outf.name statement = ''' python %(scriptsdir)s/csv2db.py %(csv2db_options)s \ --index=gene_id \ --table=%(tablename)s \ < %(tmpfilename)s > %(outfile)s ''' P.run( **dict( locals().items() + PARAMS.items() ) ) os.unlink( tmpfilename )
def getNumReadsFromBAMFile( infile ): '''count number of reads in bam file.''' # by-passes a problem with pysam, which was reading in stdout as the first elements in list data tmpf = P.getTempFile( ".") tmpfile_name = tmpf.name statement = '''samtools idxstats %(infile)s > %(tmpfile_name)s''' P.run() read_info = IOTools.openFile( tmpfile_name ).readlines() os.unlink( tmpfile_name ) try: data = sum( map(int, [ x.split("\t")[2] for x in read_info if not x.startswith("#")] ) ) except IndexError, msg: raise IndexError( "can't get number of reads from bamfile, msg=%s, data=%s" % (msg, read_info))
def importAssignments(infiles, outfile): genes = collections.defaultdict(set) tags = [] for infile in infiles: tag = re.sub(".*HVC", "", infile[:-len(".csv")]) reader = csv.DictReader(open(infile, "rU")) for row in reader: try: gene_id = row["ESTIMA_ID"] except KeyError: print "Parsing error in line %s" % str(row) raise if gene_id == "": continue # remove ".A", ".B" and ".M" suffixes if gene_id[-2] == ".": gene_id = gene_id[:-2] genes[gene_id].add(tag) tags.append(tag) outf = P.getTempFile() outf.write("gene_id\t%s\n" % "\t".join(tags)) for gene, present in genes.iteritems(): x = [] for t in tags: if t in present: x.append("1") else: x.append("0") outf.write("%s\t%s\n" % (gene, "\t".join(x))) outf.close() tablename = outfile[:-len(".import")] tmpfilename = outf.name statement = ''' python %(scriptsdir)s/csv2db.py %(csv2db_options)s \ --index=gene_id \ --table=%(tablename)s \ < %(tmpfilename)s > %(outfile)s ''' P.run(**dict(locals().items() + PARAMS.items())) os.unlink(tmpfilename)
def extractEnsemblLincRNA(infile, outfile): tmpf = P.getTempFile("/ifs/scratch") for gtf in GTF.iterator(IOTools.openFile(infile)): if gtf.source == "lincRNA": tmpf.write(str(gtf) + "\n") else: continue tmpf.close() tmpf = tmpf.name statement = ("cat %(tmpf)s |" " python %(scriptsdir)s/gtf2gtf.py" " --sort=gene" " --log=%(outfile)s.log |" " gzip > %(outfile)s") P.run() os.unlink(tmpf)
def importFromIterator( outfile, tablename, iterator, columns = None, indices = None ): '''import data in *iterator* into *tablename* via temporary file. ''' tmpfile = P.getTempFile() if columns: keys, values = zip( *columns.items() ) tmpfile.write( "\t".join( values) + "\n" ) for row in iterator: if not columns: keys = row[0].keys() values = keys columns = keys tmpfile.write( "\t".join( values) + "\n" ) tmpfile.write( "\t".join( str(row[x]) for x in keys ) + "\n" ) tmpfile.close() if indices: indices = " ".join( "--index=%s" % x for x in indices) else: indices = "" tmpfilename = tmpfile.name statement = ''' python %(scriptsdir)s/csv2db.py %(csv2db_options)s --table=%(tablename)s %(indices)s < %(tmpfilename)s > %(outfile)s ''' P.run() os.unlink( tmpfilename )
def compareAbundanceOfFalsePositiveSpecies(infiles, outfile): ''' boxplot the relative abundance of false positive species compared to true positives ''' tablename_estimate = P.toTable(infiles[0]) track = P.snip( os.path.basename(infiles[0]).replace("metaphlan_", ""), ".load") tablename_true = [P.toTable(x) for x in infiles[1:] if P.snip( os.path.basename(x), ".load") == track][0] dbh = sqlite3.connect("csvdb") cc = dbh.cursor() tmp = P.getTempFile(".") tmp.write("taxa\tabundance\tstatus\n") estimate = {} true = set() for data in cc.execute("""SELECT taxon, rel_abundance FROM %s WHERE taxon_level == 'species'""" % tablename_estimate).fetchall(): estimate[data[0]] = data[1] for data in cc.execute("""SELECT taxa FROM %s WHERE level == 'species'""" % tablename_true).fetchall(): true.add(data[0]) for taxa, abundance in estimate.iteritems(): if taxa in true: tmp.write("%s\t%f\ttp\n" % (taxa, abundance)) else: tmp.write("%s\t%f\tfp\n" % (taxa, abundance)) tmp.close() inf = tmp.name if track.find("15M") != -1: col = "cadetblue" elif track.find("30M") != -1: col = "lightblue" elif track.find("50M") != -1: col = "slategray" R('''dat <- read.csv("%s", header = T, stringsAsFactors = F, sep = "\t")''' % inf) R('''library(ggplot2)''') R('''ggplot(dat, aes(x = status, y = log2(abundance))) + geom_boxplot(colour = "%s") + geom_hline(yintersect=0, linetype="dashed")''' % col) R('''ggsave("%s")''' % outfile) os.unlink(inf)
def importFromIterator( outfile, tablename, iterator, columns=None, indices=None): '''import data in *iterator* into *tablename* via temporary file. ''' tmpfile = P.getTempFile(".") if columns: keys, values = zip(*columns.items()) tmpfile.write("\t".join(values) + "\n") for row in iterator: if not columns: keys = row[0].keys() values = keys columns = keys tmpfile.write("\t".join(values) + "\n") tmpfile.write("\t".join(str(row[x]) for x in keys) + "\n") tmpfile.close() if indices: indices = " ".join("--index=%s" % x for x in indices) else: indices = "" tmpfilename = tmpfile.name statement = ''' python %(scriptsdir)s/csv2db.py %(csv2db_options)s --table=%(tablename)s %(indices)s < %(tmpfilename)s > %(outfile)s ''' P.run() os.unlink(tmpfilename)
def readChunk( lines, chunk ): # use real file, as MAST parser can not deal with a # list of lines tmpfile2 = P.getTempFile(".") try: motif, part = re.match( ":: motif = (\S+) - (\S+) ::", lines[chunks[chunk]]).groups() except AttributeError: raise P.PipelineError("parsing error in line '%s'" % lines[chunks[chunk]]) E.info( "reading %s - %s" % (motif, part)) tmpfile2.write( "".join( lines[chunks[chunk]+1:chunks[chunk+1]]) ) tmpfile2.close() mast = MAST.parse( IOTools.openFile(tmpfile2.name, "r") ) os.unlink( tmpfile2.name ) return motif, part, mast
def loadReadCounts(infiles, outfile): '''load read counts into database.''' to_cluster = False outf = P.getTempFile() outf.write("track\ttotal_reads\n") for infile in infiles: track = P.snip(infile, ".nreads") lines = IOTools.openFile(infile).readlines() nreads = int(lines[0][:-1].split("\t")[1]) outf.write("%s\t%i\n" % (track, nreads)) outf.close() inname = outf.name tablename = P.toTable(outfile) statement = '''python %(scriptsdir)s/csv2db.py -t %(tablename)s --log=%(outfile)s.log < %(inname)s > %(outfile)s''' P.run() os.unlink(outf.name)
def loadMemeChipSummary( infiles, outfile ): '''load information about motifs into database.''' outf = P.getTempFile(".") outf.write("track\tnpeaks\twidth\tmasking\tpath\n" ) for infile in infiles: if IOTools.isEmpty( infile ): continue fn = P.snip(os.path.basename( infile ), ".memechip" ) track, npeaks, width, masking = fn.split(".") outf.write( "\t".join( map(str,(track, npeaks, width, masking, fn)) ) + "\n" ) outf.close() P.load( outf.name, outfile ) os.unlink( outf.name )
def loadTomTom(infile, outfile): '''load tomtom results''' tablename = P.toTable(outfile) resultsdir = os.path.join(os.path.abspath(PARAMS["exportdir"]), "tomtom", infile) xml_file = os.path.join(resultsdir, "tomtom.xml") if not os.path.exists(xml_file): E.warn("no tomtom output - skipped loading ") P.touch(outfile) return # get the motif name from the xml file tree = xml.etree.ElementTree.ElementTree() tree.parse(xml_file) motifs = tree.find("targets") name2alt = {} for motif in motifs.getiterator("motif"): name = motif.get("name") alt = motif.get("alt") name2alt[name] = alt tmpfile = P.getTempFile(".") # parse the text file for line in IOTools.openFile(infile): if line.startswith("#Query"): tmpfile.write( "target_name\tquery_id\ttarget_id\toptimal_offset\tpvalue\tevalue\tqvalue\tOverlap\tquery_consensus\ttarget_consensus\torientation\n" ) continue data = line[:-1].split("\t") target_name = name2alt[data[1]] tmpfile.write("%s\t%s" % (target_name, line)) tmpfile.close() P.load(tmpfile.name, outfile) os.unlink(tmpfile.name)
def loadMissedReadCounts(infiles, outfile): """load summary table of numbers of missed reads.""" def _getlines(inf): return len(IOTools.openFile(inf).readlines()) - 1 tmpfile = P.getTempFile() infiles = sorted(infiles) tmpfile.write("track\tmapped_genome\tmissed_junctions\tmissed_transcriptome\n") for x in range(0, len(infiles), 2): junctions, transcriptome = infiles[x], infiles[x + 1] track = P.snip(junctions, ".missed_junctions.gz") mapped_genome = _getlines(track + ".mapped_reads.gz") tmpfile.write("%s\t%i\t%i\t%i\n" % (track, mapped_genome, _getlines(junctions), _getlines(transcriptome))) tmpfile.close() P.load(tmpfile.name, outfile) os.unlink(tmpfile.name)
def loadMatchResults(infile, outfile): ''' load the results of the match analysis into sqlite database ''' temp = P.getTempFile() temp.write( "seq_id\tmatrix_id\tposition\tstrand\tcore_score\tmatrix_score\tsequence\n") for details in PipelineTransfacMatch.match_iterator(infile): temp.write("\t".join(map(str, [details.seq_id, details.matrix_id, details.position, details.strand, details.core_score, details.matrix_score, details.sequence])) + "\n") inf = temp.name tablename = filenameToTablename(os.path.basename(infile)) statement = '''python %(scriptsdir)s/csv2db.py -t %(tablename)s --log=%(outfile)s.log --index=seq_id %(csv2db_options)s < %(inf)s > %(outfile)s''' P.run() os.remove(inf)
def loadLncRNAClass(infile, outfile): ''' load the lncRNA classifications ''' tablename = os.path.basename(filenameToTablename(P.snip(infile, ".gtf.gz"))) to_cluster = False # just load each transcript with its classification temp = P.getTempFile() for transcript in GTF.transcript_iterator( GTF.iterator(IOTools.openFile(infile))): temp.write("%s\t%s\t%s\n" % (transcript[0].transcript_id, transcript[0].gene_id, transcript[0].source)) temp.close() inf = temp.name statement = '''python %(scriptsdir)s/csv2db.py -t %(tablename)s --log=%(outfile)s.log --header=transcript_id,gene_id,class < %(inf)s > %(outfile)s''' P.run()
def genericImportAnnotator(infiles, outfile, table, workspace, slice, subset, fdr_method): '''generic import of annotator results. Assumes that the suffix of all infiles is the same. ''' infile = " ".join(infiles) x, suffix = os.path.splitext(infiles[0]) tmpfilename = P.getTempFilename() statement = ''' python %(scriptsdir)s/annotator.py \ --method=fdr-table \ --fdr-method=%(fdr_method)s \ --log=%(outfile)s.log \ --regex-id="(.*)%(suffix)s" \ %(infile)s > %(tmpfilename)s ''' P.run() tmpfile = P.getTempFile() for line in open(tmpfilename, "r"): if line.startswith("id"): line = "subset\tworkspace\tslice\t" + re.sub("^id", "track", line) else: line = "%s\t%s\t%s\t%s" % (subset, workspace, slice, line) tmpfile.write(line) tmpfile.close() tmpfilename2 = tmpfile.name statement = ''' python %(scriptsdir)s/csv2db.py %(csv2db_options)s \ --table=%(table)s < %(tmpfilename2)s > %(outfile)s''' P.run() os.unlink(tmpfilename) os.unlink(tmpfilename2)