def loadDistances(infile, outfile): '''load annotations''' P.load( infile, outfile, "--index=gene_id --map=gene_id:str --index=closest_id --map=closest_id:str" ) table = outfile[:-len(".load")]
def loadFilteredContigLengths(infile, outfile): ''' load contig lengths ''' outname = P.snip(os.path.dirname(infile), ".dir") + "_" + P.snip(os.path.basename(infile), ".tsv") + ".load" P.load(infile, outname) P.touch(outfile)
def loadTranscriptSummary(infile, outfile): '''summarize binding information per transcript.''' dbh = connect() table = P.toTable(outfile) cc = dbh.cursor() # sqlite can not do full outer join cc.execute( """DROP TABLE IF EXISTS %(table)s""" % locals() ) transcripts = [x[0] for x in cc.execute( "SELECT DISTINCT(transcript_id) FROM annotations.transcript_info").fetchall()] tmpf = P.getTempFile() tables = ("tata", "cpg") titles = tables vals = [] for table in tables: t = set([x[0] for x in cc.execute( "SELECT DISTINCT(transcript_id) FROM %(table)s" % locals()).fetchall()]) vals.append(t) tmpf.write("transcript_id\t%s\n" % "\t".join(titles)) for transcript_id in transcripts: tmpf.write("%s\t%s\n" % (transcript_id, "\t".join([str(int(transcript_id in v)) for v in vals]))) tmpf.close() P.load(tmpf.name, outfile) os.unlink(tmpf.name)
def loadContigSummary(infile, outfile): ''' load contig summary stats for each assembler ''' outname = P.snip(os.path.dirname(infile), ".dir") + "_" + os.path.basename(infile) + ".load" P.load(infile, outname) P.touch(outfile)
def loadGeneTables(infile, outfile): ''' load genes from metagenemaek analysis ''' if infile.find("gff") != -1: P.load(infile, outfile) else: P.load(infile, outfile)
def loadExonValidation(infiles, outfile): """merge alignment stats into single tables.""" suffix = suffix = ".exon.validation.tsv.gz" mergeAndLoad(infiles, outfile, suffix=suffix) for infile in infiles: track = P.snip(infile, suffix) o = "%s_overrun.load" % track P.load(infile + ".overrun.gz", o)
def loadContigLengths(infile, outfile): ''' load contig lengths ''' outname = P.snip(os.path.dirname(infile), ".dir") + "_" + P.snip( os.path.basename(infile), ".tsv") + ".load" P.load(infile, outname) P.touch(outfile)
def loadContigSummary(infile, outfile): ''' load contig summary stats for each assembler ''' outname = P.snip(os.path.dirname(infile), ".dir") + \ "_" + os.path.basename(infile) + ".load" P.load(infile, outname) P.touch(outfile)
def loadExonValidation(infiles, outfile): '''merge alignment stats into single tables.''' suffix = suffix = ".exon.validation.tsv.gz" mergeAndLoad(infiles, outfile, suffix=suffix) for infile in infiles: track = P.snip(infile, suffix) o = "%s_overrun.load" % track P.load(infile + ".overrun.gz", o)
def loadContigGCContent(infile, outfile): ''' load contig GC content ''' outname = P.snip(os.path.dirname(infile), ".dir") + \ "_" + P.snip(os.path.basename(infile), ".tsv") + ".load" P.load(infile, outname, "--index=id") P.touch(outfile)
def loadContigLengths(infile, outfile): ''' load contig lengths ''' outname = P.snip(os.path.dirname(infile), ".dir") + \ "_" + P.snip(os.path.basename(infile), ".tsv") + ".load" P.load(infile, outname, "--index=scaffold_name") P.touch(outfile)
def loadHypergeometricAnalysis(infile, outfile): '''load GO results.''' track = P.toTable(outfile) tablename = 'hypergeometric_%s_summary' % track P.load(infile, outfile, tablename=tablename) dbh = connect() ontologies = [x[0] for x in Database.executewait( dbh, '''SELECT DISTINCT ontology FROM %s''' % tablename).fetchall()] genelists = [x[0] for x in Database.executewait( dbh, '''SELECT DISTINCT genelist FROM %s''' % tablename).fetchall()] # output files from runGO.py sections = ('results', 'parameters', 'withgenes') for section in sections: tablename = 'hypergeometric_%s_%s' % (track, section) statement = ''' python %(scriptsdir)s/combine_tables.py --cat=track --regex-filename="hypergeometric.dir/%(track)s.tsv.dir/(\S+).%(section)s" hypergeometric.dir/%(track)s.tsv.dir/*.%(section)s | python %(scriptsdir)s/csv2db.py %(csv2db_options)s --table=%(tablename)s >> %(outfile)s''' P.run() for ontology in ontologies: fn = os.path.join(infile + ".dir", "all_alldesc.%s.l2fold" % ontology) if not os.path.exists(fn): E.warn("file %s does not exist" % fn) continue P.load(fn, outfile, tablename='hypergeometric_%s_%s_l2fold' % (track, ontology), options='--allow-empty') fn = os.path.join( infile + ".dir", "all_alldesc.%s.l10pvalue" % ontology) P.load(fn, outfile, tablename='hypergeometric_%s_%s_l10pvalue' % (track, ontology), options='--allow-empty') fn = os.path.join( infile + ".dir", "all_alldesc.%s.l10qvalue" % ontology) P.load(fn, outfile, tablename='hypergeometric_%s_%s_l10qvalue' % (track, ontology), options='--allow-empty')
def loadEdgeR( infile, outfile ): '''load EdgeR per-chunk summary stats.''' prefix = P.snip( outfile, ".load" ) for fn in glob.glob( infile + "*_summary.tsv" ): prefix = P.snip(fn[len(infile)+1:], "_summary.tsv") P.load( fn, prefix + ".deseq_summary.load", collapse = 0, transpose = "sample") P.touch( outfile )
def loadCufflinks( infile, outfile ): '''load expression level measurements.''' track = P.snip( outfile, ".load" ) P.load( infile + ".genes_tracking.gz", outfile = track + "_genefpkm.load", options = "--index=gene_id --ignore-column=tracking_id --ignore-column=class_code --ignore-column=nearest_ref_id" ) track = P.snip( outfile, ".load" ) P.load( infile + ".fpkm_tracking.gz", outfile = track + "_fpkm.load", options = "--index=tracking_id --ignore-column=nearest_ref_id --rename-column=tracking_id:transcript_id" ) P.touch( outfile )
def loadReadCounts(infiles, outfile): '''load read counts into database.''' outf = P.getTempFile() outf.write("track\ttotal_reads\n") for infile in infiles: track = P.snip(infile, ".nreads") lines = IOTools.openFile(infile).readlines() nreads = int(lines[0][:-1].split("\t")[1]) outf.write("%s\t%i\n" % (track, nreads)) outf.close() P.load(outf.name, outfile) os.unlink(outf.name)
def loadReadCounts( infiles, outfile ): '''load read counts into database.''' outf = P.getTempFile() outf.write( "track\ttotal_reads\n") for infile in infiles: track = P.snip(infile, ".nreads") lines = IOTools.openFile( infile ).readlines() nreads = int( lines[0][:-1].split("\t")[1]) outf.write( "%s\t%i\n" % (track,nreads)) outf.close() P.load( outf.name, outfile ) os.unlink(outf.name)
def loadMotifInformation( infiles, outfile ): '''load information about motifs into database.''' outf = P.getTempFile(".") outf.write("motif\n" ) for infile in infiles: if IOTools.isEmpty( infile ): continue motif = P.snip( infile, ".motif" ) outf.write( "%s\n" % motif ) outf.close() P.load( outf.name, outfile, "--allow-empty" ) os.unlink( outf.name )
def loadMemeSummary( infiles, outfile ): '''load information about motifs into database.''' outf = P.getTempFile(".") outf.write("track\n" ) for infile in infiles: if IOTools.isEmpty( infile ): continue motif = P.snip( infile, ".meme" ) outf.write( "%s\n" % motif ) outf.close() P.load( outf.name, outfile ) os.unlink( outf.name )
def loadCufflinks(infile, outfile): '''load expression level measurements.''' track = P.snip(outfile, ".load") P.load(infile + ".genes_tracking.gz", outfile=track + "_genefpkm.load", options="--index=gene_id " "--ignore-column=tracking_id " "--ignore-column=class_code " "--ignore-column=nearest_ref_id") track = P.snip(outfile, ".load") P.load(infile + ".fpkm_tracking.gz", outfile=track + "_fpkm.load", options="--index=tracking_id " "--ignore-column=nearest_ref_id " "--rename-column=tracking_id:transcript_id") P.touch(outfile)
def loadMemeChipSummary( infiles, outfile ): '''load information about motifs into database.''' outf = P.getTempFile(".") outf.write("track\tnpeaks\twidth\tmasking\tpath\n" ) for infile in infiles: if IOTools.isEmpty( infile ): continue fn = P.snip(os.path.basename( infile ), ".memechip" ) track, npeaks, width, masking = fn.split(".") outf.write( "\t".join( map(str,(track, npeaks, width, masking, fn)) ) + "\n" ) outf.close() P.load( outf.name, outfile ) os.unlink( outf.name )
def loadTomTom(infile, outfile): '''load tomtom results''' tablename = P.toTable(outfile) resultsdir = os.path.join(os.path.abspath(PARAMS["exportdir"]), "tomtom", infile) xml_file = os.path.join(resultsdir, "tomtom.xml") if not os.path.exists(xml_file): E.warn("no tomtom output - skipped loading ") P.touch(outfile) return # get the motif name from the xml file tree = xml.etree.ElementTree.ElementTree() tree.parse(xml_file) motifs = tree.find("targets") name2alt = {} for motif in motifs.getiterator("motif"): name = motif.get("name") alt = motif.get("alt") name2alt[name] = alt tmpfile = P.getTempFile(".") # parse the text file for line in IOTools.openFile(infile): if line.startswith("#Query"): tmpfile.write( "target_name\tquery_id\ttarget_id\toptimal_offset\tpvalue\tevalue\tqvalue\tOverlap\tquery_consensus\ttarget_consensus\torientation\n" ) continue data = line[:-1].split("\t") target_name = name2alt[data[1]] tmpfile.write("%s\t%s" % (target_name, line)) tmpfile.close() P.load(tmpfile.name, outfile) os.unlink(tmpfile.name)
def loadDESeq( infile, outfile ): '''load DESeq per-chunk summary stats.''' prefix = P.snip( outfile, ".load" ) if os.path.exists( infile + "_size_factors.tsv" ): P.load( infile + "_size_factors.tsv", prefix + "_deseq_size_factors.load", collapse = True, transpose = "sample") for fn in glob.glob( infile + "*_summary.tsv" ): prefix = P.snip(fn[len(infile)+1:], "_summary.tsv") P.load( fn, prefix + ".deseq_summary.load", collapse = 0, transpose = "sample") P.touch( outfile )
def loadMissedReadCounts(infiles, outfile): """load summary table of numbers of missed reads.""" def _getlines(inf): return len(IOTools.openFile(inf).readlines()) - 1 tmpfile = P.getTempFile() infiles = sorted(infiles) tmpfile.write("track\tmapped_genome\tmissed_junctions\tmissed_transcriptome\n") for x in range(0, len(infiles), 2): junctions, transcriptome = infiles[x], infiles[x + 1] track = P.snip(junctions, ".missed_junctions.gz") mapped_genome = _getlines(track + ".mapped_reads.gz") tmpfile.write("%s\t%i\t%i\t%i\n" % (track, mapped_genome, _getlines(junctions), _getlines(transcriptome))) tmpfile.close() P.load(tmpfile.name, outfile) os.unlink(tmpfile.name)
def loadTomTom(infile, outfile): '''load tomtom results''' tablename = P.toTable(outfile) resultsdir = os.path.join( os.path.abspath(PARAMS["exportdir"]), "tomtom", infile) xml_file = os.path.join(resultsdir, "tomtom.xml") if not os.path.exists(xml_file): E.warn("no tomtom output - skipped loading ") P.touch(outfile) return # get the motif name from the xml file tree = xml.etree.ElementTree.ElementTree() tree.parse(xml_file) motifs = tree.find("targets") name2alt = {} for motif in motifs.getiterator("motif"): name = motif.get("name") alt = motif.get("alt") name2alt[name] = alt tmpfile = P.getTempFile(".") # parse the text file for line in IOTools.openFile(infile): if line.startswith("#Query"): tmpfile.write( "target_name\tquery_id\ttarget_id\toptimal_offset\tpvalue\tevalue\tqvalue\tOverlap\tquery_consensus\ttarget_consensus\torientation\n") continue data = line[:-1].split("\t") target_name = name2alt[data[1]] tmpfile.write("%s\t%s" % (target_name, line)) tmpfile.close() P.load(tmpfile.name, outfile) os.unlink(tmpfile.name)
def loadMissedReadCounts(infiles, outfile): '''load summary table of numbers of missed reads.''' def _getlines(inf): return len(IOTools.openFile(inf).readlines()) - 1 tmpfile = P.getTempFile() infiles = sorted(infiles) tmpfile.write( "track\tmapped_genome\tmissed_junctions\tmissed_transcriptome\n") for x in range(0, len(infiles), 2): junctions, transcriptome = infiles[x], infiles[x + 1] track = P.snip(junctions, ".missed_junctions.gz") mapped_genome = _getlines(track + ".mapped_reads.gz") tmpfile.write("%s\t%i\t%i\t%i\n" % (track, mapped_genome, _getlines(junctions), _getlines(transcriptome))) tmpfile.close() P.load(tmpfile.name, outfile) os.unlink(tmpfile.name)
def loadIdbaStats(infile, outfile): ''' load the idba stats ''' P.load(infile, outfile)
def loadGeneLevelReadCounts(infile, outfile): P.load(infile, outfile, options="--index=gene_id")
def loadCufflinksFPKM(infile, outfile): '''load fkpm data into table.''' P.load(infile, outfile, "--index=gene_id --index=transcript_id")
def loadDESeqStats(infile, outfile): P.load(infile, outfile)
def loadBlastOnAminoAcidSequences(infile, outfile): ''' load blastp results ''' P.load(infile, outfile)
def loadChimericityScores(infile, outfile): ''' load the chimericity scores ''' P.load(infile, outfile)
def loadFilteredContigStats(infile, outfile): ''' load the filtered contig stats ''' P.load(infile, outfile)
def loadTATABox(infile, outfile): '''load TATA box information.''' P.load(infile + ".table.gz", outfile, "--index=transcript_id")
def loadDummyTask(infile, outfile): '''load results of word counting into database.''' P.load(infile, outfile, "--index=word")
def loadAlignmentStats(infile, outfile): ''' load bam2stats results ''' P.load(infile, outfile)
def loadMetavelvetRawStats(infile, outfile): ''' load the assembly stats for meta-velvet ''' inf = P.snip(infile, ".contigs.fa") + ".stats.txt" P.load(inf, outfile)
def loadMetavelvetStats(infile, outfile): ''' load the metavelvet stats ''' P.load(infile, outfile)
def loadCpGIslands(infile, outfile): '''load CpG Islands information.''' P.load(infile, outfile, "--index=transcript_id")
def loadMACSsoloSummary(infile, outfile): '''load macs summary.''' P.load(infile, outfile, "--index=track")
def loadExpectedAndObservedGenomeCoverage(infile, outfile): ''' load the combined table for observed and expected genome coverage ''' P.load(infile, outfile)
def loadFeatureCounts(infile, outfile): '''load individual feature counts into database''' P.load(infile, outfile, "--index=gene_id")
def loadFastqcSummary(infile, outfile): P.load(infile, outfile, options="--index=track")
def loadEssentialGeneAssignments(infile, outfile): ''' load assignments of essential genes ''' P.load(infile, outfile, "--index=contig")
def loadTranscriptLevelReadCounts(infile, outfile): P.load(infile, outfile, options="--index=transcript_id")
def loadTagCountSummary(infile, outfile): '''load windows summary.''' P.load(infile, outfile) P.load(P.snip(infile, ".tsv") + "_correlation.tsv", P.snip(outfile, "_stats.load") + "_correlation.load", options="--first-column=track")
def loadOverrun(infile, outfile): '''load annotations''' P.load(infile, outfile, "--index=gene_id --map=gene_id:str")
def loadEdgeRStats(infile, outfile): P.load(infile, outfile)
def loadFilteringSummary(infile, outfile): '''load filtering summary.''' P.load(infile, outfile)
def loadCuffdiffStats(infile, outfile): '''import cuffdiff results.''' P.load(infile, outfile)
def loadRepeats(infile, outfile): '''load repeat overlap''' P.load(infile, outfile, "--index=gene_id --map=gene_id:str")
def loadDummyTask( infile, outfile ): '''load results of word counting into database.''' P.load( infile, outfile, "--index=word" )
def loadContigStats(infile, outfile): ''' load the contig stats ''' P.load(infile, outfile)