예제 #1
0
def loadDistances(infile, outfile):
    '''load annotations'''
    P.load(
        infile, outfile,
        "--index=gene_id --map=gene_id:str --index=closest_id --map=closest_id:str"
    )
    table = outfile[:-len(".load")]
예제 #2
0
def loadFilteredContigLengths(infile, outfile):
    '''
    load contig lengths
    '''
    outname = P.snip(os.path.dirname(infile), ".dir") + "_" + P.snip(os.path.basename(infile), ".tsv") + ".load"
    P.load(infile, outname)
    P.touch(outfile)
예제 #3
0
def loadTranscriptSummary(infile, outfile):
    '''summarize binding information per transcript.'''

    dbh = connect()

    table = P.toTable(outfile)

    cc = dbh.cursor()
    # sqlite can not do full outer join
    cc.execute( """DROP TABLE IF EXISTS %(table)s""" % locals() )

    transcripts = [x[0] for x in cc.execute(
        "SELECT DISTINCT(transcript_id) FROM annotations.transcript_info").fetchall()]

    tmpf = P.getTempFile()

    tables = ("tata", "cpg")
    titles = tables

    vals = []
    for table in tables:
        t = set([x[0] for x in cc.execute(
            "SELECT DISTINCT(transcript_id) FROM %(table)s" % locals()).fetchall()])
        vals.append(t)

    tmpf.write("transcript_id\t%s\n" % "\t".join(titles))

    for transcript_id in transcripts:
        tmpf.write("%s\t%s\n" % (transcript_id,
                                 "\t".join([str(int(transcript_id in v)) for v in vals])))

    tmpf.close()

    P.load(tmpf.name, outfile)
    os.unlink(tmpf.name)
def loadContigSummary(infile, outfile):
    '''
    load contig summary stats for each assembler
    '''
    outname = P.snip(os.path.dirname(infile), ".dir") + "_" + os.path.basename(infile) + ".load"
    P.load(infile, outname)
    P.touch(outfile)
예제 #5
0
def loadGeneTables(infile, outfile):
    '''
    load genes from metagenemaek analysis
    '''
    if infile.find("gff") != -1:
        P.load(infile, outfile)
    else:
        P.load(infile, outfile)
def loadExonValidation(infiles, outfile):
    """merge alignment stats into single tables."""
    suffix = suffix = ".exon.validation.tsv.gz"
    mergeAndLoad(infiles, outfile, suffix=suffix)
    for infile in infiles:
        track = P.snip(infile, suffix)
        o = "%s_overrun.load" % track
        P.load(infile + ".overrun.gz", o)
예제 #7
0
def loadContigLengths(infile, outfile):
    '''
    load contig lengths
    '''
    outname = P.snip(os.path.dirname(infile), ".dir") + "_" + P.snip(
        os.path.basename(infile), ".tsv") + ".load"
    P.load(infile, outname)
    P.touch(outfile)
예제 #8
0
def loadContigSummary(infile, outfile):
    '''
    load contig summary stats for each assembler
    '''
    outname = P.snip(os.path.dirname(infile), ".dir") + \
        "_" + os.path.basename(infile) + ".load"
    P.load(infile, outname)
    P.touch(outfile)
def loadExonValidation(infiles, outfile):
    '''merge alignment stats into single tables.'''
    suffix = suffix = ".exon.validation.tsv.gz"
    mergeAndLoad(infiles, outfile, suffix=suffix)
    for infile in infiles:
        track = P.snip(infile, suffix)
        o = "%s_overrun.load" % track
        P.load(infile + ".overrun.gz", o)
예제 #10
0
def loadContigGCContent(infile, outfile):
    '''
    load contig GC content
    '''
    outname = P.snip(os.path.dirname(infile), ".dir") + \
        "_" + P.snip(os.path.basename(infile), ".tsv") + ".load"
    P.load(infile, outname, "--index=id")
    P.touch(outfile)
예제 #11
0
def loadContigLengths(infile, outfile):
    '''
    load contig lengths
    '''
    outname = P.snip(os.path.dirname(infile), ".dir") + \
        "_" + P.snip(os.path.basename(infile), ".tsv") + ".load"
    P.load(infile, outname, "--index=scaffold_name")
    P.touch(outfile)
예제 #12
0
def loadHypergeometricAnalysis(infile, outfile):
    '''load GO results.'''

    track = P.toTable(outfile)
    tablename = 'hypergeometric_%s_summary' % track
    P.load(infile, outfile, tablename=tablename)

    dbh = connect()
    ontologies = [x[0] for x in Database.executewait(
        dbh,
        '''SELECT DISTINCT ontology FROM %s''' % tablename).fetchall()]

    genelists = [x[0] for x in Database.executewait(
        dbh,
        '''SELECT DISTINCT genelist FROM %s''' % tablename).fetchall()]

    # output files from runGO.py
    sections = ('results', 'parameters', 'withgenes')

    for section in sections:
        tablename = 'hypergeometric_%s_%s' % (track, section)
        statement = '''
        python %(scriptsdir)s/combine_tables.py
        --cat=track
        --regex-filename="hypergeometric.dir/%(track)s.tsv.dir/(\S+).%(section)s"
        hypergeometric.dir/%(track)s.tsv.dir/*.%(section)s
        | python %(scriptsdir)s/csv2db.py
        %(csv2db_options)s
        --table=%(tablename)s
        >> %(outfile)s'''
        P.run()

    for ontology in ontologies:

        fn = os.path.join(infile + ".dir", "all_alldesc.%s.l2fold" % ontology)

        if not os.path.exists(fn):
            E.warn("file %s does not exist" % fn)
            continue

        P.load(fn,
               outfile,
               tablename='hypergeometric_%s_%s_l2fold' % (track, ontology),
               options='--allow-empty')

        fn = os.path.join(
            infile + ".dir", "all_alldesc.%s.l10pvalue" % ontology)

        P.load(fn,
               outfile,
               tablename='hypergeometric_%s_%s_l10pvalue' % (track, ontology),
               options='--allow-empty')

        fn = os.path.join(
            infile + ".dir", "all_alldesc.%s.l10qvalue" % ontology)

        P.load(fn,
               outfile,
               tablename='hypergeometric_%s_%s_l10qvalue' % (track, ontology),
               options='--allow-empty')
예제 #13
0
def loadEdgeR( infile, outfile ):
    '''load EdgeR per-chunk summary stats.'''

    prefix = P.snip( outfile, ".load" )

    for fn in glob.glob( infile + "*_summary.tsv" ):
        prefix = P.snip(fn[len(infile)+1:], "_summary.tsv")

        P.load( fn, 
                prefix + ".deseq_summary.load", 
                collapse = 0,
                transpose = "sample")

    P.touch( outfile )
def loadCufflinks( infile, outfile ):
    '''load expression level measurements.'''

    track = P.snip( outfile, ".load" )
    P.load( infile + ".genes_tracking.gz",
            outfile = track + "_genefpkm.load",
            options = "--index=gene_id --ignore-column=tracking_id --ignore-column=class_code --ignore-column=nearest_ref_id" )

    track = P.snip( outfile, ".load" )
    P.load( infile + ".fpkm_tracking.gz",
            outfile = track + "_fpkm.load",
            options = "--index=tracking_id --ignore-column=nearest_ref_id --rename-column=tracking_id:transcript_id" )

    P.touch( outfile )
예제 #15
0
def loadReadCounts(infiles, outfile):
    '''load read counts into database.'''

    outf = P.getTempFile()
    outf.write("track\ttotal_reads\n")
    for infile in infiles:
        track = P.snip(infile, ".nreads")
        lines = IOTools.openFile(infile).readlines()
        nreads = int(lines[0][:-1].split("\t")[1])
        outf.write("%s\t%i\n" % (track, nreads))
    outf.close()

    P.load(outf.name, outfile)

    os.unlink(outf.name)
def loadReadCounts( infiles, outfile ):
    '''load read counts into database.'''

    outf = P.getTempFile()
    outf.write( "track\ttotal_reads\n")
    for infile in infiles:
        track = P.snip(infile, ".nreads")
        lines = IOTools.openFile( infile ).readlines()
        nreads = int( lines[0][:-1].split("\t")[1])
        outf.write( "%s\t%i\n" % (track,nreads))
    outf.close()

    P.load( outf.name, outfile )

    os.unlink(outf.name)
예제 #17
0
def loadMotifInformation( infiles, outfile ):
    '''load information about motifs into database.'''
    
    outf = P.getTempFile(".")

    outf.write("motif\n" )

    for infile in infiles:
        if IOTools.isEmpty( infile ): continue
        motif = P.snip( infile, ".motif" )
        outf.write( "%s\n" % motif )

    outf.close()

    P.load( outf.name, outfile, "--allow-empty" )
    
    os.unlink( outf.name )
예제 #18
0
def loadMemeSummary( infiles, outfile ):
    '''load information about motifs into database.'''
    
    outf = P.getTempFile(".")

    outf.write("track\n" )

    for infile in infiles:
        if IOTools.isEmpty( infile ): continue
        motif = P.snip( infile, ".meme" )
        outf.write( "%s\n" % motif )

    outf.close()

    P.load( outf.name, outfile )
    
    os.unlink( outf.name )
예제 #19
0
def loadCufflinks(infile, outfile):
    '''load expression level measurements.'''

    track = P.snip(outfile, ".load")
    P.load(infile + ".genes_tracking.gz",
           outfile=track + "_genefpkm.load",
           options="--index=gene_id "
           "--ignore-column=tracking_id "
           "--ignore-column=class_code "
           "--ignore-column=nearest_ref_id")

    track = P.snip(outfile, ".load")
    P.load(infile + ".fpkm_tracking.gz",
           outfile=track + "_fpkm.load",
           options="--index=tracking_id "
           "--ignore-column=nearest_ref_id "
           "--rename-column=tracking_id:transcript_id")

    P.touch(outfile)
예제 #20
0
def loadMemeChipSummary( infiles, outfile ):
    '''load information about motifs into database.'''
    
    outf = P.getTempFile(".")

    outf.write("track\tnpeaks\twidth\tmasking\tpath\n" )

    for infile in infiles:
        if IOTools.isEmpty( infile ): continue
        fn = P.snip(os.path.basename( infile ), ".memechip" )
        
        track, npeaks, width, masking = fn.split(".")
        outf.write( "\t".join( map(str,(track, npeaks, width, masking, fn)) ) + "\n" )

    outf.close()

    P.load( outf.name, outfile )
    
    os.unlink( outf.name )
예제 #21
0
def loadTomTom(infile, outfile):
    '''load tomtom results'''

    tablename = P.toTable(outfile)

    resultsdir = os.path.join(os.path.abspath(PARAMS["exportdir"]), "tomtom",
                              infile)
    xml_file = os.path.join(resultsdir, "tomtom.xml")

    if not os.path.exists(xml_file):
        E.warn("no tomtom output - skipped loading ")
        P.touch(outfile)
        return

    # get the motif name from the xml file

    tree = xml.etree.ElementTree.ElementTree()
    tree.parse(xml_file)
    motifs = tree.find("targets")
    name2alt = {}
    for motif in motifs.getiterator("motif"):
        name = motif.get("name")
        alt = motif.get("alt")
        name2alt[name] = alt

    tmpfile = P.getTempFile(".")

    # parse the text file
    for line in IOTools.openFile(infile):
        if line.startswith("#Query"):
            tmpfile.write(
                "target_name\tquery_id\ttarget_id\toptimal_offset\tpvalue\tevalue\tqvalue\tOverlap\tquery_consensus\ttarget_consensus\torientation\n"
            )
            continue
        data = line[:-1].split("\t")
        target_name = name2alt[data[1]]
        tmpfile.write("%s\t%s" % (target_name, line))
    tmpfile.close()

    P.load(tmpfile.name, outfile)

    os.unlink(tmpfile.name)
예제 #22
0
def loadDESeq( infile, outfile ):
    '''load DESeq per-chunk summary stats.'''

    prefix = P.snip( outfile, ".load" )

    if os.path.exists( infile + "_size_factors.tsv" ):
        P.load( infile + "_size_factors.tsv", 
                prefix + "_deseq_size_factors.load", 
                collapse = True,
                transpose = "sample")

    for fn in glob.glob( infile + "*_summary.tsv" ):
        prefix = P.snip(fn[len(infile)+1:], "_summary.tsv")

        P.load( fn, 
                prefix + ".deseq_summary.load", 
                collapse = 0,
                transpose = "sample")

    P.touch( outfile )
def loadMissedReadCounts(infiles, outfile):
    """load summary table of numbers of missed reads."""

    def _getlines(inf):
        return len(IOTools.openFile(inf).readlines()) - 1

    tmpfile = P.getTempFile()

    infiles = sorted(infiles)

    tmpfile.write("track\tmapped_genome\tmissed_junctions\tmissed_transcriptome\n")

    for x in range(0, len(infiles), 2):
        junctions, transcriptome = infiles[x], infiles[x + 1]
        track = P.snip(junctions, ".missed_junctions.gz")
        mapped_genome = _getlines(track + ".mapped_reads.gz")
        tmpfile.write("%s\t%i\t%i\t%i\n" % (track, mapped_genome, _getlines(junctions), _getlines(transcriptome)))
    tmpfile.close()
    P.load(tmpfile.name, outfile)
    os.unlink(tmpfile.name)
예제 #24
0
def loadTomTom(infile, outfile):
    '''load tomtom results'''

    tablename = P.toTable(outfile)

    resultsdir = os.path.join(
        os.path.abspath(PARAMS["exportdir"]), "tomtom", infile)
    xml_file = os.path.join(resultsdir, "tomtom.xml")

    if not os.path.exists(xml_file):
        E.warn("no tomtom output - skipped loading ")
        P.touch(outfile)
        return

    # get the motif name from the xml file

    tree = xml.etree.ElementTree.ElementTree()
    tree.parse(xml_file)
    motifs = tree.find("targets")
    name2alt = {}
    for motif in motifs.getiterator("motif"):
        name = motif.get("name")
        alt = motif.get("alt")
        name2alt[name] = alt

    tmpfile = P.getTempFile(".")

    # parse the text file
    for line in IOTools.openFile(infile):
        if line.startswith("#Query"):
            tmpfile.write(
                "target_name\tquery_id\ttarget_id\toptimal_offset\tpvalue\tevalue\tqvalue\tOverlap\tquery_consensus\ttarget_consensus\torientation\n")
            continue
        data = line[:-1].split("\t")
        target_name = name2alt[data[1]]
        tmpfile.write("%s\t%s" % (target_name, line))
    tmpfile.close()

    P.load(tmpfile.name, outfile)

    os.unlink(tmpfile.name)
def loadMissedReadCounts(infiles, outfile):
    '''load summary table of numbers of missed reads.'''
    def _getlines(inf):
        return len(IOTools.openFile(inf).readlines()) - 1

    tmpfile = P.getTempFile()

    infiles = sorted(infiles)

    tmpfile.write(
        "track\tmapped_genome\tmissed_junctions\tmissed_transcriptome\n")

    for x in range(0, len(infiles), 2):
        junctions, transcriptome = infiles[x], infiles[x + 1]
        track = P.snip(junctions, ".missed_junctions.gz")
        mapped_genome = _getlines(track + ".mapped_reads.gz")
        tmpfile.write("%s\t%i\t%i\t%i\n" %
                      (track, mapped_genome, _getlines(junctions),
                       _getlines(transcriptome)))
    tmpfile.close()
    P.load(tmpfile.name, outfile)
    os.unlink(tmpfile.name)
예제 #26
0
def loadIdbaStats(infile, outfile):
    '''
    load the idba stats
    '''
    P.load(infile, outfile)
예제 #27
0
def loadGeneLevelReadCounts(infile, outfile):
    P.load(infile, outfile, options="--index=gene_id")
예제 #28
0
def loadCufflinksFPKM(infile, outfile):
    '''load fkpm data into table.'''

    P.load(infile, outfile,
           "--index=gene_id --index=transcript_id")
예제 #29
0
def loadDESeqStats(infile, outfile):
    P.load(infile, outfile)
예제 #30
0
def loadBlastOnAminoAcidSequences(infile, outfile):
    '''
    load blastp results
    '''
    P.load(infile, outfile)
예제 #31
0
def loadChimericityScores(infile, outfile):
    '''
    load the chimericity scores
    '''
    P.load(infile, outfile)
예제 #32
0
def loadFilteredContigStats(infile, outfile):
    '''
    load the filtered contig stats
    '''
    P.load(infile, outfile)
예제 #33
0
def loadTATABox(infile, outfile):
    '''load TATA box information.'''

    P.load(infile + ".table.gz", outfile, "--index=transcript_id")
예제 #34
0
def loadDummyTask(infile, outfile):
    '''load results of word counting into database.'''
    P.load(infile, outfile, "--index=word")
예제 #35
0
def loadAlignmentStats(infile, outfile):
    '''
    load bam2stats results
    '''
    P.load(infile, outfile)
예제 #36
0
def loadMetavelvetRawStats(infile, outfile):
    '''
    load the assembly stats for meta-velvet
    '''
    inf = P.snip(infile, ".contigs.fa") + ".stats.txt"
    P.load(inf, outfile)
예제 #37
0
def loadMetavelvetStats(infile, outfile):
    '''
    load the metavelvet stats
    '''
    P.load(infile, outfile)
예제 #38
0
def loadCpGIslands(infile, outfile):
    '''load CpG Islands information.'''

    P.load(infile, outfile, "--index=transcript_id")
예제 #39
0
def loadMACSsoloSummary(infile, outfile):
    '''load macs summary.'''
    P.load(infile, outfile, "--index=track")
예제 #40
0
def loadExpectedAndObservedGenomeCoverage(infile, outfile):
    '''
    load the combined table for observed and expected
    genome coverage
    '''
    P.load(infile, outfile)
예제 #41
0
def loadFeatureCounts(infile, outfile):
    '''load individual feature counts into database'''
    P.load(infile, outfile, "--index=gene_id")
예제 #42
0
def loadFastqcSummary(infile, outfile):
    P.load(infile, outfile, options="--index=track")
예제 #43
0
def loadEssentialGeneAssignments(infile, outfile):
    '''
    load assignments of essential genes
    '''
    P.load(infile, outfile, "--index=contig")
예제 #44
0
def loadTranscriptLevelReadCounts(infile, outfile):
    P.load(infile, outfile, options="--index=transcript_id")
예제 #45
0
def loadMACSsoloSummary(infile, outfile):
    '''load macs summary.'''
    P.load(infile, outfile, "--index=track")
예제 #46
0
def loadTagCountSummary(infile, outfile):
    '''load windows summary.'''
    P.load(infile, outfile)
    P.load(P.snip(infile, ".tsv") + "_correlation.tsv",
           P.snip(outfile, "_stats.load") + "_correlation.load",
           options="--first-column=track")
예제 #47
0
def loadOverrun(infile, outfile):
    '''load annotations'''
    P.load(infile, outfile, "--index=gene_id --map=gene_id:str")
예제 #48
0
def loadEdgeRStats(infile, outfile):
    P.load(infile, outfile)
예제 #49
0
def loadFilteringSummary(infile, outfile):
    '''load filtering summary.'''
    P.load(infile, outfile)
예제 #50
0
def loadCuffdiffStats(infile, outfile):
    '''import cuffdiff results.'''
    P.load(infile, outfile)
예제 #51
0
def loadRepeats(infile, outfile):
    '''load repeat overlap'''
    P.load(infile, outfile, "--index=gene_id --map=gene_id:str")
def loadDummyTask( infile, outfile ):
    '''load results of word counting into database.'''
    P.load( infile, outfile, "--index=word" )
예제 #53
0
def loadContigStats(infile, outfile):
    '''
    load the contig stats
    '''
    P.load(infile, outfile)