Python Pipeline.load 예제들, CGAT.Pipeline.load Python 예제들

예제 #1

0

파일 보기

def loadDistances(infile, outfile):
    '''load annotations'''
    P.load(
        infile, outfile,
        "--index=gene_id --map=gene_id:str --index=closest_id --map=closest_id:str"
    )
    table = outfile[:-len(".load")]

예제 #2

0

파일 보기

파일: pipeline_metagenomebenchmark.py 프로젝트: yangjl/cgat

def loadFilteredContigLengths(infile, outfile):
    '''
    load contig lengths
    '''
    outname = P.snip(os.path.dirname(infile), ".dir") + "_" + P.snip(os.path.basename(infile), ".tsv") + ".load"
    P.load(infile, outname)
    P.touch(outfile)

예제 #3

0

파일 보기

def loadTranscriptSummary(infile, outfile):
    '''summarize binding information per transcript.'''

    dbh = connect()

    table = P.toTable(outfile)

    cc = dbh.cursor()
    # sqlite can not do full outer join
    cc.execute( """DROP TABLE IF EXISTS %(table)s""" % locals() )

    transcripts = [x[0] for x in cc.execute(
        "SELECT DISTINCT(transcript_id) FROM annotations.transcript_info").fetchall()]

    tmpf = P.getTempFile()

    tables = ("tata", "cpg")
    titles = tables

    vals = []
    for table in tables:
        t = set([x[0] for x in cc.execute(
            "SELECT DISTINCT(transcript_id) FROM %(table)s" % locals()).fetchall()])
        vals.append(t)

    tmpf.write("transcript_id\t%s\n" % "\t".join(titles))

    for transcript_id in transcripts:
        tmpf.write("%s\t%s\n" % (transcript_id,
                                 "\t".join([str(int(transcript_id in v)) for v in vals])))

    tmpf.close()

    P.load(tmpf.name, outfile)
    os.unlink(tmpf.name)

예제 #4

0

파일 보기

파일: pipeline_genomeassembly.py 프로젝트: BioinformaticsArchive/cgat

def loadContigSummary(infile, outfile):
    '''
    load contig summary stats for each assembler
    '''
    outname = P.snip(os.path.dirname(infile), ".dir") + "_" + os.path.basename(infile) + ".load"
    P.load(infile, outname)
    P.touch(outfile)

예제 #5

0

파일 보기

def loadGeneTables(infile, outfile):
    '''
    load genes from metagenemaek analysis
    '''
    if infile.find("gff") != -1:
        P.load(infile, outfile)
    else:
        P.load(infile, outfile)

예제 #6

0

파일 보기

파일: pipeline_benchmark_rnaseqmappers.py 프로젝트: nishantthakur/cgat

def loadExonValidation(infiles, outfile):
    """merge alignment stats into single tables."""
    suffix = suffix = ".exon.validation.tsv.gz"
    mergeAndLoad(infiles, outfile, suffix=suffix)
    for infile in infiles:
        track = P.snip(infile, suffix)
        o = "%s_overrun.load" % track
        P.load(infile + ".overrun.gz", o)

예제 #7

0

파일 보기

def loadContigLengths(infile, outfile):
    '''
    load contig lengths
    '''
    outname = P.snip(os.path.dirname(infile), ".dir") + "_" + P.snip(
        os.path.basename(infile), ".tsv") + ".load"
    P.load(infile, outname)
    P.touch(outfile)

예제 #8

0

파일 보기

def loadContigSummary(infile, outfile):
    '''
    load contig summary stats for each assembler
    '''
    outname = P.snip(os.path.dirname(infile), ".dir") + \
        "_" + os.path.basename(infile) + ".load"
    P.load(infile, outname)
    P.touch(outfile)

예제 #9

0

파일 보기

파일: pipeline_benchmark_rnaseqmappers.py 프로젝트: lesheng/cgat

def loadExonValidation(infiles, outfile):
    '''merge alignment stats into single tables.'''
    suffix = suffix = ".exon.validation.tsv.gz"
    mergeAndLoad(infiles, outfile, suffix=suffix)
    for infile in infiles:
        track = P.snip(infile, suffix)
        o = "%s_overrun.load" % track
        P.load(infile + ".overrun.gz", o)

예제 #10

0

파일 보기

def loadContigGCContent(infile, outfile):
    '''
    load contig GC content
    '''
    outname = P.snip(os.path.dirname(infile), ".dir") + \
        "_" + P.snip(os.path.basename(infile), ".tsv") + ".load"
    P.load(infile, outname, "--index=id")
    P.touch(outfile)

예제 #11

0

파일 보기

def loadContigLengths(infile, outfile):
    '''
    load contig lengths
    '''
    outname = P.snip(os.path.dirname(infile), ".dir") + \
        "_" + P.snip(os.path.basename(infile), ".tsv") + ".load"
    P.load(infile, outname, "--index=scaffold_name")
    P.touch(outfile)

예제 #12

0

파일 보기

파일: pipeline_genesets.py 프로젝트: Charlie-George/cgat

def loadHypergeometricAnalysis(infile, outfile):
    '''load GO results.'''

    track = P.toTable(outfile)
    tablename = 'hypergeometric_%s_summary' % track
    P.load(infile, outfile, tablename=tablename)

    dbh = connect()
    ontologies = [x[0] for x in Database.executewait(
        dbh,
        '''SELECT DISTINCT ontology FROM %s''' % tablename).fetchall()]

    genelists = [x[0] for x in Database.executewait(
        dbh,
        '''SELECT DISTINCT genelist FROM %s''' % tablename).fetchall()]

    # output files from runGO.py
    sections = ('results', 'parameters', 'withgenes')

    for section in sections:
        tablename = 'hypergeometric_%s_%s' % (track, section)
        statement = '''
        python %(scriptsdir)s/combine_tables.py
        --cat=track
        --regex-filename="hypergeometric.dir/%(track)s.tsv.dir/(\S+).%(section)s"
        hypergeometric.dir/%(track)s.tsv.dir/*.%(section)s
        | python %(scriptsdir)s/csv2db.py
        %(csv2db_options)s
        --table=%(tablename)s
        >> %(outfile)s'''
        P.run()

    for ontology in ontologies:

        fn = os.path.join(infile + ".dir", "all_alldesc.%s.l2fold" % ontology)

        if not os.path.exists(fn):
            E.warn("file %s does not exist" % fn)
            continue

        P.load(fn,
               outfile,
               tablename='hypergeometric_%s_%s_l2fold' % (track, ontology),
               options='--allow-empty')

        fn = os.path.join(
            infile + ".dir", "all_alldesc.%s.l10pvalue" % ontology)

        P.load(fn,
               outfile,
               tablename='hypergeometric_%s_%s_l10pvalue' % (track, ontology),
               options='--allow-empty')

        fn = os.path.join(
            infile + ".dir", "all_alldesc.%s.l10qvalue" % ontology)

        P.load(fn,
               outfile,
               tablename='hypergeometric_%s_%s_l10qvalue' % (track, ontology),
               options='--allow-empty')

예제 #13

0

파일 보기

파일: pipeline_windows.py 프로젝트: pombredanne/cgat

def loadEdgeR( infile, outfile ):
    '''load EdgeR per-chunk summary stats.'''

    prefix = P.snip( outfile, ".load" )

    for fn in glob.glob( infile + "*_summary.tsv" ):
        prefix = P.snip(fn[len(infile)+1:], "_summary.tsv")

        P.load( fn, 
                prefix + ".deseq_summary.load", 
                collapse = 0,
                transpose = "sample")

    P.touch( outfile )

예제 #14

0

파일 보기

파일: pipeline_rnaseqdiffexpression.py 프로젝트: BioinformaticsArchive/cgat

def loadCufflinks( infile, outfile ):
    '''load expression level measurements.'''

    track = P.snip( outfile, ".load" )
    P.load( infile + ".genes_tracking.gz",
            outfile = track + "_genefpkm.load",
            options = "--index=gene_id --ignore-column=tracking_id --ignore-column=class_code --ignore-column=nearest_ref_id" )

    track = P.snip( outfile, ".load" )
    P.load( infile + ".fpkm_tracking.gz",
            outfile = track + "_fpkm.load",
            options = "--index=tracking_id --ignore-column=nearest_ref_id --rename-column=tracking_id:transcript_id" )

    P.touch( outfile )

예제 #15

0

파일 보기

def loadReadCounts(infiles, outfile):
    '''load read counts into database.'''

    outf = P.getTempFile()
    outf.write("track\ttotal_reads\n")
    for infile in infiles:
        track = P.snip(infile, ".nreads")
        lines = IOTools.openFile(infile).readlines()
        nreads = int(lines[0][:-1].split("\t")[1])
        outf.write("%s\t%i\n" % (track, nreads))
    outf.close()

    P.load(outf.name, outfile)

    os.unlink(outf.name)

예제 #16

0

파일 보기

파일: pipeline_genomeassembly.py 프로젝트: BioinformaticsArchive/cgat

def loadReadCounts( infiles, outfile ):
    '''load read counts into database.'''

    outf = P.getTempFile()
    outf.write( "track\ttotal_reads\n")
    for infile in infiles:
        track = P.snip(infile, ".nreads")
        lines = IOTools.openFile( infile ).readlines()
        nreads = int( lines[0][:-1].split("\t")[1])
        outf.write( "%s\t%i\n" % (track,nreads))
    outf.close()

    P.load( outf.name, outfile )

    os.unlink(outf.name)

예제 #17

0

파일 보기

def loadMotifInformation( infiles, outfile ):
    '''load information about motifs into database.'''
    
    outf = P.getTempFile(".")

    outf.write("motif\n" )

    for infile in infiles:
        if IOTools.isEmpty( infile ): continue
        motif = P.snip( infile, ".motif" )
        outf.write( "%s\n" % motif )

    outf.close()

    P.load( outf.name, outfile, "--allow-empty" )
    
    os.unlink( outf.name )

예제 #18

0

파일 보기

def loadMemeSummary( infiles, outfile ):
    '''load information about motifs into database.'''
    
    outf = P.getTempFile(".")

    outf.write("track\n" )

    for infile in infiles:
        if IOTools.isEmpty( infile ): continue
        motif = P.snip( infile, ".meme" )
        outf.write( "%s\n" % motif )

    outf.close()

    P.load( outf.name, outfile )
    
    os.unlink( outf.name )

예제 #19

0

파일 보기

def loadCufflinks(infile, outfile):
    '''load expression level measurements.'''

    track = P.snip(outfile, ".load")
    P.load(infile + ".genes_tracking.gz",
           outfile=track + "_genefpkm.load",
           options="--index=gene_id "
           "--ignore-column=tracking_id "
           "--ignore-column=class_code "
           "--ignore-column=nearest_ref_id")

    track = P.snip(outfile, ".load")
    P.load(infile + ".fpkm_tracking.gz",
           outfile=track + "_fpkm.load",
           options="--index=tracking_id "
           "--ignore-column=nearest_ref_id "
           "--rename-column=tracking_id:transcript_id")

    P.touch(outfile)

예제 #20

0

파일 보기

def loadMemeChipSummary( infiles, outfile ):
    '''load information about motifs into database.'''
    
    outf = P.getTempFile(".")

    outf.write("track\tnpeaks\twidth\tmasking\tpath\n" )

    for infile in infiles:
        if IOTools.isEmpty( infile ): continue
        fn = P.snip(os.path.basename( infile ), ".memechip" )
        
        track, npeaks, width, masking = fn.split(".")
        outf.write( "\t".join( map(str,(track, npeaks, width, masking, fn)) ) + "\n" )

    outf.close()

    P.load( outf.name, outfile )
    
    os.unlink( outf.name )

예제 #21

0

파일 보기

파일: pipeline_motifs.py 프로젝트: santayana/cgat

def loadTomTom(infile, outfile):
    '''load tomtom results'''

    tablename = P.toTable(outfile)

    resultsdir = os.path.join(os.path.abspath(PARAMS["exportdir"]), "tomtom",
                              infile)
    xml_file = os.path.join(resultsdir, "tomtom.xml")

    if not os.path.exists(xml_file):
        E.warn("no tomtom output - skipped loading ")
        P.touch(outfile)
        return

    # get the motif name from the xml file

    tree = xml.etree.ElementTree.ElementTree()
    tree.parse(xml_file)
    motifs = tree.find("targets")
    name2alt = {}
    for motif in motifs.getiterator("motif"):
        name = motif.get("name")
        alt = motif.get("alt")
        name2alt[name] = alt

    tmpfile = P.getTempFile(".")

    # parse the text file
    for line in IOTools.openFile(infile):
        if line.startswith("#Query"):
            tmpfile.write(
                "target_name\tquery_id\ttarget_id\toptimal_offset\tpvalue\tevalue\tqvalue\tOverlap\tquery_consensus\ttarget_consensus\torientation\n"
            )
            continue
        data = line[:-1].split("\t")
        target_name = name2alt[data[1]]
        tmpfile.write("%s\t%s" % (target_name, line))
    tmpfile.close()

    P.load(tmpfile.name, outfile)

    os.unlink(tmpfile.name)

예제 #22

0

파일 보기

파일: pipeline_windows.py 프로젝트: pombredanne/cgat

def loadDESeq( infile, outfile ):
    '''load DESeq per-chunk summary stats.'''

    prefix = P.snip( outfile, ".load" )

    if os.path.exists( infile + "_size_factors.tsv" ):
        P.load( infile + "_size_factors.tsv", 
                prefix + "_deseq_size_factors.load", 
                collapse = True,
                transpose = "sample")

    for fn in glob.glob( infile + "*_summary.tsv" ):
        prefix = P.snip(fn[len(infile)+1:], "_summary.tsv")

        P.load( fn, 
                prefix + ".deseq_summary.load", 
                collapse = 0,
                transpose = "sample")

    P.touch( outfile )

예제 #23

0

파일 보기

파일: pipeline_benchmark_rnaseqmappers.py 프로젝트: nishantthakur/cgat

def loadMissedReadCounts(infiles, outfile):
    """load summary table of numbers of missed reads."""

    def _getlines(inf):
        return len(IOTools.openFile(inf).readlines()) - 1

    tmpfile = P.getTempFile()

    infiles = sorted(infiles)

    tmpfile.write("track\tmapped_genome\tmissed_junctions\tmissed_transcriptome\n")

    for x in range(0, len(infiles), 2):
        junctions, transcriptome = infiles[x], infiles[x + 1]
        track = P.snip(junctions, ".missed_junctions.gz")
        mapped_genome = _getlines(track + ".mapped_reads.gz")
        tmpfile.write("%s\t%i\t%i\t%i\n" % (track, mapped_genome, _getlines(junctions), _getlines(transcriptome)))
    tmpfile.close()
    P.load(tmpfile.name, outfile)
    os.unlink(tmpfile.name)

예제 #24

0

파일 보기

파일: pipeline_motifs.py 프로젝트: jmadzo/cgat

def loadTomTom(infile, outfile):
    '''load tomtom results'''

    tablename = P.toTable(outfile)

    resultsdir = os.path.join(
        os.path.abspath(PARAMS["exportdir"]), "tomtom", infile)
    xml_file = os.path.join(resultsdir, "tomtom.xml")

    if not os.path.exists(xml_file):
        E.warn("no tomtom output - skipped loading ")
        P.touch(outfile)
        return

    # get the motif name from the xml file

    tree = xml.etree.ElementTree.ElementTree()
    tree.parse(xml_file)
    motifs = tree.find("targets")
    name2alt = {}
    for motif in motifs.getiterator("motif"):
        name = motif.get("name")
        alt = motif.get("alt")
        name2alt[name] = alt

    tmpfile = P.getTempFile(".")

    # parse the text file
    for line in IOTools.openFile(infile):
        if line.startswith("#Query"):
            tmpfile.write(
                "target_name\tquery_id\ttarget_id\toptimal_offset\tpvalue\tevalue\tqvalue\tOverlap\tquery_consensus\ttarget_consensus\torientation\n")
            continue
        data = line[:-1].split("\t")
        target_name = name2alt[data[1]]
        tmpfile.write("%s\t%s" % (target_name, line))
    tmpfile.close()

    P.load(tmpfile.name, outfile)

    os.unlink(tmpfile.name)

예제 #25

0

파일 보기

파일: pipeline_benchmark_rnaseqmappers.py 프로젝트: yangjl/cgat

def loadMissedReadCounts(infiles, outfile):
    '''load summary table of numbers of missed reads.'''
    def _getlines(inf):
        return len(IOTools.openFile(inf).readlines()) - 1

    tmpfile = P.getTempFile()

    infiles = sorted(infiles)

    tmpfile.write(
        "track\tmapped_genome\tmissed_junctions\tmissed_transcriptome\n")

    for x in range(0, len(infiles), 2):
        junctions, transcriptome = infiles[x], infiles[x + 1]
        track = P.snip(junctions, ".missed_junctions.gz")
        mapped_genome = _getlines(track + ".mapped_reads.gz")
        tmpfile.write("%s\t%i\t%i\t%i\n" %
                      (track, mapped_genome, _getlines(junctions),
                       _getlines(transcriptome)))
    tmpfile.close()
    P.load(tmpfile.name, outfile)
    os.unlink(tmpfile.name)

예제 #26

0

파일 보기

def loadIdbaStats(infile, outfile):
    '''
    load the idba stats
    '''
    P.load(infile, outfile)

예제 #27

0

파일 보기

파일: pipeline_rnaseqdiffexpression.py 프로젝트: jmadzo/cgat

def loadGeneLevelReadCounts(infile, outfile):
    P.load(infile, outfile, options="--index=gene_id")

예제 #28

0

파일 보기

파일: pipeline_rnaseqdiffexpression.py 프로젝트: jmadzo/cgat

def loadCufflinksFPKM(infile, outfile):
    '''load fkpm data into table.'''

    P.load(infile, outfile,
           "--index=gene_id --index=transcript_id")

예제 #29

0

파일 보기

파일: pipeline_rnaseqdiffexpression.py 프로젝트: jmadzo/cgat

def loadDESeqStats(infile, outfile):
    P.load(infile, outfile)

예제 #30

0

파일 보기

def loadBlastOnAminoAcidSequences(infile, outfile):
    '''
    load blastp results
    '''
    P.load(infile, outfile)

예제 #31

0

파일 보기

파일: pipeline_metagenomebenchmark.py 프로젝트: yangjl/cgat

def loadChimericityScores(infile, outfile):
    '''
    load the chimericity scores
    '''
    P.load(infile, outfile)

예제 #32

0

파일 보기

파일: pipeline_metagenomebenchmark.py 프로젝트: yangjl/cgat

def loadFilteredContigStats(infile, outfile):
    '''
    load the filtered contig stats
    '''
    P.load(infile, outfile)

예제 #33

0

파일 보기

def loadTATABox(infile, outfile):
    '''load TATA box information.'''

    P.load(infile + ".table.gz", outfile, "--index=transcript_id")

예제 #34

0

파일 보기

def loadDummyTask(infile, outfile):
    '''load results of word counting into database.'''
    P.load(infile, outfile, "--index=word")

예제 #35

0

파일 보기

def loadAlignmentStats(infile, outfile):
    '''
    load bam2stats results
    '''
    P.load(infile, outfile)

예제 #36

0

파일 보기

def loadMetavelvetRawStats(infile, outfile):
    '''
    load the assembly stats for meta-velvet
    '''
    inf = P.snip(infile, ".contigs.fa") + ".stats.txt"
    P.load(inf, outfile)

예제 #37

0

파일 보기

def loadMetavelvetStats(infile, outfile):
    '''
    load the metavelvet stats
    '''
    P.load(infile, outfile)

예제 #38

0

파일 보기

def loadCpGIslands(infile, outfile):
    '''load CpG Islands information.'''

    P.load(infile, outfile, "--index=transcript_id")

예제 #39

0

파일 보기

파일: pipeline_fastqToBigWig.py 프로젝트: Charlie-George/cgat

def loadMACSsoloSummary(infile, outfile):
    '''load macs summary.'''
    P.load(infile, outfile, "--index=track")

예제 #40

0

파일 보기

파일: pipeline_metagenomebenchmark.py 프로젝트: yangjl/cgat

def loadExpectedAndObservedGenomeCoverage(infile, outfile):
    '''
    load the combined table for observed and expected
    genome coverage
    '''
    P.load(infile, outfile)

예제 #41

0

파일 보기

파일: pipeline_rnaseqdiffexpression.py 프로젝트: jmadzo/cgat

def loadFeatureCounts(infile, outfile):
    '''load individual feature counts into database'''
    P.load(infile, outfile, "--index=gene_id")

예제 #42

0

파일 보기

파일: pipeline_readqc.py 프로젝트: lesheng/cgat

def loadFastqcSummary(infile, outfile):
    P.load(infile, outfile, options="--index=track")

예제 #43

0

파일 보기

def loadEssentialGeneAssignments(infile, outfile):
    '''
    load assignments of essential genes
    '''
    P.load(infile, outfile, "--index=contig")

예제 #44

0

파일 보기

파일: pipeline_rnaseqdiffexpression.py 프로젝트: jmadzo/cgat

def loadTranscriptLevelReadCounts(infile, outfile):
    P.load(infile, outfile, options="--index=transcript_id")

예제 #45

0

파일 보기

def loadMACSsoloSummary(infile, outfile):
    '''load macs summary.'''
    P.load(infile, outfile, "--index=track")

예제 #46

0

파일 보기

파일: pipeline_rnaseqdiffexpression.py 프로젝트: jmadzo/cgat

def loadTagCountSummary(infile, outfile):
    '''load windows summary.'''
    P.load(infile, outfile)
    P.load(P.snip(infile, ".tsv") + "_correlation.tsv",
           P.snip(outfile, "_stats.load") + "_correlation.load",
           options="--first-column=track")

예제 #47

0

파일 보기

def loadOverrun(infile, outfile):
    '''load annotations'''
    P.load(infile, outfile, "--index=gene_id --map=gene_id:str")

예제 #48

0

파일 보기

파일: pipeline_rnaseqdiffexpression.py 프로젝트: jmadzo/cgat

def loadEdgeRStats(infile, outfile):
    P.load(infile, outfile)

예제 #49

0

파일 보기

def loadFilteringSummary(infile, outfile):
    '''load filtering summary.'''
    P.load(infile, outfile)

예제 #50

0

파일 보기

파일: pipeline_rnaseqdiffexpression.py 프로젝트: jmadzo/cgat

def loadCuffdiffStats(infile, outfile):
    '''import cuffdiff results.'''
    P.load(infile, outfile)

예제 #51

0

파일 보기

def loadRepeats(infile, outfile):
    '''load repeat overlap'''
    P.load(infile, outfile, "--index=gene_id --map=gene_id:str")

예제 #52

0

파일 보기

파일: pipeline_template.py 프로젝트: BioinformaticsArchive/cgat

def loadDummyTask( infile, outfile ):
    '''load results of word counting into database.'''
    P.load( infile, outfile, "--index=word" )

예제 #53

0

파일 보기

def loadContigStats(infile, outfile):
    '''
    load the contig stats
    '''
    P.load(infile, outfile)