Python Pipeline.toTable 예제들, CGAT.Pipeline.toTable Python 예제들

예제 #1

0

파일 보기

파일: PipelineMetagenomeBenchmark.py 프로젝트: Charlie-George/cgat

def calculateFalsePositiveRate(infiles, outfile):
    '''
    taxonomy false positives and negatives etc
    '''
    # connect to database
    dbh = sqlite3.connect(PARAMS["database"])
    cc = dbh.cursor()

    levels = ["phylum", "class", "order", "family", "genus", "species"]
    tablename_true = P.toTable(infiles[0])

    # get corresponding estimate file
    tablename_estimate = P.toTable(os.path.basename([inf for inf in infiles[
                                   1:] if os.path.basename(inf)[len("metaphlan_"):] == os.path.basename(infiles[0])][0]))

    outf = open(outfile, "w")
    track = P.snip(os.path.basename(infiles[0]), ".taxonomy.relab.load")
    for level in levels:
        for cutoff in [0, 1]:
            true_set = set()
            estimate_set = set()
            for taxa in cc.execute("""SELECT taxa FROM %s WHERE level == '%s' AND relab > %f""" % (tablename_true, level, float(cutoff) / 100)):
                true_set.add(taxa[0])
            for taxa in cc.execute("""SELECT taxon FROM %s WHERE taxon_level == '%s' AND rel_abundance > %f""" % (tablename_estimate, level, float(cutoff))):
                estimate_set.add(taxa[0])
            total_true = len(true_set)
            total_estimate = len(estimate_set)
            tp = true_set.intersection(estimate_set)
            fp = estimate_set.difference(true_set)

            fp_rate = float(len(fp)) / total_estimate
            tp_rate = float(len(tp)) / total_true
            outf.write("%s\t%f\t%f\t%s\t%s\n" %
                       (level, fp_rate, tp_rate, track, str(cutoff)))
    outf.close()

예제 #2

0

파일 보기

파일: PipelineMetagenomeBenchmark.py 프로젝트: yangjl/cgat

def calculateFalsePositiveRate(infiles, outfile):
    '''
    taxonomy false positives and negatives etc
    '''
    # connect to database
    dbh = sqlite3.connect(PARAMS["database"])
    cc = dbh.cursor()

    levels = ["phylum", "class", "order", "family", "genus", "species"]
    tablename_true = P.toTable(infiles[0])

    # get corresponding estimate file
    tablename_estimate = P.toTable(os.path.basename([inf for inf in infiles[1:] if os.path.basename(inf)[len("metaphlan_"):] == os.path.basename(infiles[0])][0]))

    outf = open(outfile, "w")
    track = P.snip(os.path.basename(infiles[0]), ".taxonomy.relab.load")
    for level in levels:
        for cutoff in [0, 1]:
            true_set = set()
            estimate_set = set()
            for taxa in cc.execute("""SELECT taxa FROM %s WHERE level == '%s' AND relab > %f""" % (tablename_true, level, float(cutoff)/100)):
                true_set.add(taxa[0])
            for taxa in cc.execute("""SELECT taxon FROM %s WHERE taxon_level == '%s' AND rel_abundance > %f""" % (tablename_estimate, level, float(cutoff))):
                estimate_set.add(taxa[0])
            total_true = len(true_set)
            total_estimate = len(estimate_set)
            tp = true_set.intersection(estimate_set)
            fp = estimate_set.difference(true_set)
            
            fp_rate = float(len(fp))/total_estimate
            tp_rate = float(len(tp))/total_true
            outf.write("%s\t%f\t%f\t%s\t%s\n" % (level, fp_rate, tp_rate, track, str(cutoff)))
    outf.close()

예제 #3

0

파일 보기

파일: pipeline_metagenomebenchmark.py 프로젝트: lesheng/cgat

def compareAbundanceOfFalsePositiveSpecies(infiles, outfile):
    '''
    boxplot the relative abundance of false positive
    species compared to true positives
    '''
    tablename_estimate = P.toTable(infiles[0])

    track = P.snip(
        os.path.basename(infiles[0]).replace("metaphlan_", ""), ".load")
    tablename_true = [
        P.toTable(x) for x in infiles[1:]
        if P.snip(os.path.basename(x), ".load") == track
    ][0]
    dbh = sqlite3.connect("csvdb")
    cc = dbh.cursor()
    tmp = P.getTempFile(".")
    tmp.write("taxa\tabundance\tstatus\n")
    estimate = {}
    true = set()
    for data in cc.execute(
            """SELECT taxon, rel_abundance FROM %s WHERE taxon_level == 'species'"""
            % tablename_estimate).fetchall():
        estimate[data[0]] = data[1]
    for data in cc.execute("""SELECT taxa FROM %s WHERE level == 'species'""" %
                           tablename_true).fetchall():
        true.add(data[0])

    for taxa, abundance in estimate.iteritems():
        if taxa in true:
            tmp.write("%s\t%f\ttp\n" % (taxa, abundance))
        else:
            tmp.write("%s\t%f\tfp\n" % (taxa, abundance))
    tmp.close()

    inf = tmp.name
    if track.find("15M") != -1:
        col = "cadetblue"
    elif track.find("30M") != -1:
        col = "lightblue"
    elif track.find("50M") != -1:
        col = "slategray"

    R('''dat <- read.csv("%s", header = T, stringsAsFactors = F, sep = "\t")'''
      % inf)
    R('''library(ggplot2)''')
    R('''ggplot(dat, aes(x = status, y = log2(abundance))) + geom_boxplot(colour = "%s") + geom_hline(yintersect=0, linetype="dashed")'''
      % col)
    R('''ggsave("%s")''' % outfile)
    os.unlink(inf)

예제 #4

0

파일 보기

파일: PipelineGeneset.py 프로젝트: santayana/cgat

def loadGeneInformation(infile, outfile, only_proteincoding=False):
    '''load gene information gleaned from the attributes
    in the gene set gtf file.

    *infile* is an ENSEMBL gtf file.
    '''

    table = P.toTable(outfile)

    if only_proteincoding:
        filter_cmd = ''' awk '$2 == "protein_coding"' '''
    else:
        filter_cmd = "cat"

    statement = '''
    gunzip < %(infile)s
    | %(filter_cmd)s
    | grep "transcript_id"
    | python %(scriptsdir)s/gtf2gtf.py --sort=gene+transcript
    | python %(scriptsdir)s/gtf2tsv.py --full --only-attributes -v 0
    | python %(toolsdir)s/csv_cut.py
    --remove exon_id transcript_id transcript_name protein_id exon_number
    | %(scriptsdir)s/hsort 1 | uniq
    | python %(scriptsdir)s/csv2db.py %(csv2db_options)s
              --index=gene_id
              --index=gene_name
              --map=gene_name:str
              --table=%(table)s
    > %(outfile)s'''

    P.run()

예제 #5

0

파일 보기

def loadNumberExonsLengthSummaryStats(infile, outfile):
    '''
    load the table of exon counts and transcript lengths
    '''
    tablename = P.toTable(outfile.replace("/", "_")) + "_stats"
    statement = '''python %(scriptsdir)s/csv2db.py -t %(tablename)s --log=%(outfile)s.log < %(infile)s > %(outfile)s'''
    P.run()

예제 #6

0

파일 보기

def loadSummariseReadsContributingToTranscripts(infile, outfile):
    '''
    loads the summary of reads contributing to transcripts
    '''
    tablename = P.toTable(outfile.replace("/", "_"))
    statement = '''python %(scriptsdir)s/csv2db.py -t %(tablename)s --log=%(outfile)s.log < %(infile)s > %(outfile)s'''
    P.run()

예제 #7

0

파일 보기

def loadCountSingleAndMultiExonLincRNA(infile, outfile):
    '''
    load the counts for the multi and single exon lincRNA
    '''
    tablename = P.toTable(outfile.replace("/", "_")) + ".count"
    statement = '''python %(scriptsdir)s/csv2db.py -t %(tablename)s --log=%(outfile)s.log < %(infile)s > %(outfile)s'''
    P.run()

예제 #8

0

파일 보기

파일: pipeline_genesets.py 프로젝트: Charlie-George/cgat

def loadHypergeometricAnalysis(infile, outfile):
    '''load GO results.'''

    track = P.toTable(outfile)
    tablename = 'hypergeometric_%s_summary' % track
    P.load(infile, outfile, tablename=tablename)

    dbh = connect()
    ontologies = [x[0] for x in Database.executewait(
        dbh,
        '''SELECT DISTINCT ontology FROM %s''' % tablename).fetchall()]

    genelists = [x[0] for x in Database.executewait(
        dbh,
        '''SELECT DISTINCT genelist FROM %s''' % tablename).fetchall()]

    # output files from runGO.py
    sections = ('results', 'parameters', 'withgenes')

    for section in sections:
        tablename = 'hypergeometric_%s_%s' % (track, section)
        statement = '''
        python %(scriptsdir)s/combine_tables.py
        --cat=track
        --regex-filename="hypergeometric.dir/%(track)s.tsv.dir/(\S+).%(section)s"
        hypergeometric.dir/%(track)s.tsv.dir/*.%(section)s
        | python %(scriptsdir)s/csv2db.py
        %(csv2db_options)s
        --table=%(tablename)s
        >> %(outfile)s'''
        P.run()

    for ontology in ontologies:

        fn = os.path.join(infile + ".dir", "all_alldesc.%s.l2fold" % ontology)

        if not os.path.exists(fn):
            E.warn("file %s does not exist" % fn)
            continue

        P.load(fn,
               outfile,
               tablename='hypergeometric_%s_%s_l2fold' % (track, ontology),
               options='--allow-empty')

        fn = os.path.join(
            infile + ".dir", "all_alldesc.%s.l10pvalue" % ontology)

        P.load(fn,
               outfile,
               tablename='hypergeometric_%s_%s_l10pvalue' % (track, ontology),
               options='--allow-empty')

        fn = os.path.join(
            infile + ".dir", "all_alldesc.%s.l10qvalue" % ontology)

        P.load(fn,
               outfile,
               tablename='hypergeometric_%s_%s_l10qvalue' % (track, ontology),
               options='--allow-empty')

예제 #9

0

파일 보기

파일: pipeline_fastqToBigWig.py 프로젝트: Charlie-George/cgat

def loadPicardDuplicateStats(infiles, outfile):
    '''Merge Picard duplicate stats into single table and load into SQLite.'''
    # Join data for all tracks into single file
    outf = open('dupstats.txt', 'w')
    first = True
    for f in infiles:
        track = P.snip(os.path.basename(f), ".dedup.bam")
        statfile = P.snip(f, ".bam") + ".dupstats"
        if not os.path.exists(statfile):
            E.warn("File %s missing" % statfile)
            continue
        lines = [x for x in open(
            statfile, "r").readlines() if not x.startswith("#") and x.strip()]
        if first:
            outf.write("%s\t%s" % ("track", lines[0]))
        first = False
        outf.write("%s\t%s" % (track, lines[1]))

    outf.close()
    tmpfilename = outf.name

    # Load into database
    tablename = P.toTable(outfile)
    statement = '''cat %(tmpfilename)s
                | python %(scriptsdir)s/csv2db.py
                      --index=track
                      --table=%(tablename)s 
                > %(outfile)s '''
    P.run()

예제 #10

0

파일 보기

def loadTranscriptSummary(infile, outfile):
    '''summarize binding information per transcript.'''

    dbh = connect()

    table = P.toTable(outfile)

    cc = dbh.cursor()
    # sqlite can not do full outer join
    cc.execute( """DROP TABLE IF EXISTS %(table)s""" % locals() )

    transcripts = [x[0] for x in cc.execute(
        "SELECT DISTINCT(transcript_id) FROM annotations.transcript_info").fetchall()]

    tmpf = P.getTempFile()

    tables = ("tata", "cpg")
    titles = tables

    vals = []
    for table in tables:
        t = set([x[0] for x in cc.execute(
            "SELECT DISTINCT(transcript_id) FROM %(table)s" % locals()).fetchall()])
        vals.append(t)

    tmpf.write("transcript_id\t%s\n" % "\t".join(titles))

    for transcript_id in transcripts:
        tmpf.write("%s\t%s\n" % (transcript_id,
                                 "\t".join([str(int(transcript_id in v)) for v in vals])))

    tmpf.close()

    P.load(tmpf.name, outfile)
    os.unlink(tmpf.name)

예제 #11

0

파일 보기

파일: pipeline_promotors.py 프로젝트: lesheng/cgat

def buildGeneOntology(infile, outfile):
    '''create an output file akin to GO ontology files to be
    used with GO.py
    '''

    table = P.toTable(infile)
    columns = ("cpg", "tata")
    dbh = connect()
    cc = dbh.cursor()

    outf = IOTools.openFile(outfile, "w")
    outf.write("go_type\tgene_id\tgo_id\tdescription\tevidence\n")

    i = 1
    for c in columns:
        cc.execute(
            "SELECT DISTINCT gene_id FROM %(table)s WHERE %(c)s" % locals())
        outf.write(
            "".join(["promotor\t%s\tGO:%07i\twith_%s\tNA\n" % (x[0], i, c) for x in cc]))
        i += 1
        cc.execute(
            "SELECT DISTINCT gene_id FROM %(table)s WHERE %(c)s = 0" % locals())
        outf.write(
            "".join(["promotor\t%s\tGO:%07i\twithout_%s\tNA\n" % (x[0], i, c) for x in cc]))
        i += 1

    outf.close()

예제 #12

0

파일 보기

파일: pipeline_benchmark_rnaseqmappers.py 프로젝트: lesheng/cgat

def createViewMapping(infile, outfile):
    '''create view in database for alignment stats.

    This view aggregates all information on a per-track basis.

    The table is built from the following tracks:

    mapping_stats
    bam_stats
    '''

    tablename = P.toTable(outfile)
    # can not create views across multiple database, so use table
    view_type = "TABLE"

    dbhandle = connect()
    Database.executewait(
        dbhandle, "DROP %(view_type)s IF EXISTS %(tablename)s" % locals())

    statement = '''
    CREATE %(view_type)s %(tablename)s AS
    SELECT *
    FROM bam_stats AS b
    '''

    Database.executewait(dbhandle, statement % locals())

예제 #13

0

파일 보기

파일: pipeline_fastqToBigWig.py 프로젝트: Charlie-George/cgat

def loadPicardAlignStats(infiles, outfile):
    '''Merge Picard alignment stats into single table and load into SQLite.'''
    # Join data for all tracks into single file
    outf = P.getTempFile()
    first = True
    for f in infiles:
        track = P.snip(os.path.basename(f), ".alignstats")
        if not os.path.exists(f):
            E.warn("File %s missing" % f)
            continue
        lines = [
            x for x in open(f, "r").readlines() if not x.startswith("#") and x.strip()]
        if first:
            outf.write("%s\t%s" % ("track", lines[0]))
        first = False
        for i in range(1, len(lines)):
            outf.write("%s\t%s" % (track, lines[i]))
    outf.close()
    tmpfilename = outf.name

    # Load into database
    tablename = P.toTable(outfile)
    statement = '''cat %(tmpfilename)s
                | python %(scriptsdir)s/csv2db.py
                      --index=track
                      --table=%(tablename)s 
                > %(outfile)s'''
    P.run()
    os.unlink(tmpfilename)

예제 #14

0

파일 보기

파일: pipeline_benchmark_rnaseqmappers.py 프로젝트: lesheng/cgat

def mergeAndLoad(infiles, outfile, suffix):
    '''load categorical tables (two columns) into a database.

    The tables are merged and entered row-wise.

    '''
    header = ",".join([P.quote(P.snip(x, suffix)) for x in infiles])
    if suffix.endswith(".gz"):
        filenames = " ".join(
            ["<( zcat %s | cut -f 1,2 )" % x for x in infiles])
    else:
        filenames = " ".join(["<( cat %s | cut -f 1,2 )" % x for x in infiles])

    tablename = P.toTable(outfile)

    statement = """python %(scriptsdir)s/combine_tables.py
                      --headers=%(header)s
                      --missing=0
                      --ignore-empty
                   %(filenames)s
                | perl -p -e "s/bin/track/" 
                | python %(scriptsdir)s/table2table.py --transpose
                | python %(scriptsdir)s/csv2db.py
                      --index=track
                      --table=%(tablename)s 
                > %(outfile)s
            """
    P.run()

예제 #15

0

파일 보기

파일: pipeline_medip.py 프로젝트: lesheng/cgat

def loadPicardGCStats(infiles, outfile):
    '''Merge Picard insert size stats into single table and load into SQLite.'''

    tablename = P.toTable(outfile)
    outf = P.getTempFile()

    first = True
    for f in infiles:
        track = P.snip(os.path.basename(f), ".gcstats")
        if not os.path.exists(f):
            E.warn("File %s missing" % f)
            continue
        lines = [
            x for x in open(f, "r").readlines() if not x.startswith("#") and x.strip()]
        if first:
            outf.write("%s\t%s" % ("track", lines[0]))
        first = False
        outf.write("%s\t%s" % (track, lines[1]))
    outf.close()
    tmpfilename = outf.name

    statement = '''cat %(tmpfilename)s
                   | python %(scriptsdir)s/csv2db.py
                      %(csv2db_options)s
                      --index=track
                      --table=%(tablename)s 
                   > %(outfile)s '''
    P.run()

    os.unlink(tmpfilename)

예제 #16

0

파일 보기

def loadPicardAlignStats(infiles, outfile):
    '''Merge Picard alignment stats into single table and load into SQLite.'''
    # Join data for all tracks into single file
    outf = P.getTempFile()
    first = True
    for f in infiles:
        track = P.snip(os.path.basename(f), ".alignstats")
        if not os.path.exists(f):
            E.warn("File %s missing" % f)
            continue
        lines = [
            x for x in open(f, "r").readlines()
            if not x.startswith("#") and x.strip()
        ]
        if first:
            outf.write("%s\t%s" % ("track", lines[0]))
        first = False
        for i in range(1, len(lines)):
            outf.write("%s\t%s" % (track, lines[i]))
    outf.close()
    tmpfilename = outf.name

    # Load into database
    tablename = P.toTable(outfile)
    statement = '''cat %(tmpfilename)s
                | python %(scriptsdir)s/csv2db.py
                      --index=track
                      --table=%(tablename)s 
                > %(outfile)s'''
    P.run()
    os.unlink(tmpfilename)

예제 #17

0

파일 보기

def loadLowerStringencyDeNovos(infile, outfile):
    '''Load lower stringency de novos into database'''
    scriptsdir = PARAMS["general_scriptsdir"]
    tablename = P.toTable(outfile)
    statement = '''cat %(infile)s | python %(scriptsdir)s/csv2db.py --table %(tablename)s --retry --ignore-empty --allow-empty > %(outfile)s''' % locals(
    )
    P.run()

예제 #18

0

파일 보기

def loadPicardDuplicateStats(infiles, outfile):
    '''Merge Picard duplicate stats into single table and load into SQLite.'''
    # Join data for all tracks into single file
    outf = open('dupstats.txt', 'w')
    first = True
    for f in infiles:
        track = P.snip(os.path.basename(f), ".dedup.bam")
        statfile = P.snip(f, ".bam") + ".dupstats"
        if not os.path.exists(statfile):
            E.warn("File %s missing" % statfile)
            continue
        lines = [
            x for x in open(statfile, "r").readlines()
            if not x.startswith("#") and x.strip()
        ]
        if first:
            outf.write("%s\t%s" % ("track", lines[0]))
        first = False
        outf.write("%s\t%s" % (track, lines[1]))

    outf.close()
    tmpfilename = outf.name

    # Load into database
    tablename = P.toTable(outfile)
    statement = '''cat %(tmpfilename)s
                | python %(scriptsdir)s/csv2db.py
                      --index=track
                      --table=%(tablename)s 
                > %(outfile)s '''
    P.run()

예제 #19

0

파일 보기

def loadGeneListStats(infiles, outfile):
    '''Merge gene list stats into single table and load into SQLite.'''

    tablename = P.toTable(outfile)
    outf = open("genelist_stats.txt", "w")

    first = True
    for f in infiles:
        track = P.snip(os.path.basename(f), ".genelist.stats")
        if not os.path.exists(f):
            E.warn("File %s missing" % f)
            continue
        lines = [
            x for x in open(f, "r").readlines()
            if not x.startswith("#") and x.strip()
        ]
        if first: outf.write("%s\t%s" % ("track", lines[0]))
        first = False
        outf.write("%s\t%s" % (track, lines[1]))
    outf.close()
    tmpfilename = outf.name

    statement = '''cat %(tmpfilename)s
                   | python %(scriptsdir)s/csv2db.py
                      --index=track
                      --table=%(tablename)s 
                   > %(outfile)s '''
    P.run()

예제 #20

0

파일 보기

파일: PipelineMetagenomeBenchmark.py 프로젝트: Charlie-George/cgat

def filterByCoverage(infiles, outfile):

    fcoverage = PARAMS["coverage_filter"]
    contig_file = infiles[0]
    dbh = sqlite3.connect(
        os.path.join(PARAMS["results_resultsdir"], PARAMS["database"]))
    cc = dbh.cursor()
    contigs = set()
    for infile in infiles[1:]:
        dirsplit = infile.split("/")
        infile = os.path.join(
            PARAMS["results_resultsdir"], dirsplit[-2].split(".dir")[0] + "-" + dirsplit[-1])
        tablename = P.toTable(os.path.basename(infile))
        if P.snip(contig_file, ".fa") == P.snip(os.path.basename(infile), ".coverage.load"):
            statement = """SELECT contig_id ave FROM
                           (SELECT contig_id, AVG(coverage) as ave FROM %s GROUP BY contig_id)
                           WHERE ave > %i""" % (tablename, PARAMS["coverage_filter"])
            for data in cc.execute(statement).fetchall():
                contigs.add(data[0])
    outf = open(outfile, "w")
    print contigs
    for fasta in FastaIterator.iterate(IOTools.openFile(contig_file)):
        identifier = fasta.title.split(" ")[0]
        if identifier in contigs:
            outf.write(">%s\n%s\n" % (identifier, fasta.sequence))
    outf.close()

예제 #21

0

파일 보기

def loadCodingPotential(infile, outfile):
    '''load annotations'''

    table = P.toTable(outfile)

    statement = '''
    gunzip < %(infile)s 
    | python %(scriptsdir)s/csv2db.py 
              %(csv2db_options)s 
              --allow-empty
              --index=gene_id 
              --map=gene_id:str 
              --table=%(table)s 
    > %(outfile)s'''

    P.run()

    # set the is_coding flag
    dbhandle = sqlite3.connect(PARAMS["database"])
    Database.executewait(
        dbhandle,
        '''ALTER TABLE %(table)s ADD COLUMN is_coding INTEGER''' % locals())
    Database.executewait(
        dbhandle,
        '''UPDATE %(table)s SET is_coding = (result == 'coding')''' % locals())
    dbhandle.commit()

예제 #22

0

파일 보기

파일: PipelineMetagenomeBenchmark.py 프로젝트: santayana/cgat

def filterByCoverage(infiles, outfile):

    fcoverage = PARAMS["coverage_filter"]
    contig_file = infiles[0]
    dbh = sqlite3.connect(
        os.path.join(PARAMS["results_resultsdir"], PARAMS["database"]))
    cc = dbh.cursor()
    contigs = set()
    for infile in infiles[1:]:
        dirsplit = infile.split("/")
        infile = os.path.join(
            PARAMS["results_resultsdir"],
            dirsplit[-2].split(".dir")[0] + "-" + dirsplit[-1])
        tablename = P.toTable(os.path.basename(infile))
        if P.snip(contig_file, ".fa") == P.snip(os.path.basename(infile),
                                                ".coverage.load"):
            statement = """SELECT contig_id ave FROM
                           (SELECT contig_id, AVG(coverage) as ave FROM %s GROUP BY contig_id)
                           WHERE ave > %i""" % (tablename,
                                                PARAMS["coverage_filter"])
            for data in cc.execute(statement).fetchall():
                contigs.add(data[0])
    outf = open(outfile, "w")
    print contigs
    for fasta in FastaIterator.iterate(IOTools.openFile(contig_file)):
        identifier = fasta.title.split(" ")[0]
        if identifier in contigs:
            outf.write(">%s\n%s\n" % (identifier, fasta.sequence))
    outf.close()

예제 #23

0

파일 보기

파일: PipelineGeneset.py 프로젝트: Charlie-George/cgat

def loadGeneInformation(infile, outfile, only_proteincoding=False):
    '''load gene information gleaned from the attributes
    in the gene set gtf file.

    *infile* is an ENSEMBL gtf file.
    '''

    table = P.toTable(outfile)

    if only_proteincoding:
        filter_cmd = ''' awk '$2 == "protein_coding"' '''
    else:
        filter_cmd = "cat"

    statement = '''
    gunzip < %(infile)s
    | %(filter_cmd)s
    | grep "transcript_id"
    | python %(scriptsdir)s/gtf2gtf.py --sort=gene+transcript
    | python %(scriptsdir)s/gtf2tsv.py --full --only-attributes -v 0
    | python %(toolsdir)s/csv_cut.py
    --remove exon_id transcript_id transcript_name protein_id exon_number
    | %(scriptsdir)s/hsort 1 | uniq
    | python %(scriptsdir)s/csv2db.py %(csv2db_options)s
              --index=gene_id
              --index=gene_name
              --map=gene_name:str
              --table=%(table)s
    > %(outfile)s'''

    P.run()

예제 #24

0

파일 보기

def loadProteinStats( infile, outfile ):
    '''load protein statistics to database.

    The *infile* is an ENSEMBL peptide file.
    '''

    to_cluster = True

    table = P.toTable(outfile)

    statement = '''
    gunzip < %(infile)s |
    python %(scriptsdir)s/fasta2table.py 
          --log=%(outfile)s
          --type=aa 
          --section=length 
          --section=hid 
          --section=aa 
          --regex-identifier="(\S+)" |
    sed "s/^id/protein_id/" |
    python %(scriptsdir)s/csv2db.py %(csv2db_options)s 
              --index=protein_id 
              --map=protein_id:str 
              --table=%(table)s 
    > %(outfile)s'''

    P.run()

예제 #25

0

파일 보기

def loadTranscriptStats( infile, outfile ):
    '''load gene statistics to database.

    The *infile* is the *outfile* from :meth:`buildTranscripts`
    '''

    to_cluster = True

    table = P.toTable(outfile)

    statement = '''
    gunzip < %(infile)s |\
    python %(scriptsdir)s/gtf2table.py \
          --log=%(outfile)s.log \
          --genome=%(genome_dir)s/%(genome)s \
          --reporter=transcripts \
          --counter=position \
          --counter=length \
          --counter=composition-na |\
    python %(scriptsdir)s/csv2db.py %(csv2db_options)s \
              --index=gene_id \
              --map=gene_id:str \
              --table=%(table)s \
    > %(outfile)s'''

    P.run()

예제 #26

0

파일 보기

def loadRecs(infile, outfile):
    '''Load homozygous recessive disease candidates into database'''
    scriptsdir = PARAMS["general_scriptsdir"]
    tablename = P.toTable(outfile)
    statement = '''cat %(infile)s | python %(scriptsdir)s/csv2db.py --table %(tablename)s --retry --ignore-empty --allow-empty > %(outfile)s''' % locals(
    )
    P.run()

예제 #27

0

파일 보기

파일: pipeline_cufflinks_optimization.py 프로젝트: jmadzo/cgat

def loadSummariseReadsContributingToTranscripts(infile, outfile):
    '''
    loads the summary of reads contributing to transcripts
    '''
    tablename = P.toTable(outfile.replace("/", "_"))
    statement = '''python %(scriptsdir)s/csv2db.py -t %(tablename)s --log=%(outfile)s.log < %(infile)s > %(outfile)s'''
    P.run()

예제 #28

0

파일 보기

파일: PipelineGeneset.py 프로젝트: lesheng/cgat

def loadTranscriptInformation(infile, outfile,
                              only_proteincoding=False):
    '''load the transcript set.

    *infile* is an ENSEMBL gtf file.
    '''
    to_cluster = True

    table = P.toTable(outfile)

    if only_proteincoding:
        filter_cmd = ''' awk '$2 == "protein_coding"' '''
    else:
        filter_cmd = "cat"

    statement = '''gunzip 
    < %(infile)s 
    | %(filter_cmd)s 
    | awk '$3 == "CDS"' 
    | python %(scriptsdir)s/gtf2gtf.py --sort=gene
    | python %(scriptsdir)s/gtf2tsv.py --full --only-attributes -v 0
    | python %(toolsdir)s/csv_cut.py --remove exon_id exon_number 
    | %(scriptsdir)s/hsort 1 | uniq 
    | python %(scriptsdir)s/csv2db.py %(csv2db_options)s 
              --index=transcript_id 
              --index=gene_id 
              --index=protein_id 
              --index=gene_name 
              --map=transcript_name:str 
              --map=gene_name:str 
              --table=%(table)s 
    > %(outfile)s'''
    P.run()

예제 #29

0

파일 보기

def loadCoverageStats(infiles, outfile):
    '''Import coverage statistics into SQLite'''
    scriptsdir = PARAMS["general_scriptsdir"]
    tablename = P.toTable(outfile)
    outf = open('coverage.txt', 'w')
    first = True
    for f in infiles:
        track = P.snip(os.path.basename(f), ".cov")
        lines = [
            x for x in open(f, "r").readlines()
            if not x.startswith("#") and x.strip()
        ]
        if first:
            outf.write("%s\t%s" % ("track", lines[0]))
        first = False
        outf.write("%s\t%s" % (track, lines[1]))
    outf.close()
    tmpfilename = outf.name
    statement = '''cat %(tmpfilename)s
                   | python %(scriptsdir)s/csv2db.py
                      --index=track
                      --table=%(tablename)s
                      --ignore-empty
                      --retry
                   > %(outfile)s '''
    P.run()

예제 #30

0

파일 보기

파일: pipeline_motifs.py 프로젝트: BioinformaticsArchive/cgat

def exportMotifLocations( infiles, outfile ):
    '''export motif locations. There will be a bed-file per motif.

    Overlapping motif matches in different tracks will be merged.
    '''

    dbh = connect()
    cc = dbh.cursor()

    motifs = [ x[0] for x in cc.execute( "SELECT motif FROM motif_info" ).fetchall()]

    
    for motif in motifs:

        tmpf = P.getTempFile(".")
        
        for infile in infiles:
            table = P.toTable(infile) 
            track = P.snip( table, "_mast" )
            for x in cc.execute( """SELECT contig, start, end, '%(track)s', evalue
                                   FROM %(table)s WHERE motif = '%(motif)s' AND start IS NOT NULL""" % locals() ):
                tmpf.write( "\t".join( map(str, x) ) + "\n" )
        tmpf.close()

        outfile = os.path.join( PARAMS["exportdir"], "motifs", "%s.bed.gz" % motif )
        tmpfname = tmpf.name 

        statement = '''mergeBed -i %(tmpfname)s -nms | gzip > %(outfile)s'''
        P.run()

        os.unlink( tmpf.name )

예제 #31

0

파일 보기

def loadCompoundHets(infile, outfile):
    '''Load compound heterozygous variants into database'''
    scriptsdir = PARAMS["general_scriptsdir"]
    tablename = P.toTable(outfile)
    statement = '''cat %(infile)s | python %(scriptsdir)s/csv2db.py --table %(tablename)s --retry --ignore-empty --allow-empty > %(outfile)s''' % locals(
    )
    P.run()

예제 #32

0

파일 보기

def buildGeneOntology(infile, outfile):
    '''create an output file akin to GO ontology files to be
    used with GO.py
    '''

    table = P.toTable(infile)
    columns = ("cpg", "tata")
    dbh = connect()
    cc = dbh.cursor()

    outf = IOTools.openFile(outfile, "w")
    outf.write("go_type\tgene_id\tgo_id\tdescription\tevidence\n")

    i = 1
    for c in columns:
        cc.execute("SELECT DISTINCT gene_id FROM %(table)s WHERE %(c)s" %
                   locals())
        outf.write("".join([
            "promotor\t%s\tGO:%07i\twith_%s\tNA\n" % (x[0], i, c) for x in cc
        ]))
        i += 1
        cc.execute("SELECT DISTINCT gene_id FROM %(table)s WHERE %(c)s = 0" %
                   locals())
        outf.write("".join([
            "promotor\t%s\tGO:%07i\twithout_%s\tNA\n" % (x[0], i, c)
            for x in cc
        ]))
        i += 1

    outf.close()

예제 #33

0

파일 보기

def buildContigSummary(infiles, outfile):
    '''
    merge the contig summary statistics
    '''
    stats = collections.defaultdict(list)
    for filepath in infiles:
        dirname = os.path.dirname(filepath)
        stats[dirname].append(os.path.basename(filepath))

    N = PARAMS["scaffold_n"]

    # connect to database
    dbh = connect()
    cc = dbh.cursor()
    for dirname in stats.keys():
        outfname = os.path.join(dirname, "contig.summary.tsv")
        outf = open(outfname, "w")
        outf.write(
            "track\tnscaffolds\tscaffold_length\tN%i\tmean_length\tmedian_length\tmax_length\n" % N)
        for infile in stats[dirname]:
            track = P.snip(
                infile.split(dirname.split(".dir")[0])[1][1:], ".summary.load")
            table = P.toTable(infile)
            data = cc.execute("""SELECT nscaffolds
                                 , scaffold_length
                                 , N50
                                 , mean_length
                                 , median_length
                                 , max_length FROM %s""" % table).fetchone()
            outf.write("\t".join(
                map(str, [track, data[0], data[1], data[2],
                          data[3], data[4], data[5]])) + "\n")
        outf.close()

예제 #34

0

파일 보기

파일: pipeline_motifs.py 프로젝트: santayana/cgat

def exportMotifLocations(infiles, outfile):
    '''export motif locations. There will be a bed-file per motif.

    Overlapping motif matches in different tracks will be merged.
    '''

    dbh = connect()
    cc = dbh.cursor()

    motifs = [
        x[0] for x in cc.execute("SELECT motif FROM motif_info").fetchall()
    ]

    for motif in motifs:

        tmpf = P.getTempFile(".")

        for infile in infiles:
            table = P.toTable(infile)
            track = P.snip(table, "_mast")
            for x in cc.execute(
                    """SELECT contig, start, end, '%(track)s', evalue
                                   FROM %(table)s WHERE motif = '%(motif)s' AND start IS NOT NULL"""
                    % locals()):
                tmpf.write("\t".join(map(str, x)) + "\n")
        tmpf.close()

        outfile = os.path.join(PARAMS["exportdir"], "motifs",
                               "%s.bed.gz" % motif)
        tmpfname = tmpf.name

        statement = '''mergeBed -i %(tmpfname)s -nms | gzip > %(outfile)s'''
        P.run()

        os.unlink(tmpf.name)

예제 #35

0

파일 보기

파일: pipeline_benchmark_rnaseqmappers.py 프로젝트: nishantthakur/cgat

def createViewMapping(infile, outfile):
    """create view in database for alignment stats.

    This view aggregates all information on a per-track basis.

    The table is built from the following tracks:
    
    mapping_stats
    bam_stats
    """

    tablename = P.toTable(outfile)
    # can not create views across multiple database, so use table
    view_type = "TABLE"

    dbhandle = connect()
    Database.executewait(dbhandle, "DROP %(view_type)s IF EXISTS %(tablename)s" % locals())

    statement = """
    CREATE %(view_type)s %(tablename)s AS
    SELECT *
    FROM bam_stats AS b
    """

    Database.executewait(dbhandle, statement % locals())

예제 #36

0

파일 보기

파일: PipelineGeneset.py 프로젝트: lesheng/cgat

def loadProteinStats(infile, outfile):
    '''load protein statistics to database.

    The *infile* is an ENSEMBL peptide file.
    '''

    to_cluster = True

    table = P.toTable(outfile)

    statement = '''
    gunzip < %(infile)s |
    python %(scriptsdir)s/fasta2table.py 
          --log=%(outfile)s
          --type=aa 
          --section=length 
          --section=hid 
          --section=aa 
          --regex-identifier="(\S+)" |
    sed "s/^id/protein_id/" |
    python %(scriptsdir)s/csv2db.py %(csv2db_options)s 
              --index=protein_id 
              --map=protein_id:str 
              --table=%(table)s 
    > %(outfile)s'''

    P.run()

예제 #37

0

파일 보기

파일: pipeline_exome.py 프로젝트: yangjl/cgat

def loadDeNovos(infile, outfile):
    '''load de novo variants into the database'''
    scriptsdir = PARAMS["general_scriptsdir"]
    tablename = P.toTable(outfile)
    statement = '''cat %(infile)s | sed 's/#CHROM/CHROM/g;s/EFF\[\*\]/EFF/g;s/GEN\[0\]/GEN/g' | python %(scriptsdir)s/csv2db.py --table %(tablename)s --retry --ignore-empty > %(outfile)s''' % locals(
    )
    P.run()

예제 #38

0

파일 보기

파일: PipelineMappingQC.py 프로젝트: BioinformaticsArchive/cgat

def loadPicardHistogram( infiles, outfile, suffix, column, pipeline_suffix = ".picard_stats" ):
    '''extract a histogram from a picard output file and load it into database.'''

    tablename = P.toTable( outfile )
    tname = "%s_%s" % (tablename, suffix)
    
    tname = P.snip( tname, "_metrics") + "_histogram"

    # some files might be missing
    xfiles = [ x for x in infiles if os.path.exists( "%s.%s" % (x, suffix) ) ]

    if len(xfiles) == 0: 
        E.warn ( "no files for %s" % tname )
        return
    
    header = ",".join( [P.snip( os.path.basename(x), pipeline_suffix) for x in xfiles ] )        
    filenames = " ".join( [ "%s.%s" % (x, suffix) for x in xfiles ] )

    # there might be a variable number of columns in the tables
    # only take the first ignoring the rest
    statement = """python %(scriptsdir)s/combine_tables.py
                      --regex-start="## HISTOGRAM"
                      --missing=0
                      --take=2
                   %(filenames)s
                | python %(scriptsdir)s/csv2db.py
                      --header=%(column)s,%(header)s
                      --replace-header
                      --index=track
                      --table=%(tname)s 
                >> %(outfile)s
                """
    
    P.run()

예제 #39

0

파일 보기

파일: PipelineGeneset.py 프로젝트: lesheng/cgat

def loadTranscriptStats(infile, outfile):
    '''load gene statistics to database.

    The *infile* is the *outfile* from :meth:`buildTranscripts`
    '''

    to_cluster = True

    table = P.toTable(outfile)

    statement = '''
    gunzip < %(infile)s |\
    python %(scriptsdir)s/gtf2table.py \
          --log=%(outfile)s.log \
          --genome=%(genome_dir)s/%(genome)s \
          --reporter=transcripts \
          --counter=position \
          --counter=length \
          --counter=composition-na |\
    python %(scriptsdir)s/csv2db.py %(csv2db_options)s \
              --index=gene_id \
              --map=gene_id:str \
              --table=%(table)s \
    > %(outfile)s'''

    P.run()

예제 #40

0

파일 보기

파일: pipeline_cufflinks_optimization.py 프로젝트: jmadzo/cgat

def loadNumberExonsLengthSummaryStats(infile, outfile):
    '''
    load the table of exon counts and transcript lengths
    '''
    tablename = P.toTable(outfile.replace("/", "_")) + "_stats"
    statement = '''python %(scriptsdir)s/csv2db.py -t %(tablename)s --log=%(outfile)s.log < %(infile)s > %(outfile)s'''
    P.run()

예제 #41

0

파일 보기

def loadTranscriptInformation( infile, outfile,
                                 only_proteincoding = False ):
                                 
    '''load the transcript set.

    *infile* is an ENSEMBL gtf file.
    '''
    to_cluster = True

    table = P.toTable(outfile)

    if only_proteincoding: filter_cmd = ''' awk '$2 == "protein_coding"' '''
    else: filter_cmd = "cat"

    statement = '''gunzip 
    < %(infile)s 
    | %(filter_cmd)s 
    | awk '$3 == "CDS"' 
    | python %(scriptsdir)s/gtf2gtf.py --sort=gene
    | python %(scriptsdir)s/gtf2tsv.py --full --only-attributes -v 0
    | python %(toolsdir)s/csv_cut.py --remove exon_id exon_number 
    | %(scriptsdir)s/hsort 1 | uniq 
    | python %(scriptsdir)s/csv2db.py %(csv2db_options)s 
              --index=transcript_id 
              --index=gene_id 
              --index=protein_id 
              --index=gene_name 
              --map=transcript_name:str 
              --map=gene_name:str 
              --table=%(table)s 
    > %(outfile)s'''
    P.run()

예제 #42

0

파일 보기

파일: pipeline_cufflinks_optimization.py 프로젝트: jmadzo/cgat

def loadCountSingleAndMultiExonLincRNA(infile, outfile):
    '''
    load the counts for the multi and single exon lincRNA
    '''
    tablename = P.toTable(outfile.replace("/", "_")) + ".count"
    statement = '''python %(scriptsdir)s/csv2db.py -t %(tablename)s --log=%(outfile)s.log < %(infile)s > %(outfile)s'''
    P.run()

예제 #43

0

파일 보기

def loadVariantAnnotation(infile, outfile):
    '''Load VCF annotations into database'''
    scriptsdir = PARAMS["general_scriptsdir"]
    tablename = P.toTable(outfile)
    statement = '''cat %(infile)s | python %(scriptsdir)s/csv2db.py --table %(tablename)s --retry --ignore-empty > %(outfile)s''' % locals(
    )
    P.run()

예제 #44

0

파일 보기

파일: pipeline_benchmark_rnaseqmappers.py 프로젝트: nishantthakur/cgat

def mergeAndLoad(infiles, outfile, suffix):
    """load categorical tables (two columns) into a database.

    The tables are merged and entered row-wise.

    """
    header = ",".join([P.quote(P.snip(x, suffix)) for x in infiles])
    if suffix.endswith(".gz"):
        filenames = " ".join(["<( zcat %s | cut -f 1,2 )" % x for x in infiles])
    else:
        filenames = " ".join(["<( cat %s | cut -f 1,2 )" % x for x in infiles])

    tablename = P.toTable(outfile)

    statement = """python %(scriptsdir)s/combine_tables.py
                      --headers=%(header)s
                      --missing=0
                      --ignore-empty
                   %(filenames)s
                | perl -p -e "s/bin/track/" 
                | python %(scriptsdir)s/table2table.py --transpose
                | python %(scriptsdir)s/csv2db.py
                      --index=track
                      --table=%(tablename)s 
                > %(outfile)s
            """
    P.run()

예제 #45

0

파일 보기

def loadEdgeRResults(infile, outfile):

    tableName = P.toTable(outfile)
    statement = ''' python %(scriptsdir)s/csv2db.py
                            --table=%(tableName)s
                            --index=id < infile
                    >outfile '''
    P.run()

예제 #46

0

파일 보기

파일: pipeline_exome.py 프로젝트: Charlie-George/cgat

def loadSnpeffAnnotation(infile, outfile):
    '''Load snpeff annotations into database'''
    scriptsdir = PARAMS["general_scriptsdir"]
    tablename = P.toTable(outfile)
    statement = '''cat %(infile)s
    | python %(scriptsdir)s/csv2db.py --table %(tablename)s
    --retry --ignore-empty > %(outfile)s''' % locals()
    P.run()

예제 #47

0

파일 보기

파일: pipeline_fusion.py 프로젝트: BioinformaticsArchive/cgat

def loadEdgeRResults(infile,outfile):
    
    tableName = P.toTable(outfile)
    statement = ''' python %(scriptsdir)s/csv2db.py
                            --table=%(tableName)s
                            --index=id < infile
                    >outfile '''
    P.run()

예제 #48

0

파일 보기

파일: pipeline_benchmark_rnaseqmappers.py 프로젝트: yangjl/cgat

def loadAlignmentStats(infiles, outfile):
    '''merge alignment stats into single tables.'''

    tablename = P.toTable(outfile)

    outf = P.getTempFile()

    first = True
    for f in infiles:
        track = P.snip(f, ".bam.stats")
        fn = f + ".alignment_summary_metrics"
        if not os.path.exists(fn):
            E.warn("file %s missing" % fn)
            continue
        lines = [
            x for x in open(fn, "r").readlines()
            if not x.startswith("#") and x.strip()
        ]
        if first: outf.write("%s\t%s" % ("track", lines[0]))
        first = False
        outf.write("%s\t%s" % (track, lines[1]))

    outf.close()
    tmpfilename = outf.name

    statement = '''cat %(tmpfilename)s
                | python %(scriptsdir)s/csv2db.py
                      --index=track
                      --table=%(tablename)s 
                > %(outfile)s
               '''
    P.run()

    for suffix, column in (("quality_by_cycle_metrics", "cycle"),
                           ("quality_distribution_metrics", "quality")):

        # some files might be missing - bugs in Picard
        xfiles = [x for x in infiles if os.path.exists("%s.%s" % (x, suffix))]

        header = ",".join([P.snip(x, ".bam.stats") for x in xfiles])
        filenames = " ".join(["%s.%s" % (x, suffix) for x in xfiles])

        tname = "%s_%s" % (tablename, suffix)

        statement = """python %(scriptsdir)s/combine_tables.py
                      --missing=0
                   %(filenames)s
                | python %(scriptsdir)s/csv2db.py
                      --header=%(column)s,%(header)s
                      --replace-header
                      --index=track
                      --table=%(tname)s 
                >> %(outfile)s
                """

        P.run()

    os.unlink(tmpfilename)

예제 #49

0

파일 보기

파일: pipeline_benchmark_rnaseqmappers.py 프로젝트: lesheng/cgat

def loadAlignmentStats(infiles, outfile):
    '''merge alignment stats into single tables.'''

    tablename = P.toTable(outfile)

    outf = P.getTempFile()

    first = True
    for f in infiles:
        track = P.snip(f, ".bam.stats")
        fn = f + ".alignment_summary_metrics"
        if not os.path.exists(fn):
            E.warn("file %s missing" % fn)
            continue
        lines = [
            x for x in open(fn, "r").readlines() if not x.startswith("#") and x.strip()]
        if first:
            outf.write("%s\t%s" % ("track", lines[0]))
        first = False
        outf.write("%s\t%s" % (track, lines[1]))

    outf.close()
    tmpfilename = outf.name

    statement = '''cat %(tmpfilename)s
                | python %(scriptsdir)s/csv2db.py
                      --index=track
                      --table=%(tablename)s 
                > %(outfile)s
               '''
    P.run()

    for suffix, column in (("quality_by_cycle_metrics", "cycle"),
                           ("quality_distribution_metrics", "quality")):

        # some files might be missing - bugs in Picard
        xfiles = [x for x in infiles if os.path.exists("%s.%s" % (x, suffix))]

        header = ",".join([P.snip(x, ".bam.stats") for x in xfiles])
        filenames = " ".join(["%s.%s" % (x, suffix) for x in xfiles])

        tname = "%s_%s" % (tablename, suffix)

        statement = """python %(scriptsdir)s/combine_tables.py
                      --missing=0
                   %(filenames)s
                | python %(scriptsdir)s/csv2db.py
                      --header=%(column)s,%(header)s
                      --replace-header
                      --index=track
                      --table=%(tname)s 
                >> %(outfile)s
                """

        P.run()

    os.unlink(tmpfilename)

예제 #50

0

파일 보기

파일: pipeline_metagenomebenchmark.py 프로젝트: Charlie-George/cgat

def compareAbundanceOfFalsePositiveSpecies(infiles, outfile):
    '''
    boxplot the relative abundance of false positive
    species compared to true positives
    '''
    tablename_estimate = P.toTable(infiles[0])

    track = P.snip(
        os.path.basename(infiles[0]).replace("metaphlan_", ""), ".load")
    tablename_true = [P.toTable(x) for x in infiles[1:] if P.snip(
        os.path.basename(x), ".load") == track][0]
    dbh = sqlite3.connect("csvdb")
    cc = dbh.cursor()
    tmp = P.getTempFile(".")
    tmp.write("taxa\tabundance\tstatus\n")
    estimate = {}
    true = set()
    for data in cc.execute("""SELECT taxon, rel_abundance FROM %s WHERE taxon_level == 'species'""" % tablename_estimate).fetchall():
        estimate[data[0]] = data[1]
    for data in cc.execute("""SELECT taxa FROM %s WHERE level == 'species'""" % tablename_true).fetchall():
        true.add(data[0])

    for taxa, abundance in estimate.iteritems():
        if taxa in true:
            tmp.write("%s\t%f\ttp\n" % (taxa, abundance))
        else:
            tmp.write("%s\t%f\tfp\n" % (taxa, abundance))
    tmp.close()

    inf = tmp.name
    if track.find("15M") != -1:
        col = "cadetblue"
    elif track.find("30M") != -1:
        col = "lightblue"
    elif track.find("50M") != -1:
        col = "slategray"

    R('''dat <- read.csv("%s", header = T, stringsAsFactors = F, sep = "\t")''' %
      inf)
    R('''library(ggplot2)''')
    R('''ggplot(dat, aes(x = status, y = log2(abundance))) + geom_boxplot(colour = "%s") + geom_hline(yintersect=0, linetype="dashed")''' %
      col)
    R('''ggsave("%s")''' % outfile)
    os.unlink(inf)

예제 #51

0

파일 보기

파일: pipeline_variant_annotation.py 프로젝트: jmadzo/cgat

def mergeEffects(infiles, outfile):
    '''load transcript effects into single table.'''

    tablename = P.toTable(outfile)
    outf = open('effects.txt', 'w')
    first = True
    for f in infiles:
        track = P.snip(os.path.basename(f), ".effects.gz")
        if not os.path.exists(f):
            E.warn("File %s missing" % f)
            continue
        lines = [x for x in gzip.open(f, "r").readlines()]
        if first:
            outf.write("%s\t%s" % ("track", lines[0]))
        first = False
        for i in range(1, len(lines)):
            outf.write("%s\t%s" % (track, lines[i]))
    outf.close()

    statement = '''cat effect.txt |
                   python %(scriptsdir)s/csv2db.py %(csv2db_options)s \
                       --index=transcript_id \
                       --table=%(tablename)s \
                   > %(outfile)s'''
    P.run()

    for suffix in ("cds", "intron", "splicing", "translation", "genes"):

        outf = open('effects.' + suffix + '.txt', 'w')
        first = True
        for f in infiles:
            track = P.snip(os.path.basename(f), ".effects.gz")
            statfile = f + "." + suffix + ".gz"
            print(statfile)
            if not os.path.exists(statfile):
                E.warn("File %s missing" % statfile)
                continue
            lines = [x for x in gzip.open(statfile, "r").readlines()]
            if first:
                outf.write("%s\t%s" % ("track", lines[0]))
            first = False
            for i in range(1, len(lines)):
                outf.write("%s\t%s" % (track, lines[i]))
        outf.close()
        tmpfilename = outf.name

        statement = '''cat %(tmpfilename)s |
                       python %(scriptsdir)s/csv2db.py %(csv2db_options)s 
                           --allow-empty
                           --index=transcript_id 
                           --table=%(tablename)s_%(suffix)s 
                           --ignore-column=seq_na
                           --ignore-column=seq_aa
                       >> %(outfile)s'''
        P.run()

예제 #52

0

파일 보기

def mergeEffects(infiles, outfile):
    '''load transcript effects into single table.'''

    tablename = P.toTable(outfile)
    outf = open('effects.txt', 'w')
    first = True
    for f in infiles:
        track = P.snip(os.path.basename(f), ".effects.gz")
        if not os.path.exists(f):
            E.warn("File %s missing" % f)
            continue
        lines = [x for x in gzip.open(f, "r").readlines()]
        if first:
            outf.write("%s\t%s" % ("track", lines[0]))
        first = False
        for i in range(1, len(lines)):
            outf.write("%s\t%s" % (track, lines[i]))
    outf.close()

    statement = '''cat effect.txt |
                   python %(scriptsdir)s/csv2db.py %(csv2db_options)s \
                       --index=transcript_id \
                       --table=%(tablename)s \
                   > %(outfile)s'''
    P.run()

    for suffix in ("cds", "intron", "splicing", "translation", "genes"):

        outf = open('effects.' + suffix + '.txt', 'w')
        first = True
        for f in infiles:
            track = P.snip(os.path.basename(f), ".effects.gz")
            statfile = f + "." + suffix + ".gz"
            print(statfile)
            if not os.path.exists(statfile):
                E.warn("File %s missing" % statfile)
                continue
            lines = [x for x in gzip.open(statfile, "r").readlines()]
            if first:
                outf.write("%s\t%s" % ("track", lines[0]))
            first = False
            for i in range(1, len(lines)):
                outf.write("%s\t%s" % (track, lines[i]))
        outf.close()
        tmpfilename = outf.name

        statement = '''cat %(tmpfilename)s |
                       python %(scriptsdir)s/csv2db.py %(csv2db_options)s 
                           --allow-empty
                           --index=transcript_id 
                           --table=%(tablename)s_%(suffix)s 
                           --ignore-column=seq_na
                           --ignore-column=seq_aa
                       >> %(outfile)s'''
        P.run()

예제 #53

0

파일 보기

파일: pipeline_exome_cancer.py 프로젝트: Charlie-George/cgat

def loadNCG(infile, outfile):
    '''Load NCG into database'''

    dbh = connect()
    scriptsdir = PARAMS["general_scriptsdir"]
    tablename = P.toTable(outfile)
    statement = '''cat %(infile)s |
                   python %(scriptsdir)s/csv2db.py
                   --table %(tablename)s --retry --ignore-empty
                   > %(outfile)s''' % locals()
    P.run()

예제 #54

0

파일 보기

파일: pipeline_exome_cancer.py 프로젝트: Charlie-George/cgat

def loadMutectFilteringSummary(infile, outfile):
    '''Load mutect extended output into database'''

    dbh = connect()
    scriptsdir = PARAMS["general_scriptsdir"]
    tablename = P.toTable(outfile)
    statement = '''cat %(infile)s |
                   python %(scriptsdir)s/csv2db.py
                   --table %(tablename)s --retry --ignore-empty
                   > %(outfile)s''' % locals()
    P.run()

예제 #55

0

파일 보기

파일: pipeline_exome.py 프로젝트: Charlie-George/cgat

def loadROI2Gene(infile, outfile):
    '''Import genes mapping to regions of interest bed file into SQLite.'''
    scriptsdir = PARAMS["general_scriptsdir"]
    tablename = P.toTable(outfile)
    statement = '''cat %(infile)s
            | python %(scriptsdir)s/csv2db.py %(csv2db_options)s
              --ignore-empty
              --retry
              --table=%(tablename)s
            > %(outfile)s  '''
    P.run()

예제 #56

0

파일 보기

파일: pipeline_exome.py 프로젝트: Charlie-George/cgat

def loadSamples(infile, outfile):
    '''Import sample information into SQLite.'''
    scriptsdir = PARAMS["general_scriptsdir"]
    tablename = P.toTable(outfile)
    statement = '''cat %(infile)s
            | python %(scriptsdir)s/csv2db.py %(csv2db_options)s
              --ignore-empty
              --retry
              --table=%(tablename)s
            > %(outfile)s  '''
    P.run()

예제 #57

0

파일 보기

파일: pipeline_metagenomeassembly.py 프로젝트: Charlie-George/cgat

def loadCoverageStats(infile, outfile):
    '''
    load coverage stats
    '''
    tablename = P.toTable(
        P.snip(os.path.dirname(infile), ".dir") + "_%s" % os.path.basename(outfile))
    statement = '''zcat %(infile)s | python %(scriptsdir)s/csv2db.py
                -t %(tablename)s
                --index=contig
                --log=%(outfile)s.log > %(outfile)s'''
    P.run()

예제 #58

0

파일 보기

def loadSamples(infile, outfile):
    '''Import sample information into SQLite.'''
    scriptsdir = PARAMS["general_scriptsdir"]
    tablename = P.toTable(outfile)
    statement = '''cat %(infile)s
            | python %(scriptsdir)s/csv2db.py %(csv2db_options)s
              --ignore-empty
              --retry
              --table=%(tablename)s
            > %(outfile)s  '''
    P.run()