예제 #1
0
def buildGeneOntology(infile, outfile):
    '''create an output file akin to GO ontology files to be
    used with GO.py
    '''

    table = P.toTable(infile)
    columns = ("cpg", "tata")
    dbh = connect()
    cc = dbh.cursor()

    outf = IOTools.openFile(outfile, "w")
    outf.write("go_type\tgene_id\tgo_id\tdescription\tevidence\n")

    i = 1
    for c in columns:
        cc.execute("SELECT DISTINCT gene_id FROM %(table)s WHERE %(c)s" %
                   locals())
        outf.write("".join([
            "promotor\t%s\tGO:%07i\twith_%s\tNA\n" % (x[0], i, c) for x in cc
        ]))
        i += 1
        cc.execute("SELECT DISTINCT gene_id FROM %(table)s WHERE %(c)s = 0" %
                   locals())
        outf.write("".join([
            "promotor\t%s\tGO:%07i\twithout_%s\tNA\n" % (x[0], i, c)
            for x in cc
        ]))
        i += 1

    outf.close()
예제 #2
0
def createViewMapping(infile, outfile):
    '''create view in database for alignment stats.

    This view aggregates all information on a per-track basis.

    The table is built from the following tracks:

    mapping_stats
    bam_stats
    '''

    tablename = P.toTable(outfile)
    # can not create views across multiple database, so use table
    view_type = "TABLE"

    dbhandle = connect()
    Database.executewait(
        dbhandle, "DROP %(view_type)s IF EXISTS %(tablename)s" % locals())

    statement = '''
    CREATE %(view_type)s %(tablename)s AS
    SELECT *
    FROM bam_stats AS b
    '''

    Database.executewait(dbhandle, statement % locals())
예제 #3
0
def loadPicardAlignStats(infiles, outfile):
    '''Merge Picard alignment stats into single table and load into SQLite.'''

    tablename = P.toTable(outfile)

    outf = P.getTempFile()

    first = True
    for f in infiles:
        track = P.snip(os.path.basename(f), ".dedup.alignstats")
        if not os.path.exists(f):
            E.warn("File %s missing" % f)
            continue
        lines = [
            x for x in open(f, "r").readlines()
            if not x.startswith("#") and x.strip()
        ]
        if first:
            outf.write("%s\t%s" % ("track", lines[0]))
        first = False
        for i in range(1, len(lines)):
            outf.write("%s\t%s" % (track, lines[i]))

    outf.close()
    tmpfilename = outf.name

    statement = '''cat %(tmpfilename)s
                | cgat csv2db
                      --add-index=track
                      --table=%(tablename)s 
                > %(outfile)s
               '''
    P.run()

    os.unlink(tmpfilename)
예제 #4
0
def loadSummariseReadsContributingToTranscripts(infile, outfile):
    '''
    loads the summary of reads contributing to transcripts
    '''
    tablename = P.toTable(outfile.replace("/", "_"))
    statement = '''cgat csv2db -t %(tablename)s --log=%(outfile)s.log < %(infile)s > %(outfile)s'''
    P.run()
예제 #5
0
def loadNumberExonsLengthSummaryStats(infile, outfile):
    '''
    load the table of exon counts and transcript lengths
    '''
    tablename = P.toTable(outfile.replace("/", "_")) + "_stats"
    statement = '''cgat csv2db -t %(tablename)s --log=%(outfile)s.log < %(infile)s > %(outfile)s'''
    P.run()
예제 #6
0
def loadPicardDuplicateStats(infiles, outfile):
    '''Merge Picard duplicate stats into single table and load into SQLite.'''

    tablename = P.toTable(outfile)

    outf = open('dupstats.txt', 'w')

    first = True
    for f in infiles:
        track = P.snip(os.path.basename(f), ".dedup.bam")
        statfile = P.snip(f, ".bam") + ".dupstats"
        if not os.path.exists(statfile):
            E.warn("File %s missing" % statfile)
            continue
        lines = [
            x for x in open(statfile, "r").readlines()
            if not x.startswith("#") and x.strip()
        ]
        if first:
            outf.write("%s\t%s" % ("track", lines[0]))
        first = False
        outf.write("%s\t%s" % (track, lines[1]))

    outf.close()
    tmpfilename = outf.name

    statement = '''cat %(tmpfilename)s
                | cgat csv2db
                      --add-index=track
                      --table=%(tablename)s 
                > %(outfile)s
               '''
    P.run()
예제 #7
0
def loadCountSingleAndMultiExonLincRNA(infile, outfile):
    '''
    load the counts for the multi and single exon lincRNA
    '''
    tablename = P.toTable(outfile.replace("/", "_")) + ".count"
    statement = '''cgat csv2db -t %(tablename)s --log=%(outfile)s.log < %(infile)s > %(outfile)s'''
    P.run()
예제 #8
0
def mergeAndLoad(infiles, outfile, suffix):
    '''load categorical tables (two columns) into a database.

    The tables are merged and entered row-wise.

    '''
    header = ",".join([P.tablequote(P.snip(x, suffix)) for x in infiles])
    if suffix.endswith(".gz"):
        filenames = " ".join(
            ["<( zcat %s | cut -f 1,2 )" % x for x in infiles])
    else:
        filenames = " ".join(["<( cat %s | cut -f 1,2 )" % x for x in infiles])

    tablename = P.toTable(outfile)

    statement = """cgat combine_tables
                      --header-names=%(header)s
                      --missing-value=0
                      --ignore-empty
                   %(filenames)s
                | perl -p -e "s/bin/track/"
                | cgat table2table --transpose
                | cgat csv2db
                      --add-index=track
                      --table=%(tablename)s
                > %(outfile)s
            """
    P.run()
예제 #9
0
def loadPicardDuplicateStats(infiles, outfile):
    '''Merge Picard duplicate stats into single table and load into SQLite.'''
    # Join data for all tracks into single file
    outf = open('tophat/tophat.dir/dupstats.txt', 'w')
    first = True
    for f in infiles:
        track = P.snip(os.path.basename(f), ".dedup.stats")
        statfile = f
        lines = [x for x in open(
            statfile, "r").readlines() if not x.startswith("#") and x.strip()]
        if first:
            outf.write("%s\t%s" % ("track", lines[0]))
        first = False
        outf.write("%s\t%s" % (track, lines[1]))
    outf.close()
    tmpfilename = outf.name

    # Load into database
    tablename = P.toTable(outfile)
    statement = '''cat %(tmpfilename)s
                    | python %(scriptsdir)s/csv2db.py
                          --add-index=track
                          --table=%(tablename)s 
                    > %(outfile)s '''
    P.run()
예제 #10
0
def loadPicardAlignStats(infiles, outfile):
    '''Merge Picard alignment stats into single table and load into SQLite.'''
    # Join data for all tracks into single file
    outf = open("tophat/tophat.dir/picard_align_stats.tsv", "w")
    first = True
    for f in infiles:
        track = P.snip(os.path.basename(f), ".alignstats")
        if not os.path.exists(f):
            E.warn("File %s missing" % f)
            continue
        lines = [
            x for x in open(f, "r").readlines() if not x.startswith("#") and x.strip()]
        if first:
            outf.write("%s\t%s" % ("track", lines[0]))
        first = False
        for i in range(1, len(lines)):
            outf.write("%s\t%s" % (track, lines[i]))
    outf.close()
    tmpfilename = outf.name

    # Load into database
    tablename = P.toTable(outfile)
    statement = '''cat %(tmpfilename)s
                | python %(scriptsdir)s/csv2db.py
                      --add-index=track
                      --table=%(tablename)s 
                > %(outfile)s'''
    P.run()
def filterByCoverage(infiles, outfile):

    fcoverage = PARAMS["coverage_filter"]
    contig_file = infiles[0]
    dbh = sqlite3.connect(
        os.path.join(PARAMS["results_resultsdir"], PARAMS["database_name"]))
    cc = dbh.cursor()
    contigs = set()
    for infile in infiles[1:]:
        dirsplit = infile.split("/")
        infile = os.path.join(
            PARAMS["results_resultsdir"],
            dirsplit[-2].split(".dir")[0] + "-" + dirsplit[-1])
        tablename = P.toTable(os.path.basename(infile))
        if P.snip(contig_file, ".fa") == P.snip(os.path.basename(infile),
                                                ".coverage.load"):
            statement = """SELECT contig_id ave FROM
                           (SELECT contig_id, AVG(coverage) as ave FROM %s GROUP BY contig_id)
                           WHERE ave > %i""" % (tablename,
                                                PARAMS["coverage_filter"])
            for data in cc.execute(statement).fetchall():
                contigs.add(data[0])
    outf = open(outfile, "w")
    print(contigs)
    for fasta in FastaIterator.iterate(IOTools.openFile(contig_file)):
        identifier = fasta.title.split(" ")[0]
        if identifier in contigs:
            outf.write(">%s\n%s\n" % (identifier, fasta.sequence))
    outf.close()
예제 #12
0
def buildDMRStats(infiles, outfile):
    '''compute differential methylation stats.'''
    tablenames = [P.toTable(x) for x in infiles]
    method = P.snip(outfile, "_stats.tsv")
    PipelineMedip.buildDMRStats(tablenames,
                                method,
                                outfile,
                                dbhandle=connect())
예제 #13
0
def loadCoveredCpGs(infile, outfile):
    dbh = connect()
    tablename = P.toTable(outfile)

    statement = '''cat %(infile)s |
                cgat csv2db
                --table %(tablename)s --retry --ignore-empty
                 > %(outfile)s''' % locals()
    P.run()
예제 #14
0
def loadAlignmentStats(infiles, outfile):
    '''merge alignment stats into single tables.'''

    tablename = P.toTable(outfile)

    outf = P.getTempFile()

    first = True
    for f in infiles:
        track = P.snip(f, ".bam.stats")
        fn = f + ".alignment_summary_metrics"
        if not os.path.exists(fn):
            E.warn("file %s missing" % fn)
            continue
        lines = [
            x for x in open(fn, "r").readlines() if not x.startswith("#") and x.strip()]
        if first:
            outf.write("%s\t%s" % ("track", lines[0]))
        first = False
        outf.write("%s\t%s" % (track, lines[1]))

    outf.close()
    tmpfilename = outf.name

    statement = '''cat %(tmpfilename)s
                | cgat csv2db
                      --add-index=track
                      --table=%(tablename)s
                > %(outfile)s
               '''
    P.run()

    for suffix, column in (("quality_by_cycle_metrics", "cycle"),
                           ("quality_distribution_metrics", "quality")):

        # some files might be missing - bugs in Picard
        xfiles = [x for x in infiles if os.path.exists("%s.%s" % (x, suffix))]

        header = ",".join([P.snip(x, ".bam.stats") for x in xfiles])
        filenames = " ".join(["%s.%s" % (x, suffix) for x in xfiles])

        tname = "%s_%s" % (tablename, suffix)

        statement = """cgat combine_tables
                      --missing-value=0
                   %(filenames)s
                | cgat csv2db
                      --header-names=%(column)s,%(header)s
                      --replace-header
                      --add-index=track
                      --table=%(tname)s
                >> %(outfile)s
                """

        P.run()

    os.unlink(tmpfilename)
def calculateFalsePositiveRate(infiles, outfile):
    '''
    taxonomy false positives and negatives etc
    '''
    # connect to database
    dbh = sqlite3.connect(PARAMS["database_name"])
    cc = dbh.cursor()

    levels = ["phylum", "class", "order", "family", "genus", "species"]
    tablename_true = P.toTable(infiles[0])

    # get corresponding estimate file
    tablename_estimate = P.toTable(
        os.path.basename([
            inf for inf in infiles[1:] if os.path.basename(inf)
            [len("metaphlan_"):] == os.path.basename(infiles[0])
        ][0]))

    outf = open(outfile, "w")
    track = P.snip(os.path.basename(infiles[0]), ".taxonomy.relab.load")
    for level in levels:
        for cutoff in [0, 1]:
            true_set = set()
            estimate_set = set()
            for taxa in cc.execute(
                    """SELECT taxa FROM %s WHERE level == '%s' AND relab > %f"""
                    % (tablename_true, level, float(cutoff) / 100)):
                true_set.add(taxa[0])
            for taxa in cc.execute(
                    """SELECT taxon FROM %s WHERE taxon_level == '%s' AND rel_abundance > %f"""
                    % (tablename_estimate, level, float(cutoff))):
                estimate_set.add(taxa[0])
            total_true = len(true_set)
            total_estimate = len(estimate_set)
            tp = true_set.intersection(estimate_set)
            fp = estimate_set.difference(true_set)

            fp_rate = float(len(fp)) / total_estimate
            tp_rate = float(len(tp)) / total_true
            outf.write("%s\t%f\t%f\t%s\t%s\n" %
                       (level, fp_rate, tp_rate, track, str(cutoff)))
    outf.close()
예제 #16
0
def loadMergeCoverage(infile, outfile):

    dbh = connect()
    tablename = P.toTable(outfile)
    job_options = "-l mem_free=23G"
    job_threads = 2

    statement = '''cat %(infile)s |
                cgat csv2db
                --table %(tablename)s --retry --ignore-empty
                 > %(outfile)s''' % locals()
    P.run()
예제 #17
0
def importPresence(infile, outfile):
    '''import presence/absence data.'''

    tablename = P.toTable(outfile)

    statement = '''
   cgat csv2db %(csv2db_options)s \
              --add-index=probeset \
              --table=%(tablename)s \
    < %(infile)s > %(outfile)s
    '''

    P.run()
예제 #18
0
def importExpressionLevels(infiles, outfile):
    '''import presence/absence data.'''

    infile_data, infile_table = infiles

    tablename = P.toTable(outfile)

    statement = '''
   cgat csv2db %(csv2db_options)s \
              --add-index=probeset \
              --table=%(tablename)s \
    < %(infile_table)s > %(outfile)s
    '''

    P.run()
예제 #19
0
def combineM3Dsummaries(infiles, outfiles):
    ''' combine M3D summary tables'''
    outfile1, outfile2 = outfiles
    print(outfile1, outfile2)

    tablename = P.toTable(outfile2)

    statement = ''' cgat combine_tables -v0  -a file
                    --glob=M3D_plots.dir/*between_summary.tsv > %(outfile1)s;
                    cat %(outfile1)s |
                    cgat csv2db
                    --table %(tablename)s --retry --ignore-empty
                    > %(outfile2)s''' % locals()
    print(statement)
    P.run()
예제 #20
0
def mergeEffects(infiles, outfile):
    '''load transcript effects into single table.'''

    tablename = P.toTable(outfile)
    outf = open('effects.txt', 'w')
    first = True
    for f in infiles:
        track = P.snip(os.path.basename(f), ".effects.gz")
        if not os.path.exists(f):
            E.warn("File %s missing" % f)
            continue
        lines = [x for x in gzip.open(f, "r").readlines()]
        if first:
            outf.write("%s\t%s" % ("track", lines[0]))
        first = False
        for i in range(1, len(lines)):
            outf.write("%s\t%s" % (track, lines[i]))
    outf.close()

    P.load("effect.txt", outfile, options="--add-index=transcript_id")

    for suffix in ("cds", "intron", "splicing", "translation", "genes"):

        outf = open('effects.' + suffix + '.txt', 'w')
        first = True
        for f in infiles:
            track = P.snip(os.path.basename(f), ".effects.gz")
            statfile = f + "." + suffix + ".gz"
            print(statfile)
            if not os.path.exists(statfile):
                E.warn("File %s missing" % statfile)
                continue
            lines = [x for x in gzip.open(statfile, "r").readlines()]
            if first:
                outf.write("%s\t%s" % ("track", lines[0]))
            first = False
            for i in range(1, len(lines)):
                outf.write("%s\t%s" % (track, lines[i]))
        outf.close()
        tmpfilename = outf.name

        P.load(outf.name,
               outfile,
               tablename=tabelname + "_" + suffix,
               options="--add-index=transcript_id "
               "--allow-empty-file "
               "--ignore-column=seq_na "
               "--ignore-column=seq_aa")
예제 #21
0
def loadPolyphen(infile, outfile):
    '''load polyphen results.'''

    load_statement = P.build_load_statement(P.toTable(outfile),
                                            options="--add-index=snp_id "
                                            "--add-index=protein_id "
                                            "--map=effect:str")

    statement = '''
    gunzip
    < %(infile)s
    | perl -p -e "s/o_acc/protein_id/; s/ +//g; s/^#//;"
    | %(load_statement)s
    > %(outfile)s
    '''
    P.run()
예제 #22
0
def loadReadCorrespondence(infiles, outfile):
    '''load read correspondence data into database.'''

    to_cluster = USECLUSTER

    infiles = " ".join(infiles)

    tablename = P.toTable(outfile)

    statement = '''
    cgat combine_tables
         %(infiles)s
                | cgat csv2db
                      --table=%(tablename)s
                > %(outfile)s
    '''

    P.run()
예제 #23
0
def loadGeneSummary(infile, outfile):
    '''summarize binding information per gene.'''

    dbh = connect()

    table = P.toTable(outfile)

    cc = dbh.cursor()
    cc.execute("""DROP TABLE IF EXISTS %(table)s """ % locals())
    cc.execute("""CREATE TABLE %(table)s AS
            SELECT gene_id, SUM( tata ) AS tata, SUM( cpg ) AS cpg 
                   FROM promotorinfo_transcripts AS p,
                        annotations.transcript_info as i
                   WHERE i.transcript_id = p.transcript_id
                   GROUP BY gene_id""" % locals())
    cc.close()

    P.touch(outfile)
예제 #24
0
def loadGTF(infile, outfile):
    '''load gtf files.'''

    table = P.toTable(outfile)

    to_cluster = USECLUSTER

    statement = '''gunzip
    < %(infile)s
    | cgat gtf2tsv
    |cgat csv2db %(csv2db_options)s
    --add-index=gene_id
    --map=gene_id:str
    --add-index=transcript_id
    --map=transcript_id:str
    --table=%(table)s
    > %(outfile)s
    '''
    P.run()
예제 #25
0
def mergeAnnotations(infiles, outfile):
    '''load variant annotations into single database table'''

    tablename = P.toTable(outfile)
    outf = open('anno.txt', 'w')
    first = True
    for f in infiles:
        track = P.snip(os.path.basename(f), ".annotations.gz")
        if not os.path.exists(f):
            E.warn("File %s missing" % f)
            continue
        lines = [x for x in gzip.open(f, "r").readlines()]
        if first:
            outf.write("%s\t%s" % ("track", lines[0]))
        first = False
        for i in range(1, len(lines)):
            outf.write("%s\t%s" % (track, lines[i]))
    outf.close()

    P.load('anno.text', outfile)
예제 #26
0
def loadPolyphen(infile, outfile):
    '''load polyphen results.

    The comment column is ignored.
    '''

    table = P.toTable(outfile)

    statement = '''gunzip
    < %(infile)s
    | perl -p -e "s/o_acc/protein_id/; s/ +//g"
    | cut -f 1-55
    |python %(scriptsdir)s/csv2db.py %(csv2db_options)s
              --add-index=snp_id 
              --add-index=protein_id
              --table=%(table)s 
              --map=effect:str
    > %(outfile)s
    '''
    P.run()
예제 #27
0
def loadTranscriptSummary(infile, outfile):
    '''summarize binding information per transcript.'''

    dbh = connect()

    table = P.toTable(outfile)

    cc = dbh.cursor()
    # sqlite can not do full outer join
    cc.execute("""DROP TABLE IF EXISTS %(table)s""" % locals())

    transcripts = [
        x[0] for x in cc.execute(
            "SELECT DISTINCT(transcript_id) FROM annotations.transcript_info").
        fetchall()
    ]

    tmpf = P.getTempFile()

    tables = ("tata", "cpg")
    titles = tables

    vals = []
    for table in tables:
        t = set([
            x[0] for x in
            cc.execute("SELECT DISTINCT(transcript_id) FROM %(table)s" %
                       locals()).fetchall()
        ])
        vals.append(t)

    tmpf.write("transcript_id\t%s\n" % "\t".join(titles))

    for transcript_id in transcripts:
        tmpf.write("%s\t%s\n" % (transcript_id, "\t".join(
            [str(int(transcript_id in v)) for v in vals])))

    tmpf.close()

    P.load(tmpf.name, outfile)
    os.unlink(tmpf.name)
예제 #28
0
def loadBAMStats(infiles, outfile):
    '''Import bam statistics into SQLite'''

    scriptsdir = PARAMS["general_scriptsdir"]
    header = ",".join(
        [P.snip(os.path.basename(x), ".readstats") for x in infiles])
    filenames = " ".join(["<( cut -f 1,2 < %s)" % x for x in infiles])
    tablename = P.toTable(outfile)
    E.info("loading bam stats - summary")
    statement = """cgat combine_tables
                      --header-names=%(header)s
                      --missing-value=0
                      --ignore-empty
                   %(filenames)s
                | perl -p -e "s/bin/track/"
                | perl -p -e "s/unique/unique_alignments/"
                | cgat table2table --transpose
                | cgat csv2db
                      --allow-empty-file
                      --add-index=track
                      --table=%(tablename)s 
                > %(outfile)s"""
    P.run()

    for suffix in ("nm", "nh"):
        E.info("loading bam stats - %s" % suffix)
        filenames = " ".join(["%s.%s" % (x, suffix) for x in infiles])
        tname = "%s_%s" % (tablename, suffix)

        statement = """cgat combine_tables
                      --header-names=%(header)s
                      --skip-titles
                      --missing-value=0
                      --ignore-empty
                   %(filenames)s
                | perl -p -e "s/bin/%(suffix)s/"
                | cgat csv2db
                      --table=%(tname)s 
                      --allow-empty-file
                >> %(outfile)s """
        P.run()
예제 #29
0
def loadMEDIPS(infile, outfile):
    '''load medips results'''

    table_prefix = re.sub("_prep", "", P.toTable(outfile))

    table = table_prefix + "_coveredpos"

    statement = """
    cat %(infile)s_saturation_coveredpos.csv
    | tail -n 3 
    | perl -p -e 's/\\"//g; s/[,;]/\\t/g; '
    | cgat table2table --transpose
    | cgat csv2db 
           %(csv2db_options)s
           --table=%(table)s
           --replace-header
           --header-names=coverage,ncovered,pcovered
    >> %(outfile)s
    """

    P.run()
예제 #30
0
def loadBAMStats(infiles, outfile):
    '''import bam statisticis.'''

    header = ",".join([P.tablequote(P.snip(x, ".readstats")) for x in infiles])
    filenames = " ".join(["<( cut -f 1,2 < %s)" % x for x in infiles])
    tablename = P.toTable(outfile)
    E.info("loading bam stats - summary")
    statement = """cgat combine_tables
                      --header-names=%(header)s
                      --missing-value=0
                      --ignore-empty
                   %(filenames)s
                | perl -p -e "s/bin/track/"
                | perl -p -e "s/unique/unique_alignments/"
                | cgat table2table --transpose
                | cgat csv2db
                      --add-index=track
                      --table=%(tablename)s
                > %(outfile)s
            """
    P.run()

    for suffix in ("nm", "nh"):
        E.info("loading bam stats - %s" % suffix)
        filenames = " ".join(["%s.%s" % (x, suffix) for x in infiles])
        tname = "%s_%s" % (tablename, suffix)

        statement = """cgat combine_tables
                      --header-names=%(header)s
                      --skip-titles
                      --missing-value=0
                      --ignore-empty
                   %(filenames)s
                | perl -p -e "s/bin/%(suffix)s/"
                | cgat csv2db
                      --table=%(tname)s
                >> %(outfile)s
                """

        P.run()