def buildFinalLncRNAGeneSet(filteredLncRNAGeneSet, cpc_table, outfile, filter_cpc = None):
    '''
    filters lncRNA set based on the coding potential as output from 
    the CPC
    '''
    
    if filter_cpc:
           
        # get the transcripts that are designated as coding
        coding_set = set()
        dbh = sqlite3.connect("csvdb")
        cc = dbh.cursor()
        for transcript_id in cc.execute("SELECT transcript_id from %s WHERE CP_score > 1" % cpc_table):
            coding_set.add(transcript_id[0])

        remove = set()
        outf_coding = gzip.open("gtfs/cpc_removed.gtf.gz", "w")
        for gtf in GTF.iterator(IOTools.openFile(filteredLncRNAGeneSet)):
            if gtf.transcript_id in coding_set:
                remove.add(gtf.gene_id)
                outf_coding.write("%s\n" % gtf)
        outf_coding.close()
    else:
        # create empty set
        remove = set()
    
    # get temporary file for built lncrna
    temp = P.getTempFile(dir=".")
    
    # get temporary file for known lncrna
    temp2 = P.getTempFile(dir = ".")
        
    for gtf in GTF.iterator(IOTools.openFile(filteredLncRNAGeneSet)):
        if gtf.gene_id in remove: continue
        if gtf.transcript_id.find("TCONS") != -1:
            # output known and buil transcripts separately
            temp.write("%s\n" % gtf)
        else:
            temp2.write("%s\n" % gtf)
    temp.close()
    temp2.close()

    filename = temp.name
    filename2 = temp2.name
    statement = '''cat %(filename)s | python %(scriptsdir)s/gtf2gtf.py --sort=gene | 
                     python %(scriptsdir)s/gtf2gtf.py --renumber-genes=NONCO%%i 
                    --log=%(outfile)s.log | python %(scriptsdir)s/gtf2gtf.py 
                    --sort=gene --log=%(outfile)s.log > temp.gtf'''
    P.run()
    # recombine all transcripts with new ids
    statement = ('''cat %(filename2)s temp.gtf | python %(scriptsdir)s/gtf2gtf.py 
                 --sort=contig+gene --log = %(outfile)s.log | gzip > %(outfile)s''')
    P.run()
def buildIntervalsFasta(infile, outfile):
    '''
    build fasta file from intervals. Alternatively
    if a gtf file is specified this function will
    use parameters specified in the .ini file to
    use intervals upstream / downstream of tss
    '''

    # define upstream and downstream extensions
    upstream = PARAMS["intervals_extension_upstream"]
    downstream = PARAMS["intervals_extension_downstream"]

    assert len(str(upstream)), ("extension_upstream cannot be of %s type."
                                "If no extension is to be used specify 0" %
                                type(upstream))
    assert len(str(downstream)), ("downstream extension cannot be of %s type."
                                  "If no extension is to be used specify 0" %
                                  type(downstream))

    # if input is gtf then convert to bed
    # with intervals defined by .ini file
    temp = P.getTempFile("/ifs/scratch")
    if infile.endswith(".gtf.gz"):
        # the resulting temporary file will not be zipped
        concatenate = "cat"
        for gene in GTF.merged_gene_iterator(
                GTF.iterator(IOTools.openFile(infile))):
            if gene.strand == "+":
                temp.write("%s\t%s\t%s\t%s\t%s\t%s\n" %
                           (gene.contig, str(gene.start - upstream),
                            str(gene.start + downstream), gene.gene_id, ".",
                            gene.strand))
            elif gene.strand == "-":
                temp.write(
                    "%s\t%s\t%s\t%s\t%s\t%s\n" %
                    (gene.contig, str(gene.end - downstream),
                     str(gene.end + upstream), gene.gene_id, ".", gene.strand))
        temp.close()
        inf = temp.name
    else:
        inf = infile
        concatenate = "zcat"

    to_cluster = True

    # define statement
    # option to specify strand in config file.
    statement = ("%(concatenate)s %(inf)s |"
                 " python %(scriptsdir)s/bed2fasta.py"
                 "  --genome=%(genomedir)s/%(genome)s")

    if PARAMS["intervals_stranded"]:
        statement += ("  --use-strand --log=%(outfile)s.log > %(outfile)s")
    else:
        statement += ("  --log=%(outfile)s.log > %(outfile)s")

    P.run()

    if infile.endswith(".gtf.gz"):
        os.remove(inf)
def buildAnnotatorDistanceAnnotations(annotations="expression"):
    '''build an annotations file for annotator_distance.'''

    tmpfile = P.getTempFile(".")
    tmpfilename = tmpfile.name

    if annotations == "expression":
        dbhandle = sqlite3.connect(PARAMS["database"])
        cc = dbhandle.cursor()

        statement = """
        SELECT gene_id,
        CASE WHEN %(annodist_master_expression_select)s THEN 'responsive' ELSE 'nonresponsive' END
        FROM probeset2transcript AS e,
        %(annodist_master_expression)s AS d 
        WHERE d.cluster_id = e.cluster_id
        """ % dict(locals().items() + PARAMS.items())

        data = cc.execute(statement).fetchall()
        tmpfile.write("gene_id\tlabel\n")
        for gene_id, label in data:
            tmpfile.write("%s\t%s\n" % (gene_id, label))
        tmpfile.close()

    return tmpfilename
def loadMatchResults(infile, outfile):
    '''
    load the results of the match analysis into sqlite database
    '''
    temp = P.getTempFile("./match.dir")
    temp.write("seq_id\tmatrix_id\tposition\tstrand\t"
               "core_score\tmatrix_score\tsequence\n")
    for details in PipelineTFM.match_iterator(infile):
        temp.write("\t".join(
            map(str, [
                details.seq_id, details.matrix_id, details.position,
                details.strand, details.core_score, details.matrix_score,
                details.sequence
            ])) + "\n")
    temp.close()

    to_cluster = True
    job_options = "-l mem_free=64G"

    inf = temp.name
    tablename = filenameToTablename(os.path.basename(infile))
    statement = ("python %(scriptsdir)s/csv2db.py"
                 "  -t %(tablename)s"
                 "  --log=%(outfile)s.log"
                 "  --index=seq_id"
                 "  %(csv2db_options)s"
                 " < %(inf)s > %(outfile)s")
    P.run()
    os.unlink(temp.name)
def exportMotifLocations( infiles, outfile ):
    '''export motif locations. There will be a bed-file per motif.

    Overlapping motif matches in different tracks will be merged.
    '''

    dbh = connect()
    cc = dbh.cursor()

    motifs = [ x[0] for x in cc.execute( "SELECT motif FROM motif_info" ).fetchall()]

    
    for motif in motifs:

        tmpf = P.getTempFile(".")
        
        for infile in infiles:
            table = P.toTable(infile) 
            track = P.snip( table, "_mast" )
            for x in cc.execute( """SELECT contig, start, end, '%(track)s', evalue
                                   FROM %(table)s WHERE motif = '%(motif)s' AND start IS NOT NULL""" % locals() ):
                tmpf.write( "\t".join( map(str, x) ) + "\n" )
        tmpf.close()

        outfile = os.path.join( PARAMS["exportdir"], "motifs", "%s.bed.gz" % motif )
        tmpfname = tmpf.name 

        statement = '''mergeBed -i %(tmpfname)s -nms | gzip > %(outfile)s'''
        P.run()

        os.unlink( tmpf.name )
def calculateSequenceComposition(interval_names,
                                 sequence_file,
                                 outfile,
                                 header_line=True):
    '''
    given a set of interval names that are present in a
    fasta file, return CpG content file
    '''
    interval_file = open(interval_names)
    if header_line:
        interval_file.readline()
    sequence_file = open(sequence_file)

    interval_set = set()
    for line in interval_file.readlines():
        interval_set.add(line[:-1])

    temp = P.getTempFile("/ifs/scratch")
    for record in FastaIterator.iterate(sequence_file):
        seq_id = record.title.split(" ")[0]
        if seq_id in interval_set:
            temp.write(">%s\n%s\n" % (record.title, record.sequence))
    temp.close()

    inf = temp.name
    statement = '''cat %(inf)s | python %(scriptsdir)s/fasta2table.py
                   -s cpg -s length
                   --log=%(outfile)s.log > %(outfile)s'''
    P.run()
def loadMatchResults(infile, outfile):
    '''
    load the results of the match analysis into sqlite database
    '''
    temp = P.getTempFile("./match.dir")
    temp.write("seq_id\tmatrix_id\tposition\tstrand\t"
               "core_score\tmatrix_score\tsequence\n")
    for details in PipelineTFM.match_iterator(infile):
        temp.write("\t".join(map(str, [details.seq_id,
                                       details.matrix_id,
                                       details.position,
                                       details.strand,
                                       details.core_score,
                                       details.matrix_score,
                                       details.sequence])) + "\n")
    temp.close()

    to_cluster = True
    job_options = "-l mem_free=64G"

    inf = temp.name
    tablename = filenameToTablename(os.path.basename(infile))
    statement = ("python %(scriptsdir)s/csv2db.py"
                 "  -t %(tablename)s"
                 "  --log=%(outfile)s.log"
                 "  --index=seq_id"
                 "  %(csv2db_options)s"
                 " < %(inf)s > %(outfile)s")
    P.run()
    os.unlink(temp.name)
def loadPicardAlignStats(infiles, outfile):
    '''Merge Picard alignment stats into single table and load into SQLite.'''
    # Join data for all tracks into single file
    outf = P.getTempFile()
    first = True
    for f in infiles:
        track = P.snip(os.path.basename(f), ".alignstats")
        if not os.path.exists(f):
            E.warn("File %s missing" % f)
            continue
        lines = [
            x for x in open(f, "r").readlines() if not x.startswith("#") and x.strip()]
        if first:
            outf.write("%s\t%s" % ("track", lines[0]))
        first = False
        for i in range(1, len(lines)):
            outf.write("%s\t%s" % (track, lines[i]))
    outf.close()
    tmpfilename = outf.name

    # Load into database
    tablename = P.toTable(outfile)
    statement = '''cat %(tmpfilename)s
                | python %(scriptsdir)s/csv2db.py
                      --index=track
                      --table=%(tablename)s 
                > %(outfile)s'''
    P.run()
    os.unlink(tmpfilename)
Beispiel #9
0
def buildBenchmarkInput(infile, outfile):

    tmpfile = P.getTempFile()

    dbhandle = sqlite3.connect(PARAMS["database"])
    cc = dbhandle.cursor()
    statement = '''
    SELECT DISTINCT transcript_id, protein_id FROM peptide_info
    '''
    cc.execute(statement)
    tmpfile.write("transcript_id\tprotein_id\n")
    tmpfile.write("\n".join(["\t".join(x) for x in cc]))
    tmpfile.write("\n")
    tmpfilename = tmpfile.name

    statement = '''
    perl %(scriptsdir)s/extract_fasta.pl %(infile)s
    < cds.fasta 
    python %(scripstdir)s/fasta2variants.py --is-cds  
    | python %(scriptsdir)s/substitute_tokens.py 
             --apply=%(tmpfilename)s
    > %(outfile)s
    '''
    P.run()

    os.unlink(tmpfilename)
Beispiel #10
0
def loadPicardGCStats(infiles, outfile):
    '''Merge Picard insert size stats into single table and load into SQLite.'''

    tablename = P.toTable(outfile)
    outf = P.getTempFile()

    first = True
    for f in infiles:
        track = P.snip(os.path.basename(f), ".gcstats")
        if not os.path.exists(f):
            E.warn("File %s missing" % f)
            continue
        lines = [
            x for x in open(f, "r").readlines() if not x.startswith("#") and x.strip()]
        if first:
            outf.write("%s\t%s" % ("track", lines[0]))
        first = False
        outf.write("%s\t%s" % (track, lines[1]))
    outf.close()
    tmpfilename = outf.name

    statement = '''cat %(tmpfilename)s
                   | python %(scriptsdir)s/csv2db.py
                      %(csv2db_options)s
                      --index=track
                      --table=%(tablename)s 
                   > %(outfile)s '''
    P.run()

    os.unlink(tmpfilename)
Beispiel #11
0
def exportMotifLocations(infiles, outfile):
    '''export motif locations. There will be a bed-file per motif.

    Overlapping motif matches in different tracks will be merged.
    '''

    dbh = connect()
    cc = dbh.cursor()

    motifs = [
        x[0] for x in cc.execute("SELECT motif FROM motif_info").fetchall()
    ]

    for motif in motifs:

        tmpf = P.getTempFile(".")

        for infile in infiles:
            table = P.toTable(infile)
            track = P.snip(table, "_mast")
            for x in cc.execute(
                    """SELECT contig, start, end, '%(track)s', evalue
                                   FROM %(table)s WHERE motif = '%(motif)s' AND start IS NOT NULL"""
                    % locals()):
                tmpf.write("\t".join(map(str, x)) + "\n")
        tmpf.close()

        outfile = os.path.join(PARAMS["exportdir"], "motifs",
                               "%s.bed.gz" % motif)
        tmpfname = tmpf.name

        statement = '''mergeBed -i %(tmpfname)s -nms | gzip > %(outfile)s'''
        P.run()

        os.unlink(tmpf.name)
def loadLncRNAClass(infile, outfile):
    '''
    load the lncRNA classifications
    '''
    tablename = os.path.basename(
        filenameToTablename(P.snip(infile, ".gtf.gz")))

    to_cluster = False
    # just load each transcript with its classification
    temp = P.getTempFile()
    inf = IOTools.openFile(infile)
    for transcript in GTF.transcript_iterator(GTF.iterator(inf)):
        temp.write("%s\t%s\t%s\n" % (
            transcript[0].transcript_id, 
            transcript[0].gene_id, 
            transcript[0].source))
    temp.close()

    inf_1 = temp.name
    statement = ("python %(scriptsdir)s/csv2db.py"
                 "  -t %(tablename)s"
                 "  --log=%(outfile)s.log"
                 "  --header=transcript_id,gene_id,class"
                 " < %(inf_1)s > %(outfile)s")
    P.run()
Beispiel #13
0
def loadPicardAlignStats(infiles, outfile):
    '''Merge Picard alignment stats into single table and load into SQLite.'''
    # Join data for all tracks into single file
    outf = P.getTempFile()
    first = True
    for f in infiles:
        track = P.snip(os.path.basename(f), ".alignstats")
        if not os.path.exists(f):
            E.warn("File %s missing" % f)
            continue
        lines = [
            x for x in open(f, "r").readlines()
            if not x.startswith("#") and x.strip()
        ]
        if first:
            outf.write("%s\t%s" % ("track", lines[0]))
        first = False
        for i in range(1, len(lines)):
            outf.write("%s\t%s" % (track, lines[i]))
    outf.close()
    tmpfilename = outf.name

    # Load into database
    tablename = P.toTable(outfile)
    statement = '''cat %(tmpfilename)s
                | python %(scriptsdir)s/csv2db.py
                      --index=track
                      --table=%(tablename)s 
                > %(outfile)s'''
    P.run()
    os.unlink(tmpfilename)
Beispiel #14
0
def loadMatchResults(infile, outfile):
    '''
    load the results of the match analysis into 
    sqlite database
    '''
    temp = P.getTempFile()
    temp.write(
        "seq_id\tmatrix_id\tposition\tstrand\tcore_score\tmatrix_score\tsequence\n"
    )
    for details in PipelineTransfacMatch.match_iterator(infile):
        temp.write("\t".join(
            map(str, [
                details.seq_id, details.matrix_id, details.position,
                details.strand, details.core_score, details.matrix_score,
                details.sequence
            ])) + "\n")
    inf = temp.name
    tablename = filenameToTablename(os.path.basename(infile))
    statement = '''python %(scriptsdir)s/csv2db.py -t %(tablename)s
                   --log=%(outfile)s.log
                   --index=seq_id
                   %(csv2db_options)s
                   < %(inf)s > %(outfile)s'''
    P.run()
    os.remove(inf)
Beispiel #15
0
def calculateSequenceComposition(interval_names,
                                 sequence_file,
                                 outfile,
                                 header_line=True):
    '''
    given a set of interval names that are present in a
    fasta file, return CpG content file
    '''
    interval_file = open(interval_names)
    if header_line:
        interval_file.readline()
    sequence_file = open(sequence_file)

    interval_set = set()
    for line in interval_file.readlines():
        interval_set.add(line[:-1])

    temp = P.getTempFile("/ifs/scratch")
    for record in FastaIterator.iterate(sequence_file):
        seq_id = record.title.split(" ")[0]
        if seq_id in interval_set:
            temp.write(">%s\n%s\n" % (record.title, record.sequence))
    temp.close()

    inf = temp.name
    statement = '''cat %(inf)s | python %(scriptsdir)s/fasta2table.py
                   -s cpg -s length
                   --log=%(outfile)s.log > %(outfile)s'''
    P.run()
Beispiel #16
0
def loadTranscriptSummary(infile, outfile):
    '''summarize binding information per transcript.'''

    dbh = connect()

    table = P.toTable(outfile)

    cc = dbh.cursor()
    # sqlite can not do full outer join
    cc.execute( """DROP TABLE IF EXISTS %(table)s""" % locals() )

    transcripts = [x[0] for x in cc.execute(
        "SELECT DISTINCT(transcript_id) FROM annotations.transcript_info").fetchall()]

    tmpf = P.getTempFile()

    tables = ("tata", "cpg")
    titles = tables

    vals = []
    for table in tables:
        t = set([x[0] for x in cc.execute(
            "SELECT DISTINCT(transcript_id) FROM %(table)s" % locals()).fetchall()])
        vals.append(t)

    tmpf.write("transcript_id\t%s\n" % "\t".join(titles))

    for transcript_id in transcripts:
        tmpf.write("%s\t%s\n" % (transcript_id,
                                 "\t".join([str(int(transcript_id in v)) for v in vals])))

    tmpf.close()

    P.load(tmpf.name, outfile)
    os.unlink(tmpf.name)
Beispiel #17
0
def buildBenchmarkInput(infile, outfile):

    tmpfile = P.getTempFile()

    dbhandle = sqlite3.connect(PARAMS["database"])
    cc = dbhandle.cursor()
    statement = '''
    SELECT DISTINCT transcript_id, protein_id FROM peptide_info
    '''
    cc.execute(statement)
    tmpfile.write("transcript_id\tprotein_id\n")
    tmpfile.write("\n".join(["\t".join(x) for x in cc]))
    tmpfile.write("\n")
    tmpfilename = tmpfile.name

    statement = '''
    perl %(scriptsdir)s/extract_fasta.pl %(infile)s
    < cds.fasta 
    python %(scripstdir)s/fasta2variants.py --is-cds  
    | python %(scriptsdir)s/substitute_tokens.py 
             --apply=%(tmpfilename)s
    > %(outfile)s
    '''
    P.run()

    os.unlink(tmpfilename)
def importRepeatsFromUCSC(infile, outfile, ucsc_database, repeattypes, genome):
    """import repeats from a UCSC formatted file.

    The repeats are stored as a :term:`gff` formatted file.
    """

    repclasses = "','".join(repeattypes.split(","))

    # Repeats are either stored in a single ``rmsk`` table (hg19) or in
    # individual ``rmsk`` tables (mm9) like chr1_rmsk, chr2_rmsk, ....
    # In order to do a single statement, the ucsc mysql database is
    # queried for tables that end in rmsk.

    import MySQLdb

    dbhandle = MySQLdb.Connect(host=PARAMS["ucsc_host"], user=PARAMS["ucsc_user"])

    cc = dbhandle.cursor()
    cc.execute("USE %s " % ucsc_database)

    cc = dbhandle.cursor()
    cc.execute("SHOW TABLES LIKE '%rmsk'")
    tables = [x[0] for x in cc.fetchall()]
    if len(tables) == 0:
        raise ValueError("could not find any `rmsk` tables")

    tmpfile = P.getTempFile(".")

    for table in tables:
        E.info("loading repeats from %s" % table)
        cc = dbhandle.cursor()
        cc.execute(
            """SELECT genoName, 'repeat', 'exon', genoStart+1, genoEnd, '.', strand, '.', 
                      CONCAT('class \\"', repClass, '\\"; family \\"', repFamily, '\\";')
               FROM %(table)s
               WHERE repClass in ('%(repclasses)s') """
            % locals()
        )
        for data in cc.fetchall():
            tmpfile.write("\t".join(map(str, data)) + "\n")

    tmpfile.close()
    tmpfilename = tmpfile.name

    to_cluster = USECLUSTER

    statement = """cat %(tmpfilename)s
        | %(scriptsdir)s/gff_sort pos 
        | python %(scriptsdir)s/gff2gff.py 
            --sanitize=genome 
            --skip-missing 
            --genome-file=%(genome)s
            --log=%(outfile)s.log 
        | gzip
        > %(outfile)s
    """
    P.run()

    os.unlink(tmpfilename)
def loadAlignmentStats(infiles, outfile):
    '''merge alignment stats into single tables.'''

    tablename = P.toTable(outfile)

    outf = P.getTempFile()

    first = True
    for f in infiles:
        track = P.snip(f, ".bam.stats")
        fn = f + ".alignment_summary_metrics"
        if not os.path.exists(fn):
            E.warn("file %s missing" % fn)
            continue
        lines = [
            x for x in open(fn, "r").readlines()
            if not x.startswith("#") and x.strip()
        ]
        if first: outf.write("%s\t%s" % ("track", lines[0]))
        first = False
        outf.write("%s\t%s" % (track, lines[1]))

    outf.close()
    tmpfilename = outf.name

    statement = '''cat %(tmpfilename)s
                | python %(scriptsdir)s/csv2db.py
                      --index=track
                      --table=%(tablename)s 
                > %(outfile)s
               '''
    P.run()

    for suffix, column in (("quality_by_cycle_metrics", "cycle"),
                           ("quality_distribution_metrics", "quality")):

        # some files might be missing - bugs in Picard
        xfiles = [x for x in infiles if os.path.exists("%s.%s" % (x, suffix))]

        header = ",".join([P.snip(x, ".bam.stats") for x in xfiles])
        filenames = " ".join(["%s.%s" % (x, suffix) for x in xfiles])

        tname = "%s_%s" % (tablename, suffix)

        statement = """python %(scriptsdir)s/combine_tables.py
                      --missing=0
                   %(filenames)s
                | python %(scriptsdir)s/csv2db.py
                      --header=%(column)s,%(header)s
                      --replace-header
                      --index=track
                      --table=%(tname)s 
                >> %(outfile)s
                """

        P.run()

    os.unlink(tmpfilename)
Beispiel #20
0
def buildGenomicFunctionalAnnotation(gtffile, dbh, outfiles):
    '''output a bed file with genomic regions with functional annotations.

    The regions for each gene are given in the gtf file.

    Each bed entry is a gene territory. Bed entries are labeled
    by functional annotations associated with a gene.

    Ambiguities in territories are resolved by outputting 
    annotations for all genes within a territory.

    The output file contains annotations for both GO and GOSlim. These
    are prefixed by ``go:`` and ``goslim:``.
    '''

    to_cluster = True

    territories_file = gtffile

    outfile_bed, outfile_tsv = outfiles

    gene2region = {}
    for gtf in GTF.iterator(IOTools.openFile(gtffile, "r")):
        gid = gtf.gene_id.split(":")
        for g in gid:
            gene2region[g] = (gtf.contig, gtf.start, gtf.end, gtf.strand)

    # IMS: connect is not in this module. dbh needs to be passed from caller
    #dbh = connect()
    cc = dbh.cursor()

    outf = P.getTempFile(".")
    c = E.Counter()
    term2description = {}
    for db in ('go', 'goslim'):
        for gene_id, go_id, description in cc.execute(
                "SELECT gene_id, go_id, description FROM %s_assignments" % db):
            try:
                contig, start, end, strand = gene2region[gene_id]
            except KeyError:
                c.notfound += 1
                continue
            outf.write("\t".join(
                map(str, (contig, start, end, "%s:%s" %
                          (db, go_id), 1, strand))) + "\n")
            term2description["%s:%s" % (db, go_id)] = description
    outf.close()
    tmpfname = outf.name
    statement = '''sort -k1,1 -k2,2n  < %(tmpfname)s | uniq | gzip > %(outfile_bed)s'''

    P.run()

    outf = IOTools.openFile(outfile_tsv, "w")
    outf.write("term\tdescription\n")
    for term, description in term2description.iteritems():
        outf.write("%s\t%s\n" % (term, description))
    outf.close()
Beispiel #21
0
def loadBioProspector(infile, outfile):
    '''load results from bioprospector.'''

    tablename = outfile[:-len(".load")]
    target_path = os.path.join(os.path.abspath(PARAMS["exportdir"]),
                               "bioprospector")

    try:
        os.makedirs(target_path)
    except OSError:
        pass

    track = infile[:-len(".bioprospector")]

    results = Bioprospector.parse(IOTools.openFile(infile, "r"))

    tmpfile = P.getTempFile()
    tmpfile.write("id\tmotif\tstart\tend\tstrand\tarrangement\n")

    for x, motifs in enumerate(results):
        outname = os.path.join(target_path, "%s_%02i.png" % (track, x))
        Bioprospector.build_logo([y.sequence for y in motifs.matches], outname)

        for match in motifs.matches:

            distance = abs(match.start + match.width1 -
                           (match.end - match.width2))

            if match.strand in ("+-", "-+"):
                arrangement = "ER"
            elif match.strand in ("++", "--"):
                arrangement = "DR"
            else:
                arrangement = "SM"
                distance = 0

            arrangement += "%i" % distance
            strand = match.strand[0]

            id = re.sub(".*_", "", match.id)
            tmpfile.write("%s\t%i\t%i\t%i\t%s\t%s\n" %
                          (id, x, match.start, match.end, strand, arrangement))
    tmpfile.close()
    tmpfilename = tmpfile.name

    statement = '''
   python %(scriptsdir)s/csv2db.py %(csv2db_options)s \
    --allow-empty \
    -b sqlite \
    --index=id \
    --index=motif \
    --index=id,motif \
    --table=%(tablename)s \
    < %(tmpfilename)s > %(outfile)s
    '''

    P.run()
def loadAlignmentStats(infiles, outfile):
    '''merge alignment stats into single tables.'''

    tablename = P.toTable(outfile)

    outf = P.getTempFile()

    first = True
    for f in infiles:
        track = P.snip(f, ".bam.stats")
        fn = f + ".alignment_summary_metrics"
        if not os.path.exists(fn):
            E.warn("file %s missing" % fn)
            continue
        lines = [
            x for x in open(fn, "r").readlines() if not x.startswith("#") and x.strip()]
        if first:
            outf.write("%s\t%s" % ("track", lines[0]))
        first = False
        outf.write("%s\t%s" % (track, lines[1]))

    outf.close()
    tmpfilename = outf.name

    statement = '''cat %(tmpfilename)s
                | python %(scriptsdir)s/csv2db.py
                      --index=track
                      --table=%(tablename)s 
                > %(outfile)s
               '''
    P.run()

    for suffix, column in (("quality_by_cycle_metrics", "cycle"),
                           ("quality_distribution_metrics", "quality")):

        # some files might be missing - bugs in Picard
        xfiles = [x for x in infiles if os.path.exists("%s.%s" % (x, suffix))]

        header = ",".join([P.snip(x, ".bam.stats") for x in xfiles])
        filenames = " ".join(["%s.%s" % (x, suffix) for x in xfiles])

        tname = "%s_%s" % (tablename, suffix)

        statement = """python %(scriptsdir)s/combine_tables.py
                      --missing=0
                   %(filenames)s
                | python %(scriptsdir)s/csv2db.py
                      --header=%(column)s,%(header)s
                      --replace-header
                      --index=track
                      --table=%(tname)s 
                >> %(outfile)s
                """

        P.run()

    os.unlink(tmpfilename)
Beispiel #23
0
def importRepeatsFromUCSC(infile, outfile, ucsc_database, repeattypes, genome):
    '''import repeats from a UCSC formatted file.

    The repeats are stored as a :term:`gff` formatted file.
    '''

    repclasses = "','".join(repeattypes.split(","))

    # Repeats are either stored in a single ``rmsk`` table (hg19) or in
    # individual ``rmsk`` tables (mm9) like chr1_rmsk, chr2_rmsk, ....
    # In order to do a single statement, the ucsc mysql database is
    # queried for tables that end in rmsk.

    import MySQLdb
    dbhandle = MySQLdb.Connect(host=PARAMS["ucsc_host"],
                               user=PARAMS["ucsc_user"])

    cc = dbhandle.cursor()
    cc.execute("USE %s " % ucsc_database)

    cc = dbhandle.cursor()
    cc.execute("SHOW TABLES LIKE '%rmsk'")
    tables = [x[0] for x in cc.fetchall()]
    if len(tables) == 0:
        raise ValueError("could not find any `rmsk` tables")

    tmpfile = P.getTempFile(".")

    for table in tables:
        E.info("loading repeats from %s" % table)
        cc = dbhandle.cursor()
        cc.execute(
            """SELECT genoName, 'repeat', 'exon', genoStart+1, genoEnd, '.', strand, '.', 
                      CONCAT('class \\"', repClass, '\\"; family \\"', repFamily, '\\";')
               FROM %(table)s
               WHERE repClass in ('%(repclasses)s') """ % locals())
        for data in cc.fetchall():
            tmpfile.write("\t".join(map(str, data)) + "\n")

    tmpfile.close()
    tmpfilename = tmpfile.name

    to_cluster = USECLUSTER

    statement = '''cat %(tmpfilename)s
        | %(scriptsdir)s/gff_sort pos 
        | python %(scriptsdir)s/gff2gff.py 
            --sanitize=genome 
            --skip-missing 
            --genome-file=%(genome)s
            --log=%(outfile)s.log 
        | gzip
        > %(outfile)s
    '''
    P.run()

    os.unlink(tmpfilename)
Beispiel #24
0
def computeOverlapGO(infile, outfile):
    '''compute overlap between codingmarkers and windows.
    Only markers of certain GO categories are counted.

    This is done by setting the gene_id and transcript_id of markers of the
    ENSEMBEL gene that it overlaps with. This list is filtered first to
    keep only those ids with valid GO associations
    
    '''

    to_cluster = False

    filter_goid = set(IOTools.readList(open(PARAMS["filename_gofilter"])))
    filter_genes = set()

    E.info("number of goids: %i" % len(filter_goid))

    for l in open(PARAMS["filename_go"]):
        f, id, goid, desc, evd = l[:-1].split("\t")[:5]
        if goid in filter_goid:
            filter_genes.add(id)

    tmpfile1 = P.getTempFile(dir=".")

    for line in open("ensembl.diff.genes_ovl"):

        a, b = line[:-1].split("\t")
        if b not in filter_genes: continue
        tmpfile1.write(line)

    E.info("number of genes taken: %i" % len(filter_genes))

    tmpfile1.close()
    tmpfilename1 = tmpfile1.name

    tmpfilename = P.getTempFilename(dir=".")

    statement = '''python %(scriptsdir)s/gtf2gtf.py
    --rename=gene \
    --apply=%(tmpfilename1)s \
    < %(infile)s > %(tmpfilename)s
    '''

    P.run(**dict(locals().items() + PARAMS.items()))

    statement = '''python %(scriptsdir)s/gff2table.py 
    --filename-windows=<(python %(scriptsdir)s/bed2gff.py < windows.bed) 
    --decorator=counts
    --filename-data=%(tmpfilename)s \
    --skip-empty \
    --is-gtf \
    --log=%(outfile)s.log \
    < %(genome)s.fasta > %(outfile)s'''

    P.run(**dict(locals().items() + PARAMS.items()))

    os.unlink(tmpfilename)
def computeOverlapGO( infile, outfile ):
    '''compute overlap between codingmarkers and windows.
    Only markers of certain GO categories are counted.

    This is done by setting the gene_id and transcript_id of markers of the
    ENSEMBEL gene that it overlaps with. This list is filtered first to
    keep only those ids with valid GO associations
    
    '''
    
    to_cluster = False

    filter_goid = set(IOTools.readList( open( PARAMS["filename_gofilter"] ) ))
    filter_genes = set()

    E.info( "number of goids: %i" % len(filter_goid))
    
    for l in open( PARAMS["filename_go"]):
        f, id, goid, desc, evd = l[:-1].split("\t")[:5]
        if goid in filter_goid:
            filter_genes.add( id )

    tmpfile1 = P.getTempFile( dir = "." )

    for line in open("ensembl.diff.genes_ovl" ):

        a,b = line[:-1].split( "\t" )
        if b not in filter_genes: continue
        tmpfile1.write(line)
        
    E.info( "number of genes taken: %i" % len(filter_genes))
    
    tmpfile1.close()
    tmpfilename1 = tmpfile1.name

    tmpfilename = P.getTempFilename( dir = "." )
    
    statement = '''python %(scriptsdir)s/gtf2gtf.py
    --rename=gene \
    --apply=%(tmpfilename1)s \
    < %(infile)s > %(tmpfilename)s
    '''
    
    P.run( **dict( locals().items() + PARAMS.items() ) )
    
    statement = '''python %(scriptsdir)s/gff2table.py 
    --filename-windows=<(python %(scriptsdir)s/bed2gff.py < windows.bed) 
    --decorator=counts
    --filename-data=%(tmpfilename)s \
    --skip-empty \
    --is-gtf \
    --log=%(outfile)s.log \
    < %(genome)s.fasta > %(outfile)s'''

    P.run( **dict( locals().items() + PARAMS.items() ) )

    os.unlink( tmpfilename )
Beispiel #26
0
def buildGenomicFunctionalAnnotation(gtffile, dbh, outfiles):
    '''output a bed file with genomic regions with functional annotations.

    The regions for each gene are given in the gtf file.

    Each bed entry is a gene territory. Bed entries are labeled
    by functional annotations associated with a gene.

    Ambiguities in territories are resolved by outputting
    annotations for all genes within a territory.

    The output file contains annotations for both GO and GOSlim. These
    are prefixed by ``go:`` and ``goslim:``.
    '''
    territories_file = gtffile

    outfile_bed, outfile_tsv = outfiles

    gene2region = {}
    for gtf in GTF.iterator(IOTools.openFile(gtffile, "r")):
        gid = gtf.gene_id.split(":")
        for g in gid:
            gene2region[g] = (gtf.contig, gtf.start, gtf.end, gtf.strand)

    cc = dbh.cursor()

    outf = P.getTempFile(".")
    c = E.Counter()
    term2description = {}
    for db in ('go', 'goslim'):
        for gene_id, go_id, description in cc.execute(
                "SELECT gene_id, go_id, description FROM %s_assignments" % db):
            try:
                contig, start, end, strand = gene2region[gene_id]
            except KeyError:
                c.notfound += 1
                continue
            outf.write(
                "\t".join(map(str, (
                    contig, start, end,
                    "%s:%s" % (db, go_id), 1, strand))) + "\n")
            term2description["%s:%s" % (db, go_id)] = description
    outf.close()
    tmpfname = outf.name
    statement = '''sort -k1,1 -k2,2n  < %(tmpfname)s | uniq
    | gzip > %(outfile_bed)s'''

    P.run()

    outf = IOTools.openFile(outfile_tsv, "w")
    outf.write("term\tdescription\n")
    for term, description in term2description.iteritems():
        outf.write("%s\t%s\n" % (term, description))
    outf.close()

    os.unlink(tmpfname)
Beispiel #27
0
def sharedIntervals(infile, outfile):
    '''identify shared intervals between datasets'''
    to_cluster = True
    tmpfile = P.getTempFile()
    tmpfilename = tmpfile.name
    statement = '''cat %(infile)s > %(outfile)s;'''
    P.run()
    in_track = P.snip(os.path.basename(infile), ".merged.cleaned.bed")
    for track in TRACKS:
        if str(track) != in_track:
            statement = '''intersectBed -a %(outfile)s -b intervals/%(track)s.merged.cleaned.bed -u > %(tmpfilename)s; mv %(tmpfilename)s %(outfile)s; '''
            P.run()
def uniqueIntervals(infile, outfile):
    '''identify unique intervals for each dataset'''
    to_cluster = True
    tmpfile = P.getTempFile()
    tmpfilename = tmpfile.name
    statement = '''cat %(infile)s > %(outfile)s;'''
    P.run()
    in_track = P.snip( os.path.basename( infile ),".merged.cleaned.bed")
    for track in TRACKS:
       if str(track) <> in_track: 
           statement = '''intersectBed -a %(outfile)s -b intervals/%(track)s.merged.cleaned.bed -v > %(tmpfilename)s; mv %(tmpfilename)s %(outfile)s ''' 
           P.run()
Beispiel #29
0
def uniqueIntervals(infile, outfile):
    '''identify unique intervals for each dataset'''
    to_cluster = True
    tmpfile = P.getTempFile()
    tmpfilename = tmpfile.name
    statement = '''cat %(infile)s > %(outfile)s;'''
    P.run()
    in_track = P.snip(os.path.basename(infile), ".merged.cleaned.bed")
    for track in TRACKS:
        if str(track) <> in_track:
            statement = '''intersectBed -a %(outfile)s -b intervals/%(track)s.merged.cleaned.bed -v > %(tmpfilename)s; mv %(tmpfilename)s %(outfile)s '''
            P.run()
def sharedIntervals(infile, outfile):
    '''identify shared intervals between datasets'''
    to_cluster = True
    tmpfile = P.getTempFile()
    tmpfilename = tmpfile.name
    statement = '''cat %(infile)s > %(outfile)s;'''
    P.run()
    in_track = P.snip( os.path.basename( infile ), ".merged.cleaned.bed")
    for track in TRACKS:
       if str(track) != in_track:
           statement = '''intersectBed -a %(outfile)s -b intervals/%(track)s.merged.cleaned.bed -u > %(tmpfilename)s; mv %(tmpfilename)s %(outfile)s; ''' 
           P.run()
Beispiel #31
0
def buildCDSFasta(infile, outfile):
    '''load ENSEMBL cdna FASTA file

    *infile* is an ENSEMBL cdna file.
    '''

    dbname = outfile[:-len(".fasta")]
    # infile_peptides, infile_cdnas = infiles

    statement = '''gunzip < %(infile)s
    | python %(scriptsdir)s/gff2fasta.py
        --is-gtf
        --genome=%(genome_dir)s/%(genome)s
    | python %(scriptsdir)s/index_fasta.py
    %(dbname)s --force -
    > %(dbname)s.log
    '''
    P.run()
    return

    tmpfile = P.getTempFile(".")

    dbhandle = sqlite3.connect(PARAMS["database"])
    cc = dbhandle.cursor()
    tmpfile.write("protein_id\ttranscript_id\n")
    tmpfile.write("\n".join(
        ["%s\t%s" % x for x in
         cc.execute(
             "SELECT DISTINCT protein_id, transcript_id "
             "FROM transcript_info")]))
    tmpfile.write("\n")

    tmpfile.close()

    tmpfilename = tmpfile.name

    statement = '''
    python %(scriptsdir)s/peptides2cds.py
           --peptides=%(infile_peptides)s
           --cdnas=%(infile_cdnas)s
           --map=%(tmpfilename)s
           --output-format=fasta
           --log=%(outfile)s.log
    | python %(scriptsdir)s/index_fasta.py
    %(dbname)s --force -
    > %(dbname)s.log
    '''

    P.run()
    os.unlink(tmpfilename)
Beispiel #32
0
def buildCDSFasta(infile, outfile):
    '''load ENSEMBL cdna FASTA file

    *infile* is an ENSEMBL cdna file.
    '''

    dbname = outfile[:-len(".fasta")]
    # infile_peptides, infile_cdnas = infiles

    statement = '''gunzip < %(infile)s
    | python %(scriptsdir)s/gff2fasta.py
        --is-gtf
        --genome=%(genome_dir)s/%(genome)s
    | python %(scriptsdir)s/index_fasta.py
    %(dbname)s --force -
    > %(dbname)s.log
    '''
    P.run()
    return

    tmpfile = P.getTempFile(".")

    dbhandle = sqlite3.connect(PARAMS["database"])
    cc = dbhandle.cursor()
    tmpfile.write("protein_id\ttranscript_id\n")
    tmpfile.write("\n".join(
        ["%s\t%s" % x for x in
         cc.execute(
             "SELECT DISTINCT protein_id, transcript_id "
             "FROM transcript_info")]))
    tmpfile.write("\n")

    tmpfile.close()

    tmpfilename = tmpfile.name

    statement = '''
    python %(scriptsdir)s/peptides2cds.py
           --peptides=%(infile_peptides)s
           --cdnas=%(infile_cdnas)s
           --map=%(tmpfilename)s
           --output-format=fasta
           --log=%(outfile)s.log
    | python %(scriptsdir)s/index_fasta.py
    %(dbname)s --force -
    > %(dbname)s.log
    '''

    P.run()
    os.unlink(tmpfilename)
def compareAbundanceOfFalsePositiveSpecies(infiles, outfile):
    '''
    boxplot the relative abundance of false positive
    species compared to true positives
    '''
    tablename_estimate = P.toTable(infiles[0])

    track = P.snip(
        os.path.basename(infiles[0]).replace("metaphlan_", ""), ".load")
    tablename_true = [
        P.toTable(x) for x in infiles[1:]
        if P.snip(os.path.basename(x), ".load") == track
    ][0]
    dbh = sqlite3.connect("csvdb")
    cc = dbh.cursor()
    tmp = P.getTempFile(".")
    tmp.write("taxa\tabundance\tstatus\n")
    estimate = {}
    true = set()
    for data in cc.execute(
            """SELECT taxon, rel_abundance FROM %s WHERE taxon_level == 'species'"""
            % tablename_estimate).fetchall():
        estimate[data[0]] = data[1]
    for data in cc.execute("""SELECT taxa FROM %s WHERE level == 'species'""" %
                           tablename_true).fetchall():
        true.add(data[0])

    for taxa, abundance in estimate.iteritems():
        if taxa in true:
            tmp.write("%s\t%f\ttp\n" % (taxa, abundance))
        else:
            tmp.write("%s\t%f\tfp\n" % (taxa, abundance))
    tmp.close()

    inf = tmp.name
    if track.find("15M") != -1:
        col = "cadetblue"
    elif track.find("30M") != -1:
        col = "lightblue"
    elif track.find("50M") != -1:
        col = "slategray"

    R('''dat <- read.csv("%s", header = T, stringsAsFactors = F, sep = "\t")'''
      % inf)
    R('''library(ggplot2)''')
    R('''ggplot(dat, aes(x = status, y = log2(abundance))) + geom_boxplot(colour = "%s") + geom_hline(yintersect=0, linetype="dashed")'''
      % col)
    R('''ggsave("%s")''' % outfile)
    os.unlink(inf)
Beispiel #34
0
def loadReadCounts(infiles, outfile):
    '''load read counts into database.'''

    outf = P.getTempFile()
    outf.write("track\ttotal_reads\n")
    for infile in infiles:
        track = P.snip(infile, ".nreads")
        lines = IOTools.openFile(infile).readlines()
        nreads = int(lines[0][:-1].split("\t")[1])
        outf.write("%s\t%i\n" % (track, nreads))
    outf.close()

    P.load(outf.name, outfile)

    os.unlink(outf.name)
def loadReadCounts( infiles, outfile ):
    '''load read counts into database.'''

    outf = P.getTempFile()
    outf.write( "track\ttotal_reads\n")
    for infile in infiles:
        track = P.snip(infile, ".nreads")
        lines = IOTools.openFile( infile ).readlines()
        nreads = int( lines[0][:-1].split("\t")[1])
        outf.write( "%s\t%i\n" % (track,nreads))
    outf.close()

    P.load( outf.name, outfile )

    os.unlink(outf.name)
def buildIntervalsFasta(infile, outfile):
    '''
    build fasta file from intervals. Alternatively
    if a gtf file is specified this function will
    use parameters specified in the .ini file to 
    use intervals upstream / downstream of tss 
    '''

    # define upstream and downstream extensions
    upstream = PARAMS["intervals_extension_upstream"]
    downstream = PARAMS["intervals_extension_downstream"]

    assert len(str(upstream)), """extension_upstream cannot be of %s type. 
                        If no extension is to be used specify 0""" % type(upstream)
    assert len(str(downstream)), """downstream extension cannot be of %s type. 
                        If no extension is to be used specify 0""" % type(downstream)

    # if input is gtf then convert to bed
    # with intervals defined by .ini file
    temp = P.getTempFile()
    if infile.endswith(".gtf.gz"):
        # the resulting temporary file will not be zipped
        concatenate = "cat"
        for gene in GTF.merged_gene_iterator(GTF.iterator(IOTools.openFile(infile))):
            if gene.strand == "+":
                temp.write("%s\t%s\t%s\t%s\t%s\t%s\n" % (gene.contig, str(gene.start - upstream)
                                                             , str(gene.start + downstream), gene.gene_id
                                                             , ".", gene.strand))
            elif gene.strand == "-":
                temp.write("%s\t%s\t%s\t%s\t%s\t%s\n" % (gene.contig, str(gene.end - downstream)
                                                             , str(gene.end + upstream), gene.gene_id
                                                             , ".", gene.strand))
        temp.close()
        inf = temp.name
    else:
        inf = infile
        concatenate = "zcat"

    # define statement
    # doesn't use strand information
    statement = '''%(concatenate)s %(inf)s | python %(scriptsdir)s/bed2fasta.py
                   --genome=%(genomedir)s/%(genome)s 
                   --log=%(outfile)s.log > %(outfile)s'''
    P.run()
    
    if infile.endswith(".gtf.gz"):
        os.remove(inf)
Beispiel #37
0
def loadMotifInformation( infiles, outfile ):
    '''load information about motifs into database.'''
    
    outf = P.getTempFile(".")

    outf.write("motif\n" )

    for infile in infiles:
        if IOTools.isEmpty( infile ): continue
        motif = P.snip( infile, ".motif" )
        outf.write( "%s\n" % motif )

    outf.close()

    P.load( outf.name, outfile, "--allow-empty" )
    
    os.unlink( outf.name )
Beispiel #38
0
def loadMemeSummary( infiles, outfile ):
    '''load information about motifs into database.'''
    
    outf = P.getTempFile(".")

    outf.write("track\n" )

    for infile in infiles:
        if IOTools.isEmpty( infile ): continue
        motif = P.snip( infile, ".meme" )
        outf.write( "%s\n" % motif )

    outf.close()

    P.load( outf.name, outfile )
    
    os.unlink( outf.name )
def importAssignments( infiles, outfile):

    genes = collections.defaultdict( set )
    tags = []
    for infile in infiles:
        tag = re.sub(".*HVC", "", infile[:-len(".csv")])
        reader = csv.DictReader( open(infile,"rU") )
        for row in reader:
            try:
                gene_id = row["ESTIMA_ID"]
            except KeyError:
                print "Parsing error in line %s" % str(row)
                raise
            if gene_id == "": continue
            # remove ".A", ".B" and ".M" suffixes
            if gene_id[-2] == ".":
                gene_id = gene_id[:-2]

            genes[gene_id].add( tag )
            
        tags.append(tag)

    outf = P.getTempFile()
    outf.write( "gene_id\t%s\n" % "\t".join( tags) )
    for gene, present in genes.iteritems():
        x = []
        for t in tags:
            if t in present: x.append( "1" )
            else: x.append( "0" )
        outf.write( "%s\t%s\n" % (gene, "\t".join(x) ))

    outf.close()

    tablename = outfile[:-len(".import")]
    tmpfilename = outf.name
    
    statement = '''
   python %(scriptsdir)s/csv2db.py %(csv2db_options)s \
              --index=gene_id \
              --table=%(tablename)s \
    < %(tmpfilename)s > %(outfile)s
    '''

    P.run( **dict( locals().items() + PARAMS.items() ) )

    os.unlink( tmpfilename )
def getNumReadsFromBAMFile( infile ):
    '''count number of reads in bam file.'''
    # by-passes a problem with pysam, which was reading in stdout as the first elements in list data
    tmpf = P.getTempFile( ".")
    tmpfile_name = tmpf.name
    statement = '''samtools idxstats %(infile)s > %(tmpfile_name)s'''
    
    P.run()

    read_info = IOTools.openFile( tmpfile_name ).readlines()
    os.unlink( tmpfile_name )

    try:
        data = sum( map(int, [ x.split("\t")[2] for x in read_info if not x.startswith("#")]  ) )

    except IndexError, msg:
        raise IndexError( "can't get number of reads from bamfile, msg=%s, data=%s" % (msg, read_info))
Beispiel #41
0
def importAssignments(infiles, outfile):

    genes = collections.defaultdict(set)
    tags = []
    for infile in infiles:
        tag = re.sub(".*HVC", "", infile[:-len(".csv")])
        reader = csv.DictReader(open(infile, "rU"))
        for row in reader:
            try:
                gene_id = row["ESTIMA_ID"]
            except KeyError:
                print "Parsing error in line %s" % str(row)
                raise
            if gene_id == "": continue
            # remove ".A", ".B" and ".M" suffixes
            if gene_id[-2] == ".":
                gene_id = gene_id[:-2]

            genes[gene_id].add(tag)

        tags.append(tag)

    outf = P.getTempFile()
    outf.write("gene_id\t%s\n" % "\t".join(tags))
    for gene, present in genes.iteritems():
        x = []
        for t in tags:
            if t in present: x.append("1")
            else: x.append("0")
        outf.write("%s\t%s\n" % (gene, "\t".join(x)))

    outf.close()

    tablename = outfile[:-len(".import")]
    tmpfilename = outf.name

    statement = '''
   python %(scriptsdir)s/csv2db.py %(csv2db_options)s \
              --index=gene_id \
              --table=%(tablename)s \
    < %(tmpfilename)s > %(outfile)s
    '''

    P.run(**dict(locals().items() + PARAMS.items()))

    os.unlink(tmpfilename)
def extractEnsemblLincRNA(infile, outfile):
    tmpf = P.getTempFile("/ifs/scratch")
    for gtf in GTF.iterator(IOTools.openFile(infile)):
        if gtf.source == "lincRNA":
            tmpf.write(str(gtf) + "\n")
        else:
            continue
    tmpf.close()
    tmpf = tmpf.name

    statement = ("cat %(tmpf)s |"
                 " python %(scriptsdir)s/gtf2gtf.py"
                 "  --sort=gene"
                 "  --log=%(outfile)s.log |"
                 " gzip > %(outfile)s")
    P.run()

    os.unlink(tmpf)
Beispiel #43
0
def extractEnsemblLincRNA(infile, outfile):
    tmpf = P.getTempFile("/ifs/scratch")
    for gtf in GTF.iterator(IOTools.openFile(infile)):
        if gtf.source == "lincRNA":
            tmpf.write(str(gtf) + "\n")
        else:
            continue
    tmpf.close()
    tmpf = tmpf.name

    statement = ("cat %(tmpf)s |"
                 " python %(scriptsdir)s/gtf2gtf.py"
                 "  --sort=gene"
                 "  --log=%(outfile)s.log |"
                 " gzip > %(outfile)s")
    P.run()

    os.unlink(tmpf)
Beispiel #44
0
def importFromIterator( 
    outfile,
    tablename,
    iterator,
    columns = None,
    indices = None ):
    '''import data in *iterator* into *tablename* via temporary file.

    '''
    
    tmpfile = P.getTempFile()

    if columns:
        keys, values = zip( *columns.items() )
        tmpfile.write( "\t".join( values) + "\n" )
        
    for row in iterator:
        if not columns:
            keys = row[0].keys()
            values = keys
            columns = keys
            tmpfile.write( "\t".join( values) + "\n" )

        tmpfile.write( "\t".join( str(row[x]) for x in keys ) + "\n" )
        
    tmpfile.close()

    if indices:
        indices = " ".join( "--index=%s" % x for x in indices)
    else:
        indices = ""

    tmpfilename = tmpfile.name

    statement = '''
       python %(scriptsdir)s/csv2db.py %(csv2db_options)s 
                     --table=%(tablename)s
                     %(indices)s
        < %(tmpfilename)s > %(outfile)s
    '''
    
    P.run()

    os.unlink( tmpfilename )
def compareAbundanceOfFalsePositiveSpecies(infiles, outfile):
    '''
    boxplot the relative abundance of false positive
    species compared to true positives
    '''
    tablename_estimate = P.toTable(infiles[0])

    track = P.snip(
        os.path.basename(infiles[0]).replace("metaphlan_", ""), ".load")
    tablename_true = [P.toTable(x) for x in infiles[1:] if P.snip(
        os.path.basename(x), ".load") == track][0]
    dbh = sqlite3.connect("csvdb")
    cc = dbh.cursor()
    tmp = P.getTempFile(".")
    tmp.write("taxa\tabundance\tstatus\n")
    estimate = {}
    true = set()
    for data in cc.execute("""SELECT taxon, rel_abundance FROM %s WHERE taxon_level == 'species'""" % tablename_estimate).fetchall():
        estimate[data[0]] = data[1]
    for data in cc.execute("""SELECT taxa FROM %s WHERE level == 'species'""" % tablename_true).fetchall():
        true.add(data[0])

    for taxa, abundance in estimate.iteritems():
        if taxa in true:
            tmp.write("%s\t%f\ttp\n" % (taxa, abundance))
        else:
            tmp.write("%s\t%f\tfp\n" % (taxa, abundance))
    tmp.close()

    inf = tmp.name
    if track.find("15M") != -1:
        col = "cadetblue"
    elif track.find("30M") != -1:
        col = "lightblue"
    elif track.find("50M") != -1:
        col = "slategray"

    R('''dat <- read.csv("%s", header = T, stringsAsFactors = F, sep = "\t")''' %
      inf)
    R('''library(ggplot2)''')
    R('''ggplot(dat, aes(x = status, y = log2(abundance))) + geom_boxplot(colour = "%s") + geom_hline(yintersect=0, linetype="dashed")''' %
      col)
    R('''ggsave("%s")''' % outfile)
    os.unlink(inf)
Beispiel #46
0
def importFromIterator(
        outfile,
        tablename,
        iterator,
        columns=None,
        indices=None):
    '''import data in *iterator* into *tablename* via temporary file.

    '''

    tmpfile = P.getTempFile(".")

    if columns:
        keys, values = zip(*columns.items())
        tmpfile.write("\t".join(values) + "\n")

    for row in iterator:
        if not columns:
            keys = row[0].keys()
            values = keys
            columns = keys
            tmpfile.write("\t".join(values) + "\n")

        tmpfile.write("\t".join(str(row[x]) for x in keys) + "\n")

    tmpfile.close()

    if indices:
        indices = " ".join("--index=%s" % x for x in indices)
    else:
        indices = ""

    tmpfilename = tmpfile.name

    statement = '''
       python %(scriptsdir)s/csv2db.py %(csv2db_options)s 
                     --table=%(tablename)s
                     %(indices)s
        < %(tmpfilename)s > %(outfile)s
    '''

    P.run()

    os.unlink(tmpfilename)
    def readChunk( lines, chunk ):
        # use real file, as MAST parser can not deal with a
        # list of lines
        tmpfile2 = P.getTempFile(".")
        try:
            motif, part = re.match( ":: motif = (\S+) - (\S+) ::", lines[chunks[chunk]]).groups()
        except AttributeError:
            raise P.PipelineError("parsing error in line '%s'" % lines[chunks[chunk]])

        E.info( "reading %s - %s" % (motif, part))

        tmpfile2.write( "".join( lines[chunks[chunk]+1:chunks[chunk+1]]) )
        tmpfile2.close()

        mast = MAST.parse( IOTools.openFile(tmpfile2.name, "r") )

        os.unlink( tmpfile2.name )        
    
        return motif, part, mast
Beispiel #48
0
def loadReadCounts(infiles, outfile):
    '''load read counts into database.'''

    to_cluster = False
    outf = P.getTempFile()
    outf.write("track\ttotal_reads\n")
    for infile in infiles:
        track = P.snip(infile, ".nreads")
        lines = IOTools.openFile(infile).readlines()
        nreads = int(lines[0][:-1].split("\t")[1])
        outf.write("%s\t%i\n" % (track, nreads))
    outf.close()
    inname = outf.name

    tablename = P.toTable(outfile)
    statement = '''python %(scriptsdir)s/csv2db.py -t %(tablename)s --log=%(outfile)s.log
                  < %(inname)s > %(outfile)s'''
    P.run()
    os.unlink(outf.name)
Beispiel #49
0
def loadMemeChipSummary( infiles, outfile ):
    '''load information about motifs into database.'''
    
    outf = P.getTempFile(".")

    outf.write("track\tnpeaks\twidth\tmasking\tpath\n" )

    for infile in infiles:
        if IOTools.isEmpty( infile ): continue
        fn = P.snip(os.path.basename( infile ), ".memechip" )
        
        track, npeaks, width, masking = fn.split(".")
        outf.write( "\t".join( map(str,(track, npeaks, width, masking, fn)) ) + "\n" )

    outf.close()

    P.load( outf.name, outfile )
    
    os.unlink( outf.name )
def loadReadCounts(infiles, outfile):
    '''load read counts into database.'''

    to_cluster = False
    outf = P.getTempFile()
    outf.write("track\ttotal_reads\n")
    for infile in infiles:
        track = P.snip(infile, ".nreads")
        lines = IOTools.openFile(infile).readlines()
        nreads = int(lines[0][:-1].split("\t")[1])
        outf.write("%s\t%i\n" % (track, nreads))
    outf.close()
    inname = outf.name

    tablename = P.toTable(outfile)
    statement = '''python %(scriptsdir)s/csv2db.py -t %(tablename)s --log=%(outfile)s.log 
                  < %(inname)s > %(outfile)s'''
    P.run()
    os.unlink(outf.name)
Beispiel #51
0
def loadTomTom(infile, outfile):
    '''load tomtom results'''

    tablename = P.toTable(outfile)

    resultsdir = os.path.join(os.path.abspath(PARAMS["exportdir"]), "tomtom",
                              infile)
    xml_file = os.path.join(resultsdir, "tomtom.xml")

    if not os.path.exists(xml_file):
        E.warn("no tomtom output - skipped loading ")
        P.touch(outfile)
        return

    # get the motif name from the xml file

    tree = xml.etree.ElementTree.ElementTree()
    tree.parse(xml_file)
    motifs = tree.find("targets")
    name2alt = {}
    for motif in motifs.getiterator("motif"):
        name = motif.get("name")
        alt = motif.get("alt")
        name2alt[name] = alt

    tmpfile = P.getTempFile(".")

    # parse the text file
    for line in IOTools.openFile(infile):
        if line.startswith("#Query"):
            tmpfile.write(
                "target_name\tquery_id\ttarget_id\toptimal_offset\tpvalue\tevalue\tqvalue\tOverlap\tquery_consensus\ttarget_consensus\torientation\n"
            )
            continue
        data = line[:-1].split("\t")
        target_name = name2alt[data[1]]
        tmpfile.write("%s\t%s" % (target_name, line))
    tmpfile.close()

    P.load(tmpfile.name, outfile)

    os.unlink(tmpfile.name)
def loadMissedReadCounts(infiles, outfile):
    """load summary table of numbers of missed reads."""

    def _getlines(inf):
        return len(IOTools.openFile(inf).readlines()) - 1

    tmpfile = P.getTempFile()

    infiles = sorted(infiles)

    tmpfile.write("track\tmapped_genome\tmissed_junctions\tmissed_transcriptome\n")

    for x in range(0, len(infiles), 2):
        junctions, transcriptome = infiles[x], infiles[x + 1]
        track = P.snip(junctions, ".missed_junctions.gz")
        mapped_genome = _getlines(track + ".mapped_reads.gz")
        tmpfile.write("%s\t%i\t%i\t%i\n" % (track, mapped_genome, _getlines(junctions), _getlines(transcriptome)))
    tmpfile.close()
    P.load(tmpfile.name, outfile)
    os.unlink(tmpfile.name)
Beispiel #53
0
def loadMatchResults(infile, outfile):
    '''
    load the results of the match analysis into 
    sqlite database
    '''
    temp = P.getTempFile()
    temp.write(
        "seq_id\tmatrix_id\tposition\tstrand\tcore_score\tmatrix_score\tsequence\n")
    for details in PipelineTransfacMatch.match_iterator(infile):
        temp.write("\t".join(map(str, [details.seq_id, details.matrix_id, details.position,
                   details.strand, details.core_score, details.matrix_score, details.sequence])) + "\n")
    inf = temp.name
    tablename = filenameToTablename(os.path.basename(infile))
    statement = '''python %(scriptsdir)s/csv2db.py -t %(tablename)s
                   --log=%(outfile)s.log
                   --index=seq_id
                   %(csv2db_options)s
                   < %(inf)s > %(outfile)s'''
    P.run()
    os.remove(inf)
Beispiel #54
0
def loadLncRNAClass(infile, outfile):
    '''
    load the lncRNA classifications
    '''
    tablename = os.path.basename(filenameToTablename(P.snip(infile,
                                                            ".gtf.gz")))

    to_cluster = False
    # just load each transcript with its classification
    temp = P.getTempFile()
    for transcript in GTF.transcript_iterator(
            GTF.iterator(IOTools.openFile(infile))):
        temp.write("%s\t%s\t%s\n" %
                   (transcript[0].transcript_id, transcript[0].gene_id,
                    transcript[0].source))
    temp.close()

    inf = temp.name
    statement = '''python %(scriptsdir)s/csv2db.py -t %(tablename)s --log=%(outfile)s.log --header=transcript_id,gene_id,class < %(inf)s > %(outfile)s'''
    P.run()
def genericImportAnnotator(infiles, outfile, table, workspace, slice, subset,
                           fdr_method):
    '''generic import of annotator results.

    Assumes that the suffix of all infiles is the same.
    '''

    infile = " ".join(infiles)
    x, suffix = os.path.splitext(infiles[0])

    tmpfilename = P.getTempFilename()

    statement = '''
	python %(scriptsdir)s/annotator.py \
		--method=fdr-table \
		--fdr-method=%(fdr_method)s \
		--log=%(outfile)s.log \
                --regex-id="(.*)%(suffix)s" \
                %(infile)s > %(tmpfilename)s
        '''
    P.run()

    tmpfile = P.getTempFile()

    for line in open(tmpfilename, "r"):
        if line.startswith("id"):
            line = "subset\tworkspace\tslice\t" + re.sub("^id", "track", line)
        else:
            line = "%s\t%s\t%s\t%s" % (subset, workspace, slice, line)
        tmpfile.write(line)
    tmpfile.close()
    tmpfilename2 = tmpfile.name

    statement = '''
   python %(scriptsdir)s/csv2db.py %(csv2db_options)s \
            --table=%(table)s 
    < %(tmpfilename2)s > %(outfile)s'''

    P.run()
    os.unlink(tmpfilename)
    os.unlink(tmpfilename2)