Ejemplo n.º 1
0
def loadPicardAlignStats(infiles, outfile):
    '''Merge Picard alignment stats into single table and load into SQLite.'''

    tablename = P.toTable(outfile)

    outf = P.getTempFile()

    first = True
    for f in infiles:
        track = P.snip(os.path.basename(f), ".dedup.alignstats")
        if not os.path.exists(f):
            E.warn("File %s missing" % f)
            continue
        lines = [
            x for x in open(f, "r").readlines()
            if not x.startswith("#") and x.strip()
        ]
        if first:
            outf.write("%s\t%s" % ("track", lines[0]))
        first = False
        for i in range(1, len(lines)):
            outf.write("%s\t%s" % (track, lines[i]))

    outf.close()
    tmpfilename = outf.name

    statement = '''cat %(tmpfilename)s
                | cgat csv2db
                      --add-index=track
                      --table=%(tablename)s 
                > %(outfile)s
               '''
    P.run()

    os.unlink(tmpfilename)
Ejemplo n.º 2
0
def buildBenchmarkInput(infile, outfile):

    tmpfile = P.getTempFile()

    dbhandle = sqlite3.connect(PARAMS["database_name"])
    cc = dbhandle.cursor()
    statement = '''
    SELECT DISTINCT transcript_id, protein_id FROM peptide_info
    '''
    cc.execute(statement)
    tmpfile.write("transcript_id\tprotein_id\n")
    tmpfile.write("\n".join(["\t".join(x) for x in cc]))
    tmpfile.write("\n")
    tmpfilename = tmpfile.name

    statement = '''
    perl %(scriptsdir)s/extract_fasta.pl %(infile)s
    < cds.fasta 
    python %(scripstdir)s/fasta2variants.py --is-cds  
    | python %(scriptsdir)s/substitute_tokens.py 
             --map-tsv-file=%(tmpfilename)s
    > %(outfile)s
    '''
    P.run()

    os.unlink(tmpfilename)
Ejemplo n.º 3
0
def calculateSequenceComposition(interval_names,
                                 sequence_file,
                                 outfile,
                                 header_line=True):
    '''
    given a set of interval names that are present in a
    fasta file, return CpG content file
    '''
    interval_file = open(interval_names)
    if header_line:
        interval_file.readline()
    sequence_file = open(sequence_file)

    interval_set = set()
    for line in interval_file.readlines():
        interval_set.add(line[:-1])

    temp = P.getTempFile("/ifs/scratch")
    for record in FastaIterator.iterate(sequence_file):
        seq_id = record.title.split(" ")[0]
        if seq_id in interval_set:
            temp.write(">%s\n%s\n" % (record.title, record.sequence))
    temp.close()

    inf = temp.name
    statement = '''
    cat %(inf)s | cgat fasta2table
    -s na -s cpg -s length
    --log=%(outfile)s.log > %(outfile)s'''

    P.run()
def importRepeatsFromUCSC(infile, outfile, ucsc_database, repeattypes, genome):
    '''import repeats from a UCSC formatted file.

    The repeats are stored as a :term:`gff` formatted file.
    '''

    repclasses = "','".join(repeattypes.split(","))

    # Repeats are either stored in a single ``rmsk`` table (hg19) or in
    # individual ``rmsk`` tables (mm9) like chr1_rmsk, chr2_rmsk, ....
    # In order to do a single statement, the ucsc mysql database is
    # queried for tables that end in rmsk.
    dbhandle = PipelineUCSC.connectToUCSC(host=PARAMS["ucsc_host"],
                                          user=PARAMS["ucsc_user"],
                                          database=ucsc_database)

    cc = dbhandle.execute("SHOW TABLES LIKE '%%rmsk'")
    tables = [x[0] for x in cc.fetchall()]
    if len(tables) == 0:
        raise ValueError("could not find any `rmsk` tables")

    tmpfile = P.getTempFile(shared=True)

    total_repeats = 0
    for table in tables:
        E.info("%s: loading repeats from %s" % (ucsc_database, table))
        cc = dbhandle.execute(
            """SELECT genoName, 'repeat', 'exon', genoStart+1, genoEnd, '.',
            strand, '.',
            CONCAT('class \\"', repClass, '\\"; family \\"', repFamily, '\\";')
            FROM %(table)s
            WHERE repClass in ('%(repclasses)s') """ % locals())
        n = 0
        for data in cc.fetchall():
            n += 1
            tmpfile.write("\t".join(map(str, data)) + "\n")
        E.info("%s: %s=%i repeats downloaded" % (ucsc_database, table, n))
        total_repeats += n

    if total_repeats == 0:
        raise ValueErrror("did not find any repeats for %s" % ucsc_database)

    tmpfile.close()
    tmpfilename = tmpfile.name

    statement = '''cat %(tmpfilename)s
    | %(pipeline_scriptsdir)s/gff_sort pos
    | cgat gff2gff
    --method=sanitize
    --sanitize-method=genome
    --skip-missing
    --genome-file=%(genome)s
    --log=%(outfile)s.log
    | gzip
    > %(outfile)s
    '''
    P.run()

    os.unlink(tmpfilename)
Ejemplo n.º 5
0
def loadAlignmentStats(infiles, outfile):
    '''merge alignment stats into single tables.'''

    tablename = P.toTable(outfile)

    outf = P.getTempFile()

    first = True
    for f in infiles:
        track = P.snip(f, ".bam.stats")
        fn = f + ".alignment_summary_metrics"
        if not os.path.exists(fn):
            E.warn("file %s missing" % fn)
            continue
        lines = [
            x for x in open(fn, "r").readlines() if not x.startswith("#") and x.strip()]
        if first:
            outf.write("%s\t%s" % ("track", lines[0]))
        first = False
        outf.write("%s\t%s" % (track, lines[1]))

    outf.close()
    tmpfilename = outf.name

    statement = '''cat %(tmpfilename)s
                | cgat csv2db
                      --add-index=track
                      --table=%(tablename)s
                > %(outfile)s
               '''
    P.run()

    for suffix, column in (("quality_by_cycle_metrics", "cycle"),
                           ("quality_distribution_metrics", "quality")):

        # some files might be missing - bugs in Picard
        xfiles = [x for x in infiles if os.path.exists("%s.%s" % (x, suffix))]

        header = ",".join([P.snip(x, ".bam.stats") for x in xfiles])
        filenames = " ".join(["%s.%s" % (x, suffix) for x in xfiles])

        tname = "%s_%s" % (tablename, suffix)

        statement = """cgat combine_tables
                      --missing-value=0
                   %(filenames)s
                | cgat csv2db
                      --header-names=%(column)s,%(header)s
                      --replace-header
                      --add-index=track
                      --table=%(tname)s
                >> %(outfile)s
                """

        P.run()

    os.unlink(tmpfilename)
Ejemplo n.º 6
0
def loadMatchResults(infile, outfile):
    '''
    load the results of the match analysis into sqlite database
    '''
    temp = P.getTempFile("./match.dir")
    temp.write("seq_id\tmatrix_id\tposition\tstrand\t"
               "core_score\tmatrix_score\tsequence\n")
    for details in PipelineTFM.match_iterator(infile):
        temp.write("\t".join(
            map(str, [
                details.seq_id, details.matrix_id, details.position,
                details.strand, details.core_score, details.matrix_score,
                details.sequence
            ])) + "\n")
    temp.close()

    P.load(temp.name, outfile, options="--add-index=seq_id")
    os.unlink(temp.name)
Ejemplo n.º 7
0
def extractEnsemblLincRNA(infile, outfile):
    tmpf = P.getTempFile("/ifs/scratch")
    for gtf in GTF.iterator(IOTools.openFile(infile)):
        if gtf.source == "lincRNA":
            tmpf.write(str(gtf) + "\n")
        else:
            continue
    tmpf.close()
    tmpf = tmpf.name

    statement = ("cat %(tmpf)s |"
                 " cgat gtf2gtf"
                 "  --method=sort --sort-order=gene"
                 "  --log=%(outfile)s.log |"
                 " gzip > %(outfile)s")
    P.run()

    os.unlink(tmpf)
Ejemplo n.º 8
0
def loadTranscriptSummary(infile, outfile):
    '''summarize binding information per transcript.'''

    dbh = connect()

    table = P.toTable(outfile)

    cc = dbh.cursor()
    # sqlite can not do full outer join
    cc.execute("""DROP TABLE IF EXISTS %(table)s""" % locals())

    transcripts = [
        x[0] for x in cc.execute(
            "SELECT DISTINCT(transcript_id) FROM annotations.transcript_info").
        fetchall()
    ]

    tmpf = P.getTempFile()

    tables = ("tata", "cpg")
    titles = tables

    vals = []
    for table in tables:
        t = set([
            x[0] for x in
            cc.execute("SELECT DISTINCT(transcript_id) FROM %(table)s" %
                       locals()).fetchall()
        ])
        vals.append(t)

    tmpf.write("transcript_id\t%s\n" % "\t".join(titles))

    for transcript_id in transcripts:
        tmpf.write("%s\t%s\n" % (transcript_id, "\t".join(
            [str(int(transcript_id in v)) for v in vals])))

    tmpf.close()

    P.load(tmpf.name, outfile)
    os.unlink(tmpf.name)
Ejemplo n.º 9
0
def loadLncRNAClass(infile, outfile):
    '''
    load the lncRNA classifications
    '''

    # just load each transcript with its classification
    temp = P.getTempFile(".")
    inf = IOTools.openFile(infile)
    for transcript in GTF.transcript_iterator(GTF.iterator(inf)):
        temp.write("%s\t%s\t%s\n" %
                   (transcript[0].transcript_id, transcript[0].gene_id,
                    transcript[0].source))
    temp.close()

    P.load(temp.name,
           outfile,
           options="--header-names=transcript_id,gene_id,class "
           "--add-index=transcript_id "
           "--add-index=gene_id")

    os.unlink(temp.name)
Ejemplo n.º 10
0
def importFromSeries(infiles, outfile):
    '''import expression levels from a GEO series.'''
    tablename = P.toTable(outfile)

    tmpf = P.getTempFile()

    infile_data, infile_map = infiles

    map_header = IOTools.readMap(open(infile_map, "r"))
    if "ID_REF" not in map_header:
        map_header["ID_REF"] = "probeset"

    inf = gzip.open(infile_data, "r")

    for line in inf:
        if line.startswith("!"):
            continue
        if not line.strip():
            continue
        line = re.sub('"', "", line)
        if line.startswith("ID_REF"):
            line = "\t".join([map_header[x]
                              for x in line[:-1].split("\t")]) + "\n"

        tmpf.write(line)

    tmpf.close()
    tmpname = tmpf.name

    header = map_header["ID_REF"]
    statement = '''
   cgat csv2db %(csv2db_options)s \
              --add-index=%(header)s \
              --table=%(tablename)s \
    < %(tmpname)s > %(outfile)s
    '''

    P.run()
    os.unlink(tmpname)
Ejemplo n.º 11
0
def loadMissedReadCounts(infiles, outfile):
    '''load summary table of numbers of missed reads.'''

    def _getlines(inf):
        return len(IOTools.openFile(inf).readlines()) - 1

    tmpfile = P.getTempFile()

    infiles = sorted(infiles)

    tmpfile.write(
        "track\tmapped_genome\tmissed_junctions\tmissed_transcriptome\n")

    for x in range(0, len(infiles), 2):
        junctions, transcriptome = infiles[x], infiles[x + 1]
        track = P.snip(junctions, ".missed_junctions.gz")
        mapped_genome = _getlines(track + ".mapped_reads.gz")
        tmpfile.write("%s\t%i\t%i\t%i\n" % (track,
                                            mapped_genome,
                                            _getlines(junctions),
                                            _getlines(transcriptome)))
    tmpfile.close()
    P.load(tmpfile.name, outfile)
    os.unlink(tmpfile.name)
Ejemplo n.º 12
0
def loadPicardAlignStats(infiles, outfile):
    '''Merge Picard alignment stats into single table and load into SQLite.'''
    # Join data for all tracks into single file
    outf = P.getTempFile()
    first = True
    for f in infiles:
        track = P.snip(os.path.basename(f), ".alignstats")
        if not os.path.exists(f):
            E.warn("File %s missing" % f)
            continue
        lines = [
            x for x in open(f, "r").readlines() if not x.startswith("#") and x.strip()]
        if first:
            outf.write("%s\t%s" % ("track", lines[0]))
        first = False
        for i in range(1, len(lines)):
            outf.write("%s\t%s" % (track, lines[i]))
    outf.close()

    # Load into database
    P.load(outf.name,
           outfile,
           options="--add-index=track")
    os.unlink(outf.name)
Ejemplo n.º 13
0
def loadMACS(infile, outfile, bamfile, tablename=None):
    '''load MACS results in *tablename*

    This method loads only positive peaks. It filters peaks by p-value,
    q-value and fold change and loads the diagnostic data and
    re-calculates peakcenter, peakval, ... using the supplied bamfile.

    If *tablename* is not given, it will be :file:`<track>_intervals`
    where track is derived from ``infile`` and assumed to end
    in :file:`.macs`.

    This method creates two optional additional files:

    * if the file :file:`<track>_diag.xls` is present, load MACS 
      diagnostic data into the table :file:`<track>_macsdiag`.

    * if the file :file:`<track>_model.r` is present, call R to
      create a MACS peak-shift plot and save it as :file:`<track>_model.pdf`
      in the :file:`export/MACS` directory.

    This method creates :file:`<outfile>.tsv.gz` with the results
    of the filtering.
    '''

    track = P.snip(os.path.basename(infile), ".macs")
    folder = os.path.dirname(infile)
    if len(folder) > 0:
        infilename = folder + "/" + track + "_peaks.xls"
        filename_diag = folder + "/" + track + "_diag.xls"
        filename_r = folder + "/" + track + "_model.r"
        filename_rlog = folder + "/" + track + ".r.log"
        filename_pdf = track + "_model.pdf"
    else:
        infilename = track + "_peaks.xls"
        filename_diag = track + "_diag.xls"
        filename_r = track + "_model.r"
        filename_rlog = track + ".r.log"
        filename_pdf = track + "_model.pdf"

    if not os.path.exists(infilename):
        E.warn("could not find %s" % infilename)
        P.touch(outfile)
        return

    # create plot by calling R
    if os.path.exists(filename_r):
        if len(folder) > 0:
            statement = '''R --vanilla < %(filename_r)s > %(filename_rlog)s; mv %(filename_pdf)s %(folder)s/%(filename_pdf)s; '''
        else:
            statement = '''R --vanilla < %(filename_r)s > %(filename_rlog)s; '''
        P.run()

    # filter peaks
    shift = getPeakShiftFromMacs(infile)
    assert shift is not None, "could not determine peak shift from MACS file %s" % infile

    E.info("%s: found peak shift of %i" % (track, shift))

    samfiles = [pysam.Samfile(bamfile, "rb")]
    offsets = [shift / 2]

    outtemp = P.getTempFile(".")
    tmpfilename = outtemp.name

    outtemp.write("\t".join((
        "interval_id",
        "contig", "start", "end",
        "npeaks", "peakcenter",
        "length",
        "avgval",
        "peakval",
        "nprobes",
        "pvalue", "fold", "qvalue",
        "macs_summit", "macs_nprobes",
    )) + "\n")
    id = 0

    # get thresholds
    max_qvalue = float(PARAMS["macs_max_qvalue"])
    # min, as it is -10log10
    min_pvalue = float(PARAMS["macs_min_pvalue"])

    counter = E.Counter()
    with IOTools.openFile(infilename, "r") as ins:
        for peak in WrapperMACS.iteratePeaks(ins):

            if peak.fdr > max_qvalue:
                counter.removed_qvalue += 1
                continue
            elif peak.pvalue < min_pvalue:
                counter.removed_pvalue += 1
                continue

            assert peak.start < peak.end

            npeaks, peakcenter, length, avgval, peakval, nreads = countPeaks(
                peak.contig, peak.start, peak.end, samfiles, offsets)

            outtemp.write("\t".join(map(str, (
                id, peak.contig, peak.start, peak.end,
                npeaks, peakcenter, length, avgval, peakval, nreads,
                peak.pvalue, peak.fold, peak.fdr,
                peak.start + peak.summit - 1,
                peak.tags))) + "\n")
            id += 1
            counter.output += 1

    outtemp.close()

    # output filtering summary
    outf = IOTools.openFile("%s.tsv.gz" % outfile, "w")
    outf.write("category\tcounts\n")
    outf.write("%s\n" % counter.asTable())
    outf.close()

    E.info("%s filtering: %s" % (track, str(counter)))
    if counter.output == 0:
        E.warn("%s: no peaks found" % track)

    # load data into table
    if tablename is None:
        tablename = "%s_macs_intervals" % track
    statement = '''cgat csv2db %(csv2db_options)s 
                       --allow-empty-file
                       --add-index=interval_id 
                       --add-index=contig,start
                       --table=%(tablename)s 
                   < %(tmpfilename)s > %(outfile)s '''
    P.run()
    os.unlink(tmpfilename)

    # load diagnostic data
    if os.path.exists(filename_diag):

        tablename = "%s_macsdiag" % track
        statement = '''
        cat %(filename_diag)s 
        | sed "s/FC range.*/fc\\tnpeaks\\tp90\\tp80\\tp70\\tp60\\tp50\\tp40\\tp30\\tp20/" 
        | cgat csv2db %(csv2db_options)s 
                  --map=fc:str 
                  --table=%(tablename)s 
        >> %(outfile)s
        '''
        P.run()
Ejemplo n.º 14
0
def loadIntervalsFromBed(bedfile, track, outfile,
                         bamfiles, offsets):
    '''load intervals from :term:`bed` formatted files into database.

    Re-evaluate the intervals by counting reads within
    the interval. In contrast to the initial pipeline, the
    genome is not binned. In particular, the meaning of the
    columns in the table changes to:

    nProbes: number of reads in interval
    PeakCenter: position with maximum number of reads in interval
    AvgVal: average coverage within interval

    '''

    tmpfile = P.getTempFile(".")

    headers = ("AvgVal", "DisttoStart", "GeneList", "Length", "PeakCenter", "PeakVal", "Position",
               "interval_id", "nCpGs", "nGenes", "nPeaks", "nProbes", "nPromoters", "contig", "start", "end")

    tmpfile.write("\t".join(headers) + "\n")

    avgval, contig, disttostart, end, genelist, length, peakcenter, peakval, position, start, interval_id, ncpgs, ngenes, npeaks, nprobes, npromoters = \
        0, "", 0, 0, "", 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

    mlength = int(PARAMS["calling_merge_min_interval_length"])

    c = E.Counter()

    # count tags
    for bed in Bed.iterator(IOTools.openFile(infile, "r")):

        c.input += 1

        if "name" not in bed:
            bed.name = c.input

        # remove very short intervals
        if bed.end - bed.start < mlength:
            c.skipped_length += 1
            continue

        if replicates:
            npeaks, peakcenter, length, avgval, peakval, nprobes = \
                PipelineChipseq.countPeaks(
                    bed.contig, bed.start, bed.end,
                    samfiles, offsets)

            # nreads can be 0 if the intervals overlap only slightly
            # and due to the binning, no reads are actually in the
            # overlap region.  However, most of these intervals should
            # be small and have already be deleted via the
            # merge_min_interval_length cutoff.  do not output
            # intervals without reads.
            if nprobes == 0:
                c.skipped_reads += 1

        else:
            npeaks, peakcenter, length, avgval, peakval, nprobes = (
                1,
                bed.start +
                (bed.end - bed.start) // 2,
                bed.end - bed.start,
                1,
                1,
                1)

        c.output += 1
        tmpfile.write("\t".join(map(
            str,
            (avgval, disttostart, genelist, length,
             peakcenter, peakval, position, bed.name,
             ncpgs, ngenes, npeaks, nprobes, npromoters,
             bed.contig, bed.start, bed.end))) + "\n")

    if c.output == 0:
        E.warn("%s - no intervals")

    tmpfile.close()

    tmpfilename = tmpfile.name
    tablename = "%s_intervals" % track.asTable()

    statement = '''
    cgat csv2db %(csv2db_options)s
    --allow-empty-file
    --add-index=interval_id
    --table=%(tablename)s
    < %(tmpfilename)s
    > %(outfile)s
    '''

    P.run()
    os.unlink(tmpfile.name)

    E.info("%s\n" % str(c))
Ejemplo n.º 15
0
def loadZinba(infile, outfile, bamfile,
              tablename=None,
              controlfile=None):
    '''load Zinba results in *tablename*

    This method loads only positive peaks. It filters peaks by p-value,
    q-value and fold change and loads the diagnostic data and
    re-calculates peakcenter, peakval, ... using the supplied bamfile.

    If *tablename* is not given, it will be :file:`<track>_intervals`
    where track is derived from ``infile`` and assumed to end
    in :file:`.zinba`.

    If no peaks were predicted, an empty table is created.

    This method creates :file:`<outfile>.tsv.gz` with the results
    of the filtering.

    This method uses the refined peak locations.

    Zinba peaks can be overlapping. This method does not merge
    overlapping intervals.

    Zinba calls peaks in regions where there are many reads inside
    the control. Thus this method applies a filtering step 
    removing all intervals in which there is a peak of
    more than readlength / 2 height in the control.

    .. note:

       Zinba calls peaks that are overlapping.

    '''

    track = P.snip(os.path.basename(infile), ".zinba")
    folder = os.path.dirname(infile)

    infilename = infile + ".peaks"

    outtemp = P.getTempFile(".")
    tmpfilename = outtemp.name

    outtemp.write("\t".join((
        "interval_id",
        "contig", "start", "end",
        "npeaks", "peakcenter",
        "length",
        "avgval",
        "peakval",
        "nprobes",
        "pvalue", "fold", "qvalue",
        "macs_summit", "macs_nprobes",
    )) + "\n")

    counter = E.Counter()

    if not os.path.exists(infilename):
        E.warn("could not find %s" % infilename)
    elif IOTools.isEmpty(infilename):
        E.warn("no data in %s" % infilename)
    else:
        # filter peaks
        shift = getPeakShiftFromZinba(infile)
        assert shift is not None, \
            "could not determine peak shift from Zinba file %s" % infile

        E.info("%s: found peak shift of %i" % (track, shift))

        samfiles = [pysam.Samfile(bamfile, "rb")]
        offsets = [shift / 2]

        if controlfile:
            controlfiles = [pysam.Samfile(controlfile, "rb")]
            readlength = BamTools.estimateTagSize(controlfile)
            control_max_peakval = readlength // 2
            E.info("removing intervals in which control has peak higher than %i reads" %
                   control_max_peakval)
        else:
            controlfiles = None

        id = 0

        # get thresholds
        max_qvalue = float(PARAMS["zinba_fdr_threshold"])

        with IOTools.openFile(infilename, "r") as ins:
            for peak in WrapperZinba.iteratePeaks(ins):

                # filter by qvalue
                if peak.fdr > max_qvalue:
                    counter.removed_qvalue += 1
                    continue

                assert peak.refined_start < peak.refined_end

                # filter by control
                if controlfiles:
                    npeaks, peakcenter, length, avgval, peakval, nreads = countPeaks(peak.contig,
                                                                                     peak.refined_start,
                                                                                     peak.refined_end,
                                                                                     controlfiles,
                                                                                     offsets)

                    if peakval > control_max_peakval:
                        counter.removed_control += 1
                        continue

                # output peak
                npeaks, peakcenter, length, avgval, peakval, nreads = countPeaks(peak.contig,
                                                                                 peak.refined_start,
                                                                                 peak.refined_end,
                                                                                 samfiles,
                                                                                 offsets)

                outtemp.write("\t".join(map(str, (
                    id, peak.contig, peak.refined_start, peak.refined_end,
                    npeaks, peakcenter, length, avgval, peakval, nreads,
                    1.0 - peak.posterior, 1.0, peak.fdr,
                    peak.refined_start + peak.summit - 1,
                    peak.height))) + "\n")
                id += 1
                counter.output += 1

    outtemp.close()

    # output filtering summary
    outf = IOTools.openFile("%s.tsv.gz" % outfile, "w")
    outf.write("category\tcounts\n")
    outf.write("%s\n" % counter.asTable())
    outf.close()

    E.info("%s filtering: %s" % (track, str(counter)))
    if counter.output == 0:
        E.warn("%s: no peaks found" % track)

    # load data into table
    if tablename is None:
        tablename = "%s_intervals" % track

    statement = '''
    cgat csv2db %(csv2db_options)s 
              --allow-empty-file
              --add-index=interval_id 
              --add-index=contig,start
              --table=%(tablename)s 
    < %(tmpfilename)s 
    > %(outfile)s
    '''

    P.run()

    os.unlink(tmpfilename)
Ejemplo n.º 16
0
def buildJunctionsDB(infiles, outfile):
    '''build a database of all junctions.'''

    to_cluster = USECLUSTER
    outfile_junctions = outfile + ".junctions.bed.gz"
    min_anchor_length = 3
    read_length = 50
    infiles = (infiles, )

    tmpfile = P.getTempFile(".")

    for infile in infiles:
        if infile.endswith(".bam"):
            junctions_file = P.snip(infile, ".bam") + ".junctions.bed.gz"
            columns = (0, 1, 2, 5)
        else:
            junctions_file = infile
            columns = (0, 1, 2, 3)

        if not os.path.exists(junctions_file):
            E.warn("can't find junctions file '%s'" % junctions_file)
            continue

        inf = IOTools.openFile(junctions_file)
        for line in inf:
            if line.startswith("#"):
                continue
            if line.startswith("track"):
                continue
            data = line[:-1].split("\t")
            try:
                tmpfile.write("\t".join([data[x] for x in columns]) + "\n")
            except IndexError:
                raise IndexError("parsing error in line %s" % line)

    tmpfile.close()
    tmpfilename = tmpfile.name

    statement = '''
    sort %(tmpfilename)s | gzip > %(outfile_junctions)s
    '''

    P.run()

    os.unlink(tmpfilename)

    E.info("building junctions database")
    statement = '''
    juncs_db %(min_anchor_length)i %(read_length)i
              <( zcat %(outfile_junctions)s )
              /dev/null /dev/null
              %(bowtie_genome_dir)s/%(genome)s.fa
              > %(outfile)s
              2> %(outfile)s.log
    '''
    P.run()

    E.info("indexing junctions database")

    prefix = P.snip(outfile, ".fa")

    # build raw index
    statement = '''
    bowtie-build -f %(outfile)s %(prefix)s >> %(outfile)s.log 2>&1
    '''

    P.run()

    # build color space index
    statement = '''
    bowtie-build -C -f %(outfile)s %(prefix)s_cs >> %(outfile)s.log 2>&1
    '''

    P.run()
def plotRelativeAbundanceCorrelations(infiles, outfile):
    '''
    plot the correlation between the estimated 
    relative abundance of species and the true
    relative abundances - done on the shared set
    '''

    # connect to database
    dbh = sqlite3.connect(PARAMS["database_name"])
    cc = dbh.cursor()

    true_file = infiles[0]
    temp = P.getTempFile()
    temp.write("true\testimate\tlevel\n")
    for estimate_file in infiles[1:]:
        if os.path.basename(estimate_file)[
                len("metaphlan_"):] == os.path.basename(true_file):
            tablenames = [
                P.toTable(os.path.basename(true_file)),
                P.toTable(os.path.basename(estimate_file))
            ]
            # get data for each taxonomic level
            for taxa in [
                    "phylum", "class", "order", "family", "genus", "species"
            ]:
                statement = """SELECT a.relab, b.rel_abundance, a.level
                           FROM %s as a, %s as b
                           WHERE b.taxon_level == "%s"
                           AND a.taxa == b.taxon""" % (tablenames[0],
                                                       tablenames[1], taxa)
                for data in cc.execute(statement).fetchall():
                    true, estimate, level = data[0], data[1], data[2]

                    temp.write("%f\t%f\t%s\n" % (true, estimate, level))
    temp.close()

    inf = temp.name
    R('''data <- read.csv("%s", header = T, stringsAsFactors = F, sep = "\t")'''
      % inf)
    R('''library(ggplot2)''')
    R('''data$estimate <- data$estimate/100''')
    R('''ggplot(data, aes(true, estimate, colour = level)) + geom_point() + geom_smooth(method = "lm")'''
      )
    R('''ggsave("%s")''' % outfile)

    out_cors = P.snip(outfile, ".pdf") + ".cors"
    R('''cors <- data.frame("level" = c("phylum", "class", "order", "family", "genus", "species"), "cor" = rep(0, 6))'''
      )
    R('''cors[1,2] <- cor(data$true[data$level == "phylum"], data$estimate[data$level == "phylum"])'''
      )
    R('''cors[2,2] <- cor(data$true[data$level == "class"], data$estimate[data$level == "class"])'''
      )
    R('''cors[3,2] <- cor(data$true[data$level == "order"], data$estimate[data$level == "order"])'''
      )
    R('''cors[4,2] <- cor(data$true[data$level == "family"], data$estimate[data$level == "family"])'''
      )
    R('''cors[5,2] <- cor(data$true[data$level == "genus"], data$estimate[data$level == "genus"])'''
      )
    R('''cors[6,2] <- cor(data$true[data$level == "species"], data$estimate[data$level == "species"])'''
      )
    R('''write.table(cors, file = "%s", row.names = F, sep = "\t")''' %
      out_cors)

    # do the same at the low end - not for species
    R('''data$estimate <- data$estimate/100''')
    R('''ggplot(data[data$true < 0.75 & data$level != "species",], aes(true, estimate, colour = level)) + geom_point() + geom_smooth(method = "lm", se = F)'''
      )
    outf = P.snip(outfile, ".pdf") + ".lowest.pdf"
    R('''ggsave("%s")''' % outf)

    out_cors = P.snip(outfile, ".pdf") + ".lowest.cors"
    R('''cors <- data.frame("level" = c("phylum", "class", "order", "family", "genus", "species"), "cor" = rep(0, 6))'''
      )
    R('''cors[1,2] <- cor(data$true[data$level == "phylum" & data$true < 0.75], data$estimate[data$level == "phylum" & data$true < 0.75])'''
      )
    R('''cors[2,2] <- cor(data$true[data$level == "class" & data$true < 0.75], data$estimate[data$level == "class" & data$true < 0.75])'''
      )
    R('''cors[3,2] <- cor(data$true[data$level == "order" & data$true < 0.75], data$estimate[data$level == "order" & data$true < 0.75])'''
      )
    R('''cors[4,2] <- cor(data$true[data$level == "family" & data$true < 0.75], data$estimate[data$level == "family" & data$true < 0.75])'''
      )
    R('''cors[5,2] <- cor(data$true[data$level == "genus" & data$true < 0.75], data$estimate[data$level == "genus" & data$true < 0.75])'''
      )
    R('''write.table(cors, file = "%s", row.names = F, sep = "\t")''' %
      out_cors)

    os.unlink(inf)