コード例 #1
0
def mergeAndLoad(infiles, outfile, suffix):
    '''load categorical tables (two columns) into a database.

    The tables are merged and entered row-wise.

    '''
    header = ",".join([P.quote(P.snip(x, suffix)) for x in infiles])
    if suffix.endswith(".gz"):
        filenames = " ".join(
            ["<( zcat %s | cut -f 1,2 )" % x for x in infiles])
    else:
        filenames = " ".join(["<( cat %s | cut -f 1,2 )" % x for x in infiles])

    tablename = P.toTable(outfile)

    statement = """python %(scriptsdir)s/combine_tables.py
                      --header-names=%(header)s
                      --missing-value=0
                      --ignore-empty
                   %(filenames)s
                | perl -p -e "s/bin/track/" 
                | python %(scriptsdir)s/table2table.py --transpose
                | python %(scriptsdir)s/csv2db.py
                      --add-index=track
                      --table=%(tablename)s 
                > %(outfile)s
            """
    P.run()
コード例 #2
0
def makeIntervalCorrelation(infiles, outfile, field, reference):
    '''compute correlation of interval properties between sets
    '''

    dbhandle = sqlite3.connect(PARAMS["database"])

    tracks, idx = [], []
    for infile in infiles:
        track = P.snip(infile, ".bed")
        tablename = "%s_intervals" % P.quote(track)
        cc = dbhandle.cursor()
        statement = "SELECT contig, start, end, %(field)s FROM %(tablename)s" % locals(
        )
        cc.execute(statement)
        ix = IndexedGenome.IndexedGenome()
        for contig, start, end, peakval in cc:
            ix.add(contig, start, end, peakval)
        idx.append(ix)
        tracks.append(track)
    outs = IOTools.openFile(outfile, "w")
    outs.write("contig\tstart\tend\tid\t" + "\t".join(tracks) + "\n")

    for bed in Bed.iterator(infile=open(reference, "r")):

        row = []
        for ix in idx:
            try:
                intervals = list(ix.get(bed.contig, bed.start, bed.end))
            except KeyError:
                row.append("")
                continue

            if len(intervals) == 0:
                peakval = ""
            else:
                peakval = str((max([x[2] for x in intervals])))
            row.append(peakval)

        outs.write(str(bed) + "\t" + "\t".join(row) + "\n")

    outs.close()
コード例 #3
0
def loadBAMStats(infiles, outfile):
    '''import bam statisticis.'''

    header = ",".join([P.quote(P.snip(x, ".readstats")) for x in infiles])
    filenames = " ".join(["<( cut -f 1,2 < %s)" % x for x in infiles])
    tablename = P.toTable(outfile)
    E.info("loading bam stats - summary")
    statement = """python %(scriptsdir)s/combine_tables.py
                      --header-names=%(header)s
                      --missing-value=0
                      --ignore-empty
                   %(filenames)s
                | perl -p -e "s/bin/track/"
                | perl -p -e "s/unique/unique_alignments/"
                | python %(scriptsdir)s/table2table.py --transpose
                | python %(scriptsdir)s/csv2db.py
                      --add-index=track
                      --table=%(tablename)s 
                > %(outfile)s
            """
    P.run()

    for suffix in ("nm", "nh"):
        E.info("loading bam stats - %s" % suffix)
        filenames = " ".join(["%s.%s" % (x, suffix) for x in infiles])
        tname = "%s_%s" % (tablename, suffix)

        statement = """python %(scriptsdir)s/combine_tables.py
                      --header-names=%(header)s
                      --skip-titles
                      --missing-value=0
                      --ignore-empty
                   %(filenames)s
                | perl -p -e "s/bin/%(suffix)s/"
                | python %(scriptsdir)s/csv2db.py
                      --table=%(tname)s 
                >> %(outfile)s
                """

        P.run()
コード例 #4
0
def loadTranscriptomeValidation(infiles, outfile):
    '''load transcriptome validation data into database.'''

    to_cluster = USECLUSTER

    headers = ",".join([P.quote(P.snip(x, ".accepted.bam")) for x in infiles])
    infiles = " ".join(["%s.log" % x for x in infiles])

    tablename = P.toTable(outfile)

    statement = '''
    python %(scriptsdir)s/combine_tables.py 
         --header-names=%(headers)s
         %(infiles)s
    | python %(scriptsdir)s/table2table.py --transpose
    | perl -p -e "s/bin/track/"
    | python %(scriptsdir)s/csv2db.py
         --table=%(tablename)s 
    > %(outfile)s
    '''

    P.run()
コード例 #5
0
def exportIntervalsAsBed(infile, outfile):
    '''export macs peaks as bed files.
    '''

    dbhandle = sqlite3.connect(PARAMS["database"])

    if outfile.endswith(".gz"):
        compress = True
        track = P.snip(outfile, ".bed.gz")
    else:
        compress = False
        track = P.snip(outfile, ".bed")

    tablename = "%s_intervals" % P.quote(track)

    cc = dbhandle.cursor()
    statement = "SELECT contig, start, end, interval_id, peakval FROM %s ORDER by contig, start" % tablename
    cc.execute(statement)

    outs = IOTools.openFile("%s.bed" % track, "w")

    for result in cc:
        contig, start, end, interval_id, peakval = result
        # peakval is truncated at a 1000 as this is the maximum permitted
        # score in a bed file.
        peakval = int(min(peakval, 1000))
        outs.write("%s\t%i\t%i\t%s\t%i\n" %
                   (contig, start, end, str(interval_id), peakval))

    cc.close()
    outs.close()

    if compress:
        E.info("compressing and indexing %s" % outfile)
        use_cluster = True
        statement = 'bgzip -f %(track)s.bed; tabix -f -p bed %(outfile)s'
        P.run()
コード例 #6
0
def writeSequencesForIntervals(track,
                               filename,
                               dbhandle,
                               full=False,
                               halfwidth=None,
                               maxsize=None,
                               proportion=None,
                               masker=[],
                               offset=0,
                               shuffled=False,
                               num_sequences=None,
                               min_sequences=None,
                               order="peakval",
                               shift=None):
    '''build a sequence set for motif discovery. Intervals are taken from
    the table <track>_intervals in the database *dbhandle* and save to
    *filename* in :term:`fasta` format.

    If num_shuffles is set, shuffled copies are created as well with
    the shuffled number appended to the filename.

    The sequences are masked before shuffling (is this appropriate?)

    If *full* is set, the whole intervals will be output, otherwise
    only the region around the peak given by *halfwidth*

    If *maxsize* is set, the output is truncated at *maxsize* characters
    in order to create jobs that take too long.

    If proportion is set, only the top *proportion* intervals are output
    (sorted by peakval).

    If *num_sequences* is set, the first *num_sequences* will be used.

    *masker* can be a combination of
        * dust, dustmasker: apply dustmasker
        * softmask: mask softmasked genomic regions

    *order* is the order by which peaks should be sorted. Possible
    values are 'peakval' (peak value, descending order), 'score' (peak
    score, descending order)

    If *shift* is set, intervals will be shifted. ``leftright``
    creates two intervals on the left and right of the actual
    interval. The intervals will be centered around the mid-point and
    truncated the same way as the main intervals.

    '''

    fasta = IndexedFasta.IndexedFasta(
        os.path.join(PARAMS["genome_dir"], PARAMS["genome"]))

    cc = dbhandle.cursor()

    if order == "peakval":
        orderby = " ORDER BY peakval DESC"
    elif order == "max":
        orderby = " ORDER BY score DESC"
    else:
        raise ValueError(
            "Unknown value passed as order parameter, check your ini file")

    tablename = "%s_intervals" % P.quote(track)
    statement = '''SELECT contig, start, end, interval_id, peakcenter 
                       FROM %(tablename)s 
                       ''' % locals() + orderby

    cc.execute(statement)
    data = cc.fetchall()
    cc.close()

    if proportion:
        cutoff = int(len(data) * proportion) + 1
        if min_sequences:
            cutoff = max(cutoff, min_sequences)
    elif num_sequences:
        cutoff = num_sequences
    else:
        cutoff = len(data)
        L.info("writeSequencesForIntervals %s: using at most %i sequences for pattern finding" % (
            track, cutoff))

    data = data[:cutoff]

    L.info("writeSequencesForIntervals %s: masker=%s" % (track, str(masker)))

    fasta = IndexedFasta.IndexedFasta(
        os.path.join(PARAMS["genome_dir"], PARAMS["genome"]))

    # modify the ranges
    if shift:
        if shift == "leftright":
            new_data = [(contig, start - (end - start), start, str(interval_id) + "_left", peakcenter)
                        for contig, start, end, interval_id, peakcenter in data]
            new_data.extend([(contig, end, end + (end - start), str(interval_id) + "_right", peakcenter)
                             for contig, start, end, interval_id, peakcenter in data])
        data = new_data

    if halfwidth:
        # center around peakcenter, add halfwidth on either side
        data = [(contig, peakcenter - halfwidth, peakcenter + halfwidth, interval_id)
                for contig, start, end, interval_id, peakcenter in data]
    else:
        # remove peakcenter
        data = [(contig, start, end, interval_id)
                for contig, start, end, interval_id, peakcenter in data]

    # get the sequences - cut at number of nucleotides
    sequences = []
    current_size, nseq = 0, 0
    new_data = []
    for contig, start, end, interval_id in data:
        lcontig = fasta.getLength(contig)
        start, end = max(0, start + offset), min(end + offset, lcontig)
        if start >= end:
            L.info("writeSequencesForIntervals %s: sequence %s is empty: start=%i, end=%i, offset=%i - ignored" %
                   (track, id, start, end, offset))
            continue
        seq = fasta.getSequence(contig, "+", start, end)
        sequences.append(seq)
        new_data.append((start, end, interval_id, contig))
        current_size += len(seq)
        if maxsize and current_size >= maxsize:
            L.info("writeSequencesForIntervals %s: maximum size (%i) reached - only %i sequences output (%i ignored)" %
                   (track, maxsize, nseq, len(data) - nseq))
            break
        nseq += 1

    data = new_data

    if shuffled:
        # note that shuffling is done on the unmasked sequences
        # Otherwise N's would be interspersed with real sequence
        # messing up motif finding unfairly. Instead, masking is
        # done on the shuffled sequence.
        sequences = [list(x) for x in sequences]
        for sequence in sequences:
            random.shuffle(sequence)
        sequences = maskSequences(["".join(x) for x in sequences], masker)

    c = E.Counter()
    outs = IOTools.openFile(filename, "w")
    for masker in masker:
        if masker not in ("unmasked", "none", None):
            sequences = maskSequences(sequences, masker)

    for sequence, d in zip(sequences, data):
        c.input += 1
        if len(sequence) == 0:
            c.empty += 1
            continue
        start, end, id, contig = d
        id = "%s_%s %s:%i-%i" % (track, str(id), contig, start, end)
        outs.write(">%s\n%s\n" % (id, sequence))
        c.output += 1
    outs.close()

    E.info("%s" % c)

    return c.output