Exemple #1
0
def fetchProbeFragments(probe_bed, digest_bed, outfile, lookup_out):

    digest_fragments = pysam.TabixFile(digest_bed)
    bed = Bed.Bed()
    with IOTools.openFile(outfile, "w") as outf, \
         IOTools.openFile(lookup_out,"w") as lookup:

        lookup.write("probe\tfragment\n")
        for probe in Bed.iterator(IOTools.openFile(probe_bed)):

            frag = digest_fragments.fetch(probe.contig,
                                          probe.start,
                                          probe.end,
                                          parser=pysam.asBed())
            frag = list(frag)
            if not len(frag) == 1:
                E.warn("%i fragments found for probe %s, skipping" %
                       (len(frag), probe.name))
                continue

            frag = frag[0]
            bed.start = frag.start
            bed.end = frag.end
            bed.contig = frag.contig
            bed["name"] = probe.name
            bed["score"] = "."
            bed["strand"] = "+"

            lookup.write("%s\t%s\n" % (probe.name, frag.name))
            outf.write(str(bed) + "\n")
def bedsFromList(data):
    ''' takes a list of data and returns a bed object'''

    for interval in data:
        bed = Bed.Bed()
        try:
            bed.contig, bed.start, bed.end = \
                        interval[0], int(interval[1]), int(interval[2])
        except IndexError:
            raise ValueError("Insufficient fields to generate bed entry")
        except ValueError:
            raise ValueError("Fields 2 and 3 must be integer")
        bed.fields = interval[3:]

        yield bed
Exemple #3
0
def sites2fragments(infile, genomefile, outfile):
    '''Convert bedfile of deigestion sites into bedfile of fragments'''

    contig_lengths = {
        line.split()[0]: int(line.split()[1][:-1])
        for line in IOTools.openFile(genomefile)
    }

    last_end = 0
    last_contig = None
    name = 0
    new_bed = Bed.Bed()
    new_bed["strand"] = "+"
    new_bed["score"] = "."
    with IOTools.openFile(outfile, "w") as outf:
        for bed in Bed.iterator(IOTools.openFile(infile)):

            if last_contig is not None and not bed.contig == last_contig:
                name += 1
                new_bed.start = last_end
                new_bed.contig = last_contig
                new_bed.end = contig_lengths[bed.contig]
                new_bed["name"] = str(name)

                outf.write(str(new_bed) + "\n")

                last_end = 0

            last_contig = bed.contig
            new_bed.contig = last_contig
            new_bed.start = last_end
            new_bed.end = bed.start
            name += 1
            new_bed["name"] = str(name)
            outf.write(str(new_bed) + "\n")
            last_end = bed.end

        name += 1
        new_bed.start = last_end
        new_bed.contig = last_contig
        new_bed.end = contig_lengths[bed.contig]
        new_bed["name"] = str(name)

        outf.write(str(new_bed) + "\n")

    pysam.tabix_index(outfile, force=True, preset="bed")
Exemple #4
0
def transcript2bed12(transcript):

    new_entry = Bed.Bed()
    start = min(entry.start for entry in transcript)
    end = max(entry.end for entry in transcript)

    try:
        thickStart = min(entry.start for entry in transcript
                         if entry.feature == "CDS")
        thickEnd = max(entry.end for entry in transcript
                       if entry.feature == "CDS")
    except ValueError:

        # if there is no CDS, then set first base of transcript as
        # start

        if transcript[0].strand == "-":
            thickStart = end
            thickEnd = end
        else:
            thickStart = start
            thickEnd = start

    exons = GTF.asRanges(transcript, "exon")

    exon_starts = [es - start for (es, ee) in exons]
    exon_lengths = [ee - es for (es, ee) in exons]
    exon_count = len(exons)
    new_entry.contig = transcript[0].contig
    new_entry.start = start
    new_entry.end = end
    new_entry["strand"] = transcript[0].strand
    new_entry["name"] = transcript[0].transcript_id

    new_entry["thickStart"] = thickStart
    new_entry["thickEnd"] = thickEnd

    new_entry["blockCount"] = exon_count
    new_entry["blockStarts"] = ",".join(map(str, exon_starts))
    new_entry["blockSizes"] = ",".join(map(str, exon_lengths))

    return new_entry
Exemple #5
0
def windows2bed12(windows, contig, strand, name, score):
    '''Convert a list of intervals into a single bed12 entry '''

    windows = sorted(windows)

    entry = Bed.Bed()

    #if strand == "-":
    #    windows = [(y+1, x+1) for x, y in windows]
    #    windows = sorted(windows)
    #else:
    #    windows = sorted(windows)

    entry.start = int(windows[0][0])
    entry.end = int(windows[-1][1])

    entry.contig = contig

    blockCount = int(len(windows))
    blockSizes = ",".join(
        [str(int(window[1] - window[0])) for window in windows])
    blockStarts = ",".join(
        [str(int(window[0] - windows[0][0])) for window in windows])
    thickStart = int(entry.start)
    thickEnd = int(entry.end)
    itemRGB = "255,0,0"

    entry.fields = [
        name, score, strand, thickStart, thickEnd, itemRGB, blockCount,
        blockSizes, blockStarts
    ]

    assert entry.end - entry.start > 0, "Malformed Bed entry entry size less than zero"
    assert all([blockSize > 0 for blockSize in
                map(int, blockSizes.split(","))]), \
        "Malformed Bed entry, at least one block size less than zero"
    assert all([entry.start + blockStart <= entry.end
                for blockStart in map(int,blockStarts.split(","))]), \
                    "Malformed Bed entry: block start after end of entry"

    return entry
def getExonLocations(filename):
    '''return a list of exon locations as Bed entries
    from a file contain a one ensembl gene ID per line
    '''
    fh = IOTools.openFile(filename, "r")
    ensembl_ids = []
    for line in fh:
        ensembl_ids.append(line.strip())
    fh.close()

    dbhandle = sqlite3.connect(PARAMS["annotations_database"])
    cc = dbhandle.cursor()

    gene_ids = []
    n_ids = 0
    for ID in ensembl_ids:
        gene_ids.append('gene_id="%s"' % ID)
        n_ids += 1

    statement = "select contig,start,end from geneset_cds_gtf where " + \
        " OR ".join(gene_ids)

    cc.execute(statement)

    region_list = []
    n_regions = 0
    for result in cc:
        b = Bed.Bed()
        b.contig, b.start, b.end = result
        region_list.append(b)
        n_regions += 1

    cc.close()

    E.info("Retrieved exon locations for %i genes. Got %i regions" %
           (n_ids, n_regions))

    return(region_list)
Exemple #7
0
def main(argv=sys.argv):

    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("--is-gtf", dest="is_gtf", action="store_true",
                      help="input file is in gtf format [default=%default] ")

    parser.add_option(
        "--set-name", dest="name", type="choice",
        help="field from the GFF/GTF file to use as the "
        "name field in the BED file [%default]",
        choices=("gene_id", "transcript_id", "class", "family",
                 "feature", "source", "repName", "gene_biotype"))

    parser.add_option(
        "--track", dest="track", type="choice",
        choices=("feature", "source", None),
        help="use feature/source field to define BED tracks "
        "[default=%default]")

    parser.set_defaults(
        track=None,
        name="gene_id",
        is_gtf=False)

    (options, args) = E.Start(parser, add_pipe_options=True)

    ninput, noutput = 0, 0

    iterator = GTF.iterator(options.stdin)

    if options.track:
        all_input = list(iterator)

        if options.track == "feature":
            grouper = lambda x: x.feature
        elif options.track == "source":
            grouper = lambda x: x.source

        all_input.sort(key=grouper)

        bed = Bed.Bed()
        for key, vals in itertools.groupby(all_input, grouper):
            options.stdout.write("track name=%s\n" % key)
            for gff in vals:
                ninput += 1
                bed.fromGTF(gff, name=options.name)
                options.stdout.write(str(bed) + "\n")
                noutput += 1

    else:
        bed = Bed.Bed()
        for gff in iterator:
            ninput += 1
            bed.fromGTF(gff, name=options.name)
            options.stdout.write(str(bed) + "\n")

            noutput += 1

    E.info("ninput=%i, noutput=%i" % (ninput, noutput))
    E.Stop()