Beispiel #1
0
def bed_store(bedfile, sorted=False):
    bedfile = mergeBed(bedfile, s=True, nms=True, sorted=sorted)
    bed = Bed(bedfile)
    reads, reads_r = {}, defaultdict(list)
    for b in bed:
        target = "{0}:{1}".format(b.seqid, b.start)
        for accn in b.accn.split(","):
            reads[accn] = target
            reads_r[target].append(accn)
    return reads, reads_r
Beispiel #2
0
def bed_store(bedfile):
    bedfile = mergeBed(bedfile, s=True, nms=True, sorted=True)
    bed = Bed(bedfile)
    reads, reads_r = {}, defaultdict(list)
    for b in bed:
        target = "{0}:{1}".format(b.seqid, b.start)
        for accn in b.accn.split(","):
            reads[accn] = target
            reads_r[target].append(accn)
    return reads, reads_r
Beispiel #3
0
def insertionpairs(args):
    """
    %prog insertionpairs endpoints.bed

    Pair up the candidate endpoints. A candidate exision point would contain
    both left-end (LE) and right-end (RE) within a given distance.

    -----------|   |------------
        -------|   |--------
      ---------|   |----------
            (RE)   (LE)
    """
    p = OptionParser(insertionpairs.__doc__)
    p.add_option(
        "--extend",
        default=10,
        type="int",
        help="Allow insertion sites to match up within distance",
    )
    p.set_outfile()
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    (bedfile, ) = args
    mergedbedfile = mergeBed(bedfile, d=opts.extend, nms=True)
    bed = Bed(mergedbedfile)
    fw = must_open(opts.outfile, "w")
    support = lambda x: -x.reads
    for b in bed:
        names = b.accn.split(",")
        ends = [EndPoint(x) for x in names]
        REs = sorted([x for x in ends if x.leftright == "RE"], key=support)
        LEs = sorted([x for x in ends if x.leftright == "LE"], key=support)
        if not (REs and LEs):
            continue
        mRE, mLE = REs[0], LEs[0]
        pRE, pLE = mRE.position, mLE.position
        if pLE < pRE:
            b.start, b.end = pLE - 1, pRE
        else:
            b.start, b.end = pRE - 1, pLE
        b.accn = "{0}|{1}".format(mRE.label, mLE.label)
        b.score = pLE - pRE - 1
        print(b, file=fw)
Beispiel #4
0
def insertionpairs(args):
    """
    %prog insertionpairs endpoints.bed

    Pair up the candidate endpoints. A candidate exision point would contain
    both left-end (LE) and right-end (RE) within a given distance.

    -----------|   |------------
        -------|   |--------
      ---------|   |----------
            (RE)   (LE)
    """
    p = OptionParser(insertionpairs.__doc__)
    p.add_option("--extend", default=10, type="int",
                 help="Allow insertion sites to match up within distance")
    p.set_outfile()
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    bedfile, = args
    mergedbedfile = mergeBed(bedfile, d=opts.extend, nms=True)
    bed = Bed(mergedbedfile)
    fw = must_open(opts.outfile, "w")
    support = lambda x: -x.reads
    for b in bed:
        names = b.accn.split(",")
        ends = [EndPoint(x) for x in names]
        REs = sorted([x for x in ends if x.leftright == "RE"], key=support)
        LEs = sorted([x for x in ends if x.leftright == "LE"], key=support)
        if not (REs and LEs):
            continue
        mRE, mLE = REs[0], LEs[0]
        pRE, pLE = mRE.position, mLE.position
        if pLE < pRE:
            b.start, b.end = pLE - 1, pRE
        else:
            b.start, b.end = pRE - 1, pLE
        b.accn = "{0}|{1}".format(mRE.label, mLE.label)
        b.score = pLE - pRE - 1
        print >> fw, b
Beispiel #5
0
def fill(args):
    """
    %prog fill gaps.bed bad.fasta

    Perform gap filling of one assembly (bad) using sequences from another.
    """
    p = OptionParser(fill.__doc__)
    p.add_option(
        "--extend",
        default=2000,
        type="int",
        help="Extend seq flanking the gaps",
    )
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    gapsbed, badfasta = args
    Ext = opts.extend

    gapdist = 2 * Ext + 1  # This is to prevent to replacement ranges intersect
    gapsbed = mergeBed(gapsbed, d=gapdist, nms=True)

    bed = Bed(gapsbed)
    sizes = Sizes(badfasta).mapping
    pf = gapsbed.rsplit(".", 1)[0]
    extbed = pf + ".ext.bed"
    fw = open(extbed, "w")
    for b in bed:
        gapname = b.accn
        start, end = max(0, b.start - Ext - 1), b.start - 1
        print("\t".join(str(x) for x in (b.seqid, start, end, gapname + "L")),
              file=fw)
        start, end = b.end, min(sizes[b.seqid], b.end + Ext)
        print("\t".join(str(x) for x in (b.seqid, start, end, gapname + "R")),
              file=fw)
    fw.close()

    fastaFromBed(extbed, badfasta, name=True)
Beispiel #6
0
def fill(args):
    """
    %prog fill gaps.bed bad.fasta

    Perform gap filling of one assembly (bad) using sequences from another.
    """
    p = OptionParser(fill.__doc__)
    p.add_option("--extend", default=2000, type="int",
                 help="Extend seq flanking the gaps [default: %default]")
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    gapsbed, badfasta = args
    Ext = opts.extend

    gapdist = 2 * Ext + 1  # This is to prevent to replacement ranges intersect
    gapsbed = mergeBed(gapsbed, d=gapdist, nms=True)

    bed = Bed(gapsbed)
    sizes = Sizes(badfasta).mapping
    pf = gapsbed.rsplit(".", 1)[0]
    extbed = pf + ".ext.bed"
    fw = open(extbed, "w")
    for b in bed:
        gapname = b.accn
        start, end = max(0, b.start - Ext - 1), b.start - 1
        print >> fw, "\t".join(str(x) for x in \
                             (b.seqid, start, end, gapname + "L"))
        start, end = b.end, min(sizes[b.seqid], b.end + Ext)
        print >> fw, "\t".join(str(x) for x in \
                             (b.seqid, start, end, gapname + "R"))
    fw.close()

    fastaFromBed(extbed, badfasta, name=True)
Beispiel #7
0
def pastegenes(args):
    """
    %prog pastegenes coverage.list old.genes.bed new.genes.bed old.assembly

    Paste in zero or low coverage genes.  For a set of neighboring genes
    missing, add the whole cassette as unplaced scaffolds. For singletons the
    program will try to make a patch.
    """
    from jcvi.formats.base import DictFile
    from jcvi.utils.cbook import gene_name

    p = OptionParser(pastegenes.__doc__)
    p.add_option(
        "--cutoff",
        default=90,
        type="int",
        help="Coverage cutoff to call gene missing",
    )
    p.add_option(
        "--flank",
        default=2000,
        type="int",
        help="Get the seq of size on two ends",
    )
    p.add_option(
        "--maxsize",
        default=50000,
        type="int",
        help="Maximum size of patchers to be replaced",
    )
    opts, args = p.parse_args(args)

    if len(args) != 4:
        sys.exit(not p.print_help())

    coveragefile, oldbed, newbed, oldassembly = args
    cutoff = opts.cutoff
    flank = opts.flank
    maxsize = opts.maxsize

    coverage = DictFile(coveragefile, valuepos=2, cast=float)

    obed = Bed(oldbed)
    order = obed.order
    bed = [x for x in obed if x.accn in coverage]
    key = lambda x: coverage[x.accn] >= cutoff

    extrabed = "extra.bed"
    extendbed = "extend.bed"
    pastebed = "paste.bed"

    fw = open(extrabed, "w")
    fwe = open(extendbed, "w")
    fwp = open(pastebed, "w")
    fw_ids = open(extendbed + ".ids", "w")

    singletons, large, large_genes = 0, 0, 0
    for chr, chrbed in groupby(bed, key=lambda x: x.seqid):
        chrbed = list(chrbed)
        for good, beds in groupby(chrbed, key=key):
            if good:
                continue

            beds = list(beds)
            blocksize = len(set([gene_name(x.accn) for x in beds]))
            if blocksize == 1:
                singletons += 1
                accn = beds[0].accn
                gi, gb = order[accn]
                leftb = obed[gi - 1]
                rightb = obed[gi + 1]
                leftr = leftb.range
                rightr = rightb.range
                cur = gb.range
                distance_to_left, oo = range_distance(leftr, cur)
                distance_to_right, oo = range_distance(cur, rightr)
                span, oo = range_distance(leftr, rightr)

                if distance_to_left <= distance_to_right and distance_to_left > 0:
                    label = "LEFT"
                else:
                    label = "RIGHT"

                if 0 < span <= maxsize:
                    print(
                        "\t".join(
                            str(x) for x in (chr, leftb.start, rightb.end, gb.accn)
                        ),
                        file=fwp,
                    )

                print(leftb, file=fwe)
                print(gb, file=fwe)
                print(rightb, file=fwe)
                print(
                    "L:{0} R:{1} [{2}]".format(
                        distance_to_left, distance_to_right, label
                    ),
                    file=fwe,
                )
                print(gb.accn, file=fw_ids)
                continue

            large += 1
            large_genes += blocksize

            ranges = [(x.start, x.end) for x in beds]
            rmin, rmax = range_minmax(ranges)
            rmin -= flank
            rmax += flank

            name = "-".join((beds[0].accn, beds[-1].accn))
            print("\t".join(str(x) for x in (chr, rmin - 1, rmax, name)), file=fw)

    fw.close()
    fwe.close()

    extrabed = mergeBed(extrabed, d=flank, nms=True)
    fastaFromBed(extrabed, oldassembly, name=True)
    summary([extrabed])

    logging.debug("Singleton blocks : {0}".format(singletons))
    logging.debug("Large blocks : {0} ({1} genes)".format(large, large_genes))
Beispiel #8
0
def pastegenes(args):
    """
    %prog pastegenes coverage.list old.genes.bed new.genes.bed old.assembly

    Paste in zero or low coverage genes.  For a set of neighboring genes
    missing, add the whole cassette as unplaced scaffolds. For singletons the
    program will try to make a patch.
    """
    from jcvi.formats.base import DictFile
    from jcvi.utils.cbook import gene_name

    p = OptionParser(pastegenes.__doc__)
    p.add_option("--cutoff", default=90, type="int",
                 help="Coverage cutoff to call gene missing [default: %default]")
    p.add_option("--flank", default=2000, type="int",
                 help="Get the seq of size on two ends [default: %default]")
    p.add_option("--maxsize", default=50000, type="int",
            help="Maximum size of patchers to be replaced [default: %default]")
    opts, args = p.parse_args(args)

    if len(args) != 4:
        sys.exit(not p.print_help())

    coveragefile, oldbed, newbed, oldassembly = args
    cutoff = opts.cutoff
    flank = opts.flank
    maxsize = opts.maxsize

    coverage = DictFile(coveragefile, valuepos=2, cast=float)

    obed = Bed(oldbed)
    order = obed.order
    bed = [x for x in obed if x.accn in coverage]
    key = lambda x: coverage[x.accn] >= cutoff

    extrabed = "extra.bed"
    extendbed = "extend.bed"
    pastebed = "paste.bed"

    fw = open(extrabed, "w")
    fwe = open(extendbed, "w")
    fwp = open(pastebed, "w")
    fw_ids = open(extendbed + ".ids", "w")

    singletons, large, large_genes = 0, 0, 0
    for chr, chrbed in groupby(bed, key=lambda x: x.seqid):
        chrbed = list(chrbed)
        for good, beds in groupby(chrbed, key=key):
            if good:
                continue

            beds = list(beds)
            blocksize = len(set([gene_name(x.accn) for x in beds]))
            if blocksize == 1:
                singletons += 1
                accn = beds[0].accn
                gi, gb = order[accn]
                leftb = obed[gi - 1]
                rightb = obed[gi + 1]
                leftr = leftb.range
                rightr = rightb.range
                cur = gb.range
                distance_to_left, oo = range_distance(leftr, cur)
                distance_to_right, oo = range_distance(cur, rightr)
                span, oo = range_distance(leftr, rightr)

                if distance_to_left <= distance_to_right and \
                   distance_to_left > 0:
                    label = "LEFT"
                else:
                    label = "RIGHT"

                if 0 < span <= maxsize:
                    print >> fwp, "\t".join(str(x) for x in \
                                    (chr, leftb.start, rightb.end, gb.accn))

                print >> fwe, leftb
                print >> fwe, gb
                print >> fwe, rightb
                print >> fwe, "L:{0} R:{1} [{2}]".format(distance_to_left, \
                            distance_to_right, label)
                print >> fw_ids, gb.accn
                continue

            large += 1
            large_genes += blocksize

            ranges = [(x.start, x.end) for x in beds]
            rmin, rmax = range_minmax(ranges)
            rmin -= flank
            rmax += flank

            name = "-".join((beds[0].accn, beds[-1].accn))
            print >> fw, "\t".join(str(x) for x in (chr, rmin - 1, rmax, name))

    fw.close()
    fwe.close()

    extrabed = mergeBed(extrabed, d=flank, nms=True)
    fastaFromBed(extrabed, oldassembly, name=True)
    summary([extrabed])

    logging.debug("Singleton blocks : {0}".format(singletons))
    logging.debug("Large blocks : {0} ({1} genes)".format(large, large_genes))
Beispiel #9
0
def variation(args):
    """
    %prog variation P1.bed P2.bed F1.bed

    Associate IES in parents and progeny.
    """
    p = OptionParser(variation.__doc__)
    p.add_option("--diversity",
                 choices=("breakpoint", "variant"),
                 default="variant",
                 help="Plot diversity")
    opts, args, iopts = p.set_image_options(args, figsize="6x6")

    if len(args) != 3:
        sys.exit(not p.print_help())

    pfs = [op.basename(x).split('-')[0] for x in args]
    P1, P2, F1 = pfs
    newbedfile = "-".join(pfs) + ".bed"
    if need_update(args, newbedfile):
        newbed = Bed()
        for pf, filename in zip(pfs, args):
            bed = Bed(filename)
            for b in bed:
                b.accn = "-".join((pf, b.accn))
                b.score = None
                newbed.append(b)
        newbed.print_to_file(newbedfile, sorted=True)

    neworder = Bed(newbedfile).order
    mergedbedfile = mergeBed(newbedfile, nms=True)
    bed = Bed(mergedbedfile)
    valid = 0
    total_counts = Counter()
    F1_counts = []
    bp_diff = []
    novelbedfile = "novel.bed"
    fw = open(novelbedfile, "w")
    for b in bed:
        accns = b.accn.split(',')
        pfs_accns = [x.split("-")[0] for x in accns]
        pfs_counts = Counter(pfs_accns)
        if len(pfs_counts) != 3:
            print(b, file=fw)
            continue

        valid += 1
        total_counts += pfs_counts
        F1_counts.append(pfs_counts[F1])

        # Collect breakpoint positions between P1 and F1
        P1_accns = [x for x in accns if x.split("-")[0] == P1]
        F1_accns = [x for x in accns if x.split("-")[0] == F1]
        if len(P1_accns) != 1:
            continue

        ri, ref = neworder[P1_accns[0]]
        P1_accns = [neworder[x][-1] for x in F1_accns]
        bp_diff.extend(x.start - ref.start for x in P1_accns)
        bp_diff.extend(x.end - ref.end for x in P1_accns)

    print("A total of {0} sites show consistent deletions across samples.".\
                    format(percentage(valid, len(bed))), file=sys.stderr)
    for pf, count in total_counts.items():
        print("{0:>9}: {1:.2f} deletions/site".\
                    format(pf, count * 1. / valid), file=sys.stderr)

    F1_counts = Counter(F1_counts)

    # Plot the IES variant number diversity
    from jcvi.graphics.base import plt, savefig, set_ticklabels_helvetica

    fig = plt.figure(1, (iopts.w, iopts.h))
    if opts.diversity == "variant":
        left, height = zip(*sorted(F1_counts.items()))
        for l, h in zip(left, height):
            print("{0:>9} variants: {1}".format(l, h), file=sys.stderr)
            plt.text(l,
                     h + 5,
                     str(h),
                     color="darkslategray",
                     size=8,
                     ha="center",
                     va="bottom",
                     rotation=90)

        plt.bar(left, height, align="center")
        plt.xlabel("Identified number of IES per site")
        plt.ylabel("Counts")
        plt.title("IES variation in progeny pool")
        ax = plt.gca()
        set_ticklabels_helvetica(ax)
        savefig(F1 + ".counts.pdf")

    # Plot the IES breakpoint position diversity
    else:
        bp_diff = Counter(bp_diff)
        bp_diff_abs = Counter()
        for k, v in bp_diff.items():
            bp_diff_abs[abs(k)] += v
        plt.figure(1, (iopts.w, iopts.h))
        left, height = zip(*sorted(bp_diff_abs.items()))
        for l, h in zip(left, height)[:21]:
            plt.text(l,
                     h + 50,
                     str(h),
                     color="darkslategray",
                     size=8,
                     ha="center",
                     va="bottom",
                     rotation=90)

        plt.bar(left, height, align="center")
        plt.xlabel("Progeny breakpoint relative to SB210")
        plt.ylabel("Counts")
        plt.xlim(-.5, 20.5)
        ax = plt.gca()
        set_ticklabels_helvetica(ax)
        savefig(F1 + ".breaks.pdf")
        # Serialize the data to a file
        fw = open("Breakpoint-offset-histogram.csv", "w")
        for k, v in sorted(bp_diff.items()):
            print("{0},{1}".format(k, v), file=fw)
        fw.close()

        total = sum(height)
        zeros = bp_diff[0]
        within_20 = sum([v for i, v in bp_diff.items() if -20 <= i <= 20])
        print("No deviation: {0}".format(percentage(zeros, total)),
              file=sys.stderr)
        print(" Within 20bp: {0}".format(percentage(within_20, total)),
              file=sys.stderr)
Beispiel #10
0
def variation(args):
    """
    %prog variation P1.bed P2.bed F1.bed

    Associate IES in parents and progeny.
    """
    p = OptionParser(variation.__doc__)
    p.add_option("--diversity", choices=("breakpoint", "variant"),
                 default="variant", help="Plot diversity")
    opts, args, iopts = p.set_image_options(args, figsize="6x6")

    if len(args) != 3:
        sys.exit(not p.print_help())

    pfs = [op.basename(x).split('-')[0] for x in args]
    P1, P2, F1 = pfs
    newbedfile = "-".join(pfs) + ".bed"
    if need_update(args, newbedfile):
        newbed = Bed()
        for pf, filename in zip(pfs, args):
            bed = Bed(filename)
            for b in bed:
                b.accn = "-".join((pf, b.accn))
                b.score = None
                newbed.append(b)
        newbed.print_to_file(newbedfile, sorted=True)

    neworder = Bed(newbedfile).order
    mergedbedfile = mergeBed(newbedfile, nms=True)
    bed = Bed(mergedbedfile)
    valid = 0
    total_counts = Counter()
    F1_counts = []
    bp_diff = []
    novelbedfile = "novel.bed"
    fw = open(novelbedfile, "w")
    for b in bed:
        accns = b.accn.split(',')
        pfs_accns = [x.split("-")[0] for x in accns]
        pfs_counts = Counter(pfs_accns)
        if len(pfs_counts) != 3:
            print >> fw, b
            continue

        valid += 1
        total_counts += pfs_counts
        F1_counts.append(pfs_counts[F1])

        # Collect breakpoint positions between P1 and F1
        P1_accns = [x for x in accns if x.split("-")[0] == P1]
        F1_accns = [x for x in accns if x.split("-")[0] == F1]
        if len(P1_accns) != 1:
            continue

        ri, ref = neworder[P1_accns[0]]
        P1_accns = [neworder[x][-1] for x in F1_accns]
        bp_diff.extend(x.start - ref.start for x in P1_accns)
        bp_diff.extend(x.end - ref.end for x in P1_accns)

    print >> sys.stderr, \
            "A total of {0} sites show consistent deletions across samples.".\
                    format(percentage(valid, len(bed)))
    for pf, count in total_counts.items():
        print >> sys.stderr, "{0:>9}: {1:.2f} deletions/site".\
                    format(pf, count * 1. / valid)

    F1_counts = Counter(F1_counts)

    # Plot the IES variant number diversity
    from jcvi.graphics.base import plt, savefig, set_ticklabels_helvetica

    fig = plt.figure(1, (iopts.w, iopts.h))
    if opts.diversity == "variant":
        left, height = zip(*sorted(F1_counts.items()))
        for l, h in zip(left, height):
            print >> sys.stderr, "{0:>9} variants: {1}".format(l, h)
            plt.text(l, h + 5, str(h), color="darkslategray", size=8,
                     ha="center", va="bottom", rotation=90)

        plt.bar(left, height, align="center")
        plt.xlabel("Identified number of IES per site")
        plt.ylabel("Counts")
        plt.title("IES variation in progeny pool")
        ax = plt.gca()
        set_ticklabels_helvetica(ax)
        savefig(F1 + ".counts.pdf")

    # Plot the IES breakpoint position diversity
    else:
        bp_diff = Counter(bp_diff)
        bp_diff_abs = Counter()
        for k, v in bp_diff.items():
            bp_diff_abs[abs(k)] += v
        plt.figure(1, (iopts.w, iopts.h))
        left, height = zip(*sorted(bp_diff_abs.items()))
        for l, h in zip(left, height)[:21]:
            plt.text(l, h + 50, str(h), color="darkslategray", size=8,
                     ha="center", va="bottom", rotation=90)

        plt.bar(left, height, align="center")
        plt.xlabel("Progeny breakpoint relative to SB210")
        plt.ylabel("Counts")
        plt.xlim(-.5, 20.5)
        ax = plt.gca()
        set_ticklabels_helvetica(ax)
        savefig(F1 + ".breaks.pdf")
        # Serialize the data to a file
        fw = open("Breakpoint-offset-histogram.csv", "w")
        for k, v in sorted(bp_diff.items()):
            print >> fw, "{0},{1}".format(k, v)
        fw.close()

        total = sum(height)
        zeros = bp_diff[0]
        within_20 = sum([v for i, v in bp_diff.items() if -20 <= i <= 20])
        print >> sys.stderr, "No deviation: {0}".format(percentage(zeros, total))
        print >> sys.stderr, " Within 20bp: {0}".format(percentage(within_20, total))