Ejemplo n.º 1
0
    def write_agp(self, filename):
        sizes = self.sz
        agp = []
        for scaffold, lines in self.iter_scaffold():
            for a, b in pairwise(lines):
                cline = AGPLine.cline(scaffold, a.tig, sizes, a.oo)
                gline = AGPLine.gline(scaffold, a.gaps)
                agp.append(cline)
                agp.append(gline)
            a = lines[-1]
            cline = AGPLine.cline(scaffold, a.tig, sizes, a.oo)
            agp.append(cline)

        fw = open(filename, "w")
        for a in agp:
            print >> fw, a
        fw.close()

        reindex([filename, "--inplace"])
        return filename
Ejemplo n.º 2
0
    def write_agp(self, filename):
        sizes = self.sz
        agp = []
        for scaffold, lines in self.iter_scaffold():
            for a, b in pairwise(lines):
                cline = AGPLine.cline(scaffold, a.tig, sizes, a.oo)
                gline = AGPLine.gline(scaffold, a.gaps)
                agp.append(cline)
                agp.append(gline)
            a = lines[-1]
            cline = AGPLine.cline(scaffold, a.tig, sizes, a.oo)
            agp.append(cline)

        fw = open(filename, "w")
        for a in agp:
            print >> fw, a
        fw.close()

        reindex([filename, "--inplace"])
        return filename
Ejemplo n.º 3
0
    def write_AGP(self, filename, orientationguide={}, reindex=True):
        """
        For each component, we have two overlaps: North and South.

        =======
           ||||             South
           ====(=================)  Current BAC
           North             ||||
                             ===============

        For the case that says "Non-terminal", the overlap will not be
        considered. North-South would suggest a '+' orientation, South-North
        would suggest a '-' orientation. In most cases, unless the overlap
        involves phase1 BAC, the selected range will be shown as the brackets
        above - exclude North overlap, and include South overlap (aka the
        "left-greedy" rule).
        """
        fw = must_open(filename, "w")
        for aid, bb in groupby(self.lines, key=lambda x: x.aid):
            bb = list(bb)
            north, south = bb
            aid = north.aid
            assert aid == south.aid

            aphase = north.aphase
            chr = north.chr
            size = north.asize
            ar = [chr, 0, 0, 0]

            northline = southline = None
            northrange = southrange = None

            # Warn if adjacent components do not have valid # overlaps
            if south.is_no_overlap:
                print >> sys.stderr, south

            # Most gaps, except telomeres occur twice, so only do the "North"
            if north.is_gap:
                bar = ar + self.get_agp_gap(north.bid)
                northline = "\t".join(str(x) for x in bar)
            else:
                if north.isTerminal():
                    northrange = north.astart, north.astop

            if south.is_gap:
                if south.bid == "telomere":
                    bar = ar + self.get_agp_gap(south.bid)
                    southline = "\t".join(str(x) for x in bar)
            else:
                if south.isTerminal():
                    southrange = south.astart, south.astop
                else:
                    bar = ar + self.get_agp_gap("fragment")
                    southline = "\t".join(str(x) for x in bar)

            # Determine the orientation and clear range for the current BAC
            clr = [1, size]
            orientation = sorientation = None
            if northrange:
                start, stop = northrange
                Lhang = start - 1
                Rhang = size - stop

                orientation = '+' if Lhang < Rhang else '-'
                if north.bphase == 1 and north.bphase < aphase:
                    if Lhang < Rhang:  # North overlap at 5`
                        clr[0] = start
                    else:
                        clr[1] = stop
                # Override left-greedy (also see below)
                else:
                    if Lhang < Rhang:
                        clr[0] = stop + 1
                    else:
                        clr[1] = start - 1

            if southrange:
                start, stop = southrange
                Lhang = start - 1
                Rhang = size - stop

                sorientation = '+' if Lhang > Rhang else '-'
                # Override left-greedy (also see above)
                if aphase == 1 and aphase < south.bphase:
                    if Lhang < Rhang:  # South overlap at 5`
                        clr[0] = stop + 1
                    else:
                        clr[1] = start - 1
                else:
                    if Lhang < Rhang:
                        clr[0] = start
                    else:
                        clr[1] = stop

            if orientation:
                if sorientation:
                    try:
                        assert orientation == sorientation, \
                                "Orientation conflicts:\n{0}\n{1}".format(north, south)
                    except AssertionError as e:
                        logging.debug(e)
            else:
                if sorientation:
                    orientation = sorientation
                else:  # Both overlaps fail to define orientation
                    orientation = orientationguide.get(aid, "+")

            component_type = "D" if aphase in (1, 2) else "F"
            bar = ar + [component_type, aid, clr[0], clr[1], orientation]
            cline = "\t".join(str(x) for x in bar)

            if northline:
                print >> fw, northline
            print >> fw, cline
            if southline:
                print >> fw, southline

        fw.close()

        if reindex:
            from jcvi.formats.agp import reindex
            reindex([filename])
            newagpfile = filename.replace(".agp", ".reindexed.agp")
            shutil.move(newagpfile, filename)
Ejemplo n.º 4
0
    def write_AGP(self, filename, orientationguide={}):
        """
        For each component, we have two overlaps: North and South.

        =======
           ||||             South
           ====(=================)  Current BAC
           North             ||||
                             ===============

        For the case that says "Non-terminal", the overlap will not be
        considered. North-South would suggest a '+' orientation, South-North
        would suggest a '-' orientation. In most cases, unless the overlap
        involves phase1 BAC, the selected range will be shown as the brackets
        above - exclude North overlap, and include South overlap (aka the
        "left-greedy" rule).
        """
        fw = must_open(filename, "w")
        for aid, bb in groupby(self.lines, key=lambda x: x.aid):
            bb = list(bb)
            north, south = bb
            aid = north.aid
            assert aid == south.aid

            aphase = north.aphase
            chr = north.chr
            size = north.asize
            ar = [chr, 0, 0, 0]

            northline = southline = None
            northrange = southrange = None

            # Warn if adjacent components do not have valid overlaps
            if south.is_no_overlap:
                print >> sys.stderr, south

            # Most gaps, except telomeres occur twice, so only do the "North"
            if north.is_gap:
                bar = ar + self.get_agp_gap(north.bid)
                northline = "\t".join(str(x) for x in bar)
            else:
                if north.isTerminal:
                    northrange = north.astart, north.astop

            if south.is_gap:
                if south.bid == "telomere":
                    bar = ar + self.get_agp_gap(south.bid)
                    southline = "\t".join(str(x) for x in bar)
            else:
                if south.isTerminal:
                    southrange = south.astart, south.astop
                else:
                    bar = ar + self.get_agp_gap("fragment")
                    southline = "\t".join(str(x) for x in bar)

            # Determine the orientation and clear range for the current BAC
            clr = [1, size]
            orientation = sorientation = None
            if northrange:
                start, stop = northrange
                Lhang = start - 1
                Rhang = size - stop

                orientation = '+' if Lhang < Rhang else '-'
                if north.bphase == 1 and north.bphase < aphase:
                    if Lhang < Rhang:  # North overlap at 5`
                        clr[0] = start
                    else:
                        clr[1] = stop
                # Override left-greedy (also see below)
                else:
                    if Lhang < Rhang:
                        clr[0] = stop + 1
                    else:
                        clr[1] = start - 1

            if southrange:
                start, stop = southrange
                Lhang = start - 1
                Rhang = size - stop

                sorientation = '+' if Lhang > Rhang else '-'
                # Override left-greedy (also see above)
                if aphase == 1 and aphase < south.bphase:
                    if Lhang < Rhang:  # South overlap at 5`
                        clr[0] = stop + 1
                    else:
                        clr[1] = start - 1
                else:
                    if Lhang < Rhang:
                        clr[0] = start
                    else:
                        clr[1] = stop

            if orientation:
                if sorientation:
                    try:
                        assert orientation == sorientation, \
                                "Orientation conflicts:\n{0}\n{1}".format(north, south)
                    except AssertionError as e:
                        logging.debug(e)
            else:
                if sorientation:
                    orientation = sorientation
                else:  # Both overlaps fail to define orientation
                    orientation = orientationguide.get(aid, "+")

            component_type = "D" if aphase in (1, 2) else "F"
            bar = ar + [component_type, aid, clr[0], clr[1], orientation]
            cline = "\t".join(str(x) for x in bar)

            if northline:
                print >> fw, northline
            print >> fw, cline
            if southline:
                print >> fw, southline

        fw.close()

        reindex([filename, "--inplace"])
Ejemplo n.º 5
0
def estimategaps(args):
    """
    %prog estimategaps input.bed

    Estimate sizes of inter-scaffold gaps. The AGP file generated by path()
    command has unknown gap sizes with a generic number of Ns (often 100 Ns).
    The AGP file `input.chr.agp` will be modified in-place.
    """
    p = OptionParser(estimategaps.__doc__)
    p.add_option("--minsize", default=100, type="int", help="Minimum gap size")
    p.add_option("--maxsize",
                 default=500000,
                 type="int",
                 help="Maximum gap size")
    p.add_option("--links",
                 default=10,
                 type="int",
                 help="Only use linkage grounds with matchings more than")
    p.set_verbose(help="Print details for each gap calculation")
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    inputbed, = args
    pf = inputbed.rsplit(".", 1)[0]
    agpfile = pf + ".chr.agp"
    bedfile = pf + ".lifted.bed"

    cc = Map(bedfile, scaffold_info=True)
    agp = AGP(agpfile)
    minsize, maxsize = opts.minsize, opts.maxsize
    links = opts.links
    verbose = opts.verbose

    outagpfile = pf + ".estimategaps.agp"
    fw = must_open(outagpfile, "w")

    for ob, components in agp.iter_object():
        components = list(components)
        s = Scaffold(ob, cc)
        mlg_counts = s.mlg_counts
        gaps = [x for x in components if x.is_gap]
        gapsizes = [None] * len(gaps)  # master
        for mlg, count in mlg_counts.items():
            if count < links:
                continue
            g = GapEstimator(cc, agp, ob, mlg)
            g.compute_all_gaps(minsize=minsize, maxsize=maxsize, \
                               verbose=verbose)
            # Merge evidence from this mlg into master
            assert len(g.gapsizes) == len(gaps)
            for i, gs in enumerate(gapsizes):
                gg = g.gapsizes[i]
                if gs is None:
                    gapsizes[i] = gg
                elif gg:
                    gapsizes[i] = min(gs, gg)

        print gapsizes
        # Modify AGP
        i = 0
        for x in components:
            if x.is_gap:
                x.gap_length = gapsizes[i] or minsize
                x.component_type = 'U' if x.gap_length == 100 else 'N'
                i += 1
            print >> fw, x

    fw.close()
    reindex([outagpfile, "--inplace"])
Ejemplo n.º 6
0
def estimategaps(args):
    """
    %prog estimategaps input.bed

    Estimate sizes of inter-scaffold gaps. The AGP file generated by path()
    command has unknown gap sizes with a generic number of Ns (often 100 Ns).
    The AGP file `input.chr.agp` will be modified in-place.
    """
    p = OptionParser(estimategaps.__doc__)
    p.add_option("--minsize", default=100, type="int",
                 help="Minimum gap size")
    p.add_option("--maxsize", default=500000, type="int",
                 help="Maximum gap size")
    p.add_option("--links", default=10, type="int",
                 help="Only use linkage grounds with matchings more than")
    p.set_verbose(help="Print details for each gap calculation")
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    inputbed, = args
    pf = inputbed.rsplit(".", 1)[0]
    agpfile = pf + ".chr.agp"
    bedfile = pf + ".lifted.bed"

    cc = Map(bedfile, scaffold_info=True)
    agp = AGP(agpfile)
    minsize, maxsize = opts.minsize, opts.maxsize
    links = opts.links
    verbose = opts.verbose

    outagpfile = pf + ".estimategaps.agp"
    fw = must_open(outagpfile, "w")

    for ob, components in agp.iter_object():
        components = list(components)
        s = Scaffold(ob, cc)
        mlg_counts = s.mlg_counts
        gaps = [x for x in components if x.is_gap]
        gapsizes = [None] * len(gaps)   # master
        for mlg, count in mlg_counts.items():
            if count < links:
                continue
            g = GapEstimator(cc, agp, ob, mlg)
            g.compute_all_gaps(minsize=minsize, maxsize=maxsize, \
                               verbose=verbose)
            # Merge evidence from this mlg into master
            assert len(g.gapsizes) == len(gaps)
            for i, gs in enumerate(gapsizes):
                gg = g.gapsizes[i]
                if gs is None:
                    gapsizes[i] = gg
                elif gg:
                    gapsizes[i] = min(gs, gg)

        print gapsizes
        # Modify AGP
        i = 0
        for x in components:
            if x.is_gap:
                x.gap_length = gapsizes[i] or minsize
                x.component_type = 'U' if x.gap_length == 100 else 'N'
                i += 1
            print >> fw, x

    fw.close()
    reindex([outagpfile, "--inplace"])