def write_agp(self, obj, sizes, fw=sys.stdout, gapsize=100, gaptype="contig", evidence="map"): '''Converts the ContigOrdering file into AGP format ''' contigorder = [(x.contig_name, x.strand) for x in self] order_to_agp(obj, contigorder, sizes, fw, gapsize=gapsize, gaptype=gaptype, evidence=evidence)
def agp(args): """ %prog agp main_results/ contigs.fasta Generate AGP file based on LACHESIS output. """ p = OptionParser(agp.__doc__) p.set_outfile() opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) odir, contigsfasta = args fwagp = must_open(opts.outfile, 'w') orderingfiles = natsorted(iglob(odir, "*.ordering")) sizes = Sizes(contigsfasta).mapping contigs = set(sizes.keys()) anchored = set() for ofile in orderingfiles: co = ContigOrdering(ofile) anchored |= set([x.contig_name for x in co]) obj = op.basename(ofile).split('.')[0] co.write_agp(obj, sizes, fwagp) singletons = contigs - anchored logging.debug('Anchored: {}, Singletons: {}'.\ format(len(anchored), len(singletons))) for s in natsorted(singletons): order_to_agp(s, [(s, "?")], sizes, fwagp)
def agp(args): """ %prog agp main_results/ contigs.fasta Generate AGP file based on LACHESIS output. """ p = OptionParser(agp.__doc__) p.set_outfile() opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) odir, contigsfasta = args fwagp = must_open(opts.outfile, 'w') orderingfiles = natsorted(iglob(odir, "*.ordering")) sizes = Sizes(contigsfasta).mapping contigs = set(sizes.keys()) anchored = set() for ofile in orderingfiles: co = ContigOrdering(ofile) anchored |= set([x.contig_name for x in co]) obj = op.basename(ofile).split('.')[0] co.write_agp(obj, sizes, fwagp) singletons = contigs - anchored logging.debug('Anchored: {}, Singletons: {}'. format(len(anchored), len(singletons))) for s in natsorted(singletons): order_to_agp(s, [(s, "?")], sizes, fwagp)
def graph_to_agp(g, blastfile, subjectfasta, exclude=[], verbose=False): from jcvi.formats.agp import order_to_agp logging.debug(str(g)) g.write("graph.txt") # g.draw("graph.pdf") paths = [] for path in g.iter_paths(): m, oo = g.path(path) if len(oo) == 1: # Singleton path continue paths.append(oo) if verbose: print(m) print(oo) npaths = len(paths) ntigs = sum(len(x) for x in paths) logging.debug( "Graph decomposed to {0} paths with {1} components.".format(npaths, ntigs) ) agpfile = blastfile + ".agp" sizes = Sizes(subjectfasta) fwagp = open(agpfile, "w") scaffolded = set() for i, oo in enumerate(paths): ctgorder = [(str(ctg), ("+" if strand else "-")) for ctg, strand in oo] scaffolded |= set(ctg for ctg, strand in ctgorder) object = "pmol_{0:04d}".format(i) order_to_agp(object, ctgorder, sizes.mapping, fwagp) # Get the singletons as well nsingletons = nscaffolded = nexcluded = 0 for ctg, size in sizes.iter_sizes(): if ctg in scaffolded: nscaffolded += 1 continue if ctg in exclude: nexcluded += 1 continue ctgorder = [(ctg, "+")] object = ctg order_to_agp(object, ctgorder, sizes.mapping, fwagp) nsingletons += 1 logging.debug( "scaffolded={} excluded={} singletons={}".format( nscaffolded, nexcluded, nsingletons ) ) fwagp.close() logging.debug("AGP file written to `{0}`.".format(agpfile))
def write_unplaced_agp(agpfile, scaffolds, unplaced_agp): agp = AGP(agpfile) scaffolds_seen = set(x.component_id for x in agp) sizes = Sizes(scaffolds).mapping fwagp = must_open(unplaced_agp, "w") for s in sorted(sizes.keys()): if s in scaffolds_seen: continue order_to_agp(s, [(s, "?")], sizes, fwagp) logging.debug("Write unplaced AGP to `{0}`.".format(unplaced_agp))
def graph_to_agp(g, blastfile, subjectfasta, exclude=[], verbose=False): from jcvi.formats.agp import order_to_agp logging.debug(str(g)) g.write("graph.txt") #g.draw("graph.pdf") paths = [] for path in g.iter_paths(): m, oo = g.path(path) if len(oo) == 1: # Singleton path continue paths.append(oo) if verbose: print m print oo npaths = len(paths) ntigs = sum(len(x) for x in paths) logging.debug("Graph decomposed to {0} paths with {1} components.".\ format(npaths, ntigs)) agpfile = blastfile + ".agp" sizes = Sizes(subjectfasta) fwagp = open(agpfile, "w") scaffolded = set() for i, oo in enumerate(paths): ctgorder = [(str(ctg), ("+" if strand else "-")) \ for ctg, strand in oo] scaffolded |= set(ctg for ctg, strand in ctgorder) object = "pmol_{0:04d}".format(i) order_to_agp(object, ctgorder, sizes.mapping, fwagp) # Get the singletons as well nsingletons = nscaffolded = nexcluded = 0 for ctg, size in sizes.iter_sizes(): if ctg in scaffolded: nscaffolded += 1 continue if ctg in exclude: nexcluded += 1 continue ctgorder = [(ctg, "+")] object = ctg order_to_agp(object, ctgorder, sizes.mapping, fwagp) nsingletons += 1 logging.debug("scaffolded={} excluded={} singletons={}".\ format(nscaffolded, nexcluded, nsingletons)) fwagp.close() logging.debug("AGP file written to `{0}`.".format(agpfile))
def scaffold(args): """ %prog scaffold ctgfasta agpfile Build scaffolds based on ordering in the AGP file. """ from jcvi.formats.agp import bed, order_to_agp, build from jcvi.formats.bed import Bed p = OptionParser(scaffold.__doc__) p.add_option("--prefix", default=False, action="store_true", help="Keep IDs with same prefix together [default: %default]") opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) ctgfasta, agpfile = args sizes = Sizes(ctgfasta).mapping pf = ctgfasta.rsplit(".", 1)[0] phasefile = pf + ".phases" fwphase = open(phasefile, "w") newagpfile = pf + ".new.agp" fwagp = open(newagpfile, "w") scaffoldbuckets = defaultdict(list) bedfile = bed([agpfile, "--nogaps", "--outfile=tmp"]) bb = Bed(bedfile) for s, partialorder in bb.sub_beds(): name = partialorder[0].accn bname = name.rsplit("_", 1)[0] if opts.prefix else s scaffoldbuckets[bname].append([(b.accn, b.strand) for b in partialorder]) # Now the buckets contain a mixture of singletons and partially resolved # scaffolds. Print the scaffolds first then remaining singletons. for bname, scaffolds in sorted(scaffoldbuckets.items()): ctgorder = [] singletons = set() for scaf in sorted(scaffolds): for node, orientation in scaf: ctgorder.append((node, orientation)) if len(scaf) == 1: singletons.add(node) nscaffolds = len(scaffolds) nsingletons = len(singletons) if nsingletons == 1 and nscaffolds == 0: phase = 3 elif nsingletons == 0 and nscaffolds == 1: phase = 2 else: phase = 1 msg = "{0}: Scaffolds={1} Singletons={2} Phase={3}".\ format(bname, nscaffolds, nsingletons, phase) print >> sys.stderr, msg print >> fwphase, "\t".join((bname, str(phase))) order_to_agp(bname, ctgorder, sizes, fwagp) fwagp.close() os.remove(bedfile) fastafile = "final.fasta" build([newagpfile, ctgfasta, fastafile]) tidy([fastafile])
def fromblast(args): """ %prog fromblast blastfile subject.fasta Generate path from BLAST file. If multiple subjects map to the same query, an edge is constructed between them (with the link provided by the query). The BLAST file MUST be filtered, chained, supermapped. """ from jcvi.formats.blast import sort from jcvi.utils.range import range_distance p = OptionParser(fromblast.__doc__) p.add_option( "--clique", default=False, action="store_true", help="Populate clique instead of linear path [default: %default]") p.add_option( "--maxdist", default=100000, type="int", help="Create edge within certain distance [default: %default]") p.add_option("--verbose", default=False, action="store_true", help="Print verbose reports to stdout [default: %default]") opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) blastfile, subjectfasta = args clique = opts.clique maxdist = opts.maxdist sort([blastfile, "--query"]) blast = BlastSlow(blastfile, sorted=True) g = BiGraph() for query, blines in groupby(blast, key=lambda x: x.query): blines = list(blines) iterator = combinations(blines, 2) if clique else pairwise(blines) for a, b in iterator: asub, bsub = a.subject, b.subject if asub == bsub: continue arange = (a.query, a.qstart, a.qstop, "+") brange = (b.query, b.qstart, b.qstop, "+") dist, oo = range_distance(arange, brange, distmode="ee") if dist > maxdist: continue atag = ">" if a.orientation == "+" else "<" btag = ">" if b.orientation == "+" else "<" g.add_edge(BiEdge(asub, bsub, atag, btag)) g.write("graph.txt") #g.draw("graph.pdf") logging.debug(str(g)) paths = [] for path in g.iter_paths(): m, oo = g.path(path) if len(oo) == 1: # Singleton path continue paths.append(oo) if opts.verbose: print m print oo npaths = len(paths) ntigs = sum(len(x) for x in paths) logging.debug("Graph decomposed to {0} paths with {1} components.".\ format(npaths, ntigs)) agpfile = blastfile + ".agp" sizes = Sizes(subjectfasta) fwagp = open(agpfile, "w") scaffolded = set() for i, oo in enumerate(paths): ctgorder = [(str(ctg), ("+" if strand else "-")) \ for ctg, strand in oo] scaffolded |= set(ctg for ctg, strand in ctgorder) object = "pmol_{0:04d}".format(i) order_to_agp(object, ctgorder, sizes.mapping, fwagp) # Get the singletons as well nsingletons = 0 for ctg, size in sizes.iter_sizes(): if ctg in scaffolded: continue ctgorder = [(ctg, "+")] object = ctg order_to_agp(object, ctgorder, sizes.mapping, fwagp) nsingletons += 1 logging.debug("Written {0} unscaffolded singletons.".format(nsingletons)) fwagp.close() logging.debug("AGP file written to `{0}`.".format(agpfile))
def path(args): """ %prog path input.bed scaffolds.fasta Construct golden path given a set of genetic maps. The respective weight for each map is given in file `weights.txt`. The map with the highest weight is considered the pivot map. The final output is an AGP file that contains ordered scaffolds. """ oargs = args p = OptionParser(path.__doc__) p.add_option("-w", "--weightsfile", default="weights.txt", help="Use weights from file") p.add_option("--distance", default="rank", choices=distance_choices, help="Distance function when building initial consensus") p.add_option("--linkage", default="double", choices=linkage_choices, help="Linkage function when building initial consensus") p.add_option("--gapsize", default=100, type="int", help="Insert gaps of size between scaffolds") p.add_option("--ngen", default=500, type="int", help="Iterations in GA, more ~ slower") p.add_option("--npop", default=100, type="int", help="Population size in GA, more ~ slower") p.add_option("--seqid", help="Only run partition with this seqid") p.add_option("--links", default=10, type="int", help="Only plot matchings more than") p.add_option("--noplot", default=False, action="store_true", help="Do not visualize the alignments") p.set_cpus(cpus=16) opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) inputbed, fastafile = args pf = inputbed.rsplit(".", 1)[0] bedfile = pf + ".bed" weightsfile = opts.weightsfile gapsize = opts.gapsize ngen = opts.ngen npop = opts.npop cpus = opts.cpus if sys.version_info[:2] < (2, 7): logging.debug("Python version: {0}. CPUs set to 1.".\ format(sys.version.splitlines()[0].strip())) cpus = 1 function = get_function(opts.distance) cc = Map(bedfile, function) mapnames = cc.mapnames allseqids = cc.seqids weights = Weights(weightsfile, mapnames) pivot = weights.pivot ref = weights.ref linkage = opts.linkage oseqid = opts.seqid logging.debug("Linkage function: {0}-linkage".format(linkage)) linkage = { "single": min, "double": double_linkage, "complete": max, "average": np.mean, "median": np.median }[linkage] # Partition the linkage groups into consensus clusters C = Grouper() # Initialize the partitions for mlg in cc.mlgs: C.join(mlg) logging.debug("Partition LGs based on {0}".format(ref)) for mapname in mapnames: if mapname == ref: continue # Compute co-occurrence between LG pairs G = defaultdict(int) for s in allseqids: s = Scaffold(s, cc) s.add_LG_pairs(G, (ref, mapname)) # Convert edge list to adj list nodes = defaultdict(list) for (a, b), w in G.items(): nodes[a].append((b, w)) # Find the best ref LG every non-ref LG matches to for n, neighbors in nodes.items(): if n.split("-")[0] == ref: continue neighbors = dict(neighbors) best_neighbor, best_value = best_no_ambiguous(neighbors, n) if best_neighbor is None: continue C.join(n, best_neighbor) partitions = defaultdict(list) # Partition the scaffolds and assign them to one consensus for s in allseqids: s = Scaffold(s, cc) seqid = s.seqid counts = {} for mlg, count in s.mlg_counts.items(): consensus = C[mlg] mapname = mlg.split("-")[0] mw = weights[mapname] if consensus not in counts: counts[consensus] = 0 counts[consensus] += count * mw best_consensus, best_value = best_no_ambiguous(counts, seqid) if best_consensus is None: continue partitions[best_consensus].append(seqid) # Perform OO within each partition agpfile = pf + ".chr.agp" tourfile = pf + ".tour" sizes = Sizes(fastafile).mapping fwagp = must_open(agpfile, "w") fwtour = must_open(tourfile, "w") solutions = [] for lgs, scaffolds in sorted(partitions.items()): if oseqid and oseqid not in lgs: continue tag = "|".join(lgs) lgs_maps = set(x.split("-")[0] for x in lgs) if pivot not in lgs_maps: logging.debug("Skipping {0} ...".format(tag)) continue logging.debug("Working on {0} ...".format(tag)) s = ScaffoldOO(lgs, scaffolds, cc, pivot, weights, sizes, function=function, linkage=linkage, ngen=ngen, npop=npop, cpus=cpus) for fw in (sys.stderr, fwtour): print >> fw, ">{0} ({1})".format(s.object, tag) print >> fw, " ".join("".join(x) for x in s.tour) solutions.append(s) fwtour.close() # meta-data about the run parameters command = "# COMMAND: python -m jcvi.assembly.allmaps path {0}".\ format(" ".join(oargs)) comment = "Generated by ALLMAPS v{0} ({1})\n{2}".\ format(version, get_today(), command) AGP.print_header(fwagp, comment=comment) for s in sorted(solutions, key=lambda x: x.object): order_to_agp(s.object, s.tour, sizes, fwagp, gapsize=gapsize, gaptype="map") fwagp.close() logging.debug("AGP file written to `{0}`.".format(agpfile)) logging.debug("Tour file written to `{0}`.".format(tourfile)) build([inputbed, fastafile]) summaryfile = pf + ".summary.txt" summary([inputbed, fastafile, "--outfile={0}".format(summaryfile)]) if not opts.noplot: plotall([inputbed, "--links={0}".format(opts.links)])
def scaffold(args): """ %prog scaffold ctgfasta linksfile Use the linksfile to build scaffolds. The linksfile can be generated by calling assembly.bundle.link() or assembly.bundle.bundle(). Use --prefix to place the sequences with same prefix together. The final product is an AGP file. """ from jcvi.algorithms.graph import nx from jcvi.formats.agp import order_to_agp p = OptionParser(scaffold.__doc__) p.add_option("--prefix", default=False, action="store_true", help="Keep IDs with same prefix together [default: %default]") opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) ctgfasta, linksfile = args sizes = Sizes(ctgfasta).mapping logfile = "scaffold.log" fwlog = open(logfile, "w") pf = ctgfasta.rsplit(".", 1)[0] agpfile = pf + ".agp" fwagp = open(agpfile, "w") clinks = [] g = nx.MultiGraph() # use this to get connected components fp = open(linksfile) for row in fp: c = LinkLine(row) distance = max(c.distance, 50) g.add_edge(c.aseqid, c.bseqid, orientation=c.orientation, distance=distance) def get_bname(sname, prefix=False): return sname.rsplit("_", 1)[0] if prefix else "chr0" scaffoldbuckets = defaultdict(list) seqnames = sorted(sizes.keys()) for h in nx.connected_component_subgraphs(g): partialorder = solve_component(h, sizes, fwlog) name = partialorder[0][0] bname = get_bname(name, prefix=opts.prefix) scaffoldbuckets[bname].append(partialorder) ctgbuckets = defaultdict(set) for name in seqnames: bname = get_bname(name, prefix=opts.prefix) ctgbuckets[bname].add(name) # Now the buckets contain a mixture of singletons and partially resolved # scaffolds. Print the scaffolds first then remaining singletons. scafname = "{0}.scf_{1:04d}" for bname, ctgs in sorted(ctgbuckets.items()): scaffolds = scaffoldbuckets[bname] scaffolded = set() ctgorder = [] for scafID, scaf in enumerate(scaffolds): ctgorder = [] for node, start, end, orientation in scaf: ctgorder.append((node, orientation)) scaffolded.add(node) scaf = scafname.format(bname, scafID) order_to_agp(scaf, ctgorder, sizes, fwagp) singletons = sorted(ctgbuckets[bname] - scaffolded) nscaffolds = len(scaffolds) nsingletons = len(singletons) msg = "{0}: Scaffolds={1} Singletons={2}".\ format(bname, nscaffolds, nsingletons) print >> sys.stderr, msg for singleton in singletons: ctgorder = [(singleton, "+")] order_to_agp(singleton, ctgorder, sizes, fwagp) fwagp.close() logging.debug("AGP file written to `{0}`.".format(agpfile))
def path(args): """ %prog path input.bed scaffolds.fasta Construct golden path given a set of genetic maps. The respective weight for each map is given in file `weights.txt`. The map with the highest weight is considered the pivot map. The final output is an AGP file that contains ordered scaffolds. """ oargs = args p = OptionParser(path.__doc__) p.add_option("-b", "--bedfile", help=SUPPRESS_HELP) p.add_option("-s", "--fastafile", help=SUPPRESS_HELP) p.add_option("-w", "--weightsfile", default="weights.txt", help="Use weights from file") p.add_option("--distance", default="rank", choices=distance_choices, help="Distance function when building initial consensus") p.add_option("--linkage", default="double", choices=linkage_choices, help="Linkage function when building initial consensus") p.add_option("--gapsize", default=100, type="int", help="Insert gaps of size between scaffolds") p.add_option("--ngen", default=500, type="int", help="Iterations in GA, more ~ slower") p.add_option("--npop", default=100, type="int", help="Population size in GA, more ~ slower") p.add_option("--seqid", help="Only run partition with this seqid") p.add_option("--links", default=10, type="int", help="Only plot matchings more than") p.add_option("--noplot", default=False, action="store_true", help="Do not visualize the alignments") p.set_cpus(cpus=16) opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) inputbed, fastafile = args inputbed = opts.bedfile or inputbed fastafile = opts.fastafile or fastafile pf = inputbed.rsplit(".", 1)[0] bedfile = pf + ".bed" weightsfile = opts.weightsfile gapsize = opts.gapsize ngen = opts.ngen npop = opts.npop cpus = opts.cpus if sys.version_info[:2] < (2, 7): logging.debug("Python version: {0}. CPUs set to 1.".\ format(sys.version.splitlines()[0].strip())) cpus = 1 function = get_function(opts.distance) cc = Map(bedfile, function) mapnames = cc.mapnames allseqids = cc.seqids weights = Weights(weightsfile, mapnames) pivot = weights.pivot ref = weights.ref linkage = opts.linkage oseqid = opts.seqid logging.debug("Linkage function: {0}-linkage".format(linkage)) linkage = {"single": min, "double": double_linkage, "complete": max, "average": np.mean, "median": np.median}[linkage] # Partition the linkage groups into consensus clusters C = Grouper() # Initialize the partitions for mlg in cc.mlgs: C.join(mlg) logging.debug("Partition LGs based on {0}".format(ref)) for mapname in mapnames: if mapname == ref: continue # Compute co-occurrence between LG pairs G = defaultdict(int) for s in allseqids: s = Scaffold(s, cc) s.add_LG_pairs(G, (ref, mapname)) # Convert edge list to adj list nodes = defaultdict(list) for (a, b), w in G.items(): nodes[a].append((b, w)) # Find the best ref LG every non-ref LG matches to for n, neighbors in nodes.items(): if n.split("-")[0] == ref: continue neighbors = dict(neighbors) best_neighbor, best_value = best_no_ambiguous(neighbors, n) if best_neighbor is None: continue C.join(n, best_neighbor) partitions = defaultdict(list) # Partition the scaffolds and assign them to one consensus for s in allseqids: s = Scaffold(s, cc) seqid = s.seqid counts = {} for mlg, count in s.mlg_counts.items(): consensus = C[mlg] mapname = mlg.split("-")[0] mw = weights[mapname] if consensus not in counts: counts[consensus] = 0 counts[consensus] += count * mw best_consensus, best_value = best_no_ambiguous(counts, seqid) if best_consensus is None: continue partitions[best_consensus].append(seqid) # Perform OO within each partition agpfile = pf + ".chr.agp" tourfile = pf + ".tour" sizes = Sizes(fastafile).mapping fwagp = must_open(agpfile, "w") fwtour = must_open(tourfile, "w") solutions = [] for lgs, scaffolds in sorted(partitions.items()): if oseqid and oseqid not in lgs: continue tag = "|".join(lgs) lgs_maps = set(x.split("-")[0] for x in lgs) if pivot not in lgs_maps: logging.debug("Skipping {0} ...".format(tag)) continue logging.debug("Working on {0} ...".format(tag)) s = ScaffoldOO(lgs, scaffolds, cc, pivot, weights, sizes, function=function, linkage=linkage, ngen=ngen, npop=npop, cpus=cpus) for fw in (sys.stderr, fwtour): print >> fw, ">{0} ({1})".format(s.object, tag) print >> fw, " ".join("".join(x) for x in s.tour) solutions.append(s) fwtour.close() # meta-data about the run parameters command = "# COMMAND: python -m jcvi.assembly.allmaps path {0}".\ format(" ".join(oargs)) comment = "Generated by ALLMAPS v{0} ({1})\n{2}".\ format(version, get_today(), command) AGP.print_header(fwagp, comment=comment) for s in sorted(solutions, key=lambda x: x.object): order_to_agp(s.object, s.tour, sizes, fwagp, gapsize=gapsize, gaptype="map") fwagp.close() logging.debug("AGP file written to `{0}`.".format(agpfile)) logging.debug("Tour file written to `{0}`.".format(tourfile)) build([inputbed, fastafile]) summaryfile = pf + ".summary.txt" summary([inputbed, fastafile, "--outfile={0}".format(summaryfile)]) if not opts.noplot: plotall([inputbed, "--links={0}".format(opts.links)])
def fromblast(args): """ %prog fromblast blastfile subject.fasta Generate path from BLAST file. If multiple subjects map to the same query, an edge is constructed between them (with the link provided by the query). The BLAST file MUST be filtered, chained, supermapped. """ from jcvi.formats.blast import sort from jcvi.utils.range import range_distance p = OptionParser(fromblast.__doc__) p.add_option("--clique", default=False, action="store_true", help="Populate clique instead of linear path [default: %default]") p.add_option("--maxdist", default=100000, type="int", help="Create edge within certain distance [default: %default]") p.add_option("--verbose", default=False, action="store_true", help="Print verbose reports to stdout [default: %default]") opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) blastfile, subjectfasta = args clique = opts.clique maxdist = opts.maxdist sort([blastfile, "--query"]) blast = BlastSlow(blastfile, sorted=True) g = BiGraph() for query, blines in groupby(blast, key=lambda x: x.query): blines = list(blines) iterator = combinations(blines, 2) if clique else pairwise(blines) for a, b in iterator: asub, bsub = a.subject, b.subject if asub == bsub: continue arange = (a.query, a.qstart, a.qstop, "+") brange = (b.query, b.qstart, b.qstop, "+") dist, oo = range_distance(arange, brange, distmode="ee") if dist > maxdist: continue atag = ">" if a.orientation == "+" else "<" btag = ">" if b.orientation == "+" else "<" g.add_edge(BiEdge(asub, bsub, atag, btag)) g.write("graph.txt") #g.draw("graph.pdf") logging.debug(str(g)) paths = [] for path in g.iter_paths(): m, oo = g.path(path) if len(oo) == 1: # Singleton path continue paths.append(oo) if opts.verbose: print m print oo npaths = len(paths) ntigs = sum(len(x) for x in paths) logging.debug("Graph decomposed to {0} paths with {1} components.".\ format(npaths, ntigs)) agpfile = blastfile + ".agp" sizes = Sizes(subjectfasta) fwagp = open(agpfile, "w") scaffolded = set() for i, oo in enumerate(paths): ctgorder = [(str(ctg), ("+" if strand else "-")) \ for ctg, strand in oo] scaffolded |= set(ctg for ctg, strand in ctgorder) object = "pmol_{0:04d}".format(i) order_to_agp(object, ctgorder, sizes.mapping, fwagp) # Get the singletons as well nsingletons = 0 for ctg, size in sizes.iter_sizes(): if ctg in scaffolded: continue ctgorder = [(ctg, "+")] object = ctg order_to_agp(object, ctgorder, sizes.mapping, fwagp) nsingletons += 1 logging.debug("Written {0} unscaffolded singletons.".format(nsingletons)) fwagp.close() logging.debug("AGP file written to `{0}`.".format(agpfile))