def path(args): """ %prog path input.bed scaffolds.fasta Construct golden path given a set of genetic maps. The respective weight for each map is given in file `weights.txt`. The map with the highest weight is considered the pivot map. The final output is an AGP file that contains ordered scaffolds. """ oargs = args p = OptionParser(path.__doc__) p.add_option("-w", "--weightsfile", default="weights.txt", help="Use weights from file") p.add_option("--distance", default="rank", choices=distance_choices, help="Distance function when building initial consensus") p.add_option("--linkage", default="double", choices=linkage_choices, help="Linkage function when building initial consensus") p.add_option("--gapsize", default=100, type="int", help="Insert gaps of size between scaffolds") p.add_option("--ngen", default=500, type="int", help="Iterations in GA, more ~ slower") p.add_option("--npop", default=100, type="int", help="Population size in GA, more ~ slower") p.add_option("--seqid", help="Only run partition with this seqid") p.add_option("--links", default=10, type="int", help="Only plot matchings more than") p.add_option("--noplot", default=False, action="store_true", help="Do not visualize the alignments") p.set_cpus(cpus=16) opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) inputbed, fastafile = args pf = inputbed.rsplit(".", 1)[0] bedfile = pf + ".bed" weightsfile = opts.weightsfile gapsize = opts.gapsize ngen = opts.ngen npop = opts.npop cpus = opts.cpus if sys.version_info[:2] < (2, 7): logging.debug("Python version: {0}. CPUs set to 1.".\ format(sys.version.splitlines()[0].strip())) cpus = 1 function = get_function(opts.distance) cc = Map(bedfile, function) mapnames = cc.mapnames allseqids = cc.seqids weights = Weights(weightsfile, mapnames) pivot = weights.pivot ref = weights.ref linkage = opts.linkage oseqid = opts.seqid logging.debug("Linkage function: {0}-linkage".format(linkage)) linkage = { "single": min, "double": double_linkage, "complete": max, "average": np.mean, "median": np.median }[linkage] # Partition the linkage groups into consensus clusters C = Grouper() # Initialize the partitions for mlg in cc.mlgs: C.join(mlg) logging.debug("Partition LGs based on {0}".format(ref)) for mapname in mapnames: if mapname == ref: continue # Compute co-occurrence between LG pairs G = defaultdict(int) for s in allseqids: s = Scaffold(s, cc) s.add_LG_pairs(G, (ref, mapname)) # Convert edge list to adj list nodes = defaultdict(list) for (a, b), w in G.items(): nodes[a].append((b, w)) # Find the best ref LG every non-ref LG matches to for n, neighbors in nodes.items(): if n.split("-")[0] == ref: continue neighbors = dict(neighbors) best_neighbor, best_value = best_no_ambiguous(neighbors, n) if best_neighbor is None: continue C.join(n, best_neighbor) partitions = defaultdict(list) # Partition the scaffolds and assign them to one consensus for s in allseqids: s = Scaffold(s, cc) seqid = s.seqid counts = {} for mlg, count in s.mlg_counts.items(): consensus = C[mlg] mapname = mlg.split("-")[0] mw = weights[mapname] if consensus not in counts: counts[consensus] = 0 counts[consensus] += count * mw best_consensus, best_value = best_no_ambiguous(counts, seqid) if best_consensus is None: continue partitions[best_consensus].append(seqid) # Perform OO within each partition agpfile = pf + ".chr.agp" tourfile = pf + ".tour" sizes = Sizes(fastafile).mapping fwagp = must_open(agpfile, "w") fwtour = must_open(tourfile, "w") solutions = [] for lgs, scaffolds in sorted(partitions.items()): if oseqid and oseqid not in lgs: continue tag = "|".join(lgs) lgs_maps = set(x.split("-")[0] for x in lgs) if pivot not in lgs_maps: logging.debug("Skipping {0} ...".format(tag)) continue logging.debug("Working on {0} ...".format(tag)) s = ScaffoldOO(lgs, scaffolds, cc, pivot, weights, sizes, function=function, linkage=linkage, ngen=ngen, npop=npop, cpus=cpus) for fw in (sys.stderr, fwtour): print >> fw, ">{0} ({1})".format(s.object, tag) print >> fw, " ".join("".join(x) for x in s.tour) solutions.append(s) fwtour.close() # meta-data about the run parameters command = "# COMMAND: python -m jcvi.assembly.allmaps path {0}".\ format(" ".join(oargs)) comment = "Generated by ALLMAPS v{0} ({1})\n{2}".\ format(version, get_today(), command) AGP.print_header(fwagp, comment=comment) for s in sorted(solutions, key=lambda x: x.object): order_to_agp(s.object, s.tour, sizes, fwagp, gapsize=gapsize, gaptype="map") fwagp.close() logging.debug("AGP file written to `{0}`.".format(agpfile)) logging.debug("Tour file written to `{0}`.".format(tourfile)) build([inputbed, fastafile]) summaryfile = pf + ".summary.txt" summary([inputbed, fastafile, "--outfile={0}".format(summaryfile)]) if not opts.noplot: plotall([inputbed, "--links={0}".format(opts.links)])
def path(args): """ %prog path input.bed scaffolds.fasta Construct golden path given a set of genetic maps. The respective weight for each map is given in file `weights.txt`. The map with the highest weight is considered the pivot map. The final output is an AGP file that contains ordered scaffolds. """ oargs = args p = OptionParser(path.__doc__) p.add_option("-b", "--bedfile", help=SUPPRESS_HELP) p.add_option("-s", "--fastafile", help=SUPPRESS_HELP) p.add_option("-w", "--weightsfile", default="weights.txt", help="Use weights from file") p.add_option("--distance", default="rank", choices=distance_choices, help="Distance function when building initial consensus") p.add_option("--linkage", default="double", choices=linkage_choices, help="Linkage function when building initial consensus") p.add_option("--gapsize", default=100, type="int", help="Insert gaps of size between scaffolds") p.add_option("--ngen", default=500, type="int", help="Iterations in GA, more ~ slower") p.add_option("--npop", default=100, type="int", help="Population size in GA, more ~ slower") p.add_option("--seqid", help="Only run partition with this seqid") p.add_option("--links", default=10, type="int", help="Only plot matchings more than") p.add_option("--noplot", default=False, action="store_true", help="Do not visualize the alignments") p.set_cpus(cpus=16) opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) inputbed, fastafile = args inputbed = opts.bedfile or inputbed fastafile = opts.fastafile or fastafile pf = inputbed.rsplit(".", 1)[0] bedfile = pf + ".bed" weightsfile = opts.weightsfile gapsize = opts.gapsize ngen = opts.ngen npop = opts.npop cpus = opts.cpus if sys.version_info[:2] < (2, 7): logging.debug("Python version: {0}. CPUs set to 1.".\ format(sys.version.splitlines()[0].strip())) cpus = 1 function = get_function(opts.distance) cc = Map(bedfile, function) mapnames = cc.mapnames allseqids = cc.seqids weights = Weights(weightsfile, mapnames) pivot = weights.pivot ref = weights.ref linkage = opts.linkage oseqid = opts.seqid logging.debug("Linkage function: {0}-linkage".format(linkage)) linkage = {"single": min, "double": double_linkage, "complete": max, "average": np.mean, "median": np.median}[linkage] # Partition the linkage groups into consensus clusters C = Grouper() # Initialize the partitions for mlg in cc.mlgs: C.join(mlg) logging.debug("Partition LGs based on {0}".format(ref)) for mapname in mapnames: if mapname == ref: continue # Compute co-occurrence between LG pairs G = defaultdict(int) for s in allseqids: s = Scaffold(s, cc) s.add_LG_pairs(G, (ref, mapname)) # Convert edge list to adj list nodes = defaultdict(list) for (a, b), w in G.items(): nodes[a].append((b, w)) # Find the best ref LG every non-ref LG matches to for n, neighbors in nodes.items(): if n.split("-")[0] == ref: continue neighbors = dict(neighbors) best_neighbor, best_value = best_no_ambiguous(neighbors, n) if best_neighbor is None: continue C.join(n, best_neighbor) partitions = defaultdict(list) # Partition the scaffolds and assign them to one consensus for s in allseqids: s = Scaffold(s, cc) seqid = s.seqid counts = {} for mlg, count in s.mlg_counts.items(): consensus = C[mlg] mapname = mlg.split("-")[0] mw = weights[mapname] if consensus not in counts: counts[consensus] = 0 counts[consensus] += count * mw best_consensus, best_value = best_no_ambiguous(counts, seqid) if best_consensus is None: continue partitions[best_consensus].append(seqid) # Perform OO within each partition agpfile = pf + ".chr.agp" tourfile = pf + ".tour" sizes = Sizes(fastafile).mapping fwagp = must_open(agpfile, "w") fwtour = must_open(tourfile, "w") solutions = [] for lgs, scaffolds in sorted(partitions.items()): if oseqid and oseqid not in lgs: continue tag = "|".join(lgs) lgs_maps = set(x.split("-")[0] for x in lgs) if pivot not in lgs_maps: logging.debug("Skipping {0} ...".format(tag)) continue logging.debug("Working on {0} ...".format(tag)) s = ScaffoldOO(lgs, scaffolds, cc, pivot, weights, sizes, function=function, linkage=linkage, ngen=ngen, npop=npop, cpus=cpus) for fw in (sys.stderr, fwtour): print >> fw, ">{0} ({1})".format(s.object, tag) print >> fw, " ".join("".join(x) for x in s.tour) solutions.append(s) fwtour.close() # meta-data about the run parameters command = "# COMMAND: python -m jcvi.assembly.allmaps path {0}".\ format(" ".join(oargs)) comment = "Generated by ALLMAPS v{0} ({1})\n{2}".\ format(version, get_today(), command) AGP.print_header(fwagp, comment=comment) for s in sorted(solutions, key=lambda x: x.object): order_to_agp(s.object, s.tour, sizes, fwagp, gapsize=gapsize, gaptype="map") fwagp.close() logging.debug("AGP file written to `{0}`.".format(agpfile)) logging.debug("Tour file written to `{0}`.".format(tourfile)) build([inputbed, fastafile]) summaryfile = pf + ".summary.txt" summary([inputbed, fastafile, "--outfile={0}".format(summaryfile)]) if not opts.noplot: plotall([inputbed, "--links={0}".format(opts.links)])