Example #1
0
def path(args):
    """
    %prog path input.bed scaffolds.fasta

    Construct golden path given a set of genetic maps. The respective weight for
    each map is given in file `weights.txt`. The map with the highest weight is
    considered the pivot map. The final output is an AGP file that contains
    ordered scaffolds.
    """
    oargs = args
    p = OptionParser(path.__doc__)
    p.add_option("-w",
                 "--weightsfile",
                 default="weights.txt",
                 help="Use weights from file")
    p.add_option("--distance",
                 default="rank",
                 choices=distance_choices,
                 help="Distance function when building initial consensus")
    p.add_option("--linkage",
                 default="double",
                 choices=linkage_choices,
                 help="Linkage function when building initial consensus")
    p.add_option("--gapsize",
                 default=100,
                 type="int",
                 help="Insert gaps of size between scaffolds")
    p.add_option("--ngen",
                 default=500,
                 type="int",
                 help="Iterations in GA, more ~ slower")
    p.add_option("--npop",
                 default=100,
                 type="int",
                 help="Population size in GA, more ~ slower")
    p.add_option("--seqid", help="Only run partition with this seqid")
    p.add_option("--links",
                 default=10,
                 type="int",
                 help="Only plot matchings more than")
    p.add_option("--noplot",
                 default=False,
                 action="store_true",
                 help="Do not visualize the alignments")
    p.set_cpus(cpus=16)
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    inputbed, fastafile = args
    pf = inputbed.rsplit(".", 1)[0]
    bedfile = pf + ".bed"
    weightsfile = opts.weightsfile
    gapsize = opts.gapsize
    ngen = opts.ngen
    npop = opts.npop
    cpus = opts.cpus
    if sys.version_info[:2] < (2, 7):
        logging.debug("Python version: {0}. CPUs set to 1.".\
                        format(sys.version.splitlines()[0].strip()))
        cpus = 1

    function = get_function(opts.distance)
    cc = Map(bedfile, function)
    mapnames = cc.mapnames
    allseqids = cc.seqids
    weights = Weights(weightsfile, mapnames)
    pivot = weights.pivot
    ref = weights.ref
    linkage = opts.linkage
    oseqid = opts.seqid
    logging.debug("Linkage function: {0}-linkage".format(linkage))
    linkage = {
        "single": min,
        "double": double_linkage,
        "complete": max,
        "average": np.mean,
        "median": np.median
    }[linkage]

    # Partition the linkage groups into consensus clusters
    C = Grouper()
    # Initialize the partitions
    for mlg in cc.mlgs:
        C.join(mlg)

    logging.debug("Partition LGs based on {0}".format(ref))
    for mapname in mapnames:
        if mapname == ref:
            continue
        # Compute co-occurrence between LG pairs
        G = defaultdict(int)
        for s in allseqids:
            s = Scaffold(s, cc)
            s.add_LG_pairs(G, (ref, mapname))
        # Convert edge list to adj list
        nodes = defaultdict(list)
        for (a, b), w in G.items():
            nodes[a].append((b, w))
        # Find the best ref LG every non-ref LG matches to
        for n, neighbors in nodes.items():
            if n.split("-")[0] == ref:
                continue
            neighbors = dict(neighbors)
            best_neighbor, best_value = best_no_ambiguous(neighbors, n)
            if best_neighbor is None:
                continue
            C.join(n, best_neighbor)

    partitions = defaultdict(list)
    # Partition the scaffolds and assign them to one consensus
    for s in allseqids:
        s = Scaffold(s, cc)
        seqid = s.seqid
        counts = {}
        for mlg, count in s.mlg_counts.items():
            consensus = C[mlg]
            mapname = mlg.split("-")[0]
            mw = weights[mapname]
            if consensus not in counts:
                counts[consensus] = 0
            counts[consensus] += count * mw
        best_consensus, best_value = best_no_ambiguous(counts, seqid)
        if best_consensus is None:
            continue
        partitions[best_consensus].append(seqid)

    # Perform OO within each partition
    agpfile = pf + ".chr.agp"
    tourfile = pf + ".tour"
    sizes = Sizes(fastafile).mapping
    fwagp = must_open(agpfile, "w")
    fwtour = must_open(tourfile, "w")
    solutions = []
    for lgs, scaffolds in sorted(partitions.items()):
        if oseqid and oseqid not in lgs:
            continue
        tag = "|".join(lgs)
        lgs_maps = set(x.split("-")[0] for x in lgs)
        if pivot not in lgs_maps:
            logging.debug("Skipping {0} ...".format(tag))
            continue
        logging.debug("Working on {0} ...".format(tag))
        s = ScaffoldOO(lgs,
                       scaffolds,
                       cc,
                       pivot,
                       weights,
                       sizes,
                       function=function,
                       linkage=linkage,
                       ngen=ngen,
                       npop=npop,
                       cpus=cpus)

        for fw in (sys.stderr, fwtour):
            print >> fw, ">{0} ({1})".format(s.object, tag)
            print >> fw, " ".join("".join(x) for x in s.tour)
        solutions.append(s)
    fwtour.close()

    # meta-data about the run parameters
    command = "# COMMAND: python -m jcvi.assembly.allmaps path {0}".\
                     format(" ".join(oargs))
    comment = "Generated by ALLMAPS v{0} ({1})\n{2}".\
                     format(version, get_today(), command)
    AGP.print_header(fwagp, comment=comment)

    for s in sorted(solutions, key=lambda x: x.object):
        order_to_agp(s.object,
                     s.tour,
                     sizes,
                     fwagp,
                     gapsize=gapsize,
                     gaptype="map")
    fwagp.close()

    logging.debug("AGP file written to `{0}`.".format(agpfile))
    logging.debug("Tour file written to `{0}`.".format(tourfile))

    build([inputbed, fastafile])

    summaryfile = pf + ".summary.txt"
    summary([inputbed, fastafile, "--outfile={0}".format(summaryfile)])

    if not opts.noplot:
        plotall([inputbed, "--links={0}".format(opts.links)])
Example #2
0
def path(args):
    """
    %prog path input.bed scaffolds.fasta

    Construct golden path given a set of genetic maps. The respective weight for
    each map is given in file `weights.txt`. The map with the highest weight is
    considered the pivot map. The final output is an AGP file that contains
    ordered scaffolds.
    """
    oargs = args
    p = OptionParser(path.__doc__)
    p.add_option("-b", "--bedfile", help=SUPPRESS_HELP)
    p.add_option("-s", "--fastafile", help=SUPPRESS_HELP)
    p.add_option("-w", "--weightsfile", default="weights.txt",
                 help="Use weights from file")
    p.add_option("--distance", default="rank", choices=distance_choices,
                 help="Distance function when building initial consensus")
    p.add_option("--linkage", default="double", choices=linkage_choices,
                 help="Linkage function when building initial consensus")
    p.add_option("--gapsize", default=100, type="int",
                 help="Insert gaps of size between scaffolds")
    p.add_option("--ngen", default=500, type="int",
                 help="Iterations in GA, more ~ slower")
    p.add_option("--npop", default=100, type="int",
                 help="Population size in GA, more ~ slower")
    p.add_option("--seqid", help="Only run partition with this seqid")
    p.add_option("--links", default=10, type="int",
                 help="Only plot matchings more than")
    p.add_option("--noplot", default=False, action="store_true",
                 help="Do not visualize the alignments")
    p.set_cpus(cpus=16)
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    inputbed, fastafile = args
    inputbed = opts.bedfile or inputbed
    fastafile = opts.fastafile or fastafile
    pf = inputbed.rsplit(".", 1)[0]
    bedfile = pf + ".bed"
    weightsfile = opts.weightsfile
    gapsize = opts.gapsize
    ngen = opts.ngen
    npop = opts.npop
    cpus = opts.cpus
    if sys.version_info[:2] < (2, 7):
        logging.debug("Python version: {0}. CPUs set to 1.".\
                        format(sys.version.splitlines()[0].strip()))
        cpus = 1

    function = get_function(opts.distance)
    cc = Map(bedfile, function)
    mapnames = cc.mapnames
    allseqids = cc.seqids
    weights = Weights(weightsfile, mapnames)
    pivot = weights.pivot
    ref = weights.ref
    linkage = opts.linkage
    oseqid = opts.seqid
    logging.debug("Linkage function: {0}-linkage".format(linkage))
    linkage = {"single": min, "double": double_linkage, "complete": max,
               "average": np.mean, "median": np.median}[linkage]

    # Partition the linkage groups into consensus clusters
    C = Grouper()
    # Initialize the partitions
    for mlg in cc.mlgs:
        C.join(mlg)

    logging.debug("Partition LGs based on {0}".format(ref))
    for mapname in mapnames:
        if mapname == ref:
            continue
        # Compute co-occurrence between LG pairs
        G = defaultdict(int)
        for s in allseqids:
            s = Scaffold(s, cc)
            s.add_LG_pairs(G, (ref, mapname))
        # Convert edge list to adj list
        nodes = defaultdict(list)
        for (a, b), w in G.items():
            nodes[a].append((b, w))
        # Find the best ref LG every non-ref LG matches to
        for n, neighbors in nodes.items():
            if n.split("-")[0] == ref:
                continue
            neighbors = dict(neighbors)
            best_neighbor, best_value = best_no_ambiguous(neighbors, n)
            if best_neighbor is None:
                continue
            C.join(n, best_neighbor)

    partitions = defaultdict(list)
    # Partition the scaffolds and assign them to one consensus
    for s in allseqids:
        s = Scaffold(s, cc)
        seqid = s.seqid
        counts = {}
        for mlg, count in s.mlg_counts.items():
            consensus = C[mlg]
            mapname = mlg.split("-")[0]
            mw = weights[mapname]
            if consensus not in counts:
                counts[consensus] = 0
            counts[consensus] += count * mw
        best_consensus, best_value = best_no_ambiguous(counts, seqid)
        if best_consensus is None:
            continue
        partitions[best_consensus].append(seqid)

    # Perform OO within each partition
    agpfile = pf + ".chr.agp"
    tourfile = pf + ".tour"
    sizes = Sizes(fastafile).mapping
    fwagp = must_open(agpfile, "w")
    fwtour = must_open(tourfile, "w")
    solutions = []
    for lgs, scaffolds in sorted(partitions.items()):
        if oseqid and oseqid not in lgs:
            continue
        tag = "|".join(lgs)
        lgs_maps = set(x.split("-")[0] for x in lgs)
        if pivot not in lgs_maps:
            logging.debug("Skipping {0} ...".format(tag))
            continue
        logging.debug("Working on {0} ...".format(tag))
        s = ScaffoldOO(lgs, scaffolds, cc, pivot, weights, sizes,
                       function=function, linkage=linkage,
                       ngen=ngen, npop=npop, cpus=cpus)

        for fw in (sys.stderr, fwtour):
            print >> fw, ">{0} ({1})".format(s.object, tag)
            print >> fw, " ".join("".join(x) for x in s.tour)
        solutions.append(s)
    fwtour.close()

    # meta-data about the run parameters
    command = "# COMMAND: python -m jcvi.assembly.allmaps path {0}".\
                     format(" ".join(oargs))
    comment = "Generated by ALLMAPS v{0} ({1})\n{2}".\
                     format(version, get_today(), command)
    AGP.print_header(fwagp, comment=comment)

    for s in sorted(solutions, key=lambda x: x.object):
        order_to_agp(s.object, s.tour, sizes, fwagp, gapsize=gapsize,
                     gaptype="map")
    fwagp.close()

    logging.debug("AGP file written to `{0}`.".format(agpfile))
    logging.debug("Tour file written to `{0}`.".format(tourfile))

    build([inputbed, fastafile])

    summaryfile = pf + ".summary.txt"
    summary([inputbed, fastafile, "--outfile={0}".format(summaryfile)])

    if not opts.noplot:
        plotall([inputbed, "--links={0}".format(opts.links)])