def optimize_ordering(fwtour, clm, phase, cpus): """ Optimize the ordering of contigs by Genetic Algorithm (GA). """ from .chic import score_evaluate_M # Prepare input files tour_contigs = clm.active_contigs tour_sizes = clm.active_sizes tour_M = clm.M tour = clm.tour signs = clm.signs oo = clm.oo def callback(tour, gen, phase, oo): fitness = tour.fitness if hasattr(tour, "fitness") else None label = "GA{}-{}".format(phase, gen) if fitness: fitness = "{0}".format(fitness).split(",")[0].replace("(", "") label += "-" + fitness if gen % 20 == 0: print_tour(fwtour, tour, label, tour_contigs, oo, signs=signs) return tour callbacki = partial(callback, phase=phase, oo=oo) toolbox = GA_setup(tour) toolbox.register("evaluate", score_evaluate_M, tour_sizes=tour_sizes, tour_M=tour_M) tour, tour_fitness = GA_run(toolbox, ngen=1000, npop=100, cpus=cpus, callback=callbacki) clm.tour = tour return tour
def __init__(self, lgs, scaffolds, mapc, pivot, weights, sizes, function=(lambda x: x.rank), linkage=min, ngen=500, npop=100, cpus=8): self.lgs = lgs self.lengths = mapc.lengths self.bins = mapc.bins self.sizes = sizes self.scaffolds = scaffolds self.pivot = pivot self.weights = weights self.function = function self.linkage = linkage self.prepare_linkage_groups() # populate all data signs = self.assign_orientation() assert len(signs) == len(scaffolds) scaffolds_oo = dict(zip(scaffolds, signs)) tour = self.assign_order() tour = [(x, scaffolds_oo[x]) for x in tour] i = 0 best_tour, best_fitness = None, None while True: # Multiple EC rounds due to orientation fixes logging.debug("Start EC round {0}".format(i)) scaffolds_oo = dict(tour) scfs, tour, ww = self.prepare_ec(scaffolds, tour, weights) toolbox = GA_setup(tour) toolbox.register("evaluate", colinear_evaluate_multi, scfs=scfs, weights=ww) tour, fitness = GA_run(toolbox, ngen=ngen, npop=npop, cpus=cpus) tour = [scaffolds[x] for x in tour] tour = [(x, scaffolds_oo[x]) for x in tour] if best_fitness and fitness <= best_fitness: logging.debug("No fitness improvement: {0}. Exit EC.".\ format(best_fitness)) break best_tour, best_fitness = tour, fitness logging.debug("Current best fitness: {0}".format(best_fitness)) tour = self.fix_orientation(tour) i += 1 tour = best_tour recode = {0: '?', 1: '+', -1: '-'} tour = [(x, recode[o]) for x, o in tour] self.tour = tour for mlg in self.lgs: mapname, lg = mlg.rsplit("-", 1) if mapname == pivot: self.object = "chr{0}".format(lg) break
def layout(args): """ %prog layout query.subject.simple query.seqids subject.seqids Compute optimal seqids order in a second genome, based on seqids on one genome, given the pairwise blocks in .simple format. """ from jcvi.algorithms.ec import GA_setup, GA_run p = OptionParser(layout.__doc__) p.set_beds() p.set_cpus(cpus=32) opts, args = p.parse_args(args) if len(args) != 3: sys.exit(not p.print_help()) simplefile, qseqids, sseqids = args qbed, sbed, qorder, sorder, is_self = check_beds(simplefile, p, opts) qseqids = qseqids.strip().split(",") sseqids = sseqids.strip().split(",") qseqids_ii = dict((s, i) for i, s in enumerate(qseqids)) sseqids_ii = dict((s, i) for i, s in enumerate(sseqids)) blocks = SimpleFile(simplefile).blocks scores = defaultdict(int) for a, b, c, d, score, orientation, hl in blocks: qi, q = qorder[a] si, s = sorder[c] qseqid, sseqid = q.seqid, s.seqid if sseqid not in sseqids: continue scores[sseqids_ii[sseqid], qseqid] += score data = [] for (a, b), score in sorted(scores.items()): if b not in qseqids_ii: continue data.append((qseqids_ii[b], score)) tour = range(len(qseqids)) toolbox = GA_setup(tour) toolbox.register("evaluate", colinear_evaluate_weights, data=data) tour, fitness = GA_run(toolbox, ngen=100, npop=100, cpus=opts.cpus) tour = [qseqids[x] for x in tour] print ",".join(tour)
def test_ec(input_array, expected): from jcvi.algorithms.ec import ( GA_setup, GA_run, colinear_evaluate, creator, make_data, ) POINTS, SCF = 200, 20 scaffolds = make_data(POINTS, SCF) toolbox = GA_setup(input_array) toolbox.register("evaluate", colinear_evaluate, scaffolds=scaffolds) tour, tour.fitness = GA_run(toolbox, cpus=8) print(tour, tour.fitness) assert list(tour) == expected assert tour.fitness == creator.FitnessMax((200.0, ))
def score(args): """ %prog score main_results/ cached_data/ contigsfasta Score the current LACHESIS CLM. """ p = OptionParser(score.__doc__) p.set_cpus() opts, args = p.parse_args(args) if len(args) != 3: sys.exit(not p.print_help()) mdir, cdir, contigsfasta = args orderingfiles = natsorted(iglob(mdir, "*.ordering")) sizes = Sizes(contigsfasta) contig_names = list(sizes.iter_names()) contig_ids = dict((name, i) for (i, name) in enumerate(contig_names)) oo = [] # Load contact matrix glm = op.join(cdir, "all.GLM") N = len(contig_ids) M = np.zeros((N, N), dtype=int) fp = open(glm) for row in fp: if row[0] == '#': continue x, y, z = row.split() if x == 'X': continue M[int(x), int(y)] = int(z) fwtour = open("tour", "w") def callback(tour, gen, oo): fitness = tour.fitness if hasattr(tour, "fitness") else None label = "GA-{0}".format(gen) if fitness: fitness = "{0}".format(fitness).split(",")[0].replace("(", "") label += "-" + fitness print_tour(fwtour, tour, label, contig_names, oo) return tour for ofile in orderingfiles: co = ContigOrdering(ofile) for x in co: contig_id = contig_ids[x.contig_name] oo.append(contig_id) pf = op.basename(ofile).split(".")[0] print pf print oo tour, tour_sizes, tour_M = prepare_ec(oo, sizes, M) # Store INIT tour print_tour(fwtour, tour, "INIT", contig_names, oo) # Faster Cython version for evaluation from .chic import score_evaluate_M callbacki = partial(callback, oo=oo) toolbox = GA_setup(tour) toolbox.register("evaluate", score_evaluate_M, tour_sizes=tour_sizes, tour_M=tour_M) tour, tour.fitness = GA_run(toolbox, npop=100, cpus=opts.cpus, callback=callbacki) print tour, tour.fitness break fwtour.close()