def bisect(args): """ %prog bisect acc accession.fasta determine the version of the accession by querying entrez, based on a fasta file. This proceeds by a sequential search from xxxx.1 to the latest record. """ p = OptionParser(bisect.__doc__) p.set_email() opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) acc, fastafile = args arec = get_first_rec(fastafile) valid = None for i in range(1, 100): term = "%s.%d" % (acc, i) try: query = list(batch_entrez([term], email=opts.email)) except AssertionError as e: logging.debug("no records found for %s. terminating." % term) return id, term, handle = query[0] brec = next(SeqIO.parse(handle, "fasta")) match = print_first_difference(arec, brec, ignore_case=True, ignore_N=True, rc=True) if match: valid = term break if valid: printf() printf("[green]{} matches the sequence in `{}`".format( valid, fastafile))
def entrez(args): """ %prog entrez <filename|term> `filename` contains a list of terms to search. Or just one term. If the results are small in size, e.g. "--format=acc", use "--batchsize=100" to speed the download. """ p = OptionParser(entrez.__doc__) allowed_databases = { "fasta": ["genome", "nuccore", "nucgss", "protein", "nucest"], "asn.1": ["genome", "nuccore", "nucgss", "protein", "gene"], "xml": ["genome", "nuccore", "nucgss", "nucest", "gene"], "gb": ["genome", "nuccore", "nucgss"], "est": ["nucest"], "gss": ["nucgss"], "acc": ["nuccore"], } valid_formats = tuple(allowed_databases.keys()) valid_databases = ("genome", "nuccore", "nucest", "nucgss", "protein", "gene") p.add_option( "--noversion", dest="noversion", default=False, action="store_true", help="Remove trailing accession versions", ) p.add_option( "--format", default="fasta", choices=valid_formats, help="download format", ) p.add_option( "--database", default="nuccore", choices=valid_databases, help="search database", ) p.add_option( "--retmax", default=1000000, type="int", help="how many results to return", ) p.add_option( "--skipcheck", default=False, action="store_true", help="turn off prompt to check file existence", ) p.add_option( "--batchsize", default=500, type="int", help="download the results in batch for speed-up", ) p.set_outdir(outdir=None) p.add_option("--outprefix", default="out", help="output file name prefix") p.set_email() opts, args = p.parse_args(args) if len(args) != 1: sys.exit(p.print_help()) (filename,) = args if op.exists(filename): pf = filename.rsplit(".", 1)[0] list_of_terms = [row.strip() for row in open(filename)] if opts.noversion: list_of_terms = [x.rsplit(".", 1)[0] for x in list_of_terms] else: pf = filename # the filename is the search term list_of_terms = [filename.strip()] fmt = opts.format database = opts.database batchsize = opts.batchsize assert ( database in allowed_databases[fmt] ), "For output format '{0}', allowed databases are: {1}".format( fmt, allowed_databases[fmt] ) assert batchsize >= 1, "batchsize must >= 1" if " " in pf: pf = opts.outprefix outfile = "{0}.{1}".format(pf, fmt) outdir = opts.outdir if outdir: mkdir(outdir) # If noprompt, will not check file existence if not outdir: fw = must_open(outfile, "w", checkexists=True, skipcheck=opts.skipcheck) if fw is None: return seen = set() totalsize = 0 for id, size, term, handle in batch_entrez( list_of_terms, retmax=opts.retmax, rettype=fmt, db=database, batchsize=batchsize, email=opts.email, ): if outdir: outfile = urljoin(outdir, "{0}.{1}".format(term, fmt)) fw = must_open(outfile, "w", checkexists=True, skipcheck=opts.skipcheck) if fw is None: continue rec = handle.read() if id in seen: logging.error("Duplicate key ({0}) found".format(rec)) continue totalsize += size print(rec, file=fw) print(file=fw) seen.add(id) if seen: printf( "A total of {0} {1} records downloaded.".format(totalsize, fmt.upper()), ) return outfile
stats.register("min", min) eaSimpleConverge(pop, toolbox, 0.7, 0.2, ngen, stats=stats, halloffame=hof, callback=callback) tour = hof[0] if cpus > 1: pool.terminate() return tour, tour.fitness if __name__ == "__main__": POINTS, SCF = 200, 20 scaffolds = make_data(POINTS, SCF) # Demo case: scramble of the list guess = list(range(SCF)) guess[5:15] = guess[5:15][::-1] guess[7:18] = guess[7:18][::-1] printf(guess) toolbox = GA_setup(guess) toolbox.register("evaluate", colinear_evaluate, scaffolds=scaffolds) tour, tour.fitness = GA_run(toolbox, cpus=8) printf(tour, tour.fitness)
def eaSimpleConverge( population, toolbox, cxpb, mutpb, ngen, stats=None, halloffame=None, callback=None, verbose=True, ): """This algorithm reproduce the simplest evolutionary algorithm as presented in chapter 7 of [Back2000]_. Modified to allow checking if there is no change for ngen, as a simple rule for convergence. Interface is similar to eaSimple(). However, in eaSimple, ngen is total number of iterations; in eaSimpleConverge, we terminate only when the best is NOT updated for ngen iterations. """ # Evaluate the individuals with an invalid fitness invalid_ind = [ind for ind in population if not ind.fitness.valid] fitnesses = toolbox.map(toolbox.evaluate, invalid_ind) for ind, fit in zip(invalid_ind, fitnesses): ind.fitness.values = fit if halloffame is not None: halloffame.update(population) record = stats.compile(population) if stats else {} # Begin the generational process gen = 1 best = (0, ) while True: # Select the next generation individuals offspring = toolbox.select(population, len(population)) # Vary the pool of individuals offspring = varAnd(offspring, toolbox, cxpb, mutpb) # Evaluate the individuals with an invalid fitness invalid_ind = [ind for ind in offspring if not ind.fitness.valid] fitnesses = toolbox.map(toolbox.evaluate, invalid_ind) for ind, fit in zip(invalid_ind, fitnesses): ind.fitness.values = fit # Update the hall of fame with the generated individuals if halloffame is not None: halloffame.update(offspring) if callback is not None: callback(halloffame[0], gen) # Replace the current population by the offspring population[:] = offspring # Append the current generation statistics to the logbook record = stats.compile(population) if stats else {} current_best = record["max"] if gen % 20 == 0 and verbose: printf( "Current iteration {0}: max_score={1}".format( gen, current_best), ) if current_best > best: best = current_best updated = gen gen += 1 if gen - updated > ngen: break return population
def overlap(args): """ %prog overlap best.contains iid Visualize overlaps for a given fragment. Must be run in 4-unitigger. All overlaps for iid were retrieved, excluding the ones matching best.contains. """ p = OptionParser(overlap.__doc__) p.add_option("--maxerr", default=2, type="int", help="Maximum error rate") p.add_option("--canvas", default=100, type="int", help="Canvas size") opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) bestcontains, iid = args canvas = opts.canvas bestcontainscache = bestcontains + ".cache" if need_update(bestcontains, bestcontainscache): fp = open(bestcontains) fw = open(bestcontainscache, "w") exclude = set() for row in fp: if row[0] == "#": continue j = int(row.split()[0]) exclude.add(j) dump(exclude, fw) fw.close() exclude = load(open(bestcontainscache)) logging.debug("A total of {0} reads to exclude".format(len(exclude))) cmd = "overlapStore -d ../asm.ovlStore -b {0} -e {0}".format(iid) cmd += " -E {0}".format(opts.maxerr) frags = [] for row in popen(cmd): r = OverlapLine(row) if r.bid in exclude: continue frags.append(r) # Also include to query fragment frags.append(OverlapLine("{0} {0} N 0 0 0 0".format(iid))) frags.sort(key=lambda x: x.ahang) # Determine size of the query fragment cmd = "gatekeeper -b {0} -e {0}".format(iid) cmd += " -tabular -dumpfragments ../asm.gkpStore" fp = popen(cmd) row = next(fp) size = int(fp.next().split()[-1]) # Determine size of canvas xmin = min(x.ahang for x in frags) xmax = max(x.bhang for x in frags) xsize = -xmin + size + xmax ratio = xsize / canvas for f in frags: fsize = -f.ahang + size + f.bhang a = (f.ahang - xmin) / ratio b = fsize / ratio t = "-" * b if f.orientation == "N": t = t[:-1] + ">" else: t = "<" + t[1:] if f.ahang == 0 and f.bhang == 0: t = "[green]{}".format(t) c = canvas - a - b printf( "{}{}{}{} ({})".format(" " * a, t, " " * c, str(f.bid).rjust(10), f.erate_adj), )