Exemple #1
0
def bisect(args):
    """
    %prog bisect acc accession.fasta

    determine the version of the accession by querying entrez, based on a fasta file.
    This proceeds by a sequential search from xxxx.1 to the latest record.
    """
    p = OptionParser(bisect.__doc__)
    p.set_email()

    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    acc, fastafile = args
    arec = get_first_rec(fastafile)

    valid = None
    for i in range(1, 100):
        term = "%s.%d" % (acc, i)
        try:
            query = list(batch_entrez([term], email=opts.email))
        except AssertionError as e:
            logging.debug("no records found for %s. terminating." % term)
            return

        id, term, handle = query[0]
        brec = next(SeqIO.parse(handle, "fasta"))

        match = print_first_difference(arec,
                                       brec,
                                       ignore_case=True,
                                       ignore_N=True,
                                       rc=True)
        if match:
            valid = term
            break

    if valid:
        printf()
        printf("[green]{} matches the sequence in `{}`".format(
            valid, fastafile))
Exemple #2
0
def entrez(args):
    """
    %prog entrez <filename|term>

    `filename` contains a list of terms to search. Or just one term. If the
    results are small in size, e.g. "--format=acc", use "--batchsize=100" to speed
    the download.
    """
    p = OptionParser(entrez.__doc__)

    allowed_databases = {
        "fasta": ["genome", "nuccore", "nucgss", "protein", "nucest"],
        "asn.1": ["genome", "nuccore", "nucgss", "protein", "gene"],
        "xml": ["genome", "nuccore", "nucgss", "nucest", "gene"],
        "gb": ["genome", "nuccore", "nucgss"],
        "est": ["nucest"],
        "gss": ["nucgss"],
        "acc": ["nuccore"],
    }

    valid_formats = tuple(allowed_databases.keys())
    valid_databases = ("genome", "nuccore", "nucest", "nucgss", "protein", "gene")

    p.add_option(
        "--noversion",
        dest="noversion",
        default=False,
        action="store_true",
        help="Remove trailing accession versions",
    )
    p.add_option(
        "--format",
        default="fasta",
        choices=valid_formats,
        help="download format",
    )
    p.add_option(
        "--database",
        default="nuccore",
        choices=valid_databases,
        help="search database",
    )
    p.add_option(
        "--retmax",
        default=1000000,
        type="int",
        help="how many results to return",
    )
    p.add_option(
        "--skipcheck",
        default=False,
        action="store_true",
        help="turn off prompt to check file existence",
    )
    p.add_option(
        "--batchsize",
        default=500,
        type="int",
        help="download the results in batch for speed-up",
    )
    p.set_outdir(outdir=None)
    p.add_option("--outprefix", default="out", help="output file name prefix")
    p.set_email()
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(p.print_help())

    (filename,) = args
    if op.exists(filename):
        pf = filename.rsplit(".", 1)[0]
        list_of_terms = [row.strip() for row in open(filename)]
        if opts.noversion:
            list_of_terms = [x.rsplit(".", 1)[0] for x in list_of_terms]
    else:
        pf = filename
        # the filename is the search term
        list_of_terms = [filename.strip()]

    fmt = opts.format
    database = opts.database
    batchsize = opts.batchsize

    assert (
        database in allowed_databases[fmt]
    ), "For output format '{0}', allowed databases are: {1}".format(
        fmt, allowed_databases[fmt]
    )
    assert batchsize >= 1, "batchsize must >= 1"

    if " " in pf:
        pf = opts.outprefix

    outfile = "{0}.{1}".format(pf, fmt)

    outdir = opts.outdir
    if outdir:
        mkdir(outdir)

    # If noprompt, will not check file existence
    if not outdir:
        fw = must_open(outfile, "w", checkexists=True, skipcheck=opts.skipcheck)
        if fw is None:
            return

    seen = set()
    totalsize = 0
    for id, size, term, handle in batch_entrez(
        list_of_terms,
        retmax=opts.retmax,
        rettype=fmt,
        db=database,
        batchsize=batchsize,
        email=opts.email,
    ):
        if outdir:
            outfile = urljoin(outdir, "{0}.{1}".format(term, fmt))
            fw = must_open(outfile, "w", checkexists=True, skipcheck=opts.skipcheck)
            if fw is None:
                continue

        rec = handle.read()
        if id in seen:
            logging.error("Duplicate key ({0}) found".format(rec))
            continue

        totalsize += size
        print(rec, file=fw)
        print(file=fw)

        seen.add(id)

    if seen:
        printf(
            "A total of {0} {1} records downloaded.".format(totalsize, fmt.upper()),
        )

    return outfile
Exemple #3
0
    stats.register("min", min)

    eaSimpleConverge(pop,
                     toolbox,
                     0.7,
                     0.2,
                     ngen,
                     stats=stats,
                     halloffame=hof,
                     callback=callback)
    tour = hof[0]
    if cpus > 1:
        pool.terminate()
    return tour, tour.fitness


if __name__ == "__main__":
    POINTS, SCF = 200, 20
    scaffolds = make_data(POINTS, SCF)

    # Demo case: scramble of the list
    guess = list(range(SCF))
    guess[5:15] = guess[5:15][::-1]
    guess[7:18] = guess[7:18][::-1]
    printf(guess)

    toolbox = GA_setup(guess)
    toolbox.register("evaluate", colinear_evaluate, scaffolds=scaffolds)
    tour, tour.fitness = GA_run(toolbox, cpus=8)
    printf(tour, tour.fitness)
Exemple #4
0
def eaSimpleConverge(
    population,
    toolbox,
    cxpb,
    mutpb,
    ngen,
    stats=None,
    halloffame=None,
    callback=None,
    verbose=True,
):
    """This algorithm reproduce the simplest evolutionary algorithm as
    presented in chapter 7 of [Back2000]_.

    Modified to allow checking if there is no change for ngen, as a simple
    rule for convergence. Interface is similar to eaSimple(). However, in
    eaSimple, ngen is total number of iterations; in eaSimpleConverge, we
    terminate only when the best is NOT updated for ngen iterations.
    """
    # Evaluate the individuals with an invalid fitness
    invalid_ind = [ind for ind in population if not ind.fitness.valid]
    fitnesses = toolbox.map(toolbox.evaluate, invalid_ind)
    for ind, fit in zip(invalid_ind, fitnesses):
        ind.fitness.values = fit

    if halloffame is not None:
        halloffame.update(population)

    record = stats.compile(population) if stats else {}

    # Begin the generational process
    gen = 1
    best = (0, )
    while True:
        # Select the next generation individuals
        offspring = toolbox.select(population, len(population))

        # Vary the pool of individuals
        offspring = varAnd(offspring, toolbox, cxpb, mutpb)

        # Evaluate the individuals with an invalid fitness
        invalid_ind = [ind for ind in offspring if not ind.fitness.valid]
        fitnesses = toolbox.map(toolbox.evaluate, invalid_ind)
        for ind, fit in zip(invalid_ind, fitnesses):
            ind.fitness.values = fit

        # Update the hall of fame with the generated individuals
        if halloffame is not None:
            halloffame.update(offspring)

        if callback is not None:
            callback(halloffame[0], gen)

        # Replace the current population by the offspring
        population[:] = offspring

        # Append the current generation statistics to the logbook
        record = stats.compile(population) if stats else {}
        current_best = record["max"]
        if gen % 20 == 0 and verbose:
            printf(
                "Current iteration {0}: max_score={1}".format(
                    gen, current_best), )

        if current_best > best:
            best = current_best
            updated = gen

        gen += 1
        if gen - updated > ngen:
            break

    return population
Exemple #5
0
def overlap(args):
    """
    %prog overlap best.contains iid

    Visualize overlaps for a given fragment. Must be run in 4-unitigger. All
    overlaps for iid were retrieved, excluding the ones matching best.contains.
    """
    p = OptionParser(overlap.__doc__)
    p.add_option("--maxerr", default=2, type="int", help="Maximum error rate")
    p.add_option("--canvas", default=100, type="int", help="Canvas size")
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    bestcontains, iid = args
    canvas = opts.canvas

    bestcontainscache = bestcontains + ".cache"
    if need_update(bestcontains, bestcontainscache):
        fp = open(bestcontains)
        fw = open(bestcontainscache, "w")
        exclude = set()
        for row in fp:
            if row[0] == "#":
                continue
            j = int(row.split()[0])
            exclude.add(j)
        dump(exclude, fw)
        fw.close()

    exclude = load(open(bestcontainscache))
    logging.debug("A total of {0} reads to exclude".format(len(exclude)))

    cmd = "overlapStore -d ../asm.ovlStore -b {0} -e {0}".format(iid)
    cmd += " -E {0}".format(opts.maxerr)
    frags = []
    for row in popen(cmd):
        r = OverlapLine(row)
        if r.bid in exclude:
            continue
        frags.append(r)

    # Also include to query fragment
    frags.append(OverlapLine("{0} {0} N 0 0 0 0".format(iid)))
    frags.sort(key=lambda x: x.ahang)

    # Determine size of the query fragment
    cmd = "gatekeeper -b {0} -e {0}".format(iid)
    cmd += " -tabular -dumpfragments ../asm.gkpStore"
    fp = popen(cmd)
    row = next(fp)
    size = int(fp.next().split()[-1])

    # Determine size of canvas
    xmin = min(x.ahang for x in frags)
    xmax = max(x.bhang for x in frags)
    xsize = -xmin + size + xmax
    ratio = xsize / canvas

    for f in frags:
        fsize = -f.ahang + size + f.bhang
        a = (f.ahang - xmin) / ratio
        b = fsize / ratio
        t = "-" * b
        if f.orientation == "N":
            t = t[:-1] + ">"
        else:
            t = "<" + t[1:]
        if f.ahang == 0 and f.bhang == 0:
            t = "[green]{}".format(t)
        c = canvas - a - b
        printf(
            "{}{}{}{} ({})".format(" " * a, t, " " * c,
                                   str(f.bid).rjust(10), f.erate_adj), )