def bisect(args): """ %prog bisect acc accession.fasta determine the version of the accession by querying entrez, based on a fasta file. This proceeds by a sequential search from xxxx.1 to the latest record. """ p = OptionParser(bisect.__doc__) p.set_email() opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) acc, fastafile = args arec = get_first_rec(fastafile) valid = None for i in range(1, 100): term = "%s.%d" % (acc, i) try: query = list(batch_entrez([term], email=opts.email)) except AssertionError as e: logging.debug("no records found for %s. terminating." % term) return id, term, handle = query[0] brec = next(SeqIO.parse(handle, "fasta")) match = print_first_difference(arec, brec, ignore_case=True, ignore_N=True, rc=True) if match: valid = term break if valid: printf() printf("[green]{} matches the sequence in `{}`".format( valid, fastafile))
def bisect(args): """ %prog bisect acc accession.fasta determine the version of the accession by querying entrez, based on a fasta file. This proceeds by a sequential search from xxxx.1 to the latest record. """ p = OptionParser(bisect.__doc__) p.set_email() opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) acc, fastafile = args arec = get_first_rec(fastafile) valid = None for i in range(1, 100): term = "%s.%d" % (acc, i) try: query = list(batch_entrez([term], email=opts.email)) except AssertionError as e: logging.debug("no records found for %s. terminating." % term) return id, term, handle = query[0] brec = SeqIO.parse(handle, "fasta").next() match = print_first_difference(arec, brec, ignore_case=True, ignore_N=True, rc=True) if match: valid = term break if valid: print print green("%s matches the sequence in `%s`" % (valid, fastafile))
def entrez(args): """ %prog entrez <filename|term> `filename` contains a list of terms to search. Or just one term. If the results are small in size, e.g. "--format=acc", use "--batchsize=100" to speed the download. """ p = OptionParser(entrez.__doc__) allowed_databases = { "fasta": ["genome", "nuccore", "nucgss", "protein", "nucest"], "asn.1": ["genome", "nuccore", "nucgss", "protein", "gene"], "xml": ["genome", "nuccore", "nucgss", "nucest", "gene"], "gb": ["genome", "nuccore", "nucgss"], "est": ["nucest"], "gss": ["nucgss"], "acc": ["nuccore"], } valid_formats = tuple(allowed_databases.keys()) valid_databases = ("genome", "nuccore", "nucest", "nucgss", "protein", "gene") p.add_option( "--noversion", dest="noversion", default=False, action="store_true", help="Remove trailing accession versions", ) p.add_option( "--format", default="fasta", choices=valid_formats, help="download format", ) p.add_option( "--database", default="nuccore", choices=valid_databases, help="search database", ) p.add_option( "--retmax", default=1000000, type="int", help="how many results to return", ) p.add_option( "--skipcheck", default=False, action="store_true", help="turn off prompt to check file existence", ) p.add_option( "--batchsize", default=500, type="int", help="download the results in batch for speed-up", ) p.set_outdir(outdir=None) p.add_option("--outprefix", default="out", help="output file name prefix") p.set_email() opts, args = p.parse_args(args) if len(args) != 1: sys.exit(p.print_help()) (filename,) = args if op.exists(filename): pf = filename.rsplit(".", 1)[0] list_of_terms = [row.strip() for row in open(filename)] if opts.noversion: list_of_terms = [x.rsplit(".", 1)[0] for x in list_of_terms] else: pf = filename # the filename is the search term list_of_terms = [filename.strip()] fmt = opts.format database = opts.database batchsize = opts.batchsize assert ( database in allowed_databases[fmt] ), "For output format '{0}', allowed databases are: {1}".format( fmt, allowed_databases[fmt] ) assert batchsize >= 1, "batchsize must >= 1" if " " in pf: pf = opts.outprefix outfile = "{0}.{1}".format(pf, fmt) outdir = opts.outdir if outdir: mkdir(outdir) # If noprompt, will not check file existence if not outdir: fw = must_open(outfile, "w", checkexists=True, skipcheck=opts.skipcheck) if fw is None: return seen = set() totalsize = 0 for id, size, term, handle in batch_entrez( list_of_terms, retmax=opts.retmax, rettype=fmt, db=database, batchsize=batchsize, email=opts.email, ): if outdir: outfile = urljoin(outdir, "{0}.{1}".format(term, fmt)) fw = must_open(outfile, "w", checkexists=True, skipcheck=opts.skipcheck) if fw is None: continue rec = handle.read() if id in seen: logging.error("Duplicate key ({0}) found".format(rec)) continue totalsize += size print(rec, file=fw) print(file=fw) seen.add(id) if seen: printf( "A total of {0} {1} records downloaded.".format(totalsize, fmt.upper()), ) return outfile
def entrez(args): """ %prog entrez <filename|term> `filename` contains a list of terms to search. Or just one term. If the results are small in size, e.g. "--format=acc", use "--batchsize=100" to speed the download. """ p = OptionParser(entrez.__doc__) allowed_databases = {"fasta": ["genome", "nuccore", "nucgss", "protein", "nucest"], "asn.1": ["genome", "nuccore", "nucgss", "protein"], "gb" : ["genome", "nuccore", "nucgss"], "est" : ["nucest"], "gss" : ["nucgss"], "acc" : ["nuccore"], } valid_formats = tuple(allowed_databases.keys()) valid_databases = ("genome", "nuccore", "nucest", "nucgss", "protein") p.add_option("--noversion", dest="noversion", default=False, action="store_true", help="Remove trailing accession versions") p.add_option("--format", default="fasta", choices=valid_formats, help="download format [default: %default]") p.add_option("--database", default="nuccore", choices=valid_databases, help="search database [default: %default]") p.add_option("--retmax", default=1000000, type="int", help="how many results to return [default: %default]") p.add_option("--skipcheck", default=False, action="store_true", help="turn off prompt to check file existence [default: %default]") p.add_option("--batchsize", default=500, type="int", help="download the results in batch for speed-up [default: %default]") p.add_option("--outdir", default=None, help="output directory, with accession number as filename") p.add_option("--outprefix", default="out", help="output file name prefix [default: %default]") p.set_email() opts, args = p.parse_args(args) if len(args) != 1: sys.exit(p.print_help()) filename, = args if op.exists(filename): pf = filename.rsplit(".", 1)[0] list_of_terms = [row.strip() for row in open(filename)] if opts.noversion: list_of_terms = [x.rsplit(".", 1)[0] for x in list_of_terms] else: pf = filename # the filename is the search term list_of_terms = [filename.strip()] fmt = opts.format database = opts.database batchsize = opts.batchsize assert database in allowed_databases[fmt], \ "For output format '{0}', allowed databases are: {1}".\ format(fmt, allowed_databases[fmt]) assert batchsize >= 1, "batchsize must >= 1" if " " in pf: pf = opts.outprefix outfile = "{0}.{1}".format(pf, fmt) outdir = opts.outdir if outdir: mkdir(outdir) # If noprompt, will not check file existence if not outdir: fw = must_open(outfile, "w", checkexists=True, \ skipcheck=opts.skipcheck) if fw is None: return seen = set() totalsize = 0 for id, size, term, handle in batch_entrez(list_of_terms, retmax=opts.retmax, \ rettype=fmt, db=database, batchsize=batchsize, \ email=opts.email): if outdir: outfile = urljoin(outdir, "{0}.{1}".format(term, fmt)) fw = must_open(outfile, "w", checkexists=True, \ skipcheck=opts.skipcheck) if fw is None: continue rec = handle.read() if id in seen: logging.error("Duplicate key ({0}) found".format(rec)) continue totalsize += size print >> fw, rec print >> fw seen.add(id) if seen: print >> sys.stderr, "A total of {0} {1} records downloaded.".\ format(totalsize, fmt.upper()) return outfile