def download_species_ensembl(species, valid_species, url): assert species in valid_species, "{0} is not in the species list".format(species) # We want to download assembly and annotation for given species ann_url = urljoin(url, "gtf/{0}".format(species)) cds_url = urljoin(url, "fasta/{0}/cds".format(species)) for u in (ann_url, cds_url): valid_files = [x for x in ls_ftp(u) if x.endswith(".gz")] for f in valid_files: f = urljoin(u, f) download(f)
def download_species_ensembl(species, valid_species, url): assert species in valid_species, \ "{0} is not in the species list".format(species) # We want to download assembly and annotation for given species ann_url = urljoin(url, "gtf/{0}".format(species)) cds_url = urljoin(url, "fasta/{0}/cds".format(species)) for u in (ann_url, cds_url): valid_files = [x for x in ls_ftp(u) if x.endswith(".gz")] for f in valid_files: f = urljoin(u, f) download(f)
def sra(args): """ %prog sra term Given an SRA run ID, fetch the corresponding .sra file from the sra-instant FTP """ sra_base_url = "ftp://ftp-trace.ncbi.nlm.nih.gov/sra/sra-instant/reads/ByRun/sra/" sra_run_id_re = re.compile(r'^([DES]{1}RR)(\d{3})(\d{3,4})$') p = OptionParser(sra.__doc__) opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) term, = args m = re.search(sra_run_id_re, term) if m is None: logging.error("Incorrect SRA identifier format " + \ "[should be like SRR126150, SRR1001901. " + \ "len(identifier) should be between 9-10 characters]") sys.exit() prefix, subprefix = m.group(1), "{0}{1}".format(m.group(1), m.group(2)) download_url = urljoin(sra_base_url, prefix, subprefix, term, "{0}.sra".format(term)) logging.debug("Downloading file: {0}".format(download_url)) download(download_url)
def request(self, urn: str, params: dict): """ Send a request to CoinMarketCap Parameters ---------- urn : `str` the endpoints, E.g "cryptocurrency/info" params : `dict` the parameters for the request Raises ------ requests.exceptions.HTTPError If status code is not 200 """ url = Request("GET", urljoin(self._url, urn), params=params).prepare().url # NOTE: race condition, but it should be harmless if self._session.cache.has_url(url): response = self._request_cache(url) else: response = self._request_throttle(url) res = loads(response.text) if response.status_code == 200: res["cached"] = response.from_cache return res else: raise response.raise_for_status()
def get_features(refseq, start, end, strand, featuretype, completely_within=True, level=1): """ Return the features within a given set of chromosome coordinates """ url = urljoin(THALEMINE_BASE_URL, "jbrowse", TAXID, "features", refseq) data = tools.do_request(url, None, start=start, end=end, type=featuretype) elems_to_delete = [] for x, elem0 in enumerate(data['features']): # remove feature if not completely_within specified chromosome coordinates if completely_within and (elem0['start'] < start or elem0['end'] > end): elems_to_delete.append(x) continue # remove all subfeatures below 0th-level object if level == 0: data['features'][x]['subfeatures'] = [] # remove all subfeatures below 1st-level object elif level == 1: for y, elem1 in enumerate(elem0['subfeatures']): data['features'][x]['subfeatures'][y]['subfeatures'] = [] for i in sorted(elems_to_delete, reverse=True): del data['features'][i] return data
def get_global_stats(featuretype): """ Return global stats for features of specific type """ url = urljoin(THALEMINE_BASE_URL, "jbrowse", TAXID, "stats", "global") global_stats = tools.do_request(url, None, type=featuretype) return global_stats
def get_region_feature_densities(refseq, start, end, featuretype): """ Return binned density stats for features within a given set of chromosome coordinates """ url = urljoin(THALEMINE_BASE_URL, "jbrowse", TAXID, "stats", "regionFeatureDensities", refseq) region_feature_densities = tools.do_request(url, None, start=start, end=end, type=featuretype) return region_feature_densities
def download_species_phytozome9(species, valid_species, base_url, assembly=False): assert species in valid_species, "{} is not in the species list".format(species) # We want to download assembly and annotation for given species surl = urljoin(base_url, species) contents = [x for x in ls_ftp(surl) if x.endswith("_readme.txt")] magic = contents[0].split("_")[1] # Get the magic number logging.debug("Found magic number for {0}: {1}".format(species, magic)) pf = "{0}_{1}".format(species, magic) asm_url = urljoin(surl, "assembly/{0}.fa.gz".format(pf)) ann_url = urljoin(surl, "annotation/{0}_gene.gff3.gz".format(pf)) cds_url = urljoin(surl, "annotation/{0}_cds.fa.gz".format(pf)) res = {} if assembly: res["asm"] = download(asm_url) res["gff"] = download(ann_url) res["cds"] = download(cds_url) return res
def do(self): "run it, get a new url" scheme, netloc, path, params, query, fragment = Split(self.url).do() if isinstance(self.query, dict): query = query + "&" + urllib.urlencode(self.query) if query else urllib.urlencode(self.query) path = urljoin(path, self.path).replace('\\', '/') if self.path else path return Splice(scheme=scheme, netloc=netloc, path=path, params=params, query=query, fragment=fragment).geturl
def download_species_phytozome(species, valid_species, url, assembly=False): from os.path import join as urljoin assert species in valid_species, \ "{0} is not in the species list".format(species) # We want to download assembly and annotation for given species surl = urljoin(url, species) contents = [x for x in ls_ftp(surl) if x.endswith("_readme.txt")] magic = contents[0].split("_")[1] # Get the magic number logging.debug("Found magic number for {0}: {1}".format(species, magic)) pf = "{0}_{1}".format(species, magic) asm_url = urljoin(surl, "assembly/{0}.fa.gz".format(pf)) ann_url = urljoin(surl, "annotation/{0}_gene.gff3.gz".format(pf)) cds_url = urljoin(surl, "annotation/{0}_cds.fa.gz".format(pf)) if assembly: download(asm_url) for u in (ann_url, cds_url): download(u)
def download_srr_term(term): sra_base_url = "ftp://ftp-trace.ncbi.nlm.nih.gov/sra/sra-instant/reads/ByRun/sra/" sra_run_id_re = re.compile(r'^([DES]{1}RR)(\d{3})(\d{3,4})$') m = re.search(sra_run_id_re, term) if m is None: logging.error("Incorrect SRA identifier format " + \ "[should be like SRR126150, SRR1001901. " + \ "len(identifier) should be between 9-10 characters]") sys.exit() prefix, subprefix = m.group(1), "{0}{1}".format(m.group(1), m.group(2)) download_url = urljoin(sra_base_url, prefix, subprefix, term, "{0}.sra".format(term)) logging.debug("Downloading file: {0}".format(download_url)) return download(download_url)
def rule(services, settings): if not settings.ONE_DOMAIN_MODE: return API_URL = urljoin(settings.API_URL, "api") STORE_AVAILABLE = services.get("store") ADMIN_AVAILABLE = services.get("admin") BACKEND_AVAILABLE = services.get("backend") # replace defaults if STORE_AVAILABLE: with modify_key(services, "store", "environment") as environment: environment["BITCART_STORE_API_URL"] = API_URL if ADMIN_AVAILABLE: with modify_key(services, "admin", "environment") as environment: environment["BITCART_ADMIN_ROOTPATH"] = environment[ "BITCART_ADMIN_ROOTPATH"].replace("/", "/admin") environment["BITCART_ADMIN_API_URL"] = API_URL elif ADMIN_AVAILABLE: with modify_key(services, "admin", "environment") as environment: environment["BITCART_ADMIN_API_URL"] = API_URL if BACKEND_AVAILABLE and (ADMIN_AVAILABLE or STORE_AVAILABLE): with modify_key(services, "backend", "environment") as environment: environment["BITCART_BACKEND_ROOTPATH"] = environment[ "BITCART_BACKEND_ROOTPATH"].replace("-}", "-/api}")
def entrez(args): """ %prog entrez <filename|term> `filename` contains a list of terms to search. Or just one term. If the results are small in size, e.g. "--format=acc", use "--batchsize=100" to speed the download. """ p = OptionParser(entrez.__doc__) allowed_databases = { "fasta": ["genome", "nuccore", "nucgss", "protein", "nucest"], "asn.1": ["genome", "nuccore", "nucgss", "protein", "gene"], "xml": ["genome", "nuccore", "nucgss", "nucest", "gene"], "gb": ["genome", "nuccore", "nucgss"], "est": ["nucest"], "gss": ["nucgss"], "acc": ["nuccore"], } valid_formats = tuple(allowed_databases.keys()) valid_databases = ("genome", "nuccore", "nucest", "nucgss", "protein", "gene") p.add_option( "--noversion", dest="noversion", default=False, action="store_true", help="Remove trailing accession versions", ) p.add_option( "--format", default="fasta", choices=valid_formats, help="download format", ) p.add_option( "--database", default="nuccore", choices=valid_databases, help="search database", ) p.add_option( "--retmax", default=1000000, type="int", help="how many results to return", ) p.add_option( "--skipcheck", default=False, action="store_true", help="turn off prompt to check file existence", ) p.add_option( "--batchsize", default=500, type="int", help="download the results in batch for speed-up", ) p.set_outdir(outdir=None) p.add_option("--outprefix", default="out", help="output file name prefix") p.set_email() opts, args = p.parse_args(args) if len(args) != 1: sys.exit(p.print_help()) (filename,) = args if op.exists(filename): pf = filename.rsplit(".", 1)[0] list_of_terms = [row.strip() for row in open(filename)] if opts.noversion: list_of_terms = [x.rsplit(".", 1)[0] for x in list_of_terms] else: pf = filename # the filename is the search term list_of_terms = [filename.strip()] fmt = opts.format database = opts.database batchsize = opts.batchsize assert ( database in allowed_databases[fmt] ), "For output format '{0}', allowed databases are: {1}".format( fmt, allowed_databases[fmt] ) assert batchsize >= 1, "batchsize must >= 1" if " " in pf: pf = opts.outprefix outfile = "{0}.{1}".format(pf, fmt) outdir = opts.outdir if outdir: mkdir(outdir) # If noprompt, will not check file existence if not outdir: fw = must_open(outfile, "w", checkexists=True, skipcheck=opts.skipcheck) if fw is None: return seen = set() totalsize = 0 for id, size, term, handle in batch_entrez( list_of_terms, retmax=opts.retmax, rettype=fmt, db=database, batchsize=batchsize, email=opts.email, ): if outdir: outfile = urljoin(outdir, "{0}.{1}".format(term, fmt)) fw = must_open(outfile, "w", checkexists=True, skipcheck=opts.skipcheck) if fw is None: continue rec = handle.read() if id in seen: logging.error("Duplicate key ({0}) found".format(rec)) continue totalsize += size print(rec, file=fw) print(file=fw) seen.add(id) if seen: printf( "A total of {0} {1} records downloaded.".format(totalsize, fmt.upper()), ) return outfile
def search(args): """ args contains a dict with one or key:values transcript is AGI identifier and is mandatory material is the tissue or treatment and is restricted below to a limited list """ """ In the future, ADAMA will check a query, map_*, or generic request against a list of mandatory parameters specified for each service. For now, if we want to enforce that behavior we need to implement it ourselves. ADAMA will have a graceful, cross-language exception handling scheme in a future release At present, we are hand-coding a return """ if not args.viewkeys() & {'material1', 'material2', 'foldchange'}: return """ Check that foldchange is a valid number """ foldchange = args['foldchange'] try: n = float(foldchange) except (ValueError, TypeError): return """ Check materials to make sure they're in the (hard-coded) approved list """ valid_materials = { 'flower': 'Flo', 'iaa': 'IAA', 'leaf': 'Lea', \ 'root': 'Roo', 'salicylic': 'Sal', 'nacl': 'NaC', \ 'young': 'You', 't87': 'T87'} material1 = args['material1'].lower() if material1 not in valid_materials.keys(): return tissue1 = valid_materials[material1] material2 = args['material2'].lower() if material2 not in valid_materials.keys(): return tissue2 = valid_materials[material2] """ Build the url from the base + the intended endpoint action Also encode the params (payload) into a dict """ url = urljoin(jcvi_common.base_url(), 'ExpressionConditionComparison') payload = { 'tissue1': tissue1, 'tissue2': tissue2, 'change': foldchange } """ Make the request to the remote service """ r = requests.get(url, params=payload) """ Iterate through the results Foreach record from the remote service, build the response json Print this json to stdout followed by a record separator "---" ADAMA takes care of serializing these results """ p = re.compile('AT[1-5MC]G[0-9]{5,5}\.[0-9]+', re.IGNORECASE) for result in r.json()['compare_table']: # check that transcript uses a valid transcript identifier transcript = result['elem_target_id'] if not p.search(transcript): continue record = { 'transcript': transcript, 'class': 'transcript_property', 'source_text_description': 'RT-PCR', 'expression_comparison_record': { 'material1_text_description': result['elem_tissue1'], 'expression_value_material1': result['elem_tissue1_value'], 'expression_value_material1_stdev': result['elem_tissue1_value2'], 'material2_text_description': result['elem_tissue2'], 'expression_value_material2': result['elem_tissue2_value'], 'expression_value_material2_stdev': result['elem_tissue2_value2'] } } print json.dumps(record, indent=2) print '---'
def __init__(self, request, endpoint): self.request = lambda x: request(urljoin(endpoint, "price-conversion"), args(**x))
def __init__(self, request, endpoint): self.request = lambda x: request(urljoin(endpoint, "map"), x)
def __init__(self, request, endpoint): self.request = lambda x, y: request(urljoin(endpoint, "quotes", x), args(**y))
def __init__(self, request, endpoint): self.request = lambda x: request( urljoin(endpoint, "market-pairs/latest"), args(**x))
def __init__(self, request, endpoint): self.request = lambda x: request(urljoin(endpoint, "info"), args(**x))
def entrez(args): """ %prog entrez <filename|term> `filename` contains a list of terms to search. Or just one term. If the results are small in size, e.g. "--format=acc", use "--batchsize=100" to speed the download. """ p = OptionParser(entrez.__doc__) allowed_databases = {"fasta": ["genome", "nuccore", "nucgss", "protein", "nucest"], "asn.1": ["genome", "nuccore", "nucgss", "protein"], "gb" : ["genome", "nuccore", "nucgss"], "est" : ["nucest"], "gss" : ["nucgss"], "acc" : ["nuccore"], } valid_formats = tuple(allowed_databases.keys()) valid_databases = ("genome", "nuccore", "nucest", "nucgss", "protein") p.add_option("--noversion", dest="noversion", default=False, action="store_true", help="Remove trailing accession versions") p.add_option("--format", default="fasta", choices=valid_formats, help="download format [default: %default]") p.add_option("--database", default="nuccore", choices=valid_databases, help="search database [default: %default]") p.add_option("--retmax", default=1000000, type="int", help="how many results to return [default: %default]") p.add_option("--skipcheck", default=False, action="store_true", help="turn off prompt to check file existence [default: %default]") p.add_option("--batchsize", default=500, type="int", help="download the results in batch for speed-up [default: %default]") p.add_option("--outdir", default=None, help="output directory, with accession number as filename") p.add_option("--outprefix", default="out", help="output file name prefix [default: %default]") p.set_email() opts, args = p.parse_args(args) if len(args) != 1: sys.exit(p.print_help()) filename, = args if op.exists(filename): pf = filename.rsplit(".", 1)[0] list_of_terms = [row.strip() for row in open(filename)] if opts.noversion: list_of_terms = [x.rsplit(".", 1)[0] for x in list_of_terms] else: pf = filename # the filename is the search term list_of_terms = [filename.strip()] fmt = opts.format database = opts.database batchsize = opts.batchsize assert database in allowed_databases[fmt], \ "For output format '{0}', allowed databases are: {1}".\ format(fmt, allowed_databases[fmt]) assert batchsize >= 1, "batchsize must >= 1" if " " in pf: pf = opts.outprefix outfile = "{0}.{1}".format(pf, fmt) outdir = opts.outdir if outdir: mkdir(outdir) # If noprompt, will not check file existence if not outdir: fw = must_open(outfile, "w", checkexists=True, \ skipcheck=opts.skipcheck) if fw is None: return seen = set() totalsize = 0 for id, size, term, handle in batch_entrez(list_of_terms, retmax=opts.retmax, \ rettype=fmt, db=database, batchsize=batchsize, \ email=opts.email): if outdir: outfile = urljoin(outdir, "{0}.{1}".format(term, fmt)) fw = must_open(outfile, "w", checkexists=True, \ skipcheck=opts.skipcheck) if fw is None: continue rec = handle.read() if id in seen: logging.error("Duplicate key ({0}) found".format(rec)) continue totalsize += size print >> fw, rec print >> fw seen.add(id) if seen: print >> sys.stderr, "A total of {0} {1} records downloaded.".\ format(totalsize, fmt.upper()) return outfile
def urljoin(url, path): return path.urljoin(url, path)