def search_count(db, query): """Call TogoWS search count to see how many matches a search gives. Arguments: - db - database (string), see http://togows.dbcls.jp/search - query - search term (string) You could then use the count to download a large set of search results in batches using the offset and limit options to Bio.TogoWS.search(). In general however the Bio.TogoWS.search_iter() function is simpler to use. """ global _search_db_names if _search_db_names is None: _search_db_names = _get_fields(_BASE_URL + "/search") if db not in _search_db_names: # TODO - Make this a ValueError? Right now despite the HTML website # claiming to, the "gene" or "ncbi-gene" don't work and are not listed. import warnings warnings.warn("TogoWS search does not officially support database '%s'. " "See %s/search/ for options." % (db, _BASE_URL)) url = _BASE_URL + "/search/%s/%s/count" % (db, _quote(query)) handle = _open(url) data = handle.read() handle.close() if not data: raise ValueError("TogoWS returned no data from URL %s" % url) try: count = int(data.strip()) except ValueError: raise ValueError("Expected an integer from URL %s, got: %r" % (url, data)) return count
def search_count(db, query): """TogoWS search count (returns an integer). db - database (string), see http://togows.dbcls.jp/search query - search term (string) You could then use the count to download a large set of search results in batches using the offset and limit options to Bio.TogoWS.search(). In general however the Bio.TogoWS.search_iter() function is simpler to use. """ global _search_db_names if _search_db_names is None: _search_db_names = _get_fields(_BASE_URL + "/search") if db not in _search_db_names: #TODO - Make this a ValueError? Right now despite the HTML website #claiming to, the "gene" or "ncbi-gene" don't work and are not listed. import warnings warnings.warn("TogoWS search does not officially support database '%s'. " "See %s/search/ for options." % (db, _BASE_URL)) handle = _open(_BASE_URL + "/search/%s/%s/count" % (db, _quote(query))) count = int(handle.read().strip()) handle.close() return count
def search_count(db, query): """TogoWS search count (returns an integer). db - database (string), see http://togows.dbcls.jp/search query - search term (string) You could then use the count to download a large set of search results in batches using the offset and limit options to Bio.TogoWS.search(). In general however the Bio.TogoWS.search_iter() function is simpler to use. """ global _search_db_names if _search_db_names is None: _search_db_names = _get_fields(_BASE_URL + "/search") if db not in _search_db_names: #TODO - Make this a ValueError? Right now despite the HTML website #claiming to, the "gene" or "ncbi-gene" don't work and are not listed. import warnings warnings.warn( "TogoWS search does not officially support database '%s'. " "See %s/search/ for options." % (db, _BASE_URL)) handle = _open(_BASE_URL + "/search/%s/%s/count" % (db, _quote(query))) count = int(handle.read().strip()) handle.close() return count
def entry(db, id, format=None, field=None): """Call TogoWS 'entry' to fetch a record. Arguments: - db - database (string), see list below. - id - identier (string) or a list of identifiers (either as a list of strings or a single string with comma separators). - format - return data file format (string), options depend on the database e.g. "xml", "json", "gff", "fasta", "ttl" (RDF Turtle) - field - specific field from within the database record (string) e.g. "au" or "authors" for pubmed. At the time of writing, this includes the following:: KEGG: compound, drug, enzyme, genes, glycan, orthology, reaction, module, pathway DDBj: ddbj, dad, pdb NCBI: nuccore, nucest, nucgss, nucleotide, protein, gene, onim, homologue, snp, mesh, pubmed EBI: embl, uniprot, uniparc, uniref100, uniref90, uniref50 For the current list, please see http://togows.dbcls.jp/entry/ This function is essentially equivalent to the NCBI Entrez service EFetch, available in Biopython as Bio.Entrez.efetch(...), but that does not offer field extraction. """ global _entry_db_names, _entry_db_fields, fetch_db_formats if _entry_db_names is None: _entry_db_names = _get_entry_dbs() if db not in _entry_db_names: raise ValueError("TogoWS entry fetch does not officially support " "database '%s'." % db) if field: try: fields = _entry_db_fields[db] except KeyError: fields = _get_entry_fields(db) _entry_db_fields[db] = fields if db == "pubmed" and field == "ti" and "title" in fields: # Backwards compatibility fix for TogoWS change Nov/Dec 2013 field = "title" import warnings warnings.warn("TogoWS dropped 'pubmed' field alias 'ti', please use 'title' instead.") if field not in fields: raise ValueError("TogoWS entry fetch does not explicitly support " "field '%s' for database '%s'. Only: %s" % (field, db, ", ".join(sorted(fields)))) if format: try: formats = _entry_db_formats[db] except KeyError: formats = _get_entry_formats(db) _entry_db_formats[db] = formats if format not in formats: raise ValueError("TogoWS entry fetch does not explicitly support " "format '%s' for database '%s'. Only: %s" % (format, db, ", ".join(sorted(formats)))) if isinstance(id, list): id = ",".join(id) url = _BASE_URL + "/entry/%s/%s" % (db, _quote(id)) if field: url += "/" + field if format: url += "." + format return _open(url)
def search(db, query, offset=None, limit=None, format=None): """Call TogoWS search. This is a low level wrapper for the TogoWS search function, which can return results in a several formats. In general, the search_iter function is more suitable for end users. Arguments: - db - database (string), see http://togows.dbcls.jp/search/ - query - search term (string) - offset, limit - optional integers specifying which result to start from (1 based) and the number of results to return. - format - return data file format (string), e.g. "json", "ttl" (RDF) By default plain text is returned, one result per line. At the time of writing, TogoWS applies a default count limit of 100 search results, and this is an upper bound. To access more results, use the offset argument or the search_iter(...) function. TogoWS supports a long list of databases, including many from the NCBI (e.g. "ncbi-pubmed" or "pubmed", "ncbi-genbank" or "genbank", and "ncbi-taxonomy"), EBI (e.g. "ebi-ebml" or "embl", "ebi-uniprot" or "uniprot, "ebi-go"), and KEGG (e.g. "kegg-compound" or "compound"). For the current list, see http://togows.dbcls.jp/search/ The NCBI provide the Entrez Search service (ESearch) which is similar, available in Biopython as the Bio.Entrez.esearch() function. See also the function Bio.TogoWS.search_count() which returns the number of matches found, and the Bio.TogoWS.search_iter() function which allows you to iterate over the search results (taking care of batching for you). """ global _search_db_names if _search_db_names is None: _search_db_names = _get_fields(_BASE_URL + "/search") if db not in _search_db_names: # TODO - Make this a ValueError? Right now despite the HTML website # claiming to, the "gene" or "ncbi-gene" don't work and are not listed. import warnings warnings.warn("TogoWS search does not explicitly support database '%s'. " "See %s/search/ for options." % (db, _BASE_URL)) url = _BASE_URL + "/search/%s/%s" % (db, _quote(query)) if offset is not None and limit is not None: try: offset = int(offset) except ValueError: raise ValueError("Offset should be an integer (at least one), not %r" % offset) try: limit = int(limit) except ValueError: raise ValueError("Limit should be an integer (at least one), not %r" % limit) if offset <= 0: raise ValueError("Offset should be at least one, not %i" % offset) if limit <= 0: raise ValueError("Count should be at least one, not %i" % limit) url += "/%i,%i" % (offset, limit) elif offset is not None or limit is not None: raise ValueError("Expect BOTH offset AND limit to be provided (or neither)") if format: url += "." + format # print(url) return _open(url)