def _open(url, post=None): """Helper function to build the URL and open a handle to it (PRIVATE). Open a handle to TogoWS, will raise an IOError if it encounters an error. In the absense of clear guidelines, this function enforces a limit of "up to three queries per second" to avoid abusing the TogoWS servers. """ delay = 0.333333333 # one third of a second current = time.time() wait = _open.previous + delay - current if wait > 0: time.sleep(wait) _open.previous = current + wait else: _open.previous = current #print(url) if post: handle = _urlopen(url, _as_bytes(post)) else: handle = _urlopen(url) #We now trust TogoWS to have set an HTTP error code, that #suffices for my current unit tests. Previously we would #examine the start of the data returned back. return _binary_to_string_handle(handle)
def get_recent_changes(self): """Returns three lists of the newest weekly files (added,mod,obsolete). Reads the directories with changed entries from the PDB server and returns a tuple of three URL's to the files of new, modified and obsolete entries from the most recent list. The directory with the largest numerical name is used. Returns None if something goes wrong. Contents of the data/status dir (20031013 would be used); drwxrwxr-x 2 1002 sysadmin 512 Oct 6 18:28 20031006 drwxrwxr-x 2 1002 sysadmin 512 Oct 14 02:14 20031013 -rw-r--r-- 1 1002 sysadmin 1327 Mar 12 2001 README """ url = self.pdb_server + '/pub/pdb/data/status/' with contextlib.closing(_urlopen(url)) as handle: recent = filter(str.isdigit, (x.split()[-1] for x in handle.readlines()))[-1] path = self.pdb_server + '/pub/pdb/data/status/%s/' % (recent) # Retrieve the lists added = self.get_status_list(path + 'added.pdb') modified = self.get_status_list(path + 'modified.pdb') obsolete = self.get_status_list(path + 'obsolete.pdb') return [added, modified, obsolete]
def get_sprot_raw(id): """Get a handle to a raw SwissProt entry at ExPASy. For an ID of XXX, fetches http://www.uniprot.org/uniprot/XXX.txt (as per the http://www.expasy.ch/expasy_urls.html documentation). """ return _urlopen("http://www.uniprot.org/uniprot/%s.txt" % id)
def get_all_obsolete(self): """Returns a list of all obsolete entries ever in the PDB. Returns a list of all obsolete pdb codes that have ever been in the PDB. Gets and parses the file from the PDB server in the format (the first pdb_code column is the one used). The file looks like this: LIST OF OBSOLETE COORDINATE ENTRIES AND SUCCESSORS OBSLTE 31-JUL-94 116L 216L ... OBSLTE 29-JAN-96 1HFT 2HFT OBSLTE 21-SEP-06 1HFV 2J5X OBSLTE 21-NOV-03 1HG6 OBSLTE 18-JUL-84 1HHB 2HHB 3HHB OBSLTE 08-NOV-96 1HID 2HID OBSLTE 01-APR-97 1HIU 2HIU OBSLTE 14-JAN-04 1HKE 1UUZ ... """ url = self.pdb_server + '/pub/pdb/data/status/obsolete.dat' with contextlib.closing(_urlopen(url)) as handle: # Extract pdb codes. Could use a list comprehension, but I want # to include an assert to check for mis-reading the data. obsolete = [] for line in handle: if not line.startswith("OBSLTE "): continue pdb = line.split()[2] assert len(pdb) == 4 obsolete.append(pdb) return obsolete
def get_recent_changes(self): """Returns three lists of the newest weekly files (added,mod,obsolete). Reads the directories with changed entries from the PDB server and returns a tuple of three URL's to the files of new, modified and obsolete entries from the most recent list. The directory with the largest numerical name is used. Returns None if something goes wrong. Contents of the data/status dir (20031013 would be used); drwxrwxr-x 2 1002 sysadmin 512 Oct 6 18:28 20031006 drwxrwxr-x 2 1002 sysadmin 512 Oct 14 02:14 20031013 -rw-r--r-- 1 1002 sysadmin 1327 Mar 12 2001 README """ url = self.pdb_server + '/pub/pdb/data/status/' with contextlib.closing(_urlopen(url)) as handle: recent = filter(str.isdigit, (x.split()[-1] for x in handle.readlines()) )[-1] path = self.pdb_server + '/pub/pdb/data/status/%s/' % (recent) # Retrieve the lists added = self.get_status_list(path + 'added.pdb') modified = self.get_status_list(path + 'modified.pdb') obsolete = self.get_status_list(path + 'obsolete.pdb') return [added, modified, obsolete]
def scan(seq="", mirror='http://www.expasy.org', output='xml', **keywords): """Execute a ScanProsite search. mirror: The ScanProsite mirror to be used (default: http://www.expasy.org). seq: The query sequence, or UniProtKB (Swiss-Prot, TrEMBL) accession output: Format of the search results (default: xml) Further search parameters can be passed as keywords; see the documentation for programmatic access to ScanProsite at http://www.expasy.org/tools/scanprosite/ScanPrositeREST.html for a description of such parameters. This function returns a handle to the search results returned by ScanProsite. Search results in the XML format can be parsed into a Python object, by using the Bio.ExPASy.ScanProsite.read function. """ parameters = {'seq': seq, 'output': output} for key, value in keywords.items(): if value is not None: parameters[key] = value command = _urlencode(parameters) url = "%s/cgi-bin/prosite/PSScan.cgi?%s" % (mirror, command) handle = _urlopen(url) return handle
def startElementHandler(self, name, attrs): # preprocessing the xml schema if self.is_schema: if len(attrs) == 1: schema = list(attrs.values())[0] handle = self.open_xsd_file(os.path.basename(schema)) # if there is no local xsd file grab the url and parse the file if not handle: handle = _urlopen(schema) text = handle.read() self.save_xsd_file(os.path.basename(schema), text) handle.close() self.parse_xsd(ET.fromstring(text)) else: self.parse_xsd(ET.fromstring(handle.read())) handle.close() self.content = "" if name in self.lists: object = ListElement() elif name in self.dictionaries: object = DictionaryElement() elif name in self.structures: object = StructureElement(self.structures[name]) elif name in self.items: # Only appears in ESummary name = str(attrs["Name"]) # convert from Unicode del attrs["Name"] itemtype = str(attrs["Type"]) # convert from Unicode del attrs["Type"] if itemtype == "Structure": object = DictionaryElement() elif name in ("ArticleIds", "History"): object = StructureElement(["pubmed", "medline"]) elif itemtype == "List": object = ListElement() else: object = StringElement() object.itemname = name object.itemtype = itemtype elif name in self.strings + self.errors + self.integers: self.attributes = attrs return else: # Element not found in DTD if self.validating: raise ValidationError(name) else: # this will not be stored in the record object = "" if object != "": object.tag = name if attrs: object.attributes = dict(attrs) if len(self.stack) != 0: current = self.stack[-1] try: current.append(object) except AttributeError: current[name] = object self.stack.append(object)
def _open(cgi, params=None, post=None, ecitmatch=False): """Helper function to build the URL and open a handle to it (PRIVATE). Open a handle to Entrez. cgi is the URL for the cgi script to access. params is a dictionary with the options to pass to it. Does some simple error checking, and will raise an IOError if it encounters one. The arugment post should be a boolean to explicitly control if an HTTP POST should be used rather an HTTP GET based on the query length. By default (post=None), POST is used if the URL encoded paramters would be over 1000 characters long. This function also enforces the "up to three queries per second rule" to avoid abusing the NCBI servers. """ # NCBI requirement: At most three queries per second. # Equivalently, at least a third of second between queries delay = 0.333333334 current = time.time() wait = _open.previous + delay - current if wait > 0: time.sleep(wait) _open.previous = current + wait else: _open.previous = current params = _construct_params(params) options = _encode_options(ecitmatch, params) # By default, post is None. Set to a boolean to over-ride length choice: if post is None and len(options) > 1000: post = True cgi = _construct_cgi(cgi, post, options) try: if post: handle = _urlopen(cgi, data=_as_bytes(options)) else: handle = _urlopen(cgi) except _HTTPError as exception: raise exception return _binary_to_string_handle(handle)
def get_prosite_raw(id, cgi="http://www.expasy.ch/cgi-bin/get-prosite-raw.pl"): """get_prosite_raw(id, cgi='http://www.expasy.ch/cgi-bin/get-prosite-raw.pl') -> handle Get a handle to a raw PROSITE or PRODOC entry at ExPASy. For a non-existing key, ExPASy returns nothing. """ return _urlopen("%s?%s" % (cgi, id))
def get_prosite_raw(id, cgi='http://www.expasy.ch/cgi-bin/get-prosite-raw.pl'): """get_prosite_raw(id, cgi='http://www.expasy.ch/cgi-bin/get-prosite-raw.pl') -> handle Get a handle to a raw PROSITE or PRODOC entry at ExPASy. For a non-existing key, ExPASy returns nothing. """ return _urlopen("%s?%s" % (cgi, id))
def get_all_entries(self): """Retrieves a big file containing all the PDB entries and some annotation to them. Returns a list of PDB codes in the index file. """ print("retrieving index file. Takes about 5 MB.") url = self.pdb_server + '/pub/pdb/derived_data/index/entries.idx' with contextlib.closing(_urlopen(url)) as handle: all_entries = [line[:4] for line in handle.readlines()[2:] if len(line) > 4] return all_entries
def get_prosite_entry(id, cgi="http://www.expasy.ch/cgi-bin/get-prosite-entry"): """get_prosite_entry(id, cgi='http://www.expasy.ch/cgi-bin/get-prosite-entry') -> handle Get a handle to a PROSITE entry at ExPASy in HTML format. For a non-existing key XXX, ExPASy returns an HTML-formatted page containing this line: 'There is currently no PROSITE entry for XXX. Please try again.' """ return _urlopen("%s?%s" % (cgi, id))
def get_all_entries(self): """Retrieves a big file containing all the PDB entries and some annotation to them. Returns a list of PDB codes in the index file. """ print("retrieving index file. Takes about 5 MB.") url = self.pdb_server + '/pub/pdb/derived_data/index/entries.idx' with contextlib.closing(_urlopen(url)) as handle: all_entries = [ line[:4] for line in handle.readlines()[2:] if len(line) > 4 ] return all_entries
def get_prosite_entry(id, cgi='http://www.expasy.ch/cgi-bin/get-prosite-entry'): """get_prosite_entry(id, cgi='http://www.expasy.ch/cgi-bin/get-prosite-entry') -> handle Get a handle to a PROSITE entry at ExPASy in HTML format. For a non-existing key XXX, ExPASy returns an HTML-formatted page containing this line: 'There is currently no PROSITE entry for XXX. Please try again.' """ return _urlopen("%s?%s" % (cgi, id))
def get_prodoc_entry(id, cgi="http://www.expasy.ch/cgi-bin/get-prodoc-entry"): """get_prodoc_entry(id, cgi='http://www.expasy.ch/cgi-bin/get-prodoc-entry') -> handle Get a handle to a PRODOC entry at ExPASy in HTML format. For a non-existing key XXX, ExPASy returns an HTML-formatted page containing this line: 'There is no PROSITE documentation entry XXX. Please try again.' """ # Open a handle to ExPASy. return _urlopen("%s?%s" % (cgi, id))
def get_prodoc_entry(id, cgi='http://www.expasy.ch/cgi-bin/get-prodoc-entry'): """get_prodoc_entry(id, cgi='http://www.expasy.ch/cgi-bin/get-prodoc-entry') -> handle Get a handle to a PRODOC entry at ExPASy in HTML format. For a non-existing key XXX, ExPASy returns an HTML-formatted page containing this line: 'There is no PROSITE documentation entry XXX. Please try again.' """ # Open a handle to ExPASy. return _urlopen("%s?%s" % (cgi, id))
def get_status_list(self, url): """Retrieves a list of pdb codes in the weekly pdb status file from the given URL. Used by get_recent_files. Typical contents of the list files parsed by this method is now very simply one PDB name per line. """ with contextlib.closing(_urlopen(url)) as handle: answer = [] for line in handle: pdb = line.strip() assert len(pdb) == 4 answer.append(pdb) return answer
def externalEntityRefHandler(self, context, base, systemId, publicId): """The purpose of this function is to load the DTD locally, instead of downloading it from the URL specified in the XML. Using the local DTD results in much faster parsing. If the DTD is not found locally, we try to download it. If new DTDs become available from NCBI, putting them in Bio/Entrez/DTDs will allow the parser to see them.""" urlinfo = _urlparse(systemId) # Following attribute requires Python 2.5+ # if urlinfo.scheme=='http': if urlinfo[0] in ['http', 'https', 'ftp']: # Then this is an absolute path to the DTD. url = systemId elif urlinfo[0] == '': # Then this is a relative path to the DTD. # Look at the parent URL to find the full path. try: source = self.dtd_urls[-1] except IndexError: # Assume the default URL for DTDs if the top parent # does not contain an absolute path source = "http://www.ncbi.nlm.nih.gov/dtd/" else: source = os.path.dirname(source) # urls always have a forward slash, don't use os.path.join url = source.rstrip("/") + "/" + systemId else: raise ValueError("Unexpected URL scheme %r" % (urlinfo[0])) self.dtd_urls.append(url) # First, try to load the local version of the DTD file location, filename = os.path.split(systemId) handle = self.open_dtd_file(filename) if not handle: # DTD is not available as a local file. Try accessing it through # the internet instead. try: handle = _urlopen(url) except IOError: raise RuntimeError("Failed to access %s at %s" % (filename, url)) text = handle.read() handle.close() self.save_dtd_file(filename, text) handle = BytesIO(text) parser = self.parser.ExternalEntityParserCreate(context) parser.ElementDeclHandler = self.elementDecl parser.ParseFile(handle) handle.close() self.dtd_urls.pop() return 1
def sprot_search_de(text, swissprot=1, trembl=None, cgi="http://www.expasy.ch/cgi-bin/sprot-search-de"): """sprot_search_de(text, swissprot=1, trembl=None, cgi='http://www.expasy.ch/cgi-bin/sprot-search-de') -> handle Search SwissProt by name, description, gene name, species, or organelle. """ variables = {"SEARCH": text} if swissprot: variables["S"] = "on" if trembl: variables["T"] = "on" options = _urlencode(variables) fullcgi = "%s?%s" % (cgi, options) handle = _urlopen(fullcgi) return handle
def sprot_search_de(text, swissprot=1, trembl=None, cgi='http://www.expasy.ch/cgi-bin/sprot-search-de'): """sprot_search_de(text, swissprot=1, trembl=None, cgi='http://www.expasy.ch/cgi-bin/sprot-search-de') -> handle Search SwissProt by name, description, gene name, species, or organelle. """ variables = {'SEARCH': text} if swissprot: variables['S'] = 'on' if trembl: variables['T'] = 'on' options = _urlencode(variables) fullcgi = "%s?%s" % (cgi, options) handle = _urlopen(fullcgi) return handle
def sprot_search_ful( text, make_wild=None, swissprot=1, trembl=None, cgi="http://www.expasy.ch/cgi-bin/sprot-search-ful" ): """sprot_search_ful(text, make_wild=None, swissprot=1, trembl=None, cgi='http://www.expasy.ch/cgi-bin/sprot-search-ful') -> handle Search SwissProt by full text. """ variables = {"SEARCH": text} if make_wild: variables["makeWild"] = "on" if swissprot: variables["S"] = "on" if trembl: variables["T"] = "on" options = _urlencode(variables) fullcgi = "%s?%s" % (cgi, options) handle = _urlopen(fullcgi) return handle
def sprot_search_ful(text, make_wild=None, swissprot=1, trembl=None, cgi='http://www.expasy.ch/cgi-bin/sprot-search-ful'): """sprot_search_ful(text, make_wild=None, swissprot=1, trembl=None, cgi='http://www.expasy.ch/cgi-bin/sprot-search-ful') -> handle Search SwissProt by full text. """ variables = {'SEARCH': text} if make_wild: variables['makeWild'] = 'on' if swissprot: variables['S'] = 'on' if trembl: variables['T'] = 'on' options = _urlencode(variables) fullcgi = "%s?%s" % (cgi, options) handle = _urlopen(fullcgi) return handle
def qblast(program, database, sequence, auto_format=None, composition_based_statistics=None, db_genetic_code=None, endpoints=None, entrez_query='(none)', expect=10.0, filter=None, gapcosts=None, genetic_code=None, hitlist_size=50, i_thresh=None, layout=None, lcase_mask=None, matrix_name=None, nucl_penalty=None, nucl_reward=None, other_advanced=None, perc_ident=None, phi_pattern=None, query_file=None, query_believe_defline=None, query_from=None, query_to=None, searchsp_eff=None, service=None, threshold=None, ungapped_alignment=None, word_size=None, alignments=500, alignment_view=None, descriptions=500, entrez_links_new_window=None, expect_low=None, expect_high=None, format_entrez_query=None, format_object=None, format_type='XML', ncbi_gi=None, results_file=None, show_overview=None, megablast=None, ): """Do a BLAST search using the QBLAST server at NCBI. Supports all parameters of the qblast API for Put and Get. Some useful parameters: program blastn, blastp, blastx, tblastn, or tblastx (lower case) database Which database to search against (e.g. "nr"). sequence The sequence to search. ncbi_gi TRUE/FALSE whether to give 'gi' identifier. descriptions Number of descriptions to show. Def 500. alignments Number of alignments to show. Def 500. expect An expect value cutoff. Def 10.0. matrix_name Specify an alt. matrix (PAM30, PAM70, BLOSUM80, BLOSUM45). filter "none" turns off filtering. Default no filtering format_type "HTML", "Text", "ASN.1", or "XML". Def. "XML". entrez_query Entrez query to limit Blast search hitlist_size Number of hits to return. Default 50 megablast TRUE/FALSE whether to use MEga BLAST algorithm (blastn only) service plain, psi, phi, rpsblast, megablast (lower case) This function does no checking of the validity of the parameters and passes the values to the server as is. More help is available at: http://www.ncbi.nlm.nih.gov/BLAST/Doc/urlapi.html """ import time assert program in ['blastn', 'blastp', 'blastx', 'tblastn', 'tblastx'] # Format the "Put" command, which sends search requests to qblast. # Parameters taken from http://www.ncbi.nlm.nih.gov/BLAST/Doc/node5.html on 9 July 2007 # Additional parameters are taken from http://www.ncbi.nlm.nih.gov/BLAST/Doc/node9.html on 8 Oct 2010 # To perform a PSI-BLAST or PHI-BLAST search the service ("Put" and "Get" commands) must be specified # (e.g. psi_blast = NCBIWWW.qblast("blastp", "refseq_protein", input_sequence, service="psi")) parameters = [ ('AUTO_FORMAT', auto_format), ('COMPOSITION_BASED_STATISTICS', composition_based_statistics), ('DATABASE', database), ('DB_GENETIC_CODE', db_genetic_code), ('ENDPOINTS', endpoints), ('ENTREZ_QUERY', entrez_query), ('EXPECT', expect), ('FILTER', filter), ('GAPCOSTS', gapcosts), ('GENETIC_CODE', genetic_code), ('HITLIST_SIZE', hitlist_size), ('I_THRESH', i_thresh), ('LAYOUT', layout), ('LCASE_MASK', lcase_mask), ('MEGABLAST', megablast), ('MATRIX_NAME', matrix_name), ('NUCL_PENALTY', nucl_penalty), ('NUCL_REWARD', nucl_reward), ('OTHER_ADVANCED', other_advanced), ('PERC_IDENT', perc_ident), ('PHI_PATTERN', phi_pattern), ('PROGRAM', program), #('PSSM',pssm), - It is possible to use PSI-BLAST via this API? ('QUERY', sequence), ('QUERY_FILE', query_file), ('QUERY_BELIEVE_DEFLINE', query_believe_defline), ('QUERY_FROM', query_from), ('QUERY_TO', query_to), #('RESULTS_FILE',...), - Can we use this parameter? ('SEARCHSP_EFF', searchsp_eff), ('SERVICE', service), ('THRESHOLD', threshold), ('UNGAPPED_ALIGNMENT', ungapped_alignment), ('WORD_SIZE', word_size), ('CMD', 'Put'), ] query = [x for x in parameters if x[1] is not None] message = _as_bytes(_urlencode(query)) # Send off the initial query to qblast. # Note the NCBI do not currently impose a rate limit here, other # than the request not to make say 50 queries at once using multiple # threads. request = _Request("http://blast.ncbi.nlm.nih.gov/Blast.cgi", message, {"User-Agent":"BiopythonClient"}) handle = _urlopen(request) # Format the "Get" command, which gets the formatted results from qblast # Parameters taken from http://www.ncbi.nlm.nih.gov/BLAST/Doc/node6.html on 9 July 2007 rid, rtoe = _parse_qblast_ref_page(handle) parameters = [ ('ALIGNMENTS', alignments), ('ALIGNMENT_VIEW', alignment_view), ('DESCRIPTIONS', descriptions), ('ENTREZ_LINKS_NEW_WINDOW', entrez_links_new_window), ('EXPECT_LOW', expect_low), ('EXPECT_HIGH', expect_high), ('FORMAT_ENTREZ_QUERY', format_entrez_query), ('FORMAT_OBJECT', format_object), ('FORMAT_TYPE', format_type), ('NCBI_GI', ncbi_gi), ('RID', rid), ('RESULTS_FILE', results_file), ('SERVICE', service), ('SHOW_OVERVIEW', show_overview), ('CMD', 'Get'), ] query = [x for x in parameters if x[1] is not None] message = _as_bytes(_urlencode(query)) # Poll NCBI until the results are ready. Use a backoff delay from 2 - 120 second wait delay = 2.0 previous = time.time() while True: current = time.time() wait = previous + delay - current if wait > 0: time.sleep(wait) previous = current + wait else: previous = current if delay + .5*delay <= 120: delay += .5*delay else: delay = 120 request = _Request("http://blast.ncbi.nlm.nih.gov/Blast.cgi", message, {"User-Agent":"BiopythonClient"}) handle = _urlopen(request) results = _as_string(handle.read()) # Can see an "\n\n" page while results are in progress, # if so just wait a bit longer... if results=="\n\n": continue # XML results don't have the Status tag when finished if "Status=" not in results: break i = results.index("Status=") j = results.index("\n", i) status = results[i+len("Status="):j].strip() if status.upper() == "READY": break return StringIO(results)