def _is_redundant(nc_acc, accs): # Function checks if "NC-or-NW"-record is redundant (if it's non-RefSeq copy already exists in acc_dict). # :param nc_acc: accession number of NC-record; # :type nc_acc: str; # :param accs: tuple of accession numbers; # :type accs: tuple<str>; summary = lingering_https_get_request( "www.ncbi.nlm.nih.gov", "/nuccore/{}?report=genbank&log$=seqview".format(nc_acc), "summary", nc_acc) try: # Find link to Identical GenBank Record # Firstly, get GI number of NC seqeunce: get_gi_url = "/nuccore/{}?report=gilist&log$=seqview&format=text".format( nc_acc) nc_gi_text = lingering_https_get_request("www.ncbi.nlm.nih.gov", get_gi_url, "GI of {}".format(nc_acc), nc_acc) nc_gi_text = nc_gi_text.replace('\n', '') nc_gi_re = re.search(r"\<pre\>([0-9]+).*\</pre\>", nc_gi_text) if nc_gi_re is None: raise _NoIdentLabelError( "Error 771. Accession: {}. Please, contact the developer.". format(nc_acc)) # end if nc_gi = nc_gi_re.group(1) # Retrieve identical GenBank sequence accession number. # NCBI redirects these requests and provides necessary location in headers. # So, we'll follow thin link. identical_gb_link = "/nuccore?LinkName=nuccore_nuccore_rsgb&from_uid={}".format( nc_gi) redirect_text = _ling_https_getreq_handl_301( "www.ncbi.nlm.nih.gov", identical_gb_link, "link to identical genbank sequence", nc_acc) # Get accession number from the response text pattern = r"\<pre\>(.*).*\</pre\>" ident_acc_re = re.search(pattern, redirect_text.replace('\n', '')) if ident_acc_re is None: raise _NoIdentLabelError( "Error 773. Accession: {}. Please, contact the developer.". format(nc_acc)) # end if ident_acc = ident_acc_re.group(1).partition('.')[0] except (_NoIdentLabelError, _NoLinkError, _NoAccError) as err: printlog_error_time("Error: {}".format(err)) platf_depend_exit(1) else: return ident_acc, ident_acc in accs
def verify_taxids(taxid_list): # Funciton verifies TaxIDs passed to prober with `-g` option. # Function requests NCBI Taxonomy Browser and parses organism's name from HTML response. # What is more, this function configures `oraganisms` list - it will be included into BLAST submissions. # # :param taxid_list: list of TaxIDs. TaxIDs are strings, but they are verified to be integers # during CL argument parsing; # :type taxid_list: list<str>; # # Returns list of strings of the following format: "<tax_name> (taxid:<TaxID>)>" organisms = list() if len(taxid_list) > 0: printlog_info("Verifying TaxIDs:") for taxid in taxid_list: printn(" {} - ".format(taxid)) try: tax_resp = lingering_https_get_request( "www.ncbi.nlm.nih.gov", "/Taxonomy/Browser/wwwtax.cgi?mode=Info&id={}".format( taxid), "taxonomy") tax_name = re.search(r"Taxonomy browser \((.+?)\)", tax_resp).group(1) except AttributeError: printlog_error("\aError: TaxID not found") printlog_error( "Please, check your TaxID: https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi" ) platf_depend_exit(1) except OSError as oserr: printlog_error("Something is wrong with connection:") printlog_error(str(oserr)) platf_depend_exit(-2) else: print(tax_name) log_info("{} - {}".format(taxid, tax_name)) organisms.append("{} (taxid:{})".format(tax_name, taxid)) # end try # end for print('-' * 30 + '\n') # end if return organisms
def verify_cl_accessions(accs_to_download, acc_dict): # Function checks existance of GenBank records that correspond to accessions # specified with '-s' option. After checking the function fulills 'acc_fict'. # :param accs_to_download: list of accessions from command line ('-s'); # :type accs_to_download: list<str>; # :param acc_dict: dictionary {<ACCESSION>: <HIT_DEFINITION>}; # :type acc_dict: dict<str: tuple<str>>; check_connection("https://www.ncbi.nlm.nih.gov/") printlog_info_time("Verifying `-s` accessions...") sys.stdout.write("0/{}".format(len(accs_to_download))) for i, acc in enumerate(accs_to_download): server = "eutils.ncbi.nlm.nih.gov" url = "/entrez/eutils/esummary.fcgi?db=nuccore&id={}".format(acc) text = lingering_https_get_request(server, url, "record's name", acc) name = re.search( r"\<Item Name=\"Title\" Type=\"String\"\>(.+)\</Item\>", text) if name is None: printlog_info( "Cannot find GenBank record with accession '{}'".format(acc)) platf_depend_exit(1) else: name = name.group(1) # end if acc_dict[acc] = name sys.stdout.write("\r{}/{}".format(i + 1, len(accs_to_download))) # end for print() printlog_info_time("OK.")
def wait_for_align(rid, rtoe, pack_to_send, filename): # Function waits untill BLAST server accomplishes the request. # # :param rid: Request ID to wait for; # :type rid: str; # :param rtoe: time in seconds estimated by BLAST server needed to accomplish the request; # :type rtoe: int; # :param pack_to_send: current packet (id) number to send; # :type pack_to_send: int; # :param filename: basename of current FASTA file; # :type filename: str # # Returns XML response ('str'). print() print("Requesting for current query status. Request ID: {}".format(rid)) print(" `{}`; Submission #{}".format(filename, pack_to_send[0])) log_info("Requesting for current query status.") log_info("Request ID: {}; `{}`; Submission #{}".format( rid, filename, pack_to_send[0], )) # RTOE can be zero at the very beginning of resumption if rtoe > 0: printlog_info_time( "BLAST server estimates that alignment will be accomplished in {} seconds" .format(rtoe)) printlog_info_time( "Waiting for {}+3 (+3 extra) seconds...".format(rtoe)) # Server migth be wrong -- we will give it 3 extra seconds sleep(rtoe + 3) printlog_info_time( "{} seconds have passed. Checking if alignment is accomplished...". format(rtoe + 3)) # end if server = "blast.ncbi.nlm.nih.gov" wait_url = "/blast/Blast.cgi?CMD=Get&FORMAT_OBJECT=SearchInfo&RID=" + rid whtspc_len = 6 + len("(requesting)") while True: resp_content = lingering_https_get_request(server, wait_url, "BLAST response") # if server asks to wait if "Status=WAITING" in resp_content: printn("\r{} - The request is being processed. Waiting{}{}".format( getwt(), ' ' * whtspc_len, "\033[%dD" % whtspc_len)) # indicate each 20 seconds with a dot for i in range(1, 7): sleep(10) printn( "\r{} - The request is being processed. Waiting{}".format( getwt(), '.' * i)) # end for printn("(requesting)") continue elif "Status=FAILED" in resp_content: # if job failed print() printlog_info_time("Job failed\a\n") printlog_info("Resending this packet.") return None, BlastError(2) elif "Status=UNKNOWN" in resp_content: # if job expired print() printlog_info_time("Job expired\a\n") printlog_info("Resending this packet.") return None, BlastError(1) # if results are ready elif "Status=READY" in resp_content: print() printlog_info("Result for query `{}` #{} is ready!".format( filename, pack_to_send[0])) # if there are hits if "ThereAreHits=yes" in resp_content: for i in range(15, 0, -5): print('-' * i) # end for print("-\nRetrieving results...") # Retrieve human-readable text and put it into result directory retrieve_text_url = "/Blast.cgi?CMD=Get&FORMAT_TYPE=Text&DESCRIPTIONS=1&ALIGNMENTS=1&RID=" + rid txt_align_res = lingering_https_get_request( server, retrieve_text_url, "text version of BLAST response") # Count already existing plain text files in outdir: is_txt_response = lambda f: not re.search( r"prober_blast_response_[0-9]+\.txt", f) is None outdir_path = os.path.dirname(logging.getLoggerClass( ).root.handlers[0].baseFilename) # tricky trick response_num = len( tuple(filter(is_txt_response, os.listdir(outdir_path)))) # Curent txt response file will have number `response_num+1` txt_hpath = os.path.join( outdir_path, "prober_blast_response_{}.txt".format(response_num + 1)) # Write text result for a human to read with open(txt_hpath, 'w') as txt_file: txt_file.write(txt_align_res) # end with elif "ThereAreHits=no" in resp_content: # if there are no hits printlog_info_time("There are no hits. It happens.\n") else: # probably, job is failed if execution reaches here print() printlog_info_time("Job failed\a\n") printlog_info("Resending this packet.") return None, BlastError(2) # end if break # end if # Execution should not reach here printlog_error_time( "Fatal error (-122). Please contact the developer.\a\n") platf_depend_exit(-122) # end while # Retrieve XML result retrieve_xml_url = "/Blast.cgi?CMD=Get&FORMAT_TYPE=XML&ALIGNMENTS=1&RID=" + rid xml_text = lingering_https_get_request(server, retrieve_xml_url, "XML BLAST response") if "Bad Gateway" in xml_text: print() printlog_info_time("Bad Gateway. Data from last packet has been lost.") printlog_info("Resending this packet.") return None, BlastError(1) elif "Status=FAILED" in xml_text: print() printlog_info_time("BLAST error: request failed") printlog_info("Resending this packet.") return None, BlastError(2) elif "to start it again" in xml_text: print() printlog_info_time("BLAST error") printlog_info("Resending this packet.") return None, BlastError(2) elif "[blastsrv4.REAL]" in xml_text: blastsrv4_match = re.search(r"(\[blastsrv4\.REAL\].*\))", xml_text) blastsrv4_str = "" if blastsrv4_match is None else ": {}".format( blastsrv4_match.group(1)) printlog_info_time("BLAST server error{}".format(blastsrv4_str)) # Error code 2 indicated that we need to split packet and resubmit return None, BlastError(2) # end if return xml_text, BlastError(0)
def download_taxonomy(hit_acc, hit_def, taxonomy_path): # Function retrieves taxonomy of a hit from NCBI. # Moreover, it saves this taxonomy in file ``taxonomy_tsv: # <accession>\t<taxonomy_str> # # :param hit_acc: hit accession; # :type hit_acc: str; # :param hit_def: definition of reference record; # :type hit_def: str; # :param taxonomy_path: path to TSV file with taxonomy; # :type taxonomy_path: str; # Get TaxID of the organism from GenBank summary: gb_summary = lingering_https_get_request("www.ncbi.nlm.nih.gov", "/nuccore/{}".format(hit_acc), "GenBank summary", hit_acc) try: taxid = re.search(r"ORGANISM=([0-9]+)", gb_summary).group(1) except AttributeError: printlog_error_time( "Error: taxonomy parsing error 115-{}".format(hit_acc)) printlog_error("Please, contact the developer.") platf_depend_exit(115) # end try # Get taxonomy page of the organism taxonomy_url = "/Taxonomy/Browser/wwwtax.cgi?mode=Info&id={}&lvl=3&lin=f&keep=1&srchmode=1&unlock".format( taxid) taxonomy_text = lingering_https_get_request("www.ncbi.nlm.nih.gov", taxonomy_url, "taxonomy", hit_acc) # This pattern will match taxonomic names along with their ranks tax_rank_pattern = r"TITLE=\"([a-z ]+)\"\>([A-Z].+?)\</a\>" # Get all taxonomic names of the organism taxonomy = re.findall(tax_rank_pattern, taxonomy_text) # We will convert ranks to lowercase just in case. # Firstly convert tuples to lists in order to change them: taxonomy = list(map(lambda x: list(x), taxonomy)) # Remove odd information from beginnig of names: for i in range(len(taxonomy)): taxonomy[i][0] = taxonomy[i][0].lower() # just in case # end for # We will leave only following taxonomic ranks: domain, phylum, class, order, family, genus. # Species name requires special handling, it will be added later. ranks_to_select = ranks[:-1] # Remove redundant ranks: taxonomy = filter(lambda x: x[0].lower() in ranks_to_select, taxonomy) # Convert back to tuples: taxonomy = list(map(lambda x: tuple(x), taxonomy)) # E.g., this record has no appropriate ranks: CP034535 # Merely return it's definition if len(taxonomy) == 0: # Save taxonomy _tax_accs.append(hit_acc) with open(taxonomy_path, 'a') as tax_file: tax_file.write("{}\n".format('\t'.join((hit_acc, hit_def)))) # end with # end if # Check if species name is specified like other ranks: check_direct_species_patt = r"TITLE=\"(species)\"\>([A-Za-z0-9 \.]+)\</a\>" match_direct_species = re.search(check_direct_species_patt, taxonomy_text) if not match_direct_species is None: # If species name is specified like other ranks, merely add it to list: taxonomy.append((match_direct_species.group(1), match_direct_species.group(2).partition(" ")[2])) else: # Otherwise we need to parse species name from title title = re.search(r"\<title\>Taxonomy browser \((.+)\)\</title\>", taxonomy_text).group(1) # Get words title = title.split(' ') # We will take all this words as species name. # Viruses also often have unpredictable names. # Example: MN908947 try: if title[1] in second_words_not_species or taxonomy[0][1].lower( ) == "viruses": taxonomy.append(("species", '_'.join(title[1:]))) else: taxonomy.append(("species", title[1])) # end if except IndexError: # Handle absence of species name, e.g., this: AC150248.3 # Well, nothing to append in this case! pass # end try # end if # Fill in missing ranks with empty strings for i in range(len(ranks)): if len(taxonomy) < i + 1: # for this (missing in the end): AC150248 taxonomy.append((ranks[i], "")) elif taxonomy[i][0] != ranks[ i]: # for this (mising in the middle): MN908947 taxonomy.insert(i, (ranks[i], "")) # end if # end for # It will be a bit faster taxonomy = tuple(taxonomy) # Save taxonomy _tax_accs.append(hit_acc) with open(taxonomy_path, 'a') as tax_file: tax_file.write("{}\n".format('\t'.join( (hit_acc, config_taxonomy_str(taxonomy)))))
def _get_related_replicons(acc, acc_dict): # Generator finds replicons (other chromosomes or plasmids, sometimes even proviruses), # which are related to Genbank record "discovered" by barapost-prober.py. # # :param acc: accession of a record "discovered" by barapost-prober.py; # :type acc: str; # :param acc_dict: dictionary {<ACCESSION>: <HIT_DEFINITION>}; # :type acc_dict: dict<str: tuple<str>>; # # Yields tuples of a following structure: # (<ACCESSION>, <RECORD_DEFINITION>) # We will save all titles in order not to duplicate records in our database repl_list = [(acc, acc_dict[acc])] # Elink utility returns links in DB_1, that are connected to given ID in DB_2 eutils_server = "eutils.ncbi.nlm.nih.gov" elink = "elink.fcgi" # = Find BioSample ID = # Configure URL nuc2biosmp_url = "/entrez/eutils/{}?dbfrom=nuccore&db=biosample&id={}".format( elink, acc) # Get XML with our links text_link_to_bsmp = lingering_https_get_request(eutils_server, nuc2biosmp_url, "BioSample page", acc) # Parse this XML root = ElementTree.fromstring(text_link_to_bsmp) linkset = next(iter(root.getchildren())).find("LinkSetDb") # XML should contain element "LinkSetDb" if linkset is None: printlog_warning( "Cannot check replicons for `{}`: there is no BioSample page for this record." .format(acc)) return list() # end if # Here we have BioSample ID biosmp_id = linkset.find("Link").find("Id").text # = Find assembly assotiated with this BioSample ID = # We will pass this BioSample ID through nuccore in order not to # allow requesting for over 7k transcripts, like for this fungus: # https://www.ncbi.nlm.nih.gov/biosample/SAMN07457167 # After this, only scaffolds (nearly 130 sequences) will be downloaded. # Configure URL biosmp2ass_url = "/entrez/eutils/{}?dbfrom=biosample&db=assembly&id={}".format( elink, biosmp_id) # Get XML with our links text_link_to_ass = lingering_https_get_request( eutils_server, biosmp2ass_url, "Assembly link assotiated with BioSample ID {}".format(biosmp_id)) # Parse this XML root = ElementTree.fromstring(text_link_to_ass) linkset = next(iter(root.getchildren())).find("LinkSetDb") # XML should contain element "LinkSetDb" if linkset is None: printlog_warning( """Cannot check replicons for `{}`: there is no assembly page for this record.""" .format(acc)) return list() # end if # Here we have BioSample ID ass_id = linkset.find("Link").find("Id").text # = Find GIs in nuccore assotiated with this Assembly ID = # Configure URL ass2nuc_url = "/entrez/eutils/{}?dbfrom=assembly&db=nuccore&id={}".format( elink, ass_id) # Get XML with our links text_link_to_nuc = lingering_https_get_request( eutils_server, ass2nuc_url, "Nucleotide links assotiated with assembly {}".format(ass_id)) # Parse this XML root = ElementTree.fromstring(text_link_to_nuc) linkset = next(iter(root.getchildren())).find("LinkSetDb") # XML should contain element "LinkSetDb" if linkset is None: printlog_error_time("""Cannot check replicons for `{}`: failed to find nuccore records for assembly {}.""".format(acc, ass_id)) printlog_error("Please, contact the developer.") platf_depend_exit(1) # end if # We will ntertain user -- show him/her this spinning thing (like conda does # indicating that the script is actually working. krutiolka = ('|', '/', '-', '\\') krut_i = 0 sys.stdout.write("\r {}".format(krutiolka[3])) sys.stdout.flush() # Collect links for elem in linkset.iter(): if elem.tag == "Id": # element "Id" contains our GI # Get GI, title and accession: rel_gi = elem.text rel_def, rel_acc = _get_record_title(rel_gi) # Print this spinning thing sys.stdout.write("\r {}".format(krutiolka[krut_i])) sys.stdout.flush() krut_i = krut_i + 1 if krut_i != 3 else 0 # If accession is new -- update list if not rel_acc in map(lambda x: x[0], repl_list): # acc_dict[rel_acc] = rel_def # update acc_dict repl_list.append((rel_acc, rel_def)) # end if # end if # end for return repl_list
def _ling_https_getreq_handl_301(server, url, request_for=None, acc=None): # Name stands for "Lingering Https Get Request Handling 301". # Function performs a "lingering" HTTPS request. # It means that the function tries to get the response # again and again if the request fails. # It handles 301-redirection in order to search for replicons related to "NC-records". # # :param server: server address; # :type server: str; # :param url: the rest of url; # :type url: str; # :param request_for: some comment for error message; # :type request_for: str; # :param acc: GenBank accession; # :type acc: str; # # Returns obtained response coded in UTF-8 ('str'). error = True while error: try: conn = http.client.HTTPSConnection(server, timeout=10) # create connection conn.request("GET", url) # ask for if there areresults response = conn.getresponse() # get the resonse # Handle redirection if response.code == 301: # Link to identical GenBank record is in "Location" header: redirect_url = response.getheader( "Location") + "?report=accnlist&log$=seqview&format=text" else: raise _DoesNotRedirectError( "NCBI does not redirect, although it must!") # end if except (OSError, http.client.RemoteDisconnected, socket.gaierror, http.client.CannotSendRequest) as err: comment_str = "" if not request_for is None: comment_str += " requesting for {}".format(request_for) if not acc is None: comment_str += " (accession: '{}')".format(acc) # end if comment_str += '.' # end if print() printlog_warning("Can't connect to `{}`{}".format( server + url, comment_str)) printlog_warning(str(err)) printlog_warning( "the program will sleep for 30 seconds and try to connect again." ) sleep(30) except _DoesNotRedirectError as err: printlog_error_time(str(err)) printlog_error("Please, contact the developer.") platf_depend_exit(1) else: error = False # if no exception ocured, get out of the loop finally: conn.close() # end try # end while # And here goes simple "lingering_https_get_request", # which will retrieve content from redirected location return lingering_https_get_request(server, redirect_url, request_for, acc)
def _get_record_title(record_id): # Function retrieves title (aka definition) and accession # of a GenBank record by given accession or GI number. # :param record_id: accession or GI number of the record; # :type record_idi: str; # Returns tuple of two elements: # (<RECORD_TITLE>, <RECORD_ACCESSION>) # We'll use E-utilities to communicate with GenBank eutils_server = "eutils.ncbi.nlm.nih.gov" esummary = "esummary.fcgi" # utility name # Configure URL url = "/entrez/eutils/{}?db=nuccore&id={}".format(esummary, record_id) # Sometimes (I never figured out why) this XML arrives empty, and StopIteration emerges. # So, if we just repeat this request, everything is going to be ok. error = True print_ok = False while error: # Send the request and get the response summary = lingering_https_get_request( eutils_server, url, "e-summary of nuccore record {}".format(record_id)) # Parse XML that we've got root = ElementTree.fromstring(summary) # Elements of our insterest are all named "Item", # but they have different tags. # They are children of element "DocSum", which is # the first child of root try: docsum = next(iter(root.getchildren())) except StopIteration: print() printlog_info_time( "Failed to retrieve data for record {}. Trying again...". format(record_id)) print_ok = True # print this "ok" only after successful attepmt after fail else: if print_ok: printlog_info("ok") # end if error = False # end try # end while record_title = None record_acc = None # Search for title and accession for item in docsum.iter("Item"): if item.attrib["Name"] == "Title": record_title = item.text elif item.attrib["Name"] == "AccessionVersion": # Remove version just in case record_acc = re.search(r"(.*)\.[0-9]+", item.text).group(1) # end if # end for if record_title is None or record_acc is None: printlog_erro_time( "Error 8989: can't access e-summary for `{}`".format(record_acc)) platf_depend_exit(1) # end if return record_title, record_acc