Beispiel #1
0
def _is_redundant(nc_acc, accs):
    # Function checks if "NC-or-NW"-record is redundant (if it's non-RefSeq copy already exists in acc_dict).
    # :param nc_acc: accession number of NC-record;
    # :type nc_acc: str;
    # :param accs: tuple of accession numbers;
    # :type accs: tuple<str>;

    summary = lingering_https_get_request(
        "www.ncbi.nlm.nih.gov",
        "/nuccore/{}?report=genbank&log$=seqview".format(nc_acc), "summary",
        nc_acc)

    try:
        # Find link to Identical GenBank Record

        # Firstly, get GI number of NC seqeunce:
        get_gi_url = "/nuccore/{}?report=gilist&log$=seqview&format=text".format(
            nc_acc)
        nc_gi_text = lingering_https_get_request("www.ncbi.nlm.nih.gov",
                                                 get_gi_url,
                                                 "GI of {}".format(nc_acc),
                                                 nc_acc)
        nc_gi_text = nc_gi_text.replace('\n', '')
        nc_gi_re = re.search(r"\<pre\>([0-9]+).*\</pre\>", nc_gi_text)
        if nc_gi_re is None:
            raise _NoIdentLabelError(
                "Error 771. Accession: {}. Please, contact the developer.".
                format(nc_acc))
        # end if

        nc_gi = nc_gi_re.group(1)

        # Retrieve identical GenBank sequence accession number.
        # NCBI redirects these requests and provides necessary location in headers.
        # So, we'll follow thin link.
        identical_gb_link = "/nuccore?LinkName=nuccore_nuccore_rsgb&from_uid={}".format(
            nc_gi)
        redirect_text = _ling_https_getreq_handl_301(
            "www.ncbi.nlm.nih.gov", identical_gb_link,
            "link to identical genbank sequence", nc_acc)

        # Get accession number from the response text
        pattern = r"\<pre\>(.*).*\</pre\>"
        ident_acc_re = re.search(pattern, redirect_text.replace('\n', ''))

        if ident_acc_re is None:
            raise _NoIdentLabelError(
                "Error 773. Accession: {}. Please, contact the developer.".
                format(nc_acc))
        # end if

        ident_acc = ident_acc_re.group(1).partition('.')[0]

    except (_NoIdentLabelError, _NoLinkError, _NoAccError) as err:
        printlog_error_time("Error: {}".format(err))
        platf_depend_exit(1)
    else:
        return ident_acc, ident_acc in accs
Beispiel #2
0
def verify_taxids(taxid_list):
    # Funciton verifies TaxIDs passed to prober with `-g` option.
    # Function requests NCBI Taxonomy Browser and parses organism's name from HTML response.
    # What is more, this function configures `oraganisms` list - it will be included into BLAST submissions.
    #
    # :param taxid_list: list of TaxIDs. TaxIDs are strings, but they are verified to be integers
    #     during CL argument parsing;
    # :type taxid_list: list<str>;
    #
    # Returns list of strings of the following format: "<tax_name> (taxid:<TaxID>)>"

    organisms = list()
    if len(taxid_list) > 0:

        printlog_info("Verifying TaxIDs:")
        for taxid in taxid_list:
            printn("   {} - ".format(taxid))
            try:
                tax_resp = lingering_https_get_request(
                    "www.ncbi.nlm.nih.gov",
                    "/Taxonomy/Browser/wwwtax.cgi?mode=Info&id={}".format(
                        taxid), "taxonomy")
                tax_name = re.search(r"Taxonomy browser \((.+?)\)",
                                     tax_resp).group(1)
            except AttributeError:
                printlog_error("\aError: TaxID not found")
                printlog_error(
                    "Please, check your TaxID: https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi"
                )
                platf_depend_exit(1)
            except OSError as oserr:
                printlog_error("Something is wrong with connection:")
                printlog_error(str(oserr))
                platf_depend_exit(-2)
            else:
                print(tax_name)
                log_info("{} - {}".format(taxid, tax_name))
                organisms.append("{} (taxid:{})".format(tax_name, taxid))
            # end try
        # end for
        print('-' * 30 + '\n')

    # end if
    return organisms
Beispiel #3
0
def verify_cl_accessions(accs_to_download, acc_dict):
    # Function checks existance of GenBank records that correspond to accessions
    #   specified with '-s' option. After checking the function fulills 'acc_fict'.

    # :param accs_to_download: list of accessions from command line ('-s');
    # :type accs_to_download: list<str>;
    # :param acc_dict: dictionary {<ACCESSION>: <HIT_DEFINITION>};
    # :type acc_dict: dict<str: tuple<str>>;

    check_connection("https://www.ncbi.nlm.nih.gov/")

    printlog_info_time("Verifying `-s` accessions...")
    sys.stdout.write("0/{}".format(len(accs_to_download)))

    for i, acc in enumerate(accs_to_download):

        server = "eutils.ncbi.nlm.nih.gov"
        url = "/entrez/eutils/esummary.fcgi?db=nuccore&id={}".format(acc)
        text = lingering_https_get_request(server, url, "record's name", acc)

        name = re.search(
            r"\<Item Name=\"Title\" Type=\"String\"\>(.+)\</Item\>", text)

        if name is None:
            printlog_info(
                "Cannot find GenBank record with accession '{}'".format(acc))
            platf_depend_exit(1)
        else:
            name = name.group(1)
        # end if

        acc_dict[acc] = name
        sys.stdout.write("\r{}/{}".format(i + 1, len(accs_to_download)))
    # end for
    print()
    printlog_info_time("OK.")
Beispiel #4
0
def wait_for_align(rid, rtoe, pack_to_send, filename):
    # Function waits untill BLAST server accomplishes the request.
    #
    # :param rid: Request ID to wait for;
    # :type rid: str;
    # :param rtoe: time in seconds estimated by BLAST server needed to accomplish the request;
    # :type rtoe: int;
    # :param pack_to_send: current packet (id) number to send;
    # :type pack_to_send: int;
    # :param filename: basename of current FASTA file;
    # :type filename: str
    #
    # Returns XML response ('str').

    print()
    print("Requesting for current query status. Request ID: {}".format(rid))
    print(" `{}`; Submission #{}".format(filename, pack_to_send[0]))
    log_info("Requesting for current query status.")
    log_info("Request ID: {}; `{}`; Submission #{}".format(
        rid,
        filename,
        pack_to_send[0],
    ))
    # RTOE can be zero at the very beginning of resumption
    if rtoe > 0:

        printlog_info_time(
            "BLAST server estimates that alignment will be accomplished in {} seconds"
            .format(rtoe))
        printlog_info_time(
            "Waiting for {}+3 (+3 extra) seconds...".format(rtoe))
        # Server migth be wrong -- we will give it 3 extra seconds
        sleep(rtoe + 3)
        printlog_info_time(
            "{} seconds have passed. Checking if alignment is accomplished...".
            format(rtoe + 3))
    # end if

    server = "blast.ncbi.nlm.nih.gov"
    wait_url = "/blast/Blast.cgi?CMD=Get&FORMAT_OBJECT=SearchInfo&RID=" + rid

    whtspc_len = 6 + len("(requesting)")

    while True:
        resp_content = lingering_https_get_request(server, wait_url,
                                                   "BLAST response")

        # if server asks to wait
        if "Status=WAITING" in resp_content:
            printn("\r{} - The request is being processed. Waiting{}{}".format(
                getwt(), ' ' * whtspc_len, "\033[%dD" % whtspc_len))
            # indicate each 20 seconds with a dot
            for i in range(1, 7):
                sleep(10)
                printn(
                    "\r{} - The request is being processed. Waiting{}".format(
                        getwt(), '.' * i))
            # end for
            printn("(requesting)")
            continue
        elif "Status=FAILED" in resp_content:
            # if job failed
            print()
            printlog_info_time("Job failed\a\n")
            printlog_info("Resending this packet.")
            return None, BlastError(2)
        elif "Status=UNKNOWN" in resp_content:
            # if job expired
            print()
            printlog_info_time("Job expired\a\n")
            printlog_info("Resending this packet.")
            return None, BlastError(1)
        # if results are ready
        elif "Status=READY" in resp_content:
            print()
            printlog_info("Result for query `{}` #{} is ready!".format(
                filename, pack_to_send[0]))
            # if there are hits
            if "ThereAreHits=yes" in resp_content:
                for i in range(15, 0, -5):
                    print('-' * i)
                # end for
                print("-\nRetrieving results...")

                # Retrieve human-readable text and put it into result directory
                retrieve_text_url = "/Blast.cgi?CMD=Get&FORMAT_TYPE=Text&DESCRIPTIONS=1&ALIGNMENTS=1&RID=" + rid
                txt_align_res = lingering_https_get_request(
                    server, retrieve_text_url,
                    "text version of BLAST response")

                # Count already existing plain text files in outdir:
                is_txt_response = lambda f: not re.search(
                    r"prober_blast_response_[0-9]+\.txt", f) is None
                outdir_path = os.path.dirname(logging.getLoggerClass(
                ).root.handlers[0].baseFilename)  # tricky trick
                response_num = len(
                    tuple(filter(is_txt_response, os.listdir(outdir_path))))

                # Curent txt response file will have number `response_num+1`
                txt_hpath = os.path.join(
                    outdir_path,
                    "prober_blast_response_{}.txt".format(response_num + 1))
                # Write text result for a human to read
                with open(txt_hpath, 'w') as txt_file:
                    txt_file.write(txt_align_res)
                # end with
            elif "ThereAreHits=no" in resp_content:
                # if there are no hits
                printlog_info_time("There are no hits. It happens.\n")
            else:
                # probably, job is failed if execution reaches here
                print()
                printlog_info_time("Job failed\a\n")
                printlog_info("Resending this packet.")
                return None, BlastError(2)
            # end if
            break
        # end if
        # Execution should not reach here
        printlog_error_time(
            "Fatal error (-122). Please contact the developer.\a\n")
        platf_depend_exit(-122)
    # end while

    # Retrieve XML result
    retrieve_xml_url = "/Blast.cgi?CMD=Get&FORMAT_TYPE=XML&ALIGNMENTS=1&RID=" + rid
    xml_text = lingering_https_get_request(server, retrieve_xml_url,
                                           "XML BLAST response")

    if "Bad Gateway" in xml_text:
        print()
        printlog_info_time("Bad Gateway. Data from last packet has been lost.")
        printlog_info("Resending this packet.")
        return None, BlastError(1)

    elif "Status=FAILED" in xml_text:
        print()
        printlog_info_time("BLAST error: request failed")
        printlog_info("Resending this packet.")
        return None, BlastError(2)

    elif "to start it again" in xml_text:
        print()
        printlog_info_time("BLAST error")
        printlog_info("Resending this packet.")
        return None, BlastError(2)

    elif "[blastsrv4.REAL]" in xml_text:
        blastsrv4_match = re.search(r"(\[blastsrv4\.REAL\].*\))", xml_text)
        blastsrv4_str = "" if blastsrv4_match is None else ": {}".format(
            blastsrv4_match.group(1))
        printlog_info_time("BLAST server error{}".format(blastsrv4_str))
        # Error code 2 indicated that we need to split packet and resubmit
        return None, BlastError(2)
    # end if

    return xml_text, BlastError(0)
Beispiel #5
0
def download_taxonomy(hit_acc, hit_def, taxonomy_path):
    # Function retrieves taxonomy of a hit from NCBI.
    # Moreover, it saves this taxonomy in file ``taxonomy_tsv:
    #     <accession>\t<taxonomy_str>
    #
    # :param hit_acc: hit accession;
    # :type hit_acc: str;
    # :param hit_def: definition of reference record;
    # :type hit_def: str;
    # :param taxonomy_path: path to TSV file with taxonomy;
    # :type taxonomy_path: str;

    # Get TaxID of the organism from GenBank summary:
    gb_summary = lingering_https_get_request("www.ncbi.nlm.nih.gov",
                                             "/nuccore/{}".format(hit_acc),
                                             "GenBank summary", hit_acc)

    try:
        taxid = re.search(r"ORGANISM=([0-9]+)", gb_summary).group(1)
    except AttributeError:
        printlog_error_time(
            "Error: taxonomy parsing error 115-{}".format(hit_acc))
        printlog_error("Please, contact the developer.")
        platf_depend_exit(115)
    # end try

    # Get taxonomy page of the organism
    taxonomy_url = "/Taxonomy/Browser/wwwtax.cgi?mode=Info&id={}&lvl=3&lin=f&keep=1&srchmode=1&unlock".format(
        taxid)
    taxonomy_text = lingering_https_get_request("www.ncbi.nlm.nih.gov",
                                                taxonomy_url, "taxonomy",
                                                hit_acc)

    # This pattern will match taxonomic names along with their ranks
    tax_rank_pattern = r"TITLE=\"([a-z ]+)\"\>([A-Z].+?)\</a\>"

    # Get all taxonomic names of the organism
    taxonomy = re.findall(tax_rank_pattern, taxonomy_text)

    # We will convert ranks to lowercase just in case.
    # Firstly convert tuples to lists in order to change them:
    taxonomy = list(map(lambda x: list(x), taxonomy))

    # Remove odd information from beginnig of names:
    for i in range(len(taxonomy)):
        taxonomy[i][0] = taxonomy[i][0].lower()  # just in case
    # end for

    # We will leave only following taxonomic ranks: domain, phylum, class, order, family, genus.
    # Species name requires special handling, it will be added later.
    ranks_to_select = ranks[:-1]

    # Remove redundant ranks:
    taxonomy = filter(lambda x: x[0].lower() in ranks_to_select, taxonomy)

    # Convert back to tuples:
    taxonomy = list(map(lambda x: tuple(x), taxonomy))

    # E.g., this record has no appropriate ranks: CP034535
    # Merely return it's definition
    if len(taxonomy) == 0:
        # Save taxonomy
        _tax_accs.append(hit_acc)
        with open(taxonomy_path, 'a') as tax_file:
            tax_file.write("{}\n".format('\t'.join((hit_acc, hit_def))))
        # end with
    # end if

    # Check if species name is specified like other ranks:
    check_direct_species_patt = r"TITLE=\"(species)\"\>([A-Za-z0-9 \.]+)\</a\>"
    match_direct_species = re.search(check_direct_species_patt, taxonomy_text)

    if not match_direct_species is None:
        # If species name is specified like other ranks, merely add it to list:
        taxonomy.append((match_direct_species.group(1),
                         match_direct_species.group(2).partition(" ")[2]))
    else:
        # Otherwise we need to parse species name from title
        title = re.search(r"\<title\>Taxonomy browser \((.+)\)\</title\>",
                          taxonomy_text).group(1)

        # Get words
        title = title.split(' ')

        # We will take all this words as species name.
        # Viruses also often have unpredictable names.
        #   Example: MN908947
        try:
            if title[1] in second_words_not_species or taxonomy[0][1].lower(
            ) == "viruses":
                taxonomy.append(("species", '_'.join(title[1:])))
            else:
                taxonomy.append(("species", title[1]))
            # end if
        except IndexError:
            # Handle absence of species name, e.g., this: AC150248.3
            # Well, nothing to append in this case!
            pass
        # end try
    # end if

    # Fill in missing ranks with empty strings
    for i in range(len(ranks)):
        if len(taxonomy) < i + 1:  # for this (missing in the end): AC150248
            taxonomy.append((ranks[i], ""))
        elif taxonomy[i][0] != ranks[
                i]:  # for this (mising in the middle): MN908947
            taxonomy.insert(i, (ranks[i], ""))
        # end if
    # end for

    # It will be a bit faster
    taxonomy = tuple(taxonomy)

    # Save taxonomy
    _tax_accs.append(hit_acc)
    with open(taxonomy_path, 'a') as tax_file:
        tax_file.write("{}\n".format('\t'.join(
            (hit_acc, config_taxonomy_str(taxonomy)))))
Beispiel #6
0
def _get_related_replicons(acc, acc_dict):
    # Generator finds replicons (other chromosomes or plasmids, sometimes even proviruses),
    #   which are related to Genbank record "discovered" by barapost-prober.py.
    #
    # :param acc: accession of a record "discovered" by barapost-prober.py;
    # :type acc: str;
    # :param acc_dict: dictionary {<ACCESSION>: <HIT_DEFINITION>};
    # :type acc_dict: dict<str: tuple<str>>;
    #
    # Yields tuples of a following structure:
    #     (<ACCESSION>, <RECORD_DEFINITION>)

    # We will save all titles in order not to duplicate records in our database
    repl_list = [(acc, acc_dict[acc])]

    # Elink utility returns links in DB_1, that are connected to given ID in DB_2
    eutils_server = "eutils.ncbi.nlm.nih.gov"
    elink = "elink.fcgi"

    # = Find BioSample ID =

    # Configure URL
    nuc2biosmp_url = "/entrez/eutils/{}?dbfrom=nuccore&db=biosample&id={}".format(
        elink, acc)

    # Get XML with our links
    text_link_to_bsmp = lingering_https_get_request(eutils_server,
                                                    nuc2biosmp_url,
                                                    "BioSample page", acc)

    # Parse this XML
    root = ElementTree.fromstring(text_link_to_bsmp)
    linkset = next(iter(root.getchildren())).find("LinkSetDb")

    # XML should contain element "LinkSetDb"
    if linkset is None:
        printlog_warning(
            "Cannot check replicons for `{}`: there is no BioSample page for this record."
            .format(acc))
        return list()
    # end if

    # Here we have BioSample ID
    biosmp_id = linkset.find("Link").find("Id").text

    # = Find assembly assotiated with this BioSample ID =

    # We will pass this BioSample ID through nuccore in order not to
    #   allow requesting for over 7k transcripts, like for this fungus:
    #   https://www.ncbi.nlm.nih.gov/biosample/SAMN07457167
    # After this, only scaffolds (nearly 130 sequences) will be downloaded.

    # Configure URL
    biosmp2ass_url = "/entrez/eutils/{}?dbfrom=biosample&db=assembly&id={}".format(
        elink, biosmp_id)

    # Get XML with our links
    text_link_to_ass = lingering_https_get_request(
        eutils_server, biosmp2ass_url,
        "Assembly link assotiated with BioSample ID {}".format(biosmp_id))

    # Parse this XML
    root = ElementTree.fromstring(text_link_to_ass)
    linkset = next(iter(root.getchildren())).find("LinkSetDb")

    # XML should contain element "LinkSetDb"
    if linkset is None:
        printlog_warning(
            """Cannot check replicons for `{}`: there is no assembly page for this record."""
            .format(acc))
        return list()
    # end if

    # Here we have BioSample ID
    ass_id = linkset.find("Link").find("Id").text

    # = Find GIs in nuccore assotiated with this Assembly ID =

    # Configure URL
    ass2nuc_url = "/entrez/eutils/{}?dbfrom=assembly&db=nuccore&id={}".format(
        elink, ass_id)

    # Get XML with our links
    text_link_to_nuc = lingering_https_get_request(
        eutils_server, ass2nuc_url,
        "Nucleotide links assotiated with assembly {}".format(ass_id))

    # Parse this XML
    root = ElementTree.fromstring(text_link_to_nuc)
    linkset = next(iter(root.getchildren())).find("LinkSetDb")

    # XML should contain element "LinkSetDb"
    if linkset is None:
        printlog_error_time("""Cannot check replicons for `{}`:
  failed to find nuccore records for assembly {}.""".format(acc, ass_id))
        printlog_error("Please, contact the developer.")
        platf_depend_exit(1)
    # end if

    # We will ntertain user -- show him/her this spinning thing (like conda does
    #   indicating that the script is actually working.
    krutiolka = ('|', '/', '-', '\\')
    krut_i = 0
    sys.stdout.write("\r {}".format(krutiolka[3]))
    sys.stdout.flush()

    # Collect links
    for elem in linkset.iter():

        if elem.tag == "Id":  # element "Id" contains our GI

            # Get GI, title and accession:
            rel_gi = elem.text
            rel_def, rel_acc = _get_record_title(rel_gi)

            # Print this spinning thing
            sys.stdout.write("\r {}".format(krutiolka[krut_i]))
            sys.stdout.flush()
            krut_i = krut_i + 1 if krut_i != 3 else 0

            # If accession is new -- update list
            if not rel_acc in map(lambda x: x[0], repl_list):
                # acc_dict[rel_acc] = rel_def # update acc_dict
                repl_list.append((rel_acc, rel_def))
            # end if
        # end if
    # end for
    return repl_list
Beispiel #7
0
def _ling_https_getreq_handl_301(server, url, request_for=None, acc=None):
    # Name stands for "Lingering Https Get Request Handling 301".
    # Function performs a "lingering" HTTPS request.
    # It means that the function tries to get the response
    #     again and again if the request fails.
    # It handles 301-redirection in order to search for replicons related to "NC-records".
    #
    # :param server: server address;
    # :type server: str;
    # :param url: the rest of url;
    # :type url: str;
    # :param request_for: some comment for error message;
    # :type request_for: str;
    # :param acc: GenBank accession;
    # :type acc: str;
    #
    # Returns obtained response coded in UTF-8 ('str').

    error = True
    while error:
        try:
            conn = http.client.HTTPSConnection(server,
                                               timeout=10)  # create connection
            conn.request("GET", url)  # ask for if there areresults
            response = conn.getresponse()  # get the resonse

            # Handle redirection
            if response.code == 301:
                # Link to identical GenBank record is in "Location" header:
                redirect_url = response.getheader(
                    "Location") + "?report=accnlist&log$=seqview&format=text"
            else:
                raise _DoesNotRedirectError(
                    "NCBI does not redirect, although it must!")
            # end if

        except (OSError, http.client.RemoteDisconnected, socket.gaierror,
                http.client.CannotSendRequest) as err:
            comment_str = ""
            if not request_for is None:
                comment_str += " requesting for {}".format(request_for)
                if not acc is None:
                    comment_str += " (accession: '{}')".format(acc)
                # end if
                comment_str += '.'
            # end if
            print()
            printlog_warning("Can't connect to `{}`{}".format(
                server + url, comment_str))
            printlog_warning(str(err))
            printlog_warning(
                "the program will sleep for 30 seconds and try to connect again."
            )
            sleep(30)
        except _DoesNotRedirectError as err:
            printlog_error_time(str(err))
            printlog_error("Please, contact the developer.")
            platf_depend_exit(1)
        else:
            error = False  # if no exception ocured, get out of the loop
        finally:
            conn.close()
        # end try
    # end while

    # And here goes simple "lingering_https_get_request",
    #   which will retrieve content from redirected location
    return lingering_https_get_request(server, redirect_url, request_for, acc)
Beispiel #8
0
def _get_record_title(record_id):
    # Function retrieves title (aka definition) and accession
    #   of a GenBank record by given accession or GI number.
    # :param record_id: accession or GI number of the record;
    # :type record_idi: str;
    # Returns tuple of two elements:
    #   (<RECORD_TITLE>, <RECORD_ACCESSION>)

    # We'll use E-utilities to communicate with GenBank
    eutils_server = "eutils.ncbi.nlm.nih.gov"
    esummary = "esummary.fcgi"  # utility name

    # Configure URL
    url = "/entrez/eutils/{}?db=nuccore&id={}".format(esummary, record_id)

    # Sometimes (I never figured out why) this XML arrives empty, and StopIteration emerges.
    # So, if we just repeat this request, everything is going to be ok.
    error = True
    print_ok = False
    while error:
        # Send the request and get the response
        summary = lingering_https_get_request(
            eutils_server, url,
            "e-summary of nuccore record {}".format(record_id))

        # Parse XML that we've got
        root = ElementTree.fromstring(summary)

        # Elements of our insterest are all named "Item",
        #   but they have different tags.
        # They are children of element "DocSum", which is
        #   the first child of root
        try:
            docsum = next(iter(root.getchildren()))
        except StopIteration:
            print()
            printlog_info_time(
                "Failed to retrieve data for record {}. Trying again...".
                format(record_id))
            print_ok = True  # print this "ok" only after successful attepmt after fail
        else:
            if print_ok:
                printlog_info("ok")
            # end if
            error = False
        # end try
    # end while

    record_title = None
    record_acc = None

    # Search for title and accession
    for item in docsum.iter("Item"):
        if item.attrib["Name"] == "Title":
            record_title = item.text
        elif item.attrib["Name"] == "AccessionVersion":
            # Remove version just in case
            record_acc = re.search(r"(.*)\.[0-9]+", item.text).group(1)
        # end if
    # end for

    if record_title is None or record_acc is None:
        printlog_erro_time(
            "Error 8989: can't access e-summary for `{}`".format(record_acc))
        platf_depend_exit(1)
    # end if

    return record_title, record_acc