Example #1
0
def _format_sra(sra, text):
    """ (PRIVATE)
    End users can input either SRX or SRR, which will be submitted to NCBI to check the validity,
    and this function is for deciding what the kind of SRA the input is.
    """
    # NSuggest_CreateData("srr181288", new
    # Array(
    #     "SRX882917 ( taxid:13443; run:SRR1810085 SRR1810146 SRR1810833 SRR1812884)@srx882917 13443
    #         srr1810085 srr1810146 srr1810833 srr1812884",
    #     "SRX885412 ( taxid:1639; run:SRR1812880)@srx885412 1639     srr1812880",
    #     "SRX885413 ( taxid:28901; run:SRR1812881)@srx885413 28901     srr1812881",
    #     "SRX885414 ( taxid:28901; run:SRR1812882)@srx885414 28901     srr1812882",
    #     "SRX885415 ( taxid:28901; run:SRR1812883)@srx885415 28901     srr1812883",
    #     "SRX885416 (Leymus arenarius taxid:220462; run:SRR1812885)@srx885416 220462  leymus arenarius   srr1812885",
    #     "SRX885417 ( taxid:115547; run:SRR1812888)@srx885417 115547     srr1812888",
    #     "SRX885418 ( taxid:7160; run:SRR1812886)@srx885418 7160     srr1812886",
    #     "SRX885419 ( taxid:7160; run:SRR1812887)@srx885419 7160     srr1812887",
    #     "SRX885420 ( taxid:7160; run:SRR1812889)@srx885420 7160     srr1812889"),
    # 1);
    srx = _search_keyword(r'"({}) \( ?'.format(sra.upper()), text)
    srr = _search_keyword(
        r'run:[0-9A-Z ]*({})[0-9A-Z ]*\)'.format(sra.upper()), text)
    if srx is not None and srr is None:
        srr = _search_keyword(
            r'"{} \([\d\w\ \:]+; run:([0-9A-Z ]+)\)'.format(srx.upper()), text)
        if srr is None:
            raise errors.QueryError(
                "Invalid SRX accession number: {}. Aborted.".format(sra))
    elif srr is not None and srx is None:
        srx = _search_keyword(
            r'"(\w+\d+) \([\d\w\ \:]+; run:[0-9A-Z ]*({})[0-9A-Z ]*\)'.format(
                srr.upper()), text)
        if srx is None:
            raise errors.QueryError(
                "Invalid SRR accession number: {}. Aborted.".format(sra))
        else:
            # An SRX may relates to multiple SRAs:
            srr = _search_keyword(
                r'"{} \([\d\w\ \:]+; run:([0-9A-Z ]+)\)'.format(srx.upper()),
                text)
    else:
        raise errors.QueryError(
            "Invalid SRA accession number: {}. Aborted.".format(sra))
    taxid = _search_keyword(r'"{} \(.+taxid:(\d+); run:{}'.format(srx, srr),
                            text)
    srr = srr.strip().split()
    return taxid, srx, srr
Example #2
0
def check_sra_validity(input_SRAs, proxy=None):
    """
    End users can input either SRX or SRR, which will be submitted to NCBI to check the validity,
    and this function is for deciding what the kind of SRA the input is.
    The proxy should be args[proxy].
    """
    if type(input_SRAs) == str:
        input_SRAs = [
            input_SRAs,
        ]
    output_SRAs = {}
    for s in input_SRAs:
        check_sra_params = {"dict": "srx_dict_sg", "q": s}
        try:
            check_sra_response = requests.get(
                "https://blast.ncbi.nlm.nih.gov/portal/utils/autocomp.fcgi",
                params=check_sra_params,
                headers=headers,
                timeout=60,
                proxies=proxy)
        except Exception as err:
            raise errors.QueryError(
                "Couldn't check the validity of SRA accession numbers "
                "probably because of network issues. {}.".format(err))
        else:
            if not check_sra_response.ok:
                raise errors.QueryError(
                    "Couldn't check the validity of SRA accession numbers "
                    "probably because of network issues. Status code: {}".
                    format(check_sra_response.status_code))
            else:
                # returned srr is a list
                taxid, srx, srr = _format_sra(
                    s, check_sra_response.content.decode('utf-8'))
                species = _taxid_to_species(taxid)
                srxes = deepcopy(output_SRAs.get(species, {}))
                srxes[srx] = deepcopy(srxes.get(srx, []) + srr)
                output_SRAs[species] = deepcopy(srxes)
                # output_SRAs = {species1: {srx1: [srr...], srx2: [srr...]}, species2: ...}
        time.sleep(5)
    return output_SRAs
Example #3
0
def qblast(
        program,
        srx,  # only accept SRX
        query,
        query_from=None,
        query_to=None,
        max_num_seq=500,
        expect=10.0,
        repeat_filter=None,  # filter out low complexity regions
        short_query=None,
        word_size=None,
        job_title=None,
        format_type="XML",
        browser="http://127.0.0.1:4444/wd/hub",
        proxies=(None, None),  # (webdriver_proxy, general_proxy)
        verbose=False,
):
    """BLAST search using the selenium module:
         Some useful parameters:

          - program        megaBlast, blastn, discoMegablast, or tblastn (capital sensitive)
          - sra            Which sra database to search against (srr or srx).
          - sequence       The sequence to search.
          - max_num_seq    The number of hits that NCBI returned.
          - expect         An expect value cutoff.  Default 10.0.
          - repeat_filter  "L" turns on filtering low complexity regions.  Default no filtering.
          - word_size      default: 28 for blastn, 6 for tblastn
          - format_type    "HTML", "Text", "ASN.1", or "XML".  Default "XML".
    """
    # - base url:
    # https://blast.ncbi.nlm.nih.gov/blast/Blast.cgi?PAGE_TYPE=BlastSearch&BLAST_SPEC=SRA&DB_GROUP=Exp&
    # 1) PROGRAM = ['blastn', 'tblastn', 'tblastx']
    # 2) BLAST_PROGRAMS = ['megaBlast', 'blastn', 'discoMegablast']
    # e.g.
    # PROGRAM=blastn&BLAST_PROGRAMS=megaBlast&NUM_ORG=1&EQ_MENU=SRX000001
    # PROGRAM=tblastn&NUM_ORG=2&EQ_MENU=SRX000001&EQ_MENU1=SRX000002
    # Step 1 - Submit queries using the selenium module:
    url = "https://blast.ncbi.nlm.nih.gov/blast/Blast.cgi?PAGE_TYPE=BlastSearch&BLAST_SPEC=SRA&DB_GROUP=Exp"
    url += _add_eq_menus(srx)
    url += _add_program(program)

    chrome = _setup_chrome_webdriver(browser=browser, proxy=proxies[0])
    time.sleep(4)

    submit_params = [
        # ("QUERY", query),
        # ("QUERY_FROM", query_from),
        # ("QUERY_TO", query_to),
        ("MAX_NUM_SEQ", max_num_seq),
        ("EXPECT", expect),
        ("FILTER", repeat_filter),
        ("SHORT_QUERY_ADJUST", short_query),
        ("WORD_SIZE", word_size),
        ("JOB_TITLE", job_title)
    ]
    for p in submit_params:
        if p[1] is not None:
            url += "&{}={}".format(p[0], p[-1])

    chrome.get(url)
    time.sleep(4)
    chrome.find_element_by_name("QUERY").send_keys(query)
    if query_from is not None and query_to is not None:
        chrome.find_element_by_name("QUERY_FROM").send_keys(query_from)
        chrome.find_element_by_name("QUERY_TO").send_keys(query_to)
    time.sleep(4)
    chrome.find_element_by_class_name('blastbutton').click()
    wait_page = chrome.page_source
    try:
        rid, status, job_title, entrez_query, rtoe, max_num_seq = _parse_qblast_wait_page(
            wait_page)
    except errors.QueryError:
        # In my experience, the first submit may be blocked somehow, so try to submit again:
        time.sleep(4)
        chrome.find_element_by_class_name('blastbutton').click()
        wait_page = chrome.page_source
        rid, status, job_title, entrez_query, rtoe, max_num_seq = _parse_qblast_wait_page(
            wait_page)
    cookies = chrome.get_cookies()
    _previous = time.time()
    chrome.quit()

    # Step 2 - Poll results from NCBI:
    # Actually, all parameters for polling results can be obtained from the wait page.
    # --
    # Poll NCBI until the results are ready.
    # https://blast.ncbi.nlm.nih.gov/Blast.cgi?CMD=Web&PAGE_TYPE=BlastDocs&DOC_TYPE=DeveloperInfo
    # 1. Do not contact the server more often than once every 10 seconds.
    # 2. Do not poll for any single RID more often than once a minute.
    # 3. Use the URL parameter email and tool, so that the NCBI
    #    can contact you if there is a problem.
    # 4. Run scripts weekends or between 9 pm and 5 am Eastern time
    #    on weekdays if more than 50 searches will be submitted.
    # --
    # Could start with a 10s delay, but expect most short queries
    # will take longer thus at least 70s with delay. Therefore,
    # start with 20s delay, thereafter once a minute.
    poll_params = [
        ("RID", rid),
        ("JOB_TITLE", job_title),
        ("ENTREZ_QUERY", entrez_query),
        ('MAX_NUM_SEQ', max_num_seq),
        ("CMD", "Get"),
    ]
    poll_params = [p for p in poll_params if p[1] is not None]

    delay = 20  # seconds
    session = requests.Session()
    for c in cookies:
        session.cookies.set(c['name'], c['value'])
    not_done_yet = True
    while not_done_yet:
        current = time.time()
        wait = _previous + delay - current
        if wait > 0:
            time.sleep(wait)
            _previous = current + wait
        else:
            _previous = current
        # delay by at least 60 seconds only if running the request against the public NCBI API
        if delay < 60:
            # Wasn't a quick return, must wait at least a minute
            delay = 60

        try:
            poll_response = session.get(
                "https://blast.ncbi.nlm.nih.gov/Blast.cgi",
                params=poll_params,
                headers=headers,
                timeout=120,
                proxies=proxies[-1],
            )
        except Exception as err:
            utils.log(
                "WARNING: Couldn't poll results from NCBI. {}. "
                "But don't panic, we will retry and are almost there.".format(
                    err),
                verbose=verbose,
                attr="debug")
        else:
            if poll_response.ok:
                poll_rid, poll_status, _, _, _, _ = _parse_qblast_wait_page(
                    poll_response.content.decode("utf-8"))
                utils.log("RID: {}, Status: {}.".format(poll_rid, poll_status),
                          verbose, "debug")
                if poll_rid == rid:
                    if poll_status.lower() in ["waiting", "searching"]:
                        continue
                    elif poll_status.lower() == "failed":
                        err_msg = _search_keyword(
                            r'(<p class="error">.+?</p>)',
                            poll_response.content.decode("utf-8"), ">NA<")
                        err_msg = ''.join(
                            re.findall(r'>(.+?)<',
                                       err_msg))  # remove inside links <a></a>
                        raise errors.QueryError(
                            'Retrieving results failed. Error message from NCBI: "{}".'
                            .format(err_msg))
                    elif poll_status.lower() == "ready":
                        poll_params.append(("FORMAT_TYPE", format_type))
                        while not_done_yet:
                            try:
                                poll_response = session.get(
                                    "https://blast.ncbi.nlm.nih.gov/Blast.cgi",
                                    params=poll_params,
                                    headers=headers,
                                    timeout=120,
                                    proxies=proxies[-1])
                            except Exception as err:
                                raise errors.QueryError(
                                    "Although the query was submitted, "
                                    "but the results couldn't be retrieved. {}"
                                    .format(err))
                            else:
                                if poll_response.ok:
                                    poll_format = _search_keyword(
                                        r'<!DOCTYPE ([\w]+?) PUBLIC',
                                        poll_response.content.decode("utf-8"),
                                        "NA")
                                    if poll_format.lower() == "blastoutput":
                                        blastoutput = poll_response.content.decode(
                                            "utf-8")  # XML
                                        not_done_yet = False
                                        break
                                    else:
                                        utils.log(
                                            "WARNING: Although the results are ready, "
                                            "they can't be retrieved somehow. "
                                            "Don't panic, we will retry and are almost there.",
                                            verbose=verbose,
                                            attr="debug")
                                        continue
                                else:
                                    utils.log(
                                        "WARNING: Although the query was submitted, "
                                        "but the results couldn't be retrieved probably because of network issues. "
                                        "Status code: {}.".format(
                                            poll_response.status_code),
                                        verbose=verbose,
                                        attr="debug")
                    else:
                        utils.log(
                            "WARNING: Something wrong while retrieving results from NCBI. "
                            "RID: {}. Status: {}. "
                            "But don't panic, we will retry and are almost there."
                            .format(poll_rid, poll_status),
                            verbose=verbose,
                            attr="debug")
                else:
                    utils.log(
                        "WARNING: The submitted RID ({}) "
                        "is different from the polled one ({}). "
                        "But don't panic, we will try to retrieve results again."
                        .format(rid, poll_rid),
                        verbose=verbose,
                        attr="debug")
            else:
                utils.log(
                    "WARNING: Couldn't get results from NCBI. Status code: {}. "
                    "But don't panic, we will retry and are almost there.".
                    format(poll_response.status_code),
                    verbose=verbose,
                    attr="debug")
    return blastoutput
Example #4
0
def _parse_qblast_wait_page(wait_html):
    """ (PRIVATE)
    Extract a tuple of job title, entrez query, RID, and RTOE from the 'please wait' page.

    The NCBI FAQ pages use TOE for 'Time of Execution', so RTOE is probably
    'Request Time of Execution' and RID would be 'Request Identifier'.
    """
    rid = _search_keyword(r'<input name="RID".+?value="([\d\w]+?)">',
                          wait_html)  # The most important to proceed
    status_code = _search_keyword(
        r'<input name="SEARCH_DB_STATUS".+?value="(\d+?)"', wait_html, "NA")
    status1 = _search_keyword(r'<td>Status</td><td>(\w+?)</td>', wait_html)
    status2 = _search_keyword(r'Status=(\w+)', wait_html)
    job_title = _search_keyword(r'<input name="JOB_TITLE".+?value="(.+?)"',
                                wait_html)
    entrez_query = _search_keyword(
        r'<input name="ENTREZ_QUERY".+?value="(.+?)"', wait_html)
    rtoe = _search_keyword(r'<input name="RID".+?value="(\d+?)">', wait_html)
    max_num_seq = _search_keyword(
        r'<input name="MAX_NUM_SEQ".+?value="(\d+?)"', wait_html, 500)

    if rid is None:
        # Can we reliably extract the error message from the HTML page?
        # e.g.  "Message ID#24 Error: Failed to read the Blast query:
        #       Nucleotide FASTA provided for protein sequence"
        # or    "Message ID#32 Error: Query contains no data: Query
        #       contains no sequence data"
        #
        # This used to occur inside a <div class="error msInf"> entry.
        #
        # Taken from an error webpage (July 23, 2020, format: HTML):
        # <!-- Do errors this way -->
        # <!--<ul class="msg error"><li class="error"><p class="error"></p></li></ul>-->
        # <ul id="upgMsg" class="msg error"><li id="lpgMsg" class="error">\
        # <p class="error">Non-interactive SRA BLAST searches not supported.</p>\
        # </li></ul>

        err_msg = _search_keyword(r'(<p class="error">.+?</p>)', wait_html)
        if err_msg is None:
            err_msg = _search_keyword(r"Message ID#\d+ Error: (.+?)$",
                                      wait_html)
            if err_msg is None:
                # We didn't recognise the error layout :(
                raise errors.QueryError(
                    "No RID found in the 'please wait' page, "
                    "there was probably an error in your request "
                    "but we could not extract a helpful error message.")
        else:
            err_msg = "".join(re.findall(r'>(.+?)<', err_msg))
        raise errors.QueryError("Error message from NCBI: {}".format(err_msg))
    else:
        # Polling Status can be very tricky:
        status_code_dict = {
            "31": "searching",
            "21": "waiting",
            "43": "ready",
            "63": "failed"
        }  # these are all I met
        if status1 is not None:
            status = status1.lower()
        elif status2 is not None:
            status = status2.lower()
        else:
            status = status_code_dict.get(status_code, "unknown")
    return rid, status, job_title, entrez_query, rtoe, max_num_seq
Example #5
0
def query(args, webdriver):
    download_list = {}
    name, seq_chunks = _format_seq(args)
    utils.log(seq_chunks, args['verbose'], 'debug')
    SRAs = {}.fromkeys(args['sra'].strip().split(',')).keys()
    formatted_SRAs = NCBIWWW_selenium.check_sra_validity(SRAs,
                                                         proxy=args["proxy"])
    # formatted_SRAs = {species1: {srx1: [srr...], srx2: [srr...]}, species2: ...}
    interval = 10
    for i in formatted_SRAs.items():
        for j in i[-1].items():
            srx = j[0]
            srr = ','.join(j[-1])
            current = 0
            for chunk in seq_chunks:
                current += 1
                utils.processing(current, len(seq_chunks),
                                 "{} - {} ({})".format(i[0], srx,
                                                       srr), "percent")
                r = -1
                err = ''
                while r < int(args['retry']):
                    # Do not contact the server more often than once every 10 seconds:
                    if interval < 10:
                        time.sleep(11 - interval)
                    start_time = time.time()
                    if len(err) > 0:
                        # utils.log("Retrying...", shift="\n")
                        utils.log("Retrying...")
                    try:
                        result = NCBIWWW_selenium.qblast(
                            program=args["program"],
                            srx=srx,
                            query=chunk,
                            max_num_seq=(args["max_num_seq"] //
                                         (len(seq_chunks) * 20) + 1),
                            expect=args["evalue"],
                            # format_type='Tabular'
                            # Don't know why the number of returned hits can't be determined when the format is Tabular.
                            # So the XML format is required:
                            format_type='XML',
                            browser=webdriver,
                            proxies=(args["chrome_proxy"], args["proxy"]),
                            verbose=args["verbose"])
                        if args['verbose']:
                            with open(
                                    os.path.join(args['outdir'],
                                                 "{}.xml".format(srx)),
                                    'w') as outf:
                                outf.write(result)
                    except Exception as e:
                        err = str(e)
                        r += 1
                        utils.log("Error msg while querying: {}.".format(err),
                                  shift="\n")
                    else:
                        err = ''
                        break
                if len(err) > 0:
                    raise errors.QueryError(
                        "Couldn't get results from NCBI. Errors above must be investigated."
                    )
                else:
                    result = _parse_xml(result, args)
                    for sra in result.keys():
                        spots = deepcopy(download_list.get(sra, []))
                        spots += result[sra]
                        download_list[sra] = deepcopy(spots)
                    interval = time.time() - start_time
    download_list = _clear_up_list(download_list)
    return name, download_list