def setUp(self): self.args = { 'CPU': 4, 'browser': None, 'chrome_proxy': None, 'cleanup': False, 'cut': '80,50', 'docker': False, 'evalue': 0.001, 'firefox_proxy': None, 'max_memory': '4G', 'max_num_seq': 1000, 'min_contig_length': 150, 'outdir': 'OUTPUT', 'program': 'blastn', 'proxy': None, 'query': 'ATGC', 'retry': float('inf'), 'sra': 'SRXNNNNNN', 'stage': 'butterfly', 'trim': False, 'verbose': False } self.config_files = [ os.path.abspath(os.path.join(r2g.__path__[0], "path.json")), os.path.abspath(os.path.join(os.path.expanduser('~'), ".r2g.path.json")) ] self.path = deepcopy(os.environ['PATH']) self.app_json = utils.preflight(self.args) # self.app_json["chromedriver"] = os.environ.get("PRIVATE_WEBDRIVER", "http://127.0.0.1:4444/wd/hub") utils.log("app_json is {}".format(self.app_json)) self.pwd = os.getcwd()
def test_format_seq_2(self): # Test 2: utils.log("Testing r2g.online.blast _format_seq 2.") query_file = tempfile.mkstemp(suffix=".fasta", prefix="r2g-test_tmp_", text=True)[-1] fasta = ">{}\n{}\n>{}\n{}\n".format( "A", self.query_fasta.strip().split('\n', 1)[1], "B", self.query_fasta.strip().split('\n', 1)[1]) with open(query_file, 'w') as outf: outf.write(fasta) self.args['query'] = query_file self.args['cut'] = "24,20" name, seq = blast._format_seq(self.args) formatted_name = "A_B" with open( '{}/data/formatted_seq.json'.format( os.path.split(os.path.abspath(__file__))[0]), 'r') as inf: formatted_seq = json.loads(inf.read().strip()) if name == formatted_name and seq == formatted_seq: assertion = True else: print(name) print(seq) assertion = False shutil.rmtree(query_file, ignore_errors=True) self.assertTrue(assertion)
def test_check_sequences(self): utils.log("Testing r2g.utils.utils _check_sequences") query_file = tempfile.mkstemp(suffix=".fasta", prefix="r2g-test_tmp_", text=True)[-1] with open(query_file, 'w') as outf: # a fake fasta with a wrong character ("!") outf.write(">some_gene\nATGC!\n") self.args['query'] = query_file with self.assertRaises(errors.InputError): _ = utils.preflight(self.args) utils.delete_everything(query_file)
def test_fastq_dump_error(self): utils.log("Raising r2g.online.fetch fastq_dump error.") args = { 'query': "ATGC", 'verbose': False, 'stage': 'butterfly' } app_json = utils.preflight(args) with self.assertRaises(errors.FetchError): _, _ = fetch.fastq_dump('SRR1812889', "X", "J", app_json)
def test_query(self): utils.log("Testing r2g.online.blast query.") try: name, download_list = blast.query(self.args, os.environ["PRIVATE_WEBDRIVER"]) assertion = (name == 'some_gene' and len(download_list.get('SRR1812889', [])) > 0) except Exception as err: assertion = False utils.log("Error occurred while testing: {}".format(err)) self.assertTrue(assertion)
def run(self): utils.log("Trinity cmd: {}".format(' '.join(self.cmd))) utils.log("Trinity is running. Output dir: {}".format(self.output)) utils.log("Trinity log file: {}".format(self.log)) logs = "" try: p = subprocess.run( self.cmd, shell=False, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, ) logs = p.stdout.decode('utf-8') if p.returncode != 0: if self.args['verbose']: print(logs) raise errors.AssembleError("Trinity exited {}.".format( p.returncode)) else: utils.log("Trinity done.") return self.output, self.log except Exception as err: if self.args['verbose'] and len(logs.strip()) > 0: print(logs) raise errors.AssembleError( "Errors raised when called Trinity. {}. " "Please check the Trinity log above.".format(err)) finally: if len(logs.strip()) > 0: with open(self.log, 'w') as outf: outf.write(logs)
def __call__(self, parser, namespace, values, option_string=None): args = utils.file2json(os.path.join(r2g.__path__[0], "quick_test.json")) output_dir = tempfile.mkdtemp(prefix="r2g-dryrun_tmp_") cmd = [ self.r2g_script, "-q", args['query'], "-s", args['sra'], "-o", output_dir, "-c", args['cut'], "-p", args['program'], "--verbose", ] try: webdriver_cmd = ["--browser", os.environ["PRIVATE_WEBDRIVER"]] except KeyError: webdriver_cmd = [] cmd += webdriver_cmd err = "" try: p = subprocess.run(cmd, shell=False, timeout=600) except subprocess.TimeoutExpired as err: err += "\nThe quick test is supposed to finished in 10 minutes. Aborted." exit_code = 2 except Exception as err: exit_code = 3 else: exit_code = p.returncode if exit_code != 0: print(err) utils.log( "The quick test failed. Please check the error message above. " "Make sure the r2g was installed and configured correctly") else: utils.log( "The quick test done. Please feed me something real 😋") utils.delete_everything(output_dir) sys.exit(exit_code)
def test_parse_fastq_error(self): utils.log("Raising r2g.online.fetch _parse_fastq error.") check = [] fake_fastqs = [ "@a\nATGC\n+\nAAAA\n", "a\nATGC\n+\nAAAA\n", "@a/1\nATG\n+\nAAAA\n", "@a/1\nATGC\n\nAAAA\n", "@a/1\nATGC\n?\nAAAA\n", "@a/1\nATGC\n+\nAAAA\n", ] for seq in fake_fastqs: try: _ = fetch._parse_fastq(seq) except errors.FetchError: check.append(False) else: check.append(True) self.assertEqual(check, [False, False, False, False, False, True])
def _parse_xml(raw_results, args): download_list = {} r = -1 err = "" while r < int(args['retry']): try: results_tree = ET.fromstring(raw_results) except ET.ParseError as e: err = str(e) r += 1 print(raw_results) utils.log( "WARNING: couldn't get results from NCBI due to temporary errors. Retrying...", args['verbose'], 'debug') else: Iterations = results_tree.find('BlastOutput_iterations').findall( 'Iteration') for i in Iterations: hits = i.find('Iteration_hits') for hit in hits: hit = hit.find('Hit_accession').text.strip().split('.') try: sra = hit[0] spot = int(hit[1]) except (ValueError, IndexError): pass else: spots = deepcopy(download_list.get(sra, [])) spots.append(spot) download_list[sra] = deepcopy(spots) err = "" break if len(err) > 0: utils.log( "WARNING: couldn't get results for from NCBI due to temporary errors. " "The fragment was skipped.") if args['verbose']: with open( os.path.join(args['outdir'], "{}.xml".format(args['sra'])), 'w') as outf: outf.write(raw_results) return download_list
def test_format_seq_1(self): # Test 1 (total_length = 169, num_frag = 3): utils.log("Testing r2g.online.blast _format_seq 1.") self.query_fasta = self.query_fasta.strip().split('\n', 1)[1] + 29 * "A" self.args['query'] = self.query_fasta name, seq = blast._format_seq(self.args) formatted_name = "Undefined" formatted_seq = [ ">Undefined_0\nAATCATTCCATTGATTAGACGATGGTTACACTTGGTTCACGTCGTGCGCGTTTCCCGTGTTCCCTCTAGACGTAGAAGTG\n" ">Undefined_1\nCTTGGTTCACGTCGTGCGCGTTTCCCGTGTTCCCTCTAGACGTAGAAGTGTTGGACTTTTTTTTTTGGGTGTTGTGCTGC\n" ">Undefined_2\nTCCCTCTAGACGTAGAAGTGTTGGACTTTTTTTTTTGGGTGTTGTGCTGCTATAAGCTGCTACTGCTGATTGAGGAAATT" "AAAAAAAAAAAAAAAAAAAAAAAAAAAAA" ] if name == formatted_name and seq == formatted_seq: assertion = True else: print(name) print(seq) assertion = False self.assertTrue(assertion)
def test_parse_xml(self): utils.log("Testing r2g.online.blast _parse_xml.") xml_dir = "{}/data".format(os.path.split(os.path.abspath(__file__))[0]) xml_files = [ "{}/no_result.xml".format(xml_dir), "{}/other_result.xml".format(xml_dir), "{}/standard_result.xml".format(xml_dir), # "{}/error_result.xml".format(xml_dir) ] parsed_results = [{}, {}, {'SRR1812889': [25821753]}] for i in range(len(xml_files)): with open(xml_files[i], 'r') as inf: download_list = blast._parse_xml(inf.read(), self.args) if download_list == parsed_results[i]: assertion = True else: assertion = False utils.log("Error occured while parsing {}".format( xml_files[i])) break self.assertTrue(assertion)
def test_fastq_dump(self): utils.log("Testing r2g.online.fetch fastq_dump.") args = { 'query': "ATGC", 'verbose': False, 'stage': 'butterfly' } app_json = utils.preflight(args) fastq = { '1': '@FCC2U5KACXX:6:1101:9243:74192/1\n' 'CACGTCGTGCGCGTTTTCCGTGTTCCCTCTAGCAGACCTCAAGGTTTTGGATTTTTTTTTGTGTGCTCAGTGCCAAAGTTGCTGATTGTC\n' '+SRR1812889.232339 FCC2U5KACXX:6:1101:9243:74192 length=90\n' 'BB@FFFFDHHHHHHIJJJJJGHHHGIJIJIHJJJIJJJIJHIIJAHIJIICHHHHHDDDD?BCDDCDDDDCDCDDACCCCDDDDDCDCCD\n', '2': '@FCC2U5KACXX:6:1101:9243:74192/2\n' 'TCCGGGAATCCACAGCAGCTCAGCAATGCGGGATTTTCCACTGCCCGATAAAAACAAGTTCTACTACTGATGATTTTTCACTTTCAGCTA\n' '+SRR1812889.232339 FCC2U5KACXX:6:1101:9243:74192 length=90\n' 'CCCFFFFFHHHHHJJJJJJJIJJIJJIJJJJJIIJJJJIGIIJJJJJHIHHHFFFFDECEEEEDEDDDDDDDEEFEDDDCDDDDDCCDDD\n' } log = "SRR1812889 232339-232339:\nb'Read 1 spots for SRR1812889\\nWritten 1 spots for SRR1812889\\n'----" utils.log("Testing fastq-dump.") fetched_fastq, fetched_log = fetch.fastq_dump('SRR1812889', 232339, 232339, app_json) self.assertEqual((fastq, log), (fetched_fastq, fetched_log))
def test_check_apps(self): utils.log("Testing r2g.utils.utils configure files.") changing_app_json = deepcopy(self.app_json) # SITUATION 1: apps are not in $PATH and config_files[0] is configured. os.environ['PATH'] = '/usr/bin' os.chmod(self.config_files[0], S_IWUSR | S_IREAD) with open(self.config_files[0], 'w') as outf: json.dump(self.app_json, outf, indent=4, separators=(',', ': ')) parsed_app_json = utils.preflight(self.args) if parsed_app_json == self.app_json: assertion1 = True else: assertion1 = False # SITUATION 2: apps are not in $PATH, config_files[0] is not writable, and both two configs are not configured. # Trinity is not found: with open(self.config_files[0], 'w') as outf: changing_app_json["Trinity"] = "/" json.dump(changing_app_json, outf, indent=4, separators=(',', ': ')) # make the config_files[0] readable only: os.chmod(self.config_files[0], S_IREAD | S_IRGRP | S_IROTH) # Trinity is not executable: with open(self.config_files[1], 'w') as outf: changing_app_json["Trinity"] = "{}/data".format(os.path.split(os.path.abspath(__file__))[0]) json.dump(changing_app_json, outf, indent=4, separators=(',', ': ')) os.chmod("{}/data/Trinity".format(os.path.split(os.path.abspath(__file__))[0]), S_IREAD | S_IRGRP | S_IROTH) choose_yes = mock.Mock(return_value=True) trinity_dir = mock.Mock(return_value=self.app_json['Trinity']) fastq_dump_dir = mock.Mock(return_value=self.app_json['fastq-dump']) chromedriver_dir = mock.Mock(return_value=self.app_json['chromedriver']) utils._ask_yes_or_no = choose_yes utils._input_trinity_dir = trinity_dir utils._input_fastq_dump_dir = fastq_dump_dir utils._input_webdriver_dir = chromedriver_dir parsed_app_json = utils.preflight(self.args) with open(self.config_files[1], 'r') as inf: read_app_json = json.load(inf) if parsed_app_json == self.app_json and read_app_json == self.app_json: assertion2 = True else: assertion2 = False # Restore everything: os.environ['PATH'] = deepcopy(self.path) os.chmod(self.config_files[0], S_IWUSR | S_IREAD) utils.delete_everything(self.config_files[1]) with open(self.config_files[0], 'w') as outf: outf.write("") utils.log("assertion 1 is {}".format(assertion1)) utils.log("assertion 2 is {}".format(assertion2)) self.assertTrue(assertion1 & assertion2)
def test_query_cut_error_2(self): utils.log("Raising r2g.online.blast query Error 2.") self.args["cut"] = "X,J" with self.assertRaises(errors.InputError): _, _ = blast.query(self.args, "http://127.0.0.1:4444/wd/hub")
def qblast( program, srx, # only accept SRX query, query_from=None, query_to=None, max_num_seq=500, expect=10.0, repeat_filter=None, # filter out low complexity regions short_query=None, word_size=None, job_title=None, format_type="XML", browser="http://127.0.0.1:4444/wd/hub", proxies=(None, None), # (webdriver_proxy, general_proxy) verbose=False, ): """BLAST search using the selenium module: Some useful parameters: - program megaBlast, blastn, discoMegablast, or tblastn (capital sensitive) - sra Which sra database to search against (srr or srx). - sequence The sequence to search. - max_num_seq The number of hits that NCBI returned. - expect An expect value cutoff. Default 10.0. - repeat_filter "L" turns on filtering low complexity regions. Default no filtering. - word_size default: 28 for blastn, 6 for tblastn - format_type "HTML", "Text", "ASN.1", or "XML". Default "XML". """ # - base url: # https://blast.ncbi.nlm.nih.gov/blast/Blast.cgi?PAGE_TYPE=BlastSearch&BLAST_SPEC=SRA&DB_GROUP=Exp& # 1) PROGRAM = ['blastn', 'tblastn', 'tblastx'] # 2) BLAST_PROGRAMS = ['megaBlast', 'blastn', 'discoMegablast'] # e.g. # PROGRAM=blastn&BLAST_PROGRAMS=megaBlast&NUM_ORG=1&EQ_MENU=SRX000001 # PROGRAM=tblastn&NUM_ORG=2&EQ_MENU=SRX000001&EQ_MENU1=SRX000002 # Step 1 - Submit queries using the selenium module: url = "https://blast.ncbi.nlm.nih.gov/blast/Blast.cgi?PAGE_TYPE=BlastSearch&BLAST_SPEC=SRA&DB_GROUP=Exp" url += _add_eq_menus(srx) url += _add_program(program) chrome = _setup_chrome_webdriver(browser=browser, proxy=proxies[0]) time.sleep(4) submit_params = [ # ("QUERY", query), # ("QUERY_FROM", query_from), # ("QUERY_TO", query_to), ("MAX_NUM_SEQ", max_num_seq), ("EXPECT", expect), ("FILTER", repeat_filter), ("SHORT_QUERY_ADJUST", short_query), ("WORD_SIZE", word_size), ("JOB_TITLE", job_title) ] for p in submit_params: if p[1] is not None: url += "&{}={}".format(p[0], p[-1]) chrome.get(url) time.sleep(4) chrome.find_element_by_name("QUERY").send_keys(query) if query_from is not None and query_to is not None: chrome.find_element_by_name("QUERY_FROM").send_keys(query_from) chrome.find_element_by_name("QUERY_TO").send_keys(query_to) time.sleep(4) chrome.find_element_by_class_name('blastbutton').click() wait_page = chrome.page_source try: rid, status, job_title, entrez_query, rtoe, max_num_seq = _parse_qblast_wait_page( wait_page) except errors.QueryError: # In my experience, the first submit may be blocked somehow, so try to submit again: time.sleep(4) chrome.find_element_by_class_name('blastbutton').click() wait_page = chrome.page_source rid, status, job_title, entrez_query, rtoe, max_num_seq = _parse_qblast_wait_page( wait_page) cookies = chrome.get_cookies() _previous = time.time() chrome.quit() # Step 2 - Poll results from NCBI: # Actually, all parameters for polling results can be obtained from the wait page. # -- # Poll NCBI until the results are ready. # https://blast.ncbi.nlm.nih.gov/Blast.cgi?CMD=Web&PAGE_TYPE=BlastDocs&DOC_TYPE=DeveloperInfo # 1. Do not contact the server more often than once every 10 seconds. # 2. Do not poll for any single RID more often than once a minute. # 3. Use the URL parameter email and tool, so that the NCBI # can contact you if there is a problem. # 4. Run scripts weekends or between 9 pm and 5 am Eastern time # on weekdays if more than 50 searches will be submitted. # -- # Could start with a 10s delay, but expect most short queries # will take longer thus at least 70s with delay. Therefore, # start with 20s delay, thereafter once a minute. poll_params = [ ("RID", rid), ("JOB_TITLE", job_title), ("ENTREZ_QUERY", entrez_query), ('MAX_NUM_SEQ', max_num_seq), ("CMD", "Get"), ] poll_params = [p for p in poll_params if p[1] is not None] delay = 20 # seconds session = requests.Session() for c in cookies: session.cookies.set(c['name'], c['value']) not_done_yet = True while not_done_yet: current = time.time() wait = _previous + delay - current if wait > 0: time.sleep(wait) _previous = current + wait else: _previous = current # delay by at least 60 seconds only if running the request against the public NCBI API if delay < 60: # Wasn't a quick return, must wait at least a minute delay = 60 try: poll_response = session.get( "https://blast.ncbi.nlm.nih.gov/Blast.cgi", params=poll_params, headers=headers, timeout=120, proxies=proxies[-1], ) except Exception as err: utils.log( "WARNING: Couldn't poll results from NCBI. {}. " "But don't panic, we will retry and are almost there.".format( err), verbose=verbose, attr="debug") else: if poll_response.ok: poll_rid, poll_status, _, _, _, _ = _parse_qblast_wait_page( poll_response.content.decode("utf-8")) utils.log("RID: {}, Status: {}.".format(poll_rid, poll_status), verbose, "debug") if poll_rid == rid: if poll_status.lower() in ["waiting", "searching"]: continue elif poll_status.lower() == "failed": err_msg = _search_keyword( r'(<p class="error">.+?</p>)', poll_response.content.decode("utf-8"), ">NA<") err_msg = ''.join( re.findall(r'>(.+?)<', err_msg)) # remove inside links <a></a> raise errors.QueryError( 'Retrieving results failed. Error message from NCBI: "{}".' .format(err_msg)) elif poll_status.lower() == "ready": poll_params.append(("FORMAT_TYPE", format_type)) while not_done_yet: try: poll_response = session.get( "https://blast.ncbi.nlm.nih.gov/Blast.cgi", params=poll_params, headers=headers, timeout=120, proxies=proxies[-1]) except Exception as err: raise errors.QueryError( "Although the query was submitted, " "but the results couldn't be retrieved. {}" .format(err)) else: if poll_response.ok: poll_format = _search_keyword( r'<!DOCTYPE ([\w]+?) PUBLIC', poll_response.content.decode("utf-8"), "NA") if poll_format.lower() == "blastoutput": blastoutput = poll_response.content.decode( "utf-8") # XML not_done_yet = False break else: utils.log( "WARNING: Although the results are ready, " "they can't be retrieved somehow. " "Don't panic, we will retry and are almost there.", verbose=verbose, attr="debug") continue else: utils.log( "WARNING: Although the query was submitted, " "but the results couldn't be retrieved probably because of network issues. " "Status code: {}.".format( poll_response.status_code), verbose=verbose, attr="debug") else: utils.log( "WARNING: Something wrong while retrieving results from NCBI. " "RID: {}. Status: {}. " "But don't panic, we will retry and are almost there." .format(poll_rid, poll_status), verbose=verbose, attr="debug") else: utils.log( "WARNING: The submitted RID ({}) " "is different from the polled one ({}). " "But don't panic, we will try to retrieve results again." .format(rid, poll_rid), verbose=verbose, attr="debug") else: utils.log( "WARNING: Couldn't get results from NCBI. Status code: {}. " "But don't panic, we will retry and are almost there.". format(poll_response.status_code), verbose=verbose, attr="debug") return blastoutput
def test_parse_args(self): utils.log("Testing r2g.utils.utils _parse_args") raw_args = "r2g -o OUTPUT -s SRXNNNNNN -q ATGC --cut 80,50 -p blastn --CPU 4 --retry" raw_args = raw_args.split() parsed_args = utils.parse_arguments(raw_args) self.assertEqual(parsed_args, self.args)
def query(args, webdriver): download_list = {} name, seq_chunks = _format_seq(args) utils.log(seq_chunks, args['verbose'], 'debug') SRAs = {}.fromkeys(args['sra'].strip().split(',')).keys() formatted_SRAs = NCBIWWW_selenium.check_sra_validity(SRAs, proxy=args["proxy"]) # formatted_SRAs = {species1: {srx1: [srr...], srx2: [srr...]}, species2: ...} interval = 10 for i in formatted_SRAs.items(): for j in i[-1].items(): srx = j[0] srr = ','.join(j[-1]) current = 0 for chunk in seq_chunks: current += 1 utils.processing(current, len(seq_chunks), "{} - {} ({})".format(i[0], srx, srr), "percent") r = -1 err = '' while r < int(args['retry']): # Do not contact the server more often than once every 10 seconds: if interval < 10: time.sleep(11 - interval) start_time = time.time() if len(err) > 0: # utils.log("Retrying...", shift="\n") utils.log("Retrying...") try: result = NCBIWWW_selenium.qblast( program=args["program"], srx=srx, query=chunk, max_num_seq=(args["max_num_seq"] // (len(seq_chunks) * 20) + 1), expect=args["evalue"], # format_type='Tabular' # Don't know why the number of returned hits can't be determined when the format is Tabular. # So the XML format is required: format_type='XML', browser=webdriver, proxies=(args["chrome_proxy"], args["proxy"]), verbose=args["verbose"]) if args['verbose']: with open( os.path.join(args['outdir'], "{}.xml".format(srx)), 'w') as outf: outf.write(result) except Exception as e: err = str(e) r += 1 utils.log("Error msg while querying: {}.".format(err), shift="\n") else: err = '' break if len(err) > 0: raise errors.QueryError( "Couldn't get results from NCBI. Errors above must be investigated." ) else: result = _parse_xml(result, args) for sra in result.keys(): spots = deepcopy(download_list.get(sra, [])) spots += result[sra] download_list[sra] = deepcopy(spots) interval = time.time() - start_time download_list = _clear_up_list(download_list) return name, download_list