def download_waiter(stop_wait): """ Function waits untill 'local_fasta' file is downloaded. It prints size of downloaded data to console during downloading. This function just waits -- it won't bring you the menu :). """ # Wait untill downloading starts while not os.path.exists(tmp_fasta): if not stop_wait.is_set(): return # end if sleep(1) # end while MB_size = 1024**2 # we will divide by it to get megabytes while stop_wait.is_set(): # Get size of downloaded data fsize = round(os.path.getsize(tmp_fasta) / MB_size, 1) # get megabytes printn("\r{} - {} MB downloaded ".format(getwt(), fsize)) sleep(1) # instant updates are not necessary # end while # Print total size of downloaded file (it can be deleted by this time) try: fsize = round(os.path.getsize(tmp_fasta) / MB_size, 1) except OSError: # We can pass this ecxeption -- we do delete this file if downloading crushes # And this function just waits :) pass # end try printlog_info("\r{} - {} MB downloaded ".format( getwt(), fsize))
def run_status_bar(iteration_func, infpaths): # Function, which prints status bar for processing fastq file(s). # :param iteration_func: function which manipulates fastq data; # :param infpaths: collection of paths to input files; # :type infpaths: list<str>; # Configure initial parameters for status bar. nreads = src.fastq.count_reads(infpaths) bar_len = _get_bar_len() next_print_num = int(nreads * 0.01) inc_num = next_print_num i = 0 sys.stdout.write('{} - [>{}] 0/{} (0%)'.format(getwt(), ' ' * bar_len, nreads)) sys.stdout.flush() # Start processing for fastq_records in src.fastq.fastq_generator(infpaths): iteration_func(fastq_records) i += 1 # Update status bar if i > next_print_num: # Get new length of terminal window bar_len = _get_bar_len() done_ratio = i / nreads sys.stdout.write('\r{} - [{}>{}] {}/{} ({}%)'\ .format(getwt(), '='*int(bar_len*done_ratio), ' '*int(bar_len*(1-done_ratio)), i, nreads, int(done_ratio*100))) sys.stdout.flush() next_print_num += inc_num # end if # end for # Update status bar last time bar_len = _get_bar_len() done_ratio = i / nreads sys.stdout.write('\r{} - [{}{}] {}/{} ({}%)\n'\ .format(getwt(), '='*int(bar_len*done_ratio), ' '*int(bar_len*(1-done_ratio)), i, nreads, int(done_ratio*100))) sys.stdout.flush() next_print_num += inc_num
def wait_for_align(rid, rtoe, pack_to_send, filename): # Function waits untill BLAST server accomplishes the request. # # :param rid: Request ID to wait for; # :type rid: str; # :param rtoe: time in seconds estimated by BLAST server needed to accomplish the request; # :type rtoe: int; # :param pack_to_send: current packet (id) number to send; # :type pack_to_send: int; # :param filename: basename of current FASTA file; # :type filename: str # # Returns XML response ('str'). print() print("Requesting for current query status. Request ID: {}".format(rid)) print(" `{}`; Submission #{}".format(filename, pack_to_send[0])) log_info("Requesting for current query status.") log_info("Request ID: {}; `{}`; Submission #{}".format( rid, filename, pack_to_send[0], )) # RTOE can be zero at the very beginning of resumption if rtoe > 0: printlog_info_time( "BLAST server estimates that alignment will be accomplished in {} seconds" .format(rtoe)) printlog_info_time( "Waiting for {}+3 (+3 extra) seconds...".format(rtoe)) # Server migth be wrong -- we will give it 3 extra seconds sleep(rtoe + 3) printlog_info_time( "{} seconds have passed. Checking if alignment is accomplished...". format(rtoe + 3)) # end if server = "blast.ncbi.nlm.nih.gov" wait_url = "/blast/Blast.cgi?CMD=Get&FORMAT_OBJECT=SearchInfo&RID=" + rid whtspc_len = 6 + len("(requesting)") while True: resp_content = lingering_https_get_request(server, wait_url, "BLAST response") # if server asks to wait if "Status=WAITING" in resp_content: printn("\r{} - The request is being processed. Waiting{}{}".format( getwt(), ' ' * whtspc_len, "\033[%dD" % whtspc_len)) # indicate each 20 seconds with a dot for i in range(1, 7): sleep(10) printn( "\r{} - The request is being processed. Waiting{}".format( getwt(), '.' * i)) # end for printn("(requesting)") continue elif "Status=FAILED" in resp_content: # if job failed print() printlog_info_time("Job failed\a\n") printlog_info("Resending this packet.") return None, BlastError(2) elif "Status=UNKNOWN" in resp_content: # if job expired print() printlog_info_time("Job expired\a\n") printlog_info("Resending this packet.") return None, BlastError(1) # if results are ready elif "Status=READY" in resp_content: print() printlog_info("Result for query `{}` #{} is ready!".format( filename, pack_to_send[0])) # if there are hits if "ThereAreHits=yes" in resp_content: for i in range(15, 0, -5): print('-' * i) # end for print("-\nRetrieving results...") # Retrieve human-readable text and put it into result directory retrieve_text_url = "/Blast.cgi?CMD=Get&FORMAT_TYPE=Text&DESCRIPTIONS=1&ALIGNMENTS=1&RID=" + rid txt_align_res = lingering_https_get_request( server, retrieve_text_url, "text version of BLAST response") # Count already existing plain text files in outdir: is_txt_response = lambda f: not re.search( r"prober_blast_response_[0-9]+\.txt", f) is None outdir_path = os.path.dirname(logging.getLoggerClass( ).root.handlers[0].baseFilename) # tricky trick response_num = len( tuple(filter(is_txt_response, os.listdir(outdir_path)))) # Curent txt response file will have number `response_num+1` txt_hpath = os.path.join( outdir_path, "prober_blast_response_{}.txt".format(response_num + 1)) # Write text result for a human to read with open(txt_hpath, 'w') as txt_file: txt_file.write(txt_align_res) # end with elif "ThereAreHits=no" in resp_content: # if there are no hits printlog_info_time("There are no hits. It happens.\n") else: # probably, job is failed if execution reaches here print() printlog_info_time("Job failed\a\n") printlog_info("Resending this packet.") return None, BlastError(2) # end if break # end if # Execution should not reach here printlog_error_time( "Fatal error (-122). Please contact the developer.\a\n") platf_depend_exit(-122) # end while # Retrieve XML result retrieve_xml_url = "/Blast.cgi?CMD=Get&FORMAT_TYPE=XML&ALIGNMENTS=1&RID=" + rid xml_text = lingering_https_get_request(server, retrieve_xml_url, "XML BLAST response") if "Bad Gateway" in xml_text: print() printlog_info_time("Bad Gateway. Data from last packet has been lost.") printlog_info("Resending this packet.") return None, BlastError(1) elif "Status=FAILED" in xml_text: print() printlog_info_time("BLAST error: request failed") printlog_info("Resending this packet.") return None, BlastError(2) elif "to start it again" in xml_text: print() printlog_info_time("BLAST error") printlog_info("Resending this packet.") return None, BlastError(2) elif "[blastsrv4.REAL]" in xml_text: blastsrv4_match = re.search(r"(\[blastsrv4\.REAL\].*\))", xml_text) blastsrv4_str = "" if blastsrv4_match is None else ": {}".format( blastsrv4_match.group(1)) printlog_info_time("BLAST server error{}".format(blastsrv4_str)) # Error code 2 indicated that we need to split packet and resubmit return None, BlastError(2) # end if return xml_text, BlastError(0)
def retrieve_fastas_by_acc(acc_dict, db_dir, local_fasta): # Function downloads set of records from Genbank according to accessions passed to it. # Downloaded FASTA file will be placed in 'db_dir' directory and named 'local_seq_set.fasta' # :param acc_dict: dictionary comntaining accession data of hits; # :type acc_dict: dict<str: tuple<str, str, int>>; # :param db_dir: path to directory in which downloaded FASTA file will be placed; # :type db_dir: str; # :param local_fasta: path to file with reference sequences to be included in database; # :type local_fasta: str; # Path to file with current chunk (see below "100 accession numbers...") tmp_fasta = os.path.join(db_dir, "tmp.fasta") accessions = tuple(set(acc_dict.keys())) if len(accessions) == 0: # just in case return # end if # 100 accession numbers in order not to make too long URL # Download genomes by chunks of 100 sequences. max_accnum = 100 i = 0 accnum = len(accessions) while i < accnum: curr_accessions = accessions[i:i + max_accnum] # slice chunk accs_del_comma = ','.join( curr_accessions) # accessions must be separated by comma in url # E-utilities provide a possibility to download records from Genbank by accessions. retrieve_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?\ db=nuccore&id={}&rettype=fasta&retmode=text".format(accs_del_comma) log_info("Retrieve URL: `{}`".format(retrieve_url)) # GNU wget utility is safer, but there can be presence of absence of it :) wget_util = "wget" util_found = False for d in os.environ["PATH"].split(os.pathsep): if os.path.isdir(d) and wget_util in os.listdir(d): util_found = True break # end if # end for print() printlog_info("{} - Downloading {} reference sequences...".format( getwt(), len(curr_accessions))) if util_found: # If we have wget -- just use it wget_cmd = 'wget --no-check-certificate "{}" -O {}'.format( retrieve_url, tmp_fasta) pipe = sp_Popen(wget_cmd, shell=True) pipe.communicate() if pipe.returncode != 0: printlog_error_time( "Error occured while downloading reference sequences") platf_depend_exit(pipe.returncode) # end if else: # If there are no wget -- we will download sequences with Python disposal stop_wait = Event( ) # a flag variable that will signal waiter-function to stop executing def download_waiter(stop_wait): """ Function waits untill 'local_fasta' file is downloaded. It prints size of downloaded data to console during downloading. This function just waits -- it won't bring you the menu :). """ # Wait untill downloading starts while not os.path.exists(tmp_fasta): if not stop_wait.is_set(): return # end if sleep(1) # end while MB_size = 1024**2 # we will divide by it to get megabytes while stop_wait.is_set(): # Get size of downloaded data fsize = round(os.path.getsize(tmp_fasta) / MB_size, 1) # get megabytes printn("\r{} - {} MB downloaded ".format(getwt(), fsize)) sleep(1) # instant updates are not necessary # end while # Print total size of downloaded file (it can be deleted by this time) try: fsize = round(os.path.getsize(tmp_fasta) / MB_size, 1) except OSError: # We can pass this ecxeption -- we do delete this file if downloading crushes # And this function just waits :) pass # end try printlog_info("\r{} - {} MB downloaded ".format( getwt(), fsize)) # end def download_waiter error = True while error: try: waiter = Thread(target=download_waiter, args=(stop_wait, )) # create thread stop_wait.set() # raise the flag waiter.start() # start waiting urllib.request.urlretrieve( retrieve_url, tmp_fasta) # retrieve FASTA file except OSError as err: printlog_error_time( "Error occured while downloading fasta file.") printlog_error(str(err)) printlog_error( "`barapost-local.py` will try again in 30 seconds") if os.path.exists(tmp_fasta): os.unlink(tmp_fasta) # end if sleep(30) else: error = False finally: stop_wait.clear() # lower the flag waiter.join( ) # main thread will wait until waiter function ends it's work # end try # end while # end if printlog_info_time("Downloading is completed") # Write chunk to result fasta file with open(tmp_fasta, 'r') as infile, open(local_fasta, 'a') as outfile: outfile.write(infile.read()) # end with # Remove temp chunk file os.unlink(tmp_fasta) i += max_accnum # go to next chunk
def build_local_db(tax_annot_res_dir, acc_fpath, your_own_fasta_lst, accs_to_download, use_index): # Function creates a database with utilities from 'blast+' toolkit # according to acc_dict and your_own_fasta_lst. # # :param tax_annot_res_dir: path to current result directory # (each processed file has it's own result directory); # :type tax_annot_res_dir: str; # :param acc_fpath: path to file "hits_to_download.tsv"; # :type acc_fpath: str; # :param your_own_fasta_lst: list of user's fasta files to be included in database; # :type your_own_fasta_lst: list<str>; # :param accs_to_download: list of accessions from command line ('-s'); # :type accs_to_download: list<str>; # :param use_index: whether to use index; # :type use_index: str; # Returns path to created database. # Path to directory in which database will be placed db_dir = os.path.join(tax_annot_res_dir, "local_database") # Path to DBM taxonomy file taxonomy_path = os.path.join(tax_annot_res_dir, "taxonomy", "taxonomy.tsv") try: os.makedirs(db_dir) except OSError: #If this directory exists while True: if len(os.listdir(db_dir)) == 0: # If db directory is empty -- break and build a database break else: print() printlog_info("Database directory is not empty:") printlog_info(" `{}`".format(os.path.abspath(db_dir))) printlog_info("Here is it's content:") for i, fname in enumerate(os.listdir(os.path.abspath(db_dir))): printlog_info(" {}. `{}`".format(i + 1, fname)) # end for reply = input( """\nPress ENTER to start classification using existing database. Enter 'r' to remove all files in this directory and create the database from the beginning:>>""" ) if reply == "": # Do not build a database, just return path to it. printlog_info("You have chosen to use extant database.") # Return path to DB located in this directory dbpath = next(iter(os.listdir(db_dir))) dbpath = dbpath.partition(".fasta")[0] + dbpath.partition( ".fasta")[1] # remove all after '.fasta' return os.path.join(db_dir, dbpath) elif reply == 'r': printlog_info("You have chosen to rebuild the database.") # Rename old classification files and write actual data to new one: old_classif_dirs = filter( lambda d: os.path.exists( os.path.join(d, "classification.tsv")), glob(os.path.join(tax_annot_res_dir, "*"))) old_classif_files = tuple( map(lambda f: os.path.join(f, "classification.tsv"), old_classif_dirs)) if len(old_classif_files) > 0: print() printlog_info("Renaming old classification files:") for classif_file in old_classif_files: rename_file_verbosely(classif_file) # end for # end if # Empty database directory for file in glob("{}{}*".format(db_dir, os.sep)): os.unlink(file) # end for # Break from the loop in order to build a database break else: print("Invalid reply: `{}`\n".format(reply)) continue # end if # end if # end while # end try # It is a dictionary of accessions and record names. # Accessions are keys, record names are values. acc_dict = configure_acc_dict(acc_fpath, your_own_fasta_lst, accs_to_download) if len(accs_to_download) != 0: verify_cl_accessions(accs_to_download, acc_dict) # end if # Retrieve already existing taxonomy data from taxonomy file tax_exist_accs = taxonomy.get_tax_keys(taxonomy_path) # If accession file does not exist and execution has reached here -- everything is OK -- # we are building a database from user's files only. if len(acc_dict) != 0: print() print("""Following sequences (and all replicons related to them) will be downloaded from Genbank for further taxonomic classification on your local machine:\n""") printlog_info( "Following sequences (and all replicons related to them) \ will be downloaded from Genbank for further taxonomic classification \ on your local machine:") for i, acc in enumerate(acc_dict.keys()): printlog_info(" {}. {} - `{}`".format(i + 1, acc, acc_dict[acc])) # end for search_for_related_replicons(acc_dict) printlog_info_time("Completing taxonomy file...") for i, acc in enumerate(acc_dict.keys()): if not acc in tax_exist_accs: taxonomy.find_taxonomy(acc, acc_dict[acc][1], taxonomy_path) # end if # Accessions can be of different length printn("\r{} - {}: {}/{}".format(getwt(), acc, i + 1, len(acc_dict)) + " " * 10 + "\b" * 10) # end for print() printlog_info_time("Taxonomy file is consistent.") # end if local_fasta = os.path.join( db_dir, "local_seq_set.fasta") # path to downloaded FASTA file add_lambda_phage(local_fasta, taxonomy_path) # add lambda phage control sequence retrieve_fastas_by_acc( acc_dict, db_dir, local_fasta) # download main fasta data from GenBank # Add 'your own' fasta files to database if not len(your_own_fasta_lst) == 0: # This variable counts sequences from local files. # It is necessary for not allowing duplicated accessions. own_seq_counter = 0 # Check if these files are assembly made by SPAdes or a5 spades_patt = r">NODE_[0-9]+" # this pattern will match sequence IDs generated y SPAdes a5_patt = r">scaffold_[0-9]+" # this pattern will match sequence IDs generated y a5 assemblies = list( ) # this list will contain paths to assembly files (SPAdes or a5) for own_fasta_path in reversed(your_own_fasta_lst): how_to_open = OPEN_FUNCS[is_gzipped(own_fasta_path)] fmt_func = FORMATTING_FUNCS[is_gzipped(own_fasta_path)] with how_to_open(own_fasta_path) as fasta_file: first_seq_id = fmt_func(fasta_file.readline( )) # get the first line in file (the first seq ID) # end with # if we've got SPAdes assembly if not re.search(spades_patt, first_seq_id) is None: assemblies.append(own_fasta_path) # Remove these file from list -- they will be processed in a specific way your_own_fasta_lst.remove(own_fasta_path) continue # end if # if we've got a5 assembly if not re.search(a5_patt, first_seq_id) is None: assemblies.append(own_fasta_path) your_own_fasta_lst.remove(own_fasta_path) continue # end if # end for # Include assemblies files in multi-fasta file # Find common prefix of all assembly paths and remove it from assembly names if len(assemblies) > 1: assemblies_formatted = tuple( map(lambda f: os.path.abspath(f).replace(os.sep, '-'), assemblies)) common_prefix = find_common_prefix(assemblies_formatted) assemblies_formatted = tuple( map(lambda f: f.replace(common_prefix, ''), assemblies_formatted)) elif len(assemblies) > 0: common_prefix = '' assemblies_formatted = tuple(map(os.path.basename, assemblies)) # end if # Add assembled sequences to database with open(local_fasta, 'a') as fasta_db: for i, assm_path in enumerate(assemblies): printlog_info("Adding `{}` to database...".format( os.path.basename(assm_path))) assm_name_fmt = assemblies_formatted[i] how_to_open = OPEN_FUNCS[is_gzipped(assm_path)] fmt_func = FORMATTING_FUNCS[is_gzipped(assm_path)] with how_to_open(assm_path) as fasta_file: for line in fasta_file: line = fmt_func(line) # You can find comments to "OWN_SEQ..." below. # Paths will be written to seq IDs in following way: # some-happy-path.fastq-- # in order to retrieve them securely with regex later. if line.startswith('>'): own_seq_counter += 1 own_acc = "OWN_SEQ_{}".format(own_seq_counter) own_def = "{}--".format( assm_name_fmt.replace(common_prefix, '')) + line[1:] own_def = remove_bad_chars(own_def) taxonomy.save_taxonomy_directly( taxonomy_path, own_acc, own_def) line = ">" + "{} {}".format(own_acc, own_def) # end if fasta_db.write(line + '\n') # end for # end with # end for # end with with open(local_fasta, 'a') as fasta_db: for own_fasta_path in your_own_fasta_lst: printlog_info("Adding `{}` to database...".format( os.path.basename(own_fasta_path))) how_to_open = OPEN_FUNCS[is_gzipped(own_fasta_path)] fmt_func = FORMATTING_FUNCS[is_gzipped(own_fasta_path)] with how_to_open(own_fasta_path) as fasta_file: for line in fasta_file: line = fmt_func(line) # 'makeblastdb' considers first word (sep. is space) as sequence ID # and throws an error if there are duplicated IDs. # In order not to allow this duplication we'll create our own sequence IDs: # 'OWN_SEQ_<NUMBER>' and write it in the beginning of FASTA record name. if line.startswith('>'): own_seq_counter += 1 own_acc = "OWN_SEQ_{}".format(own_seq_counter) taxonomy.save_taxonomy_directly( taxonomy_path, own_acc, line[1:]) line = ">" + own_acc + ' ' + remove_bad_chars( line[1:]) # end if fasta_db.write(line + '\n') # end for # end with # end for # end with # end if # 'lcl|ACCESSION...' entries can be given with '.1' # (or '.2', whatever) terminus by blastn. # There is no '.1' terminus in taxonomy file. # Therefore we will prune accessions in advance. print() printn("{} - Formatting accessions...".format(getwt())) log_info("Formatting accessions...") corrected_path = os.path.join(db_dir, "corrected_seqs.fasta") with open(local_fasta, 'r') as source_file, open(corrected_path, 'w') as dest_file: for line in source_file: if line.startswith('>'): line = line.strip() acc, seq_name = (line.partition(' ')[0], line.partition(' ')[2]) acc = acc.partition('.')[0] seq_name = remove_bad_chars(seq_name) seq_name = re.sub(r'[^\x00-\x7F]+', '_', seq_name) # remove non-ascii chars line = ' '.join((acc, seq_name)) + '\n' # end if dest_file.write(line) # end for # end with os.unlink(local_fasta) os.rename(corrected_path, local_fasta) sys.stdout.write("\r{} - Formatting accessions... ok".format(getwt())) log_info("Formatting accessions done.") # Configure command line make_db_cmd = "makeblastdb -in {} -parse_seqids -dbtype nucl".format( local_fasta) exit_code = os.system(make_db_cmd) # make a blast-format database if exit_code != 0: printlog_error_time("Error occured while making the database") platf_depend_exit(exit_code) # end if print("\033[1A{} - Database is successfully created: `{}`\n".format( getwt(), local_fasta)) log_info("Database is successfully created: `{}`".format(local_fasta)) if use_index == "true": printlog_info_time("Database index creating started") # Configure command line make_index_cmd = "makembindex -input {} -iformat blastdb -verbosity verbose".format( local_fasta) exit_code = os.system( make_index_cmd) # create an index for the database if exit_code != 0: printlog_info_time("Error occured while creating database index") platf_depend_exit(exit_code) # end if printlog_info_time("Database index has been successfully created") # end if # Gzip downloaded FASTA file printlog_info_time("Gzipping FASTA file: `{}`".format(local_fasta)) if gzip_util_found: os.system("{} -v {}".format(gzip_util, local_fasta)) else: # form .fasta.gz file 'by hand' with open(local_fasta, 'rb') as fasta_file, open_as_gzip(local_fasta + ".gz", "wb") as fagz_file: shutil_copyfileobj(fasta_file, fagz_file) # end with os.unlink(local_fasta) # remove source FASTA file, not the database # end if return local_fasta