def download_waiter(stop_wait): """ Function waits untill 'local_fasta' file is downloaded. It prints size of downloaded data to console during downloading. This function just waits -- it won't bring you the menu :). """ # Wait untill downloading starts while not os.path.exists(tmp_fasta): if not stop_wait.is_set(): return # end if sleep(1) # end while MB_size = 1024**2 # we will divide by it to get megabytes while stop_wait.is_set(): # Get size of downloaded data fsize = round(os.path.getsize(tmp_fasta) / MB_size, 1) # get megabytes printn("\r{} - {} MB downloaded ".format(getwt(), fsize)) sleep(1) # instant updates are not necessary # end while # Print total size of downloaded file (it can be deleted by this time) try: fsize = round(os.path.getsize(tmp_fasta) / MB_size, 1) except OSError: # We can pass this ecxeption -- we do delete this file if downloading crushes # And this function just waits :) pass # end try printlog_info("\r{} - {} MB downloaded ".format( getwt(), fsize))
def verify_taxids(taxid_list): # Funciton verifies TaxIDs passed to prober with `-g` option. # Function requests NCBI Taxonomy Browser and parses organism's name from HTML response. # What is more, this function configures `oraganisms` list - it will be included into BLAST submissions. # # :param taxid_list: list of TaxIDs. TaxIDs are strings, but they are verified to be integers # during CL argument parsing; # :type taxid_list: list<str>; # # Returns list of strings of the following format: "<tax_name> (taxid:<TaxID>)>" organisms = list() if len(taxid_list) > 0: printlog_info("Verifying TaxIDs:") for taxid in taxid_list: printn(" {} - ".format(taxid)) try: tax_resp = lingering_https_get_request( "www.ncbi.nlm.nih.gov", "/Taxonomy/Browser/wwwtax.cgi?mode=Info&id={}".format( taxid), "taxonomy") tax_name = re.search(r"Taxonomy browser \((.+?)\)", tax_resp).group(1) except AttributeError: printlog_error("\aError: TaxID not found") printlog_error( "Please, check your TaxID: https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi" ) platf_depend_exit(1) except OSError as oserr: printlog_error("Something is wrong with connection:") printlog_error(str(oserr)) platf_depend_exit(-2) else: print(tax_name) log_info("{} - {}".format(taxid, tax_name)) organisms.append("{} (taxid:{})".format(tax_name, taxid)) # end try # end for print('-' * 30 + '\n') # end if return organisms
def launch_single_thread_binning(fpath_list, binning_func, tax_annot_res_dir, sens, min_qual, min_qlen, min_pident, min_coverage, no_trash): # Function launches single-thread binning, performed by finction 'srt_func'. # # :param fpath_list: list of path to files to process; # :type fpath_list: list<str>; # :param binning_func: function that performs binning; # :param tax_annot_res_dir: path to directory containing taxonomic annotation; # :type tax_annot_res_dir: str; # :param sens: binning sensitivity; # :type sens: str; # :param min_qual: threshold for quality filter; # :type min_qual: float; # :param min_qlen: threshold for length filter; # :type min_qlen: int (or None, if this filter is disabled); # :param min_pident: threshold for alignment identity filter; # :type min_pident: float (or None, if this filter is disabled); # :param min_coverage: threshold for alignment coverage filter; # :type min_coverage: float (or None, if this filter is disabled); # :param no_trash: loical value. True if user does NOT want to output trash files; # :type no_trash: bool; res_stats = list() num_files_total = len(fpath_list) # Sort files in single thread: for i, fq_fa_path in enumerate(fpath_list): res_stats.append( binning_func(fq_fa_path, tax_annot_res_dir, sens, min_qual, min_qlen, min_pident, min_coverage, no_trash)) sys.stdout.write('\r') printlog_info_time("File #{}/{} `{}` is binned."\ .format(i+1, num_files_total, os.path.basename(fq_fa_path))) printn(" Working...") # end for return res_stats
def wait_for_align(rid, rtoe, pack_to_send, filename): # Function waits untill BLAST server accomplishes the request. # # :param rid: Request ID to wait for; # :type rid: str; # :param rtoe: time in seconds estimated by BLAST server needed to accomplish the request; # :type rtoe: int; # :param pack_to_send: current packet (id) number to send; # :type pack_to_send: int; # :param filename: basename of current FASTA file; # :type filename: str # # Returns XML response ('str'). print() print("Requesting for current query status. Request ID: {}".format(rid)) print(" `{}`; Submission #{}".format(filename, pack_to_send[0])) log_info("Requesting for current query status.") log_info("Request ID: {}; `{}`; Submission #{}".format( rid, filename, pack_to_send[0], )) # RTOE can be zero at the very beginning of resumption if rtoe > 0: printlog_info_time( "BLAST server estimates that alignment will be accomplished in {} seconds" .format(rtoe)) printlog_info_time( "Waiting for {}+3 (+3 extra) seconds...".format(rtoe)) # Server migth be wrong -- we will give it 3 extra seconds sleep(rtoe + 3) printlog_info_time( "{} seconds have passed. Checking if alignment is accomplished...". format(rtoe + 3)) # end if server = "blast.ncbi.nlm.nih.gov" wait_url = "/blast/Blast.cgi?CMD=Get&FORMAT_OBJECT=SearchInfo&RID=" + rid whtspc_len = 6 + len("(requesting)") while True: resp_content = lingering_https_get_request(server, wait_url, "BLAST response") # if server asks to wait if "Status=WAITING" in resp_content: printn("\r{} - The request is being processed. Waiting{}{}".format( getwt(), ' ' * whtspc_len, "\033[%dD" % whtspc_len)) # indicate each 20 seconds with a dot for i in range(1, 7): sleep(10) printn( "\r{} - The request is being processed. Waiting{}".format( getwt(), '.' * i)) # end for printn("(requesting)") continue elif "Status=FAILED" in resp_content: # if job failed print() printlog_info_time("Job failed\a\n") printlog_info("Resending this packet.") return None, BlastError(2) elif "Status=UNKNOWN" in resp_content: # if job expired print() printlog_info_time("Job expired\a\n") printlog_info("Resending this packet.") return None, BlastError(1) # if results are ready elif "Status=READY" in resp_content: print() printlog_info("Result for query `{}` #{} is ready!".format( filename, pack_to_send[0])) # if there are hits if "ThereAreHits=yes" in resp_content: for i in range(15, 0, -5): print('-' * i) # end for print("-\nRetrieving results...") # Retrieve human-readable text and put it into result directory retrieve_text_url = "/Blast.cgi?CMD=Get&FORMAT_TYPE=Text&DESCRIPTIONS=1&ALIGNMENTS=1&RID=" + rid txt_align_res = lingering_https_get_request( server, retrieve_text_url, "text version of BLAST response") # Count already existing plain text files in outdir: is_txt_response = lambda f: not re.search( r"prober_blast_response_[0-9]+\.txt", f) is None outdir_path = os.path.dirname(logging.getLoggerClass( ).root.handlers[0].baseFilename) # tricky trick response_num = len( tuple(filter(is_txt_response, os.listdir(outdir_path)))) # Curent txt response file will have number `response_num+1` txt_hpath = os.path.join( outdir_path, "prober_blast_response_{}.txt".format(response_num + 1)) # Write text result for a human to read with open(txt_hpath, 'w') as txt_file: txt_file.write(txt_align_res) # end with elif "ThereAreHits=no" in resp_content: # if there are no hits printlog_info_time("There are no hits. It happens.\n") else: # probably, job is failed if execution reaches here print() printlog_info_time("Job failed\a\n") printlog_info("Resending this packet.") return None, BlastError(2) # end if break # end if # Execution should not reach here printlog_error_time( "Fatal error (-122). Please contact the developer.\a\n") platf_depend_exit(-122) # end while # Retrieve XML result retrieve_xml_url = "/Blast.cgi?CMD=Get&FORMAT_TYPE=XML&ALIGNMENTS=1&RID=" + rid xml_text = lingering_https_get_request(server, retrieve_xml_url, "XML BLAST response") if "Bad Gateway" in xml_text: print() printlog_info_time("Bad Gateway. Data from last packet has been lost.") printlog_info("Resending this packet.") return None, BlastError(1) elif "Status=FAILED" in xml_text: print() printlog_info_time("BLAST error: request failed") printlog_info("Resending this packet.") return None, BlastError(2) elif "to start it again" in xml_text: print() printlog_info_time("BLAST error") printlog_info("Resending this packet.") return None, BlastError(2) elif "[blastsrv4.REAL]" in xml_text: blastsrv4_match = re.search(r"(\[blastsrv4\.REAL\].*\))", xml_text) blastsrv4_str = "" if blastsrv4_match is None else ": {}".format( blastsrv4_match.group(1)) printlog_info_time("BLAST server error{}".format(blastsrv4_str)) # Error code 2 indicated that we need to split packet and resubmit return None, BlastError(2) # end if return xml_text, BlastError(0)
def bin_fastqa_file(fq_fa_lst, tax_annot_res_dir, sens, n_thr, min_qual, min_qlen, min_pident, min_coverage, no_trash): # Function for parallel binning FASTQ and FASTA files. # Actually bins multiple files. # # :param fq_fa_lst: lsit of paths to FASTQ (of FASTA) file meant to be processed; # :type fq_fa_lst: list<str>; # :param min_qual: threshold for quality filter; # :type min_qual: float; # :param min_qlen: threshold for length filter; # :type min_qlen: int (or None, if this filter is disabled); # :param min_pident: threshold for alignment identity filter; # :type min_pident: float (or None, if this filter is disabled); # :param min_coverage: threshold for alignment coverage filter; # :type min_coverage: float (or None, if this filter is disabled); # :param no_trash: loical value. True if user does NOT want to output trash files; # :type no_trash: bool; outdir_path = os.path.dirname( logging.getLoggerClass().root.handlers[0].baseFilename) seqs_pass = 0 # counter for sequences, which pass filters QL_seqs_fail = 0 # counter for too short or too low-quality sequences align_seqs_fail = 0 # counter for sequences, which align to their best hit with too low identity or coverage for fq_fa_path in fq_fa_lst: new_dpath = get_curr_res_dpath(fq_fa_path, tax_annot_res_dir) tsv_res_fpath = get_res_tsv_fpath(new_dpath) taxonomy_path = os.path.join(tax_annot_res_dir, "taxonomy", "taxonomy.tsv") resfile_lines = configure_resfile_lines(tsv_res_fpath, sens, taxonomy_path) # Configure path to trash file if is_fastq(fq_fa_path): seq_records_generator = fastq_records write_fun = write_fastq_record else: seq_records_generator = fasta_records write_fun = write_fasta_record # end if # Make filter for quality and length QL_filter = get_QL_filter(fq_fa_path, min_qual, min_qlen) # Configure path to trash file if not no_trash: QL_trash_fpath = get_QL_trash_fpath( fq_fa_path, outdir_path, min_qual, min_qlen, ) else: QL_trash_fpath = None # end if # Make filter for identity and coverage align_filter = get_align_filter(min_pident, min_coverage) # Configure path to this trash file if not no_trash: align_trash_fpath = get_align_trash_fpath(fq_fa_path, outdir_path, min_pident, min_coverage) else: align_trash_fpath = None # end if # Create an iterator that will yield records seq_records_iterator = iter(seq_records_generator(fq_fa_path)) # Dict for storing batches of sequences meant to be written to output files: to_write = dict() stop = False # for outer while-loop while not stop: # Extract batch of records of 'n_thr' size and find their destination paths: for _ in range(n_thr): try: fastqa_rec = next(seq_records_iterator) except StopIteration: stop = True # for outer while-loop break # end try read_name = sys.intern(fmt_read_id( fastqa_rec["seq_id"])[1:]) # get ID of the sequence try: hit_names, *vals_to_filter = resfile_lines[ read_name] # find hit corresponding to this sequence except KeyError: printlog_error_time("Error: read `{}` not found in TSV file containing taxonomic annotation."\ .format(read_name)) printlog_error("This TSV file: `{}`".format(tsv_res_fpath)) printlog_error( "Make sure that this read has been already processed by \ `barapost-prober.py` and `barapost-local.py`.") platf_depend_exit(1) # end try # If read is found in TSV file: if not QL_filter(vals_to_filter): # Place this sequence to QL trash file to_write[read_name] = (fastqa_rec, QL_trash_fpath) QL_seqs_fail += 1 elif not align_filter(vals_to_filter): # Place this sequence to QL trash file to_write[read_name] = (fastqa_rec, align_trash_fpath) align_seqs_fail += 1 else: for hit_name in hit_names.split("&&"): # Get name of result FASTQ file to write this read in binned_file_path = os.path.join( outdir_path, "{}.fast{}".format( hit_name, 'q' if is_fastq(fq_fa_path) else 'a')) to_write[read_name] = (fastqa_rec, binned_file_path) # end for seqs_pass += 1 # end if # end for # Write batch of records to output files: with write_lock: for record, fpath in to_write.values(): write_fun(fpath, record) # end for # end with to_write.clear() # end while with write_lock: # Write the rest of 'uneven' data to output files: if len(to_write) != 0: for record, fpath in to_write.values(): write_fun(fpath, record) # end for # end if sys.stdout.write('\r') printlog_info_time("File `{}` is binned.".format( os.path.basename(fq_fa_path))) printn(" Working...") # end with # end for return (seqs_pass, QL_seqs_fail, align_seqs_fail)
def parse_align_results_xml(xml_text, qual_dict, acc_dict, taxonomy_path): # Function parses BLAST xml response and returns tsv lines containing gathered information: # 1. Query name. # 2. Hit name formatted by 'format_taxonomy_name()' function. # 3. Hit accession. # 4. Length of query sequence. # 5. Length of alignment. # 6. Percent of identity. # 7. Percent of gaps. # 8. E-value. # 9. Average quality of a read (if source file is FASTQ). # 10. Read accuracy (%) (if source file is FASTQ). # # :param xml_text: XML text with results of alignment; # :type xml_text: str; # :param qual_dict: dict, which maps sequence IDs to their quality; # :type qual_dict: dict<str: float>; # :param acc_dict: dictionary comntaining accession data of hits; # :type acc_dict: dict<str: tuple<str, str, int>>; # :param taxonomy_path: path to DBM file with taxonomy; # :type taxonomy_path: str; # # Returns list<str>. result_tsv_lines = list() # /=== Parse BLAST XML response ===/ root = ElementTree.fromstring(xml_text) # get tree instance # Iterate over "Iteration" and "Iteration_hits" nodes for iter_elem, iter_hit in zip(root.iter("Iteration"), root.iter("Iteration_hits")): # "Iteration" node contains query name information query_name = sys.intern(iter_elem.find("Iteration_query-def").text) query_len = iter_elem.find("Iteration_query-len").text avg_quality = qual_dict[query_name] if avg_quality != '-': miscall_prop = round(10**(avg_quality / -10), 3) accuracy = round(100 * (1 - miscall_prop), 2) # expected percent of correctly called bases qual_info_to_print = " Average quality of this read is {}, i.e. accuracy is {}%;\n".format( avg_quality, accuracy) else: # If FASTA file is processing, print dashed in quality columns avg_quality = "-" accuracy = "-" # expected percent of correctly called bases qual_info_to_print = "" # end if # Check if there are any hits chck_h = iter_hit.find("Hit") if chck_h is None: # If there is no hit for current sequence print( "\n{} -- No significant similarity found;\n Query length - {};" .format(query_name, query_len)) result_tsv_lines.append('\t'.join( (query_name, "No significant similarity found", "-", query_len, "-", "-", "-", "-", str(avg_quality), str(accuracy)))) else: # If there are any hits, node "Iteration_hits" contains at least one "Hit" child # Get first-best bitscore and iterato over hits that have the save (i.e. the highest bitscore): top_bitscore = next( chck_h.find("Hit_hsps").iter("Hsp")).find("Hsp_bit-score").text annotations = list() hit_accs = list() for hit in iter_hit: # Find the first HSP hsp = next(hit.find("Hit_hsps").iter("Hsp")) if hsp.find("Hsp_bit-score").text != top_bitscore: break # end if # Get full hit name (e.g. "Erwinia amylovora strain S59/5, complete genome") hit_def = remove_bad_chars(hit.find("Hit_def").text) annotations.append(hit_def) curr_acc = sys.intern(hit.find("Hit_accession").text) hit_accs.append(curr_acc) # get hit accession # Get taxonomy find_taxonomy(curr_acc, hit_def, taxonomy_path) # Update accession dictionary try: acc_dict[curr_acc][1] += 1 except KeyError: acc_dict[curr_acc] = [hit_def, 1] # end try align_len = hsp.find("Hsp_align-len").text.strip() pident = hsp.find( "Hsp_identity").text # get number of matched nucleotides gaps = hsp.find("Hsp_gaps").text # get number of gaps evalue = hsp.find("Hsp_evalue").text # get e-value pident_ratio = round(float(pident) / int(align_len) * 100, 2) gaps_ratio = round(float(gaps) / int(align_len) * 100, 2) # end for # Divide annotations and accessions with '&&' annotations = '&&'.join(annotations) hit_accs = '&&'.join(hit_accs) print("""\n{} - {} Query length - {} nt; Identity - {}/{} ({}%); Gaps - {}/{} ({}%);""".format( query_name, annotations, query_len, pident, align_len, pident_ratio, gaps, align_len, gaps_ratio)) # Append new tsv line containing recently collected information result_tsv_lines.append('\t'.join( (query_name, annotations, hit_accs, query_len, align_len, pident, gaps, evalue, str(avg_quality), str(accuracy)))) # end if printn(qual_info_to_print) # end for return result_tsv_lines
def _reformat_legacy_file(legacy_tax_path): import shelve # Check if this file is corrupted try: with shelve.open(legacy_tax_path, 'r') as tax_file: pass # end with except OSError as err: printlog_error("Legacy taxonomy file appears to be corrupted.") printlog_error("This error might be fatal.") str_err = str(err) if "dbm.gnu" in str_err and "module is not" in str_err: printlog_error("Installing `python3-gdbm` might solve this problem.") else: printlog_error("The program can't recover taxonomy from the broken file.") printlog_error("Seems, you have to annotate your sequences again.") printlog_error("Sorry for that :(") # end if platf_depend_exit(1) # end try new_tax_path = "{}.tsv".format(legacy_tax_path) taxonomy.init_tax_file(new_tax_path) printn("Reformatting: `{}` ->".format(legacy_tax_path)) log_info("Reformatting: `{}` ->".format(legacy_tax_path)) with shelve.open(legacy_tax_path, 'r') as old_tax_file, open(new_tax_path, 'w') as new_tax_file: for acc, taxonomy_from_file in old_tax_file.items(): if isinstance(taxonomy_from_file, tuple): tax_str = taxonomy.config_taxonomy_str(taxonomy_from_file) new_tax_file.write("{}\n".format('\t'.join( (acc, tax_str) ))) elif isinstance(taxonomy_from_file, str): new_tax_file.write("{}\n".format('\t'.join( (acc, taxonomy_from_file) ))) else: # Execution must not reach here printlog_error_time("Fatal error 8755.") printlog_error("Please, contact the developer.") platf_depend_exit(8755) # end if # end for # end with printlog_info(" `<same_dir>/{}`".format(os.path.basename(new_tax_path))) try: renamed_legacy_file = "{}_deprecated".format(legacy_tax_path) os.rename(legacy_tax_path, renamed_legacy_file) except OSError as err: printlog_error_time("Cannot rename legacy taxonomy file `{}`:".format(legacy_tax_path)) printlog_error(str(err)) printlog_error("But it's not a problem -- we will proceed with our work.") else: printlog_info("Renamed: `{}` -> `<same_dir>/{}`".format(legacy_tax_path, os.path.basename(renamed_legacy_file))) # end try printlog_info("Legacy taxonomy file is reformatted to TSV format.")
def process_paral(fq_fa_list, packet_size, tax_annot_res_dir, blast_algorithm, use_index, db_path, nfiles): # Function performs 'many_files'-parallel mode of barapost-local.py. # :param fq_fa_list: list of paths to FASTA and FASTQ files meant to be processed; # :type fq_fa_list: list<str>; # :param packet_size: number of sequences processed by blast in a single launching; # :type packet_size: int; # :param tax_annot_res_dir: path to ouput directory that contains taxonomic annotation; # :type tax_annot_res_dir: str; # :param blast_algorithm: blast algorithm to use; # :type blast_algorithm: str; # :param use_index: logic value indicationg whether to use indes; # :type use_index: bool; # :param db_path: path to database; # :type db_path: str; # :param nfiles: total number of files; # :type nfiles: int; queries_tmp_dir = os.path.join(tax_annot_res_dir, "queries-tmp") # Iterate over source FASTQ and FASTA files for fq_fa_path in fq_fa_list: # Create the result directory with the name of FASTQ of FASTA file being processed: new_dpath = create_result_directory(fq_fa_path, tax_annot_res_dir) # "hname" means human readable name (i.e. without file path and extention) infile_hname = os.path.basename(fq_fa_path) infile_hname = re.search(r"(.+)\.(m)?f(ast)?(a|q)(\.gz)?$", infile_hname).group(1) # Look around and ckeck if there are results of previous runs of this script # If 'look_around' is None -- there is no data from previous run previous_data = look_around(new_dpath, fq_fa_path) if previous_data is None: # If there is no data from previous run num_done_seqs = 0 # number of successfully processed sequences tsv_res_path = os.path.join( new_dpath, "classification.tsv") # form result tsv file path else: # if there is data from previous run num_done_seqs = previous_data[ "n_done_reads"] # get number of successfully processed sequences tsv_res_path = previous_data[ "tsv_respath"] # result tsv file sholud be the same as during previous run # end if how_to_open = OPEN_FUNCS[is_gzipped(fq_fa_path)] fmt_func = FORMATTING_FUNCS[is_gzipped(fq_fa_path)] if is_fastq(fq_fa_path): packet_generator = fastq_packets num_seqs = sum( 1 for line in how_to_open(fq_fa_path)) // 4 # 4 lines per record else: packet_generator = fasta_packets try: num_seqs = len( tuple( filter( lambda l: True if l.startswith('>') else False, map(fmt_func, how_to_open(fq_fa_path).readlines())))) except UnicodeDecodeError as err: with print_lock: print() printlog_warning("Warning: current file is broken: {}."\ .format(str(err))) printlog_warning("File: `{}`".format( os.path.abspath(fq_fa_path))) printlog_warning("This file will not be processed.") continue # end with # end try # end if if num_seqs == num_done_seqs: with counter_lock: file_counter.value += 1 i = file_counter.value # save to local var and release lock # end with with print_lock: sys.stdout.write('\r') printlog_info_time("File #{}/{} (`{}`) has been already completely processed.".\ format(i, nfiles, fq_fa_path)) printlog_info("Omitting it.") printn("Working...") # end with continue # end if for packet in packet_generator(fq_fa_path, packet_size, num_done_seqs): # Blast the packet align_xml_text = launch_blastn(packet["fasta"], blast_algorithm, use_index, queries_tmp_dir, db_path) # Cnfigure result TSV lines result_tsv_lines = parse_align_results_xml(align_xml_text, packet["qual"]) # Write the result to tsv write_classification(result_tsv_lines, tsv_res_path) # end for with counter_lock: file_counter.value += 1 i = file_counter.value # save to local var and release lock # end with with print_lock: sys.stdout.write('\r') printlog_info_time("File #{}/{} (`{}`) is processed.".\ format(i, nfiles, os.path.basename(fq_fa_path))) printn("Working...") # end with # end for query_fpath = os.path.join(queries_tmp_dir, "query{}_tmp.fasta".format(os.getpid())) remove_tmp_files(query_fpath)
def map_f5reads_2_taxann(f5_fpaths, tsv_taxann_lst, tax_annot_res_dir): # Function perform mapping of all reads stored in input FAST5 files # to existing TSV files containing taxonomic annotation info. # # It creates an DBM index file. # # :param f5_fpaths: list of paths to current FAST5 file; # :type f5_fpaths: list<str>; # :param tsv_taxann_lst: list of path to TSV files that contain taxonomic annotation; # :type tsv_taxann_lst: list<str>; # :param tax_annot_res_dir: path to directory containing taxonomic annotation; # :type tax_annot_res_dir: str; index_dirpath = os.path.join( tax_annot_res_dir, index_name) # name of directory that will contain indicies for f5_path in f5_fpaths: # File validation: # RuntimeError will be raised if FAST5 file is broken. try: # File existance checking is performed while parsing CL arguments. # Therefore, this if-statement will trigger only if f5_path's file is not a valid HDF5 file. if not h5py.is_hdf5(f5_path): raise RuntimeError( "file is not of HDF5 (i.e. not FAST5) format") # end if f5_file = h5py.File(f5_path, 'r') for _ in f5_file: break # end for except RuntimeError as runterr: with print_lock: printlog_error_time("Error: FAST5 file is broken") printlog_error("Reading the file `{}` failed.".format( os.path.basename(f5_path))) printlog_error("Reason: {}".format(str(runterr))) printlog_error("Omitting this file...") print() # end with return # end try readids_to_seek = list(fast5_readids(f5_file)) idx_dict = dict() # dictionary for index # This saving is needed to compare with 'len(readids_to_seek)' # after all TSV will be looked through in order to # determine if some reads miss taxonomic annotation. len_before = len(readids_to_seek) # Iterate over TSV-taxann file for tsv_taxann_fpath in tsv_taxann_lst: with open(tsv_taxann_fpath, 'r') as taxann_file: # Get all read IDs in current TSV readids_in_tsv = list( map(lambda l: l.split('\t')[0], taxann_file.readlines())) # Iterate over all other reads in current FAST5 # ('reversed' is necessary because we remove items from list in this loop) for readid in reversed(readids_to_seek): fmt_id = fmt_read_id(readid)[1:] if fmt_id in readids_in_tsv: # If not first -- write data to dict (and to index later) try: idx_dict[tsv_taxann_fpath].append( "read_" + fmt_id) # append to existing list except KeyError: idx_dict[tsv_taxann_fpath] = [ "read_" + fmt_id ] # create a new list finally: readids_to_seek.remove(readid) # end try # end if # end for # end with if len(readids_to_seek) == 0: break # end if # end for # Save info about reads, for which classification if not found # in any of classification files if len(readids_to_seek) != 0: not_fount_key = 'CLASSIF_NOT_FOUND' idx_dict[not_fount_key] = list() for readid in readids_to_seek: fmt_id = fmt_read_id(readid)[1:] idx_dict[not_fount_key].append("read_" + fmt_id) # end for # end if # If after all TSV is checked but nothing have changed -- we miss taxonomic annotation # for some reads! And we will write their IDs to 'missing_reads_lst.txt' file. if len(readids_to_seek) == len_before: with print_lock: printlog_error_time( "Error: some reads from FAST5 file not found") printlog_error("This FAST5 file: `{}`".format(f5_path)) printlog_error( "Some reads have not undergone taxonomic annotation.") missing_log = "missing_reads_lst.txt" printlog_error( "List of missing reads are in following file: `{}`".format( missing_log)) with open(missing_log, 'w') as missing_logfile: missing_logfile.write( "Missing reads from file `{}`:\n\n".format(f5_path)) for readid in readids_to_seek: missing_logfile.write(fmt_read_id(readid) + '\n') # end for try: for path in glob(os.path.join(index_dirpath, '*')): os.unlink(path) # end for os.rmdir(index_dirpath) except OSError as oserr: printlog_error_time( "Error occured while removing index directory: {}". format(oserr)) finally: platf_depend_exit(3) # end try # end with # end if with write_lock: try: # Open index files appending to existing data ('c' parameter) with open_shelve(os.path.join(index_dirpath, index_name), 'c') as index_f5_2_tsv: # Update index index_f5_2_tsv[f5_path] = idx_dict # end with except OSError as oserr: printlog_error_time( "Error: cannot create index file `{}`".format( os.path.join(index_dirpath, index_name))) printlog_error(str(oserr)) platf_depend_exit(1) # end try # end with sys.stdout.write('\r') printlog_info_time("File `{}` is processed.".format( os.path.basename(f5_path))) printn(" Working...")
"\nWarning! Binning FAST5 files in parallel doesn't give any profit.") print("Number of threads is switched to 1.") n_thr = 1 # end if if len(fast5_list) == 0 and untwist_fast5: print( "\nWarning! No FAST5 file has been given to barapost-binning's input.") print("Therefore, `-u` (`--untwist-fast5`) flag does not make any sense.") print("Ignoring it.\n") untwist_fast5 = False # end if # Make sure that each file meant to be processed has it's directory with TSV result file # generated by prober and barapost. printn("Primary validation...") if not untwist_fast5: for fpath in fast5_list: # Get number of directories in 'tax_annot_res_dir' where results of current FAST5 # baraposting are located. possible_fast5_resdirs_num = len( glob("{}{}*{}*".format(tax_annot_res_dir, os.sep, get_checkstr(fpath)))) if possible_fast5_resdirs_num == 1: continue # OK elif possible_fast5_resdirs_num == 0: # there is no such a directory print() printlog_error_time( "Error: classification for following FAST5 file is missing:") printlog_error(" `{}`".format(fpath))
def process(fq_fa_list, n_thr, packet_size, tax_annot_res_dir, blast_algorithm, use_index, db_path): # Function preforms "few_files"-parallel mode. # # :param fq_fa_list: list of paths to files meant to be processed; # :type fq_fa_list: list<str>; # :param n_thr: number of threads to launch; # :type n_thr: int; # :param packet_size: number of sequences processed by blast in a single launching; # :type packet_size: int; # :param tax_annot_res_dir: path to ouput directory that contains taxonomic annotation; # :type tax_annot_res_dir: str; # :param blast_algorithm: blast algorithm to use; # :type blast_algorithm: str; # :param use_index: logic value indicationg whether to use indes; # :type use_index: bool; # :param db_path: path to database; # :type db_path: str; nfiles = len(fq_fa_list) for i, fq_fa_path in enumerate(fq_fa_list): # Create the result directory with the name of FASTQ of FASTA file being processed: new_dpath = create_result_directory(fq_fa_path, tax_annot_res_dir) # "hname" means human readable name (i.e. without file path and extention) infile_hname = os.path.basename(fq_fa_path) infile_hname = re.search(r"(.+)\.(m)?f(ast)?(a|q)(\.gz)?$", infile_hname).group(1) # Look around and ckeck if there are results of previous runs of this script # If 'look_around' is None -- there is no data from previous run previous_data = look_around(new_dpath, fq_fa_path) if previous_data is None: # If there is no data from previous run num_done_seqs = 0 # number of successfully processed sequences tsv_res_path = os.path.join(new_dpath, "classification.tsv") # form result tsv file path else: # if there is data from previous run num_done_seqs = previous_data["n_done_reads"] # get number of successfully processed sequences tsv_res_path = previous_data["tsv_respath"] # result tsv file sholud be the same as during previous run # end if how_to_open = OPEN_FUNCS[ is_gzipped(fq_fa_path) ] fmt_func = FORMATTING_FUNCS[ is_gzipped(fq_fa_path) ] if is_fastq(fq_fa_path): packet_generator = fastq_packets num_seqs = sum(1 for line in how_to_open(fq_fa_path)) // 4 # 4 lines per record else: packet_generator = fasta_packets try: num_seqs = len(tuple(filter(lambda l: True if l.startswith('>') else False, map(fmt_func, how_to_open(fq_fa_path).readlines())))) except UnicodeDecodeError as err: print() printlog_warning("Warning: current file is broken: {}."\ .format(str(err))) printlog_warning("File: `{}`".format(os.path.abspath(fq_fa_path))) printlog_warning("This file will not be processed.") continue # end try # end if packet_size = min(packet_size, num_seqs // n_thr) if num_seqs == num_done_seqs: sys.stdout.write('\r') printlog_info_time("File #{}/{} (`{}`) has been already completely processed."\ .format(i+1, nfiles, fq_fa_path)) printlog_info("Omitting it.") printn("Working...") return # end if # Get number of seqeunces to pass to each thread file_part_size = num_seqs // n_thr if num_seqs % n_thr != 0: file_part_size += 1 # end if pool = mp.Pool(n_thr, initializer=init_proc_single_file_in_paral, initargs=(mp.Lock(), mp.Lock(),)) pool.starmap(process_part_of_file, [(file_part, tsv_res_path, packet_size, tax_annot_res_dir, blast_algorithm, use_index, db_path) for file_part in packet_generator(fq_fa_path, file_part_size, num_done_seqs)]) # Reaping zombies pool.close() pool.join() sys.stdout.write('\r') printlog_info_time("File #{}/{} (`{}`) is processed.".\ format(i+1, nfiles, os.path.basename(fq_fa_path))) printn("Working...") # end for
def bin_fast5_file(f5_path, tax_annot_res_dir, sens, min_qual, min_qlen, min_pident, min_coverage, no_trash): # Function bins FAST5 file with untwisting. # # :param f5_path: path to FAST5 file meant to be processed; # :type f5_path: str; # :param tax_annot_res_dir: path to directory containing taxonomic annotation; # :type tax_annot_res_dir: str; # :param sens: binning sensitivity; # :type sens: str; # :param min_qual: threshold for quality filter; # :type min_qual: float; # :param min_qlen: threshold for length filter; # :type min_qlen: int (or None, if this filter is disabled); # :param min_pident: threshold for alignment identity filter; # :type min_pident: float (or None, if this filter is disabled); # :param min_coverage: threshold for alignment coverage filter; # :type min_coverage: float (or None, if this filter is disabled); # :param no_trash: loical value. True if user does NOT want to output trash files; # :type no_trash: bool; outdir_path = os.path.dirname( logging.getLoggerClass().root.handlers[0].baseFilename) seqs_pass = 0 # counter for sequences, which pass filters QL_seqs_fail = 0 # counter for too short or too low-quality sequences align_seqs_fail = 0 # counter for sequences, which align to their best hit with too low identity or coverage srt_file_dict = dict() index_dirpath = os.path.join( tax_annot_res_dir, index_name) # name of directory that will contain indicies # Make filter for quality and length QL_filter = get_QL_filter(f5_path, min_qual, min_qlen) # Configure path to trash file if not no_trash: QL_trash_fpath = get_QL_trash_fpath( f5_path, outdir_path, min_qual, min_qlen, ) else: QL_trash_fpath = None # end if # Make filter for identity and coverage align_filter = get_align_filter(min_pident, min_coverage) # Configure path to this trash file if not no_trash: align_trash_fpath = get_align_trash_fpath(f5_path, outdir_path, min_pident, min_coverage) else: align_trash_fpath = None # end if # File validation: # RuntimeError will be raised if FAST5 file is broken. try: # File existance checking is performed while parsing CL arguments. # Therefore, this if-statement will trigger only if f5_path's file is not a valid HDF5 file. if not h5py.is_hdf5(f5_path): raise RuntimeError("file is not of HDF5 (i.e. not FAST5) format") # end if from_f5 = h5py.File(f5_path, 'r') for _ in from_f5: break # end for except RuntimeError as runterr: printlog_error_time("FAST5 file is broken") printlog_error("Reading the file `{}` crashed.".format( os.path.basename(f5_path))) printlog_error("Reason: {}".format(str(runterr))) printlog_error("Omitting this file...") print() # Return zeroes -- inc_val won't be incremented and this file will be omitted return (0, 0, 0) # end try # singleFAST5 and multiFAST5 files should be processed in different ways # "Raw" group always in singleFAST5 root and never in multiFAST5 root if "Raw" in from_f5.keys(): f5_cpy_func = copy_single_f5 else: f5_cpy_func = copy_read_f5_2_f5 # end if readids_to_seek = list(from_f5.keys()) # list of not-binned-yet read IDs # Fill the list 'readids_to_seek' for read_name in fast5_readids(from_f5): # Get rid of "read_" readids_to_seek.append(sys.intern(read_name)) # end for # Walk through the index index_f5_2_tsv = open_shelve(os.path.join(index_dirpath, index_name), 'r') if not f5_path in index_f5_2_tsv.keys(): printlog_error_time( "Source FAST5 file `{}` not found in index".format(f5_path)) printlog_error("Try to rebuild index") platf_depend_exit(1) # end if for tsv_path in index_f5_2_tsv[f5_path].keys(): read_names = index_f5_2_tsv[f5_path][tsv_path] taxonomy_path = os.path.join(tax_annot_res_dir, "taxonomy", "taxonomy.tsv") resfile_lines = configure_resfile_lines(tsv_path, sens, taxonomy_path) for read_name in read_names: try: hit_names, *vals_to_filter = resfile_lines[sys.intern( fmt_read_id(read_name)[1:])] except KeyError: printlog_error_time("Error: missing taxonomic annotation info for read `{}`"\ .format(fmt_read_id(read_name)[1:])) printlog_error( "It is stored in `{}` FAST5 file".format(f5_path)) printlog_error( "Try to make new index file (press ENTER on corresponding prompt)." ) printlog_error( "Or, if does not work for you, make sure that taxonomic annotation info \ for this read is present in one of TSV files generated by `barapost-prober.py` and `barapost-local.py`." ) index_f5_2_tsv.close() platf_depend_exit(1) # end try if not QL_filter(vals_to_filter): # Get name of result FASTQ file to write this read in if QL_trash_fpath not in srt_file_dict.keys(): srt_file_dict = update_file_dict(srt_file_dict, QL_trash_fpath) # end if f5_cpy_func(from_f5, read_name, srt_file_dict[QL_trash_fpath]) QL_seqs_fail += 1 elif not align_filter(vals_to_filter): # Get name of result FASTQ file to write this read in if align_trash_fpath not in srt_file_dict.keys(): srt_file_dict = update_file_dict(srt_file_dict, align_trash_fpath) # end if f5_cpy_func(from_f5, read_name, srt_file_dict[align_trash_fpath]) align_seqs_fail += 1 else: for hit_name in hit_names.split( "&&" ): # there can be multiple hits for single query sequence # Get name of result FASTQ file to write this read in binned_file_path = os.path.join( outdir_path, "{}.fast5".format(hit_name)) if binned_file_path not in srt_file_dict.keys(): srt_file_dict = update_file_dict( srt_file_dict, binned_file_path) # end if f5_cpy_func(from_f5, read_name, srt_file_dict[binned_file_path]) # end for seqs_pass += 1 # end if # end for from_f5.close() index_f5_2_tsv.close() # Close all binned files for file_obj in filter(lambda x: not x is None, srt_file_dict.values()): file_obj.close() # end for sys.stdout.write('\r') printlog_info_time("File `{}` is binned.".format( os.path.basename(f5_path))) printn(" Working...") return (seqs_pass, QL_seqs_fail, align_seqs_fail)
def bin_fastqa_file(fq_fa_path, tax_annot_res_dir, sens, min_qual, min_qlen, min_pident, min_coverage, no_trash): # Function for single-thread binning FASTQ and FASTA files. # # :param fq_fa_path: path to FASTQ (of FASTA) file meant to be processed; # :type fq_fa_path: str; # :param tax_annot_res_dir: path to directory containing taxonomic annotation; # :type tax_annot_res_dir: str; # :param sens: binning sensitivity; # :type sens: str; # :param min_qual: threshold for quality filter; # :type min_qual: float; # :param min_qlen: threshold for length filter; # :type min_qlen: int (or None, if this filter is disabled); # :param min_pident: threshold for alignment identity filter; # :type min_pident: float (or None, if this filter is disabled); # :param min_coverage: threshold for alignment coverage filter; # :type min_coverage: float (or None, if this filter is disabled); # :param no_trash: loical value. True if user does NOT want to output trash files; # :type no_trash: bool; outdir_path = os.path.dirname( logging.getLoggerClass().root.handlers[0].baseFilename) seqs_pass = 0 # counter for sequences, which pass filters QL_seqs_fail = 0 # counter for too short or too low-quality sequences align_seqs_fail = 0 # counter for sequences, which align to their best hit with too low identity or coverage srt_file_dict = dict( ) # dict containing file objects of existing output files new_dpath = get_curr_res_dpath(fq_fa_path, tax_annot_res_dir) tsv_res_fpath = get_res_tsv_fpath(new_dpath) taxonomy_path = os.path.join(tax_annot_res_dir, "taxonomy", "taxonomy.tsv") resfile_lines = configure_resfile_lines(tsv_res_fpath, sens, taxonomy_path) # Configure generator, write function and path to trash file if is_fastq(fq_fa_path): seq_records_generator = fastq_records write_fun = write_fastq_record else: seq_records_generator = fasta_records write_fun = write_fasta_record # end if # Make filter for quality and length QL_filter = get_QL_filter(fq_fa_path, min_qual, min_qlen) # Configure path to trash file if not no_trash: QL_trash_fpath = get_QL_trash_fpath( fq_fa_path, outdir_path, min_qual, min_qlen, ) else: QL_trash_fpath = None # end if # Make filter for identity and coverage align_filter = get_align_filter(min_pident, min_coverage) # Configure path to this trash file if not no_trash: align_trash_fpath = get_align_trash_fpath(fq_fa_path, outdir_path, min_pident, min_coverage) else: align_trash_fpath = None # end if for fastq_rec in seq_records_generator(fq_fa_path): read_name = sys.intern(fmt_read_id( fastq_rec["seq_id"])[1:]) # get ID of the sequence try: hit_names, *vals_to_filter = resfile_lines[ read_name] # find hit corresponding to this sequence except KeyError: printlog_error_time("Error: read `{}` not found in TSV file containing taxonomic annotation."\ .format(read_name)) printlog_error("This TSV file: `{}`".format(tsv_res_fpath)) printlog_error("Make sure that this read has been already \ processed by `barapost-prober.py` and `barapost-local.py`.") platf_depend_exit(1) # end try # Apply filters if not QL_filter(vals_to_filter): QL_seqs_fail += 1 # Place this sequence to QL trash file if QL_trash_fpath not in srt_file_dict.keys(): srt_file_dict = update_file_dict(srt_file_dict, QL_trash_fpath) # end if write_fun(srt_file_dict[QL_trash_fpath], fastq_rec) # write current read to binned file elif not align_filter(vals_to_filter): align_seqs_fail += 1 # Place this sequence to align_trash file if align_trash_fpath not in srt_file_dict.keys(): srt_file_dict = update_file_dict(srt_file_dict, align_trash_fpath) # end if write_fun(srt_file_dict[align_trash_fpath], fastq_rec) # write current read to binned file else: for hit_name in hit_names.split( "&&" ): # there can be multiple hits for single query sequence # Get name of result FASTQ file to write this read in binned_file_path = os.path.join( outdir_path, "{}.fast{}".format(hit_name, 'q' if is_fastq(fq_fa_path) else 'a')) if binned_file_path not in srt_file_dict.keys(): srt_file_dict = update_file_dict(srt_file_dict, binned_file_path) # end if write_fun(srt_file_dict[binned_file_path], fastq_rec) # write current read to binned file # end for seqs_pass += 1 # end if # end for # Close all binned files for file_obj in filter(lambda x: not x is None, srt_file_dict.values()): file_obj.close() # end for sys.stdout.write('\r') printlog_info_time("File `{}` is binned.".format( os.path.basename(fq_fa_path))) printn(" Working...") return (seqs_pass, QL_seqs_fail, align_seqs_fail)
# The main goal of multiprocessing is to isolate processes from one another. # # Two situations are available: # 1. Number of threads <= number of files meant to be processed ('many_files'-parallel mode): # Files will be distribured equally among processes. # Processes interact with one another only while printing things to the console # for user's entertainment. # 2. Number of threads > number of files meant to be processed ('few_files'-parallel mode): # Files will be processed one by one. They will be divided into equal blocks, # and these blocks will be distributed among processes. # Processes interact with one another while writing to result file and # while printing things to the console. print() printlog_info_time("Starting classification.") printn(" Working...") if n_thr <= len(fq_fa_list): if n_thr != 1: # Proceed 'many_files'-parallel processing from src.barapost_local_modules.parallel_mult_files import process process(fq_fa_list, n_thr, packet_size, tax_annot_res_dir, blast_algorithm, use_index, db_path) else: # Proceed single-thread processing
def bin_fast5_file(f5_path, tax_annot_res_dir, sens, min_qual, min_qlen, min_pident, min_coverage, no_trash): # Function bins FAST5 file without untwisting. # # :param f5_path: path to FAST5 file meant to be processed; # :type f5_path: str; # :param tax_annot_res_dir: path to directory containing taxonomic annotation; # :type tax_annot_res_dir: str; # :param sens: binning sensitivity; # :type sens: str; # :param min_qual: threshold for quality filter; # :type min_qual: float; # :param min_qlen: threshold for length filter; # :type min_qlen: int (or None, if this filter is disabled); # :param min_pident: threshold for alignment identity filter; # :type min_pident: float (or None, if this filter is disabled); # :param min_coverage: threshold for alignment coverage filter; # :type min_coverage: float (or None, if this filter is disabled); # :param no_trash: loical value. True if user does NOT want to output trash files; # :type no_trash: bool; outdir_path = os.path.dirname(logging.getLoggerClass().root.handlers[0].baseFilename) seqs_pass = 0 # counter for sequences, which pass filters QL_seqs_fail = 0 # counter for too short or too low-quality sequences align_seqs_fail = 0 # counter for sequences, which align to their best hit with too low identity or coverage srt_file_dict = dict() new_dpath = glob("{}{}*{}*".format(tax_annot_res_dir, os.sep, get_checkstr(f5_path)))[0] tsv_res_fpath = get_res_tsv_fpath(new_dpath) taxonomy_path = os.path.join(tax_annot_res_dir, "taxonomy", "taxonomy.tsv") resfile_lines = configure_resfile_lines(tsv_res_fpath, sens, taxonomy_path) # Make filter for quality and length QL_filter = get_QL_filter(f5_path, min_qual, min_qlen) # Configure path to trash file if not no_trash: QL_trash_fpath = get_QL_trash_fpath(f5_path, outdir_path, min_qual, min_qlen,) else: QL_trash_fpath = None # end if # Make filter for identity and coverage align_filter = get_align_filter(min_pident, min_coverage) # Configure path to this trash file if not no_trash: align_trash_fpath = get_align_trash_fpath(f5_path, outdir_path, min_pident, min_coverage) else: align_trash_fpath = None # end if # File validation: # RuntimeError will be raised if FAST5 file is broken. try: # File existance checking is performed while parsing CL arguments. # Therefore, this if-statement will trigger only if f5_path's file is not a valid HDF5 file. if not h5py.is_hdf5(f5_path): raise RuntimeError("file is not of HDF5 (i.e. not FAST5) format") # end if from_f5 = h5py.File(f5_path, 'r') for _ in from_f5: break # end for except RuntimeError as runterr: printlog_error_time("FAST5 file is broken") printlog_error("Reading the file `{}` crashed.".format(os.path.basename(f5_path))) printlog_error("Reason: {}".format( str(runterr) )) printlog_error("Omitting this file...") print() # Return zeroes -- inc_val won't be incremented and this file will be omitted return (0, 0, 0) # end try # singleFAST5 and multiFAST5 files should be processed in different ways # "Raw" group always in singleFAST5 root and never in multiFAST5 root if "Raw" in from_f5.keys(): f5_cpy_func = copy_single_f5 else: f5_cpy_func = copy_read_f5_2_f5 # end if for _, read_name in enumerate(fast5_readids(from_f5)): try: hit_names, *vals_to_filter = resfile_lines[sys.intern(fmt_read_id(read_name))[1:]] # omit 'read_' in the beginning of FAST5 group's name except KeyError: printlog_error_time("Error: read `{}` not found in TSV file containing taxonomic annotation."\ .format(fmt_read_id(read_name))) printlog_error("This TSV file: `{}`".format(tsv_res_fpath)) printlog_error("Try running barapost-binning with `-u` (`--untwist-fast5`) flag.\n") platf_depend_exit(1) # end try # If read is found in TSV file: if not QL_filter(vals_to_filter): QL_seqs_fail += 1 # Get name of result FASTQ file to write this read in if QL_trash_fpath not in srt_file_dict.keys(): srt_file_dict = update_file_dict(srt_file_dict, QL_trash_fpath) # end if f5_cpy_func(from_f5, read_name, srt_file_dict[QL_trash_fpath]) elif not align_filter(vals_to_filter): align_seqs_fail += 1 # Get name of result FASTQ file to write this read in if QL_trash_fpath not in srt_file_dict.keys(): srt_file_dict = update_file_dict(srt_file_dict, align_trash_fpath) # end if f5_cpy_func(from_f5, read_name, srt_file_dict[align_trash_fpath]) else: for hit_name in hit_names.split("&&"): # there can be multiple hits for single query sequence # Get name of result FASTQ file to write this read in binned_file_path = os.path.join(outdir_path, "{}.fast5".format(hit_name)) if binned_file_path not in srt_file_dict.keys(): srt_file_dict = update_file_dict(srt_file_dict, binned_file_path) # end if f5_cpy_func(from_f5, read_name, srt_file_dict[binned_file_path]) # end for seqs_pass += 1 # end if # end for from_f5.close() # Close all binned files for file_obj in filter(lambda x: not x is None, srt_file_dict.values()): file_obj.close() # end for sys.stdout.write('\r') printlog_info_time("File `{}` is binned.".format(os.path.basename(f5_path))) printn(" Working...") return (seqs_pass, QL_seqs_fail, align_seqs_fail)
def build_local_db(tax_annot_res_dir, acc_fpath, your_own_fasta_lst, accs_to_download, use_index): # Function creates a database with utilities from 'blast+' toolkit # according to acc_dict and your_own_fasta_lst. # # :param tax_annot_res_dir: path to current result directory # (each processed file has it's own result directory); # :type tax_annot_res_dir: str; # :param acc_fpath: path to file "hits_to_download.tsv"; # :type acc_fpath: str; # :param your_own_fasta_lst: list of user's fasta files to be included in database; # :type your_own_fasta_lst: list<str>; # :param accs_to_download: list of accessions from command line ('-s'); # :type accs_to_download: list<str>; # :param use_index: whether to use index; # :type use_index: str; # Returns path to created database. # Path to directory in which database will be placed db_dir = os.path.join(tax_annot_res_dir, "local_database") # Path to DBM taxonomy file taxonomy_path = os.path.join(tax_annot_res_dir, "taxonomy", "taxonomy.tsv") try: os.makedirs(db_dir) except OSError: #If this directory exists while True: if len(os.listdir(db_dir)) == 0: # If db directory is empty -- break and build a database break else: print() printlog_info("Database directory is not empty:") printlog_info(" `{}`".format(os.path.abspath(db_dir))) printlog_info("Here is it's content:") for i, fname in enumerate(os.listdir(os.path.abspath(db_dir))): printlog_info(" {}. `{}`".format(i + 1, fname)) # end for reply = input( """\nPress ENTER to start classification using existing database. Enter 'r' to remove all files in this directory and create the database from the beginning:>>""" ) if reply == "": # Do not build a database, just return path to it. printlog_info("You have chosen to use extant database.") # Return path to DB located in this directory dbpath = next(iter(os.listdir(db_dir))) dbpath = dbpath.partition(".fasta")[0] + dbpath.partition( ".fasta")[1] # remove all after '.fasta' return os.path.join(db_dir, dbpath) elif reply == 'r': printlog_info("You have chosen to rebuild the database.") # Rename old classification files and write actual data to new one: old_classif_dirs = filter( lambda d: os.path.exists( os.path.join(d, "classification.tsv")), glob(os.path.join(tax_annot_res_dir, "*"))) old_classif_files = tuple( map(lambda f: os.path.join(f, "classification.tsv"), old_classif_dirs)) if len(old_classif_files) > 0: print() printlog_info("Renaming old classification files:") for classif_file in old_classif_files: rename_file_verbosely(classif_file) # end for # end if # Empty database directory for file in glob("{}{}*".format(db_dir, os.sep)): os.unlink(file) # end for # Break from the loop in order to build a database break else: print("Invalid reply: `{}`\n".format(reply)) continue # end if # end if # end while # end try # It is a dictionary of accessions and record names. # Accessions are keys, record names are values. acc_dict = configure_acc_dict(acc_fpath, your_own_fasta_lst, accs_to_download) if len(accs_to_download) != 0: verify_cl_accessions(accs_to_download, acc_dict) # end if # Retrieve already existing taxonomy data from taxonomy file tax_exist_accs = taxonomy.get_tax_keys(taxonomy_path) # If accession file does not exist and execution has reached here -- everything is OK -- # we are building a database from user's files only. if len(acc_dict) != 0: print() print("""Following sequences (and all replicons related to them) will be downloaded from Genbank for further taxonomic classification on your local machine:\n""") printlog_info( "Following sequences (and all replicons related to them) \ will be downloaded from Genbank for further taxonomic classification \ on your local machine:") for i, acc in enumerate(acc_dict.keys()): printlog_info(" {}. {} - `{}`".format(i + 1, acc, acc_dict[acc])) # end for search_for_related_replicons(acc_dict) printlog_info_time("Completing taxonomy file...") for i, acc in enumerate(acc_dict.keys()): if not acc in tax_exist_accs: taxonomy.find_taxonomy(acc, acc_dict[acc][1], taxonomy_path) # end if # Accessions can be of different length printn("\r{} - {}: {}/{}".format(getwt(), acc, i + 1, len(acc_dict)) + " " * 10 + "\b" * 10) # end for print() printlog_info_time("Taxonomy file is consistent.") # end if local_fasta = os.path.join( db_dir, "local_seq_set.fasta") # path to downloaded FASTA file add_lambda_phage(local_fasta, taxonomy_path) # add lambda phage control sequence retrieve_fastas_by_acc( acc_dict, db_dir, local_fasta) # download main fasta data from GenBank # Add 'your own' fasta files to database if not len(your_own_fasta_lst) == 0: # This variable counts sequences from local files. # It is necessary for not allowing duplicated accessions. own_seq_counter = 0 # Check if these files are assembly made by SPAdes or a5 spades_patt = r">NODE_[0-9]+" # this pattern will match sequence IDs generated y SPAdes a5_patt = r">scaffold_[0-9]+" # this pattern will match sequence IDs generated y a5 assemblies = list( ) # this list will contain paths to assembly files (SPAdes or a5) for own_fasta_path in reversed(your_own_fasta_lst): how_to_open = OPEN_FUNCS[is_gzipped(own_fasta_path)] fmt_func = FORMATTING_FUNCS[is_gzipped(own_fasta_path)] with how_to_open(own_fasta_path) as fasta_file: first_seq_id = fmt_func(fasta_file.readline( )) # get the first line in file (the first seq ID) # end with # if we've got SPAdes assembly if not re.search(spades_patt, first_seq_id) is None: assemblies.append(own_fasta_path) # Remove these file from list -- they will be processed in a specific way your_own_fasta_lst.remove(own_fasta_path) continue # end if # if we've got a5 assembly if not re.search(a5_patt, first_seq_id) is None: assemblies.append(own_fasta_path) your_own_fasta_lst.remove(own_fasta_path) continue # end if # end for # Include assemblies files in multi-fasta file # Find common prefix of all assembly paths and remove it from assembly names if len(assemblies) > 1: assemblies_formatted = tuple( map(lambda f: os.path.abspath(f).replace(os.sep, '-'), assemblies)) common_prefix = find_common_prefix(assemblies_formatted) assemblies_formatted = tuple( map(lambda f: f.replace(common_prefix, ''), assemblies_formatted)) elif len(assemblies) > 0: common_prefix = '' assemblies_formatted = tuple(map(os.path.basename, assemblies)) # end if # Add assembled sequences to database with open(local_fasta, 'a') as fasta_db: for i, assm_path in enumerate(assemblies): printlog_info("Adding `{}` to database...".format( os.path.basename(assm_path))) assm_name_fmt = assemblies_formatted[i] how_to_open = OPEN_FUNCS[is_gzipped(assm_path)] fmt_func = FORMATTING_FUNCS[is_gzipped(assm_path)] with how_to_open(assm_path) as fasta_file: for line in fasta_file: line = fmt_func(line) # You can find comments to "OWN_SEQ..." below. # Paths will be written to seq IDs in following way: # some-happy-path.fastq-- # in order to retrieve them securely with regex later. if line.startswith('>'): own_seq_counter += 1 own_acc = "OWN_SEQ_{}".format(own_seq_counter) own_def = "{}--".format( assm_name_fmt.replace(common_prefix, '')) + line[1:] own_def = remove_bad_chars(own_def) taxonomy.save_taxonomy_directly( taxonomy_path, own_acc, own_def) line = ">" + "{} {}".format(own_acc, own_def) # end if fasta_db.write(line + '\n') # end for # end with # end for # end with with open(local_fasta, 'a') as fasta_db: for own_fasta_path in your_own_fasta_lst: printlog_info("Adding `{}` to database...".format( os.path.basename(own_fasta_path))) how_to_open = OPEN_FUNCS[is_gzipped(own_fasta_path)] fmt_func = FORMATTING_FUNCS[is_gzipped(own_fasta_path)] with how_to_open(own_fasta_path) as fasta_file: for line in fasta_file: line = fmt_func(line) # 'makeblastdb' considers first word (sep. is space) as sequence ID # and throws an error if there are duplicated IDs. # In order not to allow this duplication we'll create our own sequence IDs: # 'OWN_SEQ_<NUMBER>' and write it in the beginning of FASTA record name. if line.startswith('>'): own_seq_counter += 1 own_acc = "OWN_SEQ_{}".format(own_seq_counter) taxonomy.save_taxonomy_directly( taxonomy_path, own_acc, line[1:]) line = ">" + own_acc + ' ' + remove_bad_chars( line[1:]) # end if fasta_db.write(line + '\n') # end for # end with # end for # end with # end if # 'lcl|ACCESSION...' entries can be given with '.1' # (or '.2', whatever) terminus by blastn. # There is no '.1' terminus in taxonomy file. # Therefore we will prune accessions in advance. print() printn("{} - Formatting accessions...".format(getwt())) log_info("Formatting accessions...") corrected_path = os.path.join(db_dir, "corrected_seqs.fasta") with open(local_fasta, 'r') as source_file, open(corrected_path, 'w') as dest_file: for line in source_file: if line.startswith('>'): line = line.strip() acc, seq_name = (line.partition(' ')[0], line.partition(' ')[2]) acc = acc.partition('.')[0] seq_name = remove_bad_chars(seq_name) seq_name = re.sub(r'[^\x00-\x7F]+', '_', seq_name) # remove non-ascii chars line = ' '.join((acc, seq_name)) + '\n' # end if dest_file.write(line) # end for # end with os.unlink(local_fasta) os.rename(corrected_path, local_fasta) sys.stdout.write("\r{} - Formatting accessions... ok".format(getwt())) log_info("Formatting accessions done.") # Configure command line make_db_cmd = "makeblastdb -in {} -parse_seqids -dbtype nucl".format( local_fasta) exit_code = os.system(make_db_cmd) # make a blast-format database if exit_code != 0: printlog_error_time("Error occured while making the database") platf_depend_exit(exit_code) # end if print("\033[1A{} - Database is successfully created: `{}`\n".format( getwt(), local_fasta)) log_info("Database is successfully created: `{}`".format(local_fasta)) if use_index == "true": printlog_info_time("Database index creating started") # Configure command line make_index_cmd = "makembindex -input {} -iformat blastdb -verbosity verbose".format( local_fasta) exit_code = os.system( make_index_cmd) # create an index for the database if exit_code != 0: printlog_info_time("Error occured while creating database index") platf_depend_exit(exit_code) # end if printlog_info_time("Database index has been successfully created") # end if # Gzip downloaded FASTA file printlog_info_time("Gzipping FASTA file: `{}`".format(local_fasta)) if gzip_util_found: os.system("{} -v {}".format(gzip_util, local_fasta)) else: # form .fasta.gz file 'by hand' with open(local_fasta, 'rb') as fasta_file, open_as_gzip(local_fasta + ".gz", "wb") as fagz_file: shutil_copyfileobj(fasta_file, fagz_file) # end with os.unlink(local_fasta) # remove source FASTA file, not the database # end if return local_fasta