def gzip_outfiles(outdir): # Function gzips all fastq files in directory `outdir`. # # :param outdir: path to outdir; # :type outdir: str; # Get gzipping function gzip_func = _get_gzip_func() print() printlog_info_time('Gzipping output files...') # Get fastq files is_fastq = lambda x: not re.match(r'.+\.f(ast)?q$', x) is None fq_fpaths = filter(is_fastq, glob.iglob(os.path.join(outdir, '*'))) # Gzip them! for fpath in fq_fpaths: try: gzip_func(fpath) except OSError as err: printlog_info('Error: cannot gzip file `{}`: {}.'.format( fpath, err)) platf_depend_exit(1) # end try # end for printlog_info_time('Output files are gzipped.')
def main(): args = handle_args() src.filesystem.make_outdir(args['o']) config_logging(args['o'], __version__, __last_update_date__) report_run_params(args) run_task_chain(args) if args['z']: src.compression.gzip_outfiles(args['o']) # end if printlog_info_time('Work is completed.')
def count_reads(fq_fpaths): # Function counts reads in a fastq file. # :param fq_fpaths: list of paths to fastq files; # :type fq_fpaths: list<str>; # Returns number of reads in "forward" file # (assuming that there are as many reads in "reverse" file). printlog_info_time('Counting reads...') # Get open function for "forward" file open_func = src.compression.provide_open_funcs(fq_fpaths)[0] # Count reads in "forward" file with open_func(fq_fpaths[0]) as infile: nreads = sum(1 for _ in infile) // 4 # end with printlog_info_time('{} reads.'.format(nreads)) return nreads
def add_lambda_phage(local_fasta, taxonomy_path): # Function adds control sequence of nanopore lambda phase DNA-CS # to 'local_fasta'. # # :param local_fasta: path to file with reference sequences to be included in database; # :type local_fasta: str; # :param taxonomy_path: path to taxonomy file; # :type taxonomy_path: str; print() printlog_info_time("Adding lambda phage control sequence...") # sys.path[0] is directory containing the script that was used to invoke the Python interpreter. # We will use it to get path to file with lambda's sequence. lambda_fpath = os.path.join(os.path.dirname(sys.path[0]), "lambda_control", "nanopore_lambda_DNA-CS_control.fasta.gz") # Check file existance if not os.path.exists(lambda_fpath): printlog_error_time( "Error: cannot find lambda phage control sequence: '{}'".format( lambda_fpath)) platf_depend_exit(1) # end if # Read lambda's sequence with open_as_gzip(lambda_fpath, 'rb') as lambda_file: lambda_fasta = lambda_file.read() # end with # Write it to db fasta file with open(local_fasta, 'wb') as db_fasta_file: db_fasta_file.write(lambda_fasta) # end with # Save lambda's taxonomy taxonomy.save_taxonomy_directly(taxonomy_path, "LAMBDA", "Lambda-phage-nanopore-control") printlog_info_time(" ok")
def launch_single_thread_binning(fpath_list, binning_func, tax_annot_res_dir, sens, min_qual, min_qlen, min_pident, min_coverage, no_trash): # Function launches single-thread binning, performed by finction 'srt_func'. # # :param fpath_list: list of path to files to process; # :type fpath_list: list<str>; # :param binning_func: function that performs binning; # :param tax_annot_res_dir: path to directory containing taxonomic annotation; # :type tax_annot_res_dir: str; # :param sens: binning sensitivity; # :type sens: str; # :param min_qual: threshold for quality filter; # :type min_qual: float; # :param min_qlen: threshold for length filter; # :type min_qlen: int (or None, if this filter is disabled); # :param min_pident: threshold for alignment identity filter; # :type min_pident: float (or None, if this filter is disabled); # :param min_coverage: threshold for alignment coverage filter; # :type min_coverage: float (or None, if this filter is disabled); # :param no_trash: loical value. True if user does NOT want to output trash files; # :type no_trash: bool; res_stats = list() num_files_total = len(fpath_list) # Sort files in single thread: for i, fq_fa_path in enumerate(fpath_list): res_stats.append( binning_func(fq_fa_path, tax_annot_res_dir, sens, min_qual, min_qlen, min_pident, min_coverage, no_trash)) sys.stdout.write('\r') printlog_info_time("File #{}/{} `{}` is binned."\ .format(i+1, num_files_total, os.path.basename(fq_fa_path))) printn(" Working...") # end for return res_stats
def verify_cl_accessions(accs_to_download, acc_dict): # Function checks existance of GenBank records that correspond to accessions # specified with '-s' option. After checking the function fulills 'acc_fict'. # :param accs_to_download: list of accessions from command line ('-s'); # :type accs_to_download: list<str>; # :param acc_dict: dictionary {<ACCESSION>: <HIT_DEFINITION>}; # :type acc_dict: dict<str: tuple<str>>; check_connection("https://www.ncbi.nlm.nih.gov/") printlog_info_time("Verifying `-s` accessions...") sys.stdout.write("0/{}".format(len(accs_to_download))) for i, acc in enumerate(accs_to_download): server = "eutils.ncbi.nlm.nih.gov" url = "/entrez/eutils/esummary.fcgi?db=nuccore&id={}".format(acc) text = lingering_https_get_request(server, url, "record's name", acc) name = re.search( r"\<Item Name=\"Title\" Type=\"String\"\>(.+)\</Item\>", text) if name is None: printlog_info( "Cannot find GenBank record with accession '{}'".format(acc)) platf_depend_exit(1) else: name = name.group(1) # end if acc_dict[acc] = name sys.stdout.write("\r{}/{}".format(i + 1, len(accs_to_download))) # end for print() printlog_info_time("OK.")
def search_for_related_replicons(acc_dict): # Function searches for replicons related to those in 'hits_to_download.tsv' # of specified with '-s' option. # :param acc_dict: dictionary comntaining accession data of hits; # :type acc_dict: dict<str: tuple<str, str, int>>; print() printlog_info_time("Searching for related replicons...") start_accs = tuple( acc_dict.keys()) # accessions, which were "discovered" by prober for i, acc in enumerate(start_accs): printlog_info("{}. {} ({}):".format(i + 1, acc, acc_dict[acc])) # Search for related replicons: try: related_repls = _get_related_replicons(acc, acc_dict) except AttributeError: printlog_errot_time( "Parsing error: cannot find replicons related to {}.".format( acc)) printlog_error("Please, contact the developer") platf_depend_exit(1) else: related_repls = _deduplicate_replicons(related_repls, acc) # end try for rel_acc, rel_def in related_repls: acc_dict[rel_acc] = rel_def # end for # end for print() if len(start_accs) != len(acc_dict): # there are some new replicons printlog_info_time("{} related replicons have been found.".\ format(len(acc_dict) - len(start_accs))) else: printlog_info_time("No related replicons found.") # end if print() # end def search_for_related_replicons
def wait_for_align(rid, rtoe, pack_to_send, filename): # Function waits untill BLAST server accomplishes the request. # # :param rid: Request ID to wait for; # :type rid: str; # :param rtoe: time in seconds estimated by BLAST server needed to accomplish the request; # :type rtoe: int; # :param pack_to_send: current packet (id) number to send; # :type pack_to_send: int; # :param filename: basename of current FASTA file; # :type filename: str # # Returns XML response ('str'). print() print("Requesting for current query status. Request ID: {}".format(rid)) print(" `{}`; Submission #{}".format(filename, pack_to_send[0])) log_info("Requesting for current query status.") log_info("Request ID: {}; `{}`; Submission #{}".format( rid, filename, pack_to_send[0], )) # RTOE can be zero at the very beginning of resumption if rtoe > 0: printlog_info_time( "BLAST server estimates that alignment will be accomplished in {} seconds" .format(rtoe)) printlog_info_time( "Waiting for {}+3 (+3 extra) seconds...".format(rtoe)) # Server migth be wrong -- we will give it 3 extra seconds sleep(rtoe + 3) printlog_info_time( "{} seconds have passed. Checking if alignment is accomplished...". format(rtoe + 3)) # end if server = "blast.ncbi.nlm.nih.gov" wait_url = "/blast/Blast.cgi?CMD=Get&FORMAT_OBJECT=SearchInfo&RID=" + rid whtspc_len = 6 + len("(requesting)") while True: resp_content = lingering_https_get_request(server, wait_url, "BLAST response") # if server asks to wait if "Status=WAITING" in resp_content: printn("\r{} - The request is being processed. Waiting{}{}".format( getwt(), ' ' * whtspc_len, "\033[%dD" % whtspc_len)) # indicate each 20 seconds with a dot for i in range(1, 7): sleep(10) printn( "\r{} - The request is being processed. Waiting{}".format( getwt(), '.' * i)) # end for printn("(requesting)") continue elif "Status=FAILED" in resp_content: # if job failed print() printlog_info_time("Job failed\a\n") printlog_info("Resending this packet.") return None, BlastError(2) elif "Status=UNKNOWN" in resp_content: # if job expired print() printlog_info_time("Job expired\a\n") printlog_info("Resending this packet.") return None, BlastError(1) # if results are ready elif "Status=READY" in resp_content: print() printlog_info("Result for query `{}` #{} is ready!".format( filename, pack_to_send[0])) # if there are hits if "ThereAreHits=yes" in resp_content: for i in range(15, 0, -5): print('-' * i) # end for print("-\nRetrieving results...") # Retrieve human-readable text and put it into result directory retrieve_text_url = "/Blast.cgi?CMD=Get&FORMAT_TYPE=Text&DESCRIPTIONS=1&ALIGNMENTS=1&RID=" + rid txt_align_res = lingering_https_get_request( server, retrieve_text_url, "text version of BLAST response") # Count already existing plain text files in outdir: is_txt_response = lambda f: not re.search( r"prober_blast_response_[0-9]+\.txt", f) is None outdir_path = os.path.dirname(logging.getLoggerClass( ).root.handlers[0].baseFilename) # tricky trick response_num = len( tuple(filter(is_txt_response, os.listdir(outdir_path)))) # Curent txt response file will have number `response_num+1` txt_hpath = os.path.join( outdir_path, "prober_blast_response_{}.txt".format(response_num + 1)) # Write text result for a human to read with open(txt_hpath, 'w') as txt_file: txt_file.write(txt_align_res) # end with elif "ThereAreHits=no" in resp_content: # if there are no hits printlog_info_time("There are no hits. It happens.\n") else: # probably, job is failed if execution reaches here print() printlog_info_time("Job failed\a\n") printlog_info("Resending this packet.") return None, BlastError(2) # end if break # end if # Execution should not reach here printlog_error_time( "Fatal error (-122). Please contact the developer.\a\n") platf_depend_exit(-122) # end while # Retrieve XML result retrieve_xml_url = "/Blast.cgi?CMD=Get&FORMAT_TYPE=XML&ALIGNMENTS=1&RID=" + rid xml_text = lingering_https_get_request(server, retrieve_xml_url, "XML BLAST response") if "Bad Gateway" in xml_text: print() printlog_info_time("Bad Gateway. Data from last packet has been lost.") printlog_info("Resending this packet.") return None, BlastError(1) elif "Status=FAILED" in xml_text: print() printlog_info_time("BLAST error: request failed") printlog_info("Resending this packet.") return None, BlastError(2) elif "to start it again" in xml_text: print() printlog_info_time("BLAST error") printlog_info("Resending this packet.") return None, BlastError(2) elif "[blastsrv4.REAL]" in xml_text: blastsrv4_match = re.search(r"(\[blastsrv4\.REAL\].*\))", xml_text) blastsrv4_str = "" if blastsrv4_match is None else ": {}".format( blastsrv4_match.group(1)) printlog_info_time("BLAST server error{}".format(blastsrv4_str)) # Error code 2 indicated that we need to split packet and resubmit return None, BlastError(2) # end if return xml_text, BlastError(0)
def bin_fastqa_file(fq_fa_lst, tax_annot_res_dir, sens, n_thr, min_qual, min_qlen, min_pident, min_coverage, no_trash): # Function for parallel binning FASTQ and FASTA files. # Actually bins multiple files. # # :param fq_fa_lst: lsit of paths to FASTQ (of FASTA) file meant to be processed; # :type fq_fa_lst: list<str>; # :param min_qual: threshold for quality filter; # :type min_qual: float; # :param min_qlen: threshold for length filter; # :type min_qlen: int (or None, if this filter is disabled); # :param min_pident: threshold for alignment identity filter; # :type min_pident: float (or None, if this filter is disabled); # :param min_coverage: threshold for alignment coverage filter; # :type min_coverage: float (or None, if this filter is disabled); # :param no_trash: loical value. True if user does NOT want to output trash files; # :type no_trash: bool; outdir_path = os.path.dirname( logging.getLoggerClass().root.handlers[0].baseFilename) seqs_pass = 0 # counter for sequences, which pass filters QL_seqs_fail = 0 # counter for too short or too low-quality sequences align_seqs_fail = 0 # counter for sequences, which align to their best hit with too low identity or coverage for fq_fa_path in fq_fa_lst: new_dpath = get_curr_res_dpath(fq_fa_path, tax_annot_res_dir) tsv_res_fpath = get_res_tsv_fpath(new_dpath) taxonomy_path = os.path.join(tax_annot_res_dir, "taxonomy", "taxonomy.tsv") resfile_lines = configure_resfile_lines(tsv_res_fpath, sens, taxonomy_path) # Configure path to trash file if is_fastq(fq_fa_path): seq_records_generator = fastq_records write_fun = write_fastq_record else: seq_records_generator = fasta_records write_fun = write_fasta_record # end if # Make filter for quality and length QL_filter = get_QL_filter(fq_fa_path, min_qual, min_qlen) # Configure path to trash file if not no_trash: QL_trash_fpath = get_QL_trash_fpath( fq_fa_path, outdir_path, min_qual, min_qlen, ) else: QL_trash_fpath = None # end if # Make filter for identity and coverage align_filter = get_align_filter(min_pident, min_coverage) # Configure path to this trash file if not no_trash: align_trash_fpath = get_align_trash_fpath(fq_fa_path, outdir_path, min_pident, min_coverage) else: align_trash_fpath = None # end if # Create an iterator that will yield records seq_records_iterator = iter(seq_records_generator(fq_fa_path)) # Dict for storing batches of sequences meant to be written to output files: to_write = dict() stop = False # for outer while-loop while not stop: # Extract batch of records of 'n_thr' size and find their destination paths: for _ in range(n_thr): try: fastqa_rec = next(seq_records_iterator) except StopIteration: stop = True # for outer while-loop break # end try read_name = sys.intern(fmt_read_id( fastqa_rec["seq_id"])[1:]) # get ID of the sequence try: hit_names, *vals_to_filter = resfile_lines[ read_name] # find hit corresponding to this sequence except KeyError: printlog_error_time("Error: read `{}` not found in TSV file containing taxonomic annotation."\ .format(read_name)) printlog_error("This TSV file: `{}`".format(tsv_res_fpath)) printlog_error( "Make sure that this read has been already processed by \ `barapost-prober.py` and `barapost-local.py`.") platf_depend_exit(1) # end try # If read is found in TSV file: if not QL_filter(vals_to_filter): # Place this sequence to QL trash file to_write[read_name] = (fastqa_rec, QL_trash_fpath) QL_seqs_fail += 1 elif not align_filter(vals_to_filter): # Place this sequence to QL trash file to_write[read_name] = (fastqa_rec, align_trash_fpath) align_seqs_fail += 1 else: for hit_name in hit_names.split("&&"): # Get name of result FASTQ file to write this read in binned_file_path = os.path.join( outdir_path, "{}.fast{}".format( hit_name, 'q' if is_fastq(fq_fa_path) else 'a')) to_write[read_name] = (fastqa_rec, binned_file_path) # end for seqs_pass += 1 # end if # end for # Write batch of records to output files: with write_lock: for record, fpath in to_write.values(): write_fun(fpath, record) # end for # end with to_write.clear() # end while with write_lock: # Write the rest of 'uneven' data to output files: if len(to_write) != 0: for record, fpath in to_write.values(): write_fun(fpath, record) # end for # end if sys.stdout.write('\r') printlog_info_time("File `{}` is binned.".format( os.path.basename(fq_fa_path))) printn(" Working...") # end with # end for return (seqs_pass, QL_seqs_fail, align_seqs_fail)
def send_request(request, pack_to_send, packet_size, packet_mode, filename, tmp_fpath): # Function sends a request to "blast.ncbi.nlm.nih.gov/blast/Blast.cgi" # and then waits for satisfaction of the request and retrieves response text. # # :param request: request_data (it is a dict that `configure_request()` function returns); # :param request: dict<dict>; # :param pack_to_send: current number (like id) of packet meant to be sent now. # :type pack_to_send: int; # :param pack_to_send: ordinal number of packet; # :type pack_to_send: int; # :param packet_size: numner of sequences in the packet; # :type packet_size: int; # # Returns XML text of type 'str' with BLAST response. payload = request["payload"] headers = request["headers"] server = "blast.ncbi.nlm.nih.gov" url = "/blast/Blast.cgi" error = True while error: try: conn = http.client.HTTPSConnection(server) # create a connection conn.request("POST", url, payload, headers) # send the request response = conn.getresponse() # get the response response_text = str(response.read(), "utf-8") # get response text except OSError as oserr: printlog_info_time( "`https://blast.ncbi.nlm.nih.gov` is not available.") printlog_info(str(oserr)) printlog_info( "barapost will try to connect again in 30 seconds...\n") sleep(30) # if no exception occured else: error = False # end try # end while try: rid = re.search(r"RID = (.+)", response_text).group(1) # get Request ID rtoe = int(re.search(r"RTOE = ([0-9]+)", response_text).group( 1)) # get time to wait provided by the NCBI server except AttributeError: printlog_error_time("Seems, NCBI has denied your request.") printlog_error("Response is in file `request_denial_response.html`") with open("request_denial_response.html", 'w') as den_file: den_file.write(response_text) # end with platf_depend_exit(1) finally: conn.close() # end try # Save temporary data with open(tmp_fpath, 'w') as tmpfile: tmpfile.write("Request_ID: {}\n".format(rid)) tmpfile.write("Packet_size: {}\n".format(packet_size)) tmpfile.write("Packet_mode: {}".format(packet_mode)) # end with # Wait for results of alignment return wait_for_align(rid, rtoe, pack_to_send, filename)
def map_f5reads_2_taxann(f5_fpaths, tsv_taxann_lst, tax_annot_res_dir): # Function perform mapping of all reads stored in input FAST5 files # to existing TSV files containing taxonomic annotation info. # # It creates an DBM index file. # # :param f5_fpaths: list of paths to current FAST5 file; # :type f5_fpaths: list<str>; # :param tsv_taxann_lst: list of path to TSV files that contain taxonomic annotation; # :type tsv_taxann_lst: list<str>; # :param tax_annot_res_dir: path to directory containing taxonomic annotation; # :type tax_annot_res_dir: str; index_dirpath = os.path.join( tax_annot_res_dir, index_name) # name of directory that will contain indicies for f5_path in f5_fpaths: # File validation: # RuntimeError will be raised if FAST5 file is broken. try: # File existance checking is performed while parsing CL arguments. # Therefore, this if-statement will trigger only if f5_path's file is not a valid HDF5 file. if not h5py.is_hdf5(f5_path): raise RuntimeError( "file is not of HDF5 (i.e. not FAST5) format") # end if f5_file = h5py.File(f5_path, 'r') for _ in f5_file: break # end for except RuntimeError as runterr: with print_lock: printlog_error_time("Error: FAST5 file is broken") printlog_error("Reading the file `{}` failed.".format( os.path.basename(f5_path))) printlog_error("Reason: {}".format(str(runterr))) printlog_error("Omitting this file...") print() # end with return # end try readids_to_seek = list(fast5_readids(f5_file)) idx_dict = dict() # dictionary for index # This saving is needed to compare with 'len(readids_to_seek)' # after all TSV will be looked through in order to # determine if some reads miss taxonomic annotation. len_before = len(readids_to_seek) # Iterate over TSV-taxann file for tsv_taxann_fpath in tsv_taxann_lst: with open(tsv_taxann_fpath, 'r') as taxann_file: # Get all read IDs in current TSV readids_in_tsv = list( map(lambda l: l.split('\t')[0], taxann_file.readlines())) # Iterate over all other reads in current FAST5 # ('reversed' is necessary because we remove items from list in this loop) for readid in reversed(readids_to_seek): fmt_id = fmt_read_id(readid)[1:] if fmt_id in readids_in_tsv: # If not first -- write data to dict (and to index later) try: idx_dict[tsv_taxann_fpath].append( "read_" + fmt_id) # append to existing list except KeyError: idx_dict[tsv_taxann_fpath] = [ "read_" + fmt_id ] # create a new list finally: readids_to_seek.remove(readid) # end try # end if # end for # end with if len(readids_to_seek) == 0: break # end if # end for # Save info about reads, for which classification if not found # in any of classification files if len(readids_to_seek) != 0: not_fount_key = 'CLASSIF_NOT_FOUND' idx_dict[not_fount_key] = list() for readid in readids_to_seek: fmt_id = fmt_read_id(readid)[1:] idx_dict[not_fount_key].append("read_" + fmt_id) # end for # end if # If after all TSV is checked but nothing have changed -- we miss taxonomic annotation # for some reads! And we will write their IDs to 'missing_reads_lst.txt' file. if len(readids_to_seek) == len_before: with print_lock: printlog_error_time( "Error: some reads from FAST5 file not found") printlog_error("This FAST5 file: `{}`".format(f5_path)) printlog_error( "Some reads have not undergone taxonomic annotation.") missing_log = "missing_reads_lst.txt" printlog_error( "List of missing reads are in following file: `{}`".format( missing_log)) with open(missing_log, 'w') as missing_logfile: missing_logfile.write( "Missing reads from file `{}`:\n\n".format(f5_path)) for readid in readids_to_seek: missing_logfile.write(fmt_read_id(readid) + '\n') # end for try: for path in glob(os.path.join(index_dirpath, '*')): os.unlink(path) # end for os.rmdir(index_dirpath) except OSError as oserr: printlog_error_time( "Error occured while removing index directory: {}". format(oserr)) finally: platf_depend_exit(3) # end try # end with # end if with write_lock: try: # Open index files appending to existing data ('c' parameter) with open_shelve(os.path.join(index_dirpath, index_name), 'c') as index_f5_2_tsv: # Update index index_f5_2_tsv[f5_path] = idx_dict # end with except OSError as oserr: printlog_error_time( "Error: cannot create index file `{}`".format( os.path.join(index_dirpath, index_name))) printlog_error(str(oserr)) platf_depend_exit(1) # end try # end with sys.stdout.write('\r') printlog_info_time("File `{}` is processed.".format( os.path.basename(f5_path))) printn(" Working...")
def process_paral(fq_fa_list, packet_size, tax_annot_res_dir, blast_algorithm, use_index, db_path, nfiles): # Function performs 'many_files'-parallel mode of barapost-local.py. # :param fq_fa_list: list of paths to FASTA and FASTQ files meant to be processed; # :type fq_fa_list: list<str>; # :param packet_size: number of sequences processed by blast in a single launching; # :type packet_size: int; # :param tax_annot_res_dir: path to ouput directory that contains taxonomic annotation; # :type tax_annot_res_dir: str; # :param blast_algorithm: blast algorithm to use; # :type blast_algorithm: str; # :param use_index: logic value indicationg whether to use indes; # :type use_index: bool; # :param db_path: path to database; # :type db_path: str; # :param nfiles: total number of files; # :type nfiles: int; queries_tmp_dir = os.path.join(tax_annot_res_dir, "queries-tmp") # Iterate over source FASTQ and FASTA files for fq_fa_path in fq_fa_list: # Create the result directory with the name of FASTQ of FASTA file being processed: new_dpath = create_result_directory(fq_fa_path, tax_annot_res_dir) # "hname" means human readable name (i.e. without file path and extention) infile_hname = os.path.basename(fq_fa_path) infile_hname = re.search(r"(.+)\.(m)?f(ast)?(a|q)(\.gz)?$", infile_hname).group(1) # Look around and ckeck if there are results of previous runs of this script # If 'look_around' is None -- there is no data from previous run previous_data = look_around(new_dpath, fq_fa_path) if previous_data is None: # If there is no data from previous run num_done_seqs = 0 # number of successfully processed sequences tsv_res_path = os.path.join( new_dpath, "classification.tsv") # form result tsv file path else: # if there is data from previous run num_done_seqs = previous_data[ "n_done_reads"] # get number of successfully processed sequences tsv_res_path = previous_data[ "tsv_respath"] # result tsv file sholud be the same as during previous run # end if how_to_open = OPEN_FUNCS[is_gzipped(fq_fa_path)] fmt_func = FORMATTING_FUNCS[is_gzipped(fq_fa_path)] if is_fastq(fq_fa_path): packet_generator = fastq_packets num_seqs = sum( 1 for line in how_to_open(fq_fa_path)) // 4 # 4 lines per record else: packet_generator = fasta_packets try: num_seqs = len( tuple( filter( lambda l: True if l.startswith('>') else False, map(fmt_func, how_to_open(fq_fa_path).readlines())))) except UnicodeDecodeError as err: with print_lock: print() printlog_warning("Warning: current file is broken: {}."\ .format(str(err))) printlog_warning("File: `{}`".format( os.path.abspath(fq_fa_path))) printlog_warning("This file will not be processed.") continue # end with # end try # end if if num_seqs == num_done_seqs: with counter_lock: file_counter.value += 1 i = file_counter.value # save to local var and release lock # end with with print_lock: sys.stdout.write('\r') printlog_info_time("File #{}/{} (`{}`) has been already completely processed.".\ format(i, nfiles, fq_fa_path)) printlog_info("Omitting it.") printn("Working...") # end with continue # end if for packet in packet_generator(fq_fa_path, packet_size, num_done_seqs): # Blast the packet align_xml_text = launch_blastn(packet["fasta"], blast_algorithm, use_index, queries_tmp_dir, db_path) # Cnfigure result TSV lines result_tsv_lines = parse_align_results_xml(align_xml_text, packet["qual"]) # Write the result to tsv write_classification(result_tsv_lines, tsv_res_path) # end for with counter_lock: file_counter.value += 1 i = file_counter.value # save to local var and release lock # end with with print_lock: sys.stdout.write('\r') printlog_info_time("File #{}/{} (`{}`) is processed.".\ format(i, nfiles, os.path.basename(fq_fa_path))) printn("Working...") # end with # end for query_fpath = os.path.join(queries_tmp_dir, "query{}_tmp.fasta".format(os.getpid())) remove_tmp_files(query_fpath)
def process(fq_fa_list, n_thr, packet_size, tax_annot_res_dir, blast_algorithm, use_index, db_path): # Function preforms "few_files"-parallel mode. # # :param fq_fa_list: list of paths to files meant to be processed; # :type fq_fa_list: list<str>; # :param n_thr: number of threads to launch; # :type n_thr: int; # :param packet_size: number of sequences processed by blast in a single launching; # :type packet_size: int; # :param tax_annot_res_dir: path to ouput directory that contains taxonomic annotation; # :type tax_annot_res_dir: str; # :param blast_algorithm: blast algorithm to use; # :type blast_algorithm: str; # :param use_index: logic value indicationg whether to use indes; # :type use_index: bool; # :param db_path: path to database; # :type db_path: str; nfiles = len(fq_fa_list) for i, fq_fa_path in enumerate(fq_fa_list): # Create the result directory with the name of FASTQ of FASTA file being processed: new_dpath = create_result_directory(fq_fa_path, tax_annot_res_dir) # "hname" means human readable name (i.e. without file path and extention) infile_hname = os.path.basename(fq_fa_path) infile_hname = re.search(r"(.+)\.(m)?f(ast)?(a|q)(\.gz)?$", infile_hname).group(1) # Look around and ckeck if there are results of previous runs of this script # If 'look_around' is None -- there is no data from previous run previous_data = look_around(new_dpath, fq_fa_path) if previous_data is None: # If there is no data from previous run num_done_seqs = 0 # number of successfully processed sequences tsv_res_path = os.path.join(new_dpath, "classification.tsv") # form result tsv file path else: # if there is data from previous run num_done_seqs = previous_data["n_done_reads"] # get number of successfully processed sequences tsv_res_path = previous_data["tsv_respath"] # result tsv file sholud be the same as during previous run # end if how_to_open = OPEN_FUNCS[ is_gzipped(fq_fa_path) ] fmt_func = FORMATTING_FUNCS[ is_gzipped(fq_fa_path) ] if is_fastq(fq_fa_path): packet_generator = fastq_packets num_seqs = sum(1 for line in how_to_open(fq_fa_path)) // 4 # 4 lines per record else: packet_generator = fasta_packets try: num_seqs = len(tuple(filter(lambda l: True if l.startswith('>') else False, map(fmt_func, how_to_open(fq_fa_path).readlines())))) except UnicodeDecodeError as err: print() printlog_warning("Warning: current file is broken: {}."\ .format(str(err))) printlog_warning("File: `{}`".format(os.path.abspath(fq_fa_path))) printlog_warning("This file will not be processed.") continue # end try # end if packet_size = min(packet_size, num_seqs // n_thr) if num_seqs == num_done_seqs: sys.stdout.write('\r') printlog_info_time("File #{}/{} (`{}`) has been already completely processed."\ .format(i+1, nfiles, fq_fa_path)) printlog_info("Omitting it.") printn("Working...") return # end if # Get number of seqeunces to pass to each thread file_part_size = num_seqs // n_thr if num_seqs % n_thr != 0: file_part_size += 1 # end if pool = mp.Pool(n_thr, initializer=init_proc_single_file_in_paral, initargs=(mp.Lock(), mp.Lock(),)) pool.starmap(process_part_of_file, [(file_part, tsv_res_path, packet_size, tax_annot_res_dir, blast_algorithm, use_index, db_path) for file_part in packet_generator(fq_fa_path, file_part_size, num_done_seqs)]) # Reaping zombies pool.close() pool.join() sys.stdout.write('\r') printlog_info_time("File #{}/{} (`{}`) is processed.".\ format(i+1, nfiles, os.path.basename(fq_fa_path))) printn("Working...") # end for
# Check if there is legacy taxonomy file and, if so, reformat it to new (TSV) format legacy_taxonomy_handling.check_deprecated_taxonomy(tax_annot_res_dir) if n_thr != 1: from src.spread_files_equally import spread_files_equally # end if # |=== Proceed "FAST5-untwisting" if it is enabled ===| printlog_info('-' * 30) print() if untwist_fast5 and not use_old_index: printlog_info_time("Untwisting started.") printn(" Working...") from src.binning_modules.binning_spec import get_tsv_taxann_lst tsv_taxann_lst = get_tsv_taxann_lst(tax_annot_res_dir) if n_thr == 1: for f5_path in fast5_list: utw_module.map_f5reads_2_taxann(f5_path, tsv_taxann_lst, tax_annot_res_dir) sys.stdout.write('\r') printlog_info_time("File `{}` is processed.".format( os.path.basename(f5_path))) printn(" Working...") # end for else:
def build_local_db(tax_annot_res_dir, acc_fpath, your_own_fasta_lst, accs_to_download, use_index): # Function creates a database with utilities from 'blast+' toolkit # according to acc_dict and your_own_fasta_lst. # # :param tax_annot_res_dir: path to current result directory # (each processed file has it's own result directory); # :type tax_annot_res_dir: str; # :param acc_fpath: path to file "hits_to_download.tsv"; # :type acc_fpath: str; # :param your_own_fasta_lst: list of user's fasta files to be included in database; # :type your_own_fasta_lst: list<str>; # :param accs_to_download: list of accessions from command line ('-s'); # :type accs_to_download: list<str>; # :param use_index: whether to use index; # :type use_index: str; # Returns path to created database. # Path to directory in which database will be placed db_dir = os.path.join(tax_annot_res_dir, "local_database") # Path to DBM taxonomy file taxonomy_path = os.path.join(tax_annot_res_dir, "taxonomy", "taxonomy.tsv") try: os.makedirs(db_dir) except OSError: #If this directory exists while True: if len(os.listdir(db_dir)) == 0: # If db directory is empty -- break and build a database break else: print() printlog_info("Database directory is not empty:") printlog_info(" `{}`".format(os.path.abspath(db_dir))) printlog_info("Here is it's content:") for i, fname in enumerate(os.listdir(os.path.abspath(db_dir))): printlog_info(" {}. `{}`".format(i + 1, fname)) # end for reply = input( """\nPress ENTER to start classification using existing database. Enter 'r' to remove all files in this directory and create the database from the beginning:>>""" ) if reply == "": # Do not build a database, just return path to it. printlog_info("You have chosen to use extant database.") # Return path to DB located in this directory dbpath = next(iter(os.listdir(db_dir))) dbpath = dbpath.partition(".fasta")[0] + dbpath.partition( ".fasta")[1] # remove all after '.fasta' return os.path.join(db_dir, dbpath) elif reply == 'r': printlog_info("You have chosen to rebuild the database.") # Rename old classification files and write actual data to new one: old_classif_dirs = filter( lambda d: os.path.exists( os.path.join(d, "classification.tsv")), glob(os.path.join(tax_annot_res_dir, "*"))) old_classif_files = tuple( map(lambda f: os.path.join(f, "classification.tsv"), old_classif_dirs)) if len(old_classif_files) > 0: print() printlog_info("Renaming old classification files:") for classif_file in old_classif_files: rename_file_verbosely(classif_file) # end for # end if # Empty database directory for file in glob("{}{}*".format(db_dir, os.sep)): os.unlink(file) # end for # Break from the loop in order to build a database break else: print("Invalid reply: `{}`\n".format(reply)) continue # end if # end if # end while # end try # It is a dictionary of accessions and record names. # Accessions are keys, record names are values. acc_dict = configure_acc_dict(acc_fpath, your_own_fasta_lst, accs_to_download) if len(accs_to_download) != 0: verify_cl_accessions(accs_to_download, acc_dict) # end if # Retrieve already existing taxonomy data from taxonomy file tax_exist_accs = taxonomy.get_tax_keys(taxonomy_path) # If accession file does not exist and execution has reached here -- everything is OK -- # we are building a database from user's files only. if len(acc_dict) != 0: print() print("""Following sequences (and all replicons related to them) will be downloaded from Genbank for further taxonomic classification on your local machine:\n""") printlog_info( "Following sequences (and all replicons related to them) \ will be downloaded from Genbank for further taxonomic classification \ on your local machine:") for i, acc in enumerate(acc_dict.keys()): printlog_info(" {}. {} - `{}`".format(i + 1, acc, acc_dict[acc])) # end for search_for_related_replicons(acc_dict) printlog_info_time("Completing taxonomy file...") for i, acc in enumerate(acc_dict.keys()): if not acc in tax_exist_accs: taxonomy.find_taxonomy(acc, acc_dict[acc][1], taxonomy_path) # end if # Accessions can be of different length printn("\r{} - {}: {}/{}".format(getwt(), acc, i + 1, len(acc_dict)) + " " * 10 + "\b" * 10) # end for print() printlog_info_time("Taxonomy file is consistent.") # end if local_fasta = os.path.join( db_dir, "local_seq_set.fasta") # path to downloaded FASTA file add_lambda_phage(local_fasta, taxonomy_path) # add lambda phage control sequence retrieve_fastas_by_acc( acc_dict, db_dir, local_fasta) # download main fasta data from GenBank # Add 'your own' fasta files to database if not len(your_own_fasta_lst) == 0: # This variable counts sequences from local files. # It is necessary for not allowing duplicated accessions. own_seq_counter = 0 # Check if these files are assembly made by SPAdes or a5 spades_patt = r">NODE_[0-9]+" # this pattern will match sequence IDs generated y SPAdes a5_patt = r">scaffold_[0-9]+" # this pattern will match sequence IDs generated y a5 assemblies = list( ) # this list will contain paths to assembly files (SPAdes or a5) for own_fasta_path in reversed(your_own_fasta_lst): how_to_open = OPEN_FUNCS[is_gzipped(own_fasta_path)] fmt_func = FORMATTING_FUNCS[is_gzipped(own_fasta_path)] with how_to_open(own_fasta_path) as fasta_file: first_seq_id = fmt_func(fasta_file.readline( )) # get the first line in file (the first seq ID) # end with # if we've got SPAdes assembly if not re.search(spades_patt, first_seq_id) is None: assemblies.append(own_fasta_path) # Remove these file from list -- they will be processed in a specific way your_own_fasta_lst.remove(own_fasta_path) continue # end if # if we've got a5 assembly if not re.search(a5_patt, first_seq_id) is None: assemblies.append(own_fasta_path) your_own_fasta_lst.remove(own_fasta_path) continue # end if # end for # Include assemblies files in multi-fasta file # Find common prefix of all assembly paths and remove it from assembly names if len(assemblies) > 1: assemblies_formatted = tuple( map(lambda f: os.path.abspath(f).replace(os.sep, '-'), assemblies)) common_prefix = find_common_prefix(assemblies_formatted) assemblies_formatted = tuple( map(lambda f: f.replace(common_prefix, ''), assemblies_formatted)) elif len(assemblies) > 0: common_prefix = '' assemblies_formatted = tuple(map(os.path.basename, assemblies)) # end if # Add assembled sequences to database with open(local_fasta, 'a') as fasta_db: for i, assm_path in enumerate(assemblies): printlog_info("Adding `{}` to database...".format( os.path.basename(assm_path))) assm_name_fmt = assemblies_formatted[i] how_to_open = OPEN_FUNCS[is_gzipped(assm_path)] fmt_func = FORMATTING_FUNCS[is_gzipped(assm_path)] with how_to_open(assm_path) as fasta_file: for line in fasta_file: line = fmt_func(line) # You can find comments to "OWN_SEQ..." below. # Paths will be written to seq IDs in following way: # some-happy-path.fastq-- # in order to retrieve them securely with regex later. if line.startswith('>'): own_seq_counter += 1 own_acc = "OWN_SEQ_{}".format(own_seq_counter) own_def = "{}--".format( assm_name_fmt.replace(common_prefix, '')) + line[1:] own_def = remove_bad_chars(own_def) taxonomy.save_taxonomy_directly( taxonomy_path, own_acc, own_def) line = ">" + "{} {}".format(own_acc, own_def) # end if fasta_db.write(line + '\n') # end for # end with # end for # end with with open(local_fasta, 'a') as fasta_db: for own_fasta_path in your_own_fasta_lst: printlog_info("Adding `{}` to database...".format( os.path.basename(own_fasta_path))) how_to_open = OPEN_FUNCS[is_gzipped(own_fasta_path)] fmt_func = FORMATTING_FUNCS[is_gzipped(own_fasta_path)] with how_to_open(own_fasta_path) as fasta_file: for line in fasta_file: line = fmt_func(line) # 'makeblastdb' considers first word (sep. is space) as sequence ID # and throws an error if there are duplicated IDs. # In order not to allow this duplication we'll create our own sequence IDs: # 'OWN_SEQ_<NUMBER>' and write it in the beginning of FASTA record name. if line.startswith('>'): own_seq_counter += 1 own_acc = "OWN_SEQ_{}".format(own_seq_counter) taxonomy.save_taxonomy_directly( taxonomy_path, own_acc, line[1:]) line = ">" + own_acc + ' ' + remove_bad_chars( line[1:]) # end if fasta_db.write(line + '\n') # end for # end with # end for # end with # end if # 'lcl|ACCESSION...' entries can be given with '.1' # (or '.2', whatever) terminus by blastn. # There is no '.1' terminus in taxonomy file. # Therefore we will prune accessions in advance. print() printn("{} - Formatting accessions...".format(getwt())) log_info("Formatting accessions...") corrected_path = os.path.join(db_dir, "corrected_seqs.fasta") with open(local_fasta, 'r') as source_file, open(corrected_path, 'w') as dest_file: for line in source_file: if line.startswith('>'): line = line.strip() acc, seq_name = (line.partition(' ')[0], line.partition(' ')[2]) acc = acc.partition('.')[0] seq_name = remove_bad_chars(seq_name) seq_name = re.sub(r'[^\x00-\x7F]+', '_', seq_name) # remove non-ascii chars line = ' '.join((acc, seq_name)) + '\n' # end if dest_file.write(line) # end for # end with os.unlink(local_fasta) os.rename(corrected_path, local_fasta) sys.stdout.write("\r{} - Formatting accessions... ok".format(getwt())) log_info("Formatting accessions done.") # Configure command line make_db_cmd = "makeblastdb -in {} -parse_seqids -dbtype nucl".format( local_fasta) exit_code = os.system(make_db_cmd) # make a blast-format database if exit_code != 0: printlog_error_time("Error occured while making the database") platf_depend_exit(exit_code) # end if print("\033[1A{} - Database is successfully created: `{}`\n".format( getwt(), local_fasta)) log_info("Database is successfully created: `{}`".format(local_fasta)) if use_index == "true": printlog_info_time("Database index creating started") # Configure command line make_index_cmd = "makembindex -input {} -iformat blastdb -verbosity verbose".format( local_fasta) exit_code = os.system( make_index_cmd) # create an index for the database if exit_code != 0: printlog_info_time("Error occured while creating database index") platf_depend_exit(exit_code) # end if printlog_info_time("Database index has been successfully created") # end if # Gzip downloaded FASTA file printlog_info_time("Gzipping FASTA file: `{}`".format(local_fasta)) if gzip_util_found: os.system("{} -v {}".format(gzip_util, local_fasta)) else: # form .fasta.gz file 'by hand' with open(local_fasta, 'rb') as fasta_file, open_as_gzip(local_fasta + ".gz", "wb") as fagz_file: shutil_copyfileobj(fasta_file, fagz_file) # end with os.unlink(local_fasta) # remove source FASTA file, not the database # end if return local_fasta
def bin_fast5_file(f5_path, tax_annot_res_dir, sens, min_qual, min_qlen, min_pident, min_coverage, no_trash): # Function bins FAST5 file with untwisting. # # :param f5_path: path to FAST5 file meant to be processed; # :type f5_path: str; # :param tax_annot_res_dir: path to directory containing taxonomic annotation; # :type tax_annot_res_dir: str; # :param sens: binning sensitivity; # :type sens: str; # :param min_qual: threshold for quality filter; # :type min_qual: float; # :param min_qlen: threshold for length filter; # :type min_qlen: int (or None, if this filter is disabled); # :param min_pident: threshold for alignment identity filter; # :type min_pident: float (or None, if this filter is disabled); # :param min_coverage: threshold for alignment coverage filter; # :type min_coverage: float (or None, if this filter is disabled); # :param no_trash: loical value. True if user does NOT want to output trash files; # :type no_trash: bool; outdir_path = os.path.dirname( logging.getLoggerClass().root.handlers[0].baseFilename) seqs_pass = 0 # counter for sequences, which pass filters QL_seqs_fail = 0 # counter for too short or too low-quality sequences align_seqs_fail = 0 # counter for sequences, which align to their best hit with too low identity or coverage srt_file_dict = dict() index_dirpath = os.path.join( tax_annot_res_dir, index_name) # name of directory that will contain indicies # Make filter for quality and length QL_filter = get_QL_filter(f5_path, min_qual, min_qlen) # Configure path to trash file if not no_trash: QL_trash_fpath = get_QL_trash_fpath( f5_path, outdir_path, min_qual, min_qlen, ) else: QL_trash_fpath = None # end if # Make filter for identity and coverage align_filter = get_align_filter(min_pident, min_coverage) # Configure path to this trash file if not no_trash: align_trash_fpath = get_align_trash_fpath(f5_path, outdir_path, min_pident, min_coverage) else: align_trash_fpath = None # end if # File validation: # RuntimeError will be raised if FAST5 file is broken. try: # File existance checking is performed while parsing CL arguments. # Therefore, this if-statement will trigger only if f5_path's file is not a valid HDF5 file. if not h5py.is_hdf5(f5_path): raise RuntimeError("file is not of HDF5 (i.e. not FAST5) format") # end if from_f5 = h5py.File(f5_path, 'r') for _ in from_f5: break # end for except RuntimeError as runterr: printlog_error_time("FAST5 file is broken") printlog_error("Reading the file `{}` crashed.".format( os.path.basename(f5_path))) printlog_error("Reason: {}".format(str(runterr))) printlog_error("Omitting this file...") print() # Return zeroes -- inc_val won't be incremented and this file will be omitted return (0, 0, 0) # end try # singleFAST5 and multiFAST5 files should be processed in different ways # "Raw" group always in singleFAST5 root and never in multiFAST5 root if "Raw" in from_f5.keys(): f5_cpy_func = copy_single_f5 else: f5_cpy_func = copy_read_f5_2_f5 # end if readids_to_seek = list(from_f5.keys()) # list of not-binned-yet read IDs # Fill the list 'readids_to_seek' for read_name in fast5_readids(from_f5): # Get rid of "read_" readids_to_seek.append(sys.intern(read_name)) # end for # Walk through the index index_f5_2_tsv = open_shelve(os.path.join(index_dirpath, index_name), 'r') if not f5_path in index_f5_2_tsv.keys(): printlog_error_time( "Source FAST5 file `{}` not found in index".format(f5_path)) printlog_error("Try to rebuild index") platf_depend_exit(1) # end if for tsv_path in index_f5_2_tsv[f5_path].keys(): read_names = index_f5_2_tsv[f5_path][tsv_path] taxonomy_path = os.path.join(tax_annot_res_dir, "taxonomy", "taxonomy.tsv") resfile_lines = configure_resfile_lines(tsv_path, sens, taxonomy_path) for read_name in read_names: try: hit_names, *vals_to_filter = resfile_lines[sys.intern( fmt_read_id(read_name)[1:])] except KeyError: printlog_error_time("Error: missing taxonomic annotation info for read `{}`"\ .format(fmt_read_id(read_name)[1:])) printlog_error( "It is stored in `{}` FAST5 file".format(f5_path)) printlog_error( "Try to make new index file (press ENTER on corresponding prompt)." ) printlog_error( "Or, if does not work for you, make sure that taxonomic annotation info \ for this read is present in one of TSV files generated by `barapost-prober.py` and `barapost-local.py`." ) index_f5_2_tsv.close() platf_depend_exit(1) # end try if not QL_filter(vals_to_filter): # Get name of result FASTQ file to write this read in if QL_trash_fpath not in srt_file_dict.keys(): srt_file_dict = update_file_dict(srt_file_dict, QL_trash_fpath) # end if f5_cpy_func(from_f5, read_name, srt_file_dict[QL_trash_fpath]) QL_seqs_fail += 1 elif not align_filter(vals_to_filter): # Get name of result FASTQ file to write this read in if align_trash_fpath not in srt_file_dict.keys(): srt_file_dict = update_file_dict(srt_file_dict, align_trash_fpath) # end if f5_cpy_func(from_f5, read_name, srt_file_dict[align_trash_fpath]) align_seqs_fail += 1 else: for hit_name in hit_names.split( "&&" ): # there can be multiple hits for single query sequence # Get name of result FASTQ file to write this read in binned_file_path = os.path.join( outdir_path, "{}.fast5".format(hit_name)) if binned_file_path not in srt_file_dict.keys(): srt_file_dict = update_file_dict( srt_file_dict, binned_file_path) # end if f5_cpy_func(from_f5, read_name, srt_file_dict[binned_file_path]) # end for seqs_pass += 1 # end if # end for from_f5.close() index_f5_2_tsv.close() # Close all binned files for file_obj in filter(lambda x: not x is None, srt_file_dict.values()): file_obj.close() # end for sys.stdout.write('\r') printlog_info_time("File `{}` is binned.".format( os.path.basename(f5_path))) printn(" Working...") return (seqs_pass, QL_seqs_fail, align_seqs_fail)
def retrieve_fastas_by_acc(acc_dict, db_dir, local_fasta): # Function downloads set of records from Genbank according to accessions passed to it. # Downloaded FASTA file will be placed in 'db_dir' directory and named 'local_seq_set.fasta' # :param acc_dict: dictionary comntaining accession data of hits; # :type acc_dict: dict<str: tuple<str, str, int>>; # :param db_dir: path to directory in which downloaded FASTA file will be placed; # :type db_dir: str; # :param local_fasta: path to file with reference sequences to be included in database; # :type local_fasta: str; # Path to file with current chunk (see below "100 accession numbers...") tmp_fasta = os.path.join(db_dir, "tmp.fasta") accessions = tuple(set(acc_dict.keys())) if len(accessions) == 0: # just in case return # end if # 100 accession numbers in order not to make too long URL # Download genomes by chunks of 100 sequences. max_accnum = 100 i = 0 accnum = len(accessions) while i < accnum: curr_accessions = accessions[i:i + max_accnum] # slice chunk accs_del_comma = ','.join( curr_accessions) # accessions must be separated by comma in url # E-utilities provide a possibility to download records from Genbank by accessions. retrieve_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?\ db=nuccore&id={}&rettype=fasta&retmode=text".format(accs_del_comma) log_info("Retrieve URL: `{}`".format(retrieve_url)) # GNU wget utility is safer, but there can be presence of absence of it :) wget_util = "wget" util_found = False for d in os.environ["PATH"].split(os.pathsep): if os.path.isdir(d) and wget_util in os.listdir(d): util_found = True break # end if # end for print() printlog_info("{} - Downloading {} reference sequences...".format( getwt(), len(curr_accessions))) if util_found: # If we have wget -- just use it wget_cmd = 'wget --no-check-certificate "{}" -O {}'.format( retrieve_url, tmp_fasta) pipe = sp_Popen(wget_cmd, shell=True) pipe.communicate() if pipe.returncode != 0: printlog_error_time( "Error occured while downloading reference sequences") platf_depend_exit(pipe.returncode) # end if else: # If there are no wget -- we will download sequences with Python disposal stop_wait = Event( ) # a flag variable that will signal waiter-function to stop executing def download_waiter(stop_wait): """ Function waits untill 'local_fasta' file is downloaded. It prints size of downloaded data to console during downloading. This function just waits -- it won't bring you the menu :). """ # Wait untill downloading starts while not os.path.exists(tmp_fasta): if not stop_wait.is_set(): return # end if sleep(1) # end while MB_size = 1024**2 # we will divide by it to get megabytes while stop_wait.is_set(): # Get size of downloaded data fsize = round(os.path.getsize(tmp_fasta) / MB_size, 1) # get megabytes printn("\r{} - {} MB downloaded ".format(getwt(), fsize)) sleep(1) # instant updates are not necessary # end while # Print total size of downloaded file (it can be deleted by this time) try: fsize = round(os.path.getsize(tmp_fasta) / MB_size, 1) except OSError: # We can pass this ecxeption -- we do delete this file if downloading crushes # And this function just waits :) pass # end try printlog_info("\r{} - {} MB downloaded ".format( getwt(), fsize)) # end def download_waiter error = True while error: try: waiter = Thread(target=download_waiter, args=(stop_wait, )) # create thread stop_wait.set() # raise the flag waiter.start() # start waiting urllib.request.urlretrieve( retrieve_url, tmp_fasta) # retrieve FASTA file except OSError as err: printlog_error_time( "Error occured while downloading fasta file.") printlog_error(str(err)) printlog_error( "`barapost-local.py` will try again in 30 seconds") if os.path.exists(tmp_fasta): os.unlink(tmp_fasta) # end if sleep(30) else: error = False finally: stop_wait.clear() # lower the flag waiter.join( ) # main thread will wait until waiter function ends it's work # end try # end while # end if printlog_info_time("Downloading is completed") # Write chunk to result fasta file with open(tmp_fasta, 'r') as infile, open(local_fasta, 'a') as outfile: outfile.write(infile.read()) # end with # Remove temp chunk file os.unlink(tmp_fasta) i += max_accnum # go to next chunk
def bin_fast5_file(f5_path, tax_annot_res_dir, sens, min_qual, min_qlen, min_pident, min_coverage, no_trash): # Function bins FAST5 file without untwisting. # # :param f5_path: path to FAST5 file meant to be processed; # :type f5_path: str; # :param tax_annot_res_dir: path to directory containing taxonomic annotation; # :type tax_annot_res_dir: str; # :param sens: binning sensitivity; # :type sens: str; # :param min_qual: threshold for quality filter; # :type min_qual: float; # :param min_qlen: threshold for length filter; # :type min_qlen: int (or None, if this filter is disabled); # :param min_pident: threshold for alignment identity filter; # :type min_pident: float (or None, if this filter is disabled); # :param min_coverage: threshold for alignment coverage filter; # :type min_coverage: float (or None, if this filter is disabled); # :param no_trash: loical value. True if user does NOT want to output trash files; # :type no_trash: bool; outdir_path = os.path.dirname(logging.getLoggerClass().root.handlers[0].baseFilename) seqs_pass = 0 # counter for sequences, which pass filters QL_seqs_fail = 0 # counter for too short or too low-quality sequences align_seqs_fail = 0 # counter for sequences, which align to their best hit with too low identity or coverage srt_file_dict = dict() new_dpath = glob("{}{}*{}*".format(tax_annot_res_dir, os.sep, get_checkstr(f5_path)))[0] tsv_res_fpath = get_res_tsv_fpath(new_dpath) taxonomy_path = os.path.join(tax_annot_res_dir, "taxonomy", "taxonomy.tsv") resfile_lines = configure_resfile_lines(tsv_res_fpath, sens, taxonomy_path) # Make filter for quality and length QL_filter = get_QL_filter(f5_path, min_qual, min_qlen) # Configure path to trash file if not no_trash: QL_trash_fpath = get_QL_trash_fpath(f5_path, outdir_path, min_qual, min_qlen,) else: QL_trash_fpath = None # end if # Make filter for identity and coverage align_filter = get_align_filter(min_pident, min_coverage) # Configure path to this trash file if not no_trash: align_trash_fpath = get_align_trash_fpath(f5_path, outdir_path, min_pident, min_coverage) else: align_trash_fpath = None # end if # File validation: # RuntimeError will be raised if FAST5 file is broken. try: # File existance checking is performed while parsing CL arguments. # Therefore, this if-statement will trigger only if f5_path's file is not a valid HDF5 file. if not h5py.is_hdf5(f5_path): raise RuntimeError("file is not of HDF5 (i.e. not FAST5) format") # end if from_f5 = h5py.File(f5_path, 'r') for _ in from_f5: break # end for except RuntimeError as runterr: printlog_error_time("FAST5 file is broken") printlog_error("Reading the file `{}` crashed.".format(os.path.basename(f5_path))) printlog_error("Reason: {}".format( str(runterr) )) printlog_error("Omitting this file...") print() # Return zeroes -- inc_val won't be incremented and this file will be omitted return (0, 0, 0) # end try # singleFAST5 and multiFAST5 files should be processed in different ways # "Raw" group always in singleFAST5 root and never in multiFAST5 root if "Raw" in from_f5.keys(): f5_cpy_func = copy_single_f5 else: f5_cpy_func = copy_read_f5_2_f5 # end if for _, read_name in enumerate(fast5_readids(from_f5)): try: hit_names, *vals_to_filter = resfile_lines[sys.intern(fmt_read_id(read_name))[1:]] # omit 'read_' in the beginning of FAST5 group's name except KeyError: printlog_error_time("Error: read `{}` not found in TSV file containing taxonomic annotation."\ .format(fmt_read_id(read_name))) printlog_error("This TSV file: `{}`".format(tsv_res_fpath)) printlog_error("Try running barapost-binning with `-u` (`--untwist-fast5`) flag.\n") platf_depend_exit(1) # end try # If read is found in TSV file: if not QL_filter(vals_to_filter): QL_seqs_fail += 1 # Get name of result FASTQ file to write this read in if QL_trash_fpath not in srt_file_dict.keys(): srt_file_dict = update_file_dict(srt_file_dict, QL_trash_fpath) # end if f5_cpy_func(from_f5, read_name, srt_file_dict[QL_trash_fpath]) elif not align_filter(vals_to_filter): align_seqs_fail += 1 # Get name of result FASTQ file to write this read in if QL_trash_fpath not in srt_file_dict.keys(): srt_file_dict = update_file_dict(srt_file_dict, align_trash_fpath) # end if f5_cpy_func(from_f5, read_name, srt_file_dict[align_trash_fpath]) else: for hit_name in hit_names.split("&&"): # there can be multiple hits for single query sequence # Get name of result FASTQ file to write this read in binned_file_path = os.path.join(outdir_path, "{}.fast5".format(hit_name)) if binned_file_path not in srt_file_dict.keys(): srt_file_dict = update_file_dict(srt_file_dict, binned_file_path) # end if f5_cpy_func(from_f5, read_name, srt_file_dict[binned_file_path]) # end for seqs_pass += 1 # end if # end for from_f5.close() # Close all binned files for file_obj in filter(lambda x: not x is None, srt_file_dict.values()): file_obj.close() # end for sys.stdout.write('\r') printlog_info_time("File `{}` is binned.".format(os.path.basename(f5_path))) printn(" Working...") return (seqs_pass, QL_seqs_fail, align_seqs_fail)
def crosstalks_runner(args): # Runner function for crosstalk detection task. # # :param args: arguments for crosstalk detection task; # :type args: CrosstalksArguments; # # Returns two collections: # 1. A collection of valid (non-crosstalk) paths. # 2. A collection of trash (crosstalk) paths. def write_positive(fastq_records, valid_files, statistics): # Function for writing non-crosstalk fastq records. src.fastq.write_fastq_records(fastq_records, valid_files) statistics.increment_positive() # end def crosstalk_write_positive def write_negative(fastq_records, trash_files, statistics): # Function for writing crosstalk fastq records. src.fastq.write_fastq_records(fastq_records, trash_files) statistics.increment_negative() # end def crosstalk_write_positive def crosstalk_iteration(fastq_records, primers, threshold, max_offset, write_positive, write_negative): not_crosstalk, fastq_records = crosstalk_pipe(primers, fastq_records, threshold, max_offset) if not_crosstalk: write_positive(fastq_records) else: write_negative(fastq_records) # end if # end def printlog_info_time('Start cross-talk detection.') # Get crosstalk pipe crosstalk_pipe = pcp.provide_crosstalk_pipe(args.cut_off_primers) # Cnofigure and open output files valid_fpaths, trash_fpaths = ofn.get_crosstalk_outfpaths( args.outdir, args.infpaths) valid_files = src.filesystem.open_files_for_appending(valid_fpaths) trash_files = src.filesystem.open_files_for_appending(trash_fpaths) # Create BinaryStatistics object crosstalk_statistics = src.binary_statistics.BinaryStatistics() # Connect non-crosstalk output files and statistics to `write_positive` function crosstalk_write_positive = functools.partial( write_positive, valid_files=valid_files, statistics=crosstalk_statistics) # Connect crosstalk output files and statistics to `write_negative` function crosstalk_write_negative = functools.partial( write_negative, trash_files=trash_files, statistics=crosstalk_statistics) # Connect arguments to `crosstalk_iteration` function # for status_bar to be wrapped around it. loaded_crosstalk_iteration = functools.partial( crosstalk_iteration, primers=args.primers, threshold=args.threshold, max_offset=args.max_offset, write_positive=crosstalk_write_positive, write_negative=crosstalk_write_negative) # Proceed src.status_bar.run_status_bar(loaded_crosstalk_iteration, args.infpaths) # Close files for file_collection in (valid_files, trash_files): src.filesystem.close_files(file_collection) # end for printlog_info_time('{} ({}%) cross-talks detected.'\ .format(crosstalk_statistics.negative_stat, crosstalk_statistics.get_negative_percents()) ) return valid_fpaths, trash_fpaths
def ngmerge_runner(args): # Runner function for NGmerge task. # # :param args: arguments for NGmerge task; # :type args: NGmergeArguments; # # Returns two collections: # 1. A collection of valid ("merged") paths. # 2. A collection of trash ("unmerged") paths. print() printlog_info_time('Running NGmerge..') # NGmerge puts result files into working directory -- # we will temporarily go to output directory old_dir = os.getcwd() os.chdir(args.outdir) # Conigure output files' names merged_basename, unmerged_prefix = ofn.get_ngmerge_outprefixes( args.infpaths[0]) # Configure command ngmerge_cmd = '{} -1 {} -2 {} -o {} -f {} -n {} -v -m {} -p {} -q {}'\ .format(args.ngmerge, args.infpaths[0], args.infpaths[1], merged_basename, unmerged_prefix, args.n_thr, args.min_overlap, args.mismatch_frac, args.phred_offset) printlog_info('Command: `{}`'.format(ngmerge_cmd)) # Run NGmerge print('NGmerge is doing it\'s job silently...') pipe = sp.Popen(ngmerge_cmd, shell=True, stderr=sp.PIPE) stderr = pipe.communicate()[1].decode('utf-8') # run NGmerge if pipe.returncode != 0: # error printlog_error('Error running NGmerge.: {}'.format(stderr)) platf_depend_exit(pipe.returncode) # end if # Parse merging statistics from NGmerge's stderr stderr = stderr.splitlines()[1:] reads_pattern = r'Fragments \(pairs of reads\) analyzed: ([0-9]+)' merged_pattern = r'Successfully stitched: ([0-9]+)' # Collect statistics try: reads_processed = int(re.search(reads_pattern, stderr[0]).group(1)) merged_reads = int(re.search(merged_pattern, stderr[1]).group(1)) except (ValueError, AttributeError) as err: printlog_error( 'Error 78 ({}). Please, contact the developer.'.format(err)) platf_depend_exit(78) # end try os.chdir(old_dir) # return to old dir printlog_info_time('NGmerge merged {}/{} ({}%) read pairs.'\ .format(merged_reads, reads_processed, round(merged_reads / reads_processed * 100, 2))) # Configure absolute paths to output files. merged_fpath = os.path.join(args.outdir, merged_basename) unmerged_fpaths = sorted( glob.glob( os.path.join(args.outdir, '{}*.fastq'.format(unmerged_prefix)))) # Oh yeah, first returned value must be a collection. return [merged_fpath], unmerged_fpaths
# Proceeding. # The main goal of multiprocessing is to isolate processes from one another. # # Two situations are available: # 1. Number of threads <= number of files meant to be processed ('many_files'-parallel mode): # Files will be distribured equally among processes. # Processes interact with one another only while printing things to the console # for user's entertainment. # 2. Number of threads > number of files meant to be processed ('few_files'-parallel mode): # Files will be processed one by one. They will be divided into equal blocks, # and these blocks will be distributed among processes. # Processes interact with one another while writing to result file and # while printing things to the console. print() printlog_info_time("Starting classification.") printn(" Working...") if n_thr <= len(fq_fa_list): if n_thr != 1: # Proceed 'many_files'-parallel processing from src.barapost_local_modules.parallel_mult_files import process process(fq_fa_list, n_thr, packet_size, tax_annot_res_dir, blast_algorithm, use_index, db_path) else: # Proceed single-thread processing
def _get_record_title(record_id): # Function retrieves title (aka definition) and accession # of a GenBank record by given accession or GI number. # :param record_id: accession or GI number of the record; # :type record_idi: str; # Returns tuple of two elements: # (<RECORD_TITLE>, <RECORD_ACCESSION>) # We'll use E-utilities to communicate with GenBank eutils_server = "eutils.ncbi.nlm.nih.gov" esummary = "esummary.fcgi" # utility name # Configure URL url = "/entrez/eutils/{}?db=nuccore&id={}".format(esummary, record_id) # Sometimes (I never figured out why) this XML arrives empty, and StopIteration emerges. # So, if we just repeat this request, everything is going to be ok. error = True print_ok = False while error: # Send the request and get the response summary = lingering_https_get_request( eutils_server, url, "e-summary of nuccore record {}".format(record_id)) # Parse XML that we've got root = ElementTree.fromstring(summary) # Elements of our insterest are all named "Item", # but they have different tags. # They are children of element "DocSum", which is # the first child of root try: docsum = next(iter(root.getchildren())) except StopIteration: print() printlog_info_time( "Failed to retrieve data for record {}. Trying again...". format(record_id)) print_ok = True # print this "ok" only after successful attepmt after fail else: if print_ok: printlog_info("ok") # end if error = False # end try # end while record_title = None record_acc = None # Search for title and accession for item in docsum.iter("Item"): if item.attrib["Name"] == "Title": record_title = item.text elif item.attrib["Name"] == "AccessionVersion": # Remove version just in case record_acc = re.search(r"(.*)\.[0-9]+", item.text).group(1) # end if # end for if record_title is None or record_acc is None: printlog_erro_time( "Error 8989: can't access e-summary for `{}`".format(record_acc)) platf_depend_exit(1) # end if return record_title, record_acc
def bin_fastqa_file(fq_fa_path, tax_annot_res_dir, sens, min_qual, min_qlen, min_pident, min_coverage, no_trash): # Function for single-thread binning FASTQ and FASTA files. # # :param fq_fa_path: path to FASTQ (of FASTA) file meant to be processed; # :type fq_fa_path: str; # :param tax_annot_res_dir: path to directory containing taxonomic annotation; # :type tax_annot_res_dir: str; # :param sens: binning sensitivity; # :type sens: str; # :param min_qual: threshold for quality filter; # :type min_qual: float; # :param min_qlen: threshold for length filter; # :type min_qlen: int (or None, if this filter is disabled); # :param min_pident: threshold for alignment identity filter; # :type min_pident: float (or None, if this filter is disabled); # :param min_coverage: threshold for alignment coverage filter; # :type min_coverage: float (or None, if this filter is disabled); # :param no_trash: loical value. True if user does NOT want to output trash files; # :type no_trash: bool; outdir_path = os.path.dirname( logging.getLoggerClass().root.handlers[0].baseFilename) seqs_pass = 0 # counter for sequences, which pass filters QL_seqs_fail = 0 # counter for too short or too low-quality sequences align_seqs_fail = 0 # counter for sequences, which align to their best hit with too low identity or coverage srt_file_dict = dict( ) # dict containing file objects of existing output files new_dpath = get_curr_res_dpath(fq_fa_path, tax_annot_res_dir) tsv_res_fpath = get_res_tsv_fpath(new_dpath) taxonomy_path = os.path.join(tax_annot_res_dir, "taxonomy", "taxonomy.tsv") resfile_lines = configure_resfile_lines(tsv_res_fpath, sens, taxonomy_path) # Configure generator, write function and path to trash file if is_fastq(fq_fa_path): seq_records_generator = fastq_records write_fun = write_fastq_record else: seq_records_generator = fasta_records write_fun = write_fasta_record # end if # Make filter for quality and length QL_filter = get_QL_filter(fq_fa_path, min_qual, min_qlen) # Configure path to trash file if not no_trash: QL_trash_fpath = get_QL_trash_fpath( fq_fa_path, outdir_path, min_qual, min_qlen, ) else: QL_trash_fpath = None # end if # Make filter for identity and coverage align_filter = get_align_filter(min_pident, min_coverage) # Configure path to this trash file if not no_trash: align_trash_fpath = get_align_trash_fpath(fq_fa_path, outdir_path, min_pident, min_coverage) else: align_trash_fpath = None # end if for fastq_rec in seq_records_generator(fq_fa_path): read_name = sys.intern(fmt_read_id( fastq_rec["seq_id"])[1:]) # get ID of the sequence try: hit_names, *vals_to_filter = resfile_lines[ read_name] # find hit corresponding to this sequence except KeyError: printlog_error_time("Error: read `{}` not found in TSV file containing taxonomic annotation."\ .format(read_name)) printlog_error("This TSV file: `{}`".format(tsv_res_fpath)) printlog_error("Make sure that this read has been already \ processed by `barapost-prober.py` and `barapost-local.py`.") platf_depend_exit(1) # end try # Apply filters if not QL_filter(vals_to_filter): QL_seqs_fail += 1 # Place this sequence to QL trash file if QL_trash_fpath not in srt_file_dict.keys(): srt_file_dict = update_file_dict(srt_file_dict, QL_trash_fpath) # end if write_fun(srt_file_dict[QL_trash_fpath], fastq_rec) # write current read to binned file elif not align_filter(vals_to_filter): align_seqs_fail += 1 # Place this sequence to align_trash file if align_trash_fpath not in srt_file_dict.keys(): srt_file_dict = update_file_dict(srt_file_dict, align_trash_fpath) # end if write_fun(srt_file_dict[align_trash_fpath], fastq_rec) # write current read to binned file else: for hit_name in hit_names.split( "&&" ): # there can be multiple hits for single query sequence # Get name of result FASTQ file to write this read in binned_file_path = os.path.join( outdir_path, "{}.fast{}".format(hit_name, 'q' if is_fastq(fq_fa_path) else 'a')) if binned_file_path not in srt_file_dict.keys(): srt_file_dict = update_file_dict(srt_file_dict, binned_file_path) # end if write_fun(srt_file_dict[binned_file_path], fastq_rec) # write current read to binned file # end for seqs_pass += 1 # end if # end for # Close all binned files for file_obj in filter(lambda x: not x is None, srt_file_dict.values()): file_obj.close() # end for sys.stdout.write('\r') printlog_info_time("File `{}` is binned.".format( os.path.basename(fq_fa_path))) printn(" Working...") return (seqs_pass, QL_seqs_fail, align_seqs_fail)