def bin_fastqa_file(fq_fa_lst, tax_annot_res_dir, sens, n_thr, min_qual, min_qlen, min_pident, min_coverage, no_trash): # Function for parallel binning FASTQ and FASTA files. # Actually bins multiple files. # # :param fq_fa_lst: lsit of paths to FASTQ (of FASTA) file meant to be processed; # :type fq_fa_lst: list<str>; # :param min_qual: threshold for quality filter; # :type min_qual: float; # :param min_qlen: threshold for length filter; # :type min_qlen: int (or None, if this filter is disabled); # :param min_pident: threshold for alignment identity filter; # :type min_pident: float (or None, if this filter is disabled); # :param min_coverage: threshold for alignment coverage filter; # :type min_coverage: float (or None, if this filter is disabled); # :param no_trash: loical value. True if user does NOT want to output trash files; # :type no_trash: bool; outdir_path = os.path.dirname( logging.getLoggerClass().root.handlers[0].baseFilename) seqs_pass = 0 # counter for sequences, which pass filters QL_seqs_fail = 0 # counter for too short or too low-quality sequences align_seqs_fail = 0 # counter for sequences, which align to their best hit with too low identity or coverage for fq_fa_path in fq_fa_lst: new_dpath = get_curr_res_dpath(fq_fa_path, tax_annot_res_dir) tsv_res_fpath = get_res_tsv_fpath(new_dpath) taxonomy_path = os.path.join(tax_annot_res_dir, "taxonomy", "taxonomy.tsv") resfile_lines = configure_resfile_lines(tsv_res_fpath, sens, taxonomy_path) # Configure path to trash file if is_fastq(fq_fa_path): seq_records_generator = fastq_records write_fun = write_fastq_record else: seq_records_generator = fasta_records write_fun = write_fasta_record # end if # Make filter for quality and length QL_filter = get_QL_filter(fq_fa_path, min_qual, min_qlen) # Configure path to trash file if not no_trash: QL_trash_fpath = get_QL_trash_fpath( fq_fa_path, outdir_path, min_qual, min_qlen, ) else: QL_trash_fpath = None # end if # Make filter for identity and coverage align_filter = get_align_filter(min_pident, min_coverage) # Configure path to this trash file if not no_trash: align_trash_fpath = get_align_trash_fpath(fq_fa_path, outdir_path, min_pident, min_coverage) else: align_trash_fpath = None # end if # Create an iterator that will yield records seq_records_iterator = iter(seq_records_generator(fq_fa_path)) # Dict for storing batches of sequences meant to be written to output files: to_write = dict() stop = False # for outer while-loop while not stop: # Extract batch of records of 'n_thr' size and find their destination paths: for _ in range(n_thr): try: fastqa_rec = next(seq_records_iterator) except StopIteration: stop = True # for outer while-loop break # end try read_name = sys.intern(fmt_read_id( fastqa_rec["seq_id"])[1:]) # get ID of the sequence try: hit_names, *vals_to_filter = resfile_lines[ read_name] # find hit corresponding to this sequence except KeyError: printlog_error_time("Error: read `{}` not found in TSV file containing taxonomic annotation."\ .format(read_name)) printlog_error("This TSV file: `{}`".format(tsv_res_fpath)) printlog_error( "Make sure that this read has been already processed by \ `barapost-prober.py` and `barapost-local.py`.") platf_depend_exit(1) # end try # If read is found in TSV file: if not QL_filter(vals_to_filter): # Place this sequence to QL trash file to_write[read_name] = (fastqa_rec, QL_trash_fpath) QL_seqs_fail += 1 elif not align_filter(vals_to_filter): # Place this sequence to QL trash file to_write[read_name] = (fastqa_rec, align_trash_fpath) align_seqs_fail += 1 else: for hit_name in hit_names.split("&&"): # Get name of result FASTQ file to write this read in binned_file_path = os.path.join( outdir_path, "{}.fast{}".format( hit_name, 'q' if is_fastq(fq_fa_path) else 'a')) to_write[read_name] = (fastqa_rec, binned_file_path) # end for seqs_pass += 1 # end if # end for # Write batch of records to output files: with write_lock: for record, fpath in to_write.values(): write_fun(fpath, record) # end for # end with to_write.clear() # end while with write_lock: # Write the rest of 'uneven' data to output files: if len(to_write) != 0: for record, fpath in to_write.values(): write_fun(fpath, record) # end for # end if sys.stdout.write('\r') printlog_info_time("File `{}` is binned.".format( os.path.basename(fq_fa_path))) printn(" Working...") # end with # end for return (seqs_pass, QL_seqs_fail, align_seqs_fail)
if probing_batch_size % packet_size > 0: # and this is ceiling packs_at_all += 1 # end if # Pretty string to print "Request 3 out of 7" if not 'send_all' out_of_n = { "msg": " out of {}".format(packs_at_all), "npacks": packs_at_all } # enf if # Ordinal number of packet meant to be sent (will incrementing after every successful submission) pack_to_send = [1] # it should be mutable # Choose appropriate record generator: packet_generator = fastq_packets if is_fastq(fq_fa_path) else fasta_packets # Iterate over packets in current file for packet in packet_generator(fq_fa_path, packet_size, num_done_seqs, packet_mode, saved_packet_size, saved_packet_mode, max_seq_len, probing_batch_size): # Assumption that we need to submit current packet (that we cannot just request for results) send = True # If current packet has been already send, we can try just to request for results if not saved_RID is None: send = retrieve_ready_job(saved_RID, packet, packet_size, packet_mode, pack_to_send, seqs_processed, fq_fa_path, tmp_fpath,
def process_paral(fq_fa_list, packet_size, tax_annot_res_dir, blast_algorithm, use_index, db_path, nfiles): # Function performs 'many_files'-parallel mode of barapost-local.py. # :param fq_fa_list: list of paths to FASTA and FASTQ files meant to be processed; # :type fq_fa_list: list<str>; # :param packet_size: number of sequences processed by blast in a single launching; # :type packet_size: int; # :param tax_annot_res_dir: path to ouput directory that contains taxonomic annotation; # :type tax_annot_res_dir: str; # :param blast_algorithm: blast algorithm to use; # :type blast_algorithm: str; # :param use_index: logic value indicationg whether to use indes; # :type use_index: bool; # :param db_path: path to database; # :type db_path: str; # :param nfiles: total number of files; # :type nfiles: int; queries_tmp_dir = os.path.join(tax_annot_res_dir, "queries-tmp") # Iterate over source FASTQ and FASTA files for fq_fa_path in fq_fa_list: # Create the result directory with the name of FASTQ of FASTA file being processed: new_dpath = create_result_directory(fq_fa_path, tax_annot_res_dir) # "hname" means human readable name (i.e. without file path and extention) infile_hname = os.path.basename(fq_fa_path) infile_hname = re.search(r"(.+)\.(m)?f(ast)?(a|q)(\.gz)?$", infile_hname).group(1) # Look around and ckeck if there are results of previous runs of this script # If 'look_around' is None -- there is no data from previous run previous_data = look_around(new_dpath, fq_fa_path) if previous_data is None: # If there is no data from previous run num_done_seqs = 0 # number of successfully processed sequences tsv_res_path = os.path.join( new_dpath, "classification.tsv") # form result tsv file path else: # if there is data from previous run num_done_seqs = previous_data[ "n_done_reads"] # get number of successfully processed sequences tsv_res_path = previous_data[ "tsv_respath"] # result tsv file sholud be the same as during previous run # end if how_to_open = OPEN_FUNCS[is_gzipped(fq_fa_path)] fmt_func = FORMATTING_FUNCS[is_gzipped(fq_fa_path)] if is_fastq(fq_fa_path): packet_generator = fastq_packets num_seqs = sum( 1 for line in how_to_open(fq_fa_path)) // 4 # 4 lines per record else: packet_generator = fasta_packets try: num_seqs = len( tuple( filter( lambda l: True if l.startswith('>') else False, map(fmt_func, how_to_open(fq_fa_path).readlines())))) except UnicodeDecodeError as err: with print_lock: print() printlog_warning("Warning: current file is broken: {}."\ .format(str(err))) printlog_warning("File: `{}`".format( os.path.abspath(fq_fa_path))) printlog_warning("This file will not be processed.") continue # end with # end try # end if if num_seqs == num_done_seqs: with counter_lock: file_counter.value += 1 i = file_counter.value # save to local var and release lock # end with with print_lock: sys.stdout.write('\r') printlog_info_time("File #{}/{} (`{}`) has been already completely processed.".\ format(i, nfiles, fq_fa_path)) printlog_info("Omitting it.") printn("Working...") # end with continue # end if for packet in packet_generator(fq_fa_path, packet_size, num_done_seqs): # Blast the packet align_xml_text = launch_blastn(packet["fasta"], blast_algorithm, use_index, queries_tmp_dir, db_path) # Cnfigure result TSV lines result_tsv_lines = parse_align_results_xml(align_xml_text, packet["qual"]) # Write the result to tsv write_classification(result_tsv_lines, tsv_res_path) # end for with counter_lock: file_counter.value += 1 i = file_counter.value # save to local var and release lock # end with with print_lock: sys.stdout.write('\r') printlog_info_time("File #{}/{} (`{}`) is processed.".\ format(i, nfiles, os.path.basename(fq_fa_path))) printn("Working...") # end with # end for query_fpath = os.path.join(queries_tmp_dir, "query{}_tmp.fasta".format(os.getpid())) remove_tmp_files(query_fpath)
from glob import glob try: opts, args = getopt.gnu_getopt(sys.argv[1:], "hvr:d:o:s:q:m:i:c:ut:n", [ "help", "version", "taxannot-resdir=", "indir=", "outdir=", "binning-sensitivity=", "min-qual=", "min-seq-len=", "min-pident=", "min-coverage=", "untwist-fast5", "threads=", "no-trash" ]) except getopt.GetoptError as gerr: print(str(gerr)) platf_depend_exit(2) # end try from src.filesystem import is_fasta, is_fastq, is_fast5 is_fastqa = lambda f: is_fasta(f) or is_fastq(f) from datetime import datetime now = datetime.now().strftime("%Y-%m-%d %H.%M.%S") # |== Default parameters: ==| fq_fa_list = list() # list with input FASTQ and FASTA files paths fast5_list = list() # list with input FAST5 files paths tax_annot_res_dir = "barapost_result" # path to directory with classification results indir_path = None # path to input directory outdir_path = "binning_result_{}".format(now.replace( ' ', '_')) # path to output directory sens = ("genus", 5) # binning sensitivity min_qual = 10 # minimum mean read quality to keep min_qlen = None # minimum seqeunce length to keep
def process(fq_fa_list, n_thr, packet_size, tax_annot_res_dir, blast_algorithm, use_index, db_path): # Function preforms "few_files"-parallel mode. # # :param fq_fa_list: list of paths to files meant to be processed; # :type fq_fa_list: list<str>; # :param n_thr: number of threads to launch; # :type n_thr: int; # :param packet_size: number of sequences processed by blast in a single launching; # :type packet_size: int; # :param tax_annot_res_dir: path to ouput directory that contains taxonomic annotation; # :type tax_annot_res_dir: str; # :param blast_algorithm: blast algorithm to use; # :type blast_algorithm: str; # :param use_index: logic value indicationg whether to use indes; # :type use_index: bool; # :param db_path: path to database; # :type db_path: str; nfiles = len(fq_fa_list) for i, fq_fa_path in enumerate(fq_fa_list): # Create the result directory with the name of FASTQ of FASTA file being processed: new_dpath = create_result_directory(fq_fa_path, tax_annot_res_dir) # "hname" means human readable name (i.e. without file path and extention) infile_hname = os.path.basename(fq_fa_path) infile_hname = re.search(r"(.+)\.(m)?f(ast)?(a|q)(\.gz)?$", infile_hname).group(1) # Look around and ckeck if there are results of previous runs of this script # If 'look_around' is None -- there is no data from previous run previous_data = look_around(new_dpath, fq_fa_path) if previous_data is None: # If there is no data from previous run num_done_seqs = 0 # number of successfully processed sequences tsv_res_path = os.path.join(new_dpath, "classification.tsv") # form result tsv file path else: # if there is data from previous run num_done_seqs = previous_data["n_done_reads"] # get number of successfully processed sequences tsv_res_path = previous_data["tsv_respath"] # result tsv file sholud be the same as during previous run # end if how_to_open = OPEN_FUNCS[ is_gzipped(fq_fa_path) ] fmt_func = FORMATTING_FUNCS[ is_gzipped(fq_fa_path) ] if is_fastq(fq_fa_path): packet_generator = fastq_packets num_seqs = sum(1 for line in how_to_open(fq_fa_path)) // 4 # 4 lines per record else: packet_generator = fasta_packets try: num_seqs = len(tuple(filter(lambda l: True if l.startswith('>') else False, map(fmt_func, how_to_open(fq_fa_path).readlines())))) except UnicodeDecodeError as err: print() printlog_warning("Warning: current file is broken: {}."\ .format(str(err))) printlog_warning("File: `{}`".format(os.path.abspath(fq_fa_path))) printlog_warning("This file will not be processed.") continue # end try # end if packet_size = min(packet_size, num_seqs // n_thr) if num_seqs == num_done_seqs: sys.stdout.write('\r') printlog_info_time("File #{}/{} (`{}`) has been already completely processed."\ .format(i+1, nfiles, fq_fa_path)) printlog_info("Omitting it.") printn("Working...") return # end if # Get number of seqeunces to pass to each thread file_part_size = num_seqs // n_thr if num_seqs % n_thr != 0: file_part_size += 1 # end if pool = mp.Pool(n_thr, initializer=init_proc_single_file_in_paral, initargs=(mp.Lock(), mp.Lock(),)) pool.starmap(process_part_of_file, [(file_part, tsv_res_path, packet_size, tax_annot_res_dir, blast_algorithm, use_index, db_path) for file_part in packet_generator(fq_fa_path, file_part_size, num_done_seqs)]) # Reaping zombies pool.close() pool.join() sys.stdout.write('\r') printlog_info_time("File #{}/{} (`{}`) is processed.".\ format(i+1, nfiles, os.path.basename(fq_fa_path))) printn("Working...") # end for
def bin_fastqa_file(fq_fa_path, tax_annot_res_dir, sens, min_qual, min_qlen, min_pident, min_coverage, no_trash): # Function for single-thread binning FASTQ and FASTA files. # # :param fq_fa_path: path to FASTQ (of FASTA) file meant to be processed; # :type fq_fa_path: str; # :param tax_annot_res_dir: path to directory containing taxonomic annotation; # :type tax_annot_res_dir: str; # :param sens: binning sensitivity; # :type sens: str; # :param min_qual: threshold for quality filter; # :type min_qual: float; # :param min_qlen: threshold for length filter; # :type min_qlen: int (or None, if this filter is disabled); # :param min_pident: threshold for alignment identity filter; # :type min_pident: float (or None, if this filter is disabled); # :param min_coverage: threshold for alignment coverage filter; # :type min_coverage: float (or None, if this filter is disabled); # :param no_trash: loical value. True if user does NOT want to output trash files; # :type no_trash: bool; outdir_path = os.path.dirname( logging.getLoggerClass().root.handlers[0].baseFilename) seqs_pass = 0 # counter for sequences, which pass filters QL_seqs_fail = 0 # counter for too short or too low-quality sequences align_seqs_fail = 0 # counter for sequences, which align to their best hit with too low identity or coverage srt_file_dict = dict( ) # dict containing file objects of existing output files new_dpath = get_curr_res_dpath(fq_fa_path, tax_annot_res_dir) tsv_res_fpath = get_res_tsv_fpath(new_dpath) taxonomy_path = os.path.join(tax_annot_res_dir, "taxonomy", "taxonomy.tsv") resfile_lines = configure_resfile_lines(tsv_res_fpath, sens, taxonomy_path) # Configure generator, write function and path to trash file if is_fastq(fq_fa_path): seq_records_generator = fastq_records write_fun = write_fastq_record else: seq_records_generator = fasta_records write_fun = write_fasta_record # end if # Make filter for quality and length QL_filter = get_QL_filter(fq_fa_path, min_qual, min_qlen) # Configure path to trash file if not no_trash: QL_trash_fpath = get_QL_trash_fpath( fq_fa_path, outdir_path, min_qual, min_qlen, ) else: QL_trash_fpath = None # end if # Make filter for identity and coverage align_filter = get_align_filter(min_pident, min_coverage) # Configure path to this trash file if not no_trash: align_trash_fpath = get_align_trash_fpath(fq_fa_path, outdir_path, min_pident, min_coverage) else: align_trash_fpath = None # end if for fastq_rec in seq_records_generator(fq_fa_path): read_name = sys.intern(fmt_read_id( fastq_rec["seq_id"])[1:]) # get ID of the sequence try: hit_names, *vals_to_filter = resfile_lines[ read_name] # find hit corresponding to this sequence except KeyError: printlog_error_time("Error: read `{}` not found in TSV file containing taxonomic annotation."\ .format(read_name)) printlog_error("This TSV file: `{}`".format(tsv_res_fpath)) printlog_error("Make sure that this read has been already \ processed by `barapost-prober.py` and `barapost-local.py`.") platf_depend_exit(1) # end try # Apply filters if not QL_filter(vals_to_filter): QL_seqs_fail += 1 # Place this sequence to QL trash file if QL_trash_fpath not in srt_file_dict.keys(): srt_file_dict = update_file_dict(srt_file_dict, QL_trash_fpath) # end if write_fun(srt_file_dict[QL_trash_fpath], fastq_rec) # write current read to binned file elif not align_filter(vals_to_filter): align_seqs_fail += 1 # Place this sequence to align_trash file if align_trash_fpath not in srt_file_dict.keys(): srt_file_dict = update_file_dict(srt_file_dict, align_trash_fpath) # end if write_fun(srt_file_dict[align_trash_fpath], fastq_rec) # write current read to binned file else: for hit_name in hit_names.split( "&&" ): # there can be multiple hits for single query sequence # Get name of result FASTQ file to write this read in binned_file_path = os.path.join( outdir_path, "{}.fast{}".format(hit_name, 'q' if is_fastq(fq_fa_path) else 'a')) if binned_file_path not in srt_file_dict.keys(): srt_file_dict = update_file_dict(srt_file_dict, binned_file_path) # end if write_fun(srt_file_dict[binned_file_path], fastq_rec) # write current read to binned file # end for seqs_pass += 1 # end if # end for # Close all binned files for file_obj in filter(lambda x: not x is None, srt_file_dict.values()): file_obj.close() # end for sys.stdout.write('\r') printlog_info_time("File `{}` is binned.".format( os.path.basename(fq_fa_path))) printn(" Working...") return (seqs_pass, QL_seqs_fail, align_seqs_fail)