def get_res_tsv_fpath(new_dpath): # Function returns current TSV file. Binning will be performed according to this file. # :param new_dpath: current result directory; # :type new_dpath: str; is_similar_to_tsv_res = lambda f: True if f == "classification.tsv" else False if not os.path.exists(new_dpath): printlog_error_time( "Error: directory `{}` does not exist!".format(new_dpath)) printlog_error("Please make sure you have performed taxonomic \ annotation of the following file: `{}` \ with `barapost-prober.py` and/or `barapost-local.py`".format( os.path.basename(new_dpath))) printlog_error( "Also this error might occur if you forget to specify result directory \ generated by `barapost-prober.py` with `-r` option.") platf_depend_exit(0) # end if # Recent file will be the first in sorted list tsv_res_fpath = list( filter(is_similar_to_tsv_res, sorted(os.listdir(new_dpath))))[0] return os.path.join(new_dpath, tsv_res_fpath)
def create_result_directory(fq_fa_path, outdir_path): # Function creates a result directory named according # to how source FASTQ or FASTA file is named. # # :param fq_fa_path: path to source FASTQ or FASTA file; # :type fq_fa_path: str; # :param outdir_path: path to directory in which result_directory will be created; # :type outdir_path: str; # # Returns 'str' path to the recently created result directory. # dpath means "directory path" new_dpath = os.path.join( outdir_path, os.path.basename(fq_fa_path)) # get rid of absolute path new_dpath = re.search(r"(.*)\.(m)?f(ast)?(a|q)(\.gz)?$", new_dpath).group(1) # get rid of extention if not os.path.exists(new_dpath): try: os.makedirs(new_dpath) except OSError as oserr: printlog_error_time( "Error: can't create result directory: `{}`".format(new_dpath)) printlog_error(str(oserr)) platf_depend_exit(1) # end try # end if return new_dpath
def provide_open_funcs(fpaths): # Function, which returns opening function(s) for input file(s). # # :param fpaths: collection of paths to input files; # :type fpaths: list<str>; open_funcs = list() try: for fpath in fpaths: # Check if input file is gzipped if _is_gzipped(fpath): open_funcs.append( functools.partial(gzip.open, mode='rt', encoding='utf-8')) # Check if input file is bzipped2 elif _is_bzipped(fpath): open_funcs.append( functools.partial(bz2.open, mode='rt', encoding='utf-8')) # Check if input file is plain text file elif _is_plain_text(fpath): open_funcs.append( functools.partial(open, mode='r', encoding='utf-8')) else: # Raise a super terrifying exception raise _InvalidFileError('Error: cannot read file `{}`: \ it is neither plain text file, nor gzipped, nor bzipped2.'.format(fpath)) # end if # end for except _InvalidFileError as err: printlog_error(str(err)) platf_depend_exit(1) # end try return open_funcs
def rename_file_verbosely(file): # Function verbosely renames file (as well as directory) given to it. # :param file: path to file (directory) meant to be renamed; # :type file: str; if not os.path.exists(file): return None # end if # Path to "file's" parent directory pardir = os.path.abspath(os.path.dirname(file)) # Function can rename directories if os.path.isdir(file): is_analog = lambda f: not re.search(r"{}.*(_old_[0-9]+)?$"\ .format(os.path.basename(file)), f) is None word = "directory" name_itself = file ext = "" else: is_analog = lambda f: re.search(r"(.*)\..*$", os.path.basename(file) ).group(1) in f word = "file" name_itself = re.search(r"(.*)\..*$", file).group(1) ext = re.search(r".*(\..*)$", file).group(1) # end if # Count files in 'pardir' that have analogous names as 'file' has: num_analog_files = len(list(filter(is_analog, os.listdir(pardir)))) if re.search(r"_old_[0-9]+", file) is None: # Append "_old_<number>" new_name = name_itself + "_old_" + str(num_analog_files) + ext else: # Merely substitute new number new_name = file.replace( re.search(r"_old_([0-9]+)", file).group(1), str(num_analog_files + 1)) # end if try: print() printlog_info(" - Renaming old {}:".format(word)) printlog_info(" `{}` --> `{}`".format(file, new_name)) os.rename(file, new_name) except OSError as err: printlog_error_time("Error: {} `{}` cannot be renamed:".format( word, str(file))) printlog_error(str(err)) platf_depend_exit(1) # end try return new_name
def remove_tmp_files(*paths): # Function removes files passed to it. # :param paths: an array-like collection of apth of files; # :type paths: list<str>; for path in paths: if os.path.exists(path): try: os.unlink(path) except OSError as oserr: printlog_error_time("Error: cannot remove file `{}`").format( path) printlog_error(str(oserr)) platf_depend_exit(1)
def update_file_dict(srt_file_dict, new_fpath): try: if not new_fpath is None: srt_file_dict[sys.intern(new_fpath)] = open(new_fpath, 'a') else: srt_file_dict[new_fpath] = None # handle no_trash # end if except OSError as oserr: printlog_error_time("Error occured while opening one of result files") printlog_error("Errorneous file: `{}`".format(new_fpath)) printlog_error(str(oserr)) platf_depend_exit(1) # end try return srt_file_dict
def fastq_generator(fq_fpaths): # Function yields fastq records. # It does not create new FastqRecord object each time. # Instead it just updates extant object. # :param fq_fpaths: list ot paths to input fastq files; # :type fq_fpaths: list<str>, tuple<str>; # Yields list of FastqRecord-s, list<FastqRecord>. # Get open funtions for both files open_funcs = src.compression.provide_open_funcs(fq_fpaths) # Open input files and create FastqRecord objects for forward and reverse reads. fq_files = list() fq_records = list() for fpath, open_func in zip(fq_fpaths, open_funcs): fq_files.append(open_func(fpath)) fq_records.append(FastqRecord(None, None, None, None)) # end for eof = False while not eof: for fq_record, fq_file in zip(fq_records, fq_files): # Update FastqRecord fq_record.update_record(fq_file.readline().strip(), fq_file.readline().strip(), fq_file.readline().strip(), fq_file.readline().strip()) # end for if fq_records[0].read_name == '': eof = True # end of file else: # Validate fastq record(s) for fq_record in fq_records: error_response = fq_record.validate_fastq() if not error_response is None: printlog_error('Fastq error: {}'.format(error_response)) platf_depend_exit(1) # end if # end for yield fq_records # end if # end while # Close input files. for fq_file in fq_files: fq_file.close()
def look_around(new_dpath, fq_fa_path): # Function looks around in order to check if there are results from previous runs of this script. # # Returns None if there is no result from previous run. # If there are results from previous run, returns a dict of the following structure: # { # "tsv_respath": path_to_tsv_file_from_previous_run (str), # "n_done_reads": number_of_successfull_requests_from_currenrt_FASTA_file (int), # } # # :param new_dpath: path to current (corresponding to fq_fa_path file) result directory; # :type new_dpath: str; # :param fq_fa_path: path to current (corresponding to fq_fa_path file) FASTA file; # :type fq_fa_path: str; # "hname" means human readable name (i.e. without file path and extention) fasta_hname = os.path.basename(fq_fa_path) # get rid of absolute path fasta_hname = re.search(r"(.*)\.(m)?f(ast)?a", fasta_hname).group( 1) # get rid of '.fasta' extention # Form path to result file tsv_res_fpath = os.path.join(new_dpath, "classification.tsv") num_done_reads = 0 # variable to keep number of succeffdully processed sequences if os.path.exists(tsv_res_fpath): with open(tsv_res_fpath, 'r') as res_file: # There can be invalid information in result file try: lines = res_file.readlines() num_done_reads = len(lines) - 1 # the first line is a head except OSError as err: printlog_error_time("Data in classification file `{}` is broken. Reason:"\ .format(tsv_res_fpath)) printlog_error(str(err)) printlog_error("Starting from the beginning.") rename_file_verbosely(tsv_res_fpath) return None # end try # end with else: return None # end if return { "tsv_respath": tsv_res_fpath, "n_done_reads": num_done_reads, }
def recover_taxonomy(acc, hit_def, taxonomy_path): # Function recovers missing taxonomy by given accession. # # :param acc: accession of taxonomy entry to recover; # :type acc: str; # :param hit_def: name of this sequence; # :type hit_def: sre; # :param taxonomy_path: path to TSV file with taxonomy; # :type taxonomy_path: str; if acc == "LAMBDA": # If we are missing lambda phage taxonomy -- just add it save_taxonomy_directly(taxonomy_path, acc, "Lambda-phage-nanopore-control") elif acc.startswith("OWN_SEQ_"): # If sequence is an "own seq" -- check fasta file # Get necessary title line from `local_seq_set.fasta` # Firstly find fasta file (it may be compressed) classif_dir = os.path.dirname(os.path.dirname(taxonomy_path)) db_dir = os.path.join(classif_dir, "local_database") db_files = glob.glob("{}{}*".format(db_dir, os.sep)) try: local_fasta = next(iter(filter(is_fasta, db_files))) except StopIteration: printlog_error_time( "Error: cannot recover taxonomy for following sequence:") printlog_error(" `{} - {}`.".format(acc, hit_def)) printlog_error( "You can solve this problem by yourself (it's pretty simple).") printlog_error("Just add taxonomy line for {} to file `{}`".format( acc, taxonomy_path)) printlog_error(" and run the program again.") platf_depend_exit(1) # end try # Find our line startingg with `acc` how_to_open = OPEN_FUNCS[is_gzipped(local_fasta)] fmt_func = FORMATTING_FUNCS[is_gzipped(local_fasta)] if is_gzipped(local_fasta): search_for = b">" + bytes(acc, 'ascii') + b" " else: search_for = ">{} ".format(acc) # end if with how_to_open(local_fasta) as fasta_file: for line in fasta_file: if line.startswith(search_for): seq_name = fmt_func(line).partition(' ')[ 2] # get name of the sequence save_taxonomy_directly(taxonomy_path, acc, seq_name) break # end if # end for # end with else: # Try to find taxonomy in NCBI download_taxonomy(acc, hit_def, taxonomy_path)
def search_for_related_replicons(acc_dict): # Function searches for replicons related to those in 'hits_to_download.tsv' # of specified with '-s' option. # :param acc_dict: dictionary comntaining accession data of hits; # :type acc_dict: dict<str: tuple<str, str, int>>; print() printlog_info_time("Searching for related replicons...") start_accs = tuple( acc_dict.keys()) # accessions, which were "discovered" by prober for i, acc in enumerate(start_accs): printlog_info("{}. {} ({}):".format(i + 1, acc, acc_dict[acc])) # Search for related replicons: try: related_repls = _get_related_replicons(acc, acc_dict) except AttributeError: printlog_errot_time( "Parsing error: cannot find replicons related to {}.".format( acc)) printlog_error("Please, contact the developer") platf_depend_exit(1) else: related_repls = _deduplicate_replicons(related_repls, acc) # end try for rel_acc, rel_def in related_repls: acc_dict[rel_acc] = rel_def # end for # end for print() if len(start_accs) != len(acc_dict): # there are some new replicons printlog_info_time("{} related replicons have been found.".\ format(len(acc_dict) - len(start_accs))) else: printlog_info_time("No related replicons found.") # end if print() # end def search_for_related_replicons
def whether_to_build_index(index_dirpath): # Function checks if there are any files in index directory. # If there are any, it asks a user whether to create a new index or to use old one. # :param index_dirpath: path to index directory; # :type index_dirpath: str; use_old_index = False if len(os.listdir(index_dirpath)) != 0: printlog_info( "Index file created by `-u` option already exists (left from previous run)." ) error = True while error: reply = input(""" Press ENTER to make new index file or enter 'u' to use old index file:>>""") if reply == "": try: for path in glob(os.path.join(index_dirpath, '*')): os.unlink(path) # end for except OSError as oserr: printlog_error_time( "Error: cannot remove old index files!") printlog_error(str(oserr)) platf_depend_exit(1) # end try error = False elif reply == 'u': use_old_index = True error = False else: print("Invalid reply!\n") # end if # end while printlog_info("You have chosen to {} index file.".format( "use old" if use_old_index else "make new")) print() # end if return use_old_index
def launch_blastn(packet, blast_algorithm, use_index, queries_tmp_dir, db_path): """ Function launches 'blastn' utility from "BLAST+" toolkit and returns it's response. :param pacekt: FASTA data meant to be processend by 'blastn'; :type packet: str; :param blast_algorithm: blastn algorithm to use; :type blast_algorithm: str; :param use_index: logic value inddicating whether to use index; :type use_index: bool: :param queries_tmp_dir: path to directory with query files; :type queries_tmp_dir: str: :param db_path: path to database; :type db_path: str: """ # PID of current process won't change, so we can use it to mark query files. # 'paket's are too large to pass them to 'subprocess.Popen' as stdin, # therefore we need to use these query files. query_path = os.path.join(queries_tmp_dir, "query{}_tmp.fasta".format(os.getpid())) with open(query_path, 'w') as query_file: query_file.write(packet) # end with # Configure command line blast_cmd = "blastn -query {} -db {} -outfmt 5 -task {} -max_target_seqs 10 -max_hsps 1 -use_index {}"\ .format(query_path, db_path, blast_algorithm, use_index) pipe = sp.Popen(blast_cmd, shell=True, stdout=sp.PIPE, stderr=sp.PIPE) stdout_stderr = pipe.communicate() if pipe.returncode != 0: printlog_error_time( "Error occured while aligning a sequence against local database") printlog_error(stdout_stderr[1].decode("utf-8")) platf_depend_exit(pipe.returncode) # end if return stdout_stderr[0].decode("utf-8")
def copy_single_f5(from_f5, read_name, to_f5): # Function copies a read with ID 'read_name' # from 'from_f5' singleFAST5 file to 'to_f5' multiFAST5 one. # # :param from_f5: FAST5 file object to copy a read from; # :type from_f5: h5py.File; # :param read_name: ID of a read to copy; # :type read_name: str; # :param to_f5: destination FAST5 file; # :type to_f5: h5py.File; # Handle no_trash if to_f5 is None: return # end if try: read_group = read_name to_f5.create_group(read_group) # create group in destination multi_FAST5 file # Copy "UniqueGlobalKey" to root of recently created group for ugk_subgr in from_f5["UniqueGlobalKey"]: from_f5.copy("UniqueGlobalKey/"+ugk_subgr, to_f5[read_group]) # end for # Get data array in single-FAST5 file read_number_group = "Raw/Reads/"+next(iter(from_f5["Raw"]["Reads"])) # It's name in multi-FAST5 file read_number = re.search(r"(Read_[0-9]+)", read_number_group).group(1) # Copy group to multi-FAST5 file from_f5.copy(from_f5[read_number_group], to_f5[read_group]) # Move data array to "Raw" group, as it is in multi-FAST5 files to_f5.move("{}/{}".format(read_group, read_number), "{}/Raw".format(read_group)) # Copy everything else to recently created group for group in from_f5: if group != "Raw" and group != "UniqueGlobalKey": from_f5.copy(group, to_f5["/{}".format(read_group)]) # end if # end for except ValueError as err: printlog_error_time("Error: `{}`".format( str(err) )) printlog_error("Reason is probably the following:") printlog_error(" read that is copying to the result file is already in this file.") printlog_error("ID of the read: `{}`".format(read_name)) printlog_error("File: `{}`".format(to_f5.filename)) return
def verify_taxids(taxid_list): # Funciton verifies TaxIDs passed to prober with `-g` option. # Function requests NCBI Taxonomy Browser and parses organism's name from HTML response. # What is more, this function configures `oraganisms` list - it will be included into BLAST submissions. # # :param taxid_list: list of TaxIDs. TaxIDs are strings, but they are verified to be integers # during CL argument parsing; # :type taxid_list: list<str>; # # Returns list of strings of the following format: "<tax_name> (taxid:<TaxID>)>" organisms = list() if len(taxid_list) > 0: printlog_info("Verifying TaxIDs:") for taxid in taxid_list: printn(" {} - ".format(taxid)) try: tax_resp = lingering_https_get_request( "www.ncbi.nlm.nih.gov", "/Taxonomy/Browser/wwwtax.cgi?mode=Info&id={}".format( taxid), "taxonomy") tax_name = re.search(r"Taxonomy browser \((.+?)\)", tax_resp).group(1) except AttributeError: printlog_error("\aError: TaxID not found") printlog_error( "Please, check your TaxID: https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi" ) platf_depend_exit(1) except OSError as oserr: printlog_error("Something is wrong with connection:") printlog_error(str(oserr)) platf_depend_exit(-2) else: print(tax_name) log_info("{} - {}".format(taxid, tax_name)) organisms.append("{} (taxid:{})".format(tax_name, taxid)) # end try # end for print('-' * 30 + '\n') # end if return organisms
def copy_read_f5_2_f5(from_f5, read_name, to_f5): # Function copies a read with ID 'read_name' # from 'from_f5' multiFAST5 file to to_f5 multiFAST5 one. # # :param from_f5: FAST5 file object to copy a read from; # :type from_f5: h5py.File; # :param read_name: ID of a read to copy; # :type read_name: str; # :param to_f5: destination FAST5 file; # :type to_f5: h5py.File; if not to_f5 is None: # handle no_trash try: from_f5.copy(read_name, to_f5) except ValueError as err: printlog_error_time("Error: `{}`".format( str(err) )) printlog_error("Reason is probably the following:") printlog_error(" read that is copying to the result file is already in this file.") printlog_error("ID of the read: `{}`".format(read_name)) printlog_error("File: `{}`".format(to_f5.filename)) return
def get_primers_seqs(primers_fpath): # Function for for obtaining primer sequence(s). # If primer_fpath is None, it returns refault primers: # Illumina 16S V3-V4 primers. # Otherwise it parses primers from provided fasta file. # Use Illumina V3-V4 primers by dafault if primers_fpath is None: primers = ('CCTACGGGNGGCWGCAG', 'GACTACHVGGGTATCTAATCC') else: primers = list() # Get lines try: with open(primers_fpath, 'r') as primers_file: lines = primers_file.readlines() # end with except OSError as oserror: printlog_error('Error while reading file of primers: {}'\ .format(oserror)) platf_depend_exit(1) # end try # Remove blank lines lines = list(filter(lambda x: x != '', lines)) # There must be 1 or 2 primers in primers file. if len(lines) not in (2, 4): printlog_error('Error: invalid format of primers file.\ It should be single (2 lines at all) or "double" (4 lines at all) fasta file.\ Bu there are {} lines in your file.'.format(len(lines))) platf_depend_exit(1) # end if bases = 'AGCTUTYSWKMBDHVN' # Validate sequence(s). for i in range(1, len(lines), 2): seq = lines[i].strip().upper() if re.match(r'[{}]+'.format(bases), seq) is None: printlog_error('Error: invalid character in primer sequence.\ Here is invalid primer sequence: `{}`. Permitted characters: `{}`'\ .format(seq, bases)) platf_depend_exit(1) # end if primers.append(seq) # end for # end if return primers
def lingering_https_get_request(server, url, request_for=None, acc=None): # Function performs a "lingering" HTTPS request. # It means that the function tries to get the response # again and again if the request fails. # # :param server: server address; # :type server: str; # :param url: the rest of url; # :type url: str; # :param request_for: some comment for error message; # :type request_for: str; # :param acc: GenBank accession; # :type acc: str; # # Returns obtained response coded in UTF-8 ('str'). error = True # We can get spurious 404 or sth due to instability of NCBI servers work. # Let's give it 3 attempts (with 15 sec spans in between), # and if all them are unsuccessful -- teminate execution. attempt_i = 0 max_attempts = 3 while error: try: conn = http.client.HTTPSConnection(server, timeout=30) # create connection conn.request("GET", url) # ask for if there areresults response = conn.getresponse() # get the resonse if response.code != 200: if attempt_i < max_attempts and "ncbi.nlm.nih.gov" in server: printlog_error("Error {}: {}.".format( response.code, response.reason)) printlog_error( "It may be due to instable work of NCBI servers.") printlog_error("{} attempts to connect left, waiting 15 sec..."\ .format(max_attempts - attempt_i)) attempt_i += 1 else: printlog_error("Cannot find {} for {}.".format( request_for, acc)) printlog_error("Request failed with status code {}: {}"\ .format(response.code, response.reason)) platf_depend_exit(1) # end if # end if resp_content = str(response.read(), "utf-8") # get response text except (OSError,\ http.client.RemoteDisconnected,\ socket.gaierror,\ http.client.CannotSendRequest) as err: comment_str = "" if not request_for is None: comment_str += " requesting for {}".format(request_for) if not acc is None: comment_str += " (accession: `{}`)".format(acc) # end if comment_str += '.' # end if print() printlog_info("Can't connect to `{}`{}".format( server + url, comment_str)) printlog_info(str(err)) printlog_info( """the program will sleep for 30 seconds and try to connect again.""" ) sleep(30) else: error = False # if no exception ocured, get out of the loop finally: conn.close() # end try # end while return resp_content
def _reformat_legacy_file(legacy_tax_path): import shelve # Check if this file is corrupted try: with shelve.open(legacy_tax_path, 'r') as tax_file: pass # end with except OSError as err: printlog_error("Legacy taxonomy file appears to be corrupted.") printlog_error("This error might be fatal.") str_err = str(err) if "dbm.gnu" in str_err and "module is not" in str_err: printlog_error("Installing `python3-gdbm` might solve this problem.") else: printlog_error("The program can't recover taxonomy from the broken file.") printlog_error("Seems, you have to annotate your sequences again.") printlog_error("Sorry for that :(") # end if platf_depend_exit(1) # end try new_tax_path = "{}.tsv".format(legacy_tax_path) taxonomy.init_tax_file(new_tax_path) printn("Reformatting: `{}` ->".format(legacy_tax_path)) log_info("Reformatting: `{}` ->".format(legacy_tax_path)) with shelve.open(legacy_tax_path, 'r') as old_tax_file, open(new_tax_path, 'w') as new_tax_file: for acc, taxonomy_from_file in old_tax_file.items(): if isinstance(taxonomy_from_file, tuple): tax_str = taxonomy.config_taxonomy_str(taxonomy_from_file) new_tax_file.write("{}\n".format('\t'.join( (acc, tax_str) ))) elif isinstance(taxonomy_from_file, str): new_tax_file.write("{}\n".format('\t'.join( (acc, taxonomy_from_file) ))) else: # Execution must not reach here printlog_error_time("Fatal error 8755.") printlog_error("Please, contact the developer.") platf_depend_exit(8755) # end if # end for # end with printlog_info(" `<same_dir>/{}`".format(os.path.basename(new_tax_path))) try: renamed_legacy_file = "{}_deprecated".format(legacy_tax_path) os.rename(legacy_tax_path, renamed_legacy_file) except OSError as err: printlog_error_time("Cannot rename legacy taxonomy file `{}`:".format(legacy_tax_path)) printlog_error(str(err)) printlog_error("But it's not a problem -- we will proceed with our work.") else: printlog_info("Renamed: `{}` -> `<same_dir>/{}`".format(legacy_tax_path, os.path.basename(renamed_legacy_file))) # end try printlog_info("Legacy taxonomy file is reformatted to TSV format.")
def look_around(outdir_path, new_dpath, infile_path, blast_algorithm, acc_dict, probing_batch_size): # Function looks around in order to check if there are results from previous run(s) of this script # in order to resume the previous run. # # Returns None if there is no result from previous run. # If there are results from previous run, returns a dict of the following structure: # { # "RID": saved_RID <str>, # "packet_size_save": saved packet size <int>, # "packet_size_mode": saved packet mode <int>, # "tsv_respath": path_to_tsv_file_from_previous_run <str>, # "n_done_reads": number_of_successfull_requests_from_currenrt_FASTA_file <int>, # "tmp_fpath": path_to_pemporary_file <str>, # "decr_pb": valuse decreasing size of probing batch (see below, where this variable is defined) <int> # } # # :param outdir_path: path to output directory; # :type outdir_path: str; # :param new_dpath: path to current (corresponding to fq_fa_path file) result directory; # :type new_dpath: str; # :param infile_path: path to current (corresponding to fq_fa_path file) FASTA file; # :type infile_path: str; # :param blast_algorithm: BLASTn algorithm to use. # This parameter is necessary because it is included in name of result files; # :param acc_dict: dictionary of accession info of hits; # :type acc_dict: dict<str: tuple<str, str, int>>; # :param probing_batch_size: amount of sequences meant to be processed in a single run; # :type probing_batch_size: str; # :type blast_algorithm: str; # "hname" means human readable name (i.e. without file path and extention) fasta_hname = os.path.basename(infile_path) # get rid of absolute path fasta_hname = re.search(r"(.*)\.(m)?f(ast)?a", fasta_hname).group( 1) # get rid of `.fasta` extention # Form path to temporary file tmp_fpath = "{}_{}_temp.txt".format(os.path.join(new_dpath, fasta_hname), blast_algorithm) # Form path to result file tsv_res_fpath = os.path.join(new_dpath, "classification.tsv") # Form path to file with hits to download acc_fpath = os.path.join(outdir_path, "hits_to_download.tsv") num_done_seqs = 0 # variable to keep number of successfully processed sequences resume = None # Check if there are results from previous run. if os.path.exists(tsv_res_fpath) or os.path.exists(tmp_fpath): print() printlog_info( "A result file from previous run is found in the directory:") printlog_info(" `{}`".format(new_dpath)) # Allow politely to continue from last successfully sent packet. resume = ask_for_resumption() # end if if not resume: rename_file_verbosely(tsv_res_fpath) rename_file_verbosely(tmp_fpath) rename_file_verbosely(acc_fpath) else: printlog_info("Let's try to resume...") # Collect information from result file if os.path.exists(tsv_res_fpath): # There can be invalid information in this file try: with open(tsv_res_fpath, 'r') as res_file: lines = res_file.readlines() num_done_seqs = len(lines) - 1 # the first line is a head last_line = lines[-1] last_seq_id = last_line.split('\t')[0] # end with # There must be 10 columns in each row: if any(map(lambda l: l.count('\t') != 9, lines)): raise ValueError( "There must be 10 colums separated by tabs in file `classification.tsv`" ) # end if except Exception as err: printlog_error_time( "\nData in classification file `{}` not found or broken. Reason:" .format(tsv_res_fpath)) printlog_error(' ' + str(err)) # If the reason is known -- print erroneous lines if isinstance(err, ValueError): printlog_error("Here are numbers of improper lines:") for i, line in enumerate(lines): if line.count('\t') != 9: printlog_error(str(i + 1) + ": `{}`".format(line)) # end if # end for # end if # Ask a user if he/she wants to start from the beginning or to quit error = True while error: reply = input("""Press ENTER to start from the beginning or enter `q` to quit:>> """) if reply == "": error = False printlog_info( "You have chosen to start from the beginning.\n") rename_file_verbosely(tsv_res_fpath) rename_file_verbosely(tmp_fpath) rename_file_verbosely(acc_fpath) return None elif reply == 'q': platf_depend_exit(0) else: print("! - Invalid reply: `{}`\n".format(reply)) # end if # end while else: printlog_info("Last classified sequence: " + last_seq_id) printlog_info( "{} sequences have been already processed".format( num_done_seqs)) # end try # end if # Collect information from accession file if os.path.exists(acc_fpath): # There can be invalid information in this file try: with open(acc_fpath, 'r') as acc_file: lines = acc_file.readlines()[ 9:] # omit description and head of the table local_files_filtered = list( filter(lambda x: False if os.path.exists(x) else True, lines)) # omit file paths for line in local_files_filtered: vals = line.split('\t') acc = sys.intern(vals[0].strip()) if len(vals) == 1: acc_dict[acc] = [ "No definition of the sequence provided", 1 ] elif len(vals) == 2: acc_dict[acc] = [vals[1].strip(), 1] else: acc_dict[acc] = [ vals[1].strip(), int(vals[2].strip()) ] # end if # end for # end with except Exception as err: printlog_error_time( "Data in accession file `{}` not found or broken. Reason:". format(acc_fpath)) printlog_error(' ' + str(err)) printlog_error("Invalid line: `{}`".format(line)) # Ask a user if he/she wants to start from the beginning or to quit error = True while error: reply = input("""Press ENTER to start from the beginning or enter `q` to quit:>> """) if reply == "": error = False printlog_info( "You have chosen to start from the beginning.\n") rename_file_verbosely(tsv_res_fpath) rename_file_verbosely(tmp_fpath) rename_file_verbosely(acc_fpath) return None elif reply == 'q': platf_depend_exit(0) else: print("! - Invalid reply: `{}`\n".format(reply)) # end if # end while else: print() printlog_info( "Here are Genbank records encountered during previous run(s):" ) for acc, other_info in sorted(acc_dict.items(), key=lambda x: -x[1][1]): s_letter = "s" if other_info[1] > 1 else "" printlog_info(" {} hit{} - {}, `{}`".format( other_info[1], s_letter, acc, other_info[0])) # end for print('-' * 20) # end try # end if # Get packet size, number of the last sent packet and RID from temp file. # There can be invalid information in tmp file of tmp file may not exist try: with open(tmp_fpath, 'r') as tmp_file: temp_lines = tmp_file.readlines() # end with RID_save = re.search(r"Request_ID: (.+)", temp_lines[0]).group(1).strip() packet_size_save = int( re.search(r"Packet_size: ([0-9]*)", temp_lines[1]).group(1).strip()) packet_mode_save = int( re.search(r"Packet_mode: ([0-9]{1})", temp_lines[2]).group(1).strip()) except (AttributeError, OSError): # There is no need to disturb a user, merely proceed. return { "RID": None, "packet_size_save": None, "packet_mode_save": None, "tsv_respath": tsv_res_fpath, "n_done_reads": num_done_seqs, "tmp_fpath": tmp_fpath, "decr_pb": 0 } else: # Let's assume that a user won't modify his/her brobing_batch size between erroneous runs: # subtract num_done_reads if probing_batch_size > num_done_reads. decr_pb = num_done_seqs if num_done_seqs < probing_batch_size else 0 # Return data from previous run return { "RID": RID_save, "packet_size_save": packet_size_save, "packet_mode_save": packet_mode_save, "tsv_respath": tsv_res_fpath, "n_done_reads": num_done_seqs, "tmp_fpath": tmp_fpath, "decr_pb": decr_pb } # end try # end if return None
def download_taxonomy(hit_acc, hit_def, taxonomy_path): # Function retrieves taxonomy of a hit from NCBI. # Moreover, it saves this taxonomy in file ``taxonomy_tsv: # <accession>\t<taxonomy_str> # # :param hit_acc: hit accession; # :type hit_acc: str; # :param hit_def: definition of reference record; # :type hit_def: str; # :param taxonomy_path: path to TSV file with taxonomy; # :type taxonomy_path: str; # Get TaxID of the organism from GenBank summary: gb_summary = lingering_https_get_request("www.ncbi.nlm.nih.gov", "/nuccore/{}".format(hit_acc), "GenBank summary", hit_acc) try: taxid = re.search(r"ORGANISM=([0-9]+)", gb_summary).group(1) except AttributeError: printlog_error_time( "Error: taxonomy parsing error 115-{}".format(hit_acc)) printlog_error("Please, contact the developer.") platf_depend_exit(115) # end try # Get taxonomy page of the organism taxonomy_url = "/Taxonomy/Browser/wwwtax.cgi?mode=Info&id={}&lvl=3&lin=f&keep=1&srchmode=1&unlock".format( taxid) taxonomy_text = lingering_https_get_request("www.ncbi.nlm.nih.gov", taxonomy_url, "taxonomy", hit_acc) # This pattern will match taxonomic names along with their ranks tax_rank_pattern = r"TITLE=\"([a-z ]+)\"\>([A-Z].+?)\</a\>" # Get all taxonomic names of the organism taxonomy = re.findall(tax_rank_pattern, taxonomy_text) # We will convert ranks to lowercase just in case. # Firstly convert tuples to lists in order to change them: taxonomy = list(map(lambda x: list(x), taxonomy)) # Remove odd information from beginnig of names: for i in range(len(taxonomy)): taxonomy[i][0] = taxonomy[i][0].lower() # just in case # end for # We will leave only following taxonomic ranks: domain, phylum, class, order, family, genus. # Species name requires special handling, it will be added later. ranks_to_select = ranks[:-1] # Remove redundant ranks: taxonomy = filter(lambda x: x[0].lower() in ranks_to_select, taxonomy) # Convert back to tuples: taxonomy = list(map(lambda x: tuple(x), taxonomy)) # E.g., this record has no appropriate ranks: CP034535 # Merely return it's definition if len(taxonomy) == 0: # Save taxonomy _tax_accs.append(hit_acc) with open(taxonomy_path, 'a') as tax_file: tax_file.write("{}\n".format('\t'.join((hit_acc, hit_def)))) # end with # end if # Check if species name is specified like other ranks: check_direct_species_patt = r"TITLE=\"(species)\"\>([A-Za-z0-9 \.]+)\</a\>" match_direct_species = re.search(check_direct_species_patt, taxonomy_text) if not match_direct_species is None: # If species name is specified like other ranks, merely add it to list: taxonomy.append((match_direct_species.group(1), match_direct_species.group(2).partition(" ")[2])) else: # Otherwise we need to parse species name from title title = re.search(r"\<title\>Taxonomy browser \((.+)\)\</title\>", taxonomy_text).group(1) # Get words title = title.split(' ') # We will take all this words as species name. # Viruses also often have unpredictable names. # Example: MN908947 try: if title[1] in second_words_not_species or taxonomy[0][1].lower( ) == "viruses": taxonomy.append(("species", '_'.join(title[1:]))) else: taxonomy.append(("species", title[1])) # end if except IndexError: # Handle absence of species name, e.g., this: AC150248.3 # Well, nothing to append in this case! pass # end try # end if # Fill in missing ranks with empty strings for i in range(len(ranks)): if len(taxonomy) < i + 1: # for this (missing in the end): AC150248 taxonomy.append((ranks[i], "")) elif taxonomy[i][0] != ranks[ i]: # for this (mising in the middle): MN908947 taxonomy.insert(i, (ranks[i], "")) # end if # end for # It will be a bit faster taxonomy = tuple(taxonomy) # Save taxonomy _tax_accs.append(hit_acc) with open(taxonomy_path, 'a') as tax_file: tax_file.write("{}\n".format('\t'.join( (hit_acc, config_taxonomy_str(taxonomy)))))
def map_f5reads_2_taxann(f5_path, tsv_taxann_lst, tax_annot_res_dir): # Function perform mapping of all reads stored in input FAST5 files # to existing TSV files containing taxonomic annotation info. # # It creates an DBM index file. # # :param f5_path: path to current FAST5 file; # :type f5_path: str; # :param tsv_taxann_lst: list of path to TSV files that contain taxonomic annotation; # :type tsv_taxann_lst: list<str>; # :param tax_annot_res_dir: path to directory containing taxonomic annotation; # :type tax_annot_res_dir: str; index_dirpath = os.path.join( tax_annot_res_dir, index_name) # name of directory that will contain indicies # File validation: # RuntimeError will be raised if FAST5 file is broken. try: # File existance checking is performed while parsing CL arguments. # Therefore, this if-statement will trigger only if f5_path's file is not a valid HDF5 file. if not h5py.is_hdf5(f5_path): raise RuntimeError("file is not of HDF5 (i.e. not FAST5) format") # end if f5_file = h5py.File(f5_path, 'r') for _ in f5_file: break # end for except RuntimeError as runterr: printlog_error_time("FAST5 file is broken") printlog_error("Reading the file `{}` crashed.".format( os.path.basename(f5_path))) printlog_error("Reason: {}".format(str(runterr))) printlog_error("Omitting this file...") print() return # end try readids_to_seek = list(fast5_readids(f5_file)) idx_dict = dict() # dictionary for index # This saving is needed to compare with 'len(readids_to_seek)' # after all TSV will be looked through in order to # determine if some reads miss taxonomic annotation. len_before = len(readids_to_seek) # Iterate over TSV-taaxnn file for tsv_taxann_fpath in tsv_taxann_lst: with open(tsv_taxann_fpath, 'r') as taxann_file: # Get all read IDs in current TSV readids_in_tsv = list( map(lambda l: l.split('\t')[0], taxann_file.readlines())) # Iterate over all other reads in current FAST5 # ('reversed' is necessary because we remove items from list in this loop) for readid in reversed(readids_to_seek): fmt_id = fmt_read_id(readid)[1:] if fmt_id in readids_in_tsv: # If not first -- write data to dict (and to index later) try: idx_dict[tsv_taxann_fpath].append( "read_" + fmt_id) # append to existing list except KeyError: idx_dict[tsv_taxann_fpath] = ["read_" + fmt_id ] # create a new list finally: readids_to_seek.remove(readid) # end try # end if # end for # end with if len(readids_to_seek) == 0: break # end if # end for # If after all TSV is checked but nothing have changed -- we miss taxonomic annotation # for some reads! And we will write their IDs to 'missing_reads_lst.txt' file. if len(readids_to_seek) == len_before: printlog_error_time("reads from FAST5 file not found") printlog_error("FAST5 file: `{}`".format(f5_path)) printlog_error("Some reads have not undergone taxonomic annotation.") missing_log = "missing_reads_lst.txt" printlog_error("List of missing reads are in following file:") printlog_error("{}".format(missing_log)) with open(missing_log, 'w') as missing_logfile: missing_logfile.write( "Missing reads from file '{}':\n\n".format(f5_path)) for readid in readids_to_seek: missing_logfile.write(fmt_read_id(readid) + '\n') # end for try: for path in glob(os.path.join(index_dirpath, '*')): os.unlink(path) # end for os.rmdir(index_dirpath) except OSError as oserr: printlog_error( "Error occured while removing index directory: {}".format( oserr)) finally: platf_depend_exit(3) # end try # end if try: # Open index files appending to existing data ('c' parameter) with open_shelve(os.path.join(index_dirpath, index_name), 'c') as index_f5_2_tsv: # Update index index_f5_2_tsv[f5_path] = idx_dict # end with except OSError as oserr: printlog_error_time("Error: cannot create index file `{}`"\ .format(os.path.join(index_dirpath, index_name))) printlog_error(str(oserr)) platf_depend_exit(1)
def configure_acc_dict(acc_fpath, your_own_fasta_lst, accs_to_download): # Fucntion configures accession dictionary according to accession file generated by 'barapost-prober.py': # keys are accessions, values are tuples of the following format: # (<sequence_name_aka_definition>). # # :param acc_fpath: path to accession file generated by 'barapost-prober.py'; # :type acc_fpath: str; # :param your_own_fasta_lst: list of paths to user's fasta files; # :type your_own_fasta_lst: list<str>; # # Returns accession dictionary described above. acc_dict = dict() # if database will be created only from 'your own' FASTA files -- return empty dict if not acc_fpath is None: with open(acc_fpath, 'r') as acc_file: lines = acc_file.readlines() for line_idx, line in enumerate(lines): line = line.strip() # Ignore ampty lines, commented lines and head of the table: if line != "" and not line.startswith( '#') and not line.startswith("ACCESSION"): line_splt = line.split('\t') acc = sys.intern(line_splt[0].partition('.')[0]) if not re.match(GB_ACC_PATTERN, acc) is None: # If we encounter GenBank accession number try: if len(line_splt) == 1: # just accession name = "No definition of the sequence provided" else: name = line_splt[1] # end if acc_dict[acc] = name except IndexError as err: printlog_error_time( "Error: invalid data in file `{}`!".format( acc_fpath)) printlog_error( "Here is that invalid line:\n `{}`".format( line)) printlog_error(str(err)) platf_depend_exit(1) # end try else: # It it's not a GenBank accession number, # probably it is a path to reference file. if os.path.exists(line): your_own_fasta_lst.append(line) else: printlog_error_time( "Error in file `{}`.".format(acc_fpath)) printlog_error("Line #{} looks like path to reference file, but this file does not exist."\ .format(line_idx+1)) printlog_error( "Here is this invalid line:\n `{}`".format( line)) platf_depend_exit(1) # end if # end if # end if # end for # end with # end if if len(your_own_fasta_lst) == 0 and len(acc_dict) == 0 and len( accs_to_download) == 0: printlog_error_time( "Error: no accession information found in file `{}`".format( acc_fpath)) platf_depend_exit(1) # end if return acc_dict
def bin_fastqa_file(fq_fa_path, tax_annot_res_dir, sens, min_qual, min_qlen, min_pident, min_coverage, no_trash): # Function for single-thread binning FASTQ and FASTA files. # # :param fq_fa_path: path to FASTQ (of FASTA) file meant to be processed; # :type fq_fa_path: str; # :param tax_annot_res_dir: path to directory containing taxonomic annotation; # :type tax_annot_res_dir: str; # :param sens: binning sensitivity; # :type sens: str; # :param min_qual: threshold for quality filter; # :type min_qual: float; # :param min_qlen: threshold for length filter; # :type min_qlen: int (or None, if this filter is disabled); # :param min_pident: threshold for alignment identity filter; # :type min_pident: float (or None, if this filter is disabled); # :param min_coverage: threshold for alignment coverage filter; # :type min_coverage: float (or None, if this filter is disabled); # :param no_trash: loical value. True if user does NOT want to output trash files; # :type no_trash: bool; outdir_path = os.path.dirname( logging.getLoggerClass().root.handlers[0].baseFilename) seqs_pass = 0 # counter for sequences, which pass filters QL_seqs_fail = 0 # counter for too short or too low-quality sequences align_seqs_fail = 0 # counter for sequences, which align to their best hit with too low identity or coverage srt_file_dict = dict( ) # dict containing file objects of existing output files new_dpath = get_curr_res_dpath(fq_fa_path, tax_annot_res_dir) tsv_res_fpath = get_res_tsv_fpath(new_dpath) taxonomy_path = os.path.join(tax_annot_res_dir, "taxonomy", "taxonomy.tsv") resfile_lines = configure_resfile_lines(tsv_res_fpath, sens, taxonomy_path) # Configure generator, write function and path to trash file if is_fastq(fq_fa_path): seq_records_generator = fastq_records write_fun = write_fastq_record else: seq_records_generator = fasta_records write_fun = write_fasta_record # end if # Make filter for quality and length QL_filter = get_QL_filter(fq_fa_path, min_qual, min_qlen) # Configure path to trash file if not no_trash: QL_trash_fpath = get_QL_trash_fpath( fq_fa_path, outdir_path, min_qual, min_qlen, ) else: QL_trash_fpath = None # end if # Make filter for identity and coverage align_filter = get_align_filter(min_pident, min_coverage) # Configure path to this trash file if not no_trash: align_trash_fpath = get_align_trash_fpath(fq_fa_path, outdir_path, min_pident, min_coverage) else: align_trash_fpath = None # end if for fastq_rec in seq_records_generator(fq_fa_path): read_name = sys.intern(fmt_read_id( fastq_rec["seq_id"])[1:]) # get ID of the sequence try: hit_names, *vals_to_filter = resfile_lines[ read_name] # find hit corresponding to this sequence except KeyError: printlog_error_time("Error: read `{}` not found in TSV file containing taxonomic annotation."\ .format(read_name)) printlog_error("This TSV file: `{}`".format(tsv_res_fpath)) printlog_error("Make sure that this read has been already \ processed by `barapost-prober.py` and `barapost-local.py`.") platf_depend_exit(1) # end try # Apply filters if not QL_filter(vals_to_filter): QL_seqs_fail += 1 # Place this sequence to QL trash file if QL_trash_fpath not in srt_file_dict.keys(): srt_file_dict = update_file_dict(srt_file_dict, QL_trash_fpath) # end if write_fun(srt_file_dict[QL_trash_fpath], fastq_rec) # write current read to binned file elif not align_filter(vals_to_filter): align_seqs_fail += 1 # Place this sequence to align_trash file if align_trash_fpath not in srt_file_dict.keys(): srt_file_dict = update_file_dict(srt_file_dict, align_trash_fpath) # end if write_fun(srt_file_dict[align_trash_fpath], fastq_rec) # write current read to binned file else: for hit_name in hit_names.split( "&&" ): # there can be multiple hits for single query sequence # Get name of result FASTQ file to write this read in binned_file_path = os.path.join( outdir_path, "{}.fast{}".format(hit_name, 'q' if is_fastq(fq_fa_path) else 'a')) if binned_file_path not in srt_file_dict.keys(): srt_file_dict = update_file_dict(srt_file_dict, binned_file_path) # end if write_fun(srt_file_dict[binned_file_path], fastq_rec) # write current read to binned file # end for seqs_pass += 1 # end if # end for # Close all binned files for file_obj in filter(lambda x: not x is None, srt_file_dict.values()): file_obj.close() # end for sys.stdout.write('\r') printlog_info_time("File `{}` is binned.".format( os.path.basename(fq_fa_path))) printn(" Working...") return (seqs_pass, QL_seqs_fail, align_seqs_fail)
printn("Primary validation...") if not untwist_fast5: for fpath in fast5_list: # Get number of directories in 'tax_annot_res_dir' where results of current FAST5 # baraposting are located. possible_fast5_resdirs_num = len( glob("{}{}*{}*".format(tax_annot_res_dir, os.sep, get_checkstr(fpath)))) if possible_fast5_resdirs_num == 1: continue # OK elif possible_fast5_resdirs_num == 0: # there is no such a directory print() printlog_error_time( "Error: classification for following FAST5 file is missing:") printlog_error(" `{}`".format(fpath)) printlog_error( "Try running barapost-binning with `-u` (`--untwist-fast5`) flag." ) print() platf_depend_exit(5) else: # there are multiple directories where prober-barapost results can be located printlog_error_time( "Error: multiple result directories match FAST5 file meant to be binned" ) printlog_error("File: `{}`".format(os.path.basename(fpath))) printlog_error("Directories:") for d in glob("{}{}*{}*".format(tax_annot_res_dir, os.sep, get_checkstr(fpath))): printlog_error(d) # end for
def bin_fast5_file(f5_path, tax_annot_res_dir, sens, min_qual, min_qlen, min_pident, min_coverage, no_trash): # Function bins FAST5 file with untwisting. # # :param f5_path: path to FAST5 file meant to be processed; # :type f5_path: str; # :param tax_annot_res_dir: path to directory containing taxonomic annotation; # :type tax_annot_res_dir: str; # :param sens: binning sensitivity; # :type sens: str; # :param min_qual: threshold for quality filter; # :type min_qual: float; # :param min_qlen: threshold for length filter; # :type min_qlen: int (or None, if this filter is disabled); # :param min_pident: threshold for alignment identity filter; # :type min_pident: float (or None, if this filter is disabled); # :param min_coverage: threshold for alignment coverage filter; # :type min_coverage: float (or None, if this filter is disabled); # :param no_trash: loical value. True if user does NOT want to output trash files; # :type no_trash: bool; outdir_path = os.path.dirname( logging.getLoggerClass().root.handlers[0].baseFilename) seqs_pass = 0 # counter for sequences, which pass filters QL_seqs_fail = 0 # counter for too short or too low-quality sequences align_seqs_fail = 0 # counter for sequences, which align to their best hit with too low identity or coverage srt_file_dict = dict() index_dirpath = os.path.join( tax_annot_res_dir, index_name) # name of directory that will contain indicies # Make filter for quality and length QL_filter = get_QL_filter(f5_path, min_qual, min_qlen) # Configure path to trash file if not no_trash: QL_trash_fpath = get_QL_trash_fpath( f5_path, outdir_path, min_qual, min_qlen, ) else: QL_trash_fpath = None # end if # Make filter for identity and coverage align_filter = get_align_filter(min_pident, min_coverage) # Configure path to this trash file if not no_trash: align_trash_fpath = get_align_trash_fpath(f5_path, outdir_path, min_pident, min_coverage) else: align_trash_fpath = None # end if # File validation: # RuntimeError will be raised if FAST5 file is broken. try: # File existance checking is performed while parsing CL arguments. # Therefore, this if-statement will trigger only if f5_path's file is not a valid HDF5 file. if not h5py.is_hdf5(f5_path): raise RuntimeError("file is not of HDF5 (i.e. not FAST5) format") # end if from_f5 = h5py.File(f5_path, 'r') for _ in from_f5: break # end for except RuntimeError as runterr: printlog_error_time("FAST5 file is broken") printlog_error("Reading the file `{}` crashed.".format( os.path.basename(f5_path))) printlog_error("Reason: {}".format(str(runterr))) printlog_error("Omitting this file...") print() # Return zeroes -- inc_val won't be incremented and this file will be omitted return (0, 0, 0) # end try # singleFAST5 and multiFAST5 files should be processed in different ways # "Raw" group always in singleFAST5 root and never in multiFAST5 root if "Raw" in from_f5.keys(): f5_cpy_func = copy_single_f5 else: f5_cpy_func = copy_read_f5_2_f5 # end if readids_to_seek = list(from_f5.keys()) # list of not-binned-yet read IDs # Fill the list 'readids_to_seek' for read_name in fast5_readids(from_f5): # Get rid of "read_" readids_to_seek.append(sys.intern(read_name)) # end for # Walk through the index index_f5_2_tsv = open_shelve(os.path.join(index_dirpath, index_name), 'r') if not f5_path in index_f5_2_tsv.keys(): printlog_error_time( "Source FAST5 file `{}` not found in index".format(f5_path)) printlog_error("Try to rebuild index") platf_depend_exit(1) # end if for tsv_path in index_f5_2_tsv[f5_path].keys(): read_names = index_f5_2_tsv[f5_path][tsv_path] taxonomy_path = os.path.join(tax_annot_res_dir, "taxonomy", "taxonomy.tsv") resfile_lines = configure_resfile_lines(tsv_path, sens, taxonomy_path) for read_name in read_names: try: hit_names, *vals_to_filter = resfile_lines[sys.intern( fmt_read_id(read_name)[1:])] except KeyError: printlog_error_time("Error: missing taxonomic annotation info for read `{}`"\ .format(fmt_read_id(read_name)[1:])) printlog_error( "It is stored in `{}` FAST5 file".format(f5_path)) printlog_error( "Try to make new index file (press ENTER on corresponding prompt)." ) printlog_error( "Or, if does not work for you, make sure that taxonomic annotation info \ for this read is present in one of TSV files generated by `barapost-prober.py` and `barapost-local.py`." ) index_f5_2_tsv.close() platf_depend_exit(1) # end try if not QL_filter(vals_to_filter): # Get name of result FASTQ file to write this read in if QL_trash_fpath not in srt_file_dict.keys(): srt_file_dict = update_file_dict(srt_file_dict, QL_trash_fpath) # end if f5_cpy_func(from_f5, read_name, srt_file_dict[QL_trash_fpath]) QL_seqs_fail += 1 elif not align_filter(vals_to_filter): # Get name of result FASTQ file to write this read in if align_trash_fpath not in srt_file_dict.keys(): srt_file_dict = update_file_dict(srt_file_dict, align_trash_fpath) # end if f5_cpy_func(from_f5, read_name, srt_file_dict[align_trash_fpath]) align_seqs_fail += 1 else: for hit_name in hit_names.split( "&&" ): # there can be multiple hits for single query sequence # Get name of result FASTQ file to write this read in binned_file_path = os.path.join( outdir_path, "{}.fast5".format(hit_name)) if binned_file_path not in srt_file_dict.keys(): srt_file_dict = update_file_dict( srt_file_dict, binned_file_path) # end if f5_cpy_func(from_f5, read_name, srt_file_dict[binned_file_path]) # end for seqs_pass += 1 # end if # end for from_f5.close() index_f5_2_tsv.close() # Close all binned files for file_obj in filter(lambda x: not x is None, srt_file_dict.values()): file_obj.close() # end for sys.stdout.write('\r') printlog_info_time("File `{}` is binned.".format( os.path.basename(f5_path))) printn(" Working...") return (seqs_pass, QL_seqs_fail, align_seqs_fail)
def bin_fastqa_file(fq_fa_lst, tax_annot_res_dir, sens, n_thr, min_qual, min_qlen, min_pident, min_coverage, no_trash): # Function for parallel binning FASTQ and FASTA files. # Actually bins multiple files. # # :param fq_fa_lst: lsit of paths to FASTQ (of FASTA) file meant to be processed; # :type fq_fa_lst: list<str>; # :param min_qual: threshold for quality filter; # :type min_qual: float; # :param min_qlen: threshold for length filter; # :type min_qlen: int (or None, if this filter is disabled); # :param min_pident: threshold for alignment identity filter; # :type min_pident: float (or None, if this filter is disabled); # :param min_coverage: threshold for alignment coverage filter; # :type min_coverage: float (or None, if this filter is disabled); # :param no_trash: loical value. True if user does NOT want to output trash files; # :type no_trash: bool; outdir_path = os.path.dirname( logging.getLoggerClass().root.handlers[0].baseFilename) seqs_pass = 0 # counter for sequences, which pass filters QL_seqs_fail = 0 # counter for too short or too low-quality sequences align_seqs_fail = 0 # counter for sequences, which align to their best hit with too low identity or coverage for fq_fa_path in fq_fa_lst: new_dpath = get_curr_res_dpath(fq_fa_path, tax_annot_res_dir) tsv_res_fpath = get_res_tsv_fpath(new_dpath) taxonomy_path = os.path.join(tax_annot_res_dir, "taxonomy", "taxonomy.tsv") resfile_lines = configure_resfile_lines(tsv_res_fpath, sens, taxonomy_path) # Configure path to trash file if is_fastq(fq_fa_path): seq_records_generator = fastq_records write_fun = write_fastq_record else: seq_records_generator = fasta_records write_fun = write_fasta_record # end if # Make filter for quality and length QL_filter = get_QL_filter(fq_fa_path, min_qual, min_qlen) # Configure path to trash file if not no_trash: QL_trash_fpath = get_QL_trash_fpath( fq_fa_path, outdir_path, min_qual, min_qlen, ) else: QL_trash_fpath = None # end if # Make filter for identity and coverage align_filter = get_align_filter(min_pident, min_coverage) # Configure path to this trash file if not no_trash: align_trash_fpath = get_align_trash_fpath(fq_fa_path, outdir_path, min_pident, min_coverage) else: align_trash_fpath = None # end if # Create an iterator that will yield records seq_records_iterator = iter(seq_records_generator(fq_fa_path)) # Dict for storing batches of sequences meant to be written to output files: to_write = dict() stop = False # for outer while-loop while not stop: # Extract batch of records of 'n_thr' size and find their destination paths: for _ in range(n_thr): try: fastqa_rec = next(seq_records_iterator) except StopIteration: stop = True # for outer while-loop break # end try read_name = sys.intern(fmt_read_id( fastqa_rec["seq_id"])[1:]) # get ID of the sequence try: hit_names, *vals_to_filter = resfile_lines[ read_name] # find hit corresponding to this sequence except KeyError: printlog_error_time("Error: read `{}` not found in TSV file containing taxonomic annotation."\ .format(read_name)) printlog_error("This TSV file: `{}`".format(tsv_res_fpath)) printlog_error( "Make sure that this read has been already processed by \ `barapost-prober.py` and `barapost-local.py`.") platf_depend_exit(1) # end try # If read is found in TSV file: if not QL_filter(vals_to_filter): # Place this sequence to QL trash file to_write[read_name] = (fastqa_rec, QL_trash_fpath) QL_seqs_fail += 1 elif not align_filter(vals_to_filter): # Place this sequence to QL trash file to_write[read_name] = (fastqa_rec, align_trash_fpath) align_seqs_fail += 1 else: for hit_name in hit_names.split("&&"): # Get name of result FASTQ file to write this read in binned_file_path = os.path.join( outdir_path, "{}.fast{}".format( hit_name, 'q' if is_fastq(fq_fa_path) else 'a')) to_write[read_name] = (fastqa_rec, binned_file_path) # end for seqs_pass += 1 # end if # end for # Write batch of records to output files: with write_lock: for record, fpath in to_write.values(): write_fun(fpath, record) # end for # end with to_write.clear() # end while with write_lock: # Write the rest of 'uneven' data to output files: if len(to_write) != 0: for record, fpath in to_write.values(): write_fun(fpath, record) # end for # end if sys.stdout.write('\r') printlog_info_time("File `{}` is binned.".format( os.path.basename(fq_fa_path))) printn(" Working...") # end with # end for return (seqs_pass, QL_seqs_fail, align_seqs_fail)
def send_request(request, pack_to_send, packet_size, packet_mode, filename, tmp_fpath): # Function sends a request to "blast.ncbi.nlm.nih.gov/blast/Blast.cgi" # and then waits for satisfaction of the request and retrieves response text. # # :param request: request_data (it is a dict that `configure_request()` function returns); # :param request: dict<dict>; # :param pack_to_send: current number (like id) of packet meant to be sent now. # :type pack_to_send: int; # :param pack_to_send: ordinal number of packet; # :type pack_to_send: int; # :param packet_size: numner of sequences in the packet; # :type packet_size: int; # # Returns XML text of type 'str' with BLAST response. payload = request["payload"] headers = request["headers"] server = "blast.ncbi.nlm.nih.gov" url = "/blast/Blast.cgi" error = True while error: try: conn = http.client.HTTPSConnection(server) # create a connection conn.request("POST", url, payload, headers) # send the request response = conn.getresponse() # get the response response_text = str(response.read(), "utf-8") # get response text except OSError as oserr: printlog_info_time( "`https://blast.ncbi.nlm.nih.gov` is not available.") printlog_info(str(oserr)) printlog_info( "barapost will try to connect again in 30 seconds...\n") sleep(30) # if no exception occured else: error = False # end try # end while try: rid = re.search(r"RID = (.+)", response_text).group(1) # get Request ID rtoe = int(re.search(r"RTOE = ([0-9]+)", response_text).group( 1)) # get time to wait provided by the NCBI server except AttributeError: printlog_error_time("Seems, NCBI has denied your request.") printlog_error("Response is in file `request_denial_response.html`") with open("request_denial_response.html", 'w') as den_file: den_file.write(response_text) # end with platf_depend_exit(1) finally: conn.close() # end try # Save temporary data with open(tmp_fpath, 'w') as tmpfile: tmpfile.write("Request_ID: {}\n".format(rid)) tmpfile.write("Packet_size: {}\n".format(packet_size)) tmpfile.write("Packet_mode: {}".format(packet_mode)) # end with # Wait for results of alignment return wait_for_align(rid, rtoe, pack_to_send, filename)
def configure_resfile_lines(tsv_res_fpath, sens, taxonomy_path): # Function returns dictionary, where keys are sequence (i.e. sequences meant to be binned) IDs, # and values are corresponding hit names. # # :param tsv_res_fpath: path to current TSV file. Binning will be performed accorfing to this TSV file; # :type tsv_res_fpath: str; # :param sens: binning sensitivity; # :type sens: str; # :parm taxonomy_path: path to taxonomy file; # :type taxonomy_file: str; resfile_lines = dict() tax_dict = src.taxonomy.get_tax_dict(taxonomy_path) with open(tsv_res_fpath, 'r') as brpst_resfile: brpst_resfile.readline() # pass the head of the table line = brpst_resfile.readline().strip( ) # get the first informative line while line != "": splt = line.split('\t') read_name = sys.intern(splt[0]) hit_name = splt[1] hit_acc = splt[2] try: quality = float(splt[8]) # we will filter by quality except ValueError as verr: if splt[8] == '-': # Keep minus as quality if there is no quality information. # Error will not be raised. quality = splt[8] else: printlog_error_time("query quality parsing error") printlog_error(str(verr)) printlog_error("Please, contact the developer.") platf_depend_exit(1) # end if # end try try: query_len = int(splt[3]) # we will filter by length except ValueError as verr: printlog_error_time("query length parsing error") printlog_error(str(verr)) printlog_error("Please, contact the developer.") platf_depend_exit(1) # end try try: pident = float(splt[5]) # we will filter by identity except ValueError as verr: if splt[5] == '-': # Keep minus as quality if there is no quality information. # Error will not be raised. pident = splt[5] else: printlog_error_time( "Alignment percent of identity parsing error") printlog_error(str(verr)) printlog_error("Please, contact the developer.") platf_depend_exit(1) # end if # end try try: coverage = float(splt[4]) # we will filter by coverage except ValueError as verr: if splt[4] == '-': # Keep minus as quality if there is no quality information. # Error will not be raised. coverage = splt[4] else: printlog_error_time("alignment coverage parsing error") printlog_error(str(verr)) printlog_error("Please, contact the developer.") platf_depend_exit(1) # end if # end try try: resfile_lines[read_name] = [ format_taxonomy_name(hit_acc, hit_name, sens, tax_dict), quality, query_len, pident, coverage ] except NoTaxonomyError: printlog_warning( "Can't find taxonomy for reference sequence `{}`".format( hit_acc)) printlog_warning("Trying to recover taxonomy.") # Recover src.taxonomy.recover_taxonomy(hit_acc, hit_name, taxonomy_path) printlog_info("Taxonomy for {} is recovered.".format(hit_acc)) # Update tax_dict tax_dict = src.taxonomy.get_tax_dict(taxonomy_path) # Format again -- with new tax_dict resfile_lines[read_name] = [ format_taxonomy_name(hit_acc, hit_name, sens, tax_dict), quality, query_len, pident, coverage ] # end try line = brpst_resfile.readline().strip() # get next line # end while # end with return resfile_lines
def format_taxonomy_name(hit_acc, hit_def, sens, tax_dict): # Function formats taxonomy name according to chosen sensibiliry of binning. # # :param hit_acc: accession(s) of best hit(s); # :type hit_acc: str; # :param hit_def: annotation of best hit; # :type hit_def: str; # :param sens: sensibility returned by 'get_classif_sensibility()' function. # It's value can be one of the following strings: "genus", "species"; # :type sens: str; # :param tax_dict: taxonomy dictionary returned by function 'src.taxonomy.get_tax_dict'; # :type tax_dict: dict; # # Returns formatted hit name of 'str' type; # If there is no hit -- we are sure what to do! if hit_def == "No significant similarity found": return "unknown" # end if best_hit_annots = list( ) # list of strings that will be names of binned files for acc, annotation in zip(hit_acc.split('&&'), hit_def.split('&&')): # Get taxonomy try: taxonomy = tax_dict[acc] except KeyError: raise NoTaxonomyError() # end try # If it is beautiful tuple-formatted taxonomy -- find rank name for filename if isinstance(taxonomy, tuple): best_hit_annots.append(find_rank_for_filename(sens, taxonomy)) if sens[0] == "species": genus_sens = ("genus", sens[1] - 1) genus_name = find_rank_for_filename(genus_sens, taxonomy) species_name = best_hit_annots[len(best_hit_annots) - 1] best_hit_annots[len(best_hit_annots) - 1] = "{}_{}".format( genus_name, species_name) # end if # Otherwise consider sequence ID elif isinstance(taxonomy, str): # Check if hit is a sequence from SPAdes or a5 assembly: spades_match_obj = re.search(SPADES_PATT, annotation) a5_match_obj = re.search(A5_PATT, annotation) if not spades_match_obj is None: if sens[0] != "species": contig_info = spades_match_obj.group(1) taxonomy = taxonomy.replace('--' + contig_info, '') # end if elif not a5_match_obj is None: if sens[0] != "species": contig_info = a5_match_obj.group(1) taxonomy = taxonomy.replace('--' + contig_info, '') # end if # end if # If it is not assembly -- merely return taxonomy best_hit_annots.append(taxonomy) else: # Execution must not reach here printlog_error_time("Fatal error 8754.") printlog_error("Please, contact the developer.") platf_depend_exit(8754) # end if # end for # Replace symbols not allowed in filenames best_hit_annots = map(remove_bad_chars, best_hit_annots) # Return deduplicated names return "&&".join(set(best_hit_annots))
def ngmerge_runner(args): # Runner function for NGmerge task. # # :param args: arguments for NGmerge task; # :type args: NGmergeArguments; # # Returns two collections: # 1. A collection of valid ("merged") paths. # 2. A collection of trash ("unmerged") paths. print() printlog_info_time('Running NGmerge..') # NGmerge puts result files into working directory -- # we will temporarily go to output directory old_dir = os.getcwd() os.chdir(args.outdir) # Conigure output files' names merged_basename, unmerged_prefix = ofn.get_ngmerge_outprefixes( args.infpaths[0]) # Configure command ngmerge_cmd = '{} -1 {} -2 {} -o {} -f {} -n {} -v -m {} -p {} -q {}'\ .format(args.ngmerge, args.infpaths[0], args.infpaths[1], merged_basename, unmerged_prefix, args.n_thr, args.min_overlap, args.mismatch_frac, args.phred_offset) printlog_info('Command: `{}`'.format(ngmerge_cmd)) # Run NGmerge print('NGmerge is doing it\'s job silently...') pipe = sp.Popen(ngmerge_cmd, shell=True, stderr=sp.PIPE) stderr = pipe.communicate()[1].decode('utf-8') # run NGmerge if pipe.returncode != 0: # error printlog_error('Error running NGmerge.: {}'.format(stderr)) platf_depend_exit(pipe.returncode) # end if # Parse merging statistics from NGmerge's stderr stderr = stderr.splitlines()[1:] reads_pattern = r'Fragments \(pairs of reads\) analyzed: ([0-9]+)' merged_pattern = r'Successfully stitched: ([0-9]+)' # Collect statistics try: reads_processed = int(re.search(reads_pattern, stderr[0]).group(1)) merged_reads = int(re.search(merged_pattern, stderr[1]).group(1)) except (ValueError, AttributeError) as err: printlog_error( 'Error 78 ({}). Please, contact the developer.'.format(err)) platf_depend_exit(78) # end try os.chdir(old_dir) # return to old dir printlog_info_time('NGmerge merged {}/{} ({}%) read pairs.'\ .format(merged_reads, reads_processed, round(merged_reads / reads_processed * 100, 2))) # Configure absolute paths to output files. merged_fpath = os.path.join(args.outdir, merged_basename) unmerged_fpaths = sorted( glob.glob( os.path.join(args.outdir, '{}*.fastq'.format(unmerged_prefix)))) # Oh yeah, first returned value must be a collection. return [merged_fpath], unmerged_fpaths