def get_res_tsv_fpath(new_dpath): # Function returns current TSV file. Binning will be performed according to this file. # :param new_dpath: current result directory; # :type new_dpath: str; is_similar_to_tsv_res = lambda f: True if f == "classification.tsv" else False if not os.path.exists(new_dpath): printlog_error_time( "Error: directory `{}` does not exist!".format(new_dpath)) printlog_error("Please make sure you have performed taxonomic \ annotation of the following file: `{}` \ with `barapost-prober.py` and/or `barapost-local.py`".format( os.path.basename(new_dpath))) printlog_error( "Also this error might occur if you forget to specify result directory \ generated by `barapost-prober.py` with `-r` option.") platf_depend_exit(0) # end if # Recent file will be the first in sorted list tsv_res_fpath = list( filter(is_similar_to_tsv_res, sorted(os.listdir(new_dpath))))[0] return os.path.join(new_dpath, tsv_res_fpath)
def create_result_directory(fq_fa_path, outdir_path): # Function creates a result directory named according # to how source FASTQ or FASTA file is named. # # :param fq_fa_path: path to source FASTQ or FASTA file; # :type fq_fa_path: str; # :param outdir_path: path to directory in which result_directory will be created; # :type outdir_path: str; # # Returns 'str' path to the recently created result directory. # dpath means "directory path" new_dpath = os.path.join( outdir_path, os.path.basename(fq_fa_path)) # get rid of absolute path new_dpath = re.search(r"(.*)\.(m)?f(ast)?(a|q)(\.gz)?$", new_dpath).group(1) # get rid of extention if not os.path.exists(new_dpath): try: os.makedirs(new_dpath) except OSError as oserr: printlog_error_time( "Error: can't create result directory: `{}`".format(new_dpath)) printlog_error(str(oserr)) platf_depend_exit(1) # end try # end if return new_dpath
def recover_taxonomy(acc, hit_def, taxonomy_path): # Function recovers missing taxonomy by given accession. # # :param acc: accession of taxonomy entry to recover; # :type acc: str; # :param hit_def: name of this sequence; # :type hit_def: sre; # :param taxonomy_path: path to TSV file with taxonomy; # :type taxonomy_path: str; if acc == "LAMBDA": # If we are missing lambda phage taxonomy -- just add it save_taxonomy_directly(taxonomy_path, acc, "Lambda-phage-nanopore-control") elif acc.startswith("OWN_SEQ_"): # If sequence is an "own seq" -- check fasta file # Get necessary title line from `local_seq_set.fasta` # Firstly find fasta file (it may be compressed) classif_dir = os.path.dirname(os.path.dirname(taxonomy_path)) db_dir = os.path.join(classif_dir, "local_database") db_files = glob.glob("{}{}*".format(db_dir, os.sep)) try: local_fasta = next(iter(filter(is_fasta, db_files))) except StopIteration: printlog_error_time( "Error: cannot recover taxonomy for following sequence:") printlog_error(" `{} - {}`.".format(acc, hit_def)) printlog_error( "You can solve this problem by yourself (it's pretty simple).") printlog_error("Just add taxonomy line for {} to file `{}`".format( acc, taxonomy_path)) printlog_error(" and run the program again.") platf_depend_exit(1) # end try # Find our line startingg with `acc` how_to_open = OPEN_FUNCS[is_gzipped(local_fasta)] fmt_func = FORMATTING_FUNCS[is_gzipped(local_fasta)] if is_gzipped(local_fasta): search_for = b">" + bytes(acc, 'ascii') + b" " else: search_for = ">{} ".format(acc) # end if with how_to_open(local_fasta) as fasta_file: for line in fasta_file: if line.startswith(search_for): seq_name = fmt_func(line).partition(' ')[ 2] # get name of the sequence save_taxonomy_directly(taxonomy_path, acc, seq_name) break # end if # end for # end with else: # Try to find taxonomy in NCBI download_taxonomy(acc, hit_def, taxonomy_path)
def _is_redundant(nc_acc, accs): # Function checks if "NC-or-NW"-record is redundant (if it's non-RefSeq copy already exists in acc_dict). # :param nc_acc: accession number of NC-record; # :type nc_acc: str; # :param accs: tuple of accession numbers; # :type accs: tuple<str>; summary = lingering_https_get_request( "www.ncbi.nlm.nih.gov", "/nuccore/{}?report=genbank&log$=seqview".format(nc_acc), "summary", nc_acc) try: # Find link to Identical GenBank Record # Firstly, get GI number of NC seqeunce: get_gi_url = "/nuccore/{}?report=gilist&log$=seqview&format=text".format( nc_acc) nc_gi_text = lingering_https_get_request("www.ncbi.nlm.nih.gov", get_gi_url, "GI of {}".format(nc_acc), nc_acc) nc_gi_text = nc_gi_text.replace('\n', '') nc_gi_re = re.search(r"\<pre\>([0-9]+).*\</pre\>", nc_gi_text) if nc_gi_re is None: raise _NoIdentLabelError( "Error 771. Accession: {}. Please, contact the developer.". format(nc_acc)) # end if nc_gi = nc_gi_re.group(1) # Retrieve identical GenBank sequence accession number. # NCBI redirects these requests and provides necessary location in headers. # So, we'll follow thin link. identical_gb_link = "/nuccore?LinkName=nuccore_nuccore_rsgb&from_uid={}".format( nc_gi) redirect_text = _ling_https_getreq_handl_301( "www.ncbi.nlm.nih.gov", identical_gb_link, "link to identical genbank sequence", nc_acc) # Get accession number from the response text pattern = r"\<pre\>(.*).*\</pre\>" ident_acc_re = re.search(pattern, redirect_text.replace('\n', '')) if ident_acc_re is None: raise _NoIdentLabelError( "Error 773. Accession: {}. Please, contact the developer.". format(nc_acc)) # end if ident_acc = ident_acc_re.group(1).partition('.')[0] except (_NoIdentLabelError, _NoLinkError, _NoAccError) as err: printlog_error_time("Error: {}".format(err)) platf_depend_exit(1) else: return ident_acc, ident_acc in accs
def rename_file_verbosely(file): # Function verbosely renames file (as well as directory) given to it. # :param file: path to file (directory) meant to be renamed; # :type file: str; if not os.path.exists(file): return None # end if # Path to "file's" parent directory pardir = os.path.abspath(os.path.dirname(file)) # Function can rename directories if os.path.isdir(file): is_analog = lambda f: not re.search(r"{}.*(_old_[0-9]+)?$"\ .format(os.path.basename(file)), f) is None word = "directory" name_itself = file ext = "" else: is_analog = lambda f: re.search(r"(.*)\..*$", os.path.basename(file) ).group(1) in f word = "file" name_itself = re.search(r"(.*)\..*$", file).group(1) ext = re.search(r".*(\..*)$", file).group(1) # end if # Count files in 'pardir' that have analogous names as 'file' has: num_analog_files = len(list(filter(is_analog, os.listdir(pardir)))) if re.search(r"_old_[0-9]+", file) is None: # Append "_old_<number>" new_name = name_itself + "_old_" + str(num_analog_files) + ext else: # Merely substitute new number new_name = file.replace( re.search(r"_old_([0-9]+)", file).group(1), str(num_analog_files + 1)) # end if try: print() printlog_info(" - Renaming old {}:".format(word)) printlog_info(" `{}` --> `{}`".format(file, new_name)) os.rename(file, new_name) except OSError as err: printlog_error_time("Error: {} `{}` cannot be renamed:".format( word, str(file))) printlog_error(str(err)) platf_depend_exit(1) # end try return new_name
def update_file_dict(srt_file_dict, new_fpath): try: if not new_fpath is None: srt_file_dict[sys.intern(new_fpath)] = open(new_fpath, 'a') else: srt_file_dict[new_fpath] = None # handle no_trash # end if except OSError as oserr: printlog_error_time("Error occured while opening one of result files") printlog_error("Errorneous file: `{}`".format(new_fpath)) printlog_error(str(oserr)) platf_depend_exit(1) # end try return srt_file_dict
def remove_tmp_files(*paths): # Function removes files passed to it. # :param paths: an array-like collection of apth of files; # :type paths: list<str>; for path in paths: if os.path.exists(path): try: os.unlink(path) except OSError as oserr: printlog_error_time("Error: cannot remove file `{}`").format( path) printlog_error(str(oserr)) platf_depend_exit(1)
def look_around(new_dpath, fq_fa_path): # Function looks around in order to check if there are results from previous runs of this script. # # Returns None if there is no result from previous run. # If there are results from previous run, returns a dict of the following structure: # { # "tsv_respath": path_to_tsv_file_from_previous_run (str), # "n_done_reads": number_of_successfull_requests_from_currenrt_FASTA_file (int), # } # # :param new_dpath: path to current (corresponding to fq_fa_path file) result directory; # :type new_dpath: str; # :param fq_fa_path: path to current (corresponding to fq_fa_path file) FASTA file; # :type fq_fa_path: str; # "hname" means human readable name (i.e. without file path and extention) fasta_hname = os.path.basename(fq_fa_path) # get rid of absolute path fasta_hname = re.search(r"(.*)\.(m)?f(ast)?a", fasta_hname).group( 1) # get rid of '.fasta' extention # Form path to result file tsv_res_fpath = os.path.join(new_dpath, "classification.tsv") num_done_reads = 0 # variable to keep number of succeffdully processed sequences if os.path.exists(tsv_res_fpath): with open(tsv_res_fpath, 'r') as res_file: # There can be invalid information in result file try: lines = res_file.readlines() num_done_reads = len(lines) - 1 # the first line is a head except OSError as err: printlog_error_time("Data in classification file `{}` is broken. Reason:"\ .format(tsv_res_fpath)) printlog_error(str(err)) printlog_error("Starting from the beginning.") rename_file_verbosely(tsv_res_fpath) return None # end try # end with else: return None # end if return { "tsv_respath": tsv_res_fpath, "n_done_reads": num_done_reads, }
def copy_single_f5(from_f5, read_name, to_f5): # Function copies a read with ID 'read_name' # from 'from_f5' singleFAST5 file to 'to_f5' multiFAST5 one. # # :param from_f5: FAST5 file object to copy a read from; # :type from_f5: h5py.File; # :param read_name: ID of a read to copy; # :type read_name: str; # :param to_f5: destination FAST5 file; # :type to_f5: h5py.File; # Handle no_trash if to_f5 is None: return # end if try: read_group = read_name to_f5.create_group(read_group) # create group in destination multi_FAST5 file # Copy "UniqueGlobalKey" to root of recently created group for ugk_subgr in from_f5["UniqueGlobalKey"]: from_f5.copy("UniqueGlobalKey/"+ugk_subgr, to_f5[read_group]) # end for # Get data array in single-FAST5 file read_number_group = "Raw/Reads/"+next(iter(from_f5["Raw"]["Reads"])) # It's name in multi-FAST5 file read_number = re.search(r"(Read_[0-9]+)", read_number_group).group(1) # Copy group to multi-FAST5 file from_f5.copy(from_f5[read_number_group], to_f5[read_group]) # Move data array to "Raw" group, as it is in multi-FAST5 files to_f5.move("{}/{}".format(read_group, read_number), "{}/Raw".format(read_group)) # Copy everything else to recently created group for group in from_f5: if group != "Raw" and group != "UniqueGlobalKey": from_f5.copy(group, to_f5["/{}".format(read_group)]) # end if # end for except ValueError as err: printlog_error_time("Error: `{}`".format( str(err) )) printlog_error("Reason is probably the following:") printlog_error(" read that is copying to the result file is already in this file.") printlog_error("ID of the read: `{}`".format(read_name)) printlog_error("File: `{}`".format(to_f5.filename)) return
def whether_to_build_index(index_dirpath): # Function checks if there are any files in index directory. # If there are any, it asks a user whether to create a new index or to use old one. # :param index_dirpath: path to index directory; # :type index_dirpath: str; use_old_index = False if len(os.listdir(index_dirpath)) != 0: printlog_info( "Index file created by `-u` option already exists (left from previous run)." ) error = True while error: reply = input(""" Press ENTER to make new index file or enter 'u' to use old index file:>>""") if reply == "": try: for path in glob(os.path.join(index_dirpath, '*')): os.unlink(path) # end for except OSError as oserr: printlog_error_time( "Error: cannot remove old index files!") printlog_error(str(oserr)) platf_depend_exit(1) # end try error = False elif reply == 'u': use_old_index = True error = False else: print("Invalid reply!\n") # end if # end while printlog_info("You have chosen to {} index file.".format( "use old" if use_old_index else "make new")) print() # end if return use_old_index
def launch_blastn(packet, blast_algorithm, use_index, queries_tmp_dir, db_path): """ Function launches 'blastn' utility from "BLAST+" toolkit and returns it's response. :param pacekt: FASTA data meant to be processend by 'blastn'; :type packet: str; :param blast_algorithm: blastn algorithm to use; :type blast_algorithm: str; :param use_index: logic value inddicating whether to use index; :type use_index: bool: :param queries_tmp_dir: path to directory with query files; :type queries_tmp_dir: str: :param db_path: path to database; :type db_path: str: """ # PID of current process won't change, so we can use it to mark query files. # 'paket's are too large to pass them to 'subprocess.Popen' as stdin, # therefore we need to use these query files. query_path = os.path.join(queries_tmp_dir, "query{}_tmp.fasta".format(os.getpid())) with open(query_path, 'w') as query_file: query_file.write(packet) # end with # Configure command line blast_cmd = "blastn -query {} -db {} -outfmt 5 -task {} -max_target_seqs 10 -max_hsps 1 -use_index {}"\ .format(query_path, db_path, blast_algorithm, use_index) pipe = sp.Popen(blast_cmd, shell=True, stdout=sp.PIPE, stderr=sp.PIPE) stdout_stderr = pipe.communicate() if pipe.returncode != 0: printlog_error_time( "Error occured while aligning a sequence against local database") printlog_error(stdout_stderr[1].decode("utf-8")) platf_depend_exit(pipe.returncode) # end if return stdout_stderr[0].decode("utf-8")
def copy_read_f5_2_f5(from_f5, read_name, to_f5): # Function copies a read with ID 'read_name' # from 'from_f5' multiFAST5 file to to_f5 multiFAST5 one. # # :param from_f5: FAST5 file object to copy a read from; # :type from_f5: h5py.File; # :param read_name: ID of a read to copy; # :type read_name: str; # :param to_f5: destination FAST5 file; # :type to_f5: h5py.File; if not to_f5 is None: # handle no_trash try: from_f5.copy(read_name, to_f5) except ValueError as err: printlog_error_time("Error: `{}`".format( str(err) )) printlog_error("Reason is probably the following:") printlog_error(" read that is copying to the result file is already in this file.") printlog_error("ID of the read: `{}`".format(read_name)) printlog_error("File: `{}`".format(to_f5.filename)) return
def add_lambda_phage(local_fasta, taxonomy_path): # Function adds control sequence of nanopore lambda phase DNA-CS # to 'local_fasta'. # # :param local_fasta: path to file with reference sequences to be included in database; # :type local_fasta: str; # :param taxonomy_path: path to taxonomy file; # :type taxonomy_path: str; print() printlog_info_time("Adding lambda phage control sequence...") # sys.path[0] is directory containing the script that was used to invoke the Python interpreter. # We will use it to get path to file with lambda's sequence. lambda_fpath = os.path.join(os.path.dirname(sys.path[0]), "lambda_control", "nanopore_lambda_DNA-CS_control.fasta.gz") # Check file existance if not os.path.exists(lambda_fpath): printlog_error_time( "Error: cannot find lambda phage control sequence: '{}'".format( lambda_fpath)) platf_depend_exit(1) # end if # Read lambda's sequence with open_as_gzip(lambda_fpath, 'rb') as lambda_file: lambda_fasta = lambda_file.read() # end with # Write it to db fasta file with open(local_fasta, 'wb') as db_fasta_file: db_fasta_file.write(lambda_fasta) # end with # Save lambda's taxonomy taxonomy.save_taxonomy_directly(taxonomy_path, "LAMBDA", "Lambda-phage-nanopore-control") printlog_info_time(" ok")
# generated by prober and barapost. printn("Primary validation...") if not untwist_fast5: for fpath in fast5_list: # Get number of directories in 'tax_annot_res_dir' where results of current FAST5 # baraposting are located. possible_fast5_resdirs_num = len( glob("{}{}*{}*".format(tax_annot_res_dir, os.sep, get_checkstr(fpath)))) if possible_fast5_resdirs_num == 1: continue # OK elif possible_fast5_resdirs_num == 0: # there is no such a directory print() printlog_error_time( "Error: classification for following FAST5 file is missing:") printlog_error(" `{}`".format(fpath)) printlog_error( "Try running barapost-binning with `-u` (`--untwist-fast5`) flag." ) print() platf_depend_exit(5) else: # there are multiple directories where prober-barapost results can be located printlog_error_time( "Error: multiple result directories match FAST5 file meant to be binned" ) printlog_error("File: `{}`".format(os.path.basename(fpath))) printlog_error("Directories:") for d in glob("{}{}*{}*".format(tax_annot_res_dir, os.sep, get_checkstr(fpath))): printlog_error(d)
def configure_resfile_lines(tsv_res_fpath, sens, taxonomy_path): # Function returns dictionary, where keys are sequence (i.e. sequences meant to be binned) IDs, # and values are corresponding hit names. # # :param tsv_res_fpath: path to current TSV file. Binning will be performed accorfing to this TSV file; # :type tsv_res_fpath: str; # :param sens: binning sensitivity; # :type sens: str; # :parm taxonomy_path: path to taxonomy file; # :type taxonomy_file: str; resfile_lines = dict() tax_dict = src.taxonomy.get_tax_dict(taxonomy_path) with open(tsv_res_fpath, 'r') as brpst_resfile: brpst_resfile.readline() # pass the head of the table line = brpst_resfile.readline().strip( ) # get the first informative line while line != "": splt = line.split('\t') read_name = sys.intern(splt[0]) hit_name = splt[1] hit_acc = splt[2] try: quality = float(splt[8]) # we will filter by quality except ValueError as verr: if splt[8] == '-': # Keep minus as quality if there is no quality information. # Error will not be raised. quality = splt[8] else: printlog_error_time("query quality parsing error") printlog_error(str(verr)) printlog_error("Please, contact the developer.") platf_depend_exit(1) # end if # end try try: query_len = int(splt[3]) # we will filter by length except ValueError as verr: printlog_error_time("query length parsing error") printlog_error(str(verr)) printlog_error("Please, contact the developer.") platf_depend_exit(1) # end try try: pident = float(splt[5]) # we will filter by identity except ValueError as verr: if splt[5] == '-': # Keep minus as quality if there is no quality information. # Error will not be raised. pident = splt[5] else: printlog_error_time( "Alignment percent of identity parsing error") printlog_error(str(verr)) printlog_error("Please, contact the developer.") platf_depend_exit(1) # end if # end try try: coverage = float(splt[4]) # we will filter by coverage except ValueError as verr: if splt[4] == '-': # Keep minus as quality if there is no quality information. # Error will not be raised. coverage = splt[4] else: printlog_error_time("alignment coverage parsing error") printlog_error(str(verr)) printlog_error("Please, contact the developer.") platf_depend_exit(1) # end if # end try try: resfile_lines[read_name] = [ format_taxonomy_name(hit_acc, hit_name, sens, tax_dict), quality, query_len, pident, coverage ] except NoTaxonomyError: printlog_warning( "Can't find taxonomy for reference sequence `{}`".format( hit_acc)) printlog_warning("Trying to recover taxonomy.") # Recover src.taxonomy.recover_taxonomy(hit_acc, hit_name, taxonomy_path) printlog_info("Taxonomy for {} is recovered.".format(hit_acc)) # Update tax_dict tax_dict = src.taxonomy.get_tax_dict(taxonomy_path) # Format again -- with new tax_dict resfile_lines[read_name] = [ format_taxonomy_name(hit_acc, hit_name, sens, tax_dict), quality, query_len, pident, coverage ] # end try line = brpst_resfile.readline().strip() # get next line # end while # end with return resfile_lines
def build_local_db(tax_annot_res_dir, acc_fpath, your_own_fasta_lst, accs_to_download, use_index): # Function creates a database with utilities from 'blast+' toolkit # according to acc_dict and your_own_fasta_lst. # # :param tax_annot_res_dir: path to current result directory # (each processed file has it's own result directory); # :type tax_annot_res_dir: str; # :param acc_fpath: path to file "hits_to_download.tsv"; # :type acc_fpath: str; # :param your_own_fasta_lst: list of user's fasta files to be included in database; # :type your_own_fasta_lst: list<str>; # :param accs_to_download: list of accessions from command line ('-s'); # :type accs_to_download: list<str>; # :param use_index: whether to use index; # :type use_index: str; # Returns path to created database. # Path to directory in which database will be placed db_dir = os.path.join(tax_annot_res_dir, "local_database") # Path to DBM taxonomy file taxonomy_path = os.path.join(tax_annot_res_dir, "taxonomy", "taxonomy.tsv") try: os.makedirs(db_dir) except OSError: #If this directory exists while True: if len(os.listdir(db_dir)) == 0: # If db directory is empty -- break and build a database break else: print() printlog_info("Database directory is not empty:") printlog_info(" `{}`".format(os.path.abspath(db_dir))) printlog_info("Here is it's content:") for i, fname in enumerate(os.listdir(os.path.abspath(db_dir))): printlog_info(" {}. `{}`".format(i + 1, fname)) # end for reply = input( """\nPress ENTER to start classification using existing database. Enter 'r' to remove all files in this directory and create the database from the beginning:>>""" ) if reply == "": # Do not build a database, just return path to it. printlog_info("You have chosen to use extant database.") # Return path to DB located in this directory dbpath = next(iter(os.listdir(db_dir))) dbpath = dbpath.partition(".fasta")[0] + dbpath.partition( ".fasta")[1] # remove all after '.fasta' return os.path.join(db_dir, dbpath) elif reply == 'r': printlog_info("You have chosen to rebuild the database.") # Rename old classification files and write actual data to new one: old_classif_dirs = filter( lambda d: os.path.exists( os.path.join(d, "classification.tsv")), glob(os.path.join(tax_annot_res_dir, "*"))) old_classif_files = tuple( map(lambda f: os.path.join(f, "classification.tsv"), old_classif_dirs)) if len(old_classif_files) > 0: print() printlog_info("Renaming old classification files:") for classif_file in old_classif_files: rename_file_verbosely(classif_file) # end for # end if # Empty database directory for file in glob("{}{}*".format(db_dir, os.sep)): os.unlink(file) # end for # Break from the loop in order to build a database break else: print("Invalid reply: `{}`\n".format(reply)) continue # end if # end if # end while # end try # It is a dictionary of accessions and record names. # Accessions are keys, record names are values. acc_dict = configure_acc_dict(acc_fpath, your_own_fasta_lst, accs_to_download) if len(accs_to_download) != 0: verify_cl_accessions(accs_to_download, acc_dict) # end if # Retrieve already existing taxonomy data from taxonomy file tax_exist_accs = taxonomy.get_tax_keys(taxonomy_path) # If accession file does not exist and execution has reached here -- everything is OK -- # we are building a database from user's files only. if len(acc_dict) != 0: print() print("""Following sequences (and all replicons related to them) will be downloaded from Genbank for further taxonomic classification on your local machine:\n""") printlog_info( "Following sequences (and all replicons related to them) \ will be downloaded from Genbank for further taxonomic classification \ on your local machine:") for i, acc in enumerate(acc_dict.keys()): printlog_info(" {}. {} - `{}`".format(i + 1, acc, acc_dict[acc])) # end for search_for_related_replicons(acc_dict) printlog_info_time("Completing taxonomy file...") for i, acc in enumerate(acc_dict.keys()): if not acc in tax_exist_accs: taxonomy.find_taxonomy(acc, acc_dict[acc][1], taxonomy_path) # end if # Accessions can be of different length printn("\r{} - {}: {}/{}".format(getwt(), acc, i + 1, len(acc_dict)) + " " * 10 + "\b" * 10) # end for print() printlog_info_time("Taxonomy file is consistent.") # end if local_fasta = os.path.join( db_dir, "local_seq_set.fasta") # path to downloaded FASTA file add_lambda_phage(local_fasta, taxonomy_path) # add lambda phage control sequence retrieve_fastas_by_acc( acc_dict, db_dir, local_fasta) # download main fasta data from GenBank # Add 'your own' fasta files to database if not len(your_own_fasta_lst) == 0: # This variable counts sequences from local files. # It is necessary for not allowing duplicated accessions. own_seq_counter = 0 # Check if these files are assembly made by SPAdes or a5 spades_patt = r">NODE_[0-9]+" # this pattern will match sequence IDs generated y SPAdes a5_patt = r">scaffold_[0-9]+" # this pattern will match sequence IDs generated y a5 assemblies = list( ) # this list will contain paths to assembly files (SPAdes or a5) for own_fasta_path in reversed(your_own_fasta_lst): how_to_open = OPEN_FUNCS[is_gzipped(own_fasta_path)] fmt_func = FORMATTING_FUNCS[is_gzipped(own_fasta_path)] with how_to_open(own_fasta_path) as fasta_file: first_seq_id = fmt_func(fasta_file.readline( )) # get the first line in file (the first seq ID) # end with # if we've got SPAdes assembly if not re.search(spades_patt, first_seq_id) is None: assemblies.append(own_fasta_path) # Remove these file from list -- they will be processed in a specific way your_own_fasta_lst.remove(own_fasta_path) continue # end if # if we've got a5 assembly if not re.search(a5_patt, first_seq_id) is None: assemblies.append(own_fasta_path) your_own_fasta_lst.remove(own_fasta_path) continue # end if # end for # Include assemblies files in multi-fasta file # Find common prefix of all assembly paths and remove it from assembly names if len(assemblies) > 1: assemblies_formatted = tuple( map(lambda f: os.path.abspath(f).replace(os.sep, '-'), assemblies)) common_prefix = find_common_prefix(assemblies_formatted) assemblies_formatted = tuple( map(lambda f: f.replace(common_prefix, ''), assemblies_formatted)) elif len(assemblies) > 0: common_prefix = '' assemblies_formatted = tuple(map(os.path.basename, assemblies)) # end if # Add assembled sequences to database with open(local_fasta, 'a') as fasta_db: for i, assm_path in enumerate(assemblies): printlog_info("Adding `{}` to database...".format( os.path.basename(assm_path))) assm_name_fmt = assemblies_formatted[i] how_to_open = OPEN_FUNCS[is_gzipped(assm_path)] fmt_func = FORMATTING_FUNCS[is_gzipped(assm_path)] with how_to_open(assm_path) as fasta_file: for line in fasta_file: line = fmt_func(line) # You can find comments to "OWN_SEQ..." below. # Paths will be written to seq IDs in following way: # some-happy-path.fastq-- # in order to retrieve them securely with regex later. if line.startswith('>'): own_seq_counter += 1 own_acc = "OWN_SEQ_{}".format(own_seq_counter) own_def = "{}--".format( assm_name_fmt.replace(common_prefix, '')) + line[1:] own_def = remove_bad_chars(own_def) taxonomy.save_taxonomy_directly( taxonomy_path, own_acc, own_def) line = ">" + "{} {}".format(own_acc, own_def) # end if fasta_db.write(line + '\n') # end for # end with # end for # end with with open(local_fasta, 'a') as fasta_db: for own_fasta_path in your_own_fasta_lst: printlog_info("Adding `{}` to database...".format( os.path.basename(own_fasta_path))) how_to_open = OPEN_FUNCS[is_gzipped(own_fasta_path)] fmt_func = FORMATTING_FUNCS[is_gzipped(own_fasta_path)] with how_to_open(own_fasta_path) as fasta_file: for line in fasta_file: line = fmt_func(line) # 'makeblastdb' considers first word (sep. is space) as sequence ID # and throws an error if there are duplicated IDs. # In order not to allow this duplication we'll create our own sequence IDs: # 'OWN_SEQ_<NUMBER>' and write it in the beginning of FASTA record name. if line.startswith('>'): own_seq_counter += 1 own_acc = "OWN_SEQ_{}".format(own_seq_counter) taxonomy.save_taxonomy_directly( taxonomy_path, own_acc, line[1:]) line = ">" + own_acc + ' ' + remove_bad_chars( line[1:]) # end if fasta_db.write(line + '\n') # end for # end with # end for # end with # end if # 'lcl|ACCESSION...' entries can be given with '.1' # (or '.2', whatever) terminus by blastn. # There is no '.1' terminus in taxonomy file. # Therefore we will prune accessions in advance. print() printn("{} - Formatting accessions...".format(getwt())) log_info("Formatting accessions...") corrected_path = os.path.join(db_dir, "corrected_seqs.fasta") with open(local_fasta, 'r') as source_file, open(corrected_path, 'w') as dest_file: for line in source_file: if line.startswith('>'): line = line.strip() acc, seq_name = (line.partition(' ')[0], line.partition(' ')[2]) acc = acc.partition('.')[0] seq_name = remove_bad_chars(seq_name) seq_name = re.sub(r'[^\x00-\x7F]+', '_', seq_name) # remove non-ascii chars line = ' '.join((acc, seq_name)) + '\n' # end if dest_file.write(line) # end for # end with os.unlink(local_fasta) os.rename(corrected_path, local_fasta) sys.stdout.write("\r{} - Formatting accessions... ok".format(getwt())) log_info("Formatting accessions done.") # Configure command line make_db_cmd = "makeblastdb -in {} -parse_seqids -dbtype nucl".format( local_fasta) exit_code = os.system(make_db_cmd) # make a blast-format database if exit_code != 0: printlog_error_time("Error occured while making the database") platf_depend_exit(exit_code) # end if print("\033[1A{} - Database is successfully created: `{}`\n".format( getwt(), local_fasta)) log_info("Database is successfully created: `{}`".format(local_fasta)) if use_index == "true": printlog_info_time("Database index creating started") # Configure command line make_index_cmd = "makembindex -input {} -iformat blastdb -verbosity verbose".format( local_fasta) exit_code = os.system( make_index_cmd) # create an index for the database if exit_code != 0: printlog_info_time("Error occured while creating database index") platf_depend_exit(exit_code) # end if printlog_info_time("Database index has been successfully created") # end if # Gzip downloaded FASTA file printlog_info_time("Gzipping FASTA file: `{}`".format(local_fasta)) if gzip_util_found: os.system("{} -v {}".format(gzip_util, local_fasta)) else: # form .fasta.gz file 'by hand' with open(local_fasta, 'rb') as fasta_file, open_as_gzip(local_fasta + ".gz", "wb") as fagz_file: shutil_copyfileobj(fasta_file, fagz_file) # end with os.unlink(local_fasta) # remove source FASTA file, not the database # end if return local_fasta
if blast_algorithm == "megaBlast": blast_algorithm = "megablast" elif blast_algorithm == "discoMegablast": blast_algorithm = "dc-megablast" # end if import src.legacy_taxonomy_handling as legacy_taxonomy_handling # Form path to taxonomy file: taxonomy_dir = os.path.join(tax_annot_res_dir, "taxonomy") if not os.path.isdir(taxonomy_dir): try: os.makedirs(taxonomy_dir) except OSError as err: printlog_error_time( "Error: cannot create taxonomy directory `{}`".format( taxonomy_dir)) printlog_error_time(str(err)) platf_depend_exit(1) # end try # end if taxonomy_path = os.path.join(taxonomy_dir, "taxonomy.tsv") # Check if there is legacy taxonomy file and, if so, reformat it to new (TSV) format legacy_taxonomy_handling.check_deprecated_taxonomy(tax_annot_res_dir) from src.barapost_local_modules.build_local_db import build_local_db # Indexed discontiguous searches are not supported: # https://www.ncbi.nlm.nih.gov/books/NBK279668/#usermanual.Megablast_indexed_searches if use_index == "true" and blast_algorithm == "dc-megablast":
def look_around(outdir_path, new_dpath, infile_path, blast_algorithm, acc_dict, probing_batch_size): # Function looks around in order to check if there are results from previous run(s) of this script # in order to resume the previous run. # # Returns None if there is no result from previous run. # If there are results from previous run, returns a dict of the following structure: # { # "RID": saved_RID <str>, # "packet_size_save": saved packet size <int>, # "packet_size_mode": saved packet mode <int>, # "tsv_respath": path_to_tsv_file_from_previous_run <str>, # "n_done_reads": number_of_successfull_requests_from_currenrt_FASTA_file <int>, # "tmp_fpath": path_to_pemporary_file <str>, # "decr_pb": valuse decreasing size of probing batch (see below, where this variable is defined) <int> # } # # :param outdir_path: path to output directory; # :type outdir_path: str; # :param new_dpath: path to current (corresponding to fq_fa_path file) result directory; # :type new_dpath: str; # :param infile_path: path to current (corresponding to fq_fa_path file) FASTA file; # :type infile_path: str; # :param blast_algorithm: BLASTn algorithm to use. # This parameter is necessary because it is included in name of result files; # :param acc_dict: dictionary of accession info of hits; # :type acc_dict: dict<str: tuple<str, str, int>>; # :param probing_batch_size: amount of sequences meant to be processed in a single run; # :type probing_batch_size: str; # :type blast_algorithm: str; # "hname" means human readable name (i.e. without file path and extention) fasta_hname = os.path.basename(infile_path) # get rid of absolute path fasta_hname = re.search(r"(.*)\.(m)?f(ast)?a", fasta_hname).group( 1) # get rid of `.fasta` extention # Form path to temporary file tmp_fpath = "{}_{}_temp.txt".format(os.path.join(new_dpath, fasta_hname), blast_algorithm) # Form path to result file tsv_res_fpath = os.path.join(new_dpath, "classification.tsv") # Form path to file with hits to download acc_fpath = os.path.join(outdir_path, "hits_to_download.tsv") num_done_seqs = 0 # variable to keep number of successfully processed sequences resume = None # Check if there are results from previous run. if os.path.exists(tsv_res_fpath) or os.path.exists(tmp_fpath): print() printlog_info( "A result file from previous run is found in the directory:") printlog_info(" `{}`".format(new_dpath)) # Allow politely to continue from last successfully sent packet. resume = ask_for_resumption() # end if if not resume: rename_file_verbosely(tsv_res_fpath) rename_file_verbosely(tmp_fpath) rename_file_verbosely(acc_fpath) else: printlog_info("Let's try to resume...") # Collect information from result file if os.path.exists(tsv_res_fpath): # There can be invalid information in this file try: with open(tsv_res_fpath, 'r') as res_file: lines = res_file.readlines() num_done_seqs = len(lines) - 1 # the first line is a head last_line = lines[-1] last_seq_id = last_line.split('\t')[0] # end with # There must be 10 columns in each row: if any(map(lambda l: l.count('\t') != 9, lines)): raise ValueError( "There must be 10 colums separated by tabs in file `classification.tsv`" ) # end if except Exception as err: printlog_error_time( "\nData in classification file `{}` not found or broken. Reason:" .format(tsv_res_fpath)) printlog_error(' ' + str(err)) # If the reason is known -- print erroneous lines if isinstance(err, ValueError): printlog_error("Here are numbers of improper lines:") for i, line in enumerate(lines): if line.count('\t') != 9: printlog_error(str(i + 1) + ": `{}`".format(line)) # end if # end for # end if # Ask a user if he/she wants to start from the beginning or to quit error = True while error: reply = input("""Press ENTER to start from the beginning or enter `q` to quit:>> """) if reply == "": error = False printlog_info( "You have chosen to start from the beginning.\n") rename_file_verbosely(tsv_res_fpath) rename_file_verbosely(tmp_fpath) rename_file_verbosely(acc_fpath) return None elif reply == 'q': platf_depend_exit(0) else: print("! - Invalid reply: `{}`\n".format(reply)) # end if # end while else: printlog_info("Last classified sequence: " + last_seq_id) printlog_info( "{} sequences have been already processed".format( num_done_seqs)) # end try # end if # Collect information from accession file if os.path.exists(acc_fpath): # There can be invalid information in this file try: with open(acc_fpath, 'r') as acc_file: lines = acc_file.readlines()[ 9:] # omit description and head of the table local_files_filtered = list( filter(lambda x: False if os.path.exists(x) else True, lines)) # omit file paths for line in local_files_filtered: vals = line.split('\t') acc = sys.intern(vals[0].strip()) if len(vals) == 1: acc_dict[acc] = [ "No definition of the sequence provided", 1 ] elif len(vals) == 2: acc_dict[acc] = [vals[1].strip(), 1] else: acc_dict[acc] = [ vals[1].strip(), int(vals[2].strip()) ] # end if # end for # end with except Exception as err: printlog_error_time( "Data in accession file `{}` not found or broken. Reason:". format(acc_fpath)) printlog_error(' ' + str(err)) printlog_error("Invalid line: `{}`".format(line)) # Ask a user if he/she wants to start from the beginning or to quit error = True while error: reply = input("""Press ENTER to start from the beginning or enter `q` to quit:>> """) if reply == "": error = False printlog_info( "You have chosen to start from the beginning.\n") rename_file_verbosely(tsv_res_fpath) rename_file_verbosely(tmp_fpath) rename_file_verbosely(acc_fpath) return None elif reply == 'q': platf_depend_exit(0) else: print("! - Invalid reply: `{}`\n".format(reply)) # end if # end while else: print() printlog_info( "Here are Genbank records encountered during previous run(s):" ) for acc, other_info in sorted(acc_dict.items(), key=lambda x: -x[1][1]): s_letter = "s" if other_info[1] > 1 else "" printlog_info(" {} hit{} - {}, `{}`".format( other_info[1], s_letter, acc, other_info[0])) # end for print('-' * 20) # end try # end if # Get packet size, number of the last sent packet and RID from temp file. # There can be invalid information in tmp file of tmp file may not exist try: with open(tmp_fpath, 'r') as tmp_file: temp_lines = tmp_file.readlines() # end with RID_save = re.search(r"Request_ID: (.+)", temp_lines[0]).group(1).strip() packet_size_save = int( re.search(r"Packet_size: ([0-9]*)", temp_lines[1]).group(1).strip()) packet_mode_save = int( re.search(r"Packet_mode: ([0-9]{1})", temp_lines[2]).group(1).strip()) except (AttributeError, OSError): # There is no need to disturb a user, merely proceed. return { "RID": None, "packet_size_save": None, "packet_mode_save": None, "tsv_respath": tsv_res_fpath, "n_done_reads": num_done_seqs, "tmp_fpath": tmp_fpath, "decr_pb": 0 } else: # Let's assume that a user won't modify his/her brobing_batch size between erroneous runs: # subtract num_done_reads if probing_batch_size > num_done_reads. decr_pb = num_done_seqs if num_done_seqs < probing_batch_size else 0 # Return data from previous run return { "RID": RID_save, "packet_size_save": packet_size_save, "packet_mode_save": packet_mode_save, "tsv_respath": tsv_res_fpath, "n_done_reads": num_done_seqs, "tmp_fpath": tmp_fpath, "decr_pb": decr_pb } # end try # end if return None
def map_f5reads_2_taxann(f5_path, tsv_taxann_lst, tax_annot_res_dir): # Function perform mapping of all reads stored in input FAST5 files # to existing TSV files containing taxonomic annotation info. # # It creates an DBM index file. # # :param f5_path: path to current FAST5 file; # :type f5_path: str; # :param tsv_taxann_lst: list of path to TSV files that contain taxonomic annotation; # :type tsv_taxann_lst: list<str>; # :param tax_annot_res_dir: path to directory containing taxonomic annotation; # :type tax_annot_res_dir: str; index_dirpath = os.path.join( tax_annot_res_dir, index_name) # name of directory that will contain indicies # File validation: # RuntimeError will be raised if FAST5 file is broken. try: # File existance checking is performed while parsing CL arguments. # Therefore, this if-statement will trigger only if f5_path's file is not a valid HDF5 file. if not h5py.is_hdf5(f5_path): raise RuntimeError("file is not of HDF5 (i.e. not FAST5) format") # end if f5_file = h5py.File(f5_path, 'r') for _ in f5_file: break # end for except RuntimeError as runterr: printlog_error_time("FAST5 file is broken") printlog_error("Reading the file `{}` crashed.".format( os.path.basename(f5_path))) printlog_error("Reason: {}".format(str(runterr))) printlog_error("Omitting this file...") print() return # end try readids_to_seek = list(fast5_readids(f5_file)) idx_dict = dict() # dictionary for index # This saving is needed to compare with 'len(readids_to_seek)' # after all TSV will be looked through in order to # determine if some reads miss taxonomic annotation. len_before = len(readids_to_seek) # Iterate over TSV-taaxnn file for tsv_taxann_fpath in tsv_taxann_lst: with open(tsv_taxann_fpath, 'r') as taxann_file: # Get all read IDs in current TSV readids_in_tsv = list( map(lambda l: l.split('\t')[0], taxann_file.readlines())) # Iterate over all other reads in current FAST5 # ('reversed' is necessary because we remove items from list in this loop) for readid in reversed(readids_to_seek): fmt_id = fmt_read_id(readid)[1:] if fmt_id in readids_in_tsv: # If not first -- write data to dict (and to index later) try: idx_dict[tsv_taxann_fpath].append( "read_" + fmt_id) # append to existing list except KeyError: idx_dict[tsv_taxann_fpath] = ["read_" + fmt_id ] # create a new list finally: readids_to_seek.remove(readid) # end try # end if # end for # end with if len(readids_to_seek) == 0: break # end if # end for # If after all TSV is checked but nothing have changed -- we miss taxonomic annotation # for some reads! And we will write their IDs to 'missing_reads_lst.txt' file. if len(readids_to_seek) == len_before: printlog_error_time("reads from FAST5 file not found") printlog_error("FAST5 file: `{}`".format(f5_path)) printlog_error("Some reads have not undergone taxonomic annotation.") missing_log = "missing_reads_lst.txt" printlog_error("List of missing reads are in following file:") printlog_error("{}".format(missing_log)) with open(missing_log, 'w') as missing_logfile: missing_logfile.write( "Missing reads from file '{}':\n\n".format(f5_path)) for readid in readids_to_seek: missing_logfile.write(fmt_read_id(readid) + '\n') # end for try: for path in glob(os.path.join(index_dirpath, '*')): os.unlink(path) # end for os.rmdir(index_dirpath) except OSError as oserr: printlog_error( "Error occured while removing index directory: {}".format( oserr)) finally: platf_depend_exit(3) # end try # end if try: # Open index files appending to existing data ('c' parameter) with open_shelve(os.path.join(index_dirpath, index_name), 'c') as index_f5_2_tsv: # Update index index_f5_2_tsv[f5_path] = idx_dict # end with except OSError as oserr: printlog_error_time("Error: cannot create index file `{}`"\ .format(os.path.join(index_dirpath, index_name))) printlog_error(str(oserr)) platf_depend_exit(1)
def send_request(request, pack_to_send, packet_size, packet_mode, filename, tmp_fpath): # Function sends a request to "blast.ncbi.nlm.nih.gov/blast/Blast.cgi" # and then waits for satisfaction of the request and retrieves response text. # # :param request: request_data (it is a dict that `configure_request()` function returns); # :param request: dict<dict>; # :param pack_to_send: current number (like id) of packet meant to be sent now. # :type pack_to_send: int; # :param pack_to_send: ordinal number of packet; # :type pack_to_send: int; # :param packet_size: numner of sequences in the packet; # :type packet_size: int; # # Returns XML text of type 'str' with BLAST response. payload = request["payload"] headers = request["headers"] server = "blast.ncbi.nlm.nih.gov" url = "/blast/Blast.cgi" error = True while error: try: conn = http.client.HTTPSConnection(server) # create a connection conn.request("POST", url, payload, headers) # send the request response = conn.getresponse() # get the response response_text = str(response.read(), "utf-8") # get response text except OSError as oserr: printlog_info_time( "`https://blast.ncbi.nlm.nih.gov` is not available.") printlog_info(str(oserr)) printlog_info( "barapost will try to connect again in 30 seconds...\n") sleep(30) # if no exception occured else: error = False # end try # end while try: rid = re.search(r"RID = (.+)", response_text).group(1) # get Request ID rtoe = int(re.search(r"RTOE = ([0-9]+)", response_text).group( 1)) # get time to wait provided by the NCBI server except AttributeError: printlog_error_time("Seems, NCBI has denied your request.") printlog_error("Response is in file `request_denial_response.html`") with open("request_denial_response.html", 'w') as den_file: den_file.write(response_text) # end with platf_depend_exit(1) finally: conn.close() # end try # Save temporary data with open(tmp_fpath, 'w') as tmpfile: tmpfile.write("Request_ID: {}\n".format(rid)) tmpfile.write("Packet_size: {}\n".format(packet_size)) tmpfile.write("Packet_mode: {}".format(packet_mode)) # end with # Wait for results of alignment return wait_for_align(rid, rtoe, pack_to_send, filename)
def configure_acc_dict(acc_fpath, your_own_fasta_lst, accs_to_download): # Fucntion configures accession dictionary according to accession file generated by 'barapost-prober.py': # keys are accessions, values are tuples of the following format: # (<sequence_name_aka_definition>). # # :param acc_fpath: path to accession file generated by 'barapost-prober.py'; # :type acc_fpath: str; # :param your_own_fasta_lst: list of paths to user's fasta files; # :type your_own_fasta_lst: list<str>; # # Returns accession dictionary described above. acc_dict = dict() # if database will be created only from 'your own' FASTA files -- return empty dict if not acc_fpath is None: with open(acc_fpath, 'r') as acc_file: lines = acc_file.readlines() for line_idx, line in enumerate(lines): line = line.strip() # Ignore ampty lines, commented lines and head of the table: if line != "" and not line.startswith( '#') and not line.startswith("ACCESSION"): line_splt = line.split('\t') acc = sys.intern(line_splt[0].partition('.')[0]) if not re.match(GB_ACC_PATTERN, acc) is None: # If we encounter GenBank accession number try: if len(line_splt) == 1: # just accession name = "No definition of the sequence provided" else: name = line_splt[1] # end if acc_dict[acc] = name except IndexError as err: printlog_error_time( "Error: invalid data in file `{}`!".format( acc_fpath)) printlog_error( "Here is that invalid line:\n `{}`".format( line)) printlog_error(str(err)) platf_depend_exit(1) # end try else: # It it's not a GenBank accession number, # probably it is a path to reference file. if os.path.exists(line): your_own_fasta_lst.append(line) else: printlog_error_time( "Error in file `{}`.".format(acc_fpath)) printlog_error("Line #{} looks like path to reference file, but this file does not exist."\ .format(line_idx+1)) printlog_error( "Here is this invalid line:\n `{}`".format( line)) platf_depend_exit(1) # end if # end if # end if # end for # end with # end if if len(your_own_fasta_lst) == 0 and len(acc_dict) == 0 and len( accs_to_download) == 0: printlog_error_time( "Error: no accession information found in file `{}`".format( acc_fpath)) platf_depend_exit(1) # end if return acc_dict
def retrieve_fastas_by_acc(acc_dict, db_dir, local_fasta): # Function downloads set of records from Genbank according to accessions passed to it. # Downloaded FASTA file will be placed in 'db_dir' directory and named 'local_seq_set.fasta' # :param acc_dict: dictionary comntaining accession data of hits; # :type acc_dict: dict<str: tuple<str, str, int>>; # :param db_dir: path to directory in which downloaded FASTA file will be placed; # :type db_dir: str; # :param local_fasta: path to file with reference sequences to be included in database; # :type local_fasta: str; # Path to file with current chunk (see below "100 accession numbers...") tmp_fasta = os.path.join(db_dir, "tmp.fasta") accessions = tuple(set(acc_dict.keys())) if len(accessions) == 0: # just in case return # end if # 100 accession numbers in order not to make too long URL # Download genomes by chunks of 100 sequences. max_accnum = 100 i = 0 accnum = len(accessions) while i < accnum: curr_accessions = accessions[i:i + max_accnum] # slice chunk accs_del_comma = ','.join( curr_accessions) # accessions must be separated by comma in url # E-utilities provide a possibility to download records from Genbank by accessions. retrieve_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?\ db=nuccore&id={}&rettype=fasta&retmode=text".format(accs_del_comma) log_info("Retrieve URL: `{}`".format(retrieve_url)) # GNU wget utility is safer, but there can be presence of absence of it :) wget_util = "wget" util_found = False for d in os.environ["PATH"].split(os.pathsep): if os.path.isdir(d) and wget_util in os.listdir(d): util_found = True break # end if # end for print() printlog_info("{} - Downloading {} reference sequences...".format( getwt(), len(curr_accessions))) if util_found: # If we have wget -- just use it wget_cmd = 'wget --no-check-certificate "{}" -O {}'.format( retrieve_url, tmp_fasta) pipe = sp_Popen(wget_cmd, shell=True) pipe.communicate() if pipe.returncode != 0: printlog_error_time( "Error occured while downloading reference sequences") platf_depend_exit(pipe.returncode) # end if else: # If there are no wget -- we will download sequences with Python disposal stop_wait = Event( ) # a flag variable that will signal waiter-function to stop executing def download_waiter(stop_wait): """ Function waits untill 'local_fasta' file is downloaded. It prints size of downloaded data to console during downloading. This function just waits -- it won't bring you the menu :). """ # Wait untill downloading starts while not os.path.exists(tmp_fasta): if not stop_wait.is_set(): return # end if sleep(1) # end while MB_size = 1024**2 # we will divide by it to get megabytes while stop_wait.is_set(): # Get size of downloaded data fsize = round(os.path.getsize(tmp_fasta) / MB_size, 1) # get megabytes printn("\r{} - {} MB downloaded ".format(getwt(), fsize)) sleep(1) # instant updates are not necessary # end while # Print total size of downloaded file (it can be deleted by this time) try: fsize = round(os.path.getsize(tmp_fasta) / MB_size, 1) except OSError: # We can pass this ecxeption -- we do delete this file if downloading crushes # And this function just waits :) pass # end try printlog_info("\r{} - {} MB downloaded ".format( getwt(), fsize)) # end def download_waiter error = True while error: try: waiter = Thread(target=download_waiter, args=(stop_wait, )) # create thread stop_wait.set() # raise the flag waiter.start() # start waiting urllib.request.urlretrieve( retrieve_url, tmp_fasta) # retrieve FASTA file except OSError as err: printlog_error_time( "Error occured while downloading fasta file.") printlog_error(str(err)) printlog_error( "`barapost-local.py` will try again in 30 seconds") if os.path.exists(tmp_fasta): os.unlink(tmp_fasta) # end if sleep(30) else: error = False finally: stop_wait.clear() # lower the flag waiter.join( ) # main thread will wait until waiter function ends it's work # end try # end while # end if printlog_info_time("Downloading is completed") # Write chunk to result fasta file with open(tmp_fasta, 'r') as infile, open(local_fasta, 'a') as outfile: outfile.write(infile.read()) # end with # Remove temp chunk file os.unlink(tmp_fasta) i += max_accnum # go to next chunk
def download_taxonomy(hit_acc, hit_def, taxonomy_path): # Function retrieves taxonomy of a hit from NCBI. # Moreover, it saves this taxonomy in file ``taxonomy_tsv: # <accession>\t<taxonomy_str> # # :param hit_acc: hit accession; # :type hit_acc: str; # :param hit_def: definition of reference record; # :type hit_def: str; # :param taxonomy_path: path to TSV file with taxonomy; # :type taxonomy_path: str; # Get TaxID of the organism from GenBank summary: gb_summary = lingering_https_get_request("www.ncbi.nlm.nih.gov", "/nuccore/{}".format(hit_acc), "GenBank summary", hit_acc) try: taxid = re.search(r"ORGANISM=([0-9]+)", gb_summary).group(1) except AttributeError: printlog_error_time( "Error: taxonomy parsing error 115-{}".format(hit_acc)) printlog_error("Please, contact the developer.") platf_depend_exit(115) # end try # Get taxonomy page of the organism taxonomy_url = "/Taxonomy/Browser/wwwtax.cgi?mode=Info&id={}&lvl=3&lin=f&keep=1&srchmode=1&unlock".format( taxid) taxonomy_text = lingering_https_get_request("www.ncbi.nlm.nih.gov", taxonomy_url, "taxonomy", hit_acc) # This pattern will match taxonomic names along with their ranks tax_rank_pattern = r"TITLE=\"([a-z ]+)\"\>([A-Z].+?)\</a\>" # Get all taxonomic names of the organism taxonomy = re.findall(tax_rank_pattern, taxonomy_text) # We will convert ranks to lowercase just in case. # Firstly convert tuples to lists in order to change them: taxonomy = list(map(lambda x: list(x), taxonomy)) # Remove odd information from beginnig of names: for i in range(len(taxonomy)): taxonomy[i][0] = taxonomy[i][0].lower() # just in case # end for # We will leave only following taxonomic ranks: domain, phylum, class, order, family, genus. # Species name requires special handling, it will be added later. ranks_to_select = ranks[:-1] # Remove redundant ranks: taxonomy = filter(lambda x: x[0].lower() in ranks_to_select, taxonomy) # Convert back to tuples: taxonomy = list(map(lambda x: tuple(x), taxonomy)) # E.g., this record has no appropriate ranks: CP034535 # Merely return it's definition if len(taxonomy) == 0: # Save taxonomy _tax_accs.append(hit_acc) with open(taxonomy_path, 'a') as tax_file: tax_file.write("{}\n".format('\t'.join((hit_acc, hit_def)))) # end with # end if # Check if species name is specified like other ranks: check_direct_species_patt = r"TITLE=\"(species)\"\>([A-Za-z0-9 \.]+)\</a\>" match_direct_species = re.search(check_direct_species_patt, taxonomy_text) if not match_direct_species is None: # If species name is specified like other ranks, merely add it to list: taxonomy.append((match_direct_species.group(1), match_direct_species.group(2).partition(" ")[2])) else: # Otherwise we need to parse species name from title title = re.search(r"\<title\>Taxonomy browser \((.+)\)\</title\>", taxonomy_text).group(1) # Get words title = title.split(' ') # We will take all this words as species name. # Viruses also often have unpredictable names. # Example: MN908947 try: if title[1] in second_words_not_species or taxonomy[0][1].lower( ) == "viruses": taxonomy.append(("species", '_'.join(title[1:]))) else: taxonomy.append(("species", title[1])) # end if except IndexError: # Handle absence of species name, e.g., this: AC150248.3 # Well, nothing to append in this case! pass # end try # end if # Fill in missing ranks with empty strings for i in range(len(ranks)): if len(taxonomy) < i + 1: # for this (missing in the end): AC150248 taxonomy.append((ranks[i], "")) elif taxonomy[i][0] != ranks[ i]: # for this (mising in the middle): MN908947 taxonomy.insert(i, (ranks[i], "")) # end if # end for # It will be a bit faster taxonomy = tuple(taxonomy) # Save taxonomy _tax_accs.append(hit_acc) with open(taxonomy_path, 'a') as tax_file: tax_file.write("{}\n".format('\t'.join( (hit_acc, config_taxonomy_str(taxonomy)))))
def bin_fast5_file(f5_path, tax_annot_res_dir, sens, min_qual, min_qlen, min_pident, min_coverage, no_trash): # Function bins FAST5 file with untwisting. # # :param f5_path: path to FAST5 file meant to be processed; # :type f5_path: str; # :param tax_annot_res_dir: path to directory containing taxonomic annotation; # :type tax_annot_res_dir: str; # :param sens: binning sensitivity; # :type sens: str; # :param min_qual: threshold for quality filter; # :type min_qual: float; # :param min_qlen: threshold for length filter; # :type min_qlen: int (or None, if this filter is disabled); # :param min_pident: threshold for alignment identity filter; # :type min_pident: float (or None, if this filter is disabled); # :param min_coverage: threshold for alignment coverage filter; # :type min_coverage: float (or None, if this filter is disabled); # :param no_trash: loical value. True if user does NOT want to output trash files; # :type no_trash: bool; outdir_path = os.path.dirname( logging.getLoggerClass().root.handlers[0].baseFilename) seqs_pass = 0 # counter for sequences, which pass filters QL_seqs_fail = 0 # counter for too short or too low-quality sequences align_seqs_fail = 0 # counter for sequences, which align to their best hit with too low identity or coverage srt_file_dict = dict() index_dirpath = os.path.join( tax_annot_res_dir, index_name) # name of directory that will contain indicies # Make filter for quality and length QL_filter = get_QL_filter(f5_path, min_qual, min_qlen) # Configure path to trash file if not no_trash: QL_trash_fpath = get_QL_trash_fpath( f5_path, outdir_path, min_qual, min_qlen, ) else: QL_trash_fpath = None # end if # Make filter for identity and coverage align_filter = get_align_filter(min_pident, min_coverage) # Configure path to this trash file if not no_trash: align_trash_fpath = get_align_trash_fpath(f5_path, outdir_path, min_pident, min_coverage) else: align_trash_fpath = None # end if # File validation: # RuntimeError will be raised if FAST5 file is broken. try: # File existance checking is performed while parsing CL arguments. # Therefore, this if-statement will trigger only if f5_path's file is not a valid HDF5 file. if not h5py.is_hdf5(f5_path): raise RuntimeError("file is not of HDF5 (i.e. not FAST5) format") # end if from_f5 = h5py.File(f5_path, 'r') for _ in from_f5: break # end for except RuntimeError as runterr: printlog_error_time("FAST5 file is broken") printlog_error("Reading the file `{}` crashed.".format( os.path.basename(f5_path))) printlog_error("Reason: {}".format(str(runterr))) printlog_error("Omitting this file...") print() # Return zeroes -- inc_val won't be incremented and this file will be omitted return (0, 0, 0) # end try # singleFAST5 and multiFAST5 files should be processed in different ways # "Raw" group always in singleFAST5 root and never in multiFAST5 root if "Raw" in from_f5.keys(): f5_cpy_func = copy_single_f5 else: f5_cpy_func = copy_read_f5_2_f5 # end if readids_to_seek = list(from_f5.keys()) # list of not-binned-yet read IDs # Fill the list 'readids_to_seek' for read_name in fast5_readids(from_f5): # Get rid of "read_" readids_to_seek.append(sys.intern(read_name)) # end for # Walk through the index index_f5_2_tsv = open_shelve(os.path.join(index_dirpath, index_name), 'r') if not f5_path in index_f5_2_tsv.keys(): printlog_error_time( "Source FAST5 file `{}` not found in index".format(f5_path)) printlog_error("Try to rebuild index") platf_depend_exit(1) # end if for tsv_path in index_f5_2_tsv[f5_path].keys(): read_names = index_f5_2_tsv[f5_path][tsv_path] taxonomy_path = os.path.join(tax_annot_res_dir, "taxonomy", "taxonomy.tsv") resfile_lines = configure_resfile_lines(tsv_path, sens, taxonomy_path) for read_name in read_names: try: hit_names, *vals_to_filter = resfile_lines[sys.intern( fmt_read_id(read_name)[1:])] except KeyError: printlog_error_time("Error: missing taxonomic annotation info for read `{}`"\ .format(fmt_read_id(read_name)[1:])) printlog_error( "It is stored in `{}` FAST5 file".format(f5_path)) printlog_error( "Try to make new index file (press ENTER on corresponding prompt)." ) printlog_error( "Or, if does not work for you, make sure that taxonomic annotation info \ for this read is present in one of TSV files generated by `barapost-prober.py` and `barapost-local.py`." ) index_f5_2_tsv.close() platf_depend_exit(1) # end try if not QL_filter(vals_to_filter): # Get name of result FASTQ file to write this read in if QL_trash_fpath not in srt_file_dict.keys(): srt_file_dict = update_file_dict(srt_file_dict, QL_trash_fpath) # end if f5_cpy_func(from_f5, read_name, srt_file_dict[QL_trash_fpath]) QL_seqs_fail += 1 elif not align_filter(vals_to_filter): # Get name of result FASTQ file to write this read in if align_trash_fpath not in srt_file_dict.keys(): srt_file_dict = update_file_dict(srt_file_dict, align_trash_fpath) # end if f5_cpy_func(from_f5, read_name, srt_file_dict[align_trash_fpath]) align_seqs_fail += 1 else: for hit_name in hit_names.split( "&&" ): # there can be multiple hits for single query sequence # Get name of result FASTQ file to write this read in binned_file_path = os.path.join( outdir_path, "{}.fast5".format(hit_name)) if binned_file_path not in srt_file_dict.keys(): srt_file_dict = update_file_dict( srt_file_dict, binned_file_path) # end if f5_cpy_func(from_f5, read_name, srt_file_dict[binned_file_path]) # end for seqs_pass += 1 # end if # end for from_f5.close() index_f5_2_tsv.close() # Close all binned files for file_obj in filter(lambda x: not x is None, srt_file_dict.values()): file_obj.close() # end for sys.stdout.write('\r') printlog_info_time("File `{}` is binned.".format( os.path.basename(f5_path))) printn(" Working...") return (seqs_pass, QL_seqs_fail, align_seqs_fail)
def _reformat_legacy_file(legacy_tax_path): import shelve # Check if this file is corrupted try: with shelve.open(legacy_tax_path, 'r') as tax_file: pass # end with except OSError as err: printlog_error("Legacy taxonomy file appears to be corrupted.") printlog_error("This error might be fatal.") str_err = str(err) if "dbm.gnu" in str_err and "module is not" in str_err: printlog_error("Installing `python3-gdbm` might solve this problem.") else: printlog_error("The program can't recover taxonomy from the broken file.") printlog_error("Seems, you have to annotate your sequences again.") printlog_error("Sorry for that :(") # end if platf_depend_exit(1) # end try new_tax_path = "{}.tsv".format(legacy_tax_path) taxonomy.init_tax_file(new_tax_path) printn("Reformatting: `{}` ->".format(legacy_tax_path)) log_info("Reformatting: `{}` ->".format(legacy_tax_path)) with shelve.open(legacy_tax_path, 'r') as old_tax_file, open(new_tax_path, 'w') as new_tax_file: for acc, taxonomy_from_file in old_tax_file.items(): if isinstance(taxonomy_from_file, tuple): tax_str = taxonomy.config_taxonomy_str(taxonomy_from_file) new_tax_file.write("{}\n".format('\t'.join( (acc, tax_str) ))) elif isinstance(taxonomy_from_file, str): new_tax_file.write("{}\n".format('\t'.join( (acc, taxonomy_from_file) ))) else: # Execution must not reach here printlog_error_time("Fatal error 8755.") printlog_error("Please, contact the developer.") platf_depend_exit(8755) # end if # end for # end with printlog_info(" `<same_dir>/{}`".format(os.path.basename(new_tax_path))) try: renamed_legacy_file = "{}_deprecated".format(legacy_tax_path) os.rename(legacy_tax_path, renamed_legacy_file) except OSError as err: printlog_error_time("Cannot rename legacy taxonomy file `{}`:".format(legacy_tax_path)) printlog_error(str(err)) printlog_error("But it's not a problem -- we will proceed with our work.") else: printlog_info("Renamed: `{}` -> `<same_dir>/{}`".format(legacy_tax_path, os.path.basename(renamed_legacy_file))) # end try printlog_info("Legacy taxonomy file is reformatted to TSV format.")
def format_taxonomy_name(hit_acc, hit_def, sens, tax_dict): # Function formats taxonomy name according to chosen sensibiliry of binning. # # :param hit_acc: accession(s) of best hit(s); # :type hit_acc: str; # :param hit_def: annotation of best hit; # :type hit_def: str; # :param sens: sensibility returned by 'get_classif_sensibility()' function. # It's value can be one of the following strings: "genus", "species"; # :type sens: str; # :param tax_dict: taxonomy dictionary returned by function 'src.taxonomy.get_tax_dict'; # :type tax_dict: dict; # # Returns formatted hit name of 'str' type; # If there is no hit -- we are sure what to do! if hit_def == "No significant similarity found": return "unknown" # end if best_hit_annots = list( ) # list of strings that will be names of binned files for acc, annotation in zip(hit_acc.split('&&'), hit_def.split('&&')): # Get taxonomy try: taxonomy = tax_dict[acc] except KeyError: raise NoTaxonomyError() # end try # If it is beautiful tuple-formatted taxonomy -- find rank name for filename if isinstance(taxonomy, tuple): best_hit_annots.append(find_rank_for_filename(sens, taxonomy)) if sens[0] == "species": genus_sens = ("genus", sens[1] - 1) genus_name = find_rank_for_filename(genus_sens, taxonomy) species_name = best_hit_annots[len(best_hit_annots) - 1] best_hit_annots[len(best_hit_annots) - 1] = "{}_{}".format( genus_name, species_name) # end if # Otherwise consider sequence ID elif isinstance(taxonomy, str): # Check if hit is a sequence from SPAdes or a5 assembly: spades_match_obj = re.search(SPADES_PATT, annotation) a5_match_obj = re.search(A5_PATT, annotation) if not spades_match_obj is None: if sens[0] != "species": contig_info = spades_match_obj.group(1) taxonomy = taxonomy.replace('--' + contig_info, '') # end if elif not a5_match_obj is None: if sens[0] != "species": contig_info = a5_match_obj.group(1) taxonomy = taxonomy.replace('--' + contig_info, '') # end if # end if # If it is not assembly -- merely return taxonomy best_hit_annots.append(taxonomy) else: # Execution must not reach here printlog_error_time("Fatal error 8754.") printlog_error("Please, contact the developer.") platf_depend_exit(8754) # end if # end for # Replace symbols not allowed in filenames best_hit_annots = map(remove_bad_chars, best_hit_annots) # Return deduplicated names return "&&".join(set(best_hit_annots))
def bin_fastqa_file(fq_fa_lst, tax_annot_res_dir, sens, n_thr, min_qual, min_qlen, min_pident, min_coverage, no_trash): # Function for parallel binning FASTQ and FASTA files. # Actually bins multiple files. # # :param fq_fa_lst: lsit of paths to FASTQ (of FASTA) file meant to be processed; # :type fq_fa_lst: list<str>; # :param min_qual: threshold for quality filter; # :type min_qual: float; # :param min_qlen: threshold for length filter; # :type min_qlen: int (or None, if this filter is disabled); # :param min_pident: threshold for alignment identity filter; # :type min_pident: float (or None, if this filter is disabled); # :param min_coverage: threshold for alignment coverage filter; # :type min_coverage: float (or None, if this filter is disabled); # :param no_trash: loical value. True if user does NOT want to output trash files; # :type no_trash: bool; outdir_path = os.path.dirname( logging.getLoggerClass().root.handlers[0].baseFilename) seqs_pass = 0 # counter for sequences, which pass filters QL_seqs_fail = 0 # counter for too short or too low-quality sequences align_seqs_fail = 0 # counter for sequences, which align to their best hit with too low identity or coverage for fq_fa_path in fq_fa_lst: new_dpath = get_curr_res_dpath(fq_fa_path, tax_annot_res_dir) tsv_res_fpath = get_res_tsv_fpath(new_dpath) taxonomy_path = os.path.join(tax_annot_res_dir, "taxonomy", "taxonomy.tsv") resfile_lines = configure_resfile_lines(tsv_res_fpath, sens, taxonomy_path) # Configure path to trash file if is_fastq(fq_fa_path): seq_records_generator = fastq_records write_fun = write_fastq_record else: seq_records_generator = fasta_records write_fun = write_fasta_record # end if # Make filter for quality and length QL_filter = get_QL_filter(fq_fa_path, min_qual, min_qlen) # Configure path to trash file if not no_trash: QL_trash_fpath = get_QL_trash_fpath( fq_fa_path, outdir_path, min_qual, min_qlen, ) else: QL_trash_fpath = None # end if # Make filter for identity and coverage align_filter = get_align_filter(min_pident, min_coverage) # Configure path to this trash file if not no_trash: align_trash_fpath = get_align_trash_fpath(fq_fa_path, outdir_path, min_pident, min_coverage) else: align_trash_fpath = None # end if # Create an iterator that will yield records seq_records_iterator = iter(seq_records_generator(fq_fa_path)) # Dict for storing batches of sequences meant to be written to output files: to_write = dict() stop = False # for outer while-loop while not stop: # Extract batch of records of 'n_thr' size and find their destination paths: for _ in range(n_thr): try: fastqa_rec = next(seq_records_iterator) except StopIteration: stop = True # for outer while-loop break # end try read_name = sys.intern(fmt_read_id( fastqa_rec["seq_id"])[1:]) # get ID of the sequence try: hit_names, *vals_to_filter = resfile_lines[ read_name] # find hit corresponding to this sequence except KeyError: printlog_error_time("Error: read `{}` not found in TSV file containing taxonomic annotation."\ .format(read_name)) printlog_error("This TSV file: `{}`".format(tsv_res_fpath)) printlog_error( "Make sure that this read has been already processed by \ `barapost-prober.py` and `barapost-local.py`.") platf_depend_exit(1) # end try # If read is found in TSV file: if not QL_filter(vals_to_filter): # Place this sequence to QL trash file to_write[read_name] = (fastqa_rec, QL_trash_fpath) QL_seqs_fail += 1 elif not align_filter(vals_to_filter): # Place this sequence to QL trash file to_write[read_name] = (fastqa_rec, align_trash_fpath) align_seqs_fail += 1 else: for hit_name in hit_names.split("&&"): # Get name of result FASTQ file to write this read in binned_file_path = os.path.join( outdir_path, "{}.fast{}".format( hit_name, 'q' if is_fastq(fq_fa_path) else 'a')) to_write[read_name] = (fastqa_rec, binned_file_path) # end for seqs_pass += 1 # end if # end for # Write batch of records to output files: with write_lock: for record, fpath in to_write.values(): write_fun(fpath, record) # end for # end with to_write.clear() # end while with write_lock: # Write the rest of 'uneven' data to output files: if len(to_write) != 0: for record, fpath in to_write.values(): write_fun(fpath, record) # end for # end if sys.stdout.write('\r') printlog_info_time("File `{}` is binned.".format( os.path.basename(fq_fa_path))) printn(" Working...") # end with # end for return (seqs_pass, QL_seqs_fail, align_seqs_fail)
def bin_fast5_file(f5_path, tax_annot_res_dir, sens, min_qual, min_qlen, min_pident, min_coverage, no_trash): # Function bins FAST5 file without untwisting. # # :param f5_path: path to FAST5 file meant to be processed; # :type f5_path: str; # :param tax_annot_res_dir: path to directory containing taxonomic annotation; # :type tax_annot_res_dir: str; # :param sens: binning sensitivity; # :type sens: str; # :param min_qual: threshold for quality filter; # :type min_qual: float; # :param min_qlen: threshold for length filter; # :type min_qlen: int (or None, if this filter is disabled); # :param min_pident: threshold for alignment identity filter; # :type min_pident: float (or None, if this filter is disabled); # :param min_coverage: threshold for alignment coverage filter; # :type min_coverage: float (or None, if this filter is disabled); # :param no_trash: loical value. True if user does NOT want to output trash files; # :type no_trash: bool; outdir_path = os.path.dirname(logging.getLoggerClass().root.handlers[0].baseFilename) seqs_pass = 0 # counter for sequences, which pass filters QL_seqs_fail = 0 # counter for too short or too low-quality sequences align_seqs_fail = 0 # counter for sequences, which align to their best hit with too low identity or coverage srt_file_dict = dict() new_dpath = glob("{}{}*{}*".format(tax_annot_res_dir, os.sep, get_checkstr(f5_path)))[0] tsv_res_fpath = get_res_tsv_fpath(new_dpath) taxonomy_path = os.path.join(tax_annot_res_dir, "taxonomy", "taxonomy.tsv") resfile_lines = configure_resfile_lines(tsv_res_fpath, sens, taxonomy_path) # Make filter for quality and length QL_filter = get_QL_filter(f5_path, min_qual, min_qlen) # Configure path to trash file if not no_trash: QL_trash_fpath = get_QL_trash_fpath(f5_path, outdir_path, min_qual, min_qlen,) else: QL_trash_fpath = None # end if # Make filter for identity and coverage align_filter = get_align_filter(min_pident, min_coverage) # Configure path to this trash file if not no_trash: align_trash_fpath = get_align_trash_fpath(f5_path, outdir_path, min_pident, min_coverage) else: align_trash_fpath = None # end if # File validation: # RuntimeError will be raised if FAST5 file is broken. try: # File existance checking is performed while parsing CL arguments. # Therefore, this if-statement will trigger only if f5_path's file is not a valid HDF5 file. if not h5py.is_hdf5(f5_path): raise RuntimeError("file is not of HDF5 (i.e. not FAST5) format") # end if from_f5 = h5py.File(f5_path, 'r') for _ in from_f5: break # end for except RuntimeError as runterr: printlog_error_time("FAST5 file is broken") printlog_error("Reading the file `{}` crashed.".format(os.path.basename(f5_path))) printlog_error("Reason: {}".format( str(runterr) )) printlog_error("Omitting this file...") print() # Return zeroes -- inc_val won't be incremented and this file will be omitted return (0, 0, 0) # end try # singleFAST5 and multiFAST5 files should be processed in different ways # "Raw" group always in singleFAST5 root and never in multiFAST5 root if "Raw" in from_f5.keys(): f5_cpy_func = copy_single_f5 else: f5_cpy_func = copy_read_f5_2_f5 # end if for _, read_name in enumerate(fast5_readids(from_f5)): try: hit_names, *vals_to_filter = resfile_lines[sys.intern(fmt_read_id(read_name))[1:]] # omit 'read_' in the beginning of FAST5 group's name except KeyError: printlog_error_time("Error: read `{}` not found in TSV file containing taxonomic annotation."\ .format(fmt_read_id(read_name))) printlog_error("This TSV file: `{}`".format(tsv_res_fpath)) printlog_error("Try running barapost-binning with `-u` (`--untwist-fast5`) flag.\n") platf_depend_exit(1) # end try # If read is found in TSV file: if not QL_filter(vals_to_filter): QL_seqs_fail += 1 # Get name of result FASTQ file to write this read in if QL_trash_fpath not in srt_file_dict.keys(): srt_file_dict = update_file_dict(srt_file_dict, QL_trash_fpath) # end if f5_cpy_func(from_f5, read_name, srt_file_dict[QL_trash_fpath]) elif not align_filter(vals_to_filter): align_seqs_fail += 1 # Get name of result FASTQ file to write this read in if QL_trash_fpath not in srt_file_dict.keys(): srt_file_dict = update_file_dict(srt_file_dict, align_trash_fpath) # end if f5_cpy_func(from_f5, read_name, srt_file_dict[align_trash_fpath]) else: for hit_name in hit_names.split("&&"): # there can be multiple hits for single query sequence # Get name of result FASTQ file to write this read in binned_file_path = os.path.join(outdir_path, "{}.fast5".format(hit_name)) if binned_file_path not in srt_file_dict.keys(): srt_file_dict = update_file_dict(srt_file_dict, binned_file_path) # end if f5_cpy_func(from_f5, read_name, srt_file_dict[binned_file_path]) # end for seqs_pass += 1 # end if # end for from_f5.close() # Close all binned files for file_obj in filter(lambda x: not x is None, srt_file_dict.values()): file_obj.close() # end for sys.stdout.write('\r') printlog_info_time("File `{}` is binned.".format(os.path.basename(f5_path))) printn(" Working...") return (seqs_pass, QL_seqs_fail, align_seqs_fail)
def wait_for_align(rid, rtoe, pack_to_send, filename): # Function waits untill BLAST server accomplishes the request. # # :param rid: Request ID to wait for; # :type rid: str; # :param rtoe: time in seconds estimated by BLAST server needed to accomplish the request; # :type rtoe: int; # :param pack_to_send: current packet (id) number to send; # :type pack_to_send: int; # :param filename: basename of current FASTA file; # :type filename: str # # Returns XML response ('str'). print() print("Requesting for current query status. Request ID: {}".format(rid)) print(" `{}`; Submission #{}".format(filename, pack_to_send[0])) log_info("Requesting for current query status.") log_info("Request ID: {}; `{}`; Submission #{}".format( rid, filename, pack_to_send[0], )) # RTOE can be zero at the very beginning of resumption if rtoe > 0: printlog_info_time( "BLAST server estimates that alignment will be accomplished in {} seconds" .format(rtoe)) printlog_info_time( "Waiting for {}+3 (+3 extra) seconds...".format(rtoe)) # Server migth be wrong -- we will give it 3 extra seconds sleep(rtoe + 3) printlog_info_time( "{} seconds have passed. Checking if alignment is accomplished...". format(rtoe + 3)) # end if server = "blast.ncbi.nlm.nih.gov" wait_url = "/blast/Blast.cgi?CMD=Get&FORMAT_OBJECT=SearchInfo&RID=" + rid whtspc_len = 6 + len("(requesting)") while True: resp_content = lingering_https_get_request(server, wait_url, "BLAST response") # if server asks to wait if "Status=WAITING" in resp_content: printn("\r{} - The request is being processed. Waiting{}{}".format( getwt(), ' ' * whtspc_len, "\033[%dD" % whtspc_len)) # indicate each 20 seconds with a dot for i in range(1, 7): sleep(10) printn( "\r{} - The request is being processed. Waiting{}".format( getwt(), '.' * i)) # end for printn("(requesting)") continue elif "Status=FAILED" in resp_content: # if job failed print() printlog_info_time("Job failed\a\n") printlog_info("Resending this packet.") return None, BlastError(2) elif "Status=UNKNOWN" in resp_content: # if job expired print() printlog_info_time("Job expired\a\n") printlog_info("Resending this packet.") return None, BlastError(1) # if results are ready elif "Status=READY" in resp_content: print() printlog_info("Result for query `{}` #{} is ready!".format( filename, pack_to_send[0])) # if there are hits if "ThereAreHits=yes" in resp_content: for i in range(15, 0, -5): print('-' * i) # end for print("-\nRetrieving results...") # Retrieve human-readable text and put it into result directory retrieve_text_url = "/Blast.cgi?CMD=Get&FORMAT_TYPE=Text&DESCRIPTIONS=1&ALIGNMENTS=1&RID=" + rid txt_align_res = lingering_https_get_request( server, retrieve_text_url, "text version of BLAST response") # Count already existing plain text files in outdir: is_txt_response = lambda f: not re.search( r"prober_blast_response_[0-9]+\.txt", f) is None outdir_path = os.path.dirname(logging.getLoggerClass( ).root.handlers[0].baseFilename) # tricky trick response_num = len( tuple(filter(is_txt_response, os.listdir(outdir_path)))) # Curent txt response file will have number `response_num+1` txt_hpath = os.path.join( outdir_path, "prober_blast_response_{}.txt".format(response_num + 1)) # Write text result for a human to read with open(txt_hpath, 'w') as txt_file: txt_file.write(txt_align_res) # end with elif "ThereAreHits=no" in resp_content: # if there are no hits printlog_info_time("There are no hits. It happens.\n") else: # probably, job is failed if execution reaches here print() printlog_info_time("Job failed\a\n") printlog_info("Resending this packet.") return None, BlastError(2) # end if break # end if # Execution should not reach here printlog_error_time( "Fatal error (-122). Please contact the developer.\a\n") platf_depend_exit(-122) # end while # Retrieve XML result retrieve_xml_url = "/Blast.cgi?CMD=Get&FORMAT_TYPE=XML&ALIGNMENTS=1&RID=" + rid xml_text = lingering_https_get_request(server, retrieve_xml_url, "XML BLAST response") if "Bad Gateway" in xml_text: print() printlog_info_time("Bad Gateway. Data from last packet has been lost.") printlog_info("Resending this packet.") return None, BlastError(1) elif "Status=FAILED" in xml_text: print() printlog_info_time("BLAST error: request failed") printlog_info("Resending this packet.") return None, BlastError(2) elif "to start it again" in xml_text: print() printlog_info_time("BLAST error") printlog_info("Resending this packet.") return None, BlastError(2) elif "[blastsrv4.REAL]" in xml_text: blastsrv4_match = re.search(r"(\[blastsrv4\.REAL\].*\))", xml_text) blastsrv4_str = "" if blastsrv4_match is None else ": {}".format( blastsrv4_match.group(1)) printlog_info_time("BLAST server error{}".format(blastsrv4_str)) # Error code 2 indicated that we need to split packet and resubmit return None, BlastError(2) # end if return xml_text, BlastError(0)
def bin_fastqa_file(fq_fa_path, tax_annot_res_dir, sens, min_qual, min_qlen, min_pident, min_coverage, no_trash): # Function for single-thread binning FASTQ and FASTA files. # # :param fq_fa_path: path to FASTQ (of FASTA) file meant to be processed; # :type fq_fa_path: str; # :param tax_annot_res_dir: path to directory containing taxonomic annotation; # :type tax_annot_res_dir: str; # :param sens: binning sensitivity; # :type sens: str; # :param min_qual: threshold for quality filter; # :type min_qual: float; # :param min_qlen: threshold for length filter; # :type min_qlen: int (or None, if this filter is disabled); # :param min_pident: threshold for alignment identity filter; # :type min_pident: float (or None, if this filter is disabled); # :param min_coverage: threshold for alignment coverage filter; # :type min_coverage: float (or None, if this filter is disabled); # :param no_trash: loical value. True if user does NOT want to output trash files; # :type no_trash: bool; outdir_path = os.path.dirname( logging.getLoggerClass().root.handlers[0].baseFilename) seqs_pass = 0 # counter for sequences, which pass filters QL_seqs_fail = 0 # counter for too short or too low-quality sequences align_seqs_fail = 0 # counter for sequences, which align to their best hit with too low identity or coverage srt_file_dict = dict( ) # dict containing file objects of existing output files new_dpath = get_curr_res_dpath(fq_fa_path, tax_annot_res_dir) tsv_res_fpath = get_res_tsv_fpath(new_dpath) taxonomy_path = os.path.join(tax_annot_res_dir, "taxonomy", "taxonomy.tsv") resfile_lines = configure_resfile_lines(tsv_res_fpath, sens, taxonomy_path) # Configure generator, write function and path to trash file if is_fastq(fq_fa_path): seq_records_generator = fastq_records write_fun = write_fastq_record else: seq_records_generator = fasta_records write_fun = write_fasta_record # end if # Make filter for quality and length QL_filter = get_QL_filter(fq_fa_path, min_qual, min_qlen) # Configure path to trash file if not no_trash: QL_trash_fpath = get_QL_trash_fpath( fq_fa_path, outdir_path, min_qual, min_qlen, ) else: QL_trash_fpath = None # end if # Make filter for identity and coverage align_filter = get_align_filter(min_pident, min_coverage) # Configure path to this trash file if not no_trash: align_trash_fpath = get_align_trash_fpath(fq_fa_path, outdir_path, min_pident, min_coverage) else: align_trash_fpath = None # end if for fastq_rec in seq_records_generator(fq_fa_path): read_name = sys.intern(fmt_read_id( fastq_rec["seq_id"])[1:]) # get ID of the sequence try: hit_names, *vals_to_filter = resfile_lines[ read_name] # find hit corresponding to this sequence except KeyError: printlog_error_time("Error: read `{}` not found in TSV file containing taxonomic annotation."\ .format(read_name)) printlog_error("This TSV file: `{}`".format(tsv_res_fpath)) printlog_error("Make sure that this read has been already \ processed by `barapost-prober.py` and `barapost-local.py`.") platf_depend_exit(1) # end try # Apply filters if not QL_filter(vals_to_filter): QL_seqs_fail += 1 # Place this sequence to QL trash file if QL_trash_fpath not in srt_file_dict.keys(): srt_file_dict = update_file_dict(srt_file_dict, QL_trash_fpath) # end if write_fun(srt_file_dict[QL_trash_fpath], fastq_rec) # write current read to binned file elif not align_filter(vals_to_filter): align_seqs_fail += 1 # Place this sequence to align_trash file if align_trash_fpath not in srt_file_dict.keys(): srt_file_dict = update_file_dict(srt_file_dict, align_trash_fpath) # end if write_fun(srt_file_dict[align_trash_fpath], fastq_rec) # write current read to binned file else: for hit_name in hit_names.split( "&&" ): # there can be multiple hits for single query sequence # Get name of result FASTQ file to write this read in binned_file_path = os.path.join( outdir_path, "{}.fast{}".format(hit_name, 'q' if is_fastq(fq_fa_path) else 'a')) if binned_file_path not in srt_file_dict.keys(): srt_file_dict = update_file_dict(srt_file_dict, binned_file_path) # end if write_fun(srt_file_dict[binned_file_path], fastq_rec) # write current read to binned file # end for seqs_pass += 1 # end if # end for # Close all binned files for file_obj in filter(lambda x: not x is None, srt_file_dict.values()): file_obj.close() # end for sys.stdout.write('\r') printlog_info_time("File `{}` is binned.".format( os.path.basename(fq_fa_path))) printn(" Working...") return (seqs_pass, QL_seqs_fail, align_seqs_fail)