def format_protein_db(self, input_file_path, output_file_path): progress.new('Formatting raw files') progress.update('Decompressing protein sequences') # poor man's uncompress temp_fasta_path = filesnpaths.get_temp_file_path() with open(temp_fasta_path, 'wb') as f_out, gzip.open(input_file_path, 'rb') as f_in: f_out.write(f_in.read()) progress.end() if utils.is_program_exists('diamond', dont_raise=True): output_dir = J(self.COG_data_dir, 'DB_DIAMOND') if os.path.exists(output_dir): shutil.rmtree(output_dir) os.mkdir(output_dir) output_db_path = J(output_dir, 'COG') log_file_path = J(output_dir, 'log.txt') self.run.info('Diamond log', log_file_path) diamond = Diamond(temp_fasta_path) diamond.num_threads = self.num_threads diamond.run.log_file_path = log_file_path diamond.makedb(output_db_path) else: self.run.warning( "Diamond does not seem to be installed on this system, so anvi'o is not going to\ generate a search database for it. Remember this when/if things go South." ) if utils.is_program_exists( 'makeblastdb', dont_raise=True) and utils.is_program_exists( 'blastp', dont_raise=True): output_dir = J(self.COG_data_dir, 'DB_BLAST') if os.path.exists(output_dir): shutil.rmtree(output_dir) os.mkdir(output_dir) output_db_path = J(output_dir, 'COG') log_file_path = J(output_dir, 'log.txt') self.run.info('BLAST log', log_file_path) blast = BLAST(temp_fasta_path) blast.run.log_file_path = log_file_path blast.num_threads = self.num_threads blast.makedb(os.path.join(output_db_path, 'COG.fa')) else: self.run.warning( "BLAST tools do not seem to be installed on this system, so anvi'o is not going to\ generate a search database for them to be used. Keep this in mind for later." ) os.remove(temp_fasta_path)
def format_protein_db(self, input_file_path, output_file_path): progress.new('Formatting raw files') progress.update('Decompressing protein sequences') # poor man's uncompress temp_fasta_path = filesnpaths.get_temp_file_path() try: with open(temp_fasta_path, 'wb') as f_out, gzip.open(input_file_path, 'rb') as f_in: f_out.write(f_in.read()) except Exception as e: progress.end() raise ConfigError(f"Something went wrong while decompressing the downloaded file :/ It is likely that " f"the download failed and only part of the file was downloaded. If you would like to " f"try again, please run the setup command with the flag `--reset`. Here is what the " f"downstream library said: '{e}'.") progress.end() if utils.is_program_exists('diamond', dont_raise=True): output_dir = J(self.COG_data_dir, 'DB_DIAMOND') if os.path.exists(output_dir): shutil.rmtree(output_dir) os.mkdir(output_dir) output_db_path = J(output_dir, 'COG') log_file_path = J(output_dir, 'log.txt') self.run.info('Diamond log', log_file_path) diamond = Diamond(temp_fasta_path) diamond.num_threads = self.num_threads diamond.run.log_file_path = log_file_path diamond.makedb(output_db_path) else: self.run.warning("DIAMOND does not seem to be installed on this system, so anvi'o is not going to " "generate a search database for it. Remember this when/if things go South.") if utils.is_program_exists('makeblastdb', dont_raise=True) and utils.is_program_exists('blastp', dont_raise=True): output_dir = J(self.COG_data_dir, 'DB_BLAST') if os.path.exists(output_dir): shutil.rmtree(output_dir) os.mkdir(output_dir) output_db_path = J(output_dir, 'COG') log_file_path = J(output_dir, 'log.txt') self.run.info('BLAST log', log_file_path) blast = BLAST(temp_fasta_path) blast.run.log_file_path = log_file_path blast.num_threads = self.num_threads blast.makedb(os.path.join(output_db_path, 'COG.fa')) else: self.run.warning("BLAST tools do not seem to be installed on this system, so anvi'o is not going to " "generate a search database for them to be used. Keep this in mind for later.") os.remove(temp_fasta_path)
def format_protein_db(self, input_file_path, output_file_path): progress.new('Formatting raw files') progress.update('Decompressing protein sequences') # poor man's uncompress temp_fasta_path = filesnpaths.get_temp_file_path() with open(temp_fasta_path, 'wb') as f_out, gzip.open(input_file_path, 'rb') as f_in: f_out.write(f_in.read()) progress.end() if utils.is_program_exists('diamond', dont_raise=True): output_dir = J(self.COG_data_dir, 'DB_DIAMOND') if os.path.exists(output_dir): shutil.rmtree(output_dir) os.mkdir(output_dir) output_db_path = J(output_dir, 'COG') log_file_path = J(output_dir, 'log.txt') self.run.info('Diamond log', log_file_path) diamond = Diamond(temp_fasta_path) diamond.num_threads = self.num_threads diamond.run.log_file_path = log_file_path diamond.makedb(output_db_path) else: self.run.warning("Diamond does not seem to be installed on this system, so anvi'o is not going to\ generate a search database for it. Remember this when/if things go South.") if utils.is_program_exists('makeblastdb', dont_raise=True) and utils.is_program_exists('blastp', dont_raise=True): output_dir = J(self.COG_data_dir, 'DB_BLAST') if os.path.exists(output_dir): shutil.rmtree(output_dir) os.mkdir(output_dir) output_db_path = J(output_dir, 'COG') log_file_path = J(output_dir, 'log.txt') self.run.info('BLAST log', log_file_path) blast = BLAST(temp_fasta_path) blast.run.log_file_path = log_file_path blast.num_threads = self.num_threads blast.makedb(os.path.join(output_db_path, 'COG.fa')) else: self.run.warning("BLAST tools do not seem to be installed on this system, so anvi'o is not going to\ generate a search database for them to be used. Keep this in mind for later.") os.remove(temp_fasta_path)
def create_search_databases(self): """Creates all the search databases""" self.progress.new("Creating search databases") self.progress.update( "Removing any database that still exists in the output directory..." ) for prefix in ['.nhr', '.nin', '.nsq']: [ os.remove(database_path) for database_path in [s['db'] + prefix for s in self.ctx.anticodons.values()] if os.path.exists(database_path) ] # compresssing and decompressing FASTA files changes their hash and make them look like # modified in git. to avoid that, we will do the database generation in a temporary directory. temp_dir = filesnpaths.get_temp_directory_path() self.progress.update("Copying FASTA files to %s ..." % (temp_dir)) # the following line basically returns a dictionary that shows the new path # of the FASTA file under temp_dir for a given anticodon .. apologies for the # incomprehensible list comprehension new_paths = dict([ (os.path.basename(fasta_path), shutil.copy((fasta_path + '.gz'), os.path.join(temp_dir, os.path.basename(fasta_path) + '.gz'))) for fasta_path in [s['db'] for s in self.ctx.anticodons.values()] ]) missing_FASTA_files = [ anticodon for anticodon in self.ctx.anticodons if not os.path.exists(new_paths[anticodon]) ] if len(missing_FASTA_files): raise ConfigError( "Weird news :( Anvi'o is missing some FASTA files that were supposed to be somewhere. Since this " "can't be your fault, it is not easy to advice what could be the solution to this. If you are not " "an anvi'o programmer working on this problem this very moment, please get in touch with one." ) self.progress.update("Decompressing FASTA files in %s" % (temp_dir)) new_paths = dict([(anticodon, utils.gzip_decompress_file(new_paths[anticodon], keep_original=False)) for anticodon in new_paths]) for anticodon in self.ctx.anticodons: self.progress.update("Working on %s in %d threads" % (anticodon, self.num_threads)) FASTA_file_path_for_anticodon = new_paths[anticodon] # create a BLAST search database for `FASTA_file_path_for_anticodon` blast = BLAST(query_fasta=FASTA_file_path_for_anticodon, run=run_quiet, progress=progress_quiet, num_threads=self.num_threads) blast.log_file_path = os.path.join( os.path.dirname(FASTA_file_path_for_anticodon), '%s.log' % anticodon) blast.makedb(dbtype='nucl') for prefix in ['.nhr', '.nin', '.nsq']: if not os.path.exists(FASTA_file_path_for_anticodon + prefix): raise ConfigError( "Something went wrong and BLAST did not create the database file it was supposed to " "for %s :(" % anticodon) else: shutil.move( FASTA_file_path_for_anticodon + prefix, os.path.dirname(self.ctx.anticodons[anticodon]['db'])) shutil.rmtree(temp_dir) self.progress.end() self.run.info_single( "Every FASTA is now turned into a fancy search database. It means you are now allowed to run " "`anvi-run-trna-taxonomy` on anvi'o contigs databases. This workflow is very new, and there are " "caveats to it just like every other computational approach you use to make sense of complex 'omics " "data. To better understand those caveats you should read our online documentation a bit. If you see " "things that concerns you, please let anvi'o developers know. They love bad news. If you get good " "results from this workflow, thank to those who contributed to the GTDB.", nl_after=1, mc="green")
def find(self, sequence, sequence_name="(a sequence does not have a name)", display_palindromes=False): """Find palindromes in a single sequence, and populate `self.palindromes` The member function `process` may be a better one to call with an `args` object. See `anvi-search-palindromes` for example usage. """ if sequence_name in self.palindromes: raise ConfigError( f"The sequence '{sequence_name}' is already in `self.palindromes`." ) else: self.palindromes[sequence_name] = [] sequence = sequence.upper() sequence_length = len(sequence) if sequence_length < self.min_palindrome_length * 2 + self.min_distance: self.progress.reset() self.run.warning( f"The sequence '{sequence_name}', which is only {sequence_length} nts long, is too short " f"to find palindromes that are at least {self.min_palindrome_length} nts, with " f"{self.min_distance} nucleoties in between :/ Anvi'o will skip it." ) # setup BLAST job BLAST_search_tmp_dir = filesnpaths.get_temp_directory_path() fasta_file_path = os.path.join(BLAST_search_tmp_dir, 'sequence.fa') log_file_path = os.path.join(BLAST_search_tmp_dir, 'blast-log.txt') results_file_path = os.path.join(BLAST_search_tmp_dir, 'hits.xml') with open(fasta_file_path, 'w') as fasta_file: fasta_file.write(f'>sequence\n{sequence}\n') # run blast blast = BLAST(fasta_file_path, search_program='blastn', run=run_quiet, progress=progress_quiet) blast.evalue = 10 blast.num_threads = self.num_threads blast.min_pct_id = 100 - self.max_num_mismatches blast.search_output_path = results_file_path blast.log_file_path = log_file_path blast.makedb(dbtype='nucl') if self.min_palindrome_length < 20 and len( sequence ) > 10000 and not self.user_is_warned_for_potential_performance_issues: self.progress.reset() self.run.warning( f"Please note, you are searching for palindromes that are as short as {self.min_palindrome_length} " f"in a sequence that is {pp(len(sequence))} nts long. If your palindrome search takes a VERY long time " f"you may want to go for longer palindromes by setting a different `--min-palindrome-length` parameter " f"and by increasing the BLAST word size using `--blast-word-size` parameter (please read the help menu first). " f"This part of the code does not know if you have many more seqeunces to search, but anvi'o will not " f"continue displaying this warning for additional seqeunces to minimize redundant informatio in your " f"log files (because despite the popular belief anvi'o can actually sometimes be like nice and all).", header="ONE-TIME PERFORMANCE WARNING") self.user_is_warned_for_potential_performance_issues = True blast.blast(outputfmt='5', word_size=self.blast_word_size, strand='minus') # parse the BLAST XML output root = ET.parse(blast.search_output_path).getroot() for query_sequence_xml in root.findall( 'BlastOutput_iterations/Iteration'): for hit_xml in query_sequence_xml.findall('Iteration_hits/Hit'): for hsp_xml in hit_xml.findall('Hit_hsps/Hsp'): p = Palindrome(run=self.run) p.sequence_name = sequence_name p.first_start = int( hsp_xml.find('Hsp_query-from').text) - 1 p.first_end = int(hsp_xml.find('Hsp_query-to').text) p.first_sequence = hsp_xml.find('Hsp_qseq').text p.second_start = int(hsp_xml.find('Hsp_hit-to').text) - 1 p.second_end = int(hsp_xml.find('Hsp_hit-from').text) p.second_sequence = hsp_xml.find('Hsp_hseq').text p.distance = p.second_start - p.first_start # for each hit, there will be a copy of its reverse complement. # the first half of the if statement below is to control for that # and make sure we keep only one of them. the other half is to # remove those that do not meet the minimum distance criterion. if p.distance < 0 or p.distance < self.min_distance: continue # before we continue, we will test for a special case: internal palindromes # within larger palindromes of 0 distance. IT DOES HAPPEN I PROM. if p.distance == 0: internal_palindrome = False for _p in self.palindromes[sequence_name]: if p.first_start > _p.first_start and p.first_start < _p.first_end: internal_palindrome = True break if internal_palindrome: continue p.length = int(hsp_xml.find('Hsp_align-len').text) if p.length < self.min_palindrome_length: # buckle your seat belt Dorothy, 'cause Kansas is going bye-bye: continue p.num_gaps = int(hsp_xml.find('Hsp_gaps').text) p.num_mismatches = int( hsp_xml.find('Hsp_align-len').text) - int( hsp_xml.find('Hsp_identity').text) p.midline = ''.join([ '|' if p.first_sequence[i] == p.second_sequence[i] else 'x' for i in range(0, len(p.first_sequence)) ]) if p.num_mismatches > self.max_num_mismatches or p.num_gaps > 0: # this is the crazy part: read the function docstring for `get_split_palindromes`. # briefly, we conclude that there are too many mismatches in this match, we will # try and see if there is anything we can salvage from it. p_list = self.get_split_palindromes( p, display_palindromes=display_palindromes) else: # there aren't too many mismatches, and the length checks out. we will continue # processing this hit as a sole palindrome p_list = [p] for sp in p_list: if anvio.DEBUG or display_palindromes or self.verbose: self.progress.reset() sp.display() self.palindromes[sequence_name].append(sp) # clean after yourself if anvio.DEBUG: self.run.info("BLAST temporary dir kept", BLAST_search_tmp_dir, nl_before=1, mc='red') else: filesnpaths.shutil.rmtree(BLAST_search_tmp_dir)
def create_search_databases(self): """Creates all the search databases""" self.progress.new("Creating search databases") self.progress.update( "Removing any database that still exists in the output directory..." ) for anticodon_base_path in [ b['db'] for b in self.ctx.anticodons.values() ]: [ os.remove(f) for f in glob.glob(anticodon_base_path + '.*') if not f.endswith('.gz') ] # compresssing and decompressing FASTA files changes their hash and make them look like # modified in git. to avoid that, we will do the database generation in a temporary directory. temp_dir = filesnpaths.get_temp_directory_path() self.progress.update("Copying FASTA files to %s ..." % (temp_dir)) # the following line basically returns a dictionary that shows the new path # of the FASTA file under temp_dir for a given anticodon .. apologies for the # incomprehensible list comprehension new_paths = dict([ (os.path.basename(fasta_path), shutil.copy((fasta_path + '.gz'), os.path.join(temp_dir, os.path.basename(fasta_path) + '.gz'))) for fasta_path in [s['db'] for s in self.ctx.anticodons.values()] ]) missing_FASTA_files = [ anticodon for anticodon in self.ctx.anticodons if not os.path.exists(new_paths[anticodon]) ] if len(missing_FASTA_files): raise ConfigError( "Weird news :( Anvi'o is missing some FASTA files that were supposed to be somewhere. Since this " "can't be your fault, it is not easy to advice what could be the solution to this. If you are not " "an anvi'o programmer working on this problem this very moment, please get in touch with one." ) self.progress.update("Decompressing FASTA files in %s" % (temp_dir)) new_paths = dict([(anticodon, utils.gzip_decompress_file(new_paths[anticodon], keep_original=False)) for anticodon in new_paths]) for anticodon in self.ctx.anticodons: self.progress.update("Working on %s in %d threads" % (anticodon, self.num_threads)) FASTA_file_path_for_anticodon = new_paths[anticodon] # create a BLAST search database for `FASTA_file_path_for_anticodon` blast = BLAST(query_fasta=FASTA_file_path_for_anticodon, run=run_quiet, progress=progress_quiet, num_threads=self.num_threads) blast.log_file_path = os.path.join( os.path.dirname(FASTA_file_path_for_anticodon), '%s.log' % anticodon) blast.makedb(dbtype='nucl') files_generated = [ f for f in glob.glob(FASTA_file_path_for_anticodon + '.*') ] if not len(files_generated): raise ConfigError( f"Even though the process to generate BLAST database files for '{anticodon}' has officially ended, " f"anvi'o is unable to find any files generated by BLAST in the temporary directory it was working " f"with :( This is as confusing to anvi'o as it probably sounds to you. A likely explanation is that " f"something went wrong with the `makeblastdb` step. Please go into the following directory, and run " f"`makeblastdb -in AAA -dbtype nucl; ls AAA*` manually to see what happens: '{temp_dir}'." ) else: for file_path in files_generated: shutil.move( file_path, os.path.dirname(self.ctx.anticodons[anticodon]['db'])) shutil.rmtree(temp_dir) self.progress.end() self.run.info_single( "Every FASTA is now turned into a fancy search database. It means you are now allowed to run " "`anvi-run-trna-taxonomy` on anvi'o contigs databases. This workflow is very new, and there are " "caveats to it just like every other computational approach you use to make sense of complex 'omics " "data. To better understand those caveats you should read our online documentation a bit. If you see " "things that concerns you, please let anvi'o developers know. They love bad news. If you get good " "results from this workflow, thank to those who contributed to the GTDB.", nl_after=1, mc="green")