def decompress_files(self): # Decompressing Pfam-A.hmm.gz is not necessary, HMMer class works with .gz for file_name in ['Pfam.version.gz', 'Pfam-A.clans.tsv.gz']: full_path = os.path.join(self.pfam_data_dir, file_name) utils.gzip_decompress_file(full_path) os.remove(full_path)
def is_database_exists(self): """Checks if database files exist and decompresses them if compressed This function verifies that pfam_data_dir contains the Pfam hmm profiles and checks whether they are compressed or not. If they are compressed, we decompress them and run hmmpress. """ if not (os.path.exists( os.path.join(self.pfam_data_dir, 'Pfam-A.hmm.gz')) or os.path.exists( os.path.join(self.pfam_data_dir, 'Pfam-A.hmm'))): raise ConfigError( "It seems you do not have Pfam database installed, please run 'anvi-setup-pfams' to download it." ) # here we check if the HMM profile is compressed so we can decompress it for next time if os.path.exists(os.path.join(self.pfam_data_dir, 'Pfam-A.hmm.gz')): self.run.warning( "Anvi'o has detected that your Pfam database is currently compressed. It will now be unpacked before " "running HMMs.") utils.gzip_decompress_file(os.path.join(self.pfam_data_dir, 'Pfam-A.hmm.gz'), keep_original=False) cmd_line = [ 'hmmpress', os.path.join(self.pfam_data_dir, 'Pfam-A.hmm') ] log_file_path = os.path.join(self.pfam_data_dir, '00_hmmpress_log.txt') ret_val = utils.run_command(cmd_line, log_file_path) if ret_val: raise ConfigError( "Hmm. There was an error while running `hmmpress` on the Pfam HMM profiles. " "Check out the log file ('%s') to see what went wrong." % (log_file_path)) else: # getting rid of the log file because hmmpress was successful os.remove(log_file_path)
def decompress_files(self): """Decompresses Pfam HMM profiles.""" for file_name in self.files: full_path = os.path.join(self.pfam_data_dir, file_name) if full_path.endswith('.gz'): if not os.path.exists(full_path) and os.path.exists( full_path[:-3]): self.run.warning( "It seems the file at %s is already decompressed. You are probably seeing " "this message because Pfams was set up previously on this computer. Hakuna Matata. Anvi'o will " "simply skip decompressing this file at this time. But if you think there is an issue, you can " "re-do the Pfam setup by running `anvi-setup-pfams` again and using the --reset flag." % (full_path[:-3])) continue elif not os.path.exists(full_path): raise ConfigError( "Oh no. The file at %s does not exist. Something is terribly wrong. :( Anvi'o suggests re-running " "`anvi-setup-pfams` using the --reset flag." % (full_path)) utils.gzip_decompress_file(full_path) os.remove(full_path)
def decompress_files(self): """Decompresses and runs hmmpress on Pfam HMM profiles.""" for file_name in self.files: full_path = os.path.join(self.pfam_data_dir, file_name) if full_path.endswith('.gz'): if not os.path.exists(full_path) and os.path.exists( full_path[:-3]): self.run.warning( "It seems the file at %s is already decompressed. You are probably seeing " "this message because Pfams was set up previously on this computer. Hakuna Matata. Anvi'o will " "simply skip decompressing this file at this time. But if you think there is an issue, you can " "re-do the Pfam setup by running `anvi-setup-pfams` again and using the --reset flag." % (full_path[:-3])) continue elif not os.path.exists(full_path): raise ConfigError( "Oh no. The file at %s does not exist. Something is terribly wrong. :( Anvi'o suggests re-running " "`anvi-setup-pfams` using the --reset flag." % (full_path)) utils.gzip_decompress_file(full_path) os.remove(full_path) for file_path in glob.glob(os.path.join(self.pfam_data_dir, '*.hmm')): cmd_line = ['hmmpress', file_path] log_file_path = os.path.join(self.pfam_data_dir, '00_hmmpress_log.txt') ret_val = utils.run_command(cmd_line, log_file_path) if ret_val: raise ConfigError( "Hmm. There was an error while running `hmmpress` on the Pfam HMM profiles. " "Check out the log file ('%s') to see what went wrong." % (log_file_path)) else: # getting rid of the log file because hmmpress was successful os.remove(log_file_path)
def create_search_databases(self): """Creates all the search databases""" self.progress.new("Creating search databases") self.progress.update( "Removing any database that still exists in the output directory..." ) for prefix in ['.nhr', '.nin', '.nsq']: [ os.remove(database_path) for database_path in [s['db'] + prefix for s in self.ctx.anticodons.values()] if os.path.exists(database_path) ] # compresssing and decompressing FASTA files changes their hash and make them look like # modified in git. to avoid that, we will do the database generation in a temporary directory. temp_dir = filesnpaths.get_temp_directory_path() self.progress.update("Copying FASTA files to %s ..." % (temp_dir)) # the following line basically returns a dictionary that shows the new path # of the FASTA file under temp_dir for a given anticodon .. apologies for the # incomprehensible list comprehension new_paths = dict([ (os.path.basename(fasta_path), shutil.copy((fasta_path + '.gz'), os.path.join(temp_dir, os.path.basename(fasta_path) + '.gz'))) for fasta_path in [s['db'] for s in self.ctx.anticodons.values()] ]) missing_FASTA_files = [ anticodon for anticodon in self.ctx.anticodons if not os.path.exists(new_paths[anticodon]) ] if len(missing_FASTA_files): raise ConfigError( "Weird news :( Anvi'o is missing some FASTA files that were supposed to be somewhere. Since this " "can't be your fault, it is not easy to advice what could be the solution to this. If you are not " "an anvi'o programmer working on this problem this very moment, please get in touch with one." ) self.progress.update("Decompressing FASTA files in %s" % (temp_dir)) new_paths = dict([(anticodon, utils.gzip_decompress_file(new_paths[anticodon], keep_original=False)) for anticodon in new_paths]) for anticodon in self.ctx.anticodons: self.progress.update("Working on %s in %d threads" % (anticodon, self.num_threads)) FASTA_file_path_for_anticodon = new_paths[anticodon] # create a BLAST search database for `FASTA_file_path_for_anticodon` blast = BLAST(query_fasta=FASTA_file_path_for_anticodon, run=run_quiet, progress=progress_quiet, num_threads=self.num_threads) blast.log_file_path = os.path.join( os.path.dirname(FASTA_file_path_for_anticodon), '%s.log' % anticodon) blast.makedb(dbtype='nucl') for prefix in ['.nhr', '.nin', '.nsq']: if not os.path.exists(FASTA_file_path_for_anticodon + prefix): raise ConfigError( "Something went wrong and BLAST did not create the database file it was supposed to " "for %s :(" % anticodon) else: shutil.move( FASTA_file_path_for_anticodon + prefix, os.path.dirname(self.ctx.anticodons[anticodon]['db'])) shutil.rmtree(temp_dir) self.progress.end() self.run.info_single( "Every FASTA is now turned into a fancy search database. It means you are now allowed to run " "`anvi-run-trna-taxonomy` on anvi'o contigs databases. This workflow is very new, and there are " "caveats to it just like every other computational approach you use to make sense of complex 'omics " "data. To better understand those caveats you should read our online documentation a bit. If you see " "things that concerns you, please let anvi'o developers know. They love bad news. If you get good " "results from this workflow, thank to those who contributed to the GTDB.", nl_after=1, mc="green")
def create_search_databases(self): """Creates all the search databases""" self.progress.new("Creating search databases") self.progress.update( "Removing any database that still exists in the output directory..." ) for anticodon_base_path in [ b['db'] for b in self.ctx.anticodons.values() ]: [ os.remove(f) for f in glob.glob(anticodon_base_path + '.*') if not f.endswith('.gz') ] # compresssing and decompressing FASTA files changes their hash and make them look like # modified in git. to avoid that, we will do the database generation in a temporary directory. temp_dir = filesnpaths.get_temp_directory_path() self.progress.update("Copying FASTA files to %s ..." % (temp_dir)) # the following line basically returns a dictionary that shows the new path # of the FASTA file under temp_dir for a given anticodon .. apologies for the # incomprehensible list comprehension new_paths = dict([ (os.path.basename(fasta_path), shutil.copy((fasta_path + '.gz'), os.path.join(temp_dir, os.path.basename(fasta_path) + '.gz'))) for fasta_path in [s['db'] for s in self.ctx.anticodons.values()] ]) missing_FASTA_files = [ anticodon for anticodon in self.ctx.anticodons if not os.path.exists(new_paths[anticodon]) ] if len(missing_FASTA_files): raise ConfigError( "Weird news :( Anvi'o is missing some FASTA files that were supposed to be somewhere. Since this " "can't be your fault, it is not easy to advice what could be the solution to this. If you are not " "an anvi'o programmer working on this problem this very moment, please get in touch with one." ) self.progress.update("Decompressing FASTA files in %s" % (temp_dir)) new_paths = dict([(anticodon, utils.gzip_decompress_file(new_paths[anticodon], keep_original=False)) for anticodon in new_paths]) for anticodon in self.ctx.anticodons: self.progress.update("Working on %s in %d threads" % (anticodon, self.num_threads)) FASTA_file_path_for_anticodon = new_paths[anticodon] # create a BLAST search database for `FASTA_file_path_for_anticodon` blast = BLAST(query_fasta=FASTA_file_path_for_anticodon, run=run_quiet, progress=progress_quiet, num_threads=self.num_threads) blast.log_file_path = os.path.join( os.path.dirname(FASTA_file_path_for_anticodon), '%s.log' % anticodon) blast.makedb(dbtype='nucl') files_generated = [ f for f in glob.glob(FASTA_file_path_for_anticodon + '.*') ] if not len(files_generated): raise ConfigError( f"Even though the process to generate BLAST database files for '{anticodon}' has officially ended, " f"anvi'o is unable to find any files generated by BLAST in the temporary directory it was working " f"with :( This is as confusing to anvi'o as it probably sounds to you. A likely explanation is that " f"something went wrong with the `makeblastdb` step. Please go into the following directory, and run " f"`makeblastdb -in AAA -dbtype nucl; ls AAA*` manually to see what happens: '{temp_dir}'." ) else: for file_path in files_generated: shutil.move( file_path, os.path.dirname(self.ctx.anticodons[anticodon]['db'])) shutil.rmtree(temp_dir) self.progress.end() self.run.info_single( "Every FASTA is now turned into a fancy search database. It means you are now allowed to run " "`anvi-run-trna-taxonomy` on anvi'o contigs databases. This workflow is very new, and there are " "caveats to it just like every other computational approach you use to make sense of complex 'omics " "data. To better understand those caveats you should read our online documentation a bit. If you see " "things that concerns you, please let anvi'o developers know. They love bad news. If you get good " "results from this workflow, thank to those who contributed to the GTDB.", nl_after=1, mc="green")