def download(self): self.run.info("Database URL", self.database_url) for file_name in self.files: utils.download_file(self.database_url + '/' + file_name, os.path.join(self.pfam_data_dir, file_name), progress=self.progress, run=self.run) self.confirm_downloaded_files() self.decompress_files()
def download(self): self.run.info("Database URL", self.database_url) for file_name in self.files: utils.download_file(self.database_url + '/' + file_name, os.path.join(self.pfam_data_dir, file_name), progress=self.progress, run=self.run) self.confirm_downloaded_files() self.decompress_files()
def get_raw_data(self): if not os.path.exists(self.raw_NCBI_files_dir): os.mkdir(self.raw_NCBI_files_dir) open(self.COG_data_dir_version, 'w').write(COG_DATA_VERSION) for file_name in self.files: if not 'url' in self.files[file_name]: continue file_path = J(self.raw_NCBI_files_dir, file_name) if not os.path.exists(file_path): utils.download_file(self.files[file_name]['url'], file_path, progress=progress, run=run)
def download_interacdome_files(self): """Download the confident and representable non-redundant InteracDome datasets These datasets can be found at the interacdome webpage: https://interacdome.princeton.edu/ """ for path, url in self.interacdome_files.items(): utils.download_file(url, os.path.join(self.interacdome_data_dir, path), check_certificate=False, progress=self.progress, run=self.run)
def get_raw_data(self): if not os.path.exists(self.raw_NCBI_files_dir): os.mkdir(self.raw_NCBI_files_dir) open(self.COG_data_dir_version, 'w').write(COG_DATA_VERSION) for file_name in self.files: if not 'url' in self.files[file_name]: continue file_path = J(self.raw_NCBI_files_dir, file_name) if not os.path.exists(file_path): utils.download_file(self.files[file_name]['url'], file_path, progress=progress, run=run)
def check_database(self): """ Checks for the .bin version of database. If it only finds the .pir version, it binarizes it. Sets the db filepath. """ extensionless, extension = os.path.splitext(self.modeller_database) if extension not in [".bin", ".pir", ""]: raise ConfigError( "MODELLER :: The only possible database extensions are .bin and .pir" ) bin_db_path = J(self.database_dir, extensionless + ".bin") pir_db_path = J(self.database_dir, extensionless + ".pir") bin_exists = utils.filesnpaths.is_file_exists(bin_db_path, dont_raise=True) pir_exists = utils.filesnpaths.is_file_exists(pir_db_path, dont_raise=True) self.database_path = bin_db_path if bin_exists: return if not pir_exists and not bin_exists: self.progress.clear() self.run.warning( "Anvi'o looked in {} for a database with the name {} and with an extension \ of either .bin or .pir, but didn't find anything matching that \ criteria. Anvi'o will try and download the best database it knows of from \ https://salilab.org/modeller/downloads/pdb_95.pir.gz and use that. \ You can checkout https://salilab.org/modeller/ for more info about the pdb_95 \ database".format(self.database_dir, self.modeller_database)) db_download_path = os.path.join(self.database_dir, "pdb_95.pir.gz") utils.download_file( "https://salilab.org/modeller/downloads/pdb_95.pir.gz", db_download_path) utils.run_command(['gzip', '-d', db_download_path], log_file_path=filesnpaths.get_temp_file_path()) pir_exists = utils.filesnpaths.is_file_exists(pir_db_path, dont_raise=True) if pir_exists and not bin_exists: self.progress.clear() self.run.warning( "Your database is not in binary format. That means accessing its contents is slower \ than it could be. Anvi'o is going to make a binary format. Just FYI" ) self.run_binarize_database(pir_db_path, bin_db_path) return
def check_database(self): """ Checks for the .bin version of database. If it only finds the .pir version, it binarizes it. Sets the db filepath. """ extensionless, extension = os.path.splitext(self.modeller_database) if extension not in [".bin",".pir",""]: raise ConfigError("MODELLER :: The only possible database extensions are .bin and .pir") bin_db_path = J(self.database_dir, extensionless+".bin") pir_db_path = J(self.database_dir, extensionless+".pir") bin_exists = utils.filesnpaths.is_file_exists(bin_db_path, dont_raise=True) pir_exists = utils.filesnpaths.is_file_exists(pir_db_path, dont_raise=True) self.database_path = bin_db_path if bin_exists: return if not pir_exists and not bin_exists: self.progress.clear() self.run.warning("Anvi'o looked in {} for a database with the name {} and with an extension \ of either .bin or .pir, but didn't find anything matching that \ criteria. We'll try and download the best database we know of from \ https://salilab.org/modeller/downloads/pdb_95.pir.gz and use that. \ You can checkout https://salilab.org/modeller/ for more info about the pdb_95 \ database".format(self.database_dir, self.modeller_database)) db_download_path = os.path.join(self.database_dir, "pdb_95.pir.gz") utils.download_file("https://salilab.org/modeller/downloads/pdb_95.pir.gz", db_download_path) utils.run_command(['gzip', '-d', db_download_path], log_file_path=filesnpaths.get_temp_file_path()) pir_exists = utils.filesnpaths.is_file_exists(pir_db_path, dont_raise=True) if pir_exists and not bin_exists: self.progress.clear() self.run.warning("Your database is not in binary format. That means accessing its contents is slower \ than it could be. Anvi'o is going to make a binary format. Just FYI") self.run_binarize_database(pir_db_path, bin_db_path) return
def check_database(self): """Setup the database files Downloads the .pir file if it is missing Binarizes .pir file if .bin is missing Creates the .dmnd file if it is missing """ bin_db_path = J(self.database_dir, self.modeller_database + ".bin") pir_db_path = J(self.database_dir, self.modeller_database + ".pir") bin_exists = utils.filesnpaths.is_file_exists(bin_db_path, dont_raise=True) pir_exists = utils.filesnpaths.is_file_exists(pir_db_path, dont_raise=True) if bin_exists and pir_exists: # We good pass else: if not pir_exists: # Download .pir self.run.warning( "Anvi'o looked in {} for a database with the name {} and with an extension " "of either .bin or .pir, but didn't find anything matching that " "criteria. Anvi'o will try and download the best database it knows of from " "https://salilab.org/modeller/downloads/pdb_95.pir.gz and use that. " "You can checkout https://salilab.org/modeller/ for more info about the pdb_95 " "database".format(self.database_dir, self.modeller_database)) db_download_path = os.path.join(self.database_dir, "pdb_95.pir.gz") utils.download_file( "https://salilab.org/modeller/downloads/pdb_95.pir.gz", db_download_path) utils.run_command( ['gzip', '-d', db_download_path], log_file_path=filesnpaths.get_temp_file_path()) # Binarize .pir (make .bin) self.run.warning( "Your database is not in binary format. That means accessing its contents is slower " "than it could be. Anvi'o is going to make a binary format. Just FYI" ) self.run_binarize_database(pir_db_path, bin_db_path) dmnd_db_path = J(self.database_dir, self.modeller_database + '.dmnd') if os.path.exists(dmnd_db_path): return self.run.warning( "Your diamond database does not exist. It will be created.") script_name = "pir_to_fasta.py" self.copy_script_to_directory(script_name) input_pir_path = J(self.database_dir, self.modeller_database + '.pir') fasta_path = J(self.database_dir, self.modeller_database + '.fa') dmnd_path = J(self.database_dir, self.modeller_database) command = [self.executable, script_name, input_pir_path, fasta_path] self.run_command(command, script_name=script_name, rename_log=False) temp = u.FastaOutput(filesnpaths.get_temp_file_path()) fasta = u.SequenceSource(fasta_path) while next(fasta): temp.write_id(fasta.id) temp.write_seq(fasta.seq.replace('-', '').replace('.', 'X')) shutil.move(temp.output_file_path, fasta_path) fasta.close() temp.close() driver = diamond.Diamond( query_fasta=fasta_path, run=terminal.Run(verbose=False), progress=terminal.Progress(verbose=False), ) driver.makedb(output_file_path=dmnd_path) os.remove(fasta_path)