def initETE3Database(database_directory, ETE3DBTAXAFILE, logging): lockfilepath = os.path.join(database_directory, ".lock") if os.path.exists(lockfilepath) == False: open(file=lockfilepath, mode="w").close() logging.info("Placed lock file at {}".format(lockfilepath)) else: while os.path.exists(lockfilepath): elapsed_time = time.time() - os.path.getmtime(lockfilepath) logging.info("Lock file found at {}. Waiting for other processes to finish ete3 database init ...".format( lockfilepath)) logging.info( "Elapsed time {} min. Will continue processing after 16 min mark.".format(int(elapsed_time / 60))) if elapsed_time >= 1000: logging.info( "Elapsed time {} min. Assuming previous process completed all init steps. Continue ...".format( int(elapsed_time / 60))) try: # if previous process failed, no processes are running and > 16 min passed since the lock was created os.remove(lockfilepath) except: # continue if file was removed by other process pass break time.sleep(60) # recheck every 1 min if lock file was removed by other process logging.info("Lock file no longer exists. Assuming init process completed successfully") ncbi = NCBITaxa() ncbi.dbfile = ETE3DBTAXAFILE ncbi.update_taxonomy_database() try: os.remove(lockfilepath) logging.info("Lock file removed.") except: logging.warning("Lock file is already removed by some other process.") pass try: os.remove(os.path.join(os.getcwd(), "taxdump.tar.gz")) logging.info("Removed residual taxdump.tar.gz as ete3 is not doing proper cleaning job.") except: pass logging.info("ETE3 database init completed successfully.")
def main(): args = arguments() database_directory = os.path.abspath(args.database_directory) if os.path.exists(database_directory) == False: os.mkdir(database_directory) else: logger.info("Database directory folder already exists at {}".format( database_directory)) # Helper function to simplify adding database_directory to everything prepend_db_dir = functools.partial(os.path.join, database_directory) lockfilepath = os.path.join(database_directory, ".lock") status_file = prepend_db_dir('status.txt') if os.path.exists(lockfilepath) == False: try: open(file=lockfilepath, mode="w").close() logger.info("Placed lock file at {}".format(lockfilepath)) except Exception as e: logger.error( "Failed to place a lock file at {}. Database diretory can not be accessed. Wrong path?" .format(lockfilepath)) logger.error("{}".format(e)) pass else: while os.path.exists(lockfilepath): elapsed_time = time.time() - os.path.getmtime(lockfilepath) logger.info( "Lock file found at {}. Waiting for other processes to finish database init ..." .format(lockfilepath)) logger.info( "Elapsed time {} min. Will continue processing after 16 min mark." .format(int(elapsed_time / 60))) if elapsed_time >= 1000: logger.info( "Elapsed time {} min. Assuming previous process completed all init steps. Continue ..." .format(int(elapsed_time / 60))) try: #if previous process failed, no processes are running and > 16 min passed since the lock was created os.remove(lockfilepath) except: #continue if file was removed by other process pass break time.sleep(60) #recheck every 1 min if lock file was removed logger.info( "Lock file no longer exists. Assuming init process completed successfully" ) return 0 logger.info('Initializing databases...this will take some time') # Find available threads and use the maximum number available for mash sketch but cap it at 32 num_threads = min(multiprocessing.cpu_count(), 32) if not os.path.exists(database_directory): os.makedirs(database_directory) zip_file = prepend_db_dir('data.tar.gz') plasmid_database_fasta_file = prepend_db_dir('ncbi_plasmid_full_seqs.fas') repetitive_fasta_file = prepend_db_dir('repetitive.dna.fas') mash_db_file = prepend_db_dir('ncbi_plasmid_full_seqs.fas.msh') logger.info('Downloading databases...this will take some time') for db_mirror in config['db_mirrors']: try: logger.info('Trying mirror {}'.format(db_mirror)) download_to_file(db_mirror, zip_file) break except Exception as e: logger.error( "Download failed with error {}. Removing lock file".format( str(e))) os.remove(lockfilepath) sys.exit(-1) logger.info( "Downloading databases successful, now building databases at {}". format(database_directory)) extract(zip_file, database_directory) files = [ prepend_db_dir(f) for f in os.listdir(database_directory) if f.endswith('.gz') ] for file in files: extract(file, database_directory) #Initialize blast and mash databases try: logger.info('Building repetitive mask database') blast_runner = BlastRunner(repetitive_fasta_file, database_directory) blast_runner.makeblastdb(repetitive_fasta_file, 'nucl', logger) logger.info('Building complete plasmid database') blast_runner = BlastRunner(plasmid_database_fasta_file, database_directory) blast_runner.makeblastdb(plasmid_database_fasta_file, 'nucl', logger, True) logger.info('Sketching complete plasmid database') mObj = mash() mObj.mashsketch(plasmid_database_fasta_file, mash_db_file, num_threads=num_threads) except Exception as e: logger.error( 'Downloading databases failed, please check your internet connection and retry' ) logger.error( "Process failed with error {}. Removing lock file".format(e)) os.remove(lockfilepath) sys.exit(-1) try: logger.info("Init ete3 library ...") ete3taxadbpath = os.path.abspath( os.path.join(database_directory, "taxa.sqlite")) ncbi = NCBITaxa() ncbi.dbfile = ete3taxadbpath ncbi.update_taxonomy_database() except Exception as e: logger.error( "Init of ete3 library failed with error {}. Removing lock file". format(e)) os.remove(lockfilepath) sys.exit(-1) try: os.remove(os.path.join(os.getcwd(), "taxdump.tar.gz")) logger.info( "Removed residual taxdump.tar.gz as ete3 is not doing proper cleaning job." ) except: pass with open(status_file, 'w') as f: download_date = datetime.datetime.today().strftime('%Y-%m-%d') f.write("Download date: {}. Removing lock file.".format(download_date)) try: os.remove(lockfilepath) except: logger.warning( "Lock file is already removed by some other process.") pass logger.info("MOB init completed successfully") return 0