Example #1
0
def initETE3Database(database_directory, ETE3DBTAXAFILE, logging):
    lockfilepath = os.path.join(database_directory, ".lock")

    if os.path.exists(lockfilepath) == False:
        open(file=lockfilepath, mode="w").close()
        logging.info("Placed lock file at {}".format(lockfilepath))
    else:
        while os.path.exists(lockfilepath):
            elapsed_time = time.time() - os.path.getmtime(lockfilepath)
            logging.info("Lock file found at {}. Waiting for other processes to finish ete3 database init ...".format(
                lockfilepath))
            logging.info(
                "Elapsed time {} min. Will continue processing after 16 min mark.".format(int(elapsed_time / 60)))
            if elapsed_time >= 1000:
                logging.info(
                    "Elapsed time {} min. Assuming previous process completed all init steps. Continue ...".format(
                        int(elapsed_time / 60)))
                try:  # if previous process failed, no processes are running and > 16 min passed since the lock was created
                    os.remove(lockfilepath)
                except:  # continue if file was removed by other process
                    pass
                break
            time.sleep(60)  # recheck every 1 min if lock file was removed by other process
        logging.info("Lock file no longer exists. Assuming init process completed successfully")

    ncbi = NCBITaxa()
    ncbi.dbfile = ETE3DBTAXAFILE
    ncbi.update_taxonomy_database()

    try:
        os.remove(lockfilepath)
        logging.info("Lock file removed.")
    except:
        logging.warning("Lock file is already removed by some other process.")
        pass

    try:
        os.remove(os.path.join(os.getcwd(), "taxdump.tar.gz"))
        logging.info("Removed residual taxdump.tar.gz as ete3 is not doing proper cleaning job.")
    except:
        pass
    logging.info("ETE3 database init completed successfully.")
Example #2
0
def main():
    args = arguments()

    database_directory = os.path.abspath(args.database_directory)

    if os.path.exists(database_directory) == False:
        os.mkdir(database_directory)
    else:
        logger.info("Database directory folder already exists at {}".format(
            database_directory))

    # Helper function to simplify adding database_directory to everything
    prepend_db_dir = functools.partial(os.path.join, database_directory)

    lockfilepath = os.path.join(database_directory, ".lock")
    status_file = prepend_db_dir('status.txt')

    if os.path.exists(lockfilepath) == False:
        try:
            open(file=lockfilepath, mode="w").close()
            logger.info("Placed lock file at {}".format(lockfilepath))
        except Exception as e:
            logger.error(
                "Failed to place a lock file at {}. Database diretory can not be accessed. Wrong path?"
                .format(lockfilepath))
            logger.error("{}".format(e))
            pass
    else:
        while os.path.exists(lockfilepath):
            elapsed_time = time.time() - os.path.getmtime(lockfilepath)
            logger.info(
                "Lock file found at {}. Waiting for other processes to finish database init ..."
                .format(lockfilepath))
            logger.info(
                "Elapsed time {} min. Will continue processing after 16 min mark."
                .format(int(elapsed_time / 60)))
            if elapsed_time >= 1000:
                logger.info(
                    "Elapsed time {} min. Assuming previous process completed all init steps. Continue ..."
                    .format(int(elapsed_time / 60)))
                try:  #if previous process failed, no processes are running and > 16 min passed since the lock was created
                    os.remove(lockfilepath)
                except:  #continue if file was removed by other process
                    pass
                break
            time.sleep(60)  #recheck every 1 min if lock file was removed
        logger.info(
            "Lock file no longer exists. Assuming init process completed successfully"
        )
        return 0

    logger.info('Initializing databases...this will take some time')
    # Find available threads and use the maximum number available for mash sketch but cap it at 32
    num_threads = min(multiprocessing.cpu_count(), 32)

    if not os.path.exists(database_directory):
        os.makedirs(database_directory)

    zip_file = prepend_db_dir('data.tar.gz')
    plasmid_database_fasta_file = prepend_db_dir('ncbi_plasmid_full_seqs.fas')
    repetitive_fasta_file = prepend_db_dir('repetitive.dna.fas')
    mash_db_file = prepend_db_dir('ncbi_plasmid_full_seqs.fas.msh')

    logger.info('Downloading databases...this will take some time')

    for db_mirror in config['db_mirrors']:
        try:
            logger.info('Trying mirror {}'.format(db_mirror))
            download_to_file(db_mirror, zip_file)
            break
        except Exception as e:
            logger.error(
                "Download failed with error {}. Removing lock file".format(
                    str(e)))
            os.remove(lockfilepath)
            sys.exit(-1)

    logger.info(
        "Downloading databases successful, now building databases at {}".
        format(database_directory))
    extract(zip_file, database_directory)

    files = [
        prepend_db_dir(f) for f in os.listdir(database_directory)
        if f.endswith('.gz')
    ]

    for file in files:

        extract(file, database_directory)

    #Initialize blast and mash databases
    try:
        logger.info('Building repetitive mask database')
        blast_runner = BlastRunner(repetitive_fasta_file, database_directory)
        blast_runner.makeblastdb(repetitive_fasta_file, 'nucl', logger)

        logger.info('Building complete plasmid database')
        blast_runner = BlastRunner(plasmid_database_fasta_file,
                                   database_directory)
        blast_runner.makeblastdb(plasmid_database_fasta_file, 'nucl', logger,
                                 True)

        logger.info('Sketching complete plasmid database')
        mObj = mash()
        mObj.mashsketch(plasmid_database_fasta_file,
                        mash_db_file,
                        num_threads=num_threads)
    except Exception as e:
        logger.error(
            'Downloading databases failed, please check your internet connection and retry'
        )
        logger.error(
            "Process failed with error {}. Removing lock file".format(e))
        os.remove(lockfilepath)
        sys.exit(-1)

    try:
        logger.info("Init ete3 library ...")
        ete3taxadbpath = os.path.abspath(
            os.path.join(database_directory, "taxa.sqlite"))
        ncbi = NCBITaxa()
        ncbi.dbfile = ete3taxadbpath
        ncbi.update_taxonomy_database()
    except Exception as e:
        logger.error(
            "Init of ete3 library failed with error {}. Removing lock file".
            format(e))
        os.remove(lockfilepath)
        sys.exit(-1)

    try:
        os.remove(os.path.join(os.getcwd(), "taxdump.tar.gz"))
        logger.info(
            "Removed residual taxdump.tar.gz as ete3 is not doing proper cleaning job."
        )
    except:
        pass

    with open(status_file, 'w') as f:
        download_date = datetime.datetime.today().strftime('%Y-%m-%d')
        f.write("Download date: {}. Removing lock file.".format(download_date))
        try:
            os.remove(lockfilepath)
        except:
            logger.warning(
                "Lock file is already removed by some other process.")
            pass

    logger.info("MOB init completed successfully")
    return 0