def test_to_database_nofolder_per_genome(caplog): """ Test behavior when the folder refseq/bacteria exists, but there are no folders inside -> should exit with error message """ outdir = os.path.join(GENEPATH, "out-refseq") empty_dir = os.path.join(outdir, "refseq", "bacteria") os.makedirs(empty_dir) caplog.set_level(logging.DEBUG) with pytest.raises(SystemExit): downg.to_database(outdir, "refseq") # Check error message is as expected assert "ERROR" in caplog.text assert ( "The folder supposed to contain genomes downloaded from NCBI refseq " "(test/data/prepare/generated_by_unit-tests/out-refseq/refseq/bacteria) " "exists but is empty") in caplog.text assert ( "Check that you really downloaded sequences (fna.gz)") in caplog.text # Same with genbank outdir = os.path.join(GENEPATH, "out-genbank") empty_dir = os.path.join(outdir, "genbank", "bacteria") os.makedirs(empty_dir) caplog.set_level(logging.DEBUG) with pytest.raises(SystemExit): downg.to_database(outdir, "genbank") # Check error message is as expected assert "ERROR" in caplog.text assert ( "The folder supposed to contain genomes downloaded from NCBI genbank " "(test/data/prepare/generated_by_unit-tests/out-genbank/genbank/bacteria) " "exists but is empty") in caplog.text assert ( "Check that you really downloaded sequences (fna.gz)") in caplog.text
def test_to_database_nofolder_refseq(caplog): """ Test behavior when the folder that should contain refseq downloaded genomes does not exist -> should exit with error message """ caplog.set_level(logging.DEBUG) with pytest.raises(SystemExit): downg.to_database(GENEPATH, "genbank") assert "ERROR" in caplog.text assert ("The folder containing genomes downloaded from NCBI genbank " "(test/data/prepare/generated_by_unit-tests/genbank/bacteria) " "does not exist.") in caplog.text assert ( "Check that you really downloaded sequences (fna.gz) and that they are " "in this folder") in caplog.text
def test_to_database_several_genomes(caplog): """ Test behavior when the folder refseq/bacteria exists, there are subfolders inside, but 1 of them contains more than 1 genome: warning message informing that this genome will be ignored """ out_dir = os.path.join(GENEPATH, "genomes") refseq_dir = os.path.join(DATA_TEST_DIR, "genomes") # Copy content of refseq in genomes test data to output folder that will be used shutil.copytree(refseq_dir, out_dir) # Create a new gz file in one of the genome directories to_create_filename = "ACOR002.0519.bis.fna.gz" # Name of file that must be created to_fill_dir = "ACOR002" # Directory containing file to create to_create_path = os.path.join(out_dir, "refseq", "bacteria", to_fill_dir, to_create_filename) # Create empty gz file open(to_create_path, "w").close() # Run to_database, and check that only 2 genomes were considered nb_gen, db_dir = downg.to_database(out_dir, "refseq") assert nb_gen == 2 assert db_dir == os.path.join(out_dir, "Database_init") # Check that a warning message was raised, indicating that genome is ignored caplog.set_level(logging.DEBUG) assert "WARNING" in caplog.text assert ( "Problem with genome in ACOR002: several compressed fasta files found. " "This genome will be ignored.") in caplog.text assert not os.path.isfile(os.path.join(db_dir, "ACOR002.0519.fna")) assert os.path.isfile(os.path.join(db_dir, "ACOR001.0519.fna")) assert os.path.isfile(os.path.join(db_dir, "ACOR003.0519.fna"))
def test_to_database_1empty_genome_folder(caplog): """ Test behavior when the folder refseq/bacteria exists, there are subfolders inside, but 1 of them is empty: warning message informing that this genome will be ignored """ caplog.set_level(logging.DEBUG) out_dir = os.path.join(GENEPATH, "1empty_genome_folder") refseq_dir = os.path.join(DATA_TEST_DIR, "genomes") # Copy content of refseq in genomes test data to output folder that will be used shutil.copytree(refseq_dir, out_dir) # Empty 1 directory: move its file to 'out_dir' to_remove = os.path.join(out_dir, "refseq", "bacteria", "ACOR003", "ACOR003.0519.fna.gz") os.remove(to_remove) # Run to_database nb_gen, db_dir = downg.to_database(out_dir, "refseq") assert nb_gen == 2 assert db_dir == os.path.join(out_dir, "Database_init") # Check that a warning message was raised, indicating that genome is ignored assert "WARNING" in caplog.text assert ( "Problem with genome in ACOR003: no compressed fasta file downloaded. " "This genome will be ignored.") in caplog.text assert not os.path.isfile(os.path.join(db_dir, "ACOR003.0519.fna")) assert os.path.isfile(os.path.join(db_dir, "ACOR001.0519.fna")) assert os.path.isfile(os.path.join(db_dir, "ACOR002.0519.fna"))
def test_to_database_1genome_wrong_format(caplog): """ Test behavior when the folder refseq/bacteria exists, there is 1 genome per subfolder, but 1 genome cannot be unzipped """ # out_dir = os.path.join(DATA_TEST_DIR, "genomes") # gz_genomes_folder = os.path.join(out_dir, "refseq", "bacteria") out_dir = os.path.join(GENEPATH, "genomes") refseq_dir = os.path.join(DATA_TEST_DIR, "genomes") # Copy content of refseq in genomes test data to output folder that will be used shutil.copytree(refseq_dir, out_dir) # Name of directory directly containing the original gz file to_corrupt_dir = "ACOR001" to_corrupt_filename = "ACOR001.0519.fna.gz" to_corrupt_path = os.path.join(out_dir, "refseq", "bacteria", to_corrupt_dir, to_corrupt_filename) # Create fake gz file (txt file) false_gz = open(to_corrupt_path, "w") false_gz.write("This is not a gz file") false_gz.close() # Run to_database nb_gen, db_dir = downg.to_database(out_dir, "refseq") assert nb_gen == 2 assert db_dir == os.path.join(out_dir, "Database_init") # Check that a error message was raised, indicating that genome is ignored caplog.set_level(logging.DEBUG) assert "ERROR" in caplog.text assert ( "Error while trying to uncompress " "test/data/prepare/generated_by_unit-tests/genomes/Database_init/ACOR001.0519.fna.gz. " "This genome will be ignored") in caplog.text # Check that there are only 2 files in the database, and that they correspond # to uncompressed gz files list_db = os.listdir(db_dir) assert len(list_db) == 2 assert not os.path.isfile(os.path.join(db_dir, to_corrupt_filename)) assert os.path.isfile(os.path.join(db_dir, "ACOR002.0519.fna")) assert os.path.isfile(os.path.join(db_dir, "ACOR003.0519.fna"))
def test_to_database(): """ Test that all fna.gz files are uncompressed and moved to a created Database_init folder """ out_dir = os.path.join(DATA_TEST_DIR, "genomes") nb_gen, db_init_dir = downg.to_database(out_dir, "refseq") db_dir = os.path.join(DATA_TEST_DIR, "genomes", "Database_init") assert os.path.isdir(db_dir) files_all = glob.glob(os.path.join(db_dir, "*")) files_fna = glob.glob(os.path.join(db_dir, "*.fna")) # Check that there are only 3 files in result database assert len(files_all) == len(files_fna) # And that those files are .fna files assert len(files_fna) == 3 # Check that we have as many genomes as expected, and that the output database has the # expected name assert nb_gen == 3 assert db_init_dir == db_dir assert os.path.isfile(os.path.join(db_dir, "ACOR002.0519.fna")) assert os.path.isfile(os.path.join(db_dir, "ACOR002.0519.fna")) assert os.path.isfile(os.path.join(db_dir, "ACOR003.0519.fna")) shutil.rmtree(db_dir)
def main(cmd, ncbi_species_name, ncbi_species_taxid, ncbi_taxid, ncbi_strains, levels, ncbi_section, outdir, tmp_dir, threads, norefseq, db_dir, only_mash, info_file, l90, nbcont, cutn, min_dist, max_dist, verbose, quiet): """ Main method, constructing the draft dataset for the given species verbosity: - defaut 0 : stdout contains INFO, stderr contains ERROR, .log contains INFO and more, .log.err contains warning and more - 1: same as 0 + WARNING in stderr - 2: same as 1 + DETAILS in stdout + DETAILS in .log.details - >=15: same as 2 + Add DEBUG in stdout + create .log.debug with everything from info to debug Parameters ---------- cmd : str command line used to launch this program ncbi_species_name : str name of species to download, as given by NCBI ncbi_species_taxid : int species taxid given in NCBI ncbi_taxid : int NCBI taxid (sub-species) ncbi_strains : str specific strains to download levels: str Level of assembly to download. Choice between 'all', 'complete', 'chromosome', 'scaffold', 'contig'. Default is 'all' outdir : str path to output directory (where created database will be saved). tmp_dir : str Path to directory where tmp files are saved (sequences split at each row of 5 'N') threads : int max number of threads to use norefseq : bool True if user does not want to download again the database db_dir : str Name of the folder where already downloaded fasta files are saved. only_mash : bool True if user user already has the database and quality of each genome (L90, #contigs etc.) info_file : str File containing information on QC if it was already ran before (columns to_annotate, gsize, nb_conts and L90). l90 : int Max L90 allowed to keep a genome nbcont : int Max number of contigs allowed to keep a genome cutn : int cut at each when there are 'cutn' N in a row. Don't cut if equal to 0 min_dist : int lower limit of distance between 2 genomes to keep them max_dist : int upper limit of distance between 2 genomes to keep them (default is 0.06) verbose : int verbosity: - defaut 0 : stdout contains INFO, stderr contains ERROR, .log contains INFO and more, .log.err contains warning and more - 1: same as 0 + WARNING in stderr - 2: same as 1 + DETAILS in stdout + DETAILS in .log.details - >=15: same as 2 + Add DEBUG in stdout + create .log.debug with everything from info to debug quiet : bool True if nothing must be sent to stdout/stderr, False otherwise """ # get species name in NCBI format # -> will be used to name output directory # -> will be used to download summary file if given species corresponds to NCBI name if ncbi_species_name: species_linked = "_".join(ncbi_species_name.split()) species_linked = "_".join(species_linked.split("/")) # if species name not given by user, use species taxID (if given) to name output directory elif ncbi_species_taxid: species_linked = str(ncbi_species_taxid) # if species name not species taxid by user, use taxID (if given) to name output directory elif ncbi_taxid: species_linked = str(ncbi_taxid) # If no species nor taxID, get specific strain names elif ncbi_strains: if os.path.isfile(ncbi_strains): species_linked = os.path.basename(ncbi_strains) species_linked = os.path.splitext(species_linked)[0] else: species_linked = "_".join(ncbi_strains.split()) species_linked = "-".join(species_linked.split("/")) species_linked = "_and_".join(species_linked.split(",")) # if neither speName, speID, taxID nor strainName given (--norefseq, mashonly), name is NA else: species_linked = "NA" # Default outdir is species name if given, or species taxID if not outdir: outdir = species_linked # Default tmp_dir is outdir/tmp_files if not tmp_dir: tmp_dir = os.path.join(outdir, "tmp_files") # directory that will be created by ncbi_genome_download ncbidir = os.path.join(outdir, ncbi_section, "bacteria") os.makedirs(outdir, exist_ok=True) os.makedirs(tmp_dir, exist_ok=True) # Initialize logger # set level of logger: level is the minimum level that will be considered. if verbose <= 1: level = logging.INFO # for verbose = 2, ignore only debug if verbose >= 2 and verbose < 15: level = utils.detail_lvl() # int corresponding to detail level # for verbose >= 15, write everything if verbose >= 15: level = logging.DEBUG logfile_base = os.path.join(outdir, "PanACoTA_prepare_{}").format(species_linked) logfile_base, logger = utils.init_logger(logfile_base, level, 'prepare', log_details=True, verbose=verbose, quiet=quiet) # Message on what will be done (cmd, cores used) logger.info(f'PanACoTA version {version}') logger.info("Command used\n \t > " + cmd) message = f"'PanACoTA prepare' will run on {threads} " message += f"cores" if threads > 1 else "core" logger.info(message) # Start prepare step # Run more than only mash filter (!only_mash): # - start from QC and mash (norefseq) # - start from genome download (!norefseq)) if not only_mash: # Not only mash, so a new info file will be created. If the user still gave an info # file (he will be warned that it will be ignored), rename it with '.bak' # to avoid erasing it if info_file and os.path.isfile(info_file): os.rename(info_file, info_file + ".back") # 'norefseq = True" : Do not download genomes, just do QC and mash filter on given genomes # -> if not, error and exit if norefseq: logger.warning(f'You asked to skip {ncbi_section} downloads.') # -> if db_dir given, watch for sequences there. If does not exist, error and exit # (user gave a directory (even if it does not exist), so we won't look for # the sequences in other folders) if db_dir: if not os.path.exists(db_dir): logger.error( f"Database folder {db_dir} supposed to contain fasta " "sequences does not " "exist. Please give a valid folder, or leave the default " "directory (no '-d' option).") sys.exit(1) # -> If user did not give db_dir, genomes could be in # outdir/Database_init/<genome_name>.fna else: db_dir = os.path.join(outdir, "Database_init") # If it does not exist, check if default compressed files folder exists. if not os.path.exists(db_dir): logger.warning( f"Database folder {db_dir} supposed to contain fasta " "sequences does not " "exist. We will check if the download folder (with compressed " "sequences) exists.") # -> if not in database_init, genomes must be in # outdir/refeq/bacteria/<genome_name>.fna.gz. In that case, # uncompress and add them to Database_init if not os.path.exists(ncbidir): logger.error( f"Folder {ncbidir} does not exist. You do not have any " "genome to analyse. Possible reasons:\n" "- if you want to rerun analysis in the same folder as " "sequences were downloaded (my_outdir/Database_init or " f"my_outdir/{ncbi_section}), make sure you have '-o my_outdir' " "option\n" "- if you want to rerun analysis and save them in a new " "output folder called 'new_outdir', make sure you have " "'-o new_outdir' option, " "and you specified where the uncompressed sequences to " "use are ('-d sequence_database_path'). ") sys.exit(1) # add genomes from refseq/bacteria folder to Database_init nb_gen, _ = dgf.to_database(outdir, ncbi_section) # No sequence: Do all steps -> download, QC, mash filter else: # Download all genomes of the given taxID db_dir, nb_gen = dgf.download_from_ncbi(species_linked, ncbi_section, ncbi_species_name, ncbi_species_taxid, ncbi_taxid, ncbi_strains, levels, outdir, threads) logger.info(f"{nb_gen} {ncbi_section} genome(s) downloaded") # Now that genomes are downloaded and uncompressed, check their quality to remove bad ones genomes = fg.check_quality(species_linked, db_dir, tmp_dir, l90, nbcont, cutn) # Do only mash filter. Genomes must be already downloaded, and there must be a file with # all information on these genomes (L90 etc.) else: logger.warning('You asked to run only mash steps.') if not os.path.exists( info_file): # info-file missing -> error and exit logger.error( f"Your info file {info_file} does not exist. Please provide the " "right name/path, or remove the '--mash-only option to rerun " "quality control.") sys.exit(1) logger.info(("You want to run only mash steps. Getting information " "from {}").format(info_file)) genomes = utils.read_genomes_info( info_file, species_linked, ) # Run Mash # genomes : {genome_file: [genome_name, orig_name, path_to_seq_to_annotate, size, nbcont, l90]} # sorted_genome : [genome_file] ordered by L90/nbcont (keys of genomes) sorted_genomes = fg.sort_genomes_minhash(genomes, l90, nbcont) # Write discarded genomes to a file -> orig_name, to_annotate, gsize, nb_conts, L90 discQC = f"by-L90_nbcont-{species_linked}.txt" utils.write_genomes_info(genomes, sorted_genomes, discQC, outdir) # Remove genomes not corresponding to mash filters removed = fg.iterative_mash(sorted_genomes, genomes, outdir, species_linked, min_dist, max_dist, threads, quiet) # Write list of genomes kept, and list of genomes discarded by mash step info_file = fg.write_outputfiles(genomes, sorted_genomes, removed, outdir, species_linked, min_dist, max_dist) logger.info("End") return info_file