Python to_databaseの例、PanACoTA.prepare_module.download_genomes_func.to_database Pythonの例

コード例 #1

0

ファイルを表示

ファイル: test_download.py プロジェクト: gem-pasteur/PanACoTA

def test_to_database_nofolder_per_genome(caplog):
    """
    Test behavior when the folder refseq/bacteria exists, but there are no folders inside
    -> should exit with error message
    """
    outdir = os.path.join(GENEPATH, "out-refseq")
    empty_dir = os.path.join(outdir, "refseq", "bacteria")
    os.makedirs(empty_dir)
    caplog.set_level(logging.DEBUG)
    with pytest.raises(SystemExit):
        downg.to_database(outdir, "refseq")
    # Check error message is as expected
    assert "ERROR" in caplog.text
    assert (
        "The folder supposed to contain genomes downloaded from NCBI refseq "
        "(test/data/prepare/generated_by_unit-tests/out-refseq/refseq/bacteria) "
        "exists but is empty") in caplog.text
    assert (
        "Check that you really downloaded sequences (fna.gz)") in caplog.text

    # Same with genbank
    outdir = os.path.join(GENEPATH, "out-genbank")
    empty_dir = os.path.join(outdir, "genbank", "bacteria")
    os.makedirs(empty_dir)
    caplog.set_level(logging.DEBUG)
    with pytest.raises(SystemExit):
        downg.to_database(outdir, "genbank")
    # Check error message is as expected
    assert "ERROR" in caplog.text
    assert (
        "The folder supposed to contain genomes downloaded from NCBI genbank "
        "(test/data/prepare/generated_by_unit-tests/out-genbank/genbank/bacteria) "
        "exists but is empty") in caplog.text
    assert (
        "Check that you really downloaded sequences (fna.gz)") in caplog.text

コード例 #2

0

ファイルを表示

ファイル: test_download.py プロジェクト: gem-pasteur/PanACoTA

def test_to_database_nofolder_refseq(caplog):
    """
    Test behavior when the folder that should contain refseq downloaded genomes does not exist
    -> should exit with error message
    """
    caplog.set_level(logging.DEBUG)
    with pytest.raises(SystemExit):
        downg.to_database(GENEPATH, "genbank")

    assert "ERROR" in caplog.text
    assert ("The folder containing genomes downloaded from NCBI genbank "
            "(test/data/prepare/generated_by_unit-tests/genbank/bacteria) "
            "does not exist.") in caplog.text
    assert (
        "Check that you really downloaded sequences (fna.gz) and that they are "
        "in this folder") in caplog.text

コード例 #3

0

ファイルを表示

ファイル: test_download.py プロジェクト: gem-pasteur/PanACoTA

def test_to_database_several_genomes(caplog):
    """
    Test behavior when the folder refseq/bacteria exists, there are subfolders inside,
    but 1 of them contains more than 1 genome: warning message informing that this
    genome will be ignored
    """
    out_dir = os.path.join(GENEPATH, "genomes")
    refseq_dir = os.path.join(DATA_TEST_DIR, "genomes")
    # Copy content of refseq in genomes test data to output folder that will be used
    shutil.copytree(refseq_dir, out_dir)

    # Create a new gz file in one of the genome directories
    to_create_filename = "ACOR002.0519.bis.fna.gz"  # Name of file that must be created
    to_fill_dir = "ACOR002"  # Directory containing file to create
    to_create_path = os.path.join(out_dir, "refseq", "bacteria", to_fill_dir,
                                  to_create_filename)
    # Create empty gz file
    open(to_create_path, "w").close()

    # Run to_database, and check that only 2 genomes were considered
    nb_gen, db_dir = downg.to_database(out_dir, "refseq")
    assert nb_gen == 2
    assert db_dir == os.path.join(out_dir, "Database_init")

    # Check that a warning message was raised, indicating that genome is ignored
    caplog.set_level(logging.DEBUG)
    assert "WARNING" in caplog.text
    assert (
        "Problem with genome in ACOR002: several compressed fasta files found. "
        "This genome will be ignored.") in caplog.text
    assert not os.path.isfile(os.path.join(db_dir, "ACOR002.0519.fna"))
    assert os.path.isfile(os.path.join(db_dir, "ACOR001.0519.fna"))
    assert os.path.isfile(os.path.join(db_dir, "ACOR003.0519.fna"))

コード例 #4

0

ファイルを表示

ファイル: test_download.py プロジェクト: gem-pasteur/PanACoTA

def test_to_database_1empty_genome_folder(caplog):
    """
    Test behavior when the folder refseq/bacteria exists, there are subfolders inside,
    but 1 of them is empty: warning message informing that this genome will be ignored
    """
    caplog.set_level(logging.DEBUG)
    out_dir = os.path.join(GENEPATH, "1empty_genome_folder")
    refseq_dir = os.path.join(DATA_TEST_DIR, "genomes")
    # Copy content of refseq in genomes test data to output folder that will be used
    shutil.copytree(refseq_dir, out_dir)

    # Empty 1 directory: move its file to 'out_dir'
    to_remove = os.path.join(out_dir, "refseq", "bacteria", "ACOR003",
                             "ACOR003.0519.fna.gz")
    os.remove(to_remove)
    # Run to_database
    nb_gen, db_dir = downg.to_database(out_dir, "refseq")
    assert nb_gen == 2
    assert db_dir == os.path.join(out_dir, "Database_init")

    # Check that a warning message was raised, indicating that genome is ignored
    assert "WARNING" in caplog.text
    assert (
        "Problem with genome in ACOR003: no compressed fasta file downloaded. "
        "This genome will be ignored.") in caplog.text
    assert not os.path.isfile(os.path.join(db_dir, "ACOR003.0519.fna"))
    assert os.path.isfile(os.path.join(db_dir, "ACOR001.0519.fna"))
    assert os.path.isfile(os.path.join(db_dir, "ACOR002.0519.fna"))

コード例 #5

0

ファイルを表示

ファイル: test_download.py プロジェクト: gem-pasteur/PanACoTA

def test_to_database_1genome_wrong_format(caplog):
    """
    Test behavior when the folder refseq/bacteria exists, there is 1 genome per subfolder,
    but 1 genome cannot be unzipped
    """
    # out_dir = os.path.join(DATA_TEST_DIR, "genomes")
    # gz_genomes_folder = os.path.join(out_dir, "refseq", "bacteria")

    out_dir = os.path.join(GENEPATH, "genomes")
    refseq_dir = os.path.join(DATA_TEST_DIR, "genomes")
    # Copy content of refseq in genomes test data to output folder that will be used
    shutil.copytree(refseq_dir, out_dir)

    # Name of directory directly containing the original gz file
    to_corrupt_dir = "ACOR001"
    to_corrupt_filename = "ACOR001.0519.fna.gz"
    to_corrupt_path = os.path.join(out_dir, "refseq", "bacteria",
                                   to_corrupt_dir, to_corrupt_filename)
    # Create fake gz file (txt file)
    false_gz = open(to_corrupt_path, "w")
    false_gz.write("This is not a gz file")
    false_gz.close()

    # Run to_database
    nb_gen, db_dir = downg.to_database(out_dir, "refseq")
    assert nb_gen == 2
    assert db_dir == os.path.join(out_dir, "Database_init")

    # Check that a error message was raised, indicating that genome is ignored
    caplog.set_level(logging.DEBUG)
    assert "ERROR" in caplog.text
    assert (
        "Error while trying to uncompress "
        "test/data/prepare/generated_by_unit-tests/genomes/Database_init/ACOR001.0519.fna.gz. "
        "This genome will be ignored") in caplog.text
    # Check that there are only 2 files in the database, and that they correspond
    # to uncompressed gz files
    list_db = os.listdir(db_dir)
    assert len(list_db) == 2
    assert not os.path.isfile(os.path.join(db_dir, to_corrupt_filename))
    assert os.path.isfile(os.path.join(db_dir, "ACOR002.0519.fna"))
    assert os.path.isfile(os.path.join(db_dir, "ACOR003.0519.fna"))

コード例 #6

0

ファイルを表示

ファイル: test_download.py プロジェクト: gem-pasteur/PanACoTA

def test_to_database():
    """
    Test that all fna.gz files are uncompressed and moved to a created Database_init folder
    """
    out_dir = os.path.join(DATA_TEST_DIR, "genomes")
    nb_gen, db_init_dir = downg.to_database(out_dir, "refseq")
    db_dir = os.path.join(DATA_TEST_DIR, "genomes", "Database_init")
    assert os.path.isdir(db_dir)
    files_all = glob.glob(os.path.join(db_dir, "*"))
    files_fna = glob.glob(os.path.join(db_dir, "*.fna"))
    # Check that there are only 3 files in result database
    assert len(files_all) == len(files_fna)
    # And that those files are .fna files
    assert len(files_fna) == 3
    # Check that we have as many genomes as expected, and that the output database has the
    # expected name
    assert nb_gen == 3
    assert db_init_dir == db_dir
    assert os.path.isfile(os.path.join(db_dir, "ACOR002.0519.fna"))
    assert os.path.isfile(os.path.join(db_dir, "ACOR002.0519.fna"))
    assert os.path.isfile(os.path.join(db_dir, "ACOR003.0519.fna"))

    shutil.rmtree(db_dir)

コード例 #7

0

ファイルを表示

def main(cmd, ncbi_species_name, ncbi_species_taxid, ncbi_taxid, ncbi_strains,
         levels, ncbi_section, outdir, tmp_dir, threads, norefseq, db_dir,
         only_mash, info_file, l90, nbcont, cutn, min_dist, max_dist, verbose,
         quiet):
    """
    Main method, constructing the draft dataset for the given species

    verbosity:
    - defaut 0 : stdout contains INFO, stderr contains ERROR, .log contains INFO and more, .log.err contains warning and more
    - 1: same as 0 + WARNING in stderr
    - 2: same as 1 + DETAILS in stdout + DETAILS in .log.details
    - >=15: same as 2 + Add DEBUG in stdout + create .log.debug with everything from info to debug


    Parameters
    ----------
    cmd : str
        command line used to launch this program
    ncbi_species_name : str
        name of species to download, as given by NCBI
    ncbi_species_taxid : int
        species taxid given in NCBI
    ncbi_taxid : int
        NCBI taxid (sub-species)
    ncbi_strains : str
        specific strains to download
    levels: str
        Level of assembly to download. Choice between 'all', 'complete', 'chromosome',
        'scaffold', 'contig'. Default is 'all'
    outdir : str
        path to output directory (where created database will be saved).
    tmp_dir : str
        Path to directory where tmp files are saved (sequences split at each row of 5 'N')
    threads : int
        max number of threads to use
    norefseq : bool
        True if user does not want to download again the database
    db_dir : str
        Name of the folder where already downloaded fasta files are saved.
    only_mash : bool
        True if user user already has the database and quality of each genome (L90, #contigs etc.)
    info_file : str
        File containing information on QC if it was already ran before (columns to_annotate,
        gsize, nb_conts and L90).
    l90 : int
        Max L90 allowed to keep a genome
    nbcont : int
        Max number of contigs allowed to keep a genome
    cutn : int
        cut at each when there are 'cutn' N in a row. Don't cut if equal to 0
    min_dist : int
        lower limit of distance between 2 genomes to keep them
    max_dist : int
        upper limit of distance between 2 genomes to keep them (default is 0.06)
    verbose : int
        verbosity:
        - defaut 0 : stdout contains INFO, stderr contains ERROR, .log contains INFO and more,
          .log.err contains warning and more
        - 1: same as 0 + WARNING in stderr
        - 2: same as 1 + DETAILS in stdout + DETAILS in .log.details
        - >=15: same as 2 + Add DEBUG in stdout + create .log.debug with everything
          from info to debug
    quiet : bool
        True if nothing must be sent to stdout/stderr, False otherwise
    """

    # get species name in NCBI format
    # -> will be used to name output directory
    # -> will be used to download summary file if given species corresponds to NCBI name
    if ncbi_species_name:
        species_linked = "_".join(ncbi_species_name.split())
        species_linked = "_".join(species_linked.split("/"))

    # if species name not given by user, use species taxID (if given) to name output directory
    elif ncbi_species_taxid:
        species_linked = str(ncbi_species_taxid)
    # if species name not species taxid by user, use taxID (if given) to name output directory
    elif ncbi_taxid:
        species_linked = str(ncbi_taxid)
    # If no species nor taxID, get specific strain names
    elif ncbi_strains:
        if os.path.isfile(ncbi_strains):
            species_linked = os.path.basename(ncbi_strains)
            species_linked = os.path.splitext(species_linked)[0]
        else:
            species_linked = "_".join(ncbi_strains.split())
            species_linked = "-".join(species_linked.split("/"))
            species_linked = "_and_".join(species_linked.split(","))
    # if neither speName, speID, taxID nor strainName given (--norefseq, mashonly), name is NA
    else:
        species_linked = "NA"
    # Default outdir is species name if given, or species taxID
    if not outdir:
        outdir = species_linked
    # Default tmp_dir is outdir/tmp_files
    if not tmp_dir:
        tmp_dir = os.path.join(outdir, "tmp_files")
    # directory that will be created by ncbi_genome_download
    ncbidir = os.path.join(outdir, ncbi_section, "bacteria")
    os.makedirs(outdir, exist_ok=True)
    os.makedirs(tmp_dir, exist_ok=True)

    # Initialize logger
    # set level of logger: level is the minimum level that will be considered.
    if verbose <= 1:
        level = logging.INFO
    # for verbose = 2, ignore only debug
    if verbose >= 2 and verbose < 15:
        level = utils.detail_lvl()  # int corresponding to detail level
    # for verbose >= 15, write everything
    if verbose >= 15:
        level = logging.DEBUG
    logfile_base = os.path.join(outdir,
                                "PanACoTA_prepare_{}").format(species_linked)
    logfile_base, logger = utils.init_logger(logfile_base,
                                             level,
                                             'prepare',
                                             log_details=True,
                                             verbose=verbose,
                                             quiet=quiet)

    # Message on what will be done (cmd, cores used)
    logger.info(f'PanACoTA version {version}')
    logger.info("Command used\n \t > " + cmd)
    message = f"'PanACoTA prepare' will run on {threads} "
    message += f"cores" if threads > 1 else "core"
    logger.info(message)

    # Start prepare step
    # Run more than only mash filter (!only_mash):
    # - start from QC and mash (norefseq)
    # - start from genome download (!norefseq))
    if not only_mash:
        # Not only mash, so a new info file will be created. If the user still gave an info
        # file (he will be warned that it will be ignored), rename it with '.bak'
        # to avoid erasing it
        if info_file and os.path.isfile(info_file):
            os.rename(info_file, info_file + ".back")

        # 'norefseq = True" : Do not download genomes, just do QC and mash filter on given genomes
        # -> if not, error and exit
        if norefseq:
            logger.warning(f'You asked to skip {ncbi_section} downloads.')

            # -> if db_dir given, watch for sequences there. If does not exist, error and exit
            # (user gave a directory (even if it does not exist), so we won't look for
            # the sequences in other folders)
            if db_dir:
                if not os.path.exists(db_dir):
                    logger.error(
                        f"Database folder {db_dir} supposed to contain fasta "
                        "sequences does not "
                        "exist. Please give a valid folder, or leave the default "
                        "directory (no '-d' option).")
                    sys.exit(1)
            # -> If user did not give db_dir, genomes could be in
            # outdir/Database_init/<genome_name>.fna
            else:
                db_dir = os.path.join(outdir, "Database_init")
                # If it does not exist, check if default compressed files folder exists.
                if not os.path.exists(db_dir):
                    logger.warning(
                        f"Database folder {db_dir} supposed to contain fasta "
                        "sequences does not "
                        "exist. We will check if the download folder (with compressed "
                        "sequences) exists.")
                    # -> if not in database_init, genomes must be in
                    # outdir/refeq/bacteria/<genome_name>.fna.gz. In that case,
                    # uncompress and add them to Database_init
                    if not os.path.exists(ncbidir):
                        logger.error(
                            f"Folder {ncbidir} does not exist. You do not have any "
                            "genome to analyse. Possible reasons:\n"
                            "- if you want to rerun analysis in the same folder as "
                            "sequences were downloaded (my_outdir/Database_init or "
                            f"my_outdir/{ncbi_section}), make sure you have '-o my_outdir' "
                            "option\n"
                            "- if you want to rerun analysis and save them in a new "
                            "output folder called 'new_outdir', make sure you have "
                            "'-o new_outdir' option, "
                            "and you specified where the uncompressed sequences to "
                            "use are ('-d sequence_database_path'). ")
                        sys.exit(1)
                    # add genomes from refseq/bacteria folder to Database_init
                    nb_gen, _ = dgf.to_database(outdir, ncbi_section)
        # No sequence: Do all steps -> download, QC, mash filter
        else:
            # Download all genomes of the given taxID
            db_dir, nb_gen = dgf.download_from_ncbi(species_linked,
                                                    ncbi_section,
                                                    ncbi_species_name,
                                                    ncbi_species_taxid,
                                                    ncbi_taxid, ncbi_strains,
                                                    levels, outdir, threads)
            logger.info(f"{nb_gen} {ncbi_section} genome(s) downloaded")

        # Now that genomes are downloaded and uncompressed, check their quality to remove bad ones
        genomes = fg.check_quality(species_linked, db_dir, tmp_dir, l90,
                                   nbcont, cutn)

    # Do only mash filter. Genomes must be already downloaded, and there must be a file with
    # all information on these genomes (L90 etc.)
    else:
        logger.warning('You asked to run only mash steps.')
        if not os.path.exists(
                info_file):  # info-file missing -> error and exit
            logger.error(
                f"Your info file {info_file} does not exist. Please provide the  "
                "right name/path, or remove the '--mash-only option to rerun "
                "quality control.")
            sys.exit(1)
        logger.info(("You want to run only mash steps. Getting information "
                     "from {}").format(info_file))
        genomes = utils.read_genomes_info(
            info_file,
            species_linked,
        )

    # Run Mash
    # genomes : {genome_file: [genome_name, orig_name, path_to_seq_to_annotate, size, nbcont, l90]}
    # sorted_genome : [genome_file] ordered by L90/nbcont (keys of genomes)
    sorted_genomes = fg.sort_genomes_minhash(genomes, l90, nbcont)

    # Write discarded genomes to a file -> orig_name, to_annotate, gsize, nb_conts, L90
    discQC = f"by-L90_nbcont-{species_linked}.txt"
    utils.write_genomes_info(genomes, sorted_genomes, discQC, outdir)

    # Remove genomes not corresponding to mash filters
    removed = fg.iterative_mash(sorted_genomes, genomes, outdir,
                                species_linked, min_dist, max_dist, threads,
                                quiet)
    # Write list of genomes kept, and list of genomes discarded by mash step
    info_file = fg.write_outputfiles(genomes, sorted_genomes, removed, outdir,
                                     species_linked, min_dist, max_dist)
    logger.info("End")
    return info_file