Esempio n. 1
0
def test_run_all_prodigal_error_train():
    """
    Check that when we want to train on a genome but it fails, it returns False for all genomes
    Here, it fails because genome to train on is too small
    """
    logger = my_logger("test_run_all_parallel_more_threads")
    utils.init_logger(LOGFILE_BASE, 0, 'test_run_all_parallel_more_threads')
    # genomes = {genome: [name, gpath, annot_path, size, nbcont, l90]}
    genome1 = "H299_H561.fasta"
    gpath1 = os.path.join(GEN_PATH, genome1)
    genome2 = "A_H738.fasta"
    gpath2 = os.path.join(GEN_PATH, genome2)
    genomes = {
        genome1: ["test_runall_1by1_1", gpath1, gpath1, 12656, 3, 0],
        genome2: ["test_runall_1by1_2", gpath2, gpath2, 456464645, 1, 465]
    }
    threads = 8
    force = False
    trn_gname = genome1
    final = afunc.run_annotation_all(genomes,
                                     threads,
                                     force,
                                     GENEPATH,
                                     trn_gname,
                                     prodigal_only=True,
                                     quiet=True)
    assert not final[genome1]
    assert not final[genome2]
    q = logger[0]
    assert q.qsize() == 4
    assert q.get().message == "Annotating all genomes with prodigal"
    assert q.get().message == ("Prodigal will train using "
                               "test/data/annotate/genomes/H299_H561.fasta")
    assert q.get().message == (
        "prodigal command: prodigal -i "
        "test/data/annotate/genomes/H299_H561.fasta -t "
        "test/data/annotate/generated_by_unit-tests/H299_H561.fasta.trn")
    assert q.get().message == (
        "Error while trying to train prodigal on H299_H561.fasta. See "
        "test/data/annotate/generated_by_unit-tests/"
        "H299_H561.fasta.trn-prodigal-train.log.err.")
Esempio n. 2
0
def test_run_all_prodigal():
    """
    Check that there is no problem when running prodigal on all genomes
    Start and end are not necessarily in the same order (ex: start1, start2, end2, end1)
    """
    logger = my_logger("test_run_all_parallel_more_threads")
    utils.init_logger(LOGFILE_BASE, 0, 'test_run_all_parallel_more_threads')
    # genomes = {genome: [name, gpath, annot_path, size, nbcont, l90]}
    genome1 = "H299_H561.fasta"
    gpath1 = os.path.join(GEN_PATH, genome1)
    genome2 = "A_H738.fasta"
    gpath2 = os.path.join(GEN_PATH, genome2)
    genomes = {
        genome1: ["test_runall_1by1_1", gpath1, gpath1, 12656, 3, 0],
        genome2: ["test_runall_1by1_2", gpath2, gpath2, 456464645, 1, 465]
    }
    threads = 8
    force = False
    trn_gname = genome2
    final = afunc.run_annotation_all(genomes,
                                     threads,
                                     force,
                                     GENEPATH,
                                     trn_gname,
                                     prodigal_only=True,
                                     quiet=True)
    assert final[genome1]
    assert final[genome2]
    q = logger[0]
    assert q.qsize() == 10
    assert q.get().message == "Annotating all genomes with prodigal"
    assert q.get(
    ).message == "Prodigal will train using test/data/annotate/genomes/A_H738.fasta"
    assert q.get().message == (
        "prodigal command: prodigal -i "
        "test/data/annotate/genomes/A_H738.fasta -t "
        "test/data/annotate/generated_by_unit-tests/A_H738.fasta.trn")
    assert q.get(
    ).message == "End training on test/data/annotate/genomes/A_H738.fasta"
    messages = []
    for i in range(6):
        a = q.get().message
        messages.append(a)
    message_start_annot1 = (
        "Start annotating test_runall_1by1_1 "
        "(from test/data/annotate/genomes/H299_H561.fasta sequence) "
        "with Prodigal")
    message_start_annot2 = (
        "Start annotating test_runall_1by1_2 "
        "(from test/data/annotate/genomes/A_H738.fasta sequence) "
        "with Prodigal")
    # Check that all messages exist. We cannot know in which order,
    # as 'genomes' is a dict, hence unordered, and as computation is done in parallel
    assert message_start_annot1 in messages
    assert message_start_annot2 in messages
    # Prodigal cmd
    message_cmd1 = (
        "Prodigal command: prodigal -i test/data/annotate/genomes/H299_H561.fasta "
        "-d test/data/annotate/generated_by_unit-tests/H299_H561.fasta-prodigalRes/"
        "test_runall_1by1_1.ffn -a test/data/annotate/generated_by_unit-tests/"
        "H299_H561.fasta-prodigalRes/test_runall_1by1_1.faa -f gff "
        "-o test/data/annotate/generated_by_unit-tests/"
        "H299_H561.fasta-prodigalRes/test_runall_1by1_1.gff -t "
        "test/data/annotate/generated_by_unit-tests/A_H738.fasta.trn -q")
    message_cmd2 = (
        "Prodigal command: prodigal -i test/data/annotate/genomes/A_H738.fasta "
        "-d test/data/annotate/generated_by_unit-tests/A_H738.fasta-prodigalRes/"
        "test_runall_1by1_2.ffn -a test/data/annotate/generated_by_unit-tests/"
        "A_H738.fasta-prodigalRes/test_runall_1by1_2.faa -f gff "
        "-o test/data/annotate/generated_by_unit-tests/A_H738.fasta-prodigalRes/"
        "test_runall_1by1_2.gff -t "
        "test/data/annotate/generated_by_unit-tests/A_H738.fasta.trn -q")
    assert message_cmd1 in messages
    assert message_cmd2 in messages
    message_end_annot1 = (
        "End annotating test_runall_1by1_1 (from test/data/annotate/genomes/"
        "H299_H561.fasta)")
    message_end_annot2 = (
        "End annotating test_runall_1by1_2 (from test/data/annotate/genomes/"
        "A_H738.fasta)")
    assert message_end_annot1 in messages
    assert message_end_annot2 in messages
Esempio n. 3
0
def test_run_all_parallel_prokka_more_threads():
    """
    Check that there is no problem when running with more threads than genomes
    (6 threads and 2 genome: each genome uses 3 threads)
    Genomes H299 should run well but genome1.fasta should get an error
    """
    logger = my_logger("test_run_all_parallel_more_threads")
    utils.init_logger(LOGFILE_BASE, 0, 'test_run_all_4threads')
    # genomes = {genome: [name, gpath, size, nbcont, l90]}
    gnames = ["H299_H561.fasta", "genome1.fasta"]
    gpaths = [os.path.join(GEN_PATH, name) for name in gnames]
    genomes = {
        gnames[0]: ["test_runall_1by1_1", gpaths[0], gpaths[0], 12656, 3, 1],
        gnames[1]:
        ["test_runall_1by1_2", gpaths[1], gpaths[1], 456464645, 4, 1],
    }
    threads = 6
    force = False
    trn_file = "nofile.trn"
    final = afunc.run_annotation_all(genomes, threads, force, GENEPATH,
                                     trn_file)
    assert final[gnames[0]]
    assert not final[gnames[1]]
    q = logger[0]
    # Check size of logs
    # -> starting log -> 1 log
    # -> for genome ok : start annotate, prokka cmd, end annotate -> 3 logs
    # -> for genome not ok : start annotate, prokka cmd, problem, end annotate -> 4 logs
    assert q.qsize() == 8
    assert q.get().message == "Annotating all genomes with prokka"
    # messages start annotation
    messages = []
    for i in range(7):
        a = q.get().message
        messages.append(a)
    message_start_annot1 = ("Start annotating test_runall_1by1_1 "
                            "from test/data/annotate/genomes/H299_H561.fasta "
                            "with Prokka")
    message_start_annot2 = ("Start annotating test_runall_1by1_2 "
                            "from test/data/annotate/genomes/genome1.fasta "
                            "with Prokka")
    # Check that all messages exist. We cannot know in which order,
    # as 'genomes' is a dict, hence unordered, and as computation is done in parallel
    assert message_start_annot1 in messages
    assert message_start_annot2 in messages
    # messages Prokka cmd
    message_cmd1 = (
        "Prokka command: prokka --outdir test/data/annotate/generated_by_unit-tests/"
        "H299_H561.fasta-prokkaRes --cpus 3 --prefix test_runall_1by1_1 "
        "--centre prokka test/data/annotate/genomes/H299_H561.fasta")
    message_cmd2 = (
        "Prokka command: prokka --outdir test/data/annotate/generated_by_unit-tests/"
        "genome1.fasta-prokkaRes --cpus 3 --prefix test_runall_1by1_2 "
        "--centre prokka test/data/annotate/genomes/genome1.fasta")
    assert message_cmd1 in messages
    assert message_cmd2 in messages
    # Messages end annotation cmd
    message_end1 = ("End annotating test_runall_1by1_1 from "
                    "test/data/annotate/genomes/H299_H561.fasta.")
    message_end2 = ("End annotating test_runall_1by1_2 from "
                    "test/data/annotate/genomes/genome1.fasta.")
    assert message_end1 in messages
    assert message_end2 in messages
    # Messages error annotation cmd
    message_err1 = "test_runall_1by1_2 genome1.fasta: several .faa files"
    assert message_err1 in messages
Esempio n. 4
0
def test_run_all_prokka_parallel_less_threads():
    """
    Check that there is no problem when running with less threads than genomes (each genomes
    uses 2 threads)
    Genomes H299 and A_H738 should run well, but genomes genome* have problems (no CDS found),
    so check_prokka should return false.
    """
    logger = my_logger("test_run_all_parallel_more_threads")
    utils.init_logger(LOGFILE_BASE, 0, 'test_run_all_4threads')
    # genomes = {genome: [name, gpath, size, nbcont, l90]}
    gnames = [
        "H299_H561.fasta", "A_H738.fasta", "genome1.fasta", "genome2.fasta",
        "genome3.fasta"
    ]
    gpaths = [os.path.join(GEN_PATH, name) for name in gnames]
    genomes = {
        gnames[0]: ["test_runall_1by1_1", gpaths[0], gpaths[0], 12656, 3, 1],
        gnames[1]:
        ["test_runall_1by1_2", gpaths[1], gpaths[1], 456464645, 1, 1],
        gnames[2]:
        ["test_runall_1by1_3", gpaths[2], gpaths[2], 456464645, 4, 1],
        gnames[3]:
        ["test_runall_1by1_4", gpaths[3], gpaths[3], 456464645, 3, 1],
        gnames[4]:
        ["test_runall_1by1_5", gpaths[4], gpaths[4], 456464645, 1, 1]
    }
    threads = 4
    force = False
    trn_file = "nofile.trn"
    final = afunc.run_annotation_all(genomes, threads, force, GENEPATH,
                                     trn_file)
    assert final[gnames[0]]
    assert final[gnames[1]]
    assert not final[gnames[2]]
    assert not final[gnames[3]]
    assert not final[gnames[4]]
    q = logger[0]
    # Check size of logs
    # -> starting log -> 1 log
    # -> for each genome ok (2 first ones): start annotate, prokka cmd, end annotate -> 6 logs
    # -> for each genome not ok (3 others):
    #           start annotate, prokka cmd, problem, end annotate -> 12 logs
    assert q.qsize() == 19
    assert q.get().message == "Annotating all genomes with prokka"
    # messages start annotation
    messages = []
    for i in range(18):
        a = q.get().message
        messages.append(a)
    message_start_annot1 = ("Start annotating test_runall_1by1_1 "
                            "from test/data/annotate/genomes/H299_H561.fasta "
                            "with Prokka")
    message_start_annot2 = ("Start annotating test_runall_1by1_2 "
                            "from test/data/annotate/genomes/A_H738.fasta "
                            "with Prokka")
    message_start_annot3 = ("Start annotating test_runall_1by1_4 "
                            "from test/data/annotate/genomes/genome2.fasta "
                            "with Prokka")
    # Check that all messages exist. We cannot know in which order,
    # as 'genomes' is a dict, hence unordered, and as computation is done in parallel
    assert message_start_annot1 in messages
    assert message_start_annot2 in messages
    assert message_start_annot3 in messages
    # messages Prokka cmd
    message_cmd1 = (
        "Prokka command: prokka --outdir test/data/annotate/generated_by_unit-tests/"
        "H299_H561.fasta-prokkaRes --cpus 2 --prefix test_runall_1by1_1 "
        "--centre prokka test/data/annotate/genomes/H299_H561.fasta")
    message_cmd2 = (
        "Prokka command: prokka --outdir test/data/annotate/generated_by_unit-tests/"
        "A_H738.fasta-prokkaRes --cpus 2 --prefix test_runall_1by1_2 "
        "--centre prokka test/data/annotate/genomes/A_H738.fasta")
    message_cmd3 = (
        "Prokka command: prokka --outdir test/data/annotate/generated_by_unit-tests/"
        "genome1.fasta-prokkaRes --cpus 2 --prefix test_runall_1by1_3 "
        "--centre prokka test/data/annotate/genomes/genome1.fasta")
    assert message_cmd1 in messages
    assert message_cmd2 in messages
    assert message_cmd3 in messages
    # Messages end annotation cmd
    message_end1 = ("End annotating test_runall_1by1_1 from "
                    "test/data/annotate/genomes/H299_H561.fasta.")
    message_end2 = ("End annotating test_runall_1by1_3 from "
                    "test/data/annotate/genomes/genome1.fasta.")
    message_end3 = ("End annotating test_runall_1by1_5 from "
                    "test/data/annotate/genomes/genome3.fasta.")
    assert message_end1 in messages
    assert message_end2 in messages
    assert message_end3 in messages
    # Messages error annotation cmd
    message_err1 = "test_runall_1by1_3 genome1.fasta: several .faa files"
    message_err2 = "test_runall_1by1_4 genome2.fasta: several .faa files"
    message_err3 = "test_runall_1by1_5 genome3.fasta: several .faa files"
    assert message_err1 in messages
    assert message_err2 in messages
    assert message_err3 in messages
Esempio n. 5
0
def test_run_all_1by1_prokka():
    """
    Check that when running with 3 threads (not parallel), prokka runs as expected,
    and returns True for each genome
    -> Runs 1 by 1, with prokka using 3 cpus
    Start and end must be ordered: (start1, end1, start2, end2) or (start2, end2, start1, end1)
    """
    logger = my_logger("test_runall_1by1_1")
    utils.init_logger(LOGFILE_BASE, 0, 'test_run_all_1by1')
    # genomes = {genome: [name, gpath, size, nbcont, l90]}
    genome1 = "H299_H561.fasta"
    gpath1 = os.path.join(GEN_PATH, genome1)
    genome2 = "A_H738.fasta"
    gpath2 = os.path.join(GEN_PATH, genome2)
    genomes = {
        genome1: ["test_runall_1by1_1", gpath1, gpath1, 12656, 3, 0],
        genome2: ["test_runall_1by1_2", gpath2, gpath2, 456464645, 1, 465]
    }
    threads = 3
    force = False
    trn_file = "nofile.trn"
    annot_folder = os.path.join(GENEPATH, "annot-folder")
    os.makedirs(annot_folder)
    final = afunc.run_annotation_all(genomes, threads, force, annot_folder,
                                     trn_file)
    assert final[genome1]
    assert final[genome2]
    q = logger[0]
    assert q.qsize() == 7
    assert q.get().message == 'Annotating all genomes with prokka'
    # Messages for start and end annotation of the different genomes
    message_start_annot1 = (
        "Start annotating test_runall_1by1_1 test/data/annotate/genomes/"
        "H299_H561.fasta")
    message_cmd1 = (
        "Prokka command: prokka --outdir test/data/annotate/generated_by_unit-tests/"
        "annot-folder/H299_H561.fasta-prokkaRes --cpus 3")
    message_end_annot1 = (
        "End annotating test_runall_1by1_1 from test/data/annotate/genomes/"
        "H299_H561.fasta.")
    message_start_annot2 = (
        "Start annotating test_runall_1by1_2 test/data/annotate/genomes/"
        "A_H738.fasta")
    message_cmd2 = (
        "Prokka command: prokka --outdir test/data/annotate/generated_by_unit-tests/"
        "annot-folder/A_H738.fasta-prokkaRes --cpus 3")
    message_end_annot2 = (
        "End annotating test_runall_1by1_2 from test/data/annotate/genomes/"
        "A_H738.fasta.")
    qget = q.get().message
    # Check logs. Given that it is executed in parallel, we cannot know in which order messages
    # will appear
    assert qget == message_start_annot1 or message_start_annot2
    if qget == message_start_annot1:
        # Ending annotation of first genome (same genome as started because running 1by1)
        assert q.get().message.startswith(message_cmd1)
        assert q.get().message == message_end_annot1
    else:
        assert q.get().message.startswith(message_cmd2)
        assert q.get().message == message_end_annot2
    qget2 = q.get().message
    assert qget2 == message_start_annot1 or message_start_annot2
    if qget2 == message_start_annot2:
        # Ending annotation of first genome (same genome as started because running 1by1)
        assert q.get().message.startswith(message_cmd2)
        assert q.get().message == message_end_annot2
    else:
        assert q.get().message.startswith(message_cmd1)
        assert q.get().message == message_end_annot1
Esempio n. 6
0
def test_run_all_prodigal_outexists_error():
    """
    trn file already exists, and output folder too. No force option. Output folder is empty
    -> error message while checking prodigal
    """
    logger = my_logger("test_run_all_parallel_more_threads")
    utils.init_logger(LOGFILE_BASE, 0, 'test_run_all_parallel_more_threads')
    # genomes = {genome: [name, gpath, annot_path, size, nbcont, l90]}
    genome1 = "toto.fasta"
    genome2 = "A_H738.fasta"
    genomes = {
        genome1: ["test_runall_1by1_1", genome1, genome1, 12656, 3, 0],
        genome2: ["test_runall_1by1_2", genome2, genome2, 456464645, 1, 465]
    }
    # Create prodigal result directories
    prodigaldir_g1 = os.path.join(GENEPATH, "A_H738.fasta-prodigalRes")
    prodigaldir_g2 = os.path.join(GENEPATH, "toto.fasta-prodigalRes")
    os.makedirs(prodigaldir_g1)
    os.makedirs(prodigaldir_g2)
    # Other parameters
    threads = 1
    force = False
    # Add existing training file
    orig_trn_file = os.path.join(TEST_DIR, "A_H738-and-B2_A3_5.fna.trn")
    trn_file = os.path.join(GENEPATH, "toto.fasta.trn")
    shutil.copyfile(orig_trn_file, trn_file)
    trn_gname = genome1
    final = afunc.run_annotation_all(genomes,
                                     threads,
                                     force,
                                     GENEPATH,
                                     trn_gname,
                                     prodigal_only=True,
                                     quiet=False)
    assert not final[genome1]
    assert not final[genome2]
    q = logger[0]
    assert q.qsize() == 15
    assert q.get().message == "Annotating all genomes with prodigal"
    assert q.get().message == "Prodigal will train using toto.fasta"
    assert q.get().message == (
        "A training file already exists (test/data/annotate/"
        "generated_by_unit-tests/toto.fasta.trn). It will "
        "be used to annotate all genomes.")
    messages = []
    for i in range(12):
        a = q.get().message
        messages.append(a)
    message_start_annot1 = ("Start annotating test_runall_1by1_1 "
                            "(from toto.fasta sequence) with Prodigal")
    # Check that all messages exist. We cannot know in which order,
    # as 'genomes' is a dict, hence unordered, and as computation is done in parallel
    assert message_start_annot1 in messages
    # Prodigal cmd
    message_exists1 = (
        "Prodigal results folder test/data/annotate/generated_by_unit-tests/"
        "toto.fasta-prodigalRes already exists.")
    message_errorfaa = (
        "test_runall_1by1_1 toto.fasta: no or several .faa file(s)")
    message_errorffn = (
        "test_runall_1by1_1 toto.fasta: no or several .ffn file(s)")
    message_errorgff = (
        "test_runall_1by1_1 toto.fasta: no or several .gff file(s)")
    message_error1 = (
        "Problems in the files contained in your already existing output dir "
        "(test/data/annotate/generated_by_unit-tests/toto.fasta-prodigalRes). "
        "Please check it, or remove it to re-annotate.")
    assert message_exists1 in messages
    assert message_errorfaa in messages
    assert message_errorffn in messages
    assert message_errorgff in messages
    assert message_error1 in messages
    message_start_annot2 = ("Start annotating test_runall_1by1_2 "
                            "(from A_H738.fasta sequence) with Prodigal")
    assert message_start_annot2 in messages
    message_error_annot2 = (
        "Problems in the files contained in your already existing output dir "
        "(test/data/annotate/generated_by_unit-tests/A_H738.fasta-prodigalRes). "
        "Please check it, or remove it to re-annotate.")
    assert message_error_annot2 in messages
Esempio n. 7
0
def test_run_all_prodigal_train_exists_ok():
    """
    Check that when we want to train on a genome but it fails, it returns False for all genomes
    Here, it fails because genome to train on is too small
    """
    logger = my_logger("test_run_prodigal_train_exist_error")
    utils.init_logger(LOGFILE_BASE, 0, 'test_run_prodigal_train_exist_error')
    # genomes = {genome: [name, gpath, annot_path, size, nbcont, l90]}
    genome1 = "toto.fasta"
    gpath1 = os.path.join(GEN_PATH, genome1)
    genome2 = "A_H738.fasta"
    gpath2 = os.path.join(GEN_PATH, genome2)
    genomes = {
        genome1: ["test_runall_1by1_1", gpath1, gpath1, 12656, 3, 0],
        genome2: ["test_runall_1by1_2", gpath2, gpath2, 456464645, 1, 465]
    }
    threads = 8
    force = False
    trn_gname = genome1
    # Copy trn file to outdir, so that panacota detects that it already exists
    orig_trn_file = os.path.join(TEST_DIR, "A_H738-and-B2_A3_5.fna.trn")
    trn_file = os.path.join(GENEPATH, "toto.fasta.trn")
    shutil.copyfile(orig_trn_file, trn_file)
    # Run annotation all
    final = afunc.run_annotation_all(genomes,
                                     threads,
                                     force,
                                     GENEPATH,
                                     trn_gname,
                                     prodigal_only=True,
                                     quiet=False)
    assert not final[genome1]
    assert final[genome2]
    q = logger[0]
    assert q.qsize() == 9
    assert q.get().message == "Annotating all genomes with prodigal"
    assert q.get().message == ("Prodigal will train using "
                               "test/data/annotate/genomes/toto.fasta")
    assert q.get().message == (
        "A training file already exists (test/data/annotate/"
        "generated_by_unit-tests/toto.fasta.trn). It will be used "
        "to annotate all genomes.")
    # Check that all messages exist. We cannot know in which order,
    # as 'genomes' is a dict, hence unordered, and as computation is done in parallel
    messages = []
    for i in range(6):
        a = q.get().message
        messages.append(a)
    # Check start annotation messages
    message_start_annot1 = (
        "Start annotating test_runall_1by1_1 "
        "(from test/data/annotate/genomes/toto.fasta sequence) "
        "with Prodigal")
    message_start_annot2 = (
        "Start annotating test_runall_1by1_2 "
        "(from test/data/annotate/genomes/A_H738.fasta sequence) "
        "with Prodigal")
    assert message_start_annot1 in messages
    assert message_start_annot2 in messages
    # Prodigal cmd
    message_cmd1 = (
        "Prodigal command: prodigal -i test/data/annotate/genomes/toto.fasta "
        "-d test/data/annotate/generated_by_unit-tests/toto.fasta-prodigalRes/"
        "test_runall_1by1_1.ffn -a test/data/annotate/generated_by_unit-tests/"
        "toto.fasta-prodigalRes/test_runall_1by1_1.faa -f gff "
        "-o test/data/annotate/generated_by_unit-tests/"
        "toto.fasta-prodigalRes/test_runall_1by1_1.gff -t "
        "test/data/annotate/generated_by_unit-tests/toto.fasta.trn -q")
    message_cmd2 = (
        "Prodigal command: prodigal -i test/data/annotate/genomes/A_H738.fasta "
        "-d test/data/annotate/generated_by_unit-tests/A_H738.fasta-prodigalRes/"
        "test_runall_1by1_2.ffn -a test/data/annotate/generated_by_unit-tests/"
        "A_H738.fasta-prodigalRes/test_runall_1by1_2.faa -f gff "
        "-o test/data/annotate/generated_by_unit-tests/A_H738.fasta-prodigalRes/"
        "test_runall_1by1_2.gff -t "
        "test/data/annotate/generated_by_unit-tests/toto.fasta.trn -q")
    assert message_cmd1 in messages
    assert message_cmd2 in messages
    message_end_annot1 = ("Error while trying to run prodigal. See "
                          "test/data/annotate/generated_by_unit-tests/"
                          "toto.fasta-prodigal.log.err.")
    message_end_annot2 = (
        "End annotating test_runall_1by1_2 (from test/data/annotate/genomes/"
        "A_H738.fasta)")
    assert message_end_annot1 in messages
    assert message_end_annot2 in messages
Esempio n. 8
0
def main(cmd,
         list_file,
         db_path,
         res_dir,
         name,
         date,
         l90=100,
         nbcont=999,
         cutn=5,
         threads=1,
         force=False,
         qc_only=False,
         from_info=None,
         tmp_dir=None,
         res_annot_dir=None,
         verbose=0,
         quiet=False,
         prodigal_only=False,
         small=False):
    """
    Main method, doing all steps:

    1. analyze genomes (nb contigs, L90, rows of N...)
    2. keep only genomes with 'good' (according to user thresholds) L90 and nb_contigs
    3. rename genomes with strain number in decreasing quality
    4. annotate genome with prokka or only prodigal
    5. format annotated genomes

    If option '-Q': ends at step 2.
    If option '--info <genome_info file name>' option: starts at step 2

    verbosity:

    - defaut 0 : stdout contains INFO, stderr contains ERROR.
    - 1: stdout contains INFO, stderr contains WARNING and ERROR
    - 2: stdout contains (DEBUG), DETAIL and INFO, stderr contains WARNING and ERROR
    - >=15: Add DEBUG in stdout

    Parameters
    ----------
    cmd : str
        command line used to launch this program
    list_file : str
        file containing the list of genome files, 1 genome per line, separated by a
        space if a genome is split in several fasta files. This file can also
        specify date and/or species information, according to the format described
        in documentation.
    db_path : str
        Path to the folder containing all the fasta files which will be annotated
    res_dir : str
        Path to the folder which will contain result folders and files
    name : str
        4 alpha numeric characters, describing the species (for example ESCO). Used by default
        if no species name is given in list_file line.
    date : str
        4 alpha numeric characters, defining the default date, for strains where it is not specified
        in the list_file
    l90 : int
        Max L90 allowed to keep a genome
    nbcont : int
        Max number of contigs allowed to keep a genome
    cutn : int
        cut each time there are at least cutn 'N' in a row. Don't cut if equal to 0
    threads : int
        max number of threads to use
    force : bool
        If True, overwrite previous results, if False keep what is already calculated
    qc_only : bool
        If True, do only quality control, if False, also do annotation
    from_info : str
        File containing information on genomes and their quality information (from prepare step)
    tmp_dir : str or None
        Path to folder where tmp files must be saved. None to use the default tmp folder
    res_annot_dir : str or None
        Path to folder where are the prokka/prodigal result folders for the genomes. None
        to use the default prokka/prodigal folder
    verbose : int
        verbosity:
        default (0): info in stdout, error and more in stderr
        1 = add warnings in stderr
        2 = like 1 + add DETAIL to stdout (by default only INFO)
        >15: add debug to stdout
    quiet : bool
        True if nothing must be sent to stdout/stderr, False otherwise
    prodigal_only : bool
        True -> run only prodigal. False -> run prokka
    small : bool
        True -> use -p meta option with prodigal

    Returns
    -------
    (genomes, kept_genomes, skipped, skipped_format) : tuple
        with:

        - genomes: dict with all genomes in list_file:
          {genome: [gembase_name, path_split_gembase, gsize, nbcont, L90]}
        - kept_genomes: dict with all genomes kept for annotation (same format as genomes)
        - skipped: list of genomes skipped because they had a problem in annotation step
        - skipped_format : list of genomes skipped because they had a problem in format step
    """
    # import needed packages
    import shutil
    import logging
    from PanACoTA.annotate_module import genome_seq_functions as gfunc
    from PanACoTA.annotate_module import annotation_functions as pfunc
    from PanACoTA.annotate_module import general_format_functions as ffunc
    from PanACoTA import utils
    from PanACoTA import __version__ as version
    # Check that needed softs are installed
    prokka = utils.check_installed("prokka")
    prodigal = utils.check_installed("prodigal")
    if prodigal_only:
        soft = "prodigal"
    else:
        soft = "prokka"

    changed = cutn != 0
    if not qc_only:  # pragma: no cover
        # If user using prokka: check prokka is installed and in the path
        if not prodigal_only and not prokka:
            print(
                "Prokka is not installed. 'PanACoTA annotate' cannot run. Install prokka "
                "to be able to annotate genomes. If you only need syntactical annotation, "
                "check that prodigal is installed, and add '--prodigal' option."
            )
            sys.exit(1)
        if prodigal_only and not prodigal:
            print(
                "Prodigal is not installed. 'PanACoTA annotate' cannot run. Install "
                "prodigal to be able to annotate genomes. If you also need functional "
                "annotation, check that prokka is installed, and remove '--prodigal' "
                "option.")
            sys.exit(1)

    # By default, all tmp files (split sequences, renamed sequences, prokka/prodigal results) will
    # be saved in the given <res_dir>/tmp_files.
    # Create output (results, tmp...) directories if not already existing
    if not tmp_dir:
        tmp_dir = os.path.join(res_dir, "tmp_files")
    if not res_annot_dir:
        res_annot_dir = tmp_dir
    os.makedirs(res_dir, exist_ok=True)
    os.makedirs(tmp_dir, exist_ok=True)
    os.makedirs(res_annot_dir, exist_ok=True)

    # If force was set, remove result folders (Proteins, Replicons, Genes, LSTINFO, gff)
    if force:
        shutil.rmtree(os.path.join(res_dir, "LSTINFO"), ignore_errors=True)
        shutil.rmtree(os.path.join(res_dir, "Proteins"), ignore_errors=True)
        shutil.rmtree(os.path.join(res_dir, "Genes"), ignore_errors=True)
        shutil.rmtree(os.path.join(res_dir, "Replicons"), ignore_errors=True)
        shutil.rmtree(os.path.join(res_dir, "gff3"), ignore_errors=True)
    # If not --force, check that result folders do not already contain results
    else:
        utils.check_out_dirs(res_dir)

    # get only filename of list_file, without extension
    if list_file:
        listfile_base = os.path.basename(os.path.splitext(list_file)[0])
    else:
        list_file = from_info
        listfile_base = os.path.basename(os.path.splitext(list_file)[0])

    # Initialize logger
    # set level of logger: level is the minimum level that will be considered.
    if verbose <= 1:
        level = logging.INFO
    # for verbose = 2, ignore only debug
    if verbose >= 2 and verbose < 15:
        level = utils.detail_lvl()  # int corresponding to detail level
    # for verbose >= 15, write everything
    if verbose >= 15:
        level = logging.DEBUG
    logfile_base = os.path.join(res_dir, "PanACoTA-annotate_" + listfile_base)
    logfile_base = utils.init_logger(logfile_base,
                                     level,
                                     name='annotate',
                                     log_details=True,
                                     verbose=verbose,
                                     quiet=quiet)
    logger = logging.getLogger('annotate')
    logger.info(f'PanACoTA version {version}')
    logger.info("Command used\n \t > " + cmd)

    # STEP 1. analyze genomes (nb contigs, L90, rows of N...)
    # If already info on genome ('--info <file>' option), skip this step
    # If no info on genomes, read them and get needed information
    if not from_info:
        # Read genome names.
        # genomes = {genome: [spegenus.date]}
        genomes = utils.read_genomes(list_file, name, date, db_path, tmp_dir,
                                     logger)
        if not genomes:
            logger.error(
                ("We did not find any genome listed in {} in the folder {}. "
                 "Please check your list to give valid genome "
                 "names.").format(list_file, db_path))
            sys.exit(1)
        # Get L90, nbcontig, size for all genomes, and cut at row of cutn 'N' if asked
        # -> genome: [spegenus.date, orig_path, to_annotate_path, size, nbcont, l90]
        gfunc.analyse_all_genomes(genomes,
                                  db_path,
                                  tmp_dir,
                                  cutn,
                                  soft,
                                  logger,
                                  quiet=quiet)
    # --info <filename> option given: read information (L90, nb contigs...) from this file.
    else:
        # genomes = {genome: [spegenus.date, orig_path, to_annotate_path, size, nbcont, l90]}
        # orig_path is the path to the original sequence
        # and to_annotate_path the path to the sequence to annotate (once split etc.)
        # Here, both are the same, as we take given sequences as is.
        genomes = utils.read_genomes_info(from_info, name, date, logger)

    # STEP 2. keep only genomes with 'good' (according to user thresholds) L90 and nb_contigs
    # genomes = {genome: [spegenus.date, orig_seq, path_to_splitSequence, size, nbcont, l90]}
    # Plot L90 and nb_contigs distributions
    gfunc.plot_distributions(genomes, res_dir, listfile_base, l90, nbcont)
    # Get list of genomes kept (according to L90 and nbcont thresholds)
    kept_genomes = {
        genome: info
        for genome, info in genomes.items()
        if info[-2] <= nbcont and info[-1] <= l90
    }
    # Write discarded genomes to a file -> orig_name, to_annotate, gsize, nb_conts, L90
    utils.write_genomes_info(genomes, list(kept_genomes.keys()), list_file,
                             res_dir)

    if not kept_genomes:
        logger.info("No genome kept for annotation.")
        return "", 0
    # Info on folder containing original sequences
    if not from_info:
        logger.info(
            f"-> Original sequences folder ('orig_name' column): {db_path} ")
        logger.info(
            f"\t-> If original sequence not found in {db_path}, "
            f"look for it in {tmp_dir}, as it must be a concatenation of several "
            "input sequence files.")
        if cutn == 0:
            logger.info(
                "-> Sequences used for annotation ('to_annotate' column) are the "
                "same as the previous ones (original sequences).")
        else:
            logger.info(
                f"-> Folder with sequence files that will be used for annotation "
                f"('to_annotate' column): {tmp_dir}")
    # If only QC, stop here.
    if qc_only:
        # Write information on genomes that would be annotated with the current
        # parameters if not QC_only:
        # orig_name, to_annnote, gsize, nb_conts, L90
        utils.write_genomes_info(genomes, [], list_file, res_dir, qc=True)
        logger.info("QC only done.")
        return "", 0

    # STEP 3. Rename genomes kept, ordered by decreasing quality
    first_gname = gfunc.rename_all_genomes(kept_genomes)
    # kept_genomes = {genome: [gembase_name, path_to_origfile, path_split_gembase,
    #                 gsize, nbcont, L90]}
    # first_gname = name of the first genome
    # Write lstinfo file (list of genomes kept with info on L90 etc.)
    outlst = utils.write_lstinfo(list_file, kept_genomes, res_dir)

    # STEP 4. Annotate all kept genomes
    results = pfunc.run_annotation_all(kept_genomes,
                                       threads,
                                       force,
                                       res_annot_dir,
                                       first_gname,
                                       prodigal_only,
                                       small=small,
                                       quiet=quiet)
    # Information on genomes to format
    # results_ok = {genome: [gembase_name, path_to_origfile, path_split_gembase,
    #               gsize, nbcont, L90]}
    results_ok = {
        genome: info
        for genome, info in kept_genomes.items() if results[genome]
    }
    # If no genome was ok, no need to format them. Just print that no genome was annotated,
    # end program.
    if not results_ok:
        logger.error(
            "Error: No genome was correctly annotated, no need to format them."
        )
        sys.exit(1)
    # list of genomes skipped because annotation had problems: no format step run
    skipped = [genome for (genome, ok) in results.items() if not ok]
    # At least 1 genome was not annotated: write a message to warn on it
    if skipped:
        utils.write_warning_skipped(skipped,
                                    prodigal_only=prodigal_only,
                                    logfile=logfile_base)

    # STEP 5. Format genomes annotated
    # Here, we have at least 1 genome annotated (otherwise,
    # it would already have stopped because results_ok is empty)
    # Initialize list of genomes skipped because something went wrong while formatting.
    skipped_format = []
    # Generate database (folders Proteins, Genes, Replicons, LSTINFO)
    skipped_format = ffunc.format_genomes(results_ok,
                                          res_dir,
                                          res_annot_dir,
                                          prodigal_only,
                                          threads,
                                          quiet=quiet)
    # At least one genome could not be formatted -> warn user
    if skipped_format:
        utils.write_warning_skipped(skipped_format,
                                    do_format=True,
                                    prodigal_only=prodigal_only,
                                    logfile=logfile_base)
    logger.info("Annotation step done.")
    return outlst, len(kept_genomes) - len(skipped) - len(skipped_format)