Ejemplo n.º 1
0
def test_postalign_missgenome(caplog):
    """
    Test that when running post-alignment on a folder containing all expected alignment files,
    but giving incomplete list of genomes, it exits with error message specifying protein which
    does not belong to any given genome name.
    """
    caplog.set_level(logging.DEBUG)
    # define parameters
    fam_nums = [1, 8, 11]
    all_genomes = ["GEN2.1017.00001", "GEN4.1111.00001", "GENO.1017.00001"]
    outdir = os.path.join(GENEPATH, "test_post-align")
    aldir = os.path.join(outdir, "aldir_post-align")
    os.makedirs(aldir)
    dname = "TESTpost"
    prefix = os.path.join(aldir, dname)
    prot_ali = False
    quiet = False
    # Prepare aldir with all needed alignment files
    orig_btr1 = os.path.join(EXPPATH, "exp_aldir", "mafft-prt2nuc.1.aln")
    orig_btr8 = os.path.join(EXPPATH, "exp_aldir-pers", "mafft-prt2nuc.8.aln")
    orig_btr11 = os.path.join(EXPPATH, "exp_aldir-pers",
                              "mafft-prt2nuc.11.aln")
    btr1 = os.path.join(aldir, dname + "-mafft-prt2nuc.1.aln")
    btr8 = os.path.join(aldir, dname + "-mafft-prt2nuc.8.aln")
    btr11 = os.path.join(aldir, dname + "-mafft-prt2nuc.11.aln")
    shutil.copyfile(orig_btr1, btr1)
    shutil.copyfile(orig_btr8, btr8)
    shutil.copyfile(orig_btr11, btr11)
    # Run post-alignment
    with pytest.raises(SystemExit):
        pal.post_alignment(fam_nums, all_genomes, prefix, outdir, dname,
                           prot_ali, quiet)
    # Check that concatenated file is created and with expected content
    out_concat = os.path.join(aldir, dname + "-complete.nucl.cat.aln")
    assert not os.path.isfile(out_concat)
    # Check that grouped by genome file is not created
    treedir = os.path.join(outdir, "Phylo-" + dname)
    out_grp = os.path.join(treedir, dname + ".nucl.grp.aln")
    assert not os.path.isfile(out_grp)
    # check logs
    assert "Concatenating all nucl alignment files" in caplog.text
    assert "Grouping nucleic alignments per genome" in caplog.text
    assert "An error occurred. We could not group DNA alignments by genome." in caplog.text
Ejemplo n.º 2
0
def test_postalign_aa_missalign(caplog):
    """
    Test that when running post-alignment on a folder containing all expected alignment files
    except 1, it exits with an error message indicating the missing alignment file. As
    we also ask for aa alignments, 2 alignments per family should be found.
    Here, all alignments in nucl are found, but aa alignment is not
    """
    caplog.set_level(logging.DEBUG)
    # define parameters
    fam_nums = [1, 8, 11]
    all_genomes = [
        "GEN2.1017.00001", "GEN4.1111.00001", "GENO.1017.00001",
        "GENO.1216.00002"
    ]
    outdir = os.path.join(GENEPATH, "test_post-align_missalign")
    aldir = os.path.join(outdir, "aldir_post-align")
    os.makedirs(aldir)
    dname = "TESTpost"
    prot_ali = True
    prefix = os.path.join(aldir, dname)
    quiet = False
    # Prepare aldir with all needed alignment files
    orig_btr1 = os.path.join(EXPPATH, "exp_aldir", "mafft-prt2nuc.1.aln")
    orig_btr8 = os.path.join(EXPPATH, "exp_aldir-pers", "mafft-prt2nuc.8.aln")
    orig_btr11 = os.path.join(EXPPATH, "exp_aldir-pers",
                              "mafft-prt2nuc.11.aln")
    btr1 = os.path.join(aldir, dname + "-mafft-prt2nuc.1.aln")
    btr8 = os.path.join(aldir, dname + "-mafft-prt2nuc.8.aln")
    btr11 = os.path.join(aldir, dname + "-mafft-prt2nuc.11.aln")
    shutil.copyfile(orig_btr1, btr1)
    shutil.copyfile(orig_btr8, btr8)
    shutil.copyfile(orig_btr11, btr11)
    # Run post-alignment
    with pytest.raises(SystemExit):
        pal.post_alignment(fam_nums, all_genomes, prefix, outdir, dname,
                           prot_ali, quiet)
    assert (
        "The alignment file test/data/align/generated_by_unit-tests/"
        "test_post-align_missalign/aldir_post-align/TESTpost-mafft-align.1.aln "
        "does not exist. Please check the families you want, and their corresponding "
        "alignment files") in caplog.text
Ejemplo n.º 3
0
def test_postalign_error_grpaa(caplog):
    """
    Test that when running post-alignment on a folder containing :
    - all btr files ok
    - all mafft files ok + concat mafft not ok (1 génome missing for 1 family)
    concat and group works for nucleic, but fails for aa. Just puts an error message, but 
    still returns the nucl grp file (without exit).
    """
    caplog.set_level(logging.DEBUG)
    # define parameters
    fam_nums = [1, 8, 11]
    all_genomes = [
        "GEN2.1017.00001", "GEN4.1111.00001", "GENO.1017.00001",
        "GENO.1216.00002"
    ]
    outdir = os.path.join(GENEPATH, "test_post-align")
    aldir = os.path.join(outdir, "aldir_post-align")
    os.makedirs(aldir)
    dname = "TESTpost"
    prefix = os.path.join(aldir, dname)
    prot_ali = True
    quiet = False
    # Prepare aldir with all needed alignment files
    orig_btr1 = os.path.join(EXPPATH, "exp_aldir", "mafft-prt2nuc.1.aln")
    orig_btr8 = os.path.join(EXPPATH, "exp_aldir-pers", "mafft-prt2nuc.8.aln")
    orig_btr11 = os.path.join(EXPPATH, "exp_aldir-pers",
                              "mafft-prt2nuc.11.aln")
    orig_ali1 = os.path.join(EXPPATH, "exp_aldir", "mafft-align.1.aln")
    orig_ali11 = os.path.join(EXPPATH, "exp_aldir-pers", "mafft-align.11.aln")
    orig_concat_aa = os.path.join(
        EXPPATH, "exp_concat_4genomes-fam1-8-11-error.aa.aln")
    btr1 = os.path.join(aldir, dname + "-mafft-prt2nuc.1.aln")
    btr8 = os.path.join(aldir, dname + "-mafft-prt2nuc.8.aln")
    btr11 = os.path.join(aldir, dname + "-mafft-prt2nuc.11.aln")
    ali1 = os.path.join(aldir, dname + "-mafft-align.1.aln")
    ali11 = os.path.join(aldir, dname + "-mafft-align.11.aln")
    concataa = os.path.join(aldir, dname + "-complete.aa.cat.aln")
    shutil.copyfile(orig_btr1, btr1)
    shutil.copyfile(orig_btr8, btr8)
    shutil.copyfile(orig_btr11, btr11)
    shutil.copyfile(orig_ali1, ali1)
    shutil.copyfile(orig_ali11, ali11)
    shutil.copyfile(orig_concat_aa, concataa)
    # Run post-alignment
    out_concat = os.path.join(aldir, dname + "-complete.nucl.cat.aln")
    treedir = os.path.join(outdir, "Phylo-" + dname)
    out_grp = os.path.join(treedir, dname + ".nucl.grp.aln")
    assert pal.post_alignment(fam_nums, all_genomes, prefix, outdir, dname,
                              prot_ali, quiet) == out_grp
    # Check that concatenated file is created and with expected content
    ref_concat_nucl = os.path.join(EXPPATH,
                                   "exp_concat_4genomes-fam1-8-11.aln")
    assert os.path.isfile(out_concat)
    assert tutil.compare_order_content(out_concat, ref_concat_nucl)
    # Check that grouped by genome file is not created
    assert os.path.isfile(out_grp)
    exp_grp = os.path.join(EXPPATH, "exp_grp_4genomes-fam1-8-11.aln")
    assert tutil.compare_order_content(out_grp, exp_grp)
    # check logs
    assert "Concatenating all nucl alignment files" in caplog.text
    assert "Grouping nucleic alignments per genome" in caplog.text
    assert (
        "aa alignments already concatenated in test/data/align/generated_by_unit-tests/"
        "test_post-align/aldir_post-align/TESTpost-complete.aa.cat.aln. "
        "Program will use it for next steps. If you want to redo it, "
        "remove it before running.") in caplog.text
    assert "Grouping protein alignments per genome" in caplog.text
    assert (
        "An error occurred. We could not group protein alignments by genome"
    ) in caplog.text
Ejemplo n.º 4
0
def test_postalign(caplog):
    """
    Test that when running post-alignment on a folder containing all expected alignment files,
    it creates concatenated alignments, and a folder Phylo with the alignments grouped by genome.
    """
    caplog.set_level(logging.DEBUG)
    # define parameters
    fam_nums = [1, 8, 11]
    all_genomes = [
        "GEN2.1017.00001", "GEN4.1111.00001", "GENO.1017.00001",
        "GENO.1216.00002"
    ]
    outdir = os.path.join(GENEPATH, "test_post-align")
    aldir = os.path.join(outdir, "aldir_post-align")
    os.makedirs(aldir)
    dname = "TESTpost"
    prot_ali = True
    prefix = os.path.join(aldir, dname)
    quiet = False
    # Prepare aldir with all needed alignment files
    orig_btr1 = os.path.join(EXPPATH, "exp_aldir", "mafft-prt2nuc.1.aln")
    orig_btr8 = os.path.join(EXPPATH, "exp_aldir-pers", "mafft-prt2nuc.8.aln")
    orig_btr11 = os.path.join(EXPPATH, "exp_aldir-pers",
                              "mafft-prt2nuc.11.aln")
    orig_ali1 = os.path.join(EXPPATH, "exp_aldir", "mafft-align.1.aln")
    orig_ali8 = os.path.join(EXPPATH, "exp_aldir-pers",
                             "mafft-align.8-completed.aln")
    orig_ali11 = os.path.join(EXPPATH, "exp_aldir-pers", "mafft-align.11.aln")
    btr1 = os.path.join(aldir, dname + "-mafft-prt2nuc.1.aln")
    btr8 = os.path.join(aldir, dname + "-mafft-prt2nuc.8.aln")
    btr11 = os.path.join(aldir, dname + "-mafft-prt2nuc.11.aln")
    ali1 = os.path.join(aldir, dname + "-mafft-align.1.aln")
    ali8 = os.path.join(aldir, dname + "-mafft-align.8.aln")
    ali11 = os.path.join(aldir, dname + "-mafft-align.11.aln")
    shutil.copyfile(orig_btr1, btr1)
    shutil.copyfile(orig_btr8, btr8)
    shutil.copyfile(orig_btr11, btr11)
    shutil.copyfile(orig_ali1, ali1)
    shutil.copyfile(orig_ali8, ali8)
    shutil.copyfile(orig_ali11, ali11)
    # Run post-alignment
    pal.post_alignment(fam_nums, all_genomes, prefix, outdir, dname, prot_ali,
                       quiet)
    # print(caplog.text)
    # CHECK CONCAT
    # Check that concatenated file in nucl is created and with expected content
    out_concat_nucl = os.path.join(aldir, dname + "-complete.nucl.cat.aln")
    assert os.path.isfile(out_concat_nucl)
    ref_concat_nucl = os.path.join(EXPPATH,
                                   "exp_concat_4genomes-fam1-8-11.aln")
    assert tutil.compare_order_content(out_concat_nucl, ref_concat_nucl)
    # Check concatenated in aa
    out_concat_aa = os.path.join(aldir, dname + "-complete.aa.cat.aln")
    assert os.path.isfile(out_concat_aa)
    ref_concat_aa = os.path.join(EXPPATH,
                                 "exp_concat_4genomes-fam1-8-11.aa.aln")
    assert tutil.compare_order_content(out_concat_aa, ref_concat_aa)
    # CHECK GROUPED
    # Check that grouped by genome file in nucleotides is created, with expected content
    treedir = os.path.join(outdir, "Phylo-" + dname)
    out_grp = os.path.join(treedir, dname + ".nucl.grp.aln")
    assert os.path.isfile(out_grp)
    exp_grp = os.path.join(EXPPATH, "exp_grp_4genomes-fam1-8-11.aln")
    assert tutil.compare_order_content(out_grp, exp_grp)
    # Check aa alignment grouped by genome
    out_grp_aa = os.path.join(treedir, dname + ".aa.grp.aln")
    assert os.path.isfile(out_grp_aa)
    exp_grp_aa = os.path.join(EXPPATH, "exp_grp_4genomes-fam1-8-11.aa.aln")
    assert tutil.compare_order_content(out_grp_aa, exp_grp_aa)
    # check logs
    assert "Concatenating all nucl alignment files" in caplog.text
    assert "Grouping nucleic alignments per genome" in caplog.text
    assert "Concatenating all aa alignment files" in caplog.text
    assert "Grouping protein alignments per genome" in caplog.text
Ejemplo n.º 5
0
def main(cmd, corepers, list_genomes, dname, dbpath, outdir, prot_ali, threads, force, verbose=0,
         quiet=False):
    """
    Align given core genome families

    Parameters
    ----------
    corepers : str
        File containing persistent genome families
    list_genomes : str
        File containing the list of all genomes in the dataset. Only first column is
        considered.
    dname : str
        Dataset name, used to name output files
    dbpath : str
        path to the directory containing 'Proteins' and 'Genes' folders
    outdir : str
        path to the directory where output files must be saved
    prot_ali : bool
        Also give aa alignment of concatenation of persistent proteins
    threads : int
        Max number of threads to use
    force : bool
        Remove existing output files and rerun everything if True.
    verbose : int
        verbosity:
        - defaut 0 : stdout contains INFO, stderr contains ERROR.
        - 1: stdout contains INFO, stderr contains WARNING and ERROR
        - 2: stdout contains (DEBUG), DETAIL and INFO, stderr contains WARNING and ERROR
        - >=15: Add DEBUG in stdout

    quiet : bool
        True if nothing must be sent to stdout/stderr, False otherwise
    """
    # import needed packages
    import logging
    import shutil
    from PanACoTA import utils
    from PanACoTA.align_module import pan_to_pergenome as p2g
    from PanACoTA.align_module import get_seqs as gseqs
    from PanACoTA.align_module import alignment as ali
    from PanACoTA.align_module import post_align as post
    from PanACoTA import __version__ as version

    # test if prokka is installed and in the path
    if not utils.check_installed("mafft"):  # pragma: no cover
        print("mafft is not installed. 'PanACoTA align' cannot run.")
        sys.exit(1)

    if force and os.path.isdir(outdir):
        shutil.rmtree(outdir)
    os.makedirs(outdir, exist_ok=True)
    # set level of logger (here debug to show everything during development)
    # level is the minimum level that will be considered.
    # for verbose = 0 or 1, ignore details and debug, start from info
    if verbose <= 1:
        level = logging.INFO
    # for verbose = 2, ignore only debug
    if verbose >= 2 and verbose < 15:
        level = 15 # int corresponding to detail level
    # for verbose >= 15, write everything
    if verbose >= 15:
        level = logging.DEBUG
    # name logfile, add timestamp if already existing
    logfile_base = os.path.join(outdir, "PanACoTA-align_" + dname)
    utils.init_logger(logfile_base, level, 'align', log_details=True, verbose=verbose, quiet=quiet)
    logger = logging.getLogger("align")
    logger.info(f'PanACoTA version {version}')
    logger.info("Command used\n \t > " + cmd)

    all_genomes, aldir, listdir, fam_nums = p2g.get_per_genome(corepers, list_genomes,
                                                               dname, outdir)
    # generate required files
    gseqs.get_all_seqs(all_genomes, dname, dbpath, listdir, aldir, fam_nums, quiet)
    prefix = os.path.join(aldir, dname)

    # Align all families
    status = ali.align_all_families(prefix, fam_nums, len(all_genomes), dname, quiet, threads)
    if not status:
        logger.error(("At least one alignment did not run well. See detailed log file for "
                      "more information. Program will stop here, alignments won't be "
                      "grouped by genome."))
        sys.exit(1)

    # post-process alignment files
    align_file = post.post_alignment(fam_nums, all_genomes, prefix, outdir, dname, prot_ali, quiet)
    logger.info("END")
    return align_file