Ejemplo n.º 1
0
def test_write_genome_prt_exists():
    """
    Test that when only prt file exists, it overwrites it and generates
    expected prt and gen files
    """
    listdir = os.path.join(GENEPATH, "Listdir")
    aldir = os.path.join(GENEPATH, "Aldir")
    # Create align folder
    os.makedirs(listdir)
    dname = "test_write_genome"
    strain = "ESCO4"
    members = ALL_PROTS[strain]

    # Create prt file
    fileprt = os.path.join(listdir, f"{dname}-getEntry_prt_ESCO4.txt")
    with open(fileprt, "w") as prtf:
        prtf.write("Wrong prt file\n")
    p2p.write_genome_file(listdir, aldir, dname, strain, members, SEVERAL)

    # Check creation of files and content
    expprt = os.path.join(EXPPATH, "exp_getentry-prt-ESCO4_write-prt.txt")
    assert tutil.compare_file_content(fileprt, expprt)
    filegen = os.path.join(listdir, f"{dname}-getEntry_gen_ESCO4.txt")
    expgen = os.path.join(EXPPATH, "exp_getentry-gen-ESCO4_write-prt.txt")
    assert tutil.compare_file_content(expgen, filegen)
Ejemplo n.º 2
0
def test_main_qc():
    """
    Test that when only QC is run, it writes:
    - the list of all genomes with their characteristics
    - the list of genomes that would be discarded for annotation
    - the 2 png files
    """
    list_file = os.path.join(TEST_DIR, "list_genomes-func-test-default.txt")
    name = "ESCO"
    cutn = 0
    threads = 1
    l90 = 1
    date = "0417"
    force = False
    qc_only = True
    assert annot.main("cmd", list_file, GEN_PATH, GENEPATH, name, date, l90=l90,
                      cutn=cutn, qc_only=qc_only) == ("", 0)
    # Check files are here
    lstfile = os.path.join(GENEPATH, "ALL-GENOMES-info-list_genomes-func-test-default.lst")
    exp_lstfile = os.path.join(EXP_DIR, "exp_ALL-GENOMES-QC.lst")
    discardedfile = os.path.join(GENEPATH, "discarded-list_genomes-func-test-default.lst")
    exp_discarded = os.path.join(EXP_DIR, "exp_discarded_QC.lst")
    assert os.path.isfile(lstfile)
    assert os.path.isfile(discardedfile)
    assert os.path.isfile(os.path.join(GENEPATH,
                          "QC_L90-list_genomes-func-test-default.png"))
    assert os.path.isfile(os.path.join(GENEPATH,
                          "QC_nb-contigs-list_genomes-func-test-default.png"))
    # Check content of discarded genomes
    assert tutil.compare_file_content(lstfile, exp_lstfile)
    assert tutil.compare_file_content(discardedfile, exp_discarded)
Ejemplo n.º 3
0
def test_write_getentry():
    """
    Test that when giving a list of genomes with their persistent gene names,
    it creates all expected files.
    """
    listdir = os.path.join(GENEPATH, "Listdir")
    aldir = os.path.join(GENEPATH, "Aldir")
    # Create align folder
    os.makedirs(listdir)
    dname = "TEST6"
    p2p.write_getentry_files(ALL_PROTS, SEVERAL, listdir, aldir, dname, ALL_GENOMES)
    # Check creation and content of all files
    genfiles = [os.path.join(listdir, "{}-getEntry_gen_ESCO{}.txt".format(dname, num)) for num in
                range(1, 7)]
    expgens = [os.path.join(EXPPATH, "exp_getentry-gen-ESCO{}.txt".format(num)) for num in
               range(1, 7)]
    for fexp, fout in zip(expgens, genfiles):
        print(fexp, fout)
        assert tutil.compare_file_content(fexp, fout)
    prtfiles = [os.path.join(listdir, "{}-getEntry_prt_ESCO{}.txt".format(dname, num)) for num in
                range(1, 7)]
    expprts = [os.path.join(EXPPATH, "exp_getentry-prt-ESCO{}.txt".format(num)) for num in
               range(1, 7)]
    for fexp, fout in zip(expprts, prtfiles):
        assert tutil.compare_file_content(fexp, fout)
Ejemplo n.º 4
0
def test_get_all_seqs_prt6(caplog):
    """
    Test that when giving a list of family numbers, and output directories contain only a prt
    file for 1 family, it removes this prt file and it extracts all expected proteins and genes.
    => Aldir with prt file for fam 6. Others as default
    """
    caplog.set_level(logging.DEBUG)
    all_genomes = [
        "GEN2.1017.00001", "GEN4.1111.00001", "GENO.1017.00001",
        "GENO.1216.00002"
    ]
    dname = "TESTgetAllSeq"
    listdir = os.path.join(GENEPATH, "Listdir")
    aldir = os.path.join(GENEPATH, "Align")
    all_fams = [1, 6]
    quiet = False
    # Create listdir and aldir and put all getentry files in listdir
    os.makedirs(listdir)
    os.makedirs(aldir)
    ref_listdir = os.path.join(TESTPATH, "test_listdir")
    ref_aldir = os.path.join(EXPPATH, "exp_aldir")
    prt6 = os.path.join(aldir, "{}-current.6.prt".format(dname))
    # Create empty file for prt of family 6
    open(prt6, "w").close()
    for gen in all_genomes:
        genome_gen = os.path.join(ref_listdir, "getentry-gen_{}".format(gen))
        genome_prt = os.path.join(ref_listdir, "getentry-prt_{}".format(gen))
        gen_out = os.path.join(listdir,
                               "{}-getEntry_gen_{}.txt".format(dname, gen))
        prt_out = os.path.join(listdir,
                               "{}-getEntry_prt_{}.txt".format(dname, gen))
        shutil.copyfile(genome_gen, gen_out)
        shutil.copyfile(genome_prt, prt_out)
    assert os.path.isfile(prt6)
    gseq.get_all_seqs(all_genomes, dname, DBPATH, listdir, aldir, all_fams,
                      quiet)
    # For each family, check that prt and gen files exist, and their content
    for fam in all_fams:
        fam_prt = os.path.join(aldir, "{}-current.{}.prt".format(dname, fam))
        assert os.path.isfile(fam_prt)
        exp_fam_prt = os.path.join(ref_aldir, "current.{}.prt".format(fam))
        assert tutil.compare_file_content(fam_prt, exp_fam_prt)
        fam_gen = os.path.join(aldir, "{}-current.{}.gen".format(dname, fam))
        assert os.path.isfile(fam_gen)
        exp_fam_gen = os.path.join(ref_aldir, "current.{}.gen".format(fam))
        assert tutil.compare_file_content(fam_gen, exp_fam_gen)
    # Check logs
    assert "Extracting proteins and genes from all genomes" in caplog.text
    for gen in all_genomes:
        assert "Extracting proteins and genes from {}".format(
            gen) in caplog.text
Ejemplo n.º 5
0
def test_write_getentry_error(caplog):
    """
    Test that when giving a list of genomes with their persistent gene names,
    but for 2 genomes, there is no persistent gene, it exists, with an error message
    """
    caplog.set_level(logging.DEBUG)
    all_prots = {"ESCO1": {"ESCO1_00001": '1',
                           "ESCO1_00002": '4'},
                 "ESCO2": {"ESCO2_00001": '1',
                           "ESCO2_22": '2',
                           "ESCO2_456": '4',
                           "ESCO2_46": '3'},
                 "ESCO3": {"ESCO3_1": '2',
                           "ESCO3_12": '1',
                           "ESCO3_4564": '3',
                           "ESCO3_00123": '4',
                           "ESCO3_8": '2'},
                 "ESCO6": {"ESCO6_1": '4',
                           "ESCO6_2": '3',
                           "ESCO6_3": '1'}}
    several = {'1': [],
               '2': ["ESCO3"],
               '3': [],
               '4': []}
    listdir = os.path.join(GENEPATH, "Listdir")
    aldir = os.path.join(GENEPATH, "Aldir")
    # Create align folder
    os.makedirs(listdir)
    dname = "TEST6"
    with pytest.raises(SystemExit):
        p2p.write_getentry_files(all_prots, several, listdir, aldir, dname, ALL_GENOMES)
    assert ("There is not any protein for genome ESCO4 in any family! The program will close, "
            "please fix this problem to be able to run the alignments") in caplog.text
    assert ("There is not any protein for genome ESCO5 in any family! The program will close, "
            "please fix this problem to be able to run the alignments") in caplog.text
    # Check creation and content of all files
    genfiles = [os.path.join(listdir, "{}-getEntry_gen_ESCO{}.txt".format(dname, num)) for num in
                list(range(1, 4)) + [6]]
    expgens = [os.path.join(EXPPATH, "exp_getentry-gen-ESCO{}.txt".format(num)) for num in
               list(range(1, 4)) + [6]]
    for fexp, fout in zip(expgens, genfiles):
        assert tutil.compare_file_content(fexp, fout)
    prtfiles = [os.path.join(listdir, "{}-getEntry_prt_ESCO{}.txt".format(dname, num)) for num in
                list(range(1, 4)) + [6]]
    expprts = [os.path.join(EXPPATH, "exp_getentry-prt-ESCO{}.txt".format(num)) for num in
               list(range(1, 4)) + [6]]
    for fexp, fout in zip(expprts, prtfiles):
        assert tutil.compare_file_content(fexp, fout)
Ejemplo n.º 6
0
def test_get_all_seqs(caplog):
    """
    Test that when giving a list of family numbers, and output directories are empty,
    it extracts all expected proteins and genes.
    => Default. empty output, give database and 2 families to extract and getentry files
    exist in Listdir
    """
    caplog.set_level(logging.DEBUG)
    all_genomes = [
        "GEN2.1017.00001", "GEN4.1111.00001", "GENO.1017.00001",
        "GENO.1216.00002"
    ]
    dname = "TESTgetAllSeq"
    listdir = os.path.join(GENEPATH, "Listdir")
    aldir = os.path.join(GENEPATH, "Align")
    all_fams = [1, 6]
    quiet = False
    # Create listdir and aldir and put all getentry files in listdir
    os.makedirs(listdir)
    os.makedirs(aldir)
    ref_listdir = os.path.join(TESTPATH, "test_listdir")
    ref_aldir = os.path.join(EXPPATH, "exp_aldir")
    for gen in all_genomes:
        genome_gen = os.path.join(ref_listdir, f"getentry-gen_{gen}")
        genome_prt = os.path.join(ref_listdir, f"getentry-prt_{gen}")
        gen_out = os.path.join(listdir, f"{dname}-getEntry_gen_{gen}.txt")
        prt_out = os.path.join(listdir, f"{dname}-getEntry_prt_{gen}.txt")
        shutil.copyfile(genome_gen, gen_out)
        shutil.copyfile(genome_prt, prt_out)
    gseq.get_all_seqs(all_genomes, dname, DBPATH, listdir, aldir, all_fams,
                      quiet)

    # For each family, check that prt and gen files exist, and their content
    for fam in all_fams:
        fam_prt = os.path.join(aldir, f"{dname}-current.{fam}.prt")
        assert os.path.isfile(fam_prt)
        exp_fam_prt = os.path.join(ref_aldir, f"current.{fam}.prt")
        assert tutil.compare_file_content(fam_prt, exp_fam_prt)
        fam_gen = os.path.join(aldir, f"{dname}-current.{fam}.gen")
        assert os.path.isfile(fam_gen)
        exp_fam_gen = os.path.join(ref_aldir, f"current.{fam}.gen")
        assert tutil.compare_file_content(fam_gen, exp_fam_gen)

    # Check logs
    assert "Extracting proteins and genes from all genomes" in caplog.text
    for gen in all_genomes:
        assert f"Extracting proteins and genes from {gen}" in caplog.text
Ejemplo n.º 7
0
def test_extract_seq_out_different():
    """
    Test that when giving an open fasta file, a list of 3 sequences to extract with a
    corresponding output file for each, it writes the expected sequences to the expected output
    file.
    """
    out1 = os.path.join(GENEPATH, "test_extract1.prt")
    out2 = os.path.join(GENEPATH, "test_extract2.prt")
    to_extract = {
        "GEN2.1017.00001.b0001_00001": out1,
        "GEN2.1017.00001.i0003_00008": out2,
        "GEN2.1017.00001.b0004_00013": out1
    }
    with open(FASTA, "r") as fasf:
        gseq.extract_sequences(to_extract, fasf, files_todo=[out1, out2])
    exp_extracted1 = os.path.join(EXPPATH, "exp_extracted1.prt")
    exp_extracted2 = os.path.join(EXPPATH, "exp_extracted2.prt")
    assert tutil.compare_file_content(out1, exp_extracted1)
    assert tutil.compare_file_content(out2, exp_extracted2)
Ejemplo n.º 8
0
def test_get_genome_seqs_outgiven_1col():
    """
    Test that given a fasta file, and a tab file containing only all sequences to extract,
    (no filename), and an output file, it extracts all sequences to the same output file.
    """
    tabfile = os.path.join(TESTPATH, "getentry_all_1column.txt")
    outfile = os.path.join(GENEPATH, "fileout.txt")
    todo = []
    gseq.get_genome_seqs(FASTA, tabfile, todo, outfile)
    assert os.path.isfile(outfile)
    exp_file = os.path.join(EXPPATH, "exp_extracted.prt")
    assert tutil.compare_file_content(outfile, exp_file)
Ejemplo n.º 9
0
def test_write_genome():
    """
    Test that given a genome, it writes the list of its proteins
    and genes in expected files.
    """
    listdir = os.path.join(GENEPATH, "Listdir")
    aldir = os.path.join(GENEPATH, "Aldir")
    # Create align folder
    os.makedirs(listdir)
    dname = "test_write_genome"
    strain = "ESCO4"
    members = ALL_PROTS[strain]
    p2p.write_genome_file(listdir, aldir, dname, strain, members, SEVERAL)

    # Check creation of files and content
    fileprt = os.path.join(listdir, f"{dname}-getEntry_prt_ESCO4.txt")
    expprt = os.path.join(EXPPATH, "exp_getentry-prt-ESCO4_write-prt.txt")
    assert tutil.compare_file_content(fileprt, expprt)
    filegen = os.path.join(listdir, f"{dname}-getEntry_gen_ESCO4.txt")
    expgen = os.path.join(EXPPATH, "exp_getentry-gen-ESCO4_write-prt.txt")
    assert tutil.compare_file_content(filegen, expgen)
Ejemplo n.º 10
0
def test_get_genome_seqs_outgiven_2cols():
    """
    Test that given a fasta file, and a tab file containing all sequences to extract, with the
    files to which it must be extracted, and an output file, it extracts all sequences to the
    same output file, ignoring the ones given in tab file
    """
    tabfile = os.path.join(TESTPATH, "getentry_all_2columns.txt")
    outfile = os.path.join(GENEPATH, "fileout.txt")
    todo = []
    gseq.get_genome_seqs(FASTA, tabfile, todo, outfile)
    assert os.path.isfile(outfile)
    exp_file = os.path.join(EXPPATH, "exp_extracted.prt")
    assert tutil.compare_file_content(outfile, exp_file)
    os.remove(outfile)
Ejemplo n.º 11
0
def test_get_genome_all_seqs():
    """
    Test that given a fasta file, and a tab file containing all sequences to extract, with the
    files to which it must be extracted, it extracts everything in the right file.
    """
    tabfile = os.path.join(TESTPATH, "getentry_all_2columns.txt")
    todo = ["file1.txt", "file2.txt"]
    todo = [os.path.join(GENEPATH, f) for f in todo]
    gseq.get_genome_seqs(FASTA, tabfile, todo)
    for i in range(1, 3):
        outfile = os.path.join(GENEPATH, f"file{i}.txt")
        exp_file = os.path.join(EXPPATH, f"exp_extracted{i}.prt")
        assert os.path.isfile(outfile)
        assert tutil.compare_file_content(outfile, exp_file)
Ejemplo n.º 12
0
def test_get_genome_seqs_1notasked():
    """
    Test that given a fasta file, and a tab file containing all sequences to extract, with the
    files to which it must be extracted, and only 1 of them in 'files_todo', it extracts only
    the proteins going to this file.
    """
    tabfile = os.path.join(TESTPATH, "getentry_all_2columns.txt")
    outfile1 = os.path.join(GENEPATH, "file1.txt")
    outfile2 = os.path.join(GENEPATH, "file2.txt")
    todo = [outfile1]
    gseq.get_genome_seqs(FASTA, tabfile, todo)
    assert os.path.isfile(outfile1)
    exp_file = os.path.join(EXPPATH, "exp_extracted1.prt")
    assert tutil.compare_file_content(outfile1, exp_file)
    assert not os.path.isfile(outfile2)
Ejemplo n.º 13
0
def test_extract_seq_out_given():
    """
    Test that when giving an open fasta file, a list of 3 sequences to extract, and
    a open file to which extracted sequences must be written, it writes the
    expected sequences to output file.
    """
    to_extract = [
        "GEN2.1017.00001.b0001_00001", "GEN2.1017.00001.i0003_00008",
        "GEN2.1017.00001.b0004_00013"
    ]
    outfile = os.path.join(GENEPATH, "test_extract_out-given.prt")
    with open(FASTA, "r") as fasf, open(outfile, "w") as outf:
        gseq.extract_sequences(to_extract, fasf, outf=outf)
    exp_extracted = os.path.join(EXPPATH, "exp_extracted.prt")
    assert tutil.compare_file_content(outfile, exp_extracted)
Ejemplo n.º 14
0
def test_extract_seq_out_allsame():
    """
    Test that when giving an open fasta file, a list of 3 sequences to extract with a
    corresponding output file (same for all 3 proteins), it writes the
    expected sequences to output file.
    """
    out = os.path.join(GENEPATH, "test_extract1.prt")
    to_extract = {
        "GEN2.1017.00001.b0001_00001": out,
        "GEN2.1017.00001.i0003_00008": out,
        "GEN2.1017.00001.b0004_00013": out
    }
    with open(FASTA, "r") as fasf:
        gseq.extract_sequences(to_extract, fasf, files_todo=[out])
    exp_extracted = os.path.join(EXPPATH, "exp_extracted.prt")
    assert tutil.compare_file_content(out, exp_extracted)
Ejemplo n.º 15
0
def test_extract_seq_out_different_notasked():
    """
    Test that when giving an open fasta file, a list of 3 sequences to extract with a
    corresponding output file for each, it writes the
    expected sequences to expected output file, only if this file is contained in 'files_todo'.
    If not, file is not created, and protein not extracted.
    """
    out1 = os.path.join(GENEPATH, "test_extract1.prt")
    out2 = os.path.join(GENEPATH, "test_extract2.prt")
    to_extract = {
        "GEN2.1017.00001.b0001_00001": out1,
        "GEN2.1017.00001.i0003_00008": out2,
        "GEN2.1017.00001.b0004_00013": out1
    }
    with open(FASTA, "r") as fasf:
        gseq.extract_sequences(to_extract, fasf, files_todo=[out1])
    exp_extracted1 = os.path.join(EXPPATH, "exp_extracted1.prt")
    assert tutil.compare_file_content(out1, exp_extracted1)
    assert not os.path.isfile(out2)
Ejemplo n.º 16
0
def test_compare_all(caplog):
    """
    Check that comparison of all sketched sequences is as expected (output matrix is as expected)
    """
    out_msh = os.path.join(DATA_TEST_DIR, "test_files", "test_mash_output")
    matrix = os.path.join(GENEPATH, "matrix_from_test_compare_all.txt")
    mash_log = os.path.join(GENEPATH, "mashlog_from_test_compare_all.log")
    threads = 1

    # Check msh file exists
    assert os.path.isfile(out_msh + ".msh")

    filterg.compare_all(out_msh, matrix, "matrix", mash_log, threads)

    # Check output files are created
    assert os.path.isfile(matrix)
    assert os.path.isfile(mash_log)

    # Check content of matrix file
    expect_matrix = os.path.join(DATA_TEST_DIR, "test_files",
                                 "test_matrix_mash.txt")
    assert tutil.compare_file_content(matrix, expect_matrix)
Ejemplo n.º 17
0
def test_get_all_seqs_prtgen6(caplog):
    """
    Test that when giving a list of family numbers, and output directories contain a prt and a gen
    file for 1 family, it extracts all expected proteins and genes for other families, but keeps
    the current file for family already having prt and gen
    + add mafft and prt2nuc files for this family, and check that they are not removed
    + add concatenate file, and check that it is removed
    => prt and gen files in Aldir for fam 6. Others as default
    """
    caplog.set_level(logging.DEBUG)
    all_genomes = [
        "GEN2.1017.00001", "GEN4.1111.00001", "GENO.1017.00001",
        "GENO.1216.00002"
    ]
    dname = "TESTgetAllSeq"
    listdir = os.path.join(GENEPATH, "Listdir")
    aldir = os.path.join(GENEPATH, "Align")
    all_fams = [1, 6]
    quiet = False
    # Create listdir and aldir and put all getentry files in listdir
    os.makedirs(listdir)
    os.makedirs(aldir)
    ref_listdir = os.path.join(TESTPATH, "test_listdir")
    ref_aldir = os.path.join(EXPPATH, "exp_aldir")
    # Create empty files for prt, gen, mafft and prt2nuc files of family 6
    prt6 = os.path.join(aldir, "{}-current.6.prt".format(dname))
    gen6 = os.path.join(aldir, "{}-current.6.gen".format(dname))
    mafft6 = os.path.join(aldir, "{}-mafft-align.6.aln".format(dname))
    prt2nuc6 = os.path.join(aldir, "{}-mafft-prt2nuc.6.aln".format(dname))
    # Add concatenate file
    concat = os.path.join(aldir, "{}-complete.cat.aln".format(dname))
    for outf in [prt6, gen6, mafft6, prt2nuc6, concat]:
        open(outf, "w").close()
    for gen in all_genomes:
        genome_gen = os.path.join(ref_listdir, "getentry-gen_{}".format(gen))
        genome_prt = os.path.join(ref_listdir, "getentry-prt_{}".format(gen))
        gen_out = os.path.join(listdir,
                               "{}-getEntry_gen_{}.txt".format(dname, gen))
        prt_out = os.path.join(listdir,
                               "{}-getEntry_prt_{}.txt".format(dname, gen))
        shutil.copyfile(genome_gen, gen_out)
        shutil.copyfile(genome_prt, prt_out)
    gseq.get_all_seqs(all_genomes, dname, DBPATH, listdir, aldir, all_fams,
                      quiet)
    # For family 1, check that prt and gen files exist and are as expected
    fam_prt = os.path.join(aldir, "{}-current.1.prt".format(dname))
    assert os.path.isfile(fam_prt)
    exp_fam_prt = os.path.join(ref_aldir, "current.1.prt")
    assert tutil.compare_file_content(fam_prt, exp_fam_prt)
    fam_gen = os.path.join(aldir, "{}-current.1.gen".format(dname))
    assert os.path.isfile(fam_gen)
    exp_fam_gen = os.path.join(ref_aldir, "current.1.gen")
    assert tutil.compare_file_content(fam_gen, exp_fam_gen)
    # For family 6 , check that all filesare present and empty
    for outf in [prt6, gen6, mafft6, prt2nuc6]:
        assert os.path.isfile(outf)
        with open(outf, "r") as out:
            assert out.readlines() == []
    # Check that concat file was removed
    assert not os.path.isfile(concat)
    # Check logs
    assert "Extracting proteins and genes from all genomes" in caplog.text
    for gen in all_genomes:
        assert "Extracting proteins and genes from {}".format(
            gen) in caplog.text