def test_Otyping(caplog): """ Giving E.coli fasta genomes with truncated wzx and wzy genes with reference coverage <50 predict O and H antigens :return: None """ caplog.set_level(logging.DEBUG) file = os.path.join( TEST_ROOT, 'Data/Escherichia_O26H11.fasta' ) #+","+os.path.join(TEST_ROOT, 'Data/Escherichia.fna') tmpdir = tempfile.mkdtemp() set_input(input=file, cores=4, print_sequence=True, verify=True, output=tmpdir, debug=False) ectyper.run_program() with open(os.path.join(tmpdir, "output.tsv")) as outfp: secondrow = outfp.readlines()[1].split("\t") Otype = secondrow[2] Htype = secondrow[3] assert Otype == "-", "Expected no call but reported O-type:" + Otype assert Htype == "H11", "Expected H11 but reported H-type:" + Htype
def test_valid_fastq_file_with_verify(caplog): """ Given a valid fastq file with low genome coverage, test species verification fail Use a temp dir for the test output :return: None """ file = os.path.join(TEST_ROOT, 'Data/Escherichia.fastq') set_input(file, verify=True) ectyper.run_program() assert "Escherichia coli" in caplog.text
def test_integration_validfasta_noverify(caplog): """ Tests for fasta files without E.coli species verify function (--verify) do not fail as per issue #76 (https://github.com/phac-nml/ecoli_serotyping/issues/76) :return: None """ file = os.path.join(TEST_ROOT, 'Data/Escherichia.fna') set_input(file, verify=False) ectyper.run_program() assert "O103\tH2\tO103:H2" in caplog.text assert "Escherichia\t-\tO103\tH2" in caplog.text
def test_integration_invalid_file(caplog): """ Giving a non-fasta file in fasta-file name. :return: None """ caplog.set_level(logging.DEBUG) file = os.path.join(TEST_ROOT, 'Data/test_dir/badfasta.fasta') set_input(input=file) ectyper.run_program() assert "Non fasta / fastq file" in caplog.text
def test_integration_yersinia(caplog): """ Ensure a non-E. coli gets categorized as such :return: None """ file = os.path.join(TEST_ROOT, 'Data/Yersinia.fasta') set_input(file) ectyper.run_program() assert "Yersinia pestis" in caplog.text assert "WARNING (WRONG SPECIES)" in caplog.text
def test_valid_fastq_file(caplog): """ Given a valid fastq file, get the correct results. Use a temp dir for the test output :return: None """ file = os.path.join(TEST_ROOT, 'Data/Escherichia.fastq') set_input(file, verify=False) ectyper.run_program() assert "O22:H8" in caplog.text
def test_integration_no_file(): """ Giving no input to the program. :return: None """ file = '' set_input(file) with pytest.raises(FileNotFoundError) as se: ectyper.run_program() assert se.type == FileNotFoundError assert str(se.value) == "No files were found to run on"
def test_integration_valid_file(caplog): """ Ensure a valid E. coli fasta passes :return: None """ file = os.path.join(TEST_ROOT, 'Data/Escherichia.fna') set_input(file) ectyper.run_program() print(caplog.text) assert "PASS (REPORTABLE)" in caplog.text assert "O103:H2" in caplog.text assert "Escherichia coli" in caplog.text
def test_non_existing_accession_in_meta(caplog): """ GCA_900059685.2 - Streptococcus pneumoniae - ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/900/059/685/GCA_900059685.2_12291_5_44 is not present in assembly_summary_refseq.txt and is a perfect candidate to try to test species identification function :param caplog: :return: """ caplog.set_level(logging.DEBUG) file = os.path.join(TEST_ROOT, 'Data/GCA_900059685.2.fna') set_input(input=file, verify=False) ectyper.run_program() assert "No O and H antigen determinant E.coli genes were found" in caplog.text
def test_multiple_directories(caplog): """ Check a number of small files, some good, some bad, within a nested directory structure. :param caplog: Capture logging output for pytest :return: None """ the_dir = os.path.join(TEST_ROOT, 'Data/test_dir') set_input(the_dir, cores=4, verify=True, print_sequence=True) ectyper.run_program() assert any([ True if re.match( r".+sample2.+WARNING\s+\(WRONG\s+SPECIES\).+Sample identified as -", line) else False for line in caplog.text.splitlines() ]) #O148:H44 assert any([ True if re.match( r".+sample3.+WARNING\s+\(WRONG\s+SPECIES\).+Sample identified as -", line) else False for line in caplog.text.splitlines() ]) #O148:H44 assert any([ True if re.match( r".+sample4.+WARNING\s+\(WRONG\s+SPECIES\).+Sample identified as -", line) else False for line in caplog.text.splitlines() ]) #O148:H44 assert any([ True if re.match( r".+badfasta.+WARNING\s+\(WRONG\s+SPECIES\).+Non fasta / fastq file", line) else False for line in caplog.text.splitlines() ]) assert any([ True if re.match( r".+sample.fasta.+WARNING\s+\(WRONG\s+SPECIES\).+Non fasta / fastq file", line) else False for line in caplog.text.splitlines() ]) assert any([ True if re.match( r".+sampletar.+WARNING\s+\(WRONG\s+SPECIES\).+Non fasta / fastq file", line) else False for line in caplog.text.splitlines() ]) assert any([ True if re.match( r".+test_junk.+WARNING\s+\(WRONG\s+SPECIES\).+Non fasta / fastq file", line) else False for line in caplog.text.splitlines() ])
def test_Ecoli_O17H18(caplog): caplog.set_level(logging.DEBUG) file = os.path.join(TEST_ROOT, 'Data/EscherichiaO17H18.fasta') tmpdir = tempfile.mkdtemp() set_input(input=file, cores=4, print_sequence=False, verify=True, debug=True, output=tmpdir) ectyper.run_program() with open(os.path.join(tmpdir, "output.tsv")) as outfp: rows = outfp.readlines() secondrow = rows[1:][0] #check only second row assert "Escherichia coli\tO77/O17/O44/O106\tH18\tO77/O17/O44/O106:H18\tWARNING MIXED O-TYPE" in secondrow
def test_Shigella_typing(caplog): caplog.set_level(logging.DEBUG) file = os.path.join( TEST_ROOT, 'Data/DRR015915_Shigella_boydii.fasta' ) # +","+os.path.join(TEST_ROOT, 'Data/Escherichia.fna') tmpdir = tempfile.mkdtemp() set_input(input=file, cores=4, print_sequence=True, debug=True, verify=True, output=tmpdir) ectyper.run_program() with open(os.path.join(tmpdir, "output.tsv")) as outfp: secondrow = outfp.readlines()[1].split("\t") species = secondrow[1] assert species == "Shigella boydii"
def test_closeOalles_O42_O28(caplog): caplog.set_level(logging.DEBUG) file = os.path.join( TEST_ROOT, 'Data/EscherichiaO28H5.fasta' ) # +","+os.path.join(TEST_ROOT, 'Data/Escherichia.fna') tmpdir = tempfile.mkdtemp() set_input(input=file, cores=4, print_sequence=True, verify=True, debug=False, output=tmpdir) ectyper.run_program() with open(os.path.join(tmpdir, "output.tsv")) as outfp: secondrow = outfp.readlines()[1] print(secondrow) assert re.match(r".+Escherichia coli.+O42\/O28\tH25\tO42\/O28:H25", secondrow)
def test_Ealbertii_1(caplog): #error LOG.info( "Starting 1 of 3 test on EnteroBase on sample ESC_HA8355AA_AS: Escherichia albertii O65:H5" ) caplog.set_level(logging.DEBUG) file = os.path.join(TEST_ROOT, 'Data/ESC_HA8355AA_AS_Ealberii_O65H5.fasta') tmpdir = tempfile.mkdtemp() set_input(input=file, cores=4, print_sequence=True, verify=True, output=tmpdir) ectyper.run_program() with open(os.path.join(tmpdir, "output.tsv")) as outfp: rows = outfp.readlines() secondrow = rows[1:][0] #remove header line assert "Escherichia albertii" in secondrow assert "WARNING (WRONG SPECIES)" in secondrow
def test_Ealbertii_3(caplog): LOG.info( "Starting 3 of 3 test Escherichia albertii O49:NM" ) #can not type O49 due to poor sequence quality of uncertainty of wet-lab O49 typing caplog.set_level(logging.DEBUG) file = os.path.join(TEST_ROOT, 'Data/Ealbertii_O49NM.fasta') tmpdir = tempfile.mkdtemp() set_input(input=file, cores=4, print_sequence=True, verify=True, output=tmpdir) ectyper.run_program() with open(os.path.join(tmpdir, "output.tsv")) as outfp: rows = outfp.readlines() secondrow = rows[1:][0] #check only second row assert "Escherichia albertii" in secondrow assert "WARNING (WRONG SPECIES)" in secondrow
def test_Ealbertii_2(): #error LOG.info( "Starting 2 of 3 test on EnteroBase on sample on ESC_HA8509AA_AS: Escherichia albertii O5:H5" ) file = os.path.join(TEST_ROOT, 'Data/ESC_HA8509AA_AS_EalbertiiO5H5.fasta') tmpdir = tempfile.mkdtemp() set_input(input=file, cores=4, print_sequence=True, verify=True, output=tmpdir) ectyper.run_program() with open(os.path.join(tmpdir, "output.tsv")) as outfp: rows = outfp.readlines() secondrow = rows[1:][0] #check only second row assert "Escherichia albertii" in secondrow assert "WARNING (WRONG SPECIES)" in secondrow
def test_mixofspecies(caplog): caplog.set_level(logging.DEBUG) file = os.path.join(TEST_ROOT, 'Data/Campylobacter.fasta') +","+os.path.join(TEST_ROOT, 'Data/Salmonella.fasta')+","\ + os.path.join(TEST_ROOT, 'Data/Escherichia.fastq') tmpdir = tempfile.mkdtemp() set_input(input=file, cores=4, print_sequence=True, verify=True, output=tmpdir) ectyper.run_program() with open(os.path.join(tmpdir, "output.tsv")) as outfp: rows = outfp.readlines() rows = rows[1:] #remove header line serovars = [] genomenames = [] QCflag = [] confidence = [] for row in rows: rowlist = row.split("\t") print(rowlist) serovars.append(rowlist[4]) genomenames.append(rowlist[1]) QCflag.append(rowlist[5]) confidence.append(rowlist[6]) assert serovars == ['-:-', 'O22:H8', '-:-'] expectedspecies_list = [ "Campylobacter jejuni", "Escherichia coli", "Salmonella enterica" ] for i in range(0, 3): assert bool(re.match(expectedspecies_list[i], genomenames[i])) == True assert QCflag == [ "WARNING (WRONG SPECIES)", "PASS (REPORTABLE)", "WARNING (WRONG SPECIES)" ]
def test_failed_species_identification_nospeciesverify(caplog): caplog.set_level(logging.DEBUG) file = os.path.join(TEST_ROOT, 'Data/GCF_001672015.1.fna') set_input(input=file, verify=False) ectyper.run_program() assert "GCF_001672015.1\t-\t-\tH8\t-:H8\t-" in caplog.text
def test_failed_species_identification(caplog): caplog.set_level(logging.DEBUG) file = os.path.join(TEST_ROOT, 'Data/GCF_001672015.1.fna') set_input(input=file, verify=True) ectyper.run_program() assert "GCF_001672015.1\tEscherichia coli\t-\tH8\t-:H8\tWARNING (-:H TYPING)" in caplog.text