def test_init_ok(self): '''Test init with good input''' fasta_in = os.path.join(data_dir, 'reference_data_init_ok.in.fa') tsv_in = os.path.join(data_dir, 'reference_data_init_ok.in.tsv') meta1 = sequence_metadata.SequenceMetadata( 'gene1\t1\t0\tR2S\t.\tconfers killer rabbit resistance') meta2 = sequence_metadata.SequenceMetadata( "gene2\t1\t0\tI42L\t.\tremoves tardigrade's space-living capability" ) expected_metadata = { 'gene1': { 'seq_type': 'p', 'variant_only': False, 'n': {}, 'p': { 1: {meta1} }, '.': set(), }, 'gene2': { 'seq_type': 'p', 'variant_only': False, 'n': {}, 'p': { 41: {meta2} }, '.': set(), } } ref_data = reference_data.ReferenceData([fasta_in], [tsv_in]) self.assertEqual(expected_metadata, ref_data.metadata) expected_seqs_dict = { 'gene1': pyfastaq.sequences.Fasta('gene1', 'CATCGTCGTCTATCGTCGTCCTAG'), 'gene2': pyfastaq.sequences.Fasta('gene2', 'AAAAACCCCGGGGTTTT') } self.assertEqual(expected_seqs_dict, ref_data.sequences) self.assertEqual({}, ref_data.ariba_to_original_name) self.assertEqual({}, ref_data.extra_parameters) rename_file = os.path.join(data_dir, 'reference_data_init_ok.rename.tsv') parameters_file = os.path.join(data_dir, 'reference_data_init_ok.params.json') ref_data = reference_data.ReferenceData( [fasta_in], [tsv_in], rename_file=rename_file, parameters_file=parameters_file) expected_rename_dict = { 'gene1': 'original_gene1', 'gene2': 'original_gene2' } self.assertEqual(expected_rename_dict, ref_data.ariba_to_original_name) expected_extra_parameters = {'foo': 'bar', 'spam': 'eggs'} self.assertEqual(expected_extra_parameters, ref_data.extra_parameters)
def test_init_fails(self): '''Test __init__ fails when it should''' empty_fasta = os.path.join(data_dir, 'reference_data_init_fails.empty.fa') empty_tsv = os.path.join(data_dir, 'reference_data_init_fails.empty.tsv') fasta = os.path.join(data_dir, 'reference_data_init_fails.in.fa') with self.assertRaises(reference_data.Error): reference_data.ReferenceData([empty_fasta], [empty_tsv]) reference_data.ReferenceData([fasta], [empty_tsv])
def test_full_run_multiple_vars_in_codon(self): '''Test complete run where there is a codon with a SNP and an indel''' fasta_in = os.path.join(data_dir, 'cluster_test_full_run_multiple_vars.fa') tsv_in = os.path.join(data_dir, 'cluster_test_full_run_multiple_vars.tsv') refdata = reference_data.ReferenceData([fasta_in], [tsv_in]) tmpdir = 'tmp.cluster_test_full_run_multiple_vars' shutil.rmtree(tmpdir, ignore_errors=True) shutil.copytree( os.path.join(data_dir, 'cluster_test_full_run_multiple_vars'), tmpdir) c = cluster.Cluster(tmpdir, 'cluster_name', refdata, total_reads=292, total_reads_bases=20900) c.run() expected = [ 'presence_absence1\tpresence_absence1\t1\t0\t539\t292\tcluster_name\t96\t96\t96.91\tcluster_name.l15.c30.ctg.1\t1074\t20.4\t0\t.\tp\t.\t0\t.\tMULTIPLE\t25\t26\tGA\t487\t489\tCAT\t27;26;25\tC;A;T\t27;26;25\t.\tGeneric description of presence_absence1', 'presence_absence1\tpresence_absence1\t1\t0\t539\t292\tcluster_name\t96\t96\t96.91\tcluster_name.l15.c30.ctg.1\t1074\t20.4\t0\t.\tp\t.\t0\tA10fs\tFSHIFT\t28\t28\tG\t491\t491\tG\t26\tG\t26\t.\tGeneric description of presence_absence1', ] self.assertEqual(expected, c.report_lines) shutil.rmtree(tmpdir)
def test_full_run_cluster_test_full_run_smtls_snp_varonly_nonc(self): '''test complete run where samtools calls a snp at a known snp location in a presence/absence noncoding and sample has the var''' fasta_in = os.path.join( data_dir, 'cluster_test_full_run_smtls_snp_varonly_nonc.fa') tsv_in = os.path.join( data_dir, 'cluster_test_full_run_smtls_snp_varonly_nonc.tsv') refdata = reference_data.ReferenceData([fasta_in], [tsv_in]) tmpdir = 'tmp.cluster_test_full_run_ok_samtools_snp_known_position_var_only_noncoding' shutil.rmtree(tmpdir, ignore_errors=True) shutil.copytree( os.path.join(data_dir, 'cluster_test_full_run_smtls_snp_varonly_nonc'), tmpdir) c = cluster.Cluster(tmpdir, 'cluster_name', refdata, total_reads=148, total_reads_bases=13320) c.run() # We shouldn't get an extra 'HET' line because we already know about the snp, so # included in the report of the known snp expected = [ 'ref_seq\tref_seq\t0\t1\t147\t148\tcluster_name\t96\t96\t100.0\tcluster_name.l15.c30.ctg.1\t335\t39.8\t1\tSNP\tn\tA18G\t1\t.\t.\t18\t18\tG\t137\t137\tG\t63\tG,A\t32,31\tref_seq:0:1:A18G:.:Description of A18G snp\t.' ] self.assertEqual(expected, c.report_lines) shutil.rmtree(tmpdir)
def test_full_run_smtls_snp_varonly_gene(self): '''test complete run where samtools calls a snp at a known snp location in a variant only gene, gene does have variant''' fasta_in = os.path.join(data_dir, 'cluster_full_run_smtls_snp_varonly_gene.fa') tsv_in = os.path.join(data_dir, 'cluster_full_run_smtls_snp_varonly_gene.tsv') refdata = reference_data.ReferenceData([fasta_in], [tsv_in]) tmpdir = 'tmp.cluster_test_full_run_ok_samtools_snp_known_position_var_only_gene_does_have_var' shutil.rmtree(tmpdir, ignore_errors=True) shutil.copytree( os.path.join(data_dir, 'cluster_full_run_smtls_snp_varonly_gene'), tmpdir) c = cluster.Cluster(tmpdir, 'cluster_name', refdata, total_reads=148, total_reads_bases=13320) c.run() # We shouldn't get an extra 'HET' line because we already know about the snp, so # included in the report of the known snp expected = [ 'ref_gene\tref_gene\t1\t1\t155\t148\tcluster_name\t96\t96\t100.0\tcluster_name.l15.c30.ctg.1\t335\t39.8\t1\tSNP\tp\tI6M\t1\t.\t.\t16\t18\tATG\t135\t137\tATG\t65;64;63\tA;T;G,A\t65;64;32,31\tref_gene:1:1:I6M:.:Description of I6M snp\t.' ] self.assertEqual(expected, c.report_lines) shutil.rmtree(tmpdir)
def test_full_run_smtls_snp_varonly_gene_2(self): '''test complete run where samtools calls a snp in a variant only gene''' # _2 because I think test_full_run_smtls_snp_varonly_gene tests the asame functionality. # ... but let's leave both tests in anyway fasta_in = os.path.join( data_dir, 'cluster_full_run_smtls_snp_varonly_gene_2.fa') tsv_in = os.path.join(data_dir, 'cluster_full_run_smtls_snp_varonly_gene_2.tsv') refdata = reference_data.ReferenceData([fasta_in], [tsv_in]) tmpdir = 'tmp.cluster_full_run_smtls_snp_varonly_gene_2' shutil.rmtree(tmpdir, ignore_errors=True) shutil.copytree( os.path.join(data_dir, 'cluster_full_run_smtls_snp_varonly_gene_2'), tmpdir) c = cluster.Cluster(tmpdir, 'cluster_name', refdata, total_reads=148, total_reads_bases=13320) c.run() expected = [ 'ref_gene\tref_gene\t1\t1\t155\t148\tcluster_name\t96\t96\t100.0\tcluster_name.l15.c30.ctg.1\t335\t39.8\t0\tHET\t.\t.\t.\tG18A\t.\t18\t18\tG\t137\t137\tG\t63\tG,A\t32,31\t.\tGeneric description of ref_gene' ] self.assertEqual(expected, c.report_lines) shutil.rmtree(tmpdir)
def test_full_run_ok_gene_start_mismatch(self): '''test complete run where gene extended because too different at end for full nucmer match''' fasta_in = os.path.join( data_dir, 'cluster_test_full_run_ok_gene_start_mismatch.fa') tsv_in = os.path.join( data_dir, 'cluster_test_full_run_ok_gene_start_mismatch.metadata.tsv') refdata = reference_data.ReferenceData([fasta_in], [tsv_in]) tmpdir = 'tmp.cluster_test_full_run_ok_gene_start_mismatch' shutil.rmtree(tmpdir, ignore_errors=True) shutil.copytree( os.path.join(data_dir, 'cluster_test_full_run_ok_gene_start_mismatch'), tmpdir) c = cluster.Cluster(tmpdir, 'cluster_name', refdata, total_reads=112, total_reads_bases=1080) c.run() expected = [ 'gene\tgene\t1\t0\t27\t112\tcluster_name\t96\t96\t100.0\tcluster_name.l6.c30.ctg.1\t362\t27.8\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\tGeneric description of gene' ] self.assertEqual(expected, c.report_lines) shutil.rmtree(tmpdir)
def test_full_run_ok_variants_only_variant_is_present(self): '''test complete run of cluster on a variants only gene when variant is present''' fasta_in = os.path.join(data_dir, 'cluster_test_full_run_ok_variants_only.fa') tsv_in = os.path.join( data_dir, 'cluster_test_full_run_ok_variants_only.present.metadata.tsv') refdata = reference_data.ReferenceData([fasta_in], [tsv_in]) tmpdir = 'tmp.cluster_test_full_run_ok_variants_only.present' shutil.rmtree(tmpdir, ignore_errors=True) shutil.copytree( os.path.join(data_dir, 'cluster_test_full_run_ok_variants_only'), tmpdir) c = cluster.Cluster(tmpdir, 'cluster_name', refdata, total_reads=66, total_reads_bases=3300) c.run() expected = [ 'variants_only1\tvariants_only1\t1\t1\t27\t66\tcluster_name\t96\t96\t100.0\tcluster_name.l15.c30.ctg.1\t215\t15.3\t1\tSNP\tp\tR3S\t0\t.\t.\t7\t9\tCGC\t65\t67\tCGC\t18;18;19\tC;G;C\t18;18;19\tvariants_only1:1:1:R3S:.:Ref and assembly have wild type\tGeneric description of variants_only1', 'variants_only1\tvariants_only1\t1\t1\t27\t66\tcluster_name\t96\t96\t100.0\tcluster_name.l15.c30.ctg.1\t215\t15.3\t1\tSNP\tp\tI5A\t1\t.\t.\t13\t15\tGCG\t71\t73\tGCG\t17;17;17\tG;C;G\t17;17;17\tvariants_only1:1:1:I5A:.:Ref and reads have variant so report\tGeneric description of variants_only1', ] self.assertEqual(expected, c.report_lines) shutil.rmtree(tmpdir)
def test_full_run_ok_presence_absence(self): '''test complete run of cluster on a presence absence gene''' fasta_in = os.path.join( data_dir, 'cluster_test_full_run_ok_presence_absence.fa') tsv_in = os.path.join( data_dir, 'cluster_test_full_run_ok_presence_absence.metadata.tsv') refdata = reference_data.ReferenceData([fasta_in], [tsv_in]) tmpdir = 'tmp.cluster_test_full_run_ok_presence_absence' shutil.rmtree(tmpdir, ignore_errors=True) shutil.copytree( os.path.join(data_dir, 'cluster_test_full_run_ok_presence_absence'), tmpdir) c = cluster.Cluster(tmpdir, 'cluster_name', refdata, total_reads=64, total_reads_bases=3200) c.run() expected = [ 'presence_absence1\tpresence_absence1\t1\t0\t539\t64\tcluster_name\t96\t96\t97.92\tcluster_name.l15.c30.ctg.1\t213\t15.0\t1\tSNP\tp\tA10V\t1\tA10V\tNONSYN\t28\t30\tGCG\t83\t85\tGTG\t22;22;21\tG;T;G\t22;22;21\tpresence_absence1:1:0:A10V:.:Ref has wild, reads have variant so report\tGeneric description of presence_absence1', 'presence_absence1\tpresence_absence1\t1\t0\t539\t64\tcluster_name\t96\t96\t97.92\tcluster_name.l15.c30.ctg.1\t213\t15.0\t0\t.\tp\t.\t0\t.\tSYN\t52\t54\tATT\t107\t109\tATC\t31;31;32\tA;T;C\t31;31;32\t.\tGeneric description of presence_absence1', 'presence_absence1\tpresence_absence1\t1\t0\t539\t64\tcluster_name\t96\t96\t97.92\tcluster_name.l15.c30.ctg.1\t213\t15.0\t1\tSNP\tp\tR3S\t0\t.\t.\t7\t9\tCGC\t62\t64\tCGC\t18;17;17\tC;G;C\t18;17;17\tpresence_absence1:1:0:R3S:.:Ref and assembly have wild type\tGeneric description of presence_absence1', 'presence_absence1\tpresence_absence1\t1\t0\t539\t64\tcluster_name\t96\t96\t97.92\tcluster_name.l15.c30.ctg.1\t213\t15.0\t1\tSNP\tp\tI5A\t1\t.\t.\t13\t15\tGCG\t68\t70\tGCG\t18;20;20\tG;C;G\t18;20;20\tpresence_absence1:1:0:I5A:.:Ref and reads have variant so report\tGeneric description of presence_absence1', ] self.assertEqual(expected, c.report_lines) shutil.rmtree(tmpdir)
def test_full_run_ok_non_coding(self): '''test complete run of cluster on a noncoding sequence''' fasta_in = os.path.join(data_dir, 'cluster_test_full_run_ok_non_coding.fa') tsv_in = os.path.join( data_dir, 'cluster_test_full_run_ok_non_coding.metadata.tsv') refdata = reference_data.ReferenceData([fasta_in], [tsv_in]) tmpdir = 'tmp.test_full_run_ok_non_coding' shutil.rmtree(tmpdir, ignore_errors=True) shutil.copytree( os.path.join(data_dir, 'cluster_test_full_run_ok_non_coding'), tmpdir) c = cluster.Cluster(tmpdir, 'cluster_name', refdata, total_reads=72, total_reads_bases=3600) c.run() self.maxDiff = None expected = [ 'noncoding1\tnoncoding1\t0\t0\t531\t72\tcluster_name\t120\t120\t95.87\tcluster_name.l15.c30.ctg.1\t234\t15.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t14\t14\tA\t74\t74\tT\t19\tT\t19\tnoncoding1:0:0:A14T:.:ref has wild type, reads has variant so should report\tgeneric description of noncoding1', 'noncoding1\tnoncoding1\t0\t0\t531\t72\tcluster_name\t120\t120\t95.87\tcluster_name.l15.c30.ctg.1\t234\t15.4\t0\t.\tn\t.\t0\tG61T\tSNP\t61\t61\tG\t121\t121\tT\t24\tT\t24\t.\tgeneric description of noncoding1', 'noncoding1\tnoncoding1\t0\t0\t531\t72\tcluster_name\t120\t120\t95.87\tcluster_name.l15.c30.ctg.1\t234\t15.4\t0\t.\tn\t.\t0\t.82C\tINS\t82\t82\tA\t143\t143\tC\t23\tC\t23\t.\tgeneric description of noncoding1', 'noncoding1\tnoncoding1\t0\t0\t531\t72\tcluster_name\t120\t120\t95.87\tcluster_name.l15.c30.ctg.1\t234\t15.4\t0\t.\tn\t.\t0\tT108.\tDEL\t108\t108\tT\t168\t168\tC\t17\tC\t17\t.\tgeneric description of noncoding1', 'noncoding1\tnoncoding1\t0\t0\t531\t72\tcluster_name\t120\t120\t95.87\tcluster_name.l15.c30.ctg.1\t234\t15.4\t1\tSNP\tn\tA6G\t1\t.\t.\t6\t6\tG\t66\t66\tG\t19\tG\t19\tnoncoding1:0:0:A6G:.:variant in ref and reads so should report\tgeneric description of noncoding1', 'noncoding1\tnoncoding1\t0\t0\t531\t72\tcluster_name\t120\t120\t95.87\tcluster_name.l15.c30.ctg.1\t234\t15.4\t1\tSNP\tn\tG9T\t0\t.\t.\t9\t9\tG\t69\t69\tG\t19\tG\t19\tnoncoding1:0:0:G9T:.:wild type in ref and reads\tgeneric description of noncoding1' ] self.assertEqual(expected, c.report_lines) shutil.rmtree(tmpdir)
def test_sequence(self): '''Test sequence''' fasta_in = os.path.join(data_dir, 'reference_data_sequence.in.fa') tsv_in = os.path.join(data_dir, 'reference_data_sequence.in.tsv') expected = pyfastaq.sequences.Fasta('seq1', 'ATGTTTTAA') refdata = reference_data.ReferenceData([fasta_in], [tsv_in]) self.assertEqual(expected, refdata.sequence('seq1'))
def test_full_run_ref_not_in_cluster(self): '''test complete run of cluster when nearest ref is outside cluster''' fasta_in = os.path.join( data_dir, 'cluster_test_full_run_ref_not_in_cluster.in.fa') tsv_in = os.path.join( data_dir, 'cluster_test_full_run_ref_not_in_cluster.in.tsv') refdata = reference_data.ReferenceData([fasta_in], [tsv_in]) tmpdir = 'tmp.test_full_run_ref_not_in_cluster' all_refs_fa = os.path.join( data_dir, 'cluster_test_full_run_ref_not_in_cluster.all_refs.fa') shutil.rmtree(tmpdir, ignore_errors=True) shutil.copytree( os.path.join(data_dir, 'cluster_test_full_run_ref_not_in_cluster'), tmpdir) c = cluster.Cluster(tmpdir, 'cluster_name', refdata, total_reads=72, total_reads_bases=3600, all_ref_seqs_fasta=all_refs_fa) c.run() expected = '\t'.join( ['.', '.', '.', '.', '1024', '72', 'cluster_name'] + ['.'] * 24) self.assertEqual([expected], c.report_lines) self.assertTrue(c.status_flag.has('ref_seq_choose_fail')) self.assertFalse(c.status_flag.has('assembly_fail')) shutil.rmtree(tmpdir)
def test_full_run_assembly_fail(self): '''test complete run of cluster when assembly fails''' fasta_in = os.path.join(data_dir, 'cluster_test_full_run_assembly_fail.in.fa') tsv_in = os.path.join(data_dir, 'cluster_test_full_run_assembly_fail.in.tsv') refdata = reference_data.ReferenceData([fasta_in], [tsv_in]) tmpdir = 'tmp.test_full_run_assembly_fail' shutil.rmtree(tmpdir, ignore_errors=True) shutil.copytree( os.path.join(data_dir, 'cluster_test_full_run_assembly_fail'), tmpdir) c = cluster.Cluster(tmpdir, 'cluster_name', refdata, total_reads=4, total_reads_bases=304) c.run() expected = '\t'.join(['.', '.', '.', '.', '64', '4', 'cluster_name'] + ['.'] * 24) self.assertEqual([expected], c.report_lines) self.assertFalse(c.status_flag.has('ref_seq_choose_fail')) self.assertTrue(c.status_flag.has('assembly_fail')) shutil.rmtree(tmpdir)
def test_cluster_w_cdhit_nocluster(self): '''Test cluster_with_cd_hit do not run cdhit''' fasta_in = os.path.join( data_dir, 'reference_data_cluster_w_cdhit_nocluster.in.fa') tsv_in = os.path.join( data_dir, 'reference_data_cluster_w_cdhit_nocluster.in.tsv') refdata = reference_data.ReferenceData([fasta_in], [tsv_in]) outprefix = 'tmp.test_cluster_with_cdhit_nocluster' expected_clusters = { '0': {'noncoding1'}, '1': {'noncoding2'}, '2': {'presence_absence1'}, '3': {'presence_absence2'}, '4': {'presence_absence3'}, '5': {'presence_absence4'}, } got_clusters = refdata.cluster_with_cdhit(outprefix, nocluster=True) self.assertEqual(expected_clusters, got_clusters) expected_clusters_file = os.path.join( data_dir, 'reference_data_cluster_w_cdhit_nocluster.expect.tsv') got_clusters_file = outprefix + '.clusters.tsv' self.assertTrue( filecmp.cmp(expected_clusters_file, got_clusters_file, shallow=False)) os.unlink(got_clusters_file) os.unlink(outprefix + '.all.fa') os.unlink(outprefix + '.gene.fa') os.unlink(outprefix + '.gene.varonly.fa') os.unlink(outprefix + '.noncoding.fa') os.unlink(outprefix + '.noncoding.varonly.fa')
def test_all_non_wild_type_variants(self): '''Test all_non_wild_type_variants''' tsv_file = os.path.join( data_dir, 'reference_data_test_all_non_wild_type_variants.tsv') fasta_in = os.path.join( data_dir, 'reference_data_test_all_non_wild_type_variants.ref.fa') refdata = reference_data.ReferenceData([fasta_in], [tsv_file]) v1 = sequence_metadata.SequenceMetadata( 'var_only_gene\t1\t1\tP3Q\t.\tref has wild type P') v2 = sequence_metadata.SequenceMetadata( 'var_only_gene\t1\t1\tG4I\t.\tref has wild type F') v3 = sequence_metadata.SequenceMetadata( 'var_only_gene\t1\t1\tI5V\t.\tref has variant V instead of I') v4 = sequence_metadata.SequenceMetadata( 'var_only_gene\t1\t1\tF6I\t.\tref has wild type F') p1 = sequence_metadata.SequenceMetadata( 'presence_absence_gene\t1\t0\tN2I\t.\tref has wild type N') p2 = sequence_metadata.SequenceMetadata( 'presence_absence_gene\t1\t0\tA4G\t.\tref has variant G instead of A' ) n1 = sequence_metadata.SequenceMetadata( 'non_coding\t0\t0\tA2C\t.\tref has wild type A') n2 = sequence_metadata.SequenceMetadata( 'non_coding\t0\t0\tC4T\t.\tref has variant T instead of C') var_only_expected = { 'n': {}, 'p': { 2: {v1}, 3: {v2}, 4: {v3}, 5: {v4} } } pres_abs_expected = { 'n': {}, 'p': { 1: {p1}, 3: {p2} }, } non_coding_expected = {'n': {1: {n1}, 3: {n2}}, 'p': {}} self.assertEqual(var_only_expected, refdata.all_non_wild_type_variants('var_only_gene')) self.assertEqual( pres_abs_expected, refdata.all_non_wild_type_variants('presence_absence_gene')) self.assertEqual(non_coding_expected, refdata.all_non_wild_type_variants('non_coding')) self.assertEqual({ 'n': {}, 'p': {} }, refdata.all_non_wild_type_variants('not_a_known_sequence'))
def test_get_variants_variants_only(self): '''test get_variants variants only''' meta1 = sequence_metadata.SequenceMetadata( 'variants_only\t1\t0\tD2E\tid1\tref has wild type D (GAT=D, GAA=E)' ) meta2 = sequence_metadata.SequenceMetadata( 'variants_only\t1\t0\tS3R\tid1\tref has variant type R (AGA=R, AGT=S)' ) meta3 = sequence_metadata.SequenceMetadata( 'variants_only\t1\t0\tD4E\tid1\tref has variant type E (GAA=E, GAC=D)' ) metadata_tsv = 'tmp.test_get_variants_variants_only.metadata.tsv' with open(metadata_tsv, 'w') as f: print(meta1, file=f) print(meta2, file=f) print(meta3, file=f) fasta_in = os.path.join( data_dir, 'assembly_variants_test_get_variants_variants_only.fa') refdata = reference_data.ReferenceData([fasta_in], [metadata_tsv]) os.unlink(metadata_tsv) nucmer_snp_file = os.path.join( data_dir, 'assembly_variants_test_get_variants_variants_only.snps') v2 = pymummer.variant.Variant( pymummer.snp.Snp( '14\tC\tA\t14\tx\tx\t42\t42\tx\tx\tvariants_only\tcontig1')) v3 = pymummer.variant.Variant( pymummer.snp.Snp( '15\tG\tC\t15\tx\tx\t42\t42\tx\tx\tvariants_only\tcontig1')) ctg_nucmer_coords = { 'contig1': [pyfastaq.intervals.Interval(0, 41)], 'contig2': [pyfastaq.intervals.Interval(10, 41)], } ref_nucmer_coords = { 'contig1': [pyfastaq.intervals.Interval(0, 41)], 'contig2': [pyfastaq.intervals.Interval(10, 41)], } expected = { 'contig1': [ (4, 'p', 'A5D', 'NONSYN', [v2, v3], set(), set()), (None, 'p', None, None, None, {meta1}, set()), (None, 'p', None, None, None, {meta3}, set()), ], 'contig2': [(None, 'p', None, None, None, {meta3}, set())], } a_variants = assembly_variants.AssemblyVariants( refdata, nucmer_snp_file) got = a_variants.get_variants('variants_only', ctg_nucmer_coords, ref_nucmer_coords) self.assertEqual(expected, got)
def test_write_seqs_to_fasta(self): '''Test write_seqs_to_fasta''' fasta_in = os.path.join( data_dir, 'reference_data_test_write_seqs_to_fasta.in.fa') tsv_in = os.path.join( data_dir, 'reference_data_test_write_seqs_to_fasta.in.tsv') refdata = reference_data.ReferenceData([fasta_in], [tsv_in]) expected_outfile = os.path.join( data_dir, 'reference_data_test_write_seqs_to_fasta.expected.fa') tmpfile = 'tmp.test.reference_data.write_seqs_to_fasta.out.fa' refdata.write_seqs_to_fasta(tmpfile, {'seq1', 'seq4', 'seq5'}) self.assertTrue(filecmp.cmp(expected_outfile, tmpfile, shallow=False)) os.unlink(tmpfile)
def test_one_var_one_ctg_noncdg(self): '''test _get_one_variant_for_one_contig_non_coding''' fasta_in = os.path.join(data_dir, 'assembly_variants_one_var_one_ctg_noncdg.fa') tsv_in = os.path.join(data_dir, 'assembly_variants_one_var_one_ctg_noncdg.tsv') refdata = reference_data.ReferenceData([fasta_in], [tsv_in]) ref_sequence_name = 'non_coding' refdata_var_dict = refdata.metadata[ref_sequence_name] v0 = pymummer.variant.Variant( pymummer.snp.Snp( '2\tT\tA\t2\tx\tx\t42\t42\tx\tx\tnon_coding\tcontig')) # ref has A at position 3, which is variant type. This gives contig the wild type C. Shouldn't report v1 = pymummer.variant.Variant( pymummer.snp.Snp( '3\tA\tC\t3\tx\tx\t42\t42\tx\tx\tnon_coding\tcontig')) # ref has T at position 5, which is wild type. This gives contig variant type A. Should report v2 = pymummer.variant.Variant( pymummer.snp.Snp( '5\tT\tA\t5\tx\tx\t42\t42\tx\tx\tnon_coding\tcontig')) meta0 = sequence_metadata.SequenceMetadata( 'non_coding\t0\t0\tC3A\tid1\tref has variant type A') meta2 = sequence_metadata.SequenceMetadata( 'non_coding\t0\t0\tT5A\tid1\tref has wild type T') mummer_variants = [v0, v1, v2] expected_tuples = [ (1, 'n', 'T2A', 'SNP', [v0], set(), set()), #0 None, #1 (4, 'n', 'T5A', 'SNP', [v2], {meta2}, set()), #2 ] expected_used_variants = [ set(), #0 {meta0}, #1 {meta2}, #2 ] assert len(mummer_variants) == len(expected_tuples) == len( expected_used_variants) for i in range(len(mummer_variants)): got_tuple, got_used_variants = assembly_variants.AssemblyVariants._get_one_variant_for_one_contig_non_coding( refdata_var_dict, mummer_variants[i]) self.assertEqual(expected_tuples[i], got_tuple) self.assertEqual(expected_used_variants[i], got_used_variants)
def test_full_run_smtls_snp_varonly_nonc(self): '''test complete run where samtools calls a snp in a presence/absence noncoding sequence''' fasta_in = os.path.join(data_dir, 'cluster_full_run_smtls_snp_varonly_nonc.fa') tsv_in = os.path.join(data_dir, 'cluster_full_run_smtls_snp_varonly_nonc.tsv') refdata = reference_data.ReferenceData([fasta_in], [tsv_in]) tmpdir = 'tmp.cluster_full_run_smtls_snp_varonly_nonc' shutil.copytree(os.path.join(data_dir, 'cluster_full_run_smtls_snp_varonly_nonc'), tmpdir) c = cluster.Cluster(tmpdir, 'cluster_name', refdata, spades_other_options='--only-assembler', total_reads=148, total_reads_bases=13320) c.run() expected = [ 'ref_seq\tref_seq\t0\t1\t147\t148\tcluster_name\t96\t96\t100.0\tcluster_name.l15.c30.ctg.1\t335\t39.8\t0\tHET\t.\t.\t.\tG18A\t.\t18\t18\tG\t137\t137\tG\t63\tG,A\t32,31\t.\tGeneric description of ref_seq' ] self.assertEqual(expected, c.report_lines) shutil.rmtree(tmpdir)
def test_sequence_type(self): '''Test sequence_type''' fasta_in = os.path.join(data_dir, 'reference_data_sequence_type.in.fa') tsv_in = os.path.join(data_dir, 'reference_data_sequence_type.in.tsv') refdata = reference_data.ReferenceData([fasta_in], [tsv_in]) tests = [ ('gene', ('p', False)), ('gene.var_only', ('p', True)), ('noncoding', ('n', False)), ('noncoding.var_only', ('n', True)), ] for name, expected in tests: self.assertEqual(expected, refdata.sequence_type(name))
def test_full_run_insert_codon(self): '''Test complete run where there is a inserted codon''' fasta_in = os.path.join(data_dir, 'cluster_test_full_run_insert_codon.fa') tsv_in = os.path.join(data_dir, 'cluster_test_full_run_insert_codon.tsv') refdata = reference_data.ReferenceData([fasta_in], [tsv_in]) tmpdir = 'tmp.cluster_test_full_insert_codon' shutil.copytree(os.path.join(data_dir, 'cluster_test_full_run_insert_codon'), tmpdir) c = cluster.Cluster(tmpdir, 'cluster_name', refdata, spades_other_options='--only-assembler', total_reads=292, total_reads_bases=20900) c.run() expected = [ 'presence_absence1\tpresence_absence1\t1\t0\t539\t292\tcluster_name\t108\t108\t92.31\tcluster_name.l15.c30.ctg.1\t1115\t19.9\t0\t.\tp\t.\t0\tS25_M26insELI\tINS\t73\t73\tA\t554\t554\tG\t24\tG\t24\t.\tGeneric description of presence_absence1' ] self.assertEqual(expected, c.report_lines) shutil.rmtree(tmpdir)
def test_full_run_partial_assembly(self): '''Test complete run where only part of the ref gene is present in the reads''' fasta_in = os.path.join(data_dir, 'cluster_test_full_run_partial_asmbly.fa') tsv_in = os.path.join(data_dir, 'cluster_test_full_run_partial_asmbly.tsv') refdata = reference_data.ReferenceData([fasta_in], [tsv_in]) tmpdir = 'tmp.cluster_test_full_run_partial_assembly' shutil.copytree(os.path.join(data_dir, 'cluster_test_full_run_partial_asmbly'), tmpdir) c = cluster.Cluster(tmpdir, 'cluster_name', refdata, spades_other_options='--only-assembler', total_reads=278, total_reads_bases=15020) c.run() expected = [ 'presence_absence1\tpresence_absence1\t1\t0\t19\t278\tcluster_name\t96\t77\t100.0\tcluster_name.l15.c17.ctg.1\t949\t20.5\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\tGeneric description of presence_absence1' ] self.assertEqual(expected, c.report_lines) shutil.rmtree(tmpdir)
def test_full_run_smtls_known_snp_presabs_nonc(self): '''test complete run where samtools calls a snp in a presence/absence noncoding sequence at a known snp position''' fasta_in = os.path.join(data_dir, 'cluster_full_run_smtls_known_snp_presabs_nonc.fa') tsv_in = os.path.join(data_dir, 'cluster_full_run_smtls_known_snp_presabs_nonc.tsv') refdata = reference_data.ReferenceData([fasta_in], [tsv_in]) tmpdir = 'tmp.cluster_test_full_run_smtls_known_snp_presabs_nonc' common.rmtree(tmpdir) shutil.copytree(os.path.join(data_dir, 'cluster_full_run_smtls_known_snp_presabs_nonc'), tmpdir) c = cluster.Cluster(tmpdir, 'cluster_name', refdata, total_reads=148, total_reads_bases=13320) c.run() expected = [ 'ref_seq\tref_seq\t0\t0\t147\t148\tcluster_name\t96\t96\t100.0\tcluster_name.l15.c30.ctg.1\t335\t39.8\t1\tSNP\tn\tG18A\t0\t.\t.\t18\t18\tG\t137\t137\tG\t63\tG,A\t32,31\tref_seq:0:0:G18A:.:Description of G18A\tGeneric description of ref_seq' ] self.assertEqual(expected, c.report_lines) common.rmtree(tmpdir)
def test_full_run_smtls_snp_presabs_gene(self): '''test complete run where samtools calls a snp in a presence/absence gene''' fasta_in = os.path.join(data_dir, 'cluster_full_run_smtls_snp_presabs_gene.fa') tsv_in = os.path.join(data_dir, 'cluster_full_run_smtls_snp_presabs_gene.tsv') refdata = reference_data.ReferenceData([fasta_in], [tsv_in]) tmpdir = 'tmp.cluster_test_full_run_ok_samtools_snp_pres_abs_gene' common.rmtree(tmpdir) shutil.copytree(os.path.join(data_dir, 'cluster_full_run_smtls_snp_presabs_gene'), tmpdir) c = cluster.Cluster(tmpdir, 'cluster_name', refdata, total_reads=148, total_reads_bases=13320) c.run() expected = [ 'ref_gene\tref_gene\t1\t0\t155\t148\tcluster_name\t96\t96\t100.0\tcluster_name.l15.c30.ctg.1\t335\t39.8\t0\tHET\t.\t.\t.\tG18A\t.\t18\t18\tG\t137\t137\tG\t63\tG,A\t32,31\t.\tGeneric description of ref_gene' ] self.assertEqual(expected, c.report_lines) common.rmtree(tmpdir)
def test_full_run_ok_variants_only_variant_not_present_always_report(self): '''test complete run of cluster on a variants only gene when variant not present but always report variant''' fasta_in = os.path.join(data_dir, 'cluster_test_full_run_ok_variants_only.fa') tsv_in = os.path.join(data_dir, 'cluster_full_run_varonly.not_present.always_report.tsv') refdata = reference_data.ReferenceData([fasta_in], [tsv_in]) tmpdir = 'tmp.cluster_full_run_varonly.not_present.always_report' shutil.copytree(os.path.join(data_dir, 'cluster_test_full_run_ok_variants_only'), tmpdir) c = cluster.Cluster(tmpdir, 'cluster_name', refdata, spades_other_options='--only-assembler', total_reads=66, total_reads_bases=3300) c.run() expected = [ 'variants_only1\tvariants_only1\t1\t1\t27\t66\tcluster_name\t96\t96\t100.0\tcluster_name.l15.c30.ctg.1\t215\t15.3\t1\tSNP\tp\tR3S\t0\t.\t.\t7\t9\tCGC\t65\t67\tCGC\t18;18;19\tC;G;C\t18;18;19\tvariants_only1:1:1:R3S:.:Ref and assembly have wild type, but always report anyway\tGeneric description of variants_only1' ] self.assertEqual(expected, c.report_lines) shutil.rmtree(tmpdir)
def test_full_run_choose_ref_fail(self): '''test complete run of cluster when choosing ref seq fails''' fasta_in = os.path.join(data_dir, 'cluster_test_full_run_choose_ref_fail.in.fa') tsv_in = os.path.join(data_dir, 'cluster_test_full_run_choose_ref_fail.in.tsv') refdata = reference_data.ReferenceData([fasta_in], [tsv_in]) tmpdir = 'tmp.test_full_run_choose_ref_fail' shutil.copytree(os.path.join(data_dir, 'cluster_test_full_run_choose_ref_fail'), tmpdir) c = cluster.Cluster(tmpdir, 'cluster_name', refdata, total_reads=2, total_reads_bases=108, spades_other_options='--only-assembler') c.run() expected = '\t'.join(['.', '.', '.', '.', '1024', '2', 'cluster_name'] + ['.'] * 24) self.assertEqual([expected], c.report_lines) self.assertTrue(c.status_flag.has('ref_seq_choose_fail')) self.assertFalse(c.status_flag.has('assembly_fail')) shutil.rmtree(tmpdir)
def test_full_run_delete_codon(self): '''Test complete run where there is a deleted codon''' fasta_in = os.path.join(data_dir, 'cluster_test_full_run_delete_codon.fa') tsv_in = os.path.join(data_dir, 'cluster_test_full_run_delete_codon.tsv') refdata = reference_data.ReferenceData([fasta_in], [tsv_in]) tmpdir = 'tmp.cluster_test_full_delete_codon' common.rmtree(tmpdir) shutil.copytree(os.path.join(data_dir, 'cluster_test_full_run_delete_codon'), tmpdir) c = cluster.Cluster(tmpdir, 'cluster_name', refdata, total_reads=292, total_reads_bases=20900) c.run() expected = [ 'presence_absence1\tpresence_absence1\t1\t0\t539\t292\tcluster_name\t117\t117\t92.31\tcluster_name.l15.c30.ctg.1\t1104\t20.0\t0\t.\tp\t.\t0\tR25_A26del\tDEL\t73\t73\tA\t553\t553\tA\t27\tA\t27\t.\tGeneric description of presence_absence1', ] self.assertEqual(expected, c.report_lines) common.rmtree(tmpdir)
def test_full_run_no_reads_after_filtering(self): '''test complete run of cluster when filtering removes all reads''' fasta_in = os.path.join(data_dir, 'cluster_test_full_run_no_reads_after_filtering.in.fa') tsv_in = os.path.join(data_dir, 'cluster_test_full_run_no_reads_after_filtering.in.tsv') refdata = reference_data.ReferenceData([fasta_in], [tsv_in]) tmpdir = 'tmp.test_full_run_no_reads_after_filtering' common.rmtree(tmpdir) shutil.copytree(os.path.join(data_dir, 'cluster_test_full_run_no_reads_after_filtering'), tmpdir) c = cluster.Cluster(tmpdir, 'cluster_name', refdata, total_reads=0, total_reads_bases=0) c.run() expected = '\t'.join(['.', '.', '.', '.', '64', '0', 'cluster_name'] + ['.'] * 24) self.assertEqual([expected], c.report_lines) self.assertFalse(c.status_flag.has('ref_seq_choose_fail')) self.assertTrue(c.status_flag.has('assembly_fail')) common.rmtree(tmpdir)
def test_full_run_known_smtls_snp_presabs_gene(self): '''test complete run where samtools calls a snp at a known snp location in a presence/absence gene''' fasta_in = os.path.join(data_dir, 'cluster_full_run_known_smtls_snp_presabs_gene.fa') tsv_in = os.path.join(data_dir, 'cluster_full_run_known_smtls_snp_presabs_gene.tsv') refdata = reference_data.ReferenceData([fasta_in], [tsv_in]) tmpdir = 'tmp.cluster_test_full_run_ok_samtools_snp_known_position_pres_abs_gene' shutil.copytree(os.path.join(data_dir, 'cluster_full_run_known_smtls_snp_presabs_gene'), tmpdir) c = cluster.Cluster(tmpdir, 'cluster_name', refdata, spades_other_options='--only-assembler', total_reads=148, total_reads_bases=13320) c.run() # We shouldn't get an extra 'HET' line because we already know about the snp, so # included in the report of the known snp expected = [ 'ref_gene\tref_gene\t1\t0\t155\t148\tcluster_name\t96\t96\t100.0\tcluster_name.l15.c30.ctg.1\t335\t39.8\t1\tSNP\tp\tM6I\t0\t.\t.\t16\t18\tATG\t135\t137\tATG\t65;64;63\tA;T;G,A\t65;64;32,31\tref_gene:1:0:M6I:.:Description of M6I snp\t.' ] self.assertEqual(expected, c.report_lines) shutil.rmtree(tmpdir)
def test_init_fail_files_missing(self): '''test init_fail_files_missing''' refdata_fa = os.path.join(data_dir, 'cluster_test_init_refdata.fa') meatadata_tsv = os.path.join(data_dir, 'cluster_test_init_refdata.tsv') refdata = reference_data.ReferenceData([refdata_fa], [meatadata_tsv]) dirs = [ 'cluster_test_init_no_refs_fa', 'cluster_test_init_no_reads_1', 'cluster_test_init_no_reads_2', ] dirs = [os.path.join(data_dir, d) for d in dirs] for d in dirs: tmpdir = 'tmp.cluster_test_init_fail_files_missing' shutil.copytree(d, tmpdir) with self.assertRaises(cluster.Error): cluster.Cluster(tmpdir, 'name', refdata=refdata, total_reads=42, total_reads_bases=4242) shutil.rmtree(tmpdir)