def test_set_assembly_kmer(self): '''test _set_assembly_kmer''' cluster_dir = os.path.join(data_dir, 'cluster_test_set_assembly_kmer') clean_cluster_dir(cluster_dir) c = cluster.Cluster(cluster_dir, 'name', assembly_kmer=42) self.assertEqual(c.assembly_kmer, 42) clean_cluster_dir(cluster_dir) c = cluster.Cluster(os.path.join(data_dir, 'cluster_test_set_assembly_kmer'), 'name') self.assertEqual(c.assembly_kmer, 5) clean_cluster_dir(cluster_dir)
def test_full_run_multiple_vars_in_codon(self): '''Test complete run where there is a codon with a SNP and an indel''' fasta_in = os.path.join(data_dir, 'cluster_test_full_run_multiple_vars.fa') tsv_in = os.path.join(data_dir, 'cluster_test_full_run_multiple_vars.tsv') refdata = reference_data.ReferenceData([fasta_in], [tsv_in]) tmpdir = 'tmp.cluster_test_full_run_multiple_vars' shutil.rmtree(tmpdir, ignore_errors=True) shutil.copytree( os.path.join(data_dir, 'cluster_test_full_run_multiple_vars'), tmpdir) c = cluster.Cluster(tmpdir, 'cluster_name', refdata, total_reads=292, total_reads_bases=20900) c.run() expected = [ 'presence_absence1\tpresence_absence1\t1\t0\t539\t292\tcluster_name\t96\t96\t96.91\tcluster_name.l15.c30.ctg.1\t1074\t20.4\t0\t.\tp\t.\t0\t.\tMULTIPLE\t25\t26\tGA\t487\t489\tCAT\t27;26;25\tC;A;T\t27;26;25\t.\tGeneric description of presence_absence1', 'presence_absence1\tpresence_absence1\t1\t0\t539\t292\tcluster_name\t96\t96\t96.91\tcluster_name.l15.c30.ctg.1\t1074\t20.4\t0\t.\tp\t.\t0\tA10fs\tFSHIFT\t28\t28\tG\t491\t491\tG\t26\tG\t26\t.\tGeneric description of presence_absence1', ] self.assertEqual(expected, c.report_lines) shutil.rmtree(tmpdir)
def test_full_run_smtls_snp_varonly_gene(self): '''test complete run where samtools calls a snp at a known snp location in a variant only gene, gene does have variant''' fasta_in = os.path.join(data_dir, 'cluster_full_run_smtls_snp_varonly_gene.fa') tsv_in = os.path.join(data_dir, 'cluster_full_run_smtls_snp_varonly_gene.tsv') refdata = reference_data.ReferenceData([fasta_in], [tsv_in]) tmpdir = 'tmp.cluster_test_full_run_ok_samtools_snp_known_position_var_only_gene_does_have_var' shutil.rmtree(tmpdir, ignore_errors=True) shutil.copytree( os.path.join(data_dir, 'cluster_full_run_smtls_snp_varonly_gene'), tmpdir) c = cluster.Cluster(tmpdir, 'cluster_name', refdata, total_reads=148, total_reads_bases=13320) c.run() # We shouldn't get an extra 'HET' line because we already know about the snp, so # included in the report of the known snp expected = [ 'ref_gene\tref_gene\t1\t1\t155\t148\tcluster_name\t96\t96\t100.0\tcluster_name.l15.c30.ctg.1\t335\t39.8\t1\tSNP\tp\tI6M\t1\t.\t.\t16\t18\tATG\t135\t137\tATG\t65;64;63\tA;T;G,A\t65;64;32,31\tref_gene:1:1:I6M:.:Description of I6M snp\t.' ] self.assertEqual(expected, c.report_lines) shutil.rmtree(tmpdir)
def test_full_run_cluster_test_full_run_smtls_snp_varonly_nonc(self): '''test complete run where samtools calls a snp at a known snp location in a presence/absence noncoding and sample has the var''' fasta_in = os.path.join( data_dir, 'cluster_test_full_run_smtls_snp_varonly_nonc.fa') tsv_in = os.path.join( data_dir, 'cluster_test_full_run_smtls_snp_varonly_nonc.tsv') refdata = reference_data.ReferenceData([fasta_in], [tsv_in]) tmpdir = 'tmp.cluster_test_full_run_ok_samtools_snp_known_position_var_only_noncoding' shutil.rmtree(tmpdir, ignore_errors=True) shutil.copytree( os.path.join(data_dir, 'cluster_test_full_run_smtls_snp_varonly_nonc'), tmpdir) c = cluster.Cluster(tmpdir, 'cluster_name', refdata, total_reads=148, total_reads_bases=13320) c.run() # We shouldn't get an extra 'HET' line because we already know about the snp, so # included in the report of the known snp expected = [ 'ref_seq\tref_seq\t0\t1\t147\t148\tcluster_name\t96\t96\t100.0\tcluster_name.l15.c30.ctg.1\t335\t39.8\t1\tSNP\tn\tA18G\t1\t.\t.\t18\t18\tG\t137\t137\tG\t63\tG,A\t32,31\tref_seq:0:1:A18G:.:Description of A18G snp\t.' ] self.assertEqual(expected, c.report_lines) shutil.rmtree(tmpdir)
def test_full_run_ok_gene_start_mismatch(self): '''test complete run where gene extended because too different at end for full nucmer match''' fasta_in = os.path.join( data_dir, 'cluster_test_full_run_ok_gene_start_mismatch.fa') tsv_in = os.path.join( data_dir, 'cluster_test_full_run_ok_gene_start_mismatch.metadata.tsv') refdata = reference_data.ReferenceData([fasta_in], [tsv_in]) tmpdir = 'tmp.cluster_test_full_run_ok_gene_start_mismatch' shutil.rmtree(tmpdir, ignore_errors=True) shutil.copytree( os.path.join(data_dir, 'cluster_test_full_run_ok_gene_start_mismatch'), tmpdir) c = cluster.Cluster(tmpdir, 'cluster_name', refdata, total_reads=112, total_reads_bases=1080) c.run() expected = [ 'gene\tgene\t1\t0\t27\t112\tcluster_name\t96\t96\t100.0\tcluster_name.l6.c30.ctg.1\t362\t27.8\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\tGeneric description of gene' ] self.assertEqual(expected, c.report_lines) shutil.rmtree(tmpdir)
def test_full_run_smtls_snp_varonly_gene_2(self): '''test complete run where samtools calls a snp in a variant only gene''' # _2 because I think test_full_run_smtls_snp_varonly_gene tests the asame functionality. # ... but let's leave both tests in anyway fasta_in = os.path.join( data_dir, 'cluster_full_run_smtls_snp_varonly_gene_2.fa') tsv_in = os.path.join(data_dir, 'cluster_full_run_smtls_snp_varonly_gene_2.tsv') refdata = reference_data.ReferenceData([fasta_in], [tsv_in]) tmpdir = 'tmp.cluster_full_run_smtls_snp_varonly_gene_2' shutil.rmtree(tmpdir, ignore_errors=True) shutil.copytree( os.path.join(data_dir, 'cluster_full_run_smtls_snp_varonly_gene_2'), tmpdir) c = cluster.Cluster(tmpdir, 'cluster_name', refdata, total_reads=148, total_reads_bases=13320) c.run() expected = [ 'ref_gene\tref_gene\t1\t1\t155\t148\tcluster_name\t96\t96\t100.0\tcluster_name.l15.c30.ctg.1\t335\t39.8\t0\tHET\t.\t.\t.\tG18A\t.\t18\t18\tG\t137\t137\tG\t63\tG,A\t32,31\t.\tGeneric description of ref_gene' ] self.assertEqual(expected, c.report_lines) shutil.rmtree(tmpdir)
def test_full_run_ok_variants_only_variant_is_present(self): '''test complete run of cluster on a variants only gene when variant is present''' fasta_in = os.path.join(data_dir, 'cluster_test_full_run_ok_variants_only.fa') tsv_in = os.path.join( data_dir, 'cluster_test_full_run_ok_variants_only.present.metadata.tsv') refdata = reference_data.ReferenceData([fasta_in], [tsv_in]) tmpdir = 'tmp.cluster_test_full_run_ok_variants_only.present' shutil.rmtree(tmpdir, ignore_errors=True) shutil.copytree( os.path.join(data_dir, 'cluster_test_full_run_ok_variants_only'), tmpdir) c = cluster.Cluster(tmpdir, 'cluster_name', refdata, total_reads=66, total_reads_bases=3300) c.run() expected = [ 'variants_only1\tvariants_only1\t1\t1\t27\t66\tcluster_name\t96\t96\t100.0\tcluster_name.l15.c30.ctg.1\t215\t15.3\t1\tSNP\tp\tR3S\t0\t.\t.\t7\t9\tCGC\t65\t67\tCGC\t18;18;19\tC;G;C\t18;18;19\tvariants_only1:1:1:R3S:.:Ref and assembly have wild type\tGeneric description of variants_only1', 'variants_only1\tvariants_only1\t1\t1\t27\t66\tcluster_name\t96\t96\t100.0\tcluster_name.l15.c30.ctg.1\t215\t15.3\t1\tSNP\tp\tI5A\t1\t.\t.\t13\t15\tGCG\t71\t73\tGCG\t17;17;17\tG;C;G\t17;17;17\tvariants_only1:1:1:I5A:.:Ref and reads have variant so report\tGeneric description of variants_only1', ] self.assertEqual(expected, c.report_lines) shutil.rmtree(tmpdir)
def test_full_run_ok_presence_absence(self): '''test complete run of cluster on a presence absence gene''' fasta_in = os.path.join( data_dir, 'cluster_test_full_run_ok_presence_absence.fa') tsv_in = os.path.join( data_dir, 'cluster_test_full_run_ok_presence_absence.metadata.tsv') refdata = reference_data.ReferenceData([fasta_in], [tsv_in]) tmpdir = 'tmp.cluster_test_full_run_ok_presence_absence' shutil.rmtree(tmpdir, ignore_errors=True) shutil.copytree( os.path.join(data_dir, 'cluster_test_full_run_ok_presence_absence'), tmpdir) c = cluster.Cluster(tmpdir, 'cluster_name', refdata, total_reads=64, total_reads_bases=3200) c.run() expected = [ 'presence_absence1\tpresence_absence1\t1\t0\t539\t64\tcluster_name\t96\t96\t97.92\tcluster_name.l15.c30.ctg.1\t213\t15.0\t1\tSNP\tp\tA10V\t1\tA10V\tNONSYN\t28\t30\tGCG\t83\t85\tGTG\t22;22;21\tG;T;G\t22;22;21\tpresence_absence1:1:0:A10V:.:Ref has wild, reads have variant so report\tGeneric description of presence_absence1', 'presence_absence1\tpresence_absence1\t1\t0\t539\t64\tcluster_name\t96\t96\t97.92\tcluster_name.l15.c30.ctg.1\t213\t15.0\t0\t.\tp\t.\t0\t.\tSYN\t52\t54\tATT\t107\t109\tATC\t31;31;32\tA;T;C\t31;31;32\t.\tGeneric description of presence_absence1', 'presence_absence1\tpresence_absence1\t1\t0\t539\t64\tcluster_name\t96\t96\t97.92\tcluster_name.l15.c30.ctg.1\t213\t15.0\t1\tSNP\tp\tR3S\t0\t.\t.\t7\t9\tCGC\t62\t64\tCGC\t18;17;17\tC;G;C\t18;17;17\tpresence_absence1:1:0:R3S:.:Ref and assembly have wild type\tGeneric description of presence_absence1', 'presence_absence1\tpresence_absence1\t1\t0\t539\t64\tcluster_name\t96\t96\t97.92\tcluster_name.l15.c30.ctg.1\t213\t15.0\t1\tSNP\tp\tI5A\t1\t.\t.\t13\t15\tGCG\t68\t70\tGCG\t18;20;20\tG;C;G\t18;20;20\tpresence_absence1:1:0:I5A:.:Ref and reads have variant so report\tGeneric description of presence_absence1', ] self.assertEqual(expected, c.report_lines) shutil.rmtree(tmpdir)
def test_full_run_ok_non_coding(self): '''test complete run of cluster on a noncoding sequence''' fasta_in = os.path.join(data_dir, 'cluster_test_full_run_ok_non_coding.fa') tsv_in = os.path.join( data_dir, 'cluster_test_full_run_ok_non_coding.metadata.tsv') refdata = reference_data.ReferenceData([fasta_in], [tsv_in]) tmpdir = 'tmp.test_full_run_ok_non_coding' shutil.rmtree(tmpdir, ignore_errors=True) shutil.copytree( os.path.join(data_dir, 'cluster_test_full_run_ok_non_coding'), tmpdir) c = cluster.Cluster(tmpdir, 'cluster_name', refdata, total_reads=72, total_reads_bases=3600) c.run() self.maxDiff = None expected = [ 'noncoding1\tnoncoding1\t0\t0\t531\t72\tcluster_name\t120\t120\t95.87\tcluster_name.l15.c30.ctg.1\t234\t15.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t14\t14\tA\t74\t74\tT\t19\tT\t19\tnoncoding1:0:0:A14T:.:ref has wild type, reads has variant so should report\tgeneric description of noncoding1', 'noncoding1\tnoncoding1\t0\t0\t531\t72\tcluster_name\t120\t120\t95.87\tcluster_name.l15.c30.ctg.1\t234\t15.4\t0\t.\tn\t.\t0\tG61T\tSNP\t61\t61\tG\t121\t121\tT\t24\tT\t24\t.\tgeneric description of noncoding1', 'noncoding1\tnoncoding1\t0\t0\t531\t72\tcluster_name\t120\t120\t95.87\tcluster_name.l15.c30.ctg.1\t234\t15.4\t0\t.\tn\t.\t0\t.82C\tINS\t82\t82\tA\t143\t143\tC\t23\tC\t23\t.\tgeneric description of noncoding1', 'noncoding1\tnoncoding1\t0\t0\t531\t72\tcluster_name\t120\t120\t95.87\tcluster_name.l15.c30.ctg.1\t234\t15.4\t0\t.\tn\t.\t0\tT108.\tDEL\t108\t108\tT\t168\t168\tC\t17\tC\t17\t.\tgeneric description of noncoding1', 'noncoding1\tnoncoding1\t0\t0\t531\t72\tcluster_name\t120\t120\t95.87\tcluster_name.l15.c30.ctg.1\t234\t15.4\t1\tSNP\tn\tA6G\t1\t.\t.\t6\t6\tG\t66\t66\tG\t19\tG\t19\tnoncoding1:0:0:A6G:.:variant in ref and reads so should report\tgeneric description of noncoding1', 'noncoding1\tnoncoding1\t0\t0\t531\t72\tcluster_name\t120\t120\t95.87\tcluster_name.l15.c30.ctg.1\t234\t15.4\t1\tSNP\tn\tG9T\t0\t.\t.\t9\t9\tG\t69\t69\tG\t19\tG\t19\tnoncoding1:0:0:G9T:.:wild type in ref and reads\tgeneric description of noncoding1' ] self.assertEqual(expected, c.report_lines) shutil.rmtree(tmpdir)
def test_get_read_counts(self): '''test _get_read_counts pass''' cluster_dir = os.path.join(data_dir, 'cluster_test_get_read_counts') clean_cluster_dir(cluster_dir) c = cluster.Cluster(cluster_dir, 'name') self.assertEqual(2, c._get_read_counts()) clean_cluster_dir(cluster_dir)
def test_get_mummer_variants(self): '''test _get_mummer_variants''' cluster_dir = os.path.join(data_dir, 'cluster_test_generic') clean_cluster_dir(cluster_dir) c = cluster.Cluster(cluster_dir, 'name') snp_file = os.path.join(data_dir, 'cluster_test_get_mummer_variants.none.snps') shutil.copyfile(snp_file, c.assembly_vs_gene_coords + '.snps') c._get_mummer_variants() self.assertEqual(c.mummer_variants, {}) clean_cluster_dir(cluster_dir) snp_file = os.path.join(data_dir, 'cluster_test_get_mummer_variants.snp.snps') shutil.copyfile(snp_file, c.assembly_vs_gene_coords + '.snps') v1 = pymummer.variant.Variant(pymummer.snp.Snp('42\tA\tG\t42\t42\t42\t500\t500\t1\t1\tgene\tcontig1')) v2 = pymummer.variant.Variant(pymummer.snp.Snp('42\tA\tG\t42\t42\t42\t500\t500\t1\t1\tgene\tcontig2')) v3 = pymummer.variant.Variant(pymummer.snp.Snp('40\tT\tC\t40\t42\t42\t500\t500\t1\t1\tgene\tcontig1')) v4 = pymummer.variant.Variant(pymummer.snp.Snp('2\tC\tG\t2\t42\t42\t500\t500\t1\t1\tgene\tcontig1')) expected = { 'contig1': [[v4], [v3, v1]], 'contig2': [[v2]] } shutil.copyfile(snp_file, c.assembly_vs_gene_coords + '.snps') c._get_mummer_variants() self.assertEqual(c.mummer_variants, expected) clean_cluster_dir(cluster_dir)
def test_full_run_ref_not_in_cluster(self): '''test complete run of cluster when nearest ref is outside cluster''' fasta_in = os.path.join( data_dir, 'cluster_test_full_run_ref_not_in_cluster.in.fa') tsv_in = os.path.join( data_dir, 'cluster_test_full_run_ref_not_in_cluster.in.tsv') refdata = reference_data.ReferenceData([fasta_in], [tsv_in]) tmpdir = 'tmp.test_full_run_ref_not_in_cluster' all_refs_fa = os.path.join( data_dir, 'cluster_test_full_run_ref_not_in_cluster.all_refs.fa') shutil.rmtree(tmpdir, ignore_errors=True) shutil.copytree( os.path.join(data_dir, 'cluster_test_full_run_ref_not_in_cluster'), tmpdir) c = cluster.Cluster(tmpdir, 'cluster_name', refdata, total_reads=72, total_reads_bases=3600, all_ref_seqs_fasta=all_refs_fa) c.run() expected = '\t'.join( ['.', '.', '.', '.', '1024', '72', 'cluster_name'] + ['.'] * 24) self.assertEqual([expected], c.report_lines) self.assertTrue(c.status_flag.has('ref_seq_choose_fail')) self.assertFalse(c.status_flag.has('assembly_fail')) shutil.rmtree(tmpdir)
def test_gene_covered_by_complete_contig_with_orf(self): '''test _gene_covered_by_complete_contig_with_orf''' cluster_dir = os.path.join(data_dir, 'cluster_test_generic') clean_cluster_dir(cluster_dir) c = cluster.Cluster(cluster_dir, 'name') gene = pyfastaq.sequences.Fasta('gene', 'GATCGCGAAGCGATGACCCATGAAGCGACCGAACGCTGA') gene_no_orf = pyfastaq.sequences.Fasta('gene', 'GATTGAGAAGCGATGACCCATGAAGCGACCGAACGCTGA') c.gene = gene hit1 = ['1', '39', '1', '39', '39', '39', '100.00', '39', '39', '1', '1', 'gene', 'contig1'] hit2 = ['1', '20', '1', '20', '20', '20', '100.00', '39', '39', '1', '1', 'gene', 'contig1'] hit3 = ['21', '39', '21', '39', '19', '19', '100.00', '39', '39', '1', '1', 'gene', 'contig2'] nucmer_hits = [ {'contig1': [pymummer.alignment.Alignment('\t'.join(hit1))]}, {'contig1': [pymummer.alignment.Alignment('\t'.join(hit1))]}, {'contig2': [pymummer.alignment.Alignment('\t'.join(hit2))]}, {'contig2': [pymummer.alignment.Alignment('\t'.join(hit2)), pymummer.alignment.Alignment('\t'.join(hit3))]}, ] expected = [True, False, False, False] assemblies = [ {'contig1': gene}, {'contig1': gene_no_orf}, {'contig1': gene}, {'contig1': gene, 'contig2': pyfastaq.sequences.Fasta('contig2', 'ACGT')} ] assert len(expected) == len(nucmer_hits) == len(assemblies) for i in range(len(expected)): c.final_assembly = assemblies[i] c.nucmer_hits = nucmer_hits[i] self.assertEqual(c._gene_covered_by_complete_contig_with_orf(), expected[i]) clean_cluster_dir(cluster_dir)
def test_get_samtools_variants(self): '''test _get_samtools_variants''' cluster_dir = os.path.join(data_dir, 'cluster_test_generic') clean_cluster_dir(cluster_dir) c = cluster.Cluster(cluster_dir, 'name') c.final_assembly_vcf = os.path.join(data_dir, 'cluster_test_get_samtools_variants.vcf') c.final_assembly_read_depths = os.path.join(data_dir, 'cluster_test_get_samtools_variants.read_depths.gz') positions = [ ('16__cat_2_M35190.scaffold.1', 92), ('16__cat_2_M35190.scaffold.1', 179), ('16__cat_2_M35190.scaffold.1', 263), ('16__cat_2_M35190.scaffold.6', 93) ] expected = { '16__cat_2_M35190.scaffold.1': { 92: ('T', 'A', 123, '65,58'), 179: ('A', 'T', 86, '41,45'), 263: ('G', 'C', 97, '53,44'), }, '16__cat_2_M35190.scaffold.6': { 93: ('T', 'G', 99, '56,43') } } got = c._get_samtools_variants(positions) self.assertEqual(expected, got)
def test_nucmer_hits_to_ref_coords(self): '''test _nucmer_hits_to_ref_coords''' cluster_dir = os.path.join(data_dir, 'cluster_test_generic') clean_cluster_dir(cluster_dir) c = cluster.Cluster(cluster_dir, 'name') hits = [ ['1', '42', '1', '42', '42', '42', '100.00', '1000', '1000', '1', '1', 'gene', 'contig1'], ['100', '142', '200', '242', '42', '42', '99.42', '1000', '1000', '1', '1', 'gene', 'contig1'], ['100', '110', '200', '210', '11', '11', '99.42', '1000', '1000', '1', '1', 'gene', 'contig2'], ] c.nucmer_hits = { 'contig1': [ pymummer.alignment.Alignment('\t'.join(hits[0])), pymummer.alignment.Alignment('\t'.join(hits[1])), ], 'contig2': [ pymummer.alignment.Alignment('\t'.join(hits[2])), ] } got_coords = c._nucmer_hits_to_ref_coords() expected = [ pyfastaq.intervals.Interval(0,41), pyfastaq.intervals.Interval(99, 109), pyfastaq.intervals.Interval(99, 141), ] self.assertEqual(got_coords, expected) got_coords = c._nucmer_hits_to_ref_coords(contig='contig2') expected = [ pyfastaq.intervals.Interval(99, 109), ] self.assertEqual(got_coords, expected) clean_cluster_dir(cluster_dir)
def test_nucmer_hits_to_scaff_coords(self): '''test _nucmer_hits_to_scaff_coords''' cluster_dir = os.path.join(data_dir, 'cluster_test_generic') clean_cluster_dir(cluster_dir) c = cluster.Cluster(cluster_dir, 'name') hits = [ ['1', '10', '1', '10', '10', '10', '100.00', '1000', '1000', '1', '1', 'gene', 'scaff1'], ['9', '42', '9', '42', '34', '34', '100.00', '1000', '1000', '1', '1', 'gene', 'scaff1'], ['50', '52', '50', '52', '3', '3', '100.00', '1000', '1000', '1', '1', 'gene', 'scaff1'], ['1', '42', '1', '42', '42', '42', '100.00', '1000', '1000', '1', '1', 'gene', 'scaff2'], ] c.nucmer_hits = { 'scaff1': [ pymummer.alignment.Alignment('\t'.join(hits[0])), pymummer.alignment.Alignment('\t'.join(hits[1])), pymummer.alignment.Alignment('\t'.join(hits[2])), ], 'scaff2': [ pymummer.alignment.Alignment('\t'.join(hits[3])), ] } got = c._nucmer_hits_to_scaff_coords() expected = { 'scaff1': [ pyfastaq.intervals.Interval(0, 41), pyfastaq.intervals.Interval(49, 51) ], 'scaff2': [ pyfastaq.intervals.Interval(0, 41), ] } self.assertEqual(got, expected) clean_cluster_dir(cluster_dir)
def test_full_run_assembly_fail(self): '''test complete run of cluster when assembly fails''' fasta_in = os.path.join(data_dir, 'cluster_test_full_run_assembly_fail.in.fa') tsv_in = os.path.join(data_dir, 'cluster_test_full_run_assembly_fail.in.tsv') refdata = reference_data.ReferenceData([fasta_in], [tsv_in]) tmpdir = 'tmp.test_full_run_assembly_fail' shutil.rmtree(tmpdir, ignore_errors=True) shutil.copytree( os.path.join(data_dir, 'cluster_test_full_run_assembly_fail'), tmpdir) c = cluster.Cluster(tmpdir, 'cluster_name', refdata, total_reads=4, total_reads_bases=304) c.run() expected = '\t'.join(['.', '.', '.', '.', '64', '4', 'cluster_name'] + ['.'] * 24) self.assertEqual([expected], c.report_lines) self.assertFalse(c.status_flag.has('ref_seq_choose_fail')) self.assertTrue(c.status_flag.has('assembly_fail')) shutil.rmtree(tmpdir)
def test_get_read_counts_fail(self): '''test _get_read_counts fail''' cluster_dir = os.path.join(data_dir, 'cluster_test_get_read_counts_fail') clean_cluster_dir(cluster_dir) c = cluster.Cluster(cluster_dir, 'name') with self.assertRaises(cluster.Error): c._get_read_counts() clean_cluster_dir(cluster_dir)
def test_get_best_gene_by_alignment_score(self): '''test _get_best_gene_by_alignment_score''' cluster_dir = os.path.join(data_dir, 'cluster_test_get_best_gene_by_alignment_score') clean_cluster_dir(cluster_dir) c = cluster.Cluster(cluster_dir, 'name') got_name = c._get_best_gene_by_alignment_score() self.assertEqual(got_name, '1') clean_cluster_dir(cluster_dir)
def test_assemble_with_spades_fail(self): '''test _assemble_with_spades handles spades fail''' cluster_dir = os.path.join(data_dir, 'cluster_test_assemble_with_spades') clean_cluster_dir(cluster_dir) c = cluster.Cluster(cluster_dir, 'name') shutil.copyfile(os.path.join(data_dir, 'cluster_test_assemble_with_spades.gene.fa'), c.gene_fa) c._assemble_with_spades() self.assertEqual(c.status_flag.to_number(), 64) clean_cluster_dir(cluster_dir)
def test_get_total_alignment_score(self): '''test _get_total_alignment_score''' cluster_dir = os.path.join(data_dir, 'cluster_test_get_total_alignment_score') clean_cluster_dir(cluster_dir) c = cluster.Cluster(cluster_dir, 'name') got_score = c._get_total_alignment_score('1') expected_score = 3000 self.assertEqual(got_score, expected_score) clean_cluster_dir(cluster_dir)
def test_gene_coverage_unique(self): '''test _gene_coverage_unique''' cluster_dir = os.path.join(data_dir, 'cluster_test_generic') clean_cluster_dir(cluster_dir) c = cluster.Cluster(cluster_dir, 'name') c.gene = pyfastaq.sequences.Fasta('gene', 'ACGTGTGCAT') hit1 = ['1', '10', '1', '10', '10', '10', '100.00', '10', '10', '1', '1', 'gene', 'contig1'] hit2 = ['1', '5', '1', '5', '5', '5', '100.00', '10', '10', '1', '1', 'gene', 'contig2'] c.nucmer_hits = { 'contig1': [pymummer.alignment.Alignment('\t'.join(hit1))] } self.assertTrue(c._gene_coverage_unique()) c.nucmer_hits['contig2'] = [pymummer.alignment.Alignment('\t'.join(hit2))] self.assertFalse(c._gene_coverage_unique())
def test_fix_contig_orientation(self): '''test _fix_contig_orientation''' cluster_dir = os.path.join(data_dir, 'cluster_test_fix_contig_orientation') clean_cluster_dir(cluster_dir) c = cluster.Cluster(cluster_dir, 'name') scaffs_in = os.path.join(data_dir, 'cluster_test_fix_contig_orientation.in.fa') scaffs_out = os.path.join(data_dir, 'cluster_test_fix_contig_orientation.out.fa') shutil.copyfile(scaffs_in, c.gapfilled_scaffolds) shutil.copyfile(os.path.join(data_dir, 'cluster_test_fix_contig_orientation.gene.fa'), c.gene_fa) c._fix_contig_orientation() self.assertTrue(filecmp.cmp(scaffs_out, c.final_assembly_fa, shallow=False)) clean_cluster_dir(cluster_dir)
def test_scaffold_with_sspace(self): '''test _scaffold_with_sspace''' cluster_dir = os.path.join(data_dir, 'cluster_test_scaffold_with_sspace') clean_cluster_dir(cluster_dir) c = cluster.Cluster(cluster_dir, 'name') shutil.copyfile( os.path.join(data_dir, 'cluster_test_scaffold_with_sspace.contigs.fa'), c.assembly_contigs ) #shutil.copyfile(os.path.join(data_dir, 'cluster_test_scaffold_with_sspace.gene.fa'), c.gene_fa) c._scaffold_with_sspace() self.assertTrue(os.path.exists(c.scaffolder_scaffolds)) clean_cluster_dir(cluster_dir)
def test_gap_fill_with_gapfiller_with_gaps(self): '''test _gap_fill_with_gapfiller with gaps''' cluster_dir = os.path.join(data_dir, 'cluster_test_gapfill_with_gapfiller') clean_cluster_dir(cluster_dir) c = cluster.Cluster(cluster_dir, 'name') shutil.copyfile( os.path.join(data_dir, 'cluster_test_gapfill_with_gapfiller.scaffolds_with_gaps.fa'), c.scaffolder_scaffolds ) c.gene = pyfastaq.sequences.Fasta('name_of_gene', 'AAACCCGGGTTT') c._gap_fill_with_gapfiller() self.assertTrue(os.path.exists(c.gapfilled_scaffolds)) clean_cluster_dir(cluster_dir)
def test_rename_scaffolds(self): '''test _rename_scaffolds''' cluster_dir = os.path.join(data_dir, 'cluster_test_rename_scaffolds') clean_cluster_dir(cluster_dir) c = cluster.Cluster(cluster_dir, 'name') c.gene = pyfastaq.sequences.Fasta('name_of_gene', 'AAACCCGGGTTT') infile = os.path.join(data_dir, 'cluster_test_rename_scaffolds.in.fa') outfile = os.path.join(data_dir, 'cluster_test_rename_scaffolds.out.fa') tmpfile = 'tmp.fa' c._rename_scaffolds(infile, tmpfile) self.assertTrue(filecmp.cmp(outfile, tmpfile, shallow=False)) os.unlink(tmpfile) clean_cluster_dir(cluster_dir)
def test_get_samtools_variant_positions(self): '''test _get_samtools_variant_positions''' cluster_dir = os.path.join(data_dir, 'cluster_test_generic') clean_cluster_dir(cluster_dir) c = cluster.Cluster(cluster_dir, 'name') c.final_assembly_vcf = os.path.join(data_dir, 'cluster_test_get_samtools_variant_positions.vcf') expected = [ ('16__cat_2_M35190.scaffold.1', 92), ('16__cat_2_M35190.scaffold.1', 179), ('16__cat_2_M35190.scaffold.1', 263), ('16__cat_2_M35190.scaffold.6', 93) ] self.assertEqual(expected, c._get_samtools_variant_positions())
def test_full_run_smtls_snp_varonly_nonc(self): '''test complete run where samtools calls a snp in a presence/absence noncoding sequence''' fasta_in = os.path.join(data_dir, 'cluster_full_run_smtls_snp_varonly_nonc.fa') tsv_in = os.path.join(data_dir, 'cluster_full_run_smtls_snp_varonly_nonc.tsv') refdata = reference_data.ReferenceData([fasta_in], [tsv_in]) tmpdir = 'tmp.cluster_full_run_smtls_snp_varonly_nonc' shutil.copytree(os.path.join(data_dir, 'cluster_full_run_smtls_snp_varonly_nonc'), tmpdir) c = cluster.Cluster(tmpdir, 'cluster_name', refdata, spades_other_options='--only-assembler', total_reads=148, total_reads_bases=13320) c.run() expected = [ 'ref_seq\tref_seq\t0\t1\t147\t148\tcluster_name\t96\t96\t100.0\tcluster_name.l15.c30.ctg.1\t335\t39.8\t0\tHET\t.\t.\t.\tG18A\t.\t18\t18\tG\t137\t137\tG\t63\tG,A\t32,31\t.\tGeneric description of ref_seq' ] self.assertEqual(expected, c.report_lines) shutil.rmtree(tmpdir)
def test_init_fail_files_missing(self): '''test init_fail_files_missing''' dirs = [ 'cluster_test_directorynotexist' 'cluster_test_init_no_genes_fa', 'cluster_test_init_no_reads_1', 'cluster_test_init_no_reads_2', ] dirs = [os.path.join(data_dir, d) for d in dirs] for d in dirs: clean_cluster_dir(d) with self.assertRaises(cluster.Error): c = cluster.Cluster(d, 'name') clean_cluster_dir(d)
def test_filter_mummer_variants(self): '''test filter_mummer_variants''' cluster_dir = os.path.join(data_dir, 'cluster_test_generic') clean_cluster_dir(cluster_dir) c = cluster.Cluster(cluster_dir, 'name') c.gene = pyfastaq.sequences.Fasta('gene', 'GATCGCGAAGCGATGACCCATGAAGCGACCGAACGCTGA') v1 = pymummer.variant.Variant(pymummer.snp.Snp('4\tC\tT\t4\tx\tx\t39\t39\tx\tx\tgene\tcontig')) v2 = pymummer.variant.Variant(pymummer.snp.Snp('6\tC\tA\t6\tx\tx\t39\t39\tx\tx\tgene\tcontig')) v3 = pymummer.variant.Variant(pymummer.snp.Snp('12\tG\tT\t12\tx\tx\t39\t39\tx\tx\tgene\tcontig')) c.mummer_variants = {'contig': [[v1, v2], v3]} c._filter_mummer_variants() expected = {'contig': [[v1, v2]]} self.assertEqual(expected, c.mummer_variants) clean_cluster_dir(cluster_dir)