Example #1
0
 def test_set_assembly_kmer(self):
     '''test _set_assembly_kmer'''
     cluster_dir = os.path.join(data_dir, 'cluster_test_set_assembly_kmer')
     clean_cluster_dir(cluster_dir)
     c = cluster.Cluster(cluster_dir, 'name', assembly_kmer=42)
     self.assertEqual(c.assembly_kmer, 42)
     clean_cluster_dir(cluster_dir)
     c = cluster.Cluster(os.path.join(data_dir, 'cluster_test_set_assembly_kmer'), 'name')
     self.assertEqual(c.assembly_kmer, 5)
     clean_cluster_dir(cluster_dir)
Example #2
0
    def test_full_run_multiple_vars_in_codon(self):
        '''Test complete run where there is a codon with a SNP and an indel'''
        fasta_in = os.path.join(data_dir,
                                'cluster_test_full_run_multiple_vars.fa')
        tsv_in = os.path.join(data_dir,
                              'cluster_test_full_run_multiple_vars.tsv')
        refdata = reference_data.ReferenceData([fasta_in], [tsv_in])
        tmpdir = 'tmp.cluster_test_full_run_multiple_vars'
        shutil.rmtree(tmpdir, ignore_errors=True)
        shutil.copytree(
            os.path.join(data_dir, 'cluster_test_full_run_multiple_vars'),
            tmpdir)
        c = cluster.Cluster(tmpdir,
                            'cluster_name',
                            refdata,
                            total_reads=292,
                            total_reads_bases=20900)
        c.run()

        expected = [
            'presence_absence1\tpresence_absence1\t1\t0\t539\t292\tcluster_name\t96\t96\t96.91\tcluster_name.l15.c30.ctg.1\t1074\t20.4\t0\t.\tp\t.\t0\t.\tMULTIPLE\t25\t26\tGA\t487\t489\tCAT\t27;26;25\tC;A;T\t27;26;25\t.\tGeneric description of presence_absence1',
            'presence_absence1\tpresence_absence1\t1\t0\t539\t292\tcluster_name\t96\t96\t96.91\tcluster_name.l15.c30.ctg.1\t1074\t20.4\t0\t.\tp\t.\t0\tA10fs\tFSHIFT\t28\t28\tG\t491\t491\tG\t26\tG\t26\t.\tGeneric description of presence_absence1',
        ]
        self.assertEqual(expected, c.report_lines)
        shutil.rmtree(tmpdir)
Example #3
0
    def test_full_run_smtls_snp_varonly_gene(self):
        '''test complete run where samtools calls a snp at a known snp location in a variant only gene, gene does have variant'''
        fasta_in = os.path.join(data_dir,
                                'cluster_full_run_smtls_snp_varonly_gene.fa')
        tsv_in = os.path.join(data_dir,
                              'cluster_full_run_smtls_snp_varonly_gene.tsv')
        refdata = reference_data.ReferenceData([fasta_in], [tsv_in])
        tmpdir = 'tmp.cluster_test_full_run_ok_samtools_snp_known_position_var_only_gene_does_have_var'
        shutil.rmtree(tmpdir, ignore_errors=True)
        shutil.copytree(
            os.path.join(data_dir, 'cluster_full_run_smtls_snp_varonly_gene'),
            tmpdir)
        c = cluster.Cluster(tmpdir,
                            'cluster_name',
                            refdata,
                            total_reads=148,
                            total_reads_bases=13320)
        c.run()

        # We shouldn't get an extra 'HET' line because we already know about the snp, so
        # included in the report of the known snp
        expected = [
            'ref_gene\tref_gene\t1\t1\t155\t148\tcluster_name\t96\t96\t100.0\tcluster_name.l15.c30.ctg.1\t335\t39.8\t1\tSNP\tp\tI6M\t1\t.\t.\t16\t18\tATG\t135\t137\tATG\t65;64;63\tA;T;G,A\t65;64;32,31\tref_gene:1:1:I6M:.:Description of I6M snp\t.'
        ]
        self.assertEqual(expected, c.report_lines)
        shutil.rmtree(tmpdir)
Example #4
0
    def test_full_run_cluster_test_full_run_smtls_snp_varonly_nonc(self):
        '''test complete run where samtools calls a snp at a known snp location in a presence/absence noncoding and sample has the var'''
        fasta_in = os.path.join(
            data_dir, 'cluster_test_full_run_smtls_snp_varonly_nonc.fa')
        tsv_in = os.path.join(
            data_dir, 'cluster_test_full_run_smtls_snp_varonly_nonc.tsv')
        refdata = reference_data.ReferenceData([fasta_in], [tsv_in])
        tmpdir = 'tmp.cluster_test_full_run_ok_samtools_snp_known_position_var_only_noncoding'
        shutil.rmtree(tmpdir, ignore_errors=True)
        shutil.copytree(
            os.path.join(data_dir,
                         'cluster_test_full_run_smtls_snp_varonly_nonc'),
            tmpdir)
        c = cluster.Cluster(tmpdir,
                            'cluster_name',
                            refdata,
                            total_reads=148,
                            total_reads_bases=13320)
        c.run()

        # We shouldn't get an extra 'HET' line because we already know about the snp, so
        # included in the report of the known snp
        expected = [
            'ref_seq\tref_seq\t0\t1\t147\t148\tcluster_name\t96\t96\t100.0\tcluster_name.l15.c30.ctg.1\t335\t39.8\t1\tSNP\tn\tA18G\t1\t.\t.\t18\t18\tG\t137\t137\tG\t63\tG,A\t32,31\tref_seq:0:1:A18G:.:Description of A18G snp\t.'
        ]
        self.assertEqual(expected, c.report_lines)
        shutil.rmtree(tmpdir)
Example #5
0
 def test_full_run_ok_gene_start_mismatch(self):
     '''test complete run where gene extended because too different at end for full nucmer match'''
     fasta_in = os.path.join(
         data_dir, 'cluster_test_full_run_ok_gene_start_mismatch.fa')
     tsv_in = os.path.join(
         data_dir,
         'cluster_test_full_run_ok_gene_start_mismatch.metadata.tsv')
     refdata = reference_data.ReferenceData([fasta_in], [tsv_in])
     tmpdir = 'tmp.cluster_test_full_run_ok_gene_start_mismatch'
     shutil.rmtree(tmpdir, ignore_errors=True)
     shutil.copytree(
         os.path.join(data_dir,
                      'cluster_test_full_run_ok_gene_start_mismatch'),
         tmpdir)
     c = cluster.Cluster(tmpdir,
                         'cluster_name',
                         refdata,
                         total_reads=112,
                         total_reads_bases=1080)
     c.run()
     expected = [
         'gene\tgene\t1\t0\t27\t112\tcluster_name\t96\t96\t100.0\tcluster_name.l6.c30.ctg.1\t362\t27.8\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\tGeneric description of gene'
     ]
     self.assertEqual(expected, c.report_lines)
     shutil.rmtree(tmpdir)
Example #6
0
 def test_full_run_smtls_snp_varonly_gene_2(self):
     '''test complete run where samtools calls a snp in a variant only gene'''
     # _2 because I think test_full_run_smtls_snp_varonly_gene tests the asame functionality.
     # ... but let's leave both tests in anyway
     fasta_in = os.path.join(
         data_dir, 'cluster_full_run_smtls_snp_varonly_gene_2.fa')
     tsv_in = os.path.join(data_dir,
                           'cluster_full_run_smtls_snp_varonly_gene_2.tsv')
     refdata = reference_data.ReferenceData([fasta_in], [tsv_in])
     tmpdir = 'tmp.cluster_full_run_smtls_snp_varonly_gene_2'
     shutil.rmtree(tmpdir, ignore_errors=True)
     shutil.copytree(
         os.path.join(data_dir,
                      'cluster_full_run_smtls_snp_varonly_gene_2'), tmpdir)
     c = cluster.Cluster(tmpdir,
                         'cluster_name',
                         refdata,
                         total_reads=148,
                         total_reads_bases=13320)
     c.run()
     expected = [
         'ref_gene\tref_gene\t1\t1\t155\t148\tcluster_name\t96\t96\t100.0\tcluster_name.l15.c30.ctg.1\t335\t39.8\t0\tHET\t.\t.\t.\tG18A\t.\t18\t18\tG\t137\t137\tG\t63\tG,A\t32,31\t.\tGeneric description of ref_gene'
     ]
     self.assertEqual(expected, c.report_lines)
     shutil.rmtree(tmpdir)
Example #7
0
    def test_full_run_ok_variants_only_variant_is_present(self):
        '''test complete run of cluster on a variants only gene when variant is present'''
        fasta_in = os.path.join(data_dir,
                                'cluster_test_full_run_ok_variants_only.fa')
        tsv_in = os.path.join(
            data_dir,
            'cluster_test_full_run_ok_variants_only.present.metadata.tsv')
        refdata = reference_data.ReferenceData([fasta_in], [tsv_in])
        tmpdir = 'tmp.cluster_test_full_run_ok_variants_only.present'
        shutil.rmtree(tmpdir, ignore_errors=True)
        shutil.copytree(
            os.path.join(data_dir, 'cluster_test_full_run_ok_variants_only'),
            tmpdir)

        c = cluster.Cluster(tmpdir,
                            'cluster_name',
                            refdata,
                            total_reads=66,
                            total_reads_bases=3300)
        c.run()

        expected = [
            'variants_only1\tvariants_only1\t1\t1\t27\t66\tcluster_name\t96\t96\t100.0\tcluster_name.l15.c30.ctg.1\t215\t15.3\t1\tSNP\tp\tR3S\t0\t.\t.\t7\t9\tCGC\t65\t67\tCGC\t18;18;19\tC;G;C\t18;18;19\tvariants_only1:1:1:R3S:.:Ref and assembly have wild type\tGeneric description of variants_only1',
            'variants_only1\tvariants_only1\t1\t1\t27\t66\tcluster_name\t96\t96\t100.0\tcluster_name.l15.c30.ctg.1\t215\t15.3\t1\tSNP\tp\tI5A\t1\t.\t.\t13\t15\tGCG\t71\t73\tGCG\t17;17;17\tG;C;G\t17;17;17\tvariants_only1:1:1:I5A:.:Ref and reads have variant so report\tGeneric description of variants_only1',
        ]
        self.assertEqual(expected, c.report_lines)
        shutil.rmtree(tmpdir)
Example #8
0
    def test_full_run_ok_presence_absence(self):
        '''test complete run of cluster on a presence absence gene'''
        fasta_in = os.path.join(
            data_dir, 'cluster_test_full_run_ok_presence_absence.fa')
        tsv_in = os.path.join(
            data_dir, 'cluster_test_full_run_ok_presence_absence.metadata.tsv')
        refdata = reference_data.ReferenceData([fasta_in], [tsv_in])
        tmpdir = 'tmp.cluster_test_full_run_ok_presence_absence'
        shutil.rmtree(tmpdir, ignore_errors=True)
        shutil.copytree(
            os.path.join(data_dir,
                         'cluster_test_full_run_ok_presence_absence'), tmpdir)

        c = cluster.Cluster(tmpdir,
                            'cluster_name',
                            refdata,
                            total_reads=64,
                            total_reads_bases=3200)
        c.run()

        expected = [
            'presence_absence1\tpresence_absence1\t1\t0\t539\t64\tcluster_name\t96\t96\t97.92\tcluster_name.l15.c30.ctg.1\t213\t15.0\t1\tSNP\tp\tA10V\t1\tA10V\tNONSYN\t28\t30\tGCG\t83\t85\tGTG\t22;22;21\tG;T;G\t22;22;21\tpresence_absence1:1:0:A10V:.:Ref has wild, reads have variant so report\tGeneric description of presence_absence1',
            'presence_absence1\tpresence_absence1\t1\t0\t539\t64\tcluster_name\t96\t96\t97.92\tcluster_name.l15.c30.ctg.1\t213\t15.0\t0\t.\tp\t.\t0\t.\tSYN\t52\t54\tATT\t107\t109\tATC\t31;31;32\tA;T;C\t31;31;32\t.\tGeneric description of presence_absence1',
            'presence_absence1\tpresence_absence1\t1\t0\t539\t64\tcluster_name\t96\t96\t97.92\tcluster_name.l15.c30.ctg.1\t213\t15.0\t1\tSNP\tp\tR3S\t0\t.\t.\t7\t9\tCGC\t62\t64\tCGC\t18;17;17\tC;G;C\t18;17;17\tpresence_absence1:1:0:R3S:.:Ref and assembly have wild type\tGeneric description of presence_absence1',
            'presence_absence1\tpresence_absence1\t1\t0\t539\t64\tcluster_name\t96\t96\t97.92\tcluster_name.l15.c30.ctg.1\t213\t15.0\t1\tSNP\tp\tI5A\t1\t.\t.\t13\t15\tGCG\t68\t70\tGCG\t18;20;20\tG;C;G\t18;20;20\tpresence_absence1:1:0:I5A:.:Ref and reads have variant so report\tGeneric description of presence_absence1',
        ]

        self.assertEqual(expected, c.report_lines)
        shutil.rmtree(tmpdir)
Example #9
0
    def test_full_run_ok_non_coding(self):
        '''test complete run of cluster on a noncoding sequence'''
        fasta_in = os.path.join(data_dir,
                                'cluster_test_full_run_ok_non_coding.fa')
        tsv_in = os.path.join(
            data_dir, 'cluster_test_full_run_ok_non_coding.metadata.tsv')
        refdata = reference_data.ReferenceData([fasta_in], [tsv_in])
        tmpdir = 'tmp.test_full_run_ok_non_coding'
        shutil.rmtree(tmpdir, ignore_errors=True)
        shutil.copytree(
            os.path.join(data_dir, 'cluster_test_full_run_ok_non_coding'),
            tmpdir)

        c = cluster.Cluster(tmpdir,
                            'cluster_name',
                            refdata,
                            total_reads=72,
                            total_reads_bases=3600)
        c.run()

        self.maxDiff = None
        expected = [
            'noncoding1\tnoncoding1\t0\t0\t531\t72\tcluster_name\t120\t120\t95.87\tcluster_name.l15.c30.ctg.1\t234\t15.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t14\t14\tA\t74\t74\tT\t19\tT\t19\tnoncoding1:0:0:A14T:.:ref has wild type, reads has variant so should report\tgeneric description of noncoding1',
            'noncoding1\tnoncoding1\t0\t0\t531\t72\tcluster_name\t120\t120\t95.87\tcluster_name.l15.c30.ctg.1\t234\t15.4\t0\t.\tn\t.\t0\tG61T\tSNP\t61\t61\tG\t121\t121\tT\t24\tT\t24\t.\tgeneric description of noncoding1',
            'noncoding1\tnoncoding1\t0\t0\t531\t72\tcluster_name\t120\t120\t95.87\tcluster_name.l15.c30.ctg.1\t234\t15.4\t0\t.\tn\t.\t0\t.82C\tINS\t82\t82\tA\t143\t143\tC\t23\tC\t23\t.\tgeneric description of noncoding1',
            'noncoding1\tnoncoding1\t0\t0\t531\t72\tcluster_name\t120\t120\t95.87\tcluster_name.l15.c30.ctg.1\t234\t15.4\t0\t.\tn\t.\t0\tT108.\tDEL\t108\t108\tT\t168\t168\tC\t17\tC\t17\t.\tgeneric description of noncoding1',
            'noncoding1\tnoncoding1\t0\t0\t531\t72\tcluster_name\t120\t120\t95.87\tcluster_name.l15.c30.ctg.1\t234\t15.4\t1\tSNP\tn\tA6G\t1\t.\t.\t6\t6\tG\t66\t66\tG\t19\tG\t19\tnoncoding1:0:0:A6G:.:variant in ref and reads so should report\tgeneric description of noncoding1',
            'noncoding1\tnoncoding1\t0\t0\t531\t72\tcluster_name\t120\t120\t95.87\tcluster_name.l15.c30.ctg.1\t234\t15.4\t1\tSNP\tn\tG9T\t0\t.\t.\t9\t9\tG\t69\t69\tG\t19\tG\t19\tnoncoding1:0:0:G9T:.:wild type in ref and reads\tgeneric description of noncoding1'
        ]

        self.assertEqual(expected, c.report_lines)
        shutil.rmtree(tmpdir)
Example #10
0
 def test_get_read_counts(self):
     '''test _get_read_counts pass'''
     cluster_dir = os.path.join(data_dir, 'cluster_test_get_read_counts')
     clean_cluster_dir(cluster_dir)
     c = cluster.Cluster(cluster_dir, 'name')
     self.assertEqual(2, c._get_read_counts())
     clean_cluster_dir(cluster_dir)
Example #11
0
    def test_get_mummer_variants(self):
        '''test _get_mummer_variants'''
        cluster_dir = os.path.join(data_dir, 'cluster_test_generic')
        clean_cluster_dir(cluster_dir)
        c = cluster.Cluster(cluster_dir, 'name')
        snp_file = os.path.join(data_dir, 'cluster_test_get_mummer_variants.none.snps')
        shutil.copyfile(snp_file, c.assembly_vs_gene_coords + '.snps')
        c._get_mummer_variants()
        self.assertEqual(c.mummer_variants, {})

        clean_cluster_dir(cluster_dir)
        snp_file = os.path.join(data_dir, 'cluster_test_get_mummer_variants.snp.snps')
        shutil.copyfile(snp_file, c.assembly_vs_gene_coords + '.snps')
        v1 = pymummer.variant.Variant(pymummer.snp.Snp('42\tA\tG\t42\t42\t42\t500\t500\t1\t1\tgene\tcontig1'))
        v2 = pymummer.variant.Variant(pymummer.snp.Snp('42\tA\tG\t42\t42\t42\t500\t500\t1\t1\tgene\tcontig2'))
        v3 = pymummer.variant.Variant(pymummer.snp.Snp('40\tT\tC\t40\t42\t42\t500\t500\t1\t1\tgene\tcontig1'))
        v4 = pymummer.variant.Variant(pymummer.snp.Snp('2\tC\tG\t2\t42\t42\t500\t500\t1\t1\tgene\tcontig1'))
        expected = {
            'contig1': [[v4], [v3, v1]],
            'contig2': [[v2]]
        }
        shutil.copyfile(snp_file, c.assembly_vs_gene_coords + '.snps')
        c._get_mummer_variants()
        self.assertEqual(c.mummer_variants, expected)
        clean_cluster_dir(cluster_dir)
Example #12
0
    def test_full_run_ref_not_in_cluster(self):
        '''test complete run of cluster when nearest ref is outside cluster'''
        fasta_in = os.path.join(
            data_dir, 'cluster_test_full_run_ref_not_in_cluster.in.fa')
        tsv_in = os.path.join(
            data_dir, 'cluster_test_full_run_ref_not_in_cluster.in.tsv')
        refdata = reference_data.ReferenceData([fasta_in], [tsv_in])
        tmpdir = 'tmp.test_full_run_ref_not_in_cluster'
        all_refs_fa = os.path.join(
            data_dir, 'cluster_test_full_run_ref_not_in_cluster.all_refs.fa')
        shutil.rmtree(tmpdir, ignore_errors=True)
        shutil.copytree(
            os.path.join(data_dir, 'cluster_test_full_run_ref_not_in_cluster'),
            tmpdir)

        c = cluster.Cluster(tmpdir,
                            'cluster_name',
                            refdata,
                            total_reads=72,
                            total_reads_bases=3600,
                            all_ref_seqs_fasta=all_refs_fa)
        c.run()

        expected = '\t'.join(
            ['.', '.', '.', '.', '1024', '72', 'cluster_name'] + ['.'] * 24)
        self.assertEqual([expected], c.report_lines)
        self.assertTrue(c.status_flag.has('ref_seq_choose_fail'))
        self.assertFalse(c.status_flag.has('assembly_fail'))
        shutil.rmtree(tmpdir)
Example #13
0
 def test_gene_covered_by_complete_contig_with_orf(self):
     '''test _gene_covered_by_complete_contig_with_orf'''
     cluster_dir = os.path.join(data_dir, 'cluster_test_generic')
     clean_cluster_dir(cluster_dir)
     c = cluster.Cluster(cluster_dir, 'name')
     gene = pyfastaq.sequences.Fasta('gene', 'GATCGCGAAGCGATGACCCATGAAGCGACCGAACGCTGA')
     gene_no_orf = pyfastaq.sequences.Fasta('gene', 'GATTGAGAAGCGATGACCCATGAAGCGACCGAACGCTGA')
     c.gene = gene
     hit1 = ['1', '39', '1', '39', '39', '39', '100.00', '39', '39', '1', '1', 'gene', 'contig1']
     hit2 = ['1', '20', '1', '20', '20', '20', '100.00', '39', '39', '1', '1', 'gene', 'contig1']
     hit3 = ['21', '39', '21', '39', '19', '19', '100.00', '39', '39', '1', '1', 'gene', 'contig2']
     nucmer_hits = [
         {'contig1': [pymummer.alignment.Alignment('\t'.join(hit1))]},
         {'contig1': [pymummer.alignment.Alignment('\t'.join(hit1))]},
         {'contig2': [pymummer.alignment.Alignment('\t'.join(hit2))]},
         {'contig2': [pymummer.alignment.Alignment('\t'.join(hit2)), pymummer.alignment.Alignment('\t'.join(hit3))]},
     ]
     expected = [True, False, False, False]
     assemblies = [
         {'contig1': gene},
         {'contig1': gene_no_orf},
         {'contig1': gene},
         {'contig1': gene, 'contig2': pyfastaq.sequences.Fasta('contig2', 'ACGT')}
     ]
     assert len(expected) == len(nucmer_hits) == len(assemblies)
     for i in range(len(expected)):
         c.final_assembly = assemblies[i]
         c.nucmer_hits = nucmer_hits[i]
         self.assertEqual(c._gene_covered_by_complete_contig_with_orf(), expected[i])
     clean_cluster_dir(cluster_dir)
Example #14
0
    def test_get_samtools_variants(self):
        '''test _get_samtools_variants'''
        cluster_dir = os.path.join(data_dir, 'cluster_test_generic')
        clean_cluster_dir(cluster_dir)
        c = cluster.Cluster(cluster_dir, 'name')
        c.final_assembly_vcf = os.path.join(data_dir, 'cluster_test_get_samtools_variants.vcf')
        c.final_assembly_read_depths = os.path.join(data_dir, 'cluster_test_get_samtools_variants.read_depths.gz')
        positions = [
            ('16__cat_2_M35190.scaffold.1', 92),
            ('16__cat_2_M35190.scaffold.1', 179),
            ('16__cat_2_M35190.scaffold.1', 263),
            ('16__cat_2_M35190.scaffold.6', 93)
        ]
        expected = {
            '16__cat_2_M35190.scaffold.1': {
                92: ('T', 'A', 123, '65,58'),
                179: ('A', 'T', 86, '41,45'),
                263: ('G', 'C', 97, '53,44'),
            },
            '16__cat_2_M35190.scaffold.6': {
                93: ('T', 'G', 99, '56,43')
            }
        }

        got = c._get_samtools_variants(positions)
        self.assertEqual(expected, got)
Example #15
0
    def test_nucmer_hits_to_ref_coords(self):
        '''test _nucmer_hits_to_ref_coords'''
        cluster_dir = os.path.join(data_dir, 'cluster_test_generic')
        clean_cluster_dir(cluster_dir)
        c = cluster.Cluster(cluster_dir, 'name')
        hits = [
            ['1', '42', '1', '42', '42', '42', '100.00', '1000', '1000', '1', '1', 'gene', 'contig1'],
            ['100', '142', '200', '242', '42', '42', '99.42', '1000', '1000', '1', '1', 'gene', 'contig1'],
            ['100', '110', '200', '210', '11', '11', '99.42', '1000', '1000', '1', '1', 'gene', 'contig2'],
        ]
        c.nucmer_hits = {
            'contig1': [
                pymummer.alignment.Alignment('\t'.join(hits[0])),
                pymummer.alignment.Alignment('\t'.join(hits[1])),
            ],
            'contig2': [
                pymummer.alignment.Alignment('\t'.join(hits[2])),
            ]
        }
        got_coords = c._nucmer_hits_to_ref_coords()
        expected = [
            pyfastaq.intervals.Interval(0,41),
            pyfastaq.intervals.Interval(99, 109),
            pyfastaq.intervals.Interval(99, 141),
        ]
        self.assertEqual(got_coords, expected)

        got_coords = c._nucmer_hits_to_ref_coords(contig='contig2')
        expected = [
            pyfastaq.intervals.Interval(99, 109),
        ]
        self.assertEqual(got_coords, expected)
        clean_cluster_dir(cluster_dir)
Example #16
0
 def test_nucmer_hits_to_scaff_coords(self):
     '''test _nucmer_hits_to_scaff_coords'''
     cluster_dir = os.path.join(data_dir, 'cluster_test_generic')
     clean_cluster_dir(cluster_dir)
     c = cluster.Cluster(cluster_dir, 'name')
     hits = [
         ['1', '10', '1', '10', '10', '10', '100.00', '1000', '1000', '1', '1', 'gene', 'scaff1'],
         ['9', '42', '9', '42', '34', '34', '100.00', '1000', '1000', '1', '1', 'gene', 'scaff1'],
         ['50', '52', '50', '52', '3', '3', '100.00', '1000', '1000', '1', '1', 'gene', 'scaff1'],
         ['1', '42', '1', '42', '42', '42', '100.00', '1000', '1000', '1', '1', 'gene', 'scaff2'],
     ]
     c.nucmer_hits = {
         'scaff1': [
             pymummer.alignment.Alignment('\t'.join(hits[0])),
             pymummer.alignment.Alignment('\t'.join(hits[1])),
             pymummer.alignment.Alignment('\t'.join(hits[2])),
         ],
         'scaff2': [
             pymummer.alignment.Alignment('\t'.join(hits[3])),
         ]
     }
     got = c._nucmer_hits_to_scaff_coords()
     expected = {
         'scaff1': [
             pyfastaq.intervals.Interval(0, 41),
             pyfastaq.intervals.Interval(49, 51)
         ],
         'scaff2': [
             pyfastaq.intervals.Interval(0, 41),
         ]
     }
     self.assertEqual(got, expected)
     clean_cluster_dir(cluster_dir)
Example #17
0
    def test_full_run_assembly_fail(self):
        '''test complete run of cluster when assembly fails'''
        fasta_in = os.path.join(data_dir,
                                'cluster_test_full_run_assembly_fail.in.fa')
        tsv_in = os.path.join(data_dir,
                              'cluster_test_full_run_assembly_fail.in.tsv')
        refdata = reference_data.ReferenceData([fasta_in], [tsv_in])
        tmpdir = 'tmp.test_full_run_assembly_fail'
        shutil.rmtree(tmpdir, ignore_errors=True)
        shutil.copytree(
            os.path.join(data_dir, 'cluster_test_full_run_assembly_fail'),
            tmpdir)

        c = cluster.Cluster(tmpdir,
                            'cluster_name',
                            refdata,
                            total_reads=4,
                            total_reads_bases=304)
        c.run()

        expected = '\t'.join(['.', '.', '.', '.', '64', '4', 'cluster_name'] +
                             ['.'] * 24)
        self.assertEqual([expected], c.report_lines)
        self.assertFalse(c.status_flag.has('ref_seq_choose_fail'))
        self.assertTrue(c.status_flag.has('assembly_fail'))
        shutil.rmtree(tmpdir)
Example #18
0
 def test_get_read_counts_fail(self):
     '''test _get_read_counts fail'''
     cluster_dir = os.path.join(data_dir, 'cluster_test_get_read_counts_fail')
     clean_cluster_dir(cluster_dir)
     c = cluster.Cluster(cluster_dir, 'name')
     with self.assertRaises(cluster.Error):
         c._get_read_counts()
     clean_cluster_dir(cluster_dir)
Example #19
0
 def test_get_best_gene_by_alignment_score(self):
     '''test _get_best_gene_by_alignment_score'''
     cluster_dir = os.path.join(data_dir, 'cluster_test_get_best_gene_by_alignment_score')
     clean_cluster_dir(cluster_dir)
     c = cluster.Cluster(cluster_dir, 'name')
     got_name = c._get_best_gene_by_alignment_score()
     self.assertEqual(got_name, '1')
     clean_cluster_dir(cluster_dir)
Example #20
0
 def test_assemble_with_spades_fail(self):
     '''test _assemble_with_spades handles spades fail'''
     cluster_dir = os.path.join(data_dir, 'cluster_test_assemble_with_spades')
     clean_cluster_dir(cluster_dir)
     c = cluster.Cluster(cluster_dir, 'name')
     shutil.copyfile(os.path.join(data_dir, 'cluster_test_assemble_with_spades.gene.fa'), c.gene_fa)
     c._assemble_with_spades()
     self.assertEqual(c.status_flag.to_number(), 64)
     clean_cluster_dir(cluster_dir)
Example #21
0
 def test_get_total_alignment_score(self):
     '''test _get_total_alignment_score'''
     cluster_dir = os.path.join(data_dir, 'cluster_test_get_total_alignment_score')
     clean_cluster_dir(cluster_dir)
     c = cluster.Cluster(cluster_dir, 'name')
     got_score = c._get_total_alignment_score('1')
     expected_score = 3000
     self.assertEqual(got_score, expected_score)
     clean_cluster_dir(cluster_dir)
Example #22
0
 def test_gene_coverage_unique(self):
     '''test _gene_coverage_unique'''
     cluster_dir = os.path.join(data_dir, 'cluster_test_generic')
     clean_cluster_dir(cluster_dir)
     c = cluster.Cluster(cluster_dir, 'name')
     c.gene = pyfastaq.sequences.Fasta('gene', 'ACGTGTGCAT')
     hit1 = ['1', '10', '1', '10', '10', '10', '100.00', '10', '10', '1', '1', 'gene', 'contig1']
     hit2 = ['1', '5', '1', '5', '5', '5', '100.00', '10', '10', '1', '1', 'gene', 'contig2']
     c.nucmer_hits = { 'contig1': [pymummer.alignment.Alignment('\t'.join(hit1))] }
     self.assertTrue(c._gene_coverage_unique())
     c.nucmer_hits['contig2'] = [pymummer.alignment.Alignment('\t'.join(hit2))]
     self.assertFalse(c._gene_coverage_unique())
Example #23
0
 def test_fix_contig_orientation(self):
     '''test _fix_contig_orientation'''
     cluster_dir = os.path.join(data_dir, 'cluster_test_fix_contig_orientation')
     clean_cluster_dir(cluster_dir)
     c = cluster.Cluster(cluster_dir, 'name')
     scaffs_in = os.path.join(data_dir, 'cluster_test_fix_contig_orientation.in.fa')
     scaffs_out = os.path.join(data_dir, 'cluster_test_fix_contig_orientation.out.fa')
     shutil.copyfile(scaffs_in, c.gapfilled_scaffolds)
     shutil.copyfile(os.path.join(data_dir, 'cluster_test_fix_contig_orientation.gene.fa'), c.gene_fa)
     c._fix_contig_orientation()
     self.assertTrue(filecmp.cmp(scaffs_out, c.final_assembly_fa, shallow=False))
     clean_cluster_dir(cluster_dir)
Example #24
0
 def test_scaffold_with_sspace(self):
     '''test _scaffold_with_sspace'''
     cluster_dir = os.path.join(data_dir, 'cluster_test_scaffold_with_sspace')
     clean_cluster_dir(cluster_dir)
     c = cluster.Cluster(cluster_dir, 'name')
     shutil.copyfile(
         os.path.join(data_dir, 'cluster_test_scaffold_with_sspace.contigs.fa'),
         c.assembly_contigs
     )
     #shutil.copyfile(os.path.join(data_dir, 'cluster_test_scaffold_with_sspace.gene.fa'), c.gene_fa)
     c._scaffold_with_sspace()
     self.assertTrue(os.path.exists(c.scaffolder_scaffolds))
     clean_cluster_dir(cluster_dir)
Example #25
0
 def test_gap_fill_with_gapfiller_with_gaps(self):
     '''test _gap_fill_with_gapfiller with gaps'''
     cluster_dir = os.path.join(data_dir, 'cluster_test_gapfill_with_gapfiller')
     clean_cluster_dir(cluster_dir)
     c = cluster.Cluster(cluster_dir, 'name')
     shutil.copyfile(
         os.path.join(data_dir, 'cluster_test_gapfill_with_gapfiller.scaffolds_with_gaps.fa'),
         c.scaffolder_scaffolds
     )
     c.gene = pyfastaq.sequences.Fasta('name_of_gene', 'AAACCCGGGTTT')
     c._gap_fill_with_gapfiller()
     self.assertTrue(os.path.exists(c.gapfilled_scaffolds))
     clean_cluster_dir(cluster_dir)
Example #26
0
 def test_rename_scaffolds(self):
     '''test _rename_scaffolds'''
     cluster_dir = os.path.join(data_dir, 'cluster_test_rename_scaffolds')
     clean_cluster_dir(cluster_dir)
     c = cluster.Cluster(cluster_dir, 'name')
     c.gene = pyfastaq.sequences.Fasta('name_of_gene', 'AAACCCGGGTTT')
     infile = os.path.join(data_dir, 'cluster_test_rename_scaffolds.in.fa')
     outfile = os.path.join(data_dir, 'cluster_test_rename_scaffolds.out.fa')
     tmpfile = 'tmp.fa'
     c._rename_scaffolds(infile, tmpfile)
     self.assertTrue(filecmp.cmp(outfile, tmpfile, shallow=False))
     os.unlink(tmpfile)
     clean_cluster_dir(cluster_dir)
Example #27
0
 def test_get_samtools_variant_positions(self):
     '''test _get_samtools_variant_positions'''
     cluster_dir = os.path.join(data_dir, 'cluster_test_generic')
     clean_cluster_dir(cluster_dir)
     c = cluster.Cluster(cluster_dir, 'name')
     c.final_assembly_vcf = os.path.join(data_dir, 'cluster_test_get_samtools_variant_positions.vcf')
     expected = [
         ('16__cat_2_M35190.scaffold.1', 92),
         ('16__cat_2_M35190.scaffold.1', 179),
         ('16__cat_2_M35190.scaffold.1', 263),
         ('16__cat_2_M35190.scaffold.6', 93)
     ] 
     self.assertEqual(expected, c._get_samtools_variant_positions())
Example #28
0
 def test_full_run_smtls_snp_varonly_nonc(self):
     '''test complete run where samtools calls a snp in a presence/absence noncoding sequence'''
     fasta_in = os.path.join(data_dir, 'cluster_full_run_smtls_snp_varonly_nonc.fa')
     tsv_in = os.path.join(data_dir, 'cluster_full_run_smtls_snp_varonly_nonc.tsv')
     refdata = reference_data.ReferenceData([fasta_in], [tsv_in])
     tmpdir = 'tmp.cluster_full_run_smtls_snp_varonly_nonc'
     shutil.copytree(os.path.join(data_dir, 'cluster_full_run_smtls_snp_varonly_nonc'), tmpdir)
     c = cluster.Cluster(tmpdir, 'cluster_name', refdata, spades_other_options='--only-assembler', total_reads=148, total_reads_bases=13320)
     c.run()
     expected = [
         'ref_seq\tref_seq\t0\t1\t147\t148\tcluster_name\t96\t96\t100.0\tcluster_name.l15.c30.ctg.1\t335\t39.8\t0\tHET\t.\t.\t.\tG18A\t.\t18\t18\tG\t137\t137\tG\t63\tG,A\t32,31\t.\tGeneric description of ref_seq'
     ]
     self.assertEqual(expected, c.report_lines)
     shutil.rmtree(tmpdir)
Example #29
0
 def test_init_fail_files_missing(self):
     '''test init_fail_files_missing'''
     dirs = [
         'cluster_test_directorynotexist'
         'cluster_test_init_no_genes_fa',
         'cluster_test_init_no_reads_1',
         'cluster_test_init_no_reads_2',
     ]
     dirs = [os.path.join(data_dir, d) for d in dirs]
     for d in dirs:
         clean_cluster_dir(d)
         with self.assertRaises(cluster.Error):
             c = cluster.Cluster(d, 'name')
         clean_cluster_dir(d)
Example #30
0
 def test_filter_mummer_variants(self):
     '''test filter_mummer_variants'''
     cluster_dir = os.path.join(data_dir, 'cluster_test_generic')
     clean_cluster_dir(cluster_dir)
     c = cluster.Cluster(cluster_dir, 'name')
     c.gene = pyfastaq.sequences.Fasta('gene', 'GATCGCGAAGCGATGACCCATGAAGCGACCGAACGCTGA')
     v1 = pymummer.variant.Variant(pymummer.snp.Snp('4\tC\tT\t4\tx\tx\t39\t39\tx\tx\tgene\tcontig'))
     v2 = pymummer.variant.Variant(pymummer.snp.Snp('6\tC\tA\t6\tx\tx\t39\t39\tx\tx\tgene\tcontig'))
     v3 = pymummer.variant.Variant(pymummer.snp.Snp('12\tG\tT\t12\tx\tx\t39\t39\tx\tx\tgene\tcontig'))
     c.mummer_variants = {'contig': [[v1, v2], v3]}
     c._filter_mummer_variants()
     expected = {'contig': [[v1, v2]]}
     self.assertEqual(expected, c.mummer_variants)
     clean_cluster_dir(cluster_dir)