Example #1
0
 def test_write_report_tsv(self):
     '''Test write_report_tsv'''
     infile = os.path.join(data_dir, 'report_filter_test_write_report.tsv')
     tmpfile = 'tmp.test.report_filter.write_report.tsv'
     rf = report_filter.ReportFilter(infile=infile)
     rf._write_report_tsv(tmpfile)
     self.assertTrue(filecmp.cmp(tmpfile, infile, shallow=False))
     os.unlink(tmpfile)
Example #2
0
 def test_run(self):
     '''Test run'''
     infile = os.path.join(data_dir, 'report_filter_test_run.in.tsv')
     expected_file = os.path.join(data_dir, 'report_filter_test_run.expected.tsv')
     tmpfile = 'tmp.test.report_filter.run.out.tsv'
     rf = report_filter.ReportFilter(infile=infile)
     rf.run(tmpfile)
     self.assertTrue(filecmp.cmp(expected_file, tmpfile, shallow=False))
     os.unlink(tmpfile)
Example #3
0
 def test_filter_list_of_dicts_all_fail(self):
     '''Test _filter_list_of_dicts where all fail'''
     rf = report_filter.ReportFilter()
     line1 = 'ariba_cluster1\tcluster1\t0\t0\t27\t10000\tcluster1\t1000\t999\t88.42\tcluster1.scaffold.1\t400\t12.2\t1\tSNP\tn\tC42T\t0\t.\t.\t42\t42\tC\t142\t142\tC\t500\t.\t500\tDescription_of_variant C42T\tfree text'
     line2 = 'ariba_cluster1\tcluster1\t0\t0\t27\t10000\tcluster1\t1000\t999\t78.42\tcluster1.scaffold.1\t400\t12.2\t1\tSNP\tn\tC42T\t0\t.\t.\t42\t42\tC\t142\t142\tC\t500\t.\t500\tDescription_of_variant C42T\tfree text'
     dict1 = report_filter.ReportFilter._report_line_to_dict(line1)
     dict2 = report_filter.ReportFilter._report_line_to_dict(line2)
     got = rf._filter_list_of_dicts([dict1, dict2])
     self.assertEqual([], got)
Example #4
0
 def test_filter_list_of_dicts_with_pass(self):
     '''Test _filter_list_of_dicts with a line that passes'''
     rf = report_filter.ReportFilter(ignore_not_has_known_variant=True)
     line1 = 'ariba_cluster1\tcluster1\t0\t0\t27\t10000\tcluster1\t1000\t999\t98.42\tcluster1.scaffold.1\t500\t12.1\t1\tSNP\tn\tC42T\t0\t.\t.\t42\t42\tC\t142\t142\tC\t500\t.\t500\tDescription_of_variant C42T\tfree text'
     line2 = 'ariba_cluster1\tcluster1\t0\t0\t27\t10000\tcluster1\t1000\t999\t98.42\tcluster1.scaffold.1\t500\t12.1\t1\tSNP\tn\tC46T\t1\t.\t.\t42\t42\tC\t142\t142\tC\t500\t.\t500\tDescription_of_variant C46T\tfree text'
     line3 = 'ariba_cluster1\tcluster1\t0\t0\t27\t10000\tcluster1\t1000\t999\t78.42\tcluster1.scaffold.1\t500\t12.1\t1\tSNP\tn\tC42T\t0\t.\t.\t42\t42\tC\t142\t142\tC\t500\t.\t500\tDescription_of_variant C42T\tfree text'
     dict1 = report_filter.ReportFilter._report_line_to_dict(line1)
     dict2 = report_filter.ReportFilter._report_line_to_dict(line2)
     dict3 = report_filter.ReportFilter._report_line_to_dict(line3)
     got = rf._filter_list_of_dicts([dict1, dict2, dict3])
     self.assertEqual([dict2], got)
Example #5
0
 def test_filter_list_of_dicts_with_essential(self):
     '''Test _filter_list_of_dicts with an essential line but all others fail'''
     rf = report_filter.ReportFilter(ignore_not_has_known_variant=True)
     line1 = 'ariba_cluster1\tcluster1\t0\t0\t27\t10000\tcluster1\t1000\t999\t98.42\tcluster1.scaffold.1\t400\t12.2\t1\tSNP\tn\tC42T\t0\t.\t.\t42\t42\tC\t142\t142\tC\t500\t.\t500\tDescription_of_variant C42T\tfree text'
     line2 = 'ariba_cluster1\tcluster1\t0\t0\t27\t10000\tcluster1\t1000\t999\t78.42\tcluster1.scaffold.1\t400\t12.2\t1\tSNP\tn\tC42T\t0\t.\t.\t42\t42\tC\t142\t142\tC\t500\t.\t500\tDescription_of_variant C42T\tfree text'
     dict1 = report_filter.ReportFilter._report_line_to_dict(line1)
     dict2 = report_filter.ReportFilter._report_line_to_dict(line2)
     expected_line = 'ariba_cluster1\tcluster1\t0\t0\t27\t10000\tcluster1\t1000\t999\t98.42\tcluster1.scaffold.1\t400\t12.2\t' + '\t'.join(['.'] * 17) + '\tfree text'
     expected = [report_filter.ReportFilter._report_line_to_dict(expected_line)]
     assert expected != [None]
     got = rf._filter_list_of_dicts([dict1, dict2])
     self.assertEqual(expected, got)
Example #6
0
    def test_filter_dicts(self):
        '''Test _filter_dicts'''
        rf = report_filter.ReportFilter(min_ref_base_assembled=10, ignore_not_has_known_variant=True)
        ref_2_dict = {x: '.' for x in report.columns}
        ref_2_dict['pc_ident'] = 91.0
        ref_2_dict['ref_base_assembled'] = 10
        ref_2_dict['has_known_var'] = '0'
        ref_2_dict['flag'] = flag.Flag(27)
        ref_2_dict['var_type'] = '.'

        rf.report = {
            'ref1': {
                'ref1.scaff1': [
                    {'flag': flag.Flag(27), 'pc_ident': 91.0, 'ref_base_assembled': 9, 'known_var': '1', 'has_known_var': '1', 'var_type': 'SNP'},
                    {'flag': flag.Flag(27), 'pc_ident': 91.5, 'ref_base_assembled': 11, 'known_var': '1', 'has_known_var': '1', 'var_type': 'HET'},
                    {'flag': flag.Flag(27), 'pc_ident': 89.0, 'ref_base_assembled': 10, 'known_var': '1', 'has_known_var': '1', 'var_type': 'SNP'},
                    {'flag': flag.Flag(27), 'pc_ident': 90.0, 'ref_base_assembled': 11, 'known_var': '1', 'has_known_var': '0', 'var_type': 'SNP'},
                    {'flag': flag.Flag(27), 'pc_ident': 90.0, 'ref_base_assembled': 11, 'known_var': '1', 'has_known_var': '1', 'var_type': 'SNP'},
                ]
            },
            'ref2': {
                'ref2.scaff1': [
                    ref_2_dict
                ]
            },
            'ref3': {
                'ref3.scaff1': [
                    {'flag': flag.Flag(27), 'pc_ident': 84.0, 'ref_base_assembled': 10, 'known_var': '1', 'has_known_var': '0', 'var_type': 'SNP'},
                ]
            },
            'ref4': {
                'ref4.scaff1': [
                    {'flag': flag.Flag(64), 'pc_ident': '.', 'ref_base_assembled': '.', 'known_var': '.', 'has_known_var': '.', 'var_type': '.'},
                ]
            }
        }

        expected = {
            'ref1': {
                'ref1.scaff1': [
                    {'flag': flag.Flag(27), 'pc_ident': 91.5, 'ref_base_assembled': 11, 'known_var': '1', 'has_known_var': '1', 'var_type': 'HET'},
                    {'flag': flag.Flag(27), 'pc_ident': 90.0, 'ref_base_assembled': 11, 'known_var': '1', 'has_known_var': '1', 'var_type': 'SNP'},
                ]
            },
            'ref2': {
                'ref2.scaff1': [ref_2_dict]
            }
        }

        rf._filter_dicts()
        self.assertEqual(expected, rf.report)
Example #7
0
 def test_flag_passes_filter(self):
     '''Test _flag_passes_filter'''
     rf = report_filter.ReportFilter()
     exclude_flags = ['assembly_fail', 'ref_seq_choose_fail']
     f = flag.Flag()
     self.assertTrue(rf._flag_passes_filter(f, exclude_flags))
     f.add('assembled')
     self.assertTrue(rf._flag_passes_filter(f, exclude_flags))
     f = flag.Flag()
     f.add('assembly_fail')
     self.assertFalse(rf._flag_passes_filter(f, exclude_flags))
     f = flag.Flag()
     f.add('ref_seq_choose_fail')
     self.assertFalse(rf._flag_passes_filter(f, exclude_flags))
Example #8
0
    def test_report_dict_passes_essential_filters(self):
        '''Test _report_dict_passes_essential_filters'''
        line1 = 'ariba_cluster1\tcluster1\t0\t0\t27\t10000\tcluster1\t1000\t999\t98.42\tcluster1.scaffold.1\t400\t12.2\t1\tSNP\tn\tC42T\t0\t.\t.\t42\t42\tC\t142\t142\tC\t500\t.\t500\tDescription_of_variant C42T\tfree text'
        line2 = 'ariba_cluster1\tcluster1\t0\t0\t27\t10000\tcluster1\t1000\t0\t98.42\tcluster1.scaffold.1\t400\t12.2\t1\tSNP\tn\tC42T\t0\t.\t.\t42\t42\tC\t142\t142\tC\t500\t.\t500\tDescription_of_variant C42T\tfree text'
        line3 = 'ariba_cluster1\tcluster1\t0\t0\t27\t10000\tcluster1\t1000\t999\t78.42\tcluster1.scaffold.1\t400\t12.2\t1\tSNP\tn\tC42T\t0\t.\t.\t42\t42\tC\t142\t142\tC\t500\t.\t500\tDescription_of_variant C42T\tfree text'
        tests = [
            (report_filter.ReportFilter._report_line_to_dict(line1), True),
            (report_filter.ReportFilter._report_line_to_dict(line2), False),
            (report_filter.ReportFilter._report_line_to_dict(line3), False),
        ]

        for test_dict, expected in tests:
            rf = report_filter.ReportFilter()
            self.assertEqual(expected,  rf._report_dict_passes_essential_filters(test_dict))
Example #9
0
    def test_report_dict_passes_non_essential_filters_synonymous(self):
        '''Test _report_dict_passes_non_essential_filters with synonymous AA changes'''
        tests = [
             ('.', True, True),
             ('.', False, True),
             ('SNP', True, True),
             ('SNP', False, True),
             ('SYN', True, False),
             ('SYN', False, True),
        ]

        for var, remove_synonymous_snps, expected in tests:
            d = {'known_var': '1', 'ref_ctg_effect': var, 'has_known_var': '1'}
            rf = report_filter.ReportFilter(remove_synonymous_snps=remove_synonymous_snps)
            self.assertEqual(expected, rf._report_dict_passes_non_essential_filters(d))
Example #10
0
    def test_init_good_file(self):
        '''test __init__ on good input file'''
        infile = os.path.join(data_dir, 'report_filter_test_init_good.tsv')
        rf = report_filter.ReportFilter(infile=infile)
        line1 = '\t'.join([
            'ariba_cluster1', 'cluster1', '0', '0', '27', '10000', 'cluster1',
            '1000', '999', '99.42', 'cluster1.scaffold.1', '1300', '10.5', '1',
            'SNP', 'n', 'C42T', '0', '.', '.', '42', '42', 'C', '142', '142',
            'C', '500', 'C', '500', 'a:n:C42T:id1:foo', 'free_text'
        ])
        line2 = '\t'.join([
            'ariba_cluster1', 'cluster1', '0', '0', '27', '10000', 'cluster1',
            '1000', '999', '99.42', 'cluster1.scaffold.1', '1300', '10.5', '1',
            'SNP', 'n', 'A51G', '0', '.', '.', '51', '51', 'C', '151', '151',
            'C', '542', 'C', '542', 'a:n:A51G:id2:bar', 'free_text2'
        ])
        line3 = '\t'.join([
            'ariba_cluster1', 'cluster1', '0', '0', '27', '10000', 'cluster1',
            '1000', '999', '99.42', 'cluster1.scaffold.2', '1300', '12.4', '1',
            'SNP', 'n', 'A51G', '0', '.', '.', '51', '51', 'C', '151', '151',
            'C', '542', 'C', '542', 'a:n:A51G:id3:spam', 'free_text3'
        ])
        line4 = '\t'.join([
            'ariba_cluster2', 'cluster2', '1', '0', '179', '20000', 'cluster2',
            '1042', '1042', '42.42', 'cluster2.scaffold.1', '1442', '20.2',
            '1', 'SNP', 'p', 'I42L', '1', 'I42L', 'NONSYN', '112', '112', 'C',
            '442', '442', 'T', '300', 'T', '290', 'a:v:I42L:id4:eggs',
            'free_text3'
        ])

        expected = {
            'cluster1': {
                'cluster1.scaffold.1': [
                    report_filter.ReportFilter._report_line_to_dict(line1),
                    report_filter.ReportFilter._report_line_to_dict(line2)
                ],
                'cluster1.scaffold.2':
                [report_filter.ReportFilter._report_line_to_dict(line3)],
            },
            'cluster2': {
                'cluster2.scaffold.1':
                [report_filter.ReportFilter._report_line_to_dict(line4)]
            }
        }

        self.assertEqual(expected, rf.report)
Example #11
0
    def test_report_dict_passes_non_essential_filters_known_vars(self):
        '''Test _report_dict_passes_non_essential_filters with known vars'''
        tests = [
            ('.', '.', True, True),
            ('.', '.', False, True),
            ('0', '0', True, True),
            ('0', '0', False, True),
            ('1', '0', True, False),
            ('1', '1', True, True),
            ('1', '0', False, True),
            ('1', '1', False, True),
        ]

        for known_var, has_known_var, ignore_not_has_known_variant, expected in tests:
            d = {'known_var': known_var, 'has_known_var': has_known_var}
            rf = report_filter.ReportFilter(ignore_not_has_known_variant=ignore_not_has_known_variant)
            self.assertEqual(expected, rf._report_dict_passes_non_essential_filters(d))
Example #12
0
    def _run(self):
        cwd = os.getcwd()
        try:
            os.chdir(self.outdir)
            self.write_versions_file(cwd)
            self._map_and_cluster_reads()
            self.log_files = None

            if len(self.cluster_to_dir) > 0:
                got_insert_data_ok = self._set_insert_size_data()
                if not got_insert_data_ok:
                    print('WARNING: not enough proper read pairs (found ' +
                          str(self.proper_pairs) +
                          ') to determine insert size.',
                          file=sys.stderr)
                    print(
                        'This probably means that very few reads were mapped at all. No local assemblies will be run',
                        file=sys.stderr)
                    if self.verbose:
                        print(
                            'Not enough proper read pairs mapped to determine insert size. Skipping all assemblies.',
                            flush=True)
                else:
                    if self.verbose:
                        print('{:_^79}'.format(' Assembling each cluster '))
                        print('Will run',
                              self.threads,
                              'cluster(s) in parallel',
                              flush=True)
                    self._init_and_run_clusters()
                    if self.verbose:
                        print('Finished assembling clusters\n')
            else:
                if self.verbose:
                    print('No reads mapped. Skipping all assemblies',
                          flush=True)
                print(
                    'WARNING: no reads mapped to reference genes. Therefore no local assemblies will be run',
                    file=sys.stderr)

            if not self.clusters_all_ran_ok:
                raise Error('At least one cluster failed! Stopping...')

            if self.verbose:
                print('{:_^79}'.format(' Writing reports '), flush=True)
                print('Making', self.report_file_all_tsv)
            self._write_report(self.clusters, self.report_file_all_tsv)

            if self.verbose:
                print('Making', self.report_file_filtered)
            rf = report_filter.ReportFilter(infile=self.report_file_all_tsv)
            rf.run(self.report_file_filtered)

            if self.verbose:
                print()
                print(
                    '{:_^79}'.format(' Writing fasta of assembled sequences '),
                    flush=True)
                print(self.catted_assembled_seqs_fasta,
                      'and',
                      self.catted_genes_matching_refs_fasta,
                      flush=True)
            self._write_catted_assembled_seqs_fasta(
                self.catted_assembled_seqs_fasta)
            self._write_catted_genes_matching_refs_fasta(
                self.catted_genes_matching_refs_fasta)
            self._write_catted_assemblies_fasta(self.catted_assemblies_fasta)

            if self.log_files is not None:
                clusters_log_file = os.path.join(self.outdir,
                                                 'log.clusters.gz')
                if self.verbose:
                    print()
                    print('{:_^79}'.format(' Catting cluster log files '),
                          flush=True)
                    print('Writing file', clusters_log_file, flush=True)
                common.cat_files(self.log_files, clusters_log_file)

            if self.verbose:
                print()
                print('{:_^79}'.format(' Cleaning files '), flush=True)
            self._clean()

            Clusters._write_mlst_reports(self.mlst_profile_file,
                                         self.report_file_filtered,
                                         self.mlst_reports_prefix,
                                         verbose=self.verbose)

            if self.clusters_all_ran_ok and self.verbose:
                print('\nAll done!\n')
        finally:
            os.chdir(cwd)
Example #13
0
 def test_init_bad_file(self):
     '''test __init__ on bad input file'''
     infile = os.path.join(data_dir, 'report_filter_test_init_bad.tsv')
     with self.assertRaises(report_filter.Error):
         report_filter.ReportFilter(infile=infile)