def test_write_report_tsv(self): '''Test write_report_tsv''' infile = os.path.join(data_dir, 'report_filter_test_write_report.tsv') tmpfile = 'tmp.test.report_filter.write_report.tsv' rf = report_filter.ReportFilter(infile=infile) rf._write_report_tsv(tmpfile) self.assertTrue(filecmp.cmp(tmpfile, infile, shallow=False)) os.unlink(tmpfile)
def test_run(self): '''Test run''' infile = os.path.join(data_dir, 'report_filter_test_run.in.tsv') expected_file = os.path.join(data_dir, 'report_filter_test_run.expected.tsv') tmpfile = 'tmp.test.report_filter.run.out.tsv' rf = report_filter.ReportFilter(infile=infile) rf.run(tmpfile) self.assertTrue(filecmp.cmp(expected_file, tmpfile, shallow=False)) os.unlink(tmpfile)
def test_filter_list_of_dicts_all_fail(self): '''Test _filter_list_of_dicts where all fail''' rf = report_filter.ReportFilter() line1 = 'ariba_cluster1\tcluster1\t0\t0\t27\t10000\tcluster1\t1000\t999\t88.42\tcluster1.scaffold.1\t400\t12.2\t1\tSNP\tn\tC42T\t0\t.\t.\t42\t42\tC\t142\t142\tC\t500\t.\t500\tDescription_of_variant C42T\tfree text' line2 = 'ariba_cluster1\tcluster1\t0\t0\t27\t10000\tcluster1\t1000\t999\t78.42\tcluster1.scaffold.1\t400\t12.2\t1\tSNP\tn\tC42T\t0\t.\t.\t42\t42\tC\t142\t142\tC\t500\t.\t500\tDescription_of_variant C42T\tfree text' dict1 = report_filter.ReportFilter._report_line_to_dict(line1) dict2 = report_filter.ReportFilter._report_line_to_dict(line2) got = rf._filter_list_of_dicts([dict1, dict2]) self.assertEqual([], got)
def test_filter_list_of_dicts_with_pass(self): '''Test _filter_list_of_dicts with a line that passes''' rf = report_filter.ReportFilter(ignore_not_has_known_variant=True) line1 = 'ariba_cluster1\tcluster1\t0\t0\t27\t10000\tcluster1\t1000\t999\t98.42\tcluster1.scaffold.1\t500\t12.1\t1\tSNP\tn\tC42T\t0\t.\t.\t42\t42\tC\t142\t142\tC\t500\t.\t500\tDescription_of_variant C42T\tfree text' line2 = 'ariba_cluster1\tcluster1\t0\t0\t27\t10000\tcluster1\t1000\t999\t98.42\tcluster1.scaffold.1\t500\t12.1\t1\tSNP\tn\tC46T\t1\t.\t.\t42\t42\tC\t142\t142\tC\t500\t.\t500\tDescription_of_variant C46T\tfree text' line3 = 'ariba_cluster1\tcluster1\t0\t0\t27\t10000\tcluster1\t1000\t999\t78.42\tcluster1.scaffold.1\t500\t12.1\t1\tSNP\tn\tC42T\t0\t.\t.\t42\t42\tC\t142\t142\tC\t500\t.\t500\tDescription_of_variant C42T\tfree text' dict1 = report_filter.ReportFilter._report_line_to_dict(line1) dict2 = report_filter.ReportFilter._report_line_to_dict(line2) dict3 = report_filter.ReportFilter._report_line_to_dict(line3) got = rf._filter_list_of_dicts([dict1, dict2, dict3]) self.assertEqual([dict2], got)
def test_filter_list_of_dicts_with_essential(self): '''Test _filter_list_of_dicts with an essential line but all others fail''' rf = report_filter.ReportFilter(ignore_not_has_known_variant=True) line1 = 'ariba_cluster1\tcluster1\t0\t0\t27\t10000\tcluster1\t1000\t999\t98.42\tcluster1.scaffold.1\t400\t12.2\t1\tSNP\tn\tC42T\t0\t.\t.\t42\t42\tC\t142\t142\tC\t500\t.\t500\tDescription_of_variant C42T\tfree text' line2 = 'ariba_cluster1\tcluster1\t0\t0\t27\t10000\tcluster1\t1000\t999\t78.42\tcluster1.scaffold.1\t400\t12.2\t1\tSNP\tn\tC42T\t0\t.\t.\t42\t42\tC\t142\t142\tC\t500\t.\t500\tDescription_of_variant C42T\tfree text' dict1 = report_filter.ReportFilter._report_line_to_dict(line1) dict2 = report_filter.ReportFilter._report_line_to_dict(line2) expected_line = 'ariba_cluster1\tcluster1\t0\t0\t27\t10000\tcluster1\t1000\t999\t98.42\tcluster1.scaffold.1\t400\t12.2\t' + '\t'.join(['.'] * 17) + '\tfree text' expected = [report_filter.ReportFilter._report_line_to_dict(expected_line)] assert expected != [None] got = rf._filter_list_of_dicts([dict1, dict2]) self.assertEqual(expected, got)
def test_filter_dicts(self): '''Test _filter_dicts''' rf = report_filter.ReportFilter(min_ref_base_assembled=10, ignore_not_has_known_variant=True) ref_2_dict = {x: '.' for x in report.columns} ref_2_dict['pc_ident'] = 91.0 ref_2_dict['ref_base_assembled'] = 10 ref_2_dict['has_known_var'] = '0' ref_2_dict['flag'] = flag.Flag(27) ref_2_dict['var_type'] = '.' rf.report = { 'ref1': { 'ref1.scaff1': [ {'flag': flag.Flag(27), 'pc_ident': 91.0, 'ref_base_assembled': 9, 'known_var': '1', 'has_known_var': '1', 'var_type': 'SNP'}, {'flag': flag.Flag(27), 'pc_ident': 91.5, 'ref_base_assembled': 11, 'known_var': '1', 'has_known_var': '1', 'var_type': 'HET'}, {'flag': flag.Flag(27), 'pc_ident': 89.0, 'ref_base_assembled': 10, 'known_var': '1', 'has_known_var': '1', 'var_type': 'SNP'}, {'flag': flag.Flag(27), 'pc_ident': 90.0, 'ref_base_assembled': 11, 'known_var': '1', 'has_known_var': '0', 'var_type': 'SNP'}, {'flag': flag.Flag(27), 'pc_ident': 90.0, 'ref_base_assembled': 11, 'known_var': '1', 'has_known_var': '1', 'var_type': 'SNP'}, ] }, 'ref2': { 'ref2.scaff1': [ ref_2_dict ] }, 'ref3': { 'ref3.scaff1': [ {'flag': flag.Flag(27), 'pc_ident': 84.0, 'ref_base_assembled': 10, 'known_var': '1', 'has_known_var': '0', 'var_type': 'SNP'}, ] }, 'ref4': { 'ref4.scaff1': [ {'flag': flag.Flag(64), 'pc_ident': '.', 'ref_base_assembled': '.', 'known_var': '.', 'has_known_var': '.', 'var_type': '.'}, ] } } expected = { 'ref1': { 'ref1.scaff1': [ {'flag': flag.Flag(27), 'pc_ident': 91.5, 'ref_base_assembled': 11, 'known_var': '1', 'has_known_var': '1', 'var_type': 'HET'}, {'flag': flag.Flag(27), 'pc_ident': 90.0, 'ref_base_assembled': 11, 'known_var': '1', 'has_known_var': '1', 'var_type': 'SNP'}, ] }, 'ref2': { 'ref2.scaff1': [ref_2_dict] } } rf._filter_dicts() self.assertEqual(expected, rf.report)
def test_flag_passes_filter(self): '''Test _flag_passes_filter''' rf = report_filter.ReportFilter() exclude_flags = ['assembly_fail', 'ref_seq_choose_fail'] f = flag.Flag() self.assertTrue(rf._flag_passes_filter(f, exclude_flags)) f.add('assembled') self.assertTrue(rf._flag_passes_filter(f, exclude_flags)) f = flag.Flag() f.add('assembly_fail') self.assertFalse(rf._flag_passes_filter(f, exclude_flags)) f = flag.Flag() f.add('ref_seq_choose_fail') self.assertFalse(rf._flag_passes_filter(f, exclude_flags))
def test_report_dict_passes_essential_filters(self): '''Test _report_dict_passes_essential_filters''' line1 = 'ariba_cluster1\tcluster1\t0\t0\t27\t10000\tcluster1\t1000\t999\t98.42\tcluster1.scaffold.1\t400\t12.2\t1\tSNP\tn\tC42T\t0\t.\t.\t42\t42\tC\t142\t142\tC\t500\t.\t500\tDescription_of_variant C42T\tfree text' line2 = 'ariba_cluster1\tcluster1\t0\t0\t27\t10000\tcluster1\t1000\t0\t98.42\tcluster1.scaffold.1\t400\t12.2\t1\tSNP\tn\tC42T\t0\t.\t.\t42\t42\tC\t142\t142\tC\t500\t.\t500\tDescription_of_variant C42T\tfree text' line3 = 'ariba_cluster1\tcluster1\t0\t0\t27\t10000\tcluster1\t1000\t999\t78.42\tcluster1.scaffold.1\t400\t12.2\t1\tSNP\tn\tC42T\t0\t.\t.\t42\t42\tC\t142\t142\tC\t500\t.\t500\tDescription_of_variant C42T\tfree text' tests = [ (report_filter.ReportFilter._report_line_to_dict(line1), True), (report_filter.ReportFilter._report_line_to_dict(line2), False), (report_filter.ReportFilter._report_line_to_dict(line3), False), ] for test_dict, expected in tests: rf = report_filter.ReportFilter() self.assertEqual(expected, rf._report_dict_passes_essential_filters(test_dict))
def test_report_dict_passes_non_essential_filters_synonymous(self): '''Test _report_dict_passes_non_essential_filters with synonymous AA changes''' tests = [ ('.', True, True), ('.', False, True), ('SNP', True, True), ('SNP', False, True), ('SYN', True, False), ('SYN', False, True), ] for var, remove_synonymous_snps, expected in tests: d = {'known_var': '1', 'ref_ctg_effect': var, 'has_known_var': '1'} rf = report_filter.ReportFilter(remove_synonymous_snps=remove_synonymous_snps) self.assertEqual(expected, rf._report_dict_passes_non_essential_filters(d))
def test_init_good_file(self): '''test __init__ on good input file''' infile = os.path.join(data_dir, 'report_filter_test_init_good.tsv') rf = report_filter.ReportFilter(infile=infile) line1 = '\t'.join([ 'ariba_cluster1', 'cluster1', '0', '0', '27', '10000', 'cluster1', '1000', '999', '99.42', 'cluster1.scaffold.1', '1300', '10.5', '1', 'SNP', 'n', 'C42T', '0', '.', '.', '42', '42', 'C', '142', '142', 'C', '500', 'C', '500', 'a:n:C42T:id1:foo', 'free_text' ]) line2 = '\t'.join([ 'ariba_cluster1', 'cluster1', '0', '0', '27', '10000', 'cluster1', '1000', '999', '99.42', 'cluster1.scaffold.1', '1300', '10.5', '1', 'SNP', 'n', 'A51G', '0', '.', '.', '51', '51', 'C', '151', '151', 'C', '542', 'C', '542', 'a:n:A51G:id2:bar', 'free_text2' ]) line3 = '\t'.join([ 'ariba_cluster1', 'cluster1', '0', '0', '27', '10000', 'cluster1', '1000', '999', '99.42', 'cluster1.scaffold.2', '1300', '12.4', '1', 'SNP', 'n', 'A51G', '0', '.', '.', '51', '51', 'C', '151', '151', 'C', '542', 'C', '542', 'a:n:A51G:id3:spam', 'free_text3' ]) line4 = '\t'.join([ 'ariba_cluster2', 'cluster2', '1', '0', '179', '20000', 'cluster2', '1042', '1042', '42.42', 'cluster2.scaffold.1', '1442', '20.2', '1', 'SNP', 'p', 'I42L', '1', 'I42L', 'NONSYN', '112', '112', 'C', '442', '442', 'T', '300', 'T', '290', 'a:v:I42L:id4:eggs', 'free_text3' ]) expected = { 'cluster1': { 'cluster1.scaffold.1': [ report_filter.ReportFilter._report_line_to_dict(line1), report_filter.ReportFilter._report_line_to_dict(line2) ], 'cluster1.scaffold.2': [report_filter.ReportFilter._report_line_to_dict(line3)], }, 'cluster2': { 'cluster2.scaffold.1': [report_filter.ReportFilter._report_line_to_dict(line4)] } } self.assertEqual(expected, rf.report)
def test_report_dict_passes_non_essential_filters_known_vars(self): '''Test _report_dict_passes_non_essential_filters with known vars''' tests = [ ('.', '.', True, True), ('.', '.', False, True), ('0', '0', True, True), ('0', '0', False, True), ('1', '0', True, False), ('1', '1', True, True), ('1', '0', False, True), ('1', '1', False, True), ] for known_var, has_known_var, ignore_not_has_known_variant, expected in tests: d = {'known_var': known_var, 'has_known_var': has_known_var} rf = report_filter.ReportFilter(ignore_not_has_known_variant=ignore_not_has_known_variant) self.assertEqual(expected, rf._report_dict_passes_non_essential_filters(d))
def _run(self): cwd = os.getcwd() try: os.chdir(self.outdir) self.write_versions_file(cwd) self._map_and_cluster_reads() self.log_files = None if len(self.cluster_to_dir) > 0: got_insert_data_ok = self._set_insert_size_data() if not got_insert_data_ok: print('WARNING: not enough proper read pairs (found ' + str(self.proper_pairs) + ') to determine insert size.', file=sys.stderr) print( 'This probably means that very few reads were mapped at all. No local assemblies will be run', file=sys.stderr) if self.verbose: print( 'Not enough proper read pairs mapped to determine insert size. Skipping all assemblies.', flush=True) else: if self.verbose: print('{:_^79}'.format(' Assembling each cluster ')) print('Will run', self.threads, 'cluster(s) in parallel', flush=True) self._init_and_run_clusters() if self.verbose: print('Finished assembling clusters\n') else: if self.verbose: print('No reads mapped. Skipping all assemblies', flush=True) print( 'WARNING: no reads mapped to reference genes. Therefore no local assemblies will be run', file=sys.stderr) if not self.clusters_all_ran_ok: raise Error('At least one cluster failed! Stopping...') if self.verbose: print('{:_^79}'.format(' Writing reports '), flush=True) print('Making', self.report_file_all_tsv) self._write_report(self.clusters, self.report_file_all_tsv) if self.verbose: print('Making', self.report_file_filtered) rf = report_filter.ReportFilter(infile=self.report_file_all_tsv) rf.run(self.report_file_filtered) if self.verbose: print() print( '{:_^79}'.format(' Writing fasta of assembled sequences '), flush=True) print(self.catted_assembled_seqs_fasta, 'and', self.catted_genes_matching_refs_fasta, flush=True) self._write_catted_assembled_seqs_fasta( self.catted_assembled_seqs_fasta) self._write_catted_genes_matching_refs_fasta( self.catted_genes_matching_refs_fasta) self._write_catted_assemblies_fasta(self.catted_assemblies_fasta) if self.log_files is not None: clusters_log_file = os.path.join(self.outdir, 'log.clusters.gz') if self.verbose: print() print('{:_^79}'.format(' Catting cluster log files '), flush=True) print('Writing file', clusters_log_file, flush=True) common.cat_files(self.log_files, clusters_log_file) if self.verbose: print() print('{:_^79}'.format(' Cleaning files '), flush=True) self._clean() Clusters._write_mlst_reports(self.mlst_profile_file, self.report_file_filtered, self.mlst_reports_prefix, verbose=self.verbose) if self.clusters_all_ran_ok and self.verbose: print('\nAll done!\n') finally: os.chdir(cwd)
def test_init_bad_file(self): '''test __init__ on bad input file''' infile = os.path.join(data_dir, 'report_filter_test_init_bad.tsv') with self.assertRaises(report_filter.Error): report_filter.ReportFilter(infile=infile)