コード例 #1
0
    def test_load_ref_data_from_dir(self):
        '''test _load_reference_data_from_dir'''
        indir = os.path.join(data_dir, 'clusters_load_ref_data_from_dir')
        got_refdata, got_clusters = clusters.Clusters._load_reference_data_from_dir(
            indir)
        expected_seq_dict = {
            'variants_only1':
            pyfastaq.sequences.Fasta('variants_only1', 'atggcgtgcgatgaataa'),
            'presabs1':
            pyfastaq.sequences.Fasta('presabs1',
                                     'atgatgatgagcccggcgatggaaggcggctag'),
            'noncoding1':
            pyfastaq.sequences.Fasta('noncoding1', 'ACGTA'),
        }
        self.assertEqual(expected_seq_dict, got_refdata.sequences)
        self.assertEqual(11, got_refdata.genetic_code)

        expected_metadata = {
            'presabs1': {
                'seq_type': 'p',
                'variant_only': False,
                '.': {
                    sequence_metadata.SequenceMetadata(
                        'presabs1\t1\t0\t.\t.\tpresabs1 description')
                },
                'n': {},
                'p': {}
            },
            'variants_only1': {
                'seq_type': 'p',
                'variant_only': True,
                '.': set(),
                'n': {},
                'p': {
                    1: {
                        sequence_metadata.SequenceMetadata(
                            'variants_only1\t1\t1\tC2I\t.\tdescription of variants_only1 C2I'
                        )
                    }
                }
            },
            'noncoding1': {
                'seq_type': 'n',
                'variant_only': False,
                '.': {
                    sequence_metadata.SequenceMetadata(
                        'noncoding1\t0\t0\t.\t.\t.')
                },
                'n': {},
                'p': {},
            }
        }
        self.assertEqual(expected_metadata, got_refdata.metadata)

        expected_clusters = {
            '0': {'presabs1'},
            '1': {'variants_only1'},
            '2': {'noncoding1'}
        }
        self.assertEqual(expected_clusters, got_clusters)
コード例 #2
0
    def test_init_ok(self):
        '''Test init with good input'''
        fasta_in = os.path.join(data_dir, 'reference_data_init_ok.in.fa')
        tsv_in = os.path.join(data_dir, 'reference_data_init_ok.in.tsv')
        meta1 = sequence_metadata.SequenceMetadata(
            'gene1\t1\t0\tR2S\t.\tconfers killer rabbit resistance')
        meta2 = sequence_metadata.SequenceMetadata(
            "gene2\t1\t0\tI42L\t.\tremoves tardigrade's space-living capability"
        )

        expected_metadata = {
            'gene1': {
                'seq_type': 'p',
                'variant_only': False,
                'n': {},
                'p': {
                    1: {meta1}
                },
                '.': set(),
            },
            'gene2': {
                'seq_type': 'p',
                'variant_only': False,
                'n': {},
                'p': {
                    41: {meta2}
                },
                '.': set(),
            }
        }
        ref_data = reference_data.ReferenceData([fasta_in], [tsv_in])
        self.assertEqual(expected_metadata, ref_data.metadata)

        expected_seqs_dict = {
            'gene1': pyfastaq.sequences.Fasta('gene1',
                                              'CATCGTCGTCTATCGTCGTCCTAG'),
            'gene2': pyfastaq.sequences.Fasta('gene2', 'AAAAACCCCGGGGTTTT')
        }

        self.assertEqual(expected_seqs_dict, ref_data.sequences)
        self.assertEqual({}, ref_data.ariba_to_original_name)
        self.assertEqual({}, ref_data.extra_parameters)

        rename_file = os.path.join(data_dir,
                                   'reference_data_init_ok.rename.tsv')
        parameters_file = os.path.join(data_dir,
                                       'reference_data_init_ok.params.json')
        ref_data = reference_data.ReferenceData(
            [fasta_in], [tsv_in],
            rename_file=rename_file,
            parameters_file=parameters_file)
        expected_rename_dict = {
            'gene1': 'original_gene1',
            'gene2': 'original_gene2'
        }
        self.assertEqual(expected_rename_dict, ref_data.ariba_to_original_name)
        expected_extra_parameters = {'foo': 'bar', 'spam': 'eggs'}
        self.assertEqual(expected_extra_parameters, ref_data.extra_parameters)
コード例 #3
0
ファイル: assembly_variants_test.py プロジェクト: ys4/ariba
    def test_get_variants_variants_only(self):
        '''test get_variants variants only'''
        meta1 = sequence_metadata.SequenceMetadata(
            'variants_only\t1\t0\tD2E\tid1\tref has wild type D (GAT=D, GAA=E)'
        )
        meta2 = sequence_metadata.SequenceMetadata(
            'variants_only\t1\t0\tS3R\tid1\tref has variant type R (AGA=R, AGT=S)'
        )
        meta3 = sequence_metadata.SequenceMetadata(
            'variants_only\t1\t0\tD4E\tid1\tref has variant type E (GAA=E, GAC=D)'
        )

        metadata_tsv = 'tmp.test_get_variants_variants_only.metadata.tsv'
        with open(metadata_tsv, 'w') as f:
            print(meta1, file=f)
            print(meta2, file=f)
            print(meta3, file=f)

        fasta_in = os.path.join(
            data_dir, 'assembly_variants_test_get_variants_variants_only.fa')
        refdata = reference_data.ReferenceData([fasta_in], [metadata_tsv])
        os.unlink(metadata_tsv)

        nucmer_snp_file = os.path.join(
            data_dir, 'assembly_variants_test_get_variants_variants_only.snps')
        v2 = pymummer.variant.Variant(
            pymummer.snp.Snp(
                '14\tC\tA\t14\tx\tx\t42\t42\tx\tx\tvariants_only\tcontig1'))
        v3 = pymummer.variant.Variant(
            pymummer.snp.Snp(
                '15\tG\tC\t15\tx\tx\t42\t42\tx\tx\tvariants_only\tcontig1'))

        ctg_nucmer_coords = {
            'contig1': [pyfastaq.intervals.Interval(0, 41)],
            'contig2': [pyfastaq.intervals.Interval(10, 41)],
        }

        ref_nucmer_coords = {
            'contig1': [pyfastaq.intervals.Interval(0, 41)],
            'contig2': [pyfastaq.intervals.Interval(10, 41)],
        }
        expected = {
            'contig1': [
                (4, 'p', 'A5D', 'NONSYN', [v2, v3], set(), set()),
                (None, 'p', None, None, None, {meta1}, set()),
                (None, 'p', None, None, None, {meta3}, set()),
            ],
            'contig2': [(None, 'p', None, None, None, {meta3}, set())],
        }

        a_variants = assembly_variants.AssemblyVariants(
            refdata, nucmer_snp_file)
        got = a_variants.get_variants('variants_only', ctg_nucmer_coords,
                                      ref_nucmer_coords)
        self.assertEqual(expected, got)
コード例 #4
0
ファイル: assembly_variants_test.py プロジェクト: ys4/ariba
    def test_one_var_one_ctg_noncdg(self):
        '''test _get_one_variant_for_one_contig_non_coding'''
        fasta_in = os.path.join(data_dir,
                                'assembly_variants_one_var_one_ctg_noncdg.fa')
        tsv_in = os.path.join(data_dir,
                              'assembly_variants_one_var_one_ctg_noncdg.tsv')
        refdata = reference_data.ReferenceData([fasta_in], [tsv_in])
        ref_sequence_name = 'non_coding'
        refdata_var_dict = refdata.metadata[ref_sequence_name]

        v0 = pymummer.variant.Variant(
            pymummer.snp.Snp(
                '2\tT\tA\t2\tx\tx\t42\t42\tx\tx\tnon_coding\tcontig'))

        # ref has A at position 3, which is variant type. This gives contig the wild type C. Shouldn't report
        v1 = pymummer.variant.Variant(
            pymummer.snp.Snp(
                '3\tA\tC\t3\tx\tx\t42\t42\tx\tx\tnon_coding\tcontig'))

        # ref has T at position 5, which is wild type. This gives contig variant type A. Should report
        v2 = pymummer.variant.Variant(
            pymummer.snp.Snp(
                '5\tT\tA\t5\tx\tx\t42\t42\tx\tx\tnon_coding\tcontig'))

        meta0 = sequence_metadata.SequenceMetadata(
            'non_coding\t0\t0\tC3A\tid1\tref has variant type A')
        meta2 = sequence_metadata.SequenceMetadata(
            'non_coding\t0\t0\tT5A\tid1\tref has wild type T')

        mummer_variants = [v0, v1, v2]

        expected_tuples = [
            (1, 'n', 'T2A', 'SNP', [v0], set(), set()),  #0
            None,  #1
            (4, 'n', 'T5A', 'SNP', [v2], {meta2}, set()),  #2
        ]

        expected_used_variants = [
            set(),  #0
            {meta0},  #1
            {meta2},  #2
        ]

        assert len(mummer_variants) == len(expected_tuples) == len(
            expected_used_variants)

        for i in range(len(mummer_variants)):
            got_tuple, got_used_variants = assembly_variants.AssemblyVariants._get_one_variant_for_one_contig_non_coding(
                refdata_var_dict, mummer_variants[i])
            self.assertEqual(expected_tuples[i], got_tuple)
            self.assertEqual(expected_used_variants[i], got_used_variants)
コード例 #5
0
    def _load_metadata_tsv(cls, filename, metadata_dict):
        if filename is None:
            return {}

        f = pyfastaq.utils.open_file_read(filename)

        for line in f:
            try:
                metadata = sequence_metadata.SequenceMetadata(line)
            except:
                print('Problem with this line of metadata, which will be ignored:', line.rstrip(), file=sys.stderr)
                continue

            if metadata.name not in metadata_dict:
                metadata_dict[metadata.name] = {
                    'seq_type': metadata.seq_type,
                    'variant_only': metadata.variant_only,
                    'n': {},
                    'p': {},
                    '.': set()
                }
            elif metadata.seq_type != metadata_dict[metadata.name]['seq_type'] or metadata.variant_only != metadata_dict[metadata.name]['variant_only']:
                raise Error('Inconsistent metadata for sequence ' + metadata.name + '. Cannot continue')

            if metadata.variant is None:
                metadata_dict[metadata.name]['.'].add(metadata)
            else:
                if metadata.variant.position not in metadata_dict[metadata.name][metadata.seq_type]:
                    metadata_dict[metadata.name][metadata.seq_type][metadata.variant.position] = set()

                metadata_dict[metadata.name][metadata.seq_type][metadata.variant.position].add(metadata)

        pyfastaq.utils.close(f)
        return metadata_dict
コード例 #6
0
    def test_all_non_wild_type_variants(self):
        '''Test all_non_wild_type_variants'''
        tsv_file = os.path.join(
            data_dir, 'reference_data_test_all_non_wild_type_variants.tsv')
        fasta_in = os.path.join(
            data_dir, 'reference_data_test_all_non_wild_type_variants.ref.fa')
        refdata = reference_data.ReferenceData([fasta_in], [tsv_file])

        v1 = sequence_metadata.SequenceMetadata(
            'var_only_gene\t1\t1\tP3Q\t.\tref has wild type P')
        v2 = sequence_metadata.SequenceMetadata(
            'var_only_gene\t1\t1\tG4I\t.\tref has wild type F')
        v3 = sequence_metadata.SequenceMetadata(
            'var_only_gene\t1\t1\tI5V\t.\tref has variant V instead of I')
        v4 = sequence_metadata.SequenceMetadata(
            'var_only_gene\t1\t1\tF6I\t.\tref has wild type F')
        p1 = sequence_metadata.SequenceMetadata(
            'presence_absence_gene\t1\t0\tN2I\t.\tref has wild type N')
        p2 = sequence_metadata.SequenceMetadata(
            'presence_absence_gene\t1\t0\tA4G\t.\tref has variant G instead of A'
        )
        n1 = sequence_metadata.SequenceMetadata(
            'non_coding\t0\t0\tA2C\t.\tref has wild type A')
        n2 = sequence_metadata.SequenceMetadata(
            'non_coding\t0\t0\tC4T\t.\tref has variant T instead of C')

        var_only_expected = {
            'n': {},
            'p': {
                2: {v1},
                3: {v2},
                4: {v3},
                5: {v4}
            }
        }

        pres_abs_expected = {
            'n': {},
            'p': {
                1: {p1},
                3: {p2}
            },
        }

        non_coding_expected = {'n': {1: {n1}, 3: {n2}}, 'p': {}}

        self.assertEqual(var_only_expected,
                         refdata.all_non_wild_type_variants('var_only_gene'))
        self.assertEqual(
            pres_abs_expected,
            refdata.all_non_wild_type_variants('presence_absence_gene'))
        self.assertEqual(non_coding_expected,
                         refdata.all_non_wild_type_variants('non_coding'))
        self.assertEqual({
            'n': {},
            'p': {}
        }, refdata.all_non_wild_type_variants('not_a_known_sequence'))
コード例 #7
0
    def test_rename_metadata_set(self):
        '''Test _rename_metadata_set'''
        metaset = {
            sequence_metadata.SequenceMetadata(
                'foo 1\t1\t0\t.\t.\tdescription'),
            sequence_metadata.SequenceMetadata(
                'foo 1\t1\t0\tI42L\t.\tspam eggs')
        }

        expected = {
            sequence_metadata.SequenceMetadata(
                'new_name\t1\t0\t.\t.\tdescription'),
            sequence_metadata.SequenceMetadata(
                'new_name\t1\t0\tI42L\t.\tspam eggs')
        }
        got = reference_data.ReferenceData._rename_metadata_set(
            metaset, 'new_name')
        self.assertEqual(expected, got)
コード例 #8
0
 def test_load_input_check_seq_names_ok(self):
     '''Test _load_input_files_and_check_seq_names with good input'''
     fasta_files = [
         os.path.join(
             data_dir,
             'reference_data_load_input_check_seq_names.good.fa.' + x)
         for x in ['1', '2']
     ]
     metadata_files = [
         os.path.join(
             data_dir,
             'reference_data_load_input_check_seq_names.good.csv.' + x)
         for x in ['1', '2']
     ]
     expected_seqs = {
         'seq1': pyfastaq.sequences.Fasta('seq1', 'ACGT'),
         'seq2': pyfastaq.sequences.Fasta('seq2', 'TTTT')
     }
     meta1 = sequence_metadata.SequenceMetadata(
         'seq1\t0\t0\tA1G\t.\tfree text')
     meta2 = sequence_metadata.SequenceMetadata(
         "seq2\t0\t0\t.\t.\tspam eggs")
     expected_meta = {
         'seq1': {
             'seq_type': 'n',
             'variant_only': False,
             'n': {
                 0: {meta1}
             },
             'p': {},
             '.': set(),
         },
         'seq2': {
             'seq_type': 'n',
             'variant_only': False,
             'n': {},
             'p': {},
             '.': {meta2},
         }
     }
     got_seqs, got_meta = reference_data.ReferenceData._load_input_files_and_check_seq_names(
         fasta_files, metadata_files)
     self.assertEqual(expected_seqs, got_seqs)
     self.assertEqual(expected_meta, got_meta)
コード例 #9
0
 def test_init_on_good_input(self):
     '''test init ok on good input'''
     data = sequence_metadata.SequenceMetadata(
         'gene\t1\t0\tI42L\tid\tspam spam wonderful spam')
     self.assertEqual(data.name, 'gene')
     self.assertEqual(data.seq_type, 'p')
     self.assertEqual(data.variant_only, False)
     self.assertEqual(data.variant.wild_value, 'I')
     self.assertEqual(data.variant.variant_value, 'L')
     self.assertEqual(data.variant.identifier, 'id')
     self.assertEqual(data.free_text, 'spam spam wonderful spam')
コード例 #10
0
    def test_str(self):
        '''test __str__'''
        lines = [
            'gene1\t1\t1\tA42G\tid1\tspam',
            'gene2\t0\t0\t.\t.\t.',
            'gene3\t0\t0\t.\t.\teggs',
            'gene4\t1\t0\tI42K\tid\tthis mutation kills tardigrades',
        ]

        for line in lines:
            self.assertEqual(line,
                             str(sequence_metadata.SequenceMetadata(line)))
コード例 #11
0
    def test_load_all_metadata_tsvs(self):
        '''Test _load_all_metadata_tsvs'''
        input_files = [
            os.path.join(data_dir,
                         'reference_data_load_all_metadata_tsvs.' + x + '.tsv')
            for x in ['1', '2']
        ]
        meta1 = sequence_metadata.SequenceMetadata(
            'gene1\t0\t0\tA42G\t.\tfree text')
        meta2 = sequence_metadata.SequenceMetadata(
            'gene1\t0\t0\tG13T\t.\tconfers killer rabbit resistance')
        meta3 = sequence_metadata.SequenceMetadata(
            "gene2\t1\t0\tI42L\t.\tremoves tardigrade's space-living capability"
        )
        expected = {
            'gene1': {
                'seq_type': 'n',
                'variant_only': False,
                'n': {
                    12: {meta2},
                    41: {meta1}
                },
                'p': {},
                '.': set(),
            },
            'gene2': {
                'seq_type': 'p',
                'variant_only': False,
                'n': {},
                'p': {
                    41: {meta3}
                },
                '.': set(),
            }
        }

        got = reference_data.ReferenceData._load_all_metadata_tsvs(input_files)
        self.assertEqual(expected, got)
コード例 #12
0
    def test_to_string(self):
        '''test to_string'''
        lines = [
            ('gene1', '0', '0', 'A42G', 'id1', 'spam'),
            ('gene2', '0', '0', '.', '.', '.'),
            ('gene3', '0', '0', '.', '.', 'eggs'),
            ('gene4', '1', '0', 'I42K', 'id',
             'this mutation kills tardigrades'),
        ]

        for line in lines:
            m = sequence_metadata.SequenceMetadata('\t'.join(line))
            for separator in ('_', '\t'):
                expected = separator.join(line)
                self.assertEqual(expected, m.to_string(separator=separator))
コード例 #13
0
    def test_init_fails_on_bad_lines(self):
        '''Test init fails on bad lines'''
        lines = [
            'only one column. There can NOT be only one\n',
            'two\tcolumns is not enough\n',
            'three\tcolumns\tis still not enough\n',
            'four\tcolumns\tis\talso not enough\n',
            'five\tcolumns\tis\talso\tnot enough\n',
            'seven\tcolumns\tis\tone\tmore\tthan\nwe want',
        ]

        for line in lines:
            with self.assertRaises(sequence_metadata.Error):
                sequence_metadata.SequenceMetadata(line)

        tests = [
            ('gene\tx\t0\t.\t.\tfoo\n', sequence_metadata.Error),
            ('gene\t1\t2\t.\t.\tfoo\n', sequence_metadata.Error),
            ('gene\t1\t1\tI42\t.\tfoo\n', sequence_variant.Error),
        ]

        for line, err in tests:
            with self.assertRaises(err):
                sequence_metadata.SequenceMetadata(line)
コード例 #14
0
    def test_has_variant(self):
        '''test has_variant'''
        tests = [
            ('gene1\t0\t0\t.\t.\t.', False),
            ('gene1\t0\t0\tA2T\t.\t,', True),
            ('gene1\t0\t0\tT2A\t.\t.', False),
            ('gene1\t1\t0\tI2Y\t.\t.', True),
            ('gene1\t1\t0\tY2I\t.\t.', False),
        ]

        seq = pyfastaq.sequences.Fasta('name',
                                       'ATGTATTGCTGA')  # translation: MYC*

        for line, expected in tests:
            metadata = sequence_metadata.SequenceMetadata(line)
            self.assertEqual(expected, metadata.has_variant(seq))
コード例 #15
0
ファイル: assembly_variants_test.py プロジェクト: ys4/ariba
    def test_get_remaining_known_ref_variants_amino_acids(self):
        '''test _get_remaining_known_ref_variants with amino acids'''
        ref_var1 = sequence_metadata.SequenceMetadata(
            'gene1\t1\t0\tD2E\tid1\tfoo bar')
        ref_var2 = sequence_metadata.SequenceMetadata(
            'gene1\t1\t0\tD3E\tid1\tfoo bar baz')
        ref_var3 = sequence_metadata.SequenceMetadata(
            'gene1\t1\t0\tD3I\tid1\tfoo bar baz spam')
        ref_var4 = sequence_metadata.SequenceMetadata(
            'gene1\t1\t0\tD10E\tid1\tfoo bar baz spam egg')
        ref_var5 = sequence_metadata.SequenceMetadata(
            'gene1\t1\t0\tD14E\tid1\tfoo bar baz spam egg chips')
        ref_var6 = sequence_metadata.SequenceMetadata(
            'gene1\t1\t0\tD15E\tid1\tfoo bar baz spam egg chips')
        ref_var7 = sequence_metadata.SequenceMetadata(
            'gene1\t1\t0\tD40E\tid1\tfoo bar baz spam egg chips')

        known_ref_variants = {
            1: {ref_var1},
            2: {ref_var2, ref_var3},
            9: {ref_var4},
            13: {ref_var5},
            14: {ref_var6},
            39: {ref_var7}
        }

        used_ref_variants = {ref_var3, ref_var5}

        nucmer_coords = [
            pyfastaq.intervals.Interval(6, 25),
            pyfastaq.intervals.Interval(30, 100)
        ]

        expected = [(None, 'p', None, None, None, {x}, set())
                    for x in [ref_var2, ref_var6]]
        got = assembly_variants.AssemblyVariants._get_remaining_known_ref_variants(
            known_ref_variants, used_ref_variants, nucmer_coords)
        self.assertEqual(expected, got)
コード例 #16
0
ファイル: assembly_variants_test.py プロジェクト: ys4/ariba
    def test_get_remaining_known_ref_variants_nucleotides(self):
        '''test _get_remaining_known_ref_variants with nucleotides'''
        ref_var1 = sequence_metadata.SequenceMetadata(
            'gene1\t0\t0\tA2C\tid1\tfoo bar')
        ref_var2 = sequence_metadata.SequenceMetadata(
            'gene1\t0\t0\tA3C\tid1\tfoo bar baz')
        ref_var3 = sequence_metadata.SequenceMetadata(
            'gene1\t0\t0\tA3T\tid1\tfoo bar baz spam')
        ref_var4 = sequence_metadata.SequenceMetadata(
            'gene1\t0\t0\tA10C\tid1\tfoo bar baz spam egg')
        ref_var5 = sequence_metadata.SequenceMetadata(
            'gene1\t0\t0\tA14C\tid1\tfoo bar baz spam egg chips')
        ref_var6 = sequence_metadata.SequenceMetadata(
            'gene1\t0\t0\tA15C\tid1\tfoo bar baz spam egg chips')
        ref_var7 = sequence_metadata.SequenceMetadata(
            'gene1\t0\t0\tA40C\tid1\tfoo bar baz spam egg chips')

        known_ref_variants = {
            1: {ref_var1},
            2: {ref_var2, ref_var3},
            9: {ref_var4},
            13: {ref_var5},
            14: {ref_var6},
            39: {ref_var7}
        }

        used_ref_variants = {ref_var3, ref_var5}

        nucmer_coords = [
            pyfastaq.intervals.Interval(2, 13),
            pyfastaq.intervals.Interval(30, 100)
        ]

        expected = [(None, 'n', None, None, None, {x}, set())
                    for x in [ref_var2, ref_var4, ref_var7]]
        got = assembly_variants.AssemblyVariants._get_remaining_known_ref_variants(
            known_ref_variants, used_ref_variants, nucmer_coords)
        self.assertEqual(expected, got)
コード例 #17
0
    def test_rename_sequences(self):
        '''Test rename_sequences'''
        fasta_in = os.path.join(data_dir, 'reference_data_rename_sequences.fa')
        tsv_in = os.path.join(data_dir,
                              'reference_data_rename_sequences_metadata.tsv')
        refdata = reference_data.ReferenceData([fasta_in], [tsv_in])
        tmp_out = 'tmp.test_rename_sequences.out'
        refdata.rename_sequences(tmp_out)
        expected_file = os.path.join(
            data_dir, 'reference_data_test_rename_sequences.out')
        self.assertTrue(filecmp.cmp(expected_file, tmp_out, shallow=False))
        os.unlink(tmp_out)

        meta1 = sequence_metadata.SequenceMetadata(
            'noncoding1\t0\t0\t.\t.\toriginal name "noncoding1 blah"')
        meta3 = sequence_metadata.SequenceMetadata(
            'pres_abs1_1\t0\t0\t.\t.\toriginal name "pres_abs1 foo bar spam eggs"'
        )
        meta5 = sequence_metadata.SequenceMetadata(
            'pres_abs1\t0\t0\t.\t.\toriginal name "pres\'abs1"')
        meta6 = sequence_metadata.SequenceMetadata(
            'pres_abs2\t0\t0\t.\t.\toriginal name "pres_abs2"')
        meta7 = sequence_metadata.SequenceMetadata(
            'pres_abs3\t0\t0\t.\t.\toriginal name "pres!abs3"')
        meta8 = sequence_metadata.SequenceMetadata(
            'var_only1_2\t0\t0\t.\t.\toriginal name "var_only1 hello"')
        meta9 = sequence_metadata.SequenceMetadata(
            'var_only1\t0\t0\t.\t.\toriginal name "var,only1"')
        meta10 = sequence_metadata.SequenceMetadata(
            'var_only1_1\t0\t0\t.\t.\toriginal name "var:only1 boo"')
        meta11 = sequence_metadata.SequenceMetadata(
            'var_only2\t0\t0\t.\t.\toriginal name "var_only2"')

        expected_meta = {
            'noncoding1': {
                'seq_type': 'n',
                'variant_only': False,
                'n': {},
                'p': {},
                '.': {meta1}
            },
            'pres_abs1_1': {
                'seq_type': 'n',
                'variant_only': False,
                'n': {},
                'p': {},
                '.': {meta3}
            },
            'pres_abs1': {
                'seq_type': 'n',
                'variant_only': False,
                'n': {},
                'p': {},
                '.': {meta5}
            },
            'pres_abs2': {
                'seq_type': 'n',
                'variant_only': False,
                'n': {},
                'p': {},
                '.': {meta6}
            },
            'pres_abs3': {
                'seq_type': 'n',
                'variant_only': False,
                'n': {},
                'p': {},
                '.': {meta7}
            },
            'var_only1_2': {
                'seq_type': 'n',
                'variant_only': False,
                'n': {},
                'p': {},
                '.': {meta8}
            },
            'var_only1': {
                'seq_type': 'n',
                'variant_only': False,
                'n': {},
                'p': {},
                '.': {meta9}
            },
            'var_only1_1': {
                'seq_type': 'n',
                'variant_only': False,
                'n': {},
                'p': {},
                '.': {meta10}
            },
            'var_only2': {
                'seq_type': 'n',
                'variant_only': False,
                'n': {},
                'p': {},
                '.': {meta11}
            },
        }

        self.maxDiff = None
        self.assertEqual(set(expected_meta.keys()),
                         set(refdata.metadata.keys()))
        self.assertEqual(expected_meta, refdata.metadata)

        expected_seqs_dict = {
            'noncoding1': pyfastaq.sequences.Fasta('noncoding1', 'AAAA'),
            'pres_abs1_1': pyfastaq.sequences.Fasta('pres_abs1_1', 'ACGT'),
            'pres_abs1': pyfastaq.sequences.Fasta('pres_abs1', 'CCCC'),
            'pres_abs2': pyfastaq.sequences.Fasta('pres_abs2', 'TTTT'),
            'pres_abs3': pyfastaq.sequences.Fasta('pres_abs3', 'GGGG'),
            'var_only1_2': pyfastaq.sequences.Fasta('var_only1_2', 'AAAA'),
            'var_only1': pyfastaq.sequences.Fasta('var_only1', 'GGGG'),
            'var_only1_1': pyfastaq.sequences.Fasta('var_only1_1', 'CCCC'),
            'var_only2': pyfastaq.sequences.Fasta('var_only2', 'TTTT'),
        }

        self.assertEqual(expected_seqs_dict, refdata.sequences)

        expected_rename_dict = {
            'pres!abs3': 'pres_abs3',
            'pres\'abs1': 'pres_abs1',
            'pres_abs1': 'pres_abs1_1',
            'var,only1': 'var_only1',
            'var:only1': 'var_only1_1',
            'var_only1': 'var_only1_2',
        }

        self.assertEqual(expected_rename_dict, refdata.rename_dict)
コード例 #18
0
    def test_rename_names_in_metadata(self):
        '''Test _rename_names_in_metadata'''
        meta1 = sequence_metadata.SequenceMetadata(
            'gene1\t0\t0\tA42G\t.\tfree text')
        meta2 = sequence_metadata.SequenceMetadata(
            'gene1\t0\t0\tA42T\t.\tfree text2')
        meta3 = sequence_metadata.SequenceMetadata(
            'gene1\t0\t0\t.\t.\tfree text3')
        meta4 = sequence_metadata.SequenceMetadata(
            'gene1\t0\t0\tG13T\t.\tconfers killer rabbit resistance')
        meta5 = sequence_metadata.SequenceMetadata(
            "gene2\t1\t0\tI42L\t.\tremoves tardigrade's space-living capability"
        )
        meta1rename = sequence_metadata.SequenceMetadata(
            'new_gene1\t0\t0\tA42G\t.\tfree text')
        meta2rename = sequence_metadata.SequenceMetadata(
            'new_gene1\t0\t0\tA42T\t.\tfree text2')
        meta3rename = sequence_metadata.SequenceMetadata(
            'new_gene1\t0\t0\t.\t.\tfree text3')
        meta4rename = sequence_metadata.SequenceMetadata(
            'new_gene1\t0\t0\tG13T\t.\tconfers killer rabbit resistance')

        metadata = {
            'gene1': {
                'n': {
                    12: {meta4},
                    41: {meta1, meta2}
                },
                'p': {},
                '.': {meta3},
            },
            'gene2': {
                'n': {},
                'p': {
                    41: {meta5}
                },
                '.': set(),
            }
        }

        expected = {
            'new_gene1': {
                'n': {
                    12: {meta4rename},
                    41: {meta1rename, meta2rename}
                },
                'p': {},
                '.': {meta3rename},
            },
            'gene2': {
                'n': {},
                'p': {
                    41: {meta5}
                },
                '.': set(),
            }
        }

        rename_dict = {'gene1': 'new_gene1'}
        got = reference_data.ReferenceData._rename_names_in_metadata(
            metadata, rename_dict)
        self.assertEqual(expected, got)
コード例 #19
0
ファイル: assembly_variants_test.py プロジェクト: ys4/ariba
    def test_one_var_one_ctg_cdg(self):
        '''test _get_one_variant_for_one_contig_coding'''
        fasta_in = os.path.join(data_dir,
                                'assembly_variants_one_var_one_ctg_cdg.fa')
        tsv_in = os.path.join(data_dir,
                              'assembly_variants_one_var_one_ctg_cdg.tsv')
        refdata = reference_data.ReferenceData([fasta_in], [tsv_in])
        ref_sequence_name = 'presence_absence'
        ref_sequence = refdata.sequence(ref_sequence_name)
        refdata_var_dict = refdata.metadata[ref_sequence_name]

        v0 = pymummer.variant.Variant(
            pymummer.snp.Snp(
                '6\tT\tA\t6\tx\tx\t42\t42\tx\tx\tpresence_absence\tcontig'))
        v1 = pymummer.variant.Variant(
            pymummer.snp.Snp(
                '9\tA\tT\t9\tx\tx\t42\t42\tx\tx\tpresence_absence\tcontig'))
        v2 = pymummer.variant.Variant(
            pymummer.snp.Snp(
                '18\tG\tT\t18\tx\tx\t42\t42\tx\tx\tpresence_absence\tcontig'))
        v3 = pymummer.variant.Variant(
            pymummer.snp.Snp(
                '21\tC\tT\t21\tx\tx\t42\t42\tx\tx\tpresence_absence\tcontig'))
        v4 = pymummer.variant.Variant(
            pymummer.snp.Snp(
                '7\tA\tT\t7\tx\tx\t42\t42\tx\tx\tpresence_absence\tcontig'))
        v5 = pymummer.variant.Variant(
            pymummer.snp.Snp(
                '12\tA\tC\t11\tx\tx\t42\t42\tx\tx\tpresence_absence\tcontig'))

        v6 = pymummer.variant.Variant(
            pymummer.snp.Snp(
                '4\tG\t.\t4\tx\tx\t42\t42\tx\tx\tpresence_absence\tcontig'))
        self.assertTrue(
            v6.update_indel(
                pymummer.snp.Snp(
                    '5\tA\t.\t4\tx\tx\t42\t42\tx\tx\tpresence_absence\tcontig')
            ))

        v7 = pymummer.variant.Variant(
            pymummer.snp.Snp(
                '4\t.\tA\t4\tx\tx\t42\t42\tx\tx\tpresence_absence\tcontig'))
        self.assertTrue(
            v7.update_indel(
                pymummer.snp.Snp(
                    '4\t.\tA\t5\tx\tx\t42\t42\tx\tx\tpresence_absence\tcontig')
            ))

        v8 = pymummer.variant.Variant(
            pymummer.snp.Snp(
                '4\tG\t.\t4\tx\tx\t42\t42\tx\tx\tpresence_absence\tcontig'))
        self.assertTrue(
            v8.update_indel(
                pymummer.snp.Snp(
                    '5\tA\t.\t4\tx\tx\t42\t42\tx\tx\tpresence_absence\tcontig')
            ))
        self.assertTrue(
            v8.update_indel(
                pymummer.snp.Snp(
                    '6\tT\t.\t4\tx\tx\t42\t42\tx\tx\tpresence_absence\tcontig')
            ))

        v9 = pymummer.variant.Variant(
            pymummer.snp.Snp(
                '4\tG\t.\t4\tx\tx\t42\t42\tx\tx\tpresence_absence\tcontig'))
        self.assertTrue(
            v9.update_indel(
                pymummer.snp.Snp(
                    '5\tA\t.\t4\tx\tx\t42\t42\tx\tx\tpresence_absence\tcontig')
            ))
        self.assertTrue(
            v9.update_indel(
                pymummer.snp.Snp(
                    '6\tT\t.\t4\tx\tx\t42\t42\tx\tx\tpresence_absence\tcontig')
            ))
        self.assertTrue(
            v9.update_indel(
                pymummer.snp.Snp(
                    '7\tA\t.\t4\tx\tx\t42\t42\tx\tx\tpresence_absence\tcontig')
            ))
        self.assertTrue(
            v9.update_indel(
                pymummer.snp.Snp(
                    '8\tG\t.\t4\tx\tx\t42\t42\tx\tx\tpresence_absence\tcontig')
            ))
        self.assertTrue(
            v9.update_indel(
                pymummer.snp.Snp(
                    '9\tA\t.\t4\tx\tx\t42\t42\tx\tx\tpresence_absence\tcontig')
            ))

        v10 = pymummer.variant.Variant(
            pymummer.snp.Snp(
                '4\t.\tA\t4\tx\tx\t42\t42\tx\tx\tpresence_absence\tcontig'))
        self.assertTrue(
            v10.update_indel(
                pymummer.snp.Snp(
                    '4\t.\tT\t5\tx\tx\t42\t42\tx\tx\tpresence_absence\tcontig')
            ))
        self.assertTrue(
            v10.update_indel(
                pymummer.snp.Snp(
                    '4\t.\tT\t6\tx\tx\t42\t42\tx\tx\tpresence_absence\tcontig')
            ))

        mummer_variants = [[v0], [v1], [v2], [v3], [v4], [v5], [v6], [v7],
                           [v8], [v9], [v10]]

        meta0 = sequence_metadata.SequenceMetadata(
            'presence_absence\t1\t0\tD2E\tid1\tref has wild type D (GAT=D, GAA=E)'
        )
        meta4 = sequence_metadata.SequenceMetadata(
            'presence_absence\t1\t0\tS3R\tid1\tref has variant type R (AGA=R, AGT=S)'
        )

        expected_tuples = [
            (1, 'p', 'D2E', 'NONSYN', [v0], {meta0}, set()),  #0
            None,  #1
            (5, 'p', 'M6I', 'NONSYN', [v2], set(), set()),  #2
            (6, 'p', '.', 'SYN', [v3], set(), set()),  #3
            (2, 'p', 'R3trunc', 'TRUNC', [v4], set(), {meta4}),  #4
            None,  #5
            (1, 'p', 'D2fs', 'FSHIFT', [v6], set(), {meta0}),  #6
            (1, 'p', 'D2fs', 'FSHIFT', [v7], set(), {meta0}),  #7
            (1, 'p', 'D2del', 'DEL', [v8], set(), {meta0}),  #8
            (1, 'p', 'D2_R3del', 'DEL', [v9], set(), {meta0}),  #9
            (1, 'p', 'D2_R3insI', 'INS', [v10], set(), {meta0})  #10
        ]

        expected_used_variants = [
            refdata_var_dict['p'][1],  #0
            refdata_var_dict['p'][2],  #1
            set(),  #2
            set(),  #3
            refdata_var_dict['p'][2],  #4
            refdata_var_dict['p'][3],  #5
            refdata_var_dict['p'][1],  #6
            refdata_var_dict['p'][1],  #7
            refdata_var_dict['p'][1],  #8
            refdata_var_dict['p'][1],  #9
            refdata_var_dict['p'][1],  #10
        ]

        assert len(mummer_variants) == len(expected_tuples) == len(
            expected_used_variants)

        for i in range(len(mummer_variants)):
            got_tuple, got_used_variants = assembly_variants.AssemblyVariants._get_one_variant_for_one_contig_coding(
                ref_sequence, refdata_var_dict, mummer_variants[i])
            self.assertEqual(expected_tuples[i], got_tuple)
            self.assertEqual(expected_used_variants[i], got_used_variants)
コード例 #20
0
ファイル: assembly_variants_test.py プロジェクト: ys4/ariba
    def test_get_variants_presence_absence(self):
        '''test get_variants presence absence genes'''
        meta1 = sequence_metadata.SequenceMetadata(
            'presence_absence\t1\t0\tD2E\tid1\tref has wild type D, contig has var (GAT=D, GAA=E)'
        )
        meta2 = sequence_metadata.SequenceMetadata(
            'presence_absence\t1\t0\tS3R\tid1\tref has variant type R, contig has wild (AGA=R, AGT=S)'
        )
        meta3 = sequence_metadata.SequenceMetadata(
            'presence_absence\t1\t0\tD4E\tid1\tref has variant type E, contig has var (GAA=E, GAC=D)'
        )
        meta4 = sequence_metadata.SequenceMetadata(
            'presence_absence\t1\t0\tA5D\tid1\tref has wild type A, contig has var (GCG=A, GAC=D)'
        )
        meta5 = sequence_metadata.SequenceMetadata(
            'presence_absence\t1\t0\tR13S\tid1\tref and qry have wild type')

        metadata_tsv = 'tmp.test_get_variants_presence_absence.metadata.tsv'
        with open(metadata_tsv, 'w') as f:
            print(meta1, file=f)
            print(meta2, file=f)
            print(meta3, file=f)
            print(meta4, file=f)
            print(meta5, file=f)

        fasta_in = os.path.join(
            data_dir,
            'assembly_variants_test_get_variants_presence_absence.fa')
        refdata = reference_data.ReferenceData([fasta_in], [metadata_tsv])

        os.unlink(metadata_tsv)

        nucmer_snp_file = os.path.join(
            data_dir,
            'assembly_variants_test_get_variants_presence_absence.snps')
        v2 = pymummer.variant.Variant(
            pymummer.snp.Snp(
                '14\tC\tA\t14\tx\tx\t42\t42\tx\tx\tpresence_absence\tcontig1'))
        v3 = pymummer.variant.Variant(
            pymummer.snp.Snp(
                '15\tG\tC\t15\tx\tx\t42\t42\tx\tx\tpresence_absence\tcontig1'))

        ref_nucmer_coords = {
            'contig1': [pyfastaq.intervals.Interval(0, 30)],
            'contig2': [pyfastaq.intervals.Interval(10, 41)],
        }

        ctg_nucmer_coords = {
            'contig1': [pyfastaq.intervals.Interval(0, 30)],
            'contig2': [pyfastaq.intervals.Interval(10, 41)],
        }

        expected = {
            'contig1': [
                (4, 'p', 'A5D', 'NONSYN', [v2, v3], {meta4}, set()),
                (None, 'p', None, None, None, {meta1}, set()),
                (None, 'p', None, None, None, {meta3}, set()),
            ],
            'contig2': [
                (None, 'p', None, None, None, {meta3}, set()),
                (None, 'p', None, None, None, {meta4}, set()),
                (None, 'p', None, None, None, {meta5}, set()),
            ],
        }

        a_variants = assembly_variants.AssemblyVariants(
            refdata, nucmer_snp_file)
        got = a_variants.get_variants('presence_absence', ctg_nucmer_coords,
                                      ref_nucmer_coords)
        self.assertEqual(expected, got)