Esempio n. 1
0
 def test_constructor_interval_metadata_len(self):
     for n in 1, 2, 3:
         im = IntervalMetadata(n)
         im.add([(0, 1)], metadata={'a': 'b'})
         obj = self._interval_metadata_constructor_(n, im)
         self.assertTrue(obj.has_interval_metadata())
         self.assertIsInstance(obj.interval_metadata, IntervalMetadata)
Esempio n. 2
0
def _parse_record(lines):
    sid = lines[0].split()[1]
    splitter = split(split_head, is_head=lambda s: not s.startswith('  '))
    imd = IntervalMetadata(None)
    for gene in splitter(lines):
        if len(gene) == 1:
            # there is no terminator predicted
            continue
        gene_id = gene[0].split()[0]
        it = iter(gene[1:])
        for term in it:
            items = term.split()
            term_id = '%s_%s' % (items[0], items[1])
            hair_pin_seq = next(it)
            hair_pin_seq = '/'.join(hair_pin_seq.split())
            start, end = int(items[2]), int(items[4])
            strand = items[5]
            if strand == '-':
                start, end = end, start
            bounds = [(start, end)]
            md = {
                'ID': term_id,
                'gene_id': gene_id,
                'confidence': items[7],
                'strand': strand,
                'source': 'TransTermHP',
                'sequence': hair_pin_seq,
                'type': 'terminator'
            }
            imd.add(bounds, metadata=md)
    return sid, imd
Esempio n. 3
0
 def test_ne_diff_bounds(self):
     im1 = IntervalMetadata(10)
     im2 = IntervalMetadata(9)
     intvl = {'bounds': [(0, 1)], 'metadata': {'spam': 'foo'}}
     im1.add(**intvl)
     im2.add(**intvl)
     self.assertReallyNotEqual(im1, im2)
Esempio n. 4
0
 def test_ne_diff_bounds(self):
     im1 = IntervalMetadata(10)
     im2 = IntervalMetadata(9)
     intvl = {'bounds': [(0, 1)], 'metadata': {'spam': 'foo'}}
     im1.add(**intvl)
     im2.add(**intvl)
     self.assertReallyNotEqual(im1, im2)
Esempio n. 5
0
 def test_init_copy_from(self):
     for i in [None, 99, 999]:
         obs = IntervalMetadata(i, self.im_1)
         exp = IntervalMetadata(i)
         exp.add(bounds=[(1, 2), (4, self.upper_bound)],
                 metadata={'gene': 'sagA',  'bound': 0})
         self.assertEqual(obs, exp)
    def test_complement_without_reverse_non_empty(self):
        for (constructor, seq_str, comp_str,
             qual) in self.all_combos_comp_qual:
            comp = constructor(seq_str).complement()
            self.assertEqual(comp, constructor(comp_str))

            im = IntervalMetadata(len(seq_str))
            im.add([(0, 1)], metadata={'gene': 'p53'})
            comp = constructor(seq_str,
                               metadata={
                                   'id': 'foo',
                                   'description': 'bar'
                               },
                               positional_metadata={
                                   'quality': qual
                               },
                               interval_metadata=im).complement()
            self.assertEqual(
                comp,
                constructor(comp_str,
                            metadata={
                                'id': 'foo',
                                'description': 'bar'
                            },
                            positional_metadata={'quality': qual},
                            interval_metadata=im))
Esempio n. 7
0
    def test_compute_trna_score(self):
        imd = IntervalMetadata(None)

        obs = compute_trna_score([imd])
        self.assertEqual(obs, 0.1)

        for a in [
                'Cys', 'Gln', 'Glu', 'Gly', 'His', 'Ile', 'Leu', 'Lys', 'Met',
                'Phe', 'Pro', 'Ser', 'Thr', 'Trp', 'Tyr', 'Val'
        ]:
            imd.add([(0, 12)],
                    metadata={
                        'type': 'tRNA',
                        'product': 'tRNA-' + a
                    })

        obs = compute_trna_score([imd])
        self.assertEqual(obs, 0.6)

        for a in ['Ala', 'Arg', 'Asn', 'Asp']:
            imd.add([(0, 12)],
                    metadata={
                        'type': 'tRNA',
                        'product': 'tRNA-' + a
                    })

        obs = compute_trna_score([imd])
        self.assertEqual(obs, 1)
    def test_complement_with_reverse_non_empty(self):
        for (constructor, seq_str, rev_comp_str,
             qual) in self.all_combos_rev_comp_qual:
            rc = constructor(seq_str).complement(reverse=True)
            self.assertEqual(rc, constructor(rev_comp_str))

            length = len(seq_str)
            im = IntervalMetadata(length)
            im.add([(0, 1)], metadata={'gene': 'p53'})
            im_rc = IntervalMetadata(length)
            im_rc.add([(length - 1, length)], metadata={'gene': 'p53'})
            original = constructor(seq_str,
                                   metadata={
                                       'id': 'foo',
                                       'description': 'bar'
                                   },
                                   positional_metadata={'quality': qual},
                                   interval_metadata=im)
            rc = original.complement(reverse=True)

            self.assertEqual(
                rc,
                constructor(rev_comp_str,
                            metadata={
                                'id': 'foo',
                                'description': 'bar'
                            },
                            positional_metadata={'quality': list(qual)[::-1]},
                            interval_metadata=im_rc))
            # assert the original object is not changed
            self.assertIsNot(original.interval_metadata, im)
            self.assertEqual(original.interval_metadata, im)
Esempio n. 9
0
def _parse_record(lines):
    imd = IntervalMetadata(None)
    seq_id = lines[0].split('\t')[0]
    for line in lines:
        bounds, md = _parse_line(line)
        imd.add(bounds, metadata=md)
    return seq_id, imd
    def test_complement_with_reverse_non_empty(self):
        for (constructor, seq_str, rev_comp_str,
             qual) in self.all_combos_rev_comp_qual:
            rc = constructor(seq_str).complement(reverse=True)
            self.assertEqual(rc, constructor(rev_comp_str))

            length = len(seq_str)
            im = IntervalMetadata(length)
            im.add([(0, 1)], metadata={'gene': 'p53'})
            im_rc = IntervalMetadata(length)
            im_rc.add([(length-1, length)], metadata={'gene': 'p53'})
            original = constructor(
                seq_str,
                metadata={'id': 'foo', 'description': 'bar'},
                positional_metadata={
                    'quality': qual},
                interval_metadata=im)
            rc = original.complement(reverse=True)

            self.assertEqual(
                rc,
                constructor(
                    rev_comp_str,
                    metadata={'id': 'foo', 'description': 'bar'},
                    positional_metadata={'quality':
                                         list(qual)[::-1]},
                    interval_metadata=im_rc))
            # assert the original object is not changed
            self.assertIsNot(original.interval_metadata, im)
            self.assertEqual(original.interval_metadata, im)
Esempio n. 11
0
    def test_create_faa(self):
        imd = IntervalMetadata(None)
        imd.add([(0, 120)],
                metadata={
                    'type': 'CDS',
                    'product': 'Homoserine kinase',
                    'ID': '1_1'
                })
        seq = DNA(
            'ATGGTTAAAGTTTATGCCCCGGCTTCCAGTGCCAATATGAGCGTCGGGTTTGATGTGCTC'
            'GGGGCGGCGGTGACACCCGTTGATGGTGCATTGCTCGGAGATGTAGTCACGGTTGAGGCG'
            'GCAGAGACATTCAGTCTCAACAACCTCGGACGCTTTGCCGATAAGCTGCCGTCAGAACCA'
            'CGGGAAAATATCGTTTATCAGTGCTGGGAGCGTTTTTGCCTGGAGCTGGGCAAGCAAATT'
            'CCAGTGGCGATGACTCTGGAAAAGAATATGCCGATCGGCTCGGGCTTAGGCTCCAGCGCC'
            'TGTTCGGTGGTCGCGGCGCTGATGGCGATGAATGAACACTGCGGCAAGCCACTTAATGAC'
            'ACCCGTTTGCTGGCTTTGATGGGCGAGCTGGAAGGACGTATCTCCGGCAGCATTCATTAC'
            'GACAACGTGGCACCGTGTTTTCTTGGTGGTATGCAGTTGATGATCGAAGAAAACGACATC'
            'ATCAGCCAGCAAGTGCCAGGGTTTGATGAGTGGCTGTGGGTGCTGGCGTATCCGGGAATT'
            'AAAGTCTCGACGGCAGAAGCCCGGGCTATTTTACCGGCGCAGTATCGCCGCCAGGATTGC'
            'ATTGCGCACGGGCGACATCTGGCTGGCTTCATTCACGCCTGCTATTCCCGTCAGCCTGAG'
            'CTTGCCGCGAAGCTGATGAAAGATGTTATCGCTGAACCCTACCGTGAACGGTTACTGCCT'
            'GGCTTCCGGCAGGCGCGGCAGGCGGTCGCGGAAATCGGCGCGGTAGCGAGCGGTATCTCC'
            'GGCTCCGGCCCGACCTTGTTCGCTCTATGTGACAAGCCGGATACCGCCCAGCGCGTTGCC'
            'GACTGGTTGGGTAAGAACTACCTGCAAAATCAGGAAGGTTTTGTTCATATTTGCCGGCTG'
            'GATACGGCGGGCGCACGAGTACTGGAAAACTAA',
            interval_metadata=imd)
        create_faa([seq], self.o)
        exp = '>1_1 Homoserine kinase\nMVKVYAPASSANMSVGFDVLGAAVTPVDGALLGDVVTVEA\n'

        with open(self.o) as out:
            obs = out.read()
            self.assertEqual(exp, obs)
Esempio n. 12
0
def _parse_record(lines):
    '''Return interval metadata.'''
    imd = IntervalMetadata(None)
    for line in lines:
        bounds, md = _parse_line(line)
        imd.add(bounds, metadata=md)
    return imd
Esempio n. 13
0
 def test_raise_subregion(self):
     im = IntervalMetadata(None)
     im.add([(0, 3), (7, 9)], metadata={'type': 'gene'})
     with io.StringIO() as fh:
         with self.assertRaises(GFF3FormatError):
             _serialize_interval_metadata(
                 im, seq_id='a', fh=fh, skip_subregion=False)
Esempio n. 14
0
def _parse_record(lines, length):
    '''Parse the lines into a IntervalMetadata object.'''
    interval_metadata = IntervalMetadata(length)
    for line in lines:
        columns = line.split('\t')
        # there should be 9 columns
        if len(columns) != 9:
            raise GFF3FormatError(
                'do not have 9 columns in this line: "%s"' % line)
        # the 1st column is seq ID for every feature. don't store
        # this repetitive information
        metadata = {'source': columns[1],
                    'type': columns[2],
                    'score': columns[5],
                    'strand': columns[6]}
        phase = columns[7]
        # phase value can only be int or '.'
        try:
            metadata['phase'] = int(phase)
        except ValueError:
            if phase != '.':
                raise GFF3FormatError(
                    'unknown value for phase column: {!r}'.format(phase))
        metadata.update(_parse_attr(columns[8]))

        start, end = columns[3:5]

        bounds = [(int(start)-1, int(end))]

        interval_metadata.add(bounds, metadata=metadata)

    return interval_metadata
Esempio n. 15
0
def _parse_record(lines, length):
    '''Parse the lines into a IntervalMetadata object.'''
    interval_metadata = IntervalMetadata(length)
    for line in lines:
        columns = line.split('\t')
        # there should be 9 columns
        if len(columns) != 9:
            raise GFF3FormatError('do not have 9 columns in this line: "%s"' %
                                  line)
        # the 1st column is seq ID for every feature. don't store
        # this repetitive information
        metadata = {
            'source': columns[1],
            'type': columns[2],
            'score': columns[5],
            'strand': columns[6]
        }
        phase = columns[7]
        # phase value can only be int or '.'
        try:
            metadata['phase'] = int(phase)
        except ValueError:
            if phase != '.':
                raise GFF3FormatError(
                    'unknown value for phase column: {!r}'.format(phase))
        metadata.update(_parse_attr(columns[8]))

        start, end = columns[3:5]

        bounds = [(int(start) - 1, int(end))]

        interval_metadata.add(bounds, metadata=metadata)

    return interval_metadata
Esempio n. 16
0
def _parse_record(lines):
    '''Return interval metadata'''
    imd = IntervalMetadata(None)
    seq_id = lines[0].split()[2]
    for line in lines:
        bounds, md = _parse_line(line)
        imd.add(bounds, metadata=md)
    return seq_id, imd
Esempio n. 17
0
    def test_ne_only_one_is_empty(self):
        im1 = IntervalMetadata(self.upper_bound)
        im1.add(**self.intvls[0])
        obj1 = self._interval_metadata_constructor_(self.upper_bound, im1)

        obj2 = self._interval_metadata_constructor_(self.upper_bound)

        self.assertReallyNotEqual(obj1, obj2)
Esempio n. 18
0
 def test_upper_bound_is_none(self):
     im = IntervalMetadata(None)
     # should not raise error
     im.add([(0, 1000000000)])
     self.assertIsNone(im.upper_bound)
     with self.assertRaisesRegex(TypeError, 'upper bound is `None`'):
         im._reverse()
     with self.assertRaisesRegex(TypeError, 'upper bound is `None`'):
         IntervalMetadata.concat([self.im_1, im])
Esempio n. 19
0
    def test_eq_populated_differently(self):
        im1 = IntervalMetadata(self.upper_bound)
        im1.add(**self.intvls[0])
        obj1 = self._interval_metadata_constructor_(self.upper_bound, im1)

        obj2 = self._interval_metadata_constructor_(self.upper_bound)
        obj2.interval_metadata.add(**self.intvls[0])

        self.assertReallyEqual(obj1, obj2)
Esempio n. 20
0
 def test_raise_subregion(self):
     im = IntervalMetadata(None)
     im.add([(0, 3), (7, 9)], metadata={'type': 'gene'})
     with io.StringIO() as fh:
         with self.assertRaises(GFF3FormatError):
             _serialize_interval_metadata(im,
                                          seq_id='a',
                                          fh=fh,
                                          skip_subregion=False)
Esempio n. 21
0
    def test_eq_basic(self):
        im1 = IntervalMetadata(self.upper_bound)
        im1.add(**self.intvls[0])
        obj1 = self._interval_metadata_constructor_(self.upper_bound, im1)

        im2 = IntervalMetadata(self.upper_bound)
        im2.add(**self.intvls[0])
        obj2 = self._interval_metadata_constructor_(self.upper_bound, im2)

        self.assertReallyEqual(obj1, obj2)
Esempio n. 22
0
 def test_init_copy_from(self):
     for i in [None, 99, 999]:
         obs = IntervalMetadata(i, self.im_1)
         exp = IntervalMetadata(i)
         exp.add(bounds=[(1, 2), (4, self.upper_bound)],
                 metadata={
                     'gene': 'sagA',
                     'bound': 0
                 })
         self.assertEqual(obs, exp)
Esempio n. 23
0
    def test_transcribe_preserves_all_metadata(self):
        im = IntervalMetadata(4)
        im.add([(0, 2)], metadata={'gene': 'p53'})

        exp = RNA('AGUU', metadata={'foo': 'bar'},
                  positional_metadata={'foo': range(4)},
                  interval_metadata=im)
        seq = DNA('AGTT', metadata={'foo': 'bar'},
                  positional_metadata={'foo': range(4)},
                  interval_metadata=im)
        self.assertEqual(seq.transcribe(), exp)
Esempio n. 24
0
 def test_upper_bound_is_none(self):
     im = IntervalMetadata(None)
     # should not raise error
     im.add([(0, 1000000000)])
     self.assertIsNone(im.upper_bound)
     with self.assertRaisesRegex(
             TypeError, r'upper bound is `None`'):
         im._reverse()
     with self.assertRaisesRegex(
             TypeError, r'upper bound is `None`'):
         IntervalMetadata.concat([self.im_1, im])
Esempio n. 25
0
    def test_interval_metadata_to_gff3_missing_field(self):
        exp = 'ctg123\t.\tgene\t1\t9\t.\t.\t.\tID=gene00001;Name=EDEN'
        imd = IntervalMetadata(9)
        imd.add([(0, 9)], metadata={
            'type': 'gene', 'ID': 'gene00001', 'Name': 'EDEN'})
        with io.StringIO() as fh:
            _interval_metadata_to_gff3(imd, fh, seq_id='ctg123')
            # only compare the uncommented lines because the comments are not
            # stored in IntervalMetadata
            obs = [i for i in fh.getvalue().splitlines()
                   if not i.startswith('#')]

        self.assertEqual([exp], obs)
Esempio n. 26
0
    def test_interval_metadata_to_gff3_multiple_values(self):
        # test multiple values of db_xref are correctly serialized
        exp = 'ctg123\t.\tgene\t1\t9\t.\t.\t.\tDbxref=GO:000152,GO:001234'
        imd = IntervalMetadata(9)
        imd.add([(0, 9)], metadata={
            'type': 'gene', 'db_xref': ['GO:000152', 'GO:001234']})
        with io.StringIO() as fh:
            _interval_metadata_to_gff3(imd, fh, seq_id='ctg123')
            # only compare the uncommented lines because the comments are not
            # stored in IntervalMetadata
            obs = [i for i in fh.getvalue().splitlines()
                   if not i.startswith('#')]

        self.assertEqual([exp], obs)
Esempio n. 27
0
    def test_init_nondefault_parameters(self):
        im = IntervalMetadata(8)
        im.add([(1, 8)], metadata={'gene': 'p53'})
        seq = ExampleGrammaredSequence(
            '.-ABCXYZ',
            metadata={'id': 'foo'},
            positional_metadata={'quality': range(8)},
            interval_metadata=im)

        npt.assert_equal(seq.values, np.array('.-ABCXYZ', dtype='c'))
        self.assertEqual(seq.metadata, {'id': 'foo'})
        assert_data_frame_almost_equal(seq.positional_metadata,
                                       pd.DataFrame({'quality': range(8)}))
        self.assertEqual(seq.interval_metadata, im)
    def test_init_nondefault_parameters(self):
        im = IntervalMetadata(8)
        im.add([(1, 8)], metadata={'gene': 'p53'})
        seq = ExampleGrammaredSequence(
            '.-ABCXYZ',
            metadata={'id': 'foo'},
            positional_metadata={'quality': range(8)},
            interval_metadata=im)

        npt.assert_equal(seq.values, np.array('.-ABCXYZ', dtype='c'))
        self.assertEqual(seq.metadata, {'id': 'foo'})
        assert_data_frame_almost_equal(seq.positional_metadata,
                                       pd.DataFrame({'quality': range(8)}))
        self.assertEqual(seq.interval_metadata, im)
Esempio n. 29
0
    def test_interval_metadata_to_gff3_escape(self):
        # test escape of reserved char in GFF3
        exp = 'ctg123\t.\tgene\t1\t9\t.\t.\t.\tID=a%3B%3D%26%2Cb'
        imd = IntervalMetadata(9)
        imd.add([(0, 9)], metadata={
            'type': 'gene', 'ID': 'a;=&,b'})
        with io.StringIO() as fh:
            _interval_metadata_to_gff3(imd, fh, seq_id='ctg123')
            # only compare the uncommented lines because the comments are not
            # stored in IntervalMetadata
            obs = [i for i in fh.getvalue().splitlines()
                   if not i.startswith('#')]

        self.assertEqual([exp], obs)
Esempio n. 30
0
    def test_interval_metadata_to_gff3_escape(self):
        # test escape of reserved char in GFF3
        exp = 'ctg123\t.\tgene\t1\t9\t.\t.\t.\tID=a%3B%3D%26%2Cb'
        imd = IntervalMetadata(9)
        imd.add([(0, 9)], metadata={'type': 'gene', 'ID': 'a;=&,b'})
        with io.StringIO() as fh:
            _interval_metadata_to_gff3(imd, fh, seq_id='ctg123')
            # only compare the uncommented lines because the comments are not
            # stored in IntervalMetadata
            obs = [
                i for i in fh.getvalue().splitlines() if not i.startswith('#')
            ]

        self.assertEqual([exp], obs)
Esempio n. 31
0
    def test_parse(self):
        fp = get_data_path('transtermhp.tt')
        exps = [('gi|556503834|ref|NC_000913.3|1', 12),
                ('gi|556503834|ref|NC_000913.3|2', 4)]
        for (sid, imd), exp in zip(_generator(fp), exps):
            self.assertEqual(sid, exp[0])
            self.assertEqual(imd.num_interval_features, exp[1])

        # test the interval metadata from the 2nd sequence
        exp_imd = IntervalMetadata(None)
        exp_imd.add(
            [(7857, 7876)],
            metadata={
                'strand': '+',
                'confidence': '95',
                'sequence':
                'TGCCCACGATTAAAG/GTGGCCGC/CCTG/GCGGTCAC/TTCTTTGAGAAAAGG',
                'source': 'TransTermHP',
                'ID': 'TERM_1',
                'gene_id': '2_7',
                'type': 'terminator'
            })
        exp_imd.add(
            [(8919, 8958)],
            metadata={
                'strand': '-',
                'confidence': '100',
                'sequence':
                'AATGAGCCAGAATAA/GCTAAGGTTGAAGGGGC/TGGAAC/GCCCCTTCAACCTTAGC/AGTAGCGTGGGATGA',
                'source': 'TransTermHP',
                'ID': 'TERM_2',
                'gene_id': '2_9',
                'type': 'terminator'
            })
        exp_imd.add(
            [(9258, 9273)],
            metadata={
                'strand': '+',
                'confidence': '89',
                'sequence':
                'GGCAGAAACAAAAAA/TCCCCG/GACT/CGGGGA/TTTATGTACAAGAGG',
                'ID': 'TERM_3',
                'gene_id': '2_9',
                'source': 'TransTermHP',
                'type': 'terminator'
            })
        exp_imd.add(
            [(9258, 9273)],
            metadata={
                'strand': '-',
                'confidence': '100',
                'sequence':
                'GGCAGAAACAAAAAA/TCCCCG/GACT/CGGGGA/TTTATGTACAAGAGG',
                'ID': 'TERM_4',
                'gene_id': '2_9',
                'source': 'TransTermHP',
                'type': 'terminator'
            })
        self.assertEqual(exp_imd, imd)
Esempio n. 32
0
    def test_interval_metadata_to_gff3_missing_field(self):
        exp = 'ctg123\t.\tgene\t1\t9\t.\t.\t.\tID=gene00001;Name=EDEN'
        imd = IntervalMetadata(9)
        imd.add([(0, 9)],
                metadata={
                    'type': 'gene',
                    'ID': 'gene00001',
                    'Name': 'EDEN'
                })
        with io.StringIO() as fh:
            _interval_metadata_to_gff3(imd, fh, seq_id='ctg123')
            # only compare the uncommented lines because the comments are not
            # stored in IntervalMetadata
            obs = [
                i for i in fh.getvalue().splitlines() if not i.startswith('#')
            ]

        self.assertEqual([exp], obs)
Esempio n. 33
0
    def test_interval_metadata_to_gff3_multiple_values(self):
        # test multiple values of db_xref are correctly serialized
        exp = 'ctg123\t.\tgene\t1\t9\t.\t.\t.\tDbxref=GO:000152,GO:001234'
        imd = IntervalMetadata(9)
        imd.add([(0, 9)],
                metadata={
                    'type': 'gene',
                    'db_xref': ['GO:000152', 'GO:001234']
                })
        with io.StringIO() as fh:
            _interval_metadata_to_gff3(imd, fh, seq_id='ctg123')
            # only compare the uncommented lines because the comments are not
            # stored in IntervalMetadata
            obs = [
                i for i in fh.getvalue().splitlines() if not i.startswith('#')
            ]

        self.assertEqual([exp], obs)
Esempio n. 34
0
    def test_filter_partial_genes(self):
        in_fp = join(self.tmpd, 'in.gff')
        out_fp = join(self.tmpd, 'out.gff')
        imd1 = IntervalMetadata(None)
        imd1.add(
            [(0, 100)],
            metadata={
                'partial': '01',
                'phase': 0,
                'source': 'Prodigal_v2.6.3',
                'strand': '.',
                'type': '.',
                'score': '.'
            })
        imd2 = IntervalMetadata(None)
        imd2.add(
            [(200, 300)],
            metadata={
                'partial': '10',
                'phase': 1,
                'source': 'Prodigal_v2.6.3',
                'strand': '-',
                'type': 'CDS',
                'score': '1'
            })
        imd2.add(
            [(2000, 3000)],
            metadata={
                'partial': '00',
                'phase': 1,
                'source': 'Prodigal_v2.6.3',
                'strand': '.',
                'type': '.',
                'score': '.'
            })

        imd3 = IntervalMetadata(None)
        imd3.add(
            [(2000, 3000)],
            metadata={
                'partial': '00',
                'phase': 1,
                'source': 'Prodigal_v2.6.3',
                'strand': '.',
                'type': '.',
                'score': '.'
            })

        data = (('seq1', imd1), ('seq2', imd2))
        write(((sid, imd) for sid, imd in data), into=in_fp, format='gff3')
        filter_partial_genes(in_fp, out_fp)
        obs = read(out_fp, format='gff3')
        for i, j in zip(obs, [('seq2', imd3)]):
            self.assertEqual(i, j)
    def test_complement_without_reverse_non_empty(self):
        for (constructor, seq_str, comp_str,
             qual) in self.all_combos_comp_qual:
            comp = constructor(seq_str).complement()
            self.assertEqual(comp, constructor(comp_str))

            im = IntervalMetadata(len(seq_str))
            im.add([(0, 1)], metadata={'gene': 'p53'})
            comp = constructor(
                seq_str,
                metadata={'id': 'foo', 'description': 'bar'},
                positional_metadata={'quality': qual},
                interval_metadata=im).complement()
            self.assertEqual(
                comp,
                constructor(
                    comp_str,
                    metadata={'id': 'foo', 'description': 'bar'},
                    positional_metadata={'quality': qual},
                    interval_metadata=im))
Esempio n. 36
0
    def test_parse(self):
        imd1 = IntervalMetadata(None)
        imd1.add(bounds=[(237929, 238006)],
                 metadata={
                     'strand': '+',
                     'type': 'tRNA',
                     'source': 'Aragorn',
                     'product': 'tRNA-Ile'
                 })
        imd1.add(bounds=[(238048, 238124)],
                 metadata={
                     'strand': '+',
                     'type': 'tRNA',
                     'source': 'Aragorn',
                     'product': 'tRNA-Ala'
                 })
        imd2 = IntervalMetadata(None)
        imd2.add(bounds=[(4954141, 4954228)],
                 metadata={
                     'strand': '-',
                     'type': 'tRNA',
                     'source': 'Aragorn',
                     'product': 'tRNA-Ser'
                 })
        imd3 = IntervalMetadata(None)
        exp = (('NC_016822.1', imd1), ('NC_016833.1', imd2), ('NC_016834.1',
                                                              imd3))

        fp = get_data_path('aragorn.txt')
        gen = _generator(fp)

        for (exp_id, exp_imd), (obs_id, obs_imd) in zip(exp, gen):
            self.assertEqual(exp_id, obs_id)
            self.assertEqual(exp_imd, obs_imd)
    def test_parse(self):
        imd1 = IntervalMetadata(None)
        imd1.add(bounds=[(252151, 252184)],
                 metadata={
                     'source': 'Tandem_Repeats_Finder',
                     'repeat': 'T',
                     'type': 'tandem_repeat'
                 })
        imd1.add(bounds=[(261169, 261210)],
                 metadata={
                     'source': 'Tandem_Repeats_Finder',
                     'repeat': 'CTCTGA',
                     'type': 'tandem_repeat'
                 })
        imd2 = IntervalMetadata(None)
        imd2.add(bounds=[(172614, 172703)],
                 metadata={
                     'source': 'Tandem_Repeats_Finder',
                     'repeat': 'AACAGCCGC',
                     'type': 'tandem_repeat'
                 })
        exp = (('NC_016822.1', imd1), ('NC_016833.1', imd2))

        fp = get_data_path('tandem_repeats_finder.txt')
        gen = _generator(fp)

        for (exp_id, exp_imd), (obs_id, obs_imd) in zip(exp, gen):
            self.assertEqual(exp_id, obs_id)
            self.assertEqual(exp_imd, obs_imd)
Esempio n. 38
0
    def test_parse(self):
        imd1 = IntervalMetadata(None)
        imd1.add(bounds=[(3588441, 3588818)],
                 metadata={
                     'ncRNA_class': 'RNaseP_bact_a',
                     'type': 'ncRNA',
                     'strand': '-',
                     'db_xref': 'RF00010',
                     'source': 'Rfam'
                 })
        imd1.add(bounds=[(3355449, 3355633)],
                 metadata={
                     'ncRNA_class': '5S_rRNA',
                     'type': 'rRNA',
                     'strand': '+',
                     'product': '5s_rRNA',
                     'db_xref': 'RF00001',
                     'source': 'Rfam'
                 })
        imd2 = IntervalMetadata(None)
        imd2.add(bounds=[(85215, 85384)],
                 metadata={
                     'ncRNA_class': 'LSU_rRNA_bacteria',
                     'type': 'rRNA',
                     'strand': '+',
                     'product': '23s_rRNA',
                     'db_xref': 'RF02541',
                     'source': 'Rfam'
                 })
        imd3 = IntervalMetadata(None)
        imd3.add(bounds=[(8739, 8777)],
                 metadata={
                     'ncRNA_class': 'SSU_rRNA_bacteria',
                     'type': 'rRNA',
                     'strand': '+',
                     'product': '16s_rRNA',
                     'db_xref': 'RF00177',
                     'source': 'Rfam'
                 })
        exp = (('NC_016822.1', imd1), ('NC_016833.1', imd2), ('NC_016834.1',
                                                              imd3))

        fp = get_data_path('cmscan.txt')
        gen = _generator(fp)

        for (exp_id, exp_imd), (obs_id, obs_imd) in zip(exp, gen):
            self.assertEqual(exp_id, obs_id)
            self.assertEqual(exp_imd, obs_imd)
    def test_serialize_location(self):
        imd = IntervalMetadata(9)
        i1 = imd.add([(0, 1)])
        self.assertEqual(_serialize_location(i1), '1')

        i2 = imd.add([(0, 2)], [(True, True)])
        self.assertEqual(_serialize_location(i2), '<1..>2')

        i3 = imd.add([(0, 2)], [(False, True)])
        self.assertEqual(_serialize_location(i3), '1..>2')

        i4 = imd.add([(0, 2)], [(True, False)])
        self.assertEqual(_serialize_location(i4), '<1..2')

        i5 = imd.add([(0, 2), (3, 9)], metadata={'strand': '-'})
        self.assertEqual(_serialize_location(i5),
                         'complement(join(1..2,4..9))')

        i6 = imd.add([(0, 2), (3, 9)], [(True, False), (False, True)],
                     metadata={'strand': '-'})
        self.assertEqual(_serialize_location(i6),
                         'complement(join(<1..2,4..>9))')
    def test_serialize_location(self):
        imd = IntervalMetadata(9)
        i1 = imd.add([(0, 1)])
        self.assertEqual(_serialize_location(i1), '1')

        i2 = imd.add([(0, 2)], [(True, True)])
        self.assertEqual(_serialize_location(i2), '<1..>2')

        i3 = imd.add([(0, 2)], [(False, True)])
        self.assertEqual(_serialize_location(i3), '1..>2')

        i4 = imd.add([(0, 2)], [(True, False)])
        self.assertEqual(_serialize_location(i4), '<1..2')

        i5 = imd.add([(0, 2), (3, 9)], metadata={'strand': '-'})
        self.assertEqual(_serialize_location(i5),
                         'complement(join(1..2,4..9))')

        i6 = imd.add([(0, 2), (3, 9)],
                     [(True, False), (False, True)],
                     metadata={'strand': '-'})
        self.assertEqual(_serialize_location(i6),
                         'complement(join(<1..2,4..>9))')
Esempio n. 41
0
    def test_parse(self):
        imd1 = IntervalMetadata(None)
        imd1.add(bounds=[(0, 2853)],
                 metadata={
                     'source': 'RNAmmer-1.2',
                     'type': 'rRNA',
                     'product': '23s_rRNA',
                     'strand': '-',
                     'score': '3222.8'
                 })
        imd1.add(bounds=[(2924, 3040)],
                 metadata={
                     'source': 'RNAmmer-1.2',
                     'type': 'rRNA',
                     'product': '5s_rRNA',
                     'strand': '+',
                     'score': '80.8'
                 })
        imd2 = IntervalMetadata(None)
        imd2.add(bounds=[(77272, 78834)],
                 metadata={
                     'source': 'RNAmmer-1.2',
                     'type': 'rRNA',
                     'product': '16s_rRNA',
                     'strand': '+',
                     'score': '1984.2'
                 })

        exp = (('NZ_JXDA01000005.1', imd1), ('NZ_JXDA01000001.1', imd2))

        fp = get_data_path('rnammer.gff')
        gen = _generator(fp)

        for (exp_id, exp_imd), (obs_id, obs_imd) in zip(exp, gen):
            self.assertEqual(exp_id, obs_id)
            self.assertEqual(exp_imd, obs_imd)
Esempio n. 42
0
class GFF3IOTests(TestCase):
    def setUp(self):
        self.multi_fp = get_data_path('gff3_multi_record')
        self.single_fp = get_data_path('gff3_single_record')

        intvls = [{'bounds': [(0, 4641652)],
                   'metadata': {'source': 'European Nucleotide Archive',
                                'type': 'chromosome',
                                'score': '.',
                                'strand': '.',
                                'ID': 'chromosome:Chromosome',
                                'Alias': 'U00096.3',
                                'Is_circular': 'true'}},
                  {'bounds': [(147, 148)],
                   'metadata': {'source': 'regulondb_feature',
                                'type': 'biological_region',
                                'score': '.',
                                'strand': '+',
                                'external_name':
                                'Promoter thrLp (RegulonDB:ECK120010236)',
                                'logic_name': 'regulondb_promoter'}},
                  {'bounds': [(336, 2799)],
                   'metadata': {'source': 'Prodigal_v2.60',
                                'type': 'gene',
                                'score': '1.8',
                                'strand': '+',
                                'phase': 0,
                                'ID': '1_1',
                                'gc_cont': '0.427'}},
                  {'bounds': [(336, 2799)],
                   'metadata': {'source': 'Prodigal_v2.60',
                                'type': 'CDS',
                                'score': '333.8',
                                'strand': '+',
                                'phase': 0,
                                'ID': '1_2',
                                'Parent': '1_1',
                                'rbs_motif': 'GGAG/GAGG',
                                'rbs_spacer': '5-10bp'}},
                  {'bounds': [(0, 50), (55, 100)],
                   'metadata': {'source': 'Prodigal_v2.60',
                                'type': 'gene',
                                'score': '1.8',
                                'strand': '+',
                                'phase': 0,
                                'ID': '1_1',
                                'gene': 'FXR receptor'}}]

        self.upper_bound = 4641652
        self.imd1 = IntervalMetadata(self.upper_bound)
        self.imd1.add(**intvls[0])
        self.imd1.add(**intvls[1])

        self.imd2 = IntervalMetadata(None)
        self.imd2.add(**intvls[2])
        self.imd2.add(**intvls[3])

        self.imd3 = IntervalMetadata(None)
        self.imd3.add(**intvls[4])

        self.seq_fp = get_data_path('gff3_dna')
        self.seq = Sequence('ATGCATGCATGC',
                            metadata={'id': 'NC_1',
                                      'description': 'species X'})
        self.seq.interval_metadata.add(
            [(0, 9)],
            metadata={'source': 'Prodigal_v2.60',
                      'type': 'gene',
                      'score': '.',
                      'strand': '+',
                      'phase': 0,
                      'ID': 'gene1',
                      'Name': 'FXR'})
        self.dna = DNA(self.seq)
Esempio n. 43
0
    def test_eq_ne(self):
        im1 = IntervalMetadata(10)
        im1.add(metadata={'gene': 'sagA', 'bound': '0'},
                bounds=[(0, 2), (4, 7)])
        im1.add(metadata={'gene': 'sagB', 'bound': '3'},
                bounds=[(3, 5)])

        # The ordering shouldn't matter
        im2 = IntervalMetadata(10)
        im2.add(metadata={'gene': 'sagB', 'bound': '3'},
                bounds=[(3, 5)])
        im2.add(metadata={'gene': 'sagA', 'bound': '0'},
                bounds=[(0, 2), (4, 7)])

        im3 = IntervalMetadata(10)
        im3.add(metadata={'gene': 'sagA', 'bound': '3'},
                bounds=[(0, 2), (4, 7)])
        im3.add(metadata={'gene': 'sagB', 'bound': '3'},
                bounds=[(3, 5)])

        self.assertReallyEqual(im1, im2)
        self.assertReallyNotEqual(im1, im3)
Esempio n. 44
0
    def test_compute_rrna_score(self):
        imd1 = IntervalMetadata(None)
        imd1.add([(0, 100)], metadata={'type': 'rRNA', 'product': '5s_rRNA'})
        imd1.add([(0, 725)], metadata={'type': 'rRNA', 'product': '16s_rRNA'})
        imd1.add([(0, 12)], metadata={'type': 'tRNA'})
        imd2 = IntervalMetadata(None)
        imd2.add([(0, 10)], metadata={'type': 'rRNA', 'product': '5s_rRNA'})
        imd2.add([(0, 1450)], metadata={'type': 'rRNA', 'product': '16s_rRNA'})
        imd3 = IntervalMetadata(None)
        imd3.add([(0, 2900)], metadata={'type': 'rRNA', 'product': '23s_rRNA'})

        obs = compute_rrna_score([imd1, imd2, imd3])
        exp = 0.1 + sum([0.3] * 3)
        self.assertEqual(obs, exp)

        obs = compute_rrna_score([imd1, imd2])
        exp = 0.1 + sum([0.3] * 2)
        self.assertEqual(obs, exp)

        obs = compute_rrna_score([])
        self.assertEqual(obs, 0.1)

        obs = compute_rrna_score([imd2])
        self.assertEqual(obs, 0.5)

        obs = compute_rrna_score([imd1])
        self.assertEqual(obs, 0.6)
Esempio n. 45
0
class TestIntervalMetadata(unittest.TestCase, ReallyEqualMixin):
    def setUp(self):
        self.upper_bound = 10
        self.im_empty = IntervalMetadata(self.upper_bound)
        self.im_1 = IntervalMetadata(self.upper_bound)
        self.im_1_1 = Interval(
            interval_metadata=self.im_1,
            bounds=[(1, 2), (4, self.upper_bound)],
            metadata={'gene': 'sagA',  'bound': 0})
        self.im_2 = IntervalMetadata(self.upper_bound)
        self.im_2_1 = Interval(
            interval_metadata=self.im_2,
            bounds=[(1, 2), (4, self.upper_bound)],
            metadata={'gene': 'sagA',  'bound': 0})
        self.im_2_2 = Interval(
            interval_metadata=self.im_2,
            bounds=[(3, 5)],
            metadata={'gene': 'sagB', 'bound': 0, 'spam': [0]})

    def test_copy_empty(self):
        obs = copy(self.im_empty)
        self.assertEqual(obs, self.im_empty)
        self.assertIsNot(obs._intervals, self.im_empty._intervals)
        self.assertIsNot(obs._interval_tree, self.im_empty._interval_tree)

    def test_copy(self):
        obs = copy(self.im_2)
        self.assertEqual(obs, self.im_2)
        self.assertIsNot(obs._intervals, self.im_2._intervals)
        self.assertIsNot(obs._interval_tree, self.im_2._interval_tree)

        for i in range(self.im_2.num_interval_features):
            i1, i2 = obs._intervals[i], self.im_2._intervals[i]
            self.assertIsNot(i1, i2)
            self.assertIsNot(i1.bounds, i2.bounds)
            self.assertIsNot(i1.fuzzy, i2.fuzzy)
            self.assertIsNot(i1._interval_metadata, i2._interval_metadata)
            self.assertIsNot(i1.metadata, i2.metadata)
            for k in i1.metadata:
                self.assertIs(i1.metadata[k], i2.metadata[k])

    def test_deepcopy(self):
        obs = deepcopy(self.im_2)
        self.assertEqual(obs, self.im_2)
        self.assertIsNot(obs._intervals, self.im_2._intervals)
        self.assertIsNot(obs._interval_tree, self.im_2._interval_tree)

        for i in range(self.im_2.num_interval_features):
            i1, i2 = obs._intervals[i], self.im_2._intervals[i]
            self.assertIsNot(i1, i2)
            self.assertIsNot(i1.bounds, i2.bounds)
            self.assertIsNot(i1.fuzzy, i2.fuzzy)
            self.assertIsNot(i1.metadata, i2.metadata)

        i2.metadata['spam'].append(1)
        self.assertEqual(i2.metadata,
                         {'gene': 'sagB', 'bound': 0, 'spam': [0, 1]})
        self.assertEqual(i1.metadata,
                         {'gene': 'sagB', 'bound': 0, 'spam': [0]})

    def test_deepcopy_memo_is_respected(self):
        memo = {}
        deepcopy(self.im_1, memo)
        self.assertGreater(len(memo), 2)

    def test_init(self):
        self.assertFalse(self.im_empty._is_stale_tree)
        self.assertEqual(self.im_empty._intervals, [])

    def test_init_upper_bound_lt_lower_bound(self):
        # test that no exception is raised
        IntervalMetadata(0)

        with self.assertRaises(ValueError):
            IntervalMetadata(-1)

    def test_upper_bound_is_none(self):
        im = IntervalMetadata(None)
        # should not raise error
        im.add([(0, 1000000000)])
        self.assertIsNone(im.upper_bound)
        with self.assertRaisesRegex(
                TypeError, r'upper bound is `None`'):
            im._reverse()
        with self.assertRaisesRegex(
                TypeError, r'upper bound is `None`'):
            IntervalMetadata.concat([self.im_1, im])

    def test_init_copy_from(self):
        for i in [None, 99, 999]:
            obs = IntervalMetadata(i, self.im_1)
            exp = IntervalMetadata(i)
            exp.add(bounds=[(1, 2), (4, self.upper_bound)],
                    metadata={'gene': 'sagA',  'bound': 0})
            self.assertEqual(obs, exp)

    def test_init_copy_from_empty(self):
        for i in [None, 0, 9, 99, 999]:
            obs = IntervalMetadata(i, self.im_empty)
            exp = IntervalMetadata(i)
            self.assertEqual(obs, exp)
            # test it is shallow copy
            self.assertIsNot(obs._intervals, self.im_empty._intervals)
            self.assertIsNot(obs._interval_tree, self.im_empty._interval_tree)

    def test_init_copy_from_shallow_copy(self):
        obs = IntervalMetadata(self.upper_bound, self.im_2)
        self.assertEqual(self.im_2, obs)
        # test it is shallow copy
        self.assertIsNot(obs._intervals, self.im_2._intervals)
        self.assertIsNot(obs._interval_tree, self.im_2._interval_tree)
        for i in range(self.im_2.num_interval_features):
            i1, i2 = obs._intervals[i], self.im_2._intervals[i]
            self.assertIsNot(i1, i2)
            self.assertIsNot(i1.bounds, i2.bounds)
            self.assertIsNot(i1.fuzzy, i2.fuzzy)
            self.assertIsNot(i1._interval_metadata, i2._interval_metadata)
            self.assertIsNot(i1.metadata, i2.metadata)
            for k in i1.metadata:
                self.assertIs(i1.metadata[k], i2.metadata[k])

    def test_init_copy_from_error(self):
        i = self.upper_bound - 1
        with self.assertRaisesRegex(
                ValueError, r'larger than upper bound \(%r\)' % i):
            IntervalMetadata(i, self.im_2)

    def test_num_interval_features(self):
        self.assertEqual(self.im_empty.num_interval_features, 0)
        self.assertEqual(self.im_1.num_interval_features, 1)
        self.assertEqual(self.im_2.num_interval_features, 2)

    def test_duplicate(self):
        '''Test query and drop methods on duplicate Intervals.'''
        intvl_1 = self.im_empty.add([(1, 2)])
        intvl_2 = self.im_empty.add([(1, 2)])
        self.assertEqual(len(list(self.im_empty.query([(1, 2)]))), 2)
        self.im_empty.drop([intvl_1])
        self.assertEqual(len(self.im_empty._intervals), 1)
        self.assertTrue(self.im_empty._intervals[0] is intvl_2)

    def test_duplicate_bounds(self):
        intvl = self.im_empty.add([(1, 2), (1, 2)])
        intvls = list(self.im_empty.query([(1, 2)]))
        self.assertEqual(len(intvls), 1)
        self.assertTrue(intvl is intvls[0])

    def test_concat_empty(self):
        for i in 0, 1, 2:
            obs = IntervalMetadata.concat([self.im_empty] * i)
            exp = IntervalMetadata(self.upper_bound * i)
            self.assertEqual(obs, exp)

        obs = IntervalMetadata.concat([])
        self.assertEqual(obs, IntervalMetadata(0))

    def test_concat(self):
        im1 = IntervalMetadata(3)
        im2 = IntervalMetadata(4)
        im3 = IntervalMetadata(5)
        im1.add([(0, 2)], [(True, True)])
        im2.add([(0, 3)], [(True, False)], {'gene': 'sagA'})
        im2.add([(2, 4)], metadata={'gene': 'sagB'})
        im3.add([(1, 5)], [(False, True)], {'gene': 'sagC'})
        obs = IntervalMetadata.concat([im1, im2, im3])

        exp = IntervalMetadata(12)
        exp.add(bounds=[(0, 2)], fuzzy=[(True, True)])
        exp.add(bounds=[(3, 6)], fuzzy=[(True, False)],
                metadata={'gene': 'sagA'})
        exp.add(bounds=[(5, 7)], metadata={'gene': 'sagB'})
        exp.add(bounds=[(8, 12)], fuzzy=[(False, True)],
                metadata={'gene': 'sagC'})
        self.assertEqual(obs, exp)

    def test_merge(self):
        # empty + empty
        im = IntervalMetadata(self.upper_bound)
        self.im_empty.merge(im)
        self.assertEqual(self.im_empty, im)
        # empty + non-empty
        self.im_empty.merge(self.im_1)
        self.assertEqual(self.im_empty, self.im_1)
        # non-empty + non-empty
        self.im_empty.merge(self.im_2)
        self.im_2.merge(self.im_1)
        self.assertEqual(self.im_empty, self.im_2)

    def test_merge_unequal_upper_bounds(self):
        n = 3
        im1 = IntervalMetadata(n)
        for im in [self.im_empty, self.im_1]:
            with self.assertRaisesRegex(
                    ValueError,
                    r'not equal \(%d != %d\)' % (self.upper_bound, n)):
                im.merge(im1)

    def test_merge_to_unbounded(self):
        for im in [self.im_empty, self.im_1, IntervalMetadata(None)]:
            obs = IntervalMetadata(None)
            obs.merge(im)
            self.assertIsNone(obs.upper_bound)
            self.assertEqual(obs._intervals, im._intervals)

    def test_merge_unbounded_to_bounded(self):
        im = IntervalMetadata(None)
        with self.assertRaisesRegex(
                ValueError,
                r'Cannot merge an unbound IntervalMetadata object '
                'to a bounded one'):
            self.im_1.merge(im)
        # original im is not changed
        self.assertIsNone(im.upper_bound)
        self.assertEqual(im._intervals, [])

    def test_sort(self):
        interval = Interval(
            self.im_2,
            [(1, 2), (3, 8)],
            metadata={'gene': 'sagA',  'bound': 0})
        im = deepcopy(self.im_2)
        self.im_2.sort(False)
        # check sorting does not have other side effects
        self.assertEqual(im, self.im_2)
        self.assertEqual(self.im_2._intervals,
                         [self.im_2_2, self.im_2_1, interval])

        self.im_2.sort()
        self.assertEqual(im, self.im_2)
        self.assertEqual(self.im_2._intervals,
                         [interval, self.im_2_1, self.im_2_2])

        self.im_empty.sort()
        self.assertEqual(self.im_empty, IntervalMetadata(self.upper_bound))

    def test_add_eq_upper_bound(self):
        self.im_empty.add(bounds=[(1, 2), (4, self.upper_bound)],
                          metadata={'gene': 'sagA',  'bound': 0})
        self.assertTrue(self.im_empty._is_stale_tree)
        interval = self.im_empty._intervals[0]
        self.assertEqual(interval.bounds, [(1, 2), (4, self.upper_bound)])
        self.assertEqual(interval.metadata, {'gene': 'sagA', 'bound': 0})
        self.assertTrue(isinstance(self.im_empty._interval_tree, IntervalTree))

    def test_add_gt_upper_bound(self):
        with self.assertRaises(ValueError):
            self.im_empty.add(bounds=[(1, 2), (4, self.upper_bound+1)],
                              metadata={'gene': 'sagA',  'bound': 0})

    def test_add_eq_start_end_bound(self):
        for i in 0, 1, self.upper_bound:
            # test that no exception is raised
            self.im_empty.add(bounds=[(i, i)],
                              metadata={'gene': 'sagA',  'bound': 0})

    def test_query_attribute(self):
        intervals = self.im_2._query_attribute({})
        for i, j in zip(intervals, self.im_2._intervals):
            self.assertEqual(i, j)

        intervals = list(self.im_2._query_attribute(None))
        self.assertEqual(len(intervals), 0)

        for i in self.im_2._intervals:
            intervals = list(self.im_2._query_attribute(i.metadata))
            self.assertEqual(len(intervals), 1)
            self.assertEqual(intervals[0], i)

    def test_query_interval(self):
        intervals = list(self.im_2._query_interval((1, 2)))
        self.assertEqual(len(intervals), 1)
        self.assertEqual(intervals[0], self.im_2_1)

        intervals = list(self.im_2._query_interval((3, 4)))
        self.assertEqual(len(intervals), 1)
        self.assertEqual(intervals[0], self.im_2_2)

        intervals = {repr(i) for i in self.im_2._query_interval((1, 7))}
        self.assertEqual(len(intervals), 2)
        self.assertSetEqual(intervals,
                            {repr(i) for i in self.im_2._intervals})

    def test_query_interval_upper_bound(self):
        intervals = list(self.im_2._query_interval((self.upper_bound-1,
                                                    self.upper_bound)))
        self.assertEqual(intervals, [self.im_2_1])

    def test_query(self):
        intervals = list(self.im_2.query(bounds=[(1, 5)],
                                         metadata={'gene': 'sagA'}))
        self.assertEqual(len(intervals), 1)
        self.assertEqual(intervals[0], self.im_2_1)

    def test_query_empty(self):
        intervals = list(self.im_1.query())
        self.assertEqual(len(intervals), 0)

    def test_query_no_hits(self):
        intervals = list(self.im_2.query(bounds=[(self.upper_bound, 200)]))
        self.assertEqual(len(intervals), 0)

        intervals = list(self.im_2.query(metadata={'gene': 'sagC'}))
        self.assertEqual(len(intervals), 0)

        intervals = list(self.im_2.query(bounds=[(1, 2)],
                                         metadata={'gene': 'sagC'}))
        self.assertEqual(len(intervals), 0)

    def test_query_interval_only(self):
        for loc in [[(1, 7)],
                    [(1, 2), (3, 4)]]:
            intervals = list(self.im_2.query(bounds=loc))
            self.assertEqual(len(intervals), 2)
            self.assertEqual(intervals[0], self.im_2_1)
            self.assertEqual(intervals[1], self.im_2_2)

    def test_query_metadata_only(self):
        intervals = list(self.im_2.query(metadata={'gene': 'sagB'}))
        self.assertEqual(len(intervals), 1)
        self.assertEqual(intervals[0], self.im_2_2)

        intervals = list(self.im_2.query(metadata={'bound': 0}))
        self.assertEqual(len(intervals), 2)
        self.assertEqual(intervals[0], self.im_2_1)
        self.assertEqual(intervals[1], self.im_2_2)

    def test_drop(self):
        intvl = self.im_2._intervals[0]
        self.im_2.drop([intvl])
        self.assertEqual(len(self.im_2._intervals), 1)
        self.assertEqual(self.im_2._intervals[0], self.im_2_2)
        # test the intvl was set to dropped
        self.assertTrue(intvl.dropped)

    def test_drop_all(self):
        self.im_2.drop(self.im_2._intervals)
        self.assertEqual(self.im_2, self.im_empty)

    def test_drop_negate(self):
        intvl = self.im_2._intervals[0]
        self.im_2.drop([intvl], negate=True)
        self.assertEqual(len(self.im_2._intervals), 1)
        self.assertEqual(self.im_2._intervals[0], intvl)
        # test the dropped intvl was set to dropped
        self.assertTrue(self.im_2_2.dropped)

    def test_reverse(self):
        self.im_2._reverse()
        Interval(
            interval_metadata=self.im_empty,
            bounds=[(0, 6), (8, 9)],
            metadata={'gene': 'sagA',  'bound': 0})
        Interval(
            interval_metadata=self.im_empty,
            bounds=[(5, 7)],
            metadata={'gene': 'sagB', 'bound': 0, 'spam': [0]})
        self.assertEqual(self.im_2, self.im_empty)

    def test_eq_ne(self):
        im1 = IntervalMetadata(10)
        im1.add(metadata={'gene': 'sagA', 'bound': '0'},
                bounds=[(0, 2), (4, 7)])
        im1.add(metadata={'gene': 'sagB', 'bound': '3'},
                bounds=[(3, 5)])

        # The ordering shouldn't matter
        im2 = IntervalMetadata(10)
        im2.add(metadata={'gene': 'sagB', 'bound': '3'},
                bounds=[(3, 5)])
        im2.add(metadata={'gene': 'sagA', 'bound': '0'},
                bounds=[(0, 2), (4, 7)])

        im3 = IntervalMetadata(10)
        im3.add(metadata={'gene': 'sagA', 'bound': '3'},
                bounds=[(0, 2), (4, 7)])
        im3.add(metadata={'gene': 'sagB', 'bound': '3'},
                bounds=[(3, 5)])

        self.assertReallyEqual(im1, im2)
        self.assertReallyNotEqual(im1, im3)

    def test_ne_diff_bounds(self):
        im1 = IntervalMetadata(10)
        im2 = IntervalMetadata(9)
        intvl = {'bounds': [(0, 1)], 'metadata': {'spam': 'foo'}}
        im1.add(**intvl)
        im2.add(**intvl)
        self.assertReallyNotEqual(im1, im2)

    def test_repr(self):
        exp = '''0 interval features
-------------------'''
        self.assertEqual(repr(self.im_empty), exp)

        self.im_empty.add([(1, 2)], metadata={'gene': 'sagA'})

        exp = ("1 interval feature\n"
               "------------------\n"
               r"Interval\(interval_metadata=<[0-9]+>, bounds=\[\(1, 2\)\], "
               r"fuzzy=\[\(False, False\)\], metadata={'gene': 'sagA'}\)")
        self.assertRegex(repr(self.im_empty), exp)

        self.im_empty.add([(3, 4)], metadata={'gene': 'sagB'})
        self.im_empty.add([(3, 4)], metadata={'gene': 'sagC'})
        self.im_empty.add([(3, 4)], metadata={'gene': 'sagD'})
        self.im_empty.add([(3, 4)], metadata={'gene': 'sagE'})
        self.im_empty.add([(3, 4)], metadata={'gene': 'sagF'})
        exp = ("6 interval features\n"
               "-------------------\n"
               r"Interval\(interval_metadata=<[0-9]+>, bounds=\[\(1, 2\)\], "
               r"fuzzy=\[\(False, False\)\], metadata={'gene': 'sagA'}\)\n"
               r"Interval\(interval_metadata=<[0-9]+>, bounds=\[\(3, 4\)\], "
               r"fuzzy=\[\(False, False\)\], metadata={'gene': 'sagB'}\)\n"
               r"...\n"
               r"Interval\(interval_metadata=<[0-9]+>, bounds=\[\(3, 4\)\], "
               r"fuzzy=\[\(False, False\)\], metadata={'gene': 'sagE'}\)\n"
               r"Interval\(interval_metadata=<[0-9]+>, bounds=\[\(3, 4\)\], "
               r"fuzzy=\[\(False, False\)\], metadata={'gene': 'sagF'}\)")
        self.assertRegex(repr(self.im_empty), exp)
Esempio n. 46
0
    def setUp(self):
        # to test ID line
        self.id = (
            # This is a derived record (non-coding, rRNA and spacer records)
            # (feature level record:
            # http://www.ebi.ac.uk/ena/browse/feature-level-products
            # TODO: a Uniprot record?
            ([
                'ID   AB000684.1:1..275:rRNA; SV 1; linear; '
                'genomic DNA; STD; ENV; 275 BP.'
            ], {
                'division': 'ENV',
                'mol_type': 'genomic DNA',
                'shape': 'linear',
                'locus_name': 'AB000684.1:1..275:rRNA',
                'unit': 'bp',
                'size': 275,
                'version': 1,
                'class': 'STD',
                'date': None
            }),
            # A standard record
            (['ID   M14399; SV 1; linear; mRNA; STD; PRO; 63 BP.'], {
                'division': 'PRO',
                'mol_type': 'mRNA',
                'shape': 'linear',
                'locus_name': 'M14399',
                'unit': 'bp',
                'size': 63,
                'version': 1,
                'class': 'STD',
                'date': None
            }))

        # define a single DNA record (with no interval metadata)
        # M14399; SV 1; linear; mRNA; STD; PRO; 63 BP.
        self.single = (
            'gtgaaacaaagcactattgcactggctgtcttaccgttactgtttacccctgtgacaaaagcc',
            {
                'LOCUS': {
                    'locus_name': 'M14399',
                    'class': 'STD',
                    'division': 'PRO',
                    'mol_type': 'mRNA',
                    'shape': 'linear',
                    'size': 63,
                    'unit': 'bp',
                    'version': 1,
                    'date': None
                }
            }, None, DNA)

        # define a single protein record (uniprot)
        self.protein = (
            'MAFSAEDVLKEYDRRRRMEALLLSLYYPNDRKLLDYKEWSPPRVQVECPKAPVEWNNPPSEKG'
            'LIVGHFSGIKYKGEKAQASEVDVNKMCCWVSKFKDAMRRYQGIQTCKIPGKVLSDLDAKIKAY'
            'NLTVEGVEGFVRYSRVTKQHVAAFLKELRHSKQYENVNLIHYILTDKRVDIQHLEKDLVKDFK'
            'ALVESAHRMRQGHMINVKYILYQLLKKHGHGPDGPDILTVKTGSKGVLYDDSFRKIYTDLGWK'
            'FTPL', {
                'LOCUS': {
                    'locus_name': '001R_FRG3G',
                    'status': 'Reviewed',
                    'size': 256,
                    'unit': 'aa'
                }
            }, None, Protein)

        # define a single DNA record uppercase (filepath)
        self.single_upper_fp = get_data_path('embl_single_record_upper')

        # define a single RNA record lower
        self.single_lower_fp = get_data_path('embl_single_record_lower')

        # define a single RNA record file path
        self.single_rna_fp = get_data_path('embl_single_record')

        # define a http://www.ebi.ac.uk/ena/browse/feature-level-products
        self.feature_level_fp = get_data_path("embl_feature_level_record")

        # define a interval metadata (see skbio.metadata.IntervalMetadata)
        imd = IntervalMetadata(63)

        # then add interval object to interval metadata. Add source
        imd.add(
            [(0, 63)], [(False, False)], {
                'db_xref': '"taxon:562"',
                'mol_type': '"mRNA"',
                'organism': '"Escherichia coli"',
                'type': 'source',
                'strand': '+',
                '__location': '1..63'
            })

        imd.add(
            [(0, 63)],
            # the second True is beacause exact location is not known
            [(False, True)],
            {
                'phase':
                0,
                'db_xref': [
                    '"GOA:P00634"', '"InterPro:IPR001952"',
                    '"InterPro:IPR017849"', '"InterPro:IPR017850"',
                    '"InterPro:IPR018299"', '"PDB:1AJA"', '"PDB:1AJB"',
                    '"PDB:1AJC"', '"PDB:1AJD"', '"PDB:1ALH"', '"PDB:1ALI"',
                    '"PDB:1ALJ"', '"PDB:1ALK"', '"PDB:1ANI"', '"PDB:1ANJ"',
                    '"PDB:1B8J"', '"PDB:1ED8"', '"PDB:1ED9"', '"PDB:1ELX"',
                    '"PDB:1ELY"', '"PDB:1ELZ"', '"PDB:1EW8"', '"PDB:1EW9"',
                    '"PDB:1HJK"', '"PDB:1HQA"', '"PDB:1KH4"', '"PDB:1KH5"',
                    '"PDB:1KH7"', '"PDB:1KH9"', '"PDB:1KHJ"', '"PDB:1KHK"',
                    '"PDB:1KHL"', '"PDB:1KHN"', '"PDB:1URA"', '"PDB:1URB"',
                    '"PDB:1Y6V"', '"PDB:1Y7A"', '"PDB:2ANH"', '"PDB:2G9Y"',
                    '"PDB:2GA3"', '"PDB:2MLX"', '"PDB:2MLY"', '"PDB:2MLZ"',
                    '"PDB:3BDF"', '"PDB:3BDG"', '"PDB:3BDH"', '"PDB:3CMR"',
                    '"PDB:3DPC"', '"PDB:3DYC"', '"PDB:3TG0"', '"PDB:4KM4"',
                    '"PDB:4YR1"', '"PDB:5C66"', '"PDB:5GAD"', '"PDB:5GAF"',
                    '"PDB:5GAG"', '"PDB:5GAH"', '"PDB:5JTL"', '"PDB:5JTM"',
                    '"PDB:5JTN"', '"PDB:5JTO"', '"PDB:5JTP"',
                    '"UniProtKB/Swiss-Prot:P00634"'
                ],
                '__location':
                '1..>63',
                'strand':
                '+',
                'note':
                '"alkaline phosphatase signal peptide"',
                'protein_id':
                '"AAA23431.1"',
                'transl_table':
                '11',
                'translation':
                '"MKQSTIALAVLPLLFTPVTKA"',
                'type':
                'CDS'
            })

        self.single_rna = (
            'gugaaacaaagcacuauugcacuggcugucuuaccguuacuguuuaccccugugacaaaagcc',
            {
                'LOCUS': {
                    'locus_name': 'M14399',
                    'class': 'STD',
                    'division': 'PRO',
                    'mol_type': 'mRNA',
                    'shape': 'linear',
                    'size': 63,
                    'unit': 'bp',
                    'version': 1,
                    'date': '02-SEP-1999'
                },
                'ACCESSION':
                'M14399;',  # accessions (could be more than one)
                'VERSION':
                'M14399.1',  # a genbank like version
                'DATE': [
                    "16-JUL-1988 (Rel. 16, Created)",
                    "02-SEP-1999 (Rel. 60, Last updated, Version 3)"
                ],
                'DBSOURCE':
                'MD5; c9b40131b8622946b5aafdf5473b3d43.',
                'DEFINITION':
                "E.coli alkaline phosphatase signal mRNA, 5' end.",
                'KEYWORDS':
                "alkaline phosphatase; signal peptide.",
                'SOURCE': {
                    "ORGANISM":
                    "Escherichia coli",
                    'taxonomy':
                    "Bacteria; Proteobacteria; "
                    "Gammaproteobacteria; Enterobacterales; "
                    "Enterobacteriaceae; Escherichia."
                },
                'REFERENCE': [{
                    'AUTHORS':
                    'Gray G.L., Baldridge J.S., '
                    'McKeown K.S., Heyneker H.L., '
                    'Chang C.N.;',
                    'JOURNAL':
                    'Gene 39(2-3):247-254(1985).',
                    'REFERENCE':
                    '1  (bases 1 to 63)',
                    'TITLE':
                    '"Periplasmic production of correctly '
                    'processed human growth hormone in '
                    'Escherichia coli: natural and bacterial '
                    'signal sequences are '
                    'interchangeable";',
                    'PUBMED':
                    '3912261'
                }],
                'CROSS_REFERENCE':
                ['DOI; 10.1016/0378-1119(85)'
                 '90319-1. PUBMED; 3912261.']
            },
            imd,
            RNA)

        # define a multi record. File path
        self.multi_fp = get_data_path('embl_multi_records')

        # define interval metadata (as single metadata)
        imd1 = imd

        # define interal metadata for multi 2
        imd2 = IntervalMetadata(743)

        # then add interval object to interval metadata. Add source
        imd2.add(
            [(0, 743)], [(False, False)], {
                'organism': '"Ruditapes philippinarum"',
                'type': 'source',
                '__location': '1..743',
                'strand': '+',
                'mol_type': '"mRNA"',
                'db_xref': '"taxon:129788"'
            })

        imd2.add(
            [(57, 444)], [(False, False)], {
                'translation':
                '"MPGGKAGKDSGKAKAKAVSRSARAGLQFPVGRIHRHLKNRT'
                'TSHG RVGATAAVYSAAILEYLTAEVLELAGNASKDLKVKRI'
                'TPRHLQLAIRGDEELDSLIKAT IAGGGVIPHIHKSLIGKKG'
                'GQQAK"',
                'type':
                'CDS',
                '__location':
                '58..444',
                'protein_id':
                '"APY18893.1"',
                'strand':
                '+',
                'phase':
                0,
                'product':
                '"histone"'
            })

        # multi object
        self.multi = (
            (
                'GTGAAACAAAGCACTATTGCACTGGCTGTCTTACCGTTACTGTTTACCCCTGTGACAAAAGCC',
                {
                    'LOCUS': {
                        'locus_name': 'M14399',
                        'class': 'STD',
                        'division': 'PRO',
                        'mol_type': 'mRNA',
                        'shape': 'linear',
                        'size': 63,
                        'unit': 'bp',
                        'version': 1,
                        'date': '02-SEP-1999'
                    },
                    'ACCESSION':
                    'M14399;',  # accessions (could be more than one)
                    'VERSION':
                    'M14399.1',  # a genbank like version
                    'DATE': [
                        "16-JUL-1988 (Rel. 16, Created)",
                        "02-SEP-1999 (Rel. 60, Last updated, Version 3)"
                    ],
                    'DBSOURCE':
                    'MD5; c9b40131b8622946b5aafdf5473b3d43.',
                    'DEFINITION':
                    "E.coli alkaline phosphatase signal mRNA, 5' end.",
                    'KEYWORDS':
                    "alkaline phosphatase; signal peptide.",
                    'SOURCE': {
                        "ORGANISM":
                        "Escherichia coli",
                        'taxonomy':
                        "Bacteria; Proteobacteria; "
                        "Gammaproteobacteria; Enterobacterales; "
                        "Enterobacteriaceae; Escherichia."
                    },
                    'REFERENCE': [{
                        'AUTHORS':
                        'Gray G.L., Baldridge J.S., '
                        'McKeown K.S., Heyneker H.L., '
                        'Chang C.N.;',
                        'JOURNAL':
                        'Gene 39(2-3):247-254(1985).',
                        'REFERENCE':
                        '1  (bases 1 to 63)',
                        'TITLE':
                        '"Periplasmic production of correctly '
                        'processed human growth hormone in '
                        'Escherichia coli: natural and '
                        'bacterial signal sequences are '
                        'interchangeable";',
                        'PUBMED':
                        '3912261'
                    }],
                    'CROSS_REFERENCE':
                    ['DOI; 10.1016/0378-1119(85)'
                     '90319-1. PUBMED; 3912261.']
                },
                imd1,
                DNA),
            ('TGTGCACAGTCTACGCGTCATCTTGAAAGAAAGAACTACACTACTCCAAAAATAATCATGCC'
             'TGGTGGAAAAGCTGGTAAAGATTCCGGAAAGGCCAAGGCTAAGGCAGTGTCAAGGTCCGCAA'
             'GAGCTGGCTTACAGTTTCCAGTCGGACGTATTCACAGGCATTTGAAGAACAGAACCACTAGC'
             'CACGGTCGTGTTGGAGCTACAGCAGCCGTTTACAGTGCAGCAATCCTTGAATACCTGACCGC'
             'CGAAGTGCTTGAGTTGGCTGGAAACGCAAGTAAAGATCTCAAAGTAAAGAGAATCACCCCAC'
             'GTCACTTGCAGTTGGCAATCAGAGGAGATGAAGAGTTGGATTCCCTAATTAAAGCCACAATC'
             'GCTGGTGGTGGTGTTATTCCACATATCCACAAGTCACTTATTGGCAAGAAGGGAGGTCAGCA'
             'AGCCAAATAAATTGGACATACTCATTCATCAGGGAACAATGTGTAGTGAATGTGTTAAAAAG'
             'AACAATCTCATTGTGTAGCTCTTTAGTTTTATATGAATGTGTTAACATGGTCATTCACATCG'
             'TATGACTCATAGAATCATCTGTGTATCATTTCATCCTCTCATTTTATAGCTCCTCATTTTCC'
             'TTAGACTCATTAAAATTTTTATCTCGGAAAAATGTTTTTTCTACAATTTTAGCATTCATTTA'
             'TCTTCATCTTGCTTTTATGTTTAATAAAACGAACTTATAATACCAAAAAAAAAAAAAAAAA', {
                 'ACCESSION':
                 'KX454487;',
                 'VERSION':
                 'KX454487.1',
                 'COMMENT':
                 '##Assembly-Data-START##\nSequencing Technology '
                 ':: Sanger dideoxy sequencing\n##Assembly-Data-END##',
                 'DATE': [
                     '02-FEB-2017 (Rel. 131, Created)',
                     '02-FEB-2017 (Rel. 131, Last updated, Version 1)'
                 ],
                 'DBSOURCE':
                 'MD5; cbc730cf7a8d694b50fb7dd6b993ae0d.',
                 'DEFINITION':
                 'Ruditapes philippinarum histone mRNA, '
                 'complete cds.',
                 'KEYWORDS':
                 '.',
                 'LOCUS': {
                     'locus_name': 'KX454487',
                     'class': 'STD',
                     'division': 'INV',
                     'mol_type': 'mRNA',
                     'shape': 'linear',
                     'size': 743,
                     'unit': 'bp',
                     'version': 1,
                     'date': '02-FEB-2017'
                 },
                 'REFERENCE': [{
                     'AUTHORS':
                     'Yang D., Zhao J., Wang Q.;',
                     'JOURNAL':
                     'Submitted (27-JUN-2016) to the INSDC. Key '
                     'Laboratory of Coastal Zone Environment Processes '
                     'and Ecological Remediation, Yantai Institute '
                     'of Coastal Zone Research (YIC), Chinese Academy '
                     'of Sciences (CAS), 17 Chunhui Road, Laishan '
                     'District, Yantai, Shandong 264003, China',
                     'REFERENCE':
                     '1  (bases 1 to 743)',
                     'TITLE':
                     ';'
                 }],
                 'CROSS_REFERENCE': [None],
                 'SOURCE': {
                     'ORGANISM':
                     'Ruditapes philippinarum',
                     'taxonomy':
                     'Eukaryota; Metazoa; Lophotrochozoa; Mollusca; '
                     'Bivalvia; Heteroconchia; Euheterodonta; '
                     'Veneroida; Veneroidea; Veneridae; Ruditapes.'
                 }
             }, imd2, DNA))

        # define the feature level product obj
        self.feature_level = (
            'AAUUGAAGAGUUUGAUCAUGGCUCAGAUUGAACGCUGGCGGCAGGCCUAACACAUGCAAGUC'
            'GAGCGGCAGCACAGAGGAACUUGUUCCUUGGGUGGCGAGCGGCGGACGGGUGAGUAAUGCCU'
            'AGGAAAUUGCCCUGAUGUGGGGGAUAACCAUUGGAAACGAUGGCUAAUACCGCAUGAUGCCU'
            'ACGGGCCAAAGAGGGGGACCUUCUGGCCUCUCGCGUCAGGAUAUGCCUAGGUGGGAUUAGCU'
            'AGUUGGUGAGGUAAUGGCUCACCAAGGCGACGAUCCCUAGCUGGUCUGAGAGGAUGAUCAGC'
            'CACACUGGAACUGAGACACGGUCCAGACUCCUACGGGAGGCAGCAGUGGGGAAUAUUGCACA'
            'AUGGGCGCAAGCCUGAUGCAGCCAUGCCGCGUGUAUGAAGAAGGCCUUCGGGUUGUAAAGUA'
            'CUUUCAGUCGUGAGGAAGGUGGUGUUGUUAAUAGCAGCAUCAUUUGACGUUAGCGACAGAAG'
            'AAGCACCGGCUAACUCCGUGCCAGCAGCCGCGGUAAUACGGAGGGUGCGAGCGUUAAUCGGA'
            'AUUACUGGGCGUAAAGCGCAUGCAGGUGGUGGAUUAAGUCAGAUGUGAAAGCCCGGGGCUCA'
            'ACCUCGGAACCGCAUUUGAAACUGGUUCACUAGAGUACUGUAGAGGGGGGUAGAAUUUCAGG'
            'UGUAGCGGUGAAAUGCGUAGAGAUCUGAAGGAAUACCGGUGGCGAAGGCGGCCCCCUGGACA'
            'GAUACUGACACUCAGAUGCGAAAGCGUGGGGAGCAAACAGGAUUAGAUACCCUGGUAGUCCA'
            'CGCCGUAAACGAUGUCUACUUGGAGGUUGUGGCCUUGAGCCGUGGCUUUCGGAGCUAACGCG'
            'UUAAGUAGACCGCCUGGGGAGUACGGUCGCAAGAUUAAAACUCAAAUGAAUUGACGGGGGCC'
            'CGCACAAGCGGUGGAGCAUGUGGUUUAAUUCGAUGCAACGCGAAGAACCUUACCUACUCUUG'
            'ACAUCCAGAGAAGCCAGCGGAGACGCAGGUGUGCCUUCGGGAGCUCUGAGACAGGUGCUGCA'
            'UGGCUGUCGUCAGCUCGUGUUGUGAAAUGUUGGGUUAAGUCCCGCAACGAGCGCAACCCUUA'
            'UCCUUGUUUGCCAGCGAGUCAUGUCGGGAACUCCAGGGAGACUGCCGGUGAUAAACCGGAGG'
            'AAGGUGGGGACGACGUCAAGUCAUCAUGGCCCUUACGAGUAGGGCUACACACGUGCUACAAU'
            'GGCGCAUACAGAGGGCAGCAAGCUAGCGAUAGUGAGCGAAUCCCAAAAAGUGCGUCGUAGUC'
            'CGGAUUGGAGUCUGCAACUCGACUCCAUGAAGUCGGAAUCGCUAGUAAUCGUAGAUCAGAAU'
            'GCUACGGUGAAUACGUUCCCGGGCCUUGUACACACCGCCCGUCACACCAUGGGAGUGGGCUG'
            'CAAAAGAAGUGGGUAGUUUAACCUUUCGGGGAGGACGCUCACCACUUUGUGGUUCAUGACUG'
            'GGGUGAAGUCGUAACAAGGUAGCGCUAGGGGAACCUGGCGCUGGAUCACCUCCUUA', {
                'DATE': [
                    '02-JUN-2014 (Rel. 121, Created)',
                    '04-FEB-2016 (Rel. 127, Last updated, Version 5)'
                ],
                'DBSOURCE':
                'SILVA-LSU; LK021130. SILVA-SSU; LK021130. MD5; '
                'afd116bf2c1a13acbf40d63d82f0218c. BioSample; '
                'SAMEA3865288.',
                'DEFINITION':
                'Vibrio anguillarum 16S rRNA',
                'KEYWORDS':
                '.',
                'LOCUS': {
                    'locus_name': 'LK021130.1:74067..75610:rRNA',
                    'class': 'STD',
                    'division': 'PRO',
                    'mol_type': 'genomic DNA',
                    'shape': 'linear',
                    'size': 1544,
                    'unit': 'bp',
                    'version': 1,
                    'date': '04-FEB-2016'
                },
                'PARENT_ACCESSION':
                'LK021130.1',
                'VERSION':
                'LK021130.1',
                'PROJECT_IDENTIFIER':
                'Project:PRJEB5701;',
                'REFERENCE': [{
                    'AUTHORS':
                    'Holm K.;',
                    'JOURNAL':
                    'Submitted (26-MAR-2014) to the INSDC. '
                    'Norstruct, Dept of Chemistry, University of '
                    'Tromso, Science Park 3, NO-9037 Tromso, NORWAY.',
                    'TITLE':
                    ';',
                    'REFERENCE':
                    '1'
                }, {
                    'AUTHORS':
                    'Holm K.O., Nilsson K., Hjerde E., Willassen '
                    'N.P., Milton D.L.;',
                    'JOURNAL':
                    'Stand Genomic Sci. 10:60-60(2015).',
                    'TITLE':
                    '"Complete genome sequence of Vibrio anguillarum '
                    'strain NB10, a virulent isolate from the Gulf '
                    'of Bothnia";',
                    'REFERENCE':
                    '2',
                    'PUBMED':
                    '26380645'
                }],
                'CROSS_REFERENCE':
                [None, 'DOI; 10.1186/s40793-015-0060-7. PUBMED; 26380645.'],
                'SOURCE': {
                    'ORGANISM':
                    'Vibrio anguillarum',
                    'taxonomy':
                    'Bacteria; Proteobacteria; Gammaproteobacteria; '
                    'Vibrionales; Vibrionaceae; Vibrio.'
                }
            }, None, RNA)

        # get the feature level file without FT
        self.feature_level_fp = get_data_path(
            "embl_feature_level_record_no_FT")

        # get a genbank file in order to to file conversion
        self.genbank_fp = get_data_path('genbank_single_record')

        # a embl constructed sequence file path
        self.embl_constructed_fp = get_data_path("embl_constructed")

        # a simple embl version to perform embl->gb->embl conversion
        self.single_rna_simple_fp = get_data_path("embl_single_record_simple")
Esempio n. 47
0
    def setUp(self):
        # to test ID line
        self.id = (
            # This is a derived record (non-coding, rRNA and spacer records)
            # (feature level record:
            # http://www.ebi.ac.uk/ena/browse/feature-level-products
            # TODO: a Uniprot record?
            (['ID   AB000684.1:1..275:rRNA; SV 1; linear; '
              'genomic DNA; STD; ENV; 275 BP.'],
             {'division': 'ENV', 'mol_type': 'genomic DNA', 'shape': 'linear',
              'locus_name': 'AB000684.1:1..275:rRNA', 'unit': 'bp',
              'size': 275, 'version': 1, 'class': 'STD', 'date': None}),
            # A standard record
            (['ID   M14399; SV 1; linear; mRNA; STD; PRO; 63 BP.'],
             {'division': 'PRO', 'mol_type': 'mRNA', 'shape': 'linear',
              'locus_name': 'M14399', 'unit': 'bp',
              'size': 63, 'version': 1, 'class': 'STD', 'date': None}))

        # define a single DNA record (with no interval metadata)
        # M14399; SV 1; linear; mRNA; STD; PRO; 63 BP.
        self.single = (
            'gtgaaacaaagcactattgcactggctgtcttaccgttactgtttacccctgtgacaaaagcc',
            {'LOCUS': {'locus_name': 'M14399',
                       'class': 'STD',
                       'division': 'PRO',
                       'mol_type': 'mRNA',
                       'shape': 'linear',
                       'size': 63,
                       'unit': 'bp',
                       'version': 1,
                       'date': None}},
            None,
            DNA)

        # define a single protein record (uniprot)
        self.protein = (
            'MAFSAEDVLKEYDRRRRMEALLLSLYYPNDRKLLDYKEWSPPRVQVECPKAPVEWNNPPSEKG'
            'LIVGHFSGIKYKGEKAQASEVDVNKMCCWVSKFKDAMRRYQGIQTCKIPGKVLSDLDAKIKAY'
            'NLTVEGVEGFVRYSRVTKQHVAAFLKELRHSKQYENVNLIHYILTDKRVDIQHLEKDLVKDFK'
            'ALVESAHRMRQGHMINVKYILYQLLKKHGHGPDGPDILTVKTGSKGVLYDDSFRKIYTDLGWK'
            'FTPL',
            {'LOCUS': {'locus_name': '001R_FRG3G',
                       'status': 'Reviewed',
                       'size': 256,
                       'unit': 'aa'}},
            None,
            Protein)

        # define a single DNA record uppercase (filepath)
        self.single_upper_fp = get_data_path('embl_single_record_upper')

        # define a single RNA record lower
        self.single_lower_fp = get_data_path('embl_single_record_lower')

        # define a single RNA record file path
        self.single_rna_fp = get_data_path('embl_single_record')

        # define a http://www.ebi.ac.uk/ena/browse/feature-level-products
        self.feature_level_fp = get_data_path("embl_feature_level_record")

        # define a interval metadata (see skbio.metadata.IntervalMetadata)
        imd = IntervalMetadata(63)

        # then add interval object to interval metadata. Add source
        imd.add([(0, 63)],
                [(False, False)],
                {'db_xref': '"taxon:562"',
                 'mol_type': '"mRNA"',
                 'organism': '"Escherichia coli"',
                 'type': 'source',
                 'strand': '+',
                 '__location': '1..63'})

        imd.add([(0, 63)],
                # the second True is beacause exact location is not known
                [(False, True)],
                {'phase': 0,
                 'db_xref': ['"GOA:P00634"',
                             '"InterPro:IPR001952"',
                             '"InterPro:IPR017849"',
                             '"InterPro:IPR017850"',
                             '"InterPro:IPR018299"',
                             '"PDB:1AJA"',
                             '"PDB:1AJB"',
                             '"PDB:1AJC"',
                             '"PDB:1AJD"',
                             '"PDB:1ALH"',
                             '"PDB:1ALI"',
                             '"PDB:1ALJ"',
                             '"PDB:1ALK"',
                             '"PDB:1ANI"',
                             '"PDB:1ANJ"',
                             '"PDB:1B8J"',
                             '"PDB:1ED8"',
                             '"PDB:1ED9"',
                             '"PDB:1ELX"',
                             '"PDB:1ELY"',
                             '"PDB:1ELZ"',
                             '"PDB:1EW8"',
                             '"PDB:1EW9"',
                             '"PDB:1HJK"',
                             '"PDB:1HQA"',
                             '"PDB:1KH4"',
                             '"PDB:1KH5"',
                             '"PDB:1KH7"',
                             '"PDB:1KH9"',
                             '"PDB:1KHJ"',
                             '"PDB:1KHK"',
                             '"PDB:1KHL"',
                             '"PDB:1KHN"',
                             '"PDB:1URA"',
                             '"PDB:1URB"',
                             '"PDB:1Y6V"',
                             '"PDB:1Y7A"',
                             '"PDB:2ANH"',
                             '"PDB:2G9Y"',
                             '"PDB:2GA3"',
                             '"PDB:2MLX"',
                             '"PDB:2MLY"',
                             '"PDB:2MLZ"',
                             '"PDB:3BDF"',
                             '"PDB:3BDG"',
                             '"PDB:3BDH"',
                             '"PDB:3CMR"',
                             '"PDB:3DPC"',
                             '"PDB:3DYC"',
                             '"PDB:3TG0"',
                             '"PDB:4KM4"',
                             '"PDB:4YR1"',
                             '"PDB:5C66"',
                             '"PDB:5GAD"',
                             '"PDB:5GAF"',
                             '"PDB:5GAG"',
                             '"PDB:5GAH"',
                             '"PDB:5JTL"',
                             '"PDB:5JTM"',
                             '"PDB:5JTN"',
                             '"PDB:5JTO"',
                             '"PDB:5JTP"',
                             '"UniProtKB/Swiss-Prot:P00634"'],
                 '__location': '1..>63',
                 'strand': '+',
                 'note': '"alkaline phosphatase signal peptide"',
                 'protein_id': '"AAA23431.1"',
                 'transl_table': '11',
                 'translation': '"MKQSTIALAVLPLLFTPVTKA"',
                 'type': 'CDS'})

        self.single_rna = (
            'gugaaacaaagcacuauugcacuggcugucuuaccguuacuguuuaccccugugacaaaagcc',
            {'LOCUS': {'locus_name': 'M14399',
                       'class': 'STD',
                       'division': 'PRO',
                       'mol_type': 'mRNA',
                       'shape': 'linear',
                       'size': 63,
                       'unit': 'bp',
                       'version': 1,
                       'date': '02-SEP-1999'},
             'ACCESSION': 'M14399;',  # accessions (could be more than one)
             'VERSION': 'M14399.1',  # a genbank like version
             'DATE': ["16-JUL-1988 (Rel. 16, Created)",
                      "02-SEP-1999 (Rel. 60, Last updated, Version 3)"],
             'DBSOURCE': 'MD5; c9b40131b8622946b5aafdf5473b3d43.',
             'DEFINITION': "E.coli alkaline phosphatase signal mRNA, 5' end.",
             'KEYWORDS': "alkaline phosphatase; signal peptide.",
             'SOURCE': {"ORGANISM": "Escherichia coli",
                        'taxonomy': "Bacteria; Proteobacteria; "
                        "Gammaproteobacteria; Enterobacterales; "
                        "Enterobacteriaceae; Escherichia."},
             'REFERENCE': [{'AUTHORS': 'Gray G.L., Baldridge J.S., '
                                       'McKeown K.S., Heyneker H.L., '
                                       'Chang C.N.;',
                            'JOURNAL': 'Gene 39(2-3):247-254(1985).',
                            'REFERENCE': '1  (bases 1 to 63)',
                            'TITLE': '"Periplasmic production of correctly '
                                     'processed human growth hormone in '
                                     'Escherichia coli: natural and bacterial '
                                     'signal sequences are '
                                     'interchangeable";',
                            'PUBMED': '3912261'}],
             'CROSS_REFERENCE': ['DOI; 10.1016/0378-1119(85)'
                                 '90319-1. PUBMED; 3912261.']},
            imd,
            RNA)

        # define a multi record. File path
        self.multi_fp = get_data_path('embl_multi_records')

        # define interval metadata (as single metadata)
        imd1 = imd

        # define interal metadata for multi 2
        imd2 = IntervalMetadata(743)

        # then add interval object to interval metadata. Add source
        imd2.add([(0, 743)],
                 [(False, False)],
                 {'organism': '"Ruditapes philippinarum"',
                  'type': 'source',
                  '__location': '1..743',
                  'strand': '+',
                  'mol_type': '"mRNA"',
                  'db_xref': '"taxon:129788"'})

        imd2.add([(57, 444)],
                 [(False, False)],
                 {'translation': '"MPGGKAGKDSGKAKAKAVSRSARAGLQFPVGRIHRHLKNRT'
                                 'TSHG RVGATAAVYSAAILEYLTAEVLELAGNASKDLKVKRI'
                                 'TPRHLQLAIRGDEELDSLIKAT IAGGGVIPHIHKSLIGKKG'
                                 'GQQAK"',
                  'type': 'CDS',
                  '__location': '58..444',
                  'protein_id': '"APY18893.1"',
                  'strand': '+',
                  'phase': 0,
                  'product': '"histone"'})

        # multi object
        self.multi = (
            ('GTGAAACAAAGCACTATTGCACTGGCTGTCTTACCGTTACTGTTTACCCCTGTGACAAAAGCC',
             {'LOCUS': {'locus_name': 'M14399',
                        'class': 'STD',
                        'division': 'PRO',
                        'mol_type': 'mRNA',
                        'shape': 'linear',
                        'size': 63,
                        'unit': 'bp',
                        'version': 1,
                        'date': '02-SEP-1999'},
              'ACCESSION': 'M14399;',  # accessions (could be more than one)
              'VERSION': 'M14399.1',  # a genbank like version
              'DATE': ["16-JUL-1988 (Rel. 16, Created)",
                       "02-SEP-1999 (Rel. 60, Last updated, Version 3)"],
              'DBSOURCE': 'MD5; c9b40131b8622946b5aafdf5473b3d43.',
              'DEFINITION': "E.coli alkaline phosphatase signal mRNA, 5' end.",
              'KEYWORDS': "alkaline phosphatase; signal peptide.",
              'SOURCE': {"ORGANISM": "Escherichia coli",
                         'taxonomy': "Bacteria; Proteobacteria; "
                         "Gammaproteobacteria; Enterobacterales; "
                         "Enterobacteriaceae; Escherichia."},
              'REFERENCE': [{'AUTHORS': 'Gray G.L., Baldridge J.S., '
                                        'McKeown K.S., Heyneker H.L., '
                                        'Chang C.N.;',
                             'JOURNAL': 'Gene 39(2-3):247-254(1985).',
                             'REFERENCE': '1  (bases 1 to 63)',
                             'TITLE': '"Periplasmic production of correctly '
                                      'processed human growth hormone in '
                                      'Escherichia coli: natural and '
                                      'bacterial signal sequences are '
                                      'interchangeable";',
                             'PUBMED': '3912261'}],
              'CROSS_REFERENCE': ['DOI; 10.1016/0378-1119(85)'
                                  '90319-1. PUBMED; 3912261.']},
             imd1,
             DNA),
            ('TGTGCACAGTCTACGCGTCATCTTGAAAGAAAGAACTACACTACTCCAAAAATAATCATGCC'
             'TGGTGGAAAAGCTGGTAAAGATTCCGGAAAGGCCAAGGCTAAGGCAGTGTCAAGGTCCGCAA'
             'GAGCTGGCTTACAGTTTCCAGTCGGACGTATTCACAGGCATTTGAAGAACAGAACCACTAGC'
             'CACGGTCGTGTTGGAGCTACAGCAGCCGTTTACAGTGCAGCAATCCTTGAATACCTGACCGC'
             'CGAAGTGCTTGAGTTGGCTGGAAACGCAAGTAAAGATCTCAAAGTAAAGAGAATCACCCCAC'
             'GTCACTTGCAGTTGGCAATCAGAGGAGATGAAGAGTTGGATTCCCTAATTAAAGCCACAATC'
             'GCTGGTGGTGGTGTTATTCCACATATCCACAAGTCACTTATTGGCAAGAAGGGAGGTCAGCA'
             'AGCCAAATAAATTGGACATACTCATTCATCAGGGAACAATGTGTAGTGAATGTGTTAAAAAG'
             'AACAATCTCATTGTGTAGCTCTTTAGTTTTATATGAATGTGTTAACATGGTCATTCACATCG'
             'TATGACTCATAGAATCATCTGTGTATCATTTCATCCTCTCATTTTATAGCTCCTCATTTTCC'
             'TTAGACTCATTAAAATTTTTATCTCGGAAAAATGTTTTTTCTACAATTTTAGCATTCATTTA'
             'TCTTCATCTTGCTTTTATGTTTAATAAAACGAACTTATAATACCAAAAAAAAAAAAAAAAA',
             {'ACCESSION': 'KX454487;',
              'VERSION': 'KX454487.1',
              'COMMENT': '##Assembly-Data-START##\nSequencing Technology '
                         ':: Sanger dideoxy sequencing\n##Assembly-Data-END##',
              'DATE': ['02-FEB-2017 (Rel. 131, Created)',
                       '02-FEB-2017 (Rel. 131, Last updated, Version 1)'],
              'DBSOURCE': 'MD5; cbc730cf7a8d694b50fb7dd6b993ae0d.',
              'DEFINITION': 'Ruditapes philippinarum histone mRNA, '
                            'complete cds.',
              'KEYWORDS': '.',
              'LOCUS': {'locus_name': 'KX454487',
                        'class': 'STD',
                        'division': 'INV',
                        'mol_type': 'mRNA',
                        'shape': 'linear',
                        'size': 743,
                        'unit': 'bp',
                        'version': 1,
                        'date': '02-FEB-2017'},
              'REFERENCE': [
                {'AUTHORS': 'Yang D., Zhao J., Wang Q.;',
                 'JOURNAL': 'Submitted (27-JUN-2016) to the INSDC. Key '
                            'Laboratory of Coastal Zone Environment Processes '
                            'and Ecological Remediation, Yantai Institute '
                            'of Coastal Zone Research (YIC), Chinese Academy '
                            'of Sciences (CAS), 17 Chunhui Road, Laishan '
                            'District, Yantai, Shandong 264003, China',
                 'REFERENCE': '1  (bases 1 to 743)',
                 'TITLE': ';'}],
              'CROSS_REFERENCE': [None],
              'SOURCE': {
                'ORGANISM': 'Ruditapes philippinarum',
                'taxonomy': 'Eukaryota; Metazoa; Lophotrochozoa; Mollusca; '
                            'Bivalvia; Heteroconchia; Euheterodonta; '
                            'Veneroida; Veneroidea; Veneridae; Ruditapes.'}},
             imd2,
             DNA))

        # define the feature level product obj
        self.feature_level = (
            'AAUUGAAGAGUUUGAUCAUGGCUCAGAUUGAACGCUGGCGGCAGGCCUAACACAUGCAAGUC'
            'GAGCGGCAGCACAGAGGAACUUGUUCCUUGGGUGGCGAGCGGCGGACGGGUGAGUAAUGCCU'
            'AGGAAAUUGCCCUGAUGUGGGGGAUAACCAUUGGAAACGAUGGCUAAUACCGCAUGAUGCCU'
            'ACGGGCCAAAGAGGGGGACCUUCUGGCCUCUCGCGUCAGGAUAUGCCUAGGUGGGAUUAGCU'
            'AGUUGGUGAGGUAAUGGCUCACCAAGGCGACGAUCCCUAGCUGGUCUGAGAGGAUGAUCAGC'
            'CACACUGGAACUGAGACACGGUCCAGACUCCUACGGGAGGCAGCAGUGGGGAAUAUUGCACA'
            'AUGGGCGCAAGCCUGAUGCAGCCAUGCCGCGUGUAUGAAGAAGGCCUUCGGGUUGUAAAGUA'
            'CUUUCAGUCGUGAGGAAGGUGGUGUUGUUAAUAGCAGCAUCAUUUGACGUUAGCGACAGAAG'
            'AAGCACCGGCUAACUCCGUGCCAGCAGCCGCGGUAAUACGGAGGGUGCGAGCGUUAAUCGGA'
            'AUUACUGGGCGUAAAGCGCAUGCAGGUGGUGGAUUAAGUCAGAUGUGAAAGCCCGGGGCUCA'
            'ACCUCGGAACCGCAUUUGAAACUGGUUCACUAGAGUACUGUAGAGGGGGGUAGAAUUUCAGG'
            'UGUAGCGGUGAAAUGCGUAGAGAUCUGAAGGAAUACCGGUGGCGAAGGCGGCCCCCUGGACA'
            'GAUACUGACACUCAGAUGCGAAAGCGUGGGGAGCAAACAGGAUUAGAUACCCUGGUAGUCCA'
            'CGCCGUAAACGAUGUCUACUUGGAGGUUGUGGCCUUGAGCCGUGGCUUUCGGAGCUAACGCG'
            'UUAAGUAGACCGCCUGGGGAGUACGGUCGCAAGAUUAAAACUCAAAUGAAUUGACGGGGGCC'
            'CGCACAAGCGGUGGAGCAUGUGGUUUAAUUCGAUGCAACGCGAAGAACCUUACCUACUCUUG'
            'ACAUCCAGAGAAGCCAGCGGAGACGCAGGUGUGCCUUCGGGAGCUCUGAGACAGGUGCUGCA'
            'UGGCUGUCGUCAGCUCGUGUUGUGAAAUGUUGGGUUAAGUCCCGCAACGAGCGCAACCCUUA'
            'UCCUUGUUUGCCAGCGAGUCAUGUCGGGAACUCCAGGGAGACUGCCGGUGAUAAACCGGAGG'
            'AAGGUGGGGACGACGUCAAGUCAUCAUGGCCCUUACGAGUAGGGCUACACACGUGCUACAAU'
            'GGCGCAUACAGAGGGCAGCAAGCUAGCGAUAGUGAGCGAAUCCCAAAAAGUGCGUCGUAGUC'
            'CGGAUUGGAGUCUGCAACUCGACUCCAUGAAGUCGGAAUCGCUAGUAAUCGUAGAUCAGAAU'
            'GCUACGGUGAAUACGUUCCCGGGCCUUGUACACACCGCCCGUCACACCAUGGGAGUGGGCUG'
            'CAAAAGAAGUGGGUAGUUUAACCUUUCGGGGAGGACGCUCACCACUUUGUGGUUCAUGACUG'
            'GGGUGAAGUCGUAACAAGGUAGCGCUAGGGGAACCUGGCGCUGGAUCACCUCCUUA',
            {'DATE': ['02-JUN-2014 (Rel. 121, Created)',
                      '04-FEB-2016 (Rel. 127, Last updated, Version 5)'],
             'DBSOURCE': 'SILVA-LSU; LK021130. SILVA-SSU; LK021130. MD5; '
                         'afd116bf2c1a13acbf40d63d82f0218c. BioSample; '
                         'SAMEA3865288.',
             'DEFINITION': 'Vibrio anguillarum 16S rRNA',
             'KEYWORDS': '.',
             'LOCUS': {'locus_name': 'LK021130.1:74067..75610:rRNA',
                       'class': 'STD',
                       'division': 'PRO',
                       'mol_type': 'genomic DNA',
                       'shape': 'linear',
                       'size': 1544,
                       'unit': 'bp',
                       'version': 1,
                       'date': '04-FEB-2016'},
             'PARENT_ACCESSION': 'LK021130.1',
             'VERSION': 'LK021130.1',
             'PROJECT_IDENTIFIER': 'Project:PRJEB5701;',
             'REFERENCE': [
                {'AUTHORS': 'Holm K.;',
                 'JOURNAL': 'Submitted (26-MAR-2014) to the INSDC. '
                            'Norstruct, Dept of Chemistry, University of '
                            'Tromso, Science Park 3, NO-9037 Tromso, NORWAY.',
                 'TITLE': ';',
                 'REFERENCE': '1'},
                {'AUTHORS': 'Holm K.O., Nilsson K., Hjerde E., Willassen '
                            'N.P., Milton D.L.;',
                 'JOURNAL': 'Stand Genomic Sci. 10:60-60(2015).',
                 'TITLE': '"Complete genome sequence of Vibrio anguillarum '
                          'strain NB10, a virulent isolate from the Gulf '
                          'of Bothnia";',
                 'REFERENCE': '2',
                 'PUBMED': '26380645'}],
             'CROSS_REFERENCE': [
                None,
                'DOI; 10.1186/s40793-015-0060-7. PUBMED; 26380645.'],
             'SOURCE': {
                'ORGANISM': 'Vibrio anguillarum',
                'taxonomy': 'Bacteria; Proteobacteria; Gammaproteobacteria; '
                            'Vibrionales; Vibrionaceae; Vibrio.'}},
            None,
            RNA)

        # get the feature level file without FT
        self.feature_level_fp = get_data_path(
                "embl_feature_level_record_no_FT")

        # get a genbank file in order to to file conversion
        self.genbank_fp = get_data_path('genbank_single_record')

        # a embl constructed sequence file path
        self.embl_constructed_fp = get_data_path("embl_constructed")

        # a simple embl version to perform embl->gb->embl conversion
        self.single_rna_simple_fp = get_data_path(
                "embl_single_record_simple")
Esempio n. 48
0
    def setUp(self):
        # test locus line
        self.locus = (
            (['LOCUS       NC_005816   9609 bp   '
              'DNA   circular   CON   07-FEB-2015'],
             {'division': 'CON', 'mol_type': 'DNA', 'shape': 'circular',
              'locus_name': 'NC_005816', 'date': '07-FEB-2015',
              'unit': 'bp', 'size': 9609}),
            (['LOCUS       SCU49845   5028 bp   '
              'DNA      PLN   21-JUN-1999'],
             {'division': 'PLN', 'mol_type': 'DNA', 'shape': None,
             'locus_name': 'SCU49845', 'date': '21-JUN-1999',
              'unit': 'bp', 'size': 5028}),
            (['LOCUS       NP_001832   360 aa      '
              'linear   PRI   18-DEC-2001'],
             {'division': 'PRI', 'mol_type': None, 'shape': 'linear',
              'locus_name': 'NP_001832', 'date': '18-DEC-2001',
              'unit': 'aa', 'size': 360}))

        # test single record and read uppercase sequence
        self.single_upper_fp = get_data_path('genbank_single_record_upper')
        self.single_lower_fp = get_data_path('genbank_single_record_lower')
        self.single = (
            'GSREILDFK',
            {'LOCUS': {'date': '23-SEP-1994',
                       'division': 'BCT',
                       'locus_name': 'AAB29917',
                       'mol_type': None,
                       'shape': 'linear',
                       'size': 9,
                       'unit': 'aa'}},
            None,
            Protein)

        self.single_rna_fp = get_data_path('genbank_single_record')
        imd = IntervalMetadata(63)
        imd.add([(0, 63)],
                [(False, False)],
                {'db_xref': '"taxon:562"',
                 'mol_type': '"mRNA"',
                 'organism': '"Escherichia coli"',
                 'type': 'source',
                 'strand': '+',
                 '__location': '1..63'})
        imd.add([(0, 63)],
                [(False, True)],
                {'phase': 0,
                 'db_xref': ['"taxon:562"', '"taxon:561"'],
                 '__location': '1..>63',
                 'strand': '+',
                 'note': '"alkaline phosphatase signal peptide"',
                 'protein_id': '"AAA23431.1"',
                 'transl_table': '11',
                 'translation': '"MKQSTIALAVLPLLFTPVTKA"',
                 'type': 'CDS'})
        self.single_rna = (
            'gugaaacaaagcacuauugcacuggcugucuuaccguuacuguuuaccccugugacaaaagcc',
            {'ACCESSION': 'M14399',
             'COMMENT': 'Original source text: E.coli, cDNA to mRNA.',
             'DEFINITION': "alkaline phosphatase signal mRNA, 5' end.",
             'KEYWORDS': 'alkaline phosphatase; signal peptide.',
             'LOCUS': {'date': '26-APR-1993',
                       'division': 'BCT',
                       'locus_name': 'ECOALKP',
                       'mol_type': 'mRNA',
                       'shape': 'linear',
                       'size': 63,
                       'unit': 'bp'},
             'SOURCE': {'ORGANISM': 'Escherichia coli',
                        'taxonomy': 'Bacteria; Proteobacteria; '
                        'Gammaproteobacteria; Enterobacteriales; '
                        'Enterobacteriaceae; Escherichia.'},
             'VERSION': 'M14399.1'},
            imd,
            RNA)

        # test:
        # 1. multiple records in one file
        # 2. lowercase sequence
        # 3. DNA, RNA, Protein type
        # 4. variation of formats
        self.multi_fp = get_data_path('genbank_multi_records')
        imd_pro = IntervalMetadata(9)
        imd_pro.add([(0, 9)], [(False, False)],
                    {'organism': '"Bacteria"',
                     'type': 'source',
                     'strand': '+',
                     '__location': '1..9'},)
        imd_pro.add([(0, 9)], [(False, True)],
                    {'__location': '1..>9',
                     'product': '"L-carnitine amidase"',
                     'strand': '+',
                     'type': 'Protein'})
        imd_dna = IntervalMetadata(9)
        imd_dna.add([(0, 9)], [(False, False)],
                    {'country': '"Brazil: Parana, Paranavai"',
                     'type': 'source',
                     'strand': '+',
                     '__location': '1..9',
                     'environmental_sample': ''})
        imd_dna.add([(1, 8)], [(True, True)],
                    {'__location': 'complement(<2..>8)',
                     'product': '"16S ribosomal RNA"',
                     'strand': '-',
                     'type': 'rRNA'})

        self.multi = (
            ('gsreildfk',
             {'ACCESSION': 'AAB29917',
              'COMMENT': 'Method: direct peptide sequencing.',
              'DBSOURCE': 'accession AAB29917.1',
              'DEFINITION': 'L-carnitine amidase {N-terminal}',
              'KEYWORDS': '.',
              'LOCUS': {'date': '23-SEP-1994',
                        'division': 'BCT',
                        'locus_name': 'AAB29917',
                        'mol_type': None,
                        'shape': 'linear',
                        'size': 9,
                        'unit': 'aa'},
              'REFERENCE': [{'AUTHORS': 'Joeres,U. and Kula,M.R.',
                             'JOURNAL': 'AMB 40 (5), 606-610 (1994)',
                             'PUBMED': '7764422',
                             'REFERENCE': '1  (residues 1 to 9)',
                             'REMARK': 'from the original journal article.',
                             'TITLE': 'a microbial L-carnitine amidase'},
                            {'AUTHORS': 'Joeres,U. and Kula,M.R.',
                             'JOURNAL': 'AMB 40 (5), 606-610 (1994)',
                             'PUBMED': '7764422',
                             'REFERENCE': '1  (residues 1 to 9)',
                             'TITLE': 'a microbial L-carnitine amidase'}],
              'SOURCE': {'ORGANISM': 'Bacteria',
                         'taxonomy': 'Unclassified.'},
              'VERSION': 'AAB29917.1  GI:545426'},
             imd_pro,
             Protein),

            ('catgcaggc',
             {'ACCESSION': 'HQ018078',
              'DEFINITION': 'Uncultured Xylanimonas sp.16S, partial',
              'KEYWORDS': 'ENV.',
              'LOCUS': {'date': '29-AUG-2010',
                        'division': 'ENV',
                        'locus_name': 'HQ018078',
                        'mol_type': 'DNA',
                        'shape': 'linear',
                        'size': 9,
                        'unit': 'bp'},
              'SOURCE': {'ORGANISM': 'uncultured Xylanimonas sp.',
                         'taxonomy': 'Bacteria; Actinobacteria; '
                         'Micrococcales; Promicromonosporaceae; '
                         'Xylanimonas; environmental samples.'},
              'VERSION': 'HQ018078.1  GI:304421728'},
             imd_dna,
             DNA))
Esempio n. 49
0
class IntervalMetadataMixinTests:
    def _set_up(self):
        self.upper_bound = 9
        self.im = IntervalMetadata(self.upper_bound)
        self.intvls = [
            {'bounds': [(0, 1), (2, 9)], 'metadata': {'gene': 'sagA'}},
            {'bounds': [(0, 1)], 'metadata': {'gene': ['a'],
                                              'product': 'foo'}}]

    def test_constructor_invalid(self):
        with self.assertRaisesRegex(TypeError,
                                    'You must provide `IntervalMetadata` '
                                    'object.'):
            self._interval_metadata_constructor_(0, '')

    def test_constructor_interval_metadata_len_mismatch(self):
        for i in [0, 1, 3, 100]:
            with self.assertRaisesRegex(
                    ValueError, '\(%d\).*\(%d\)' % (self.upper_bound, i)):
                self._interval_metadata_constructor_(i, self.im)

    def test_constructor_interval_metadata_len(self):
        for n in 1, 2, 3:
            im = IntervalMetadata(n)
            im.add([(0, 1)], metadata={'a': 'b'})
            obj = self._interval_metadata_constructor_(n, im)
            self.assertTrue(obj.has_interval_metadata())
            self.assertIsInstance(obj.interval_metadata, IntervalMetadata)

    def test_constructor_interval_metadata_len_0(self):
        im = IntervalMetadata(0)
        obj = self._interval_metadata_constructor_(0, im)
        self.assertFalse(obj.has_interval_metadata())

    def test_constructor_no_interval_metadata(self):
        for i, im in [(0, None), (self.upper_bound, self.im)]:
            obj = self._interval_metadata_constructor_(i, im)
            self.assertFalse(obj.has_interval_metadata())
            self.assertIsInstance(obj.interval_metadata, IntervalMetadata)

    def test_constructor_handles_missing_interval_metadata_efficiently(self):
        obj = self._interval_metadata_constructor_(self.upper_bound)
        self.assertIsNone(obj._interval_metadata)

        obj = self._interval_metadata_constructor_(
            self.upper_bound, interval_metadata=None)
        self.assertIsNone(obj._interval_metadata)

    def test_constructor_makes_shallow_copy_of_interval_metadata(self):
        intvl = self.im.add(**self.intvls[1])
        obj = self._interval_metadata_constructor_(self.upper_bound, self.im)

        self.assertEqual(obj.interval_metadata, self.im)
        self.assertIsNot(obj.interval_metadata, self.im)

        # Changing mutable value of metadata of the old interval
        # also changes obj.
        intvl.metadata['gene'].append('b')
        self.assertEqual(obj.interval_metadata, self.im)

        # Changing old interval doesn't change obj
        intvl.bounds = [(3, 6)]
        self.assertNotEqual(obj.interval_metadata, self.im)

    def test_eq_basic(self):
        im1 = IntervalMetadata(self.upper_bound)
        im1.add(**self.intvls[0])
        obj1 = self._interval_metadata_constructor_(self.upper_bound, im1)

        im2 = IntervalMetadata(self.upper_bound)
        im2.add(**self.intvls[0])
        obj2 = self._interval_metadata_constructor_(self.upper_bound, im2)

        self.assertReallyEqual(obj1, obj2)

    def test_eq_populated_differently(self):
        im1 = IntervalMetadata(self.upper_bound)
        im1.add(**self.intvls[0])
        obj1 = self._interval_metadata_constructor_(self.upper_bound, im1)

        obj2 = self._interval_metadata_constructor_(self.upper_bound)
        obj2.interval_metadata.add(**self.intvls[0])

        self.assertReallyEqual(obj1, obj2)

    def test_eq_handles_missing_positional_metadata_efficiently(self):
        obj1 = self._interval_metadata_constructor_(self.upper_bound)
        obj2 = self._interval_metadata_constructor_(self.upper_bound)
        self.assertReallyEqual(obj1, obj2)

        self.assertIsNone(obj1._interval_metadata)
        self.assertIsNone(obj2._interval_metadata)

    def test_ne_diff_len(self):
        obj1 = self._interval_metadata_constructor_(0)
        obj2 = self._interval_metadata_constructor_(self.upper_bound)
        self.assertReallyNotEqual(obj1, obj2)

    def test_ne_only_one_is_empty(self):
        im1 = IntervalMetadata(self.upper_bound)
        im1.add(**self.intvls[0])
        obj1 = self._interval_metadata_constructor_(self.upper_bound, im1)

        obj2 = self._interval_metadata_constructor_(self.upper_bound)

        self.assertReallyNotEqual(obj1, obj2)

    def test_ne(self):
        im1 = IntervalMetadata(self.upper_bound)
        im1.add(**self.intvls[0])
        obj1 = self._interval_metadata_constructor_(self.upper_bound, im1)

        im2 = IntervalMetadata(self.upper_bound)
        im2.add(**self.intvls[1])
        obj2 = self._interval_metadata_constructor_(self.upper_bound, im2)

        self.assertReallyNotEqual(obj1, obj2)

    def test_copy_interval_metadata_empty(self):
        obj = self._interval_metadata_constructor_(self.upper_bound, self.im)
        obj_copy = copy.copy(obj)

        self.assertEqual(obj, obj_copy)
        self.assertIsNot(obj, obj_copy)

        self.assertIsNone(obj_copy._interval_metadata)
        self.assertEqual(obj._interval_metadata, self.im)

    def test_copy_interval_metadata_none(self):
        obj = self._interval_metadata_constructor_(self.upper_bound)
        obj_copy = copy.copy(obj)

        self.assertEqual(obj, obj_copy)
        self.assertIsNot(obj, obj_copy)

        self.assertIsNone(obj._interval_metadata)
        self.assertIsNone(obj_copy._interval_metadata)

    def test_copy_interval_metadata(self):
        self.im.add(**self.intvls[1])
        obj = self._interval_metadata_constructor_(self.upper_bound, self.im)
        obj_copy = copy.copy(obj)

        self.assertEqual(obj, obj_copy)
        self.assertIsNot(obj, obj_copy)

        self.assertIsNot(obj.interval_metadata,
                         obj_copy.interval_metadata)
        self.assertIsNot(obj.interval_metadata._intervals,
                         obj_copy.interval_metadata._intervals)
        for i, j in zip(obj.interval_metadata._intervals,
                        obj_copy.interval_metadata._intervals):
            self.assertIsNot(i, j)
            self.assertIsNot(i.metadata, j.metadata)
            for k in i.metadata:
                self.assertIs(i.metadata[k], j.metadata[k])

    def test_deepcopy_interval_metadata(self):
        self.im.add(**self.intvls[1])
        obj = self._interval_metadata_constructor_(self.upper_bound, self.im)
        obj_copy = copy.deepcopy(obj)

        self.assertEqual(obj, obj_copy)
        self.assertIsNot(obj, obj_copy)

        self.assertIsNot(obj.interval_metadata,
                         obj_copy.interval_metadata)
        self.assertIsNot(obj.interval_metadata._intervals,
                         obj_copy.interval_metadata._intervals)
        for i, j in zip(obj.interval_metadata._intervals,
                        obj_copy.interval_metadata._intervals):
            self.assertIsNot(i, j)
            self.assertIsNot(i.metadata, j.metadata)
            self.assertIsNot(i.metadata['gene'], j.metadata['gene'])
            self.assertIs(i.metadata['product'], j.metadata['product'])

    def test_deepcopy_interval_metadata_empty(self):
        obj = self._interval_metadata_constructor_(self.upper_bound, self.im)
        obj_copy = copy.deepcopy(obj)

        self.assertEqual(obj, obj_copy)
        self.assertIsNot(obj, obj_copy)

        self.assertIsNone(obj_copy._interval_metadata)
        self.assertEqual(obj._interval_metadata, self.im)

    def test_deepcopy_interval_metadata_none(self):
        obj = self._interval_metadata_constructor_(self.upper_bound, None)
        obj_copy = copy.deepcopy(obj)

        self.assertEqual(obj, obj_copy)
        self.assertIsNot(obj, obj_copy)

        self.assertIsNone(obj._interval_metadata)
        self.assertIsNone(obj_copy._interval_metadata)

    def test_deepcopy_memo_is_respected(self):
        # Basic test to ensure deepcopy's memo is passed through to recursive
        # deepcopy calls.
        obj = self._interval_metadata_constructor_(self.upper_bound, self.im)
        memo = {}
        copy.deepcopy(obj, memo)
        self.assertGreater(len(memo), 1)

    def test_interval_metadata_getter(self):
        self.im.add(**self.intvls[0])
        obj = self._interval_metadata_constructor_(self.upper_bound, self.im)
        self.assertIsInstance(obj.interval_metadata, IntervalMetadata)
        self.assertEqual(self.im, obj.interval_metadata)

        # Update existing metadata.
        obj.interval_metadata._intervals[0].metadata['gene'] = 'sagB'
        self.assertNotEqual(obj.interval_metadata, self.im)
        self.im._intervals[0].metadata['gene'] = 'sagB'
        self.assertEqual(obj.interval_metadata, self.im)

        # Add new interval feature.
        obj.interval_metadata.add(**self.intvls[1])
        self.im.add(**self.intvls[1])
        self.assertEqual(obj.interval_metadata, self.im)

    def test_interval_metadata_getter_no_interval_metadata(self):
        obj = self._interval_metadata_constructor_(self.upper_bound)
        self.assertIsNone(obj._interval_metadata)
        self.assertIsInstance(obj.interval_metadata, IntervalMetadata)
        self.assertEqual(obj.interval_metadata, self.im)
        self.assertIsNotNone(obj._interval_metadata)

    def test_interval_metadata_setter(self):
        obj = self._interval_metadata_constructor_(self.upper_bound)

        self.assertFalse(obj.has_interval_metadata())

        obj.interval_metadata = self.im
        self.assertFalse(obj.has_interval_metadata())
        self.assertEqual(obj.interval_metadata, self.im)

        self.im.add(**self.intvls[1])
        obj.interval_metadata = self.im
        self.assertTrue(obj.has_interval_metadata())
        self.assertEqual(obj.interval_metadata, self.im)

    def test_interval_metadata_setter_makes_copy(self):
        intvl = self.im.add(**self.intvls[1])
        obj = self._interval_metadata_constructor_(self.upper_bound)
        obj.interval_metadata = self.im

        self.assertEqual(obj.interval_metadata, self.im)
        self.assertIsNot(obj.interval_metadata, self.im)

        # Changing mutable value of metadata of the old interval
        # also changes obj.
        intvl.metadata['gene'].append('b')
        self.assertEqual(obj.interval_metadata, self.im)

        # Changing old interval doesn't change obj
        intvl.bounds = [(3, 6)]
        self.assertNotEqual(obj.interval_metadata, self.im)

    def test_interval_metadata_setter_len_mismatch(self):
        self.im.add(**self.intvls[1])
        obj = self._interval_metadata_constructor_(self.upper_bound, self.im)

        for i in 0, 1, 3, 100:
            with self.assertRaisesRegex(
                    ValueError, '\(%d\).*\(%d\)' % (i, self.upper_bound)):
                obj.interval_metadata = IntervalMetadata(i)

        self.assertEqual(obj.interval_metadata, self.im)

    def test_interval_metadata_setter_invalid_type(self):
        self.im.add(**self.intvls[0])
        obj = self._interval_metadata_constructor_(self.upper_bound, self.im)

        for i in [2, None, '', {}, []]:
            with self.assertRaisesRegex(
                    TypeError,
                    'You must provide `IntervalMetadata` object'):
                obj.interval_metadata = i

        self.assertEqual(self.im, obj.interval_metadata)

    def test_interval_metadata_deleter_empty(self):
        obj = self._interval_metadata_constructor_(self.upper_bound, self.im)

        del obj.interval_metadata
        self.assertIsNone(obj._interval_metadata)
        self.assertFalse(obj.has_interval_metadata())

        # Delete again. test idempotent
        del obj.interval_metadata
        self.assertIsNone(obj._interval_metadata)
        self.assertFalse(obj.has_interval_metadata())

    def test_interval_metadata_deleter(self):
        self.im.add(**self.intvls[0])
        obj = self._interval_metadata_constructor_(self.upper_bound, self.im)

        del obj.interval_metadata
        self.assertIsNone(obj._interval_metadata)
        self.assertFalse(obj.has_interval_metadata())

    def test_has_interval_metadata(self):
        obj = self._interval_metadata_constructor_(self.upper_bound)
        self.assertFalse(obj.has_interval_metadata())

        obj = self._interval_metadata_constructor_(self.upper_bound, self.im)
        self.assertFalse(obj.has_interval_metadata())

        self.im.add([(0, 1)])
        obj = self._interval_metadata_constructor_(self.upper_bound, self.im)
        self.assertTrue(obj.has_interval_metadata())
Esempio n. 50
0
class TestIntervalMetadata(unittest.TestCase, ReallyEqualMixin):
    def setUp(self):
        self.upper_bound = 10
        self.im_empty = IntervalMetadata(self.upper_bound)
        self.im_1 = IntervalMetadata(self.upper_bound)
        self.im_1_1 = Interval(
            interval_metadata=self.im_1,
            bounds=[(1, 2), (4, self.upper_bound)],
            metadata={'gene': 'sagA',  'bound': 0})
        self.im_2 = IntervalMetadata(self.upper_bound)
        self.im_2_1 = Interval(
            interval_metadata=self.im_2,
            bounds=[(1, 2), (4, self.upper_bound)],
            metadata={'gene': 'sagA',  'bound': 0})
        self.im_2_2 = Interval(
            interval_metadata=self.im_2,
            bounds=[(3, 5)],
            metadata={'gene': 'sagB', 'bound': 0, 'spam': [0]})

    def test_copy_empty(self):
        obs = copy(self.im_empty)
        self.assertEqual(obs, self.im_empty)
        self.assertIsNot(obs._intervals, self.im_empty._intervals)
        self.assertIsNot(obs._interval_tree, self.im_empty._interval_tree)

    def test_copy(self):
        obs = copy(self.im_2)
        self.assertEqual(obs, self.im_2)
        self.assertIsNot(obs._intervals, self.im_2._intervals)
        self.assertIsNot(obs._interval_tree, self.im_2._interval_tree)

        for i in range(self.im_2.num_interval_features):
            i1, i2 = obs._intervals[i], self.im_2._intervals[i]
            self.assertIsNot(i1, i2)
            self.assertIsNot(i1.bounds, i2.bounds)
            self.assertIsNot(i1.fuzzy, i2.fuzzy)
            self.assertIsNot(i1._interval_metadata, i2._interval_metadata)
            self.assertIsNot(i1.metadata, i2.metadata)
            for k in i1.metadata:
                self.assertIs(i1.metadata[k], i2.metadata[k])

    def test_deepcopy(self):
        obs = deepcopy(self.im_2)
        self.assertEqual(obs, self.im_2)
        self.assertIsNot(obs._intervals, self.im_2._intervals)
        self.assertIsNot(obs._interval_tree, self.im_2._interval_tree)

        for i in range(self.im_2.num_interval_features):
            i1, i2 = obs._intervals[i], self.im_2._intervals[i]
            self.assertIsNot(i1, i2)
            self.assertIsNot(i1.bounds, i2.bounds)
            self.assertIsNot(i1.fuzzy, i2.fuzzy)
            self.assertIsNot(i1.metadata, i2.metadata)

        i2.metadata['spam'].append(1)
        self.assertEqual(i2.metadata,
                         {'gene': 'sagB', 'bound': 0, 'spam': [0, 1]})
        self.assertEqual(i1.metadata,
                         {'gene': 'sagB', 'bound': 0, 'spam': [0]})

    def test_deepcopy_memo_is_respected(self):
        memo = {}
        deepcopy(self.im_1, memo)
        self.assertGreater(len(memo), 2)

    def test_init(self):
        self.assertFalse(self.im_empty._is_stale_tree)
        self.assertEqual(self.im_empty._intervals, [])

    def test_init_upper_bound_lt_lower_bound(self):
        # test that no exception is raised
        IntervalMetadata(0)

        with self.assertRaises(ValueError):
            IntervalMetadata(-1)

    def test_num_interval_features(self):
        self.assertEqual(self.im_empty.num_interval_features, 0)
        self.assertEqual(self.im_1.num_interval_features, 1)
        self.assertEqual(self.im_2.num_interval_features, 2)

    def test_duplicate(self):
        '''Test query and drop methods on duplicate Intervals.'''
        intvl_1 = self.im_empty.add([(1, 2)])
        intvl_2 = self.im_empty.add([(1, 2)])
        self.assertEqual(len(list(self.im_empty.query([(1, 2)]))), 2)
        self.im_empty.drop([intvl_1])
        self.assertEqual(len(self.im_empty._intervals), 1)
        self.assertTrue(self.im_empty._intervals[0] is intvl_2)

    def test_duplicate_bounds(self):
        intvl = self.im_empty.add([(1, 2), (1, 2)])
        intvls = list(self.im_empty.query([(1, 2)]))
        self.assertEqual(len(intvls), 1)
        self.assertTrue(intvl is intvls[0])

    def test_concat_empty(self):
        for i in 0, 1, 2:
            obs = IntervalMetadata.concat([self.im_empty] * i)
            exp = IntervalMetadata(self.upper_bound * i)
            self.assertEqual(obs, exp)

        obs = IntervalMetadata.concat([])
        self.assertEqual(obs, IntervalMetadata(0))

    def test_concat(self):
        im1 = IntervalMetadata(3)
        im2 = IntervalMetadata(4)
        im3 = IntervalMetadata(5)
        im1.add([(0, 2)], [(True, True)])
        im2.add([(0, 3)], [(True, False)], {'gene': 'sagA'})
        im2.add([(2, 4)], metadata={'gene': 'sagB'})
        im3.add([(1, 5)], [(False, True)], {'gene': 'sagC'})
        obs = IntervalMetadata.concat([im1, im2, im3])

        exp = IntervalMetadata(12)
        exp.add(bounds=[(0, 2)], fuzzy=[(True, True)])
        exp.add(bounds=[(3, 6)], fuzzy=[(True, False)],
                metadata={'gene': 'sagA'})
        exp.add(bounds=[(5, 7)], metadata={'gene': 'sagB'})
        exp.add(bounds=[(8, 12)], fuzzy=[(False, True)],
                metadata={'gene': 'sagC'})
        self.assertEqual(obs, exp)

    def test_sort(self):
        interval = Interval(
            self.im_2,
            [(1, 2), (3, 8)],
            metadata={'gene': 'sagA',  'bound': 0})
        im = deepcopy(self.im_2)
        self.im_2.sort(False)
        # check sorting does not have other side effects
        self.assertEqual(im, self.im_2)
        self.assertEqual(self.im_2._intervals,
                         [self.im_2_2, self.im_2_1, interval])

        self.im_2.sort()
        self.assertEqual(im, self.im_2)
        self.assertEqual(self.im_2._intervals,
                         [interval, self.im_2_1, self.im_2_2])

        self.im_empty.sort()
        self.assertEqual(self.im_empty, IntervalMetadata(self.upper_bound))

    def test_add_eq_upper_bound(self):
        self.im_empty.add(bounds=[(1, 2), (4, self.upper_bound)],
                          metadata={'gene': 'sagA',  'bound': 0})
        self.assertTrue(self.im_empty._is_stale_tree)
        interval = self.im_empty._intervals[0]
        self.assertEqual(interval.bounds, [(1, 2), (4, self.upper_bound)])
        self.assertEqual(interval.metadata, {'gene': 'sagA', 'bound': 0})
        self.assertTrue(isinstance(self.im_empty._interval_tree, IntervalTree))

    def test_add_gt_upper_bound(self):
        with self.assertRaises(ValueError):
            self.im_empty.add(bounds=[(1, 2), (4, self.upper_bound+1)],
                              metadata={'gene': 'sagA',  'bound': 0})

    def test_add_eq_start_end_bound(self):
        for i in 0, 1, self.upper_bound:
            # test that no exception is raised
            self.im_empty.add(bounds=[(i, i)],
                              metadata={'gene': 'sagA',  'bound': 0})

    def test_query_attribute(self):
        intervals = self.im_2._query_attribute({})
        for i, j in zip(intervals, self.im_2._intervals):
            self.assertEqual(i, j)

        intervals = list(self.im_2._query_attribute(None))
        self.assertEqual(len(intervals), 0)

        for i in self.im_2._intervals:
            intervals = list(self.im_2._query_attribute(i.metadata))
            self.assertEqual(len(intervals), 1)
            self.assertEqual(intervals[0], i)

    def test_query_interval(self):
        intervals = list(self.im_2._query_interval((1, 2)))
        self.assertEqual(len(intervals), 1)
        self.assertEqual(intervals[0], self.im_2_1)

        intervals = list(self.im_2._query_interval((3, 4)))
        self.assertEqual(len(intervals), 1)
        self.assertEqual(intervals[0], self.im_2_2)

        intervals = {repr(i) for i in self.im_2._query_interval((1, 7))}
        self.assertEqual(len(intervals), 2)
        self.assertSetEqual(intervals,
                            {repr(i) for i in self.im_2._intervals})

    def test_query_interval_upper_bound(self):
        intervals = list(self.im_2._query_interval((self.upper_bound-1,
                                                    self.upper_bound)))
        self.assertEqual(intervals, [self.im_2_1])

    def test_query(self):
        intervals = list(self.im_2.query(bounds=[(1, 5)],
                                         metadata={'gene': 'sagA'}))
        self.assertEqual(len(intervals), 1)
        self.assertEqual(intervals[0], self.im_2_1)

    def test_query_empty(self):
        intervals = list(self.im_1.query())
        self.assertEqual(len(intervals), 0)

    def test_query_no_hits(self):
        intervals = list(self.im_2.query(bounds=[(self.upper_bound, 200)]))
        self.assertEqual(len(intervals), 0)

        intervals = list(self.im_2.query(metadata={'gene': 'sagC'}))
        self.assertEqual(len(intervals), 0)

        intervals = list(self.im_2.query(bounds=[(1, 2)],
                                         metadata={'gene': 'sagC'}))
        self.assertEqual(len(intervals), 0)

    def test_query_interval_only(self):
        for loc in [[(1, 7)],
                    [(1, 2), (3, 4)]]:
            intervals = list(self.im_2.query(bounds=loc))
            self.assertEqual(len(intervals), 2)
            self.assertEqual(intervals[0], self.im_2_1)
            self.assertEqual(intervals[1], self.im_2_2)

    def test_query_metadata_only(self):
        intervals = list(self.im_2.query(metadata={'gene': 'sagB'}))
        self.assertEqual(len(intervals), 1)
        self.assertEqual(intervals[0], self.im_2_2)

        intervals = list(self.im_2.query(metadata={'bound': 0}))
        self.assertEqual(len(intervals), 2)
        self.assertEqual(intervals[0], self.im_2_1)
        self.assertEqual(intervals[1], self.im_2_2)

    def test_drop(self):
        intvl = self.im_2._intervals[0]
        self.im_2.drop([intvl])
        self.assertEqual(len(self.im_2._intervals), 1)
        self.assertEqual(self.im_2._intervals[0], self.im_2_2)
        # test the intvl was set to dropped
        self.assertTrue(intvl.dropped)

    def test_drop_all(self):
        self.im_2.drop(self.im_2._intervals)
        self.assertEqual(self.im_2, self.im_empty)

    def test_reverse(self):
        self.im_2._reverse()
        Interval(
            interval_metadata=self.im_empty,
            bounds=[(0, 6), (8, 9)],
            metadata={'gene': 'sagA',  'bound': 0})
        Interval(
            interval_metadata=self.im_empty,
            bounds=[(5, 7)],
            metadata={'gene': 'sagB', 'bound': 0, 'spam': [0]})
        self.assertEqual(self.im_2, self.im_empty)

    def test_eq_ne(self):
        im1 = IntervalMetadata(10)
        im1.add(metadata={'gene': 'sagA', 'bound': '0'},
                bounds=[(0, 2), (4, 7)])
        im1.add(metadata={'gene': 'sagB', 'bound': '3'},
                bounds=[(3, 5)])

        # The ordering shouldn't matter
        im2 = IntervalMetadata(10)
        im2.add(metadata={'gene': 'sagB', 'bound': '3'},
                bounds=[(3, 5)])
        im2.add(metadata={'gene': 'sagA', 'bound': '0'},
                bounds=[(0, 2), (4, 7)])

        im3 = IntervalMetadata(10)
        im3.add(metadata={'gene': 'sagA', 'bound': '3'},
                bounds=[(0, 2), (4, 7)])
        im3.add(metadata={'gene': 'sagB', 'bound': '3'},
                bounds=[(3, 5)])

        self.assertReallyEqual(im1, im2)
        self.assertReallyNotEqual(im1, im3)

    def test_ne_diff_bounds(self):
        im1 = IntervalMetadata(10)
        im2 = IntervalMetadata(9)
        intvl = {'bounds': [(0, 1)], 'metadata': {'spam': 'foo'}}
        im1.add(**intvl)
        im2.add(**intvl)
        self.assertReallyNotEqual(im1, im2)

    def test_repr(self):
        exp = '''0 interval features
-------------------'''
        self.assertEqual(repr(self.im_empty), exp)

        self.im_empty.add([(1, 2)], metadata={'gene': 'sagA'})

        exp = '''1 interval feature
------------------
Interval\(interval_metadata=<[0-9]+>, bounds=\[\(1, 2\)\], \
fuzzy=\[\(False, False\)\], metadata={'gene': 'sagA'}\)'''
        self.assertRegex(repr(self.im_empty), exp)

        self.im_empty.add([(3, 4)], metadata={'gene': 'sagB'})
        self.im_empty.add([(3, 4)], metadata={'gene': 'sagC'})
        self.im_empty.add([(3, 4)], metadata={'gene': 'sagD'})
        self.im_empty.add([(3, 4)], metadata={'gene': 'sagE'})
        self.im_empty.add([(3, 4)], metadata={'gene': 'sagF'})
        exp = '''6 interval features
-------------------
Interval\(interval_metadata=<[0-9]+>, bounds=\[\(1, 2\)\], \
fuzzy=\[\(False, False\)\], metadata={'gene': 'sagA'}\)
Interval\(interval_metadata=<[0-9]+>, bounds=\[\(3, 4\)\], \
fuzzy=\[\(False, False\)\], metadata={'gene': 'sagB'}\)
...
Interval\(interval_metadata=<[0-9]+>, bounds=\[\(3, 4\)\], \
fuzzy=\[\(False, False\)\], metadata={'gene': 'sagE'}\)
Interval\(interval_metadata=<[0-9]+>, bounds=\[\(3, 4\)\], \
fuzzy=\[\(False, False\)\], metadata={'gene': 'sagF'}\)'''
        self.assertRegex(repr(self.im_empty), exp)
Esempio n. 51
0
    def test_concat(self):
        im1 = IntervalMetadata(3)
        im2 = IntervalMetadata(4)
        im3 = IntervalMetadata(5)
        im1.add([(0, 2)], [(True, True)])
        im2.add([(0, 3)], [(True, False)], {'gene': 'sagA'})
        im2.add([(2, 4)], metadata={'gene': 'sagB'})
        im3.add([(1, 5)], [(False, True)], {'gene': 'sagC'})
        obs = IntervalMetadata.concat([im1, im2, im3])

        exp = IntervalMetadata(12)
        exp.add(bounds=[(0, 2)], fuzzy=[(True, True)])
        exp.add(bounds=[(3, 6)], fuzzy=[(True, False)],
                metadata={'gene': 'sagA'})
        exp.add(bounds=[(5, 7)], metadata={'gene': 'sagB'})
        exp.add(bounds=[(8, 12)], fuzzy=[(False, True)],
                metadata={'gene': 'sagC'})
        self.assertEqual(obs, exp)
Esempio n. 52
0
    def test_concat(self):
        im1 = IntervalMetadata(3)
        im2 = IntervalMetadata(4)
        im3 = IntervalMetadata(5)
        im1.add([(0, 2)], [(True, True)])
        im2.add([(0, 3)], [(True, False)], {'gene': 'sagA'})
        im2.add([(2, 4)], metadata={'gene': 'sagB'})
        im3.add([(1, 5)], [(False, True)], {'gene': 'sagC'})
        obs = IntervalMetadata.concat([im1, im2, im3])

        exp = IntervalMetadata(12)
        exp.add(bounds=[(0, 2)], fuzzy=[(True, True)])
        exp.add(bounds=[(3, 6)],
                fuzzy=[(True, False)],
                metadata={'gene': 'sagA'})
        exp.add(bounds=[(5, 7)], metadata={'gene': 'sagB'})
        exp.add(bounds=[(8, 12)],
                fuzzy=[(False, True)],
                metadata={'gene': 'sagC'})
        self.assertEqual(obs, exp)
Esempio n. 53
0
    def test_eq_ne(self):
        im1 = IntervalMetadata(10)
        im1.add(metadata={
            'gene': 'sagA',
            'bound': '0'
        },
                bounds=[(0, 2), (4, 7)])
        im1.add(metadata={'gene': 'sagB', 'bound': '3'}, bounds=[(3, 5)])

        # The ordering shouldn't matter
        im2 = IntervalMetadata(10)
        im2.add(metadata={'gene': 'sagB', 'bound': '3'}, bounds=[(3, 5)])
        im2.add(metadata={
            'gene': 'sagA',
            'bound': '0'
        },
                bounds=[(0, 2), (4, 7)])

        im3 = IntervalMetadata(10)
        im3.add(metadata={
            'gene': 'sagA',
            'bound': '3'
        },
                bounds=[(0, 2), (4, 7)])
        im3.add(metadata={'gene': 'sagB', 'bound': '3'}, bounds=[(3, 5)])

        self.assertReallyEqual(im1, im2)
        self.assertReallyNotEqual(im1, im3)
Esempio n. 54
0
class GFF3IOTests(TestCase):
    def setUp(self):
        self.multi_fp = get_data_path('gff3_multi_record')
        self.single_fp = get_data_path('gff3_single_record')

        intvls = [{
            'bounds': [(0, 4641652)],
            'metadata': {
                'source': 'European Nucleotide Archive',
                'type': 'chromosome',
                'score': '.',
                'strand': '.',
                'ID': 'chromosome:Chromosome',
                'Alias': 'U00096.3',
                'Is_circular': 'true'
            }
        }, {
            'bounds': [(147, 148)],
            'metadata': {
                'source': 'regulondb_feature',
                'type': 'biological_region',
                'score': '.',
                'strand': '+',
                'external_name': 'Promoter thrLp (RegulonDB:ECK120010236)',
                'logic_name': 'regulondb_promoter'
            }
        }, {
            'bounds': [(336, 2799)],
            'metadata': {
                'source': 'Prodigal_v2.60',
                'type': 'gene',
                'score': '1.8',
                'strand': '+',
                'phase': 0,
                'ID': '1_1',
                'gc_cont': '0.427'
            }
        }, {
            'bounds': [(336, 2799)],
            'metadata': {
                'source': 'Prodigal_v2.60',
                'type': 'CDS',
                'score': '333.8',
                'strand': '+',
                'phase': 0,
                'ID': '1_2',
                'Parent': '1_1',
                'rbs_motif': 'GGAG/GAGG',
                'rbs_spacer': '5-10bp'
            }
        }, {
            'bounds': [(0, 50), (55, 100)],
            'metadata': {
                'source': 'Prodigal_v2.60',
                'type': 'gene',
                'score': '1.8',
                'strand': '+',
                'phase': 0,
                'ID': '1_1',
                'gene': 'FXR receptor'
            }
        }]

        self.upper_bound = 4641652
        self.imd1 = IntervalMetadata(self.upper_bound)
        self.imd1.add(**intvls[0])
        self.imd1.add(**intvls[1])

        self.imd2 = IntervalMetadata(None)
        self.imd2.add(**intvls[2])
        self.imd2.add(**intvls[3])

        self.imd3 = IntervalMetadata(None)
        self.imd3.add(**intvls[4])

        self.seq_fp = get_data_path('gff3_dna')
        self.seq = Sequence('ATGCATGCATGC',
                            metadata={
                                'id': 'NC_1',
                                'description': 'species X'
                            })
        self.seq.interval_metadata.add(
            [(0, 9)],
            metadata={
                'source': 'Prodigal_v2.60',
                'type': 'gene',
                'score': '.',
                'strand': '+',
                'phase': 0,
                'ID': 'gene1',
                'Name': 'FXR'
            })
        self.dna = DNA(self.seq)