def joined_rec1NNNrec2(): return SeqRecordEM2(dna_seq('rec1NNNrec2'), id='Rec1_NNN_Rec2', name='R1R2', features=[SeqFeatureEM2(location=FeatureLocation(0, 5), strand=1, id='A1'), SeqFeatureEM2(location=FeatureLocation(28, 33), strand=1, id='B1'), SeqFeatureEM2(location=FeatureLocation(38, 43), strand=1, id='A2'), SeqFeatureEM2(location=FeatureLocation(66, 71), strand=1, id='B2') ])
def orfs_to_features(self, start=['ATG'], stop=None, filter=None, add=False): """ Determines all open reading frames in a sequence record. All the returned ORFs have a length that is a multiple of 3. Thus, for sequences without any stop codon, 3 ORFs are returned, one for each frame. Both strands are examined but it is possible to filter the ORFs by length, frame, etc. with the FeatureFilter defined by the filter argument. :param start: a list of accepted start codons :param stop: a list of accepted stop codons :param filter: a FeatureFilter object defining a filter to select ORFs according to some criteria :param add: if True, the selected ORFs are added to the record's features :return: a list of ORFs as SeqFeatureEM2 objects """ features = [] for orf in self.seq.get_orfs(start=start, stop=stop): features.append( SeqFeatureEM2(location=FeatureLocation(orf[0], orf[1]), type='ORF', strand=1)) for orf in self.reverse_complement().seq.get_orfs(start=start, stop=stop): features.append( SeqFeatureEM2(location=FeatureLocation(orf[0], orf[1]), type='ORF', strand=-1)) if filter is not None: features = filter.apply(features) if add: self.features.extend(features) return features
def dna_stitch_rec1_overlap_rec2(): stitched = joined_rec1_overlap_rec2() stitched.features += [SeqFeatureEM2(location=FeatureLocation(0, 35), strand=1, id='Rec1'), SeqFeatureEM2(location=FeatureLocation(25, 60), strand=1, id='Rec2'), SeqFeatureEM2(location=FeatureLocation(22, 45), strand=1, id='stitcher') ] return stitched
def dna_stitch_rec1_NNN_rec2rev(): stitched = joined_rec1NNNrec2() stitched.features += [SeqFeatureEM2(location=FeatureLocation(0, 35), strand=1, id='Rec1'), SeqFeatureEM2(location=FeatureLocation(38, 73), strand=-1, id='Rec2'), SeqFeatureEM2(location=FeatureLocation(30, 48), strand=1, id='stitcher') ] return stitched
def joined_rec1_overlap_rec2(): return SeqRecordEM2(dna_seq('rec1_overlap_rec2/rec3'), id='Rec1_overlap_Rec2', name='R1R2', features=[SeqFeatureEM2(location=FeatureLocation(0, 5), strand=1, id='A1'), SeqFeatureEM2(location=FeatureLocation(28, 33), strand=1, id='B1'), SeqFeatureEM2(location=FeatureLocation(25, 30), strand=1, id='A2'), SeqFeatureEM2(location=FeatureLocation(53, 58), strand=1, id='B2') ])
def add_feature(self, **kwargs): """ Adds a feature to the current record according to arguments passed as \*\*kwargs. :param kwargs: keyword arguments to pass to SeqFeatureEM2 class """ self.features.append(SeqFeatureEM2(parent=self, **kwargs))
def dna_rec(): return SeqRecordEM2(dna_seq('rec'), id='Rec', name='Rec', features=[SeqFeatureEM2(location=FeatureLocation(0, 2), strand=1, id='A'), SeqFeatureEM2(location=FeatureLocation(6, 10), strand=1, type='yyy', id='B'), SeqFeatureEM2(location=FeatureLocation(15, 20), strand=1, id='C'), SeqFeatureEM2(location=FeatureLocation(20, 30), strand=1, id='D'), SeqFeatureEM2(location=FeatureLocation(6, 10), strand=-1, id='F'), SeqFeatureEM2(location=FeatureLocation(18, 25), strand=-1, type='xxx', id='G'), SeqFeatureEM2(location=FeatureLocation(16, 19), strand=0, id='H') ])
class SeqFeatureTests(unittest.TestCase): sprot: SeqRecord = SeqRecord( SeqEM2.protein('MYNAMEISFREDHEREIAMWHEREARETHEYALLTHISISEXCELLENT'), id='X', name='DummyProt') sprot.features = [ SeqFeatureEM2(parent=sprot, location=FeatureLocation(0, 11), type='domain', id='d1'), # MYNAMEISFRED SeqFeatureEM2(parent=sprot, location=FeatureLocation(8, 18), type='domain', id='d2'), # FREDHEREIAM SeqFeatureEM2(parent=sprot, location=FeatureLocation(19, 30), type='domain', id='d3'), # WHEREARETHEY SeqFeatureEM2(parent=sprot, location=FeatureLocation(6, 23), type='domain', id='d4'), # ISFREDHEREIAMWHERE SeqFeatureEM2(parent=sprot, location=FeatureLocation(34, AfterPosition(39)), id='d5'), # THISIS SeqFeatureEM2(parent=sprot, location=FeatureLocation(BeforePosition(2), 5), type='domain', id='d6'), # MYNAME SeqFeatureEM2(parent=sprot, location=FeatureLocation(19, 23), type='domain', id='d7'), # WHERE SeqFeatureEM2(parent=sprot, location=FeatureLocation(BeforePosition(30), 37), type='domain', id='d8') # YALLTHI ] @classmethod def test_parent(cls): assert [f.id for f in cls.sprot.features ] == ['d1', 'd2', 'd3', 'd4', 'd5', 'd6', 'd7', 'd8'] assert cls.sprot.features[1].parent.id == cls.sprot.id assert cls.sprot.features[1].parent.name == cls.sprot.name assert cls.sprot.features[1].parent.seq._data == cls.sprot.seq._data @classmethod def test_lies_within(cls): assert cls.sprot.features[1].lies_within(5, 25) assert not cls.sprot.features[1].lies_within(10, 25) assert not cls.sprot.features[1].lies_within(19, 25) @classmethod def test_lies_within_fuzzy(cls): with pytest.warns(UserWarning): cls.sprot.features[4].lies_within(30, 42) cls.sprot.features[5].lies_within(0, 10) @classmethod def test_overlaps(cls): assert cls.sprot.features[2].overlaps(20, 25) assert cls.sprot.features[2].overlaps(20, 40) assert cls.sprot.features[2].overlaps(20) assert not cls.sprot.features[2].overlaps(35) assert not cls.sprot.features[2].overlaps(2, 5) @classmethod def test_overlaps_fuzzy(cls): with pytest.warns(UserWarning): cls.sprot.features[4].overlaps(35) cls.sprot.features[5].overlaps(3) @classmethod def test_covers(cls): assert cls.sprot.features[3].covers(15, 20) assert not cls.sprot.features[3].covers(4, 20) @classmethod def test_covers_fuzzy(cls): with pytest.warns(UserWarning): cls.sprot.features[4].covers(35, 38) cls.sprot.features[5].covers(3, 4) @classmethod def test_intersect(cls): assert cls.sprot.features[4].intersect( cls.sprot.features[7]).location == FeatureLocation(34, 37) assert cls.sprot.features[2].intersect( cls.sprot.features[3]).location == cls.sprot.features[6].location assert cls.sprot.features[1].intersect( cls.sprot.features[3]).location == FeatureLocation(8, 18) @classmethod def test_intersect_errors(cls): with pytest.raises(ValueError, match=r'Undetermined .*'): cls.sprot.features[0].intersect( SeqFeatureEM2(location=FeatureLocation(30, 37))) @classmethod def test_intersect_fuzzy(cls): with pytest.warns(UserWarning): cls.sprot.features[5].intersect(cls.sprot.features[0]) @classmethod def test_move(cls): assert cls.sprot.features[0].move(5).location == FeatureLocation(5, 16)
def test_intersect_errors(cls): with pytest.raises(ValueError, match=r'Undetermined .*'): cls.sprot.features[0].intersect( SeqFeatureEM2(location=FeatureLocation(30, 37)))
def dna_rec2_rev(): return SeqRecordEM2(dna_seq('rec2'), id='Rec2', name='R2', features=[SeqFeatureEM2(location=FeatureLocation(0, 5), strand=1, id='A2'), SeqFeatureEM2(location=FeatureLocation(28, 33), strand=1, id='B2') ]).reverse_complement()
def dna_rec1(): return SeqRecordEM2(dna_seq('rec1'), id='Rec1', name='R1', features=[SeqFeatureEM2(location=FeatureLocation(0, 5), strand=1, id='A1'), SeqFeatureEM2(location=FeatureLocation(28, 33), strand=1, id='B1') ])
class GFFtests(unittest.TestCase): sprot: SeqRecord = SeqRecord( SeqEM2.dna('ATGAGTCGGTAACGATGCATGCATGCAGCTGACGC'), id='X', name='DummyDNA') sprot.features = [ SeqFeatureEM2(parent=sprot, location=FeatureLocation(0, 2), type='start', strand=1, qualifiers={ 'codon': 'start', 'source': '', 'phase': '0', 'score': '0' }), SeqFeatureEM2(parent=sprot, location=FeatureLocation(8, 18), type='domain', id='d1', strand=0, qualifiers={ 'source': '', 'phase': '0', 'score': '0' }), SeqFeatureEM2(parent=sprot, location=FeatureLocation(16, 30), type='domain', id='d2', strand=-1, qualifiers={ 'source': '', 'phase': '0', 'score': '0' }) ] df0 = DataFrame({ 'seq_id': ['X'], 'source': [''], 'type': ['start'], 'start': ['0'], 'end': ['2'], 'score': ['0'], 'strand': ['+'], 'phase': ['0'], 'attributes': ['codon=start;id=<unknown id>'] }) df1 = DataFrame({ 'seq_id': ['X', 'X', 'X'], 'source': ['', '', ''], 'type': ['start', 'domain', 'domain'], 'start': ['0', '8', '16'], 'end': ['2', '18', '30'], 'score': ['0', '0', '0'], 'strand': ['+', '?', '-'], 'phase': ['0', '0', '0'], 'attributes': ['codon=start;id=<unknown id>', 'id=d1', 'id=d2'] }) @classmethod def test_df_from_feature(cls): assert_frame_equal( GFF.df_from_feature(cls.sprot.features[0]).reset_index(drop=True), cls.df0.reset_index(drop=True)) assert_frame_equal( GFF.df_from_feature(None).reset_index(drop=True), DataFrame(columns=[ 'seq_id', 'source', 'type', 'start', 'end', 'score', 'strand', 'phase', 'attributes' ]).reset_index(drop=True)) @classmethod def test_from_feature_list(cls): assert_frame_equal( GFF(cls.sprot.features[0:3]).df.reset_index(drop=True), cls.df1.reset_index(drop=True)) assert_frame_equal( GFF([]).df.reset_index(drop=True), DataFrame(columns=[ 'seq_id', 'source', 'type', 'start', 'end', 'score', 'strand', 'phase', 'attributes' ]).reset_index(drop=True)) assert_frame_equal( GFF(None).df.reset_index(drop=True), DataFrame(columns=[ 'seq_id', 'source', 'type', 'start', 'end', 'score', 'strand', 'phase', 'attributes' ]).reset_index(drop=True)) assert_frame_equal( GFF().df.reset_index(drop=True), DataFrame(columns=[ 'seq_id', 'source', 'type', 'start', 'end', 'score', 'strand', 'phase', 'attributes' ]).reset_index(drop=True)) @classmethod def test_add_list(cls): assert_frame_equal( GFF([cls.sprot.features[0]]).add_feature_list( cls.sprot.features[1:3]).df.reset_index(drop=True), cls.df1.reset_index(drop=True)) @classmethod def test_to_feature_list(cls): for i in range(0, len(cls.sprot.features)): assert GFF(input_df=cls.df1).to_feature_list(parents=cls.sprot)[i].__str__() == \ cls.sprot.features[i].__str__() assert GFF(input_df=cls.df1).to_feature_list(parents=cls.sprot)[i].parent.id == \ cls.sprot.features[i].parent.id assert GFF(input_df=cls.df1).to_feature_list()[i].parent is None for i in range(0, len(cls.sprot.features)): assert GFF(input_df=cls.df1).to_feature_list(parents=[cls.sprot, cls.sprot, cls.sprot])[i].__str__() == \ cls.sprot.features[i].__str__() with pytest.raises( ValueError, match= r'The number of parents should match the number of features .*' ): GFF(input_df=cls.df1).to_feature_list(parents=[cls.sprot])