def test_constructor_interval_metadata_len(self): for n in 1, 2, 3: im = IntervalMetadata(n) im.add([(0, 1)], metadata={'a': 'b'}) obj = self._interval_metadata_constructor_(n, im) self.assertTrue(obj.has_interval_metadata()) self.assertIsInstance(obj.interval_metadata, IntervalMetadata)
def _parse_record(lines): sid = lines[0].split()[1] splitter = split(split_head, is_head=lambda s: not s.startswith(' ')) imd = IntervalMetadata(None) for gene in splitter(lines): if len(gene) == 1: # there is no terminator predicted continue gene_id = gene[0].split()[0] it = iter(gene[1:]) for term in it: items = term.split() term_id = '%s_%s' % (items[0], items[1]) hair_pin_seq = next(it) hair_pin_seq = '/'.join(hair_pin_seq.split()) start, end = int(items[2]), int(items[4]) strand = items[5] if strand == '-': start, end = end, start bounds = [(start, end)] md = { 'ID': term_id, 'gene_id': gene_id, 'confidence': items[7], 'strand': strand, 'source': 'TransTermHP', 'sequence': hair_pin_seq, 'type': 'terminator' } imd.add(bounds, metadata=md) return sid, imd
def test_ne_diff_bounds(self): im1 = IntervalMetadata(10) im2 = IntervalMetadata(9) intvl = {'bounds': [(0, 1)], 'metadata': {'spam': 'foo'}} im1.add(**intvl) im2.add(**intvl) self.assertReallyNotEqual(im1, im2)
def test_init_copy_from(self): for i in [None, 99, 999]: obs = IntervalMetadata(i, self.im_1) exp = IntervalMetadata(i) exp.add(bounds=[(1, 2), (4, self.upper_bound)], metadata={'gene': 'sagA', 'bound': 0}) self.assertEqual(obs, exp)
def test_complement_without_reverse_non_empty(self): for (constructor, seq_str, comp_str, qual) in self.all_combos_comp_qual: comp = constructor(seq_str).complement() self.assertEqual(comp, constructor(comp_str)) im = IntervalMetadata(len(seq_str)) im.add([(0, 1)], metadata={'gene': 'p53'}) comp = constructor(seq_str, metadata={ 'id': 'foo', 'description': 'bar' }, positional_metadata={ 'quality': qual }, interval_metadata=im).complement() self.assertEqual( comp, constructor(comp_str, metadata={ 'id': 'foo', 'description': 'bar' }, positional_metadata={'quality': qual}, interval_metadata=im))
def test_compute_trna_score(self): imd = IntervalMetadata(None) obs = compute_trna_score([imd]) self.assertEqual(obs, 0.1) for a in [ 'Cys', 'Gln', 'Glu', 'Gly', 'His', 'Ile', 'Leu', 'Lys', 'Met', 'Phe', 'Pro', 'Ser', 'Thr', 'Trp', 'Tyr', 'Val' ]: imd.add([(0, 12)], metadata={ 'type': 'tRNA', 'product': 'tRNA-' + a }) obs = compute_trna_score([imd]) self.assertEqual(obs, 0.6) for a in ['Ala', 'Arg', 'Asn', 'Asp']: imd.add([(0, 12)], metadata={ 'type': 'tRNA', 'product': 'tRNA-' + a }) obs = compute_trna_score([imd]) self.assertEqual(obs, 1)
def test_complement_with_reverse_non_empty(self): for (constructor, seq_str, rev_comp_str, qual) in self.all_combos_rev_comp_qual: rc = constructor(seq_str).complement(reverse=True) self.assertEqual(rc, constructor(rev_comp_str)) length = len(seq_str) im = IntervalMetadata(length) im.add([(0, 1)], metadata={'gene': 'p53'}) im_rc = IntervalMetadata(length) im_rc.add([(length - 1, length)], metadata={'gene': 'p53'}) original = constructor(seq_str, metadata={ 'id': 'foo', 'description': 'bar' }, positional_metadata={'quality': qual}, interval_metadata=im) rc = original.complement(reverse=True) self.assertEqual( rc, constructor(rev_comp_str, metadata={ 'id': 'foo', 'description': 'bar' }, positional_metadata={'quality': list(qual)[::-1]}, interval_metadata=im_rc)) # assert the original object is not changed self.assertIsNot(original.interval_metadata, im) self.assertEqual(original.interval_metadata, im)
def _parse_record(lines): imd = IntervalMetadata(None) seq_id = lines[0].split('\t')[0] for line in lines: bounds, md = _parse_line(line) imd.add(bounds, metadata=md) return seq_id, imd
def test_complement_with_reverse_non_empty(self): for (constructor, seq_str, rev_comp_str, qual) in self.all_combos_rev_comp_qual: rc = constructor(seq_str).complement(reverse=True) self.assertEqual(rc, constructor(rev_comp_str)) length = len(seq_str) im = IntervalMetadata(length) im.add([(0, 1)], metadata={'gene': 'p53'}) im_rc = IntervalMetadata(length) im_rc.add([(length-1, length)], metadata={'gene': 'p53'}) original = constructor( seq_str, metadata={'id': 'foo', 'description': 'bar'}, positional_metadata={ 'quality': qual}, interval_metadata=im) rc = original.complement(reverse=True) self.assertEqual( rc, constructor( rev_comp_str, metadata={'id': 'foo', 'description': 'bar'}, positional_metadata={'quality': list(qual)[::-1]}, interval_metadata=im_rc)) # assert the original object is not changed self.assertIsNot(original.interval_metadata, im) self.assertEqual(original.interval_metadata, im)
def test_create_faa(self): imd = IntervalMetadata(None) imd.add([(0, 120)], metadata={ 'type': 'CDS', 'product': 'Homoserine kinase', 'ID': '1_1' }) seq = DNA( 'ATGGTTAAAGTTTATGCCCCGGCTTCCAGTGCCAATATGAGCGTCGGGTTTGATGTGCTC' 'GGGGCGGCGGTGACACCCGTTGATGGTGCATTGCTCGGAGATGTAGTCACGGTTGAGGCG' 'GCAGAGACATTCAGTCTCAACAACCTCGGACGCTTTGCCGATAAGCTGCCGTCAGAACCA' 'CGGGAAAATATCGTTTATCAGTGCTGGGAGCGTTTTTGCCTGGAGCTGGGCAAGCAAATT' 'CCAGTGGCGATGACTCTGGAAAAGAATATGCCGATCGGCTCGGGCTTAGGCTCCAGCGCC' 'TGTTCGGTGGTCGCGGCGCTGATGGCGATGAATGAACACTGCGGCAAGCCACTTAATGAC' 'ACCCGTTTGCTGGCTTTGATGGGCGAGCTGGAAGGACGTATCTCCGGCAGCATTCATTAC' 'GACAACGTGGCACCGTGTTTTCTTGGTGGTATGCAGTTGATGATCGAAGAAAACGACATC' 'ATCAGCCAGCAAGTGCCAGGGTTTGATGAGTGGCTGTGGGTGCTGGCGTATCCGGGAATT' 'AAAGTCTCGACGGCAGAAGCCCGGGCTATTTTACCGGCGCAGTATCGCCGCCAGGATTGC' 'ATTGCGCACGGGCGACATCTGGCTGGCTTCATTCACGCCTGCTATTCCCGTCAGCCTGAG' 'CTTGCCGCGAAGCTGATGAAAGATGTTATCGCTGAACCCTACCGTGAACGGTTACTGCCT' 'GGCTTCCGGCAGGCGCGGCAGGCGGTCGCGGAAATCGGCGCGGTAGCGAGCGGTATCTCC' 'GGCTCCGGCCCGACCTTGTTCGCTCTATGTGACAAGCCGGATACCGCCCAGCGCGTTGCC' 'GACTGGTTGGGTAAGAACTACCTGCAAAATCAGGAAGGTTTTGTTCATATTTGCCGGCTG' 'GATACGGCGGGCGCACGAGTACTGGAAAACTAA', interval_metadata=imd) create_faa([seq], self.o) exp = '>1_1 Homoserine kinase\nMVKVYAPASSANMSVGFDVLGAAVTPVDGALLGDVVTVEA\n' with open(self.o) as out: obs = out.read() self.assertEqual(exp, obs)
def _parse_record(lines): '''Return interval metadata.''' imd = IntervalMetadata(None) for line in lines: bounds, md = _parse_line(line) imd.add(bounds, metadata=md) return imd
def test_raise_subregion(self): im = IntervalMetadata(None) im.add([(0, 3), (7, 9)], metadata={'type': 'gene'}) with io.StringIO() as fh: with self.assertRaises(GFF3FormatError): _serialize_interval_metadata( im, seq_id='a', fh=fh, skip_subregion=False)
def _parse_record(lines, length): '''Parse the lines into a IntervalMetadata object.''' interval_metadata = IntervalMetadata(length) for line in lines: columns = line.split('\t') # there should be 9 columns if len(columns) != 9: raise GFF3FormatError( 'do not have 9 columns in this line: "%s"' % line) # the 1st column is seq ID for every feature. don't store # this repetitive information metadata = {'source': columns[1], 'type': columns[2], 'score': columns[5], 'strand': columns[6]} phase = columns[7] # phase value can only be int or '.' try: metadata['phase'] = int(phase) except ValueError: if phase != '.': raise GFF3FormatError( 'unknown value for phase column: {!r}'.format(phase)) metadata.update(_parse_attr(columns[8])) start, end = columns[3:5] bounds = [(int(start)-1, int(end))] interval_metadata.add(bounds, metadata=metadata) return interval_metadata
def _parse_record(lines, length): '''Parse the lines into a IntervalMetadata object.''' interval_metadata = IntervalMetadata(length) for line in lines: columns = line.split('\t') # there should be 9 columns if len(columns) != 9: raise GFF3FormatError('do not have 9 columns in this line: "%s"' % line) # the 1st column is seq ID for every feature. don't store # this repetitive information metadata = { 'source': columns[1], 'type': columns[2], 'score': columns[5], 'strand': columns[6] } phase = columns[7] # phase value can only be int or '.' try: metadata['phase'] = int(phase) except ValueError: if phase != '.': raise GFF3FormatError( 'unknown value for phase column: {!r}'.format(phase)) metadata.update(_parse_attr(columns[8])) start, end = columns[3:5] bounds = [(int(start) - 1, int(end))] interval_metadata.add(bounds, metadata=metadata) return interval_metadata
def _parse_record(lines): '''Return interval metadata''' imd = IntervalMetadata(None) seq_id = lines[0].split()[2] for line in lines: bounds, md = _parse_line(line) imd.add(bounds, metadata=md) return seq_id, imd
def test_ne_only_one_is_empty(self): im1 = IntervalMetadata(self.upper_bound) im1.add(**self.intvls[0]) obj1 = self._interval_metadata_constructor_(self.upper_bound, im1) obj2 = self._interval_metadata_constructor_(self.upper_bound) self.assertReallyNotEqual(obj1, obj2)
def test_upper_bound_is_none(self): im = IntervalMetadata(None) # should not raise error im.add([(0, 1000000000)]) self.assertIsNone(im.upper_bound) with self.assertRaisesRegex(TypeError, 'upper bound is `None`'): im._reverse() with self.assertRaisesRegex(TypeError, 'upper bound is `None`'): IntervalMetadata.concat([self.im_1, im])
def test_eq_populated_differently(self): im1 = IntervalMetadata(self.upper_bound) im1.add(**self.intvls[0]) obj1 = self._interval_metadata_constructor_(self.upper_bound, im1) obj2 = self._interval_metadata_constructor_(self.upper_bound) obj2.interval_metadata.add(**self.intvls[0]) self.assertReallyEqual(obj1, obj2)
def test_raise_subregion(self): im = IntervalMetadata(None) im.add([(0, 3), (7, 9)], metadata={'type': 'gene'}) with io.StringIO() as fh: with self.assertRaises(GFF3FormatError): _serialize_interval_metadata(im, seq_id='a', fh=fh, skip_subregion=False)
def test_eq_basic(self): im1 = IntervalMetadata(self.upper_bound) im1.add(**self.intvls[0]) obj1 = self._interval_metadata_constructor_(self.upper_bound, im1) im2 = IntervalMetadata(self.upper_bound) im2.add(**self.intvls[0]) obj2 = self._interval_metadata_constructor_(self.upper_bound, im2) self.assertReallyEqual(obj1, obj2)
def test_init_copy_from(self): for i in [None, 99, 999]: obs = IntervalMetadata(i, self.im_1) exp = IntervalMetadata(i) exp.add(bounds=[(1, 2), (4, self.upper_bound)], metadata={ 'gene': 'sagA', 'bound': 0 }) self.assertEqual(obs, exp)
def test_transcribe_preserves_all_metadata(self): im = IntervalMetadata(4) im.add([(0, 2)], metadata={'gene': 'p53'}) exp = RNA('AGUU', metadata={'foo': 'bar'}, positional_metadata={'foo': range(4)}, interval_metadata=im) seq = DNA('AGTT', metadata={'foo': 'bar'}, positional_metadata={'foo': range(4)}, interval_metadata=im) self.assertEqual(seq.transcribe(), exp)
def test_upper_bound_is_none(self): im = IntervalMetadata(None) # should not raise error im.add([(0, 1000000000)]) self.assertIsNone(im.upper_bound) with self.assertRaisesRegex( TypeError, r'upper bound is `None`'): im._reverse() with self.assertRaisesRegex( TypeError, r'upper bound is `None`'): IntervalMetadata.concat([self.im_1, im])
def test_interval_metadata_to_gff3_missing_field(self): exp = 'ctg123\t.\tgene\t1\t9\t.\t.\t.\tID=gene00001;Name=EDEN' imd = IntervalMetadata(9) imd.add([(0, 9)], metadata={ 'type': 'gene', 'ID': 'gene00001', 'Name': 'EDEN'}) with io.StringIO() as fh: _interval_metadata_to_gff3(imd, fh, seq_id='ctg123') # only compare the uncommented lines because the comments are not # stored in IntervalMetadata obs = [i for i in fh.getvalue().splitlines() if not i.startswith('#')] self.assertEqual([exp], obs)
def test_interval_metadata_to_gff3_multiple_values(self): # test multiple values of db_xref are correctly serialized exp = 'ctg123\t.\tgene\t1\t9\t.\t.\t.\tDbxref=GO:000152,GO:001234' imd = IntervalMetadata(9) imd.add([(0, 9)], metadata={ 'type': 'gene', 'db_xref': ['GO:000152', 'GO:001234']}) with io.StringIO() as fh: _interval_metadata_to_gff3(imd, fh, seq_id='ctg123') # only compare the uncommented lines because the comments are not # stored in IntervalMetadata obs = [i for i in fh.getvalue().splitlines() if not i.startswith('#')] self.assertEqual([exp], obs)
def test_init_nondefault_parameters(self): im = IntervalMetadata(8) im.add([(1, 8)], metadata={'gene': 'p53'}) seq = ExampleGrammaredSequence( '.-ABCXYZ', metadata={'id': 'foo'}, positional_metadata={'quality': range(8)}, interval_metadata=im) npt.assert_equal(seq.values, np.array('.-ABCXYZ', dtype='c')) self.assertEqual(seq.metadata, {'id': 'foo'}) assert_data_frame_almost_equal(seq.positional_metadata, pd.DataFrame({'quality': range(8)})) self.assertEqual(seq.interval_metadata, im)
def test_interval_metadata_to_gff3_escape(self): # test escape of reserved char in GFF3 exp = 'ctg123\t.\tgene\t1\t9\t.\t.\t.\tID=a%3B%3D%26%2Cb' imd = IntervalMetadata(9) imd.add([(0, 9)], metadata={ 'type': 'gene', 'ID': 'a;=&,b'}) with io.StringIO() as fh: _interval_metadata_to_gff3(imd, fh, seq_id='ctg123') # only compare the uncommented lines because the comments are not # stored in IntervalMetadata obs = [i for i in fh.getvalue().splitlines() if not i.startswith('#')] self.assertEqual([exp], obs)
def test_interval_metadata_to_gff3_escape(self): # test escape of reserved char in GFF3 exp = 'ctg123\t.\tgene\t1\t9\t.\t.\t.\tID=a%3B%3D%26%2Cb' imd = IntervalMetadata(9) imd.add([(0, 9)], metadata={'type': 'gene', 'ID': 'a;=&,b'}) with io.StringIO() as fh: _interval_metadata_to_gff3(imd, fh, seq_id='ctg123') # only compare the uncommented lines because the comments are not # stored in IntervalMetadata obs = [ i for i in fh.getvalue().splitlines() if not i.startswith('#') ] self.assertEqual([exp], obs)
def test_parse(self): fp = get_data_path('transtermhp.tt') exps = [('gi|556503834|ref|NC_000913.3|1', 12), ('gi|556503834|ref|NC_000913.3|2', 4)] for (sid, imd), exp in zip(_generator(fp), exps): self.assertEqual(sid, exp[0]) self.assertEqual(imd.num_interval_features, exp[1]) # test the interval metadata from the 2nd sequence exp_imd = IntervalMetadata(None) exp_imd.add( [(7857, 7876)], metadata={ 'strand': '+', 'confidence': '95', 'sequence': 'TGCCCACGATTAAAG/GTGGCCGC/CCTG/GCGGTCAC/TTCTTTGAGAAAAGG', 'source': 'TransTermHP', 'ID': 'TERM_1', 'gene_id': '2_7', 'type': 'terminator' }) exp_imd.add( [(8919, 8958)], metadata={ 'strand': '-', 'confidence': '100', 'sequence': 'AATGAGCCAGAATAA/GCTAAGGTTGAAGGGGC/TGGAAC/GCCCCTTCAACCTTAGC/AGTAGCGTGGGATGA', 'source': 'TransTermHP', 'ID': 'TERM_2', 'gene_id': '2_9', 'type': 'terminator' }) exp_imd.add( [(9258, 9273)], metadata={ 'strand': '+', 'confidence': '89', 'sequence': 'GGCAGAAACAAAAAA/TCCCCG/GACT/CGGGGA/TTTATGTACAAGAGG', 'ID': 'TERM_3', 'gene_id': '2_9', 'source': 'TransTermHP', 'type': 'terminator' }) exp_imd.add( [(9258, 9273)], metadata={ 'strand': '-', 'confidence': '100', 'sequence': 'GGCAGAAACAAAAAA/TCCCCG/GACT/CGGGGA/TTTATGTACAAGAGG', 'ID': 'TERM_4', 'gene_id': '2_9', 'source': 'TransTermHP', 'type': 'terminator' }) self.assertEqual(exp_imd, imd)
def test_interval_metadata_to_gff3_missing_field(self): exp = 'ctg123\t.\tgene\t1\t9\t.\t.\t.\tID=gene00001;Name=EDEN' imd = IntervalMetadata(9) imd.add([(0, 9)], metadata={ 'type': 'gene', 'ID': 'gene00001', 'Name': 'EDEN' }) with io.StringIO() as fh: _interval_metadata_to_gff3(imd, fh, seq_id='ctg123') # only compare the uncommented lines because the comments are not # stored in IntervalMetadata obs = [ i for i in fh.getvalue().splitlines() if not i.startswith('#') ] self.assertEqual([exp], obs)
def test_interval_metadata_to_gff3_multiple_values(self): # test multiple values of db_xref are correctly serialized exp = 'ctg123\t.\tgene\t1\t9\t.\t.\t.\tDbxref=GO:000152,GO:001234' imd = IntervalMetadata(9) imd.add([(0, 9)], metadata={ 'type': 'gene', 'db_xref': ['GO:000152', 'GO:001234'] }) with io.StringIO() as fh: _interval_metadata_to_gff3(imd, fh, seq_id='ctg123') # only compare the uncommented lines because the comments are not # stored in IntervalMetadata obs = [ i for i in fh.getvalue().splitlines() if not i.startswith('#') ] self.assertEqual([exp], obs)
def test_filter_partial_genes(self): in_fp = join(self.tmpd, 'in.gff') out_fp = join(self.tmpd, 'out.gff') imd1 = IntervalMetadata(None) imd1.add( [(0, 100)], metadata={ 'partial': '01', 'phase': 0, 'source': 'Prodigal_v2.6.3', 'strand': '.', 'type': '.', 'score': '.' }) imd2 = IntervalMetadata(None) imd2.add( [(200, 300)], metadata={ 'partial': '10', 'phase': 1, 'source': 'Prodigal_v2.6.3', 'strand': '-', 'type': 'CDS', 'score': '1' }) imd2.add( [(2000, 3000)], metadata={ 'partial': '00', 'phase': 1, 'source': 'Prodigal_v2.6.3', 'strand': '.', 'type': '.', 'score': '.' }) imd3 = IntervalMetadata(None) imd3.add( [(2000, 3000)], metadata={ 'partial': '00', 'phase': 1, 'source': 'Prodigal_v2.6.3', 'strand': '.', 'type': '.', 'score': '.' }) data = (('seq1', imd1), ('seq2', imd2)) write(((sid, imd) for sid, imd in data), into=in_fp, format='gff3') filter_partial_genes(in_fp, out_fp) obs = read(out_fp, format='gff3') for i, j in zip(obs, [('seq2', imd3)]): self.assertEqual(i, j)
def test_complement_without_reverse_non_empty(self): for (constructor, seq_str, comp_str, qual) in self.all_combos_comp_qual: comp = constructor(seq_str).complement() self.assertEqual(comp, constructor(comp_str)) im = IntervalMetadata(len(seq_str)) im.add([(0, 1)], metadata={'gene': 'p53'}) comp = constructor( seq_str, metadata={'id': 'foo', 'description': 'bar'}, positional_metadata={'quality': qual}, interval_metadata=im).complement() self.assertEqual( comp, constructor( comp_str, metadata={'id': 'foo', 'description': 'bar'}, positional_metadata={'quality': qual}, interval_metadata=im))
def test_parse(self): imd1 = IntervalMetadata(None) imd1.add(bounds=[(237929, 238006)], metadata={ 'strand': '+', 'type': 'tRNA', 'source': 'Aragorn', 'product': 'tRNA-Ile' }) imd1.add(bounds=[(238048, 238124)], metadata={ 'strand': '+', 'type': 'tRNA', 'source': 'Aragorn', 'product': 'tRNA-Ala' }) imd2 = IntervalMetadata(None) imd2.add(bounds=[(4954141, 4954228)], metadata={ 'strand': '-', 'type': 'tRNA', 'source': 'Aragorn', 'product': 'tRNA-Ser' }) imd3 = IntervalMetadata(None) exp = (('NC_016822.1', imd1), ('NC_016833.1', imd2), ('NC_016834.1', imd3)) fp = get_data_path('aragorn.txt') gen = _generator(fp) for (exp_id, exp_imd), (obs_id, obs_imd) in zip(exp, gen): self.assertEqual(exp_id, obs_id) self.assertEqual(exp_imd, obs_imd)
def test_parse(self): imd1 = IntervalMetadata(None) imd1.add(bounds=[(252151, 252184)], metadata={ 'source': 'Tandem_Repeats_Finder', 'repeat': 'T', 'type': 'tandem_repeat' }) imd1.add(bounds=[(261169, 261210)], metadata={ 'source': 'Tandem_Repeats_Finder', 'repeat': 'CTCTGA', 'type': 'tandem_repeat' }) imd2 = IntervalMetadata(None) imd2.add(bounds=[(172614, 172703)], metadata={ 'source': 'Tandem_Repeats_Finder', 'repeat': 'AACAGCCGC', 'type': 'tandem_repeat' }) exp = (('NC_016822.1', imd1), ('NC_016833.1', imd2)) fp = get_data_path('tandem_repeats_finder.txt') gen = _generator(fp) for (exp_id, exp_imd), (obs_id, obs_imd) in zip(exp, gen): self.assertEqual(exp_id, obs_id) self.assertEqual(exp_imd, obs_imd)
def test_parse(self): imd1 = IntervalMetadata(None) imd1.add(bounds=[(3588441, 3588818)], metadata={ 'ncRNA_class': 'RNaseP_bact_a', 'type': 'ncRNA', 'strand': '-', 'db_xref': 'RF00010', 'source': 'Rfam' }) imd1.add(bounds=[(3355449, 3355633)], metadata={ 'ncRNA_class': '5S_rRNA', 'type': 'rRNA', 'strand': '+', 'product': '5s_rRNA', 'db_xref': 'RF00001', 'source': 'Rfam' }) imd2 = IntervalMetadata(None) imd2.add(bounds=[(85215, 85384)], metadata={ 'ncRNA_class': 'LSU_rRNA_bacteria', 'type': 'rRNA', 'strand': '+', 'product': '23s_rRNA', 'db_xref': 'RF02541', 'source': 'Rfam' }) imd3 = IntervalMetadata(None) imd3.add(bounds=[(8739, 8777)], metadata={ 'ncRNA_class': 'SSU_rRNA_bacteria', 'type': 'rRNA', 'strand': '+', 'product': '16s_rRNA', 'db_xref': 'RF00177', 'source': 'Rfam' }) exp = (('NC_016822.1', imd1), ('NC_016833.1', imd2), ('NC_016834.1', imd3)) fp = get_data_path('cmscan.txt') gen = _generator(fp) for (exp_id, exp_imd), (obs_id, obs_imd) in zip(exp, gen): self.assertEqual(exp_id, obs_id) self.assertEqual(exp_imd, obs_imd)
def test_serialize_location(self): imd = IntervalMetadata(9) i1 = imd.add([(0, 1)]) self.assertEqual(_serialize_location(i1), '1') i2 = imd.add([(0, 2)], [(True, True)]) self.assertEqual(_serialize_location(i2), '<1..>2') i3 = imd.add([(0, 2)], [(False, True)]) self.assertEqual(_serialize_location(i3), '1..>2') i4 = imd.add([(0, 2)], [(True, False)]) self.assertEqual(_serialize_location(i4), '<1..2') i5 = imd.add([(0, 2), (3, 9)], metadata={'strand': '-'}) self.assertEqual(_serialize_location(i5), 'complement(join(1..2,4..9))') i6 = imd.add([(0, 2), (3, 9)], [(True, False), (False, True)], metadata={'strand': '-'}) self.assertEqual(_serialize_location(i6), 'complement(join(<1..2,4..>9))')
def test_parse(self): imd1 = IntervalMetadata(None) imd1.add(bounds=[(0, 2853)], metadata={ 'source': 'RNAmmer-1.2', 'type': 'rRNA', 'product': '23s_rRNA', 'strand': '-', 'score': '3222.8' }) imd1.add(bounds=[(2924, 3040)], metadata={ 'source': 'RNAmmer-1.2', 'type': 'rRNA', 'product': '5s_rRNA', 'strand': '+', 'score': '80.8' }) imd2 = IntervalMetadata(None) imd2.add(bounds=[(77272, 78834)], metadata={ 'source': 'RNAmmer-1.2', 'type': 'rRNA', 'product': '16s_rRNA', 'strand': '+', 'score': '1984.2' }) exp = (('NZ_JXDA01000005.1', imd1), ('NZ_JXDA01000001.1', imd2)) fp = get_data_path('rnammer.gff') gen = _generator(fp) for (exp_id, exp_imd), (obs_id, obs_imd) in zip(exp, gen): self.assertEqual(exp_id, obs_id) self.assertEqual(exp_imd, obs_imd)
class GFF3IOTests(TestCase): def setUp(self): self.multi_fp = get_data_path('gff3_multi_record') self.single_fp = get_data_path('gff3_single_record') intvls = [{'bounds': [(0, 4641652)], 'metadata': {'source': 'European Nucleotide Archive', 'type': 'chromosome', 'score': '.', 'strand': '.', 'ID': 'chromosome:Chromosome', 'Alias': 'U00096.3', 'Is_circular': 'true'}}, {'bounds': [(147, 148)], 'metadata': {'source': 'regulondb_feature', 'type': 'biological_region', 'score': '.', 'strand': '+', 'external_name': 'Promoter thrLp (RegulonDB:ECK120010236)', 'logic_name': 'regulondb_promoter'}}, {'bounds': [(336, 2799)], 'metadata': {'source': 'Prodigal_v2.60', 'type': 'gene', 'score': '1.8', 'strand': '+', 'phase': 0, 'ID': '1_1', 'gc_cont': '0.427'}}, {'bounds': [(336, 2799)], 'metadata': {'source': 'Prodigal_v2.60', 'type': 'CDS', 'score': '333.8', 'strand': '+', 'phase': 0, 'ID': '1_2', 'Parent': '1_1', 'rbs_motif': 'GGAG/GAGG', 'rbs_spacer': '5-10bp'}}, {'bounds': [(0, 50), (55, 100)], 'metadata': {'source': 'Prodigal_v2.60', 'type': 'gene', 'score': '1.8', 'strand': '+', 'phase': 0, 'ID': '1_1', 'gene': 'FXR receptor'}}] self.upper_bound = 4641652 self.imd1 = IntervalMetadata(self.upper_bound) self.imd1.add(**intvls[0]) self.imd1.add(**intvls[1]) self.imd2 = IntervalMetadata(None) self.imd2.add(**intvls[2]) self.imd2.add(**intvls[3]) self.imd3 = IntervalMetadata(None) self.imd3.add(**intvls[4]) self.seq_fp = get_data_path('gff3_dna') self.seq = Sequence('ATGCATGCATGC', metadata={'id': 'NC_1', 'description': 'species X'}) self.seq.interval_metadata.add( [(0, 9)], metadata={'source': 'Prodigal_v2.60', 'type': 'gene', 'score': '.', 'strand': '+', 'phase': 0, 'ID': 'gene1', 'Name': 'FXR'}) self.dna = DNA(self.seq)
def test_eq_ne(self): im1 = IntervalMetadata(10) im1.add(metadata={'gene': 'sagA', 'bound': '0'}, bounds=[(0, 2), (4, 7)]) im1.add(metadata={'gene': 'sagB', 'bound': '3'}, bounds=[(3, 5)]) # The ordering shouldn't matter im2 = IntervalMetadata(10) im2.add(metadata={'gene': 'sagB', 'bound': '3'}, bounds=[(3, 5)]) im2.add(metadata={'gene': 'sagA', 'bound': '0'}, bounds=[(0, 2), (4, 7)]) im3 = IntervalMetadata(10) im3.add(metadata={'gene': 'sagA', 'bound': '3'}, bounds=[(0, 2), (4, 7)]) im3.add(metadata={'gene': 'sagB', 'bound': '3'}, bounds=[(3, 5)]) self.assertReallyEqual(im1, im2) self.assertReallyNotEqual(im1, im3)
def test_compute_rrna_score(self): imd1 = IntervalMetadata(None) imd1.add([(0, 100)], metadata={'type': 'rRNA', 'product': '5s_rRNA'}) imd1.add([(0, 725)], metadata={'type': 'rRNA', 'product': '16s_rRNA'}) imd1.add([(0, 12)], metadata={'type': 'tRNA'}) imd2 = IntervalMetadata(None) imd2.add([(0, 10)], metadata={'type': 'rRNA', 'product': '5s_rRNA'}) imd2.add([(0, 1450)], metadata={'type': 'rRNA', 'product': '16s_rRNA'}) imd3 = IntervalMetadata(None) imd3.add([(0, 2900)], metadata={'type': 'rRNA', 'product': '23s_rRNA'}) obs = compute_rrna_score([imd1, imd2, imd3]) exp = 0.1 + sum([0.3] * 3) self.assertEqual(obs, exp) obs = compute_rrna_score([imd1, imd2]) exp = 0.1 + sum([0.3] * 2) self.assertEqual(obs, exp) obs = compute_rrna_score([]) self.assertEqual(obs, 0.1) obs = compute_rrna_score([imd2]) self.assertEqual(obs, 0.5) obs = compute_rrna_score([imd1]) self.assertEqual(obs, 0.6)
class TestIntervalMetadata(unittest.TestCase, ReallyEqualMixin): def setUp(self): self.upper_bound = 10 self.im_empty = IntervalMetadata(self.upper_bound) self.im_1 = IntervalMetadata(self.upper_bound) self.im_1_1 = Interval( interval_metadata=self.im_1, bounds=[(1, 2), (4, self.upper_bound)], metadata={'gene': 'sagA', 'bound': 0}) self.im_2 = IntervalMetadata(self.upper_bound) self.im_2_1 = Interval( interval_metadata=self.im_2, bounds=[(1, 2), (4, self.upper_bound)], metadata={'gene': 'sagA', 'bound': 0}) self.im_2_2 = Interval( interval_metadata=self.im_2, bounds=[(3, 5)], metadata={'gene': 'sagB', 'bound': 0, 'spam': [0]}) def test_copy_empty(self): obs = copy(self.im_empty) self.assertEqual(obs, self.im_empty) self.assertIsNot(obs._intervals, self.im_empty._intervals) self.assertIsNot(obs._interval_tree, self.im_empty._interval_tree) def test_copy(self): obs = copy(self.im_2) self.assertEqual(obs, self.im_2) self.assertIsNot(obs._intervals, self.im_2._intervals) self.assertIsNot(obs._interval_tree, self.im_2._interval_tree) for i in range(self.im_2.num_interval_features): i1, i2 = obs._intervals[i], self.im_2._intervals[i] self.assertIsNot(i1, i2) self.assertIsNot(i1.bounds, i2.bounds) self.assertIsNot(i1.fuzzy, i2.fuzzy) self.assertIsNot(i1._interval_metadata, i2._interval_metadata) self.assertIsNot(i1.metadata, i2.metadata) for k in i1.metadata: self.assertIs(i1.metadata[k], i2.metadata[k]) def test_deepcopy(self): obs = deepcopy(self.im_2) self.assertEqual(obs, self.im_2) self.assertIsNot(obs._intervals, self.im_2._intervals) self.assertIsNot(obs._interval_tree, self.im_2._interval_tree) for i in range(self.im_2.num_interval_features): i1, i2 = obs._intervals[i], self.im_2._intervals[i] self.assertIsNot(i1, i2) self.assertIsNot(i1.bounds, i2.bounds) self.assertIsNot(i1.fuzzy, i2.fuzzy) self.assertIsNot(i1.metadata, i2.metadata) i2.metadata['spam'].append(1) self.assertEqual(i2.metadata, {'gene': 'sagB', 'bound': 0, 'spam': [0, 1]}) self.assertEqual(i1.metadata, {'gene': 'sagB', 'bound': 0, 'spam': [0]}) def test_deepcopy_memo_is_respected(self): memo = {} deepcopy(self.im_1, memo) self.assertGreater(len(memo), 2) def test_init(self): self.assertFalse(self.im_empty._is_stale_tree) self.assertEqual(self.im_empty._intervals, []) def test_init_upper_bound_lt_lower_bound(self): # test that no exception is raised IntervalMetadata(0) with self.assertRaises(ValueError): IntervalMetadata(-1) def test_upper_bound_is_none(self): im = IntervalMetadata(None) # should not raise error im.add([(0, 1000000000)]) self.assertIsNone(im.upper_bound) with self.assertRaisesRegex( TypeError, r'upper bound is `None`'): im._reverse() with self.assertRaisesRegex( TypeError, r'upper bound is `None`'): IntervalMetadata.concat([self.im_1, im]) def test_init_copy_from(self): for i in [None, 99, 999]: obs = IntervalMetadata(i, self.im_1) exp = IntervalMetadata(i) exp.add(bounds=[(1, 2), (4, self.upper_bound)], metadata={'gene': 'sagA', 'bound': 0}) self.assertEqual(obs, exp) def test_init_copy_from_empty(self): for i in [None, 0, 9, 99, 999]: obs = IntervalMetadata(i, self.im_empty) exp = IntervalMetadata(i) self.assertEqual(obs, exp) # test it is shallow copy self.assertIsNot(obs._intervals, self.im_empty._intervals) self.assertIsNot(obs._interval_tree, self.im_empty._interval_tree) def test_init_copy_from_shallow_copy(self): obs = IntervalMetadata(self.upper_bound, self.im_2) self.assertEqual(self.im_2, obs) # test it is shallow copy self.assertIsNot(obs._intervals, self.im_2._intervals) self.assertIsNot(obs._interval_tree, self.im_2._interval_tree) for i in range(self.im_2.num_interval_features): i1, i2 = obs._intervals[i], self.im_2._intervals[i] self.assertIsNot(i1, i2) self.assertIsNot(i1.bounds, i2.bounds) self.assertIsNot(i1.fuzzy, i2.fuzzy) self.assertIsNot(i1._interval_metadata, i2._interval_metadata) self.assertIsNot(i1.metadata, i2.metadata) for k in i1.metadata: self.assertIs(i1.metadata[k], i2.metadata[k]) def test_init_copy_from_error(self): i = self.upper_bound - 1 with self.assertRaisesRegex( ValueError, r'larger than upper bound \(%r\)' % i): IntervalMetadata(i, self.im_2) def test_num_interval_features(self): self.assertEqual(self.im_empty.num_interval_features, 0) self.assertEqual(self.im_1.num_interval_features, 1) self.assertEqual(self.im_2.num_interval_features, 2) def test_duplicate(self): '''Test query and drop methods on duplicate Intervals.''' intvl_1 = self.im_empty.add([(1, 2)]) intvl_2 = self.im_empty.add([(1, 2)]) self.assertEqual(len(list(self.im_empty.query([(1, 2)]))), 2) self.im_empty.drop([intvl_1]) self.assertEqual(len(self.im_empty._intervals), 1) self.assertTrue(self.im_empty._intervals[0] is intvl_2) def test_duplicate_bounds(self): intvl = self.im_empty.add([(1, 2), (1, 2)]) intvls = list(self.im_empty.query([(1, 2)])) self.assertEqual(len(intvls), 1) self.assertTrue(intvl is intvls[0]) def test_concat_empty(self): for i in 0, 1, 2: obs = IntervalMetadata.concat([self.im_empty] * i) exp = IntervalMetadata(self.upper_bound * i) self.assertEqual(obs, exp) obs = IntervalMetadata.concat([]) self.assertEqual(obs, IntervalMetadata(0)) def test_concat(self): im1 = IntervalMetadata(3) im2 = IntervalMetadata(4) im3 = IntervalMetadata(5) im1.add([(0, 2)], [(True, True)]) im2.add([(0, 3)], [(True, False)], {'gene': 'sagA'}) im2.add([(2, 4)], metadata={'gene': 'sagB'}) im3.add([(1, 5)], [(False, True)], {'gene': 'sagC'}) obs = IntervalMetadata.concat([im1, im2, im3]) exp = IntervalMetadata(12) exp.add(bounds=[(0, 2)], fuzzy=[(True, True)]) exp.add(bounds=[(3, 6)], fuzzy=[(True, False)], metadata={'gene': 'sagA'}) exp.add(bounds=[(5, 7)], metadata={'gene': 'sagB'}) exp.add(bounds=[(8, 12)], fuzzy=[(False, True)], metadata={'gene': 'sagC'}) self.assertEqual(obs, exp) def test_merge(self): # empty + empty im = IntervalMetadata(self.upper_bound) self.im_empty.merge(im) self.assertEqual(self.im_empty, im) # empty + non-empty self.im_empty.merge(self.im_1) self.assertEqual(self.im_empty, self.im_1) # non-empty + non-empty self.im_empty.merge(self.im_2) self.im_2.merge(self.im_1) self.assertEqual(self.im_empty, self.im_2) def test_merge_unequal_upper_bounds(self): n = 3 im1 = IntervalMetadata(n) for im in [self.im_empty, self.im_1]: with self.assertRaisesRegex( ValueError, r'not equal \(%d != %d\)' % (self.upper_bound, n)): im.merge(im1) def test_merge_to_unbounded(self): for im in [self.im_empty, self.im_1, IntervalMetadata(None)]: obs = IntervalMetadata(None) obs.merge(im) self.assertIsNone(obs.upper_bound) self.assertEqual(obs._intervals, im._intervals) def test_merge_unbounded_to_bounded(self): im = IntervalMetadata(None) with self.assertRaisesRegex( ValueError, r'Cannot merge an unbound IntervalMetadata object ' 'to a bounded one'): self.im_1.merge(im) # original im is not changed self.assertIsNone(im.upper_bound) self.assertEqual(im._intervals, []) def test_sort(self): interval = Interval( self.im_2, [(1, 2), (3, 8)], metadata={'gene': 'sagA', 'bound': 0}) im = deepcopy(self.im_2) self.im_2.sort(False) # check sorting does not have other side effects self.assertEqual(im, self.im_2) self.assertEqual(self.im_2._intervals, [self.im_2_2, self.im_2_1, interval]) self.im_2.sort() self.assertEqual(im, self.im_2) self.assertEqual(self.im_2._intervals, [interval, self.im_2_1, self.im_2_2]) self.im_empty.sort() self.assertEqual(self.im_empty, IntervalMetadata(self.upper_bound)) def test_add_eq_upper_bound(self): self.im_empty.add(bounds=[(1, 2), (4, self.upper_bound)], metadata={'gene': 'sagA', 'bound': 0}) self.assertTrue(self.im_empty._is_stale_tree) interval = self.im_empty._intervals[0] self.assertEqual(interval.bounds, [(1, 2), (4, self.upper_bound)]) self.assertEqual(interval.metadata, {'gene': 'sagA', 'bound': 0}) self.assertTrue(isinstance(self.im_empty._interval_tree, IntervalTree)) def test_add_gt_upper_bound(self): with self.assertRaises(ValueError): self.im_empty.add(bounds=[(1, 2), (4, self.upper_bound+1)], metadata={'gene': 'sagA', 'bound': 0}) def test_add_eq_start_end_bound(self): for i in 0, 1, self.upper_bound: # test that no exception is raised self.im_empty.add(bounds=[(i, i)], metadata={'gene': 'sagA', 'bound': 0}) def test_query_attribute(self): intervals = self.im_2._query_attribute({}) for i, j in zip(intervals, self.im_2._intervals): self.assertEqual(i, j) intervals = list(self.im_2._query_attribute(None)) self.assertEqual(len(intervals), 0) for i in self.im_2._intervals: intervals = list(self.im_2._query_attribute(i.metadata)) self.assertEqual(len(intervals), 1) self.assertEqual(intervals[0], i) def test_query_interval(self): intervals = list(self.im_2._query_interval((1, 2))) self.assertEqual(len(intervals), 1) self.assertEqual(intervals[0], self.im_2_1) intervals = list(self.im_2._query_interval((3, 4))) self.assertEqual(len(intervals), 1) self.assertEqual(intervals[0], self.im_2_2) intervals = {repr(i) for i in self.im_2._query_interval((1, 7))} self.assertEqual(len(intervals), 2) self.assertSetEqual(intervals, {repr(i) for i in self.im_2._intervals}) def test_query_interval_upper_bound(self): intervals = list(self.im_2._query_interval((self.upper_bound-1, self.upper_bound))) self.assertEqual(intervals, [self.im_2_1]) def test_query(self): intervals = list(self.im_2.query(bounds=[(1, 5)], metadata={'gene': 'sagA'})) self.assertEqual(len(intervals), 1) self.assertEqual(intervals[0], self.im_2_1) def test_query_empty(self): intervals = list(self.im_1.query()) self.assertEqual(len(intervals), 0) def test_query_no_hits(self): intervals = list(self.im_2.query(bounds=[(self.upper_bound, 200)])) self.assertEqual(len(intervals), 0) intervals = list(self.im_2.query(metadata={'gene': 'sagC'})) self.assertEqual(len(intervals), 0) intervals = list(self.im_2.query(bounds=[(1, 2)], metadata={'gene': 'sagC'})) self.assertEqual(len(intervals), 0) def test_query_interval_only(self): for loc in [[(1, 7)], [(1, 2), (3, 4)]]: intervals = list(self.im_2.query(bounds=loc)) self.assertEqual(len(intervals), 2) self.assertEqual(intervals[0], self.im_2_1) self.assertEqual(intervals[1], self.im_2_2) def test_query_metadata_only(self): intervals = list(self.im_2.query(metadata={'gene': 'sagB'})) self.assertEqual(len(intervals), 1) self.assertEqual(intervals[0], self.im_2_2) intervals = list(self.im_2.query(metadata={'bound': 0})) self.assertEqual(len(intervals), 2) self.assertEqual(intervals[0], self.im_2_1) self.assertEqual(intervals[1], self.im_2_2) def test_drop(self): intvl = self.im_2._intervals[0] self.im_2.drop([intvl]) self.assertEqual(len(self.im_2._intervals), 1) self.assertEqual(self.im_2._intervals[0], self.im_2_2) # test the intvl was set to dropped self.assertTrue(intvl.dropped) def test_drop_all(self): self.im_2.drop(self.im_2._intervals) self.assertEqual(self.im_2, self.im_empty) def test_drop_negate(self): intvl = self.im_2._intervals[0] self.im_2.drop([intvl], negate=True) self.assertEqual(len(self.im_2._intervals), 1) self.assertEqual(self.im_2._intervals[0], intvl) # test the dropped intvl was set to dropped self.assertTrue(self.im_2_2.dropped) def test_reverse(self): self.im_2._reverse() Interval( interval_metadata=self.im_empty, bounds=[(0, 6), (8, 9)], metadata={'gene': 'sagA', 'bound': 0}) Interval( interval_metadata=self.im_empty, bounds=[(5, 7)], metadata={'gene': 'sagB', 'bound': 0, 'spam': [0]}) self.assertEqual(self.im_2, self.im_empty) def test_eq_ne(self): im1 = IntervalMetadata(10) im1.add(metadata={'gene': 'sagA', 'bound': '0'}, bounds=[(0, 2), (4, 7)]) im1.add(metadata={'gene': 'sagB', 'bound': '3'}, bounds=[(3, 5)]) # The ordering shouldn't matter im2 = IntervalMetadata(10) im2.add(metadata={'gene': 'sagB', 'bound': '3'}, bounds=[(3, 5)]) im2.add(metadata={'gene': 'sagA', 'bound': '0'}, bounds=[(0, 2), (4, 7)]) im3 = IntervalMetadata(10) im3.add(metadata={'gene': 'sagA', 'bound': '3'}, bounds=[(0, 2), (4, 7)]) im3.add(metadata={'gene': 'sagB', 'bound': '3'}, bounds=[(3, 5)]) self.assertReallyEqual(im1, im2) self.assertReallyNotEqual(im1, im3) def test_ne_diff_bounds(self): im1 = IntervalMetadata(10) im2 = IntervalMetadata(9) intvl = {'bounds': [(0, 1)], 'metadata': {'spam': 'foo'}} im1.add(**intvl) im2.add(**intvl) self.assertReallyNotEqual(im1, im2) def test_repr(self): exp = '''0 interval features -------------------''' self.assertEqual(repr(self.im_empty), exp) self.im_empty.add([(1, 2)], metadata={'gene': 'sagA'}) exp = ("1 interval feature\n" "------------------\n" r"Interval\(interval_metadata=<[0-9]+>, bounds=\[\(1, 2\)\], " r"fuzzy=\[\(False, False\)\], metadata={'gene': 'sagA'}\)") self.assertRegex(repr(self.im_empty), exp) self.im_empty.add([(3, 4)], metadata={'gene': 'sagB'}) self.im_empty.add([(3, 4)], metadata={'gene': 'sagC'}) self.im_empty.add([(3, 4)], metadata={'gene': 'sagD'}) self.im_empty.add([(3, 4)], metadata={'gene': 'sagE'}) self.im_empty.add([(3, 4)], metadata={'gene': 'sagF'}) exp = ("6 interval features\n" "-------------------\n" r"Interval\(interval_metadata=<[0-9]+>, bounds=\[\(1, 2\)\], " r"fuzzy=\[\(False, False\)\], metadata={'gene': 'sagA'}\)\n" r"Interval\(interval_metadata=<[0-9]+>, bounds=\[\(3, 4\)\], " r"fuzzy=\[\(False, False\)\], metadata={'gene': 'sagB'}\)\n" r"...\n" r"Interval\(interval_metadata=<[0-9]+>, bounds=\[\(3, 4\)\], " r"fuzzy=\[\(False, False\)\], metadata={'gene': 'sagE'}\)\n" r"Interval\(interval_metadata=<[0-9]+>, bounds=\[\(3, 4\)\], " r"fuzzy=\[\(False, False\)\], metadata={'gene': 'sagF'}\)") self.assertRegex(repr(self.im_empty), exp)
def setUp(self): # to test ID line self.id = ( # This is a derived record (non-coding, rRNA and spacer records) # (feature level record: # http://www.ebi.ac.uk/ena/browse/feature-level-products # TODO: a Uniprot record? ([ 'ID AB000684.1:1..275:rRNA; SV 1; linear; ' 'genomic DNA; STD; ENV; 275 BP.' ], { 'division': 'ENV', 'mol_type': 'genomic DNA', 'shape': 'linear', 'locus_name': 'AB000684.1:1..275:rRNA', 'unit': 'bp', 'size': 275, 'version': 1, 'class': 'STD', 'date': None }), # A standard record (['ID M14399; SV 1; linear; mRNA; STD; PRO; 63 BP.'], { 'division': 'PRO', 'mol_type': 'mRNA', 'shape': 'linear', 'locus_name': 'M14399', 'unit': 'bp', 'size': 63, 'version': 1, 'class': 'STD', 'date': None })) # define a single DNA record (with no interval metadata) # M14399; SV 1; linear; mRNA; STD; PRO; 63 BP. self.single = ( 'gtgaaacaaagcactattgcactggctgtcttaccgttactgtttacccctgtgacaaaagcc', { 'LOCUS': { 'locus_name': 'M14399', 'class': 'STD', 'division': 'PRO', 'mol_type': 'mRNA', 'shape': 'linear', 'size': 63, 'unit': 'bp', 'version': 1, 'date': None } }, None, DNA) # define a single protein record (uniprot) self.protein = ( 'MAFSAEDVLKEYDRRRRMEALLLSLYYPNDRKLLDYKEWSPPRVQVECPKAPVEWNNPPSEKG' 'LIVGHFSGIKYKGEKAQASEVDVNKMCCWVSKFKDAMRRYQGIQTCKIPGKVLSDLDAKIKAY' 'NLTVEGVEGFVRYSRVTKQHVAAFLKELRHSKQYENVNLIHYILTDKRVDIQHLEKDLVKDFK' 'ALVESAHRMRQGHMINVKYILYQLLKKHGHGPDGPDILTVKTGSKGVLYDDSFRKIYTDLGWK' 'FTPL', { 'LOCUS': { 'locus_name': '001R_FRG3G', 'status': 'Reviewed', 'size': 256, 'unit': 'aa' } }, None, Protein) # define a single DNA record uppercase (filepath) self.single_upper_fp = get_data_path('embl_single_record_upper') # define a single RNA record lower self.single_lower_fp = get_data_path('embl_single_record_lower') # define a single RNA record file path self.single_rna_fp = get_data_path('embl_single_record') # define a http://www.ebi.ac.uk/ena/browse/feature-level-products self.feature_level_fp = get_data_path("embl_feature_level_record") # define a interval metadata (see skbio.metadata.IntervalMetadata) imd = IntervalMetadata(63) # then add interval object to interval metadata. Add source imd.add( [(0, 63)], [(False, False)], { 'db_xref': '"taxon:562"', 'mol_type': '"mRNA"', 'organism': '"Escherichia coli"', 'type': 'source', 'strand': '+', '__location': '1..63' }) imd.add( [(0, 63)], # the second True is beacause exact location is not known [(False, True)], { 'phase': 0, 'db_xref': [ '"GOA:P00634"', '"InterPro:IPR001952"', '"InterPro:IPR017849"', '"InterPro:IPR017850"', '"InterPro:IPR018299"', '"PDB:1AJA"', '"PDB:1AJB"', '"PDB:1AJC"', '"PDB:1AJD"', '"PDB:1ALH"', '"PDB:1ALI"', '"PDB:1ALJ"', '"PDB:1ALK"', '"PDB:1ANI"', '"PDB:1ANJ"', '"PDB:1B8J"', '"PDB:1ED8"', '"PDB:1ED9"', '"PDB:1ELX"', '"PDB:1ELY"', '"PDB:1ELZ"', '"PDB:1EW8"', '"PDB:1EW9"', '"PDB:1HJK"', '"PDB:1HQA"', '"PDB:1KH4"', '"PDB:1KH5"', '"PDB:1KH7"', '"PDB:1KH9"', '"PDB:1KHJ"', '"PDB:1KHK"', '"PDB:1KHL"', '"PDB:1KHN"', '"PDB:1URA"', '"PDB:1URB"', '"PDB:1Y6V"', '"PDB:1Y7A"', '"PDB:2ANH"', '"PDB:2G9Y"', '"PDB:2GA3"', '"PDB:2MLX"', '"PDB:2MLY"', '"PDB:2MLZ"', '"PDB:3BDF"', '"PDB:3BDG"', '"PDB:3BDH"', '"PDB:3CMR"', '"PDB:3DPC"', '"PDB:3DYC"', '"PDB:3TG0"', '"PDB:4KM4"', '"PDB:4YR1"', '"PDB:5C66"', '"PDB:5GAD"', '"PDB:5GAF"', '"PDB:5GAG"', '"PDB:5GAH"', '"PDB:5JTL"', '"PDB:5JTM"', '"PDB:5JTN"', '"PDB:5JTO"', '"PDB:5JTP"', '"UniProtKB/Swiss-Prot:P00634"' ], '__location': '1..>63', 'strand': '+', 'note': '"alkaline phosphatase signal peptide"', 'protein_id': '"AAA23431.1"', 'transl_table': '11', 'translation': '"MKQSTIALAVLPLLFTPVTKA"', 'type': 'CDS' }) self.single_rna = ( 'gugaaacaaagcacuauugcacuggcugucuuaccguuacuguuuaccccugugacaaaagcc', { 'LOCUS': { 'locus_name': 'M14399', 'class': 'STD', 'division': 'PRO', 'mol_type': 'mRNA', 'shape': 'linear', 'size': 63, 'unit': 'bp', 'version': 1, 'date': '02-SEP-1999' }, 'ACCESSION': 'M14399;', # accessions (could be more than one) 'VERSION': 'M14399.1', # a genbank like version 'DATE': [ "16-JUL-1988 (Rel. 16, Created)", "02-SEP-1999 (Rel. 60, Last updated, Version 3)" ], 'DBSOURCE': 'MD5; c9b40131b8622946b5aafdf5473b3d43.', 'DEFINITION': "E.coli alkaline phosphatase signal mRNA, 5' end.", 'KEYWORDS': "alkaline phosphatase; signal peptide.", 'SOURCE': { "ORGANISM": "Escherichia coli", 'taxonomy': "Bacteria; Proteobacteria; " "Gammaproteobacteria; Enterobacterales; " "Enterobacteriaceae; Escherichia." }, 'REFERENCE': [{ 'AUTHORS': 'Gray G.L., Baldridge J.S., ' 'McKeown K.S., Heyneker H.L., ' 'Chang C.N.;', 'JOURNAL': 'Gene 39(2-3):247-254(1985).', 'REFERENCE': '1 (bases 1 to 63)', 'TITLE': '"Periplasmic production of correctly ' 'processed human growth hormone in ' 'Escherichia coli: natural and bacterial ' 'signal sequences are ' 'interchangeable";', 'PUBMED': '3912261' }], 'CROSS_REFERENCE': ['DOI; 10.1016/0378-1119(85)' '90319-1. PUBMED; 3912261.'] }, imd, RNA) # define a multi record. File path self.multi_fp = get_data_path('embl_multi_records') # define interval metadata (as single metadata) imd1 = imd # define interal metadata for multi 2 imd2 = IntervalMetadata(743) # then add interval object to interval metadata. Add source imd2.add( [(0, 743)], [(False, False)], { 'organism': '"Ruditapes philippinarum"', 'type': 'source', '__location': '1..743', 'strand': '+', 'mol_type': '"mRNA"', 'db_xref': '"taxon:129788"' }) imd2.add( [(57, 444)], [(False, False)], { 'translation': '"MPGGKAGKDSGKAKAKAVSRSARAGLQFPVGRIHRHLKNRT' 'TSHG RVGATAAVYSAAILEYLTAEVLELAGNASKDLKVKRI' 'TPRHLQLAIRGDEELDSLIKAT IAGGGVIPHIHKSLIGKKG' 'GQQAK"', 'type': 'CDS', '__location': '58..444', 'protein_id': '"APY18893.1"', 'strand': '+', 'phase': 0, 'product': '"histone"' }) # multi object self.multi = ( ( 'GTGAAACAAAGCACTATTGCACTGGCTGTCTTACCGTTACTGTTTACCCCTGTGACAAAAGCC', { 'LOCUS': { 'locus_name': 'M14399', 'class': 'STD', 'division': 'PRO', 'mol_type': 'mRNA', 'shape': 'linear', 'size': 63, 'unit': 'bp', 'version': 1, 'date': '02-SEP-1999' }, 'ACCESSION': 'M14399;', # accessions (could be more than one) 'VERSION': 'M14399.1', # a genbank like version 'DATE': [ "16-JUL-1988 (Rel. 16, Created)", "02-SEP-1999 (Rel. 60, Last updated, Version 3)" ], 'DBSOURCE': 'MD5; c9b40131b8622946b5aafdf5473b3d43.', 'DEFINITION': "E.coli alkaline phosphatase signal mRNA, 5' end.", 'KEYWORDS': "alkaline phosphatase; signal peptide.", 'SOURCE': { "ORGANISM": "Escherichia coli", 'taxonomy': "Bacteria; Proteobacteria; " "Gammaproteobacteria; Enterobacterales; " "Enterobacteriaceae; Escherichia." }, 'REFERENCE': [{ 'AUTHORS': 'Gray G.L., Baldridge J.S., ' 'McKeown K.S., Heyneker H.L., ' 'Chang C.N.;', 'JOURNAL': 'Gene 39(2-3):247-254(1985).', 'REFERENCE': '1 (bases 1 to 63)', 'TITLE': '"Periplasmic production of correctly ' 'processed human growth hormone in ' 'Escherichia coli: natural and ' 'bacterial signal sequences are ' 'interchangeable";', 'PUBMED': '3912261' }], 'CROSS_REFERENCE': ['DOI; 10.1016/0378-1119(85)' '90319-1. PUBMED; 3912261.'] }, imd1, DNA), ('TGTGCACAGTCTACGCGTCATCTTGAAAGAAAGAACTACACTACTCCAAAAATAATCATGCC' 'TGGTGGAAAAGCTGGTAAAGATTCCGGAAAGGCCAAGGCTAAGGCAGTGTCAAGGTCCGCAA' 'GAGCTGGCTTACAGTTTCCAGTCGGACGTATTCACAGGCATTTGAAGAACAGAACCACTAGC' 'CACGGTCGTGTTGGAGCTACAGCAGCCGTTTACAGTGCAGCAATCCTTGAATACCTGACCGC' 'CGAAGTGCTTGAGTTGGCTGGAAACGCAAGTAAAGATCTCAAAGTAAAGAGAATCACCCCAC' 'GTCACTTGCAGTTGGCAATCAGAGGAGATGAAGAGTTGGATTCCCTAATTAAAGCCACAATC' 'GCTGGTGGTGGTGTTATTCCACATATCCACAAGTCACTTATTGGCAAGAAGGGAGGTCAGCA' 'AGCCAAATAAATTGGACATACTCATTCATCAGGGAACAATGTGTAGTGAATGTGTTAAAAAG' 'AACAATCTCATTGTGTAGCTCTTTAGTTTTATATGAATGTGTTAACATGGTCATTCACATCG' 'TATGACTCATAGAATCATCTGTGTATCATTTCATCCTCTCATTTTATAGCTCCTCATTTTCC' 'TTAGACTCATTAAAATTTTTATCTCGGAAAAATGTTTTTTCTACAATTTTAGCATTCATTTA' 'TCTTCATCTTGCTTTTATGTTTAATAAAACGAACTTATAATACCAAAAAAAAAAAAAAAAA', { 'ACCESSION': 'KX454487;', 'VERSION': 'KX454487.1', 'COMMENT': '##Assembly-Data-START##\nSequencing Technology ' ':: Sanger dideoxy sequencing\n##Assembly-Data-END##', 'DATE': [ '02-FEB-2017 (Rel. 131, Created)', '02-FEB-2017 (Rel. 131, Last updated, Version 1)' ], 'DBSOURCE': 'MD5; cbc730cf7a8d694b50fb7dd6b993ae0d.', 'DEFINITION': 'Ruditapes philippinarum histone mRNA, ' 'complete cds.', 'KEYWORDS': '.', 'LOCUS': { 'locus_name': 'KX454487', 'class': 'STD', 'division': 'INV', 'mol_type': 'mRNA', 'shape': 'linear', 'size': 743, 'unit': 'bp', 'version': 1, 'date': '02-FEB-2017' }, 'REFERENCE': [{ 'AUTHORS': 'Yang D., Zhao J., Wang Q.;', 'JOURNAL': 'Submitted (27-JUN-2016) to the INSDC. Key ' 'Laboratory of Coastal Zone Environment Processes ' 'and Ecological Remediation, Yantai Institute ' 'of Coastal Zone Research (YIC), Chinese Academy ' 'of Sciences (CAS), 17 Chunhui Road, Laishan ' 'District, Yantai, Shandong 264003, China', 'REFERENCE': '1 (bases 1 to 743)', 'TITLE': ';' }], 'CROSS_REFERENCE': [None], 'SOURCE': { 'ORGANISM': 'Ruditapes philippinarum', 'taxonomy': 'Eukaryota; Metazoa; Lophotrochozoa; Mollusca; ' 'Bivalvia; Heteroconchia; Euheterodonta; ' 'Veneroida; Veneroidea; Veneridae; Ruditapes.' } }, imd2, DNA)) # define the feature level product obj self.feature_level = ( 'AAUUGAAGAGUUUGAUCAUGGCUCAGAUUGAACGCUGGCGGCAGGCCUAACACAUGCAAGUC' 'GAGCGGCAGCACAGAGGAACUUGUUCCUUGGGUGGCGAGCGGCGGACGGGUGAGUAAUGCCU' 'AGGAAAUUGCCCUGAUGUGGGGGAUAACCAUUGGAAACGAUGGCUAAUACCGCAUGAUGCCU' 'ACGGGCCAAAGAGGGGGACCUUCUGGCCUCUCGCGUCAGGAUAUGCCUAGGUGGGAUUAGCU' 'AGUUGGUGAGGUAAUGGCUCACCAAGGCGACGAUCCCUAGCUGGUCUGAGAGGAUGAUCAGC' 'CACACUGGAACUGAGACACGGUCCAGACUCCUACGGGAGGCAGCAGUGGGGAAUAUUGCACA' 'AUGGGCGCAAGCCUGAUGCAGCCAUGCCGCGUGUAUGAAGAAGGCCUUCGGGUUGUAAAGUA' 'CUUUCAGUCGUGAGGAAGGUGGUGUUGUUAAUAGCAGCAUCAUUUGACGUUAGCGACAGAAG' 'AAGCACCGGCUAACUCCGUGCCAGCAGCCGCGGUAAUACGGAGGGUGCGAGCGUUAAUCGGA' 'AUUACUGGGCGUAAAGCGCAUGCAGGUGGUGGAUUAAGUCAGAUGUGAAAGCCCGGGGCUCA' 'ACCUCGGAACCGCAUUUGAAACUGGUUCACUAGAGUACUGUAGAGGGGGGUAGAAUUUCAGG' 'UGUAGCGGUGAAAUGCGUAGAGAUCUGAAGGAAUACCGGUGGCGAAGGCGGCCCCCUGGACA' 'GAUACUGACACUCAGAUGCGAAAGCGUGGGGAGCAAACAGGAUUAGAUACCCUGGUAGUCCA' 'CGCCGUAAACGAUGUCUACUUGGAGGUUGUGGCCUUGAGCCGUGGCUUUCGGAGCUAACGCG' 'UUAAGUAGACCGCCUGGGGAGUACGGUCGCAAGAUUAAAACUCAAAUGAAUUGACGGGGGCC' 'CGCACAAGCGGUGGAGCAUGUGGUUUAAUUCGAUGCAACGCGAAGAACCUUACCUACUCUUG' 'ACAUCCAGAGAAGCCAGCGGAGACGCAGGUGUGCCUUCGGGAGCUCUGAGACAGGUGCUGCA' 'UGGCUGUCGUCAGCUCGUGUUGUGAAAUGUUGGGUUAAGUCCCGCAACGAGCGCAACCCUUA' 'UCCUUGUUUGCCAGCGAGUCAUGUCGGGAACUCCAGGGAGACUGCCGGUGAUAAACCGGAGG' 'AAGGUGGGGACGACGUCAAGUCAUCAUGGCCCUUACGAGUAGGGCUACACACGUGCUACAAU' 'GGCGCAUACAGAGGGCAGCAAGCUAGCGAUAGUGAGCGAAUCCCAAAAAGUGCGUCGUAGUC' 'CGGAUUGGAGUCUGCAACUCGACUCCAUGAAGUCGGAAUCGCUAGUAAUCGUAGAUCAGAAU' 'GCUACGGUGAAUACGUUCCCGGGCCUUGUACACACCGCCCGUCACACCAUGGGAGUGGGCUG' 'CAAAAGAAGUGGGUAGUUUAACCUUUCGGGGAGGACGCUCACCACUUUGUGGUUCAUGACUG' 'GGGUGAAGUCGUAACAAGGUAGCGCUAGGGGAACCUGGCGCUGGAUCACCUCCUUA', { 'DATE': [ '02-JUN-2014 (Rel. 121, Created)', '04-FEB-2016 (Rel. 127, Last updated, Version 5)' ], 'DBSOURCE': 'SILVA-LSU; LK021130. SILVA-SSU; LK021130. MD5; ' 'afd116bf2c1a13acbf40d63d82f0218c. BioSample; ' 'SAMEA3865288.', 'DEFINITION': 'Vibrio anguillarum 16S rRNA', 'KEYWORDS': '.', 'LOCUS': { 'locus_name': 'LK021130.1:74067..75610:rRNA', 'class': 'STD', 'division': 'PRO', 'mol_type': 'genomic DNA', 'shape': 'linear', 'size': 1544, 'unit': 'bp', 'version': 1, 'date': '04-FEB-2016' }, 'PARENT_ACCESSION': 'LK021130.1', 'VERSION': 'LK021130.1', 'PROJECT_IDENTIFIER': 'Project:PRJEB5701;', 'REFERENCE': [{ 'AUTHORS': 'Holm K.;', 'JOURNAL': 'Submitted (26-MAR-2014) to the INSDC. ' 'Norstruct, Dept of Chemistry, University of ' 'Tromso, Science Park 3, NO-9037 Tromso, NORWAY.', 'TITLE': ';', 'REFERENCE': '1' }, { 'AUTHORS': 'Holm K.O., Nilsson K., Hjerde E., Willassen ' 'N.P., Milton D.L.;', 'JOURNAL': 'Stand Genomic Sci. 10:60-60(2015).', 'TITLE': '"Complete genome sequence of Vibrio anguillarum ' 'strain NB10, a virulent isolate from the Gulf ' 'of Bothnia";', 'REFERENCE': '2', 'PUBMED': '26380645' }], 'CROSS_REFERENCE': [None, 'DOI; 10.1186/s40793-015-0060-7. PUBMED; 26380645.'], 'SOURCE': { 'ORGANISM': 'Vibrio anguillarum', 'taxonomy': 'Bacteria; Proteobacteria; Gammaproteobacteria; ' 'Vibrionales; Vibrionaceae; Vibrio.' } }, None, RNA) # get the feature level file without FT self.feature_level_fp = get_data_path( "embl_feature_level_record_no_FT") # get a genbank file in order to to file conversion self.genbank_fp = get_data_path('genbank_single_record') # a embl constructed sequence file path self.embl_constructed_fp = get_data_path("embl_constructed") # a simple embl version to perform embl->gb->embl conversion self.single_rna_simple_fp = get_data_path("embl_single_record_simple")
def setUp(self): # to test ID line self.id = ( # This is a derived record (non-coding, rRNA and spacer records) # (feature level record: # http://www.ebi.ac.uk/ena/browse/feature-level-products # TODO: a Uniprot record? (['ID AB000684.1:1..275:rRNA; SV 1; linear; ' 'genomic DNA; STD; ENV; 275 BP.'], {'division': 'ENV', 'mol_type': 'genomic DNA', 'shape': 'linear', 'locus_name': 'AB000684.1:1..275:rRNA', 'unit': 'bp', 'size': 275, 'version': 1, 'class': 'STD', 'date': None}), # A standard record (['ID M14399; SV 1; linear; mRNA; STD; PRO; 63 BP.'], {'division': 'PRO', 'mol_type': 'mRNA', 'shape': 'linear', 'locus_name': 'M14399', 'unit': 'bp', 'size': 63, 'version': 1, 'class': 'STD', 'date': None})) # define a single DNA record (with no interval metadata) # M14399; SV 1; linear; mRNA; STD; PRO; 63 BP. self.single = ( 'gtgaaacaaagcactattgcactggctgtcttaccgttactgtttacccctgtgacaaaagcc', {'LOCUS': {'locus_name': 'M14399', 'class': 'STD', 'division': 'PRO', 'mol_type': 'mRNA', 'shape': 'linear', 'size': 63, 'unit': 'bp', 'version': 1, 'date': None}}, None, DNA) # define a single protein record (uniprot) self.protein = ( 'MAFSAEDVLKEYDRRRRMEALLLSLYYPNDRKLLDYKEWSPPRVQVECPKAPVEWNNPPSEKG' 'LIVGHFSGIKYKGEKAQASEVDVNKMCCWVSKFKDAMRRYQGIQTCKIPGKVLSDLDAKIKAY' 'NLTVEGVEGFVRYSRVTKQHVAAFLKELRHSKQYENVNLIHYILTDKRVDIQHLEKDLVKDFK' 'ALVESAHRMRQGHMINVKYILYQLLKKHGHGPDGPDILTVKTGSKGVLYDDSFRKIYTDLGWK' 'FTPL', {'LOCUS': {'locus_name': '001R_FRG3G', 'status': 'Reviewed', 'size': 256, 'unit': 'aa'}}, None, Protein) # define a single DNA record uppercase (filepath) self.single_upper_fp = get_data_path('embl_single_record_upper') # define a single RNA record lower self.single_lower_fp = get_data_path('embl_single_record_lower') # define a single RNA record file path self.single_rna_fp = get_data_path('embl_single_record') # define a http://www.ebi.ac.uk/ena/browse/feature-level-products self.feature_level_fp = get_data_path("embl_feature_level_record") # define a interval metadata (see skbio.metadata.IntervalMetadata) imd = IntervalMetadata(63) # then add interval object to interval metadata. Add source imd.add([(0, 63)], [(False, False)], {'db_xref': '"taxon:562"', 'mol_type': '"mRNA"', 'organism': '"Escherichia coli"', 'type': 'source', 'strand': '+', '__location': '1..63'}) imd.add([(0, 63)], # the second True is beacause exact location is not known [(False, True)], {'phase': 0, 'db_xref': ['"GOA:P00634"', '"InterPro:IPR001952"', '"InterPro:IPR017849"', '"InterPro:IPR017850"', '"InterPro:IPR018299"', '"PDB:1AJA"', '"PDB:1AJB"', '"PDB:1AJC"', '"PDB:1AJD"', '"PDB:1ALH"', '"PDB:1ALI"', '"PDB:1ALJ"', '"PDB:1ALK"', '"PDB:1ANI"', '"PDB:1ANJ"', '"PDB:1B8J"', '"PDB:1ED8"', '"PDB:1ED9"', '"PDB:1ELX"', '"PDB:1ELY"', '"PDB:1ELZ"', '"PDB:1EW8"', '"PDB:1EW9"', '"PDB:1HJK"', '"PDB:1HQA"', '"PDB:1KH4"', '"PDB:1KH5"', '"PDB:1KH7"', '"PDB:1KH9"', '"PDB:1KHJ"', '"PDB:1KHK"', '"PDB:1KHL"', '"PDB:1KHN"', '"PDB:1URA"', '"PDB:1URB"', '"PDB:1Y6V"', '"PDB:1Y7A"', '"PDB:2ANH"', '"PDB:2G9Y"', '"PDB:2GA3"', '"PDB:2MLX"', '"PDB:2MLY"', '"PDB:2MLZ"', '"PDB:3BDF"', '"PDB:3BDG"', '"PDB:3BDH"', '"PDB:3CMR"', '"PDB:3DPC"', '"PDB:3DYC"', '"PDB:3TG0"', '"PDB:4KM4"', '"PDB:4YR1"', '"PDB:5C66"', '"PDB:5GAD"', '"PDB:5GAF"', '"PDB:5GAG"', '"PDB:5GAH"', '"PDB:5JTL"', '"PDB:5JTM"', '"PDB:5JTN"', '"PDB:5JTO"', '"PDB:5JTP"', '"UniProtKB/Swiss-Prot:P00634"'], '__location': '1..>63', 'strand': '+', 'note': '"alkaline phosphatase signal peptide"', 'protein_id': '"AAA23431.1"', 'transl_table': '11', 'translation': '"MKQSTIALAVLPLLFTPVTKA"', 'type': 'CDS'}) self.single_rna = ( 'gugaaacaaagcacuauugcacuggcugucuuaccguuacuguuuaccccugugacaaaagcc', {'LOCUS': {'locus_name': 'M14399', 'class': 'STD', 'division': 'PRO', 'mol_type': 'mRNA', 'shape': 'linear', 'size': 63, 'unit': 'bp', 'version': 1, 'date': '02-SEP-1999'}, 'ACCESSION': 'M14399;', # accessions (could be more than one) 'VERSION': 'M14399.1', # a genbank like version 'DATE': ["16-JUL-1988 (Rel. 16, Created)", "02-SEP-1999 (Rel. 60, Last updated, Version 3)"], 'DBSOURCE': 'MD5; c9b40131b8622946b5aafdf5473b3d43.', 'DEFINITION': "E.coli alkaline phosphatase signal mRNA, 5' end.", 'KEYWORDS': "alkaline phosphatase; signal peptide.", 'SOURCE': {"ORGANISM": "Escherichia coli", 'taxonomy': "Bacteria; Proteobacteria; " "Gammaproteobacteria; Enterobacterales; " "Enterobacteriaceae; Escherichia."}, 'REFERENCE': [{'AUTHORS': 'Gray G.L., Baldridge J.S., ' 'McKeown K.S., Heyneker H.L., ' 'Chang C.N.;', 'JOURNAL': 'Gene 39(2-3):247-254(1985).', 'REFERENCE': '1 (bases 1 to 63)', 'TITLE': '"Periplasmic production of correctly ' 'processed human growth hormone in ' 'Escherichia coli: natural and bacterial ' 'signal sequences are ' 'interchangeable";', 'PUBMED': '3912261'}], 'CROSS_REFERENCE': ['DOI; 10.1016/0378-1119(85)' '90319-1. PUBMED; 3912261.']}, imd, RNA) # define a multi record. File path self.multi_fp = get_data_path('embl_multi_records') # define interval metadata (as single metadata) imd1 = imd # define interal metadata for multi 2 imd2 = IntervalMetadata(743) # then add interval object to interval metadata. Add source imd2.add([(0, 743)], [(False, False)], {'organism': '"Ruditapes philippinarum"', 'type': 'source', '__location': '1..743', 'strand': '+', 'mol_type': '"mRNA"', 'db_xref': '"taxon:129788"'}) imd2.add([(57, 444)], [(False, False)], {'translation': '"MPGGKAGKDSGKAKAKAVSRSARAGLQFPVGRIHRHLKNRT' 'TSHG RVGATAAVYSAAILEYLTAEVLELAGNASKDLKVKRI' 'TPRHLQLAIRGDEELDSLIKAT IAGGGVIPHIHKSLIGKKG' 'GQQAK"', 'type': 'CDS', '__location': '58..444', 'protein_id': '"APY18893.1"', 'strand': '+', 'phase': 0, 'product': '"histone"'}) # multi object self.multi = ( ('GTGAAACAAAGCACTATTGCACTGGCTGTCTTACCGTTACTGTTTACCCCTGTGACAAAAGCC', {'LOCUS': {'locus_name': 'M14399', 'class': 'STD', 'division': 'PRO', 'mol_type': 'mRNA', 'shape': 'linear', 'size': 63, 'unit': 'bp', 'version': 1, 'date': '02-SEP-1999'}, 'ACCESSION': 'M14399;', # accessions (could be more than one) 'VERSION': 'M14399.1', # a genbank like version 'DATE': ["16-JUL-1988 (Rel. 16, Created)", "02-SEP-1999 (Rel. 60, Last updated, Version 3)"], 'DBSOURCE': 'MD5; c9b40131b8622946b5aafdf5473b3d43.', 'DEFINITION': "E.coli alkaline phosphatase signal mRNA, 5' end.", 'KEYWORDS': "alkaline phosphatase; signal peptide.", 'SOURCE': {"ORGANISM": "Escherichia coli", 'taxonomy': "Bacteria; Proteobacteria; " "Gammaproteobacteria; Enterobacterales; " "Enterobacteriaceae; Escherichia."}, 'REFERENCE': [{'AUTHORS': 'Gray G.L., Baldridge J.S., ' 'McKeown K.S., Heyneker H.L., ' 'Chang C.N.;', 'JOURNAL': 'Gene 39(2-3):247-254(1985).', 'REFERENCE': '1 (bases 1 to 63)', 'TITLE': '"Periplasmic production of correctly ' 'processed human growth hormone in ' 'Escherichia coli: natural and ' 'bacterial signal sequences are ' 'interchangeable";', 'PUBMED': '3912261'}], 'CROSS_REFERENCE': ['DOI; 10.1016/0378-1119(85)' '90319-1. PUBMED; 3912261.']}, imd1, DNA), ('TGTGCACAGTCTACGCGTCATCTTGAAAGAAAGAACTACACTACTCCAAAAATAATCATGCC' 'TGGTGGAAAAGCTGGTAAAGATTCCGGAAAGGCCAAGGCTAAGGCAGTGTCAAGGTCCGCAA' 'GAGCTGGCTTACAGTTTCCAGTCGGACGTATTCACAGGCATTTGAAGAACAGAACCACTAGC' 'CACGGTCGTGTTGGAGCTACAGCAGCCGTTTACAGTGCAGCAATCCTTGAATACCTGACCGC' 'CGAAGTGCTTGAGTTGGCTGGAAACGCAAGTAAAGATCTCAAAGTAAAGAGAATCACCCCAC' 'GTCACTTGCAGTTGGCAATCAGAGGAGATGAAGAGTTGGATTCCCTAATTAAAGCCACAATC' 'GCTGGTGGTGGTGTTATTCCACATATCCACAAGTCACTTATTGGCAAGAAGGGAGGTCAGCA' 'AGCCAAATAAATTGGACATACTCATTCATCAGGGAACAATGTGTAGTGAATGTGTTAAAAAG' 'AACAATCTCATTGTGTAGCTCTTTAGTTTTATATGAATGTGTTAACATGGTCATTCACATCG' 'TATGACTCATAGAATCATCTGTGTATCATTTCATCCTCTCATTTTATAGCTCCTCATTTTCC' 'TTAGACTCATTAAAATTTTTATCTCGGAAAAATGTTTTTTCTACAATTTTAGCATTCATTTA' 'TCTTCATCTTGCTTTTATGTTTAATAAAACGAACTTATAATACCAAAAAAAAAAAAAAAAA', {'ACCESSION': 'KX454487;', 'VERSION': 'KX454487.1', 'COMMENT': '##Assembly-Data-START##\nSequencing Technology ' ':: Sanger dideoxy sequencing\n##Assembly-Data-END##', 'DATE': ['02-FEB-2017 (Rel. 131, Created)', '02-FEB-2017 (Rel. 131, Last updated, Version 1)'], 'DBSOURCE': 'MD5; cbc730cf7a8d694b50fb7dd6b993ae0d.', 'DEFINITION': 'Ruditapes philippinarum histone mRNA, ' 'complete cds.', 'KEYWORDS': '.', 'LOCUS': {'locus_name': 'KX454487', 'class': 'STD', 'division': 'INV', 'mol_type': 'mRNA', 'shape': 'linear', 'size': 743, 'unit': 'bp', 'version': 1, 'date': '02-FEB-2017'}, 'REFERENCE': [ {'AUTHORS': 'Yang D., Zhao J., Wang Q.;', 'JOURNAL': 'Submitted (27-JUN-2016) to the INSDC. Key ' 'Laboratory of Coastal Zone Environment Processes ' 'and Ecological Remediation, Yantai Institute ' 'of Coastal Zone Research (YIC), Chinese Academy ' 'of Sciences (CAS), 17 Chunhui Road, Laishan ' 'District, Yantai, Shandong 264003, China', 'REFERENCE': '1 (bases 1 to 743)', 'TITLE': ';'}], 'CROSS_REFERENCE': [None], 'SOURCE': { 'ORGANISM': 'Ruditapes philippinarum', 'taxonomy': 'Eukaryota; Metazoa; Lophotrochozoa; Mollusca; ' 'Bivalvia; Heteroconchia; Euheterodonta; ' 'Veneroida; Veneroidea; Veneridae; Ruditapes.'}}, imd2, DNA)) # define the feature level product obj self.feature_level = ( 'AAUUGAAGAGUUUGAUCAUGGCUCAGAUUGAACGCUGGCGGCAGGCCUAACACAUGCAAGUC' 'GAGCGGCAGCACAGAGGAACUUGUUCCUUGGGUGGCGAGCGGCGGACGGGUGAGUAAUGCCU' 'AGGAAAUUGCCCUGAUGUGGGGGAUAACCAUUGGAAACGAUGGCUAAUACCGCAUGAUGCCU' 'ACGGGCCAAAGAGGGGGACCUUCUGGCCUCUCGCGUCAGGAUAUGCCUAGGUGGGAUUAGCU' 'AGUUGGUGAGGUAAUGGCUCACCAAGGCGACGAUCCCUAGCUGGUCUGAGAGGAUGAUCAGC' 'CACACUGGAACUGAGACACGGUCCAGACUCCUACGGGAGGCAGCAGUGGGGAAUAUUGCACA' 'AUGGGCGCAAGCCUGAUGCAGCCAUGCCGCGUGUAUGAAGAAGGCCUUCGGGUUGUAAAGUA' 'CUUUCAGUCGUGAGGAAGGUGGUGUUGUUAAUAGCAGCAUCAUUUGACGUUAGCGACAGAAG' 'AAGCACCGGCUAACUCCGUGCCAGCAGCCGCGGUAAUACGGAGGGUGCGAGCGUUAAUCGGA' 'AUUACUGGGCGUAAAGCGCAUGCAGGUGGUGGAUUAAGUCAGAUGUGAAAGCCCGGGGCUCA' 'ACCUCGGAACCGCAUUUGAAACUGGUUCACUAGAGUACUGUAGAGGGGGGUAGAAUUUCAGG' 'UGUAGCGGUGAAAUGCGUAGAGAUCUGAAGGAAUACCGGUGGCGAAGGCGGCCCCCUGGACA' 'GAUACUGACACUCAGAUGCGAAAGCGUGGGGAGCAAACAGGAUUAGAUACCCUGGUAGUCCA' 'CGCCGUAAACGAUGUCUACUUGGAGGUUGUGGCCUUGAGCCGUGGCUUUCGGAGCUAACGCG' 'UUAAGUAGACCGCCUGGGGAGUACGGUCGCAAGAUUAAAACUCAAAUGAAUUGACGGGGGCC' 'CGCACAAGCGGUGGAGCAUGUGGUUUAAUUCGAUGCAACGCGAAGAACCUUACCUACUCUUG' 'ACAUCCAGAGAAGCCAGCGGAGACGCAGGUGUGCCUUCGGGAGCUCUGAGACAGGUGCUGCA' 'UGGCUGUCGUCAGCUCGUGUUGUGAAAUGUUGGGUUAAGUCCCGCAACGAGCGCAACCCUUA' 'UCCUUGUUUGCCAGCGAGUCAUGUCGGGAACUCCAGGGAGACUGCCGGUGAUAAACCGGAGG' 'AAGGUGGGGACGACGUCAAGUCAUCAUGGCCCUUACGAGUAGGGCUACACACGUGCUACAAU' 'GGCGCAUACAGAGGGCAGCAAGCUAGCGAUAGUGAGCGAAUCCCAAAAAGUGCGUCGUAGUC' 'CGGAUUGGAGUCUGCAACUCGACUCCAUGAAGUCGGAAUCGCUAGUAAUCGUAGAUCAGAAU' 'GCUACGGUGAAUACGUUCCCGGGCCUUGUACACACCGCCCGUCACACCAUGGGAGUGGGCUG' 'CAAAAGAAGUGGGUAGUUUAACCUUUCGGGGAGGACGCUCACCACUUUGUGGUUCAUGACUG' 'GGGUGAAGUCGUAACAAGGUAGCGCUAGGGGAACCUGGCGCUGGAUCACCUCCUUA', {'DATE': ['02-JUN-2014 (Rel. 121, Created)', '04-FEB-2016 (Rel. 127, Last updated, Version 5)'], 'DBSOURCE': 'SILVA-LSU; LK021130. SILVA-SSU; LK021130. MD5; ' 'afd116bf2c1a13acbf40d63d82f0218c. BioSample; ' 'SAMEA3865288.', 'DEFINITION': 'Vibrio anguillarum 16S rRNA', 'KEYWORDS': '.', 'LOCUS': {'locus_name': 'LK021130.1:74067..75610:rRNA', 'class': 'STD', 'division': 'PRO', 'mol_type': 'genomic DNA', 'shape': 'linear', 'size': 1544, 'unit': 'bp', 'version': 1, 'date': '04-FEB-2016'}, 'PARENT_ACCESSION': 'LK021130.1', 'VERSION': 'LK021130.1', 'PROJECT_IDENTIFIER': 'Project:PRJEB5701;', 'REFERENCE': [ {'AUTHORS': 'Holm K.;', 'JOURNAL': 'Submitted (26-MAR-2014) to the INSDC. ' 'Norstruct, Dept of Chemistry, University of ' 'Tromso, Science Park 3, NO-9037 Tromso, NORWAY.', 'TITLE': ';', 'REFERENCE': '1'}, {'AUTHORS': 'Holm K.O., Nilsson K., Hjerde E., Willassen ' 'N.P., Milton D.L.;', 'JOURNAL': 'Stand Genomic Sci. 10:60-60(2015).', 'TITLE': '"Complete genome sequence of Vibrio anguillarum ' 'strain NB10, a virulent isolate from the Gulf ' 'of Bothnia";', 'REFERENCE': '2', 'PUBMED': '26380645'}], 'CROSS_REFERENCE': [ None, 'DOI; 10.1186/s40793-015-0060-7. PUBMED; 26380645.'], 'SOURCE': { 'ORGANISM': 'Vibrio anguillarum', 'taxonomy': 'Bacteria; Proteobacteria; Gammaproteobacteria; ' 'Vibrionales; Vibrionaceae; Vibrio.'}}, None, RNA) # get the feature level file without FT self.feature_level_fp = get_data_path( "embl_feature_level_record_no_FT") # get a genbank file in order to to file conversion self.genbank_fp = get_data_path('genbank_single_record') # a embl constructed sequence file path self.embl_constructed_fp = get_data_path("embl_constructed") # a simple embl version to perform embl->gb->embl conversion self.single_rna_simple_fp = get_data_path( "embl_single_record_simple")
def setUp(self): # test locus line self.locus = ( (['LOCUS NC_005816 9609 bp ' 'DNA circular CON 07-FEB-2015'], {'division': 'CON', 'mol_type': 'DNA', 'shape': 'circular', 'locus_name': 'NC_005816', 'date': '07-FEB-2015', 'unit': 'bp', 'size': 9609}), (['LOCUS SCU49845 5028 bp ' 'DNA PLN 21-JUN-1999'], {'division': 'PLN', 'mol_type': 'DNA', 'shape': None, 'locus_name': 'SCU49845', 'date': '21-JUN-1999', 'unit': 'bp', 'size': 5028}), (['LOCUS NP_001832 360 aa ' 'linear PRI 18-DEC-2001'], {'division': 'PRI', 'mol_type': None, 'shape': 'linear', 'locus_name': 'NP_001832', 'date': '18-DEC-2001', 'unit': 'aa', 'size': 360})) # test single record and read uppercase sequence self.single_upper_fp = get_data_path('genbank_single_record_upper') self.single_lower_fp = get_data_path('genbank_single_record_lower') self.single = ( 'GSREILDFK', {'LOCUS': {'date': '23-SEP-1994', 'division': 'BCT', 'locus_name': 'AAB29917', 'mol_type': None, 'shape': 'linear', 'size': 9, 'unit': 'aa'}}, None, Protein) self.single_rna_fp = get_data_path('genbank_single_record') imd = IntervalMetadata(63) imd.add([(0, 63)], [(False, False)], {'db_xref': '"taxon:562"', 'mol_type': '"mRNA"', 'organism': '"Escherichia coli"', 'type': 'source', 'strand': '+', '__location': '1..63'}) imd.add([(0, 63)], [(False, True)], {'phase': 0, 'db_xref': ['"taxon:562"', '"taxon:561"'], '__location': '1..>63', 'strand': '+', 'note': '"alkaline phosphatase signal peptide"', 'protein_id': '"AAA23431.1"', 'transl_table': '11', 'translation': '"MKQSTIALAVLPLLFTPVTKA"', 'type': 'CDS'}) self.single_rna = ( 'gugaaacaaagcacuauugcacuggcugucuuaccguuacuguuuaccccugugacaaaagcc', {'ACCESSION': 'M14399', 'COMMENT': 'Original source text: E.coli, cDNA to mRNA.', 'DEFINITION': "alkaline phosphatase signal mRNA, 5' end.", 'KEYWORDS': 'alkaline phosphatase; signal peptide.', 'LOCUS': {'date': '26-APR-1993', 'division': 'BCT', 'locus_name': 'ECOALKP', 'mol_type': 'mRNA', 'shape': 'linear', 'size': 63, 'unit': 'bp'}, 'SOURCE': {'ORGANISM': 'Escherichia coli', 'taxonomy': 'Bacteria; Proteobacteria; ' 'Gammaproteobacteria; Enterobacteriales; ' 'Enterobacteriaceae; Escherichia.'}, 'VERSION': 'M14399.1'}, imd, RNA) # test: # 1. multiple records in one file # 2. lowercase sequence # 3. DNA, RNA, Protein type # 4. variation of formats self.multi_fp = get_data_path('genbank_multi_records') imd_pro = IntervalMetadata(9) imd_pro.add([(0, 9)], [(False, False)], {'organism': '"Bacteria"', 'type': 'source', 'strand': '+', '__location': '1..9'},) imd_pro.add([(0, 9)], [(False, True)], {'__location': '1..>9', 'product': '"L-carnitine amidase"', 'strand': '+', 'type': 'Protein'}) imd_dna = IntervalMetadata(9) imd_dna.add([(0, 9)], [(False, False)], {'country': '"Brazil: Parana, Paranavai"', 'type': 'source', 'strand': '+', '__location': '1..9', 'environmental_sample': ''}) imd_dna.add([(1, 8)], [(True, True)], {'__location': 'complement(<2..>8)', 'product': '"16S ribosomal RNA"', 'strand': '-', 'type': 'rRNA'}) self.multi = ( ('gsreildfk', {'ACCESSION': 'AAB29917', 'COMMENT': 'Method: direct peptide sequencing.', 'DBSOURCE': 'accession AAB29917.1', 'DEFINITION': 'L-carnitine amidase {N-terminal}', 'KEYWORDS': '.', 'LOCUS': {'date': '23-SEP-1994', 'division': 'BCT', 'locus_name': 'AAB29917', 'mol_type': None, 'shape': 'linear', 'size': 9, 'unit': 'aa'}, 'REFERENCE': [{'AUTHORS': 'Joeres,U. and Kula,M.R.', 'JOURNAL': 'AMB 40 (5), 606-610 (1994)', 'PUBMED': '7764422', 'REFERENCE': '1 (residues 1 to 9)', 'REMARK': 'from the original journal article.', 'TITLE': 'a microbial L-carnitine amidase'}, {'AUTHORS': 'Joeres,U. and Kula,M.R.', 'JOURNAL': 'AMB 40 (5), 606-610 (1994)', 'PUBMED': '7764422', 'REFERENCE': '1 (residues 1 to 9)', 'TITLE': 'a microbial L-carnitine amidase'}], 'SOURCE': {'ORGANISM': 'Bacteria', 'taxonomy': 'Unclassified.'}, 'VERSION': 'AAB29917.1 GI:545426'}, imd_pro, Protein), ('catgcaggc', {'ACCESSION': 'HQ018078', 'DEFINITION': 'Uncultured Xylanimonas sp.16S, partial', 'KEYWORDS': 'ENV.', 'LOCUS': {'date': '29-AUG-2010', 'division': 'ENV', 'locus_name': 'HQ018078', 'mol_type': 'DNA', 'shape': 'linear', 'size': 9, 'unit': 'bp'}, 'SOURCE': {'ORGANISM': 'uncultured Xylanimonas sp.', 'taxonomy': 'Bacteria; Actinobacteria; ' 'Micrococcales; Promicromonosporaceae; ' 'Xylanimonas; environmental samples.'}, 'VERSION': 'HQ018078.1 GI:304421728'}, imd_dna, DNA))
class IntervalMetadataMixinTests: def _set_up(self): self.upper_bound = 9 self.im = IntervalMetadata(self.upper_bound) self.intvls = [ {'bounds': [(0, 1), (2, 9)], 'metadata': {'gene': 'sagA'}}, {'bounds': [(0, 1)], 'metadata': {'gene': ['a'], 'product': 'foo'}}] def test_constructor_invalid(self): with self.assertRaisesRegex(TypeError, 'You must provide `IntervalMetadata` ' 'object.'): self._interval_metadata_constructor_(0, '') def test_constructor_interval_metadata_len_mismatch(self): for i in [0, 1, 3, 100]: with self.assertRaisesRegex( ValueError, '\(%d\).*\(%d\)' % (self.upper_bound, i)): self._interval_metadata_constructor_(i, self.im) def test_constructor_interval_metadata_len(self): for n in 1, 2, 3: im = IntervalMetadata(n) im.add([(0, 1)], metadata={'a': 'b'}) obj = self._interval_metadata_constructor_(n, im) self.assertTrue(obj.has_interval_metadata()) self.assertIsInstance(obj.interval_metadata, IntervalMetadata) def test_constructor_interval_metadata_len_0(self): im = IntervalMetadata(0) obj = self._interval_metadata_constructor_(0, im) self.assertFalse(obj.has_interval_metadata()) def test_constructor_no_interval_metadata(self): for i, im in [(0, None), (self.upper_bound, self.im)]: obj = self._interval_metadata_constructor_(i, im) self.assertFalse(obj.has_interval_metadata()) self.assertIsInstance(obj.interval_metadata, IntervalMetadata) def test_constructor_handles_missing_interval_metadata_efficiently(self): obj = self._interval_metadata_constructor_(self.upper_bound) self.assertIsNone(obj._interval_metadata) obj = self._interval_metadata_constructor_( self.upper_bound, interval_metadata=None) self.assertIsNone(obj._interval_metadata) def test_constructor_makes_shallow_copy_of_interval_metadata(self): intvl = self.im.add(**self.intvls[1]) obj = self._interval_metadata_constructor_(self.upper_bound, self.im) self.assertEqual(obj.interval_metadata, self.im) self.assertIsNot(obj.interval_metadata, self.im) # Changing mutable value of metadata of the old interval # also changes obj. intvl.metadata['gene'].append('b') self.assertEqual(obj.interval_metadata, self.im) # Changing old interval doesn't change obj intvl.bounds = [(3, 6)] self.assertNotEqual(obj.interval_metadata, self.im) def test_eq_basic(self): im1 = IntervalMetadata(self.upper_bound) im1.add(**self.intvls[0]) obj1 = self._interval_metadata_constructor_(self.upper_bound, im1) im2 = IntervalMetadata(self.upper_bound) im2.add(**self.intvls[0]) obj2 = self._interval_metadata_constructor_(self.upper_bound, im2) self.assertReallyEqual(obj1, obj2) def test_eq_populated_differently(self): im1 = IntervalMetadata(self.upper_bound) im1.add(**self.intvls[0]) obj1 = self._interval_metadata_constructor_(self.upper_bound, im1) obj2 = self._interval_metadata_constructor_(self.upper_bound) obj2.interval_metadata.add(**self.intvls[0]) self.assertReallyEqual(obj1, obj2) def test_eq_handles_missing_positional_metadata_efficiently(self): obj1 = self._interval_metadata_constructor_(self.upper_bound) obj2 = self._interval_metadata_constructor_(self.upper_bound) self.assertReallyEqual(obj1, obj2) self.assertIsNone(obj1._interval_metadata) self.assertIsNone(obj2._interval_metadata) def test_ne_diff_len(self): obj1 = self._interval_metadata_constructor_(0) obj2 = self._interval_metadata_constructor_(self.upper_bound) self.assertReallyNotEqual(obj1, obj2) def test_ne_only_one_is_empty(self): im1 = IntervalMetadata(self.upper_bound) im1.add(**self.intvls[0]) obj1 = self._interval_metadata_constructor_(self.upper_bound, im1) obj2 = self._interval_metadata_constructor_(self.upper_bound) self.assertReallyNotEqual(obj1, obj2) def test_ne(self): im1 = IntervalMetadata(self.upper_bound) im1.add(**self.intvls[0]) obj1 = self._interval_metadata_constructor_(self.upper_bound, im1) im2 = IntervalMetadata(self.upper_bound) im2.add(**self.intvls[1]) obj2 = self._interval_metadata_constructor_(self.upper_bound, im2) self.assertReallyNotEqual(obj1, obj2) def test_copy_interval_metadata_empty(self): obj = self._interval_metadata_constructor_(self.upper_bound, self.im) obj_copy = copy.copy(obj) self.assertEqual(obj, obj_copy) self.assertIsNot(obj, obj_copy) self.assertIsNone(obj_copy._interval_metadata) self.assertEqual(obj._interval_metadata, self.im) def test_copy_interval_metadata_none(self): obj = self._interval_metadata_constructor_(self.upper_bound) obj_copy = copy.copy(obj) self.assertEqual(obj, obj_copy) self.assertIsNot(obj, obj_copy) self.assertIsNone(obj._interval_metadata) self.assertIsNone(obj_copy._interval_metadata) def test_copy_interval_metadata(self): self.im.add(**self.intvls[1]) obj = self._interval_metadata_constructor_(self.upper_bound, self.im) obj_copy = copy.copy(obj) self.assertEqual(obj, obj_copy) self.assertIsNot(obj, obj_copy) self.assertIsNot(obj.interval_metadata, obj_copy.interval_metadata) self.assertIsNot(obj.interval_metadata._intervals, obj_copy.interval_metadata._intervals) for i, j in zip(obj.interval_metadata._intervals, obj_copy.interval_metadata._intervals): self.assertIsNot(i, j) self.assertIsNot(i.metadata, j.metadata) for k in i.metadata: self.assertIs(i.metadata[k], j.metadata[k]) def test_deepcopy_interval_metadata(self): self.im.add(**self.intvls[1]) obj = self._interval_metadata_constructor_(self.upper_bound, self.im) obj_copy = copy.deepcopy(obj) self.assertEqual(obj, obj_copy) self.assertIsNot(obj, obj_copy) self.assertIsNot(obj.interval_metadata, obj_copy.interval_metadata) self.assertIsNot(obj.interval_metadata._intervals, obj_copy.interval_metadata._intervals) for i, j in zip(obj.interval_metadata._intervals, obj_copy.interval_metadata._intervals): self.assertIsNot(i, j) self.assertIsNot(i.metadata, j.metadata) self.assertIsNot(i.metadata['gene'], j.metadata['gene']) self.assertIs(i.metadata['product'], j.metadata['product']) def test_deepcopy_interval_metadata_empty(self): obj = self._interval_metadata_constructor_(self.upper_bound, self.im) obj_copy = copy.deepcopy(obj) self.assertEqual(obj, obj_copy) self.assertIsNot(obj, obj_copy) self.assertIsNone(obj_copy._interval_metadata) self.assertEqual(obj._interval_metadata, self.im) def test_deepcopy_interval_metadata_none(self): obj = self._interval_metadata_constructor_(self.upper_bound, None) obj_copy = copy.deepcopy(obj) self.assertEqual(obj, obj_copy) self.assertIsNot(obj, obj_copy) self.assertIsNone(obj._interval_metadata) self.assertIsNone(obj_copy._interval_metadata) def test_deepcopy_memo_is_respected(self): # Basic test to ensure deepcopy's memo is passed through to recursive # deepcopy calls. obj = self._interval_metadata_constructor_(self.upper_bound, self.im) memo = {} copy.deepcopy(obj, memo) self.assertGreater(len(memo), 1) def test_interval_metadata_getter(self): self.im.add(**self.intvls[0]) obj = self._interval_metadata_constructor_(self.upper_bound, self.im) self.assertIsInstance(obj.interval_metadata, IntervalMetadata) self.assertEqual(self.im, obj.interval_metadata) # Update existing metadata. obj.interval_metadata._intervals[0].metadata['gene'] = 'sagB' self.assertNotEqual(obj.interval_metadata, self.im) self.im._intervals[0].metadata['gene'] = 'sagB' self.assertEqual(obj.interval_metadata, self.im) # Add new interval feature. obj.interval_metadata.add(**self.intvls[1]) self.im.add(**self.intvls[1]) self.assertEqual(obj.interval_metadata, self.im) def test_interval_metadata_getter_no_interval_metadata(self): obj = self._interval_metadata_constructor_(self.upper_bound) self.assertIsNone(obj._interval_metadata) self.assertIsInstance(obj.interval_metadata, IntervalMetadata) self.assertEqual(obj.interval_metadata, self.im) self.assertIsNotNone(obj._interval_metadata) def test_interval_metadata_setter(self): obj = self._interval_metadata_constructor_(self.upper_bound) self.assertFalse(obj.has_interval_metadata()) obj.interval_metadata = self.im self.assertFalse(obj.has_interval_metadata()) self.assertEqual(obj.interval_metadata, self.im) self.im.add(**self.intvls[1]) obj.interval_metadata = self.im self.assertTrue(obj.has_interval_metadata()) self.assertEqual(obj.interval_metadata, self.im) def test_interval_metadata_setter_makes_copy(self): intvl = self.im.add(**self.intvls[1]) obj = self._interval_metadata_constructor_(self.upper_bound) obj.interval_metadata = self.im self.assertEqual(obj.interval_metadata, self.im) self.assertIsNot(obj.interval_metadata, self.im) # Changing mutable value of metadata of the old interval # also changes obj. intvl.metadata['gene'].append('b') self.assertEqual(obj.interval_metadata, self.im) # Changing old interval doesn't change obj intvl.bounds = [(3, 6)] self.assertNotEqual(obj.interval_metadata, self.im) def test_interval_metadata_setter_len_mismatch(self): self.im.add(**self.intvls[1]) obj = self._interval_metadata_constructor_(self.upper_bound, self.im) for i in 0, 1, 3, 100: with self.assertRaisesRegex( ValueError, '\(%d\).*\(%d\)' % (i, self.upper_bound)): obj.interval_metadata = IntervalMetadata(i) self.assertEqual(obj.interval_metadata, self.im) def test_interval_metadata_setter_invalid_type(self): self.im.add(**self.intvls[0]) obj = self._interval_metadata_constructor_(self.upper_bound, self.im) for i in [2, None, '', {}, []]: with self.assertRaisesRegex( TypeError, 'You must provide `IntervalMetadata` object'): obj.interval_metadata = i self.assertEqual(self.im, obj.interval_metadata) def test_interval_metadata_deleter_empty(self): obj = self._interval_metadata_constructor_(self.upper_bound, self.im) del obj.interval_metadata self.assertIsNone(obj._interval_metadata) self.assertFalse(obj.has_interval_metadata()) # Delete again. test idempotent del obj.interval_metadata self.assertIsNone(obj._interval_metadata) self.assertFalse(obj.has_interval_metadata()) def test_interval_metadata_deleter(self): self.im.add(**self.intvls[0]) obj = self._interval_metadata_constructor_(self.upper_bound, self.im) del obj.interval_metadata self.assertIsNone(obj._interval_metadata) self.assertFalse(obj.has_interval_metadata()) def test_has_interval_metadata(self): obj = self._interval_metadata_constructor_(self.upper_bound) self.assertFalse(obj.has_interval_metadata()) obj = self._interval_metadata_constructor_(self.upper_bound, self.im) self.assertFalse(obj.has_interval_metadata()) self.im.add([(0, 1)]) obj = self._interval_metadata_constructor_(self.upper_bound, self.im) self.assertTrue(obj.has_interval_metadata())
class TestIntervalMetadata(unittest.TestCase, ReallyEqualMixin): def setUp(self): self.upper_bound = 10 self.im_empty = IntervalMetadata(self.upper_bound) self.im_1 = IntervalMetadata(self.upper_bound) self.im_1_1 = Interval( interval_metadata=self.im_1, bounds=[(1, 2), (4, self.upper_bound)], metadata={'gene': 'sagA', 'bound': 0}) self.im_2 = IntervalMetadata(self.upper_bound) self.im_2_1 = Interval( interval_metadata=self.im_2, bounds=[(1, 2), (4, self.upper_bound)], metadata={'gene': 'sagA', 'bound': 0}) self.im_2_2 = Interval( interval_metadata=self.im_2, bounds=[(3, 5)], metadata={'gene': 'sagB', 'bound': 0, 'spam': [0]}) def test_copy_empty(self): obs = copy(self.im_empty) self.assertEqual(obs, self.im_empty) self.assertIsNot(obs._intervals, self.im_empty._intervals) self.assertIsNot(obs._interval_tree, self.im_empty._interval_tree) def test_copy(self): obs = copy(self.im_2) self.assertEqual(obs, self.im_2) self.assertIsNot(obs._intervals, self.im_2._intervals) self.assertIsNot(obs._interval_tree, self.im_2._interval_tree) for i in range(self.im_2.num_interval_features): i1, i2 = obs._intervals[i], self.im_2._intervals[i] self.assertIsNot(i1, i2) self.assertIsNot(i1.bounds, i2.bounds) self.assertIsNot(i1.fuzzy, i2.fuzzy) self.assertIsNot(i1._interval_metadata, i2._interval_metadata) self.assertIsNot(i1.metadata, i2.metadata) for k in i1.metadata: self.assertIs(i1.metadata[k], i2.metadata[k]) def test_deepcopy(self): obs = deepcopy(self.im_2) self.assertEqual(obs, self.im_2) self.assertIsNot(obs._intervals, self.im_2._intervals) self.assertIsNot(obs._interval_tree, self.im_2._interval_tree) for i in range(self.im_2.num_interval_features): i1, i2 = obs._intervals[i], self.im_2._intervals[i] self.assertIsNot(i1, i2) self.assertIsNot(i1.bounds, i2.bounds) self.assertIsNot(i1.fuzzy, i2.fuzzy) self.assertIsNot(i1.metadata, i2.metadata) i2.metadata['spam'].append(1) self.assertEqual(i2.metadata, {'gene': 'sagB', 'bound': 0, 'spam': [0, 1]}) self.assertEqual(i1.metadata, {'gene': 'sagB', 'bound': 0, 'spam': [0]}) def test_deepcopy_memo_is_respected(self): memo = {} deepcopy(self.im_1, memo) self.assertGreater(len(memo), 2) def test_init(self): self.assertFalse(self.im_empty._is_stale_tree) self.assertEqual(self.im_empty._intervals, []) def test_init_upper_bound_lt_lower_bound(self): # test that no exception is raised IntervalMetadata(0) with self.assertRaises(ValueError): IntervalMetadata(-1) def test_num_interval_features(self): self.assertEqual(self.im_empty.num_interval_features, 0) self.assertEqual(self.im_1.num_interval_features, 1) self.assertEqual(self.im_2.num_interval_features, 2) def test_duplicate(self): '''Test query and drop methods on duplicate Intervals.''' intvl_1 = self.im_empty.add([(1, 2)]) intvl_2 = self.im_empty.add([(1, 2)]) self.assertEqual(len(list(self.im_empty.query([(1, 2)]))), 2) self.im_empty.drop([intvl_1]) self.assertEqual(len(self.im_empty._intervals), 1) self.assertTrue(self.im_empty._intervals[0] is intvl_2) def test_duplicate_bounds(self): intvl = self.im_empty.add([(1, 2), (1, 2)]) intvls = list(self.im_empty.query([(1, 2)])) self.assertEqual(len(intvls), 1) self.assertTrue(intvl is intvls[0]) def test_concat_empty(self): for i in 0, 1, 2: obs = IntervalMetadata.concat([self.im_empty] * i) exp = IntervalMetadata(self.upper_bound * i) self.assertEqual(obs, exp) obs = IntervalMetadata.concat([]) self.assertEqual(obs, IntervalMetadata(0)) def test_concat(self): im1 = IntervalMetadata(3) im2 = IntervalMetadata(4) im3 = IntervalMetadata(5) im1.add([(0, 2)], [(True, True)]) im2.add([(0, 3)], [(True, False)], {'gene': 'sagA'}) im2.add([(2, 4)], metadata={'gene': 'sagB'}) im3.add([(1, 5)], [(False, True)], {'gene': 'sagC'}) obs = IntervalMetadata.concat([im1, im2, im3]) exp = IntervalMetadata(12) exp.add(bounds=[(0, 2)], fuzzy=[(True, True)]) exp.add(bounds=[(3, 6)], fuzzy=[(True, False)], metadata={'gene': 'sagA'}) exp.add(bounds=[(5, 7)], metadata={'gene': 'sagB'}) exp.add(bounds=[(8, 12)], fuzzy=[(False, True)], metadata={'gene': 'sagC'}) self.assertEqual(obs, exp) def test_sort(self): interval = Interval( self.im_2, [(1, 2), (3, 8)], metadata={'gene': 'sagA', 'bound': 0}) im = deepcopy(self.im_2) self.im_2.sort(False) # check sorting does not have other side effects self.assertEqual(im, self.im_2) self.assertEqual(self.im_2._intervals, [self.im_2_2, self.im_2_1, interval]) self.im_2.sort() self.assertEqual(im, self.im_2) self.assertEqual(self.im_2._intervals, [interval, self.im_2_1, self.im_2_2]) self.im_empty.sort() self.assertEqual(self.im_empty, IntervalMetadata(self.upper_bound)) def test_add_eq_upper_bound(self): self.im_empty.add(bounds=[(1, 2), (4, self.upper_bound)], metadata={'gene': 'sagA', 'bound': 0}) self.assertTrue(self.im_empty._is_stale_tree) interval = self.im_empty._intervals[0] self.assertEqual(interval.bounds, [(1, 2), (4, self.upper_bound)]) self.assertEqual(interval.metadata, {'gene': 'sagA', 'bound': 0}) self.assertTrue(isinstance(self.im_empty._interval_tree, IntervalTree)) def test_add_gt_upper_bound(self): with self.assertRaises(ValueError): self.im_empty.add(bounds=[(1, 2), (4, self.upper_bound+1)], metadata={'gene': 'sagA', 'bound': 0}) def test_add_eq_start_end_bound(self): for i in 0, 1, self.upper_bound: # test that no exception is raised self.im_empty.add(bounds=[(i, i)], metadata={'gene': 'sagA', 'bound': 0}) def test_query_attribute(self): intervals = self.im_2._query_attribute({}) for i, j in zip(intervals, self.im_2._intervals): self.assertEqual(i, j) intervals = list(self.im_2._query_attribute(None)) self.assertEqual(len(intervals), 0) for i in self.im_2._intervals: intervals = list(self.im_2._query_attribute(i.metadata)) self.assertEqual(len(intervals), 1) self.assertEqual(intervals[0], i) def test_query_interval(self): intervals = list(self.im_2._query_interval((1, 2))) self.assertEqual(len(intervals), 1) self.assertEqual(intervals[0], self.im_2_1) intervals = list(self.im_2._query_interval((3, 4))) self.assertEqual(len(intervals), 1) self.assertEqual(intervals[0], self.im_2_2) intervals = {repr(i) for i in self.im_2._query_interval((1, 7))} self.assertEqual(len(intervals), 2) self.assertSetEqual(intervals, {repr(i) for i in self.im_2._intervals}) def test_query_interval_upper_bound(self): intervals = list(self.im_2._query_interval((self.upper_bound-1, self.upper_bound))) self.assertEqual(intervals, [self.im_2_1]) def test_query(self): intervals = list(self.im_2.query(bounds=[(1, 5)], metadata={'gene': 'sagA'})) self.assertEqual(len(intervals), 1) self.assertEqual(intervals[0], self.im_2_1) def test_query_empty(self): intervals = list(self.im_1.query()) self.assertEqual(len(intervals), 0) def test_query_no_hits(self): intervals = list(self.im_2.query(bounds=[(self.upper_bound, 200)])) self.assertEqual(len(intervals), 0) intervals = list(self.im_2.query(metadata={'gene': 'sagC'})) self.assertEqual(len(intervals), 0) intervals = list(self.im_2.query(bounds=[(1, 2)], metadata={'gene': 'sagC'})) self.assertEqual(len(intervals), 0) def test_query_interval_only(self): for loc in [[(1, 7)], [(1, 2), (3, 4)]]: intervals = list(self.im_2.query(bounds=loc)) self.assertEqual(len(intervals), 2) self.assertEqual(intervals[0], self.im_2_1) self.assertEqual(intervals[1], self.im_2_2) def test_query_metadata_only(self): intervals = list(self.im_2.query(metadata={'gene': 'sagB'})) self.assertEqual(len(intervals), 1) self.assertEqual(intervals[0], self.im_2_2) intervals = list(self.im_2.query(metadata={'bound': 0})) self.assertEqual(len(intervals), 2) self.assertEqual(intervals[0], self.im_2_1) self.assertEqual(intervals[1], self.im_2_2) def test_drop(self): intvl = self.im_2._intervals[0] self.im_2.drop([intvl]) self.assertEqual(len(self.im_2._intervals), 1) self.assertEqual(self.im_2._intervals[0], self.im_2_2) # test the intvl was set to dropped self.assertTrue(intvl.dropped) def test_drop_all(self): self.im_2.drop(self.im_2._intervals) self.assertEqual(self.im_2, self.im_empty) def test_reverse(self): self.im_2._reverse() Interval( interval_metadata=self.im_empty, bounds=[(0, 6), (8, 9)], metadata={'gene': 'sagA', 'bound': 0}) Interval( interval_metadata=self.im_empty, bounds=[(5, 7)], metadata={'gene': 'sagB', 'bound': 0, 'spam': [0]}) self.assertEqual(self.im_2, self.im_empty) def test_eq_ne(self): im1 = IntervalMetadata(10) im1.add(metadata={'gene': 'sagA', 'bound': '0'}, bounds=[(0, 2), (4, 7)]) im1.add(metadata={'gene': 'sagB', 'bound': '3'}, bounds=[(3, 5)]) # The ordering shouldn't matter im2 = IntervalMetadata(10) im2.add(metadata={'gene': 'sagB', 'bound': '3'}, bounds=[(3, 5)]) im2.add(metadata={'gene': 'sagA', 'bound': '0'}, bounds=[(0, 2), (4, 7)]) im3 = IntervalMetadata(10) im3.add(metadata={'gene': 'sagA', 'bound': '3'}, bounds=[(0, 2), (4, 7)]) im3.add(metadata={'gene': 'sagB', 'bound': '3'}, bounds=[(3, 5)]) self.assertReallyEqual(im1, im2) self.assertReallyNotEqual(im1, im3) def test_ne_diff_bounds(self): im1 = IntervalMetadata(10) im2 = IntervalMetadata(9) intvl = {'bounds': [(0, 1)], 'metadata': {'spam': 'foo'}} im1.add(**intvl) im2.add(**intvl) self.assertReallyNotEqual(im1, im2) def test_repr(self): exp = '''0 interval features -------------------''' self.assertEqual(repr(self.im_empty), exp) self.im_empty.add([(1, 2)], metadata={'gene': 'sagA'}) exp = '''1 interval feature ------------------ Interval\(interval_metadata=<[0-9]+>, bounds=\[\(1, 2\)\], \ fuzzy=\[\(False, False\)\], metadata={'gene': 'sagA'}\)''' self.assertRegex(repr(self.im_empty), exp) self.im_empty.add([(3, 4)], metadata={'gene': 'sagB'}) self.im_empty.add([(3, 4)], metadata={'gene': 'sagC'}) self.im_empty.add([(3, 4)], metadata={'gene': 'sagD'}) self.im_empty.add([(3, 4)], metadata={'gene': 'sagE'}) self.im_empty.add([(3, 4)], metadata={'gene': 'sagF'}) exp = '''6 interval features ------------------- Interval\(interval_metadata=<[0-9]+>, bounds=\[\(1, 2\)\], \ fuzzy=\[\(False, False\)\], metadata={'gene': 'sagA'}\) Interval\(interval_metadata=<[0-9]+>, bounds=\[\(3, 4\)\], \ fuzzy=\[\(False, False\)\], metadata={'gene': 'sagB'}\) ... Interval\(interval_metadata=<[0-9]+>, bounds=\[\(3, 4\)\], \ fuzzy=\[\(False, False\)\], metadata={'gene': 'sagE'}\) Interval\(interval_metadata=<[0-9]+>, bounds=\[\(3, 4\)\], \ fuzzy=\[\(False, False\)\], metadata={'gene': 'sagF'}\)''' self.assertRegex(repr(self.im_empty), exp)
def test_concat(self): im1 = IntervalMetadata(3) im2 = IntervalMetadata(4) im3 = IntervalMetadata(5) im1.add([(0, 2)], [(True, True)]) im2.add([(0, 3)], [(True, False)], {'gene': 'sagA'}) im2.add([(2, 4)], metadata={'gene': 'sagB'}) im3.add([(1, 5)], [(False, True)], {'gene': 'sagC'}) obs = IntervalMetadata.concat([im1, im2, im3]) exp = IntervalMetadata(12) exp.add(bounds=[(0, 2)], fuzzy=[(True, True)]) exp.add(bounds=[(3, 6)], fuzzy=[(True, False)], metadata={'gene': 'sagA'}) exp.add(bounds=[(5, 7)], metadata={'gene': 'sagB'}) exp.add(bounds=[(8, 12)], fuzzy=[(False, True)], metadata={'gene': 'sagC'}) self.assertEqual(obs, exp)
def test_eq_ne(self): im1 = IntervalMetadata(10) im1.add(metadata={ 'gene': 'sagA', 'bound': '0' }, bounds=[(0, 2), (4, 7)]) im1.add(metadata={'gene': 'sagB', 'bound': '3'}, bounds=[(3, 5)]) # The ordering shouldn't matter im2 = IntervalMetadata(10) im2.add(metadata={'gene': 'sagB', 'bound': '3'}, bounds=[(3, 5)]) im2.add(metadata={ 'gene': 'sagA', 'bound': '0' }, bounds=[(0, 2), (4, 7)]) im3 = IntervalMetadata(10) im3.add(metadata={ 'gene': 'sagA', 'bound': '3' }, bounds=[(0, 2), (4, 7)]) im3.add(metadata={'gene': 'sagB', 'bound': '3'}, bounds=[(3, 5)]) self.assertReallyEqual(im1, im2) self.assertReallyNotEqual(im1, im3)
class GFF3IOTests(TestCase): def setUp(self): self.multi_fp = get_data_path('gff3_multi_record') self.single_fp = get_data_path('gff3_single_record') intvls = [{ 'bounds': [(0, 4641652)], 'metadata': { 'source': 'European Nucleotide Archive', 'type': 'chromosome', 'score': '.', 'strand': '.', 'ID': 'chromosome:Chromosome', 'Alias': 'U00096.3', 'Is_circular': 'true' } }, { 'bounds': [(147, 148)], 'metadata': { 'source': 'regulondb_feature', 'type': 'biological_region', 'score': '.', 'strand': '+', 'external_name': 'Promoter thrLp (RegulonDB:ECK120010236)', 'logic_name': 'regulondb_promoter' } }, { 'bounds': [(336, 2799)], 'metadata': { 'source': 'Prodigal_v2.60', 'type': 'gene', 'score': '1.8', 'strand': '+', 'phase': 0, 'ID': '1_1', 'gc_cont': '0.427' } }, { 'bounds': [(336, 2799)], 'metadata': { 'source': 'Prodigal_v2.60', 'type': 'CDS', 'score': '333.8', 'strand': '+', 'phase': 0, 'ID': '1_2', 'Parent': '1_1', 'rbs_motif': 'GGAG/GAGG', 'rbs_spacer': '5-10bp' } }, { 'bounds': [(0, 50), (55, 100)], 'metadata': { 'source': 'Prodigal_v2.60', 'type': 'gene', 'score': '1.8', 'strand': '+', 'phase': 0, 'ID': '1_1', 'gene': 'FXR receptor' } }] self.upper_bound = 4641652 self.imd1 = IntervalMetadata(self.upper_bound) self.imd1.add(**intvls[0]) self.imd1.add(**intvls[1]) self.imd2 = IntervalMetadata(None) self.imd2.add(**intvls[2]) self.imd2.add(**intvls[3]) self.imd3 = IntervalMetadata(None) self.imd3.add(**intvls[4]) self.seq_fp = get_data_path('gff3_dna') self.seq = Sequence('ATGCATGCATGC', metadata={ 'id': 'NC_1', 'description': 'species X' }) self.seq.interval_metadata.add( [(0, 9)], metadata={ 'source': 'Prodigal_v2.60', 'type': 'gene', 'score': '.', 'strand': '+', 'phase': 0, 'ID': 'gene1', 'Name': 'FXR' }) self.dna = DNA(self.seq)