def test_indent_splitter(self): """indent_splitter should split lines at correct locations""" #if lines have same indent, should not group together lines = ['abc xxx', 'def yyy'] self.assertEqual(list(indent_splitter(lines)),\ [[lines[0]], [lines[1]]]) #if second line is indented, should group with first lines = ['abc xxx', ' def yyy'] self.assertEqual(list(indent_splitter(lines)),\ [[lines[0], lines[1]]]) #if both lines indented but second is more, should group with first lines = [' abc xxx', ' def yyy'] self.assertEqual(list(indent_splitter(lines)),\ [[lines[0], lines[1]]]) #if both lines indented equally, should not group lines = [' abc xxx', ' def yyy'] self.assertEqual(list(indent_splitter(lines)), \ [[lines[0]], [lines[1]]]) #for more complex situation, should produce correct grouping lines = [ ' xyz', #0 - ' xxx', #1 - ' yyy', #2 ' uuu', #3 ' iii', #4 ' qaz', #5 - ' wsx', #6 - ' az', #7 ' sx', #8 ' gb', #9 ' bg', #10 ' aaa', #11 - ] self.assertEqual(list(indent_splitter(lines)), \ [[lines[0]], lines[1:5], [lines[5]], lines[6:11], [lines[11]]]) #real example from genbank file lines = \ """LOCUS NT_016354 92123751 bp DNA linear CON 29-AUG-2006 DEFINITION H**o sapiens chromosome 4 genomic contig, reference assembly. ACCESSION NT_016354 NT_006109 NT_006204 NT_006245 NT_006302 NT_006371 NT_006397 NT_016393 NT_016589 NT_016599 NT_016606 NT_022752 NT_022753 NT_022755 NT_022760 NT_022774 NT_022797 NT_022803 NT_022846 NT_022960 NT_025694 NT_028147 NT_029273 NT_030643 NT_030646 NT_030662 NT_031780 NT_031781 NT_031791 NT_034703 NT_034705 NT_037628 NT_037629 NT_079512 VERSION NT_016354.18 GI:88977422 KEYWORDS . SOURCE H**o sapiens (human) ORGANISM H**o sapiens Eukaryota; Metazoa; Chordata; Craniata; Vertebrata; Euteleostomi; Mammalia; Eutheria; Euarchontoglires; Primates; Haplorrhini; Catarrhini; Hominidae; H**o. ? REFERENCE 2 (bases 1 to 92123751) AUTHORS International Human Genome Sequencing Consortium. TITLE Finishing the euchromatic sequence of the human genome""".split('\n') self.assertEqual(list(indent_splitter(lines)), \ [[lines[0]],[lines[1]],lines[2:8],[lines[8]],[lines[9]],lines[10:15],\ [lines[15]], lines[16:]])
def test_indent_splitter(self): """indent_splitter should split lines at correct locations""" #if lines have same indent, should not group together lines = [ 'abc xxx', 'def yyy' ] self.assertEqual(list(indent_splitter(lines)),\ [[lines[0]], [lines[1]]]) #if second line is indented, should group with first lines = [ 'abc xxx', ' def yyy' ] self.assertEqual(list(indent_splitter(lines)),\ [[lines[0], lines[1]]]) #if both lines indented but second is more, should group with first lines = [ ' abc xxx', ' def yyy' ] self.assertEqual(list(indent_splitter(lines)),\ [[lines[0], lines[1]]]) #if both lines indented equally, should not group lines = [ ' abc xxx', ' def yyy' ] self.assertEqual(list(indent_splitter(lines)), \ [[lines[0]], [lines[1]]]) #for more complex situation, should produce correct grouping lines = [ ' xyz', #0 - ' xxx', #1 - ' yyy', #2 ' uuu', #3 ' iii', #4 ' qaz', #5 - ' wsx', #6 - ' az', #7 ' sx', #8 ' gb',#9 ' bg', #10 ' aaa', #11 - ] self.assertEqual(list(indent_splitter(lines)), \ [[lines[0]], lines[1:5], [lines[5]], lines[6:11], [lines[11]]]) #real example from genbank file lines = \ """LOCUS NT_016354 92123751 bp DNA linear CON 29-AUG-2006 DEFINITION H**o sapiens chromosome 4 genomic contig, reference assembly. ACCESSION NT_016354 NT_006109 NT_006204 NT_006245 NT_006302 NT_006371 NT_006397 NT_016393 NT_016589 NT_016599 NT_016606 NT_022752 NT_022753 NT_022755 NT_022760 NT_022774 NT_022797 NT_022803 NT_022846 NT_022960 NT_025694 NT_028147 NT_029273 NT_030643 NT_030646 NT_030662 NT_031780 NT_031781 NT_031791 NT_034703 NT_034705 NT_037628 NT_037629 NT_079512 VERSION NT_016354.18 GI:88977422 KEYWORDS . SOURCE H**o sapiens (human) ORGANISM H**o sapiens Eukaryota; Metazoa; Chordata; Craniata; Vertebrata; Euteleostomi; Mammalia; Eutheria; Euarchontoglires; Primates; Haplorrhini; Catarrhini; Hominidae; H**o. ? REFERENCE 2 (bases 1 to 92123751) AUTHORS International Human Genome Sequencing Consortium. TITLE Finishing the euchromatic sequence of the human genome""".split('\n') self.assertEqual(list(indent_splitter(lines)), \ [[lines[0]],[lines[1]],lines[2:8],[lines[8]],[lines[9]],lines[10:15],\ [lines[15]], lines[16:]])
l3_a = Location(10) l3_b = Location(12) l3 = Location([l3_a, l3_b]) ll = LocationList([l, l2, l3]) s = ll.extract('ACGTGCAGTCAGTAGCAT') # 123456789012345678 self.assertEqual(s, 'G' + 'TGC' + 'CAG') #check a case where it wraps around l5_a = Location(16) l5_b = Location(4) l5 = Location([l5_a, l5_b]) ll = LocationList([l5]) s = ll.extract('ACGTGCAGTCAGTAGCAT') self.assertEqual(s, 'CATACGT') if __name__ == '__main__': from sys import argv if len(argv) > 2 and argv[1] == 'x': filename = argv[2] lines = open(filename) for i in indent_splitter(lines): print '******' print i[0] for j in indent_splitter(i[1:]): print '?????' for line in j: print line else: main()
l3_a = Location(10) l3_b = Location(12) l3 = Location([l3_a, l3_b]) ll = LocationList([l, l2, l3]) s = ll.extract('ACGTGCAGTCAGTAGCAT') # 123456789012345678 self.assertEqual(s, 'G'+'TGC'+'CAG') #check a case where it wraps around l5_a = Location(16) l5_b = Location(4) l5 = Location([l5_a,l5_b]) ll = LocationList([l5]) s = ll.extract('ACGTGCAGTCAGTAGCAT') self.assertEqual(s, 'CATACGT') if __name__ == '__main__': from sys import argv if len(argv) > 2 and argv[1] == 'x': filename = argv[2] lines = open(filename) for i in indent_splitter(lines): print '******' print i[0] for j in indent_splitter(i[1:]): print '?????' for line in j: print line else: main()