def test_parsers(self): """DelimitedSplitter should return function with correct behavior""" empty = DelimitedSplitter() space = DelimitedSplitter(None) semicolon = DelimitedSplitter(';') twosplits = DelimitedSplitter(';', 2) allsplits = DelimitedSplitter(';', None) lastone = DelimitedSplitter(';', -1) lasttwo = DelimitedSplitter(';', -2) self.assertEqual(empty('a b c'), ['a', 'b c']) self.assertEqual(empty('abc'), ['abc']) self.assertEqual(empty(' '), []) self.assertEqual(empty('a b c'), space('a b c')) self.assertEqual(semicolon(' a ; b ; c d'), ['a', 'b ; c d']) self.assertEqual(twosplits(' a ; b ; c d'), ['a', 'b', 'c d']) self.assertEqual(allsplits(' a ; b ; c;;d;e ;'),\ ['a','b','c','','d','e','']) self.assertEqual(lastone(' a ; b ; c;;d;e ;'),\ ['a ; b ; c;;d;e','']) self.assertEqual(lasttwo(' a ; b ; c;;d;e ;'),\ ['a ; b ; c;;d','e','']) self.assertEqual(lasttwo(''), []) self.assertEqual(lasttwo('x'), ['x']) self.assertEqual(lasttwo('x;'), ['x', ''])
def DefaultDelimitedSplitter(delimiter): """Wraps delimited splitter to handle empty records""" parser = DelimitedSplitter(delimiter=delimiter) def f(line): parsed = parser(line) if len(parsed) == 1: parsed.append('') return parsed return f
def test_splitter(self): """StrictFieldWrapper with splitter should use that splitter""" fields = ['label', 'count'] splitter = DelimitedSplitter(':', -1) f = StrictFieldWrapper(fields, splitter) self.assertEqual(f('n:k:n:a:sd '), { 'label': 'n:k:n:a', 'count': 'sd' }) self.assertEqual(f('nknasd:'), {'label': 'nknasd', 'count': ''}) self.assertRaises(FieldError, f, '')
def test_full_LOC(self): """LOC should behave as expected when initialized with rich data""" data = [ "abc\t def", " 3 \t n", " abc \txyz\n\n", "x\t5", "fgh ", "x\t3 " ] class rec(MappedRecord): Required = {'abc': []} maps = {'abc': list_adder, 'x': int_setter, 'fgh': bool_setter} label_splitter = DelimitedSplitter('\t') constructor = rec strict = True loc_bad = LineOrientedConstructor(data, label_splitter, maps, \ constructor, strict) self.assertRaises(FieldError, loc_bad) strict = False loc_good = LineOrientedConstructor(data, label_splitter, maps, \ constructor, strict) result = loc_good() assert isinstance(result, rec) self.assertEqual(result, \ {'abc':['def','xyz'], '3':'n','fgh':False,'x':3})
"""Checks if x is blank.""" return (not x) or x.isspace() CutgSpeciesFinder = LabeledRecordFinder(is_cutg_species_label, ignore=is_blank) CutgFinder = LabeledRecordFinder(is_cutg_label, ignore=is_blank) codon_order = "CGA CGC CGG CGU AGA AGG CUA CUC CUG CUU UUA UUG UCA UCC UCG UCU AGC AGU ACA ACC ACG ACU CCA CCC CCG CCU GCA GCC GCG GCU GGA GGC GGG GGU GUA GUC GUG GUU AAA AAG AAC AAU CAA CAG CAC CAU GAA GAG GAC GAU UAC UAU UGC UGU UUC UUU AUA AUC AUU AUG UGG UAA UAG UGA".split( ) #NOTE: following field order omits Locus/CDS (first field), which needs further #processing. Use zip(field_order, fields[1:]) and handle first field specially. field_order = "GenBank Location Length GenPept Species Description".split() species_label_splitter = DelimitedSplitter(':', -1) def CutgSpeciesParser(infile, strict=True, constructor=CodonUsage): """Yields successive sequences from infile as CodonUsage objects. If strict is True (default), raises RecordError when label or seq missing. """ if not strict: #easier to see logic without detailed error handling for rec in CutgSpeciesFinder(infile): try: label, counts = rec if not is_cutg_species_label(label): continue species, genes = species_label_splitter(label) info = Info({'Species': species, 'NumGenes': int(genes)})
__author__ = "Rob Knight" __copyright__ = "Copyright 2007-2012, The Cogent Project" __credits__ = ["Rob Knight"] __license__ = "GPL" __version__ = "1.5.3" __maintainer__ = "Rob Knight" __email__ = "*****@*****.**" __status__ = "Development" def ll_start(line): """Returns True if line looks like the start of a LocusLink record.""" return line.startswith('>>') LLFinder = LabeledRecordFinder(ll_start) pipes = DelimitedSplitter('|', None) first_pipe = DelimitedSplitter('|') commas = DelimitedSplitter(',', None) first_colon = DelimitedSplitter(':', 1) accession_wrapper = FieldWrapper(['Accession', 'Gi', 'Strain'], pipes) def _read_accession(line): """Reads accession lines: format is Accession | Gi | Strain.""" return MappedRecord(accession_wrapper(line)) rell_wrapper = FieldWrapper(['Description', 'Id', 'IdType', 'Printable'], pipes) def _read_rell(line): """Reads RELL lines: format is Description|Id|IdType|Printable""" return MappedRecord(rell_wrapper(line)) accnum_wrapper = FieldWrapper(['Accession','Gi','Strain','Start','End'], pipes)
else: result[key] = [val] labels.append(key) return result, labels def is_clustal_seq_line(line): """Returns True if line starts with a non-blank character but not 'CLUSTAL'. Useful for filtering other lines out of the file. """ return line and (not line[0].isspace()) and\ (not line.startswith('CLUSTAL')) and (not line.startswith('MUSCLE')) last_space = DelimitedSplitter(None, -1) def delete_trailing_number(line): """Deletes trailing number from a line. WARNING: does not preserve internal whitespace when a number is removed! (converts each whitespace run to a single space). Returns the original line if it didn't end in a number. """ pieces = line.split() try: int(pieces[-1]) return ' '.join(pieces[:-1]) except ValueError: #no trailing numbers return line