def test_tokenize(self): # all of these should tokenize (even with invalid individual terms) line0 = 'Booker T., Washington, 87360, 373 781 7380, yellow' line1 = 'Chandler, Kerri, (623)-668-9293, pink, 123123121' line2 = 'James Murphy, yellow, 83880, 018 154 6474' terms0 = rolodexer.tokenize(line0) terms1 = rolodexer.tokenize(line1) terms2 = rolodexer.tokenize(line2) self.assertEqual(len(terms0), 5) self.assertEqual(len(terms1), 5) self.assertEqual(len(terms2), 4) # first/last are single term
def test_tokenize_classify(self): # from pprint import pprint entries = [] errors = [] lines = sample_input.splitlines() for idx, line in enumerate(lines): terms = rolodexer.tokenize(line) try: cterms = rolodexer.classify(terms) except rolodexer.RolodexerError: errors.append(idx) else: keys = cterms.keys() self.assertTrue(u'phonenumber' in keys) self.assertTrue(u'firstname' in keys) self.assertTrue(u'lastname' in keys) self.assertTrue(u'color' in keys) self.assertTrue(u'zipcode' in keys) entries.append(cterms) output_dict = { u"entries": entries, u"errors": errors } # pprint(output_dict) sample_output_dict = json.loads(sample_output) self.assertItemsEqual( output_dict, sample_output_dict)
def test_file_read(self): from os.path import join, dirname from rolodexer.histogram import Histogram entries = [] errors = [] colors = Histogram() inpth = join(dirname(dirname(__file__)), 'data', 'data.in') with open(inpth, 'rb') as fh: idx = 0 while True: linen = fh.readline() if not linen: break line = linen.strip() tokens = rolodexer.tokenize(line) try: terms = rolodexer.classify(tokens) except rolodexer.RolodexerError: errors.append(idx) else: entries.append(terms) colors.inc(terms.get('color', 'CLEAR')) idx += 1 output_dict = { u"entries": entries, u"errors": errors } output_json = json.dumps(output_dict, indent=2, sort_keys=True) print(output_json) print(colors) # all classified lines have colors: self.assertEquals(colors.min(), 3) self.assertEquals(colors.max(), 10) self.assertEquals(colors.val('CLEAR'), 0)
def _test_bad_line_raises(self): """ assertRaises() is holding some sort of grudge against my entire bloodline, for some reason """ # from rolodexer import RDZipCodeError, RolodexerError lines = sample_input.splitlines() for idx, line in enumerate(lines): terms = rolodexer.tokenize(line) # with self.assertRaises(RDZipCodeError): self.assertRaises( Exception, rolodexer.classify, terms)