def test_tokenize_classify(self): # from pprint import pprint entries = [] errors = [] lines = sample_input.splitlines() for idx, line in enumerate(lines): terms = rolodexer.tokenize(line) try: cterms = rolodexer.classify(terms) except rolodexer.RolodexerError: errors.append(idx) else: keys = cterms.keys() self.assertTrue(u'phonenumber' in keys) self.assertTrue(u'firstname' in keys) self.assertTrue(u'lastname' in keys) self.assertTrue(u'color' in keys) self.assertTrue(u'zipcode' in keys) entries.append(cterms) output_dict = { u"entries": entries, u"errors": errors } # pprint(output_dict) sample_output_dict = json.loads(sample_output) self.assertItemsEqual( output_dict, sample_output_dict)
def test_file_read(self): from os.path import join, dirname from rolodexer.histogram import Histogram entries = [] errors = [] colors = Histogram() inpth = join(dirname(dirname(__file__)), 'data', 'data.in') with open(inpth, 'rb') as fh: idx = 0 while True: linen = fh.readline() if not linen: break line = linen.strip() tokens = rolodexer.tokenize(line) try: terms = rolodexer.classify(tokens) except rolodexer.RolodexerError: errors.append(idx) else: entries.append(terms) colors.inc(terms.get('color', 'CLEAR')) idx += 1 output_dict = { u"entries": entries, u"errors": errors } output_json = json.dumps(output_dict, indent=2, sort_keys=True) print(output_json) print(colors) # all classified lines have colors: self.assertEquals(colors.min(), 3) self.assertEquals(colors.max(), 10) self.assertEquals(colors.val('CLEAR'), 0)
def test_classify(self): terms = [ u'yellow', u'373 781 7380', u'87360', u'Washington', u'Booker T.'] out = rolodexer.classify(terms) keys = out.keys() self.assertTrue(u'phonenumber' in keys) self.assertTrue(u'firstname' in keys) self.assertTrue(u'lastname' in keys) self.assertTrue(u'color' in keys) self.assertTrue(u'zipcode' in keys) phonefield = rolodexer.PhoneNumberField() self.assertEqual(out[u'color'], terms[0]) self.assertEqual(out[u'phonenumber'], phonefield.format(terms[1])) self.assertEqual(out[u'zipcode'], terms[2]) self.assertEqual(out[u'lastname'], terms[3]) self.assertEqual(out[u'firstname'], terms[4])