def classify(token_list): out = dict() tokens = copy(token_list) # from pprint import pprint # pprint(FIELD_DMV) # first, sanity-check the digified tokens -- # if more than one can pass for a phone number, a color, # or a zip code (that is to say, the input is ambiguous), # we bail: for token in tokens: # check each term against all test funcs -- # if more than one bucket is nonzero, it's a problem h = Histogram() for FieldType in FIELD_DMV: if FieldType.check(token): h.inc(FieldType.json_name) if len(h) > 1: # ERROR: couldn't distinguish one thing # from another... BAIL raise RDAmbiguousTerms("Token '%s' parsed ambiguously\n" "Passed multiple field checks: %s" % ( token, SEP_WS.join(h.iterkeys()) )) # update `out` with the classified tokens for idx, token in enumerate(copy(tokens)): for FieldType in FIELD_DMV: if FieldType.check(token): field = FieldType() out.update(field.value_for_token(token)) tokens.remove(token) continue # raise appropriate errors when we don't find what we need for FieldType in FIELD_DMV: field = FieldType() if not out.has_key(field.name): field.unfound(token_list) # this may raise # what is left "should" be the pieces of the name, # e.g. ['Washington', 'Booker T.'], ['James Murphy'], &c first_field = FirstNameField() last_field = LastNameField() if len(tokens) > 2: # ERROR: wtf is going on pass elif len(tokens) == 2: out.update(first_field.value_for_token(tokens[-1])) out.update(last_field.value_for_token(tokens[0])) elif len(tokens) == 1: names = tokens[0].split() if len(names) > 1: out.update(first_field.value_for_token(names[0])) out.update(last_field.value_for_token(names[-1])) else: NameField().unfound(token_list) else: # WHY ARE WE HERE. No names... really?? raise RDAmbiguousNames("No names present!" "Reconstructed original line:\n" "\t%s" % reconstruct(token_list)) # pprint(out, indent=4) return out