Ejemplo n.º 1
0
def cli(argv=None):
    if not argv:
        argv = sys.argv
    
    arguments = docopt(__doc__, argv=argv[1:],
                                help=True,
                                version='0.1.3')
    
    # print(argv)
    # print(arguments)
    # sys.exit()
    
    entries = []
    errors  = []
    colors  = Histogram()
    
    ipth = arguments.get('INFILE')
    opth = arguments.get('--output')
    verbose = bool(arguments.get('--verbose'))
    
    with open(ipth, 'rb') as fh:
        idx = 0
        while True:
            linen = fh.readline()
            if not linen:
                break
            line = linen.strip()
            tokens = rolodexer.tokenize(line)
            try:
                terms = rolodexer.classify(tokens)
            except rolodexer.RolodexerError:
                errors.append(idx)
            else:
                entries.append(terms)
                if 'color' in terms:
                    colors.inc(terms.get('color'))
            idx += 1
        
        output_dict = { u"entries": entries, u"errors": errors }
        
        if verbose:
            print("Entries parsed: %s" % len(entries), file=sys.stderr)
            print("Errors encountered: %s" % len(errors), file=sys.stderr)
            print_colors(colors)
        
        if opth == 'stdout':
            output_json = json.dumps(output_dict, **JSON_ARGS)
            print(output_json, file=sys.stdout)
        elif not exists(opth) and isdir(dirname(opth)):
            if verbose:
                print("rolodexer: saving output to %s" % opth, file=sys.stderr)
            with open(opth, 'wb') as fp:
                json.dump(output_dict, fp, **JSON_ARGS)
Ejemplo n.º 2
0
 def test_file_read(self):
     from os.path import join, dirname
     from rolodexer.histogram import Histogram
     entries = []
     errors  = []
     colors  = Histogram()
     inpth = join(dirname(dirname(__file__)), 'data', 'data.in')
     with open(inpth, 'rb') as fh:
         idx = 0
         while True:
             linen = fh.readline()
             if not linen:
                 break
             line = linen.strip()
             tokens = rolodexer.tokenize(line)
             try:
                 terms = rolodexer.classify(tokens)
             except rolodexer.RolodexerError:
                 errors.append(idx)
             else:
                 entries.append(terms)
                 colors.inc(terms.get('color', 'CLEAR'))
             idx += 1
         output_dict = { u"entries": entries, u"errors": errors }
         output_json = json.dumps(output_dict, indent=2, sort_keys=True)
         print(output_json)
         print(colors)
         # all classified lines have colors:
         self.assertEquals(colors.min(), 3)
         self.assertEquals(colors.max(), 10)
         self.assertEquals(colors.val('CLEAR'), 0)
Ejemplo n.º 3
0
def classify(orig_terms):
    out = dict()
    terms = copy(orig_terms)
    
    # first, sanity-check the digified terms --
    # if more than one can pass for a phone number, a color,
    # or a zip code (that is to say, the input is ambiguous),
    # we bail:
    for term in terms:
        # check each term against all test funcs --
        # if more than one bucket is nonzero, it's a problem
        h = Histogram()
        if is_zip(term):
            h.inc('zip')
        if is_phone(term):
            h.inc('phone')
        if is_color(term):
            h.inc('color')
        if len(h) > 1:
            # ERROR: couldn't distinguish one thing
            # from another... BAIL
            raise RDAmbiguousTerms("Term '%s' parsed ambiguously\n"
                                   "Passed multiple tests: %s" % (
                                       term, SEP_WS.join(h.iterkeys())
                                   ))
    
    # next, recurse and grab the phone number and color
    # ... they are the easiest to find:
    for idx, term in enumerate(copy(terms)):
        # tref = terms[idx] # I do miss C++ sometimes
        if is_phone(term):
            out.update({ u'phonenumber': u"%s" % phone_format(term) })
            terms.remove(term)
            continue
        elif is_color(term):
            out.update({ u'color': u"%s" % term })
            terms.remove(term)
            continue
        elif is_zip(term):
            out.update({ u'zipcode': u"%s" % term })
            terms.remove(term)
            continue
    
    if not out.has_key(u'phonenumber'):
        # ERROR: NO PHONE / BAD PHONE!
        raise RDPhoneNumberError("No valid phone number in %d-term list\n"
                                 "Reconstructed original line:\n"
                                 "\t%s" % (len(terms), reconstruct(orig_terms)))
    
    if not out.has_key(u'zipcode'):
        # ERROR: NO ZIPCODE / BAD ZIPCODE!
        raise RDZipCodeError("No valid zip code in %d-term list\n"
                             "Reconstructed original line:\n"
                             "\t%s" % (len(terms), reconstruct(orig_terms)))
    
    if not out.has_key(u'color'):
        # LESS DISCONCERTING ERROR: NO COLOR / BAD COLOR!
        pass
    
    # what is left "should" be the pieces of the name,
    # e.g. ['Washington', 'Booker T.'], ['James Murphy'], &c
    if len(terms) > 2:
        # ERROR: wtf is going on
        pass
    elif len(terms) == 2:
        out.update({ 
            u'firstname':    u"%s" % terms[-1],
            u'lastname':     u"%s" % terms[0]
        })
    elif len(terms) == 1:
        names = terms[0].split()
        if len(names) > 1:
            out.update({
                u'firstname':    u"%s" % names[0],
                u'lastname':     u"%s" % names[-1]
            })
        else:
            # ERROR: only one name -- `raise MadonnaError()` ?
            # ... use it as the *last* name for now, maybe
            # ... naw, f that: ERROR.
            raise RDAmbiguousNames("Only one name present: '%s'\n"
                                   "Reconstructed original line:\n"
                                   "\t%s" % (names.pop(), reconstruct(orig_terms)))
        
    else:
        # WHY ARE WE HERE. No names... really??
        raise RDAmbiguousNames("No names present!"
                               "Reconstructed original line:\n"
                               "\t%s" % reconstruct(orig_terms))
        
    # pprint(out, indent=4)
    return out