def get_pp_names(fn_field):
    """
    Use probablepeople to extract firstname/surname from vCard 'fn' field.

    :param fn_field: the input vCard 'fn' field.
    :return: a namedtuple containing the first name and surname.

    >>> get_names('John Smith')
    Extracting data for John Smith
    Names(first_name='John', surname='Smith')
    """
    first_name = None
    surname = None

    try:
        import probablepeople as pp  # not python 2.6 compatible
        # Use probablepeople to tag the parts of the name.

        full_name_dict = pp.tag(fn_field)[0]

        if 'GivenName' in full_name_dict:
            # If probablepeople has successfully extracted the first name,
            # use it.
            first_name = full_name_dict['GivenName']

        if 'Surname' in full_name_dict:
            # If probablepeople has successfully extracted the surname,
            # use it.
            surname = full_name_dict['Surname']
    except (ImportError, SyntaxError, TypeError) as error:
        print(error)

    return NAMES(first_name, surname)
Example #2
0
def read(fn_in):
    global tags, tagged_rows, initial_keys
    print 'Reading and parsing: %s' % fn_in
    with open(fn_in, 'rb') as incsvfile:
        reader = csv.DictReader(incsvfile, delimiter=',')
        counter = 0

        for row in reader:
            # stop early for development
            #if counter > 1000:
            #    break
            try:
                row['name'] = row['name'].decode('utf8')
                tagged = pp.tag(row['name'])
                # add t_ prefix to distinguish from original columns
                tagged0 = OrderedDict(
                    ('t_' + k, v) for k, v in tagged[0].viewitems())
                # add type
                tagged0['tag_type'] = tagged[1]
                # add to dictionary of input values
                row.update(tagged0)
            except pp.RepeatedLabelError:
                row['tag_type'] = 'RepeatedLabelError'
            except UnicodeEncodeError:
                #https://github.com/datamade/probablepeople/issues/54
                row['tag_type'] = 'UnicodeEncodeError'
            [tags.add(t) for t in row.keys()]
            tagged_rows.append(row)
            counter = counter + 1

    print 'Read {:,} rows'.format(counter)

    # preserve the original order of field names
    initial_keys = reader.fieldnames
def get_pp_names(fn_field):
    """
    Use probablepeople to extract firstname/surname from vCard 'fn' field.

    :param fn_field: the input vCard 'fn' field.
    :return: a namedtuple containing the first name and surname.

    >>> get_names('John Smith')
    Extracting data for John Smith
    Names(first_name='John', surname='Smith')
    """
    first_name = None
    surname = None

    try:
        import probablepeople as pp  # not python 2.6 compatible
        # Use probablepeople to tag the parts of the name.

        full_name_dict = pp.tag(fn_field)[0]

        if 'GivenName' in full_name_dict:
            # If probablepeople has successfully extracted the first name,
            # use it.
            first_name = full_name_dict['GivenName']

        if 'Surname' in full_name_dict:
            # If probablepeople has successfully extracted the surname,
            # use it.
            surname = full_name_dict['Surname']
    except (ImportError, SyntaxError, TypeError) as error:
        print(error)

    return NAMES(first_name, surname)
Example #4
0
def parse_name(string):
    # First try the pretrained CRF model
    try:
        results, class_type = probablepeople.tag(string)
        if class_type != 'Person':
            raise ValueError("Skipping ...")
            return None
        # Form full name
        given_name = ""
        if 'GivenName' in results:
            given_name = results['GivenName']
        elif 'FirstInitial' in results:
            given_name = results['FirstInitial']
        surname = ""
        if 'Surname' in results:
            surname = results['Surname']
        elif 'LastInitial' in results:
            surname = results['Surname']
        middle_name = ""
        if 'MiddelName' in results:
            middle_name = results['MiddleName']
        elif 'MiddleInitial' in results:
            middle_name = results['MiddleInitial']
        full_name = (given_name, middle_name, surname)
    except Exception as e:
        # If there are errors, try some rule-based models:
        print('CRF models cannot process this name {}' + string)
        results = HumanName(string)
        given_name, middle_name, surname = results.first, results.middle, results.last
        full_name = (given_name, middle_name, surname)
    return full_name
Example #5
0
def parse_business_name(row, name_cols, strict=False, type='generic'):
    """Parses a Company name with probablepeople library

    Concatenates all the company name columns into a single string and then attempts to parse it
    into standardized name components and return a subset of the name parts that are useful when
    comparing Contacts and Accounts. This process eliminates notes and other non-name text from
    dirty data.

    Args:
        row (pd.Series): A record
        name_cols (list): A list of column names in the record, in order, that when concatenated
            comprise a company/business name
        strict (boolean, optional): Whether or not to raise a RepeatedLabelError when parsing, if
            False, the last value of the repeated labels will be used for the parse
        type (str): Which probableparser to use: 'generic', 'person' or 'company'

    Returns:
        biz_name (str or np.nan): Filtered and standardized company name

    Example:
        Add a new column to a pd.DataFrame with a clean company name
    >>> from mergepurge import clean
    >>> df['clean_company'] = \
    ...     df.apply(clean.parse_business_name, axis=1, name_cols=['Account Name'], strict=False)
    """

    row = row.fillna('')
    concat = []
    for col in name_cols:
        concat.append(str(row.get(col, '')))
    concat = ' '.join(concat)

    # pre-processing (no category for non-names to train parserator with)
    cleaned = re.sub(r'(not\s*available|not\s*provided|n/a)',
                     '',
                     concat,
                     flags=re.IGNORECASE)

    try:
        parsed = probablepeople.tag(
            cleaned, type)  # type='company'? general parser works better
    except probablepeople.RepeatedLabelError as e:
        if strict:
            raise e

        problem_key, problem_vals, parsed = find_repeated_label(cleaned)
        parsed = (parsed, '')

    # filter out other name components that are bad for matching (e.g. too generic: llc)
    rebuilt_name = []
    for (k, v) in parsed[0].items():
        if k in KEEPERS:
            rebuilt_name.append(v)

    biz_name = ' '.join(rebuilt_name)

    if pd.isnull(biz_name):
        return np.nan

    return biz_name
Example #6
0
def clean_name(full_name):


    """
    This function takes an text address (street address, city, state, zip), tokenizes it with the usaddress library,
    and runs basic search and replace to help standardize the address

    """


    try:


        original_name = full_name.upper().replace('  ',', ')


        # Make edits to original_name before calling probablepeople



        probablepeople_name = probablepeople.tag(original_name)



        #print 'probable name:', probablepeople_name

        name_dict = dict(probablepeople_name[0])

        name_type = probablepeople_name[1]

        name_dict['name_type'] = name_type

        name_dict['original_name'] = original_name

        # Remove trailing '.' from any abbreviations
        for field in people_fields_to_strip_periods:
            if field in name_dict:
                name_dict[field] = name_dict[field].strip('.')


        #print 'probable name dict:', name_dict

        return name_dict


    except Exception as e:

        print 'probablepeople ERROR:'
        print e

        return e
Example #7
0
def parse_contact_name(row, name_cols, strict=False, type='person'):
    """Parses a person's name with probablepeople library

    Concatenates all the contact name columns into a single string and then attempts to parse it
    into standardized name components and return a subset of the name parts that are useful for
    comparing contacts. This process eliminates notes and other non-name text from dirty data.

    Args:
        row (pd.Series): A record
        name_cols (list): A list of column names in the record, in order, that when concatenated
            comprise a person's name
        strict (boolean, optional): Whether or not to raise a RepeatedLabelError when parsing, if
            False, the last value of the repeated labels will be used for the parse
        type (str): Which probableparser to use: 'generic', 'person' or 'company'

    Returns:
        A subset (tuple of str, or np.nan) of the standardized name components, namely:
            (title, first, last, full_name)
    """

    row = row.fillna('')
    concat = []
    for col in name_cols:
        concat.append(row.get(col, ''))
    concat = ' '.join(concat)

    cleaned = re.sub(r'(not\s*available|not\s*provided|n/a)',
                     '',
                     concat,
                     flags=re.IGNORECASE)

    try:
        parsed = probablepeople.tag(cleaned, type)
    except probablepeople.RepeatedLabelError as e:
        if strict:
            raise e

        problem_key, problem_vals, parsed = find_repeated_label(cleaned)
        parsed = (parsed, '')

    title = parsed[0].get('PrefixOther', np.nan)
    first = parsed[0].get('GivenName', np.nan)
    last = parsed[0].get('Surname', np.nan)
    try:
        full_name = first + ' ' + last
    except TypeError as e:
        full_name = np.nan

    return title, first, last, full_name
Example #8
0
def get_names(fn):
    """
    Extract the first name and surname from a vCard 'fn' field.

    :param fn: the input vCard 'fn' field.
    :return: a namedtuple containing the first name and surname.

    >>> get_names('John Smith')
    Names(first_name='John', surname='Smith')
    """
    first_name = None
    surname = None

    try:
        import probablepeople as pp  # not python 2.6 compatible
        # Use probablepeople to tag the parts of the name.
        full_name_dict = pp.tag(fn)[0]

        if 'GivenName' in full_name_dict:
            # If probablepeople has successfully extracted the first name,
            # use it.
            first_name = full_name_dict['GivenName']

        if 'Surname' in full_name_dict:
            # If probablepeople has successfully extracted the surname, use it.
            surname = full_name_dict['Surname']
    except ImportError:
        pass
    except SyntaxError:
        pass

    fn_split = fn.split(" ")

    if first_name is None:
        # If we can't get first name from probablepeople, assume it's the
        # first part of the string.
        first_name = fn_split[0]

    if surname is None:
        # If we can't get surname from probablepeople, assume it's the
        # second part of the string, if that exists.
        if len(fn_split) > 1:
            surname = fn_split[1]
        else:
            surname = ""

    names = Names(first_name, surname)

    return names
Example #9
0
 def _lookup_rsvps(self, rsvp):
     if self.private:
         yield [
             "User Id",
             "Username",
             "Full Name",
             "First Name",
             "Last Name",
             "Email",
             "Guests",
         ]
     else:
         yield [
             "Full Name",
             "First Name",
             "Last Name",
             "Guests",
         ]
     for item in rsvp:
         first_name = last_name = full_name = ""
         if not item.name:
             # if no name is defined and there is a db record for this user
             if item.user:
                 # lookup the user's name from the db
                 first_name = item.user.first_name
                 last_name = item.user.last_name
                 try:
                     full_name = item.user.profile.display_name
                 except Exception:
                     logger.exception("Unable to access user profile.")
         else:
             full_name = item.name
             try:
                 # try to parse out the user first/last name
                 parsed = probablepeople.tag(full_name)
                 first_name = parsed[0].get("GivenName")
                 last_name = parsed[0].get("Surname")
             except Exception:
                 logger.exception("unable to parse person %s", full_name)
         if self.private:
             row = [
                 item.user.id if item.user else "",
                 item.user.username if item.user else "", full_name,
                 first_name, last_name, item.email, item.guests
             ]
         else:
             row = [full_name, first_name, last_name, item.guests]
         row = [unicode_convert(x) for x in row]
         yield row
Example #10
0
def display_name_to_rsvps(apps, schema_editor):
    if not has_probablepeople:
        return None

    UserProfile = apps.get_model('profiles', 'UserProfile')
    RSVP = apps.get_model('meetings', 'RSVP')
    users = UserProfile.objects.all()
    for user in users:
        parsed = probablepeople.tag(user.display_name)
        first_name = parsed[0].get('GivenName', '').lower()
        last_name = parsed[0].get('Surname', '').lower()

        rsvps = RSVP.objects.filter(user=user.pk)
        for rsvp in rsvps:
            rsvp.first_name = first_name
            rsvp.last_name = last_name
            rsvp.save()
Example #11
0
 def post(self):
     args = partyTypeParser.parse_args()
     partyNameText = args['partyName']
     resp = pp.tag(partyNameText)
     type = resp[-1]
     if type in ['Person']:
         return jsonify(partyName=partyNameText,
                        partyTypeConfidence='High',
                        partyType=type)
     elif type in ['Corporation']:
         return jsonify(partyName=partyNameText,
                        partyTypeConfidence='High',
                        partyType=type)
     else:
         return jsonify(partyName=partyNameText,
                        partyTypeConfidence='Low',
                        partyType='Person')
Example #12
0
def get_names(name_str: str) -> Tuple[List[Name], Optional[str]]:
    if not name_str:
        return [], None
    name_str = " ".join([
        component.title() for component in name_str.split()
        if component.lower() not in remove_list
    ])
    try:
        tagged, category = tag(name_str)
        if category == CORP:
            return [Name(entity_name=name_str)], category
        elif category == PERSON:
            return tagged_name_retrieve(tagged), PERSON
        else:  # Household (ie: multiple names) should be the only other option
            return parsed_name_retrieve(parse(name_str)), category
    except RepeatedLabelError as e:
        tag_list = e.parsed_string  # this is actually the same as parse(<name>)
        return parsed_name_retrieve(tag_list), None
Example #13
0
 def _lookup_rsvps(self, rsvp):
     if self.private:
         yield ["User Id", "Username", "Full Name",
                "First Name", "Last Name", "Email", "Guests", ]
     else:
         yield ["Full Name", "First Name", "Last Name", "Guests", ]
     for item in rsvp:
         first_name = last_name = full_name = ""
         if not item.name:
             # if no name is defined and there is a db record for this user
             if item.user:
                 # lookup the user's name from the db
                 first_name = item.user.first_name
                 last_name = item.user.last_name
                 try:
                     full_name = item.user.profile.display_name
                 except Exception:
                     logger.exception("Unable to access user profile.")
         else:
             full_name = item.name
             try:
                 # try to parse out the user first/last name
                 parsed = probablepeople.tag(full_name)
                 first_name = parsed[0].get("GivenName")
                 last_name = parsed[0].get("Surname")
             except Exception:
                 logger.exception("unable to parse person %s", full_name)
         if self.private:
             row = [item.user.id if item.user else "",
                    item.user.username if item.user else "",
                    full_name,
                    first_name,
                    last_name,
                    item.email,
                    item.guests]
         else:
             row = [
                 full_name,
                 first_name,
                 last_name,
                 item.guests]
         row = [unicode_convert(x) for x in row]
         yield row
Example #14
0
    def get_candidate_fields(self, raw_result):
        fields = self._get_fields(raw_result, candidate_fields)
        full_name = raw_result.full_name.strip()

        if full_name.lower() in ['yes', 'no']:
            fields = self.get_judge_candidate_fields(raw_result)
            return fields

        if full_name.lower() in ['no candidate', 'candidate withdrew']:
            fields['full_name'] = None
            return fields

        try:
            name_parts, name_type = pp.tag(full_name)

            if name_type != 'Person':
                print "***************************"
                print "NOT A PERSON:", fields['full_name']
                print "fields:", fields
                print "tagged name:", name_parts
                print "***************************"
                fields['full_name'] = full_name
                return fields

            fields['given_name'] = name_parts.get('GivenName')
            fields['family_name'] = name_parts.get('Surname')
            if 'SuffixGenerational' in name_parts:
                fields['suffix'] = name_parts['SuffixGenerational']
            if 'Nickname' in name_parts:
                fields['additional_name'] = name_parts['Nickname']

            fields['full_name'] = full_name

        except pp.RepeatedLabelError:
            print "***************************"
            print "UNABLE TO TAG:", full_name
            print "***************************"
            fields['full_name'] = full_name

        return fields
Example #15
0
    def _valid_name(self, possible_name):
        """ Check with probable people if possible_name is a valid person name"""
        try:
            parsed_name = pp.tag(possible_name)

            if parsed_name[1] != 'Person':
                return False, ''

            name_dict = parsed_name[0]

            if not ('GivenName' in name_dict and 'Surname' in name_dict):
                return False, ''

            fragments = []
            for fragment_type, fragment in name_dict.items():
                fragments.append(fragment.capitalize())

            return True, ' '.join(fragments)

        except (pp.RepeatedLabelError, UnicodeEncodeError):
            return False, ''
            pass  # Just ignore the name on errors
Example #16
0
 def post(self):
     args = partyName.parse_args()
     partyNameText = args['partyName']
     resp0 = pp.tag(partyNameText)
     type = resp0[-1]
     finalResponse = dict(resp0[0])
     partyType = {}
     if type in ['Person']:
         partyType['partyTypeConfidence'] = 'High'
         partyType['partyType'] = type
     elif type in ['Corporation']:
         partyType['partyTypeConfidence'] = 'High'
         partyType['partyType'] = type
     elif type in ['Household']:
         partyType['partyTypeConfidence'] = 'Medium'
         partyType['partyType'] = type
     else:
         partyType['partyTypeConfidence'] = 'Low'
         partyType['partyType'] = 'Person'
     return jsonify(submittedPartyName=partyNameText,
                    parsedParty=finalResponse,
                    partyType=partyType)
Example #17
0
def normalize_name(name):
    """
    Normalize a name for sorting.

    This uses two powerful python libraries for differing reasons.

    `probablepeople` contains a discriminator between company and person names.
    This is used to determine whether to parse into last, first, middle or to
    leave the name alone.

    However, the actual name parser in `probablepeople` is unnecessarily complex,
    so that strings that it determines to be human names are parsed instead by
    `nameparser`.

    """
    sname = name.encode('utf-8').strip()  # remove spaces
    try:
        # probablepeople doesn't understand utf-8 encoding. Hand it pure unicode.
        _, type = probablepeople.tag(name)  # discard parser result
    except probablepeople.RepeatedLabelError:  # if it can't understand the name, punt
        return sname

    if type == 'Corporation':
        return sname  # do not parse and reorder company names

    # treat anything else as a human name
    nameparts = HumanName(sname)
    normalized = nameparts.last.capitalize()
    if nameparts.suffix:
        normalized = normalized + ' ' + nameparts.suffix
    normalized = normalized + ','
    if nameparts.title:
        normalized = normalized + ' ' + nameparts.title
    if nameparts.first:
        normalized = normalized + ' ' + nameparts.first.capitalize()
    if nameparts.middle:
        normalized = ' ' + normalized + ' ' + nameparts.middle.capitalize()
    return normalized.strip()
Example #18
0
def normalize_name(name):
    """
    Normalize a name for sorting.

    This uses two powerful python libraries for differing reasons.

    `probablepeople` contains a discriminator between company and person names.
    This is used to determine whether to parse into last, first, middle or to
    leave the name alone.

    However, the actual name parser in `probablepeople` is unnecessarily complex,
    so that strings that it determines to be human names are parsed instead by
    `nameparser`.

    """
    sname = name.strip()  # remove spaces
    try:
        _, type = probablepeople.tag(sname)  # discard parser result
    except probablepeople.RepeatedLabelError:  # if it can't understand the name, punt
        return sname

    if type == 'Corporation':
        return sname  # do not parse and reorder company names

    # treat anything else as a human name
    nameparts = HumanName(sname)
    normalized = nameparts.last.capitalize()
    if nameparts.suffix:
        normalized = normalized + ' ' + nameparts.suffix
    normalized = normalized + ','
    if nameparts.title:
        normalized = normalized + ' ' + nameparts.title
    if nameparts.first:
        normalized = normalized + ' ' + nameparts.first.capitalize()
    if nameparts.middle:
        normalized = ' ' + normalized + ' ' + nameparts.middle.capitalize()
    return normalized.strip()
def extract_entity(text):
    (tags, entity_type) = pp.tag(text)
    # fix the schema output
    output = {'type': '', 'data': {}}
    # person = {'type' : 'Person', 'data': {prefix': '', 'first_name': '', 'last_name': '', 'nick_name': ''}}
    # organization = {'type' : 'Organization', 'data': {name': '', 'legal_type' : ''}}

    valid_corporate_keys = ['CorporationName', 'CorporationLegalType']
    valid_person_keys = ['PrefixMarital', 'GivenName', 'Surname', 'Nickname']

 
    for key in tags:
        if entity_type == 'Corporation' and key in valid_corporate_keys:
            output['type'] = 'Organization'
            if key == 'CorporationName': output['data']['name'] = tags[key] 
            if key == 'CorporationLegalType': output['data']['legal_type'] = tags[key]  
        if entity_type == 'Person' and key in valid_person_keys:
            output['type'] = 'Person'
            if key == 'PrefixMarital': output['data']['prefix'] = tags[key]  
            if key == 'GivenName': output['data']['first_name'] = tags[key]  
            if key == 'Surname': output['data']['last_name'] = tags[key]  
            if key == 'Nickname': output['data']['nick_name'] = tags[key]  
       
    return output
Example #20
0
    def export(self, property_state):
        """
        Export HPXML file from an existing HPXML file (from import) merging in the data from property_state
        :param property_state:  object, PropertyState to merge into HPXMLs
        :return: string, as XML
        """
        if not property_state:
            f = BytesIO()
            self.tree.write(f,
                            encoding='utf-8',
                            pretty_print=True,
                            xml_declaration=True)
            return f.getvalue()

        if self.tree is None:
            tree = objectify.parse(os.path.join(here, 'schemas', 'blank.xml'),
                                   parser=hpxml_parser)
            root = tree.getroot()
        else:
            root = deepcopy(self.root)

        bldg = self._get_building(
            property_state.extra_data.get('hpxml_building_id'),
            start_from=root)

        for pskey, xml_loc in self.HPXML_STRUCT.items():
            value = getattr(property_state, pskey)
            el = self.xpath(xml_loc['path'], start_from=bldg, only_one=True)
            if pskey == 'energy_score':
                continue
            if value is None and self.tree is None:
                el.getparent().remove(el)
            if value is None or el is None:
                continue

            # set the value to magnitude if it is a quantity
            if isinstance(value, ureg.Quantity):
                value = value.magnitude
            setattr(el.getparent(), el.tag[el.tag.index('}') + 1:],
                    str(value) if not isinstance(value, basestring) else value)

        E = objectify.ElementMaker(annotate=False,
                                   namespace=self.NS,
                                   nsmap={None: self.NS})

        # Owner Information
        owner = self.xpath((
            '//h:Customer/h:CustomerDetails/h:Person'
            '[not(h:IndividualType) or h:IndividualType = "owner-occupant" or h:IndividualType = "owner-non-occupant"]'
        ),
                           start_from=root)

        if len(owner) > 0:
            owner = owner[0]
        else:
            customer = E.Customer(
                E.CustomerDetails(
                    E.Person(E.SystemIdentifier(id='person1'), E.Name())))
            root.Building.addprevious(customer)
            owner = customer.CustomerDetails.Person

        # Owner Name
        if property_state.owner is not None:
            try:
                owner_name, name_type = pp.tag(property_state.owner,
                                               type='person')
            except pp.RepeatedLabelError:
                pass
            else:
                if name_type.lower() == 'person':
                    owner.Name.clear()
                    if 'PrefixMarital' in owner_name or 'PrefixOther' in owner_name:
                        owner.Name.append(
                            E.PrefixName(' '.join([
                                owner_name.get('Prefix' + x, '')
                                for x in ('Marital', 'Other')
                            ]).strip()))
                    if 'GivenName' in owner_name:
                        owner.Name.append(E.FirstName(owner_name['GivenName']))
                    elif 'FirstInitial' in owner_name:
                        owner.Name.append(
                            E.FirstName(owner_name['FirstInitial']))
                    else:
                        owner.Name.append(E.FirstName())
                    if 'MiddleName' in owner_name:
                        owner.Name.append(
                            E.MiddleName(owner_name['MiddleName']))
                    elif 'MiddleInitial' in owner_name:
                        owner.Name.append(
                            E.MiddleName(owner_name['MiddleInitial']))
                    if 'Surname' in owner_name:
                        owner.Name.append(E.LastName(owner_name['Surname']))
                    elif 'LastInitial' in owner_name:
                        owner.Name.append(E.LastName(
                            owner_name['LastInitial']))
                    else:
                        owner.Name.append(E.LastName())
                    if 'SuffixGenerational' in owner_name or 'SuffixOther' in owner_name:
                        owner.Name.append(
                            E.SuffixName(' '.join([
                                owner_name.get('Suffix' + x, '')
                                for x in ('Generational', 'Other')
                            ]).strip()))

        # Owner Email
        if property_state.owner_email is not None:
            new_email = E.Email(E.EmailAddress(property_state.owner_email),
                                E.PreferredContactMethod(True))
            if hasattr(owner, 'Email'):
                if property_state.owner_email not in owner.Email:
                    owner.append(new_email)
            else:
                owner.append(new_email)

        # Owner Telephone
        if property_state.owner_telephone is not None:
            insert_phone_number = False
            if hasattr(owner, 'Telephone'):
                if property_state.owner_telephone not in owner.Telephone:
                    insert_phone_number = True
            else:
                insert_phone_number = True
            if insert_phone_number:
                new_phone = E.Telephone(
                    E.TelephoneNumber(property_state.owner_telephone),
                    E.PreferredContactMethod(True))
                inserted_phone_number = False
                for elname in ('Email', 'extension'):
                    if hasattr(owner, elname):
                        getattr(owner, elname).addprevious(new_phone)
                        inserted_phone_number = True
                        break
                if not inserted_phone_number:
                    owner.append(new_phone)

        # Owner Address
        try:
            address = owner.getparent().MailingAddress
        except AttributeError:
            owner.getparent().Person[-1].addnext(E.MailingAddress())
            address = owner.getparent().MailingAddress
        address.clear()
        if property_state.owner_address is not None:
            address.append(E.Address1(property_state.owner_address))
        if property_state.owner_city_state is not None:
            city_state, _ = usadd.tag(property_state.owner_city_state)
            address.append(E.CityMunicipality(city_state.get('PlaceName', '')))
            address.append(E.StateCode(city_state.get('StateName', '')))
        if property_state.owner_postal_code is not None:
            address.append(E.ZipCode(property_state.owner_postal_code))

        # Building Certification / Program Certificate
        program_certificate_options = [
            'Home Performance with Energy Star', 'LEED Certified',
            'LEED Silver', 'LEED Gold', 'LEED Platinum', 'other'
        ]
        if property_state.building_certification is not None:
            try:
                root.Project
            except AttributeError:
                root.Building[-1].addnext(
                    E.Project(
                        E.BuildingID(id=bldg.BuildingID.get('id')),
                        E.ProjectDetails(
                            E.ProjectSystemIdentifiers(
                                id=bldg.BuildingID.get('id')))))
            new_prog_cert = E.ProgramCertificate(
                property_state.building_certification if property_state.
                building_certification in
                program_certificate_options else 'other')
            try:
                root.Project.ProjectDetails.ProgramCertificate
            except AttributeError:
                for elname in ('YearCertified', 'CertifyingOrganizationURL',
                               'CertifyingOrganization', 'ProgramSponsor',
                               'ContractorSystemIdentifiers', 'ProgramName',
                               'ProjectSystemIdentifiers'):
                    if hasattr(root.Project.ProjectDetails, elname):
                        getattr(root.Project.ProjectDetails,
                                elname).addnext(new_prog_cert)
                        break
            else:
                if property_state.building_certification not in root.Project.ProjectDetails.ProgramCertificate:
                    root.Project.ProjectDetails.ProgramCertificate[-1].addnext(
                        new_prog_cert)

        # Energy Score
        energy_score_type_options = ['US DOE Home Energy Score', 'RESNET HERS']
        bldg_const = bldg.BuildingDetails.BuildingSummary.BuildingConstruction
        if property_state.energy_score:
            energy_score_type = property_state.extra_data.get(
                'energy_score_type')
            try:
                found_energy_score = False
                for energy_score_el in bldg_const.EnergyScore:
                    if energy_score_type in (energy_score_el.ScoreType,
                                             getattr(energy_score_el,
                                                     'OtherScoreType', None)):
                        found_energy_score = True
                        break
                if not found_energy_score:
                    energy_score_el = E.EnergyScore()
                    bldg_const.EnergyScore[-1].addnext(energy_score_el)
            except AttributeError:
                energy_score_el = E.EnergyScore()
                try:
                    bldg_const.extension.addprevious(energy_score_el)
                except AttributeError:
                    bldg_const.append(energy_score_el)
            if energy_score_type in energy_score_type_options:
                energy_score_el.ScoreType = energy_score_type
            else:
                energy_score_el.ScoreType = 'other'
                energy_score_el.OtherScoreType = energy_score_type
            energy_score_el.Score = property_state.energy_score

        # Serialize
        tree = etree.ElementTree(root)
        objectify.deannotate(tree, cleanup_namespaces=True)
        f = BytesIO()
        tree.write(f,
                   encoding='utf-8',
                   pretty_print=True,
                   xml_declaration=True)
        return f.getvalue()
 def test_basic(self):
     tagged, name_type = tag("Bob Belcher")
     assert name_type == "Person"
     assert "Bob" == tagged["GivenName"]
     assert "Belcher" == tagged["Surname"]
Example #22
0
	def get_contest_args(self, chicago_args, position, seen_ballot_measure):
		
		# load known offices
		# detect judge races & ballot initiatives

		known_offices = [
			# national
			'president of the united states',
			'president and vice president of the united states',
			'pres and vice pres',
			'president, u.s.',
			'senator, u.s.',
			'united states senator',
			'u.s. senator',
			'u.s. representative',
			'representative in congress',
			'rep. in congress',

			# state
			'governor',
			'lieutenant governor',
			'governor & lieutenant governor',
			'governor and lieutenant governor',
			'secretary of state',
			'attorney general',
			'state\'s attorney',
			'comptroller',
			'treasurer',
			'state senator',
			'state representative',
			'rep. in general assembly',
			'rep. in gen. assembly',

			# county
			'commissioner',
			'board president',
			'president cook county board comm',
			'clerk',
			'sheriff',
			'treasurer',
			'assessor',
			'commissioner, county board',
			'board of review',
			'recorder of deeds',

			'supreme court',
			'appellate court',
			'apellate court',
			'judge, cook county circuit',
			'circuit court',
			'circuit couut',
			'subcircuit',

			# city
			'mayor',
			'alderman',
			'committeeman',
		]

		offices_to_skip = [
			'ballots cast',
			'registered voters',
			'amendment',
			'national convention',
			'natl. convention',
			'delegate natl',
			'delegates natl',
			'state central committeeman',
			'state central',
		]

		chicago_args['office'] = position.lower()

		for office_substring in offices_to_skip:
			if office_substring in position.lower():
				return None, None

		for office_substring in known_offices:
			if office_substring in position.lower():
				is_ballot_measure = False
				if 'retain' in position.lower():
					chicago_args['is_retention'] = True
				return is_ballot_measure, chicago_args

		if not seen_ballot_measure:
			# at this point, an office is none of the above
			try:
				tokens, name_type = pp.tag(position.lower())

				if name_type == 'Person':
					chicago_args['is_retention'] = True
					is_ballot_measure = False
					return is_ballot_measure, chicago_args
				else:
					chicago_args['is_ballot_measure'] = True
					is_ballot_measure = True
					return is_ballot_measure, chicago_args
			except pp.RepeatedLabelError:
				print "REPEATED LABEL ERROR"
				return None, None

		else:
			chicago_args['is_ballot_measure'] = True
			is_ballot_measure = True
			return is_ballot_measure, chicago_args
Example #23
0
def normalize_name(name):
    """
    Normalize a name for sorting and indexing.

    This uses two powerful python libraries for differing reasons.

    `probablepeople` contains a discriminator between company and person names.
    This is used to determine whether to parse into last, first, middle or to
    leave the name alone.

    However, the actual name parser in `probablepeople` is unnecessarily complex,
    so that strings that it determines to be human names are parsed instead by
    the simpler `nameparser`.

    """
    sname = name.strip()  # remove leading and trailing spaces

    # Recognizer tends to mistake concatenated initials for Corporation name.
    # Pad potential initials with spaces before running recognizer
    # For any character A-Z followed by "." and another character A-Z, add a space after the first.
    # (?=[A-Z]) means to find A-Z after the match string but not match it.
    nname = re.sub("(?P<thing>[A-Z]\\.)(?=[A-Z])", "\\g<thing> ", sname)

    try:
        # probablepeople doesn't understand utf-8 encoding. Hand it pure unicode.
        _, type = probablepeople.tag(nname)  # discard parser result
    except probablepeople.RepeatedLabelError:  # if it can't understand the name, it's foreign
        type = 'Unknown'

    if type == 'Corporation':
        return sname  # do not parse and reorder company names

    # special case for capitalization: flag as corporation
    if (adjacent_caps.match(sname)):
        return sname

    # treat anything else as a human name
    nameparts = HumanName(nname)
    normalized = ""
    if nameparts.last:
        normalized = nameparts.last

    if nameparts.suffix:
        if not normalized:
            normalized = nameparts.suffix
        else:
            normalized = normalized + ' ' + nameparts.suffix

    if normalized:
        normalized = normalized + ','

    if nameparts.title:
        if not normalized:
            normalized = nameparts.title
        else:
            normalized = normalized + ' ' + nameparts.title

    if nameparts.first:
        if not normalized:
            normalized = nameparts.first
        else:
            normalized = normalized + ' ' + nameparts.first

    if nameparts.middle:
        if not normalized:
            normalized = nameparts.middle
        else:
            normalized = ' ' + normalized + ' ' + nameparts.middle

    return normalized.strip()
    if row_dict[contributor_name_field] == 'Total of Contributions not exceeding $100':
        is_annonymous = 1

    else:

        full_name = contributor_name = row_dict[contributor_name_field]

        bad_name = False
        bad_addy = False

        contributor_name = row_dict[contributor_name_field]
        #print('contributor_name_field:', contributor_name)

        try:
            probablepeople_name = probablepeople.tag(contributor_name)
            #print('probablepeople_name:', probablepeople_name)

            name_type = probablepeople_name[1]

            name_dict = dict(probablepeople_name[0])
            #print('name_dict:', name_dict)

        except Exception as e:
            name_type = 'Unknown'
            name_dict = {'CorporationName': contributor_name}

        if type(name_dict) is not dict:

            print('name ERROR:')
            bad_name = True
Example #25
0
    def parse_name(self, nameString):
        #--notes: clean name has already been done
        #--       sorry names will be forced upper case
        #--       returns a dictionary of names

        #--remove in garbage expressions in the string
        nameString = nameString.upper()
        for garbageValue in self.variantData['GARBAGE_VALUES']:
            if garbageValue in nameString:
                self.updateStat('GARBAGE_IN_NAMES', garbageValue, nameString)
                nameString = nameString.replace(garbageValue, '').strip()
        newString = nameString

        primaryNameTokens = []
        secondaryNameTokens = []
        referenceNameTokens = []

        #--remove tokens in parenthesis
        groupedStrings = re.findall('\(.*?\)', newString)
        for groupedString in groupedStrings:
            self.updateStat('GROUPED_STRINGS', '()',
                            newString + ' | ' + groupedString)
            newString = newString.replace(groupedString, '')
            referenceNameTokens.append(groupedString)

        #--split the name
        theToken = None
        split = 0
        for token in newString.replace('.', ' ').replace(',', ' ').replace(
                '-', ' - ').replace('/', ' / ').replace(';',
                                                        ' ; ').upper().split():
            if split == 1:
                secondaryNameTokens.append(token)
            elif split == 2:
                referenceNameTokens.append(token)
            elif token in self.variantData['NAME_SPLIT_TOKENS']:
                #--token is skipped
                split = 1
                theToken = token
            elif token in self.variantData['NAME_ENDER_TOKENS']:
                primaryNameTokens.append(token)
                split = 2
                theToken = token
            else:
                primaryNameTokens.append(token)

        primaryNameStr = ' '.join(primaryNameTokens)
        secondaryNameStr = ' '.join(secondaryNameTokens)
        referenceNameStr = ' '.join(referenceNameTokens)

        if secondaryNameStr:
            self.updateStat(
                'NAME_SPLITERS', theToken, nameString + ' -> ' +
                primaryNameStr + ' | ' + secondaryNameStr)
        if referenceNameStr and split == 2:
            self.updateStat(
                'NAME_ENDERS', theToken, nameString + ' -> ' + primaryNameStr +
                ' | ' + referenceNameStr)

        #--probable people parser
        if pp:
            #--pp.tag(name_str) # expected output: (OrderedDict([('PrefixMarital', 'Mr'), ('GivenName', 'George'), ('Nickname', '"Gob"'), ('Surname', 'Bluth'), ('SuffixGenerational', 'II')]), 'Person')
            #--pp.tag(corp_str) # expected output: (OrderedDict([('CorporationName', 'Sitwell Housing'), ('CorporationLegalType', 'Inc')]), 'Corporation')
            #--PrefixMarital
            #--PrefixOther
            #--GivenName
            #--FirstInitial
            #--MiddleName
            #--MiddleInitial
            #--Surname
            #--LastInitial
            #--SuffixGenerational
            #--SuffixOther
            #--Nickname
            #--And
            #--CorporationName
            #--CorporationNameOrganization
            #--CorporationLegalType
            #--CorporationNamePossessiveOf
            #--ShortForm
            #--ProxyFor
            #--AKA
            try:
                taggedName, nameType = pp.tag(primaryNameStr)
                isOrganization = False if nameType == 'Person' else True
                self.updateStat('ProbablePeople', nameType, primaryNameStr)
            except:
                isOrganization = self.is_organization_name(primaryNameStr)

        #--home grown parser
        else:
            isOrganization = self.is_organization_name(primaryNameStr)

        if isOrganization:
            primaryNameOrg = primaryNameStr
            secondaryNameOrg = secondaryNameStr
            primaryNameFull = ''
            secondaryNameFull = ''
        else:
            primaryNameOrg = ''
            secondaryNameOrg = ''
            primaryNameFull = primaryNameStr
            secondaryNameFull = secondaryNameStr

        nameList = []
        nameList.append({'IS_ORGANIZATION': True})
        nameList.append({'PRIMARY_NAME_ORG': primaryNameOrg})
        nameList.append({'SECONDARY_NAME_ORG': secondaryNameOrg})
        nameList.append({'PRIMARY_NAME_FULL': primaryNameFull})
        nameList.append({'SECONDARY_NAME_FULL': secondaryNameFull})
        nameList.append({'REFERENCE_NAME': referenceNameStr})

        return nameList
Example #26
0
 def tagger(self, field):
     return probablepeople.tag(field)
Example #27
0
from sqlalchemy import create_engine
import csv
import probablepeople as pp
conn = create_engine("postgres://brian@localhost:5432/acris")
c = conn.raw_connection().cursor()
c.execute("select distinct name from parties")
#setup output
csvfile = open("out.csv", "w+", newline='')
csvwriter = csv.writer(csvfile)
keys_of_interest = ['GivenName', 'Surname']
all_keys = set()
for one in c:
    name, = one
    out = [name]
    try:
        parsed, t = pp.tag(name)
        out.append(t)
        for key in keys_of_interest:
            if key in parsed:
                out.append(parsed[key])
            else:
                out.append('')
        for key in parsed:
            all_keys.add(key)
    #now write
    except:
        pass
    csvwriter.writerow(out)
    #now let's track what we might get out
print("Done! Here's what else we could have extracted")
print(all_keys)
Example #28
0
out_fn = '../data/cms-physician-permutation-tagged.csv'

tagged_rows = []
tags = set()

print 'Reading and parsing'
with open(in_fn, 'rb') as incsvfile:
    reader = csv.DictReader(incsvfile, delimiter=',')
    counter = 0
    for row in reader:
        # stop early for development
        #if counter > 1000:
        #    break
        try:
            tagged = pp.tag(row['name'])
            # add t_ prefix to distinguish
            tagged0 = OrderedDict(
                ('t_' + k, v) for k, v in tagged[0].viewitems())
            # add type
            tagged0['tag_type'] = tagged[1]
            # add to dictionary of input values
            row.update(tagged0)
        except pp.RepeatedLabelError:
            row['tag_type'] = 'RepeatedLabelError'
        [tags.add(t) for t in row.keys()]
        tagged_rows.append(row)
        counter = counter + 1

print 'Read {:,} rows'.format(counter)
Example #29
0
 def tag_ppl(x):
     try:
         return probablepeople.tag(x.lower())[1]
     except:
         return np.nan
Example #30
0
def name_classifier(input_file, output_file):
    data = pd.read_excel(input_file)
    data.fillna('', inplace=True)
    owner_names = data['Owners Names']
    buyer_names = data['Last Buyers Names']

    output = {'Owners': [], 'Buyers': []}

    corporate_list = ["CO", "LLC", "TRUST", "LL", "LP", "DEPARTMENT", "PLAN", "OF", "INC", "FAMILY", "PROPERTIES",
                      "REVOCABLE", "ESTATES", "&", "INVESTMENTS"]

    for name in owner_names:
        if not name:
            output['Owners'].append({
                'Last Name': '',
                'Middle Name': '',
                'First Name': ''
            })
        else:

            name = name.split(', ')
            valid_name = [{} for _ in range(len(name))]
            for num in range(len(name)):
                name_split = name[num].split(' ')
                if not any([sub_name in name_split for sub_name in corporate_list]):
                    name_original = name[num]
                    if len(name_split) > 1:
                        if len(name_split[1]) == 1:
                            name_split[0], name_split[-1] = name_split[-1], name_split[0]
                            name[num] = " ".join(name_split)
                        else:
                            name_temp = name_split[1:]
                            name_temp.append(name_split[0])
                            name[num] = " ".join(name_temp)

                    try:
                        parse = probablepeople.tag(name[num])
                        parsed_name = parse[0]
                        parsed_type = parse[1]

                        if parsed_type == "Person":
                            if 'LastName' in parsed_name.keys():
                                valid_name[num]['LastName'] = parsed_name['LastName']
                            elif 'Surname' in parsed_name.keys():
                                valid_name[num]['LastName'] = parsed_name['Surname']
                            elif 'LastInitial' in parsed_name.keys():
                                valid_name[num]['LastName'] = parsed_name['LastInitial']
                            else:
                                valid_name[num]['LastName'] = ''

                            if 'MiddleName' in parsed_name.keys():
                                valid_name[num]['MiddleName'] = parsed_name['MiddleName']
                            elif 'MiddleInitial' in parsed_name.keys():
                                valid_name[num]['MiddleName'] = parsed_name['MiddleInitial']
                            else:
                                valid_name[num]['MiddleName'] = ''

                            if 'GivenName' in parsed_name.keys():
                                valid_name[num]['FirstName'] = parsed_name['GivenName']
                            elif 'FirstInitial' in parsed_name.keys():
                                valid_name[num]['FirstName'] = parsed_name['FirstInitial']
                            else:
                                valid_name[num]['FirstName'] = ''

                        elif parsed_type == "Household":
                            valid_name[num] = parsed_name

                        # the name is categorized as "Corporation" after shifting
                        # then we use TBD(To Be Decided): original name
                        else:
                            valid_name[num] = {"TBD": name_original}

                    except probablepeople.RepeatedLabelError as e:
                        valid_name[num] = {'original name': e.original_string,
                                           'parsed name': e.parsed_string}

                # the name is categorized as "Corporation" by our definition
                else:
                    try:
                        parse = probablepeople.tag(name[num])
                        parsed_name = parse[0]
                        # parsed_type = parse[1]

                        valid_name[num] = parsed_name

                    except probablepeople.RepeatedLabelError as e:
                        valid_name[num] = {'original name': e.original_string,
                                           'parsed name': e.parsed_string}

            output['Owners'].append(valid_name)

    for name in buyer_names:
        if not name:
            output['Owners'].append({
                'Last Name': '',
                'Middle Name': '',
                'First Name': ''
            })
        else:

            name = name.split('/ ')
            valid_name = [{} for _ in range(len(name))]
            for num in range(len(name)):
                name_split = name[num].split(' ')
                if not any([sub_name in name_split for sub_name in corporate_list]):
                    name_original = name[num] * 1
                    if len(name_split) > 1:
                        if len(name_split[1]) == 1:
                            name_split[0], name_split[-1] = name_split[-1], name_split[0]
                            name[num] = " ".join(name_split)
                        else:
                            name_temp = name_split[1:]
                            name_temp.append(name_split[0])
                            name[num] = " ".join(name_temp)

                    try:
                        parse = probablepeople.tag(name[num])
                        parsed_name = parse[0]
                        parsed_type = parse[1]

                        if parsed_type == "Person":
                            if 'LastName' in parsed_name.keys():
                                valid_name[num]['LastName'] = parsed_name['LastName']
                            elif 'Surname' in parsed_name.keys():
                                valid_name[num]['LastName'] = parsed_name['Surname']
                            elif 'LastInitial' in parsed_name.keys():
                                valid_name[num]['LastName'] = parsed_name['LastInitial']
                            else:
                                valid_name[num]['LastName'] = ''

                            if 'MiddleName' in parsed_name.keys():
                                valid_name[num]['MiddleName'] = parsed_name['MiddleName']
                            elif 'MiddleInitial' in parsed_name.keys():
                                valid_name[num]['MiddleName'] = parsed_name['MiddleInitial']
                            else:
                                valid_name[num]['MiddleName'] = ''

                            if 'GivenName' in parsed_name.keys():
                                valid_name[num]['FirstName'] = parsed_name['GivenName']
                            elif 'FirstInitial' in parsed_name.keys():
                                valid_name[num]['FirstName'] = parsed_name['FirstInitial']
                            else:
                                valid_name[num]['FirstName'] = ''

                        elif parsed_type == "Household":
                            valid_name[num] = parsed_name

                        # the name is categorized as "Corporation" after shifting
                        # then we use TBD(To Be Decided): original name
                        else:
                            valid_name[num] = "TBD: " + name_original

                    except probablepeople.RepeatedLabelError as e:
                        valid_name[num] = {'original name': e.original_string,
                                           'parsed name': e.parsed_string}

                # the name is categorized as "Corporation" by our definition
                else:
                    try:
                        parse = probablepeople.tag(name[num])
                        parsed_name = parse[0]
                        # parsed_type = parse[1]

                        valid_name[num] = parsed_name

                    except probablepeople.RepeatedLabelError as e:
                        valid_name[num] = {'original name': e.original_string,
                                           'parsed name': e.parsed_string}

            output['Buyers'].append(valid_name)

    with open(output_file, 'w') as outfile:
        json.dump(output, outfile)
 def test_basic(self):
     tagged, name_type = tag("Bob Belcher")
     assert name_type == 'Person'
     self.assertEqual("Bob", tagged['GivenName'])
     self.assertEqual("Belcher", tagged['Surname'])
Example #32
0
def parse_lines_rules_based_version(lines):
    """
    Parse a sequence of text lines belonging to the "basics" section of a résumé
    to produce structured data in the form of :class:`schemas.ResumeBasicsSchema`
    using logical rules based on regular expressions,
    *PLUS* some pre-trained CRF parsers.

    Args:
        lines (List[str])

    Returns:
        Dict[str, obj]
    """
    data = {}
    for line in lines:
        if not line:
            continue
        for line_chunk in regexes.RE_LINE_DELIM.split(line):
            if not line_chunk:
                continue
            if "email" not in data:
                match = regexes.RE_EMAIL.search(line_chunk)
                if match:
                    data["email"] = match.group()
                    start, end = match.span()
                    if start == 0 and end == len(line_chunk):
                        continue
                    else:
                        line_chunk = line_chunk[:start] + line_chunk[end:]
            if "phone" not in data:
                match = regexes.RE_PHONE_NUMBER.search(line_chunk)
                if match:
                    data["phone"] = match.group()
                    start, end = match.span()
                    if start == 0 and end == len(line_chunk):
                        continue
                    else:
                        line_chunk = line_chunk[:start] + line_chunk[end:]
            if "website" not in data:
                match = regexes.RE_URL.search(line_chunk)
                if match:
                    data["website"] = match.group()
                    start, end = match.span()
                    if start == 0 and end == len(line_chunk):
                        continue
                    else:
                        line_chunk = line_chunk[:start] + line_chunk[end:]
            if "profiles" not in data:
                match = regexes.RE_USER_HANDLE.search(line_chunk)
                if match:
                    data["profiles"] = [{"username": match.group()}]
                    start, end = match.span()
                    if start == 0 and end == len(line_chunk):
                        continue
                    else:
                        line_chunk = line_chunk[:start] + line_chunk[end:]
            if "location" not in data:
                try:
                    location, location_type = usaddress.tag(
                        line_chunk,
                        tag_mapping=basics.constants.LOCATION_TAG_MAPPING)
                except usaddress.RepeatedLabelError as e:
                    LOGGER.debug("'location' parsing error:\n%s", e)
                    continue
                if location_type == "Street Address":
                    location = dict(location)
                    if "recipient" in location:
                        data["name"] = location.pop("recipient")
                    data["location"] = location
            if "name" not in data:
                try:
                    name, name_type = probablepeople.tag(line_chunk)
                except probablepeople.RepeatedLabelError as e:
                    LOGGER.debug("'name' parsing error:\n%s", e)
                    continue
                if name_type == "Person":
                    data["name"] = " ".join(name.values())
    return data
Example #33
0
 def test_basic(self) :
     tagged, name_type = tag("Bob Belcher")
     assert name_type == 'Person'
     assert "Bob" == tagged['GivenName']
     assert "Belcher" == tagged['Surname']