def sort_name_to_display_name(sort_name):
    """
    Take the "Last Name, First Name"-formatted sort_name, and convert it
    to a "First Name Last Name" format appropriate for displaying to patrons in a catalog listing.

    While the code attempts to do the best it can, name recognition gets complicated
    really fast when there's more than one plain-format first name and one plain-format last name.
    This code is meant to serve as first line of approximation.  If we later on can find better
    human librarian-checked sort and display names in the Metadata Wrangler, we use those.

    :param sort_name Doe, Jane
    :return display_name Jane Doe
    """
    if not sort_name:
        return None

    name = HumanName(sort_name)
    # name has title, first, middle, last, suffix, nickname
    if name.nickname:
        name.nickname = '(' + name.nickname + ')'
    display_name = u' '.join([name.title, name.first, name.nickname, name.middle, name.last, name.suffix])

    display_name = name_tidy(display_name)

    return display_name
def clean_authors(authors):
    cleaned_authors = []
    authors = authors.lower()

    # get rid of commas where there are suffixes, like Jr. or III
    authors = authors.replace(", jr.", " jr.")
    authors = authors.replace(", iii", " iii")
    authors = authors.replace(", ph.d", "")

    # special cases
    authors = authors.replace("organizer:", "")
    authors = authors.replace("roel m,", "roel m.")
    if authors == 'kozue miyashiro, etsuko harada, t.':
        author_list = ['kozue miyashiro', 'etsuko harada, t.']
    else:
        author_list = authors.split(",")

    for author in author_list:
        author = HumanName(author.lower())

        if author.first == '' or author.last == '':
            raise ValueError("invalid author name: {}".format(author))

        author.capitalize()
        author.string_format = u"{last}, {title} {first} {middle}, {suffix}"

        cleaned_authors.append(unicode(author))

    return cleaned_authors
def course_event_title_and_contact(course):
    try:
        section = get_sws_section(course)
        meeting = section.meetings[0] if hasattr(
            section, 'meetings') and len(section.meetings) else None
        instructor = meeting.instructors[0] if hasattr(
            meeting, 'instructors') and len(meeting.instructors) else None
        first_name = instructor.first_name if hasattr(
            instructor, 'first_name') else ''
        surname = instructor.surname if hasattr(instructor, 'surname') else ''
        uwnetid = instructor.uwnetid if hasattr(instructor, 'uwnetid') else ''
        email = instructor.email1 if hasattr(instructor, 'email1') else ''
        name = HumanName(' '.join([first_name, surname]))
        name.capitalize()
    except DataFailureException as err:
        if err.status == 404:
            section = None
            name = None
            email = None
            uwnetid = None
        else:
            raise

    return {
        'title_long': section.course_title_long if section else '',
        'name': '%s %s' % (name.first, name.last) if name else '',
        'uwnetid': uwnetid if uwnetid else '',
        'email': email if email and len(email) else "*****@*****.**" % uwnetid if uwnetid else ''
    }
Exemple #4
0
    def set_parsed_name(self):
        if not self.name:
            self.parsed_name = None
            return

        name = HumanName(self.name)
        self.parsed_name = name.as_dict()
def normalize(name):
    norm_1 = unidecode(" ".join(name.strip().lower().split())).replace("-", " ")
    norm_2 = re.sub(r'<[^>]+>', r'', norm_1)  # remove html
    hname = HumanName(norm_2)
    hname.string_format = '{first} {middle} {last}'
    # return str(hname).replace()
    return re.sub(r'[^a-z\s]', r'', str(hname))
def parse_name(item):
    """
    Parses a name string into an Author object.

    :param item: String containing author name and possibly email
    :return: Author
    """

    # Find emails from regex
    email_group = emailRegex.search(item)
    email = None
    # If emails found, take first email
    if email_group:
        email = email_group.group(0)
    # Remove email from string
    item = emailWithBracketsRegex.sub("", item)
    # Remove prefixes from string
    item = prefixRegex.sub("", item)
    # Remove suffixes from string
    item = suffixRegex.sub("", item)
    # Remove words in brackets () if words exist outside brackets
    if not bracketRegex.fullmatch(item):
        item = bracketRegex.sub(" ", item)

    # Strip extraneous characters from string
    item = item.strip(strip_chars)
    # Parse remaining string with HumanNameParser to find name
    name = HumanName(item)

    if not name.full_name:
        app.logger.warning(
            "Unable to parse name string %s: no full_name returned in name %s",
            item,
            name,
        )
        return None

    # Only force capitalization of names with mixed capitalization above certain percentage
    name.capitalize(force=force_capitalization(name.full_name))

    author = Author()
    if name.first:
        author.givenname = name.first
    if name.last:
        author.familyname = name.last
    if name.middle:
        author.middlename = name.middle
    if email:
        author.email = email

    if not name.first and not name.last:
        author.name = name.full_name
    else:
        author.create_full_name()

    return author
Exemple #7
0
 def test_assignment_to_full_name(self):
     hn = HumanName("John A. Kenneth Doe, Jr.")
     self.m(hn.first,"John", hn)
     self.m(hn.last,"Doe", hn)
     self.m(hn.middle,"A. Kenneth", hn)
     self.m(hn.suffix,"Jr.", hn)
     hn.full_name = "Juan Velasquez y Garcia III"
     self.m(hn.first,"Juan", hn)
     self.m(hn.last,"Velasquez y Garcia", hn)
     self.m(hn.suffix,"III", hn)
Exemple #8
0
def display_full_name_with_correct_capitalization(full_name):
    """
    See documentation here: https://github.com/derek73/python-nameparser
    :param full_name:
    :return:
    """
    full_name.strip()
    full_name_parsed = HumanName(full_name)
    full_name_parsed.capitalize()
    full_name_capitalized = str(full_name_parsed)
    return full_name_capitalized
    def get_display_name(self):
        if self.has_display_name():
            return self.display_name
        if self.has_first_name():
            name = HumanName("%s %s" % (self.first_name, self.last_name))
        else:
            name = HumanName(self.last_name)

        name.capitalize()
        name.string_format = "{first} {last}"
        return str(name)
def user_fullname(user):
    if hasattr(user, 'display_name'):
        if ((user.display_name is None or not len(user.display_name) or
                user.display_name.isupper()) and hasattr(user, 'first_name')):
            fullname = HumanName('%s %s' % (user.first_name, user.surname))
            fullname.capitalize()
            fullname.string_format = '{first} {last}'
            return str(fullname)
        else:
            return user.display_name
    elif hasattr(user, 'email'):
        return user.email.split('@')[0]  # CanvasUser
    else:
        raise UserPolicyException('Invalid user')
Exemple #11
0
def extractFirstName(name, order):
    '''Split on dots'''
    name = ' '.join(name.split('.'))

    '''Split on - '''
    name = ' '.join(name.split('-'))

    '''Replace numbers by whitespace'''
    oldname = name

    name = re.sub("\d+", "", name)
    if not len(name):
        name = re.sub("\d+", "_", oldname)

    oldname = name
    '''Replace ? by whitespace'''
    name = re.sub("\?", "", name)
    if not len(name):
        name = re.sub("\?", "_", oldname)

    name = ' '.join(name.split('_'))
    
    '''Use the Python name parser'''
    try:
        firstName = getFirstNameFromHumanName(HumanName(name), order)
    except:
        firstName = getFirstNameFromSplitName(name.split(), order)
    
    '''If fail, use heuristics'''
    if firstName.strip() == name.strip():
        '''firstName('Ben Voigt') = 'Ben Voigt'!!!'''
        if len(name.split()) == 2:
            firstName = getFirstNameFromSplitName(name.split(), order)
        else:
            '''Try CamelCase'''
            uncamel = ' '.join(splitCamelCase(name).split('_'))
            if uncamel != name:
                try:
                    firstName = HumanName(uncamel).first
                    if len(firstName.split()) == 2:
                        firstName = getFirstNameFromSplitName(firstName.split(), order)
                except:
                    firstName = getFirstNameFromSplitName(uncamel.split(), order)
    
    if firstName == 'Mc':
        firstName = ''
    if len(firstName) == 1:
        firstName = ''
    return firstName.lower()
def catogorize_by_instructor(data):
    instructorsDict = {}
    for fce in data:
        if 'instructor' in fce and len(fce['instructor']) > 2:
            name = HumanName(fce['instructor'])
            name.capitalize()
            instructor = "{} {}".format(name.first, name.last).strip()
            if len(instructor) > 2:
                course = Course(fce)
                if instructor in instructorsDict:
                    instructorsDict[instructor]['courses'].append(course)
                else:
                    instructorsDict[instructor] = Instructor(str(name))
                    instructorsDict[instructor]['courses'].append(course)
    return instructorsDict
def display_name_to_sort_name(display_name):
    """
    Take the "First Name Last Name"-formatted display_name, and convert it
    to a "Last Name, First Name" format appropriate for searching and sorting by.

    Checks first if the display_name fits what we know of corporate entity business names.
    If yes, uses the whole name without re-converting it.

    Uses the HumanName library to try to parse the name into parts, and rearrange the parts into
    desired order and format.
    """
    if not display_name:
        return None

    # TODO: to humanname: PhD, Ph.D. Sister, Queen are titles and suffixes

    # check if corporate, and if yes, return whole
    if is_corporate_name(display_name):
        return display_name

    # clean up the common PhD and MD suffixes, so HumanName recognizes them better
    display_name = name_tidy(display_name)

    # name has title, first, middle, last, suffix, nickname
    name = HumanName(display_name)

    if name.nickname:
        name.nickname = '(' + name.nickname + ')'

    # Note: When the first and middle names are initials that have come in with a space between them,
    # let them keep that space, to be consistent with initials with no periods, which would be more
    # easily algorithm-recognized if they were placed separately. So:
    # 'Classy, A. B.' and 'Classy Abe B.' and 'Classy A. Barney' and 'Classy, Abe Barney' and 'Classy, A B'.
    if not name.last:
        # Examples: 'Pope Francis', 'Prince'.
        sort_name = u' '.join([name.first, name.middle, name.suffix, name.nickname])
        if name.title:
            sort_name = u''.join([name.title, ", ", sort_name])
    else:
        sort_name = u' '.join([name.first, name.middle, name.suffix, name.nickname, name.title])
        sort_name = u''.join([name.last, ", ", sort_name])

    sort_name = name_tidy(sort_name)

    return sort_name
Exemple #14
0
def show_representatives():
    form = AddressLookup()
    if form.validate_on_submit():
        session['address'] = form.address.data
        return redirect(url_for('main.show_representatives'))

    address = session.get('address')
    form.address.data = address
    representatives = {}
    if address:
        representatives =\
            get_representativeinfo(session.get('address'),
                                   current_app.config['ELECTION_API_KEY'])
        for representative in representatives['officials']:
            human_name = HumanName(representative['name'])
            logging.debug(human_name.as_dict())
            representative['first_name'] = human_name.first
            representative['last_name'] = human_name.last
    return render_template('show_representativeinfo.html',
                           representatives=representatives,
                           lookupform=form)
def add():
    global initialised
    if not initialised:
        initialise()

    name = HumanName(request.forms.get('name'))
    first_name = name.first.upper()
    last_name = name.last.upper()
    department = request.forms.get('department').upper()
    designation = request.forms.get('designation').upper()
    photo = request.files.get('photo')
    name, ext = os.path.splitext(photo.filename)
    if ext not in ('.png', '.jpg', '.jpeg', '.JPG', '.JPEG', '.PNG'):
        rv = {"status": "photo_invalid"}
        return dict(rv)

    if not os.path.exists(folder_name + "/images/"):
        os.makedirs(folder_name + "/images/")

    name = first_name + "_" + last_name + "_" + department + "_" + designation
    photo.filename = name.replace(" ", "_") + ext
    photo.save(folder_name + "/images/")

    with open(folder_name + "/" + folder_name + ".csv", 'ab+') as csv_file:
        fieldnames = ['firstname', 'lastname', 'designation', 'department', 'photo']
        csv_writer = csv.DictWriter(csv_file, fieldnames=fieldnames)

        csv_writer.writerow({
          'firstname': first_name,
          'lastname': last_name,
          'department': department,
          'designation': designation,
          'photo': "images/"+photo.filename
        })

    rv = {"status": "ok"}
    return dict(rv)
Exemple #16
0
 def test_assignment_to_attribute(self):
     hn = HumanName("John A. Kenneth Doe, Jr.")
     hn.last = "de la Vega"
     self.m(hn.last,"de la Vega", hn)
     hn.title = "test"
     self.m(hn.title,"test", hn)
     hn.first = "test"
     self.m(hn.first,"test", hn)
     hn.middle = "test"
     self.m(hn.middle,"test", hn)
     hn.suffix = "test"
     self.m(hn.suffix,"test", hn)
Exemple #17
0
 def standardize(res):
     # Standardize name
     name = HumanName(res['list_name'])
     name.capitalize()
     res['list_name'] = str(name)
     if 'detail_name' in res:
         dname = HumanName(res['detail_name'])
         dname.capitalize()
         res['detail_name'] = str(dname)
     # Lowercase email
     if 'list_email' in res:
         res['list_email'] = res['list_email'].lower()
     # Remove `Faculty - ` from affiliation
     res['list_affiliation'] = res['list_affiliation'].replace('Faculty - ', '')
     return res
Exemple #18
0
    def HumanNameFmXML(self, ell):
        hn = HumanName()
        for el in ell:
            if el.tag == 'First':
                hn.first = el.text
            elif el.tag == 'Middle':
                hn.middle = el.text
            elif el.tag == 'Last':
                hn.last = el.text
            elif el.tag == 'Title':
                hn.title = el.text
            elif el.tag == 'Suffix':
                hn.suffix = el.text
            elif el.tag == 'NickName':
                hn.nickname = el.text
            else:
                pass

        return hn
Exemple #19
0
    def person_name_from_xml(self, ell):
        '''Create a person mane from an XML element.'''
        hname = HumanName()
        for elm in ell:
            if elm.tag == 'First':
                hname.first = elm.text
            elif elm.tag == 'Middle':
                hname.middle = elm.text
            elif elm.tag == 'Last':
                hname.last = elm.text
            elif elm.tag == 'Title':
                hname.title = elm.text
            elif elm.tag == 'Suffix':
                hname.suffix = elm.text
            elif elm.tag == 'NickName':
                hname.nickname = elm.text
            else:
                pass

        return hname
Exemple #20
0
    def parse_rss(message):
        """
            Parse Feeds into the CMS Module
        """

        db = current.db
        s3db = current.s3db
        table = s3db.msg_rss
        record = db(table.message_id == message.message_id).select(
            table.channel_id,
            table.title,
            table.from_address,
            table.body,
            table.date,
            table.location_id,
            table.tags,
            table.author,
            limitby=(0, 1)).first()
        if not record:
            return

        post_table = s3db.cms_post

        # Is this an Update or a Create?
        body = record.body or record.title
        url = record.from_address
        if url:
            doc_table = s3db.doc_document
            exists = db(doc_table.url == url).select(doc_table.doc_id,
                                                     limitby=(0, 1)).first()
            if exists:
                exists = db(post_table.doc_id == exists.doc_id).select(
                    post_table.id, limitby=(0, 1)).first()
        else:
            # Use Body
            exists = db(post_table.body == body).select(post_table.id,
                                                        limitby=(0,
                                                                 1)).first()

        channel_id = record.channel_id
        tags = record.tags

        author = record.author
        if author:
            ptable = s3db.pr_person
            # https://code.google.com/p/python-nameparser/
            from nameparser import HumanName
            name = HumanName(author)
            first_name = name.first
            middle_name = name.middle
            last_name = name.last
            query = (ptable.first_name == first_name) & \
                    (ptable.middle_name == middle_name) & \
                    (ptable.last_name == last_name)
            pexists = db(query).select(ptable.id, limitby=(0, 1)).first()
            if pexists:
                person_id = pexists.id
            else:
                person_id = ptable.insert(first_name=first_name,
                                          middle_name=middle_name,
                                          last_name=last_name)
                s3db.update_super(ptable, {"id": person_id})
        else:
            person_id = None

        if exists:
            post_id = exists.id
            db(post_table.id == post_id).update(
                title=record.title,
                body=body,
                # @ToDo: Remove created_on when we know not used in rendering
                created_on=record.date,
                date=record.date,
                location_id=record.location_id,
                person_id=person_id,
            )
            # Read existing Tags (which came from remote)
            ttable = db.cms_tag
            ltable = db.cms_tag_post
            query = (ltable.post_id == post_id) & \
                    (ltable.mci == 1) & \
                    (ltable.tag_id == ttable.id)
            rows = db(query).select(ttable.name)
            # Compare these to tags in current version of post
            old_tags = [r.name for r in rows]
            new_tags = []
            delete_tags = []
            for tag in tags:
                if tag not in old_tags:
                    new_tags.append(tag)
            for tag in old_tags:
                if tag not in tags:
                    delete_tags.append(tag)
            if new_tags or delete_tags:
                lookup_tags = []
                lookup_tags.extend(new_tags)
                lookup_tags.extend(delete_tags)
                _tags = db(ttable.name.belongs(lookup_tags)).select(
                    ttable.id,
                    ttable.name,
                ).as_dict(key="name")
            for t in new_tags:
                tag = _tags.get(t, None)
                if tag:
                    tag_id = tag["id"]
                else:
                    tag_id = ttable.insert(name=t)
                ltable.insert(
                    post_id=post_id,
                    tag_id=tag_id,
                    mci=1,  # This is an imported record, not added natively
                )
            for t in delete_tags:
                tag = _tags.get(t, None)
                if tag:
                    query = (ltable.post_id == post_id) & \
                            (ltable.tag_id == tag["id"]) & \
                            (ltable.mci == 1) & \
                            (ltable.deleted == False)
                    db(query).delete()

        else:
            # Default to 'News' series
            table = db.cms_series
            series_id = db(table.name == "News").select(table.id,
                                                        cache=s3db.cache,
                                                        limitby=(0,
                                                                 1)).first().id

            post_id = post_table.insert(
                title=record.title,
                body=body,
                # @ToDo: Remove created_on when we know not used in rendering
                created_on=record.date,
                date=record.date,
                location_id=record.location_id,
                person_id=person_id,
                series_id=series_id,
                mci=1,  # This is an imported record, not added natively
            )
            record = {"id": post_id}
            s3db.update_super(post_table, record)

            # Source link
            if url:
                doc_table.insert(
                    doc_id=record["doc_id"],
                    url=url,
                )

            # Is this feed associated with an Org/Network?
            def lookup_pe(channel_id):
                ctable = s3db.msg_rss_channel
                channel_url = db(ctable.channel_id == channel_id).select(
                    ctable.url, limitby=(0, 1)).first().url
                ctable = s3db.pr_contact
                ptable = s3db.pr_pentity
                query = (ctable.contact_method == "RSS") & \
                        (ctable.value == channel_url) & \
                        (ctable.pe_id == ptable.pe_id)
                pe = db(query).select(ptable.pe_id,
                                      ptable.instance_type,
                                      limitby=(0, 1)).first()
                if pe:
                    pe_type = pe.instance_type
                    otable = s3db[pe_type]
                    org_id = db(otable.pe_id == pe.pe_id).select(
                        otable.id,
                        limitby=(0, 1),
                    ).first().id
                    return pe_type, org_id
                else:
                    return None, None

            pe_type, org_id = current.cache.ram("pe_channel_%s" % channel_id,
                                                lambda: lookup_pe(channel_id),
                                                time_expire=120)
            if pe_type == "org_organisation":
                s3db.cms_post_organisation.insert(
                    post_id=post_id,
                    organisation_id=org_id,
                )
            elif pe_type == "org_group":
                s3db.cms_post_organisation_group.insert(
                    post_id=post_id,
                    group_id=org_id,
                )

            if tags:
                ttable = db.cms_tag
                ltable = db.cms_tag_post
                _tags = db(ttable.name.belongs(tags)).select(
                    ttable.id,
                    ttable.name,
                ).as_dict(key="name")
                for t in tags:
                    tag = _tags.get(t, None)
                    if tag:
                        tag_id = tag["id"]
                    else:
                        tag_id = ttable.insert(name=t)
                    ltable.insert(
                        post_id=post_id,
                        tag_id=tag_id,
                        mci=1,  # This is an imported record, not added natively
                    )

        # No Reply
        return
Exemple #21
0
 def execute(self, obj):
     return HumanName(obj)
Exemple #22
0
 def extract_last_name(self):
     "Extracts last name from name feature using nameparser."
     self.Xy["last_name"] = self.Xy.name.apply(lambda x: HumanName(x).last)
Exemple #23
0
    def parse_rss_2_cap(message):
        """
            Parse RSS Feeds into the CAP Module
        """

        db = current.db
        s3db = current.s3db
        table = s3db.msg_rss
        record = db(table.message_id == message.message_id).select(
            table.channel_id,
            table.title,
            table.from_address,
            table.body,
            table.date,
            table.location_id,
            table.author,
            limitby=(0, 1)).first()
        if not record:
            return

        channel_id = record.channel_id

        alert_table = s3db.cap_alert
        info_table = s3db.cap_info

        # Is this an Update or a Create?
        # @ToDo: Use guid?
        # Use Body
        body = record.body or record.title
        query = (info_table.description == body)
        exists = db(query).select(info_table.id, limitby=(0, 1)).first()

        author = record.author
        if author:
            ptable = s3db.pr_person
            # https://code.google.com/p/python-nameparser/
            from nameparser import HumanName
            name = HumanName(author)
            first_name = name.first
            middle_name = name.middle
            last_name = name.last
            query = (ptable.first_name == first_name) & \
                    (ptable.middle_name == middle_name) & \
                    (ptable.last_name == last_name)
            pexists = db(query).select(ptable.id, limitby=(0, 1)).first()
            if pexists:
                person_id = pexists.id
            else:
                person_id = ptable.insert(first_name=first_name,
                                          middle_name=middle_name,
                                          last_name=last_name)
                s3db.update_super(ptable, dict(id=person_id))
        else:
            person_id = None

        if exists:
            # @ToDo: Use XSLT
            info_id = exists.id
            db(info_table.id == info_id).update(
                headline=record.title,
                description=body,
                created_on=record.date,
                #location_id = record.location_id,
                #person_id = person_id,
            )

        else:
            # Embedded link
            url = record.from_address
            try:
                file = fetch(url)
            except urllib2.URLError:
                response.error = str(sys.exc_info()[1])
                return output
            except urllib2.HTTPError:
                response.error = str(sys.exc_info()[1])
                return output
            File = StringIO(file)

            # Import via XSLT
            resource = s3db.resource("cap_alert")
            stylesheet = os.path.join(current.request.folder, "static",
                                      "formats", "cap", "import.xsl")
            success = resource.import_xml(File, stylesheet=stylesheet)

        # No Reply
        return
Exemple #24
0
    def parse_rss_2_cms(message):
        """
            Parse Feeds into the CMS Module
        """

        db = current.db
        s3db = current.s3db
        table = s3db.msg_rss
        record = db(table.message_id == message.message_id).select(
            table.channel_id,
            table.title,
            table.from_address,
            table.body,
            table.date,
            table.location_id,
            table.tags,
            table.author,
            limitby=(0, 1)).first()
        if not record or not record.body:
            return

        post_table = s3db.cms_post

        # Is this an Update or a Create?
        body = record.body or record.title
        url = record.from_address
        if url:
            doc_table = s3db.doc_document
            exists = db(doc_table.url == url).select(doc_table.doc_id,
                                                     limitby=(0, 1)).first()
            if exists:
                exists = db(post_table.doc_id == exists.doc_id).select(
                    post_table.id, limitby=(0, 1)).first()
        else:
            # Use Body
            exists = db(post_table.body == body).select(post_table.id,
                                                        limitby=(0,
                                                                 1)).first()

        channel_id = record.channel_id
        tags = record.tags

        author = record.author
        if author:
            ptable = s3db.pr_person
            # https://code.google.com/p/python-nameparser/
            from nameparser import HumanName
            name = HumanName(author)
            first_name = name.first
            middle_name = name.middle
            last_name = name.last
            query = (ptable.first_name == first_name) & \
                    (ptable.middle_name == middle_name) & \
                    (ptable.last_name == last_name)
            pexists = db(query).select(ptable.id, limitby=(0, 1)).first()
            if pexists:
                person_id = pexists.id
            else:
                person_id = ptable.insert(first_name=first_name,
                                          middle_name=middle_name,
                                          last_name=last_name)
                s3db.update_super(ptable, dict(id=person_id))
        else:
            person_id = None

        if exists:
            post_id = exists.id
            db(post_table.id == post_id).update(
                title=record.title,
                body=body,
                created_on=record.date,
                location_id=record.location_id,
                person_id=person_id,
            )
            # Read existing Tags (which came from remote)
            ttable = db.cms_tag
            ltable = db.cms_tag_post
            query = (ltable.post_id == post_id) & \
                    (ltable.mci == 1) & \
                    (ltable.tag_id == ttable.id)
            rows = db(query).select(ttable.name)
            # Compare these to tags in current version of post
            old_tags = [r.name for r in rows]
            new_tags = []
            delete_tags = []
            for tag in tags:
                if tag not in old_tags:
                    new_tags.append(tag)
            for tag in old_tags:
                if tag not in tags:
                    delete_tags.append(tag)
            if new_tags or delete_tags:
                lookup_tags = []
                lookup_tags.extend(new_tags)
                lookup_tags.extend(delete_tags)
                _tags = db(ttable.name.belongs(lookup_tags)).select(
                    ttable.id,
                    ttable.name,
                ).as_dict(key="name")
            for t in new_tags:
                tag = _tags.get(t, None)
                if tag:
                    tag_id = tag["id"]
                else:
                    tag_id = ttable.insert(name=t)
                ltable.insert(
                    post_id=post_id,
                    tag_id=tag_id,
                    mci=1,  # This is an imported record, not added natively
                )
            for t in delete_tags:
                tag = _tags.get(t, None)
                if tag:
                    query = (ltable.post_id == post_id) & \
                            (ltable.tag_id == tag["id"]) & \
                            (ltable.mci == 1) & \
                            (ltable.deleted == False)
                    db(query).delete()

        else:
            # Default to 'News' series
            table = db.cms_series
            series = db(table.name == "News").select(table.id,
                                                     cache=s3db.cache,
                                                     limitby=(0, 1)).first()
            try:
                series_id = series.id
            except:
                raise KeyError("News Series not present in CMS module")

            post_id = post_table.insert(
                title=record.title,
                body=body,
                created_on=record.date,
                location_id=record.location_id,
                person_id=person_id,
                series_id=series_id,
                mci=1,  # This is an imported record, not added natively
            )
            record = dict(id=post_id)
            s3db.update_super(post_table, record)

            # Source link
            if url:
                doc_table.insert(
                    doc_id=record["doc_id"],
                    url=url,
                )

            if tags:
                ttable = db.cms_tag
                ltable = db.cms_tag_post
                _tags = db(ttable.name.belongs(tags)).select(
                    ttable.id,
                    ttable.name,
                ).as_dict(key="name")
                for t in tags:
                    tag = _tags.get(t, None)
                    if tag:
                        tag_id = tag["id"]
                    else:
                        tag_id = ttable.insert(name=t)
                    ltable.insert(
                        post_id=post_id,
                        tag_id=tag_id,
                        mci=1,  # This is an imported record, not added natively
                    )

        # No Reply
        return
Exemple #25
0
    def parse_raw_name(name_raw: str, count: int, extract_orgs=True) -> (str, str, str, Counter):
        """
        Parses a (usually messy) raw name and returns
        first, middle, last names and a Counter of extracted positions

        extract_orgs tries to extract organizations from name. defaults to True. only set to False
        to be able to check if a name is valid (it prevents an infinite loop because by default,
        extracting organizations is part of the initialization of a person

        :param name_raw: str
        :param count: int
        :param extract_orgs: bool
        :return: str, str, str, Counter (first name, middle name, last name, positions Counter)
        """
        name_raw = Person.remove_privlog_info(name_raw)
        # remove JR, SR, or III if it follows this pattern: 'Chumney-RD-Jr'
        name_raw = Person.remove_jr_sr_iii(name_raw)

        # position is often attached with a dash,
        # e.g. 'BAKER, T E - NATIONAL ASSOCIATION OF ATTORNEYS'
        if name_raw.find(" - ") > -1 and len(name_raw.split(' - ')) == 2:
            name_raw, extracted_position = name_raw.split(" - ")
            extracted_positions = [extracted_position.strip()]
        else:
            extracted_positions = []

        # extract positions in parens e.g. Henson, A (Chadbourne & Park)
        paren_positions = re.findall(r'\([^(]+\)', name_raw)
        for position in paren_positions:
            extracted_positions.append(position.strip(',#() '))
            name_raw = name_raw.replace(position, '')

        # Search for known raw_org strings in name_raw, extract them as positions if necessary
        if extract_orgs:
            name_raw, new_positions = Person.extract_raw_org_names_from_name(name_raw)
            extracted_positions += new_positions

        # delete any leftover hashtags
        name_raw = name_raw.strip(' #')

        # Delete dashes between last name and initials
        # DUNN-W -> Dunn W
        if name_raw[-2] == '-':
            name_raw = name_raw[:-2] + " " + name_raw[-1:]
        # DUNN-WL -> DUNN WL
        if len(name_raw) > 2 and name_raw[-3] == '-':
            name_raw = name_raw[:-3] + " " + name_raw[-2:]

        # Parse current string using HumanName
        name = HumanName(name_raw)

        # e.g. Dunn W -> parsed as last name W. -> switch first/last
        if len(name.last) <= 2 < len(name.first):
            name.first, name.last = name.last, name.first

        # remove periods from initials
        if len(name.first) == 2 and name.first[1] == '.':
            name.first = name.first[0]
        if len(name.middle) == 2 and name.middle[1] == '.':
            name.middle = name.middle[0]

        # If first name is length 2 (Teague, CE), the two letters are most likely initials.
        if len(name.middle) == 0 and len(name.first) == 2:
            name.middle = name.first[1].upper()
            name.first = name.first[0].upper()

        # If first and middle initials have periods but not spaces -> separate, e.g. "R.K. Teague"
        if re.match(r'[a-zA-Z]\.[a-zA-Z]\.', name.first):
            name.middle = name.first[2]
            name.first = name.first[0]

        name.last = name.last.capitalize()
        name.first = name.first.capitalize()
        name.middle = name.middle.capitalize()

        # if multiple names are passed, they often end up in the middle name
        # e.g. 'Holtzman, A.,  Murray, J. ,  Henson, A.  -> only allow one comma or set to empty
        if name.middle.count(',') > 1:
            name.middle = ''

        if len(name.suffix) > 20 and name.suffix.count('.') > 2:
            name.suffix = ''

        if name.suffix:
            extracted_positions.append(name.suffix)

        # map organization names to clean official names (if they are in the dict) using
        # RAW_ORG_TO_CLEAN_ORG_DICT
        clean_orgs = []
        for raw_org in extracted_positions:
            if raw_org in RAW_ORG_TO_CLEAN_ORG_DICT:
                clean_org = RAW_ORG_TO_CLEAN_ORG_DICT[raw_org]
                if clean_org != '@skip@':
                    clean_orgs.append(clean_org)
            else:
                clean_orgs.append(raw_org)
        extracted_positions = clean_orgs

        # convert mapped positions into a counter
        result_positions = Counter()
        for position in extracted_positions:
            cleaned = re.sub(r'\.', '', position)
            result_positions[cleaned.upper()] += count

        # print(name.first, name.middle, name.last, result_positions)
        return name.first, name.middle, name.last, result_positions
Exemple #26
0
 if 'salutation' not in row:
     row['salutation'] = ''
 if 'firstName' not in row:
     row['salutation'] = ''
 if 'middleName' not in row:
     row['middleName'] = ''
 if 'lastName' not in row:
     row['lastName'] = ''
 if 'nameSuffix' not in row:
     row['nameSuffix'] = ''
 if 'firstName' not in row:
     row['firstName'] = ''
 if 'nickName' not in row:
     row['nickName'] = ''
 #parse the full text name and full text address into their components and add them to the row.  For each row, we are checking the destination value to ensure it is empty, as users my import component values instead of full text values.
 parsedName = HumanName(row['fullName'])
 ap = AddressParser()
 parsedAddress = ap.parse_address(row['fullTextAddress'])
 if not row['salutation']:
     row['salutation'] = parsedName.title
 if not row['firstName']:
     People.Name.first = parsedName.first
     #row['firstName'] = parsedName.first
 if not row['middleName']:
     row['middleName'] = parsedName.middle
 if not row['lastName']:
     row['lastName'] = parsedName.last
 if not row['nameSuffix']:
     row['nameSuffix'] = parsedName.suffix
 if not row['nickName']:
     row['nickName'] = parsedName.nickname
Exemple #27
0
    def ProcessScan(self, evt):
        print "Got a scan!"

        try:
            license = self.parser.decode(evt.data)
        except aamva.ReadError as e:
            #GUI interaction must be done in a thread-safe way
            wx.CallAfter(self.ErrorMessage, 'Invalid data.\n{0}'.format(e))
            return

        name = HumanName("{} {}".format(xstr(license['first']).lower(), xstr(license['last']).lower()))
        name.capitalize()

        query = {
            'firstName' : name.first,
            'lastName' : name.last,
            'address1' : license['address'],
            'address2' : xstr(license['address2']),
            'city' : license['city'],
            'state' : license['state'],
            'postalCode' : xstr(license['ZIP'])[0:5]+"-"+xstr(license['ZIP'])[5:],
            'country' : license['country'],
            'birthdate' : license['dob']
        }

        params = urllib.urlencode(query)

        if license['expiry'] <= date.today():
            wx.CallAfter(self.InfoMessage, str('ID expired {}'.format(license['expiry'])))
            webbrowser.open(BASE_URL + "?" + params, new=0, autoraise=False)
        else:
            webbrowser.open(BASE_URL + "?" + params, new=0, autoraise=True)

        #clear form
        self.clearForm()

        #set the fields
        self.NameText.SetValue(license['first'])
        if license['middle'] is not None:
            self.MiddleText.SetValue(license['middle'])
        self.SurnameText.SetValue(license['last'])
        self.DOBText.SetValue(xstr(license['dob']))
        self.AddressText.SetValue(license['address'])
        self.Address2Text.SetValue(xstr(license['address2']))
        self.CityText.SetValue(license['city'])
        self.StateText.SetValue(license['state'])
        self.ZIPText.SetValue(xstr(license['ZIP'])[0:5]+"-"+xstr(license['ZIP'])[5:])
        self.IINText.SetValue(license['IIN'])
        self.LicenseNoText.SetValue(license['license_number'])
        self.IssuedText.SetValue(xstr(license['issued']))
        self.ExpiresText.SetValue(xstr(license['expiry']))
        try:
            self.CountryText.SetValue(license['country'])
        except KeyError:
            self.CountryText.SetValue("???")
        if license['sex'] == aamva.MALE:
            self.MaleRadio.SetValue(True)
        elif license['sex'] == aamva.FEMALE:
            self.FemaleRadio.SetValue(True)
        self.HeightText.SetValue(xstr(license['height']))
        self.WeightText.SetValue(xstr(license['weight']))
        if license['hair'] is None:
            self.HairText.SetValue("???")
        else:
            self.HairText.SetValue(license['hair'])
        if license['eyes'] is None:
            self.EyesText.SetValue("???")
        else:
            self.EyesText.SetValue(license['eyes'])
        self.EndorsementsText.SetValue(license['endorsements'])
        self.RestrictionsText.SetValue(license['restrictions'])
Exemple #28
0
class ParsedName(object):
    """Class for representing a name.

    After construction, the instance exposes the fields exposed by `HumanName` instance, i.e.
    `title`, `first`, `middle`, `last`, `suffix`.
    """
    constants = _prepare_nameparser_constants()
    """The default constants configuration for `HumanName` to use for parsing all names."""
    def __init__(self, name, constants=None):
        """Create a ParsedName instance.

        Args:
            name (Union[str, HumanName]): The name to be parsed (must be non empty nor None).
            constants (:class:`nameparser.config.Constants`): Configuration for `HumanName` instantiation.
                (Can be None, if provided it overwrites the default one generated in
                :method:`prepare_nameparser_constants`.)
        """
        if not constants:
            constants = ParsedName.constants

        if isinstance(name, HumanName):
            self._parsed_name = name
        else:
            self._parsed_name = HumanName(name, constants=constants)
            self._parsed_name.capitalize()

    def __iter__(self):
        return self._parsed_name

    def __len__(self):
        return len(self._parsed_name)

    def __repr__(self):
        return repr(self._parsed_name)

    def __str__(self):
        return str(self._parsed_name)

    @property
    def title(self):
        return self._parsed_name.title

    @property
    def first(self):
        return self._parsed_name.first

    @property
    def first_list(self):
        return self._parsed_name.first_list

    @property
    def middle(self):
        return self._parsed_name.middle

    @property
    def middle_list(self):
        return self._parsed_name.middle_list

    @property
    def last(self):
        return self._parsed_name.last

    @property
    def last_list(self):
        return self._parsed_name.last_list

    @property
    def suffix(self):
        return self._parsed_name.suffix

    @property
    def suffix_list(self):
        return self._parsed_name.suffix_list

    @classmethod
    def loads(cls, name):
        """Load a parsed name from a string.

        Raises:
            TypeError: when name isn't a type of `six.string_types`.
            ValueError: when name is empty or None.
        """
        if not isinstance(name, six.string_types):
            raise TypeError(
                u'arguments to {classname} must be of type {string_types}'.
                format(classname=cls.__name__,
                       string_types=repr(six.string_types)))
        if not name or name.isspace():
            raise ValueError('name must not be empty')

        return cls(name)

    def dumps(self):
        """Dump the name to string, after normalizing it."""
        def _is_initial(author_name):
            return len(author_name) == 1 or u'.' in author_name

        def _ensure_dotted_initials(author_name):
            if _is_initial(author_name) \
                    and u'.' not in author_name:
                seq = (author_name, u'.')
                author_name = u''.join(seq)
            return author_name

        def _ensure_dotted_suffixes(author_suffix):
            if u'.' not in author_suffix:
                seq = (author_suffix, u'.')
                author_suffix = u''.join(seq)
            return author_suffix

        def _is_roman_numeral(suffix):
            """Controls that the user's input only contains valid roman numerals"""
            valid_roman_numerals = [
                u'M', u'D', u'C', u'L', u'X', u'V', u'I', u'(', u')'
            ]
            return all(letters in valid_roman_numerals
                       for letters in suffix.upper())

        # Create first and middle
        first_name = _ensure_dotted_initials(self.first)
        middle_name = _ensure_dotted_initials(self.middle)

        if _is_initial(first_name) and _is_initial(middle_name):
            normalized_names = u'{first_name}{middle_name}'
        else:
            normalized_names = u'{first_name} {middle_name}'

        normalized_names = normalized_names.format(
            first_name=first_name,
            middle_name=middle_name,
        )

        if _is_roman_numeral(self.suffix):
            suffix = self.suffix.upper()
        else:
            suffix = _ensure_dotted_suffixes(self.suffix)

        final_name = u', '.join(part
                                for part in (self.last,
                                             normalized_names.strip(), suffix)
                                if part)

        # Replace unicode curly apostrophe to normal apostrophe.
        final_name = final_name.replace(u'’', '\'')

        return final_name

    @classmethod
    def from_parts(cls,
                   first=None,
                   last=None,
                   middle=None,
                   suffix=None,
                   title=None):
        name = HumanName()
        name.first = first
        name.middle = middle
        name.last = last
        name.suffix = suffix
        name.title = title
        return ParsedName(name)
Exemple #29
0
    def resolve(self, match_entity=None):
        """
        Associate each person name of self.person_name_list to an entity
        :return: dict idx_of_person_name -> entity,
        list of indexes that have to be discarded because matched entity is None
        """

        # PRE-PROCESSING STEP :
        # each person name is parsed using human name parser
        # each time we succeed to associate a human_name to an entity, we will remove it from this list
        human_name_list = [
            (idx, self.name_preprocessing(person_name))
            for idx, person_name in enumerate(self.person_name_list)
        ]

        # some name will contain just a title. For instance 'Sir' alone. It will be detected as a character name
        # by BERT NER but we won't try to associate it with an entity.
        # by default, we will associate such terms with a unique "NONE" entity
        remaining_list = []
        empty_entity = Entity(HumanName("NONE"))
        for idx, human_name in human_name_list:
            if human_name.first == "" and human_name.last == "":
                self.entities_match[idx] = empty_entity
            else:
                remaining_list.append((idx, human_name))
            if human_name.first == "``":
                human_name.first = ""
                self.entities_match[idx] = human_name
        human_name_list = remaining_list

        # STEP 1 :
        # for each human_name that are complets ie: that contains a title, a first name and last name
        #    -> for instance: Miss Elizabeth Bennet
        # if there already exists an entity which has this first and last name: associate the human_name to this entity
        # else : create a new entity
        print(
            "Co-ref step 1 : associate character name that have title, first name and last name to entity"
        )
        remaining_list = [
        ]  # to store the human name we have not succeed to bind to an entity
        for idx, human_name in tqdm(human_name_list):
            if human_name.title != "" and human_name.first != "" and human_name.last != "":
                try:
                    match_entity = [
                        entity for entity in self.entity_set
                        if human_name.first == entity.human_name.first
                        and human_name.last == entity.human_name.last
                    ][0]
                except IndexError:
                    match_entity = None

                if match_entity is None:
                    self.create_entity(idx, human_name)
                else:
                    self.entities_match[idx] = match_entity
            else:
                remaining_list.append((idx, human_name))
        human_name_list = remaining_list

        # STEP 2 :
        # for each remaining human_names that contain at least first name and last name
        #   -> for instance : Elizabeth Bennet
        # if there already exists an entity which has this first and last name: associate the human_name to this entity
        # else : create a new entity
        print(
            "Co-ref step 2 : associate character name that have just first name and last name to entity"
        )
        remaining_list = []
        for idx, human_name in tqdm(human_name_list):
            if human_name.first != "" and human_name.last != "":
                try:
                    match_entity = [
                        entity for entity in self.entity_set
                        if human_name.first == entity.human_name.first
                        and human_name.last == entity.human_name.last
                    ][0]
                except IndexError:
                    match_entity = None

                if match_entity is None:
                    self.create_entity(idx, human_name)
                else:
                    self.entities_match[idx] = match_entity
            else:
                remaining_list.append((idx, human_name))
        human_name_list = remaining_list

        # STEP 3 :
        # for each remaining human_names that contain a title and first name
        #   -> for instance : Miss Bennet
        # if there already exists entities which contains this first name and has the same genre (ie: Elizabeth Bennet)
        #     associate the human_name to the most common entity among those entities
        # else : create a new entity
        print(
            "Co-ref step 3 : associate character name that have just title and first name to entity"
        )
        remaining_list = []
        for idx, human_name in tqdm(human_name_list):
            if human_name.title != "" and human_name.first != "":
                possible_entities = []
                for entity in self.entity_set:
                    if entity.human_name.first == human_name.first:
                        if self.genre_of(
                                human_name
                        ) == Genre.UKN or entity.genre == Genre.UKN:
                            possible_entities.append(entity)
                        else:
                            if entity.genre == self.genre_of(human_name):
                                possible_entities.append(entity)

                match_entity = self.most_frequent_entity(possible_entities)
                if match_entity is None:
                    self.create_entity(idx, human_name)
                else:
                    self.entities_match[idx] = match_entity
            else:
                remaining_list.append((idx, human_name))
        human_name_list = remaining_list

        # STEP 4 :
        # for each remaining human_names that contain a title and last name
        #   -> for instance : Mrs. Bennet
        # if there already exists entities which contains this last name and has the same genre (ie: Elizabeth Bennet)
        #     associate the human_name to the most common entity among those entities
        # else : create a new entity
        print(
            "Co-ref step 4 : associate character name that have just title and last name to entity"
        )
        remaining_list = []
        for idx, human_name in tqdm(human_name_list):
            if human_name.title != "" and human_name.last != "":
                possible_entities = []
                for entity in self.entity_set:
                    if entity.human_name.last == human_name.last:
                        if self.genre_of(
                                human_name
                        ) == Genre.UKN or entity.genre == Genre.UKN:
                            possible_entities.append(entity)
                        else:
                            if entity.genre == self.genre_of(human_name):
                                possible_entities.append(entity)
                match_entity = self.most_frequent_entity(possible_entities)

                if match_entity is None:
                    self.create_entity(idx, human_name)
                else:
                    self.entities_match[idx] = match_entity
            else:
                remaining_list.append((idx, human_name))
        human_name_list = remaining_list

        # STEP 5 :
        # At this step, the human_name_list only contain first name
        # Note that this first could also corresponding to last_name, indeed both Duval or Alexandre will be parsed as
        # HumanName(first='Duval') , HumanName(first='Alexandre') by the HumanParser
        #
        # so for each of this human_name we look in the list of entities for the most common entities which contain
        print(
            "Co-ref step 5 : associate character name that have just first name or last name to entity"
        )
        for idx, human_name in tqdm(human_name_list):
            if human_name.first == "":
                possible_entities = [
                    entity for entity in self.entity_set
                    if entity.human_name.last == human_name.last
                    or entity.human_name.first == human_name.last
                ]
            if human_name.last == "":
                possible_entities = [
                    entity for entity in self.entity_set
                    if entity.human_name.first == human_name.first
                    or entity.human_name.last == human_name.first
                ]

            match_entity = self.most_frequent_entity(possible_entities)
            if match_entity is None:
                self.create_entity(idx, human_name)
            else:
                self.entities_match[idx] = match_entity

        return self.entities_match
Exemple #30
0
    def latex(self):
        """Render latex template"""
        rc = self.rc

        if not rc.grants:
            raise RuntimeError(
                "Error: no grant specified. Please rerun specifying a grant")
        if isinstance(rc.grants, str):
            rc.grants = [rc.grants]
        if len(rc.grants) > 1:
            raise RuntimeError(
                "Error: more than one grant specified. Please rerun with"
                "only a single grant.")
        grant_id = rc.grants[0]
        grant = fuzzy_retrieval(self.gtx['grants'], ['_id', "alias", "name"],
                                grant_id)
        grant_dates = get_dates(grant)

        # Convert Date Strings to Datetime Objects
        if rc.from_date:
            rp_start_date = date_parser.parse(rc.from_date).date()
        else:
            rp_start_date = grant_dates.get("begin_date")
            print(
                f"INFO: no begin-date specified.  running report from the beginning "
                f"of the grant period ({rp_start_date})")
        if rc.to_date:
            rp_end_date = date_parser.parse(rc.to_date).date()
        else:
            rp_end_date = min([date.today(), grant_dates.get("end_date")])
            print(
                "INFO: no end-date specified for the reporting period.  Running "
                "report up to the earlier of the end of the grant, or today "
                f"({rp_end_date}).")
        report_dates = {'begin_date': rp_start_date, 'end_date': rp_end_date}
        print(f"INFO: generating report for grant {grant_id} for the period"
              f"from {rp_start_date} to {rp_end_date})")

        # Get prum associated to grant and active during reporting period
        #        institutions_coll = [inst for inst in self.gtx["institutions"]]
        institutions_coll = self.gtx["institutions"]
        grant_prums = [
            prum for prum in self.gtx['projecta']
            if grant_id in prum.get('grants', [])
            and "checklist" not in prum.get("deliverable").get("scope")
        ]
        #        for prum in self.gtx['projecta']:
        #            if grant_name in prum['grants']:
        #                begin_date = get_dates(prum).get('begin_date')
        #                due_date = get_due_date(prum['deliverable'])
        #                # if projectum was finished during reporting period or is still current
        #                # some projectum don't have an "end date", but all projecta have a deliverable
        #                # due_date
        #                if (rp_start_date <= due_date <= rp_end_date and prum['status'] is "finished") or is_current(prum):
        #                   grant_prums.append(prum)
        # Get people associated with grant

        grant_prums_finished_this_period = [
            prum for prum in grant_prums
            if is_current(report_dates,
                          get_dates(prum).get('end_date'))
        ]
        grant_prum_leads = list(set([prum['lead'] for prum in grant_prums]))
        grant_prum_collaborators = list(
            set([
                collab for prum in grant_prums
                for collab in prum.get('collaborators', [])
            ]))
        grant_prum_group_members = list(
            set([
                grp_mbr for prum in grant_prums
                for grp_mbr in prum.get('group_members', [])
            ]))
        grant_people = grant_prum_leads
        # Accomplishments
        major_activities = []
        significant_results = []
        for prum in grant_prums:
            if prum['status'] == "finished":
                continue
            else:
                major_activities.append(prum)
        for prum in grant_prums_finished_this_period:
            significant_results.append(prum)

        # Opportunities for Training and Professional Development
        training_and_professional_development = []
        # presentations
        for id in grant_people:
            training_and_professional_development.extend(
                filter_presentations(self.gtx["people"],
                                     self.gtx["presentations"],
                                     institutions_coll,
                                     id,
                                     types=["all"],
                                     since=rp_start_date,
                                     before=rp_end_date,
                                     statuses=["accepted"]))
        # thesis defendings
        # how do i access people.yml in rg-db-public vs the people.yml file in rg-db-group?
        #        defended_theses = []
        #        for id in grant_people:
        #            for prsn in self.gtx['people']:
        #                if prsn["_id"] != id:
        #                    continue
        #                else:
        #                    person = prsn
        #            for education in person['education']:
        #                edu_dates = get_dates(education)
        #                if 'phd' in education['degree'].lower() and 'columbia' in education['institution'].lower() and \
        #                        rp_start_date.year <= edu_dates.get('end_date', edu_dates['date']).year <= rp_end_date.year:
        #                    defended_theses.append(id)

        # Products
        # need rg-db-public's citation.yml
        #        publications = filter_publications(self.gtx["citations"],
        ##                                           set(grant_people),
        #                                           since=rp_start_date,
        #                                          before=rp_end_date)
        publications = [
            publ for publ in self.gtx["citations"]
            if grant_id in publ.get("grant", "")
        ]

        for publ in publications:
            formatted_authors = [
                HumanName(name).full_name for name in publ.get("authors", [])
            ]
            publ["authors"] = formatted_authors
        # Participants/Organizations
        participants = []
        for person in self.gtx["people"]:
            months_on_grant, months_left = self.months_on(
                grant_id, person, rp_start_date, rp_end_date)
            if months_on_grant > 0:
                participants.append({
                    "name":
                    person.get("name"),
                    "email":
                    person.get("email"),
                    "position":
                    person.get('position'),
                    "months_on_grant":
                    int(round(months_on_grant, 0))
                })

        collaborators = {}
        missing_contacts = []
        for id in grant_prum_collaborators:
            for contact in self.gtx["contacts"]:
                if contact["_id"] == id:
                    name = contact.get("name")
                    aka = contact.get("aka")
                    institution_id = contact.get("institution")
                    institution = fuzzy_retrieval(institutions_coll,
                                                  ["name", "aka", "_id"],
                                                  institution_id)
                    if institution:
                        inst_name = institution.get("name")
                    else:
                        print(
                            f"WARNING: institution {institution_id} not found "
                            f"in institutions collection")
                        inst_name = institution_id
                    collaborators[id] = {
                        "aka": aka,
                        "name": name,
                        "institution": inst_name
                    }
        missing_contacts = [
            id for id in grant_prum_collaborators if not collaborators.get(id)
        ]
        missing_contacts = list(set(missing_contacts))
        for person_id in missing_contacts:
            print(
                f"WARNING: contact {person_id} not found in contacts collection"
            )

        # Impacts
        begin_date_str = rp_start_date.isoformat()
        end_date_str = rp_end_date.isoformat()
        self.render(
            "grantreport.txt",
            f"{grant_id}_report_{begin_date_str}_{end_date_str}.txt",
            begin_date=begin_date_str,
            end_date=end_date_str,
            majorActivities=major_activities,
            significantResults=significant_results,
            trainingAndProfessionalDevelopment=
            training_and_professional_development,
            #            defendedTheses=defended_theses,
            products=publications,
            grantPeople=grant_people,
            participants=participants,
            collaborators=collaborators,
            hline=
            "------------------------------------------------------------------------------"
        )
Exemple #31
0
def scrub_name(name):
    return HumanName(demoji.replace(name.replace(',', ''), ''))
Exemple #32
0
def get_last_name(author):
    author = HumanName(author.split("and")[0])
    return author.last
Exemple #33
0
 def last_first(self):
     if not self.name:
         return ''
     return '{0.last}, {0.first} {0.middle}'.format(HumanName(self.name)).strip()
Exemple #34
0
 def _generate_lastName(self):
     tmp = extract(RULES["info"],self.sec,multi=True)[0]
     self.lastName = HumanName(extract("//strong/a/text()",str(etree.tostring(tmp)))).last
Exemple #35
0
 def test_capitalize_diacritics(self):
     hn = HumanName(u'matth\xe4us schmidt')
     hn.capitalize()
     self.m(unicode(hn), u'Matth\xe4us Schmidt', hn)
 def _generate_lastName(self):
     if "name" in self.parse_data.keys():
         if self.parse_data["name"]:
             self.lastName = HumanName(self.parse_data["name"]).last
Exemple #37
0
 def test123(self):
     hn = HumanName('Shirley Maclaine')
     hn.capitalize()
     self.m(str(hn), 'Shirley Maclaine', hn)
Exemple #38
0
    def extract_raw_org_names_from_name(name_raw):
        """
        Finds raw org names like "B&W" in a name string, standarizes them (e.g. to
        "Brown & Williamson," and returns the name without that raw org name + extracted positions


        :param name_raw: str
        :param extract_orgs: bool
        :return: str (name_raw without the raw org name), list of str (extracted clean
        organization names)
        """
        extracted_positions = []

        for raw_org, clean_org in RAW_ORG_TO_CLEAN_ORG_DICT.items():

            while True:
                search_hit = None
                # this is a bit of an ugly hack to get the last (rather than the first) search hit
                # for a string: we iterate over all matches and the last one gets stored in
                # search_hit

                for search_hit in re.finditer(r'\b' + raw_org + r'\b', name_raw):
                    pass

                if not search_hit:
                    break

                if len(raw_org) >= 3:
                    name_raw = name_raw[0:search_hit.start()] + name_raw[search_hit.end():]
                    if not clean_org == "@skip@":
                        extracted_positions.append(clean_org)

                elif len(raw_org) == 2:
                    name_raw_test = name_raw[0:search_hit.start()] + name_raw[search_hit.end():]

                    # test if deleted, there exists first & middle name
                    name = HumanName(name_raw_test)
                    # if first & middle name do not exist after deletion, the deleted org might
                    # actually be initials, so ignore the match
                    if not name.first and not name.middle:
                        break

                    # last names without middle names ("TEMKO") get interpreted as first names
                    # without last names. Skip those cases
                    if not name.last:
                        break

                    # if not, do extract raw_org
                    extracted_positions.append(clean_org)
                    name_raw = name_raw_test

        name_raw = name_raw.strip(', ')

        # more adventurous: try to extract organizations we don't have in the dictionary
        # do this only if a) the name is currently not valid (i.e. it has strange characters like
        # commas in the last name) and b) extracting an org makes it valid,
        # e.g. 'HOLMAN RT, DEUEL CONFERENCE ON LIPIDS'


        if len(name_raw) > 0:
            first, middle, last, _ = Person.parse_raw_name(name_raw, 0, extract_orgs=False)

            if not Person(last=last, middle=middle, first=first).check_if_this_person_looks_valid():
                search_hit = re.search(',.+$', name_raw)
                if search_hit:
                    extracted_position = name_raw[search_hit.start():].strip(', ')
                    name_raw_without_org = name_raw[0:search_hit.start()] + name_raw[
                        search_hit.end():]

                    # if raw name becomes valid after extracting the org, then we add it to the orgs
                    # otherwise, we skip it
                    first, middle, last, _ = Person.parse_raw_name(name_raw_without_org,
                                                                   0, extract_orgs=False)
                    if Person(last=last, middle=middle,
                              first=first).check_if_this_person_looks_valid():
                        extracted_positions.append(extracted_position)
                        name_raw = name_raw_without_org




        name_raw = name_raw.strip(', ')
        return name_raw, extracted_positions
def get_names_and_genders_from_journals():
    """
    Creates a csv that identifies names that we need to clean by hand.

    :return:
    """

    authors_counter = Counter()
    journalc = Counter()

    db = sqlite3.connect(Path('data', 'JSTOR_full_cleaned.db'))
    cur = db.cursor()
    cur2 = db.cursor()
    cur.execute('''select journal, ID_doi, ID_jstor from article_pub_info 
                      where article_type="research article" and year > 1950 and language="eng";'''
                )

    rows = cur.fetchall()

    for article_id, (journal, ID_doi, ID_jstor) in enumerate(rows):
        print(article_id, len(rows))
        if journal in GENERAL_JOURNALS:
            journalc[journal] += 1

            if ID_doi:
                cur2.execute(
                    f'SELECT name, surname, role FROM contributors WHERE ID_doi = "{ID_doi}"'
                )
            elif ID_jstor:
                cur2.execute(
                    f'SELECT name, surname, role FROM contributors WHERE ID_jstor = "{ID_jstor}"'
                )
            else:
                raise ValueError("NO id for ", journal)

            article_authors = cur2.fetchall()
            for first_name, last_name, role in article_authors:
                # some last names contain commas, which trip up the gender guesser
                last_name = last_name.strip(',')
                authors_counter[(first_name, last_name)] += 1

    authors = []

    for author in authors_counter:
        first_name, last_name = author
        human_name = HumanName(f'{last_name}, {first_name}')

        guess_census = guess_gender_census(human_name)
        guess_first_middle_name_international = guess_gender_with_middle_name_and_international_names(
            human_name)

        human_check_necessary = True
        if (guess_census == guess_first_middle_name_international
                and (guess_census == 'male' or guess_census == 'female')):
            human_check_necessary = False

        authors.append({
            'first_name':
            first_name,
            'last_name':
            last_name,
            'count':
            authors_counter[author],
            'prob_male_census':
            guess_gender_census(human_name, return_type='probability_male'),
            'guess_census':
            guess_census,
            # 'guess_first_name_usa': guess_first_name_usa,
            'guess_first_middle_name_international':
            guess_first_middle_name_international,
            'human_check_necessary':
            human_check_necessary
        })

    df = pd.DataFrame(authors)
    df.to_csv(Path('data', 'ambiguous_author_gender.csv'), encoding='utf8')
Exemple #40
0
        'suffix',
    ]
    for field in new_field_list:
        sql_query = ''' ALTER TABLE `%s_raw_contribs` ADD %s TEXT  ''' % (
            date_stamp, field)
        c.execute(sql_query)
except:
    print 'fields already exist'
sql_select_command = ''' SELECT rowid, name_of_contributor FROM `%s_raw_contribs` WHERE employer <> '' AND occupation <> '' ''' % date_stamp  # every sqlite table automatically has a unique row id you can use
execute_query = c.execute(sql_select_command)
rows = execute_query.fetchall()
for row in rows:
    row_id = row[0]
    name = row[
        1]  # python lists start counting at 0 and the full name of the contributor is the second column in the sql select statement
    parsed_name = HumanName(name)
    sql_command = '''
        UPDATE `%s_raw_contribs` SET title = '%s', fname = '%s', middle = '%s', lname = '%s', suffix = '%s' WHERE rowid = %s
    ''' % (
        date_stamp,
        parsed_name.title,
        parsed_name.first,
        parsed_name.middle,
        parsed_name.last,
        parsed_name.suffix,
        row_id,
    )
    c.execute(sql_command)

sql_command = "SELECT rowid, name_of_contributor, title, fname, middle, lname, suffix FROM `%s_raw_contribs`;" % (
    date_stamp)
Exemple #41
0
def human_to_csl(name):
    """Convert HumanName to CSL-formatted JSON.

    Args:
        name : HumanName or str / unicode
    Returns:
        CSL-formatted JSON

    Examples:
    >>> csl = human_to_csl('Rafael Nadal')
    >>> csl == {'given' : 'Rafael', 'family' : 'Nadal'}
    True
    >>> csl = human_to_csl(HumanName('Rafael Nadal'))
    >>> csl == {'given' : 'Rafael', 'family' : 'Nadal'}
    True
    >>> csl = human_to_csl(HumanName('George HW de Bush'))
    >>> csl == {'given' : 'George H. W.', 'family' : 'de Bush'}
    True
    >>> csl = human_to_csl('Eisenhower, I')
    >>> csl == {'given' : 'I.', 'family' : 'Eisenhower'}
    True
    >>> csl = human_to_csl('Eisenhower, V')
    >>> csl == {'given' : 'V.', 'family' : 'Eisenhower'}
    True
    """
    # Optionally convert to nameparser.HumanName
    if not isinstance(name, HumanName):
        name = HumanName(name)
    
    # Fix: nameparser treats HumanName('Eisenhower, I') as 
    # {first : 'Eisenhower', suffix : 'I'}
    if re.search('^[IV]\.*$', name.suffix):
        name.last = name.first
        name.first = name.suffix
        name.suffix = ''

    # Initialize CSL data
    csl_data = {}
    
    # Append middle name to first
    if name.middle:
        name.first += ' ' + name.middle

    # Iterate over lookup fields
    for lookup in human_to_csl_map:
        
        # Get field and function
        field = human_to_csl_map[lookup]['field']
        fun = human_to_csl_map[lookup].get('fun', I)
        
        # Get field from name
        value = getattr(name, field)

        # Skip if empty
        if not value:
            continue

        # Apply function
        value = fun(value)
        
        # Save to CSL data
        csl_data[lookup] = value

    # Return CSL data
    return csl_data
Exemple #42
0
    def parse_rss_2_cap(message):
        """
            Parse RSS Feeds into the CAP Module
        """

        db = current.db
        s3db = current.s3db
        table = s3db.msg_rss
        message_id = message.message_id
        record = db(table.message_id == message_id).select(
            table.id,
            table.channel_id,
            table.title,
            table.from_address,
            table.body,
            table.date,
            table.location_id,
            table.author,
            limitby=(0, 1)).first()
        if not record:
            return

        pstable = s3db.msg_parsing_status
        # not adding (pstable.channel_id == record.channel_id) to query
        # because two channels (http://host.domain/eden/cap/public.rss and
        # (http://host.domain/eden/cap/alert.rss) may contain common url
        # eg. http://host.domain/eden/cap/public/xx.cap
        pquery = (pstable.message_id == message_id)
        prows = db(pquery).select(pstable.id, pstable.is_parsed)
        for prow in prows:
            if prow.is_parsed:
                return

        alert_table = s3db.cap_alert
        info_table = s3db.cap_info

        # Is this an Update or a Create?
        # @ToDo: Use guid?
        # Use Body
        body = record.body or record.title
        query = (info_table.description == body)
        exists = db(query).select(info_table.id, limitby=(0, 1)).first()

        author = record.author
        if author:
            ptable = s3db.pr_person
            # https://code.google.com/p/python-nameparser/
            from nameparser import HumanName
            name = HumanName(author)
            first_name = name.first
            middle_name = name.middle
            last_name = name.last
            query = (ptable.first_name == first_name) & \
                    (ptable.middle_name == middle_name) & \
                    (ptable.last_name == last_name)
            pexists = db(query).select(ptable.id, limitby=(0, 1)).first()
            if pexists:
                person_id = pexists.id
            else:
                person_id = ptable.insert(first_name=first_name,
                                          middle_name=middle_name,
                                          last_name=last_name)
                s3db.update_super(ptable, dict(id=person_id))
        else:
            person_id = None

        if exists:
            # @ToDo: Use XSLT
            info_id = exists.id
            db(info_table.id == info_id).update(
                headline=record.title,
                description=body,
                created_on=record.date,
                #location_id = record.location_id,
                #person_id = person_id,
            )

        else:
            # Embedded link
            url = record.from_address
            import_xml = s3db.resource("cap_alert").import_xml
            stylesheet = os.path.join(current.request.folder, "static",
                                      "formats", "cap", "import.xsl")
            try:
                file = fetch(url)
            except HTTPError as e:
                import base64
                rss_table = s3db.msg_rss_channel
                query = (rss_table.channel_id == record.channel_id)
                channel = db(query).select(rss_table.date,
                                           rss_table.etag,
                                           rss_table.url,
                                           rss_table.username,
                                           rss_table.password,
                                           limitby=(0, 1)).first()
                username = channel.username
                password = channel.password
                if e.code == 401 and username and password:
                    request = urllib2.Request(url)
                    base64string = base64.encodestring("%s:%s" %
                                                       (username, password))
                    request.add_header("Authorization",
                                       "Basic %s" % base64string)
                else:
                    request = None

                try:
                    file = urlopen(request).read() if request else fetch(url)
                except HTTPError as e:
                    # Check if there are links to look into
                    ltable = s3db.msg_rss_link
                    query_ = (ltable.rss_id
                              == record.id) & (ltable.deleted != True)
                    rows_ = db(query_).select(ltable.type, ltable.url)
                    url_format = "{uri.scheme}://{uri.netloc}/".format
                    url_domain = url_format(uri=urlparse.urlparse(url))
                    for row_ in rows_:
                        url = row_.url
                        if url and row_.type == "application/cap+xml" and \
                           url_domain == url_format(uri=urlparse.urlparse(url)):
                            # Same domain, so okey to use same username/pwd combination
                            if e.code == 401 and username and password:
                                request = urllib2.Request(url)
                                request.add_header("Authorization",
                                                   "Basic %s" % base64string)
                            else:
                                request = None
                            try:
                                file = urlopen(
                                    request).read() if request else fetch(url)
                            except HTTPError as e:
                                current.log.error(
                                    "Getting content from link failed: %s" % e)
                            else:
                                # Import via XSLT
                                import_xml(StringIO(file),
                                           stylesheet=stylesheet,
                                           ignore_errors=True)
                else:
                    # Import via XSLT
                    import_xml(StringIO(file),
                               stylesheet=stylesheet,
                               ignore_errors=True)
            else:
                # Public Alerts
                # eg. http://host.domain/eden/cap/public/xx.cap
                # Import via XSLT
                import_xml(StringIO(file),
                           stylesheet=stylesheet,
                           ignore_errors=True)

        # No Reply
        return
Exemple #43
0
    def latex(self):
        """Render latex template"""
        for group in self.gtx["groups"]:
            grp = group["_id"]
            pi = fuzzy_retrieval(self.gtx["people"], ["aka", "name"],
                                 group["pi_name"])
            pinames = pi["name"].split()
            piinitialslist = [i[0] for i in pinames]
            pi['initials'] = "".join(piinitialslist).upper()

            grants = merge_collections(self.gtx["proposals"],
                                       self.gtx["grants"], "proposal_id")
            for g in grants:
                print(g["_id"])
                g['year'] = None
                g['month'] = None
                g['end_date'] = get_dates(g).get('end_date')
                g['begin_date'] = get_dates(g).get('begin_date',
                                                   dt.date(1900, 1, 2))
                for person in g["team"]:
                    rperson = fuzzy_retrieval(self.gtx["people"],
                                              ["aka", "name"], person["name"])
                    if rperson:
                        person["name"] = rperson["name"]
                if g.get('budget'):
                    amounts = [i.get('amount') for i in g.get('budget')]
                    g['subaward_amount'] = sum(amounts)

            current_grants = [dict(g) for g in grants if is_current(g)]
            current_grants, _, _ = filter_grants(current_grants, {pi["name"]},
                                                 pi=False,
                                                 multi_pi=True)
            for g in current_grants:
                if g.get('budget'):
                    amounts = [i.get('amount') for i in g.get('budget')]
                    g['subaward_amount'] = sum(amounts)

            pending_grants = [
                g for g in self.gtx["proposals"] if is_pending(g["status"])
            ]
            for g in pending_grants:
                for person in g["team"]:
                    rperson = fuzzy_retrieval(self.gtx["people"],
                                              ["aka", "name"], person["name"])
                    if rperson:
                        person["name"] = rperson["name"]
            pending_grants, _, _ = filter_grants(pending_grants, {pi["name"]},
                                                 pi=False,
                                                 multi_pi=True)
            print([g.get('begin_date') for g in pending_grants])
            grants = pending_grants + current_grants
            for grant in grants:
                grant.update(
                    award_start_date="{}/{}/{}".format(
                        grant.get("begin_date").month,
                        grant.get("begin_date").day,
                        grant.get("begin_date").year,
                    ),
                    award_end_date="{}/{}/{}".format(
                        grant.get("end_date").month,
                        grant.get("end_date").day,
                        grant.get("end_date").year,
                    ),
                )
            badids = [
                i["_id"] for i in current_grants
                if not i.get('cpp_info').get('cppflag', "")
            ]
            iter = copy(current_grants)
            for grant in iter:
                if grant["_id"] in badids:
                    current_grants.remove(grant)
            piname = HumanName(pi["name"])
            outfile = "current-pending-{}-{}".format(grp, piname.last.lower())
            print([grant["_id"] for grant in current_grants])

            self.render(
                "current_pending.tex",
                outfile + ".tex",
                pi=pi,
                pending=pending_grants,
                current=current_grants,
                pi_upper=pi["name"].upper(),
                group=group,
            )
            self.pdf(outfile)
Exemple #44
0
 def sender_name(self):
     name = ANGLE_BRACKETS_REGEX.sub('', self.sender)
     name = name.strip().replace('"', '')
     return unicode(HumanName(name))
Exemple #45
0
 def test_formating(self):
     hn = HumanName("Rev John A. Kenneth Doe III")
     hn.string_format = "{title} {first} {middle} {last} {suffix}"
     self.assertEqual(unicode(hn), "Rev John A. Kenneth Doe III")
     hn.string_format = "{last}, {title} {first} {middle}, {suffix}"
     self.assertEqual(unicode(hn), "Doe, Rev John A. Kenneth, III")
Exemple #46
0
# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
# Contact: [email protected]

from nameparser import HumanName
import codecs
import textwrap

authors = []

with codecs.open('authors.csv', 'r', encoding='utf-8') as namefile:
    for line in namefile:
        name, address = line.split(',')
        authors.append((HumanName(name), address))

authors = sorted(authors, key=lambda author: author[0].last)
authors.insert(0, (HumanName("Michael R. Crusoe"), "*****@*****.**"))
authors.append((HumanName("C. Titus Brown"), "*****@*****.**"))

# print(authors)

bibtex = '   author = \"'

for tup in authors:
    name = tup[0]
    name.string_format = "{last}, {first} {middle} and"
    bibtex += str(name) + " "

bibtex = bibtex[:-5] + '"'  # remove last 'and' and close the quote
Exemple #47
0
full_name_test = ""

# Store problematic name:
name = "Van Conway"

# In original article, this is a first name and a last name.

# let nameparser parse
parsed = HumanName( name )

# look at how that turned out:
print( "Parsed HumanName for " + name + ":" )
print( Person.HumanName_to_str( parsed ) )

# now, make a second HumanName instance.
manual = HumanName()

# look at how that turned out:
print( "Empty HumanName?:" )
print( Person.HumanName_to_str( manual ) )

# override parsed values with correct name parts
manual.first = "Van"
manual.last = "Conway"

# look at how that turned out:
print( "after manual configuration:" )
print( Person.HumanName_to_str( manual ) )

# now, try some lookups
Exemple #48
0
 def extract_last_name(self):
     self.Xy["last_name"] = self.Xy.name.apply(lambda x: HumanName(x).last)
Exemple #49
0
print "Now let's take care of those", train.Age.isnull().sum(), "null values"

# In[30]:

print "One idea would be to take the median age:", train.Age.median(
), "or mean:", train.Age.mean(
), "but I think we can get a clue from people's titles (ex Mr., Mrs.)"

# First let's see what titles we have.

# In[31]:

titles = []
for name in train.Name:
    titles.append(HumanName(name).title)
print set(titles)

# The titles look good, expect there's an empty string, perhaps that's for the less common titles, but I feel pretty good about this range since it has covered the basicis.
# Now let's make a new feature for these titles.

# In[32]:

train.Title = train.Name.map(lambda x: HumanName(x).title)

# In[33]:

print train[train.Title == ''].Name
print train[train.Title == ''].Survived

# These are the people with the 'empty' titles. Since there are only seven of them, and many of their titles are unique, I don't mind grouping them together into a 'uncommon title' group. Plus, they seem to follow the typical pattern of women survived and men died, so I do not expect any issues to arise in the machine learning section.
Exemple #50
0
 def test_capitalization_with_Mac_as_hyphenated_names(self):
     hn = HumanName('donovan mcnabb-smith')
     hn.capitalize()
     self.m(str(hn), 'Donovan McNabb-Smith', hn)
    def extract_title(self):

        Xy['title'] = Xy.name.apply(lambda x: HumanName(x).title).replace(
            TITLE_TRANSLATOR).replace({'\.': ''}, regex=True)
Exemple #52
0
 def test_capitalize_title(self):
     hn = HumanName('lt. gen. john a. kenneth doe iv')
     hn.capitalize()
     self.m(str(hn), 'Lt. Gen. John A. Kenneth Doe IV', hn)
Exemple #53
0
	def clean_name(self):
		name = self.cleaned_data['name'].lower()
		name = HumanName(name)
		name.capitalize()
		return unicode(name)
Exemple #54
0
 def extract_title(self):
     self.Xy["title"] = (
         Xy.name.apply(lambda x: HumanName(x).title)
         .replace(self.title_translator)
         .replace({"\.": ""}, regex=True)
     )
def parse_persname(persname, auth="", source=""):
    name, birth_date, death_date = extract_birth_death_dates(persname)
    birth_date, death_date = validate_dates(birth_date, death_date)
    dates_string = make_date_string(birth_date, death_date)
    name = HumanName(name)

    titles = ["sir", "mr", "mrs", "baron", "dame", "madame", "viscount", "conte"]
    numbers = ["II", "III"]
    title = name.title
    suffix = name.suffix
    number = u""

    # check if the suffix should actually be a title
    if not title and any(suffix.lower().strip(". ") == title for title in titles):
        title = suffix.capitalize()
        if "mr" in title.lower() and not title.endswith("."):
            title += "."
        suffix = u""

    # extract numbers from the suffix
    if suffix in numbers:
        number = suffix
        suffix = u""

    # special cases cleanup
    if name.title == u"Royal":
        name.title = ""
        title = ""
        name.middle = name.first if not name.middle else "{} {}".format(u"Royal", name.middle)
        name.first = u"Royal"

    if name.title == u"Queen of Great":
        title = name.title + u" Britain"
        name.first = u""

    if name.title == u"Lama":
        title = u"Dalai Lama XIV"
        name.first = u""
        name.middle = u""

    if name.title == u"Marquis":
        title = u""
        name.first = u"Marquis"
        name.middle = u"W."

    if suffix == u"1941":
        birth_date = suffix
        suffix = u""

    if suffix in [u"18", u"b."]:
        suffix = u""

    if suffix == u"Jr":
        suffix += u"."

    if ", fl. 17th cent" in suffix:
        suffix = u"sieur de"
        dates_string = u"fl. 17th cent"

    rest_of_name = u"{0} {1}".format(name.first, name.middle).rstrip()
    if rest_of_name == u"Christella D. Personal journey through South Africa. 1991":
        rest_of_name = u"Christella D."

    # People with single-part names (like Keewaydinoquay) are mis-assigned. Have to fix those
    primary_name = name.last
    if rest_of_name and not primary_name:
        primary_name = rest_of_name
        rest_of_name = ""

    # create the parsed name dictionary
    name_parsed = {u"title": unicode(title),
                   u"primary_name": unicode(primary_name),
                   u"rest_of_name": rest_of_name,
                   u"suffix": unicode(suffix),
                   u"fuller_form": unicode(name.nickname),
                   u"numbers": unicode(number),
                   u"birth_date": unicode(birth_date),
                   u"death_date": unicode(death_date),
                   u"date_string": unicode(dates_string),
                   u"authority_id": unicode(auth),
                   u"source": unicode(source),
                   u"name_order": u"inverted",
                   u"sort_name_auto_generate": True}

    # remove empty fields
    for key, value in name_parsed.items():
        if not value:
            del name_parsed[key]

    return name_parsed
Exemple #56
0
csv_writer.writeheader()
#read the extracted author-names from the clean data and convert data from list to string
with open("author-names_extracted_modified.csv", 'r', encoding="latin") as f:
    for lines in f:
        list_string = lines

        #converting author list in string
        def listExtractedString(list_string):
            str1 = ""
            return (str1.join(list_string))

        #saved the result of the author list to string convertion
        authorListtoString = listExtractedString(list_string)
        #print(authorListtoString)
        #CONSTANTS.string_format = "{first} {middle} {last} ({suffix})"
        name = HumanName(authorListtoString)
        first = name.as_dict()["first"]
        middle = name.as_dict()["middle"]
        last = name.as_dict()["last"]
        data = [{
            'first_name1': first.strip("'['").strip("]"),
            'middle_name1': middle,
            'last_name1': last.strip("']")
        }]
        for row in data:
            #print(row)
            csv_writer.writerow(row)

csvfile.close()

# applied nameparser on the metadata (ground-truth) and saved the result in metadata_author.csv file
Exemple #57
0
def get_lname(somename):
    name = HumanName(somename)
    return name.last
Exemple #58
0
	def namer(field):
		#pre
		if type(field) == tuple:
			w_name = re.sub('[\t\r\n]', '', ", ".join([x.encode('ascii', 'ignore') for x in field])).upper()
		else:
			w_name = re.sub('[\t\r\n]', '', field.encode('ascii', 'ignore')).upper()
		if 'ANONYMOUS' not in w_name:
			if ' FORMER ' not in w_name:
				w_name = re.split(";", w_name)[0]
			else:
				w_name = re.split(";", w_name)[1]

			w_name = re.sub("(?<=[`'/+]) | (?=['`/+])", '', w_name) #6A, 4A-C
			
			out = HumanName(w_name)
			out.middle = re.sub("^[A-Z] |^[A-Z]\. ", '', out.middle)
			if " " in out.last:
				out.last = re.sub("^[A-Z] |^[A-Z]\. ", '', out.last)
			if re.sub("^[A-Z]\.|^[A-Z]", '', out.first) == '' and len(out.middle) != 0:
				out.first, out.middle = out.middle, ""
			else:
				out.first = re.sub("^[A-Z] |^[A-Z]\. ", '', out.first)
			
			#post
			
			if out.middle.startswith("FOR ") or out.middle.startswith("- "): #7A, 1B, 3E
				out.middle = "" 

			if " FOR " in out.last:
				out.last = re.sub(" FOR .*", '', out.last)

			if len(out.last) == 0 and len(out.title) != 0: #9A
				if " " in out.first:
					out = HumanName(out.first)
				else:
					out.first, out.last = "", out.first

			if " AND " in out.middle or " & " in out.middle:
				out.last = re.split("( AND )|( & )", out.middle)[0]
				out.middle = ""
 			if "AND" in out.last or "&" in out.last:

				if out.last.startswith("AND ") or out.last.startswith("& "): #3F
					out.last = HumanName(out.last).last
				elif " AND " in out.last or " & " in out.last:
					out.last = re.sub("( AND ).*|( & ).*", '', out.last)
			out.first = re.split("( AND )|&|/|\+", out.first)[0]
			out.last = re.split("/", out.last)[0].strip()
			if len(re.sub("[^A-Z]", '', out.first)) == 1 and " " in out.last:
				out.first = out.last.split(" ")[0]
				out.last = out.last.split(" ")[1]
			out.capitalize()
			first, last = out.first, out.last
			if len(out.middle) > 0:
				if re.sub("^[A-Z]\.|^[A-Z]", '', out.middle) == '':
					out.middle = ""
				elif first.endswith("-") or out.middle.startswith("-"):
					first += out.middle
				else:
					first += " %s" % out.middle #8A-B
			if len(out.suffix) > 0:
				last += " %s" % out.suffix #2A
			return (first, last)
		else:
			name = HumanName(w_name)
			return (name.first, name.last)
def get_author_info(ID_doi, ID_jstor):
    """
    Gets authors by doi or jstor id.
    Returns the names of all authors as a joined string as well as the overall author genders
    overall author gender will be male, female, mixed, unknown


    :param ID_doi:
    :param ID_jstor:
    :return:

    >>> get_author_info('10.2307_1857439', None)
    ('Walter Goffart', 'male')

    """

    db = sqlite3.connect(str(Path('data', 'JSTOR_full_cleaned.db')))
    cur = db.cursor()

    if ID_doi:
        cur.execute(
            f'SELECT name, surname, role FROM contributors WHERE ID_doi = "{ID_doi}"'
        )
    elif ID_jstor:
        cur.execute(
            f'SELECT name, surname, role FROM contributors WHERE ID_jstor = "{ID_jstor}"'
        )
    else:
        raise ValueError(f"NO id for doi: {ID_doi}, jstor id: {ID_jstor}.")

    genders = set()
    names = []
    for first_name, last_name, role in cur.fetchall():
        last_name = last_name.strip(',')

        human_name = HumanName(f'{last_name}, {first_name}')
        #        print(f"From SQL: last: {last_name}. first: {first_name}")

        gen = get_hand_coded_gender(human_name)
        genders.add(gen)
        if gen == 'n/a':
            NAS.append(f'{first_name} {last_name}')
#        gender_census = guess_gender_census(human_name)
#        gender_guess = guess_gender_with_middle_name_and_international_names(human_name)

        names.append(f'{first_name} {last_name}')


#        if gender_census == gender_guess and (gender_census == 'male' or gender_census == 'female'):
#            genders.add(gender_census)
#        else:
#            genders.add('unknown')

    if 'unknown' in genders:
        combined_gender = 'unknown'
    elif 'n/a' in genders:
        combined_gender = 'unknown'
    elif 'male' in genders and 'female' in genders:
        combined_gender = 'mixed'
    elif genders == {'male'}:
        combined_gender = 'male'
    elif genders == {'female'}:
        combined_gender = 'female'

    # if no authors, return None for author names and unknown for gender
    elif len(genders) == 0:
        return 'None', 'unknown'
    else:
        raise ValueError("How did you get here?", names, genders)

    combined_names = "; ".join(names)

    return combined_names, combined_gender