コード例 #1
0
def get_genderapi_gender(full_name):
    try:
        full_name = utils.clean_ne(full_name).lower()
        # Create a requests session
        session = requests.Session()
        url = "https://gender-api.com/get?split={}&key={}".format(
            quote(full_name), GENDERAPI_TOKEN)
        response = session.get(url)
        cache_obj = response.json()
        gender = cache_obj["gender"]
        gender_logger.debug(
            'GenderAPI service call result for "{0}": "{1}"'.format(
                full_name, gender))

        # Handle unknowns
        if gender is None:
            gender = 'unknown'
            cache_obj['gender'] = 'unknown'
        # Update cache. name attribute in cache_obj contains the name as cache_key
        cache_obj['q'] = full_name
        genderapi_cache_col.insert_one(cache_obj)

    except:
        gender_logger.exception("message")
        gender = 'unknown'

    return gender
コード例 #2
0
def get_genderize_gender(full_name):
    full_name = utils.clean_ne(full_name).lower()
    first_name = utils.extract_first_name(full_name)

    if first_name is None:
        return "unknown"
    else:
        try:
            gender_payload = {"name": first_name}
            # Create a requests session
            session = requests.Session()

            gender_return = session.get("https://api.genderize.io/?",
                                        params=gender_payload)
            cache_obj = json.loads(gender_return.text)
            gender = cache_obj['gender']
            gender_logger.debug(
                'Genderize service call result for "{0}" ("{1}"): "{2}"'.
                format(first_name, full_name, gender))

            # Handle unknowns
            if gender is None:
                gender = 'unknown'
                cache_obj['gender'] = 'unknown'
            # Update Genderize cache
            genderize_cache_col.insert_one(cache_obj)
            return gender
        except:
            gender_logger.exception("message")
            return 'unknown'
コード例 #3
0
def get_gender_from_service(full_name, ignore_list=[]):
    full_name = utils.clean_ne(full_name)
    first_name = utils.extract_first_name(full_name)

    svc_call_log_format = '"{0}" service result for "{1}" is: "{2}"'
    svc_ignore_log_format = 'Ignoring "{0}" service for "{1}"'

    # ---------- Genderize ----------
    service_name = 'Genderize'
    if GENDERIZE_ENABLED and (first_name is not None) and (service_name
                                                           not in ignore_list):

        gender = get_genderize_gender(full_name)
        gender_logger.debug(
            svc_call_log_format.format(service_name, full_name, gender))

        if gender != 'unknown':
            return gender, service_name
    else:
        gender_logger.debug(
            svc_ignore_log_format.format(service_name, full_name))

    # ---------- GenderAPI ----------
    service_name = 'GenderAPI_FullName'
    if GENDERAPI_ENABLED and service_name not in ignore_list:

        gender = get_genderapi_gender(full_name)
        gender_logger.info(
            svc_call_log_format.format(service_name, full_name, gender))

        if gender != 'unknown':
            return gender, service_name
    else:
        gender_logger.debug(
            svc_ignore_log_format.format(service_name, full_name))

    return 'unknown', 'Hardcode'
コード例 #4
0
def merge_nes(doc_coref):
    # ne_dict and ne_cluster are dictionaries which keys are PERSON named entities extracted from the text and values
    #  are mentions of that named entity in the text. Mention clusters come from coreference clustering algorithm.
    ne_dict = {}
    ne_clust = {}
    # It's highly recommended to clean nes before merging them. They usually contain invalid characters
    person_nes = [x for x in doc_coref.ents if x.label_ == 'PERSON']
    # in this for loop we try to merge clusters detected in coreference clustering

    # ----- Part A: assign clusters to person named entities
    for ent in person_nes:
        # Sometimes we get noisy characters in name entities
        # TODO: Maybe it's better to check for other types of problems in NEs here too

        ent_cleaned = utils.clean_ne(str(ent))
        if (len(ent_cleaned) == 0) or utils.string_contains_digit(ent_cleaned):
            continue

        ent_set = set(range(ent.start_char, ent.end_char))
        found = False
        # if no coreference clusters is detected in the document
        if doc_coref._.coref_clusters is None:
            ne_dict[ent] = []
            ne_clust[ent] = -1

        else:
            for cluster in doc_coref._.coref_clusters:
                for ment in cluster.mentions:
                    ment_set = set(range(ment.start_char, ment.end_char))
                    if has_coverage(ent_set, ment_set):
                        ne_dict[ent] = cluster
                        ne_clust[ent] = cluster.i

                        found = True
                        break
                # End of for on mentions
                if found:
                    break

            # End of for on clusters

            if not found:
                ne_dict[ent] = []
                ne_clust[ent] = -1

    # ----- Part B: Merge clusters in ne_dict based on exact match of their representative (PERSON named entities)
    merged_nes = {}
    for ne, cluster in zip(ne_dict.keys(), ne_dict.values()):

        ne_clean_text = utils.clean_ne(str(ne))

        if not cluster:
            cluster_id = [-1]
            mentions = []
        else:
            cluster_id = [cluster.i]
            mentions = cluster.mentions

        # check if we already have a unique cluster with same representative
        if ne_clean_text in merged_nes.keys():
            retrieved = merged_nes[ne_clean_text]
            lst = retrieved['mentions']
            lst = lst + [ne] + mentions
            cls = retrieved['cluster_id']
            cls = cls + cluster_id
            merged_nes[ne_clean_text] = {'mentions': lst, 'cluster_id': cls}
        else:
            tmp = [ne] + mentions
            merged_nes[ne_clean_text] = {'mentions': tmp, 'cluster_id': cluster_id}

    # ----- Part C: do a complex merge
    complex_merged_nes, changed = complex_merge(merged_nes)

    return complex_merged_nes
コード例 #5
0
def get_gender(name, disable_cache=False, disable_service=False):
    name = utils.clean_ne(name)

    # ===== Checking for trivial cases =====
    # (len(name.split(" ")) > 5 is added to not resolve gender for long texts!
    # Length of 5 is chosen because Arabic names are commonly 5 words long.
    if (len(name.split(" ")) <= 1) or (len(name.split(" ")) > 5):
        gender_logger.warning(
            'Skipping gender assignment due name length: "{0}"'.format(name))
        return 'unknown', 'Hardcode', True
    # Check he/shes manually
    if name.lower() == 'he':
        return 'male', 'Hardcode', True
    if name.lower() == 'she':
        return 'female', 'Hardcode', True
    # Check editorial case
    if 'editorial' in name.lower():
        return 'editorial', 'Hardcode', True

    # Checking cache for given name
    if not disable_cache:
        gender_logger.info('Checking cache for "{0}"'.format(name))
        gender, cache_name, ignore_list = get_gender_from_cache(name)

        if cache_name == 'Manual' or gender != 'unknown':
            gender_logger.info(
                '"{0}" cache identified gender for "{1}:{2}"'.format(
                    cache_name, name, gender))
            return gender, cache_name, True
    else:
        ignore_list = []

    # Checking Service for given name

    if not disable_service:
        gender_logger.info(
            'Cache missed gender for "{0}". Calling services to identify gender.'
            .format(name))
        gender, service_name = get_gender_from_service(name, ignore_list)
        gender_logger.info(
            'Services result for "{0}" is "{1}" with "{2}"'.format(
                name, gender, service_name))

        # Update FirstName Cache if applicable (ony using first name based services)
        first_name = utils.extract_first_name(name.lower())
        if gender in [
                'male', 'female'
        ] and first_name is not None and service_name not in ['VIAF']:
            existing_item = firstname_cache_col.find_one({'name': first_name})
            if existing_item is None:
                firstname_cache_col.insert_one({
                    'name': first_name,
                    'gender': gender
                })
            else:
                curr_name_id = existing_item['_id']
                if existing_item['gender'] is None or existing_item[
                        'gender'] == 'unknown':
                    firstname_cache_col.update({'_id': ObjectId(curr_name_id)},
                                               {'$set': {
                                                   'gender': gender
                                               }})
                    gender_logger.info(
                        'Update Cache "{0}" as first name for "{1}" as "{2}"'.
                        format(first_name, name, gender))

                elif existing_item['gender'] != gender:
                    gender_logger.warning(
                        'Gender Mismatch for "{0}": FirstName Cache: "{1}"   {2}: "{3}"'
                        .format(first_name, existing_item['gender'],
                                service_name, gender))

        # Return result
        if gender is None or gender == 'unknown':
            gender_logger.warning(
                'Unable to identify gender for "{0}"'.format(name))

        return gender, service_name, False

    return 'unknown', 'unknown', False
コード例 #6
0
def get_gender_from_cache(full_name):
    full_name = utils.clean_ne(full_name).lower()
    first_name = utils.extract_first_name(full_name)
    log_stmt_format = '"{0}" cache result for First Name: "{1}" | Full Name: "{2}"\t: "{3}"'
    ignore_list = []

    # ========== Checking Manual cache ==========
    # Check the manual cache with highest priority
    service_name = 'Manual'
    existing_gender = manual_cache_col.find_one({'name': full_name})
    if existing_gender is None:
        gender_logger.debug(
            log_stmt_format.format(service_name, first_name, full_name,
                                   'Name Not Found'))

    elif existing_gender is not None and existing_gender['gender'] in [
            'female', 'male', 'unknown'
    ]:
        ignore_list.append(service_name)
        gender = existing_gender['gender']
        gender_logger.debug(
            log_stmt_format.format(service_name, first_name, full_name,
                                   gender))
        return gender, service_name, ignore_list
    else:
        gender_logger.warning(
            log_stmt_format.format(service_name, first_name, full_name,
                                   'unknown'))

    # ===== Checking GenderAPI cache on full name
    service_name = 'GenderAPI_FullName'
    existing_gender = genderapi_cache_col.find_one({'q': full_name})

    if existing_gender is None:
        gender_logger.debug(
            log_stmt_format.format(service_name, first_name, full_name,
                                   'Name Not Found'))
    elif existing_gender is not None and existing_gender['gender'] == 'unknown':
        ignore_list.append(service_name)
        gender_logger.debug(
            log_stmt_format.format(service_name, first_name, full_name,
                                   'unknown'))
    elif existing_gender is not None and existing_gender['gender'] != 'unknown':
        gender = existing_gender['gender']
        gender_logger.debug(
            log_stmt_format.format(service_name, first_name, full_name,
                                   gender))

        return gender, service_name, ignore_list
    else:
        gender_logger.warning(
            log_stmt_format.format(service_name, first_name, full_name,
                                   'unknown'))
        ignore_list.append(service_name)

    # ========== Checking first name caches ==========
    if first_name is not None:  # only check for valid first names

        # ===== Checking Genderize cache
        service_name = 'Genderize'
        existing_gender = genderize_cache_col.find_one({'name': first_name})

        if existing_gender is None:
            gender_logger.debug(
                log_stmt_format.format(service_name, first_name, full_name,
                                       'Name Not Found'))
        elif existing_gender is not None and existing_gender[
                'gender'] == 'unknown':
            ignore_list.append(service_name)
            gender_logger.debug(
                log_stmt_format.format(service_name, first_name, full_name,
                                       'unknown'))
        elif existing_gender is not None and existing_gender[
                'gender'] != 'unknown':
            gender = existing_gender['gender']
            if gender is None:
                gender = 'unknown'

            gender_logger.debug(
                log_stmt_format.format(service_name, first_name, full_name,
                                       gender))
            return gender, service_name, ignore_list
        else:
            gender_logger.warning(
                log_stmt_format.format(service_name, first_name, full_name,
                                       'unknown'))
            ignore_list.append(service_name)

        # ===== Checking GenderAPI cache on first name
        service_name = 'GenderAPI_FirstName'
        existing_gender = genderapi_cache_col.find_one({'name': first_name})

        if existing_gender is None:
            gender_logger.debug(
                log_stmt_format.format(service_name, first_name, full_name,
                                       'Name Not Found'))
        elif existing_gender is not None and existing_gender[
                'gender'] == 'unknown':
            ignore_list.append(service_name)
            gender_logger.debug(
                log_stmt_format.format(service_name, first_name, full_name,
                                       'unknown'))
        elif existing_gender is not None and existing_gender[
                'gender'] != 'unknown':
            gender = existing_gender['gender']
            gender_logger.debug(
                log_stmt_format.format(service_name, first_name, full_name,
                                       gender))

            return gender, service_name, ignore_list
        else:
            gender_logger.warning(
                log_stmt_format.format(service_name, first_name, full_name,
                                       'unknown'))
            ignore_list.append(service_name)

        # ===== Checking FirstName cache
        service_name = 'FirstName'
        existing_gender = firstname_cache_col.find_one({'name': first_name})

        if existing_gender is None:
            gender_logger.debug(
                log_stmt_format.format(service_name, first_name, full_name,
                                       'Name Not Found'))

        elif existing_gender is not None and existing_gender[
                'gender'] == 'unknown':
            ignore_list.append(service_name)
            gender_logger.debug(
                log_stmt_format.format(service_name, first_name, full_name,
                                       'unknown'))
        elif existing_gender is not None and existing_gender[
                'gender'] != 'unknown':
            gender = existing_gender['gender']
            gender_logger.debug(
                log_stmt_format.format(service_name, first_name, full_name,
                                       gender))

            return gender, service_name, ignore_list
        else:
            gender_logger.warning(
                log_stmt_format.format(service_name, first_name, full_name,
                                       'unknown'))

    else:
        gender_logger.warning(
            'Can not extract first name from "{0}". Skipping First Name Services.'
            .format(full_name))

    return 'unknown', 'Hardcode', ignore_list