def get_genderapi_gender(full_name): try: full_name = utils.clean_ne(full_name).lower() # Create a requests session session = requests.Session() url = "https://gender-api.com/get?split={}&key={}".format( quote(full_name), GENDERAPI_TOKEN) response = session.get(url) cache_obj = response.json() gender = cache_obj["gender"] gender_logger.debug( 'GenderAPI service call result for "{0}": "{1}"'.format( full_name, gender)) # Handle unknowns if gender is None: gender = 'unknown' cache_obj['gender'] = 'unknown' # Update cache. name attribute in cache_obj contains the name as cache_key cache_obj['q'] = full_name genderapi_cache_col.insert_one(cache_obj) except: gender_logger.exception("message") gender = 'unknown' return gender
def get_genderize_gender(full_name): full_name = utils.clean_ne(full_name).lower() first_name = utils.extract_first_name(full_name) if first_name is None: return "unknown" else: try: gender_payload = {"name": first_name} # Create a requests session session = requests.Session() gender_return = session.get("https://api.genderize.io/?", params=gender_payload) cache_obj = json.loads(gender_return.text) gender = cache_obj['gender'] gender_logger.debug( 'Genderize service call result for "{0}" ("{1}"): "{2}"'. format(first_name, full_name, gender)) # Handle unknowns if gender is None: gender = 'unknown' cache_obj['gender'] = 'unknown' # Update Genderize cache genderize_cache_col.insert_one(cache_obj) return gender except: gender_logger.exception("message") return 'unknown'
def get_gender_from_service(full_name, ignore_list=[]): full_name = utils.clean_ne(full_name) first_name = utils.extract_first_name(full_name) svc_call_log_format = '"{0}" service result for "{1}" is: "{2}"' svc_ignore_log_format = 'Ignoring "{0}" service for "{1}"' # ---------- Genderize ---------- service_name = 'Genderize' if GENDERIZE_ENABLED and (first_name is not None) and (service_name not in ignore_list): gender = get_genderize_gender(full_name) gender_logger.debug( svc_call_log_format.format(service_name, full_name, gender)) if gender != 'unknown': return gender, service_name else: gender_logger.debug( svc_ignore_log_format.format(service_name, full_name)) # ---------- GenderAPI ---------- service_name = 'GenderAPI_FullName' if GENDERAPI_ENABLED and service_name not in ignore_list: gender = get_genderapi_gender(full_name) gender_logger.info( svc_call_log_format.format(service_name, full_name, gender)) if gender != 'unknown': return gender, service_name else: gender_logger.debug( svc_ignore_log_format.format(service_name, full_name)) return 'unknown', 'Hardcode'
def merge_nes(doc_coref): # ne_dict and ne_cluster are dictionaries which keys are PERSON named entities extracted from the text and values # are mentions of that named entity in the text. Mention clusters come from coreference clustering algorithm. ne_dict = {} ne_clust = {} # It's highly recommended to clean nes before merging them. They usually contain invalid characters person_nes = [x for x in doc_coref.ents if x.label_ == 'PERSON'] # in this for loop we try to merge clusters detected in coreference clustering # ----- Part A: assign clusters to person named entities for ent in person_nes: # Sometimes we get noisy characters in name entities # TODO: Maybe it's better to check for other types of problems in NEs here too ent_cleaned = utils.clean_ne(str(ent)) if (len(ent_cleaned) == 0) or utils.string_contains_digit(ent_cleaned): continue ent_set = set(range(ent.start_char, ent.end_char)) found = False # if no coreference clusters is detected in the document if doc_coref._.coref_clusters is None: ne_dict[ent] = [] ne_clust[ent] = -1 else: for cluster in doc_coref._.coref_clusters: for ment in cluster.mentions: ment_set = set(range(ment.start_char, ment.end_char)) if has_coverage(ent_set, ment_set): ne_dict[ent] = cluster ne_clust[ent] = cluster.i found = True break # End of for on mentions if found: break # End of for on clusters if not found: ne_dict[ent] = [] ne_clust[ent] = -1 # ----- Part B: Merge clusters in ne_dict based on exact match of their representative (PERSON named entities) merged_nes = {} for ne, cluster in zip(ne_dict.keys(), ne_dict.values()): ne_clean_text = utils.clean_ne(str(ne)) if not cluster: cluster_id = [-1] mentions = [] else: cluster_id = [cluster.i] mentions = cluster.mentions # check if we already have a unique cluster with same representative if ne_clean_text in merged_nes.keys(): retrieved = merged_nes[ne_clean_text] lst = retrieved['mentions'] lst = lst + [ne] + mentions cls = retrieved['cluster_id'] cls = cls + cluster_id merged_nes[ne_clean_text] = {'mentions': lst, 'cluster_id': cls} else: tmp = [ne] + mentions merged_nes[ne_clean_text] = {'mentions': tmp, 'cluster_id': cluster_id} # ----- Part C: do a complex merge complex_merged_nes, changed = complex_merge(merged_nes) return complex_merged_nes
def get_gender(name, disable_cache=False, disable_service=False): name = utils.clean_ne(name) # ===== Checking for trivial cases ===== # (len(name.split(" ")) > 5 is added to not resolve gender for long texts! # Length of 5 is chosen because Arabic names are commonly 5 words long. if (len(name.split(" ")) <= 1) or (len(name.split(" ")) > 5): gender_logger.warning( 'Skipping gender assignment due name length: "{0}"'.format(name)) return 'unknown', 'Hardcode', True # Check he/shes manually if name.lower() == 'he': return 'male', 'Hardcode', True if name.lower() == 'she': return 'female', 'Hardcode', True # Check editorial case if 'editorial' in name.lower(): return 'editorial', 'Hardcode', True # Checking cache for given name if not disable_cache: gender_logger.info('Checking cache for "{0}"'.format(name)) gender, cache_name, ignore_list = get_gender_from_cache(name) if cache_name == 'Manual' or gender != 'unknown': gender_logger.info( '"{0}" cache identified gender for "{1}:{2}"'.format( cache_name, name, gender)) return gender, cache_name, True else: ignore_list = [] # Checking Service for given name if not disable_service: gender_logger.info( 'Cache missed gender for "{0}". Calling services to identify gender.' .format(name)) gender, service_name = get_gender_from_service(name, ignore_list) gender_logger.info( 'Services result for "{0}" is "{1}" with "{2}"'.format( name, gender, service_name)) # Update FirstName Cache if applicable (ony using first name based services) first_name = utils.extract_first_name(name.lower()) if gender in [ 'male', 'female' ] and first_name is not None and service_name not in ['VIAF']: existing_item = firstname_cache_col.find_one({'name': first_name}) if existing_item is None: firstname_cache_col.insert_one({ 'name': first_name, 'gender': gender }) else: curr_name_id = existing_item['_id'] if existing_item['gender'] is None or existing_item[ 'gender'] == 'unknown': firstname_cache_col.update({'_id': ObjectId(curr_name_id)}, {'$set': { 'gender': gender }}) gender_logger.info( 'Update Cache "{0}" as first name for "{1}" as "{2}"'. format(first_name, name, gender)) elif existing_item['gender'] != gender: gender_logger.warning( 'Gender Mismatch for "{0}": FirstName Cache: "{1}" {2}: "{3}"' .format(first_name, existing_item['gender'], service_name, gender)) # Return result if gender is None or gender == 'unknown': gender_logger.warning( 'Unable to identify gender for "{0}"'.format(name)) return gender, service_name, False return 'unknown', 'unknown', False
def get_gender_from_cache(full_name): full_name = utils.clean_ne(full_name).lower() first_name = utils.extract_first_name(full_name) log_stmt_format = '"{0}" cache result for First Name: "{1}" | Full Name: "{2}"\t: "{3}"' ignore_list = [] # ========== Checking Manual cache ========== # Check the manual cache with highest priority service_name = 'Manual' existing_gender = manual_cache_col.find_one({'name': full_name}) if existing_gender is None: gender_logger.debug( log_stmt_format.format(service_name, first_name, full_name, 'Name Not Found')) elif existing_gender is not None and existing_gender['gender'] in [ 'female', 'male', 'unknown' ]: ignore_list.append(service_name) gender = existing_gender['gender'] gender_logger.debug( log_stmt_format.format(service_name, first_name, full_name, gender)) return gender, service_name, ignore_list else: gender_logger.warning( log_stmt_format.format(service_name, first_name, full_name, 'unknown')) # ===== Checking GenderAPI cache on full name service_name = 'GenderAPI_FullName' existing_gender = genderapi_cache_col.find_one({'q': full_name}) if existing_gender is None: gender_logger.debug( log_stmt_format.format(service_name, first_name, full_name, 'Name Not Found')) elif existing_gender is not None and existing_gender['gender'] == 'unknown': ignore_list.append(service_name) gender_logger.debug( log_stmt_format.format(service_name, first_name, full_name, 'unknown')) elif existing_gender is not None and existing_gender['gender'] != 'unknown': gender = existing_gender['gender'] gender_logger.debug( log_stmt_format.format(service_name, first_name, full_name, gender)) return gender, service_name, ignore_list else: gender_logger.warning( log_stmt_format.format(service_name, first_name, full_name, 'unknown')) ignore_list.append(service_name) # ========== Checking first name caches ========== if first_name is not None: # only check for valid first names # ===== Checking Genderize cache service_name = 'Genderize' existing_gender = genderize_cache_col.find_one({'name': first_name}) if existing_gender is None: gender_logger.debug( log_stmt_format.format(service_name, first_name, full_name, 'Name Not Found')) elif existing_gender is not None and existing_gender[ 'gender'] == 'unknown': ignore_list.append(service_name) gender_logger.debug( log_stmt_format.format(service_name, first_name, full_name, 'unknown')) elif existing_gender is not None and existing_gender[ 'gender'] != 'unknown': gender = existing_gender['gender'] if gender is None: gender = 'unknown' gender_logger.debug( log_stmt_format.format(service_name, first_name, full_name, gender)) return gender, service_name, ignore_list else: gender_logger.warning( log_stmt_format.format(service_name, first_name, full_name, 'unknown')) ignore_list.append(service_name) # ===== Checking GenderAPI cache on first name service_name = 'GenderAPI_FirstName' existing_gender = genderapi_cache_col.find_one({'name': first_name}) if existing_gender is None: gender_logger.debug( log_stmt_format.format(service_name, first_name, full_name, 'Name Not Found')) elif existing_gender is not None and existing_gender[ 'gender'] == 'unknown': ignore_list.append(service_name) gender_logger.debug( log_stmt_format.format(service_name, first_name, full_name, 'unknown')) elif existing_gender is not None and existing_gender[ 'gender'] != 'unknown': gender = existing_gender['gender'] gender_logger.debug( log_stmt_format.format(service_name, first_name, full_name, gender)) return gender, service_name, ignore_list else: gender_logger.warning( log_stmt_format.format(service_name, first_name, full_name, 'unknown')) ignore_list.append(service_name) # ===== Checking FirstName cache service_name = 'FirstName' existing_gender = firstname_cache_col.find_one({'name': first_name}) if existing_gender is None: gender_logger.debug( log_stmt_format.format(service_name, first_name, full_name, 'Name Not Found')) elif existing_gender is not None and existing_gender[ 'gender'] == 'unknown': ignore_list.append(service_name) gender_logger.debug( log_stmt_format.format(service_name, first_name, full_name, 'unknown')) elif existing_gender is not None and existing_gender[ 'gender'] != 'unknown': gender = existing_gender['gender'] gender_logger.debug( log_stmt_format.format(service_name, first_name, full_name, gender)) return gender, service_name, ignore_list else: gender_logger.warning( log_stmt_format.format(service_name, first_name, full_name, 'unknown')) else: gender_logger.warning( 'Can not extract first name from "{0}". Skipping First Name Services.' .format(full_name)) return 'unknown', 'Hardcode', ignore_list