def get_gender(self, n): name_ = str(n).capitalize() if gender.Detector().get_gender(name_) == 'female' or gender.Detector( ).get_gender(name_) == 'mostly_female': return 'madame' else: return 'Mister'
def gender_breakdown(names, graph=True, text=True, plot_options={}): d = gender_detector.Detector() first_names = map(get_first_name, names) genders = map(d.get_gender, first_names) genders = map(compress_gender, genders) gender_dict = Counter(genders) total_names = sum(gender_dict.values()) if text: print("Out of {} total customers:".format(total_names)) for key in gender_dict.keys(): print("\t{} ({:.2f}%) cusomters were {}".format( gender_dict[key], 100 * gender_dict[key] / total_names, key)) if graph: plt.style.use('ggplot') plt.figure(dpi=200) plt.bar(range(len(gender_dict)), [ gender_dict['Female'], gender_dict['Male'], gender_dict['Andro/Unknown'] ], align='center', **plot_options) plt.xticks(range(len(gender_dict)), ['Female', 'Male', 'Andro/Unknown']) plt.title("Gender Breakdown of Customers", fontsize=24, fontname='Pier Sans') plt.xlabel("Gender")
def predict_sex(name): sex_predictor = gender.Detector(unknown_value=u"unknown",case_sensitive=False) first_name= name.str.split(' ').str.get(0) sex= first_name.apply(sex_predictor.get_gender) sex_dict={'female': -2, 'mostly_female': -1,'unknown':0,'mostly_male':1, 'male': 2} sex_code = sex.map(sex_dict).astype(int) return sex_code
def do_classify_in_threads(self) -> dict: """ Open the the URLS and scrape the info related to the speakers. After that store the info in a dictionary to be classify by gender. Return a dictionary with the following form {'2016': [1, 0, 0, 1], '2018': [0, 1, 0]} where 1 represent a female and 0 a male. :return: """ speakers_in_year_es = {} with concurrent.futures.ThreadPoolExecutor(max_workers=6) as executor: future_to_url = {executor.submit(self.connect, url): url for url in URLS_ES} for future in concurrent.futures.as_completed(future_to_url): url = future_to_url[future] try: page = future.result()[0] year = future.result()[1] speakers_in_year_es[str(year)] = self.scrapper_es(page, year) except Exception as exc: print(f"{url} generated the following exception: \n{exc}") # With the next loops we get a dictionary # of this form {'2016': ['female', 'male', 'female']} # and then we replace 'female' by 1 and other cases by 0. d = gender.Detector() for year in speakers_in_year_es: for i, name in enumerate(speakers_in_year_es[year]): speakers_in_year_es[year][i] = 1 if d.get_gender(name.split()[0]) == 'female' else 0 return speakers_in_year_es
def guess_sex(s: pd.Series): """Guess the sex of the student given their first name(s). The name must be in title format for the gender_guesser to work. The "mostly_" versions have been mapped to their non-mostly counterparts for simplicity. "andy" (androgynous), i.e. equally male and female, will be mapped to "unknown". Keyword arguments name -- a string that contains the first name """ gender_detector = gender.Detector() # Compound names cannot be detected. Split the compound names and use the # first part only. first_word = (s .apply(lambda x: re.split(' |-', x)[0].title()) ) sex_mapping = {'male': 'male' ,'female': 'female' ,'mostly_female': 'female' ,'mostly_male': 'male' ,'andy': 'unknown' ,'unknown': 'unknown'} sex_guess = (first_word .apply(gender_detector.get_gender) .map(sex_mapping) ) return sex_guess
def create_new_actor(self): # We need just the first name to match against our actors first_name = self.get_first_name() # Filters our objects that start with the name # e.g. John would get ['John Wayne', 'Johnny Depp', 'John Goodman'] # istartswith is case insensitive startswith name_matches = Actor.objects.filter(name__istartswith=first_name) count = len(name_matches) if count == 1: # If we have 1 Actor, use that one actor = name_matches.first() elif count > 1: # If there's more than one, use a random one *from those* actor = name_matches[randint(0, count - 1)] else: # Otherwise, most likely create a new actor guesser = gender_detector.Detector() guessed_gender = guesser.get_gender(first_name) if guessed_gender == 'unknown': # We didn't find the gender, just use any actor actor = self.get_random_actor() else: gender = 'M' if guessed_gender == 'male' else 'F' # Woohoo we can get a gender accurate actor! actor = self.get_random_actor(gender) return Person.objects.create(name=self.name, actor=actor)
def __init__(self, connection): auth = tweepy.OAuthHandler( '2XAQTR3zNwrBtD2i6AGxZDeP6', '7e9xsaOSvyyI0nY5ZDN1cYx0phndLtclhdxukaC1rSjztsH9Q2') auth.set_access_token( '3333417351-X483ie45LpKqy6vw3LEUa84LN1bG6aMuZXRUlF7', 'FQnPzJbLjvODZXiph48eb5bK2UJRHhjdhBw5thKDFvZer') self.api = tweepy.API(auth) self.gen = gender.Detector() my_region = 'na1' watcher = RiotWatcher('RGAPI-11ab2328-9ef7-47f8-a4c5-e4afcf0baab0') static_champ_list = watcher.static_data.champions(my_region) static_item_list = watcher.static_data.items(my_region) champion_list = static_champ_list['data'] item_list = static_item_list['data'] item_names = [] champion_names = [] champion_key_list = list(champion_list.keys()) item_key_list = list(item_list.keys()) for i in item_key_list: item_names.append(item_list[i]['name'].encode('ascii', 'ignore')) for i in champion_key_list: champion_names.append(champion_list[i]['name'].encode( 'ascii', 'ignore')) self.item_name = item_names self.champions_name = champion_names print(item_names) print(champion_names) #coordinates self.connection = connection
def name_to_gender(name, api_key=None, name_dict={}): f""" This function uses the gender-guesser pip package (https://pypi.org/project/gender-guesser/) and optionally the gender API (https://gender-api.com/) or a dictionary to guess the gender of the given name. Note that a major limitation of this code and the original paper is that this is a guess at the gender of a person based only on a first name, and it does not reflect the chosen gender of the author. Further, there are greater limitations if the name is not a traditionally western name. Last, the gender API has limitations in that it doesn't include all types of experienced genders. Inputs ------ name : string The first name whose gender you want to guess. api_key : string The API key for the gender API. You can sign up for a free account and get an API key on the gender-api.com website. Optional. name_dict : dict Dictionary containing gender guesses and accuracy, the dictionary is used if the gender-guesser package returns 'unknown'. It is updated if a gender-api request is made. Outputs ------- gender : string The guessed gender of the name. accuracy : int The accuracy of the gender guess, in percent. """ # use the _gender_detector as a global variable to avoid re-generating it each time global _gender_detector # If the name is just an initial, return unknown if len(name) < 2: return "unknown", 0 # create gender-guesser detcetor if it doesn't already exist if not "_gender_detector" in globals(): _gender_detector = gender_detecor.Detector(case_sensitive=False) gender = _gender_detector.get_gender(name) accuracy = None if gender == "unknown": if name in name_dict.keys(): gender = name_dict[name]["gender"] accuracy = name_dict[name]["accuracy"] elif api_key: url = f"https://gender-api.com/get?key={api_key}&name={name}" response = requests.get(url).json() gender = response["gender"] accuracy = response["accuracy"] name_dict[name] = {"gender": gender, "accuracy": accuracy} # save the updated names_dict with open(NAME_DICT_PATH, 'w') as name_dict_file: json.dump(name_dict, name_dict_file, indent=2) # if still unknown and there is a dash in the name, try on the first part of the name if gender == "unknown" and "-" in name: return name_to_gender(name.split("-")[0], api_key, dict) return gender, accuracy
class Genderize(): detector = gender.Detector(case_sensitive=False) def guess_from_name(firstname): g = Genderize.detector.get_gender(firstname) return g
def split_data_by_gender(_file): data = pd.read_excel(_file) myColumns = list(data.columns) male_data = pd.DataFrame(columns=myColumns) female_data = pd.DataFrame(columns=myColumns) other_data = pd.DataFrame(columns=myColumns) d = gender.Detector() A = np.array('') print('all index: ', len(data)) for i in range(0, len(data)): print(i) full_name = data.iloc[i]['CustomerName'] name = full_name.split(' ', 1)[0] newGender = d.get_gender(name) # A = np.append(A, newGender) if newGender == 'male': temp = data.loc[[i]] #male_data = pd.concat([male_data, temp], axis=0) male_data = male_data.append(temp, sort=True) #print(male_data) elif newGender == 'female': temp = data.loc[[i]] female_data = female_data.append(temp, sort=True) else: temp = data.loc[[i]] other_data = other_data.append(temp, sort=True) print('male_data.shape: ', male_data.shape) print('female_data.shape: ', female_data.shape) print('other_data.shape: ', other_data.shape) return male_data, female_data
def __init__(self): # Settings self.all_data_file_name = './csv_files/altgender4_2017_12_12_upwork_analysis_unitedstates_allskills.csv' # Filename for all data self.data_log_file_name = './log_files/alt_gender4_log_upwork_data_analysis_2017_12_12_unitedstates_allskills.txt' # Write a log self.log = open(self.data_log_file_name, 'a') self.log.write("We have started analyzing data!" + "\n") self.log.flush() # Connect to the database self.conn = psycopg2.connect("dbname=eureka01") self.cur = self.conn.cursor(cursor_factory=psycopg2.extras.DictCursor) psycopg2.extensions.register_adapter(dict, psycopg2.extras.Json) # Get detailed_info from workers in our database self.cur.execute( "SELECT detailed_info FROM upwork_unitedstates_allskills_2017_12_12;" ) # Initialize arrays for Causal Analysis self.user_count = 1 # Initialize gender detectors self.d = gender.Detector() self.gc = GenderComputer('./nameLists') self.us_detector = GenderDetector('us') self.ar_detector = GenderDetector('ar') self.uk_detector = GenderDetector('uk') self.uy_detector = GenderDetector('uy') self.gender_guesser = gender_guesser.Detector()
def create_auto_reply(self,original): mail = MIMEMultipart('alternative') mail['Message-ID'] = make_msgid() mail['References'] = mail['In-Reply-To'] = original['Message-ID'] mail['Subject'] = 'Re: ' + str(original['Subject']) mail['From'] = self.from_address mail['To'] = original['Reply-To'] or original['From'] #print (str(original)) with open('cv.txt', 'a') as the_file: the_file.write(str(original)) #extract only name from the From of the email body fromName = self.from_address fullName = ' '.join([item for item in fromName.split() if '@' not in item]) print (fullName) # name splitting name = HumanName(fullName) salutation = name.last + "," + name.middle + " " + name.suffix + " " + name.nickname + "," + name.title + " " + name.first+ ":\n" # gender detection d = gender.Detector() typeGender = d.get_gender(name.first) if typeGender == "male" or typeGender == "mostly_male": starts="Mr. " elif typeGender == "female" or typeGender == "mostly_female": starts="Miss. " else: starts="Dear/Honorable " coreBody = starts + " " + salutation + self.body_html #categorize body of emails ######################## # write body to cv.txt os.system('python3 categorize.py') #with open('coreCategory.txt', 'r') as file: #allCategory = file.read().replace('\n', '') allCategory = "" with open('coreCategory.txt') as f: for line in f.readlines(): allCategory += line + "/" print (allCategory) alphaBody = coreBody + "\n" + "#EmailClassifies as:/" + (allCategory) clearCommand1 = 'rm -rf cv.txt coreCategory.txt' os.system(clearCommand1) #mail.attach(MIMEText(dedent(self.body), 'plain')) mail.attach(MIMEText(alphaBody, 'html')) return mail
def processNamesColumn(column): new_columns = np.ndarray(shape = (len(column), 1), dtype = int) gender_detector = gender.Detector() for i in range(0, (len(column))): a_name = str(column[i]) started=False; a_word="" new_columns[i, 0] = 0 regex = re.compile('[^a-zA-Z]') a_name=regex.sub(' ', a_name) a_name=re.sub(r'([A-Z])', r' \1', a_name) words=a_name.split() for word in words: prediction=gender_detector.get_gender(word) #print(word,' ',prediction) if(prediction=='mostly_female'): new_columns[i, 0] = 2 continue elif(prediction=='mostly_male'): new_columns[i, 0] = 1 continue elif(prediction=='female'): new_columns[i, 0] = 2 break elif(prediction=='male'): new_columns[i, 0] = 1 break #print(words,' ',new_columns[i, 0]) return new_columns
def get_percentage_of_female_speakers(first_names): """Run gender_guesser on the names returning a percentage of female speakers (female and mostly_female), rounded to 2 decimal places.""" d = gender.Detector() c = Counter([d.get_gender(name) for name in first_names]) return round((c['female'] + c['mostly_female']) / sum(c.values()) * 100, 2)
def create_minister(): d = gender.Detector() df_p = pd.DataFrame() for file in os.listdir("/home/flex_lev/Dev/Perso/canadian_ministry/dump"): if "Parliament" not in file: print(file) df = pd.read_csv( "/home/flex_lev/Dev/Perso/canadian_ministry/dump/" + file, sep="|").rename(columns={'Unnamed: 0': 'index_rows'}) index_number = re.findall(r'\d+', file)[0] df["start"] = df["date"].apply(lambda x: find_date(x, 0)) df["end"] = df["date"].apply(lambda x: find_date(x, 1)) df["minister_number"] = int(index_number) df["sex"] = df["name"].apply( lambda x: d.get_gender(x.split(" ")[0])) if df_p.shape[0] == 0: df_p = df else: df_p = pd.concat([df_p, df], ignore_index=True) print(df_p.shape) df_p[[col for col in df_p.columns if "sex" not in col] + ["sex"]].sort_values(by=["sex"]).to_csv("ministers.csv", sep="|", index=False)
def annotate(self, file_name): with open('./TD/Annotations/' + file_name.replace('.txt', '_ant.txt'), 'w') as fw: with open(self.path + '/' + file_name) as fr: text = fr.read() for item in self.entities: if check_if_valid(text, item): self.knowledge.append( self.extract_birth_day(text, item)) self.knowledge.append(self.extract_type(text, item)) self.knowledge.append(self.extract_pattern(text, item)) self.knowledge.append(self.extract_marriage( text, item)) self.knowledge.append( self.extract_pattern(text, item, pattern="appeared in")) tmp = item.split(' ') name = tmp[0].strip() g = gender.Detector() pronouns = genders_person[item].split() pros = pronouns + [x.title() for x in pronouns] regex = "(" + item + "|" + tmp[-1].strip( ) + "| " + " |".join(pros) + " )" text = re.sub( regex, r'<entity name="' + self.entities[item] + '">\\1</entity>', text) fw.write(text)
def get_author_gender_guesser(author): """ Tries to get gender of author, 'female', 'male', 'non-binary' from the gender guesser module >>> from gender_novels.corpus_gen import get_author_gender_guesser >>> get_author_gender_guesser("Cuthbert, Michael") 'male' >>> get_author_gender_guesser("Li, Michelle") 'female' >>> get_author_gender_guesser("Duan, Mingfei") # should return None :param author: str :return: str """ first_name = format_author(author).split()[0] guesser = gender_guesser.Detector() gender_guess = guesser.get_gender(first_name) if (gender_guess == 'andy' or gender_guess == 'unknown'): return None if (gender_guess == 'male' or gender_guess == 'mostly_male'): return 'male' if (gender_guess == 'female' or gender_guess == 'mostly_female'): return 'female'
def get_male_female_words_count(nlp_doc): gender_detector = gender.Detector() male_count = 0 female_count = 0 neutral_count = 0 for token in nlp_doc: if token.text in male_words or token.lemma_ in male_words: male_count += 1 elif token.text in female_words or token.lemma_ in female_words: female_count += 1 elif token.text in neutral_words or token.lemma_ in neutral_words: neutral_count += 1 for name in get_named_persons(nlp_doc): if gender_detector.get_gender(name) in ["male", "mostly_male"]: male_count += 1 elif gender_detector.get_gender(name) == ["female", "mostly_female"]: female_count += 1 elif gender_detector.get_gender(name) == ["andy", "unknown"]: neutral_count += 1 return { "male_count": male_count, "female_count": female_count, "neutral_count": neutral_count }
def get_percentage_of_female_speakers(first_names): """Run gender_guesser on the names returning a percentage of female speakers, rounded to 2 decimal places.""" d = gender.Detector() total = len(first_names) genders = Counter([d.get_gender(x) for x in first_names]) return round( ((genders['female'] + genders['mostly_female']) / total) * 100, 2)
def populate_genders(users): d = gender.Detector() for screen_name in list(users.keys()): user = users[screen_name] if ("gender" not in user) and ("twitter_profile" in user) and ( "name" in user["twitter_profile"]): user["gender"] = d.get_gender( user["twitter_profile"]["name"].split(" ")[0]) user["gender_source"] = "gender-guesser"
def get_percentage_of_female_speakers(first_names: list) -> float: """Run gender_guesser on the names returning a percentage of female speakers, rounded to 2 decimal places.""" d = gender.Detector(case_sensitive=False) count = Counter([d.get_gender(name) for name in first_names]) female_percantage = ((count["female"] + count["mostly_female"]) / len(first_names) * 100) return round(female_percantage, 2)
def get_percentage_of_female_speakers(first_names): """Run gender_guesser on the names returning a percentage of female speakers, rounded to 2 decimal places.""" detector = gender.Detector() gender_counter = Counter([detector.get_gender(name) for name in first_names]) perc = (gender_counter['female'] + gender_counter['mostly_female']) * 100. / len(first_names) return round(perc, 2)
def select_en_base(): debug = 2 conn = database.create_connection() cur = database.query_create_select(conn, "Select * From nom_des_voies;") stopwords = [ 'la', 'le', 'des', 'de', 'Père', 'point', 'Saint', 'Place', 'Rue', 'Avenue', 'Allée', 'Quai', 'Rond', 'Chemin', 'Passage', 'Cours', 'Boulevard', 'Impasse', 'Général', 'Lieutenant', 'Route', 'Cour', 'Galerie', 'Président', 'Prosper', 'ème', 'Régiment', 'Jardin', 'Champ', 'La', 'Le', 'et', 'Lys', 'Docteur', 'ter', 'Capitaine', 'Parc', 'Square', 'Stade', 'bis', 'Voie', 'Pont', 'Commandant', 'Sainte', 'Colonel', 'Espace' ] for ligne in cur: try: if debug == 1: print(ligne[0], ligne[1]) prenom = ligne[1].split(" ")[1] if prenom in stopwords: prenom = ligne[1].split(" ")[2] if prenom in stopwords: prenom = ligne[1].split(" ")[3] nom = ligne[1].split(" ")[2] if debug == 1: print(prenom, nom) d = sc.check_genre(prenom) # data = d.get_gender(prenom) if debug == 1: print(f'prenom :{prenom} genre:{d}') # Mise à jour du genre en base données requete = "update nom_des_voies set genre = '" + d + "' Where voie_id = " + str( ligne[0]) + ";" try: database.query_create_select(conn, requete) except: print("Erreur") if debug == 1: print(requete) except IndexError: print("Juste un nom dans la rue") print(ligne[1]) d = gender.Detector() data = d.get_gender(prenom) if debug == 2: print(f'prenom :{prenom} genre:{data}') # Mise à jour du genre en base données requete = "update nom_des_voies set genre = '" + data + "' Where voie_id = " + str( ligne[0]) + ";"
def guess_gender(author): d = gender.Detector() names = author.split(" ") first_name = names[0] if names is not None else "" gender_return = d.get_gender( first_name ) # this returns male, female, unknown or andy if gender_return not in ["male", "female"]: gender_return = search_person_for_gender(author) return gender_return
def guess_gender(goodreads_data, gender_col="gender"): d = gender.Detector() goodreads_data["First.Name"] = [ name[0] if name != "" else "" for name in goodreads_data["Author"].str.split(" ").fillna("") ] goodreads_data.loc[:, gender_col] = [ d.get_gender(name) for name in goodreads_data["First.Name"] ] return goodreads_data
def get_author_gender(self,name): # Guess import re import gender_guesser.detector as gender isnt_this_problematic_just_to=gender.Detector() genders = [isnt_this_problematic_just_to.get_gender(x) for x in re.split('\W',self.author)] for x in genders: if x!='unknown': return x return 'unknown'
def get_percentage_of_female_speakers(first_names): """Run gender_guesser on the names returning a percentage of female speakers (female and mostly_female), rounded to 2 decimal places.""" d = gender.Detector() femals = 0 for name in first_names: if d.get_gender(name) == 'female' or d.get_gender(name) == 'mostly_female': femals += 1 return round((femals / len(first_names)) * 100, 2)
def show_gender_by_year(self, articles): d = gender.Detector() articles['pub_date'] = pd.to_datetime(articles['pub_date']) articles['pub_date'] = articles['pub_date'].dt.year ay = articles.loc[articles.author.notnull(), ['author', 'pub_date']] ay['name'] = ay.author.str.split(' ', expand=True)[0] ay['gender'] = ay['name'].apply(d.get_gender) ay = ay.loc[ay.gender != 'unknown', ['pub_date', 'gender']] ay = ay.groupby('pub_date').gender.value_counts() return ay
def __init__(self, text): # Male homonyms self.male_title = ['mr.', 'sir', 'monsieur', 'captain', 'chief', 'master', 'lord', 'baron', 'mister', 'mr', 'prince', 'king'] # Female homonyms self.female_title = ['mrs.', 'ms.', 'miss', 'lady', 'mademoiselle', 'baroness', 'mistress', 'mrs', 'ms', 'queen', 'princess', 'madam', 'madame'] self.detector = gender.Detector() # base names for removing self.all_names = self.all_possible_names(spacy_names(text))
def processData(): df = pd.read_csv('Dumitrescu_Gabriel_Horia_train.csv') d = gender.Detector() df['Owner Sex'] = np.vectorize(get_binary_gender)(d, df['Owner Name']) df.drop(['Owner Name'], axis=1, inplace=True) df.fillna(df.mean(), inplace=True) return df