def get_author_gender(name): """ """ gender = getGenders(name.split()[0]) return gender
def save_name_genders(): name_pool = get_names() result = {} bad_names = [] chunk_size = 50 total_chunks = len(name_pool)/chunk_size rest = len(name_pool)%chunk_size for j in range(0, total_chunks): print '*******Chunk', j, '/', total_chunks names = [] if j == total_chunks - 1: names = name_pool[j * chunk_size:total_chunks*chunk_size+rest] else: names = name_pool[j * chunk_size:(j+1)*chunk_size] gender_list = gender.getGenders(names) #gender, prob, count for i, name in enumerate(names): infered_gender = gender_list[i][0] prob = float(gender_list[i][1]) print name, infered_gender, prob if infered_gender == 'None' or prob < 0.6: bad_names.append(name) result[name] = infered_gender with open( "genders.p", "wb" ) as outfile: pickle.dump(result, outfile) with open( "badnames.p", "wb" ) as outfile: pickle.dump(bad_names, outfile) return bad_names
def get_name_genders_genderize(): name_pool = get_names() chunk_size = 50 total_chunks = len(name_pool)/chunk_size rest = len(name_pool)%chunk_size genders = {} for j in range(0, total_chunks): print '*******Chunk', j, '/', total_chunks names = [] if j == total_chunks - 1: names = name_pool[j * chunk_size:total_chunks*chunk_size+rest] else: names = name_pool[j * chunk_size:(j+1)*chunk_size] gender_list = gender.getGenders(names) # [ # {"name":"peter","gender":"male","probability":"1.00","count":796}, # {"name":"lois","gender":"female","probability":"0.94","count":70}, # {"name":"stevie","gender":"male","probability":"0.63","count":39} # ] for g in gender_list: genders[g[0]] = g[1] json.dump(genders, open('genders_genderize.json', 'w'))
def get_genderize(name): """ Uses gender.py library to assign gender to first name using genderize.io API Must only use first name --> "Mark" name (string) """ info = getGenders(name) return info
def get_gender_from_cache_or_genderize(artist, cached_genders): if(cached_genders.get(artist) is not None): return cached_genders.get(artist) else: #not in cache -> use https://github.com/block8437/gender.py (https://genderize.io) gender, _, _ = getGenders([artist])[0] cached_genders[artist] = gender update_cached_genders(cached_genders) return gender
def _get_genders(df): genTuples = [] for fullname in df["Name"].values: firstname = get_firstname(fullname) genTuple = getGenders(firstname)[0] genTuples.append(genTuple) genders = [gen[0] if gen is not None else "n" for gen, _, _ in genTuples] gender_probas = [genpro for _, genpro, _ in genTuples] return genders, gender_probas
def gender_voter(name): votes = {"male": 0, "female": 0, "unknown": 0} # fist gender detector (male, female, None) found_gender, precision, n_docs = getGenders(name)[0] # second gender detector (male, female, mostly_male, mostly_female, andy, unknown) d = gndr.Detector(case_sensitive=False) found_gender_2 = d.get_gender(name) # spanish better detector (Male, Female) guesser = genderator.Parser() answer = guesser.guess_gender(name) if answer: spanish_gender = answer['gender'] else: spanish_gender = 'not found' # VOTING # add votes from first generator if found_gender == 'male': votes['male'] += 1 if found_gender == 'female': votes['female'] += 1 # add votes from second generator if found_gender_2 == 'male': votes['male'] += 1 if found_gender_2 == 'female': votes['female'] += 1 if found_gender_2 == 'mostly_male': votes['male'] += 1 if found_gender_2 == 'mostly_female': votes['female'] += 1 if found_gender_2 == 'andy': votes['male'] += 1 votes['female'] += 1 # add votes from third spanish gen if spanish_gender == 'Male': votes['male'] += 1 if spanish_gender == 'Female': votes['female'] += 1 return most_votes(votes)
def update_name_genders(): name_pool = get_names() bad_names = [] config = load_config() cnx = mysql.connector.connect(**config) cursor = cnx.cursor() chunk_size = 50 total_chunks = len(name_pool)/chunk_size rest = len(name_pool)%chunk_size for j in range(0, total_chunks): print '*******Chunk', j, '/', total_chunks names = [] if j == total_chunks - 1: names = name_pool[j * chunk_size:total_chunks*chunk_size+rest] else: names = name_pool[j * chunk_size:(j+1)*chunk_size] gender_list = gender.getGenders(names) #gender, prob, count for i, name in enumerate(names): infered_gender = gender_list[i][0] prob = float(gender_list[i][1]) #print name, infered_gender, prob if infered_gender == 'None' or prob < 0.6: bad_names.append(name) query = "UPDATE person SET gender = '%s' WHERE first_name = '%s'" % (infered_gender, name) cursor.execute(query) cursor.close() cnx.close()
import pandas as pd from gender import getGenders import time df = pd.read_csv("user_names/name_sorted", header=None) print(df) nameList = df[0].values.tolist() nameList = [str(x) for x in nameList] index = 994 while index < len(nameList): try: name = nameList[index] print("name = ", name) gender = getGenders(name) print("gender = ", gender) # genderList.append(gender[0]) with open("genderList.csv", "a") as op: op.write(name + "," + str(gender[0]) + "\n") index += 1 except Exception as e: print("exept: " + str(e)) time.sleep(3600) continue print("Done")
except: print("oops") print(row_index) print(out) ''' out = 'org' # coding for checking the left out record for row_index, row in data_all.iterrows(): if row["user_fname_count"] == 0 and row[ "user_predicted_fname_genderO"] != 'Org': usr_fname = row["user_fname"] time.sleep(0.1) #result = getGenders(usr_fname) try: print(row_index) out = getGenders(usr_fname) print(out) if (out[0][0] == 'male'): data_all.loc[row_index, "user_predicted_fname_genderM"] = out[0][1] data_all.loc[row_index, "user_fname_count"] = out[0][2] if (out[0][0] == 'female'): data_all.loc[row_index, "user_predicted_fname_genderF"] = out[0][1] data_all.loc[row_index, "user_fname_count"] = out[0][2] if (out[0][0] == 'None'): data_all.loc[row_index, "user_predicted_fname_genderO"] = 'Org' data_all.loc[row_index, "user_fname_count"] = out[0][2] except:
import glob import sys from gender import getGenders indir = sys.argv[1] infile = sys.argv[2] cur_names = set() with open(infile) as f: for line in f: cur_names.add(line.split()[0].strip()) dirlist = glob.glob('%s/*' % indir) namelist = [s.split('/')[-1].split('_')[0] for s in dirlist] names = list(set(namelist) - cur_names) gender_dict = dict.fromkeys(names, None) for i in range(0, len(names), 10): g = getGenders(names[i:i + 10]) for name, gender in zip(names[i:i + 10], g): gender_dict[name] = gender print name, gender