Ejemplo n.º 1
0
def get_author_gender(name):
    """
	"""

    gender = getGenders(name.split()[0])

    return gender
Ejemplo n.º 2
0
def save_name_genders():
    name_pool = get_names()
        
    result = {}
    bad_names = []
    
    chunk_size = 50
    total_chunks = len(name_pool)/chunk_size
    rest = len(name_pool)%chunk_size    
    
    for j in range(0, total_chunks):
        print '*******Chunk', j, '/', total_chunks
        names = []
        if j == total_chunks - 1:
            names = name_pool[j * chunk_size:total_chunks*chunk_size+rest]
        else:
            names = name_pool[j * chunk_size:(j+1)*chunk_size]
        
        gender_list = gender.getGenders(names) #gender, prob, count        
        
        for i, name in enumerate(names):
            infered_gender = gender_list[i][0]
            prob = float(gender_list[i][1])
            print name, infered_gender, prob
            if infered_gender == 'None' or prob < 0.6: 
                bad_names.append(name)
            result[name] = infered_gender
        
    with open( "genders.p", "wb" ) as outfile:
        pickle.dump(result, outfile) 
        
    with open( "badnames.p", "wb" ) as outfile:
        pickle.dump(bad_names, outfile) 
    
    return bad_names
Ejemplo n.º 3
0
def get_name_genders_genderize():
    name_pool = get_names()

    chunk_size = 50
    total_chunks = len(name_pool)/chunk_size
    rest = len(name_pool)%chunk_size
    
    genders = {}

    for j in range(0, total_chunks):
        print '*******Chunk', j, '/', total_chunks
        names = []
        if j == total_chunks - 1:
            names = name_pool[j * chunk_size:total_chunks*chunk_size+rest]
        else:
            names = name_pool[j * chunk_size:(j+1)*chunk_size]


        gender_list = gender.getGenders(names) 
        
#        [
#          {"name":"peter","gender":"male","probability":"1.00","count":796},
#          {"name":"lois","gender":"female","probability":"0.94","count":70},
#          {"name":"stevie","gender":"male","probability":"0.63","count":39}
#        ]

        for g in gender_list:
            genders[g[0]] = g[1]

    json.dump(genders, open('genders_genderize.json', 'w'))
Ejemplo n.º 4
0
def get_genderize(name):
    """
	Uses gender.py library to assign gender to first name using genderize.io API
	Must only use first name --> "Mark"

	name (string)
	"""
    info = getGenders(name)
    return info
def get_gender_from_cache_or_genderize(artist, cached_genders):
    if(cached_genders.get(artist) is not None):
        return cached_genders.get(artist)
    else:
        #not in cache -> use https://github.com/block8437/gender.py (https://genderize.io)
        gender, _, _ = getGenders([artist])[0]
        cached_genders[artist] = gender
        update_cached_genders(cached_genders)
        return gender
Ejemplo n.º 6
0
def _get_genders(df):
    genTuples = []
    for fullname in df["Name"].values:
        firstname = get_firstname(fullname)
        genTuple = getGenders(firstname)[0]
        genTuples.append(genTuple)
    genders = [gen[0] if gen is not None else "n" for gen, _, _ in genTuples]
    gender_probas = [genpro for _, genpro, _ in genTuples]

    return genders, gender_probas
Ejemplo n.º 7
0
def gender_voter(name):
    votes = {"male": 0, "female": 0, "unknown": 0}
    # fist gender detector (male, female, None)
    found_gender, precision, n_docs = getGenders(name)[0]

    # second gender detector (male, female, mostly_male, mostly_female, andy, unknown)
    d = gndr.Detector(case_sensitive=False)
    found_gender_2 = d.get_gender(name)

    # spanish better detector (Male, Female)
    guesser = genderator.Parser()
    answer = guesser.guess_gender(name)
    if answer:
        spanish_gender = answer['gender']
    else:
        spanish_gender = 'not found'

    # VOTING
    # add votes from first generator
    if found_gender == 'male':
        votes['male'] += 1
    if found_gender == 'female':
        votes['female'] += 1

    # add votes from second generator
    if found_gender_2 == 'male':
        votes['male'] += 1
    if found_gender_2 == 'female':
        votes['female'] += 1
    if found_gender_2 == 'mostly_male':
        votes['male'] += 1
    if found_gender_2 == 'mostly_female':
        votes['female'] += 1
    if found_gender_2 == 'andy':
        votes['male'] += 1
        votes['female'] += 1

    # add votes from third spanish gen
    if spanish_gender == 'Male':
        votes['male'] += 1
    if spanish_gender == 'Female':
        votes['female'] += 1

    return most_votes(votes)
Ejemplo n.º 8
0
def update_name_genders():
    name_pool = get_names()
    bad_names = []
    
    config = load_config()
    cnx = mysql.connector.connect(**config)
    cursor = cnx.cursor()

    chunk_size = 50
    total_chunks = len(name_pool)/chunk_size
    rest = len(name_pool)%chunk_size

    for j in range(0, total_chunks):
        print '*******Chunk', j, '/', total_chunks
        names = []
        if j == total_chunks - 1:
            names = name_pool[j * chunk_size:total_chunks*chunk_size+rest]
        else:
            names = name_pool[j * chunk_size:(j+1)*chunk_size]


        gender_list = gender.getGenders(names) #gender, prob, count

        for i, name in enumerate(names):
            infered_gender = gender_list[i][0]
            prob = float(gender_list[i][1])
            #print name, infered_gender, prob
            if infered_gender == 'None' or prob < 0.6:
                bad_names.append(name)
                
            query = "UPDATE person SET gender = '%s' WHERE first_name = '%s'" % (infered_gender, name)
            cursor.execute(query)
            
    
    cursor.close()
    cnx.close()
Ejemplo n.º 9
0
import pandas as pd
from gender import getGenders
import time

df = pd.read_csv("user_names/name_sorted", header=None)

print(df)
nameList = df[0].values.tolist()

nameList = [str(x) for x in nameList]

index = 994
while index < len(nameList):
    try:
        name = nameList[index]
        print("name = ", name)
        gender = getGenders(name)
        print("gender = ", gender)
        # genderList.append(gender[0])
        with open("genderList.csv", "a") as op:
            op.write(name + "," + str(gender[0]) + "\n")
        index += 1
    except Exception as e:
        print("exept: " + str(e))
        time.sleep(3600)
        continue

print("Done")
Ejemplo n.º 10
0
        except:
            print("oops")
            print(row_index)
            print(out)
'''
out = 'org'
# coding for checking the left out record
for row_index, row in data_all.iterrows():
    if row["user_fname_count"] == 0 and row[
            "user_predicted_fname_genderO"] != 'Org':
        usr_fname = row["user_fname"]
        time.sleep(0.1)
        #result = getGenders(usr_fname)
        try:
            print(row_index)
            out = getGenders(usr_fname)
            print(out)
            if (out[0][0] == 'male'):
                data_all.loc[row_index,
                             "user_predicted_fname_genderM"] = out[0][1]
                data_all.loc[row_index, "user_fname_count"] = out[0][2]

            if (out[0][0] == 'female'):
                data_all.loc[row_index,
                             "user_predicted_fname_genderF"] = out[0][1]
                data_all.loc[row_index, "user_fname_count"] = out[0][2]

            if (out[0][0] == 'None'):
                data_all.loc[row_index, "user_predicted_fname_genderO"] = 'Org'
                data_all.loc[row_index, "user_fname_count"] = out[0][2]
        except:
Ejemplo n.º 11
0
import glob
import sys
from gender import getGenders

indir = sys.argv[1]
infile = sys.argv[2]
cur_names = set()
with open(infile) as f:
    for line in f:
        cur_names.add(line.split()[0].strip())

dirlist = glob.glob('%s/*' % indir)
namelist = [s.split('/')[-1].split('_')[0] for s in dirlist]
names = list(set(namelist) - cur_names)
gender_dict = dict.fromkeys(names, None)
for i in range(0, len(names), 10):
    g = getGenders(names[i:i + 10])
    for name, gender in zip(names[i:i + 10], g):
        gender_dict[name] = gender
        print name, gender