def run_pred_census_ln(subset_df, census_year): """ This function takes a dataframe of teacher information and runs the Pred Census Ln Function. Input: - subset_df: a dataframe that is a subset of teacher information Output: - df: a dataframe with a predicted race imputation """ has_last_name_df = subset_df[subset_df.teacher_last.notnull()].copy() df = pred_census_ln(has_last_name_df, 'teacher_last', census_year) #recode two race categories recode_dict = {'api': 'asian'} # replace values using generalized race categories df.replace(to_replace=recode_dict, value=None, inplace=True) # keep only the race column cols_to_keep = ['race'] df = df[cols_to_keep] df.rename(columns={'race': 'census_lastname'}, inplace=True) return df
def guessActorInfo(actor_name): # If NNDB bio page not found then use SexMachine or ethnicolr module to guess gender/race print("\t\tNNDB bio page not found...Using Python modules to guess") first_name = actor_name.split(" ")[0] gender_guessed = gender_detector.get_gender(first_name) if gender_guessed == "andy": print("\t\tUsing getGenders utility") # gender_guessed = getGenders(first_name)[0][0] gender_guessed = "Male" if gender_guessed in gender_table.keys(): gender = gender_table[gender_guessed] else: gender = "Others" last_name = " ".join(actor_name.split(" ")[1:]) last_name_df = pd.DataFrame([{'name': last_name}]) race_guessed = pred_census_ln(last_name_df, 'name')["race"][0] if race_guessed in race_table.keys(): race = race_table[race_guessed] else: race = "Others" print("\t\tGuess:(" + gender_guessed + "," + race_guessed + ")") return (gender, race)
def predict_ethnicity_1(df: pd.DataFrame, name_attr: str) -> pd.DataFrame: df1 = ec.pred_census_ln(df, name_attr, year=2010).iloc[:, -4:] df1.columns = ['asian', 'black', 'hispanic', 'white'] df1.index = df.index df.drop(columns='race', inplace=True) return df1
#!/usr/bin/python # -*- coding: utf-8 -*- import pandas as pd from ethnicolr import census_ln, pred_census_ln names = [{'name': 'smith'}, {'name': 'zhang'}, {'name': 'jackson'}] df = pd.DataFrame(names) print(df) print(census_ln(df, 'name')) print(census_ln(df, 'name', 2010)) print(pred_census_ln(df, 'name'))
logging.info("ending the get username program at " + str(datetime.datetime.now())) logging.info("total: " + str(count)) myFile.close #adding race prediction logging.info("adding race prediction") print("adding race prediction") df = pd.read_csv("tmp_" + filename, usecols=[ 'username', 'firstname', 'lastname', 'groupname', 'recordingTime', 'num_record', 'num_groupuser' ]) df = pd.DataFrame(df) dict = {} result = pred_census_ln(df, 'lastname') result.to_csv(filename, sep=',') logging.info("finished race prediction") print("finished race prediction") print(filename) #f.write(str(datetime.datetime.now())+","+str(count)+","+str(client.get_participants(channel,limit = 0).total)+","+channel[13:]) #f.close() #adding to database #filename = "dx_chain_user_first.csv" #groupname = "dxchain" #connect to database print("connecting db")
import pandas as pd import numpy as np import csv import sys from ethnicolr import census_ln, pred_census_ln filepath = sys.argv[1] file = pd.read_csv(filepath) dataframe_stat = pred_census_ln(file, 'LAST NAME', 2010) refreshed_dataframe = dataframe_stat[['FIRST NAME', 'LAST NAME', 'race']] refreshed_dataframe.columns = ['FIRST NAME', 'LAST NAME', 'ETHNICITY'] refreshed_dataframe.to_csv('Consumer_Data_10394_Sample2.csv', index=False)