def analyzeRace(names): df = pd.DataFrame(names) races = {} results = pred_wiki_name(df, 'last', 'first') pprint(results) for race in results['race']: if race in list(races.keys()): races[race] += 1 else: races[race] = 1 return races
def run_pred_wiki_name(subset_df): """ This function takes a dataframe of teacher information and runs the Pred Wiki Name Function. Input: - subset_df: a dataframe that is a subset of teacher information Output: - df: a dataframe with a predicted race imputation """ has_last_name_df = subset_df[subset_df.teacher_last.notnull()].copy() also_has_first_name_df = has_last_name_df[ has_last_name_df.teacher_first.notnull()].copy() df = pred_wiki_name(also_has_first_name_df, 'teacher_first', 'teacher_last') # generalize the race categories recode_dict = { 'GreaterEuropean,British': 'white', 'GreaterEuropean,WestEuropean,Italian': 'white', 'Asian,GreaterEastAsian,Japanese': 'asian', 'GreaterEuropean,Jewish': 'white', 'GreaterEuropean,EastEuropean': 'white', 'GreaterEuropean,WestEuropean,Hispanic': 'hispanic', 'GreaterAfrican,Africans': 'black', 'GreaterEuropean,WestEuropean,Germanic': 'white', 'GreaterEuropean,WestEuropean,Nordic': 'white', 'Asian,IndianSubContinent': 'white', 'GreaterAfrican,Muslim': 'black', 'GreaterEuropean,WestEuropean,French': 'white', 'Asian,GreaterEastAsian,EastAsian': 'asian' } # replace values using generalized race categories df.replace(to_replace=recode_dict, value=None, inplace=True) # keep only the race colummn cols_to_keep = ['race'] df = df[cols_to_keep] df.rename(columns={'race': 'wiki_fullname'}, inplace=True) return df
#WIKIPEDIA LAST NAME model to predict the race and ethnicity of individual land owners # wikilndf = ethnicolr.pred_wiki_ln(residindiv, 'party1_last1') # wikilndf.replace('(S)', 0, inplace=True) # wikilndf.columns = [c.split(',')[-1] for c in wikilndf.columns] #Keep only most precise group for race/ethnicity to remove commas and shorten column names # wikilndf.columns = list(wikilndf.columns[:6])+[c[:5]+'_wln' for c in wikilndf.columns[6:]] # wikilndf.loc[wikilndf['party1_last1'].isnull(),6:wikilndf.shape[1]] = np.NaN #Glitch in ethnicolr that models race even if last name is NaN # wikilndf.to_csv(os.path.join(resdir,'parcel_taxrollname_residindiv_pred_wiki_ln.csv')) wikilndf = pd.read_csv(os.path.join( resdir, 'parcel_taxrollname_residindiv_pred_wiki_ln.csv'), dtype={ 'PolyID': np.object, 'TaxRollID': np.object }) #WIKIPEDIA FULL NAME model to predict the race and ethnicity of individual land owners wikidf = ethnicolr.pred_wiki_name(residindiv_full, 'party1_last1', 'party1_first1') wikidf.replace('(S)', 0, inplace=True) wikidf.columns = [ c.split(',')[-1] for c in wikidf.columns ] #Keep only most precise group for race/ethnicity to remove commas and shorten column names wikidf.columns = list( wikidf.columns[:6]) + [c[:5] + '_wf' for c in wikidf.columns[6:]] wikidf.to_csv( os.path.join(resdir, 'parcel_taxrollname_residindiv_pred_wiki_name.csv')) # wikidf = pd.read_csv(os.path.join(resdir, 'parcel_taxrollname_residindiv_pred_wiki_name.csv'), # dtype = {'PolyID':np.object, 'TaxRollID': np.object}) #FLORIDA REGISTRATION FULL NAME model to predict the race and ethnicity of individual land owners # flregdf = ethnicolr.pred_fl_reg_name(residindiv_full, 'party1_last1', 'party1_first1') # flregdf.replace('(S)', 0, inplace=True) # flregdf.columns = list(flregdf.columns[:6])+[c+'fl' for c in flregdf.columns[6:]]
def main(): df = pd.read_pickle(sys.argv[1]) df = pred_wiki_name(df, lname_col='Last Name', fname_col='First Name') df.to_pickle(sys.argv[2])
if la_fname == authors[0]: if la_lname == authors[1]: continue if la_fname == authors[2]: if la_lname == authors[3]: continue fa_fname = fa_fname.encode("ascii", errors="ignore").decode() fa_lname = fa_lname.encode("ascii", errors="ignore").decode() la_fname = la_fname.encode("ascii", errors="ignore").decode() la_lname = la_lname.encode("ascii", errors="ignore").decode() names = [{'lname': fa_lname, 'fname': fa_fname}] fa_df = pd.DataFrame(names, columns=['fname', 'lname']) fa_race = pred_wiki_name(fa_df, 'fname', 'lname').values[0][3:] fa_race = [ np.sum(fa_race[white]), np.sum(fa_race[asian]), np.sum(fa_race[hispanic]), np.sum(fa_race[black]) ] names = [{'lname': la_lname, 'fname': la_fname}] la_df = pd.DataFrame(names, columns=['fname', 'lname']) la_race = pred_wiki_name(la_df, 'fname', 'lname').values[0][3:] la_race = [ np.sum(la_race[white]), np.sum(la_race[asian]), np.sum(la_race[hispanic]), np.sum(la_race[black])
Path = 'Muqing.csv' df = pd.read_csv(Path) listx = [] for index, row in df.iterrows(): count = count + 1 Dirtyname = row['name'] print(Dirtyname) Clean = deEmojify(str(Dirtyname)) Testname = HumanName(Clean) if len((Testname.last)) == 0: First = Testname.first if (len(Testname.first)) == 0: Last = Testname.last if len(Testname.last) == 0 and len(Testname.first) == 0: continue else: First = Testname.first Last = Testname.last hmm = [{'First': First, 'Last': Last}] print(count) gg = pd.DataFrame(hmm) pred = (pred_wiki_name(gg, 'First', 'Last')) x = str(pred.loc[0, 'race']) df.loc[index, 'race'] = x print(df.loc[index, 'race']) df.to_csv("Abc.csv")
def prepare_data(): orgs, cats, cat_groups, geo, degrees, jobs, people = read_data(sys.argv[1]) with open(sys.argv[2], 'rb') as h: org_ids = pickle.load(h) orgs = orgs[(orgs.id.isin(org_ids))] orgs = orgs.merge(geo, how='left', left_on='location_id', right_on='id') orgs.rename(index=str, inplace=True, columns={ 'id_x': 'id', 'country_y': 'country', 'city_y': 'city' }) oj = orgs[[ 'id', 'funding_total_usd', 'founded_on', 'city', 'country', 'employee_count', 'primary_role', 'country_alpha_2', 'country_alpha_3', 'continent', 'latitude', 'longitude' ]].merge(jobs[['person_id', 'org_id', 'job_id', 'is_current', 'job_type']], how='left', left_on='id', right_on='org_id') categories = cats.merge(cat_groups, left_on='category_name', right_on='category_name') oj = oj.merge(categories[['organization_id', 'category_group_list']], how='left', left_on='id', right_on='organization_id') ojp = oj.merge(people[['id', 'first_name', 'last_name', 'gender']], how='left', left_on='person_id', right_on='id') ojp.gender = ojp.gender.apply(lambda x: x if x != 'not_provided' else np.nan) # Predict ethnicity given first and last name ojp = ethnicolr.pred_wiki_name(df=ojp, lname_col='last_name', fname_col='first_name') ojp.drop(ethnicities, axis=1, inplace=True) ojpd = ojp.merge( degrees[['person_id', 'degree_type', 'degree_id', 'institution_id']], how='left', left_on='id_y', right_on='person_id') ojpd.drop(['person_id_x', 'person_id_y', 'organization_id', 'org_id'], axis=1, inplace=True) ojpd.rename(index=str, inplace=True, columns={ 'id_x': 'org_id', 'id_y': 'person_id' }) ojpd.degree_type = ojpd.degree_type.apply(change_degree_type) ojpd.employee_count = ojpd.employee_count.apply(company_size) ojpd.to_csv('../data/processed/ojpd_eu_v3.csv', index=False) print(ojpd.shape)