def addEthnicityFields(df, namefield): # https://pypi.org/project/ethnicolr/#description import ethnicolr # use only the last word of the field for analysis df['ethname'] = df[namefield].transform(lambda t: t.split()[-1]) # convert using library function df = ethnicolr.census_ln(df, 'ethname') # drop the temporary column df = df.drop(columns=['ethname']) newfields = [ 'pctwhite', 'pctblack', 'pctapi', 'pctaian', 'pct2prace', 'pcthispanic' ] for fieldname in newfields: df[fieldname] = pd.to_numeric(df[fieldname].astype(str), errors='coerce').astype(float) return df
def run_census_last(subset_df, census_year): """ This function takes a dataframe of teacher information and runs the Census Ln Function. It provides the proportion of given last name that was registered as someone who was "white" during the 2010 United States Census. Input: - subset_df: a dataframe that is a subset of teacher information Output: - df: a dataframe with proportion that the last name was "white" during the 2010 Census """ has_last_name_df = subset_df[subset_df.teacher_last.notnull()].copy() df = census_ln(has_last_name_df, 'teacher_last', census_year) # # keep the relevant columns # cols_to_keep = ['pctwhite'] # df = df[cols_to_keep] # # fill NaNs w/ 50% # df.fillna(value=float(50), axis=1, inplace=True) return df
#!/usr/bin/python # -*- coding: utf-8 -*- import pandas as pd from ethnicolr import census_ln, pred_census_ln names = [{'name': 'smith'}, {'name': 'zhang'}, {'name': 'jackson'}] df = pd.DataFrame(names) print(df) print(census_ln(df, 'name')) print(census_ln(df, 'name', 2010)) print(pred_census_ln(df, 'name'))
native_american = 0 two_race = 0 df = [] if not os.path.exists('ethnicity.pkl'): with open('full.json', 'r') as tweets_file: for idx, line in enumerate(tweets_file): try: if idx % 10000 == 0 and idx != 0: print(idx) df = pd.DataFrame(df) classed = census_ln(df, 'name') classed = classed.dropna() classed = classed.drop(['name'], axis=1) classed = classed.replace('(S)', 0) classed = classed.astype('float64') classed = classed.divide(100) white += float(classed['pctwhite'].sum()) black += float(classed['pctblack'].sum()) asian += float(classed['pctapi'].sum()) native_american += float(classed['pctaian'].sum()) two_race += float(classed['pct2prace'].sum()) hispanic += float(classed['pcthispanic'].sum()) df = [] tweet = json.loads(line) name = tweet['user']['name']
def run_census_ln(subset_df, census_year): """Run the Census Ln Function.""" has_last_name_df = subset_df[subset_df.contributor_lname.notnull()].copy() return census_ln(has_last_name_df, 'contributor_lname', census_year)