def analyzeRace(names):
    df = pd.DataFrame(names)
    races = {}
    results = pred_wiki_name(df, 'last', 'first')
    pprint(results)
    for race in results['race']:
        if race in list(races.keys()):
            races[race] += 1
        else:
            races[race] = 1
    return races
Beispiel #2
0
def run_pred_wiki_name(subset_df):
    """
    This function takes a dataframe of teacher information and
    runs the Pred Wiki Name Function.

    Input:
    	- subset_df: a dataframe that is a subset of teacher information
    Output:
    	- df: a dataframe with a predicted race imputation 
    """
    has_last_name_df = subset_df[subset_df.teacher_last.notnull()].copy()
    also_has_first_name_df = has_last_name_df[
        has_last_name_df.teacher_first.notnull()].copy()
    df = pred_wiki_name(also_has_first_name_df, 'teacher_first',
                        'teacher_last')

    # generalize the race categories
    recode_dict = {
        'GreaterEuropean,British': 'white',
        'GreaterEuropean,WestEuropean,Italian': 'white',
        'Asian,GreaterEastAsian,Japanese': 'asian',
        'GreaterEuropean,Jewish': 'white',
        'GreaterEuropean,EastEuropean': 'white',
        'GreaterEuropean,WestEuropean,Hispanic': 'hispanic',
        'GreaterAfrican,Africans': 'black',
        'GreaterEuropean,WestEuropean,Germanic': 'white',
        'GreaterEuropean,WestEuropean,Nordic': 'white',
        'Asian,IndianSubContinent': 'white',
        'GreaterAfrican,Muslim': 'black',
        'GreaterEuropean,WestEuropean,French': 'white',
        'Asian,GreaterEastAsian,EastAsian': 'asian'
    }

    # replace values using generalized race categories
    df.replace(to_replace=recode_dict, value=None, inplace=True)

    # keep only the race colummn
    cols_to_keep = ['race']
    df = df[cols_to_keep]
    df.rename(columns={'race': 'wiki_fullname'}, inplace=True)

    return df
#WIKIPEDIA LAST NAME model to predict the race and ethnicity of individual land owners
# wikilndf = ethnicolr.pred_wiki_ln(residindiv, 'party1_last1')
# wikilndf.replace('(S)', 0, inplace=True)
# wikilndf.columns = [c.split(',')[-1] for c in wikilndf.columns] #Keep only most precise group for race/ethnicity to remove commas and shorten column names
# wikilndf.columns = list(wikilndf.columns[:6])+[c[:5]+'_wln' for c in wikilndf.columns[6:]]
# wikilndf.loc[wikilndf['party1_last1'].isnull(),6:wikilndf.shape[1]] = np.NaN #Glitch in ethnicolr that models race even if last name is NaN
# wikilndf.to_csv(os.path.join(resdir,'parcel_taxrollname_residindiv_pred_wiki_ln.csv'))
wikilndf = pd.read_csv(os.path.join(
    resdir, 'parcel_taxrollname_residindiv_pred_wiki_ln.csv'),
                       dtype={
                           'PolyID': np.object,
                           'TaxRollID': np.object
                       })

#WIKIPEDIA FULL NAME model to predict the race and ethnicity of individual land owners
wikidf = ethnicolr.pred_wiki_name(residindiv_full, 'party1_last1',
                                  'party1_first1')
wikidf.replace('(S)', 0, inplace=True)
wikidf.columns = [
    c.split(',')[-1] for c in wikidf.columns
]  #Keep only most precise group for race/ethnicity to remove commas and shorten column names
wikidf.columns = list(
    wikidf.columns[:6]) + [c[:5] + '_wf' for c in wikidf.columns[6:]]
wikidf.to_csv(
    os.path.join(resdir, 'parcel_taxrollname_residindiv_pred_wiki_name.csv'))
# wikidf = pd.read_csv(os.path.join(resdir, 'parcel_taxrollname_residindiv_pred_wiki_name.csv'),
#                          dtype = {'PolyID':np.object, 'TaxRollID': np.object})

#FLORIDA REGISTRATION FULL NAME model to predict the race and ethnicity of individual land owners
# flregdf = ethnicolr.pred_fl_reg_name(residindiv_full, 'party1_last1', 'party1_first1')
# flregdf.replace('(S)', 0, inplace=True)
# flregdf.columns = list(flregdf.columns[:6])+[c+'fl' for c in flregdf.columns[6:]]
Beispiel #4
0
def main():
    df = pd.read_pickle(sys.argv[1])

    df = pred_wiki_name(df, lname_col='Last Name', fname_col='First Name')

    df.to_pickle(sys.argv[2])
    if la_fname == authors[0]:
        if la_lname == authors[1]:
            continue

    if la_fname == authors[2]:
        if la_lname == authors[3]:
            continue

    fa_fname = fa_fname.encode("ascii", errors="ignore").decode()
    fa_lname = fa_lname.encode("ascii", errors="ignore").decode()
    la_fname = la_fname.encode("ascii", errors="ignore").decode()
    la_lname = la_lname.encode("ascii", errors="ignore").decode()

    names = [{'lname': fa_lname, 'fname': fa_fname}]
    fa_df = pd.DataFrame(names, columns=['fname', 'lname'])
    fa_race = pred_wiki_name(fa_df, 'fname', 'lname').values[0][3:]
    fa_race = [
        np.sum(fa_race[white]),
        np.sum(fa_race[asian]),
        np.sum(fa_race[hispanic]),
        np.sum(fa_race[black])
    ]

    names = [{'lname': la_lname, 'fname': la_fname}]
    la_df = pd.DataFrame(names, columns=['fname', 'lname'])
    la_race = pred_wiki_name(la_df, 'fname', 'lname').values[0][3:]
    la_race = [
        np.sum(la_race[white]),
        np.sum(la_race[asian]),
        np.sum(la_race[hispanic]),
        np.sum(la_race[black])
Beispiel #6
0
Path = 'Muqing.csv'
df = pd.read_csv(Path)
listx = []
for index, row in df.iterrows():
    count = count + 1
    Dirtyname = row['name']
    print(Dirtyname)
    Clean = deEmojify(str(Dirtyname))
    Testname = HumanName(Clean)

    if len((Testname.last)) == 0:
        First = Testname.first
    if (len(Testname.first)) == 0:
        Last = Testname.last

    if len(Testname.last) == 0 and len(Testname.first) == 0:
        continue

    else:
        First = Testname.first
        Last = Testname.last
        hmm = [{'First': First, 'Last': Last}]
        print(count)
        gg = pd.DataFrame(hmm)
        pred = (pred_wiki_name(gg, 'First', 'Last'))
        x = str(pred.loc[0, 'race'])
        df.loc[index, 'race'] = x
        print(df.loc[index, 'race'])

df.to_csv("Abc.csv")
Beispiel #7
0
def prepare_data():
    orgs, cats, cat_groups, geo, degrees, jobs, people = read_data(sys.argv[1])
    with open(sys.argv[2], 'rb') as h:
        org_ids = pickle.load(h)

    orgs = orgs[(orgs.id.isin(org_ids))]
    orgs = orgs.merge(geo, how='left', left_on='location_id', right_on='id')
    orgs.rename(index=str,
                inplace=True,
                columns={
                    'id_x': 'id',
                    'country_y': 'country',
                    'city_y': 'city'
                })
    oj = orgs[[
        'id', 'funding_total_usd', 'founded_on', 'city', 'country',
        'employee_count', 'primary_role', 'country_alpha_2', 'country_alpha_3',
        'continent', 'latitude', 'longitude'
    ]].merge(jobs[['person_id', 'org_id', 'job_id', 'is_current', 'job_type']],
             how='left',
             left_on='id',
             right_on='org_id')

    categories = cats.merge(cat_groups,
                            left_on='category_name',
                            right_on='category_name')

    oj = oj.merge(categories[['organization_id', 'category_group_list']],
                  how='left',
                  left_on='id',
                  right_on='organization_id')

    ojp = oj.merge(people[['id', 'first_name', 'last_name', 'gender']],
                   how='left',
                   left_on='person_id',
                   right_on='id')

    ojp.gender = ojp.gender.apply(lambda x: x
                                  if x != 'not_provided' else np.nan)
    # Predict ethnicity given first and last name
    ojp = ethnicolr.pred_wiki_name(df=ojp,
                                   lname_col='last_name',
                                   fname_col='first_name')
    ojp.drop(ethnicities, axis=1, inplace=True)
    ojpd = ojp.merge(
        degrees[['person_id', 'degree_type', 'degree_id', 'institution_id']],
        how='left',
        left_on='id_y',
        right_on='person_id')
    ojpd.drop(['person_id_x', 'person_id_y', 'organization_id', 'org_id'],
              axis=1,
              inplace=True)
    ojpd.rename(index=str,
                inplace=True,
                columns={
                    'id_x': 'org_id',
                    'id_y': 'person_id'
                })

    ojpd.degree_type = ojpd.degree_type.apply(change_degree_type)
    ojpd.employee_count = ojpd.employee_count.apply(company_size)

    ojpd.to_csv('../data/processed/ojpd_eu_v3.csv', index=False)
    print(ojpd.shape)