Python remove_outliersの例、data_functions_albert.remove_outliers Pythonの例

コード例 #1

0

ファイルを表示

#%% +Wealth
"""
wealth = pd.read_stata('totalWEALTH.dta')
wealth = wealth[["HHID","totW"]]
wealth.rename(columns={'HHID':'hh'}, inplace=True)

data = pd.merge(data, wealth, on='hh', how='inner')
"""

#%% Income:
#labor & business income: in US dollars
lab_inc = pd.read_csv('income_hhsec_2009.csv', header=0, na_values='nan')
lab_inc[["wage_total", "bs_profit", "other_inc"
         ]] = remove_outliers(lab_inc[["wage_total", "bs_profit",
                                       "other_inc"]],
                              lq=0.001,
                              hq=0.999)
#Agricultural income: in UG Shillings
ag_inc = pd.read_csv('income_agsec_09.csv', header=0, na_values='nan')
#ag_inc["hh"] = ag_inc["hh"].astype(int)

inc = pd.merge(lab_inc, ag_inc, on="hh", how="outer")
inc = inc.drop(inc.columns[[0, 5]], axis=1)

inc = remove_outliers(inc, lq=0.001, hq=0.999)

inc["inctotal"] = inc.loc[:, ["wage_total", "bs_profit", "total_agrls"]].sum(
    axis=1)
inc["inctotal_trans"] = inc.loc[:, [
    "wage_total", "bs_profit", "other", "total_agrls"
]].sum(axis=1)

コード例 #2

0

ファイルを表示

ファイル: agric_data_UGA10.py プロジェクト: snowdj/DSGE-Agricultural-Choice

data[[
    'animal_value_p_sell', 'chem_fert', 'cons_value_p_sell',
    'food_prod_value_p_sell', 'gift_value_p_sell', 'k', 'm', 'org_fert',
    'pesticides', 'seed_cost', 'seeds_value_p_sell', 'sell_value_p_sell',
    'stored_value_p_sell', 'total2_value_p_sell', 'trans_cost', 'y'
]] = data[[
    'animal_value_p_sell', 'chem_fert', 'cons_value_p_sell',
    'food_prod_value_p_sell', 'gift_value_p_sell', 'k', 'm', 'org_fert',
    'pesticides', 'seed_cost', 'seeds_value_p_sell', 'sell_value_p_sell',
    'stored_value_p_sell', 'total2_value_p_sell', 'trans_cost', 'y'
]].div(data.inflation, axis=0) / dollars

sumdata1 = data[['y', 'k', 'A', 'm', 'l']].describe()
data[['y', 'k', 'A', 'm',
      'l']] = remove_outliers(data[['y', 'k', 'A', 'm', 'l']],
                              lq=0.01,
                              hq=0.99)
sumdata2 = data[['y', 'k', 'A', 'm', 'l']].describe()
# One value of 166000 dollars on y. Therefore, remove 1% on both tails.

variables = ['k', 'm', 'l', 'A', 'y', 'y_over_A']
for var in variables:
    data['ln' + var] = data['ln' +
                            var] = np.log(data[var].dropna() +
                                          np.abs(np.min(data[var]))).replace(
                                              -np.inf, np.nan)

data.to_csv(
    'C:/Users/rodri/OneDrive/Documentos/IDEA/Phd tesi/data/agric_data10.csv',
    index=False)

コード例 #3

0

ファイルを表示

sumowncrops = ag5a[["cons_value","own_value"]].describe()/dollars
ag5a["cons_value2"] = ag5a["own_value"]/2
ag5a.drop(["own_value"], axis=1)
ag5a["total_value2"] =  ag5a.loc[:,["sells_value_2","gift_value","cons_value2","food_prod_value","animal_value", "seeds_value", "stored_value"]].sum(axis=1)
"""

# Merge datasets -------------------------------------------
agrica = pd.merge(ag2a, ag2b, on='hh', how='outer')
agrica = pd.merge(agrica, ag3a, on='hh', how='outer')
agrica = pd.merge(agrica, ag4a, on='hh', how='outer')
agrica = pd.merge(agrica, ag5a, on='hh', how='outer')

#### Trim the data at the 0.1% both sides
agrica.set_index("hh", inplace=True)

agrica = remove_outliers(agrica, lq=0, hq=0.999)

#Pass it to dollars to see if values make sense or not
summarya1 = agrica.describe() / dollars
agrica.reset_index(inplace=True)

del ag2a, ag2b, ag3a, ag4a, ag5a, prices
#agrica = pd.merge(agrica, basic, on='hh', how='outer')

agrica["cost_agra"] = -agrica.loc[:,
                                  ["fet_lab_c", "seeds_c", "trans_cost"]].sum(
                                      axis=1)
agrica["profit_agra"] = agrica.loc[:, [
    "total2_value_p_sell", "rent_owner", "rent_noowner", "cost_agra"
]].sum(axis=1)
agrica = agrica.replace(0, np.nan)

コード例 #4

0

ファイルを表示

ファイル: income_GGSB.py プロジェクト: gstockler/Uganda-2013.2014

other = other.groupby(by="hh").sum()
other = other
other["hh"] = np.array(other.index.values)
summaryo = other.describe() / dollars

# extra-expenditures ---------------------------------------
# NO QUESTIONARY IN EXTRA EXPENDITURES

#%% Merge datasets
income_gsec = pd.merge(lab99, bus12, on="hh", how="outer")
income_gsec = pd.merge(income_gsec, other, on="hh", how="outer")
del income_gsec["wage1"], income_gsec[
    "wage2"], bus12, dollars, other, lab99, summarybus, summaryo, summaryw

dollars = 2586.89

income_gsec[["wage_total", "bs_profit", "other_inc"]] = remove_outliers(
    income_gsec[["wage_total", "bs_profit", "other_inc"]], lq=0.005, hq=0.995)

income_gsec["wage_total"] = income_gsec[["wage_total"]] / dollars
income_gsec["bs_profit"] = income_gsec[["bs_profit"]] / dollars
income_gsec[""] = income_gsec[["other_inc"]] / dollars
sumlab = income_gsec[["wage_total", "bs_profit", "other_inc"]].describe()
print(sumlab.to_latex())

income_gsec.to_csv('income_hhsec.csv')

#%% SAVING
os.chdir('/Users/gabi/Dropbox/2019.1/Development/PS1/UG_2013_14_GGSB/')

income_gsec.to_csv('income_hhsec.csv')

コード例 #5

0

ファイルを表示

# Merge datasets -------------------------------------------

agrica = pd.merge(ag3a, ag4a, on=['HHID','plotID'], how='outer')
agrica = pd.merge(agrica, ag5a, on=['HHID','plotID'], how='right')
agrica = pd.merge(agrica, ag10, on='HHID', how='right')
agrica.set_index(['HHID','plotID'], inplace=True)
agrica = agrica.reset_index()
agrica = agrica.drop_duplicates(subset=['HHID','plotID'], keep=False)

del ag3a, ag4a, ag5a, ag5acrop, conversion_kg, count_bigger, count_equal, count_smaller, crop_count, crop_sum, p, prices, prices_usd, priceslist, q, quant, values_ag5a

sumagrica = agrica[['org_fert', 'chem_fert', 'pesticides', 'hhlabor', 'hired_labor', 'area_planted', 'seed_cost', 'trans_cost', 'total_value_p_sell', 'total2_value_p_sell', 'farm_capital']].describe()
#crop in production and planting coincide so we can eliminate one of them (in importing 2agsec4 do not import crop)

agrica[['org_fert', 'chem_fert', 'pesticides', 'hhlabor', 'hired_labor', 'area_planted', 'seed_cost', 'trans_cost', 'total_value_p_sell', 'total2_value_p_sell', 'farm_capital']] = remove_outliers(agrica[['org_fert', 'chem_fert', 'pesticides', 'hhlabor', 'hired_labor', 'area_planted', 'seed_cost', 'trans_cost', 'total_value_p_sell', 'total2_value_p_sell', 'farm_capital']], lq=0, hq=0.99)
sumagrica2 = agrica[['org_fert', 'chem_fert', 'pesticides', 'hhlabor', 'hired_labor', 'area_planted', 'seed_cost', 'trans_cost', 'total_value_p_sell', 'total2_value_p_sell', 'farm_capital']].describe()





#%% computing productivity levels
agrica['season'] = 1
agrica['k'] = agrica['farm_capital']
agrica['m'] = agrica['org_fert'].fillna(0)+ agrica['chem_fert'].fillna(0)+ agrica['pesticides'].fillna(0)+ agrica['seed_cost'].fillna(0)
agrica['l'] = agrica['hhlabor'].fillna(0)+ agrica['hired_labor'].fillna(0)
agrica['A'] = agrica['area_planted']
agrica['y'] = agrica['total2_value_p_sell']

agrica['y_over_A'] = agrica['y']/agrica['A']

コード例 #6

0

ファイルを表示

ファイル: consumption_GGSB.py プロジェクト: gstockler/Uganda-2013.2014

data["ctotal_dur_own"] = data.loc[:, ["ctotal_own", "cdur_own"]].sum(axis=1)

cdata_short = data[[
    "hh", "ctotal", "ctotal_dur", "ctotal_gift", "ctotal_dur_gift",
    "ctotal_nogift", "ctotal_dur_nogift", "ctotal_own", "ctotal_dur_own",
    "cfood", "cnodur", "cdur"
]]

cdata_short[[
    "ctotal", "ctotal_dur", "ctotal_gift", "ctotal_dur_gift", "ctotal_nogift",
    "ctotal_dur_nogift", "ctotal_own", "ctotal_dur_own", "cfood", "cnodur",
    "cdur"
]] = remove_outliers(cdata_short[[
    "ctotal", "ctotal_dur", "ctotal_gift", "ctotal_dur_gift", "ctotal_nogift",
    "ctotal_dur_nogift", "ctotal_own", "ctotal_dur_own", "cfood", "cnodur",
    "cdur"
]],
                     lq=0.005,
                     hq=0.995)

cdata_short[[
    "ctotal", "ctotal_dur", "ctotal_gift", "ctotal_dur_gift", "ctotal_nogift",
    "ctotal_dur_nogift", "ctotal_own", "ctotal_dur_own", "cfood", "cnodur",
    "cdur"
]] = cdata_short[[
    "ctotal", "ctotal_dur", "ctotal_gift", "ctotal_dur_gift", "ctotal_nogift",
    "ctotal_dur_nogift", "ctotal_own", "ctotal_dur_own", "cfood", "cnodur",
    "cdur"
]] / dollars
sumc = cdata_short.describe()

コード例 #7

0

ファイルを表示

ii = ii[["hh", "farm_asset"]]

w_farm = ii

del ii, ag10

#%% LIVESTOCK ASSETS ???

#%% HH ASSETS
c14 = pd.read_stata('GSEC14A.dta')
c14 = c14[["HHID", "h14q2", "h14q5"]]
c14 = c14.groupby(by="HHID")[["h14q5"]].sum().fillna(0)
c14.columns = ["hh_asset"]
c14["hh"] = np.array(c14.index.values)

#%% MERGING DATA
wealth = pd.merge(w_farm, c14, on="hh", how="inner")
wealth = wealth[["hh", "farm_asset", "hh_asset"]]
wealth["wtotal"] = wealth[["farm_asset", "hh_asset"]].sum(axis=1)
wealth[["farm_asset", "hh_asset", "wtotal"
        ]] = remove_outliers(wealth[["farm_asset", "hh_asset", "wtotal"]],
                             lq=0.001,
                             hq=0.999)
wealth[["farm_asset", "hh_asset",
        "wtotal"]] = wealth[["farm_asset", "hh_asset", "wtotal"]] / dollars
wealth.to_csv("wealth.csv")

#%% SAVING
os.chdir('/Users/gabi/Dropbox/2019.1/Development/PS1/UG_2013_14_GGSB/')
wealth.to_csv("wealth.csv")

コード例 #8

0

ファイルを表示

]] = agrica[[
    'org_fert', 'chem_fert', 'seed_cost', 'trans_cost', 'pesticides',
    'total2_value_p_sell', 'sell_value_p_sell', 'cons_value_p_sell',
    'gift_value_p_sell', 'food_prod_value_p_sell', 'animal_value_p_sell',
    'seeds_value_p_sell', 'stored_value_p_sell', 'farm_capital'
]] / dollars

# Remove outliers: top 0.5%
agrica[[
    'org_fert', 'chem_fert', 'seed_cost', 'trans_cost', 'pesticides',
    'total2_value_p_sell', 'farm_capital', 'area_planted', 'hhlabor',
    'hired_labor'
]] = remove_outliers(agrica[[
    'org_fert', 'chem_fert', 'seed_cost', 'trans_cost', 'pesticides',
    'total2_value_p_sell', 'farm_capital', 'area_planted', 'hhlabor',
    'hired_labor'
]],
                     lq=0,
                     hq=0.975)

# computing productivity levels
agrica['season'] = 1
agrica['k'] = agrica['farm_capital']
agrica['m'] = agrica['org_fert'].fillna(0) + agrica['chem_fert'].fillna(
    0) + agrica['pesticides'].fillna(0) + agrica['seed_cost'].fillna(0)
agrica['l'] = agrica['hhlabor'].fillna(0) + agrica['hired_labor'].fillna(0)
agrica['A'] = agrica['area_planted']
agrica['y'] = agrica['total2_value_p_sell'] - agrica['trans_cost']

agrica['y_over_A'] = (agrica['y'] / agrica['A']).replace([-np.inf, np.inf],
                                                         np.nan)

コード例 #9

0

ファイルを表示

sumowncrops = ag5a[["cons_value","own_value"]].describe()/dollars
ag5a["cons_value2"] = ag5a["own_value"]/2
ag5a.drop(["own_value"], axis=1)
ag5a["total_value2"] =  ag5a.loc[:,["sells_value_2","gift_value","cons_value2","food_prod_value","animal_value", "seeds_value", "stored_value"]].sum(axis=1)
"""

# Merge datasets -------------------------------------------
agrica = pd.merge(ag2a, ag2b, on='hh', how='outer')
agrica = pd.merge(agrica, ag3a, on='hh', how='outer')
agrica = pd.merge(agrica, ag4a, on='hh', how='outer')
agrica = pd.merge(agrica, ag5a, on='hh', how='outer')

#### Trim the data at the 0.1% both sides
agrica.set_index("hh", inplace=True)

agrica = remove_outliers(agrica, lq=0.005, hq=0.995)

#Pass it to dollars to see if values make sense or not
summarya1 = agrica.describe() / dollars
agrica.reset_index(inplace=True)

del ag2a, ag2b, ag3a, ag4a, ag5a, prices
#agrica = pd.merge(agrica, basic, on='hh', how='outer')

agrica["cost_agra"] = -agrica.loc[:,
                                  ["fet_lab_c", "seeds_c", "trans_cost"]].sum(
                                      axis=1)
agrica["profit_agra"] = agrica.loc[:, [
    "total2_value_p_sell", "rent_owner", "rent_noowner", "cost_agra"
]].sum(axis=1)
agrica = agrica.replace(0, np.nan)

コード例 #10

0

ファイルを表示




# Merge datasets -------------------------------------------
agrica = pd.merge(ag2a, ag2b, on='hh', how='outer')
agrica = pd.merge(agrica, ag3a, on='hh', how='outer')
agrica = pd.merge(agrica, ag4a, on='hh', how='outer')
agrica = pd.merge(agrica, ag5a, on='hh', how='outer')



#### Trim the data at the 0.1% both sides
agrica.set_index("hh", inplace=True)

agrica = remove_outliers(agrica, lq=0.01, hq=0.999)

#Pass it to dollars to see if values make sense or not
summarya1 = agrica.describe()/dollars
agrica.reset_index(inplace=True)
  

del ag2a, ag2b, ag3a, ag4a, ag5a, prices
#agrica = pd.merge(agrica, basic, on='hh', how='outer')


agrica["cost_agra"] = -agrica.loc[:,["fet_lab_c","seeds_c","trans_cost"]].sum(axis=1)
agrica["profit_agra"] = agrica.loc[:,["total2_value_p_sell","rent_owner","rent_noowner","cost_agra"]].sum(axis=1)
agrica= agrica.replace(0, np.nan)
agA = agrica[["hh", "profit_agra"]]