def task_infected_with_compliance_binomial_regression(depends_on, produces): depends_on = BLD / "data" / "infected_merge_data.pickle" merge_data = pd.read_pickle(depends_on) merge_data["Month"] = pd.Categorical(merge_data["Month"]) months = ["April", "May", "September"] formulas_month = ( "infected ~ age_cut + male" # " + living_alone" "+ living_with_children" " + edu + employed + income_hh_cut + working_essential_worker" # " + avoid_cafe + avoid_theater + avoid_public_transport + avoid_gym" " + compliance_index" " + compliance_index*male" " + compliance_index*edu" # " + compliance_index*employed" " + compliance_index*living_with_children" # " + compliance_index*living_alone" # ??? # " + compliance_index*working_essential_worker" # " + compliance_index*income_hh_cut" # " + compliance_index*age_cut" ) formulas = [formulas_month] * 3 + [formulas_month] results = [] odds_radios = [] model_names = [] for i, month in enumerate(months): # noqa:B007 merge_data_month = merge_data.query("Month == @month") result, summary, odds_radio = _infected_binomial_regression_formula( merge_data_month, formulas[i]) results.append(result) odds_radios.append(odds_radio) model_names.append(month) result, summary, odds_radio = _infected_binomial_regression_formula( merge_data, formulas[-1]) model_names.append("Pooled") results.append(result) odds_radios.append(odds_radio) formated_result = sm_results_format( results, model_names, order=results[-1].summary2().tables[1].index.tolist()) formated_result = pd.DataFrame(formated_result.tables[0]) formated_odds_radios = odds_radio_format(odds_radios, model_names) formed_ordinal_result = rename_index(formated_result) formed_ordinal_result.to_csv(produces["regression"], float_format="%.3f", index_label="", quoting=csv.QUOTE_NONNUMERIC) formated_odds_radios = rename_index(formated_odds_radios) formated_odds_radios.to_csv(produces["odds_radio"], float_format="%.3f", index_label="", quoting=csv.QUOTE_NONNUMERIC)
def task_infected_panel_regression(depends_on, produces): infected_merge_data = pd.read_pickle(depends_on["infected"]) # infected_merge_data = infected_merge_data.drop(columns=['Month']) policy = pd.read_pickle(depends_on["policy"]) policy_month = policy.groupby("month").mean() policy_month["policy_stringency"] = policy_month.sum(axis=1) temperature_data = pd.read_pickle(depends_on["temperature"]) temperature_data = temperature_data.reset_index("City").drop( columns=["City", "Month"]) # infected(i,s,t)~compliance(i,s) + Xi + Policy(s,t) + temperature + fixed effects + interactions merge_data = infected_merge_data.join(policy_month, on="month", how="left").join(temperature_data, on="month", how="left") merge_data.to_csv(produces["data"]) results = [] odds_radios = [] model_names = [] formula = ( "infected ~ age_cut + living_alone + living_with_children + male" " + edu + employed + income_hh_cut + working_essential_worker" " + compliance_index" " + policy_stringency" # " + LowTemp" # " + compliance_index*C(Month)" # " + compliance_index*edu" # " + compliance_index*living_with_children" ) # run regression # result, summary, odds_radio = conditional_logit_regression_formula(merge_data, formula, "Month", method="bfgs") result, summary, odds_radio = binomial_logit_regression_formula( merge_data, formula, method="bfgs") # result, summary, odds_radio = ordinal_regression_formula(merge_data, formula, "logit") model_names.append("Pooled") results.append(result) odds_radios.append(odds_radio) formated_result = sm_results_format(results, model_names) formated_odds_radios = odds_radio_format(odds_radios, model_names) formed_ordinal_result = rename_index( pd.DataFrame(formated_result.tables[0])) formed_ordinal_result.to_csv(produces["regression"], float_format="%.3f", index_label="", quoting=csv.QUOTE_NONNUMERIC) formated_odds_radios = rename_index(formated_odds_radios) formated_odds_radios.to_csv(produces["odds_radio"], float_format="%.3f", index_label="", quoting=csv.QUOTE_NONNUMERIC)
def _infected_with_compliance_binomial_regression_with_interaction( merge_data, interactions, produces): months = (merge_data.index.get_level_values( "month").drop_duplicates().sort_values().month_name().tolist()) for interaction in interactions: results = [] odds_radios = [] model_names = [] for month in months: # noqa:B007 merge_data_month = merge_data.query("Month == @month") ( result, summary, odds_radio, ) = _infected_ordinal_regression_formula_with_interaction( merge_data_month, interaction) results.append(result) odds_radios.append(odds_radio) model_names.append(month) ( result, summary, odds_radio, ) = _infected_ordinal_regression_formula_with_interaction( merge_data, interaction) model_names.append("Pooled") results.append(result) odds_radios.append(odds_radio) formated_result = sm_results_format(results, model_names) pd.DataFrame(formated_result.tables[0]).to_csv( produces / f"{interaction if interaction!='' else 'no_interaction'}.csv". replace("*", "#").replace(":", "#"), float_format="%.3f", quoting=csv.QUOTE_NONNUMERIC)
def task_repetitiously_infected_regression(depends_on, produces): compliance = pd.read_pickle(depends_on["compliance"]) infected = pd.read_pickle(depends_on["infected"]) work_status = pd.read_pickle(depends_on["work_status"]) background = pd.read_pickle(depends_on["background"]) essential_worker = pd.read_pickle(depends_on["essential_worker"]) merge_data = _merge_repetitiously_infected( compliance, infected, work_status, background, essential_worker ) result, summary, odds_radio = _repetitiously_infected_binomial_regression_formula( merge_data ) formated_result = sm_results_format([result], ["Repetitiously Infected"]) formated_odds_radios = odds_radio_format([odds_radio], ["Repetitiously Infected"]) pd.DataFrame(formated_result.tables[0]).to_csv( produces["regression"], float_format="%.3f", quoting=csv.QUOTE_NONNUMERIC ) # with open(produces['regression'], 'w') as f: # f.write(formated_result.as_latex()) formated_odds_radios.to_csv(produces["odds_radio"], float_format="%.3f", quoting=csv.QUOTE_NONNUMERIC)
def task_infected_regression(depends_on, produces): merge_data = pd.read_pickle(depends_on) months = (merge_data.index.get_level_values( "month").drop_duplicates().sort_values().month_name().tolist()) results = [] odds_radios = [] model_names = [] for month in months: # noqa:B007 merge_data_month = merge_data.query("Month == @month") result, summary, odds_radio = _infected_ordinal_regression_formula( merge_data_month) results.append(result) odds_radios.append(odds_radio) model_names.append(month) result, summary, odds_radio = _infected_ordinal_regression_formula( merge_data) model_names.append("Pooled") results.append(result) odds_radios.append(odds_radio) formated_result = sm_results_format(results, model_names) formated_odds_radios = odds_radio_format(odds_radios, model_names) formed_ordinal_result = rename_index( pd.DataFrame(formated_result.tables[0])) formed_ordinal_result.to_csv(produces["regression"], float_format="%.3f", index_label="", quoting=csv.QUOTE_NONNUMERIC) formated_odds_radios = rename_index(formated_odds_radios) formated_odds_radios.to_csv(produces["odds_radio"], float_format="%.3f", index_label="", quoting=csv.QUOTE_NONNUMERIC)
def task_compliance_regression(depends_on, produces): merge_data = pd.read_pickle(depends_on) # run regression models = ["(A)", "(B)", "(C)", "(D)"] formula1 = ( # I(net_income_hh_eqv/1000) "compliance_index ~ age_by100 + female + living_alone + living_with_children + " "edu + employed + I(net_income_hh_eqv/1000)" " + working_essential_worker") formula2 = ("compliance_index ~" + add_poly_formula("age_by100", 3) + "+ female + living_alone + living_with_children + " "edu + employed + I(net_income_hh_eqv/1000)" " + working_essential_worker") formula3 = ( formula2 + " + extraversion + openness + conscientiousness + agreeableness + neuroticism" ) formula4 = formula3 + " + ideology" " + I(ideology ** 2)" " + trust_gov" formulas = [formula1, formula2, formula3, formula4] # ordinal logit model ordinal_logit_results, _, ordinal_logit_odds_radios = map( list, zip(*[ ordinal_regression_formula(merge_data, formula, "logit") for formula in formulas ]), ) formed_odds_radios_logit = odds_radio_format(ordinal_logit_odds_radios, models) formed_odds_radios_logit = rename_index(formed_odds_radios_logit) formed_ordinal_logit_result = sm_results_format(ordinal_logit_results, models, var_order) formed_ordinal_logit_result = pd.DataFrame( formed_ordinal_logit_result.tables[0]) accuracy_logit = [ ordinal_regression_accuracy(result) for result in ordinal_logit_results ] accuracy_logit = [f"{i*100:.2f}%" for i in accuracy_logit] formed_ordinal_logit_result = formed_ordinal_logit_result.append( pd.DataFrame( [accuracy_logit], index=["accuracy"], columns=formed_ordinal_logit_result.columns, )) formed_ordinal_logit_result = rename_index(formed_ordinal_logit_result) formed_ordinal_logit_result.to_csv(produces["regression_ordered_logit"], float_format="%.3f", index_label="", quoting=csv.QUOTE_NONNUMERIC) formed_odds_radios_logit.to_csv(produces["odds_radio_logit"], float_format="%.3f", index_label="", quoting=csv.QUOTE_NONNUMERIC) # ordinal probit model ordinal_probit_results, _, ordinal_probit_odds_radios = map( list, zip(*[ ordinal_regression_formula(merge_data, formula, "probit") for formula in formulas ]), ) formed_odds_radios_probit = odds_radio_format(ordinal_probit_odds_radios, models) formed_odds_radios_probit = rename_index(formed_odds_radios_probit) formed_ordinal_probit_result = sm_results_format(ordinal_probit_results, models, var_order) formed_ordinal_probit_result = pd.DataFrame( formed_ordinal_probit_result.tables[0]) accuracy_probit = [ ordinal_regression_accuracy(result) for result in ordinal_probit_results ] accuracy_probit = [f"{i * 100:.2f}%" for i in accuracy_probit] formed_ordinal_probit_result = formed_ordinal_probit_result.append( pd.DataFrame( [accuracy_probit], index=["accuracy"], columns=formed_ordinal_probit_result.columns, )) formed_ordinal_probit_result = rename_index(formed_ordinal_probit_result) formed_ordinal_probit_result.to_csv(produces["regression_ordered_probit"], float_format="%.3f", index_label="", quoting=csv.QUOTE_NONNUMERIC) formed_odds_radios_probit.to_csv(produces["odds_radio_probit"], float_format="%.3f", index_label="", quoting=csv.QUOTE_NONNUMERIC) # ols model ols_results, _ = map( list, zip(*[ ols_regression_formula(merge_data, formula) for formula in formulas ]), ) formed_ols_result = sm_results_format(ols_results, models, var_order) formed_ols_result = pd.DataFrame(formed_ols_result.tables[0]) accuracy_ols = [ols_regression_accuracy(result) for result in ols_results] accuracy_ols = [f"{i * 100:.2f}%" for i in accuracy_ols] formed_ols_result = formed_ols_result.append( pd.DataFrame([accuracy_ols], index=["accuracy"], columns=formed_ols_result.columns)) formed_ols_result = rename_index(formed_ols_result) formed_ols_result.to_csv(produces["regression_ols"], float_format="%.3f", index_label="", quoting=csv.QUOTE_NONNUMERIC) # merge all models formed_ols_result.columns = pd.MultiIndex.from_product( [["OLS"], formed_ols_result.columns]) formed_ordinal_logit_result.columns = pd.MultiIndex.from_product( [["ORDERED LOGIT"], formed_ordinal_logit_result.columns]) formed_ordinal_probit_result.columns = pd.MultiIndex.from_product( [["ORDERED PROBIT"], formed_ordinal_probit_result.columns]) formed_ols_result = formed_ols_result.reset_index() formed_ordinal_logit_result = formed_ordinal_logit_result.reset_index( ).set_index(pd.Index(range(2, formed_ordinal_logit_result.shape[0] + 2))) formed_ordinal_probit_result = formed_ordinal_probit_result.reset_index( ).set_index(pd.Index(range(2, formed_ordinal_probit_result.shape[0] + 2))) merge_result = pd.concat( [ formed_ols_result, formed_ordinal_logit_result, formed_ordinal_probit_result ], axis=1, ) merge_result.to_csv(produces["merge_regression_result"], float_format="%.3f", index_label="", index=False, quoting=csv.QUOTE_NONNUMERIC)
def task_compliance_regression_did(depends_on, produces): merge_data = pd.read_pickle(depends_on) # run regression models = ["(B)", "(D)"] formula1 = ( "compliance_index ~ male +" "edu + employed + income_hh_cut" " + working_essential_worker" " + living_with_children" " + age_cut" " + post*male" # " + Month*employed" # " + post*income_hh_cut" # " + post*working_essential_worker" " + post*living_with_children" " + post*edu" # " + post*age_cut" # " + post*living_alone" ) formula2 = (formula1) formula3 = formula2 + " + extraversion + openness + conscientiousness + agreeableness + neuroticism" formula4 = ( formula3 + " + ideology" " + I(ideology ** 2)" + " + trust_gov" # " + post*trust_gov" ) formulas = [formula2, formula4] ols_results, _ = map( list, zip(*[ ols_regression_formula(merge_data, formula) for formula in formulas ]), ) formed_ols_result = sm_results_format(ols_results, models, order=var_order) formed_ols_result = rename_index(pd.DataFrame(formed_ols_result.tables[0])) formed_ols_result.to_csv(produces["regression_ols_did"], float_format="%.3f", index_label="", quoting=csv.QUOTE_NONNUMERIC) # regression without post formula1 = ( "compliance_index ~ male +" "edu + employed + income_hh_cut" " + working_essential_worker" " + age_cut" " + living_with_children" # " + living_alone" ) formula2 = (formula1) formula3 = formula2 + " + extraversion + openness + conscientiousness + agreeableness + neuroticism" formula4 = formula3 + " + ideology" " + I(ideology ** 2)" + " + trust_gov" formulas_without_post = [formula2, formula4] ols_without_post_results, _ = map( list, zip(*[ ols_regression_formula(merge_data, formula) for formula in formulas_without_post ]), ) formed_ols_without_post_result = sm_results_format( ols_without_post_results, models, order=var_order) formed_ols_without_post_result = rename_index( pd.DataFrame(formed_ols_without_post_result.tables[0])) formed_ols_without_post_result.to_csv( produces["regression_ols_did_no_post"], float_format="%.3f", index_label="", quoting=csv.QUOTE_NONNUMERIC) # merge all models formed_ols_without_post_result.columns = pd.MultiIndex.from_product( [["Basic estimation"], formed_ols_without_post_result.columns]) formed_ols_result.columns = pd.MultiIndex.from_product( [["Difference in Difference"], formed_ols_result.columns]) formed_ols_result = formed_ols_result.reset_index() formed_ols_without_post_result = formed_ols_without_post_result.reset_index( ) merge_result = pd.concat( [formed_ols_without_post_result, formed_ols_result], axis=1) merge_result.to_csv( produces["regression_ols_did_merge"], float_format="%.3f", index_label="", index=False, quoting=csv.QUOTE_NONNUMERIC, )