if gender == "" or genderdata[userID]["gender"] == gender: x.append(cnt) y.append(weekdata[str(day)][userID]["RW"]) cnt += 1 if str(day) in weekdata and str(day) == str(changedate): change_cnt = cnt print() print( str(change_cnt) + ": " + str(day) + " (excluded week as cut point for treatment)") cnt += 1 data = pd.DataFrame({'y': y, 'x': x}) #bandwidth_opt = rdd.optimal_bandwidth(data['y'], data['x'], cut=change_cnt) #logging.info("Optimal bandwidth:" + str(bandwidth_opt)) data_rdd = rdd.truncated_data(data, 'x', bandwidth, cut=change_cnt) print() print("Number of observations per week in this model: ") print(data_rdd["x"].value_counts()) print() print() model = rdd.rdd(data_rdd, 'x', 'y', cut=change_cnt) print() print(model.fit().summary()) print() log_endtime = datetime.datetime.now() log_runtime = (log_endtime - log_starttime) logging.info("Total runtime: " + str(log_runtime))
flag_optimal_bandwidth = 0 h = rdd.optimal_bandwidth(data['y'], data['x'], 1) if np.round(h, 5) != .75117: print("\tFAIL: value of h is wrong") flag_optimal_bandwidth = 1 if flag_optimal_bandwidth == 0: print("\tNo Failures") # TEST truncated_data() data_rdd = rdd.truncated_data(data, 'x', h, cut=1) # TEST rdd() model = rdd.rdd(data_rdd, 'x', 'y', cut=1) print(model.fit().summary()) # TEST bin_data() data_binned = rdd.bin_data(data, 'y', 'x', 100) plt.figure() plt.scatter(data_binned['x'], data_binned['y'], s=data_binned['n_obs'], facecolors='none', edgecolors='r') plt.show() plt.close()
results = list() for run in tqdm(range(num_sims)): run = list() for noise in noise_levels: run_noise = list() for angle in angles: x, y, _ = datafun(n=n, b=b, angle=angle, noise_sd=noise) qed = BNQD.BnpQedAnalysis(x, y, kernel_dict, labelFunc, b=b, opts=opts) _ = qed.train() results_df = qed.pretty_print(verbose=False) data = pd.DataFrame({'y':y, 'x': x}) rddmodel = rdd.rdd(data, 'x', 'y', cut=b, verbose=False) fit = rddmodel.fit() rddpval = fit.pvalues['TREATED'] gp_pvals = qed.get_rdd_p_values() simulation = dict() simulation['bnpqed'] = results_df simulation['freq'] = rddpval simulation['gp_pval'] = gp_pvals run_noise.append(simulation) del qed run.append(run_noise) results.append(run)
treatment = np.where(x >= threshold, 1, 0) w1 = np.random.normal(0, 1, N) w2 = np.random.normal(0, 4, N) y = .5 * treatment + 2 * x - .2 * w1 + 1 + epsilon data = pd.DataFrame({'y': y, 'x': x, 'w1': w1, 'w2': w2}) data.head() bandwidth_opt = rdd.optimal_bandwidth(data['y'], data['x'], cut=threshold) print("Optimal bandwidth:", bandwidth_opt) data_rdd = rdd.truncated_data(data, 'x', bandwidth_opt, cut=threshold) #x = running variable #y = outcome variables model = rdd.rdd(data_rdd, 'x', 'y', cut=threshold) print(model.fit().summary()) df_RD = pd.io.stata.read_stata( r'/Users/gopaljuneja/Desktop/Microenterprise_Kenya/113714-V1/App2017-0042_data/datasets/RD_Dataset.dta' ) df_RD.to_csv('/Users/gopaljuneja/Desktop/Reproduced_MEK/RD_Dataset.csv') bw100 = 100 bw150 = 150 bw200 = 200 threshold = 1 rdd.optimal_bandwidth() band100 = df_RD.loc[(df_RD['ce_std'] <= 1) & (df_RD['ce_std'] >= -1 * 1)] band150 = df_RD.loc[(df_RD['ce_std'] <= 1.5) & (df_RD['ce_std'] >= -1 * 1.5)]
kernels = [linear_kernel, std_periodic_kernel, RBF_kernel] kernel_dict = dict(zip(kernel_names, kernels)) opts = dict() opts['num_restarts'] = 50 opts['mode'] = 'BIC' opts['verbose'] = False qed = BNQD.BnpQedAnalysis(x, y, kernel_dict, labelFunc, b=bz, opts=opts) qed.train() results_df = qed.pretty_print() gp_pvals = qed.get_rdd_p_values() rdddata = pd.DataFrame({'y':y, 'x': x}) rddmodel = rdd.rdd(rdddata, 'x', 'y', cut=bz, verbose=False) fit = rddmodel.fit() rddpval = fit.pvalues['TREATED'] es_fig, es_axes = qed.plot_effect_sizes() mf_fig, mf_axes = qed.plot_model_fits(x_test) # do for bottom row for ax in mf_axes[len(kernel_dict)-1, :]: ax.set_xticks(zscorex(np.arange(1, n, step=12))) ax.set_xticklabels(np.unique(data['year'])) ax.set_xlabel('Time') # do for left column for ax in mf_axes.flatten():