def _train_and_predict_r_stage1(X, y, w, fit_mask, pred_mask, n_layers_out: int = DEFAULT_LAYERS_OUT, n_units_out: int = DEFAULT_UNITS_OUT, n_layers_r: int = DEFAULT_LAYERS_R, n_units_r: int = DEFAULT_UNITS_R, penalty_l2: float = DEFAULT_PENALTY_L2, step_size: float = DEFAULT_STEP_SIZE, n_iter: int = DEFAULT_N_ITER, batch_size: int = DEFAULT_BATCH_SIZE, val_split_prop: float = DEFAULT_VAL_SPLIT, early_stopping: bool = True, patience: int = DEFAULT_PATIENCE, n_iter_min: int = DEFAULT_N_ITER_MIN, verbose: int = 1, n_iter_print: int = DEFAULT_N_ITER_PRINT, seed: int = DEFAULT_SEED, nonlin: str = DEFAULT_NONLIN): if len(w.shape) > 1: w = w.reshape((len(w), )) # split the data X_fit, y_fit, w_fit = X[fit_mask, :], y[fit_mask], w[fit_mask] X_pred = X[pred_mask, :] if verbose > 0: print('Training output Net') params_out, predict_fun_out = train_output_net_only( X_fit, y_fit, n_layers_out=n_layers_out, n_units_out=n_units_out, n_layers_r=n_layers_r, n_units_r=n_units_r, penalty_l2=penalty_l2, step_size=step_size, n_iter=n_iter, batch_size=batch_size, val_split_prop=val_split_prop, early_stopping=early_stopping, patience=patience, n_iter_min=n_iter_min, n_iter_print=n_iter_print, verbose=verbose, seed=seed, nonlin=nonlin) mu_hat = predict_fun_out(params_out, X_pred) if verbose > 0: print('Training propensity net') params_prop, predict_fun_prop = train_output_net_only( X_fit, w_fit, binary_y=True, n_layers_out=n_layers_out, n_units_out=n_units_out, n_layers_r=n_layers_r, n_units_r=n_units_r, penalty_l2=penalty_l2, step_size=step_size, n_iter=n_iter, batch_size=batch_size, val_split_prop=val_split_prop, early_stopping=early_stopping, patience=patience, n_iter_min=n_iter_min, n_iter_print=n_iter_print, verbose=verbose, seed=seed, nonlin=nonlin) pi_hat = predict_fun_prop(params_prop, X_pred) return mu_hat, pi_hat
def train_tnet(X, y, w, binary_y: bool = False, n_layers_out: int = DEFAULT_LAYERS_OUT, n_units_out: int = DEFAULT_UNITS_OUT, n_layers_r: int = DEFAULT_LAYERS_R, n_units_r: int = DEFAULT_UNITS_R, penalty_l2: float = DEFAULT_PENALTY_L2, step_size: float = DEFAULT_STEP_SIZE, n_iter: int = DEFAULT_N_ITER, batch_size: int = DEFAULT_BATCH_SIZE, val_split_prop: float = DEFAULT_VAL_SPLIT, early_stopping: bool = True, patience: int = DEFAULT_PATIENCE, n_iter_min: int = DEFAULT_N_ITER_MIN, verbose: int = 1, n_iter_print: int = DEFAULT_N_ITER_PRINT, seed: int = DEFAULT_SEED, return_val_loss: bool = False, train_separate: bool = True, penalty_diff: float = DEFAULT_PENALTY_L2, nonlin: str = DEFAULT_NONLIN, avg_objective: bool = DEFAULT_AVG_OBJECTIVE): # w should be 1-D for indexing if len(w.shape) > 1: w = w.reshape((len(w), )) if train_separate: # train two heads completely independently if verbose > 0: print('Training PO_0 Net') out_0 = train_output_net_only(X[w == 0], y[w == 0], binary_y=binary_y, n_layers_out=n_layers_out, n_units_out=n_units_out, n_layers_r=n_layers_r, n_units_r=n_units_r, penalty_l2=penalty_l2, step_size=step_size, n_iter=n_iter, batch_size=batch_size, val_split_prop=val_split_prop, early_stopping=early_stopping, patience=patience, n_iter_min=n_iter_min, n_iter_print=n_iter_print, verbose=verbose, seed=seed, return_val_loss=return_val_loss, nonlin=nonlin, avg_objective=avg_objective) if verbose > 0: print('Training PO_1 Net') out_1 = train_output_net_only(X[w == 1], y[w == 1], binary_y=binary_y, n_layers_out=n_layers_out, n_units_out=n_units_out, n_layers_r=n_layers_r, n_units_r=n_units_r, penalty_l2=penalty_l2, step_size=step_size, n_iter=n_iter, batch_size=batch_size, val_split_prop=val_split_prop, early_stopping=early_stopping, patience=patience, n_iter_min=n_iter_min, n_iter_print=n_iter_print, verbose=verbose, seed=seed, return_val_loss=return_val_loss, nonlin=nonlin, avg_objective=avg_objective) if return_val_loss: params_0, predict_fun_0, loss_0 = out_0 params_1, predict_fun_1, loss_1 = out_1 return (params_0, params_1), (predict_fun_0, predict_fun_1), loss_1 + loss_0 params_0, predict_fun_0 = out_0 params_1, predict_fun_1 = out_1 else: # train jointly by regularizing similarity params, predict_fun = _train_tnet_jointly( X, y, w, binary_y=binary_y, n_layers_out=n_layers_out, n_units_out=n_units_out, n_layers_r=n_layers_r, n_units_r=n_units_r, penalty_l2=penalty_l2, step_size=step_size, n_iter=n_iter, batch_size=batch_size, val_split_prop=val_split_prop, early_stopping=early_stopping, patience=patience, n_iter_min=n_iter_min, n_iter_print=n_iter_print, verbose=verbose, seed=seed, return_val_loss=return_val_loss, penalty_diff=penalty_diff, nonlin=nonlin) params_0, params_1 = params[0], params[1] predict_fun_0, predict_fun_1 = predict_fun, predict_fun return (params_0, params_1), (predict_fun_0, predict_fun_1)
def train_r_net(X, y, w, p=None, second_stage_strategy: str = R_STRATEGY_NAME, data_split: bool = False, cross_fit: bool = False, n_cf_folds: int = DEFAULT_CF_FOLDS, n_layers_out: int = DEFAULT_LAYERS_OUT, n_layers_r: int = DEFAULT_LAYERS_R, n_layers_r_t: int = DEFAULT_LAYERS_R_T, n_layers_out_t: int = DEFAULT_LAYERS_OUT_T, n_units_out: int = DEFAULT_UNITS_OUT, n_units_r: int = DEFAULT_UNITS_R, n_units_out_t: int = DEFAULT_UNITS_OUT_T, n_units_r_t: int = DEFAULT_UNITS_R_T, penalty_l2: float = DEFAULT_PENALTY_L2, penalty_l2_t: float = DEFAULT_PENALTY_L2, step_size: float = DEFAULT_STEP_SIZE, step_size_t: float = DEFAULT_STEP_SIZE_T, n_iter: int = DEFAULT_N_ITER, batch_size: int = DEFAULT_BATCH_SIZE, val_split_prop: float = DEFAULT_VAL_SPLIT, early_stopping: bool = True, patience: int = DEFAULT_PATIENCE, n_iter_min: int = DEFAULT_N_ITER_MIN, verbose: int = 1, n_iter_print: int = DEFAULT_N_ITER_PRINT, seed: int = DEFAULT_SEED, return_val_loss: bool = False, nonlin: str = DEFAULT_NONLIN): # get shape of data n, d = X.shape if p is not None: p = check_shape_1d_data(p) # split data as wanted if not cross_fit: if not data_split: if verbose > 0: print('Training first stage with all data (no data splitting)') # use all data for both fit_mask = onp.ones(n, dtype=bool) pred_mask = onp.ones(n, dtype=bool) else: if verbose > 0: print( 'Training first stage with half of the data (data splitting)' ) # split data in half fit_idx = onp.random.choice(n, int(onp.round(n / 2))) fit_mask = onp.zeros(n, dtype=bool) fit_mask[fit_idx] = 1 pred_mask = ~fit_mask mu_hat, pi_hat = _train_and_predict_r_stage1( X, y, w, fit_mask, pred_mask, n_layers_out=n_layers_out, n_layers_r=n_layers_r, n_units_out=n_units_out, n_units_r=n_units_r, penalty_l2=penalty_l2, step_size=step_size, n_iter=n_iter, batch_size=batch_size, val_split_prop=val_split_prop, early_stopping=early_stopping, patience=patience, n_iter_min=n_iter_min, verbose=verbose, n_iter_print=n_iter_print, seed=seed, nonlin=nonlin) if data_split: # keep only prediction data X, y, w = X[pred_mask, :], y[pred_mask, :], w[pred_mask, :] if p is not None: p = p[pred_mask, :] else: if verbose > 0: print('Training first stage in {} folds (cross-fitting)'.format( n_cf_folds)) # do cross fitting mu_hat, pi_hat = onp.zeros((n, 1)), onp.zeros((n, 1)) splitter = StratifiedKFold(n_splits=n_cf_folds, shuffle=True, random_state=seed) fold_count = 1 for train_idx, test_idx in splitter.split(X, w): if verbose > 0: print('Training fold {}.'.format(fold_count)) fold_count = fold_count + 1 pred_mask = onp.zeros(n, dtype=bool) pred_mask[test_idx] = 1 fit_mask = ~pred_mask mu_hat[pred_mask], pi_hat[pred_mask] = \ _train_and_predict_r_stage1(X, y, w, fit_mask, pred_mask, n_layers_out=n_layers_out, n_layers_r=n_layers_r, n_units_out=n_units_out, n_units_r=n_units_r, penalty_l2=penalty_l2, step_size=step_size, n_iter=n_iter, batch_size=batch_size, val_split_prop=val_split_prop, early_stopping=early_stopping, patience=patience, n_iter_min=n_iter_min, verbose=verbose, n_iter_print=n_iter_print, seed=seed, nonlin=nonlin) if verbose > 0: print('Training second stage.') if p is not None: # use known propensity score p = check_shape_1d_data(p) pi_hat = p y, w = check_shape_1d_data(y), check_shape_1d_data(w) w_ortho = w - pi_hat y_ortho = y - mu_hat if second_stage_strategy == R_STRATEGY_NAME: return train_r_stage2(X, y_ortho, w_ortho, n_layers_out=n_layers_out_t, n_units_out=n_units_out_t, n_layers_r=n_layers_r_t, n_units_r=n_units_r_t, penalty_l2=penalty_l2_t, step_size=step_size_t, n_iter=n_iter, batch_size=batch_size, val_split_prop=val_split_prop, early_stopping=early_stopping, patience=patience, n_iter_min=n_iter_min, verbose=verbose, n_iter_print=n_iter_print, seed=seed, return_val_loss=return_val_loss, nonlin=nonlin) elif second_stage_strategy == U_STRATEGY_NAME: return train_output_net_only(X, y_ortho / w_ortho, n_layers_out=n_layers_out_t, n_units_out=n_units_out_t, n_layers_r=n_layers_r_t, n_units_r=n_units_r_t, penalty_l2=penalty_l2_t, step_size=step_size_t, n_iter=n_iter, batch_size=batch_size, val_split_prop=val_split_prop, early_stopping=early_stopping, patience=patience, n_iter_min=n_iter_min, verbose=verbose, n_iter_print=n_iter_print, seed=seed, return_val_loss=return_val_loss, nonlin=nonlin) else: raise ValueError('R-learner only supports strategies R and U.')
def _train_and_predict_first_stage_s1(X, y, w, fit_mask, pred_mask, binary_y: bool = False, n_layers_out: int = DEFAULT_LAYERS_OUT, n_layers_r: int = DEFAULT_LAYERS_R, n_units_out: int = DEFAULT_UNITS_OUT, n_units_r: int = DEFAULT_UNITS_R, penalty_l2: float = DEFAULT_PENALTY_L2, step_size: float = DEFAULT_STEP_SIZE, n_iter: int = DEFAULT_N_ITER, batch_size: int = DEFAULT_BATCH_SIZE, val_split_prop: float = DEFAULT_VAL_SPLIT, early_stopping: bool = True, patience: int = DEFAULT_PATIENCE, n_iter_min: int = DEFAULT_N_ITER_MIN, verbose: int = 1, n_iter_print: int = DEFAULT_N_ITER_PRINT, seed: int = DEFAULT_SEED, nonlin: str = DEFAULT_NONLIN, avg_objective: bool = False, transformation: str = AIPW_TRANSFORMATION): # Train and predict first stage estimators using SNet1/ TARNet # split the data X_fit, y_fit, w_fit = X[fit_mask, :], y[fit_mask], w[fit_mask] X_pred = X[pred_mask, :] if verbose > 0: print('Training SNet1') params_cfr, predict_funs_cfr = train_snet1(X_fit, y_fit, w_fit, binary_y=binary_y, n_layers_r=n_layers_r, n_units_r=n_units_r, n_layers_out=n_layers_out, n_units_out=n_units_out, penalty_l2=penalty_l2, penalty_disc=0, step_size=step_size, n_iter=n_iter, batch_size=batch_size, val_split_prop=val_split_prop, early_stopping=early_stopping, patience=patience, n_iter_min=n_iter_min, verbose=verbose, n_iter_print=n_iter_print, seed=seed, nonlin=nonlin, avg_objective=avg_objective) _, mu_0_hat, mu_1_hat = predict_snet1(X_pred, params_cfr, predict_funs_cfr, return_po=True) if transformation is not RA_TRANSFORMATION: if verbose > 0: print('Training propensity net') params_prop, predict_fun_prop = train_output_net_only(X_fit, w_fit, binary_y=True, n_layers_out=n_layers_out, n_units_out=n_units_out, n_layers_r=n_layers_r, n_units_r=n_units_r, penalty_l2=penalty_l2, step_size=step_size, n_iter=n_iter, batch_size=batch_size, val_split_prop=val_split_prop, early_stopping=early_stopping, patience=patience, n_iter_min=n_iter_min, n_iter_print=n_iter_print, verbose=verbose, seed=seed, nonlin=nonlin, avg_objective=avg_objective) pi_hat = predict_fun_prop(params_prop, X_pred) else: pi_hat = onp.nan return mu_0_hat, mu_1_hat, pi_hat
def _train_and_predict_first_stage_t(X, y, w, fit_mask, pred_mask, binary_y: bool = False, n_layers_out: int = DEFAULT_LAYERS_OUT, n_units_out: int = DEFAULT_UNITS_OUT, n_layers_r: int = DEFAULT_LAYERS_R, n_units_r: int = DEFAULT_UNITS_R, penalty_l2: float = DEFAULT_PENALTY_L2, step_size: float = DEFAULT_STEP_SIZE, n_iter: int = DEFAULT_N_ITER, batch_size: int = DEFAULT_BATCH_SIZE, val_split_prop: float = DEFAULT_VAL_SPLIT, early_stopping: bool = True, patience: int = DEFAULT_PATIENCE, n_iter_min: int = DEFAULT_N_ITER_MIN, verbose: int = 1, n_iter_print: int = DEFAULT_N_ITER_PRINT, seed: int = DEFAULT_SEED, nonlin: str = DEFAULT_NONLIN, avg_objective: bool = False, transformation: str = AIPW_TRANSFORMATION): # train and predict first stage estimators using TNet if len(w.shape) > 1: w = w.reshape((len(w),)) # split the data X_fit, y_fit, w_fit = X[fit_mask, :], y[fit_mask], w[fit_mask] X_pred = X[pred_mask, :] if transformation is not HT_TRANSFORMATION: if verbose > 0: print('Training PO_0 Net') params_0, predict_fun_0 = train_output_net_only(X_fit[w_fit == 0], y_fit[w_fit == 0], binary_y=binary_y, n_layers_out=n_layers_out, n_units_out=n_units_out, n_layers_r=n_layers_r, n_units_r=n_units_r, penalty_l2=penalty_l2, step_size=step_size, n_iter=n_iter, batch_size=batch_size, val_split_prop=val_split_prop, early_stopping=early_stopping, patience=patience, n_iter_min=n_iter_min, n_iter_print=n_iter_print, verbose=verbose, seed=seed, nonlin=nonlin, avg_objective=avg_objective) mu_0 = predict_fun_0(params_0, X_pred) if verbose > 0: print('Training PO_1 Net') params_1, predict_fun_1 = train_output_net_only(X_fit[w_fit == 1], y_fit[w_fit == 1], binary_y=binary_y, n_layers_out=n_layers_out, n_units_out=n_units_out, n_layers_r=n_layers_r, n_units_r=n_units_r, penalty_l2=penalty_l2, step_size=step_size, n_iter=n_iter, batch_size=batch_size, val_split_prop=val_split_prop, early_stopping=early_stopping, patience=patience, n_iter_min=n_iter_min, n_iter_print=n_iter_print, verbose=verbose, seed=seed, nonlin=nonlin, avg_objective=avg_objective) mu_1 = predict_fun_1(params_1, X_pred) else: mu_0, mu_1 = onp.nan, onp.nan if transformation is not RA_TRANSFORMATION: if verbose > 0: print('Training propensity net') params_prop, predict_fun_prop = train_output_net_only(X_fit, w_fit, binary_y=True, n_layers_out=n_layers_out, n_units_out=n_units_out, n_layers_r=n_layers_r, n_units_r=n_units_r, penalty_l2=penalty_l2, step_size=step_size, n_iter=n_iter, batch_size=batch_size, val_split_prop=val_split_prop, early_stopping=early_stopping, patience=patience, n_iter_min=n_iter_min, n_iter_print=n_iter_print, verbose=verbose, seed=seed, nonlin=nonlin, avg_objective=avg_objective) pi_hat = predict_fun_prop(params_prop, X_pred) else: pi_hat = onp.nan return mu_0, mu_1, pi_hat
def train_twostep_net(X, y, w, p=None, first_stage_strategy: str = T_STRATEGY, data_split: bool = False, cross_fit: bool = False, n_cf_folds: int = DEFAULT_CF_FOLDS, transformation: str = AIPW_TRANSFORMATION, binary_y: bool = False, n_layers_out: int = DEFAULT_LAYERS_OUT, n_layers_r: int = DEFAULT_LAYERS_R, n_layers_r_t: int = DEFAULT_LAYERS_R_T, n_layers_out_t: int = DEFAULT_LAYERS_OUT_T, n_units_out: int = DEFAULT_UNITS_OUT, n_units_r: int = DEFAULT_UNITS_R, n_units_out_t: int = DEFAULT_UNITS_OUT_T, n_units_r_t: int = DEFAULT_UNITS_R_T, penalty_l2: float = DEFAULT_PENALTY_L2, penalty_l2_t: float = DEFAULT_PENALTY_L2, step_size: float = DEFAULT_STEP_SIZE, step_size_t: float = DEFAULT_STEP_SIZE_T, n_iter: int = DEFAULT_N_ITER, batch_size: int = DEFAULT_BATCH_SIZE, val_split_prop: float = DEFAULT_VAL_SPLIT, early_stopping: bool = True, patience: int = DEFAULT_PATIENCE, n_iter_min: int = DEFAULT_N_ITER_MIN, verbose: int = 1, n_iter_print: int = DEFAULT_N_ITER_PRINT, seed: int = DEFAULT_SEED, rescale_transformation: bool = False, return_val_loss: bool = False, penalty_orthogonal: float = DEFAULT_PENALTY_ORTHOGONAL, n_units_r_small: int = DEFAULT_UNITS_R_SMALL_S, nonlin: str = DEFAULT_NONLIN, avg_objective: bool = DEFAULT_AVG_OBJECTIVE): # get shape of data n, d = X.shape if p is not None: p = check_shape_1d_data(p) # get transformation function transformation_function = _get_transformation_function(transformation) # get strategy name if first_stage_strategy not in ALL_STRATEGIES: raise ValueError('Parameter first stage should be in ' 'catenets.models.twostep_nets.ALL_STRATEGIES. ' 'You passed {}'.format(first_stage_strategy)) # split data as wanted if p is None or transformation is not HT_TRANSFORMATION: if not cross_fit: if not data_split: if verbose > 0: print('Training first stage with all data (no data splitting)') # use all data for both fit_mask = onp.ones(n, dtype=bool) pred_mask = onp.ones(n, dtype=bool) else: if verbose > 0: print('Training first stage with half of the data (data splitting)') # split data in half fit_idx = onp.random.choice(n, int(onp.round(n / 2))) fit_mask = onp.zeros(n, dtype=bool) fit_mask[fit_idx] = 1 pred_mask = ~ fit_mask mu_0, mu_1, pi_hat = _train_and_predict_first_stage(X, y, w, fit_mask, pred_mask, first_stage_strategy=first_stage_strategy, binary_y=binary_y, n_layers_out=n_layers_out, n_layers_r=n_layers_r, n_units_out=n_units_out, n_units_r=n_units_r, penalty_l2=penalty_l2, step_size=step_size, n_iter=n_iter, batch_size=batch_size, val_split_prop=val_split_prop, early_stopping=early_stopping, patience=patience, n_iter_min=n_iter_min, verbose=verbose, n_iter_print=n_iter_print, seed=seed, penalty_orthogonal=penalty_orthogonal, n_units_r_small=n_units_r_small, nonlin=nonlin, avg_objective=avg_objective, transformation=transformation) if data_split: # keep only prediction data X, y, w = X[pred_mask, :], y[pred_mask, :], w[pred_mask, :] if p is not None: p = p[pred_mask, :] else: if verbose > 0: print('Training first stage in {} folds (cross-fitting)'.format(n_cf_folds)) # do cross fitting mu_0, mu_1, pi_hat = onp.zeros((n, 1)), onp.zeros((n, 1)), onp.zeros((n, 1)) splitter = StratifiedKFold(n_splits=n_cf_folds, shuffle=True, random_state=seed) fold_count = 1 for train_idx, test_idx in splitter.split(X, w): if verbose > 0: print('Training fold {}.'.format(fold_count)) fold_count = fold_count + 1 pred_mask = onp.zeros(n, dtype=bool) pred_mask[test_idx] = 1 fit_mask = ~ pred_mask mu_0[pred_mask], mu_1[pred_mask], pi_hat[pred_mask] = \ _train_and_predict_first_stage(X, y, w, fit_mask, pred_mask, first_stage_strategy=first_stage_strategy, binary_y=binary_y, n_layers_out=n_layers_out, n_layers_r=n_layers_r, n_units_out=n_units_out, n_units_r=n_units_r, penalty_l2=penalty_l2, step_size=step_size, n_iter=n_iter, batch_size=batch_size, val_split_prop=val_split_prop, early_stopping=early_stopping, patience=patience, n_iter_min=n_iter_min, verbose=verbose, n_iter_print=n_iter_print, seed=seed, penalty_orthogonal=penalty_orthogonal, n_units_r_small=n_units_r_small, nonlin=nonlin, avg_objective=avg_objective, transformation=transformation) if verbose > 0: print('Training second stage.') if p is not None: # use known propensity score p = check_shape_1d_data(p) pi_hat = p # second stage y, w = check_shape_1d_data(y), check_shape_1d_data(w) # transform data and fit on transformed data if transformation is HT_TRANSFORMATION: mu_0 = None mu_1 = None pseudo_outcome = transformation_function(y=y, w=w, p=pi_hat, mu_0=mu_0, mu_1=mu_1) if rescale_transformation: scale_factor = onp.std(y) / onp.std(pseudo_outcome) if scale_factor > 1: scale_factor = 1 else: pseudo_outcome = scale_factor * pseudo_outcome params, predict_funs = train_output_net_only(X, pseudo_outcome, binary_y=False, n_layers_out=n_layers_out_t, n_units_out=n_units_out_t, n_layers_r=n_layers_r_t, n_units_r=n_units_r_t, penalty_l2=penalty_l2_t, step_size=step_size_t, n_iter=n_iter, batch_size=batch_size, val_split_prop=val_split_prop, early_stopping=early_stopping, patience=patience, n_iter_min=n_iter_min, n_iter_print=n_iter_print, verbose=verbose, seed=seed, return_val_loss=return_val_loss, nonlin=nonlin, avg_objective=avg_objective) return params, predict_funs, scale_factor else: return train_output_net_only(X, pseudo_outcome, binary_y=False, n_layers_out=n_layers_out_t, n_units_out=n_units_out_t, n_layers_r=n_layers_r_t, n_units_r=n_units_r_t, penalty_l2=penalty_l2_t, step_size=step_size_t, n_iter=n_iter, batch_size=batch_size, val_split_prop=val_split_prop, early_stopping=early_stopping, patience=patience, n_iter_min=n_iter_min, n_iter_print=n_iter_print, verbose=verbose, seed=seed, return_val_loss=return_val_loss, nonlin=nonlin, avg_objective=avg_objective)
def train_x_net(X, y, w, weight_strategy: int = None, binary_y: bool = False, n_layers_out: int = DEFAULT_LAYERS_OUT, n_layers_r: int = DEFAULT_LAYERS_R, n_layers_out_t: int = DEFAULT_LAYERS_OUT_T, n_layers_r_t: int = DEFAULT_LAYERS_R_T, n_units_out: int = DEFAULT_UNITS_OUT, n_units_r: int = DEFAULT_UNITS_R, n_units_out_t: int = DEFAULT_UNITS_OUT_T, n_units_r_t: int = DEFAULT_UNITS_R_T, penalty_l2: float = DEFAULT_PENALTY_L2, penalty_l2_t: float = DEFAULT_PENALTY_L2, step_size: float = DEFAULT_STEP_SIZE, step_size_t: float = DEFAULT_STEP_SIZE_T, n_iter: int = DEFAULT_N_ITER, batch_size: int = DEFAULT_BATCH_SIZE, n_iter_min: int = DEFAULT_N_ITER_MIN, val_split_prop: float = DEFAULT_VAL_SPLIT, early_stopping: bool = True, patience: int = DEFAULT_PATIENCE, verbose: int = 1, n_iter_print: int = DEFAULT_N_ITER_PRINT, seed: int = DEFAULT_SEED, nonlin: str = DEFAULT_NONLIN, return_val_loss: bool = False, avg_objective: bool = DEFAULT_AVG_OBJECTIVE): y = check_shape_1d_data(y) if len(w.shape) > 1: w = w.reshape((len(w), )) if weight_strategy not in [0, 1, -1, None]: # weight_strategy is coded as follows: # for tau(x)=g(x)tau_0(x) + (1-g(x))tau_1(x) [eq 9, kuenzel et al (2019)] # weight_strategy=0 sets g(x)=0, weight_strategy=1 sets g(x)=1, # weight_strategy=None sets g(x)=pi(x) [propensity score], # weight_strategy=-1 sets g(x)=(1-pi(x)) raise ValueError( 'XNet only implements weight_strategy in [0, 1, -1, None]') # first stage: get estimates of PO regression if verbose > 0: print("Training first stage") if not weight_strategy == 1: if verbose > 0: print('Training PO_0 Net') params_0, predict_fun_0 = train_output_net_only( X[w == 0], y[w == 0], binary_y=binary_y, n_layers_out=n_layers_out, n_units_out=n_units_out, n_layers_r=n_layers_r, n_units_r=n_units_r, penalty_l2=penalty_l2, step_size=step_size, n_iter=n_iter, batch_size=batch_size, val_split_prop=val_split_prop, early_stopping=early_stopping, patience=patience, n_iter_min=n_iter_min, n_iter_print=n_iter_print, verbose=verbose, seed=seed, nonlin=nonlin, avg_objective=avg_objective) mu_hat_0 = predict_fun_0(params_0, X[w == 1]) else: mu_hat_0 = None if not weight_strategy == 0: if verbose > 0: print('Training PO_1 Net') params_1, predict_fun_1 = train_output_net_only( X[w == 1], y[w == 1], binary_y=binary_y, n_layers_out=n_layers_out, n_units_out=n_units_out, n_layers_r=n_layers_r, n_units_r=n_units_r, penalty_l2=penalty_l2, step_size=step_size, n_iter=n_iter, batch_size=batch_size, val_split_prop=val_split_prop, early_stopping=early_stopping, patience=patience, n_iter_min=n_iter_min, n_iter_print=n_iter_print, verbose=verbose, seed=seed, nonlin=nonlin, avg_objective=avg_objective) mu_hat_1 = predict_fun_1(params_1, X[w == 0]) else: mu_hat_1 = None if weight_strategy is None or weight_strategy == -1: # also fit propensity estimator if verbose > 0: print('Training propensity net') params_prop, predict_fun_prop = train_output_net_only( X, w, binary_y=True, n_layers_out=n_layers_out, n_units_out=n_units_out, n_layers_r=n_layers_r, n_units_r=n_units_r, penalty_l2=penalty_l2, step_size=step_size, n_iter=n_iter, batch_size=batch_size, val_split_prop=val_split_prop, early_stopping=early_stopping, patience=patience, n_iter_min=n_iter_min, n_iter_print=n_iter_print, verbose=verbose, seed=seed, nonlin=nonlin, avg_objective=avg_objective) else: params_prop, predict_fun_prop = None, None # second stage if verbose > 0: print("Training second stage") if not weight_strategy == 0: # fit tau_0 if verbose > 0: print("Fitting tau_0") pseudo_outcome0 = mu_hat_1 - y[w == 0] params_tau0, predict_fun_tau0 = train_output_net_only( X[w == 0], pseudo_outcome0, binary_y=False, n_layers_out=n_layers_out_t, n_units_out=n_units_out_t, n_layers_r=n_layers_r_t, n_units_r=n_units_r_t, penalty_l2=penalty_l2_t, step_size=step_size_t, n_iter=n_iter, batch_size=batch_size, val_split_prop=val_split_prop, early_stopping=early_stopping, patience=patience, n_iter_min=n_iter_min, n_iter_print=n_iter_print, verbose=verbose, seed=seed, return_val_loss=return_val_loss, nonlin=nonlin, avg_objective=avg_objective) else: params_tau0, predict_fun_tau0 = None, None if not weight_strategy == 1: # fit tau_1 if verbose > 0: print("Fitting tau_1") pseudo_outcome1 = y[w == 1] - mu_hat_0 params_tau1, predict_fun_tau1 = train_output_net_only( X[w == 1], pseudo_outcome1, binary_y=False, n_layers_out=n_layers_out_t, n_units_out=n_units_out_t, n_layers_r=n_layers_r_t, n_units_r=n_units_r_t, penalty_l2=penalty_l2_t, step_size=step_size_t, n_iter=n_iter, batch_size=batch_size, val_split_prop=val_split_prop, early_stopping=early_stopping, patience=patience, n_iter_min=n_iter_min, n_iter_print=n_iter_print, verbose=verbose, seed=seed, return_val_loss=return_val_loss, nonlin=nonlin, avg_objective=avg_objective) else: params_tau1, predict_fun_tau1 = None, None params = params_tau0, params_tau1, params_prop predict_funs = predict_fun_tau0, predict_fun_tau1, predict_fun_prop return params, predict_funs