def local_stats_sqsum(args): input, state, cache, cache_dir, id, owner = readin(args) use_CV = cache["use_CV"] n_features = cache["n_features"] output = {} if cache["train_split"] > 0: # has train data with open(os.path.join(cache_dir, "X_train.npy"), "rb") as fp: X_train = np.load(fp) # sqsum sqsum_X_train = np.sum(np.square(X_train - input["mean_X_train"]), axis=0).tolist() output["sqsum_X_train_local"] = sqsum_X_train if use_CV: sqsum_X_folds = [] for train_index, mean_X in zip(cache["fold_indices"], input["mean_X_folds"]): sqsum_X_folds.append( np.sum(np.square(X_train[train_index] - mean_X), axis=0).tolist()) output["sqsum_X_folds_local"] = sqsum_X_folds else: output["sqsum_X_train_local"] = np.zeros(n_features).tolist() if use_CV: n_folds = cache["n_folds"] output["sqsum_X_folds_local"] = np.tile(np.zeros(n_features), (n_folds, 1)).tolist() # output if id == owner: output["msg"] = "to_agg_scale" result_dict = {"output": output} return json.dumps(result_dict)
def local_stats_sum(args): input, state, cache, cache_dir, id, owner = readin(args) base_dir = state["baseDirectory"] preprocess_method = input["preprocess_method"] use_CV = input["use_CV"] output = {} (X, y, name_features) = parse_csv(input) # split and store train / test dataset X_train, X_test, y_train, y_test = split_save_train_test( input, output, cache, id, owner, cache_dir, X, y) # split folds for CV if use_CV: fold_indices, valid_indices = split_folds_save_valid( input, output, cache, cache_dir, X_train, y_train) # cache dict cache["preprocess_method"] = preprocess_method cache["use_CV"] = use_CV cache["n_features"] = X.shape[1] if use_CV: cache["n_folds"] = input["n_folds"] # calculate stats: n_samples, sum stats = Stats(cache, preprocess_method) stats.cal_stats(X_train=X_train, y_train=y_train, y_test=y_test) stats.add_output(output) if use_CV: stats_CV = Stats_CV(cache, preprocess_method) stats_CV.cal_stats( X_train=X_train, y_train=y_train, fold_indices=fold_indices, valid_indices=valid_indices, ) stats_CV.add_output(output) # output dict if id == owner: output["msg"] = "to_agg_mean" output["label"] = input["label"] output["train_split_local"] = input["train_split_local"] output["train_split_owner"] = input["train_split_owner"] output["preprocess_method"] = preprocess_method output["max_iter"] = input["max_iter"] output["tol"] = input["tol"] output["positive"] = input["positive"] output["selection"] = input["selection"] output["lambdas"] = input["lambdas"] output["eps"] = input["eps"] output["n_lambdas"] = input["n_lambdas"] output["use_CV"] = use_CV if use_CV: output["n_folds"] = input["n_folds"] output["name_features"] = name_features result_dict = {"output": output, "cache": cache} return json.dumps(result_dict)
def local_train(args): input, state, cache, cache_dir, id, owner = readin(args) if cache["train_split"] > 0: # has train data w = np.array(input["w"], dtype="float64") jj = input["jj"] tmp = list(list_recursive(input, "i_fold")) if tmp: # CV ongoing i_fold = tmp[0] with open( os.path.join(cache_dir, "X_fold_" + str(i_fold) + ".npy"), "rb") as fp: X = np.load(fp) with open( os.path.join(cache_dir, "y_fold_" + str(i_fold) + ".npy"), "rb") as fp: y = np.load(fp) else: # non-CV with open(os.path.join(cache_dir, "X_train.npy"), "rb") as fp: X = np.load(fp) with open(os.path.join(cache_dir, "y_train.npy"), "rb") as fp: y = np.load(fp) c_jj = np.dot(X[:, jj], (y - np.matmul(X, w) + w[jj] * X[:, jj])) else: c_jj = 0.0 output = {"msg": "to_agg_train", "c_jj_local": float(c_jj)} result_dict = {"output": output} return json.dumps(result_dict)
def local_preprocess(args): input, state, cache, cache_dir, id, owner = readin(args) use_CV = cache["use_CV"] n_features = cache["n_features"] output = {} if cache["train_split"] > 0: # has train data with open(os.path.join(cache_dir, "X_train.npy"), "rb") as fp: X_train = np.load(fp) with open(os.path.join(cache_dir, "y_train.npy"), "rb") as fp: y_train = np.load(fp) # preprocess, save new data, calculate Xy # based on preprocessed X/y_train for lambda_max output["Xy"] = preprocess_save_calXy(input, cache_dir, X_train, y_train) if use_CV: preprocess_save_CV(input, cache_dir, X_train, y_train, cache["fold_indices"]) else: output["Xy"] = np.zeros(n_features).tolist() # dicts if id == owner: output["msg"] = "to_init_train" result_dict = {"output": output} return json.dumps(result_dict)
def local_test(args): input, state, cache, cache_dir, id, owner = readin(args) w = np.array(input["w"], dtype="float64") intercept = input["intercept"] output = {} tmp = list(list_recursive(input, "i_fold")) if tmp: # CV ongoing if cache["train_split"] > 0: # has train data i_fold = tmp[0] with open( os.path.join(cache_dir, "X_valid_" + str(i_fold) + ".npy"), "rb") as fp: X_test = np.load(fp) with open( os.path.join(cache_dir, "y_valid_" + str(i_fold) + ".npy"), "rb") as fp: y_test = np.load(fp) y_pred = predict(X_test, w, intercept) se = squared_error(y_test, y_pred) # for MSE output["se_local"] = se else: output["se_local"] = 0.0 else: # non-CV if cache["train_split"] < 1: # has test data with open(os.path.join(cache_dir, "X_test.npy"), "rb") as fp: X_test = np.load(fp) with open(os.path.join(cache_dir, "y_test.npy"), "rb") as fp: y_test = np.load(fp) y_pred = predict(X_test, w, intercept) se = squared_error(y_test, y_pred) se_denominator = squared_error( y_test, input["mean_y_test"]) # for R2 score output["se_local"] = float(se) output["se_denominator_local"] = float(se_denominator) else: output["se_local"] = 0.0 output["se_denominator_local"] = 0.0 if id == owner: output["msg"] = "to_agg_test" result_dict = {"output": output} return json.dumps(result_dict)