def train_and_persist(model_dir=None, hour_path=None, model="xgboost"): hour = read_data(hour_path) hour = preprocess(hour) hour = dummify(hour) hour = postprocess(hour) train_X, test_X, train_y, test_y = split_train_test(hour) # TODO: Implement other models? if model == "xgboost": model = train_xgboost(hour) model_path = get_model_path(model_dir) train_score = model.score(test_X, test_y) joblib.dump(model, model_path + "/xgboost.pkl") return train_score elif model == "ridge": model = train_ridge(hour) model_path = get_model_path(model_dir) train_score = model.score(test_X, test_y) joblib.dump(model, model_path + "/ridge.pkl") return train_score elif model == "lasso": model = train_lasso(hour) model_path = get_model_path(model_dir) train_score = model.score(test_X, test_y) joblib.dump(model, model_path + "/lasso.pkl") return train_score
def score(model_dir=None, hour_path=None, model_name="xgboost"): model_path = get_model_path(model_dir, model=model_name) if not os.path.exists(model_path): train_and_persist(model_dir, model=model_name) model_clf = joblib.load(model_path) hour = read_data(hour_path) hour = preprocess(hour) hour = dummify(hour) hour = postprocess(hour) hour_d = pd.get_dummies(hour) regex = re.compile(r"\[|\]|<", re.IGNORECASE) hour_d.columns = [ regex.sub("_", col) if any(x in str(col) for x in set(("[", "]", "<"))) else col for col in hour_d.columns.values ] hour_d = hour_d.select_dtypes(exclude="category") hour_d_train_x, hour_d_test_x, hour_d_train_y, hour_d_test_y = split_train_test( hour_d) r2_train = model_clf.score(hour_d_train_x, hour_d_train_y) r2_test = model_clf.score(hour_d_test_x, hour_d_test_y) return r2_train, r2_test
def predict(parameters, model_dir=None, model=None): """Returns model prediction. """ if model == "ridge": model_path = get_model_path(model_dir, model_al="ridge") train_and_persist(model_dir=model_dir, model="ridge") else: model_path = get_model_path(model_dir, model_al="xgboost") train_and_persist(model_dir=model_dir, model="xgboost") model = joblib.load(model_path) input_dict = get_input_dict(parameters) X_input = pd.DataFrame([pd.Series(input_dict)]) result = model.predict(X_input) # Undo np.sqrt(hour["cnt"]) return int(result ** 2)
def train_and_persist(model_dir=None, hour_path=None): hour = read_data(hour_path) hour = preprocess(hour) hour = dummify(hour) hour = postprocess(hour) # TODO: Implement other models? model = train_xgboost(hour) model_path = get_model_path(model_dir) joblib.dump(model, model_path)
def train_and_persist(model_dir=None, hour_path=None, model="xgboost"): hour = read_data(hour_path) hour = preprocess(hour) hour = dummify(hour) hour = postprocess(hour) # todo:Implement other models? if model == "xgboost": train_model = train_xgboost(hour) elif model == "ridge": train_model = train_ridge(hour) model_path = get_model_path(model_dir, model) joblib.dump(train_model, model_path)
def predict(parameters, model_dir=None, model="xgboost"): """Returns model prediction. """ model_path = get_model_path(model_dir) if not os.path.exists(model_path): train_and_persist(model_dir=model_dir, model=model) model = joblib.load(model_path) input_dict = get_input_dict(parameters) X_input = pd.DataFrame([pd.Series(input_dict)]) result = model.predict(X_input) # Undo np.sqrt(hour["cnt"]) return int(result**2)
def train_and_persist(model_dir=None, hour_path=None, model="xgboost"): """ Trains and persists the model (xgboost by default, the other option is ridge). """ hour = read_data(hour_path) hour = preprocess(hour) hour = dummify(hour) hour = postprocess(hour) if model == "ridge": model_clf = train_ridge(hour) else: model_clf = train_xgboost(hour) model_path = get_model_path(model_dir, model) joblib.dump(model_clf, model_path)
def train_and_persist(model_dir=None, hour_path=None, model="xgboost"): hour = read_data(hour_path) hour = preprocess(hour) hour = dummify(hour) hour = postprocess(hour) model_path = get_model_path(model_dir, model) if model == "xgboost": scikit_model, score = train_xgboost(hour) elif model == "ridge": scikit_model, score = train_ridge(hour) joblib.dump(scikit_model, model_path) return score
def train_and_persist(model_dir=None, hour_path=None, model="xgboost"): valid = {"xgboost", "ridge"} if model not in valid: raise ValueError("results: model must be one of %s." % valid) else: hour = read_data(hour_path) hour = preprocess(hour) hour = dummify(hour) hour = postprocess(hour) if model == "xgboost": model_result = train_xgboost(hour) else: model_result = train_ridge(hour) model_path = get_model_path(model, model_dir) joblib.dump(model_result, model_path)
def train_and_persist(model_dir=None, hour_path=None, model="xgboost"): hour = read_data(hour_path) hour = preprocess(hour) hour = dummify(hour) hour = postprocess(hour) model_path = get_model_path(model_dir, model) # Implement other models? if model == "ridge": model_object, score_train = train_ridge(hour) else: model_object, score_train = train_xgboost(hour) joblib.dump(model_object, model_path) return score_train
def train_and_persist(model_dir=None, hour_path=None, model_name="xgboost"): hour = read_data(hour_path) hour = preprocess(hour) hour = dummify(hour) hour = postprocess(hour) ##added # TODO: Implement other models? if model_name == "xgboost": model = train_xgboost(hour) elif model_name == "ridge": model = train_ridge(hour) elif model_name == "lasso": model = train_lasso(hour) else: print("model should be equal to 'xgboost' or 'ridge' or 'lasso'") model_path = get_model_path(model_dir, model_name) joblib.dump(model, model_path)