def df_ny_box(): df = get_data(nrows=10000) df = clean_df(df) #setting boundries df = df[df["pickup_latitude"].between(left=40, right=42)] df = df[df["pickup_longitude"].between(left=-74.3, right=-72.9)] df = df[df["dropoff_latitude"].between(left=40, right=42)] df = df[df["dropoff_longitude"].between(left=-74, right=-72.9)] return df
def load_data(params): print("############ Loading Data ############") df = get_data(**params) df = clean_df(df) y = df["fare_amount"] X = df.drop("fare_amount", axis=1) print("shape: {}".format(X.shape)) print("size: {} Mb".format(X.memory_usage().sum() / 1e6)) return X, y
if __name__ == "__main__": warnings.simplefilter(action='ignore', category=FutureWarning) # Get and clean data experiment = "taxifare_test_jean" params = dict( nrows=100000, upload=True, local=False, # set to False to get data from GCP (Storage or BigQuery) gridsearch=False, optimize=False, estimator="xgboost", mlflow=True, # set to True to log params to mlflow experiment_name=experiment) print("############ Loading Data ############") df = get_data(**params) df = clean_df(df) y_train = df["fare_amount"] X_train = df.drop("fare_amount", axis=1) del df print("shape: {}".format(X_train.shape)) print("size: {} Mb".format(X_train.memory_usage().sum() / 1e6)) # Train and save model, locally and t = Trainer(X=X_train, y=y_train, **params) del X_train, y_train print(colored("############ Training model ############", "red")) t.train() print(colored("############ Evaluating model ############", "blue")) t.evaluate() print(colored("############ Saving model ############", "green")) t.save_model()