Example #1
0
def df_ny_box():
    df = get_data(nrows=10000)
    df = clean_df(df)
    #setting boundries
    df = df[df["pickup_latitude"].between(left=40, right=42)]
    df = df[df["pickup_longitude"].between(left=-74.3, right=-72.9)]
    df = df[df["dropoff_latitude"].between(left=40, right=42)]
    df = df[df["dropoff_longitude"].between(left=-74, right=-72.9)]
    return df
Example #2
0
def load_data(params):
    print("############   Loading Data   ############")
    df = get_data(**params)
    df = clean_df(df)
    y = df["fare_amount"]
    X = df.drop("fare_amount", axis=1)
    print("shape: {}".format(X.shape))
    print("size: {} Mb".format(X.memory_usage().sum() / 1e6))
    return X, y
Example #3
0
if __name__ == "__main__":
    warnings.simplefilter(action='ignore', category=FutureWarning)
    # Get and clean data
    experiment = "taxifare_test_jean"
    params = dict(
        nrows=100000,
        upload=True,
        local=False,  # set to False to get data from GCP (Storage or BigQuery)
        gridsearch=False,
        optimize=False,
        estimator="xgboost",
        mlflow=True,  # set to True to log params to mlflow
        experiment_name=experiment)
    print("############   Loading Data   ############")
    df = get_data(**params)
    df = clean_df(df)
    y_train = df["fare_amount"]
    X_train = df.drop("fare_amount", axis=1)
    del df
    print("shape: {}".format(X_train.shape))
    print("size: {} Mb".format(X_train.memory_usage().sum() / 1e6))
    # Train and save model, locally and
    t = Trainer(X=X_train, y=y_train, **params)
    del X_train, y_train
    print(colored("############  Training model   ############", "red"))
    t.train()
    print(colored("############  Evaluating model ############", "blue"))
    t.evaluate()
    print(colored("############   Saving model    ############", "green"))
    t.save_model()