Beispiel #1
0
    def train(self, model_name, hyperparams):

        # create mlflow run
        self.mlflow_create_run()

        # get data
        df = get_data()

        # clean data
        df = clean_data(df)

        # mlflow param
        self.mlflow_log_param("estimator", model_name)

        # create pipeline
        self.pipeline = MyPipeline(self)

        dyn_model = self.create_estimator(model_name)

        self.model = self.pipeline.create_pipeline(dyn_model)

        # get df
        self.X_train, self.X_val, self.y_train, self.y_val = holdout(df)

        self.fit_model(model_name, hyperparams)

        self.evaluate()

        return self
Beispiel #2
0
def generate_submission_csv(nrows, kaggle_upload=False):
    df_test = get_test_data(nrows)
    df_test = clean_data(df_test)
    #pipeline = joblib.load(PATH_TO_LOCAL_MODEL)
    pipeline = download_model()
    if "best_estimator_" in dir(pipeline):
        y_pred = pipeline.best_estimator_.predict(df_test)
    else:
        y_pred = pipeline.predict(df_test)
    df_test["fare_amount"] = y_pred
    df_sample = df_test[["key", "fare_amount"]]
    name = f"predictions_test_ex_10k.csv"
    df_sample.to_csv(name, index=False)
    print("prediction saved under kaggle format")
    # Set kaggle_upload to False unless you install kaggle cli
    if kaggle_upload:
        kaggle_message_submission = name[:-4]
        command = f'kaggle competitions submit -c new-york-city-taxi-fare-prediction -f {name} -m "{kaggle_message_submission}"'
        os.system(command)
Beispiel #3
0
 def run_grid_search(self):
     grid = {}
     
     df = get_data()
     df = clean_data(df)
     X,y = get_Xy(df)
     
     
     X_train, X_val, y_train, y_val = hold_out(X,y)
     trainer = Trainer(X_train,y_train)
     self.pipeline = trainer.set_pipeline()
     
     search = GridSearchCV(self.pipeline, grid, 
                        scoring = 'neg_mean_squared_error',
                        cv = 5,
                        n_jobs=-1)
     
     search.fit(X_train,y_train)
     return search
Beispiel #4
0
    def mlflow_log_metric(self, key, value):
        self.mlflow_client.log_metric(self.mlflow_run.info.run_id, key, value)


knn = KNeighborsRegressor()
rf = RandomForestRegressor()
svr = SVR()
lasso = Lasso()
ridge = Ridge()

models = [rf]

if __name__ == "__main__":
    # get data
    df = get_data()
    # clean data
    df = clean_data(df)
    # set X and y
    X = df.drop(columns=['fare_amount'])
    y = df['fare_amount']
    # hold out
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
    for estimator in models:
        # train
        trainer = Trainer(X_train, y_train)
        trainer.run(estimator)
        # evaluate
        #trainer.evaluate(X_test, y_test, estimator)
        # save
        trainer.save_model(estimator)
Beispiel #5
0
            ('distance', dist_pipe, ["pickup_latitude", "pickup_longitude", 'dropoff_latitude', 'dropoff_longitude']),
            ('time', time_pipe, ['pickup_datetime'])
        ], remainder="drop")
        pipe = Pipeline([
            ('preproc', preproc_pipe),
            ('linear_model', LinearRegression())
        ])
        self.pipeline = pipe

    def run(self):
        '''returns a trained pipelined model'''
        self.set_pipeline()
        self.pipeline.fit(self.X, self.y)

    def evaluate(self, X_test, y_test):
        '''returns the value of the RMSE'''
        y_pred = self.pipeline.predict(X_test)
        rmse = compute_rmse(y_pred, y_test)
        return rmse


if __name__ == "__main__":
    df = get_data(nrows=10_000)
    df = clean_data(df, test=False)
    y = df["fare_amount"]
    X = df.drop("fare_amount", axis=1)
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.15)
    trainer = Trainer(X_train,y_train)
    trainer.run()
    rmse = trainer.evaluate(X_val, y_val)
    print(rmse)
Beispiel #6
0
        joblib.dump(reg, 'model.joblib')
        # Implement here
        print("saved model.joblib locally")
        client = storage.Client()
        bucket = client.bucket(BUCKET_NAME)
        STORAGE_LOCATION = f'{MODEL_NAME}_{MODEL_VERSION}'
        blob = bucket.blob(STORAGE_LOCATION)
        blob.upload_from_filename('model.joblib')
        # Implement here
        print(
            f"uploaded model.joblib to gcp cloud storage under \n => {STORAGE_LOCATION}"
        )


if __name__ == "__main__":
    # get data
    df = get_data()
    # clean data
    df_cleaned = clean_data(df)
    # set X and y
    X = df_cleaned.drop(columns='fare_amount')
    y = df_cleaned.fare_amount
    # hold out
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
    # # train
    trainer = Trainer(X_train, y_train)
    trainer.run()
    # # evaluate
    print(trainer.evaluate(X_test, y_test))
    print('TODO')
def test_cleaned_data():
    df = get_data(nrows=100)
    assert clean_data(df).shape[0] <= df.shape[0]
Beispiel #8
0
                                  (model["name"], model["_class"]())])

        return self.pipeline

    def run(self):
        """set and train the pipeline"""
        return self.pipeline.fit(self.X, self.y)

    def evaluate(self, X_test, y_test):
        """evaluates the pipeline on df_test and return the RMSE"""
        return self.pipeline.score(X_test, y_test)


if __name__ == "__main__":
    # get data
    raw_data = data.get_data()
    # clean data
    df = data.clean_data(raw_data)
    # set X and y
    target = "fare_amount"
    X = df.drop(columns=target)
    y = df[target]
    # hold out
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
    # train
    trainer = Trainer(X_train, y_train)
    model_ready = trainer.set_pipeline()
    model_trained = trainer.run()
    # evaluate
    model_eval = trainer.evaluate(X_test, y_test)
Beispiel #9
0
        joblib.dump(model, f"model.joblib")
        #joblib.dump(model,'model.joblib')
        print("saved model.joblib locally")

        # Implement here
        self.upload_model_to_gcp()
        print(
            f"uploaded model.joblib to gcp cloud storage under \n => {self.STORAGE_LOCATION}"
        )
        return self


if __name__ == "__main__":
    # get data
    data = get_data()
    # clean data
    data = clean_data(data)
    # set X and y
    X, y = getXy(data, col_target="fare_amount")
    # hold out
    X_train, X_val, y_train, y_val = getholdout(X, y)
    # train
    trainer = Trainer(X_train, y_train)
    trainer.run()
    # evaluate
    score = trainer.evaluate(X_val, y_val)
    # joblib
    model = trainer.best_model
    trainer.save_model(score)
    print(trainer.best_params)
Beispiel #10
0
class Trainer():

    def __init__(self, X, y):
        """
            X: pandas DataFrame
            y: pandas Series
        """
        self.pipeline = None
        self.X = X
        self.y = y
        self.experiment_name = EXPERIMENT_NAME

        @memoized_property
    def mlflow_client(self):
        mlflow.set_tracking_uri(MLFLOW_URI)
        return MlflowClient()

    @memoized_property
    def mlflow_experiment_id(self):
        try:
            return self.mlflow_client.create_experiment(self.experiment_name)
        except BaseException:
            return self.mlflow_client.get_experiment_by_name(self.experiment_name).experiment_id

    @memoized_property
    def mlflow_run(self):
        return self.mlflow_client.create_run(self.mlflow_experiment_id)

    def mlflow_log_param(self, key, value):
        self.mlflow_client.log_param(self.mlflow_run.info.run_id, key, value)

    def mlflow_log_metric(self, key, value):
        self.mlflow_client.log_metric(self.mlflow_run.info.run_id, key, value)



    def set_pipeline(self):
        """defines the pipeline as a class attribute"""
        distance_pipe = make_pipeline(DistanceTransformer(), StandardScaler())

        time_pipe = make_pipeline(
            TimeFeaturesEncoder(time_column = 'pickup_datetime'),
            OneHotEncoder(handle_unknown = 'ignore')
            )

        preprocessor = ColumnTransformer([
            ('distance_trans', distance_pipe, ['pickup_latitude', 'pickup_longitude', 'dropoff_latitude', 'dropoff_longitude']),
            ('time_trans', time_pipe, ['pickup_datetime'])])

        model_pipeline = Pipeline(steps = [('preprocessing', preprocessor),
                                            ('regressor', LinearRegression())])

        self.pipeline = model_pipeline

        return self

    def run(self):
        """set and train the pipeline"""
        self.set_pipeline()
        self.pipeline.fit(self.X, self.y)
        return self

    def evaluate(self, X_test, y_test):
        """evaluates the pipeline on df_test and return the RMSE"""
        y_pred = self.pipeline.predict(X_test)
        rmse = compute_rmse(y_pred, y_test)
        print(f'ID:{trainer.mlflow_experiment_id}')
        self.mlflow_log_param('model', str(self.pipeline.get_params()['model'])
                              .strip('()'))
        self.mlflow_log_metric('rmse', rmse)
        return rmse


if __name__ == "__main__":
    # get & clean data
    data = clean_data(get_data())

    # set X and y
    X = data.drop(columns = ['fare_amount'])
    y = data['fare_amount']

    # hold out
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

    trainer = Trainer(X_train, y_train)
    trainer.run()
    trainer.evaluate(X_test, y_test)
    # build pipeline
    #train_pipe = Trainer(X_train, y_train).set_pipeline()

    # train the pipeline
    #model = pipeline.run(X_train, y_train, train_pipe)

    # evaluate the pipeline
    #result = pipeline.evaluate(X_test, y_test, model)

    print(trainer.evaluate(X_test, y_test))
Beispiel #11
0
    def run(self):
        """set and train the pipeline"""
        self.set_pipeline()
        self.pipeline.fit(self.X, self.y)

    def evaluate(self, X_test, y_test):
        """evaluates the pipeline on df_test and return the RMSE"""
        y_pred = self.pipeline.predict(X_test)
        rmse = compute_rmse(y_pred, y_test)
        print(f"rmse = {rmse}")


if __name__ == "__main__":
    # Get and clean data
    df = get_data()
    clean_data(df)
    
    # Get features
    y = df.pop("fare_amount")
    X = df
    
    # Holdout
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
    
    # Train
    trainer = Trainer(X_train, y_train)
    trainer.run()
    
    # Evaluate
    trainer.evaluate(X_test, y_test)
Beispiel #12
0
        model_pipe = Pipeline([('preprocessing', prepro_pipe),
                               ('model', LinearRegression())])

        self.pipeline = model_pipe

    def run(self):
        """set and train the pipeline"""
        self.pipeline.fit(self.X_train, self.y_train)

    def evaluate(self):
        """evaluates the pipeline on df_test and return the RMSE"""
        print(compute_rmse(self.pipeline.predict(self.X_test), self.y_test))


if __name__ == "__main__":
    # get data and clean data
    df = clean_data(get_data())
    # set X and y
    features = [
        'key', 'pickup_datetime', 'pickup_longitude', 'pickup_latitude',
        'dropoff_longitude', 'dropoff_latitude', 'passenger_count'
    ]
    X, y = df[features], df['fare_amount']
    # train
    trainer = Trainer(X, y)
    trainer.set_pipeline()
    trainer.run()
    # evaluate
    trainer.evaluate()
Beispiel #13
0
            return self.mlflow_client.create_experiment(self.experiment_name)
        except BaseException:
            return self.mlflow_client.get_experiment_by_name(
                self.experiment_name).experiment_id

    @memoized_property
    def mlflow_run(self):
        return self.mlflow_client.create_run(self.mlflow_experiment_id)

    def mlflow_log_param(self, key, value):
        self.mlflow_client.log_param(self.mlflow_run.info.run_id, key, value)

    def mlflow_log_metric(self, key, value):
        self.mlflow_client.log_metric(self.mlflow_run.info.run_id, key, value)


if __name__ == "__main__":
    # get data
    df = get_data()
    # clean data
    df_clean = clean_data(df)
    # set X and y
    y = df.fare_amount
    X = df.drop(columns='fare_amount')
    # hold out
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
    # train
    model = Trainer(X_train, y_train)
    model.run()
    print(f"RMSE is {model.evaluate(X_test, y_test)}")