コード例 #1
0
def test_sklearn_estimator_pipeline():
    ds = vaex.ml.datasets.load_iris()
    train, test = ds.ml.train_test_split(verbose=False)
    # Add virtual columns
    train['sepal_virtual'] = np.sqrt(train.sepal_length**2 +
                                     train.sepal_width**2)
    train['petal_scaled'] = train.petal_length * 0.2
    # Do a pca
    features = ['sepal_virtual', 'petal_scaled']
    pca = train.ml.pca(n_components=2, features=features)
    train = pca.transform(train)
    # Do state transfer
    st = train.ml.state_transfer()
    # now apply the model
    features = ['sepal_virtual', 'petal_scaled']
    model = Predictor(model=LinearRegression(),
                      features=features,
                      prediction_name='pred')
    model.fit(train, train.petal_width)
    # Create a pipeline
    pipeline = vaex.ml.Pipeline([st, model])
    # Use the pipeline
    pred = pipeline.predict(test)
    df_trans = pipeline.transform(test)

    # WARNING: on windows/appveyor this gives slightly different results
    # do we fully understand why? I also have the same results on my osx laptop
    # sklearn 0.21.1 (scikit-learn-0.21.2 is installed on windows) so it might be a
    # version related thing
    np.testing.assert_array_almost_equal(pred, df_trans.pred.values)
コード例 #2
0
def test_sklearn_estimator_regression_validation():
    ds = vaex.ml.datasets.load_iris()
    train, test = ds.ml.train_test_split(verbose=False)
    features = ['sepal_length', 'sepal_width', 'petal_length']

    # Dense features
    Xtrain = train[features].values
    Xtest = test[features].values
    ytrain = train.petal_width.values

    for model in models_regression:

        # vaex
        vaex_model = Predictor(model=model,
                               features=features,
                               prediction_name='pred')
        vaex_model.fit(train, train.petal_width)
        test = vaex_model.transform(test)

        # sklearn
        model.fit(Xtrain, ytrain)
        skl_pred = model.predict(Xtest)

        np.testing.assert_array_almost_equal(test.pred.values,
                                             skl_pred,
                                             decimal=5)
コード例 #3
0
def test_sklearn_estimator_classification_validation():
    ds = vaex.ml.datasets.load_titanic()

    train, test = ds.ml.train_test_split(verbose=False)
    features = ['pclass', 'parch', 'sibsp']

    # Dense features
    Xtrain = train[features].values
    Xtest = test[features].values
    ytrain = train.survived.values

    for model in models_classification:

        # vaex
        vaex_model = Predictor(model=model,
                               features=features,
                               prediction_name='pred')
        vaex_model.fit(train, train.survived)
        test = vaex_model.transform(test)

        # scikit-learn
        model.fit(Xtrain, ytrain)
        skl_pred = model.predict(Xtest)

        assert np.all(skl_pred == test.pred.values)
コード例 #4
0
ファイル: sklearn_test.py プロジェクト: yaotianzhang/vaex
def test_sklearn_estimator_classification_validation(prediction_type,
                                                     df_titanic):
    df = df_titanic
    df['survived'] = df.survived.astype('int32')

    train, test = df.ml.train_test_split(verbose=False)
    features = ['pclass', 'parch', 'sibsp']

    # Dense features
    Xtrain = train[features].values
    Xtest = test[features].values
    ytrain = train.survived.values

    for model in models_classification:

        # vaex
        vaex_model = Predictor(model=model,
                               features=features,
                               target='survived',
                               prediction_name='pred',
                               prediction_type=prediction_type)
        vaex_model.fit(train)
        test = vaex_model.transform(test)

        # scikit-learn
        model.fit(Xtrain, ytrain)
        if prediction_type == 'predict':
            skl_pred = model.predict(Xtest)
        else:
            skl_pred = model.predict_proba(Xtest)

        assert np.all(skl_pred == test.pred.values)
コード例 #5
0
ファイル: sklearn_test.py プロジェクト: terra-analitika/vaex
def test_sklearn_estimator_virtual_columns():
    ds = vaex.ml.datasets.load_iris()
    ds['x'] = ds.sepal_length * 1
    ds['y'] = ds.sepal_width * 1
    ds['w'] = ds.petal_length * 1
    ds['z'] = ds.petal_width * 1
    train, test = ds.ml.train_test_split(test_size=0.2, verbose=False)
    features = ['x', 'y', 'z']
    model = Predictor(model=LinearRegression(), features=features, prediction_name='pred')
    model.fit(ds, ds.w)
    ds = model.transform(ds)
    assert ds.pred.values.shape == (150,)
コード例 #6
0
def train_gtfsr(df):
    print("*** gtfsr model training ***")

    feats = (df.get_column_names(regex="pca[\d]") +
             df.get_column_names(regex=".*_[xy]") +
             df.get_column_names(regex="standard_scaled_*") +
             df.get_column_names(regex="label_encode_*") +
             df.get_column_names(regex="minmax_scaled_*") +
             ["stop_sequence", "is_delayed", "direction"])

    assert [
        df[feat].dtype for feat in feats
        if not df[feat].dtype in ["float64", "int64"]
    ] == [], f"All training feature must be number types, not {df[feats].dtypes}"

    target = "arrival"
    prediction_name = "p_arrival"

    lgbm_params = {
        "boosting_type": "gbdt",
        "learning_rate": 0.3,
        "n_estimators": 300,
        "max_depth": 50,
        "num_leaves": 50,
        "num_iterations": 300,
    }

    models = [
        # lightGBM Regressor
        Predictor(
            features=feats,
            target=target,
            prediction_name=prediction_name + "_lgbm",
            model=lightgbm.LGBMRegressor(**lgbm_params, n_jobs=-1),
        ),
        # # XGBoost Regressor
        # Predictor(
        #     features=feats,
        #     target=target,
        #     prediction_name=prediction_name + "_xgb",
        #     model=xgboost.XGBRegressor(max_depth=50, min_child_weight=1, n_estimators=200, n_jobs=-1, learning_rate=0.3),
        # ),
    ]

    # here we fit and train the model
    for i, model in enumerate(models):
        model.fit(df)
        print(f"\n\nmodel {i} trained, time taken: {duration()}s")

        df = model.transform(df)

    # df[prediction_name + "_final"] = (
    #     df["p_arrival_lgbm"].astype("float64") * 0.5 + df["p_arrival_xgb"].astype("float64") * 0.5
    # )

    df.state_write(gtfsr_model_out_path)
    print("exported model")
    return
コード例 #7
0
def create_scats_ml_model():
    print("starting scats ml modeling")

    # load existing csv into vaex dataframe
    if not os.path.exists(finalScatsPath + ".hdf5"):
        vaex.from_csv(finalScatsPath, convert=True, copy_index=False, chunk_size=1_000_000)

    df = vaex.open(finalScatsPath + ".hdf5", shuffle=True)

    # transform the features into more machine learning friendly vars
    pca_coord = vaex.ml.PCA(features=["lat", "lon"], n_components=2, prefix="pca")
    df = pca_coord.fit_transform(df)

    cycl_transform_hour = vaex.ml.CycleTransformer(features=["hour"], n=24)
    df = cycl_transform_hour.fit_transform(df)

    cycl_transform_dow = vaex.ml.CycleTransformer(features=["dow"], n=7)
    df = cycl_transform_dow.fit_transform(df)

    print("dataWrangling done, ready to create model, time: {}s".format(duration()))

    # create a randomForestRegression model
    vaex_model = Predictor(
        features=df.get_column_names(regex="pca[\d]") + df.get_column_names(regex=".*_[xy]"),
        target="avg_vol",
        model=RandomForestRegressor(random_state=42, n_estimators=7 * 24),
        prediction_name="p_avg_vol",
    )

    # here we fit and train the model
    with parallel_backend("threading", n_jobs=8):
        vaex_model.fit(df)
        print("\n\nmodel created, time: {}s".format(duration()))

        dump(value=vaex_model, filename=model_out, compress=3)

    print("model written to output, time: {}s".format(duration()))
    return
コード例 #8
0
ファイル: sklearn_test.py プロジェクト: terra-analitika/vaex
def test_sklearn_estimator():
    ds = vaex.ml.datasets.load_iris()
    features = ['sepal_length', 'sepal_width', 'petal_length']

    train, test = ds.ml.train_test_split(verbose=False)

    model = Predictor(model=LinearRegression(), features=features, prediction_name='pred')
    model.fit(train, train.petal_width)
    prediction = model.predict(test)
    test = model.transform(test)
    np.testing.assert_array_almost_equal(test.pred.values, prediction, decimal=5)

    # Transfer the state of train to ds
    train = model.transform(train)
    state = train.state_get()
    ds.state_set(state)
    assert ds.pred.values.shape == (150,)
コード例 #9
0
def test_sklearn_estimator_serialize(tmpdir):
    ds = vaex.ml.datasets.load_iris()
    features = ['sepal_length', 'sepal_width', 'petal_length']

    model = Predictor(model=LinearRegression(),
                      features=features,
                      prediction_name='pred')
    model.fit(ds, ds.petal_width)

    pipeline = vaex.ml.Pipeline([model])
    pipeline.save(str(tmpdir.join('test.json')))
    pipeline.load(str(tmpdir.join('test.json')))

    model = Predictor(model=LinearRegression(),
                      features=features,
                      prediction_name='pred')
    model.fit(ds, ds.petal_width)

    model.state_set(model.state_get())
    pipeline = vaex.ml.Pipeline([model])
    pipeline.save(str(tmpdir.join('test.json')))
    pipeline.load(str(tmpdir.join('test.json')))