Ejemplo n.º 1
0
    def random_forest(df, columns, input_col, **kwargs):
        """
        Runs a random forest classifier for input DataFrame.
        :param df: Pyspark dataframe to analyze.
        :param columns: List of columns to select for prediction.
        :param input_col: Column to predict.
        :return: DataFrame with random forest and prediction run.
        """

        columns = parse_columns(df, columns)

        data = df.select(columns)
        feats = data.columns
        feats.remove(input_col)

        df = string_to_index(df, input_cols=input_col)
        df = vector_assembler(df, input_cols=feats, output_col="features")

        model = RandomForestClassifier(**kwargs)
        df.table()
        df = df.cols.rename(name_col(input_col, "index_to_string"), "label")

        rf_model = model.fit(df)
        df_model = rf_model.transform(df)
        return df_model, rf_model
Ejemplo n.º 2
0
    def gbt(df, columns, input_col, **kwargs):
        """
        Runs a gradient boosting tree classifier for input DataFrame.
        :param df: Pyspark dataframe to analyze.
        :param columns: List of columns to select for prediction.
        :param input_col: Column to predict.
        :return: DataFrame with gradient boosting tree and prediction run.
        """

        if not is_dataframe(df):
            raise TypeError("Spark dataframe expected")

        columns = parse_columns(df, columns)

        if not is_str(input_col):
            raise TypeError("Error, input column must be a string")

        data = df.select(columns)
        feats = data.columns
        feats.remove(input_col)

        df = string_to_index(df, input_cols=input_col)
        df = vector_assembler(df, input_cols=feats, output_col="features")

        model = GBTClassifier(**kwargs)

        df = df.cols.rename(name_col(input_col, "index_to_string"), "label")

        gbt_model = model.fit(df)
        df_model = gbt_model.transform(df)
        return df_model, gbt_model
Ejemplo n.º 3
0
    def decision_tree(df, columns, input_col, **kargs):
        """
        Runs a decision tree classifier for input DataFrame.
        :param df: Pyspark dataframe to analyze.
        :param columns: List of columns to select for prediction.
        :param input_col: Column to predict.
        :return: DataFrame with decision tree and prediction run.
        """

        if not is_dataframe(df):
            raise TypeError("Spark dataframe expected")

        columns = parse_columns(df, columns)

        if not is_str(input_col):
            raise TypeError("Error, input column must be a string")

        data = df.select(columns)
        feats = data.columns
        feats.remove(input_col)

        df = string_to_index(df, input_cols=input_col)
        df = vector_assembler(df, input_cols=feats)

        model = DecisionTreeClassifier(**kargs)

        df = df.cols.rename(name_col(input_col, "index"), "label")

        dt_model = model.fit(df)
        df_model = dt_model.transform(df)
        return df_model, dt_model
Ejemplo n.º 4
0
    def random_forest(df, columns, input_col, **kargs):
        """
        Runs a random forest classifier for input DataFrame.
        :param df: Pyspark dataframe to analyze.
        :param columns: List of columns to select for prediction.
        :param input_col: Column to predict.
        :return: DataFrame with random forest and prediction run.
        """

        if not is_dataframe(df):
            raise TypeError("Spark dataframe expected")

        columns = parse_columns(df, columns)

        assert isinstance(input_col,
                          str), "Error, input column must be a string"

        data = df.select(columns)
        feats = data.columns
        feats.remove(input_col)

        df = string_to_index(df, input_cols=input_col)
        df = vector_assembler(df, input_cols=feats)

        model = RandomForestClassifier(**kargs)

        df = df.cols.rename([(input_col + "_index", "label")])

        rf_model = model.fit(df)
        df_model = rf_model.transform(df)
        return df_model, rf_model
Ejemplo n.º 5
0
    def h2o_xgboost(df, label, columns, **kargs):

        H2OContext.getOrCreate(Spark.instance.spark)

        df_sti = string_to_index(df, input_cols=label)
        df_va = vector_assembler(df_sti, input_cols=columns)
        h2o_xgboost = H2OXGBoost(convertUnknownCategoricalLevelsToNa=True,
                                 featuresCols=columns,
                                 labelCol=label,
                                 **kargs)
        model = h2o_xgboost.fit(df_va)
        df_raw = model.transform(df_va)

        df_pred = df_raw.withColumn("prediction", when(df_raw.prediction_output["p1"] > 0.5, 1.0).otherwise(0.0))

        return df_pred, model
Ejemplo n.º 6
0
    def h2o_gbm(df, label, columns, **kargs):

        H2OContext.getOrCreate(Spark.instance.spark)

        df_sti = string_to_index(df, input_cols=label)
        df_va = vector_assembler(df_sti, input_cols=columns)
        h2o_gbm = H2OGBM(ratio=0.8,
                         seed=1,
                         featuresCols=columns,
                         labelCol=label,
                         **kargs)
        model = h2o_gbm.fit(df_va)
        df_raw = model.transform(df_va)

        df_pred = df_raw.withColumn("prediction", when(df_raw.prediction_output["p1"] > 0.5, 1.0).otherwise(0.0))

        return df_pred, model
Ejemplo n.º 7
0
    def h2o_automl(df, label, columns, **kargs):

        H2OContext.getOrCreate(Spark.instance.spark)

        df_sti = string_to_index(df, input_cols=label)
        df_va = vector_assembler(df_sti, input_cols=columns)
        automl = H2OAutoML(convertUnknownCategoricalLevelsToNa=True,
                           maxRuntimeSecs=60,  # 1 minutes
                           seed=1,
                           maxModels=3,
                           labelCol=label + "_index",
                           **kargs)

        model = automl.fit(df_va)
        df_raw = model.transform(df_va)

        df_pred = df_raw.withColumn("prediction", when(df_raw.prediction_output["value"] > 0.5, 1.0).otherwise(0.0))

        return df_pred, model
Ejemplo n.º 8
0
    def h2o_deeplearning(df, label, columns, **kargs):

        H2OContext.getOrCreate(Spark.instance.spark)

        df_sti = string_to_index(df, input_cols=label)
        df_va = vector_assembler(df_sti, input_cols=columns)
        h2o_deeplearning = H2ODeepLearning(epochs=10,
                                           seed=1,
                                           l1=0.001,
                                           l2=0.0,
                                           hidden=[200, 200],
                                           featuresCols=columns,
                                           labelCol=label,
                                           **kargs)
        model = h2o_deeplearning.fit(df_va)
        df_raw = model.transform(df_va)

        df_pred = df_raw.withColumn("prediction", when(df_raw.prediction_output["p1"] > 0.5, 1.0).otherwise(0.0))

        return df_pred, model
Ejemplo n.º 9
0
def test_string_to_index_kargs():
    df = op.spark.createDataFrame([(0, "a"), (1, "b"), (2, "c"), (3, "a"),
                                   (4, "a"), (5, "c")], ["id", "category"])

    df_indexed = fe.string_to_index(df,
                                    "category",
                                    stringOrderType="frequencyAsc")

    assert_spark_df(df_indexed)

    expected_collect = op.sc.parallelize([
        Row(id=0, category='a', category_index=2.0),
        Row(id=1, category='b', category_index=0.0),
        Row(id=2, category='c', category_index=1.0),
        Row(id=3, category='a', category_index=2.0),
        Row(id=4, category='a', category_index=2.0),
        Row(id=5, category='c', category_index=1.0)
    ]).toDF()

    assert_equal(
        df_indexed.select("category", "category_index", "id").collect(),
        expected_collect.collect())