def dataset_iris_binomial(spark_session):
    from sklearn.datasets import load_iris

    df = load_iris(as_frame=True).frame.rename(columns={"target": "label"})
    df = spark_session.createDataFrame(df)
    df = VectorAssembler(inputCols=df.columns[:-1], outputCol="features").transform(df)
    df = df.filter(df.label < 2).select("features", "label")
    df.cache()
    return df
Exemple #2
0
    def split_datasets(self):
        # 由于热独编码后,特征字段不再是之前的字段,重新定义特征值字段
        feature_cols = [
            # 特征值
            "price",
            "cms_group_id_value",
            "final_gender_code_value",
            "age_level_value",
            "shopping_level_value",
            "occupation_value",
            "pid_value",
            "pl_onehot_value",
            "nucl_onehot_value"
        ]

        datasets = VectorAssembler().setInputCols(feature_cols).setOutputCol(
            "features").transform(self._datasets)
        self._train_datasets = datasets.filter(
            datasets.timestamp <= (1494691186 - 24 * 60 * 60))
        self._test_datasets = datasets.filter(
            datasets.timestamp > (1494691186 - 24 * 60 * 60))