def dataset_iris_binomial(spark_session): from sklearn.datasets import load_iris df = load_iris(as_frame=True).frame.rename(columns={"target": "label"}) df = spark_session.createDataFrame(df) df = VectorAssembler(inputCols=df.columns[:-1], outputCol="features").transform(df) df = df.filter(df.label < 2).select("features", "label") df.cache() return df
def split_datasets(self): # 由于热独编码后,特征字段不再是之前的字段,重新定义特征值字段 feature_cols = [ # 特征值 "price", "cms_group_id_value", "final_gender_code_value", "age_level_value", "shopping_level_value", "occupation_value", "pid_value", "pl_onehot_value", "nucl_onehot_value" ] datasets = VectorAssembler().setInputCols(feature_cols).setOutputCol( "features").transform(self._datasets) self._train_datasets = datasets.filter( datasets.timestamp <= (1494691186 - 24 * 60 * 60)) self._test_datasets = datasets.filter( datasets.timestamp > (1494691186 - 24 * 60 * 60))