# MAGIC %md Split the dataset into train and test # COMMAND ---------- train, test = triazines.randomSplit([0.85, 0.15], seed=1) # COMMAND ---------- # MAGIC %md Train the quantile regressor on the training data. # COMMAND ---------- from mmlspark.lightgbm import LightGBMRegressor model = LightGBMRegressor(objective='quantile', alpha=0.2, learningRate=0.3, numLeaves=31).fit(train) # COMMAND ---------- # MAGIC %md We can save and load LightGBM to a file using the LightGBM native representation # COMMAND ---------- from mmlspark.lightgbm import LightGBMRegressionModel model.saveNativeModel("mymodel") model = LightGBMRegressionModel.loadNativeModelFromFile("mymodel") # COMMAND ----------
.builder .appName("news") .enableHiveSupport() .getOrCreate()) # Read raw data df = spark.read.csv('/home/worker/data/Data7602.csv', header=True, inferSchema=True, mode="DROPMALFORMED", encoding='UTF-8').drop("Area") df = df.union(df) df = df.union(df) df = df.union(df) print("==== 生データ ====") df.show(truncate=False) assembler = VectorAssembler(inputCols=df.columns[1:], outputCol="変量") feature_vectors = assembler.transform(df) feature_vectors.show() print("==== LightGBMの学習 ====") model = LightGBMRegressor(alpha=0.3, learningRate=0.3, numIterations=100, numLeaves=31, featuresCol='変量', labelCol='geo_count').fit(feature_vectors) print("==== 元のデータフレーム行数 ====") print((df.count(), len(df.columns)))
# df = spark.createDataFrame(pd_df).repartition(1) # path="/home/zhangzy/dataset/boston.csv" path = "E:\\test\\mmlspark" # pd_df.to_csv(path,index=False) df = spark.read.csv(path, header=True, inferSchema=True) feature_cols = df.columns[2:] train_data, test_data = df.randomSplit([0.75, 0.25], seed=42) train_data.cache() test_data.cache() featurizer = VectorAssembler(inputCols=feature_cols, outputCol='features') lr_train_data = featurizer.transform(train_data)['target', 'features'] lr_test_data = featurizer.transform(test_data)['target', 'features'] lgr = LightGBMRegressor( objective='quantile', alpha=0.2, learningRate=0.3, numLeaves=31, labelCol='target', numIterations=100, ) repartitioned_data = lr_train_data.repartition(1).cache() lg_model = lgr.fit(repartitioned_data) lg_predictions = lg_model.transform(lr_test_data) lg_predictions.show()