Example #1
0
# MAGIC %md Split the dataset into train and test

# COMMAND ----------

train, test = triazines.randomSplit([0.85, 0.15], seed=1)

# COMMAND ----------

# MAGIC %md Train the quantile regressor on the training data.

# COMMAND ----------

from mmlspark.lightgbm import LightGBMRegressor

model = LightGBMRegressor(objective='quantile',
                          alpha=0.2,
                          learningRate=0.3,
                          numLeaves=31).fit(train)

# COMMAND ----------

# MAGIC %md We can save and load LightGBM to a file using the LightGBM native representation

# COMMAND ----------

from mmlspark.lightgbm import LightGBMRegressionModel

model.saveNativeModel("mymodel")
model = LightGBMRegressionModel.loadNativeModelFromFile("mymodel")

# COMMAND ----------
Example #2
0
         .builder
         .appName("news")
         .enableHiveSupport()
         .getOrCreate())

# Read raw data
df = spark.read.csv('/home/worker/data/Data7602.csv', header=True, inferSchema=True, mode="DROPMALFORMED", encoding='UTF-8').drop("Area")
df = df.union(df)
df = df.union(df)
df = df.union(df)

print("==== 生データ ====")
df.show(truncate=False)

assembler = VectorAssembler(inputCols=df.columns[1:], outputCol="変量")
feature_vectors = assembler.transform(df)
feature_vectors.show()


print("==== LightGBMの学習 ====")
model = LightGBMRegressor(alpha=0.3,
                          learningRate=0.3,
                          numIterations=100,
                          numLeaves=31,
                          featuresCol='変量',
                          labelCol='geo_count').fit(feature_vectors)


print("==== 元のデータフレーム行数 ====")
print((df.count(), len(df.columns)))
Example #3
0
# df = spark.createDataFrame(pd_df).repartition(1)

# path="/home/zhangzy/dataset/boston.csv"
path = "E:\\test\\mmlspark"
# pd_df.to_csv(path,index=False)
df = spark.read.csv(path, header=True, inferSchema=True)
feature_cols = df.columns[2:]

train_data, test_data = df.randomSplit([0.75, 0.25], seed=42)
train_data.cache()
test_data.cache()

featurizer = VectorAssembler(inputCols=feature_cols, outputCol='features')
lr_train_data = featurizer.transform(train_data)['target', 'features']
lr_test_data = featurizer.transform(test_data)['target', 'features']

lgr = LightGBMRegressor(
    objective='quantile',
    alpha=0.2,
    learningRate=0.3,
    numLeaves=31,
    labelCol='target',
    numIterations=100,
)

repartitioned_data = lr_train_data.repartition(1).cache()

lg_model = lgr.fit(repartitioned_data)
lg_predictions = lg_model.transform(lr_test_data)

lg_predictions.show()