def train(self, train_datasets): # 创建逻辑回归训练器,并训练模型 lr = LogisticRegression() model = lr.setLabelCol("clk").setFeaturesCol("features").fit( train_datasets) model.save(self.MODEL_PATH) return model
except Exception as e: weights = [0.0] * 10 return row.article_id, row.user_id, row.channel_id, Vectors.dense(row.articlevector), Vectors.dense(weights), Vectors.dense(row.article_weights),int(row.clicked) train_vector = train_data.rdd.map(get_user_weights).toDF(columns) # train_vector.show() # prepare data and train model from pyspark.ml.feature import VectorAssembler from pyspark.ml.classification import LogisticRegressionModel, LogisticRegression train = VectorAssembler().setInputCols(columns[2:6]).setOutputCol("features").transform(train_vector) # print(train['features']) lr = LogisticRegression() model = lr.setLabelCol("clicked").setFeaturesCol("features").fit(train) model.save("D:/WorkSpace/ToutiaoRecommenderWorkSpace/toutiao_project/reco_sys/output/LR") # load the mode and continue to process model = LogisticRegressionModel.load("D:/WorkSpace/ToutiaoRecommenderWorkSpace/toutiao_project/reco_sys/output/LR") res_transform = model.transform(train) res_transform.select(["clicked", "probability", "prediction"]).show() def vector_to_double(row): return float(row.clicked), float(row.probability[1]) score_label = res_transform.select(["clicked", "probability"]).rdd.map(vector_to_double) from sklearn.metrics import roc_auc_score, accuracy_score import numpy as np
print(item) # 构建ML的pipeline # 分别获取标签列和特征列,进行索引,并进行了重命名。 lalelIndexer = StringIndexer.setInputCol("label").setOutputCol( "indexeLabel").fit(df) featureIndexer = VectorIndexer.setInputCol("features").setOutputCol( "indexeFeatures").fit(df) # 数据集随机分成训练集和测试集,其中训练集占70%。 trainingData, testData = df.randomSplit([0.7, 0.3]) # 设置logistic的参数,这里我们统一用setter的方法来设置,也可以用ParamMap来设置(具体的可以查看spark mllib的官网)。 # 这里我们设置了循环次数为10次,正则化项为0.3等,具体的可以设置的参数可以通过explainParams()来获取, # 还能看到我们已经设置的参数的结果。 lr = LogisticRegression.setLabelCol("lalelIndexer").setFeaturesCol( "featureIndexer").setMaxIter(10).setRegParam(0.3).setElasticNetParam( 0.8) print("LogisticRegression parameters:\n" + lr.explainParams()) # 设置一个labelConverter,目的是把预测的类别重新转化成字符型的。 labelConverter = IndexToString().setInputCol("prediction").setOutputCol( "predictedLabel").setLabels(lalelIndexer.labels) # 构建pipeline,设置stage,然后调用fit()来训练模型。 lrPipeline = Pipeline().setStages( [lalelIndexer, featureIndexer, lr, labelConverter]) lrPipelineModel = lrPipeline.fit(trainingData) # pipeline本质上是一个Estimator,当pipeline调用fit()的时候就产生了一个PipelineModel, # 本质上是一个Transformer。然后这个PipelineModel就可以调用transform()来进行预测,生成一个新的DataFrame, # 即利用训练得到的模型对测试集进行验证。 lrPredictions = lrPipelineModel.transform(testData) # 后我们可以输出预测的结果,其中select选择要输出的列,collect获取所有行的数据,用foreach把每行打印出来。
student.show() a = student.returnValue() # test 2 java_import(gateway.jvm, "org.apache.spark.ml.feature.Abs") dd = gateway.jvm.Abs() gateway.help(dd) # test 3 bin_ = Binarizer(threshold=0, inputCol='random', outputCol='bin_feature') abs_ = Abs(inputCol='random', outputCol='abs_feature') vc=VectorAssembler(inputCols=['random','abs_feature'],outputCol="features") lr=LogisticRegression() lr.setLabelCol("bin_feature") # pipline = Pipeline(stages=[bin_, abs_,vc,lr]) model = pipline.fit(df) bin_df = model.transform(df) bin_df.show() print('load model and save model') print("---*-***--" * 20) model.write().overwrite().save("./abs.model") # save # load pipmodel