def train(self, train_datasets):
        # 创建逻辑回归训练器,并训练模型

        lr = LogisticRegression()
        model = lr.setLabelCol("clk").setFeaturesCol("features").fit(
            train_datasets)
        model.save(self.MODEL_PATH)
        return model
    except Exception as e:
        weights = [0.0] * 10
    return row.article_id, row.user_id, row.channel_id, Vectors.dense(row.articlevector), Vectors.dense(weights), Vectors.dense(row.article_weights),int(row.clicked)

train_vector = train_data.rdd.map(get_user_weights).toDF(columns)
# train_vector.show()

# prepare data and train model
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import LogisticRegressionModel, LogisticRegression

train = VectorAssembler().setInputCols(columns[2:6]).setOutputCol("features").transform(train_vector)
# print(train['features'])

lr = LogisticRegression()
model = lr.setLabelCol("clicked").setFeaturesCol("features").fit(train)
model.save("D:/WorkSpace/ToutiaoRecommenderWorkSpace/toutiao_project/reco_sys/output/LR")

# load the mode and continue to process
model = LogisticRegressionModel.load("D:/WorkSpace/ToutiaoRecommenderWorkSpace/toutiao_project/reco_sys/output/LR")
res_transform = model.transform(train)
res_transform.select(["clicked", "probability", "prediction"]).show()

def vector_to_double(row):
    return float(row.clicked), float(row.probability[1])

score_label = res_transform.select(["clicked", "probability"]).rdd.map(vector_to_double)

from sklearn.metrics import roc_auc_score, accuracy_score
import numpy as np
Exemple #3
0
        print(item)

    # 构建ML的pipeline
    # 分别获取标签列和特征列,进行索引,并进行了重命名。
    lalelIndexer = StringIndexer.setInputCol("label").setOutputCol(
        "indexeLabel").fit(df)
    featureIndexer = VectorIndexer.setInputCol("features").setOutputCol(
        "indexeFeatures").fit(df)

    #     数据集随机分成训练集和测试集,其中训练集占70%。
    trainingData, testData = df.randomSplit([0.7, 0.3])
    #    设置logistic的参数,这里我们统一用setter的方法来设置,也可以用ParamMap来设置(具体的可以查看spark mllib的官网)。
    # 这里我们设置了循环次数为10次,正则化项为0.3等,具体的可以设置的参数可以通过explainParams()来获取,
    # 还能看到我们已经设置的参数的结果。
    lr = LogisticRegression.setLabelCol("lalelIndexer").setFeaturesCol(
        "featureIndexer").setMaxIter(10).setRegParam(0.3).setElasticNetParam(
            0.8)
    print("LogisticRegression parameters:\n" + lr.explainParams())
    # 设置一个labelConverter,目的是把预测的类别重新转化成字符型的。
    labelConverter = IndexToString().setInputCol("prediction").setOutputCol(
        "predictedLabel").setLabels(lalelIndexer.labels)
    #     构建pipeline,设置stage,然后调用fit()来训练模型。
    lrPipeline = Pipeline().setStages(
        [lalelIndexer, featureIndexer, lr, labelConverter])
    lrPipelineModel = lrPipeline.fit(trainingData)

    #      pipeline本质上是一个Estimator,当pipeline调用fit()的时候就产生了一个PipelineModel,
    # 本质上是一个Transformer。然后这个PipelineModel就可以调用transform()来进行预测,生成一个新的DataFrame,
    # 即利用训练得到的模型对测试集进行验证。
    lrPredictions = lrPipelineModel.transform(testData)
    #     后我们可以输出预测的结果,其中select选择要输出的列,collect获取所有行的数据,用foreach把每行打印出来。
        student.show()
        a = student.returnValue()

        # test 2
        java_import(gateway.jvm, "org.apache.spark.ml.feature.Abs")
        dd = gateway.jvm.Abs()
        gateway.help(dd)

        # test 3

        bin_ = Binarizer(threshold=0, inputCol='random', outputCol='bin_feature')
        abs_ = Abs(inputCol='random', outputCol='abs_feature')
        vc=VectorAssembler(inputCols=['random','abs_feature'],outputCol="features")
        lr=LogisticRegression()
        lr.setLabelCol("bin_feature")

        #
        pipline = Pipeline(stages=[bin_, abs_,vc,lr])
        model = pipline.fit(df)
        bin_df = model.transform(df)
        bin_df.show()

        print('load model and save model')
        print("---*-***--" * 20)
        model.write().overwrite().save("./abs.model")


        # save

        # load pipmodel