Example #1
0
def func2():
    """
    PIPeline机器学习
    :return:
    """
    row_df = sqlContext.read.format("csv").option("header", True).option("delimiter", "\t").load(Path + "train.tsv")
    df = row_df.select(["url", "alchemy_category"]  # 不需要转换的字段
                       + [replace_question(col(column)).cast("double").alias(column) for column in
                          row_df.columns[4:]])  # 需要转换的字段
    train_df, test_df = df.randomSplit([0.7, 0.3])
    ###建立机器学习Pipeline流程
    stringIndexer = StringIndexer(inputCol="alchemy_category", outputCol="alchemy_category_index")  # 创建indexer,字符串代码化
    encoder=OneHotEncoder(dropLast=False,inputCol="alchemy_category_index",outputCol="alchemy_category_indexVec")
    assemblerInputs = ["alchemy_category_indexVec"] + row_df.columns[4:-1]
    assembler = VectorAssembler(inputCols=assemblerInputs, outputCol="features")
    dt = DecisionTreeClassifier(labelCol="label", featuresCol="features", impurity="gini", maxDepth=10, maxBins=14)
    pipeline=Pipeline(stages=[stringIndexer,encoder,assembler,dt])
    print(pipeline.getStages())

    ###使用Pipeline进行数据处理和训练
    pipelineModel=pipeline.fit(train_df)#训练
    print(pipelineModel.stages[3])#第三阶段会产生模型,这里看看模型
    print(pipelineModel.stages[3].toDebugString)

    ####使用pipeline进行预测
    predicted=pipelineModel.transform(test_df)
    print(predicted.columns)
    predicted.select("url","features","rawprediction","probability","label","prediction").show(5)
    predicted.select( "probability",  "prediction").take(5)

    ####评估模型准确率
    evaluator=BinaryClassificationEvaluator(rawPredictionCol="rawPrediction",labelCol="label",metricName="areaUnderROC")
    auc=evaluator.evaluate(predicted)
    print("auc:",auc)

    #要遍历的参数们,选择最佳参数组合
    paramGrid=ParamGridBuilder().addGrid(dt.impurity,["gini","entory"]).addGrid(dt.maxDepth,[5,10,15]).addGrid(dt.maxBins,[10,15,20]).build()
    tvs=TrainValidationSplit(estimator=dt,evaluator=evaluator,estimatorParamMaps=paramGrid,trainRatio=0.8)#trainRatio 数据会8:2的比例分为训练集,验证集
    tvs_pipeline=Pipeline(stages=[stringIndexer,encoder,assembler,tvs])
    tvs_pipelineModel=tvs_pipeline.fit(train_df)
    bestModel=tvs_pipelineModel.stages[3].bestModel
    print("bestModel",bestModel)
    predictions=tvs_pipelineModel.transform(test_df)
    auc2=evaluator.evaluate(predictions)
    print("auc2:",auc2)
Example #2
0
# 可以看到本质上存储的是SparseVector类型
print(df3.select('features').take(1))

# 5, 使用DecionTreeClassifier二元分类
dt = DecisionTreeClassifier(labelCol="label",
                            featuresCol="features",
                            impurity='gini',
                            maxDepth=10,
                            maxBins=14)
dt_model = dt.fit(df3)
print(dt_model)
dt4 = dt_model.transform(df3)

# 6, 建立pipeline
pipeline = Pipeline(stages=[categoryIndexer, encoder, assembler, dt])
print(pipeline.getStages())

# 7, 使用pipeline进行数据处理与训练
# 因为训练数据执行pipeline的所有阶段,所以会花时间比较长,最后产生的结果是pipelineModel
pipelineModel = pipeline.fit(train_df)
print(pipelineModel.stages[3])

# 我们还可以进一步使用toDebugString查看决策树模型的规则
print(pipelineModel.stages[3].toDebugString)

# 8, 使用pipelineModel进行预测
predicted = pipelineModel.transform(test_df)
# 查看预测后的Schema,发现新增了3个字段
print(predicted.columns)
predicted.select('url', 'features', 'rawprediction', 'probability', 'label',
                 'prediction').show(10)
Example #3
0
                  cat_dist[idx].labels[i])

    print("===========SplitData====================")
    train_df, test_df = df.randomSplit(env.split_prop)

    print("===========VectorAssembler====================")
    feature = df.columns[1:len(df.columns) - 1]
    assembler = VectorAssembler(inputCols=feature, outputCol="features")

    print("=============pipeline==================")
    model = LinearSVC(maxIter=5,
                      regParam=0.01,
                      labelCol=lable_name[0],
                      featuresCol="features")
    pipeline = Pipeline(stages=[assembler, model])
    pipeline.getStages()

    print("===========TaintingAndTesting====================")
    pipelineModel = pipeline.fit(train_df)
    predicted = pipelineModel.transform(test_df)

    print("===========PredictedAUC====================")
    evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction",
                                              labelCol=lable_name[0],
                                              metricName="areaUnderROC")
    auc = evaluator.evaluate(predicted)
    print(auc)

    print("===========PredictedScore====================")
    Multi_evaluator = MulticlassClassificationEvaluator(labelCol=lable_name[0])
    Accuracy = Multi_evaluator.evaluate(predicted,
Example #4
0
def magic_loop3(pipelines, grid, train, test, cvfolds=3):
    best_score = 0.0  #symbolic high value :-)
    best_grid = None  #inicializar la variable
    #este loop inicia las pruebas secuenciales de los pipelines:
    #es relevante que no sólo soporta 2, sino se va en cada uno
    #de los que estén presentes en la lista
    for pipe in pipelines:
        try:
            #quiero que se desligue, para no modificar (al final, usa poco RAM)
            pipe = pipe.copy()
            #etapas del pipeline
            stages = pipe.getStages()
            #obtener el predictor (el motor ML)
            predictor = [
                stage for stage in stages
                if "pyspark.ml.classification" in str(type(stage))
                or "pyspark.ml.regression" in str(type(stage))
            ][0]
            predictor_i = stages.index(predictor)
            stringer = [
                stage for stage in pipe.getStages()
                if "pyspark.ml.feature.StringIndexer" in str(type(stage))
            ][0]
            if DEBUG: print("pipeline:\n%s\n\n" % stages)
            if DEBUG:
                print("predictor=%s (index %s, type %s), stringer=%s (%s)\n" %
                      (predictor, stages.index(predictor), type(predictor),
                       stringer, type(stringer)))
            #dado que no son predicciones susceptibles a cambios en el CV
            prepipe = Pipeline(stages=stages[0:(predictor_i)])
            if DEBUG: print("pre pipeline:\n%s\n\n" % prepipe.getStages())
            print("Starting fit on prepipe...")
            #modelo para el prepipeline
            prepipem = prepipe.fit(train)
            print("Starting transform on prepipe...")
            train2 = prepipem.transform(train)
            test2 = prepipem.transform(test)
            #el motor ML sí es susceptible CV
            postpipe = Pipeline(stages=stages[(predictor_i):len(stages)])
            if DEBUG: print("post pipeline:\n%s\n\n" % postpipe.getStages())
            #creación del Cross Validator
            gridcv = CrossValidator(
                estimator=postpipe,
                estimatorParamMaps=grid,
                evaluator=MulticlassClassificationEvaluator(
                    labelCol="DEPARTURE_DELAY"),
                numFolds=cvfolds)
            #extraemos el nombre y le quitamos la parte "fea" que devuelve type()
            predictr = [
                str(type(stage)) for stage in pipe.getStages()
                if "pyspark.ml.classification" in str(type(stage))
                or "pyspark.ml.regression" in str(type(stage))
            ][0]
            predictr = re.sub(
                "^<class 'pyspark\.ml\.(classification|regression)\.([a-zA-Z0-9]+)'>",
                "\\2", predictr)
            #creación del modelo
            print("Starting fit on %s..." % predictr)
            gridcvm = gridcv.fit(train2)
            #aplicación del modelo
            print("Starting transform on %s..." % predictr)
            preds = gridcvm.transform(test2)
            #obtenemos el evaluador para el uso en la medición de errores
            ev = gridcvm.getEvaluator()
            #obtenemos la métrica del error
            metric = ev.getMetricName()
            print("Starting error calculation on %s..." % predictr)
            #obtenemos el valor del eror
            error = ev.evaluate(preds)
            print("Error %s: %f" % (metric, error))
            #si es mejor que el modelo pasado, lo guardamos. El último
            #guardado será el que devuelva esta función
            if error > best_score:
                print("%s is the best model so far: %f (%s)" %
                      (predictr, error, metric))
                best_grid = gridcvm
        #manejo de errores y horrores
        except Exception as e:
            print('Error during Magic Loop:', e)
        continue
    return best_grid
Example #5
0
#parametros para el magic loop
paramGrid = ParamGridBuilder() \
                .addGrid(glr.family, ["Gaussian", "Poisson", "Tweedie"]) \
                .addGrid(glr.maxIter, [1, 2, 3]) \
                .addGrid(lr.maxIter, [1, 2, 3]) \
                .addGrid(lr.elasticNetParam, [0.1,0.2,0.3]) \
                .build()
magic = magic_loop3(pipelines, paramGrid, train, test, 3)
#obtengo el pipeline que devolvió el magic loop
best_model = magic.getEstimator()
#obtenemos el paso del clasificador
best_estimator = best_model.getStages()[0]
#guardo en una variable los parámetros más adecuados
best_estimator_params = best_estimator.extractParamMap()
#obtener el evaluador para medir errores
ev0 = magic.getEvaluator()
###################
#medición del error
#impresión de los parámetros y el mejor modelo
print(
    "%s (best model) Parameters: maxIter=%d, elasticNetParam=%f" %
    (re.sub(
        "^<class 'pyspark\.ml\.(classification|regression)\.([a-zA-Z0-9]+)'>",
        "\\2", (str)((type(best_estimator)))),
     best_estimator_params[best_estimator.maxIter],
     best_estimator_params[best_estimator.elasticNetParam]))
#aquí probé como se veían los errores
preds = magic.transform(
    Pipeline(stages=pipeline1.getStages()[0:6]).fit(test).transform(test))
print("Error %s: %f" % (ev0.getMetricName(), ev0.evaluate(preds)))
Example #6
0
class pipemodeler:
    def __init__(self, DF, featurestages, classifier, classifiergrid=None):
        self.DF = DF
        self.featurestages = featurestages
        self.classifier = classifier
        self.classifiergrid = classifiergrid

    def train(self):
        print('Building stages...')
        stages = []
        if (type(self.featurestages) != list):
            self.featurestages = [self.featurestages]
        stages += self.featurestages

        #In case there is word2vec which has negative features, scale the features
        #to nonnegative values because naive bayes requires that
        if (('Word2Vec' in str(stages))
                and ('NaiveBayes' in str(self.classifier))):
            print(
                'Word2Vec and NaiveBayes detected, scaling to nonnegative [0.0,1.0]'
            )
            stages[-1].setOutputCol('prefeatures')
            scaler = MinMaxScaler(inputCol='prefeatures', outputCol='features')
            stages = stages + [scaler]

        stages += [self.classifier]
        self.pipeline = Pipeline(stages=stages)

        print('Using the following stages: ' + str(self.pipeline.getStages()))
        print('Training model...')
        if (self.classifiergrid == None):
            print('Training without a Parameter Grid...')
            dftrain, dftest = self.DF.randomSplit([0.80, 0.20])
            model = self.pipeline.fit(dftrain)
            self.predictions = model.transform(dftest)
            self.model = model
        else:
            # print('Training with a Parameter Grid...')
            # tvs = TrainValidationSplit(estimator=self.pipeline,
            #                             estimatorParamMaps=self.classifiergrid,
            #                             evaluator=BinaryClassificationEvaluator(),
            #                             parallelism=4s,
            #                             trainRatio=0.7)
            # dftrain, dftest = self.DF.randomSplit([0.70, 0.30])
            # model = tvs.fit(dftrain)
            print('Cross Validation Hyperparamter Tunning...')
            cv = CrossValidator(estimator=self.pipeline,
                                estimatorParamMaps=self.classifiergrid,
                                evaluator=BinaryClassificationEvaluator(),
                                parallelism=4,
                                numFolds=5)
            dftrain, dftest = self.DF.randomSplit([0.70, 0.30])
            model = cv.fit(dftrain)
            self.predictions = model.transform(dftest)
            self.model = model

    def performancerdd(self):
        self.calculator = 'RDDs'
        print('Calculating performance metrics using RDDs...')
        predictionRDD = self.predictions.select(
            ['label', 'prediction']).rdd.map(lambda line: (line[1], line[0]))

        binmetrics = BinaryClassificationMetrics(predictionRDD)
        metrics = MulticlassMetrics(predictionRDD)

        self.areaUnderROC = binmetrics.areaUnderROC
        self.areaUnderPR = binmetrics.areaUnderPR
        self.confusionMatrix = metrics.confusionMatrix().toArray()
        self.accuracy = metrics.accuracy
        self.precision = metrics.precision()
        self.recall = metrics.recall()
        self.f1measure = metrics.fMeasure()
        self.falsePositive = metrics.falsePositiveRate(1.0)
        self.falseNegative = metrics.falsePositiveRate(0.0)

    def performance(self):
        self.calculator = 'Nothing'
        print('Calculating performance metrics using nothing...')
        evaluator = BinaryClassificationEvaluator(
            rawPredictionCol="rawPrediction")
        self.areaUnderROC = evaluator.evaluate(self.predictions)
        preds = self.predictions
        fp = preds.filter(preds.label < preds.prediction).count()
        fn = preds.filter(preds.label > preds.prediction).count()
        tp = preds.filter(preds.label == 1.0).filter(
            preds.prediction == 1.0).count()
        tn = preds.filter(preds.label == 0.0).filter(
            preds.prediction == 0.0).count()
        total = fp + fn + tp + tn
        self.confusionMatrix = [[tn, fn], [fp, tp]]
        self.accuracy = (tp + tn) / total
        if (tp + fp):
            self.precision = tp / (tp + fp)
        else:
            self.precision = 0
        if (tp + fn):
            self.recall = tp / (tp + fn)
        else:
            self.recall = 0
        if (self.precision + self.recall):
            self.f1measure = 2 * self.precision * self.recall / (
                self.precision + self.recall)
        else:
            self.f1measure = 0
        if (fp + tn):
            self.falsePositive = fp / (fp + tn)
        else:
            self.falsePositive = 0
        if (fn + tp):
            self.falseNegative = fn / (fn + tp)
        else:
            self.falseNegative = 0

    def printperformance(self):
        print('Stages: ' + str(self.pipeline.getStages()))
        print('Performance calculated using ' + self.calculator)
        print('areaUnderROC = ' + str(self.areaUnderROC))
        #        print('areaUnderPR = ' + str(self.areaUnderPR))
        print('confusionMatrix:')
        print(self.confusionMatrix)
        print('accuracy = ' + str(self.accuracy))
        print('precision = ' + str(self.precision))
        print('recall = ' + str(self.recall))
        print('f1measure = ' + str(self.f1measure))
        print('falsePositive = ' + str(self.falsePositive))
        print('falseNegative = ' + str(self.falseNegative))
Example #7
0
def func1():
    hour_df = sqlContext.read.format("csv").option(
        "header", "true").load(Path + "hour.csv")
    print("count", hour_df.count())
    print("columns:", hour_df.columns)
    #舍弃不需要的字段
    hour_df = hour_df.drop("instant").drop("dteday").drop("yr").drop(
        "casual").drop("registered")
    print("查看schema:", hour_df.printSchema())
    # 数据转换为double
    hour_df = hour_df.select([
        col(column).cast("double").alias(column) for column in hour_df.columns
    ])
    print("转换后:hour_df.printSchema():", hour_df.printSchema())
    print("前3项数据:", hour_df.show(3))
    # 将数据分为train_df和test_df,比例为0.7:0.3
    train_df, test_df = hour_df.randomSplit([0.7, 0.3])
    train_df.cache()
    test_df.cache()
    # 创建特征字段list
    featureCols = hour_df.columns[:-1]
    print("featureCols:", featureCols)
    # 建立pipeline
    vectorAssembler = VectorAssembler(inputCols=featureCols,
                                      outputCol="aFeatures")
    vectorIndexer = VectorIndexer(inputCol="aFeatures",
                                  outputCol="features",
                                  maxCategories=24)
    dt = DecisionTreeRegressor(labelCol="cnt", featuresCol="features")
    dt_pipeline = Pipeline(stages=[vectorAssembler, vectorIndexer, dt])
    print("查看pipeline流程:", dt_pipeline.getStages())
    # 训练
    dt_pipelineModel = dt_pipeline.fit(dataset=train_df)
    print("查看训练完成后的模型:", dt_pipelineModel.stages[2].toDebugString[:500])
    # 使用transform预测
    predicted = dt_pipelineModel.transform(test_df)
    print("查看新增的字段:", predicted.columns)
    print("查看预测的结果:", predicted.show(2))
    ###评估模型
    evaluator = RegressionEvaluator(labelCol="cnt",
                                    predictionCol="prediction",
                                    metricName="rmse")
    predicted_df = dt_pipelineModel.transform(test_df)
    rmse = evaluator.evaluate(predicted_df)
    print("rmse:", rmse)
    ##TrainValidationSplit训练找出最佳模型
    paramGrid = ParamGridBuilder().addGrid(
        dt.impurity,
        ["gini", "entory"]).addGrid(dt.maxDepth, [5, 10, 15]).addGrid(
            dt.maxBins, [10, 15, 20]).build()
    tvs = TrainValidationSplit(estimator=dt,
                               evaluator=evaluator,
                               estimatorParamMaps=paramGrid,
                               trainRatio=0.8)  # trainRatio 数据会8:2的比例分为训练集,验证集
    tvs_pipeline = Pipeline(stages=[vectorAssembler, vectorIndexer, tvs])
    yvs_pipelineModel = tvs_pipeline.fit(dataset=train_df)
    bestmodel = yvs_pipelineModel.stages[2].bestModel
    print("bestModel:", bestmodel.toDebugString[:500])
    ##使用最佳模型进行预测
    predictions = tvs_pipeline.transform(test_df)
    rmse2 = evaluator.evaluate(predictions)
    print(rmse2)
en_coeffs_df.query('weight == 0.0').shape[0]/en_coeffs_df.shape[0]


# In[49]:

en_coeffs_df.query('weight == 0.0').head(15)


# In[50]:

from pyspark.ml.tuning import ParamGridBuilder, TrainValidationSplit


# In[51]:

en_lr_estimator.getStages()


# In[52]:

grid = ParamGridBuilder().    addGrid(en_lr.regParam, [0., 0.01, 0.02]).    addGrid(en_lr.elasticNetParam, [0., 0.2, 0.4]).    build()


# In[53]:

grid


# In[54]:

all_models = []
Example #9
0
def func1():
    rawData = sc.textFile(Path + "covtype.data", minPartitions=40)
    lines = rawData.map(lambda x: x.split(","))
    print("lines.count()::", lines.count())
    fieldNum = len(lines.first())
    print("字段数:", fieldNum)
    fields = [
        StructField(name="f" + str(i), dataType=StringType, nullable=True)
        for i in range(fieldNum)
    ]
    schema = StringType(fields)
    covtype_df = sqlContext.createDataFrame(data=lines, schema=schema)
    print("covtype_df.columns::", covtype_df.columns)
    print("covtype_df.printSchema()::", covtype_df.printSchema())
    #数据转换为double
    covtype_df = covtype_df.select([
        col(column).cast("double").alias(column)
        for column in covtype_df.columns
    ])
    print("装换后:covtype_df.printSchema():", covtype_df.printSchema())
    #创建特征字段list
    featureCols = covtype_df.columns[:54]
    print("featureCols:", featureCols)
    #设置label字段,第54个字段是label,值范围是1-7,但是训练需从0开始,所以covtype_df["f54"]-1,表示将值都范围转到0-6了
    covtype_df = covtype_df.withColumn(colName="label",
                                       col=covtype_df["f54"] - 1).drop("f54")
    print("第一项数据:", covtype_df.show(1))
    #将数据分为train_df和test_df,比例为0.7:0.3
    train_df, test_df = covtype_df.randomSplit([0.7, 0.3])
    train_df.cache()
    test_df.cache()
    #建立pipeline
    vectorAssembler = VectorAssembler(inputCols=featureCols,
                                      outputCol="features")
    dt = DecisionTreeClassifier(labelCol="label",
                                featuresCol="features",
                                maxDepth=5,
                                maxBins=20)
    dt_pipeline = Pipeline(stages=[vectorAssembler, dt])
    print("查看pipeline流程:", dt_pipeline.getStages())
    #训练
    pipelineModel = dt_pipeline.fit(dataset=train_df)
    print("查看训练完成后的模型:", pipelineModel.stages[1].toDebugString[:500])
    #使用transform预测
    predicted = pipelineModel.transform(test_df)
    print("查看新增的字段:", predicted.columns)
    print("查看预测的结果:", predicted.show(2))
    ###评估模型
    evaluator = MulticlassClassificationEvaluator(labelCol="label",
                                                  predictionCol="prediction",
                                                  metricName="accuracy")
    accuracy = evaluator.evaluate(predicted)
    print("accuracy::", accuracy)
    ##TrainValidationSplit训练找出最佳模型
    paramGrid = ParamGridBuilder().addGrid(
        dt.impurity,
        ["gini", "entory"]).addGrid(dt.maxDepth, [5, 10, 15]).addGrid(
            dt.maxBins, [10, 15, 20]).build()
    tvs = TrainValidationSplit(estimator=dt,
                               evaluator=evaluator,
                               estimatorParamMaps=paramGrid,
                               trainRatio=0.8)  # trainRatio 数据会8:2的比例分为训练集,验证集
    tvs_pipeline = Pipeline(stages=[vectorAssembler, tvs])
    pipelineModel = tvs_pipeline.fit(dataset=train_df)
    bestmodel = pipelineModel.stages[1].bestModel
    print("bestModel:", bestmodel.toDebugString[:500])
    ##使用最佳模型进行预测
    predictions = tvs_pipeline.transform(test_df)
    result=predictions.withColumnRenamed("f0","海拔").withColumnRenamed("f1", "方位").withColumnRenamed("f2","斜率")\
        .withColumnRenamed("f3","垂直距离").withColumnRenamed("f4","水平距离").withColumnRenamed("f5","阴影")
    result.select("海拔", "方位", "斜率", "垂直距离", "水平距离", "阴影", "label",
                  "prediction").show(10)
    accuracy2 = evaluator.evaluate(predictions)
    print("accuracy2:", accuracy2)
Example #10
0
    preprocessed_data.cache()


    # split data to train/test 80/20
    train_preprocessed_data = preprocessed_data.randomSplit([.8,.2])[0]
    train_preprocessed_data.cache()

    #model
    gbmodel = GBTRegressor(featuresCol="features",labelCol=target)

    # model tuning process
    evaluator =RegressionEvaluator(labelCol=target)
    paramGrid = (ParamGridBuilder()
             .addGrid(gbmodel.maxDepth, [2, 4, 6])
             .addGrid(gbmodel.maxBins, [20, 60])
             .addGrid(gbmodel.maxIter, [10, 20])
             .addGrid(gbmodel.minInfoGain, [0.0, 0.05])
             .build())
    cv = CrossValidator(estimator=gbmodel, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=5)

    pipeline_cv = cv.fit(train_preprocessed_data)


    # final pipeline to deploy is the preprocessing steps + best model (best hyperparameters)
    final_pipeline = Pipeline(stages=[*preprocessing_pipeline.getStages(), pipeline_cv.bestModel])

    #train on all data and save model to disk
    final_model = final_pipeline.fit(scorecard_data_cleaned)
    final_model.write().overwrite().save(web_app_model_path)

    sc.stop()