Exemple #1
0
 def test_int_to_float(self):
     from pyspark.mllib.linalg import Vectors
     df = self.sc.parallelize([
         Row(label=1.0, weight=2.0, features=Vectors.dense(1.0))]).toDF()
     lr = LogisticRegression(elasticNetParam=0)
     lr.fit(df)
     lr.setElasticNetParam(0)
     lr.fit(df)
Exemple #2
0
 def test_logistic_regression_summary(self):
     from pyspark.mllib.linalg import Vectors
     sqlContext = SQLContext(self.sc)
     df = sqlContext.createDataFrame([(1.0, 2.0, Vectors.dense(1.0)),
                                      (0.0, 2.0, Vectors.sparse(1, [], []))],
                                     ["label", "weight", "features"])
     lr = LogisticRegression(maxIter=5, regParam=0.01, weightCol="weight", fitIntercept=False)
     model = lr.fit(df)
     self.assertTrue(model.hasSummary)
     s = model.summary
     # test that api is callable and returns expected types
     self.assertTrue(isinstance(s.predictions, DataFrame))
     self.assertEqual(s.probabilityCol, "probability")
     self.assertEqual(s.labelCol, "label")
     self.assertEqual(s.featuresCol, "features")
     objHist = s.objectiveHistory
     self.assertTrue(isinstance(objHist, list) and isinstance(objHist[0], float))
     self.assertGreater(s.totalIterations, 0)
     self.assertTrue(isinstance(s.roc, DataFrame))
     self.assertAlmostEqual(s.areaUnderROC, 1.0, 2)
     self.assertTrue(isinstance(s.pr, DataFrame))
     self.assertTrue(isinstance(s.fMeasureByThreshold, DataFrame))
     self.assertTrue(isinstance(s.precisionByThreshold, DataFrame))
     self.assertTrue(isinstance(s.recallByThreshold, DataFrame))
     # test evaluation (with training dataset) produces a summary with same values
     # one check is enough to verify a summary is returned, Scala version runs full test
     sameSummary = model.evaluate(df)
     self.assertAlmostEqual(sameSummary.areaUnderROC, s.areaUnderROC)
Exemple #3
0
    def test_binomial_logistic_regression_with_bound(self):

        df = self.spark.createDataFrame(
            [(1.0, 1.0, Vectors.dense(0.0, 5.0)),
             (0.0, 2.0, Vectors.dense(1.0, 2.0)),
             (1.0, 3.0, Vectors.dense(2.0, 1.0)),
             (0.0, 4.0, Vectors.dense(3.0, 3.0)), ], ["label", "weight", "features"])

        lor = LogisticRegression(regParam=0.01, weightCol="weight",
                                 lowerBoundsOnCoefficients=Matrices.dense(1, 2, [-1.0, -1.0]),
                                 upperBoundsOnIntercepts=Vectors.dense(0.0))
        model = lor.fit(df)
        self.assertTrue(
            np.allclose(model.coefficients.toArray(), [-0.2944, -0.0484], atol=1E-4))
        self.assertTrue(np.isclose(model.intercept, 0.0, atol=1E-4))
Exemple #4
0
    def test_multinomial_logistic_regression_with_bound(self):

        data_path = "data/mllib/sample_multiclass_classification_data.txt"
        df = self.spark.read.format("libsvm").load(data_path)

        lor = LogisticRegression(regParam=0.01,
                                 lowerBoundsOnCoefficients=Matrices.dense(3, 4, range(12)),
                                 upperBoundsOnIntercepts=Vectors.dense(0.0, 0.0, 0.0))
        model = lor.fit(df)
        expected = [[4.593, 4.5516, 9.0099, 12.2904],
                    [1.0, 8.1093, 7.0, 10.0],
                    [3.041, 5.0, 8.0, 11.0]]
        for i in range(0, len(expected)):
            self.assertTrue(
                np.allclose(model.coefficientMatrix.toArray()[i], expected[i], atol=1E-4))
        self.assertTrue(
            np.allclose(model.interceptVector.toArray(), [-0.9057, -1.1392, -0.0033], atol=1E-4))
 def test_binary_logistic_regression_summary(self):
     df = self.spark.createDataFrame([(1.0, 2.0, Vectors.dense(1.0)),
                                      (0.0, 2.0, Vectors.sparse(1, [], []))],
                                     ["label", "weight", "features"])
     lr = LogisticRegression(maxIter=5, regParam=0.01, weightCol="weight", fitIntercept=False)
     model = lr.fit(df)
     self.assertTrue(model.hasSummary)
     s = model.summary
     # test that api is callable and returns expected types
     self.assertTrue(isinstance(s.predictions, DataFrame))
     self.assertEqual(s.probabilityCol, "probability")
     self.assertEqual(s.labelCol, "label")
     self.assertEqual(s.featuresCol, "features")
     self.assertEqual(s.predictionCol, "prediction")
     objHist = s.objectiveHistory
     self.assertTrue(isinstance(objHist, list) and isinstance(objHist[0], float))
     self.assertGreater(s.totalIterations, 0)
     self.assertTrue(isinstance(s.labels, list))
     self.assertTrue(isinstance(s.truePositiveRateByLabel, list))
     self.assertTrue(isinstance(s.falsePositiveRateByLabel, list))
     self.assertTrue(isinstance(s.precisionByLabel, list))
     self.assertTrue(isinstance(s.recallByLabel, list))
     self.assertTrue(isinstance(s.fMeasureByLabel(), list))
     self.assertTrue(isinstance(s.fMeasureByLabel(1.0), list))
     self.assertTrue(isinstance(s.roc, DataFrame))
     self.assertAlmostEqual(s.areaUnderROC, 1.0, 2)
     self.assertTrue(isinstance(s.pr, DataFrame))
     self.assertTrue(isinstance(s.fMeasureByThreshold, DataFrame))
     self.assertTrue(isinstance(s.precisionByThreshold, DataFrame))
     self.assertTrue(isinstance(s.recallByThreshold, DataFrame))
     self.assertAlmostEqual(s.accuracy, 1.0, 2)
     self.assertAlmostEqual(s.weightedTruePositiveRate, 1.0, 2)
     self.assertAlmostEqual(s.weightedFalsePositiveRate, 0.0, 2)
     self.assertAlmostEqual(s.weightedRecall, 1.0, 2)
     self.assertAlmostEqual(s.weightedPrecision, 1.0, 2)
     self.assertAlmostEqual(s.weightedFMeasure(), 1.0, 2)
     self.assertAlmostEqual(s.weightedFMeasure(1.0), 1.0, 2)
     # test evaluation (with training dataset) produces a summary with same values
     # one check is enough to verify a summary is returned, Scala version runs full test
     sameSummary = model.evaluate(df)
     self.assertTrue(isinstance(sameSummary, BinaryLogisticRegressionSummary))
     self.assertAlmostEqual(sameSummary.areaUnderROC, s.areaUnderROC)
Exemple #6
0
def main():
    df = spark.read.json(katkam_in_directory)
    schema_file = open('schema')
    schema_lines = [i.strip() for i in schema_file.readlines()]
    schema = types.StructType([types.StructField(i, types.StringType(), False) for i in schema_lines])
    schema_file.close()
    weather = spark.read.csv(weather_in_directory, schema=schema)#.withColumn('filename', functions.input_file_name())

    df = df.join(weather, 'Date/Time')

    # https://stackoverflow.com/questions/39025707/how-to-convert-arraytype-to-densevector-in-pyspark-dataframe
    to_vec = functions.UserDefinedFunction(lambda vs: Vectors.dense(vs), VectorUDT())
    get_rid_of_rain = functions.UserDefinedFunction(lambda vs: rain_gone(vs), types.LongType())

    df = df.select(get_rid_of_rain(df['Weather']).alias('label'), to_vec(df['image']).alias('features'))

    # Do machine learning
    splits = df.randomSplit([0.6, 0.4], 1234)
    train = splits[0]
    test = splits[1]

    # Naive Bayes Model
    #nb = NaiveBayes(smoothing=1.0, modelType="multinomial")

    # Logistic Regression Model
    lr = LogisticRegression()

    model = lr.fit(train)
    predictions = model.transform(test)

    # Compute accuracy on the test set
    evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction",
                                                  metricName="accuracy")
    accuracy = evaluator.evaluate(predictions)

    # Write the final predictions dataframe to a CSV directory
    predictions.write.json(out_directory, mode='overwrite')

    # Write the final accuracy score to a text file, tide analysis will write to the same file
    with open(out_directory + '/final-results.txt', 'w+') as fp:
        fp.write('Test set accuracy for weather analysis: ' + str(accuracy))
    fp.close()
Exemple #7
0
def downstream_ml_func(features_df, results_dict, layer_index):
    """
        Sample implementation fo the downstream ML function
    :param features_df: Merged (struct+cnn) feature DataFrame
    :param results_dict: Dictionary object which is used to store downstream ML model performance details such as accuracy.
    :param layer_index: Layer index of the CNN of which the current features_df correspond to. The layer index is negative
                        pointing the index from the top of the CNN layers
    :return: Dictionary
    """
    lr = LogisticRegression(labelCol="label",
                            featuresCol="features",
                            maxIter=10,
                            regParam=0.5)
    model = lr.fit(features_df)
    predictions = model.transform(features_df)
    evaluator = MulticlassClassificationEvaluator(labelCol="label",
                                                  predictionCol="prediction",
                                                  metricName="accuracy")
    results_dict[layer_index] = evaluator.evaluate(predictions)
    return results_dict
def logistic_classifier(df, conf):
    max_iter = conf["params"].get("maxIter")
    reg_param = conf["params"].get("regParam")
    elasticNetParam = conf["params"].get("elasticNetParam")
    family = conf["params"].get("family")
    weight = conf["params"].get("weightCol")
    lr = LogisticRegression(maxIter=max_iter,
                            regParam=reg_param,
                            weightCol=weight)
    if conf["tuning"].get("crossval"):
        grid = ParamGridBuilder().addGrid(lr.maxIter, [0, 1]).build()
        evaluator = BinaryClassificationEvaluator()
        cv = CrossValidator(estimator=lr,
                            estimatorParamMaps=grid,
                            evaluator=evaluator)
        model = cv.fit(dataset)
    else:
        mlor = LogisticRegression(regParam=reg_param, weightCol=weight)
        model = mlor.fit(df)
    return model
def Logistic():
    # Create a LogisticRegression instance. This instance is an Estimator.
    lr = LogisticRegression(maxIter=10, regParam=0.01)
    # Print out the parameters, documentation, and any default values.
    print("LogisticRegression parameters:\n" + lr.explainParams() + "\n")

    # Learn a LogisticRegression model. This uses the parameters stored in lr.
    lrModel = lr.fit(train)
    lrModel.write().overwrite().save("save/bert_logistic")

    # Make predictions on test data using the Transformer.transform() method.
    # LogisticRegression.transform will only use the 'features' column.
    # Note that model2.transform() outputs a "myProbability" column instead of the usual
    # 'probability' column since we renamed the lr.probabilityCol parameter previously.
    predictions = lrModel.transform(test)
    evaluator = MulticlassClassificationEvaluator(labelCol="label",
                                                  predictionCol="prediction",
                                                  metricName="accuracy")
    accuracy = evaluator.evaluate(predictions)
    print("Test Accuracy = %g " % accuracy)
 def test_model_logistic_regression_binary_class(self):
     import inspect
     import os
     this_script_dir = os.path.dirname(
         os.path.abspath(inspect.getfile(inspect.currentframe())))
     input_path = os.path.join(this_script_dir, "data",
                               "sample_libsvm_data.txt")
     original_data = self.spark.read.format("libsvm").load(input_path)
     #
     # truncate the features
     #
     self.spark.udf.register(
         "truncateFeatures", lambda x: SparseVector(5, range(0, 5),
                                                    x.toArray()[125:130]),
         VectorUDT())
     data = original_data.selectExpr(
         "label", "truncateFeatures(features) as features")
     lr = LogisticRegression(maxIter=100, tol=0.0001)
     model = lr.fit(data)
     # the name of the input for Logistic Regression is 'features'
     model_onnx = convert_sparkml(
         model, 'sparkml logistic regression',
         [('features', FloatTensorType([1, model.numFeatures]))])
     self.assertTrue(model_onnx is not None)
     self.assertTrue(model_onnx.graph.node is not None)
     # run the model
     import pandas
     predicted = model.transform(data)
     data_np = data.toPandas().features.apply(
         lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32)
     expected = [
         predicted.toPandas().prediction.values.astype(numpy.float32),
         predicted.toPandas().probability.apply(
             lambda x: pandas.Series(x.toArray())).values.astype(
                 numpy.float32)
     ]
     dump_data_and_sparkml_model(data_np,
                                 expected,
                                 model,
                                 model_onnx,
                                 basename="SparkmlLogisticRegression")
Exemple #11
0
def run(data, headers_feature):
    print('-*-*-*- Iniciando la regresion logistica -*-*-*-')
    start_time = datetime.datetime.now()
    print('Tiempo inicial', start_time)

    # Obtener data de entrenamiento y pruebas
    train_data, test_data = prepare_dataset(data, headers_feature)

    # Configurar la regresion logistica multinomial
    lr = LogisticRegression(maxIter=200,
                            regParam=0.3,
                            elasticNetParam=0.8,
                            labelCol='genre',
                            family='multinomial')

    # Obtener el modelo de clasificacion
    lr_model = lr.fit(train_data)

    print("Coeficientes: " + str(lr_model.coefficientMatrix))
    print("Intercepto: " + str(lr_model.interceptVector))

    data_to_validate = lr_model.transform(test_data)

    evaluator1 = BinaryClassificationEvaluator(
        labelCol='genre',
        metricName='areaUnderROC',
        rawPredictionCol='rawPrediction')
    ROC = evaluator1.evaluate(data_to_validate)
    print("{}:{}".format("ROC", ROC))

    evaluator2 = BinaryClassificationEvaluator(
        labelCol='genre',
        metricName='areaUnderPR',
        rawPredictionCol='rawPrediction')
    PR = evaluator2.evaluate(data_to_validate)
    print("{}:{}".format("PR", PR))

    end_time = datetime.datetime.now()
    print('Tiempo final', end_time)
    print('Tiempo transcurrido para regresion logistica',
          end_time - start_time)
Exemple #12
0
def do_machine_learning(enterteinment_num, politics_num):
    _fp = os.path.join(os.getcwd() +
                       '/result_articles.txt')  #크롤링을 통해 얻은 트레이닝 데이터를 불러옴.
    _f = open(_fp, 'r')

    frame = []
    while True:
        line = _f.readline()
        if not line:
            break
        temp = []
        for node in line.split():
            temp.append(node)
        frame.append(temp)
        print(temp)
    _f.close()
    ###############
    df = spark.createDataFrame(frame, ['cls', 'entertainment', 'politics'])
    clsIndexer = StringIndexer(inputCol="cls", outputCol="label")
    i1Indexer = StringIndexer(inputCol="entertainment", outputCol="i1")
    i2Indexer = StringIndexer(inputCol="politics", outputCol="i2")

    va = VectorAssembler(inputCols=["i1", "i2"], outputCol="features")
    pipeline = Pipeline(stages=[clsIndexer, i1Indexer, i2Indexer, va])

    model = pipeline.fit(df)
    df2 = model.transform(df)
    df2.printSchema()
    df2.show()

    trainDf = df2.select('label', 'features')

    lr = LogisticRegression(maxIter=10, regParam=0.01)
    lrModel = lr.fit(trainDf)  #트레이닝 데이터 생성

    #print Vectors.dense(10,10)
    test0 = spark.sparkContext.parallelize([
        Row(features=Vectors.dense([enterteinment_num, politics_num]))
    ]).toDF()  #테스트 데이터 입력
    result = lrModel.transform(test0).head()
    print "Irregularity ? : ", result.prediction
Exemple #13
0
    def exec_logistic_regression(self,
                                 featuresCol1="features",
                                 labelCol1="label",
                                 predictionCol1="prediction",
                                 maxIter1=30,
                                 regParam1=0.3,
                                 elasticNetParam1=0,
                                 numClass1=2):
        '''
        Creates the Logistic Regression model Pipeline
        Input: featureCol1: feature column name, labelCol: label column name, predictionCol1: prediction column name
                            model parameters: {max iterations, regularization parameter, elastic net parameter}, 
                            numClass1: number of class labels
        Output: None
        '''
        #Initialize Logistic Regression Model with parameters passed
        lr = LogisticRegression(featuresCol=featuresCol1,
                                labelCol=labelCol1,
                                predictionCol=predictionCol1,
                                maxIter=maxIter1,
                                regParam=regParam1,
                                elasticNetParam=elasticNetParam1)

        #Fit lr model with training data
        lrModel = lr.fit(self.trainingData)

        #Make lr model predictions on testData
        predictions = lrModel.transform(self.testData)

        #Evaluate the results generated by the model prediction
        self.model_evaluator(predictions,
                             modelType="Logistic Regression Model",
                             modelParams=str({
                                 'maxIter':
                                 maxIter1,
                                 'regParam':
                                 regParam1,
                                 'elasticNetParam':
                                 elasticNetParam1
                             }),
                             numClass=numClass1)
Exemple #14
0
 def train(self, rdd):
     """
     :return:  Trained model to be passed to test.
     """
     options = self.options
     if options.reg_type == "elastic-net":  # use spark.ml
         lr = MLLogisticRegression(maxIter=options.num_iterations, regParam=options.reg_param,
                                   elasticNetParam=options.elastic_net_param)
         # TODO: Do not include time for conversion to DataFrame (but this currently matches
         #       the Scala tests)
         df = rdd.toDF()
         lrModel = lr.fit(df)
         numFeatures = len(lrModel.weights)
         numClasses = 2
         return LogisticRegressionModel(lrModel.weights, lrModel.intercept,
                                        numFeatures, numClasses)
     else:
         if options.loss == "logistic":
             if options.optimizer == "sgd":
                 return LogisticRegressionWithSGD.train(data=rdd,
                                                        iterations=options.num_iterations,
                                                        step=options.step_size,
                                                        miniBatchFraction=1.0,
                                                        regParam=options.reg_param,
                                                        regType=options.reg_type)
             elif options.optimizer == "l-bfgs":
                 return LogisticRegressionWithLBFGS.train(data=rdd,
                                                          iterations=options.num_iterations,
                                                          regParam=options.reg_param,
                                                          regType=options.reg_type,
                                                          tolerance=0.0)
             else:
                 raise Exception("GLMClassificationTest cannot run with loss = %s,"
                                 " optimizer = %s" % (options.loss, options.optimizer))
         elif options.loss == "hinge":
             if options.optimizer == "sgd":
                 return SVMWithSGD.train(data=rdd, iterations=options.num_iterations,
                                         step=options.step_size, regParam=options.reg_param,
                                         miniBatchFraction=1.0, regType=options.reg_type)
         else:
             raise Exception("GLMClassificationTest does not recognize loss: %s" % options.loss)
 def train(self, rdd):
     """
     :return:  Trained model to be passed to test.
     """
     options = self.options
     if options.reg_type == "elastic-net":  # use spark.ml
         lr = MLLogisticRegression(maxIter=options.num_iterations, regParam=options.reg_param,
                                   elasticNetParam=options.elastic_net_param)
         # TODO: Do not include time for conversion to DataFrame (but this currently matches
         #       the Scala tests)
         df = rdd.toDF()
         lrModel = lr.fit(df)
         numFeatures = len(lrModel.weights)
         numClasses = 2
         return LogisticRegressionModel(lrModel.weights, lrModel.intercept,
                                        numFeatures, numClasses)
     else:
         if options.loss == "logistic":
             if options.optimizer == "sgd":
                 return LogisticRegressionWithSGD.train(data=rdd,
                                                        iterations=options.num_iterations,
                                                        step=options.step_size,
                                                        miniBatchFraction=1.0,
                                                        regParam=options.reg_param,
                                                        regType=options.reg_type)
             elif options.optimizer == "l-bfgs":
                 return LogisticRegressionWithLBFGS.train(data=rdd,
                                                          iterations=options.num_iterations,
                                                          regParam=options.reg_param,
                                                          regType=options.reg_type,
                                                          tolerance=0.0)
             else:
                 raise Exception("GLMClassificationTest cannot run with loss = %s,"
                                 " optimizer = %s" % (options.loss, options.optimizer))
         elif options.loss == "hinge":
             if options.optimizer == "sgd":
                 return SVMWithSGD.train(data=rdd, iterations=options.num_iterations,
                                         step=options.step_size, regParam=options.reg_param,
                                         miniBatchFraction=1.0, regType=options.reg_type)
         else:
             raise Exception("GLMClassificationTest does not recognize loss: %s" % options.loss)
def modelTraining(trainSetWoeDF, weightBalance, fn):
    # 数据预转换,满足ML-linearRegression输入格式要求
    trainSetVecAse = vecAseembler.transform(trainSetWoeDF)
    strInd = stringIndexer.fit(trainSetVecAse)
    trainSetVecAseStrInd = strInd.transform(trainSetVecAse)
    trainSetVecAseStrIndWet = trainSetVecAseStrInd.withColumn(
        'weight', trainSetVecAseStrInd.target * weightBalance + 1)
    # 模型训练
    lrm = LogisticRegression(regParam=0.01, weightCol="weight")
    lrModel = lrm.fit(trainSetVecAseStrIndWet)
    trainSetWithProba = lrModel.transform(trainSetVecAseStrIndWet)
    # 保存模型及相关参数
    vecAseembler.write().overwrite().save(
        savePath + '{}/{}/vecAseembler'.format(curDate, fn))
    strInd.write().overwrite().save(savePath +
                                    '{}/{}/strInd'.format(curDate, fn))
    lrModel.write().overwrite().save(savePath +
                                     '{}/{}/lrModel'.format(curDate, fn))
    # joblib.dump([lr_model.intercept, lr_model.coefficients], localPath + 'params/lrFinalCoef_{}.pkl'.format(fn))
    coefNotNegtive = np.where(lrModel.coefficients.toArray() > 0)[0]
    return (trainSetWithProba, coefNotNegtive)
Exemple #17
0
    def test_multinomial_logistic_regression_with_bound(self):

        data_path = "data/mllib/sample_multiclass_classification_data.txt"
        df = self.spark.read.format("libsvm").load(data_path)

        lor = LogisticRegression(
            regParam=0.01,
            lowerBoundsOnCoefficients=Matrices.dense(3, 4, range(12)),
            upperBoundsOnIntercepts=Vectors.dense(0.0, 0.0, 0.0))
        model = lor.fit(df)
        expected = [[4.593, 4.5516, 9.0099, 12.2904], [1.0, 8.1093, 7.0, 10.0],
                    [3.041, 5.0, 8.0, 11.0]]
        for i in range(0, len(expected)):
            self.assertTrue(
                np.allclose(model.coefficientMatrix.toArray()[i],
                            expected[i],
                            atol=1E-4))
        self.assertTrue(
            np.allclose(model.interceptVector.toArray(),
                        [-0.9057, -1.1392, -0.0033],
                        atol=1E-4))
Exemple #18
0
def modeloLogistico(data,
                    labelCol="label",
                    featuresCol="features",
                    weightCol="classWeights"):
    """
    Función que se encarga de ajustar un modelo logístico
    a partir de un dataframe de spark con el esquema ya procesado
    a partir de la función dataProcessing().

    :param data: spark dataframe.
    :param labelCol: string nombre de la columna con la variable respuesta.
    :param featuresCol: string nombre de la columna con los vectores de las
        covariables.

    :returns modelo ajustado:
    """

    model = LogisticRegression(featuresCol=featuresCol,
                               labelCol=labelCol,
                               weightCol=weightCol)
    return model.fit(data)
Exemple #19
0
    def test_binomial_logistic_regression_with_bound(self):

        df = self.spark.createDataFrame(
            [
                (1.0, 1.0, Vectors.dense(0.0, 5.0)),
                (0.0, 2.0, Vectors.dense(1.0, 2.0)),
                (1.0, 3.0, Vectors.dense(2.0, 1.0)),
                (0.0, 4.0, Vectors.dense(3.0, 3.0)),
            ],
            ["label", "weight", "features"],
        )

        lor = LogisticRegression(
            regParam=0.01,
            weightCol="weight",
            lowerBoundsOnCoefficients=Matrices.dense(1, 2, [-1.0, -1.0]),
            upperBoundsOnIntercepts=Vectors.dense(0.0),
        )
        model = lor.fit(df)
        self.assertTrue(np.allclose(model.coefficients.toArray(), [-0.2944, -0.0484], atol=1e-4))
        self.assertTrue(np.isclose(model.intercept, 0.0, atol=1e-4))
Exemple #20
0
def retrain_full_model(data, model_type, paramMap):
    '''
  This function takes the whole dataset and retrains the given model with best parameters. 
   
  Arguments: 
    data {PySpark Dataframe} -- A PySpark Dataframe containing feature vectors and labels
    paramMap {dict} -- A dictionary of the best parameter values
    model_type {str} -- The type of model to train 
    
  Returns:
    model -- Returns the model retrained on full dataset
  '''

    if model_type == 'logistic':
        lr = LogisticRegression()
        model = lr.fit(data, paramMap)
    elif model_type == 'decisiontree':
        dt = DecisionTreeClassifier()
        model = dt.fit(data, paramMap)

    return model
Exemple #21
0
def lr(ss, data, label_index, feature_indexs, project_url):
    # 1.构造训练数据集
    def func(x):
        features_data = []
        for feature in feature_indexs:
            if (is_number(x[feature])):
                features_data.append(float(x[feature]))
            else:
                features_data.append(0.0)
        label_data = 0.0
        if (is_number(x[label_index])):
            label_data = float(x[label_index])
        return Row(label=label_data, features=Vectors.dense(features_data))

    training_set = data.rdd.map(list).map(lambda x: func(x)).toDF()

    # 2.训练模型
    lr_param = LogisticRegression(regParam=0.01, family='multinomial')
    lr_model = lr_param.fit(training_set)
    print(lr_model.coefficientMatrix)  # 系数
    print(lr_model.interceptVector)  # 截距
    # print(lr_model.explainParams())  # 参数以及其注解

    # 3.保存模型
    # model_path = project_url + '/model/multipleClassification/lr'
    # lr_model.write().overwrite().save(model_path)
    #
    # # 4.读取模型
    # lr2 = lr_model.load(model_path)

    # 5.预测
    result = lr_model.transform(training_set).head()
    print(result.prediction)

    LogisticRegressionTrainingSummary
    sum = lr_model.summary

    # 6.评估
    summary = lr_model.evaluate(training_set)
    summary.show()
def buil_lrmodel(path):

    df = load_data(path)

    #-------------------- preparing the dataset -------------------------------------------

    avg_age = find_avg_age(df)
    df = data_preparation(df, avg_age)

    print "count = ", df.count()

    df = df.drop('Cabin')
    df = df.drop('Ticket')
    df = df.drop('Name')

    #------------------ Build a model ----------------------------------------------------
    lr = LogisticRegression(maxIter=10, regParam=0.01)
    model = lr.fit(df)

    prediction = model.transform(df)
    prediction.show(truncate=False)

    evaluator = BinaryClassificationEvaluator()
    print "classification evaluation :", evaluator.evaluate(prediction)

    #-------------- selecting models with cross validation -----------------------------------
    lr = LogisticRegression()
    grid = ParamGridBuilder().addGrid(lr.maxIter, [1,10,50,150,200,500,1000])\
                            .addGrid(lr.regParam, [0.01, 0.05, 0.1,]).build()
    cv = CrossValidator(estimator=lr,
                        estimatorParamMaps=grid,
                        evaluator=evaluator)
    cvModel = cv.fit(df)

    prediction = cvModel.transform(df)
    prediction.show(truncate=False)

    print "classification evaluation :", evaluator.evaluate(prediction)

    return cvModel, avg_age
def buil_lrmodel(path):

    df = load_data(path)

    #-------------------- preparing the dataset -------------------------------------------

    avg_age = find_avg_age(df)
    df = data_preparation(df, avg_age)

    print "count = " , df.count()

    df = df.drop('Cabin')
    df = df.drop('Ticket')
    df = df.drop('Name')

    #------------------ Build a model ----------------------------------------------------
    lr = LogisticRegression(maxIter=10, regParam=0.01)
    model = lr.fit(df)

    prediction = model.transform(df)
    prediction.show(truncate=False)

    evaluator = BinaryClassificationEvaluator()
    print "classification evaluation :" , evaluator.evaluate(prediction)


    #-------------- selecting models with cross validation -----------------------------------
    lr = LogisticRegression()
    grid = ParamGridBuilder().addGrid(lr.maxIter, [1,10,50,150,200,500,1000])\
                            .addGrid(lr.regParam, [0.01, 0.05, 0.1,]).build()
    cv = CrossValidator(estimator=lr, estimatorParamMaps=grid, evaluator=evaluator)
    cvModel = cv.fit(df)

    prediction = cvModel.transform(df)
    prediction.show(truncate=False)

    print "classification evaluation :" , evaluator.evaluate(prediction)


    return cvModel,avg_age
Exemple #24
0
def build_model(df):
    """
    this function implements three models: logistic regression, decision trees and random forest
    :df: processed dataframe
    :result: None
    """
    
    # Split the data into train and test sets
    train_data, test_data = df.randomSplit([.8,.2],seed=7)
    
    print("Training Dataset Count: {0}".format(train_data.count()))
    print("Test Dataset Count: {0}".format(test_data.count()))
    print('')
    
    print('training Logitic Regression')
    lr = LogisticRegression(featuresCol = 'features', labelCol = 'label', maxIter=10)
    lr_model = lr.fit(train_data)

    training_summary = lr_model.summary

    lr_predictions = lr_model.transform(test_data)

    evaluator = BinaryClassificationEvaluator()
    print('Logistic Regression Test Area Under ROC = {0}'.format(evaluator.evaluate(lr_predictions)))
    
    print('\ntraining Decission Tree')
    dt = DecisionTreeClassifier(featuresCol = 'features', labelCol = 'label', maxDepth = 3)
    df_model = dt.fit(train_data)
    df_predictions = df_model.transform(test_data)

    evaluator = BinaryClassificationEvaluator()
    print('Decission Tree Test Area Under ROC = {0}'.format(evaluator.evaluate(df_predictions)))
    
    print('\ntraining Random Forest')
    rf = RandomForestClassifier(featuresCol = 'features', labelCol = 'label')
    rf_model = rf.fit(train_data)
    rf_predictions = rf_model.transform(test_data)

    evaluator = BinaryClassificationEvaluator()
    print('Random Forest Test Area Under ROC = {0}'.format(evaluator.evaluate(rf_predictions)))
def binomial_logistic_regression(trainingDataFrame,
                                 maxIter=100,
                                 regParam=0.0,
                                 elasticNetParam=0.0,
                                 tol=1e-6,
                                 fitIntercept=True,
                                 standardization=True,
                                 aggregationDepth=2):
    lr = LogisticRegression(maxIter=maxIter,
                            regParam=regParam,
                            elasticNetParam=elasticNetParam,
                            tol=tol,
                            fitIntercept=fitIntercept,
                            standardization=standardization,
                            aggregationDepth=aggregationDepth)
    lrModel = lr.fit(trainingDataFrame)
    result = {}
    result["model"] = lrModel
    result["summary"] = lrModel.summary  # https://goo.gl/i5UFA6
    result["intercept"] = lrModel.intercept
    result["coefficients"] = lrModel.coefficients
    return result
def linear_classifier_run(df_training, df_test, whichModel, isSmallSet = False):
    # gather train and test sets, if small set include Sex for accuracy testing
    train = gather_features(df_training).select("Scaled_features", "Sex")
    if isSmallSet == True:
        test = gather_features(df_test).select("Scaled_features", "Sex")
    else:
        test = gather_features(df_test, isTestSet = True).select("Scaled_features")

    # select classifier
    if whichModel == 'logisticRegression':    
        classifier = LogisticRegression(labelCol="Sex", featuresCol="Scaled_features", maxIter = 10)
    elif whichModel == 'onevsall':
        lr = LogisticRegression(labelCol="Sex", featuresCol="Scaled_features", maxIter=10)
        classifier = OneVsRest(classifier=lr, labelCol="Sex", featuresCol="Scaled_features")
    elif whichModel == 'decisionTree':
        classifier = DecisionTreeClassifier(labelCol="Sex", featuresCol="Scaled_features", maxDepth = 3)
    elif whichModel == 'randomForest':
        classifier = DecisionTreeClassifier(labelCol="Sex", featuresCol="Scaled_features")
    elif whichModel == 'gbt':
        classifier = GBTClassifier(labelCol="Sex", featuresCol="Scaled_features", maxIter = 10)
    elif whichModel == 'nb':
        classifier = NaiveBayes(labelCol="Sex", featuresCol="Scaled_features", smoothing=1.0, modelType="multinomial")
    else: 
        raise NameError("Model must be one of the following: logisticRegression, onevsall, decisionTree, randomForest, gbt or nb")
        
    # train the model with selected classifier
    model = classifier.fit(train)
        
    # predict test set
    print('Predicting with ', input_linear_method)
    predict_test = model.transform(test)
    # write to a text file
    predict_test.select('prediction').rdd.map(lambda x : str(int(x[0]))).saveAsTextFile(output_file)
    print('Output has been written to txt file')
    
    # test accuracy if small set
    if isSmallSet == True:
        results = predict_test.select("Sex","prediction").withColumn('Success', (predict_test['Sex'] == predict_test['prediction']))
        print('Accuracy of', whichModel, '= ', results.select("Success").where("Success == true").count() / results.count())
Exemple #27
0
def logstic_regression_usecase():
    """
        maxIter:最大迭代次数
        regPram:正则化强度
        elasticNetParam:用于指定L1和L2正则影响的权重
    """

    spark = getSparkSession()
    training = spark.read.format("libsvm").load("../data/lib_svm.txt")

    #use the multinomial family for binary classification
    mlr = LogisticRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8)

    # Fit the model
    mlrModel = mlr.fit(training)

    # Print the coefficients and intercepts for logistic regression with multinomial family
    print("Multinomial coefficients: " + str(mlrModel.coefficientMatrix))
    print("Multinomial intercepts: " + str(mlrModel.interceptVector))

    trainingSummary = mlrModel.summary

    # Obtain the objective per iteration
    objectiveHistory = trainingSummary.objectiveHistory
    print("objectiveHistory:")
    for objective in objectiveHistory:
        print(objective)

    # Obtain the receiver-operating characteristic as a dataframe and areaUnderROC.
    # 获取roc假正率和召回率数据
    trainingSummary.roc.show()
    # 获取roc曲线下方面积
    print("areaUnderROC: " + str(trainingSummary.areaUnderROC))

    # Set the model threshold to maximize F-Measure
    # 获取不通过阈值下的调和平均数
    fMeasure = trainingSummary.fMeasureByThreshold
    fMeasure.show()
def anom_with_lr():
  try:
    prepared_data = split_data()
    train = prepared_data['train']
    test = prepared_data['test']
    for_finding_more = prepared_data['for_finding_more']
    lr = LogisticRegression(maxIter = 10, regParam = 0.0, elasticNetParam = 0.0) #We set regParam = 0 to make it comparable with LogisticRegressionWithSGD that we used before, which does not do 
    #any regularization by default. With regParam = 0, value of elasticNetParam should not matter. elasticNetParam = 0 is Ridge regression (L2), keeps all features. elasticNetParam = 1 is LASSO (L1), performs feature selection.
    #With regParam = 0, test accuracy is 0.9454, fpr is 0.0713, fnr is 0.0375, on a sample of 50K test data points. 
    t0 = time()
    model = lr.fit(train)
    tt = time() - t0
    print "Classifier trained in {0} seconds".format(round(tt,3)) 
    
    t0 = time()
    predictions = model.transform(test) #Feed the test DataFrame as-is, do not need to feed the features only
    tt = time() - t0
    print "Prediction made in {0} seconds".format(round(tt,3))
 
    #Adding proabability to test data set for calibration
    labelsAndPreds = predictions.map(lambda p: (p.label, p.prediction, round(p.probability[1], 5)))   
    labelsAndPreds.toDF(["label", "predicted_label", "predicted_prob"]).write.format('com.databricks.spark.csv').save(home_folder + '/healthcare/data/cloudera_challenge/labelsAndPreds/logistic_regression')   
 
    test_accuracy = labelsAndPreds.filter(lambda (v, p, r): v == p).count()/float(test_data_size)        
    fpr = labelsAndPreds.filter(lambda (v, p, r): (v == 0 and p == 1)).count()/labelsAndPreds.filter(lambda (v, p, r): v == 0).count() 
    fnr = labelsAndPreds.filter(lambda (v, p, r): (v == 1 and p == 0)).count()/labelsAndPreds.filter(lambda (v, p, r): v == 1).count()
    print "Test accuracy is {0}, fpr is {1}, fnr is {2}".format(round(test_accuracy, 4), round(fpr, 4), round(fnr, 4))
    
    for_finding_more = model.transform(for_finding_more).map(lambda p: (p.label, round(p.probability[1], 5))) #toDF() in next line did not work without round(): some issue with float
    for_finding_more = for_finding_more.toDF(["label", "predicted_prob"])
    for_finding_more = for_finding_more.orderBy(for_finding_more.predicted_prob.desc())
    for_finding_more.select('predicted_prob').limit(10000).write.format('com.databricks.spark.csv').save(home_folder + '/healthcare/data/cloudera_challenge/additional_10000_from_spark') #Top one has 
    #probability of 0.9999, last one has probability 0.05159, 75 of them above 0.99
    
  except Exception:
    print("Exception in user code:")
    traceback.print_exc(file = sys.stdout)
  return 
def run_logistic_regression(tn_data, ts_data):
    lr = LogisticRegression(elasticNetParam=0.5,
                            regParam=0.01,
                            featuresCol="scaled_features",
                            labelCol="output",
                            weightCol="classWeights",
                            predictionCol="prediction")
    # Fit the model
    lrModel = lr.fit(tn_data)

    predict_train = lrModel.transform(tn_data)
    predict_test = lrModel.transform(ts_data)

    evaluator = BinaryClassificationEvaluator(rawPredictionCol='rawPrediction',
                                              labelCol='output')

    print_to_output_file("The area under ROC for train set is " +
                         str(evaluator.evaluate(predict_train)))
    print_to_output_file("The area under ROC for test set is " +
                         str(evaluator.evaluate(predict_test)))

    trainingSummary = lrModel.summary
    print_perf_summary(trainingSummary)
def test_should_log_model_with_wildcards_in_allowlist(dataset_binomial,
                                                      dataset_multinomial):
    mlflow.pyspark.ml.autolog(log_models=True)
    lor = LogisticRegression()
    ova1 = OneVsRest(classifier=lor)
    ova1_model = ova1.fit(dataset_multinomial)

    with mock.patch(
            "mlflow.pyspark.ml._log_model_allowlist",
        {
            "pyspark.ml.regression.*",
            "pyspark.ml.classification.LogisticRegressionModel",
            "pyspark.ml.feature.*",
        },
    ):
        lr = LinearRegression()
        with mlflow.start_run():
            lr_model = lr.fit(dataset_binomial)
        assert _should_log_model(lr_model)
        with mlflow.start_run():
            lor_model = lor.fit(dataset_binomial)
        assert _should_log_model(lor_model)
        assert not _should_log_model(ova1_model)
 def test_multiclass_logistic_regression_summary(self):
     df = self.spark.createDataFrame([(1.0, 2.0, Vectors.dense(1.0)),
                                      (0.0, 2.0, Vectors.sparse(1, [], [])),
                                      (2.0, 2.0, Vectors.dense(2.0)),
                                      (2.0, 2.0, Vectors.dense(1.9))],
                                     ["label", "weight", "features"])
     lr = LogisticRegression(maxIter=5, regParam=0.01, weightCol="weight", fitIntercept=False)
     model = lr.fit(df)
     self.assertTrue(model.hasSummary)
     s = model.summary
     # test that api is callable and returns expected types
     self.assertTrue(isinstance(s.predictions, DataFrame))
     self.assertEqual(s.probabilityCol, "probability")
     self.assertEqual(s.labelCol, "label")
     self.assertEqual(s.featuresCol, "features")
     self.assertEqual(s.predictionCol, "prediction")
     objHist = s.objectiveHistory
     self.assertTrue(isinstance(objHist, list) and isinstance(objHist[0], float))
     self.assertGreater(s.totalIterations, 0)
     self.assertTrue(isinstance(s.labels, list))
     self.assertTrue(isinstance(s.truePositiveRateByLabel, list))
     self.assertTrue(isinstance(s.falsePositiveRateByLabel, list))
     self.assertTrue(isinstance(s.precisionByLabel, list))
     self.assertTrue(isinstance(s.recallByLabel, list))
     self.assertTrue(isinstance(s.fMeasureByLabel(), list))
     self.assertTrue(isinstance(s.fMeasureByLabel(1.0), list))
     self.assertAlmostEqual(s.accuracy, 0.75, 2)
     self.assertAlmostEqual(s.weightedTruePositiveRate, 0.75, 2)
     self.assertAlmostEqual(s.weightedFalsePositiveRate, 0.25, 2)
     self.assertAlmostEqual(s.weightedRecall, 0.75, 2)
     self.assertAlmostEqual(s.weightedPrecision, 0.583, 2)
     self.assertAlmostEqual(s.weightedFMeasure(), 0.65, 2)
     self.assertAlmostEqual(s.weightedFMeasure(1.0), 0.65, 2)
     # test evaluation (with training dataset) produces a summary with same values
     # one check is enough to verify a summary is returned, Scala version runs full test
     sameSummary = model.evaluate(df)
     self.assertAlmostEqual(sameSummary.accuracy, s.accuracy)
    def logregelastic(self, cor, ip):
        spark = SparkSession.\
        builder.\
        appName("LogisticRegressionWithElasticNet").\
        master("spark://spark-master:7077").\
        config("spark.executor.memory", "512m").\
        getOrCreate()
        #print(len(sys.argv))
        #if len(sys.argv) > 2:
        sample_frac = float(ip)
        #print("Input frac is : ",sample_frac)
        num_parts = int(cor)
        #print("Cores are : ",num_parts)
        
        #print(num_parts)

        #sc = SparkContext(appName="LogisticRegressionWithElasticNet")
        sc = spark.sparkContext
        sc.setLogLevel("WARN")
        sqlContext = SQLContext(sc)

        # Load training data
        training = sqlContext.read.format("libsvm").load("/data/rcv1_train.binary/rcv1_train")
        
        training = training.sample(False, sample_frac).coalesce(num_parts)
        lr = LogisticRegression(maxIter=10, elasticNetParam=0.8)
        #training = np.array(training)
        #training = np.fromstring(training, dtype=int, sep=',')
        start = time.time()
        self.start_t = start
        # Fit the model
        #lb = preprocessing.LabelBinarizer()
        #op = lb.fit_transform(training)
        lrModel = lr.fit(training)
        end = time.time()
        self.end_t = end
        print("Cores ",num_parts, "LR sample: ", sample_frac, " took ", (end-start))
Exemple #33
0
    def run_logistic(self):
        '''
        Method to run logistic regression on our transformed data.

        Input:
        -------
        None

        Output:
        -------
        Dictionary of confusion matrix scores for this particular model.

        '''

        # Instantiate model, fit, then transform.
        lr = LogisticRegression(maxIter=30,
                                regParam=0.3,
                                elasticNetParam=0)
        lr_model = lr.fit(self.trainingData)
        predictions = lr_model.transform(self.testData)

        # Write type of model to filename.
        with open(self.filename,'a') as f:
            f.write("\n\nLogistic Regression:")

        # Create confusion matrix to see how well the model performed
        confusion_matrix = self.create_confusion_matrix(predictions)

        # Evaluate model's AUC.
        auc = self.evaluator.evaluate(predictions)
        print("AUC Score: ",str(auc))

        # Write result of model to filename.
        with open(self.filename,'a') as f:
            f.write("\nAUC Score: " + str(auc))

        return confusion_matrix
def main():
    sc = init()
    spark = SparkSession(sc)
    data = load_dataset(spark)

    train_data, test_data = prepare_dataset(data)
    #train_data.show()
    #test_data.show()

    print("Encontrando h ....")

    lr = LogisticRegression(
        maxIter=100, regParam=0.3, elasticNetParam=0.8,
        labelCol='CLASS', family='binomial')
    
    lr_model = lr.fit(train_data)

    print("Coeficientes: " + str(lr_model.coefficients))
    print("Intercept: " + str(lr_model.intercept))

    print("Testing model...")

    data_to_validate = lr_model.transform(test_data)
    
    evaluator1 = BinaryClassificationEvaluator(
        labelCol='CLASS', metricName='areaUnderROC', 
        rawPredictionCol='rawPrediction'
    )
    print("{}:{}".format(
        "areaUnderROC",evaluator1.evaluate(data_to_validate)))

    evaluator2 = BinaryClassificationEvaluator(
        labelCol='CLASS', metricName='areaUnderPR', 
        rawPredictionCol='rawPrediction'
    )
    print("{}:{}".format(
        "areaUnderPR",evaluator2.evaluate(data_to_validate)))
Exemple #35
0
    def get_logistic_regression(self, train_data, test_data, col_to_check):

        print "----------------------------------------------------------------"
        print ""
        info = {}
        lr_churn = LogisticRegression(labelCol=col_to_check)

        print "this is  lr_churn: {}".format(lr_churn)
        print "this is lr_churn type: {}".format(type(lr_churn))

        fitted_churn_model = lr_churn.fit(train_data)
        print "this is fitted_churn_model: {}".format(fitted_churn_model)
        print "this is fitted_churn_model type: {}".format(
            type(fitted_churn_model))

        print "what is here:{}".format(dir(fitted_churn_model))

        trainning_sum = fitted_churn_model.summary
        print "trainning sum: {}".format(trainning_sum)
        print "trainning sum type: {}".format(type(trainning_sum))

        trainning_sum.predictions.describe().show()

        print "should show the predictions above"

        predictions_and_labels = fitted_churn_model.evaluate(test_data)

        print "this is fitted_churn_model: {}".format(predictions_and_labels)
        print "this si fitted_chur_model after evaluate on train data type: {}".format(
            type(predictions_and_labels))

        print ""
        print "showing predictions  and labels count: "
        print predictions_and_labels.predictions.show(351)
        print "----------------------------------------------------------------"
        print ""
        return predictions_and_labels
Exemple #36
0
def train(train_path, model_name, elasticNetParam=0):
    if model_name is None:
        model_name = 'model'
    model_path = os.path.join(dirname(os.getcwd()), 'models', model_name)
    if os.path.isdir(model_path):
        shutil.rmtree(model_path)

    spark = SparkSession \
        .builder \
        .master('local') \
        .appName('Logistic App') \
        .getOrCreate()

    # todo Delete the next line
    spark.sparkContext.setLogLevel('OFF')

    raw_data = spark.read.csv(train_path, header=True)

    dataset = mature_data(raw_data)

    lr = LogisticRegression(maxIter=10, elasticNetParam=elasticNetParam)
    lrModel = lr.fit(dataset)

    lrModel.save(path=model_path)
def LogisticRegression(trainingData, testData, schemaNames):
    from pyspark.ml import Pipeline
    from pyspark.ml.classification import LogisticRegression
    from pyspark.ml.tuning import ParamGridBuilder
    from pyspark.ml.feature import StringIndexer, VectorIndexer
    from pyspark.ml.tuning import CrossValidator
    from pyspark.ml.evaluation import BinaryClassificationEvaluator
    from pyspark.ml.evaluation import MulticlassClassificationEvaluator
    import numpy as np
    import time

    lr = LogisticRegression(featuresCol='features',
                            labelCol='label',
                            regParam=0.1,
                            maxIter=7)
    timer = ''
    start = time.time()
    cvModel = lr.fit(trainingData)
    end = time.time()
    timer = ((end - start) / 60)

    prediction = cvModel.transform(testData)
    evaluator = MulticlassClassificationEvaluator(labelCol="label",
                                                  predictionCol="prediction",
                                                  metricName="accuracy")
    accuracy = evaluator.evaluate(prediction)

    evaluator = BinaryClassificationEvaluator(rawPredictionCol="prediction")
    areaUC = evaluator.evaluate(prediction)

    w_r = cvModel.coefficients
    w_r = w_r.tolist()
    feat = []
    for i in (w_r)[-3:][::-1]:
        feat.append(schemaNames[(w_r.index(i))])
    return feat, accuracy, areaUC, timer
def logistic_regression_generator(training_data, deal_id):
    ####In:
    #A training data set, as generated by data_prep()
    #The deal_id you want to generate a model for

    ####Out
    #The model is saved
    #An update message is outputted

    training_data = training_data.withColumnRenamed(deal_id, 'label')
    model = LogisticRegression(maxIter=100,
                               regParam=0.0001,
                               elasticNetParam=1,
                               family="binomial")
    model = model.fit(training_data)
    model.write().overwrite().save(
        f"s3://rtl-databricks-datascience/lpater/logistic_regression/{deal_id}/"
    )
    output_message = "Saved a Logistic Regression model for " + deal_id + "."

    #see also: https://spark.apache.org/docs/latest/ml-classification-regression.html

    #note: this currently uses LASSO to select parameters
    return output_message
Exemple #39
0
    (161.6, 61.2, 28)]).toDF("height", "weight", "age")

training.show(truncate=False)

assembler = VectorAssembler(inputCols=["height", "weight", "age"], outputCol="features")

# training 데이터에 features 컬럼 추가
assembled_training = assembler.transform(training)

assembled_training.show(truncate=False)

# 모델 생성 알고리즘 (로지스틱 회귀 평가자)
lr = LogisticRegression(maxIter=10, regParam=0.01, labelCol="gender")

# 모델 생성
model = lr.fit(assembled_training)

# 예측값 생성
model.transform(assembled_training).show()

# 파이프라인
pipeline = Pipeline(stages=[assembler, lr])

# 파이프라인 모델 생성
pipelineModel = pipeline.fit(training)

# 파이프라인 모델을 이용한 예측값 생성
pipelineModel.transform(training).show()

path1 = "/Users/beginspark/Temp/regression-model"
path2 = "/Users/beginspark/Temp/pipelinemodel"
    spark = SparkSession \
        .builder \
        .appName("MulticlassLogisticRegressionWithElasticNet") \
        .getOrCreate()

    # $example on$
    # Load training data
    training = spark \
        .read \
        .format("libsvm") \
        .load("data/mllib/sample_multiclass_classification_data.txt")

    lr = LogisticRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8)

    # Fit the model
    lrModel = lr.fit(training)

    # Print the coefficients and intercept for multinomial logistic regression
    print("Coefficients: \n" + str(lrModel.coefficientMatrix))
    print("Intercept: " + str(lrModel.interceptVector))

    trainingSummary = lrModel.summary

    # Obtain the objective per iteration
    objectiveHistory = trainingSummary.objectiveHistory
    print("objectiveHistory:")
    for objective in objectiveHistory:
        print(objective)

    # for multiclass, we can inspect metrics on a per-label basis
    print("False positive rate by label:")

# COMMAND ----------

from pyspark.ml.classification import LogisticRegression
lr = LogisticRegression(labelCol="label",featuresCol="features")


# COMMAND ----------

print lr.explainParams()


# COMMAND ----------

fittedLR = lr.fit(train)


# COMMAND ----------

train, test = df.randomSplit([0.7, 0.3])


# COMMAND ----------

rForm = RFormula()
lr = LogisticRegression().setLabelCol("label").setFeaturesCol("features")


# COMMAND ----------
Exemple #42
0
from pyspark.ml.feature import StringIndexer
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import MulticlassClassificationEvaluator


print "Fitting the classifier on selected features"
t0 = time()

string_indexer = StringIndexer(inputCol='label', outputCol='target_indexed')
lr = LogisticRegression(featuresCol='selectedFeatures',labelCol='target_indexed',maxIter=30, regParam=0.01)
evaluator = MulticlassClassificationEvaluator(predictionCol='prediction', labelCol='target_indexed', metricName='precision')

string_indexer_model = string_indexer.fit(dfTrainSelect)
dfTrainIndexed = string_indexer_model.transform(dfTrainSelect).cache()
lrModel = lr.fit(dfTrainIndexed)

tt = time() - t0
print "Done in {} second".format(round(tt,3))


# In[19]:

print "Testing precision of the model"
t0 = time()

dfValidSelect=dfValid.map(partial(vectorizeBi,dico=dictSel_broad.value)).toDF(['selectedFeatures','label']).cache()
dfValidIndexed = string_indexer_model.transform(dfValidSelect).cache()
df_valid_pred = lrModel.transform(dfValidIndexed).cache()
res=evaluator.evaluate(df_valid_pred)
print res
dfhot = oneHotEncodeColumns(dfnumeric, ["workclass", "education", "marital_status", "occupation", "relationship", "race", "native_country"])

from pyspark.ml.feature import VectorAssembler
va = VectorAssembler(outputCol="features", inputCols=dfhot.columns[0:-1])
lpoints = va.transform(dfhot).select("features", "income").withColumnRenamed("income", "label")

#section 8.2.3
splits = lpoints.randomSplit([0.8, 0.2])
adulttrain = splits[0].cache()
adultvalid = splits[1].cache()


from pyspark.ml.classification import LogisticRegression
lr = LogisticRegression(regParam=0.01, maxIter=1000, fitIntercept=True)
lrmodel = lr.fit(adulttrain)
lrmodel = lr.setParams(regParam=0.01, maxIter=500, fitIntercept=True).fit(adulttrain)

lrmodel.weights
lrmodel.intercept

#section 8.2.3
validpredicts = lrmodel.transform(adultvalid)

from pyspark.ml.evaluation import BinaryClassificationEvaluator
bceval = BinaryClassificationEvaluator()
bceval.evaluate(validpredicts)
bceval.getMetricName()

bceval.setMetricName("areaUnderPR")
bceval.evaluate(validpredicts)
print "Features created"

from pyspark.ml.feature import StringIndexer

string_indexer = StringIndexer(inputCol='label', outputCol='target_indexed')
string_indexer_model = string_indexer.fit(features)
featIndexed = string_indexer_model.transform(features)

print "labels indexed"

lr = LogisticRegression(featuresCol='Vectors', labelCol=string_indexer.getOutputCol())

from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(predictionCol='prediction', labelCol='target_indexed', metricName='precision')

lr_model = lr.fit(featIndexed)

dfTestTok = tokenizer.transform(dfTest)
featuresTest=dfTestTok.map(partial(vectorize,dico=dict_broad.value)).toDF(schema)
testIndexed = string_indexer_model.transform(featuresTest)

df_test_pred = lr_model.transform(testIndexed)

res=evaluator.evaluate(df_test_pred)

print res

#test,names=lf.loadUknown('./data/test')
#name_text=zip(names,test)
##for each doc :(name,text):
##apply the model on the vector representation of the text
Exemple #45
0
	elastic_net_param = 0.1
	"""
	for reg_param in RP:
		
		lr = LogisticRegression(maxIter = max_iter, regParam=reg_param,elasticNetParam = elastic_net_param,standardization = stand)
		lr = lr.fit(trainDF)
		validateDF_prob = add_probability(validateDF,lr,sc)
		print "======================"
		print "averaged log_loss: ",
		temp = log_loss(validateDF_prob)
		print temp 
		if temp < Opt:
			Opt = temp
			reg_param_opt = reg_param
			elastic_net_param_opt = elastic_net_param  
	"""
	elastic_net_param_opt = 5e-3
	reg_param_opt = 1e-6
	lr = LogisticRegression(maxIter = max_iter, regParam=reg_param_opt,elasticNetParam = elastic_net_param_opt,standardization = stand)
	lr = lr.fit(trainDF)
	predictions = add_probability(testDF,lr,sc).select("activity_id","outcome")
	predictions = predictions.join(leakageTest,"activity_id","left_outer").withColumnRenamed("outcome","p")
	predictions = predictions.withColumn("outcome", when( isNull(predictions.leak), predictions.leak).otherwise(predictions.p).alias("outcome"))
	predictions.show(5)
	predictions = predictions.select("activity_id","outome")
	predictions.toPandas().to_csv(datapath+"lr.csv",index = False)
	
	#predictions = predictions.select(predictions.probability.values)
	#predictions.show(3)
	#predictions = predictions.select("activity_id",predictions.outcome.getItem(1).alias("outcome"))
Exemple #46
0
USE_SVM = True
USE_LR = False
USE_DT = False

# Read Data
sqlContext = SQLContext(sc)
trainData = sqlContext.read.format('com.databricks.spark.csv').options(header='true',inferschema='true',nullValue='NA').load('flight/*.csv')
testData = sqlContext.read.format('com.databricks.spark.csv').options(header='true',inferschema='true',nullValue='NA').load('test/*.csv')

#Preprocess Data
trainData = preprocess(trainData)
testData = preprocess(testData)

#Logistic Regression
lr = LogisticRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8)
lrModel = lr.fit(trainData)
lrprediction = lrModel.transform(testData)
lrselected = lrprediction.select("probability").first().probability[0]
result="Logistic Regression Accuracy:"+str(lrselected)+'\n'

#Decision Tree Regression
dataset = trainData.unionAll(testData)
labelIndexer = StringIndexer(inputCol="label", outputCol="indexedLabel").fit(dataset)
featureIndexer =\
    VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4).fit(dataset)
# Train a DecisionTree model.
dt = DecisionTreeClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures")
# Chain indexers and tree in a Pipeline
pipeline = Pipeline(stages=[labelIndexer, featureIndexer, dt])
# Train model.  This also runs the indexers.
dcmodel = pipeline.fit(trainData)
    test = test.select('Pclass', 'Sex', 'SibSp', 'Parch')

    train = sex_to_bin(train)
    test = sex_to_bin(test)

    print "number of men in train and test resp. : %d, %d" \
        %(train.select('Sex').map(lambda x: x.Sex).sum() \
        ,test.select('Sex').map(lambda x: x.Sex).sum())

    # format train for Logistic Regression as (label, features)
    ntrain = train.map(lambda x: Row(label = float(x[0]) \
         ,features = Vectors.dense(x[1:]))).toDF().cache() # Logistic Regression is iterative, need caching
    ntest = test.map(lambda x: Row(features = Vectors.dense(x[0:]))).toDF()
    
    lr = LogisticRegression(maxIter = 100, regParam = 0.1)
    model = lr.fit(ntrain)
    pred = model.transform(ntest).select('prediction').map(lambda x: x.prediction)
    
    # configure the submission format as follows
    submit = sqlCtx.createDataFrame(testPassengerId.zip(pred), ["PassengerId", "Survived"])
    """
    NOTE: rdd1.zip(rdd2) works provided that both RDDs have the same partitioner and the same number 
    of elements per partition, otherwise should either repartition or can do:
    submit = sqlCtx.createDataFrame(pred.zipWithIndex().map(lambda x: (x[1]+892L, x[0])), ["PassengerId", "Survived"])
    where 891L is the number training samples
    """
    os.chdir(DATADIR)
    # file is small so can save pandas.DataFrame as csv
    submit.toPandas().to_csv("prediction.csv", index = False)
    # if not, should saveAsTextFile:
    # submit.rdd.saveAsTextFile("/home/ehsan/Python/PySpark/Titanic/data/prediction")
#choose estimator and grid
#estima = NaiveBayes()
#grid = ParamGridBuilder().addGrid(5, [0, 2]).build()
lr = LogisticRegression(featuresCol="features", labelCol="label", predictionCol="prediction",maxIter=20)	#choose the model
grid = ParamGridBuilder().addGrid(lr.maxIter, [0, 1]).build()	
#la grille est construite pour trouver le meilleur parametre 'alpha' pour le terme de regularisation du modele: c'est un 'elastic Net'
#max.iter vaut 30 par defaut, on pourrait changer sa valeur
#on va donc essayer 30 valeur entre 0 et 1
#alpha=0 c'est une regularisation L2, 
#alpha=1, c'est une regularisation L1
print "Cross validation debut"

evaluator = MulticlassClassificationEvaluator(predictionCol="prediction", labelCol="label",metricName='precision')	#choose the evaluator
cv = CrossValidator(estimator=lr, evaluator=evaluator) #perform the cross validation and keeps the best value of maxIter
#cvModel = cv.fit(dfTrain)	#train the model on the whole training set
model = lr.fit(dfTrain)
resultat=evaluator.evaluate(model.transform(dfTest))	#compute the percentage of success on test set
print "Pourcentage de bonne classification(0-1): ",resultat

##Train NaiveBayes
#model=NaiveBayes.train(labeledRDD)
##broadcast the model
#mb=sc.broadcast(model)
#
#test,names=lf.loadUknown('./data/test')
#name_text=zip(names,test)
##for each doc :(name,text):
##apply the model on the vector representation of the text
##return the name and the class
#predictions=sc.parallelize(name_text).map(partial(Predict,dictionary=dict_broad.value,model=mb.value)).collect()
#
def train_logistic(df):
    lr = LogisticRegression(maxIter=LR_MAX_ITER, regParam=LR_REG_PARAM)
    return lr, lr.fit(df)
        Row(label=1.0, features=DenseVector([0.0, 1.1, 0.1])),
        Row(label=0.0, features=DenseVector([2.0, 1.0, -1.0])),
        Row(label=0.0, features=DenseVector([2.0, 1.3, 1.0])),
        Row(label=1.0, features=DenseVector([0.0, 1.2, -0.5]))])

    # Create a LogisticRegression instance with maxIter = 10.
    # This instance is an Estimator.
    lr = LogisticRegression(maxIter=10)
    # Print out the parameters, documentation, and any default values.
    print("LogisticRegression parameters:\n" + lr.explainParams() + "\n")

    # We may also set parameters using setter methods.
    lr.setRegParam(0.01)

    # Learn a LogisticRegression model.  This uses the parameters stored in lr.
    model1 = lr.fit(training)

    # Since model1 is a Model (i.e., a Transformer produced by an Estimator),
    # we can view the parameters it used during fit().
    # This prints the parameter (name: value) pairs, where names are unique IDs for this
    # LogisticRegression instance.
    print("Model 1 was fit using parameters:\n")
    pprint.pprint(model1.extractParamMap())

    # We may alternatively specify parameters using a parameter map.
    # paramMap overrides all lr parameters set earlier.
    paramMap = {lr.maxIter: 20, lr.thresholds: [0.5, 0.5], lr.probabilityCol: "myProbability"}

    # Now learn a new model using the new parameters.
    model2 = lr.fit(training, paramMap)
    print("Model 2 was fit using parameters:\n")
Exemple #51
0
sqlContext = SQLContext(sc)

# Prepare training data from a list of (label, features) tuples.
training = sqlContext.createDataFrame([
    (1.0, Vectors.dense([0.0, 1.1, 0.1])),
    (0.0, Vectors.dense([2.0, 1.0, -1.0])),
    (0.0, Vectors.dense([2.0, 1.3, 1.0])),
    (1.0, Vectors.dense([0.0, 1.2, -0.5]))], ["label", "features"])

# Create a LogisticRegression instance. This instance is an Estimator.
lr = LogisticRegression(maxIter=10, regParam=0.01)
# Print out the parameters, documentation, and any default values.
print("LogisticRegression parameters:\n" + lr.explainParams() + "\n")

# Learn a LogisticRegression model. This uses the parameters stored in lr.
model1 = lr.fit(training)

# Since model1 is a Model (i.e., a transformer produced by an Estimator),
# we can view the parameters it used during fit().
# This prints the parameter (name: value) pairs, where names are unique IDs for this
# LogisticRegression instance.
print("Model 1 was fit using parameters: ")
print(model1.extractParamMap())

# We may alternatively specify parameters using a Python dictionary as a paramMap
paramMap = {lr.maxIter: 20}
paramMap[lr.maxIter] = 30 # Specify 1 Param, overwriting the original maxIter.
paramMap.update({lr.regParam: 0.1, lr.threshold: 0.55}) # Specify multiple Params.

# You can combine paramMaps, which are python dictionaries.
paramMap2 = {lr.probabilityCol: "myProbability"} # Change output column name
Exemple #52
0
from logreg import collect_one


with SparkController() as sc:
    data_path, npar = './data/a9a', 5
    dataset = MLUtils.loadLibSVMFile(sc, data_path, minPartitions=npar).cache()

    local_data = Worker.from_rows(dataset.collect(), dense=False)
    n, d = local_data.n_samples, local_data.n_features
    print '#samples: {n}; #features: {d}'.format(n=n, d=d)

    print 'Baseline: training in single node mode...'
    prob = Executor(local_data, n, d, collect_one,
                    logreg_local, cached=True, l2_reg=0.01)
    descend(prob, verbose=1, max_iter=30, l1_reg=0.005, precision='f')

    print 'Spark ({} partitions): training using peregrine...'.format(npar)
    prob = logistic_regression(dataset, dense=False, l2_reg=0.01)
    descend(prob, verbose=1, max_iter=30, l1_reg=0.005, precision='f')

    print 'Spark ({} partitions): training using mllib...'.format(npar)
    sqlContext = SQLContext(sc)
    lr = LogisticRegression(maxIter=300, regParam=0.02,
                            elasticNetParam=0.5, fitIntercept=False)
    lr.fit(dataset.toDF().replace(-1, 0, 'label').cache())

    print 'Spark/Tensorflow ({} partitions): training using peregrine...'.format(npar)
    prob = logistic_regression(dataset, l2_reg=0.01, tensorflow=True)
    descend(prob, verbose=1, max_iter=30, l1_reg=0.005, precision='f')

# MAGIC 
# MAGIC You can read more about Logistic Regression from the Programming Guide [here](http://spark.apache.org/docs/latest/mllib-linear-methods.html#logistic-regression). In the new Pipelines API, we are now able to perform Elastic net regularization with Logistic Regression, as well as other linear methods.
# MAGIC 
# MAGIC 
# MAGIC Note: As of Spark 1.5.0, The Python API does not yet support multiclass classification for Logistic Regression, but will be available in future.

# COMMAND ----------

from pyspark.ml.classification import LogisticRegression
from pyspark.ml.param import Param, Params

# Create initial LogisticRegression model
lr = LogisticRegression(labelCol="label", featuresCol="features", maxIter=10)

# Train model with Training Data
lrModel = lr.fit(trainingData)

# COMMAND ----------

# Make predictions on test data using the Transformer.transform() method.
# LogisticRegression.transform() will only use the 'features' column.
predictions = lrModel.transform(testData)

# COMMAND ----------

predictions.printSchema()

# COMMAND ----------

# View model's predictions and probabilities of each prediction class
# You can select any columns in the above schema to view as well. For example's sake we will choose age & occupation
Exemple #54
0
from __future__ import print_function
# $example on$
from pyspark.ml.classification import LogisticRegression
# $example off$
from pyspark.sql import SparkSession
if __name__ == "__main__":
    spark = SparkSession\
        .builder\
        .appName("LogisticRegressionWithElasticNet")\
        .getOrCreate()
    # $example on$
    # Load training data
    training = spark.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt")
    lr = LogisticRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8)
    # Fit the model
    lrModel = lr.fit(training)
    # Print the coefficients and intercept for logistic regression
    print("Coefficients: " + str(lrModel.coefficients))
    print("Intercept: " + str(lrModel.intercept))
    # $example off$
    spark.stop()