Beispiel #1
0
 def test_glr_summary(self):
     from pyspark.mllib.linalg import Vectors
     df = self.spark.createDataFrame([(1.0, 2.0, Vectors.dense(1.0)),
                                      (0.0, 2.0, Vectors.sparse(1, [], []))],
                                     ["label", "weight", "features"])
     glr = GeneralizedLinearRegression(family="gaussian", link="identity", weightCol="weight",
                                       fitIntercept=False)
     model = glr.fit(df)
     self.assertTrue(model.hasSummary)
     s = model.summary
     # test that api is callable and returns expected types
     self.assertEqual(s.numIterations, 1)  # this should default to a single iteration of WLS
     self.assertTrue(isinstance(s.predictions, DataFrame))
     self.assertEqual(s.predictionCol, "prediction")
     self.assertTrue(isinstance(s.residuals(), DataFrame))
     self.assertTrue(isinstance(s.residuals("pearson"), DataFrame))
     coefStdErr = s.coefficientStandardErrors
     self.assertTrue(isinstance(coefStdErr, list) and isinstance(coefStdErr[0], float))
     tValues = s.tValues
     self.assertTrue(isinstance(tValues, list) and isinstance(tValues[0], float))
     pValues = s.pValues
     self.assertTrue(isinstance(pValues, list) and isinstance(pValues[0], float))
     self.assertEqual(s.degreesOfFreedom, 1)
     self.assertEqual(s.residualDegreeOfFreedom, 1)
     self.assertEqual(s.residualDegreeOfFreedomNull, 2)
     self.assertEqual(s.rank, 1)
     self.assertTrue(isinstance(s.solver, basestring))
     self.assertTrue(isinstance(s.aic, float))
     self.assertTrue(isinstance(s.deviance, float))
     self.assertTrue(isinstance(s.nullDeviance, float))
     self.assertTrue(isinstance(s.dispersion, float))
     # test evaluation (with training dataset) produces a summary with same values
     # one check is enough to verify a summary is returned, Scala version runs full test
     sameSummary = model.evaluate(df)
     self.assertAlmostEqual(sameSummary.deviance, s.deviance)
Beispiel #2
0
def model():

    data = sql.read.parquet(str(DATA_PARQUET))
    data.createOrReplaceTempView('data')
    sample = sql.sql('''
        select
            hash_number_A
            ,interest_1
            ,phone_price_category
            ,sum(cost) as label
        from data
        group by hash_number_A, interest_1, phone_price_category''')
    # ,phone_price_category

    pipeline = Pipeline(stages=[
        StringIndexer(inputCol='interest_1', outputCol='interest'),
        StringIndexer(inputCol='phone_price_category',
                      outputCol='phone_price'),
        VectorAssembler(inputCols=['interest', 'phone_price'],
                        outputCol='features'),
    ])
    model_data = pipeline.fit(sample)

    sample = model_data.transform(sample)

    # 'gaussian', 'binomial', 'poisson', 'gamma', 'tweedie'

    regression = GeneralizedLinearRegression(family='gaussian',
                                             labelCol='label',
                                             featuresCol='features',
                                             maxIter=10,
                                             regParam=0.3)
    model = regression.fit(sample)
    breakpoint()
Beispiel #3
0
def generalizeRegression(df, label, features, adjust):
    """ This function returns the rmse and the predictions form the applied generalized 
        regression model on the dataframe with the speficied feature columns """
    ## Columns with non numerical values are adjusted
    for col in adjust:
        indexer = StringIndexer(inputCol=col, outputCol="{}_num".format(col))
        features.append("{}_num".format(col))
        df = indexer.fit(df).transform(df)
    ## Features vector configured from dataframe for model processing
    assembler = VectorAssembler(inputCols=features, outputCol="features")
    assembled = assembler.transform(df)
    gr = GeneralizedLinearRegression(featuresCol='features',
                                     labelCol=label,
                                     regParam=0.3,
                                     family="poisson")
    grModel = gr.fit(assembled)
    predictions = grModel.transform(assembled)
    ## Evaluator required for rmse estimation
    evaluator = RegressionEvaluator(labelCol=label, metricName="rmse")
    rmse = evaluator.evaluate(predictions)
    result = {
        "RMSE":
        rmse,
        "predictions":
        [r["prediction"] for r in predictions.select("prediction").collect()]
    }
    return result
Beispiel #4
0
def logisT(value):
  glr = GeneralizedLinearRegression(family="gaussian", link="identity", maxIter=10, regParam=0.3)
  # Fit the model
  model = glr.fit(value)
  # Print the coefficients and intercept for generalized linear regression model
  print("Coefficients: " + str(model.coefficients))
  print("Intercept: " + str(model.intercept))
  return (model.coefficients,1)
Beispiel #5
0
def model_dev_glm(df_train, df_test, max_iter, fit_intercept, reg_param):

    glm_start_time = time()

    # Create an Initial Model Instance
    mod_glm = GeneralizedLinearRegression(labelCol='label',
                                          featuresCol='features',
                                          family="gaussian",
                                          link="identity",
                                          fitIntercept=fit_intercept,
                                          maxIter=max_iter,
                                          regParam=reg_param)

    # Training The Model
    glm_final_model = mod_glm.fit(df_train)

    # Scoring The Model On Test Sample
    glm_transformed = glm_final_model.transform(df_test)
    glm_test_results = glm_transformed.select(['prediction', 'label'])

    # Collecting The Model Statistics
    glm_evaluator = RegressionEvaluator(predictionCol="prediction",
                                        labelCol="label")
    glm_r2 = round(
        glm_evaluator.evaluate(glm_test_results,
                               {glm_evaluator.metricName: "r2"}), 3)
    glm_mse = round(
        glm_evaluator.evaluate(glm_test_results,
                               {glm_evaluator.metricName: "mse"}), 3)
    glm_rmse = round(
        glm_evaluator.evaluate(glm_test_results,
                               {glm_evaluator.metricName: "rmse"}), 3)
    glm_mae = round(
        glm_evaluator.evaluate(glm_test_results,
                               {glm_evaluator.metricName: "mae"}), 3)

    # Printing The Model Statitics
    print("\n++++++ Printing Generalized Linear Model Accuracy ++++++\n")
    print("R Square: " + str(glm_r2 * 100) + "%")
    print("Mean Squared Error: " + str(glm_mse))
    print("Root Mean Squared Error: " + str(glm_rmse))
    print("Mean Absolute Error: " + str(glm_mae))

    glm_end_time = time()
    glm_elapsed_time = (glm_end_time - glm_start_time) / 60
    glm_model_stat = pd.DataFrame({
        "Model Name": ["Generalized Linear Model"],
        "R Square": glm_r2,
        "Mean Squared Error": glm_mse,
        "Root Mean Squared Error": glm_rmse,
        "Mean Absolute Error": glm_mae,
        "Time (Min.)": round(glm_elapsed_time, 3)
    })
    glm_output = (glm_final_model, glm_model_stat)

    return (glm_output)
Beispiel #6
0
def generalized_linear_regression(trainingDataFrame, family="gaussian", link="identity",
                                  maxIter=10, regParam=0.3):
    glr = GeneralizedLinearRegression(family=family, link=link, maxIter=maxIter, regParam=regParam)
    glrModel = glr.fit(trainingDataFrame)
    result = {}
    result["model"] = glrModel
    result["summary"] = glrModel.summary
    result["intercept"] = glrModel.intercept
    result["coefficients"] = glrModel.coefficients
    return result
Beispiel #7
0
    def main(self, sc, *args):
        points_rdd = self.requires().get_points_rdd(sc)

        model = GeneralizedLinearRegression(family='poisson',
                                            link=self.link,
                                            maxIter=self.iterations)

        spark_sql = SparkSession.builder.getOrCreate()
        model = model.fit(spark_sql.createDataFrame(points_rdd))
        model.save(self.output().path)
Beispiel #8
0
def linear_regression(ticker, writer):
    spark = SparkSession\
        .builder\
        .appName("GeneralizedLinearRegressionExample")\
        .getOrCreate()

    # $example on$
    # Load training data
    dataset1 = spark.read.format("libsvm")\
        .load("../data/newlr/" + ticker + "_no_today.csv")

    glr1 = GeneralizedLinearRegression(family="gaussian",
                                       link="identity",
                                       maxIter=10,
                                       regParam=0.3)

    # Fit the model
    model1 = glr1.fit(dataset1)

    with open("../data/tickers/" + ticker + ".csv") as csvfile:
        reader = csv.reader(csvfile, delimiter=',', quotechar='|')
        count = 0
        for row in reader:
            if count == 1:
                today_volume = row[5]
                count = count + 1
            else:
                count = count + 1

    # Print the coefficients and intercept for generalized linear regression model
    predict_close_value = -1 * float(str(model1.coefficients[0])) + float(
        str(today_volume)) * float(str(model1.coefficients[1])) + float(
            str(model1.intercept))
    print(predict_close_value)

    today_close_value = 0
    yesterday_close_value = 0
    with open("../data/newlr/" + ticker + ".csv") as csvfile:
        reader = csv.reader(csvfile, delimiter=',')
        count = 0
        for row in reader:
            if count is 0:
                today_close_value = row[0]
                count += 1
            elif count is 1:
                yesterday_close_value = row[0]
                break

    spark.stop()
    if predict_close_value >= yesterday_close_value and today_close_value >= yesterday_close_value:
        return True
    elif predict_close_value <= yesterday_close_value and today_close_value <= yesterday_close_value:
        return True
    else:
        return False
Beispiel #9
0
    def test_offset(self):

        df = self.spark.createDataFrame(
            [(0.2, 1.0, 2.0, Vectors.dense(0.0, 5.0)),
             (0.5, 2.1, 0.5, Vectors.dense(1.0, 2.0)),
             (0.9, 0.4, 1.0, Vectors.dense(2.0, 1.0)),
             (0.7, 0.7, 0.0, Vectors.dense(3.0, 3.0))], ["label", "weight", "offset", "features"])

        glr = GeneralizedLinearRegression(family="poisson", weightCol="weight", offsetCol="offset")
        model = glr.fit(df)
        self.assertTrue(np.allclose(model.coefficients.toArray(), [0.664647, -0.3192581],
                                    atol=1E-4))
        self.assertTrue(np.isclose(model.intercept, -1.561613, atol=1E-4))
Beispiel #10
0
def regression(train_set, test_set, featuresColumn, labelColumn):

    regressor = GeneralizedLinearRegression(featuresCol=featuresColumn,
                                            labelCol=labelColumn,
                                            family="gaussian",
                                            link="log",
                                            maxIter=10,
                                            regParam=0.3)
    regressor = regressor.fit(train_set)

    predict_results = regressor.evaluate(test_set)
    result = predict_results.predictions

    return result
Beispiel #11
0
def best_subset_selection_GLM(df,
                              labelCol,
                              Cols,
                              label_is_categorical=False,
                              family='gaussian',
                              link='identity'):

    print('Total number of iterations: {}'.format(2**len(Cols)))

    AIC_values, feature_list, num_features = [], [], []

    for k in np.arange(1, len(Cols) + 1):

        for i, combo in enumerate(itertools.combinations(Cols, k)):

            continuousCols, categoricalCols = [], []

            for col in list(combo):
                data_type = str(df.schema[col].dataType)
                if data_type == 'StringType':
                    categoricalCols.append(col)
                else:
                    continuousCols.append(col)

            data = prepare_data(df=df,
                                labelCol=labelCol,
                                label_is_categorical=False,
                                categoricalCols=categoricalCols,
                                continuousCols=continuousCols)

            model = GeneralizedLinearRegression(family=family,
                                                link=link,
                                                featuresCol='features',
                                                labelCol='label')

            AIC = model.fit(data).summary.aic
            AIC_values.append(AIC)

            feature_list.append(combo)
            num_features.append(len(combo))

            print('Feature/s: {}, AIC={:.3f}'.format(combo, AIC))

    return pd.DataFrame({
        'num_features': num_features,
        'AIC': AIC_values,
        'features': feature_list
    }).rename_axis('Model ID').sort_values('AIC', ascending=False)
Beispiel #12
0
    def test_tweedie_distribution(self):

        df = self.spark.createDataFrame(
            [(1.0, Vectors.dense(0.0, 0.0)),
             (1.0, Vectors.dense(1.0, 2.0)),
             (2.0, Vectors.dense(0.0, 0.0)),
             (2.0, Vectors.dense(1.0, 1.0)), ], ["label", "features"])

        glr = GeneralizedLinearRegression(family="tweedie", variancePower=1.6)
        model = glr.fit(df)
        self.assertTrue(np.allclose(model.coefficients.toArray(), [-0.4645, 0.3402], atol=1E-4))
        self.assertTrue(np.isclose(model.intercept, 0.7841, atol=1E-4))

        model2 = glr.setLinkPower(-1.0).fit(df)
        self.assertTrue(np.allclose(model2.coefficients.toArray(), [-0.6667, 0.5], atol=1E-4))
        self.assertTrue(np.isclose(model2.intercept, 0.6667, atol=1E-4))
def train_fit_glmm(window, date_label: str):
    poisson_regression = GeneralizedLinearRegression(family="poisson",
                                                     link="log",
                                                     maxIter=10,
                                                     regParam=0.3)

    columns = [
        denoise("train").alias("features"),
        F.expr(f"{date_label} as label")
    ]
    model = poisson_regression.fit(window.select(*columns))
    # TODO: may want to persist the fitted model
    observations = model.transform(
        window.withColumn("features", denoise("retrain")))

    columns = ["page_id", "train", "validate", "retrain", "test", "prediction"]
    return observations.select(*columns)
Beispiel #14
0
    def test_glr_summary(self):
        from pyspark.ml.linalg import Vectors

        df = self.spark.createDataFrame(
            [(1.0, 2.0, Vectors.dense(1.0)),
             (0.0, 2.0, Vectors.sparse(1, [], []))],
            ["label", "weight", "features"],
        )
        glr = GeneralizedLinearRegression(family="gaussian",
                                          link="identity",
                                          weightCol="weight",
                                          fitIntercept=False)
        model = glr.fit(df)
        self.assertTrue(model.hasSummary)
        s = model.summary
        # test that api is callable and returns expected types
        self.assertEqual(s.numIterations,
                         1)  # this should default to a single iteration of WLS
        self.assertTrue(isinstance(s.predictions, DataFrame))
        self.assertEqual(s.predictionCol, "prediction")
        self.assertEqual(s.numInstances, 2)
        self.assertTrue(isinstance(s.residuals(), DataFrame))
        self.assertTrue(isinstance(s.residuals("pearson"), DataFrame))
        coefStdErr = s.coefficientStandardErrors
        self.assertTrue(
            isinstance(coefStdErr, list) and isinstance(coefStdErr[0], float))
        tValues = s.tValues
        self.assertTrue(
            isinstance(tValues, list) and isinstance(tValues[0], float))
        pValues = s.pValues
        self.assertTrue(
            isinstance(pValues, list) and isinstance(pValues[0], float))
        self.assertEqual(s.degreesOfFreedom, 1)
        self.assertEqual(s.residualDegreeOfFreedom, 1)
        self.assertEqual(s.residualDegreeOfFreedomNull, 2)
        self.assertEqual(s.rank, 1)
        self.assertTrue(isinstance(s.solver, str))
        self.assertTrue(isinstance(s.aic, float))
        self.assertTrue(isinstance(s.deviance, float))
        self.assertTrue(isinstance(s.nullDeviance, float))
        self.assertTrue(isinstance(s.dispersion, float))
        # test evaluation (with training dataset) produces a summary with same values
        # one check is enough to verify a summary is returned
        # The child class GeneralizedLinearRegressionTrainingSummary runs full test
        sameSummary = model.evaluate(df)
        self.assertAlmostEqual(sameSummary.deviance, s.deviance)
Beispiel #15
0
 def test_glr_load(self):
     df = self.spark.createDataFrame([(1.0, Vectors.dense(0.0, 0.0)),
                                      (1.0, Vectors.dense(1.0, 2.0)),
                                      (2.0, Vectors.dense(0.0, 0.0)),
                                      (2.0, Vectors.dense(1.0, 1.0))],
                                     ["label",  "features"])
     glr = GeneralizedLinearRegression(family="gaussian", link="identity", linkPredictionCol="p")
     model = glr.fit(df)
     self.assertEqual(model.getSolver(), "irls")
     transformed1 = model.transform(df)
     path = tempfile.mkdtemp()
     model_path = path + "/glr"
     model.save(model_path)
     model2 = GeneralizedLinearRegressionModel.load(model_path)
     self.assertEqual(model2.getSolver(), "irls")
     transformed2 = model2.transform(df)
     self.assertEqual(transformed1.take(4), transformed2.take(4))
def linear_regression(ticker,writer):
    spark = SparkSession \
        .builder \
        .appName("GeneralizedLinearRegressionExample") \
        .getOrCreate()
    # Load training data
    dataset = spark.read.format("libsvm").load("../data/lr/" + ticker + "_no_today.csv")
    glr = GeneralizedLinearRegression(family="gaussian", link="identity", maxIter=1, regParam=0.8)

    # Fit the model
    model = glr.fit(dataset)
    data=[ticker, 'coefficient:', model.coefficients[0],'intercept:',model.intercept]
    writer.writerow(data)
    print(data)
    # predict
    today_close_value = 0
    yesterday_close_value = 0
    with open("../data/lr/" + ticker + ".csv") as csvfile:
        reader = csv.reader(csvfile, delimiter=',')
        count = 0
        for row in reader:
            if count is 0:
                today_close_value = row[0]
                count += 1
            elif count is 1:
                yesterday_close_value = row[0]
                break

    # # print(today_close_value)
    # # print(yesterday_close_value)

    predict_close_value = -1 * float(str(model.coefficients[0])) + float(str(model.intercept))
    # print(predict_close_value)
    spark.stop()
    if predict_close_value >= yesterday_close_value and today_close_value >= yesterday_close_value:
        return True
    elif predict_close_value <= yesterday_close_value and today_close_value <= yesterday_close_value:
        return True
    else:
        return False
Beispiel #17
0
def generalized_linear_regression():
    spark = SparkSession \
        .builder \
        .appName("Python Spark SQL basic example") \
        .config("spark.some.config.option", "some-value") \
        .getOrCreate()
    df = spark.createDataFrame([
        (1.0, Vectors.dense(0.0, 0.0)),
        (1.0, Vectors.dense(1.0, 2.0)),
        (2.0, Vectors.dense(0.0, 0.0)),
        (2.0, Vectors.dense(1.0, 1.0)),
    ], ["label", "features"])
    glr = GeneralizedLinearRegression(
        family="gaussian",
        link="identity",
    )  # linkPredictionCol="p")
    model = glr.fit(df)
    transformed = model.transform(df)
    abs(transformed.head().prediction - 1.5) < 0.001
    # True
    abs(transformed.head().p - 1.5) < 0.001
    # True
    model.coefficients
    model.numFeatures
    # 2
    abs(model.intercept - 1.5) < 0.001
    # True
    temp_path = "./"
    glr_path = temp_path + "/glr"
    glr.save(glr_path)
    glr2 = GeneralizedLinearRegression.load(glr_path)
    glr.getFamily() == glr2.getFamily()
    # True
    model_path = temp_path + "/glr_model"
    model.save(model_path)
    model2 = GeneralizedLinearRegressionModel.load(model_path)
    model.intercept == model2.intercept
    # True
    model.coefficients[0] == model2.coefficients[0]
if __name__ == "__main__":
    spark = SparkSession\
        .builder\
        .appName("GeneralizedLinearRegressionExample")\
        .getOrCreate()

    # $example on$
    # Load training data
    dataset = spark.read.format("libsvm")\
        .load("data/mllib/sample_linear_regression_data.txt")

    glr = GeneralizedLinearRegression(family="gaussian", link="identity", maxIter=10, regParam=0.3)

    # Fit the model
    model = glr.fit(dataset)

    # Print the coefficients and intercept for generalized linear regression model
    print("Coefficients: " + str(model.coefficients))
    print("Intercept: " + str(model.intercept))

    # Summarize the model over the training set and print out some metrics
    summary = model.summary
    print("Coefficient Standard Errors: " + str(summary.coefficientStandardErrors))
    print("T Values: " + str(summary.tValues))
    print("P Values: " + str(summary.pValues))
    print("Dispersion: " + str(summary.dispersion))
    print("Null Deviance: " + str(summary.nullDeviance))
    print("Residual Degree Of Freedom Null: " + str(summary.residualDegreeOfFreedomNull))
    print("Deviance: " + str(summary.deviance))
    print("Residual Degree Of Freedom: " + str(summary.residualDegreeOfFreedom))
    def Train(self):
        st_global = time.time()
        algosToRun = self._dataframe_context.get_algorithms_to_run()
        algoSetting = filter(lambda x:x["algorithmSlug"]==GLOBALSETTINGS.MODEL_SLUG_MAPPING["generalizedlinearregression"],algosToRun)[0]
        categorical_columns = self._dataframe_helper.get_string_columns()
        uid_col = self._dataframe_context.get_uid_column()
        if self._metaParser.check_column_isin_ignored_suggestion(uid_col):
            categorical_columns = list(set(categorical_columns) - {uid_col})
        allDateCols = self._dataframe_context.get_date_columns()
        categorical_columns = list(set(categorical_columns)-set(allDateCols))
        print categorical_columns
        result_column = self._dataframe_context.get_result_column()
        numerical_columns = self._dataframe_helper.get_numeric_columns()
        numerical_columns = [x for x in numerical_columns if x != result_column]

        model_path = self._dataframe_context.get_model_path()
        if model_path.startswith("file"):
            model_path = model_path[7:]
        validationDict = self._dataframe_context.get_validation_dict()
        print "model_path",model_path
        pipeline_filepath = "file://"+str(model_path)+"/"+str(self._slug)+"/pipeline/"
        model_filepath = "file://"+str(model_path)+"/"+str(self._slug)+"/model"
        pmml_filepath = "file://"+str(model_path)+"/"+str(self._slug)+"/modelPmml"

        df = self._data_frame
        pipeline = MLUtils.create_pyspark_ml_pipeline(numerical_columns,categorical_columns,result_column,algoType="regression")

        pipelineModel = pipeline.fit(df)
        indexed = pipelineModel.transform(df)
        featureMapping = sorted((attr["idx"], attr["name"]) for attr in (chain(*indexed.schema["features"].metadata["ml_attr"]["attrs"].values())))

        # print indexed.select([result_column,"features"]).show(5)
        MLUtils.save_pipeline_or_model(pipelineModel,pipeline_filepath)
        glinr = GeneralizedLinearRegression(labelCol=result_column, featuresCol='features',predictionCol="prediction")
        if validationDict["name"] == "kFold":
            defaultSplit = GLOBALSETTINGS.DEFAULT_VALIDATION_OBJECT["value"]
            numFold = int(validationDict["value"])
            if numFold == 0:
                numFold = 3
            trainingData,validationData = indexed.randomSplit([defaultSplit,1-defaultSplit], seed=12345)
            paramGrid = ParamGridBuilder()\
                .addGrid(glinr.regParam, [0.1, 0.01]) \
                .addGrid(glinr.fitIntercept, [False, True])\
                .build()
            crossval = CrossValidator(estimator=glinr,
                          estimatorParamMaps=paramGrid,
                          evaluator=RegressionEvaluator(predictionCol="prediction", labelCol=result_column),
                          numFolds=numFold)
            st = time.time()
            cvModel = crossval.fit(indexed)
            trainingTime = time.time()-st
            print "cvModel training takes",trainingTime
            bestModel = cvModel.bestModel
        elif validationDict["name"] == "trainAndtest":
            trainingData,validationData = indexed.randomSplit([float(validationDict["value"]),1-float(validationDict["value"])], seed=12345)
            st = time.time()
            fit = glinr.fit(trainingData)
            trainingTime = time.time()-st
            print "time to train",trainingTime
            bestModel = fit
        print bestModel.explainParams()
        print bestModel.extractParamMap()
        print bestModel.params
        print 'Best Param (regParam): ', bestModel._java_obj.getRegParam()
        print 'Best Param (MaxIter): ', bestModel._java_obj.getMaxIter()

        # modelPmmlPipeline = PMMLPipeline([
        #   ("pretrained-estimator", objs["trained_model"])
        # ])
        # try:
        #     modelPmmlPipeline.target_field = result_column
        #     modelPmmlPipeline.active_fields = np.array([col for col in x_train.columns if col != result_column])
        #     sklearn2pmml(modelPmmlPipeline, pmml_filepath, with_repr = True)
        #     pmmlfile = open(pmml_filepath,"r")
        #     pmmlText = pmmlfile.read()
        #     pmmlfile.close()
        #     self._result_setter.update_pmml_object({self._slug:pmmlText})
        # except:
        #     pass

        coefficientsArray = [(name, bestModel.coefficients[idx]) for idx, name in featureMapping]
        MLUtils.save_pipeline_or_model(bestModel,model_filepath)
        transformed = bestModel.transform(validationData)
        transformed = transformed.withColumn(result_column,transformed[result_column].cast(DoubleType()))
        transformed = transformed.select([result_column,"prediction",transformed[result_column]-transformed["prediction"]])
        transformed = transformed.withColumnRenamed(transformed.columns[-1],"difference")
        transformed = transformed.select([result_column,"prediction","difference",FN.abs(transformed["difference"])*100/transformed[result_column]])
        transformed = transformed.withColumnRenamed(transformed.columns[-1],"mape")
        sampleData = None
        nrows = transformed.count()
        if nrows > 100:
            sampleData = transformed.sample(False, float(100)/nrows, seed=420)
        else:
            sampleData = transformed
        print sampleData.show()
        evaluator = RegressionEvaluator(predictionCol="prediction",labelCol=result_column)
        metrics = {}
        metrics["r2"] = evaluator.evaluate(transformed,{evaluator.metricName: "r2"})
        metrics["rmse"] = evaluator.evaluate(transformed,{evaluator.metricName: "rmse"})
        metrics["mse"] = evaluator.evaluate(transformed,{evaluator.metricName: "mse"})
        metrics["mae"] = evaluator.evaluate(transformed,{evaluator.metricName: "mae"})
        runtime = round((time.time() - st_global),2)
        # print transformed.count()
        mapeDf = transformed.select("mape")
        # print mapeDf.show()
        mapeStats = MLUtils.get_mape_stats(mapeDf,"mape")
        mapeStatsArr = mapeStats.items()
        mapeStatsArr = sorted(mapeStatsArr,key=lambda x:int(x[0]))
        # print mapeStatsArr
        quantileDf = transformed.select("prediction")
        # print quantileDf.show()
        quantileSummaryDict = MLUtils.get_quantile_summary(quantileDf,"prediction")
        quantileSummaryArr = quantileSummaryDict.items()
        quantileSummaryArr = sorted(quantileSummaryArr,key=lambda x:int(x[0]))
        # print quantileSummaryArr
        self._model_summary.set_model_type("regression")
        self._model_summary.set_algorithm_name("Generalized Linear Regression")
        self._model_summary.set_algorithm_display_name("Generalized Linear Regression")
        self._model_summary.set_slug(self._slug)
        self._model_summary.set_training_time(runtime)
        self._model_summary.set_training_time(trainingTime)
        self._model_summary.set_target_variable(result_column)
        self._model_summary.set_validation_method(validationDict["displayName"])
        self._model_summary.set_model_evaluation_metrics(metrics)
        self._model_summary.set_model_params(bestEstimator.get_params())
        self._model_summary.set_quantile_summary(quantileSummaryArr)
        self._model_summary.set_mape_stats(mapeStatsArr)
        self._model_summary.set_sample_data(sampleData.toPandas().to_dict())
        self._model_summary.set_coefficinets_array(coefficientsArray)
        self._model_summary.set_feature_list(list(x_train.columns))

        # print CommonUtils.convert_python_object_to_json(self._model_summary)
        modelSummaryJson = {
            "dropdown":{
                        "name":self._model_summary.get_algorithm_name(),
                        "accuracy":CommonUtils.round_sig(self._model_summary.get_model_evaluation_metrics()["r2"]),
                        "slug":self._model_summary.get_slug()
                        },
            "levelcount":self._model_summary.get_level_counts(),
            "modelFeatureList":self._model_summary.get_feature_list(),
            "levelMapping":self._model_summary.get_level_map_dict()
        }

        glinrCards = [json.loads(CommonUtils.convert_python_object_to_json(cardObj)) for cardObj in MLUtils.create_model_summary_cards(self._model_summary)]

        for card in glinrCards:
            self._prediction_narrative.add_a_card(card)
        self._result_setter.set_model_summary({"generalizedlinearregression":json.loads(CommonUtils.convert_python_object_to_json(self._model_summary))})
        self._result_setter.set_generalized_linear_regression_model_summary(modelSummaryJson)
        self._result_setter.set_glinr_cards(glinrCards)
accuracy = accuracyEval.evaluate(rfc_pred)

print("accuracy of classifier is: ", accuracy)


#####3b: GLM

#training data filtered to just the rows with nonzero claims
trainFilt = trainingData.filter(col("binaryclaim")==1)


from pyspark.ml.regression import GeneralizedLinearRegression

glr = GeneralizedLinearRegression(featuresCol='finalvector', labelCol='Claim_Amount', regParam=0.01, family='gaussian', predictionCol = "combo_prediction")

glrmodel = glr.fit(trainFilt)

###only perform GLR on rows that have been predicted to have nonzero claims
predFilt = rfc_pred.filter(col("prediction")==1)


combo_pred = glrmodel.transform(predFilt)



#looking at a few rows:

combo_pred.select("finalvector",'Claim_Amount', "binaryclaim", "prediction", "combo_prediction" ).show(100)
#Now we get the MSE AND MAE

from pyspark.ml.evaluation import RegressionEvaluator
Beispiel #21
0
#Generamos un vector con la columna label  y la columna array features
ignore = ['label']
assembler = VectorAssembler(
    inputCols=[x for x in train.columns if x not in ignore],
    outputCol='features')
train_LP = assembler.transform(train).select(['label', 'features'])
evaluation_LP = assembler.transform(evaluation).select(['label', 'features'])

#Definimos el algoritmo del modelo (regresion logistica)
model_regresion = GeneralizedLinearRegression(family="gaussian",
                                              link="identity",
                                              maxIter=50,
                                              regParam=0.05)

# Fit the model
model_regresion = model_regresion.fit(train_LP)

# Make predictions.
predictions = model_regresion.transform(evaluation_LP)

# Print the coefficients and intercept for linear regression
print("Coefficients: %s" % str(model_regresion.coefficients))
print("Intercept: %s" % str(model_regresion.intercept))

# COMMAND ----------

# Summarize the model over the training set and print out some metrics

trainingSummary = model_regresion.summary

print("Coefficient Standard Errors: " +
Beispiel #22
0
                                            'prediction'].rdd).areaUnderROC)

## gamma regression with predictions

gam = predictions.filter(predictions.prediction > 0).filter(
    predictions.label > 0)

glr = GeneralizedLinearRegression(labelCol="label",
                                  featuresCol="pcaFeatures",
                                  predictionCol="gammaprediction",
                                  family="gamma",
                                  link="Inverse",
                                  maxIter=10)

## Fit the model

model = glr.fit(gam)
gammapred = model.transform(gam)

evaluator = RegressionEvaluator(labelCol="label",
                                predictionCol="gammaprediction",
                                metricName="r2")
r2 = evaluator.evaluate(gammapred)
print("Evaluating gamma prediction :")
print("R2 = %g " % r2)

end = time.time()
print('tiempo', end - begin)

sc.stop()
Beispiel #23
0
# Let's see how many numerical features we have:
num_cols = [item[0] for item in df.dtypes if item[1].startswith('int') | item[1].startswith('double')][1:]
print(str(len(num_cols)) + '  numerical features')

Data = df.rdd.map(lambda x:(Vectors.dense(x[0:-1]), x[-1])).toDF(["features", "label"])
Data.show()
pd.DataFrame(Data.take(5), columns=Data.columns)

testset,trainset = Data.randomSplit([0.3,0.7], seed=25)
print("Training Dataset Count: " + str(trainset.count()))
print("Test Dataset Count: " + str(testset.count()))

### GENERALIZED LINEAR REGRESSION FOR FEATURE SELECTION
from pyspark.ml.regression import GeneralizedLinearRegression
glr = GeneralizedLinearRegression(predictionCol="Predicted_median", labelCol="label", featuresCol="features",family="binomial", link="logit", maxIter=10,regParam=0.01)
model = glr.fit(Data)
summary = model.summary
print("Coefficient Standard Errors: " + str(summary.coefficientStandardErrors))
print("P Values: " + str(summary.pValues))

#Removing all the columns that had a p-value above 0.05
vs = VectorSlicer(inputCol="features", outputCol="selected_features", indices=[0,2,9,18,21,23,24,26,27,28,31,32,37,41])
Training_set= vs.transform(trainset)
Test_set = vs.transform(testset)

#### LOGISTIC REGRESSION
logReg = LogisticRegression(predictionCol="Predicted_median", labelCol="label", featuresCol="features", maxIter=20,regParam=0.01, elasticNetParam=0.8, family="binomial")
logReg_model = logReg.fit(Training_set)
trainingSummary = logReg_model.summary
roc = trainingSummary.roc.toPandas()
print('Training set ROC: ' + str(trainingSummary.areaUnderROC))
Beispiel #24
0
 def binomialSparkGLF(self):
     regr = GeneralizedLinearRegression()
     model = regr.fit(self.Xtrain, self.Ytrain)
     return model
Beispiel #25
0
 def scalarSparkGLR(self):
     regr = GeneralizedLinearRegression()
     model = regr.fit(self.train)
     return model
Beispiel #26
0
# Random Spliting
training, testing = modelprep2.randomSplit([0.8, 0.2])

#modelprep2.count()
#training.count()
#testing.count()


#######################################################################################
#
#   Modeling - GLM (Regression)
#
#######################################################################################

glm = GeneralizedLinearRegression(featuresCol="features", labelCol="label", maxIter=10, regParam=0.3)
glmmodel = glm.fit(training)

summary = glmmodel.summary

# Show Coefficients and Intercept
print("\nFeatures: " + str(features) + "\n")
print("\nCoefficients: " + str(glmmodel.coefficients) + "\n")
print("\nIntercept: " + str(glmmodel.intercept) + "\n")
print("\nPValues: " + str(summary.pValues) + "\n")

# Summarize the model over the training set and print out some metrics
#print("\nCoefficient Standard Errors: " + str(summary.coefficientStandardErrors))
#print("T Values: " + str(summary.tValues))
#print("P Values: " + str(summary.pValues))
#print("Dispersion: " + str(summary.dispersion))
#print("Null Deviance: " + str(summary.nullDeviance))
Beispiel #27
0
                                          metricName="areaUnderROC")
auc = evaluator.evaluate(logisticReg_prediction)
end = time.time()
print('Logistic Regression Execution time:', end - start)
print("auc = %g" % auc)

train_notzero = trainingData.filter('not_zero != 0')
test_notzero = testData.filter('not_zero != 0')

#training glm model
from pyspark.ml.regression import GeneralizedLinearRegression
from pyspark.ml.evaluation import RegressionEvaluator
glm_poisson = GeneralizedLinearRegression(featuresCol='features', labelCol='Claim_Amount', maxIter=10, regParam=0.01,\
                                          family='Gamma', link='identity')
start = time.time()
glm_model = glm_poisson.fit(train_notzero)

#select zero sample
pred_zero = logisticReg_prediction.filter('prediction == 0')
pred_zero = pred_zero.withColumn('claim_prediction',
                                 pred_zero['not_zero'] * 0).select(
                                     'Claim_Amount', 'claim_prediction')

#extract non zero value
pred_nonzero = logisticReg_prediction.filter('prediction != 0')
pred_nonzero = pred_nonzero.select('features', 'Claim_Amount')

#compare model with non zero value
pred_amount = glm_model.transform(pred_nonzero)
pred_amount = pred_amount.select('Claim_Amount', 'prediction')
pred_amount = pred_amount.withColumnRenamed('prediction', 'claim_prediction')
print summary.objectiveHistory
print summary.rootMeanSquaredError
print summary.r2


# COMMAND ----------

from pyspark.ml.regression import GeneralizedLinearRegression
glr = GeneralizedLinearRegression()\
  .setFamily("gaussian")\
  .setLink("identity")\
  .setMaxIter(10)\
  .setRegParam(0.3)\
  .setLinkPredictionCol("linkOut")
print glr.explainParams()
glrModel = glr.fit(df)


# COMMAND ----------

from pyspark.ml.regression import DecisionTreeRegressor
dtr = DecisionTreeRegressor()
print dtr.explainParams()
dtrModel = dtr.fit(df)


# COMMAND ----------

from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.regression import GBTRegressor
rf =  RandomForestRegressor()
Beispiel #29
0
# 划分训练集,集测试集
vdata = v_data.select(['features', 'medv'])
vdata.show(10)
splits = vdata.randomSplit([0.7, 0.3])
train_data = splits[0]
test_data = splits[1]

# 训练
glr = GeneralizedLinearRegression(family="gaussian",
                                  link="identity",
                                  labelCol='medv',
                                  featuresCol='features',
                                  maxIter=1000,
                                  regParam=0.3)
# Fit the model
GlModel = glr.fit(train_data)

# Print the coefficients and intercept for generalized linear regression model
print("Coefficients: " + str(GlModel.coefficients))
print("Intercept: " + str(GlModel.intercept))

# Summarize the model over the training set and print out some metrics
summary = GlModel.summary
print("Coefficient Standard Errors: " + str(summary.coefficientStandardErrors))
print("Null Deviance: " + str(summary.nullDeviance))
print("Residual Degree Of Freedom Null: " +
      str(summary.residualDegreeOfFreedomNull))
print("Deviance: " + str(summary.deviance))
print("Residual Degree Of Freedom: " + str(summary.residualDegreeOfFreedom))
print("AIC: " + str(summary.aic))
print("Deviance Residuals: ")
Beispiel #30
0
# Conver the label of data which has non-zero label to 1

from pyspark.sql.functions import when
train_set1 = training_set.withColumn('Claim_Amount',when(dataVectorised.Claim_Amount!=0, 1).otherwise(0))
test_set1 = test_set.withColumn('Claim_Amount',when(dataVectorised.Claim_Amount!=0, 1).otherwise(0))

# The binary classifier
model_start = time.time()
from pyspark.ml.classification import RandomForestClassifier
rfc = RandomForestClassifier(featuresCol='features', labelCol='Claim_Amount', maxDepth=5, numTrees=3, seed=myseed)
RFC_model = rfc.fit(train_set1)

# Gamma Regressor
from pyspark.ml.regression import GeneralizedLinearRegression
glr = GeneralizedLinearRegression(featuresCol='features', labelCol='Claim_Amount', link='identity')
GLR_model = glr.fit(data_Claim_else)

# Combine the two model
predict_RFC = RFC_model.transform(test_set)
# select the results which predicted as 1
RFC_result = predict_RFC[predict_RFC['prediction']==1].select('features','Claim_Amount')
GLR_result = GLR_model.transform(RFC_result)
model_end = time.time()

mse = evaluatorMSE.evaluate(GLR_result)
mae = evaluatorMAE.evaluate(GLR_result)
print('mse :', mse)
print('mae :', mae)
print('Time:', model_end-model_start)

spark.stop()
print("test vector assembled")
test_df.show(5)

# Split `train_df` into train and test sets (30% held out for testing)
#Split train and test
seed(0)
(trainingData, testData) = train_df.randomSplit([0.7, 0.3])

# ## Logistic Regression
#Fit logistic regression
glr = GeneralizedLinearRegression(family="binomial",
                                  link="logit",
                                  featuresCol="features",
                                  labelCol="is_duplicate")
trainLogitModel = glr.fit(trainingData)

#Logistic model predictions
LogitPredictions = trainLogitModel.transform(testData)

# Calculate AUC
evaluator = BinaryClassificationEvaluator(labelCol="is_duplicate",
                                          rawPredictionCol="prediction",
                                          metricName="areaUnderROC")
AUClogit = evaluator.evaluate(LogitPredictions)
print("Logistic Regression AUC = %g " % AUClogit)

# ## Decision trees
#Fit decision tree model
#Train a DecisionTree model and make predictions
dt = DecisionTreeClassifier(maxDepth=15, labelCol="is_duplicate")
Beispiel #32
0
cuse_df.show(5)


# In[3]:
# ## Split data into training and test datasets
training, test = cuse_df.randomSplit([0.8, 0.2], seed=1234)

# In[4]:
# ## Build Logistic Regression model

from pyspark.ml.regression import GeneralizedLinearRegression
logr = GeneralizedLinearRegression(family="binomial", link="logit", regParam=0.0)

# Fit the model to the data and call this model logr_Model
logr_Model = logr.fit(training)

# Print the coefficients and intercept for linear regression
summary = logr_Model.summary
print("Coefficient Standard Errors: " + str(summary.coefficientStandardErrors))
print("T Values: " + str(summary.tValues))
print("P Values: " + str(summary.pValues))


# #### Prediction on training data
pred_training_cv = logr_Model.transform(training)
pred_training_cv.show(5, truncate=False)

# #### Prediction on test data
pred_test_cv = logr_Model.transform(test)
pred_test_cv.show(5, truncate=False)
Beispiel #33
0
(trainingData, testData) = dataset.randomSplit([0.7, 0.3], seed=100)
print(trainingData.count())
print(testData.count())

# COMMAND ----------

from pyspark.ml.regression import GeneralizedLinearRegression

# Load training data
dataset = spark.read.format("libsvm")\
    .load("data/mllib/sample_linear_regression_data.txt")

glr = GeneralizedLinearRegression(family="gaussian", link="identity", maxIter=10, regParam=0.3)

# Fit the model
model = glr.fit(dataset)

# Print the coefficients and intercept for generalized linear regression model
print("Coefficients: " + str(model.coefficients))
print("Intercept: " + str(model.intercept))

# Summarize the model over the training set and print out some metrics
summary = model.summary
print("Coefficient Standard Errors: " + str(summary.coefficientStandardErrors))
print("T Values: " + str(summary.tValues))
print("P Values: " + str(summary.pValues))
print("Dispersion: " + str(summary.dispersion))
print("Null Deviance: " + str(summary.nullDeviance))
print("Residual Degree Of Freedom Null: " + str(summary.residualDegreeOfFreedomNull))
print("Deviance: " + str(summary.deviance))
print("Residual Degree Of Freedom: " + str(summary.residualDegreeOfFreedom))
Beispiel #34
0
dt = DecisionTreeClassifier(labelCol="label",
                            featuresCol="features",
                            predictionCol='prediction_c',
                            maxBins=800)
binarizer = Binarizer(threshold=0.0001,
                      inputCol='Claim_Amount',
                      outputCol='label')

pipeline = Pipeline(stages=[binarizer, dt])
dtModel = pipeline.fit(traindata)
# Make predictions on test data using the Transformer.transform() method.
predictions = dtModel.transform(testdata)

non_zero_train = traindata.filter(traindata['Claim_Amount'] > 0.0)
non_zero_test = predictions.filter(predictions['prediction_c'] > 0.0)

print("Generalized Linear Regression with gamma family")

from pyspark.ml.regression import GeneralizedLinearRegression
glm_gamma = GeneralizedLinearRegression(featuresCol='features', labelCol='Claim_Amount', maxIter=50,\
                                          family='gamma', link='log')
glm_model = glm_gamma.fit(non_zero_train)
predictions = glm_model.transform(non_zero_test)

from pyspark.ml.evaluation import RegressionEvaluator
evaluator = RegressionEvaluator\
      (labelCol="Claim_Amount", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(predictions)
print("RMSE = %g " % rmse)

end = time.time()