コード例 #1
0
def bestGeneralizedLR(trainDf, metricDF, metricToCompare):
    regParam = [1.0, 0.6, 0.2]
    tol = [1.0, 0.6, 0.2, 0.0]
    family = ["poisson", "gaussian"]
    link = {"poisson": ["identity", "sqrt", "log"], "gaussian": ["identity"]}
    models = []

    for r in regParam:
        for f in family:
            for l in link.get(f):
                for t in tol:
                    models.append(
                        GeneralizedLinearRegression(maxIter=10,
                                                    regParam=r,
                                                    family=f,
                                                    link=l,
                                                    tol=t).fit(trainDf))

    return getBestModel(models, metricDF, metricToCompare)
コード例 #2
0
 def test_glr_load(self):
     df = self.spark.createDataFrame([(1.0, Vectors.dense(0.0, 0.0)),
                                      (1.0, Vectors.dense(1.0, 2.0)),
                                      (2.0, Vectors.dense(0.0, 0.0)),
                                      (2.0, Vectors.dense(1.0, 1.0))],
                                     ["label", "features"])
     glr = GeneralizedLinearRegression(family="gaussian",
                                       link="identity",
                                       linkPredictionCol="p")
     model = glr.fit(df)
     self.assertEqual(model.getSolver(), "irls")
     transformed1 = model.transform(df)
     path = tempfile.mkdtemp()
     model_path = path + "/glr"
     model.save(model_path)
     model2 = GeneralizedLinearRegressionModel.load(model_path)
     self.assertEqual(model2.getSolver(), "irls")
     transformed2 = model2.transform(df)
     self.assertEqual(transformed1.take(4), transformed2.take(4))
コード例 #3
0
def selectRegressionMethod(regressionMethodName, featureName):

    if regressionMethodName == "rf":
        if test == True:
            nt = 1
        else:
            nt = 100
        modelParameters = {
            'featuresCol': featureName,
            'numTrees': nt,
            'subsamplingRate': 1,
            'maxDepth': 10
        }
        regressionMethod = RandomForestRegressor(
            featuresCol=modelParameters['featuresCol'],
            numTrees=modelParameters['numTrees'],
            subsamplingRate=modelParameters['subsamplingRate'],
            maxDepth=modelParameters['maxDepth'])

    elif regressionMethodName == "gbt":
        modelParameters = {'featuresCol': featureName, 'maxIter': 10}
        regressionMethod = GBTRegressor(
            featuresCol=modelParameters['featuresCol'],
            maxIter=modelParameters['maxIter'])

    elif regressionMethodName == "glr":
        modelParameters = {
            'featuresCol': featureName,
            'family': "poisson",
            'link': 'log',
            'maxIter': 10,
            'regParam': 0.3
        }
        regressionMethod = GeneralizedLinearRegression(
            family=modelParameters['family'],
            link=modelParameters['link'],
            maxIter=modelParameters['maxIter'],
            regParam=modelParameters['regParam'])
    else:
        print('Invalid regression method')
        return ()
    #print('Regression method selected')
    return (regressionMethod, modelParameters)
コード例 #4
0
 def test_glr_summary(self):
     from pyspark.mllib.linalg import Vectors
     df = self.spark.createDataFrame(
         [(1.0, 2.0, Vectors.dense(1.0)),
          (0.0, 2.0, Vectors.sparse(1, [], []))],
         ["label", "weight", "features"])
     glr = GeneralizedLinearRegression(family="gaussian",
                                       link="identity",
                                       weightCol="weight",
                                       fitIntercept=False)
     model = glr.fit(df)
     self.assertTrue(model.hasSummary)
     s = model.summary
     # test that api is callable and returns expected types
     self.assertEqual(s.numIterations,
                      1)  # this should default to a single iteration of WLS
     self.assertTrue(isinstance(s.predictions, DataFrame))
     self.assertEqual(s.predictionCol, "prediction")
     self.assertTrue(isinstance(s.residuals(), DataFrame))
     self.assertTrue(isinstance(s.residuals("pearson"), DataFrame))
     coefStdErr = s.coefficientStandardErrors
     self.assertTrue(
         isinstance(coefStdErr, list) and isinstance(coefStdErr[0], float))
     tValues = s.tValues
     self.assertTrue(
         isinstance(tValues, list) and isinstance(tValues[0], float))
     pValues = s.pValues
     self.assertTrue(
         isinstance(pValues, list) and isinstance(pValues[0], float))
     self.assertEqual(s.degreesOfFreedom, 1)
     self.assertEqual(s.residualDegreeOfFreedom, 1)
     self.assertEqual(s.residualDegreeOfFreedomNull, 2)
     self.assertEqual(s.rank, 1)
     self.assertTrue(isinstance(s.solver, basestring))
     self.assertTrue(isinstance(s.aic, float))
     self.assertTrue(isinstance(s.deviance, float))
     self.assertTrue(isinstance(s.nullDeviance, float))
     self.assertTrue(isinstance(s.dispersion, float))
     # test evaluation (with training dataset) produces a summary with same values
     # one check is enough to verify a summary is returned, Scala version runs full test
     sameSummary = model.evaluate(df)
     self.assertAlmostEqual(sameSummary.deviance, s.deviance)
コード例 #5
0
ファイル: test_algorithms.py プロジェクト: sujithjay/spark
    def test_offset(self):

        df = self.spark.createDataFrame(
            [
                (0.2, 1.0, 2.0, Vectors.dense(0.0, 5.0)),
                (0.5, 2.1, 0.5, Vectors.dense(1.0, 2.0)),
                (0.9, 0.4, 1.0, Vectors.dense(2.0, 1.0)),
                (0.7, 0.7, 0.0, Vectors.dense(3.0, 3.0)),
            ],
            ["label", "weight", "offset", "features"],
        )

        glr = GeneralizedLinearRegression(family="poisson",
                                          weightCol="weight",
                                          offsetCol="offset")
        model = glr.fit(df)
        self.assertTrue(
            np.allclose(model.coefficients.toArray(), [0.664647, -0.3192581],
                        atol=1e-4))
        self.assertTrue(np.isclose(model.intercept, -1.561613, atol=1e-4))
コード例 #6
0
ファイル: test_algorithms.py プロジェクト: zhengruifeng/spark
    def test_tweedie_distribution(self):

        df = self.spark.createDataFrame(
            [
                (1.0, Vectors.dense(0.0, 0.0)),
                (1.0, Vectors.dense(1.0, 2.0)),
                (2.0, Vectors.dense(0.0, 0.0)),
                (2.0, Vectors.dense(1.0, 1.0)),
            ],
            ["label", "features"],
        )

        glr = GeneralizedLinearRegression(family="tweedie", variancePower=1.6)
        model = glr.fit(df)
        self.assertTrue(np.allclose(model.coefficients.toArray(), [-0.4645, 0.3402], atol=1e-4))
        self.assertTrue(np.isclose(model.intercept, 0.7841, atol=1e-4))

        model2 = glr.setLinkPower(-1.0).fit(df)
        self.assertTrue(np.allclose(model2.coefficients.toArray(), [-0.6667, 0.5], atol=1e-4))
        self.assertTrue(np.isclose(model2.intercept, 0.6667, atol=1e-4))
コード例 #7
0
def linear_regression(ticker,writer):
    spark = SparkSession \
        .builder \
        .appName("GeneralizedLinearRegressionExample") \
        .getOrCreate()
    # Load training data
    dataset = spark.read.format("libsvm").load("../data/lr/" + ticker + "_no_today.csv")
    glr = GeneralizedLinearRegression(family="gaussian", link="identity", maxIter=1, regParam=0.8)

    # Fit the model
    model = glr.fit(dataset)
    data=[ticker, 'coefficient:', model.coefficients[0],'intercept:',model.intercept]
    writer.writerow(data)
    print(data)
    # predict
    today_close_value = 0
    yesterday_close_value = 0
    with open("../data/lr/" + ticker + ".csv") as csvfile:
        reader = csv.reader(csvfile, delimiter=',')
        count = 0
        for row in reader:
            if count is 0:
                today_close_value = row[0]
                count += 1
            elif count is 1:
                yesterday_close_value = row[0]
                break

    # # print(today_close_value)
    # # print(yesterday_close_value)

    predict_close_value = -1 * float(str(model.coefficients[0])) + float(str(model.intercept))
    # print(predict_close_value)
    spark.stop()
    if predict_close_value >= yesterday_close_value and today_close_value >= yesterday_close_value:
        return True
    elif predict_close_value <= yesterday_close_value and today_close_value <= yesterday_close_value:
        return True
    else:
        return False
コード例 #8
0
ファイル: regression.py プロジェクト: canisn/pyspark
def generalized_linear_regression():
    spark = SparkSession \
        .builder \
        .appName("Python Spark SQL basic example") \
        .config("spark.some.config.option", "some-value") \
        .getOrCreate()
    df = spark.createDataFrame([
        (1.0, Vectors.dense(0.0, 0.0)),
        (1.0, Vectors.dense(1.0, 2.0)),
        (2.0, Vectors.dense(0.0, 0.0)),
        (2.0, Vectors.dense(1.0, 1.0)),
    ], ["label", "features"])
    glr = GeneralizedLinearRegression(
        family="gaussian",
        link="identity",
    )  # linkPredictionCol="p")
    model = glr.fit(df)
    transformed = model.transform(df)
    abs(transformed.head().prediction - 1.5) < 0.001
    # True
    abs(transformed.head().p - 1.5) < 0.001
    # True
    model.coefficients
    model.numFeatures
    # 2
    abs(model.intercept - 1.5) < 0.001
    # True
    temp_path = "./"
    glr_path = temp_path + "/glr"
    glr.save(glr_path)
    glr2 = GeneralizedLinearRegression.load(glr_path)
    glr.getFamily() == glr2.getFamily()
    # True
    model_path = temp_path + "/glr_model"
    model.save(model_path)
    model2 = GeneralizedLinearRegressionModel.load(model_path)
    model.intercept == model2.intercept
    # True
    model.coefficients[0] == model2.coefficients[0]
コード例 #9
0
def generalizeRegression(df, label, features, adjust):
    """ This function returns the rmse and the predictions form the applied generalized 
        regression model on the dataframe with the speficied feature columns """
    ## Columns with non numerical values are adjusted
    for col in adjust:
        indexer=StringIndexer(inputCol=col,outputCol="{}_num".format(col)) 
        features.append("{}_num".format(col))
        df=indexer.fit(df).transform(df)
    ## Features vector configured from dataframe for model processing
    assembler = VectorAssembler(inputCols=features, outputCol="features")
    assembled = assembler.transform(df)
    gr = GeneralizedLinearRegression(featuresCol ='features', labelCol=label, regParam=0.3, family="poisson")
    grModel=gr.fit(assembled)
    predictions = grModel.transform(assembled)
    ## Evaluator required for rmse estimation
    evaluator = RegressionEvaluator(labelCol=label, metricName="rmse")
    rmse = evaluator.evaluate(predictions)
    result = {
        "RMSE": rmse,
        "predictions": [r["prediction"] for r in predictions.select("prediction").collect()]
    }
    return result
コード例 #10
0
ファイル: main.py プロジェクト: helgi22/bigdata19.case04
def model():

    data = sql.read.parquet(str(DATA_PARQUET))
    data.createOrReplaceTempView('data')
    sample = sql.sql('''
        select
            hash_number_A
            ,interest_1
            ,interest_2
            ,interest_3
            ,interest_4
            ,interest_5
            ,device_type
            ,phone_price_category
            ,sum(cost) as label
        from data
        group by {", ".join(str(n) for n in range(1, 8+1))}''')
    breakpoint()

    pipeline = Pipeline(stages=[
        StringIndexer(inputCol='interest_1', outputCol='interest'),
        StringIndexer(inputCol='phone_price_category',
                      outputCol='phone_price'),
        VectorAssembler(inputCols=['interest', 'phone_price'],
                        outputCol='features'),
    ])
    model_data = pipeline.fit(sample)

    sample = model_data.transform(sample)

    # 'gaussian', 'binomial', 'poisson', 'gamma', 'tweedie'

    regression = GeneralizedLinearRegression(family='gaussian',
                                             labelCol='label',
                                             featuresCol='features',
                                             maxIter=10,
                                             regParam=0.3)
    model = regression.fit(sample)
    breakpoint()
コード例 #11
0
ファイル: glm_spark.py プロジェクト: SweeRoty/fintell_device
	assembler = VectorAssembler(inputCols=feature_cols, outputCol='feature_vec')
	dataset = assembler.transform(dataset)
	scaler_model = None
	if args.mode == 'train':
		scaler = StandardScaler(inputCol='feature_vec', outputCol='scaled_feature_vec', withStd=True, withMean=True)
		scaler_model = scaler.fit(dataset)
		scaler_model.save('/user/ronghui_safe/hgy/nid/edw/standardScaler_model_v2')
	else:
		scaler_model = StandardScalerModel.load('/user/ronghui_safe/hgy/nid/edw/standardScaler_model_v2')
	dataset = scaler_model.transform(dataset)
	polyExpansion = PolynomialExpansion(degree=2, inputCol='scaled_feature_vec', outputCol='polyFeatures')
	dataset = polyExpansion.transform(dataset)
	dataset = dataset.select(F.col('duration'), F.col('polyFeatures'), F.col('key')).cache()
	glr = None
	if args.mode == 'train':
		glr = GeneralizedLinearRegression(labelCol='duration', featuresCol='polyFeatures', family='Binomial', linkPredictionCol='link_pred')
		paramGrid = ParamGridBuilder() \
					.addGrid(glr.link, ['logit']) \
					.addGrid(glr.regParam, [1e-5]) \
					.build()
		tvs = TrainValidationSplit(estimator=glr, \
									estimatorParamMaps=paramGrid, \
									evaluator=RegressionEvaluator(metricName='r2', labelCol='duration'), \
									trainRatio=0.7)
		tvs_model = tvs.fit(dataset)
		print('----> {}'.format(tvs_model.validationMetrics))
		if args.save_model:
			tvs_model.write().save('/user/ronghui_safe/hgy/nid/edw/glm_binomial_model_v2')
	else:
		#glr_model = GeneralizedLinearRegressionModel.load('/user/ronghui_safe/hgy/nid/models/glm_binomial_model')
		glr_model = TrainValidationSplitModel.load('/user/ronghui_safe/hgy/nid/edw/glm_binomial_model_v2')
コード例 #12
0
df_train_label = df_train_label.withColumn(
    'realdate', udf_strpTime_trainlabel(df_train_label['date'])).drop('date')

df_new = df_train_label.join(df_features, 'realdate')
df_new = df_new.na.fill(0.0)

train, validation = df_new.randomSplit([0.80, 0.20])

assembler = VectorAssembler(inputCols=[
    'realdate', 'e1', 'e2', 'e3', 'e4', 'e5', 'e6', 'e7', 'e8', 'e9', 'e10',
    'e11', 'e12', 'e13', 'e14', 'e15', 'e16', 'e17', 'e18', 'e19', 'e20',
    'e21', 'e22', 'e23', 'e24', 'e25', 'e26'
],
                            outputCol='features')

gr = GeneralizedLinearRegression(featuresCol='features', labelCol='label')
pipeline = Pipeline(stages=[assembler, gr])

model = pipeline.fit(train)
prediction = model.transform(validation)
evaluator = RegressionEvaluator(predictionCol='prediction')
res = evaluator.evaluate(prediction, {evaluator.metricName: 'mse'})
print("generalized linear regression mse is :%f " % res)

gbt = GBTRegressor(featuresCol='features', labelCol='label')

gbt_pipeline = Pipeline(stages=[assembler, gbt])
gbt_model = gbt_pipeline.fit(train)
gbt_prediction = gbt_model.transform(validation)
gbt_res = evaluator.evaluate(gbt_prediction, {evaluator.metricName: 'mse'})
print("gbt regression mse is: %f" % gbt_res)
コード例 #13
0
predictions = dtModel.transform(test_bin)
accuracy = evaluator.evaluate(predictions)
print("LR Accuracy = %g " % accuracy)
print(
    'AUC:',
    BinaryClassificationMetrics(predictions['label',
                                            'prediction'].rdd).areaUnderROC)

## gamma regression with predictions

gam = predictions.filter(predictions.prediction > 0).filter(
    predictions.label > 0)

glr = GeneralizedLinearRegression(labelCol="label",
                                  featuresCol="pcaFeatures",
                                  predictionCol="gammaprediction",
                                  family="gamma",
                                  link="Inverse",
                                  maxIter=10)

## Fit the model

model = glr.fit(gam)
gammapred = model.transform(gam)

evaluator = RegressionEvaluator(labelCol="label",
                                predictionCol="gammaprediction",
                                metricName="r2")
r2 = evaluator.evaluate(gammapred)
print("Evaluating gamma prediction :")
print("R2 = %g " % r2)
コード例 #14
0
# Let's see how many numerical features we have:
num_cols = [item[0] for item in df.dtypes if item[1].startswith('int') | item[1].startswith('double')][1:]
print(str(len(num_cols)) + '  numerical features')

Data = df.rdd.map(lambda x:(Vectors.dense(x[0:-1]), x[-1])).toDF(["features", "label"])
Data.show()
pd.DataFrame(Data.take(5), columns=Data.columns)

testset,trainset = Data.randomSplit([0.3,0.7], seed=25)
print("Training Dataset Count: " + str(trainset.count()))
print("Test Dataset Count: " + str(testset.count()))

### GENERALIZED LINEAR REGRESSION FOR FEATURE SELECTION
from pyspark.ml.regression import GeneralizedLinearRegression
glr = GeneralizedLinearRegression(predictionCol="Predicted_median", labelCol="label", featuresCol="features",family="binomial", link="logit", maxIter=10,regParam=0.01)
model = glr.fit(Data)
summary = model.summary
print("Coefficient Standard Errors: " + str(summary.coefficientStandardErrors))
print("P Values: " + str(summary.pValues))

#Removing all the columns that had a p-value above 0.05
vs = VectorSlicer(inputCol="features", outputCol="selected_features", indices=[0,2,9,18,21,23,24,26,27,28,31,32,37,41])
Training_set= vs.transform(trainset)
Test_set = vs.transform(testset)

#### LOGISTIC REGRESSION
logReg = LogisticRegression(predictionCol="Predicted_median", labelCol="label", featuresCol="features", maxIter=20,regParam=0.01, elasticNetParam=0.8, family="binomial")
logReg_model = logReg.fit(Training_set)
trainingSummary = logReg_model.summary
roc = trainingSummary.roc.toPandas()
コード例 #15
0
 def binomialSparkGLF(self):
     regr = GeneralizedLinearRegression()
     model = regr.fit(self.Xtrain, self.Ytrain)
     return model
コード例 #16
0
 def scalarSparkGLR(self):
     regr = GeneralizedLinearRegression()
     model = regr.fit(self.train)
     return model
コード例 #17
0
],
                                  outputCol='features')
v_data = vectorAssembler.transform(data)
v_data.show(10)

# 划分训练集,集测试集
vdata = v_data.select(['features', 'medv'])
vdata.show(10)
splits = vdata.randomSplit([0.7, 0.3])
train_data = splits[0]
test_data = splits[1]

# 训练
glr = GeneralizedLinearRegression(family="gaussian",
                                  link="identity",
                                  labelCol='medv',
                                  featuresCol='features',
                                  maxIter=1000,
                                  regParam=0.3)
# Fit the model
GlModel = glr.fit(train_data)

# Print the coefficients and intercept for generalized linear regression model
print("Coefficients: " + str(GlModel.coefficients))
print("Intercept: " + str(GlModel.intercept))

# Summarize the model over the training set and print out some metrics
summary = GlModel.summary
print("Coefficient Standard Errors: " + str(summary.coefficientStandardErrors))
print("Null Deviance: " + str(summary.nullDeviance))
print("Residual Degree Of Freedom Null: " +
      str(summary.residualDegreeOfFreedomNull))
コード例 #18
0
# Conver the label of data which has non-zero label to 1

from pyspark.sql.functions import when
train_set1 = training_set.withColumn('Claim_Amount',when(dataVectorised.Claim_Amount!=0, 1).otherwise(0))
test_set1 = test_set.withColumn('Claim_Amount',when(dataVectorised.Claim_Amount!=0, 1).otherwise(0))

# The binary classifier
model_start = time.time()
from pyspark.ml.classification import RandomForestClassifier
rfc = RandomForestClassifier(featuresCol='features', labelCol='Claim_Amount', maxDepth=5, numTrees=3, seed=myseed)
RFC_model = rfc.fit(train_set1)

# Gamma Regressor
from pyspark.ml.regression import GeneralizedLinearRegression
glr = GeneralizedLinearRegression(featuresCol='features', labelCol='Claim_Amount', link='identity')
GLR_model = glr.fit(data_Claim_else)

# Combine the two model
predict_RFC = RFC_model.transform(test_set)
# select the results which predicted as 1
RFC_result = predict_RFC[predict_RFC['prediction']==1].select('features','Claim_Amount')
GLR_result = GLR_model.transform(RFC_result)
model_end = time.time()

mse = evaluatorMSE.evaluate(GLR_result)
mae = evaluatorMAE.evaluate(GLR_result)
print('mse :', mse)
print('mae :', mae)
print('Time:', model_end-model_start)
コード例 #19
0
    def Train(self):
        st_global = time.time()
        algosToRun = self._dataframe_context.get_algorithms_to_run()
        algoSetting = filter(lambda x:x["algorithmSlug"]==GLOBALSETTINGS.MODEL_SLUG_MAPPING["generalizedlinearregression"],algosToRun)[0]
        categorical_columns = self._dataframe_helper.get_string_columns()
        uid_col = self._dataframe_context.get_uid_column()
        if self._metaParser.check_column_isin_ignored_suggestion(uid_col):
            categorical_columns = list(set(categorical_columns) - {uid_col})
        allDateCols = self._dataframe_context.get_date_columns()
        categorical_columns = list(set(categorical_columns)-set(allDateCols))
        print categorical_columns
        result_column = self._dataframe_context.get_result_column()
        numerical_columns = self._dataframe_helper.get_numeric_columns()
        numerical_columns = [x for x in numerical_columns if x != result_column]

        model_path = self._dataframe_context.get_model_path()
        if model_path.startswith("file"):
            model_path = model_path[7:]
        validationDict = self._dataframe_context.get_validation_dict()
        print "model_path",model_path
        pipeline_filepath = "file://"+str(model_path)+"/"+str(self._slug)+"/pipeline/"
        model_filepath = "file://"+str(model_path)+"/"+str(self._slug)+"/model"
        pmml_filepath = "file://"+str(model_path)+"/"+str(self._slug)+"/modelPmml"

        df = self._data_frame
        pipeline = MLUtils.create_pyspark_ml_pipeline(numerical_columns,categorical_columns,result_column,algoType="regression")

        pipelineModel = pipeline.fit(df)
        indexed = pipelineModel.transform(df)
        featureMapping = sorted((attr["idx"], attr["name"]) for attr in (chain(*indexed.schema["features"].metadata["ml_attr"]["attrs"].values())))

        # print indexed.select([result_column,"features"]).show(5)
        MLUtils.save_pipeline_or_model(pipelineModel,pipeline_filepath)
        glinr = GeneralizedLinearRegression(labelCol=result_column, featuresCol='features',predictionCol="prediction")
        if validationDict["name"] == "kFold":
            defaultSplit = GLOBALSETTINGS.DEFAULT_VALIDATION_OBJECT["value"]
            numFold = int(validationDict["value"])
            if numFold == 0:
                numFold = 3
            trainingData,validationData = indexed.randomSplit([defaultSplit,1-defaultSplit], seed=12345)
            paramGrid = ParamGridBuilder()\
                .addGrid(glinr.regParam, [0.1, 0.01]) \
                .addGrid(glinr.fitIntercept, [False, True])\
                .build()
            crossval = CrossValidator(estimator=glinr,
                          estimatorParamMaps=paramGrid,
                          evaluator=RegressionEvaluator(predictionCol="prediction", labelCol=result_column),
                          numFolds=numFold)
            st = time.time()
            cvModel = crossval.fit(indexed)
            trainingTime = time.time()-st
            print "cvModel training takes",trainingTime
            bestModel = cvModel.bestModel
        elif validationDict["name"] == "trainAndtest":
            trainingData,validationData = indexed.randomSplit([float(validationDict["value"]),1-float(validationDict["value"])], seed=12345)
            st = time.time()
            fit = glinr.fit(trainingData)
            trainingTime = time.time()-st
            print "time to train",trainingTime
            bestModel = fit
        print bestModel.explainParams()
        print bestModel.extractParamMap()
        print bestModel.params
        print 'Best Param (regParam): ', bestModel._java_obj.getRegParam()
        print 'Best Param (MaxIter): ', bestModel._java_obj.getMaxIter()

        # modelPmmlPipeline = PMMLPipeline([
        #   ("pretrained-estimator", objs["trained_model"])
        # ])
        # try:
        #     modelPmmlPipeline.target_field = result_column
        #     modelPmmlPipeline.active_fields = np.array([col for col in x_train.columns if col != result_column])
        #     sklearn2pmml(modelPmmlPipeline, pmml_filepath, with_repr = True)
        #     pmmlfile = open(pmml_filepath,"r")
        #     pmmlText = pmmlfile.read()
        #     pmmlfile.close()
        #     self._result_setter.update_pmml_object({self._slug:pmmlText})
        # except:
        #     pass

        coefficientsArray = [(name, bestModel.coefficients[idx]) for idx, name in featureMapping]
        MLUtils.save_pipeline_or_model(bestModel,model_filepath)
        transformed = bestModel.transform(validationData)
        transformed = transformed.withColumn(result_column,transformed[result_column].cast(DoubleType()))
        transformed = transformed.select([result_column,"prediction",transformed[result_column]-transformed["prediction"]])
        transformed = transformed.withColumnRenamed(transformed.columns[-1],"difference")
        transformed = transformed.select([result_column,"prediction","difference",FN.abs(transformed["difference"])*100/transformed[result_column]])
        transformed = transformed.withColumnRenamed(transformed.columns[-1],"mape")
        sampleData = None
        nrows = transformed.count()
        if nrows > 100:
            sampleData = transformed.sample(False, float(100)/nrows, seed=420)
        else:
            sampleData = transformed
        print sampleData.show()
        evaluator = RegressionEvaluator(predictionCol="prediction",labelCol=result_column)
        metrics = {}
        metrics["r2"] = evaluator.evaluate(transformed,{evaluator.metricName: "r2"})
        metrics["rmse"] = evaluator.evaluate(transformed,{evaluator.metricName: "rmse"})
        metrics["mse"] = evaluator.evaluate(transformed,{evaluator.metricName: "mse"})
        metrics["mae"] = evaluator.evaluate(transformed,{evaluator.metricName: "mae"})
        runtime = round((time.time() - st_global),2)
        # print transformed.count()
        mapeDf = transformed.select("mape")
        # print mapeDf.show()
        mapeStats = MLUtils.get_mape_stats(mapeDf,"mape")
        mapeStatsArr = mapeStats.items()
        mapeStatsArr = sorted(mapeStatsArr,key=lambda x:int(x[0]))
        # print mapeStatsArr
        quantileDf = transformed.select("prediction")
        # print quantileDf.show()
        quantileSummaryDict = MLUtils.get_quantile_summary(quantileDf,"prediction")
        quantileSummaryArr = quantileSummaryDict.items()
        quantileSummaryArr = sorted(quantileSummaryArr,key=lambda x:int(x[0]))
        # print quantileSummaryArr
        self._model_summary.set_model_type("regression")
        self._model_summary.set_algorithm_name("Generalized Linear Regression")
        self._model_summary.set_algorithm_display_name("Generalized Linear Regression")
        self._model_summary.set_slug(self._slug)
        self._model_summary.set_training_time(runtime)
        self._model_summary.set_training_time(trainingTime)
        self._model_summary.set_target_variable(result_column)
        self._model_summary.set_validation_method(validationDict["displayName"])
        self._model_summary.set_model_evaluation_metrics(metrics)
        self._model_summary.set_model_params(bestEstimator.get_params())
        self._model_summary.set_quantile_summary(quantileSummaryArr)
        self._model_summary.set_mape_stats(mapeStatsArr)
        self._model_summary.set_sample_data(sampleData.toPandas().to_dict())
        self._model_summary.set_coefficinets_array(coefficientsArray)
        self._model_summary.set_feature_list(list(x_train.columns))

        # print CommonUtils.convert_python_object_to_json(self._model_summary)
        modelSummaryJson = {
            "dropdown":{
                        "name":self._model_summary.get_algorithm_name(),
                        "accuracy":CommonUtils.round_sig(self._model_summary.get_model_evaluation_metrics()["r2"]),
                        "slug":self._model_summary.get_slug()
                        },
            "levelcount":self._model_summary.get_level_counts(),
            "modelFeatureList":self._model_summary.get_feature_list(),
            "levelMapping":self._model_summary.get_level_map_dict()
        }

        glinrCards = [json.loads(CommonUtils.convert_python_object_to_json(cardObj)) for cardObj in MLUtils.create_model_summary_cards(self._model_summary)]

        for card in glinrCards:
            self._prediction_narrative.add_a_card(card)
        self._result_setter.set_model_summary({"generalizedlinearregression":json.loads(CommonUtils.convert_python_object_to_json(self._model_summary))})
        self._result_setter.set_generalized_linear_regression_model_summary(modelSummaryJson)
        self._result_setter.set_glinr_cards(glinrCards)
コード例 #20
0
# COMMAND ----------

from pyspark.ml.regression import GeneralizedLinearRegression

#Generamos un vector con la columna label  y la columna array features
ignore = ['label']
assembler = VectorAssembler(
    inputCols=[x for x in train.columns if x not in ignore],
    outputCol='features')
train_LP = assembler.transform(train).select(['label', 'features'])
evaluation_LP = assembler.transform(evaluation).select(['label', 'features'])

#Definimos el algoritmo del modelo (regresion logistica)
model_regresion = GeneralizedLinearRegression(family="gaussian",
                                              link="identity",
                                              maxIter=50,
                                              regParam=0.05)

# Fit the model
model_regresion = model_regresion.fit(train_LP)

# Make predictions.
predictions = model_regresion.transform(evaluation_LP)

# Print the coefficients and intercept for linear regression
print("Coefficients: %s" % str(model_regresion.coefficients))
print("Intercept: %s" % str(model_regresion.intercept))

# COMMAND ----------

# Summarize the model over the training set and print out some metrics
コード例 #21
0
pipeline = Pipeline(stages=[])  # Must initialize with empty list!

# base pipeline (the processing here should be reused across pipelines)
basePipeline = [rformula]

#############################################################
# Specify Linear Regression model
lr = LinearRegression()
pl_lr = basePipeline + [lr]
pg_lr = ParamGridBuilder()\
          .baseOn({pipeline.stages: pl_lr})\
          .addGrid(lr.regParam,[0.01, .04])\
          .build()
#############################################################
# Specify Random Forrest model
rf = GeneralizedLinearRegression()
pl_rf = basePipeline + [rf]
pg_rf = ParamGridBuilder()\
      .baseOn({pipeline.stages: pl_rf})\
      .build()

#############################################################
# Specify Decision Tree model
dt = DecisionTreeRegressor()
pl_dt = basePipeline + [dt]
pg_dt = ParamGridBuilder()\
      .baseOn({pipeline.stages: pl_dt})\
      .build()

# One grid from the individual grids
paramGrid = pg_lr + pg_rf + pg_dt
コード例 #22
0
cuse_df = assembler.transform(df)
cuse_df = cuse_df.withColumn('label', F.col('y'))
cuse_df.select("features", "label").show()

cuse_df.show(5)


# In[3]:
# ## Split data into training and test datasets
training, test = cuse_df.randomSplit([0.8, 0.2], seed=1234)

# In[4]:
# ## Build Logistic Regression model

from pyspark.ml.regression import GeneralizedLinearRegression
logr = GeneralizedLinearRegression(family="binomial", link="logit", regParam=0.0)

# Fit the model to the data and call this model logr_Model
logr_Model = logr.fit(training)

# Print the coefficients and intercept for linear regression
summary = logr_Model.summary
print("Coefficient Standard Errors: " + str(summary.coefficientStandardErrors))
print("T Values: " + str(summary.tValues))
print("P Values: " + str(summary.pValues))


# #### Prediction on training data
pred_training_cv = logr_Model.transform(training)
pred_training_cv.show(5, truncate=False)
コード例 #23
0
rfc_pred.select( "finalvector", "Claim_Amount", "binaryclaim", "prediction").show(100)

accuracy = accuracyEval.evaluate(rfc_pred)

print("accuracy of classifier is: ", accuracy)


#####3b: GLM

#training data filtered to just the rows with nonzero claims
trainFilt = trainingData.filter(col("binaryclaim")==1)


from pyspark.ml.regression import GeneralizedLinearRegression

glr = GeneralizedLinearRegression(featuresCol='finalvector', labelCol='Claim_Amount', regParam=0.01, family='gaussian', predictionCol = "combo_prediction")

glrmodel = glr.fit(trainFilt)

###only perform GLR on rows that have been predicted to have nonzero claims
predFilt = rfc_pred.filter(col("prediction")==1)


combo_pred = glrmodel.transform(predFilt)



#looking at a few rows:

combo_pred.select("finalvector",'Claim_Amount', "binaryclaim", "prediction", "combo_prediction" ).show(100)
#Now we get the MSE AND MAE
コード例 #24
0
                   labels=li.labels)
######################
# modelos
# regresión logística
lr = LogisticRegression(
    featuresCol="features_pca",
    labelCol="delay_indexer",
    #maxIter=10, elasticNetParam=0.8,
    regParam=0.3,
    family="multinomial",
    predictionCol="prediction")
#creación del pipeline, este no se va a utilizar en el Magic Loop
pipeline1 = Pipeline(stages=[li0, li1, li2, va0, pca, li, lr, lc])
# regresión lineal
glr = GeneralizedLinearRegression(featuresCol="features_pca",
                                  labelCol="delay_indexer",
                                  family="Gaussian")
#creación del pipeline, este no se va a utilizar en el Magic Loop
pipeline2 = Pipeline(stages=[li0, li1, li2, va0, pca, li, glr, lc])


#####################
# Magic Loop
def magic_loop3(pipelines, grid, train, test, cvfolds=3):
    best_score = 0.0  #symbolic high value :-)
    best_grid = None  #inicializar la variable
    #este loop inicia las pruebas secuenciales de los pipelines:
    #es relevante que no sólo soporta 2, sino se va en cada uno
    #de los que estén presentes en la lista
    for pipe in pipelines:
        try:
コード例 #25
0
destino_indexada = StringIndexer(
    inputCol="DESTINATION_AIRPORT",
    outputCol="DESTINATION_AIRPORT_NUM").setHandleInvalid("skip")
vectorAssembler_features = VectorAssembler(inputCols=[
    "MONTH", "DAY", "DAY_OF_WEEK", "AIRLINE_NUM", "ORIGIN_AIRPORT_NUM",
    "DESTINATION_AIRPORT_NUM", "TAXI_OUT", "SCHEDULED_TIME", "ELAPSED_TIME",
    "AIR_TIME", "DISTANCE", "TAXI_IN", "ARRIVAL_DELAY", "DIVERTED"
],
                                           outputCol="features")

metod = {
    'gbt':
    GBTRegressor(labelCol="label", featuresCol="features", maxBins=640),
    'mlg':
    GeneralizedLinearRegression(labelCol="label",
                                featuresCol="features",
                                family="gaussian",
                                link="identity")
}

grid = {
    'gbt': ParamGridBuilder() \
        .addGrid(metod['gbt'].maxDepth, [2, 5, 7])\
        .addGrid(metod['gbt'].maxIter, [3, 5, 9])\
        .build(),

    'mlg': ParamGridBuilder() \
        .addGrid(metod['mlg'].regParam, [0.1, 0.3, 0.5])\
        .addGrid(metod['mlg'].maxIter, [3, 5, 9])\
        .build()
}
コード例 #26
0
dt = DecisionTreeClassifier(labelCol="label",
                            featuresCol="features",
                            predictionCol='prediction_c',
                            maxBins=800)
binarizer = Binarizer(threshold=0.0001,
                      inputCol='Claim_Amount',
                      outputCol='label')

pipeline = Pipeline(stages=[binarizer, dt])
dtModel = pipeline.fit(traindata)
# Make predictions on test data using the Transformer.transform() method.
predictions = dtModel.transform(testdata)

non_zero_train = traindata.filter(traindata['Claim_Amount'] > 0.0)
non_zero_test = predictions.filter(predictions['prediction_c'] > 0.0)

print("Generalized Linear Regression with gamma family")

from pyspark.ml.regression import GeneralizedLinearRegression
glm_gamma = GeneralizedLinearRegression(featuresCol='features', labelCol='Claim_Amount', maxIter=50,\
                                          family='gamma', link='log')
glm_model = glm_gamma.fit(non_zero_train)
predictions = glm_model.transform(non_zero_test)

from pyspark.ml.evaluation import RegressionEvaluator
evaluator = RegressionEvaluator\
      (labelCol="Claim_Amount", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(predictions)
print("RMSE = %g " % rmse)

end = time.time()
コード例 #27
0
assembler = VectorAssembler(inputCols=featureNames, outputCol="features")
test_df = assembler.transform(test_df)
test_df = test_df.select("id", "features")

print("test vector assembled")
test_df.show(5)

# Split `train_df` into train and test sets (30% held out for testing)
#Split train and test
seed(0)
(trainingData, testData) = train_df.randomSplit([0.7, 0.3])

# ## Logistic Regression
#Fit logistic regression
glr = GeneralizedLinearRegression(family="binomial",
                                  link="logit",
                                  featuresCol="features",
                                  labelCol="is_duplicate")
trainLogitModel = glr.fit(trainingData)

#Logistic model predictions
LogitPredictions = trainLogitModel.transform(testData)

# Calculate AUC
evaluator = BinaryClassificationEvaluator(labelCol="is_duplicate",
                                          rawPredictionCol="prediction",
                                          metricName="areaUnderROC")
AUClogit = evaluator.evaluate(LogitPredictions)
print("Logistic Regression AUC = %g " % AUClogit)

# ## Decision trees
#Fit decision tree model
コード例 #28
0
display(testingData2)

# COMMAND ----------

#START HERE

# COMMAND ----------

#Random Forests Documentation/Example
#https://spark.apache.org/docs/2.2.0/mllib-ensembles.html#random-forests

# COMMAND ----------

#Generalized Linear Regression Documentation/Example
#https://spark.apache.org/docs/2.2.0/ml-classification-regression.html#generalized-linear-regression
glm = GeneralizedLinearRegression(family="poisson", link="sqrt")
rfr = RandomForestRegressor(impurity="variance", numTrees=50)

# COMMAND ----------

#trainingDataDF, testingDataDF = trainingData2.randomSplit([0.8, 0.2], seed=0L)

# COMMAND ----------

pipeline = Pipeline(stages=[glm])
pipeline2 = Pipeline(stages=[rfr])

# COMMAND ----------

paramGrid = ParamGridBuilder().addGrid(glm.maxIter, [8, 10, 12]).addGrid(
    glm.regParam, [0.4, 0.6, 0.8]).build()
コード例 #29
0
# COMMAND ----------

(trainingData, testData) = dataset.randomSplit([0.7, 0.3], seed=100)
print(trainingData.count())
print(testData.count())

# COMMAND ----------

from pyspark.ml.regression import GeneralizedLinearRegression

# Load training data
dataset = spark.read.format("libsvm")\
    .load("data/mllib/sample_linear_regression_data.txt")

glr = GeneralizedLinearRegression(family="gaussian", link="identity", maxIter=10, regParam=0.3)

# Fit the model
model = glr.fit(dataset)

# Print the coefficients and intercept for generalized linear regression model
print("Coefficients: " + str(model.coefficients))
print("Intercept: " + str(model.intercept))

# Summarize the model over the training set and print out some metrics
summary = model.summary
print("Coefficient Standard Errors: " + str(summary.coefficientStandardErrors))
print("T Values: " + str(summary.tValues))
print("P Values: " + str(summary.pValues))
print("Dispersion: " + str(summary.dispersion))
print("Null Deviance: " + str(summary.nullDeviance))
コード例 #30
0
logisticReg_prediction = logisticReg_model2.transform(testData)

evaluator = BinaryClassificationEvaluator(labelCol="not_zero",
                                          metricName="areaUnderROC")
auc = evaluator.evaluate(logisticReg_prediction)
end = time.time()
print('Logistic Regression Execution time:', end - start)
print("auc = %g" % auc)

train_notzero = trainingData.filter('not_zero != 0')
test_notzero = testData.filter('not_zero != 0')

#training glm model
from pyspark.ml.regression import GeneralizedLinearRegression
from pyspark.ml.evaluation import RegressionEvaluator
glm_poisson = GeneralizedLinearRegression(featuresCol='features', labelCol='Claim_Amount', maxIter=10, regParam=0.01,\
                                          family='Gamma', link='identity')
start = time.time()
glm_model = glm_poisson.fit(train_notzero)

#select zero sample
pred_zero = logisticReg_prediction.filter('prediction == 0')
pred_zero = pred_zero.withColumn('claim_prediction',
                                 pred_zero['not_zero'] * 0).select(
                                     'Claim_Amount', 'claim_prediction')

#extract non zero value
pred_nonzero = logisticReg_prediction.filter('prediction != 0')
pred_nonzero = pred_nonzero.select('features', 'Claim_Amount')

#compare model with non zero value
pred_amount = glm_model.transform(pred_nonzero)