def bestGeneralizedLR(trainDf, metricDF, metricToCompare): regParam = [1.0, 0.6, 0.2] tol = [1.0, 0.6, 0.2, 0.0] family = ["poisson", "gaussian"] link = {"poisson": ["identity", "sqrt", "log"], "gaussian": ["identity"]} models = [] for r in regParam: for f in family: for l in link.get(f): for t in tol: models.append( GeneralizedLinearRegression(maxIter=10, regParam=r, family=f, link=l, tol=t).fit(trainDf)) return getBestModel(models, metricDF, metricToCompare)
def test_glr_load(self): df = self.spark.createDataFrame([(1.0, Vectors.dense(0.0, 0.0)), (1.0, Vectors.dense(1.0, 2.0)), (2.0, Vectors.dense(0.0, 0.0)), (2.0, Vectors.dense(1.0, 1.0))], ["label", "features"]) glr = GeneralizedLinearRegression(family="gaussian", link="identity", linkPredictionCol="p") model = glr.fit(df) self.assertEqual(model.getSolver(), "irls") transformed1 = model.transform(df) path = tempfile.mkdtemp() model_path = path + "/glr" model.save(model_path) model2 = GeneralizedLinearRegressionModel.load(model_path) self.assertEqual(model2.getSolver(), "irls") transformed2 = model2.transform(df) self.assertEqual(transformed1.take(4), transformed2.take(4))
def selectRegressionMethod(regressionMethodName, featureName): if regressionMethodName == "rf": if test == True: nt = 1 else: nt = 100 modelParameters = { 'featuresCol': featureName, 'numTrees': nt, 'subsamplingRate': 1, 'maxDepth': 10 } regressionMethod = RandomForestRegressor( featuresCol=modelParameters['featuresCol'], numTrees=modelParameters['numTrees'], subsamplingRate=modelParameters['subsamplingRate'], maxDepth=modelParameters['maxDepth']) elif regressionMethodName == "gbt": modelParameters = {'featuresCol': featureName, 'maxIter': 10} regressionMethod = GBTRegressor( featuresCol=modelParameters['featuresCol'], maxIter=modelParameters['maxIter']) elif regressionMethodName == "glr": modelParameters = { 'featuresCol': featureName, 'family': "poisson", 'link': 'log', 'maxIter': 10, 'regParam': 0.3 } regressionMethod = GeneralizedLinearRegression( family=modelParameters['family'], link=modelParameters['link'], maxIter=modelParameters['maxIter'], regParam=modelParameters['regParam']) else: print('Invalid regression method') return () #print('Regression method selected') return (regressionMethod, modelParameters)
def test_glr_summary(self): from pyspark.mllib.linalg import Vectors df = self.spark.createDataFrame( [(1.0, 2.0, Vectors.dense(1.0)), (0.0, 2.0, Vectors.sparse(1, [], []))], ["label", "weight", "features"]) glr = GeneralizedLinearRegression(family="gaussian", link="identity", weightCol="weight", fitIntercept=False) model = glr.fit(df) self.assertTrue(model.hasSummary) s = model.summary # test that api is callable and returns expected types self.assertEqual(s.numIterations, 1) # this should default to a single iteration of WLS self.assertTrue(isinstance(s.predictions, DataFrame)) self.assertEqual(s.predictionCol, "prediction") self.assertTrue(isinstance(s.residuals(), DataFrame)) self.assertTrue(isinstance(s.residuals("pearson"), DataFrame)) coefStdErr = s.coefficientStandardErrors self.assertTrue( isinstance(coefStdErr, list) and isinstance(coefStdErr[0], float)) tValues = s.tValues self.assertTrue( isinstance(tValues, list) and isinstance(tValues[0], float)) pValues = s.pValues self.assertTrue( isinstance(pValues, list) and isinstance(pValues[0], float)) self.assertEqual(s.degreesOfFreedom, 1) self.assertEqual(s.residualDegreeOfFreedom, 1) self.assertEqual(s.residualDegreeOfFreedomNull, 2) self.assertEqual(s.rank, 1) self.assertTrue(isinstance(s.solver, basestring)) self.assertTrue(isinstance(s.aic, float)) self.assertTrue(isinstance(s.deviance, float)) self.assertTrue(isinstance(s.nullDeviance, float)) self.assertTrue(isinstance(s.dispersion, float)) # test evaluation (with training dataset) produces a summary with same values # one check is enough to verify a summary is returned, Scala version runs full test sameSummary = model.evaluate(df) self.assertAlmostEqual(sameSummary.deviance, s.deviance)
def test_offset(self): df = self.spark.createDataFrame( [ (0.2, 1.0, 2.0, Vectors.dense(0.0, 5.0)), (0.5, 2.1, 0.5, Vectors.dense(1.0, 2.0)), (0.9, 0.4, 1.0, Vectors.dense(2.0, 1.0)), (0.7, 0.7, 0.0, Vectors.dense(3.0, 3.0)), ], ["label", "weight", "offset", "features"], ) glr = GeneralizedLinearRegression(family="poisson", weightCol="weight", offsetCol="offset") model = glr.fit(df) self.assertTrue( np.allclose(model.coefficients.toArray(), [0.664647, -0.3192581], atol=1e-4)) self.assertTrue(np.isclose(model.intercept, -1.561613, atol=1e-4))
def test_tweedie_distribution(self): df = self.spark.createDataFrame( [ (1.0, Vectors.dense(0.0, 0.0)), (1.0, Vectors.dense(1.0, 2.0)), (2.0, Vectors.dense(0.0, 0.0)), (2.0, Vectors.dense(1.0, 1.0)), ], ["label", "features"], ) glr = GeneralizedLinearRegression(family="tweedie", variancePower=1.6) model = glr.fit(df) self.assertTrue(np.allclose(model.coefficients.toArray(), [-0.4645, 0.3402], atol=1e-4)) self.assertTrue(np.isclose(model.intercept, 0.7841, atol=1e-4)) model2 = glr.setLinkPower(-1.0).fit(df) self.assertTrue(np.allclose(model2.coefficients.toArray(), [-0.6667, 0.5], atol=1e-4)) self.assertTrue(np.isclose(model2.intercept, 0.6667, atol=1e-4))
def linear_regression(ticker,writer): spark = SparkSession \ .builder \ .appName("GeneralizedLinearRegressionExample") \ .getOrCreate() # Load training data dataset = spark.read.format("libsvm").load("../data/lr/" + ticker + "_no_today.csv") glr = GeneralizedLinearRegression(family="gaussian", link="identity", maxIter=1, regParam=0.8) # Fit the model model = glr.fit(dataset) data=[ticker, 'coefficient:', model.coefficients[0],'intercept:',model.intercept] writer.writerow(data) print(data) # predict today_close_value = 0 yesterday_close_value = 0 with open("../data/lr/" + ticker + ".csv") as csvfile: reader = csv.reader(csvfile, delimiter=',') count = 0 for row in reader: if count is 0: today_close_value = row[0] count += 1 elif count is 1: yesterday_close_value = row[0] break # # print(today_close_value) # # print(yesterday_close_value) predict_close_value = -1 * float(str(model.coefficients[0])) + float(str(model.intercept)) # print(predict_close_value) spark.stop() if predict_close_value >= yesterday_close_value and today_close_value >= yesterday_close_value: return True elif predict_close_value <= yesterday_close_value and today_close_value <= yesterday_close_value: return True else: return False
def generalized_linear_regression(): spark = SparkSession \ .builder \ .appName("Python Spark SQL basic example") \ .config("spark.some.config.option", "some-value") \ .getOrCreate() df = spark.createDataFrame([ (1.0, Vectors.dense(0.0, 0.0)), (1.0, Vectors.dense(1.0, 2.0)), (2.0, Vectors.dense(0.0, 0.0)), (2.0, Vectors.dense(1.0, 1.0)), ], ["label", "features"]) glr = GeneralizedLinearRegression( family="gaussian", link="identity", ) # linkPredictionCol="p") model = glr.fit(df) transformed = model.transform(df) abs(transformed.head().prediction - 1.5) < 0.001 # True abs(transformed.head().p - 1.5) < 0.001 # True model.coefficients model.numFeatures # 2 abs(model.intercept - 1.5) < 0.001 # True temp_path = "./" glr_path = temp_path + "/glr" glr.save(glr_path) glr2 = GeneralizedLinearRegression.load(glr_path) glr.getFamily() == glr2.getFamily() # True model_path = temp_path + "/glr_model" model.save(model_path) model2 = GeneralizedLinearRegressionModel.load(model_path) model.intercept == model2.intercept # True model.coefficients[0] == model2.coefficients[0]
def generalizeRegression(df, label, features, adjust): """ This function returns the rmse and the predictions form the applied generalized regression model on the dataframe with the speficied feature columns """ ## Columns with non numerical values are adjusted for col in adjust: indexer=StringIndexer(inputCol=col,outputCol="{}_num".format(col)) features.append("{}_num".format(col)) df=indexer.fit(df).transform(df) ## Features vector configured from dataframe for model processing assembler = VectorAssembler(inputCols=features, outputCol="features") assembled = assembler.transform(df) gr = GeneralizedLinearRegression(featuresCol ='features', labelCol=label, regParam=0.3, family="poisson") grModel=gr.fit(assembled) predictions = grModel.transform(assembled) ## Evaluator required for rmse estimation evaluator = RegressionEvaluator(labelCol=label, metricName="rmse") rmse = evaluator.evaluate(predictions) result = { "RMSE": rmse, "predictions": [r["prediction"] for r in predictions.select("prediction").collect()] } return result
def model(): data = sql.read.parquet(str(DATA_PARQUET)) data.createOrReplaceTempView('data') sample = sql.sql(''' select hash_number_A ,interest_1 ,interest_2 ,interest_3 ,interest_4 ,interest_5 ,device_type ,phone_price_category ,sum(cost) as label from data group by {", ".join(str(n) for n in range(1, 8+1))}''') breakpoint() pipeline = Pipeline(stages=[ StringIndexer(inputCol='interest_1', outputCol='interest'), StringIndexer(inputCol='phone_price_category', outputCol='phone_price'), VectorAssembler(inputCols=['interest', 'phone_price'], outputCol='features'), ]) model_data = pipeline.fit(sample) sample = model_data.transform(sample) # 'gaussian', 'binomial', 'poisson', 'gamma', 'tweedie' regression = GeneralizedLinearRegression(family='gaussian', labelCol='label', featuresCol='features', maxIter=10, regParam=0.3) model = regression.fit(sample) breakpoint()
assembler = VectorAssembler(inputCols=feature_cols, outputCol='feature_vec') dataset = assembler.transform(dataset) scaler_model = None if args.mode == 'train': scaler = StandardScaler(inputCol='feature_vec', outputCol='scaled_feature_vec', withStd=True, withMean=True) scaler_model = scaler.fit(dataset) scaler_model.save('/user/ronghui_safe/hgy/nid/edw/standardScaler_model_v2') else: scaler_model = StandardScalerModel.load('/user/ronghui_safe/hgy/nid/edw/standardScaler_model_v2') dataset = scaler_model.transform(dataset) polyExpansion = PolynomialExpansion(degree=2, inputCol='scaled_feature_vec', outputCol='polyFeatures') dataset = polyExpansion.transform(dataset) dataset = dataset.select(F.col('duration'), F.col('polyFeatures'), F.col('key')).cache() glr = None if args.mode == 'train': glr = GeneralizedLinearRegression(labelCol='duration', featuresCol='polyFeatures', family='Binomial', linkPredictionCol='link_pred') paramGrid = ParamGridBuilder() \ .addGrid(glr.link, ['logit']) \ .addGrid(glr.regParam, [1e-5]) \ .build() tvs = TrainValidationSplit(estimator=glr, \ estimatorParamMaps=paramGrid, \ evaluator=RegressionEvaluator(metricName='r2', labelCol='duration'), \ trainRatio=0.7) tvs_model = tvs.fit(dataset) print('----> {}'.format(tvs_model.validationMetrics)) if args.save_model: tvs_model.write().save('/user/ronghui_safe/hgy/nid/edw/glm_binomial_model_v2') else: #glr_model = GeneralizedLinearRegressionModel.load('/user/ronghui_safe/hgy/nid/models/glm_binomial_model') glr_model = TrainValidationSplitModel.load('/user/ronghui_safe/hgy/nid/edw/glm_binomial_model_v2')
df_train_label = df_train_label.withColumn( 'realdate', udf_strpTime_trainlabel(df_train_label['date'])).drop('date') df_new = df_train_label.join(df_features, 'realdate') df_new = df_new.na.fill(0.0) train, validation = df_new.randomSplit([0.80, 0.20]) assembler = VectorAssembler(inputCols=[ 'realdate', 'e1', 'e2', 'e3', 'e4', 'e5', 'e6', 'e7', 'e8', 'e9', 'e10', 'e11', 'e12', 'e13', 'e14', 'e15', 'e16', 'e17', 'e18', 'e19', 'e20', 'e21', 'e22', 'e23', 'e24', 'e25', 'e26' ], outputCol='features') gr = GeneralizedLinearRegression(featuresCol='features', labelCol='label') pipeline = Pipeline(stages=[assembler, gr]) model = pipeline.fit(train) prediction = model.transform(validation) evaluator = RegressionEvaluator(predictionCol='prediction') res = evaluator.evaluate(prediction, {evaluator.metricName: 'mse'}) print("generalized linear regression mse is :%f " % res) gbt = GBTRegressor(featuresCol='features', labelCol='label') gbt_pipeline = Pipeline(stages=[assembler, gbt]) gbt_model = gbt_pipeline.fit(train) gbt_prediction = gbt_model.transform(validation) gbt_res = evaluator.evaluate(gbt_prediction, {evaluator.metricName: 'mse'}) print("gbt regression mse is: %f" % gbt_res)
predictions = dtModel.transform(test_bin) accuracy = evaluator.evaluate(predictions) print("LR Accuracy = %g " % accuracy) print( 'AUC:', BinaryClassificationMetrics(predictions['label', 'prediction'].rdd).areaUnderROC) ## gamma regression with predictions gam = predictions.filter(predictions.prediction > 0).filter( predictions.label > 0) glr = GeneralizedLinearRegression(labelCol="label", featuresCol="pcaFeatures", predictionCol="gammaprediction", family="gamma", link="Inverse", maxIter=10) ## Fit the model model = glr.fit(gam) gammapred = model.transform(gam) evaluator = RegressionEvaluator(labelCol="label", predictionCol="gammaprediction", metricName="r2") r2 = evaluator.evaluate(gammapred) print("Evaluating gamma prediction :") print("R2 = %g " % r2)
# Let's see how many numerical features we have: num_cols = [item[0] for item in df.dtypes if item[1].startswith('int') | item[1].startswith('double')][1:] print(str(len(num_cols)) + ' numerical features') Data = df.rdd.map(lambda x:(Vectors.dense(x[0:-1]), x[-1])).toDF(["features", "label"]) Data.show() pd.DataFrame(Data.take(5), columns=Data.columns) testset,trainset = Data.randomSplit([0.3,0.7], seed=25) print("Training Dataset Count: " + str(trainset.count())) print("Test Dataset Count: " + str(testset.count())) ### GENERALIZED LINEAR REGRESSION FOR FEATURE SELECTION from pyspark.ml.regression import GeneralizedLinearRegression glr = GeneralizedLinearRegression(predictionCol="Predicted_median", labelCol="label", featuresCol="features",family="binomial", link="logit", maxIter=10,regParam=0.01) model = glr.fit(Data) summary = model.summary print("Coefficient Standard Errors: " + str(summary.coefficientStandardErrors)) print("P Values: " + str(summary.pValues)) #Removing all the columns that had a p-value above 0.05 vs = VectorSlicer(inputCol="features", outputCol="selected_features", indices=[0,2,9,18,21,23,24,26,27,28,31,32,37,41]) Training_set= vs.transform(trainset) Test_set = vs.transform(testset) #### LOGISTIC REGRESSION logReg = LogisticRegression(predictionCol="Predicted_median", labelCol="label", featuresCol="features", maxIter=20,regParam=0.01, elasticNetParam=0.8, family="binomial") logReg_model = logReg.fit(Training_set) trainingSummary = logReg_model.summary roc = trainingSummary.roc.toPandas()
def binomialSparkGLF(self): regr = GeneralizedLinearRegression() model = regr.fit(self.Xtrain, self.Ytrain) return model
def scalarSparkGLR(self): regr = GeneralizedLinearRegression() model = regr.fit(self.train) return model
], outputCol='features') v_data = vectorAssembler.transform(data) v_data.show(10) # 划分训练集,集测试集 vdata = v_data.select(['features', 'medv']) vdata.show(10) splits = vdata.randomSplit([0.7, 0.3]) train_data = splits[0] test_data = splits[1] # 训练 glr = GeneralizedLinearRegression(family="gaussian", link="identity", labelCol='medv', featuresCol='features', maxIter=1000, regParam=0.3) # Fit the model GlModel = glr.fit(train_data) # Print the coefficients and intercept for generalized linear regression model print("Coefficients: " + str(GlModel.coefficients)) print("Intercept: " + str(GlModel.intercept)) # Summarize the model over the training set and print out some metrics summary = GlModel.summary print("Coefficient Standard Errors: " + str(summary.coefficientStandardErrors)) print("Null Deviance: " + str(summary.nullDeviance)) print("Residual Degree Of Freedom Null: " + str(summary.residualDegreeOfFreedomNull))
# Conver the label of data which has non-zero label to 1 from pyspark.sql.functions import when train_set1 = training_set.withColumn('Claim_Amount',when(dataVectorised.Claim_Amount!=0, 1).otherwise(0)) test_set1 = test_set.withColumn('Claim_Amount',when(dataVectorised.Claim_Amount!=0, 1).otherwise(0)) # The binary classifier model_start = time.time() from pyspark.ml.classification import RandomForestClassifier rfc = RandomForestClassifier(featuresCol='features', labelCol='Claim_Amount', maxDepth=5, numTrees=3, seed=myseed) RFC_model = rfc.fit(train_set1) # Gamma Regressor from pyspark.ml.regression import GeneralizedLinearRegression glr = GeneralizedLinearRegression(featuresCol='features', labelCol='Claim_Amount', link='identity') GLR_model = glr.fit(data_Claim_else) # Combine the two model predict_RFC = RFC_model.transform(test_set) # select the results which predicted as 1 RFC_result = predict_RFC[predict_RFC['prediction']==1].select('features','Claim_Amount') GLR_result = GLR_model.transform(RFC_result) model_end = time.time() mse = evaluatorMSE.evaluate(GLR_result) mae = evaluatorMAE.evaluate(GLR_result) print('mse :', mse) print('mae :', mae) print('Time:', model_end-model_start)
def Train(self): st_global = time.time() algosToRun = self._dataframe_context.get_algorithms_to_run() algoSetting = filter(lambda x:x["algorithmSlug"]==GLOBALSETTINGS.MODEL_SLUG_MAPPING["generalizedlinearregression"],algosToRun)[0] categorical_columns = self._dataframe_helper.get_string_columns() uid_col = self._dataframe_context.get_uid_column() if self._metaParser.check_column_isin_ignored_suggestion(uid_col): categorical_columns = list(set(categorical_columns) - {uid_col}) allDateCols = self._dataframe_context.get_date_columns() categorical_columns = list(set(categorical_columns)-set(allDateCols)) print categorical_columns result_column = self._dataframe_context.get_result_column() numerical_columns = self._dataframe_helper.get_numeric_columns() numerical_columns = [x for x in numerical_columns if x != result_column] model_path = self._dataframe_context.get_model_path() if model_path.startswith("file"): model_path = model_path[7:] validationDict = self._dataframe_context.get_validation_dict() print "model_path",model_path pipeline_filepath = "file://"+str(model_path)+"/"+str(self._slug)+"/pipeline/" model_filepath = "file://"+str(model_path)+"/"+str(self._slug)+"/model" pmml_filepath = "file://"+str(model_path)+"/"+str(self._slug)+"/modelPmml" df = self._data_frame pipeline = MLUtils.create_pyspark_ml_pipeline(numerical_columns,categorical_columns,result_column,algoType="regression") pipelineModel = pipeline.fit(df) indexed = pipelineModel.transform(df) featureMapping = sorted((attr["idx"], attr["name"]) for attr in (chain(*indexed.schema["features"].metadata["ml_attr"]["attrs"].values()))) # print indexed.select([result_column,"features"]).show(5) MLUtils.save_pipeline_or_model(pipelineModel,pipeline_filepath) glinr = GeneralizedLinearRegression(labelCol=result_column, featuresCol='features',predictionCol="prediction") if validationDict["name"] == "kFold": defaultSplit = GLOBALSETTINGS.DEFAULT_VALIDATION_OBJECT["value"] numFold = int(validationDict["value"]) if numFold == 0: numFold = 3 trainingData,validationData = indexed.randomSplit([defaultSplit,1-defaultSplit], seed=12345) paramGrid = ParamGridBuilder()\ .addGrid(glinr.regParam, [0.1, 0.01]) \ .addGrid(glinr.fitIntercept, [False, True])\ .build() crossval = CrossValidator(estimator=glinr, estimatorParamMaps=paramGrid, evaluator=RegressionEvaluator(predictionCol="prediction", labelCol=result_column), numFolds=numFold) st = time.time() cvModel = crossval.fit(indexed) trainingTime = time.time()-st print "cvModel training takes",trainingTime bestModel = cvModel.bestModel elif validationDict["name"] == "trainAndtest": trainingData,validationData = indexed.randomSplit([float(validationDict["value"]),1-float(validationDict["value"])], seed=12345) st = time.time() fit = glinr.fit(trainingData) trainingTime = time.time()-st print "time to train",trainingTime bestModel = fit print bestModel.explainParams() print bestModel.extractParamMap() print bestModel.params print 'Best Param (regParam): ', bestModel._java_obj.getRegParam() print 'Best Param (MaxIter): ', bestModel._java_obj.getMaxIter() # modelPmmlPipeline = PMMLPipeline([ # ("pretrained-estimator", objs["trained_model"]) # ]) # try: # modelPmmlPipeline.target_field = result_column # modelPmmlPipeline.active_fields = np.array([col for col in x_train.columns if col != result_column]) # sklearn2pmml(modelPmmlPipeline, pmml_filepath, with_repr = True) # pmmlfile = open(pmml_filepath,"r") # pmmlText = pmmlfile.read() # pmmlfile.close() # self._result_setter.update_pmml_object({self._slug:pmmlText}) # except: # pass coefficientsArray = [(name, bestModel.coefficients[idx]) for idx, name in featureMapping] MLUtils.save_pipeline_or_model(bestModel,model_filepath) transformed = bestModel.transform(validationData) transformed = transformed.withColumn(result_column,transformed[result_column].cast(DoubleType())) transformed = transformed.select([result_column,"prediction",transformed[result_column]-transformed["prediction"]]) transformed = transformed.withColumnRenamed(transformed.columns[-1],"difference") transformed = transformed.select([result_column,"prediction","difference",FN.abs(transformed["difference"])*100/transformed[result_column]]) transformed = transformed.withColumnRenamed(transformed.columns[-1],"mape") sampleData = None nrows = transformed.count() if nrows > 100: sampleData = transformed.sample(False, float(100)/nrows, seed=420) else: sampleData = transformed print sampleData.show() evaluator = RegressionEvaluator(predictionCol="prediction",labelCol=result_column) metrics = {} metrics["r2"] = evaluator.evaluate(transformed,{evaluator.metricName: "r2"}) metrics["rmse"] = evaluator.evaluate(transformed,{evaluator.metricName: "rmse"}) metrics["mse"] = evaluator.evaluate(transformed,{evaluator.metricName: "mse"}) metrics["mae"] = evaluator.evaluate(transformed,{evaluator.metricName: "mae"}) runtime = round((time.time() - st_global),2) # print transformed.count() mapeDf = transformed.select("mape") # print mapeDf.show() mapeStats = MLUtils.get_mape_stats(mapeDf,"mape") mapeStatsArr = mapeStats.items() mapeStatsArr = sorted(mapeStatsArr,key=lambda x:int(x[0])) # print mapeStatsArr quantileDf = transformed.select("prediction") # print quantileDf.show() quantileSummaryDict = MLUtils.get_quantile_summary(quantileDf,"prediction") quantileSummaryArr = quantileSummaryDict.items() quantileSummaryArr = sorted(quantileSummaryArr,key=lambda x:int(x[0])) # print quantileSummaryArr self._model_summary.set_model_type("regression") self._model_summary.set_algorithm_name("Generalized Linear Regression") self._model_summary.set_algorithm_display_name("Generalized Linear Regression") self._model_summary.set_slug(self._slug) self._model_summary.set_training_time(runtime) self._model_summary.set_training_time(trainingTime) self._model_summary.set_target_variable(result_column) self._model_summary.set_validation_method(validationDict["displayName"]) self._model_summary.set_model_evaluation_metrics(metrics) self._model_summary.set_model_params(bestEstimator.get_params()) self._model_summary.set_quantile_summary(quantileSummaryArr) self._model_summary.set_mape_stats(mapeStatsArr) self._model_summary.set_sample_data(sampleData.toPandas().to_dict()) self._model_summary.set_coefficinets_array(coefficientsArray) self._model_summary.set_feature_list(list(x_train.columns)) # print CommonUtils.convert_python_object_to_json(self._model_summary) modelSummaryJson = { "dropdown":{ "name":self._model_summary.get_algorithm_name(), "accuracy":CommonUtils.round_sig(self._model_summary.get_model_evaluation_metrics()["r2"]), "slug":self._model_summary.get_slug() }, "levelcount":self._model_summary.get_level_counts(), "modelFeatureList":self._model_summary.get_feature_list(), "levelMapping":self._model_summary.get_level_map_dict() } glinrCards = [json.loads(CommonUtils.convert_python_object_to_json(cardObj)) for cardObj in MLUtils.create_model_summary_cards(self._model_summary)] for card in glinrCards: self._prediction_narrative.add_a_card(card) self._result_setter.set_model_summary({"generalizedlinearregression":json.loads(CommonUtils.convert_python_object_to_json(self._model_summary))}) self._result_setter.set_generalized_linear_regression_model_summary(modelSummaryJson) self._result_setter.set_glinr_cards(glinrCards)
# COMMAND ---------- from pyspark.ml.regression import GeneralizedLinearRegression #Generamos un vector con la columna label y la columna array features ignore = ['label'] assembler = VectorAssembler( inputCols=[x for x in train.columns if x not in ignore], outputCol='features') train_LP = assembler.transform(train).select(['label', 'features']) evaluation_LP = assembler.transform(evaluation).select(['label', 'features']) #Definimos el algoritmo del modelo (regresion logistica) model_regresion = GeneralizedLinearRegression(family="gaussian", link="identity", maxIter=50, regParam=0.05) # Fit the model model_regresion = model_regresion.fit(train_LP) # Make predictions. predictions = model_regresion.transform(evaluation_LP) # Print the coefficients and intercept for linear regression print("Coefficients: %s" % str(model_regresion.coefficients)) print("Intercept: %s" % str(model_regresion.intercept)) # COMMAND ---------- # Summarize the model over the training set and print out some metrics
pipeline = Pipeline(stages=[]) # Must initialize with empty list! # base pipeline (the processing here should be reused across pipelines) basePipeline = [rformula] ############################################################# # Specify Linear Regression model lr = LinearRegression() pl_lr = basePipeline + [lr] pg_lr = ParamGridBuilder()\ .baseOn({pipeline.stages: pl_lr})\ .addGrid(lr.regParam,[0.01, .04])\ .build() ############################################################# # Specify Random Forrest model rf = GeneralizedLinearRegression() pl_rf = basePipeline + [rf] pg_rf = ParamGridBuilder()\ .baseOn({pipeline.stages: pl_rf})\ .build() ############################################################# # Specify Decision Tree model dt = DecisionTreeRegressor() pl_dt = basePipeline + [dt] pg_dt = ParamGridBuilder()\ .baseOn({pipeline.stages: pl_dt})\ .build() # One grid from the individual grids paramGrid = pg_lr + pg_rf + pg_dt
cuse_df = assembler.transform(df) cuse_df = cuse_df.withColumn('label', F.col('y')) cuse_df.select("features", "label").show() cuse_df.show(5) # In[3]: # ## Split data into training and test datasets training, test = cuse_df.randomSplit([0.8, 0.2], seed=1234) # In[4]: # ## Build Logistic Regression model from pyspark.ml.regression import GeneralizedLinearRegression logr = GeneralizedLinearRegression(family="binomial", link="logit", regParam=0.0) # Fit the model to the data and call this model logr_Model logr_Model = logr.fit(training) # Print the coefficients and intercept for linear regression summary = logr_Model.summary print("Coefficient Standard Errors: " + str(summary.coefficientStandardErrors)) print("T Values: " + str(summary.tValues)) print("P Values: " + str(summary.pValues)) # #### Prediction on training data pred_training_cv = logr_Model.transform(training) pred_training_cv.show(5, truncate=False)
rfc_pred.select( "finalvector", "Claim_Amount", "binaryclaim", "prediction").show(100) accuracy = accuracyEval.evaluate(rfc_pred) print("accuracy of classifier is: ", accuracy) #####3b: GLM #training data filtered to just the rows with nonzero claims trainFilt = trainingData.filter(col("binaryclaim")==1) from pyspark.ml.regression import GeneralizedLinearRegression glr = GeneralizedLinearRegression(featuresCol='finalvector', labelCol='Claim_Amount', regParam=0.01, family='gaussian', predictionCol = "combo_prediction") glrmodel = glr.fit(trainFilt) ###only perform GLR on rows that have been predicted to have nonzero claims predFilt = rfc_pred.filter(col("prediction")==1) combo_pred = glrmodel.transform(predFilt) #looking at a few rows: combo_pred.select("finalvector",'Claim_Amount', "binaryclaim", "prediction", "combo_prediction" ).show(100) #Now we get the MSE AND MAE
labels=li.labels) ###################### # modelos # regresión logística lr = LogisticRegression( featuresCol="features_pca", labelCol="delay_indexer", #maxIter=10, elasticNetParam=0.8, regParam=0.3, family="multinomial", predictionCol="prediction") #creación del pipeline, este no se va a utilizar en el Magic Loop pipeline1 = Pipeline(stages=[li0, li1, li2, va0, pca, li, lr, lc]) # regresión lineal glr = GeneralizedLinearRegression(featuresCol="features_pca", labelCol="delay_indexer", family="Gaussian") #creación del pipeline, este no se va a utilizar en el Magic Loop pipeline2 = Pipeline(stages=[li0, li1, li2, va0, pca, li, glr, lc]) ##################### # Magic Loop def magic_loop3(pipelines, grid, train, test, cvfolds=3): best_score = 0.0 #symbolic high value :-) best_grid = None #inicializar la variable #este loop inicia las pruebas secuenciales de los pipelines: #es relevante que no sólo soporta 2, sino se va en cada uno #de los que estén presentes en la lista for pipe in pipelines: try:
destino_indexada = StringIndexer( inputCol="DESTINATION_AIRPORT", outputCol="DESTINATION_AIRPORT_NUM").setHandleInvalid("skip") vectorAssembler_features = VectorAssembler(inputCols=[ "MONTH", "DAY", "DAY_OF_WEEK", "AIRLINE_NUM", "ORIGIN_AIRPORT_NUM", "DESTINATION_AIRPORT_NUM", "TAXI_OUT", "SCHEDULED_TIME", "ELAPSED_TIME", "AIR_TIME", "DISTANCE", "TAXI_IN", "ARRIVAL_DELAY", "DIVERTED" ], outputCol="features") metod = { 'gbt': GBTRegressor(labelCol="label", featuresCol="features", maxBins=640), 'mlg': GeneralizedLinearRegression(labelCol="label", featuresCol="features", family="gaussian", link="identity") } grid = { 'gbt': ParamGridBuilder() \ .addGrid(metod['gbt'].maxDepth, [2, 5, 7])\ .addGrid(metod['gbt'].maxIter, [3, 5, 9])\ .build(), 'mlg': ParamGridBuilder() \ .addGrid(metod['mlg'].regParam, [0.1, 0.3, 0.5])\ .addGrid(metod['mlg'].maxIter, [3, 5, 9])\ .build() }
dt = DecisionTreeClassifier(labelCol="label", featuresCol="features", predictionCol='prediction_c', maxBins=800) binarizer = Binarizer(threshold=0.0001, inputCol='Claim_Amount', outputCol='label') pipeline = Pipeline(stages=[binarizer, dt]) dtModel = pipeline.fit(traindata) # Make predictions on test data using the Transformer.transform() method. predictions = dtModel.transform(testdata) non_zero_train = traindata.filter(traindata['Claim_Amount'] > 0.0) non_zero_test = predictions.filter(predictions['prediction_c'] > 0.0) print("Generalized Linear Regression with gamma family") from pyspark.ml.regression import GeneralizedLinearRegression glm_gamma = GeneralizedLinearRegression(featuresCol='features', labelCol='Claim_Amount', maxIter=50,\ family='gamma', link='log') glm_model = glm_gamma.fit(non_zero_train) predictions = glm_model.transform(non_zero_test) from pyspark.ml.evaluation import RegressionEvaluator evaluator = RegressionEvaluator\ (labelCol="Claim_Amount", predictionCol="prediction", metricName="rmse") rmse = evaluator.evaluate(predictions) print("RMSE = %g " % rmse) end = time.time()
assembler = VectorAssembler(inputCols=featureNames, outputCol="features") test_df = assembler.transform(test_df) test_df = test_df.select("id", "features") print("test vector assembled") test_df.show(5) # Split `train_df` into train and test sets (30% held out for testing) #Split train and test seed(0) (trainingData, testData) = train_df.randomSplit([0.7, 0.3]) # ## Logistic Regression #Fit logistic regression glr = GeneralizedLinearRegression(family="binomial", link="logit", featuresCol="features", labelCol="is_duplicate") trainLogitModel = glr.fit(trainingData) #Logistic model predictions LogitPredictions = trainLogitModel.transform(testData) # Calculate AUC evaluator = BinaryClassificationEvaluator(labelCol="is_duplicate", rawPredictionCol="prediction", metricName="areaUnderROC") AUClogit = evaluator.evaluate(LogitPredictions) print("Logistic Regression AUC = %g " % AUClogit) # ## Decision trees #Fit decision tree model
display(testingData2) # COMMAND ---------- #START HERE # COMMAND ---------- #Random Forests Documentation/Example #https://spark.apache.org/docs/2.2.0/mllib-ensembles.html#random-forests # COMMAND ---------- #Generalized Linear Regression Documentation/Example #https://spark.apache.org/docs/2.2.0/ml-classification-regression.html#generalized-linear-regression glm = GeneralizedLinearRegression(family="poisson", link="sqrt") rfr = RandomForestRegressor(impurity="variance", numTrees=50) # COMMAND ---------- #trainingDataDF, testingDataDF = trainingData2.randomSplit([0.8, 0.2], seed=0L) # COMMAND ---------- pipeline = Pipeline(stages=[glm]) pipeline2 = Pipeline(stages=[rfr]) # COMMAND ---------- paramGrid = ParamGridBuilder().addGrid(glm.maxIter, [8, 10, 12]).addGrid( glm.regParam, [0.4, 0.6, 0.8]).build()
# COMMAND ---------- (trainingData, testData) = dataset.randomSplit([0.7, 0.3], seed=100) print(trainingData.count()) print(testData.count()) # COMMAND ---------- from pyspark.ml.regression import GeneralizedLinearRegression # Load training data dataset = spark.read.format("libsvm")\ .load("data/mllib/sample_linear_regression_data.txt") glr = GeneralizedLinearRegression(family="gaussian", link="identity", maxIter=10, regParam=0.3) # Fit the model model = glr.fit(dataset) # Print the coefficients and intercept for generalized linear regression model print("Coefficients: " + str(model.coefficients)) print("Intercept: " + str(model.intercept)) # Summarize the model over the training set and print out some metrics summary = model.summary print("Coefficient Standard Errors: " + str(summary.coefficientStandardErrors)) print("T Values: " + str(summary.tValues)) print("P Values: " + str(summary.pValues)) print("Dispersion: " + str(summary.dispersion)) print("Null Deviance: " + str(summary.nullDeviance))
logisticReg_prediction = logisticReg_model2.transform(testData) evaluator = BinaryClassificationEvaluator(labelCol="not_zero", metricName="areaUnderROC") auc = evaluator.evaluate(logisticReg_prediction) end = time.time() print('Logistic Regression Execution time:', end - start) print("auc = %g" % auc) train_notzero = trainingData.filter('not_zero != 0') test_notzero = testData.filter('not_zero != 0') #training glm model from pyspark.ml.regression import GeneralizedLinearRegression from pyspark.ml.evaluation import RegressionEvaluator glm_poisson = GeneralizedLinearRegression(featuresCol='features', labelCol='Claim_Amount', maxIter=10, regParam=0.01,\ family='Gamma', link='identity') start = time.time() glm_model = glm_poisson.fit(train_notzero) #select zero sample pred_zero = logisticReg_prediction.filter('prediction == 0') pred_zero = pred_zero.withColumn('claim_prediction', pred_zero['not_zero'] * 0).select( 'Claim_Amount', 'claim_prediction') #extract non zero value pred_nonzero = logisticReg_prediction.filter('prediction != 0') pred_nonzero = pred_nonzero.select('features', 'Claim_Amount') #compare model with non zero value pred_amount = glm_model.transform(pred_nonzero)