def test_rformula_force_index_label(self): df = self.spark.createDataFrame([(1.0, 1.0, "a"), (0.0, 2.0, "b"), (1.0, 0.0, "a")], ["y", "x", "s"]) # Does not index label by default since it's numeric type. rf = RFormula(formula="y ~ x + s") model = rf.fit(df) transformedDF = model.transform(df) self.assertEqual(transformedDF.head().label, 1.0) # Force to index label. rf2 = RFormula(formula="y ~ x + s").setForceIndexLabel(True) model2 = rf2.fit(df) transformedDF2 = model2.transform(df) self.assertEqual(transformedDF2.head().label, 0.0)
def test_rformula_force_index_label(self): df = self.spark.createDataFrame([ (1.0, 1.0, "a"), (0.0, 2.0, "b"), (1.0, 0.0, "a")], ["y", "x", "s"]) # Does not index label by default since it's numeric type. rf = RFormula(formula="y ~ x + s") model = rf.fit(df) transformedDF = model.transform(df) self.assertEqual(transformedDF.head().label, 1.0) # Force to index label. rf2 = RFormula(formula="y ~ x + s").setForceIndexLabel(True) model2 = rf2.fit(df) transformedDF2 = model2.transform(df) self.assertEqual(transformedDF2.head().label, 0.0)
def spark_ml(): diff_cat_in_train_test=test.select('Product_ID').subtract(train.select('Product_ID')) diff_cat_in_train_test.distinct().count() from pyspark.ml.feature import StringIndexer plan_indexer = StringIndexer(inputCol = 'Product_ID', outputCol = 'product_ID') labeller = plan_indexer.fit(train) Train1 = labeller.transform(train) Test1 = labeller.transform(test) Train1.show() from pyspark.ml.feature import RFormula formula = RFormula(formula="Purchase ~ Age+ Occupation +City_Category+Stay_In_Current_City_Years+Product_Category_1+Product_Category_2+ Gender",featuresCol="features",labelCol="label") t1 = formula.fit(Train1) train1 = t1.transform(Train1) test1 = t1.transform(Test1) train1.show() train1.select('features').show() train1.select('label').show() from pyspark.ml.regression import RandomForestRegressor rf = RandomForestRegressor() (train_cv, test_cv) = train1.randomSplit([0.7, 0.3]) model1 = rf.fit(train_cv) predictions = model1.transform(test_cv) from pyspark.ml.evaluation import RegressionEvaluator evaluator = RegressionEvaluator() mse = evaluator.evaluate(predictions,{evaluator.metricName:"mse" }) import numpy as np np.sqrt(mse), mse model = rf.fit(train1) predictions1 = model.transform(test1) df = predictions1.selectExpr("User_ID as User_ID", "Product_ID as Product_ID", 'prediction as Purchase') df.toPandas().to_csv('submission.csv')
def Chi_sqr(dataset_add, feature_colm, label_colm): dataset = spark.read.csv(dataset_add, header=True, inferSchema=True) dataset.show() # using the rformula for indexing, encoding and vectorising label = '' for y in label_colm: label = y print(label) f = "" f = label + " ~ " for x in feature_colm: f = f + x + "+" f = f[:-1] f = (f) formula = RFormula(formula=f, featuresCol="features", labelCol="label") length = feature_colm.__len__() output = formula.fit(dataset).transform(dataset) output.select("features", "label").show() # chi selector from pyspark.ml.feature import ChiSqSelector selector = ChiSqSelector(numTopFeatures=length, featuresCol="features", outputCol="selected_features", labelCol="label") result = selector.fit(output).transform(output) print("chi2 output with top %d features selected " % selector.getNumTopFeatures()) result.show() #runnin gfor the chi vallue test r = ChiSquareTest.test(result, "selected_features", "label").head() print("pValues: " + str(r.pValues)) p_values = str(r.pValues) print("degreesOfFreedom: " + str(r.degreesOfFreedom)) print("statistics: " + str(r.statistics)) json_response = {'pvalues': p_values} return json_response # Chi_sqr(dataset_add, features_colm, label_colm)
def feature_vector(df, idcol, colname, regressors): formula = RFormula(formula=colname + ' ~ ' + '+'.join(regressors), labelCol='label', featuresCol='features') # to dense feature vector df_features = formula.fit(df).transform(df).select(idcol, 'features', 'label') return df_features
def main(): spork = SparkSession.builder.appName("titanic").getOrCreate() #Gathering data df = spork.read.format("csv").option("inferschema", "true").option( "header", "true").load("titanic.csv") # df.show() df.printSchema() df = df.na.drop( "any" ) #has to that if any null value in row otherwise it will show error while feature engineering #feature Engineering #Change the formula and check the result supervised = RFormula( formula="Survived ~ Sex:Age + Pclass : Cabin + SibSp+Embarked ") fittedRF = supervised.fit(df) preparedDF = fittedRF.transform(df) preparedDF.show() #spliting data in train and validation data train, test = preparedDF.randomSplit([0.7, 0.3]) #classification #configure classifier lr = LogisticRegression(featuresCol="features", labelCol="label") #train classifier fittedLR = lr.fit(train) #check result result = fittedLR.transform(test) print("Coefficients:" + str(fittedLR.coefficients)) result.show(100) truePositive = float( result.filter("prediction =1.0 and label =1.0").count()) falsePositive = float( result.filter("prediction =1.0 and label = 0.0").count()) falseNegative = float( result.filter("prediction =0.0 and label = 1.0").count()) trueNegative = float( result.filter("prediction=0.0 and label =0.0 ").count()) print("True Positive :" + str(truePositive)) print("True Negative :" + str(trueNegative)) print("False Positive :" + str(falsePositive)) print("False Negative :" + str(falseNegative)) sensitivityOrRecall = truePositive / (truePositive + falseNegative) specificity = truePositive / (truePositive + falsePositive) precision = truePositive / (truePositive + falsePositive) accuracy = (truePositive + trueNegative) / (truePositive + trueNegative + falsePositive + falseNegative) print("sensitivityOrRecall :" + str(sensitivityOrRecall)) print("specificity :" + str(specificity)) print("precision :" + str(precision)) print("accuracy :" + str(accuracy)) spork.stop()
def test_rformula_string_indexer_order_type(self): df = self.spark.createDataFrame( [(1.0, 1.0, "a"), (0.0, 2.0, "b"), (1.0, 0.0, "a")], ["y", "x", "s"] ) rf = RFormula(formula="y ~ x + s", stringIndexerOrderType="alphabetDesc") self.assertEqual(rf.getStringIndexerOrderType(), "alphabetDesc") transformedDF = rf.fit(df).transform(df) observed = transformedDF.select("features").collect() expected = [[1.0, 0.0], [2.0, 1.0], [0.0, 0.0]] for i in range(0, len(expected)): self.assertTrue(all(observed[i]["features"].toArray() == expected[i]))
def test_rformula_string_indexer_order_type(self): df = self.spark.createDataFrame([ (1.0, 1.0, "a"), (0.0, 2.0, "b"), (1.0, 0.0, "a")], ["y", "x", "s"]) rf = RFormula(formula="y ~ x + s", stringIndexerOrderType="alphabetDesc") self.assertEqual(rf.getStringIndexerOrderType(), 'alphabetDesc') transformedDF = rf.fit(df).transform(df) observed = transformedDF.select("features").collect() expected = [[1.0, 0.0], [2.0, 1.0], [0.0, 0.0]] for i in range(0, len(expected)): self.assertTrue(all(observed[i]["features"].toArray() == expected[i]))
def data_preparation(df, avg_age,feat_name="features",lab_name='label'): df = df.fillna(avg_age,subset=['Age']) """ ## unnecessary when using Rformula df = df.replace(['male','female'],['-1','1'],'Sex') df = df.withColumn('Sex',df.Sex.cast('int')) df = df.replace(['S','Q','C'],['-1','0','1'],'Embarked') df = df.withColumn('Embarked',df.Embarked.cast('int')) df.printSchema() """ # Rformula automatically formats categorical data (Sex and Embarked) into numerical data formula = RFormula(formula="Survived ~ Sex + Age + Pclass + Fare + SibSp + Parch", featuresCol=feat_name, labelCol=lab_name) df = formula.fit(df).transform(df) df.show(truncate=False) return df
def data_preparation(df, avg_age, feat_name="features", lab_name='label'): df = df.fillna(avg_age, subset=['Age']) """ ## unnecessary when using Rformula df = df.replace(['male','female'],['-1','1'],'Sex') df = df.withColumn('Sex',df.Sex.cast('int')) df = df.replace(['S','Q','C'],['-1','0','1'],'Embarked') df = df.withColumn('Embarked',df.Embarked.cast('int')) df.printSchema() """ # Rformula automatically formats categorical data (Sex and Embarked) into numerical data formula = RFormula( formula="Survived ~ Sex + Age + Pclass + Fare + SibSp + Parch", featuresCol=feat_name, labelCol=lab_name) df = formula.fit(df).transform(df) df.show(truncate=False) return df
adDF = spark.read.csv("dataset/Advertising.csv", inferSchema=True, header=True) #데이터 위에서 5개 출력 해보자 adDF.show(5) #데이터 총 갯수는? adDF.count() adDF.printSchema() from pyspark.ml.feature import RFormula from pyspark.ml.regression import LinearRegression from pyspark.ml.evaluation import RegressionEvaluator from pyspark.ml.linalg import Vectors #transformer 라이브러리를 이용해서 벡터화 하는 방법 dataModel = RFormula().setFormula("Sales ~.").setFeaturesCol("features").setLabelCol("label") model_fit = dataModel.fit(adDF).transform(adDF) model_fit.show() model_fit.printSchema() model_fit_select = model_fit.select(["features","label"]) model_fit_select.show() model_fit_select.printSchema() #Vectors 함수를 이용해서 벡터화 하기 adV = adDF.rdd.map(lambda x: [Vectors.dense(x[0:3]), x[-1]]).toDF(['features', 'label']) adV.show() adV.printSchema()
# Create a Window partion by Id order by Date w = Window.partitionBy('Target').orderBy('Date') # Tracking the TargetValue of the previous day data = data.withColumn('PreviousDay', func.lag(data.TargetValue).over(w)) # Handle null values data = data.na.fill('na') # Vectorize the feature with the RFormula assemblerFormula = RFormula( formula= 'TargetValue ~ Date + Country_Region + Population + Target + Weight + PreviousDay ' ) assemblerFormula.setHandleInvalid('keep') trainingTF = assemblerFormula.fit(data) dataR = trainingTF.transform(data).select('Id', 'Date', 'Country_Region', 'Target', 'Weight', 'features', 'label') # Split the training and test dataset train = dataR.where(data.Date < '2020-04-27') test = dataR.where(data.Date >= '2020-04-27') # Init the Decision Tree Regressor #dt_model = DecisionTreeRegressor(featuresCol="features", weightCol='Weight', maxDepth=18) dt_model = GBTRegressor(featuresCol="features", maxIter=10) # Train the chosen model trained_model = dt_model.fit(train)
# COMMAND ---------- df = spark.read.json("/data/simple-ml") df.orderBy("value2").show() # COMMAND ---------- from pyspark.ml.feature import RFormula supervised = RFormula(formula="lab ~ . + color:value1 + color:value2") # COMMAND ---------- fittedRF = supervised.fit(df) preparedDF = fittedRF.transform(df) preparedDF.show() # COMMAND ---------- train, test = preparedDF.randomSplit([0.7, 0.3]) # COMMAND ---------- from pyspark.ml.classification import LogisticRegression lr = LogisticRegression(labelCol="label",featuresCol="features")
# categorical values) # . all columns except target # RFormula produces a vector column of features and a double or string column # of label. Like when formulas are used in R for linear regression, string # input columns will be one-hot encoded, and numeric columns will be cast to # doubles. If the label column is of type string, it will be first transformed # to double with StringIndexer. If the label column does not exist in the # DataFrame, the output label column will be created from the specified # response variable in the formula. spark = SparkSession.builder.appName("RFormula").getOrCreate() dataset = spark.createDataFrame( [(7, "US", 18, 1.0), (8, "CA", 12, 0.0), (9, "NZ", 15, 0.0)], ["id", "country", "hour", "clicked"]) formula = RFormula( formula="clicked ~ country + hour", featuresCol="features", labelCol="label") model = formula.fit(dataset) output = model.transform(dataset) output.select("features", "label").show() spark.stop()
# 构建 SparkSession spark = SparkSession \ .builder \ .appName(" GBDT TEST ") \ .enableHiveSupport() \ .getOrCreate() sc = spark.sparkContext # 从 HDFS 上读取数据 path = '/home/mnist-test/data/train' df = spark.read.csv(path, header=True, inferSchema=True) df = df.dropna() # 删除空值 # 将数据转换为 features labels rf = RFormula(formula="label ~ .", featuresCol="features", labelCol="labels") rf_model = rf.fit(df) df = rf_model.transform(df).select(["features", "labels"]) # 数据集切分 train_df, test_df = df.randomSplit([0.8, 0.2]) # 构造 GBDT 模型 gbdt = GBTClassifier(maxIter=10, maxDepth=3, labelCol="labels", featuresCol="features") # 构造 One Vs Rest Classifier. ovr = OneVsRest(classifier=gbdt) ovr_model = ovr.fit(train_df) predict_res = ovr_model.transform(test_df)
from __future__ import print_function # $example on$ from pyspark.ml.feature import RFormula # $example off$ from pyspark.sql import SparkSession if __name__ == "__main__": spark = SparkSession\ .builder\ .appName("RFormulaExample")\ .getOrCreate() # $example on$ dataset = spark.createDataFrame( [(7, "US", 18, 1.0), (8, "CA", 12, 0.0), (9, "NZ", 15, 0.0)], ["id", "country", "hour", "clicked"]) formula = RFormula( formula="clicked ~ country + hour", featuresCol="features", labelCol="label") output = formula.fit(dataset).transform(dataset) output.select("features", "label").show() # $example off$ spark.stop()
def Logistic_regression(dataset_add, features, label): dataset = spark.read.csv(dataset_add, header=True, inferSchema=True, sep=";") dataset.show() dataset.groupBy("y").count().show() # using the rformula for indexing, encoding and vectorising f = "" f = label + " ~ " for x in features: f = f + x + "+" f = f[:-1] f = (f) formula = RFormula(formula=f, featuresCol="features", labelCol="label") output = formula.fit(dataset).transform(dataset) output_2 = output.select("features", "label") output_2.show() # splitting the dataset into train and test train_data, test_data = output_2.randomSplit([0.75, 0.25], seed = 40) # implementing the logistic regression lr1 =LogisticRegression() Accuracy_list = [] # Accuracy_list.append(accuracy) FPR_list = [] # FPR_list.append(falsePositiveRate) TPR_list = [] precision_list = [] recall_list = [] y= 0.1 # x=[] for i in range(0,3): y=round(y+0.1,2) lr = LogisticRegression(maxIter=5, regParam=0.1, elasticNetParam=1.0, threshold=0.3) # fit the model lrModel = lr.fit(train_data) lrModel # print the coefficients and the intercept for the logistic regression print ("coefficients:" + str(lrModel.coefficientMatrix)) # mat = (lrModel.coefficientMatrix) # print mat print("intercept: " + str(lrModel.interceptVector)) # getting the summary of the model # f-measure calculation from pyspark.ml.classification import BinaryLogisticRegressionTrainingSummary training_summary = lrModel.summary BinaryLogisticRegressionTrainingSummary.accuracy print (" area under roc : " , training_summary.areaUnderROC) print (" roc : " , training_summary.roc) roc = training_summary.roc roc.show() print (" pr value : " , training_summary.pr) pr = training_summary.pr pr.show() print (" precision by threshold : " , training_summary.precisionByThreshold) prec_by_threshold = training_summary.precisionByThreshold prec_by_threshold.show() print (" accuracy : ", training_summary.accuracy) accuracy_d = training_summary.accuracy print (accuracy_d) fMeasure = training_summary.fMeasureByThreshold fMeasure.show() maxFMeasure = fMeasure.groupBy().max('F-Measure').select('max(F-Measure)').head() bestThreshold = fMeasure.where(fMeasure['F-Measure'] == maxFMeasure['max(F-Measure)']) \ .select('threshold').head()['threshold'] lr.setThreshold(bestThreshold) # obtain the objective per iteration objectiveHistory = training_summary.objectiveHistory print ("objectiveHistory") for objective in objectiveHistory: print (objective) # for a multiclass we can inspect a matrix on a per label basis print ("false positive rate by label:") for i, rate in enumerate(training_summary.falsePositiveRateByLabel): print ("label %d: %s" % (i, rate)) print("True positive rate") for i, rate in enumerate(training_summary.truePositiveRateByLabel): print ("label %d : %s" % (i, rate)) # # print("True Negative rate") # for i, rate in enumerate(training_summary) print("Precision by label:") for i, prec in enumerate(training_summary.precisionByLabel): print("label %d: %s" % (i, prec)) print("Recall by label:") for i, rec in enumerate(training_summary.recallByLabel): print("label %d: %s" % (i, rec)) print("F-measure by label:") for i, f in enumerate(training_summary.fMeasureByLabel()): print("label %d: %s" % (i, f)) accuracy = training_summary.accuracy falsePositiveRate = training_summary.weightedFalsePositiveRate truePositiveRate = training_summary.weightedTruePositiveRate fMeasure = training_summary.weightedFMeasure() precision = training_summary.weightedPrecision recall = training_summary.weightedRecall print("Accuracy: %s\nFPR: %s\nTPR: %s\nF-measure: %s\nPrecision: %s\nRecall: %s" % (accuracy, falsePositiveRate, truePositiveRate, fMeasure, precision, recall)) # Accuracy_list = [] Accuracy_list.append(accuracy) # FPR_list = [] FPR_list.append(falsePositiveRate) # TPR_list=[] TPR_list.append(truePositiveRate) precision_list.append(precision) recall_list.append(recall) print (Accuracy_list) print (FPR_list) print (TPR_list) print (precision_list) print (recall_list) import matplotlib.pyplot as plt # # plt.plot(recall_list, FPR_list) # plt.show() # # fpr = [0.0,0.0,0.0,0.0,0.003067484662576687, 0.003067484662576687, 0.006134969325153374, 0.11042944785276074, 0.1165644171779141, 0.1165644171779141, 0.23006134969325154, 0.9723926380368099, 0.9846625766871165 ] # tpr = [0.0, 0.09767441860465116, 0.10232558139534884, 0.13488372093023257 ,0.17674418604651163 ,0.3674418604651163 , 0.37209302325581395 , 0.7534883720930232, 0.8651162790697674 , 0.8697674418604651 , 0.9069767441860465, 0.9953488372093023, 1.0] # data visualization # ROC graph fpr = roc.select("FPR").toPandas() tpr = roc.select("TPR").toPandas() plt.plot(fpr, tpr) plt.show() # PR graph pr_recall = pr.select("recall").toPandas() pr_precision = pr.select("precision").toPandas() plt.plot(pr_precision,pr_recall) plt.show() # now applying the fit on the test data prediction_val = lrModel.transform(test_data) prediction_val.groupBy("label", "prediction").count().show() prediction_val.show() prediction_val.groupBy("prediction").count().show() prediction_val.groupBy("prediction", "probability").count().show()
data.show() ## 可產生另一個檔案.transform(data)不一定要在(data)檔案裡 #labelIndexer ===> data # RFormula from pyspark.ml.feature import RFormula ## RFormula: string input colums will be one-hot encoded, and numeric columns will be cast to doubles. ##特徵值要被修正formula" " formula = RFormula( formula="label ~ banner_pos + app_id + site_category + site_id + site_domain + device_type + device_conn_type", #formula="label ~ banner_pos + app_id + site_category + site_id + site_domain + C14 + C17 + C18 + C19 + C21", #0.707636 #formula="label ~ banner_pos + site_id + site_domain + C14 + C17 + C21", #0.7 featuresCol="features", labelCol="label") formula_data = formula.fit(data).transform(data) formula_data.select("features","label").show() # Split the data into training and test sets (30% held out for testing) #已經有了! # Split training and test data. (training, test) = formula_data.randomSplit([0.7, 0.3], seed = 12345) #what's seed training.show() from pyspark.ml.classification import LogisticRegression from pyspark.ml.param import Param, Params from pyspark.ml.feature import HashingTF, Tokenizer from pyspark.sql import Row from pyspark.ml import Pipeline
tokenized = tkn.transform(sales.select("Description")) tokenized.show(20, False) # COMMAND ---------- from pyspark.ml.feature import StandardScaler sScaler = StandardScaler().setInputCol("features") sScaler.fit(scaleDF).transform(scaleDF).show() # COMMAND ---------- from pyspark.ml.feature import RFormula supervised = RFormula(formula="lab ~ . + color:value1 + color:value2") supervised.fit(simpleDF).transform(simpleDF).show() # COMMAND ---------- from pyspark.ml.feature import SQLTransformer basicTransformation = SQLTransformer()\ .setStatement(""" SELECT sum(Quantity), count(*), CustomerID FROM __THIS__ GROUP BY CustomerID """) basicTransformation.transform(sales).show() # COMMAND ----------
# _import zoo data to a spark dataframe zoo_df = spark.read.option("inferschema", "true").option("header", "true").csv("zoo.csv") zoo_df.show(5) zoo_df.printSchema() # _add new column Is_Mammal zoo_df = zoo_df.withColumn("Is_Mammal", expr("CASE WHEN Type = 1 THEN 1 ELSE 0 END")) # _preprocess data pre_process_data = RFormula( formula= "Is_Mammal ~ Hair + Feathers + Eggs + Milk + Airborne + Aquatic + Predator + Toothed + Backbone + Breathes + Venomous + Fins + Legs + Tail + Domestic + Catsize" ) pre_process_data = pre_process_data.fit(zoo_df) pre_process_data = pre_process_data.transform(zoo_df) pre_process_data.show(5) # _split dataset into test and train datasets train, test = pre_process_data.randomSplit([0.7, 0.3]) # _initialize logistic regression classifier lr = LogisticRegression(labelCol="label", featuresCol="features") # _train logistic regression model with train data available fittedLr = lr.fit(train) # _classify test data result = fittedLr.transform(test)
.option("header", "true")\ .option("inferSchema", "true")\ .load("/data/retail-data/by-day/*.csv")\ .coalesce(5)\ .where("Description IS NOT NULL") fakeIntDF = spark.read.parquet("/data/simple-ml-integers") simpleDF = spark.read.json("/data/simple-ml") scaleDF = spark.read.parquet("/data/simple-ml-scaling") # COMMAND ---------- from pyspark.ml.feature import RFormula supervised = RFormula(formula="lab ~ . + color:value1 + color:value2") supervised.fit(simpleDF).transform(simpleDF).show() # COMMAND ---------- from pyspark.ml.feature import SQLTransformer basicTransformation = SQLTransformer()\ .setStatement(""" SELECT sum(Quantity), count(*), CustomerID FROM __THIS__ GROUP BY CustomerID """) basicTransformation.transform(sales).show()
data.show() ## 可產生另一個檔案.transform(data)不一定要在(data)檔案裡 #labelIndexer ===> data # RFormula from pyspark.ml.feature import RFormula ## RFormula: string input colums will be one-hot encoded, and numeric columns will be cast to doubles. ##特徵值要被修正formula" " formula = RFormula( formula= "label ~ banner_pos + app_id + site_category + site_id + site_domain + device_type + device_conn_type", #formula="label ~ banner_pos + app_id + site_category + site_id + site_domain + C14 + C17 + C18 + C19 + C21", #0.707636 #formula="label ~ banner_pos + site_id + site_domain + C14 + C17 + C21", #0.7 featuresCol="features", labelCol="label") formula_data = formula.fit(data).transform(data) formula_data.select("features", "label").show() # Split the data into training and test sets (30% held out for testing) #已經有了! # Split training and test data. (training, test) = formula_data.randomSplit([0.7, 0.3], seed=12345) #what's seed training.show() from pyspark.ml.classification import LogisticRegression from pyspark.ml.param import Param, Params from pyspark.ml.feature import HashingTF, Tokenizer from pyspark.sql import Row from pyspark.ml import Pipeline from pyspark.ml.classification import RandomForestClassifier
plan_indexer = StringIndexer(inputCol = 'Product_ID', outputCol = 'product_ID1') labeller = plan_indexer.fit(train) #%% Train1 = labeller.transform(train) Test1 = labeller.transform(test) Train1.show() #%% from pyspark.ml.feature import RFormula formula = RFormula(formula="Purchase ~ Age+ Occupation +City_Category+Stay_In_Current_City_Years+Product_Category_1+Product_Category_2+ Gender",featuresCol="features",labelCol="label") t1 = formula.fit(Train1) #%% train1 = t1.transform(Train1) test1 = t1.transform(Test1) train1.show() train1.select('features').show() train1.select('label').show() #%% from pyspark.ml.regression import RandomForestRegressor rf = RandomForestRegressor()
print(categorical) cat_inter = ['C14', 'C15'] concat = '+'.join(categorical) interaction = ':'.join(cat_inter) formula = "label ~ " + concat + '+' + interaction print(formula) from pyspark.ml.feature import RFormula interactor = RFormula(formula=formula, featuresCol="features", labelCol="label").setHandleInvalid("keep") interactor.fit(df_train).transform(df_train).select("features").show() from pyspark.ml.classification import LogisticRegression classifier = LogisticRegression(maxIter=20, regParam=0.000, elasticNetParam=0.000) stages = [interactor, classifier] from pyspark.ml import Pipeline pipeline = Pipeline(stages=stages) model = pipeline.fit(df_train)
# 特征矩阵 features = pandas.DataFrame(iris.data, columns=iris.feature_names) # 目标矩阵 targets = pandas.DataFrame(iris.target, columns=['Species']) # 合并矩阵 merged = pandas.concat([features, targets], axis=1) # 创建SparkSession sess = SparkSession(sc) # 创建spark DataFrame raw_df = sess.createDataFrame(merged) # 提取特征与目标 fomula = RFormula(formula='Species ~ .') raw_df = fomula.fit(raw_df).transform(raw_df) # 拆分训练集和测试集 train_df, test_df = raw_df.randomSplit([0.8, 0.2]) # 创建LR分类器 lr = LogisticRegression() # 训练 train_df.show() model = lr.fit(train_df) # 预测test集合 predict_df = model.transform(test_df)
def main(): #静默弃用sklearn警告 warnings.filterwarnings(module='sklearn*', action='ignore', category=DeprecationWarning) model_name = 'Distr_GBTClassifier' dir_of_dict = sys.argv[1] bag = too.Read_info(dir_of_dict,'supervision') name_dict,options,task_id,job_id,train_result_dir,\ names_str,names_num,names_show,Y_names,dir_of_inputdata,\ dir_of_outputdata,open_pca,train_size,test_size,normalized_type = bag dir_of_storePara = train_result_dir + '/%s_Parameters.json'%(str(task_id)+'_'+str(job_id)+'_'+model_name) dir_of_storeModel = train_result_dir + '/%s_model'%(str(task_id)+'_'+str(job_id)+'_'+model_name) # 配置spark客户端 sess = SparkSession\ .builder\ .master("local[4]")\ .appName("GBTClassifier_spark")\ .config("spark.some.config.option", "some-value")\ .getOrCreate() sc=sess.sparkContext sc.setLogLevel("ERROR") if options == 'train': time_start = time() #获取数据 dataset = pd.read_csv(dir_of_inputdata) #用于测试 #dataset = dataset[0:1000] #限制多数类的数据 #dataset = too.CalcMostLabel(dataset,Y_names) Y_datavec = dataset[Y_names].values #输出每个标签的数量 print 'Counter:original y',Counter(Y_datavec) print'----------------------------------------------' #分别获得字符字段和数值型字段数据,且合并 X_datavec,X_columns,vocabset,datavec_show_list= too.Merge_form(dataset,names_str,names_num,names_show,'vocabset','open') #数据归一化 X_datavec = too.Data_process(X_datavec,normalized_type) #处理数据不平衡问题 #X,Y = mlp.KMeans_unbalanced(X_datavec,Y_datavec,X_columns,Y_names) #X,Y = mlp.Sample_unbalanced(X_datavec,Y_datavec) X,Y = X_datavec, Y_datavec ret_num = 'no_num' #PCA降维 if open_pca == 'open_pca': pca_num,ret = mlp.GS_PCA(X) print 'PCA Information:',pca_num,ret print'----------------------------------------------' ret_num = ret['99%'] X = mlp.Model_PCA(X,ret_num) #存储vocabset这个list和ret_num too.StorePara(dir_of_storePara,vocabset,ret_num) print'--------------Train data shape----------------' print 'X.shape:',X.shape print'----------------------------------------------' print 'Y.shape:',Y.shape print'----------------------------------------------' print'--------------Start %s model------------------'%model_name features = pd.DataFrame(X,) targets = pd.DataFrame(Y, columns = ['Y']) #合拼矩阵 merged = pd.concat([features, targets], axis = 1) #创建spark DataFrame raw_df = sess.createDataFrame(merged) #提取特征与目标 fomula = RFormula(formula = 'Y ~ .', featuresCol="features",labelCol="label") raw_df = fomula.fit(raw_df).transform(raw_df) #拆分训练集和测试集 xy_train, xy_test = raw_df.randomSplit([train_size, test_size],seed=666) #调用模型 clf_model = dmp.Distr_GBTClassifier(xy_train,xy_test) #保存模型参数 clf_model.write().overwrite().save(dir_of_storeModel) print'----------------------------------------------' dmp.Predict_test_data(xy_test, datavec_show_list, names_show, clf_model, dir_of_outputdata) duration = too.Duration(time()-time_start) print 'Total run time: %s'%duration if options == 'predict': time_start = time() with open(dir_of_storePara,'r') as f: para_dict = json.load(f) vocabset = para_dict['vocabset'] ret_num = para_dict['ret_num'] #获取数据 dataset = pd.read_csv(dir_of_inputdata) #分别获得字符字段和数值型字段数据,且合并 X_datavec,datavec_show_list = too.Merge_form(dataset,names_str,names_num,names_show,vocabset,'close') #数据归一化 X = too.Data_process(X_datavec,normalized_type) #PCA降维 if open_pca == 'open_pca': X = mlp.Model_PCA(X,ret_num) print'-------------Pdedict data shape---------------' print 'X.shape:',X.shape print'----------------------------------------------' print'--------------Start %s model------------------'%model_name features = pd.DataFrame(X,) #创建spark DataFrame raw_features = sess.createDataFrame(features) raw_x = VectorAssembler(inputCols=raw_features.columns,outputCol='features').transform(raw_features) clf_model = GBTClassificationModel.load(dir_of_storeModel) dmp.Predict_data(raw_x, datavec_show_list, names_show, clf_model, dir_of_outputdata) duration = too.Duration(time()-time_start) print 'Total run time: %s'%duration
import numpy from pyspark.ml.feature import RFormula from pyspark.ml.classification import BinaryLogisticRegressionSummary, LogisticRegression from pyspark.ml.evaluation import (BinaryClassificationEvaluator, MulticlassClassificationEvaluator) # <br> # <font size=4,font style=arial> # Bağımlı değişkenimiz Y ve bağımsız değişkenlerimiz U1,U2,U3,N1,N2,N3,N4,C1,C2 olmak üzere Lojistik regresyon analizi yapalım. Spark model için bir features (bağımsız değişkenlerin oluşturduğu sparse matris) ve label(bağımlı değişken) vektörlerinin oluşturulması gerekiyor. R formula bunu oluşturmaktadır. # </font> # In[59]: formula = RFormula(formula="Y ~ U1+U2+U3+N1+N2+N3+N4+C1+C2") output = formula.fit(df).transform(df) # <font size=4,font style=arial> # Modelimiz için gerekli olan features(yer kaplamaması ve işlem kolaylığı açısından oluşturulan sparse matris) ve label kolonu oluştu. Aşağıda da çıktısı var. # </font> # In[60]: output.show(5, truncate=False) # <font size=4,font style=arial> # Model için sadece features ve label kolonunu alacağız. # </font> # In[61]:
spark = SparkSession(sc) # _import zoo data to a spark dataframe mushroom_df = spark.read.option("inferschema", "true").option("header", "true").csv("mushrooms.csv") mushroom_df.show(5) mushroom_df.printSchema() mushroom_df = mushroom_df.na.drop() # _No need to create extra column as Lab column is already binary classifiable with either EDIBLE or POISONOUS values mushroom_df = mushroom_df.drop("VeilType") # _preprocess data pre_process_data = RFormula(formula="Lab ~ .") pre_process_data = pre_process_data.fit(mushroom_df) pre_process_data = pre_process_data.transform(mushroom_df) pre_process_data.show(5) # _split dataset into test and train datasets train, test = pre_process_data.randomSplit([0.7, 0.3]) # _initialize logistic regression classifier lr = LogisticRegression(labelCol="label", featuresCol="features") # _train logistic regression model with train data available fittedLr = lr.fit(train) # _classify test data result = fittedLr.transform(test)
def Logistic_regression(dataset_add, features, label): dataset = spark.read.csv(dataset_add, header=True, inferSchema=True, sep=";") dataset.show() # using the rformula for indexing, encoding and vectorising f = "" f = label + " ~ " for x in features: f = f + x + "+" f = f[:-1] f = (f) formula = RFormula(formula=f, featuresCol="features", labelCol="label") output = formula.fit(dataset).transform(dataset) output_2 = output.select("features", "label") output_2.show() # implementing the logistic regression lr1 = LogisticRegression() lr = LogisticRegression(maxIter=10, regParam=0.3, elasticNetParam=0.6, family="multinomial") # splitting the dataset train_data, test_data = output_2.randomSplit([0.75, 0.25], seed=40) # fit the model lrModel = lr.fit(train_data) # import matplotlib.pyplot as plt # import numpy as np # # beta = np.sort(lrModel.coefficientMatrix) # # plt.plot(beta) # plt.ylabel("beta coefficients") # plt.show() prediction = lrModel.transform(test_data) prediction.groupBy("label", "prediction").count().show() prediction.show() # print the coefficients and the intercept for the logistic regression # # print ("coefficients:" + str(lrModel.coefficientMatrix)) # # mat = (lrModel.coefficientMatrix) # # print mat # print("intercept: " + str(lrModel.interceptVector)) # getting the summary of the model training_summary = lrModel.summary # obtain the objective per iteration objectiveHistory = training_summary.objectiveHistory print("objectiveHistory") for objective in objectiveHistory: print objective # for a multiclass we can inspect a matrix on a per label basis print("false positive rate by label:") for i, rate in enumerate(training_summary.falsePositiveRateByLabel): print("label %d: %s" % (i, rate)) print("True positive rate") for i, rate in enumerate(training_summary.truePositiveRateByLabel): print("label %d : %s" % (i, rate)) print("Precision by label:") for i, prec in enumerate(training_summary.precisionByLabel): print("label %d: %s" % (i, prec)) print("Recall by label:") for i, rec in enumerate(training_summary.recallByLabel): print("label %d: %s" % (i, rec)) print("F-measure by label:") for i, f in enumerate(training_summary.fMeasureByLabel()): print("label %d: %s" % (i, f)) accuracy = training_summary.accuracy falsePositiveRate = training_summary.weightedFalsePositiveRate truePositiveRate = training_summary.weightedTruePositiveRate fMeasure = training_summary.weightedFMeasure() precision = training_summary.weightedPrecision recall = training_summary.weightedRecall print( "Accuracy: %s\nFPR: %s\nTPR: %s\nF-measure: %s\nPrecision: %s\nRecall: %s" % (accuracy, falsePositiveRate, truePositiveRate, fMeasure, precision, recall)) # evaluating the model on test dataset from pyspark.ml.evaluation import BinaryClassificationEvaluator # from pyspark.ml.classification import BinaryLogisticRegressionTrainingSummary # # # training_sum = BinaryLogisticRegressionTrainingSummary(lrModel) # print training_sum.areaUnderROC() evaluator = BinaryClassificationEvaluator() print('test area under roc : ', evaluator.evaluate(prediction))
sparseVec = Vectors.sparse(size, idx, values) print(sparseVec) # COMMAND ---------- df = spark.read.json("/databricks-datasets/definitive-guide/data/simple-ml") df.orderBy("value2").show() # COMMAND ---------- from pyspark.ml.feature import RFormula supervised = RFormula(formula="lab ~ . +color:value1 + color:value2") # COMMAND ---------- fittedRF = supervised.fit(df) preparedDF = fittedRF.transform(df) preparedDF.show() # COMMAND ---------- train, test = preparedDF.randomSplit([0.7, 0.3]) # COMMAND ---------- from pyspark.ml.classification import LogisticRegression lr = LogisticRegression(labelCol="label",featuresCol="features") # COMMAND ----------
def Logistic_regression(dataset_add, feature_colm, label_colm): dataset = spark.read.csv(dataset_add, header=True, inferSchema=True, sep=";") dataset.show() dataset.groupBy("y").count().show() label = '' for y in label_colm: label = y f = "" f = label + " ~ " for x in feature_colm: f = f + x + "+" f = f[:-1] f = (f) formula = RFormula(formula=f, featuresCol="features", labelCol="label") output = formula.fit(dataset).transform(dataset) finalized_data = output.select("features", "label") finalized_data.show() train_data, test_data = finalized_data.randomSplit([0.75, 0.25], seed=40) Accuracy_list = [] FPR_list = [] TPR_list = [] precision_list = [] recall_list = [] lr = LogisticRegression(maxIter=5) lrModel = lr.fit(train_data) print("coefficients:" + str(lrModel.coefficientMatrix)) print("intercept: " + str(lrModel.interceptVector)) training_summary = lrModel.summary BinaryLogisticRegressionTrainingSummary.accuracy print(" area under roc : ", training_summary.areaUnderROC) print(" roc : ", training_summary.roc) roc = training_summary.roc roc.show() roc.write.parquet( 'hdfs://10.171.0.181:9000/dev/dmxdeepinsight/datasets/ROC_plot.parquet', mode='overwrite') print(" pr value : ", training_summary.pr) pr = training_summary.pr pr.show() pr.write.parquet( 'hdfs://10.171.0.181:9000/dev/dmxdeepinsight/datasets/PR_plot.parquet', mode='overwrite') print(" precision by threshold : ", training_summary.precisionByThreshold) prec_by_threshold = training_summary.precisionByThreshold prec_by_threshold.show() print(" accuracy : ", training_summary.accuracy) accuracy_d = training_summary.accuracy print(accuracy_d) fMeasure = training_summary.fMeasureByThreshold fMeasure.show() maxFMeasure = fMeasure.groupBy().max('F-Measure').select( 'max(F-Measure)').head() bestThreshold = fMeasure.where(fMeasure['F-Measure'] == maxFMeasure['max(F-Measure)']) \ .select('threshold').head()['threshold'] lr.setThreshold(bestThreshold) objectiveHistory = training_summary.objectiveHistory print("objectiveHistory") for objective in objectiveHistory: print(objective) print("false positive rate by label:") for i, rate in enumerate(training_summary.falsePositiveRateByLabel): print("label %d: %s" % (i, rate)) print("True positive rate") for i, rate in enumerate(training_summary.truePositiveRateByLabel): print("label %d : %s" % (i, rate)) print("Precision by label:") for i, prec in enumerate(training_summary.precisionByLabel): print("label %d: %s" % (i, prec)) print("Recall by label:") for i, rec in enumerate(training_summary.recallByLabel): print("label %d: %s" % (i, rec)) print("F-measure by label:") for i, f in enumerate(training_summary.fMeasureByLabel()): print("label %d: %s" % (i, f)) accuracy = training_summary.accuracy falsePositiveRate = training_summary.weightedFalsePositiveRate truePositiveRate = training_summary.weightedTruePositiveRate fMeasure = training_summary.weightedFMeasure() precision = training_summary.weightedPrecision recall = training_summary.weightedRecall print( "Accuracy: %s\nFPR: %s\nTPR: %s\nF-measure: %s\nPrecision: %s\nRecall: %s" % (accuracy, falsePositiveRate, truePositiveRate, fMeasure, precision, recall)) Accuracy_list.append(accuracy) FPR_list.append(falsePositiveRate) TPR_list.append(truePositiveRate) precision_list.append(precision) recall_list.append(recall) print(Accuracy_list) print(FPR_list) print(TPR_list) print(precision_list) print(recall_list) fpr = roc.select("FPR").toPandas() tpr = roc.select("TPR").toPandas() plt.plot(fpr, tpr) plt.show() pr_recall = pr.select("recall").toPandas() pr_precision = pr.select("precision").toPandas() plt.plot(pr_precision, pr_recall) plt.show() prediction_val = lrModel.transform(test_data) prediction_val.groupBy("label", "prediction").count().show() prediction_val.show() prediction_val.groupBy("prediction").count().show() prediction_val.groupBy("prediction", "probability").count().show()
# limitations under the License. # from __future__ import print_function # $example on$ from pyspark.ml.feature import RFormula # $example off$ from pyspark.sql import SparkSession if __name__ == "__main__": spark = SparkSession\ .builder\ .appName("RFormulaExample")\ .getOrCreate() # $example on$ dataset = spark.createDataFrame([(7, "US", 18, 1.0), (8, "CA", 12, 0.0), (9, "NZ", 15, 0.0)], ["id", "country", "hour", "clicked"]) formula = RFormula(formula="clicked ~ country + hour", featuresCol="features", labelCol="label") output = formula.fit(dataset).transform(dataset) output.select("features", "label").show() # $example off$ spark.stop()
#predict the number of installments that will be paid (0-1) with anything less than 1 #implying early repayment of loan #which cols? #cols:['loan_amnt', 'int_rate', 'installment', 'grade', 'emp_length', 'home_ownership', 'annual_inc', 'issue_d', 'dti', # 'revol_util', 'total_pymnt', 'last_pymnt_d', 'last_pymnt_amnt', 'mnth_start2last', #'fracNumPmts', 'pred_KM'] formula = RFormula( formula = "fracNumPmts ~ installment + annual_inc + dti + int_rate + revol_util + home_ownership + grade + emp_length + pred_KM", featuresCol="features", labelCol="label") #transformed data frame with vectors assembled regFormulaFit = formula.fit(df).transform(df) #training data frame training = regFormulaFit.select(["label","features"]) lr = LinearRegression(labelCol = "label", featuresCol= "features", maxIter=10)#, regParam=0.3) lrModel = lr.fit(training) trainingSummary = lrModel.summary df.select('fracNumPmts').describe().show() # +-------+------------------+ # |summary| fracNumPmts| # +-------+------------------+ # | count| 28227| # | mean|0.5334839555374444| # | stddev|0.2962701727734131|