Exemple #1
0
 def test_rformula_force_index_label(self):
     df = self.spark.createDataFrame([(1.0, 1.0, "a"), (0.0, 2.0, "b"),
                                      (1.0, 0.0, "a")], ["y", "x", "s"])
     # Does not index label by default since it's numeric type.
     rf = RFormula(formula="y ~ x + s")
     model = rf.fit(df)
     transformedDF = model.transform(df)
     self.assertEqual(transformedDF.head().label, 1.0)
     # Force to index label.
     rf2 = RFormula(formula="y ~ x + s").setForceIndexLabel(True)
     model2 = rf2.fit(df)
     transformedDF2 = model2.transform(df)
     self.assertEqual(transformedDF2.head().label, 0.0)
Exemple #2
0
 def test_rformula_force_index_label(self):
     df = self.spark.createDataFrame([
         (1.0, 1.0, "a"),
         (0.0, 2.0, "b"),
         (1.0, 0.0, "a")], ["y", "x", "s"])
     # Does not index label by default since it's numeric type.
     rf = RFormula(formula="y ~ x + s")
     model = rf.fit(df)
     transformedDF = model.transform(df)
     self.assertEqual(transformedDF.head().label, 1.0)
     # Force to index label.
     rf2 = RFormula(formula="y ~ x + s").setForceIndexLabel(True)
     model2 = rf2.fit(df)
     transformedDF2 = model2.transform(df)
     self.assertEqual(transformedDF2.head().label, 0.0)
Exemple #3
0
def spark_ml():
    diff_cat_in_train_test=test.select('Product_ID').subtract(train.select('Product_ID'))
    diff_cat_in_train_test.distinct().count()
    
    from pyspark.ml.feature import StringIndexer
    plan_indexer = StringIndexer(inputCol = 'Product_ID', outputCol = 'product_ID')
    labeller = plan_indexer.fit(train)
    Train1 = labeller.transform(train)
    Test1 = labeller.transform(test)
    Train1.show()
    from pyspark.ml.feature import RFormula
    formula = RFormula(formula="Purchase ~ Age+ Occupation +City_Category+Stay_In_Current_City_Years+Product_Category_1+Product_Category_2+ Gender",featuresCol="features",labelCol="label")
    t1 = formula.fit(Train1)
    train1 = t1.transform(Train1)
    test1 = t1.transform(Test1)
    train1.show()
    train1.select('features').show()
    train1.select('label').show()
    from pyspark.ml.regression import RandomForestRegressor
    rf = RandomForestRegressor()
    (train_cv, test_cv) = train1.randomSplit([0.7, 0.3])
    model1 = rf.fit(train_cv)
    predictions = model1.transform(test_cv)
    from pyspark.ml.evaluation import RegressionEvaluator
    evaluator = RegressionEvaluator()
    mse = evaluator.evaluate(predictions,{evaluator.metricName:"mse" })
    import numpy as np
    np.sqrt(mse), mse
    model = rf.fit(train1)
    predictions1 = model.transform(test1)
    df = predictions1.selectExpr("User_ID as User_ID", "Product_ID as Product_ID", 'prediction as Purchase')
    df.toPandas().to_csv('submission.csv')
def Chi_sqr(dataset_add, feature_colm, label_colm):
    dataset = spark.read.csv(dataset_add, header=True, inferSchema=True)

    dataset.show()

    # using the rformula for indexing, encoding and vectorising

    label = ''
    for y in label_colm:
        label = y

    print(label)

    f = ""
    f = label + " ~ "

    for x in feature_colm:
        f = f + x + "+"
    f = f[:-1]
    f = (f)

    formula = RFormula(formula=f, featuresCol="features", labelCol="label")

    length = feature_colm.__len__()

    output = formula.fit(dataset).transform(dataset)

    output.select("features", "label").show()

    # chi selector
    from pyspark.ml.feature import ChiSqSelector

    selector = ChiSqSelector(numTopFeatures=length,
                             featuresCol="features",
                             outputCol="selected_features",
                             labelCol="label")

    result = selector.fit(output).transform(output)

    print("chi2 output with top %d features selected " %
          selector.getNumTopFeatures())
    result.show()

    #runnin gfor the chi vallue test

    r = ChiSquareTest.test(result, "selected_features", "label").head()
    print("pValues: " + str(r.pValues))
    p_values = str(r.pValues)
    print("degreesOfFreedom: " + str(r.degreesOfFreedom))

    print("statistics: " + str(r.statistics))

    json_response = {'pvalues': p_values}

    return json_response


# Chi_sqr(dataset_add, features_colm, label_colm)
Exemple #5
0
def feature_vector(df, idcol, colname, regressors):
    formula = RFormula(formula=colname + ' ~ ' + '+'.join(regressors),
                       labelCol='label',
                       featuresCol='features')

    # to dense feature vector
    df_features = formula.fit(df).transform(df).select(idcol, 'features',
                                                       'label')

    return df_features
def main():

    spork = SparkSession.builder.appName("titanic").getOrCreate()

    #Gathering data
    df = spork.read.format("csv").option("inferschema", "true").option(
        "header", "true").load("titanic.csv")
    # df.show()
    df.printSchema()
    df = df.na.drop(
        "any"
    )  #has to that if any null value in row otherwise it will show error while feature engineering

    #feature Engineering
    #Change the formula and check the result
    supervised = RFormula(
        formula="Survived ~ Sex:Age + Pclass : Cabin + SibSp+Embarked ")
    fittedRF = supervised.fit(df)
    preparedDF = fittedRF.transform(df)
    preparedDF.show()
    #spliting data in train and validation data
    train, test = preparedDF.randomSplit([0.7, 0.3])
    #classification
    #configure classifier
    lr = LogisticRegression(featuresCol="features", labelCol="label")
    #train classifier
    fittedLR = lr.fit(train)

    #check result
    result = fittedLR.transform(test)
    print("Coefficients:" + str(fittedLR.coefficients))
    result.show(100)
    truePositive = float(
        result.filter("prediction =1.0 and label =1.0").count())
    falsePositive = float(
        result.filter("prediction =1.0 and  label = 0.0").count())
    falseNegative = float(
        result.filter("prediction =0.0 and label = 1.0").count())
    trueNegative = float(
        result.filter("prediction=0.0 and label =0.0 ").count())
    print("True Positive :" + str(truePositive))
    print("True Negative :" + str(trueNegative))
    print("False Positive :" + str(falsePositive))
    print("False Negative :" + str(falseNegative))
    sensitivityOrRecall = truePositive / (truePositive + falseNegative)
    specificity = truePositive / (truePositive + falsePositive)
    precision = truePositive / (truePositive + falsePositive)
    accuracy = (truePositive + trueNegative) / (truePositive + trueNegative +
                                                falsePositive + falseNegative)
    print("sensitivityOrRecall :" + str(sensitivityOrRecall))
    print("specificity :" + str(specificity))
    print("precision :" + str(precision))
    print("accuracy :" + str(accuracy))

    spork.stop()
Exemple #7
0
 def test_rformula_string_indexer_order_type(self):
     df = self.spark.createDataFrame(
         [(1.0, 1.0, "a"), (0.0, 2.0, "b"), (1.0, 0.0, "a")], ["y", "x", "s"]
     )
     rf = RFormula(formula="y ~ x + s", stringIndexerOrderType="alphabetDesc")
     self.assertEqual(rf.getStringIndexerOrderType(), "alphabetDesc")
     transformedDF = rf.fit(df).transform(df)
     observed = transformedDF.select("features").collect()
     expected = [[1.0, 0.0], [2.0, 1.0], [0.0, 0.0]]
     for i in range(0, len(expected)):
         self.assertTrue(all(observed[i]["features"].toArray() == expected[i]))
Exemple #8
0
 def test_rformula_string_indexer_order_type(self):
     df = self.spark.createDataFrame([
         (1.0, 1.0, "a"),
         (0.0, 2.0, "b"),
         (1.0, 0.0, "a")], ["y", "x", "s"])
     rf = RFormula(formula="y ~ x + s", stringIndexerOrderType="alphabetDesc")
     self.assertEqual(rf.getStringIndexerOrderType(), 'alphabetDesc')
     transformedDF = rf.fit(df).transform(df)
     observed = transformedDF.select("features").collect()
     expected = [[1.0, 0.0], [2.0, 1.0], [0.0, 0.0]]
     for i in range(0, len(expected)):
         self.assertTrue(all(observed[i]["features"].toArray() == expected[i]))
def data_preparation(df, avg_age,feat_name="features",lab_name='label'):

    df = df.fillna(avg_age,subset=['Age'])

    """
    ## unnecessary when using Rformula
    df = df.replace(['male','female'],['-1','1'],'Sex')
    df = df.withColumn('Sex',df.Sex.cast('int'))

    df = df.replace(['S','Q','C'],['-1','0','1'],'Embarked')
    df = df.withColumn('Embarked',df.Embarked.cast('int'))
    df.printSchema()
    """

    # Rformula automatically formats categorical data (Sex and Embarked) into numerical data
    formula = RFormula(formula="Survived ~ Sex + Age + Pclass + Fare + SibSp + Parch",
        featuresCol=feat_name,
        labelCol=lab_name)

    df = formula.fit(df).transform(df)
    df.show(truncate=False)

    return df
def data_preparation(df, avg_age, feat_name="features", lab_name='label'):

    df = df.fillna(avg_age, subset=['Age'])
    """
    ## unnecessary when using Rformula
    df = df.replace(['male','female'],['-1','1'],'Sex')
    df = df.withColumn('Sex',df.Sex.cast('int'))

    df = df.replace(['S','Q','C'],['-1','0','1'],'Embarked')
    df = df.withColumn('Embarked',df.Embarked.cast('int'))
    df.printSchema()
    """

    # Rformula automatically formats categorical data (Sex and Embarked) into numerical data
    formula = RFormula(
        formula="Survived ~ Sex + Age + Pclass + Fare + SibSp + Parch",
        featuresCol=feat_name,
        labelCol=lab_name)

    df = formula.fit(df).transform(df)
    df.show(truncate=False)

    return df
Exemple #11
0
adDF = spark.read.csv("dataset/Advertising.csv", inferSchema=True, header=True)
#데이터 위에서 5개 출력 해보자
adDF.show(5)
#데이터 총 갯수는?
adDF.count()

adDF.printSchema()

from pyspark.ml.feature import RFormula
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.linalg import Vectors

#transformer 라이브러리를 이용해서 벡터화 하는 방법
dataModel = RFormula().setFormula("Sales ~.").setFeaturesCol("features").setLabelCol("label")
model_fit = dataModel.fit(adDF).transform(adDF)

model_fit.show()
model_fit.printSchema()

model_fit_select = model_fit.select(["features","label"])

model_fit_select.show()
model_fit_select.printSchema()

#Vectors 함수를 이용해서 벡터화 하기
adV = adDF.rdd.map(lambda x: [Vectors.dense(x[0:3]), x[-1]]).toDF(['features', 'label'])

adV.show()
adV.printSchema()
# Create a Window partion by Id order by Date
w = Window.partitionBy('Target').orderBy('Date')

# Tracking the TargetValue of the previous day
data = data.withColumn('PreviousDay', func.lag(data.TargetValue).over(w))

# Handle null values
data = data.na.fill('na')

# Vectorize the feature with the RFormula
assemblerFormula = RFormula(
    formula=
    'TargetValue ~ Date + Country_Region + Population + Target + Weight + PreviousDay '
)
assemblerFormula.setHandleInvalid('keep')
trainingTF = assemblerFormula.fit(data)

dataR = trainingTF.transform(data).select('Id', 'Date', 'Country_Region',
                                          'Target', 'Weight', 'features',
                                          'label')

# Split the training and test dataset
train = dataR.where(data.Date < '2020-04-27')
test = dataR.where(data.Date >= '2020-04-27')

# Init the Decision Tree Regressor
#dt_model = DecisionTreeRegressor(featuresCol="features", weightCol='Weight', maxDepth=18)
dt_model = GBTRegressor(featuresCol="features", maxIter=10)

# Train the chosen model
trained_model = dt_model.fit(train)
# COMMAND ----------

df = spark.read.json("/data/simple-ml")
df.orderBy("value2").show()


# COMMAND ----------

from pyspark.ml.feature import RFormula
supervised = RFormula(formula="lab ~ . + color:value1 + color:value2")


# COMMAND ----------

fittedRF = supervised.fit(df)
preparedDF = fittedRF.transform(df)
preparedDF.show()


# COMMAND ----------

train, test = preparedDF.randomSplit([0.7, 0.3])


# COMMAND ----------

from pyspark.ml.classification import LogisticRegression
lr = LogisticRegression(labelCol="label",featuresCol="features")

Exemple #14
0
#      categorical values)
#    . all columns except target

# RFormula produces a vector column of features and a double or string column
# of label. Like when formulas are used in R for linear regression, string
# input columns will be one-hot encoded, and numeric columns will be cast to
# doubles. If the label column is of type string, it will be first transformed
# to double with StringIndexer. If the label column does not exist in the
# DataFrame, the output label column will be created from the specified
# response variable in the formula.

spark = SparkSession.builder.appName("RFormula").getOrCreate()

dataset = spark.createDataFrame(
    [(7, "US", 18, 1.0),
     (8, "CA", 12, 0.0),
     (9, "NZ", 15, 0.0)],
    ["id", "country", "hour", "clicked"])

formula = RFormula(
    formula="clicked ~ country + hour",
    featuresCol="features",
    labelCol="label")

model = formula.fit(dataset)

output = model.transform(dataset)
output.select("features", "label").show()

spark.stop()
# 构建 SparkSession
spark = SparkSession \
    .builder \
    .appName(" GBDT TEST ") \
    .enableHiveSupport() \
    .getOrCreate()
sc = spark.sparkContext

# 从 HDFS 上读取数据
path = '/home/mnist-test/data/train'
df = spark.read.csv(path, header=True, inferSchema=True)
df = df.dropna()  # 删除空值

# 将数据转换为 features labels
rf = RFormula(formula="label ~ .", featuresCol="features", labelCol="labels")
rf_model = rf.fit(df)
df = rf_model.transform(df).select(["features", "labels"])

# 数据集切分
train_df, test_df = df.randomSplit([0.8, 0.2])

# 构造 GBDT 模型
gbdt = GBTClassifier(maxIter=10,
                     maxDepth=3,
                     labelCol="labels",
                     featuresCol="features")

# 构造 One Vs Rest Classifier.
ovr = OneVsRest(classifier=gbdt)
ovr_model = ovr.fit(train_df)
predict_res = ovr_model.transform(test_df)
Exemple #16
0
from __future__ import print_function

# $example on$
from pyspark.ml.feature import RFormula
# $example off$
from pyspark.sql import SparkSession

if __name__ == "__main__":
    spark = SparkSession\
        .builder\
        .appName("RFormulaExample")\
        .getOrCreate()

    # $example on$
    dataset = spark.createDataFrame(
        [(7, "US", 18, 1.0),
         (8, "CA", 12, 0.0),
         (9, "NZ", 15, 0.0)],
        ["id", "country", "hour", "clicked"])

    formula = RFormula(
        formula="clicked ~ country + hour",
        featuresCol="features",
        labelCol="label")

    output = formula.fit(dataset).transform(dataset)
    output.select("features", "label").show()
    # $example off$

    spark.stop()
Exemple #17
0
    def Logistic_regression(dataset_add, features, label):

        dataset = spark.read.csv(dataset_add, header=True, inferSchema=True, sep=";")

        dataset.show()

        dataset.groupBy("y").count().show()

        # using the rformula for indexing, encoding and vectorising

        f = ""
        f = label + " ~ "

        for x in features:
            f = f + x + "+"
        f = f[:-1]
        f = (f)

        formula = RFormula(formula=f,
                           featuresCol="features",
                           labelCol="label")

        output = formula.fit(dataset).transform(dataset)

        output_2 = output.select("features", "label")

        output_2.show()

        # splitting the dataset into train and test

        train_data, test_data = output_2.randomSplit([0.75, 0.25], seed = 40)

        # implementing the logistic regression
        lr1 =LogisticRegression()

        Accuracy_list = []
        # Accuracy_list.append(accuracy)
        FPR_list = []
        # FPR_list.append(falsePositiveRate)
        TPR_list = []
        precision_list = []
        recall_list = []

        y= 0.1
        # x=[]
        for i in range(0,3):
            y=round(y+0.1,2)

            lr = LogisticRegression(maxIter=5, regParam=0.1, elasticNetParam=1.0, threshold=0.3)



            # fit the model


            lrModel = lr.fit(train_data)
            lrModel

            # print the coefficients and the intercept for the logistic regression

            print ("coefficients:" + str(lrModel.coefficientMatrix))
            # mat = (lrModel.coefficientMatrix)
            # print mat
            print("intercept: " + str(lrModel.interceptVector))





            # getting the summary of the model

            # f-measure calculation
            from pyspark.ml.classification import BinaryLogisticRegressionTrainingSummary

            training_summary = lrModel.summary

            BinaryLogisticRegressionTrainingSummary.accuracy

            print (" area under roc : " , training_summary.areaUnderROC)
            print ("  roc : " , training_summary.roc)
            roc = training_summary.roc
            roc.show()
            print (" pr value : " , training_summary.pr)
            pr = training_summary.pr
            pr.show()
            print (" precision by threshold : " , training_summary.precisionByThreshold)
            prec_by_threshold = training_summary.precisionByThreshold
            prec_by_threshold.show()

            print (" accuracy : ", training_summary.accuracy)
            accuracy_d = training_summary.accuracy
            print (accuracy_d)

            fMeasure = training_summary.fMeasureByThreshold

            fMeasure.show()

            maxFMeasure = fMeasure.groupBy().max('F-Measure').select('max(F-Measure)').head()
            bestThreshold = fMeasure.where(fMeasure['F-Measure'] == maxFMeasure['max(F-Measure)']) \
                .select('threshold').head()['threshold']
            lr.setThreshold(bestThreshold)

            # obtain the objective per iteration

            objectiveHistory = training_summary.objectiveHistory
            print ("objectiveHistory")
            for objective in objectiveHistory:
                print (objective)


            # for a multiclass we can inspect  a matrix on a per label basis

            print ("false positive rate by label:")
            for i, rate in enumerate(training_summary.falsePositiveRateByLabel):
                print ("label %d: %s" % (i, rate))


            print("True positive rate")
            for i, rate in enumerate(training_summary.truePositiveRateByLabel):
                print ("label %d : %s" % (i, rate))
            #
            # print("True Negative rate")
            # for i, rate in enumerate(training_summary)

            print("Precision by label:")
            for i, prec in enumerate(training_summary.precisionByLabel):
                print("label %d: %s" % (i, prec))

            print("Recall by label:")
            for i, rec in enumerate(training_summary.recallByLabel):
                print("label %d: %s" % (i, rec))

            print("F-measure by label:")
            for i, f in enumerate(training_summary.fMeasureByLabel()):
                print("label %d: %s" % (i, f))

            accuracy = training_summary.accuracy
            falsePositiveRate = training_summary.weightedFalsePositiveRate
            truePositiveRate = training_summary.weightedTruePositiveRate
            fMeasure = training_summary.weightedFMeasure()
            precision = training_summary.weightedPrecision
            recall = training_summary.weightedRecall
            print("Accuracy: %s\nFPR: %s\nTPR: %s\nF-measure: %s\nPrecision: %s\nRecall: %s"
                  % (accuracy, falsePositiveRate, truePositiveRate, fMeasure, precision, recall))
            # Accuracy_list = []
            Accuracy_list.append(accuracy)
            # FPR_list = []
            FPR_list.append(falsePositiveRate)
            # TPR_list=[]
            TPR_list.append(truePositiveRate)
            precision_list.append(precision)
            recall_list.append(recall)

        print (Accuracy_list)
        print (FPR_list)
        print (TPR_list)
        print (precision_list)
        print (recall_list)

        import matplotlib.pyplot as plt
        #
        # plt.plot(recall_list, FPR_list)
        # plt.show()

        #
        # fpr = [0.0,0.0,0.0,0.0,0.003067484662576687, 0.003067484662576687, 0.006134969325153374, 0.11042944785276074, 0.1165644171779141, 0.1165644171779141, 0.23006134969325154, 0.9723926380368099, 0.9846625766871165 ]
        # tpr = [0.0, 0.09767441860465116, 0.10232558139534884, 0.13488372093023257 ,0.17674418604651163 ,0.3674418604651163 , 0.37209302325581395  , 0.7534883720930232, 0.8651162790697674 , 0.8697674418604651 , 0.9069767441860465, 0.9953488372093023, 1.0]
        # data visualization

        # ROC graph
        fpr = roc.select("FPR").toPandas()

        tpr = roc.select("TPR").toPandas()


        plt.plot(fpr, tpr)
        plt.show()


        # PR graph

        pr_recall = pr.select("recall").toPandas()
        pr_precision = pr.select("precision").toPandas()

        plt.plot(pr_precision,pr_recall)
        plt.show()


        # now applying the fit on the test data


        prediction_val = lrModel.transform(test_data)
        prediction_val.groupBy("label", "prediction").count().show()
        prediction_val.show()

        prediction_val.groupBy("prediction").count().show()

        prediction_val.groupBy("prediction", "probability").count().show()
data.show()
## 可產生另一個檔案.transform(data)不一定要在(data)檔案裡
#labelIndexer  ===> data


# RFormula
from pyspark.ml.feature import RFormula
## RFormula: string input colums will be one-hot encoded, and numeric columns will be cast to doubles.
##特徵值要被修正formula" "
formula = RFormula(
    formula="label ~ banner_pos + app_id + site_category + site_id + site_domain + device_type + device_conn_type",
    #formula="label ~ banner_pos + app_id + site_category + site_id + site_domain + C14 + C17 + C18 + C19 + C21", #0.707636
    #formula="label ~ banner_pos + site_id + site_domain + C14 + C17 + C21", #0.7
    featuresCol="features",
    labelCol="label")
formula_data = formula.fit(data).transform(data)
formula_data.select("features","label").show()


# Split the data into training and test sets (30% held out for testing)
#已經有了!
# Split training and test data.
(training, test) = formula_data.randomSplit([0.7, 0.3], seed = 12345) #what's seed
training.show()


from pyspark.ml.classification import LogisticRegression
from pyspark.ml.param import Param, Params
from pyspark.ml.feature import HashingTF, Tokenizer
from pyspark.sql import Row
from pyspark.ml import Pipeline
Exemple #19
0
tokenized = tkn.transform(sales.select("Description"))
tokenized.show(20, False)

# COMMAND ----------

from pyspark.ml.feature import StandardScaler

sScaler = StandardScaler().setInputCol("features")
sScaler.fit(scaleDF).transform(scaleDF).show()

# COMMAND ----------

from pyspark.ml.feature import RFormula

supervised = RFormula(formula="lab ~ . + color:value1 + color:value2")
supervised.fit(simpleDF).transform(simpleDF).show()

# COMMAND ----------

from pyspark.ml.feature import SQLTransformer

basicTransformation = SQLTransformer()\
  .setStatement("""
    SELECT sum(Quantity), count(*), CustomerID
    FROM __THIS__
    GROUP BY CustomerID
  """)

basicTransformation.transform(sales).show()

# COMMAND ----------
Exemple #20
0
    # _import zoo data to a spark dataframe
    zoo_df = spark.read.option("inferschema",
                               "true").option("header", "true").csv("zoo.csv")
    zoo_df.show(5)
    zoo_df.printSchema()

    # _add new column Is_Mammal
    zoo_df = zoo_df.withColumn("Is_Mammal",
                               expr("CASE WHEN Type = 1 THEN 1 ELSE 0 END"))

    # _preprocess data
    pre_process_data = RFormula(
        formula=
        "Is_Mammal ~ Hair + Feathers + Eggs + Milk + Airborne + Aquatic + Predator + Toothed + Backbone + Breathes + Venomous + Fins + Legs + Tail + Domestic + Catsize"
    )
    pre_process_data = pre_process_data.fit(zoo_df)
    pre_process_data = pre_process_data.transform(zoo_df)

    pre_process_data.show(5)

    # _split dataset into test and train datasets
    train, test = pre_process_data.randomSplit([0.7, 0.3])

    # _initialize logistic regression classifier
    lr = LogisticRegression(labelCol="label", featuresCol="features")

    # _train logistic regression model with train data available
    fittedLr = lr.fit(train)

    # _classify test data
    result = fittedLr.transform(test)
  .option("header", "true")\
  .option("inferSchema", "true")\
  .load("/data/retail-data/by-day/*.csv")\
  .coalesce(5)\
  .where("Description IS NOT NULL")
fakeIntDF = spark.read.parquet("/data/simple-ml-integers")
simpleDF = spark.read.json("/data/simple-ml")
scaleDF = spark.read.parquet("/data/simple-ml-scaling")


# COMMAND ----------

from pyspark.ml.feature import RFormula

supervised = RFormula(formula="lab ~ . + color:value1 + color:value2")
supervised.fit(simpleDF).transform(simpleDF).show()


# COMMAND ----------

from pyspark.ml.feature import SQLTransformer

basicTransformation = SQLTransformer()\
  .setStatement("""
    SELECT sum(Quantity), count(*), CustomerID
    FROM __THIS__
    GROUP BY CustomerID
  """)

basicTransformation.transform(sales).show()
Exemple #22
0
data.show()
## 可產生另一個檔案.transform(data)不一定要在(data)檔案裡
#labelIndexer  ===> data

# RFormula
from pyspark.ml.feature import RFormula
## RFormula: string input colums will be one-hot encoded, and numeric columns will be cast to doubles.
##特徵值要被修正formula" "
formula = RFormula(
    formula=
    "label ~ banner_pos + app_id + site_category + site_id + site_domain + device_type + device_conn_type",
    #formula="label ~ banner_pos + app_id + site_category + site_id + site_domain + C14 + C17 + C18 + C19 + C21", #0.707636
    #formula="label ~ banner_pos + site_id + site_domain + C14 + C17 + C21", #0.7
    featuresCol="features",
    labelCol="label")
formula_data = formula.fit(data).transform(data)
formula_data.select("features", "label").show()

# Split the data into training and test sets (30% held out for testing)
#已經有了!
# Split training and test data.
(training, test) = formula_data.randomSplit([0.7, 0.3],
                                            seed=12345)  #what's seed
training.show()

from pyspark.ml.classification import LogisticRegression
from pyspark.ml.param import Param, Params
from pyspark.ml.feature import HashingTF, Tokenizer
from pyspark.sql import Row
from pyspark.ml import Pipeline
from pyspark.ml.classification import RandomForestClassifier
plan_indexer = StringIndexer(inputCol = 'Product_ID', outputCol = 'product_ID1')
labeller = plan_indexer.fit(train)

#%%

Train1 = labeller.transform(train)
Test1 = labeller.transform(test)

Train1.show()

#%%

from pyspark.ml.feature import RFormula
formula = RFormula(formula="Purchase ~ Age+ Occupation +City_Category+Stay_In_Current_City_Years+Product_Category_1+Product_Category_2+ Gender",featuresCol="features",labelCol="label")

t1 = formula.fit(Train1)
#%%

train1 = t1.transform(Train1)
test1 = t1.transform(Test1)

train1.show()

train1.select('features').show()
train1.select('label').show()

#%%

from pyspark.ml.regression import RandomForestRegressor
rf = RandomForestRegressor()
Exemple #24
0
print(categorical)

cat_inter = ['C14', 'C15']

concat = '+'.join(categorical)
interaction = ':'.join(cat_inter)
formula = "label ~ " + concat + '+' + interaction

print(formula)

from pyspark.ml.feature import RFormula
interactor = RFormula(formula=formula,
                      featuresCol="features",
                      labelCol="label").setHandleInvalid("keep")

interactor.fit(df_train).transform(df_train).select("features").show()

from pyspark.ml.classification import LogisticRegression

classifier = LogisticRegression(maxIter=20,
                                regParam=0.000,
                                elasticNetParam=0.000)

stages = [interactor, classifier]

from pyspark.ml import Pipeline

pipeline = Pipeline(stages=stages)

model = pipeline.fit(df_train)
# 特征矩阵
features = pandas.DataFrame(iris.data, columns=iris.feature_names)
# 目标矩阵
targets = pandas.DataFrame(iris.target, columns=['Species'])
# 合并矩阵
merged = pandas.concat([features, targets], axis=1)

# 创建SparkSession
sess = SparkSession(sc)

# 创建spark DataFrame
raw_df = sess.createDataFrame(merged)

# 提取特征与目标
fomula = RFormula(formula='Species ~ .')
raw_df = fomula.fit(raw_df).transform(raw_df)

# 拆分训练集和测试集
train_df, test_df = raw_df.randomSplit([0.8, 0.2])

# 创建LR分类器
lr = LogisticRegression()

# 训练
train_df.show()
model = lr.fit(train_df)

# 预测test集合
predict_df = model.transform(test_df)

Exemple #26
0
def main():
    #静默弃用sklearn警告
    warnings.filterwarnings(module='sklearn*', action='ignore', category=DeprecationWarning)
    model_name = 'Distr_GBTClassifier'
    dir_of_dict = sys.argv[1]
    bag = too.Read_info(dir_of_dict,'supervision')
    name_dict,options,task_id,job_id,train_result_dir,\
    names_str,names_num,names_show,Y_names,dir_of_inputdata,\
    dir_of_outputdata,open_pca,train_size,test_size,normalized_type = bag

    dir_of_storePara = train_result_dir + '/%s_Parameters.json'%(str(task_id)+'_'+str(job_id)+'_'+model_name)
    dir_of_storeModel = train_result_dir + '/%s_model'%(str(task_id)+'_'+str(job_id)+'_'+model_name)

    # 配置spark客户端
    sess = SparkSession\
        .builder\
        .master("local[4]")\
        .appName("GBTClassifier_spark")\
        .config("spark.some.config.option", "some-value")\
        .getOrCreate()
    sc=sess.sparkContext
    sc.setLogLevel("ERROR")

    if options == 'train':
        time_start = time()
        #获取数据
        dataset = pd.read_csv(dir_of_inputdata)
        #用于测试 
        #dataset = dataset[0:1000]
        #限制多数类的数据
        #dataset = too.CalcMostLabel(dataset,Y_names)
        Y_datavec = dataset[Y_names].values
        #输出每个标签的数量
        print 'Counter:original y',Counter(Y_datavec)
        print'----------------------------------------------'
        #分别获得字符字段和数值型字段数据,且合并
        X_datavec,X_columns,vocabset,datavec_show_list= too.Merge_form(dataset,names_str,names_num,names_show,'vocabset','open')
        #数据归一化
        X_datavec = too.Data_process(X_datavec,normalized_type)
        #处理数据不平衡问题
        #X,Y =  mlp.KMeans_unbalanced(X_datavec,Y_datavec,X_columns,Y_names)
        #X,Y =  mlp.Sample_unbalanced(X_datavec,Y_datavec)
        X,Y = X_datavec, Y_datavec
        ret_num = 'no_num'
        #PCA降维
        if open_pca == 'open_pca':
            pca_num,ret = mlp.GS_PCA(X)
            print 'PCA Information:',pca_num,ret
            print'----------------------------------------------'
            ret_num = ret['99%']
            X = mlp.Model_PCA(X,ret_num)
        #存储vocabset这个list和ret_num
        too.StorePara(dir_of_storePara,vocabset,ret_num)

        print'--------------Train data shape----------------'
        print 'X.shape:',X.shape
        print'----------------------------------------------'
        print 'Y.shape:',Y.shape
        print'----------------------------------------------'
        print'--------------Start %s model------------------'%model_name

        features = pd.DataFrame(X,) 
        targets = pd.DataFrame(Y, columns = ['Y'])
        #合拼矩阵
        merged = pd.concat([features, targets], axis = 1)
        #创建spark DataFrame
        raw_df = sess.createDataFrame(merged)
        #提取特征与目标
        fomula = RFormula(formula = 'Y ~ .', featuresCol="features",labelCol="label")
        raw_df = fomula.fit(raw_df).transform(raw_df)
        #拆分训练集和测试集
        xy_train, xy_test = raw_df.randomSplit([train_size, test_size],seed=666)
        #调用模型
        clf_model = dmp.Distr_GBTClassifier(xy_train,xy_test)
        #保存模型参数
        clf_model.write().overwrite().save(dir_of_storeModel)
        print'----------------------------------------------'
        dmp.Predict_test_data(xy_test, datavec_show_list, names_show, clf_model, dir_of_outputdata)
        duration = too.Duration(time()-time_start)
        print 'Total run time: %s'%duration

    if options == 'predict':
        time_start = time()
        with open(dir_of_storePara,'r') as f:
            para_dict = json.load(f)
        vocabset = para_dict['vocabset']
        ret_num = para_dict['ret_num']
        #获取数据
        dataset = pd.read_csv(dir_of_inputdata)
        #分别获得字符字段和数值型字段数据,且合并
        X_datavec,datavec_show_list = too.Merge_form(dataset,names_str,names_num,names_show,vocabset,'close')
        #数据归一化
        X = too.Data_process(X_datavec,normalized_type)
        #PCA降维
        if open_pca == 'open_pca':
            X = mlp.Model_PCA(X,ret_num)

        print'-------------Pdedict data shape---------------'
        print 'X.shape:',X.shape
        print'----------------------------------------------'
        print'--------------Start %s model------------------'%model_name

        features = pd.DataFrame(X,)
        #创建spark DataFrame
        raw_features = sess.createDataFrame(features)
        raw_x = VectorAssembler(inputCols=raw_features.columns,outputCol='features').transform(raw_features)
        clf_model = GBTClassificationModel.load(dir_of_storeModel)
        dmp.Predict_data(raw_x, datavec_show_list, names_show, clf_model, dir_of_outputdata)
        duration = too.Duration(time()-time_start)
        print 'Total run time: %s'%duration
Exemple #27
0
import numpy
from pyspark.ml.feature import RFormula
from pyspark.ml.classification import BinaryLogisticRegressionSummary, LogisticRegression
from pyspark.ml.evaluation import (BinaryClassificationEvaluator,
                                   MulticlassClassificationEvaluator)

# <br>
# <font size=4,font style=arial>
# Bağımlı değişkenimiz Y ve bağımsız değişkenlerimiz U1,U2,U3,N1,N2,N3,N4,C1,C2 olmak üzere Lojistik regresyon analizi yapalım. Spark model için bir features (bağımsız değişkenlerin oluşturduğu sparse matris) ve label(bağımlı değişken) vektörlerinin oluşturulması gerekiyor. R formula bunu oluşturmaktadır.
# </font>

# In[59]:

formula = RFormula(formula="Y ~ U1+U2+U3+N1+N2+N3+N4+C1+C2")
output = formula.fit(df).transform(df)

# <font size=4,font style=arial>
# Modelimiz için gerekli olan features(yer kaplamaması ve işlem kolaylığı açısından oluşturulan sparse matris)  ve label kolonu oluştu. Aşağıda da çıktısı var.
# </font>

# In[60]:

output.show(5, truncate=False)

# <font size=4,font style=arial>
# Model için sadece features ve label kolonunu alacağız.
# </font>

# In[61]:
Exemple #28
0
    spark = SparkSession(sc)

    # _import zoo data to a spark dataframe
    mushroom_df = spark.read.option("inferschema",
                                    "true").option("header",
                                                   "true").csv("mushrooms.csv")
    mushroom_df.show(5)
    mushroom_df.printSchema()

    mushroom_df = mushroom_df.na.drop()
    # _No need to create extra column as Lab column is already binary classifiable with either EDIBLE or POISONOUS values
    mushroom_df = mushroom_df.drop("VeilType")

    # _preprocess data
    pre_process_data = RFormula(formula="Lab ~ .")
    pre_process_data = pre_process_data.fit(mushroom_df)
    pre_process_data = pre_process_data.transform(mushroom_df)

    pre_process_data.show(5)

    # _split dataset into test and train datasets
    train, test = pre_process_data.randomSplit([0.7, 0.3])

    # _initialize logistic regression classifier
    lr = LogisticRegression(labelCol="label", featuresCol="features")

    # _train logistic regression model with train data available
    fittedLr = lr.fit(train)

    # _classify test data
    result = fittedLr.transform(test)
    def Logistic_regression(dataset_add, features, label):

        dataset = spark.read.csv(dataset_add,
                                 header=True,
                                 inferSchema=True,
                                 sep=";")

        dataset.show()

        # using the rformula for indexing, encoding and vectorising

        f = ""
        f = label + " ~ "

        for x in features:
            f = f + x + "+"
        f = f[:-1]
        f = (f)

        formula = RFormula(formula=f, featuresCol="features", labelCol="label")

        output = formula.fit(dataset).transform(dataset)

        output_2 = output.select("features", "label")

        output_2.show()

        # implementing the logistic regression
        lr1 = LogisticRegression()

        lr = LogisticRegression(maxIter=10,
                                regParam=0.3,
                                elasticNetParam=0.6,
                                family="multinomial")

        # splitting the dataset

        train_data, test_data = output_2.randomSplit([0.75, 0.25], seed=40)

        # fit the model

        lrModel = lr.fit(train_data)

        # import matplotlib.pyplot as plt
        # import numpy as np
        #
        # beta = np.sort(lrModel.coefficientMatrix)
        #
        # plt.plot(beta)
        # plt.ylabel("beta coefficients")
        # plt.show()

        prediction = lrModel.transform(test_data)
        prediction.groupBy("label", "prediction").count().show()
        prediction.show()

        # print the coefficients and the intercept for the logistic regression
        #
        # print ("coefficients:" + str(lrModel.coefficientMatrix))
        # # mat = (lrModel.coefficientMatrix)
        # # print mat
        # print("intercept: " + str(lrModel.interceptVector))

        # getting the summary of the model

        training_summary = lrModel.summary

        # obtain the objective per iteration

        objectiveHistory = training_summary.objectiveHistory
        print("objectiveHistory")
        for objective in objectiveHistory:
            print objective

        # for a multiclass we can inspect  a matrix on a per label basis

        print("false positive rate by label:")
        for i, rate in enumerate(training_summary.falsePositiveRateByLabel):
            print("label %d: %s" % (i, rate))

        print("True positive rate")
        for i, rate in enumerate(training_summary.truePositiveRateByLabel):
            print("label %d : %s" % (i, rate))

        print("Precision by label:")
        for i, prec in enumerate(training_summary.precisionByLabel):
            print("label %d: %s" % (i, prec))

        print("Recall by label:")
        for i, rec in enumerate(training_summary.recallByLabel):
            print("label %d: %s" % (i, rec))

        print("F-measure by label:")
        for i, f in enumerate(training_summary.fMeasureByLabel()):
            print("label %d: %s" % (i, f))

        accuracy = training_summary.accuracy
        falsePositiveRate = training_summary.weightedFalsePositiveRate
        truePositiveRate = training_summary.weightedTruePositiveRate
        fMeasure = training_summary.weightedFMeasure()
        precision = training_summary.weightedPrecision
        recall = training_summary.weightedRecall

        print(
            "Accuracy: %s\nFPR: %s\nTPR: %s\nF-measure: %s\nPrecision: %s\nRecall: %s"
            % (accuracy, falsePositiveRate, truePositiveRate, fMeasure,
               precision, recall))

        # evaluating the model on test dataset

        from pyspark.ml.evaluation import BinaryClassificationEvaluator
        # from pyspark.ml.classification import BinaryLogisticRegressionTrainingSummary
        #
        #
        # training_sum = BinaryLogisticRegressionTrainingSummary(lrModel)
        # print training_sum.areaUnderROC()

        evaluator = BinaryClassificationEvaluator()
        print('test area under roc : ', evaluator.evaluate(prediction))
sparseVec = Vectors.sparse(size, idx, values)
print(sparseVec)

# COMMAND ----------

df = spark.read.json("/databricks-datasets/definitive-guide/data/simple-ml")
df.orderBy("value2").show()

# COMMAND ----------

from pyspark.ml.feature import RFormula
supervised = RFormula(formula="lab ~ . +color:value1 + color:value2")

# COMMAND ----------

fittedRF = supervised.fit(df)
preparedDF = fittedRF.transform(df)
preparedDF.show()

# COMMAND ----------

train, test = preparedDF.randomSplit([0.7, 0.3])

# COMMAND ----------

from pyspark.ml.classification import LogisticRegression
lr = LogisticRegression(labelCol="label",featuresCol="features")


# COMMAND ----------
Exemple #31
0
    def Logistic_regression(dataset_add, feature_colm, label_colm):

        dataset = spark.read.csv(dataset_add,
                                 header=True,
                                 inferSchema=True,
                                 sep=";")

        dataset.show()

        dataset.groupBy("y").count().show()

        label = ''
        for y in label_colm:
            label = y

        f = ""
        f = label + " ~ "

        for x in feature_colm:
            f = f + x + "+"
        f = f[:-1]
        f = (f)

        formula = RFormula(formula=f, featuresCol="features", labelCol="label")

        output = formula.fit(dataset).transform(dataset)

        finalized_data = output.select("features", "label")

        finalized_data.show()

        train_data, test_data = finalized_data.randomSplit([0.75, 0.25],
                                                           seed=40)

        Accuracy_list = []

        FPR_list = []
        TPR_list = []
        precision_list = []
        recall_list = []
        lr = LogisticRegression(maxIter=5)
        lrModel = lr.fit(train_data)

        print("coefficients:" + str(lrModel.coefficientMatrix))
        print("intercept: " + str(lrModel.interceptVector))
        training_summary = lrModel.summary
        BinaryLogisticRegressionTrainingSummary.accuracy
        print(" area under roc : ", training_summary.areaUnderROC)
        print("  roc : ", training_summary.roc)
        roc = training_summary.roc
        roc.show()
        roc.write.parquet(
            'hdfs://10.171.0.181:9000/dev/dmxdeepinsight/datasets/ROC_plot.parquet',
            mode='overwrite')
        print(" pr value : ", training_summary.pr)
        pr = training_summary.pr
        pr.show()
        pr.write.parquet(
            'hdfs://10.171.0.181:9000/dev/dmxdeepinsight/datasets/PR_plot.parquet',
            mode='overwrite')
        print(" precision by threshold : ",
              training_summary.precisionByThreshold)
        prec_by_threshold = training_summary.precisionByThreshold
        prec_by_threshold.show()
        print(" accuracy : ", training_summary.accuracy)
        accuracy_d = training_summary.accuracy
        print(accuracy_d)
        fMeasure = training_summary.fMeasureByThreshold
        fMeasure.show()
        maxFMeasure = fMeasure.groupBy().max('F-Measure').select(
            'max(F-Measure)').head()
        bestThreshold = fMeasure.where(fMeasure['F-Measure'] == maxFMeasure['max(F-Measure)']) \
            .select('threshold').head()['threshold']
        lr.setThreshold(bestThreshold)
        objectiveHistory = training_summary.objectiveHistory
        print("objectiveHistory")
        for objective in objectiveHistory:
            print(objective)
        print("false positive rate by label:")
        for i, rate in enumerate(training_summary.falsePositiveRateByLabel):
            print("label %d: %s" % (i, rate))
        print("True positive rate")
        for i, rate in enumerate(training_summary.truePositiveRateByLabel):
            print("label %d : %s" % (i, rate))
        print("Precision by label:")
        for i, prec in enumerate(training_summary.precisionByLabel):
            print("label %d: %s" % (i, prec))
        print("Recall by label:")
        for i, rec in enumerate(training_summary.recallByLabel):
            print("label %d: %s" % (i, rec))
        print("F-measure by label:")
        for i, f in enumerate(training_summary.fMeasureByLabel()):
            print("label %d: %s" % (i, f))
        accuracy = training_summary.accuracy
        falsePositiveRate = training_summary.weightedFalsePositiveRate
        truePositiveRate = training_summary.weightedTruePositiveRate
        fMeasure = training_summary.weightedFMeasure()
        precision = training_summary.weightedPrecision
        recall = training_summary.weightedRecall
        print(
            "Accuracy: %s\nFPR: %s\nTPR: %s\nF-measure: %s\nPrecision: %s\nRecall: %s"
            % (accuracy, falsePositiveRate, truePositiveRate, fMeasure,
               precision, recall))
        Accuracy_list.append(accuracy)
        FPR_list.append(falsePositiveRate)
        TPR_list.append(truePositiveRate)
        precision_list.append(precision)
        recall_list.append(recall)
        print(Accuracy_list)
        print(FPR_list)
        print(TPR_list)
        print(precision_list)
        print(recall_list)
        fpr = roc.select("FPR").toPandas()
        tpr = roc.select("TPR").toPandas()
        plt.plot(fpr, tpr)
        plt.show()
        pr_recall = pr.select("recall").toPandas()
        pr_precision = pr.select("precision").toPandas()
        plt.plot(pr_precision, pr_recall)
        plt.show()
        prediction_val = lrModel.transform(test_data)
        prediction_val.groupBy("label", "prediction").count().show()
        prediction_val.show()
        prediction_val.groupBy("prediction").count().show()
        prediction_val.groupBy("prediction", "probability").count().show()
# limitations under the License.
#

from __future__ import print_function

# $example on$
from pyspark.ml.feature import RFormula
# $example off$
from pyspark.sql import SparkSession

if __name__ == "__main__":
    spark = SparkSession\
        .builder\
        .appName("RFormulaExample")\
        .getOrCreate()

    # $example on$
    dataset = spark.createDataFrame([(7, "US", 18, 1.0), (8, "CA", 12, 0.0),
                                     (9, "NZ", 15, 0.0)],
                                    ["id", "country", "hour", "clicked"])

    formula = RFormula(formula="clicked ~ country + hour",
                       featuresCol="features",
                       labelCol="label")

    output = formula.fit(dataset).transform(dataset)
    output.select("features", "label").show()
    # $example off$

    spark.stop()
#predict the number of installments that will be paid (0-1) with anything less than 1
#implying early repayment of loan

#which cols?
#cols:['loan_amnt', 'int_rate', 'installment', 'grade', 'emp_length', 'home_ownership', 'annual_inc', 'issue_d', 'dti',
# 'revol_util', 'total_pymnt', 'last_pymnt_d', 'last_pymnt_amnt', 'mnth_start2last', 
#'fracNumPmts', 'pred_KM']


formula = RFormula(
	formula = "fracNumPmts ~ installment + annual_inc + dti + int_rate + revol_util  + home_ownership + grade + emp_length + pred_KM",
	featuresCol="features",
	labelCol="label")

#transformed data frame with vectors assembled
regFormulaFit = formula.fit(df).transform(df)

#training data frame
training = regFormulaFit.select(["label","features"])
lr = LinearRegression(labelCol = "label", featuresCol= "features", maxIter=10)#, regParam=0.3)
lrModel = lr.fit(training)
trainingSummary = lrModel.summary


df.select('fracNumPmts').describe().show()
# +-------+------------------+                                                    
# |summary|       fracNumPmts|
# +-------+------------------+
# |  count|             28227|
# |   mean|0.5334839555374444|
# | stddev|0.2962701727734131|