Esempio n. 1
0
 def test_rformula_force_index_label(self):
     df = self.spark.createDataFrame([(1.0, 1.0, "a"), (0.0, 2.0, "b"),
                                      (1.0, 0.0, "a")], ["y", "x", "s"])
     # Does not index label by default since it's numeric type.
     rf = RFormula(formula="y ~ x + s")
     model = rf.fit(df)
     transformedDF = model.transform(df)
     self.assertEqual(transformedDF.head().label, 1.0)
     # Force to index label.
     rf2 = RFormula(formula="y ~ x + s").setForceIndexLabel(True)
     model2 = rf2.fit(df)
     transformedDF2 = model2.transform(df)
     self.assertEqual(transformedDF2.head().label, 0.0)
Esempio n. 2
0
 def test_rformula_force_index_label(self):
     df = self.spark.createDataFrame([
         (1.0, 1.0, "a"),
         (0.0, 2.0, "b"),
         (1.0, 0.0, "a")], ["y", "x", "s"])
     # Does not index label by default since it's numeric type.
     rf = RFormula(formula="y ~ x + s")
     model = rf.fit(df)
     transformedDF = model.transform(df)
     self.assertEqual(transformedDF.head().label, 1.0)
     # Force to index label.
     rf2 = RFormula(formula="y ~ x + s").setForceIndexLabel(True)
     model2 = rf2.fit(df)
     transformedDF2 = model2.transform(df)
     self.assertEqual(transformedDF2.head().label, 0.0)
Esempio n. 3
0
def spark_ml():
    diff_cat_in_train_test=test.select('Product_ID').subtract(train.select('Product_ID'))
    diff_cat_in_train_test.distinct().count()
    
    from pyspark.ml.feature import StringIndexer
    plan_indexer = StringIndexer(inputCol = 'Product_ID', outputCol = 'product_ID')
    labeller = plan_indexer.fit(train)
    Train1 = labeller.transform(train)
    Test1 = labeller.transform(test)
    Train1.show()
    from pyspark.ml.feature import RFormula
    formula = RFormula(formula="Purchase ~ Age+ Occupation +City_Category+Stay_In_Current_City_Years+Product_Category_1+Product_Category_2+ Gender",featuresCol="features",labelCol="label")
    t1 = formula.fit(Train1)
    train1 = t1.transform(Train1)
    test1 = t1.transform(Test1)
    train1.show()
    train1.select('features').show()
    train1.select('label').show()
    from pyspark.ml.regression import RandomForestRegressor
    rf = RandomForestRegressor()
    (train_cv, test_cv) = train1.randomSplit([0.7, 0.3])
    model1 = rf.fit(train_cv)
    predictions = model1.transform(test_cv)
    from pyspark.ml.evaluation import RegressionEvaluator
    evaluator = RegressionEvaluator()
    mse = evaluator.evaluate(predictions,{evaluator.metricName:"mse" })
    import numpy as np
    np.sqrt(mse), mse
    model = rf.fit(train1)
    predictions1 = model.transform(test1)
    df = predictions1.selectExpr("User_ID as User_ID", "Product_ID as Product_ID", 'prediction as Purchase')
    df.toPandas().to_csv('submission.csv')
def Chi_sqr(dataset_add, feature_colm, label_colm):
    dataset = spark.read.csv(dataset_add, header=True, inferSchema=True)

    dataset.show()

    # using the rformula for indexing, encoding and vectorising

    label = ''
    for y in label_colm:
        label = y

    print(label)

    f = ""
    f = label + " ~ "

    for x in feature_colm:
        f = f + x + "+"
    f = f[:-1]
    f = (f)

    formula = RFormula(formula=f, featuresCol="features", labelCol="label")

    length = feature_colm.__len__()

    output = formula.fit(dataset).transform(dataset)

    output.select("features", "label").show()

    # chi selector
    from pyspark.ml.feature import ChiSqSelector

    selector = ChiSqSelector(numTopFeatures=length,
                             featuresCol="features",
                             outputCol="selected_features",
                             labelCol="label")

    result = selector.fit(output).transform(output)

    print("chi2 output with top %d features selected " %
          selector.getNumTopFeatures())
    result.show()

    #runnin gfor the chi vallue test

    r = ChiSquareTest.test(result, "selected_features", "label").head()
    print("pValues: " + str(r.pValues))
    p_values = str(r.pValues)
    print("degreesOfFreedom: " + str(r.degreesOfFreedom))

    print("statistics: " + str(r.statistics))

    json_response = {'pvalues': p_values}

    return json_response


# Chi_sqr(dataset_add, features_colm, label_colm)
Esempio n. 5
0
def feature_vector(df, idcol, colname, regressors):
    formula = RFormula(formula=colname + ' ~ ' + '+'.join(regressors),
                       labelCol='label',
                       featuresCol='features')

    # to dense feature vector
    df_features = formula.fit(df).transform(df).select(idcol, 'features',
                                                       'label')

    return df_features
def main():

    spork = SparkSession.builder.appName("titanic").getOrCreate()

    #Gathering data
    df = spork.read.format("csv").option("inferschema", "true").option(
        "header", "true").load("titanic.csv")
    # df.show()
    df.printSchema()
    df = df.na.drop(
        "any"
    )  #has to that if any null value in row otherwise it will show error while feature engineering

    #feature Engineering
    #Change the formula and check the result
    supervised = RFormula(
        formula="Survived ~ Sex:Age + Pclass : Cabin + SibSp+Embarked ")
    fittedRF = supervised.fit(df)
    preparedDF = fittedRF.transform(df)
    preparedDF.show()
    #spliting data in train and validation data
    train, test = preparedDF.randomSplit([0.7, 0.3])
    #classification
    #configure classifier
    lr = LogisticRegression(featuresCol="features", labelCol="label")
    #train classifier
    fittedLR = lr.fit(train)

    #check result
    result = fittedLR.transform(test)
    print("Coefficients:" + str(fittedLR.coefficients))
    result.show(100)
    truePositive = float(
        result.filter("prediction =1.0 and label =1.0").count())
    falsePositive = float(
        result.filter("prediction =1.0 and  label = 0.0").count())
    falseNegative = float(
        result.filter("prediction =0.0 and label = 1.0").count())
    trueNegative = float(
        result.filter("prediction=0.0 and label =0.0 ").count())
    print("True Positive :" + str(truePositive))
    print("True Negative :" + str(trueNegative))
    print("False Positive :" + str(falsePositive))
    print("False Negative :" + str(falseNegative))
    sensitivityOrRecall = truePositive / (truePositive + falseNegative)
    specificity = truePositive / (truePositive + falsePositive)
    precision = truePositive / (truePositive + falsePositive)
    accuracy = (truePositive + trueNegative) / (truePositive + trueNegative +
                                                falsePositive + falseNegative)
    print("sensitivityOrRecall :" + str(sensitivityOrRecall))
    print("specificity :" + str(specificity))
    print("precision :" + str(precision))
    print("accuracy :" + str(accuracy))

    spork.stop()
Esempio n. 7
0
 def test_rformula_string_indexer_order_type(self):
     df = self.spark.createDataFrame(
         [(1.0, 1.0, "a"), (0.0, 2.0, "b"), (1.0, 0.0, "a")], ["y", "x", "s"]
     )
     rf = RFormula(formula="y ~ x + s", stringIndexerOrderType="alphabetDesc")
     self.assertEqual(rf.getStringIndexerOrderType(), "alphabetDesc")
     transformedDF = rf.fit(df).transform(df)
     observed = transformedDF.select("features").collect()
     expected = [[1.0, 0.0], [2.0, 1.0], [0.0, 0.0]]
     for i in range(0, len(expected)):
         self.assertTrue(all(observed[i]["features"].toArray() == expected[i]))
Esempio n. 8
0
 def test_rformula_string_indexer_order_type(self):
     df = self.spark.createDataFrame([
         (1.0, 1.0, "a"),
         (0.0, 2.0, "b"),
         (1.0, 0.0, "a")], ["y", "x", "s"])
     rf = RFormula(formula="y ~ x + s", stringIndexerOrderType="alphabetDesc")
     self.assertEqual(rf.getStringIndexerOrderType(), 'alphabetDesc')
     transformedDF = rf.fit(df).transform(df)
     observed = transformedDF.select("features").collect()
     expected = [[1.0, 0.0], [2.0, 1.0], [0.0, 0.0]]
     for i in range(0, len(expected)):
         self.assertTrue(all(observed[i]["features"].toArray() == expected[i]))
def data_preparation(df, avg_age,feat_name="features",lab_name='label'):

    df = df.fillna(avg_age,subset=['Age'])

    """
    ## unnecessary when using Rformula
    df = df.replace(['male','female'],['-1','1'],'Sex')
    df = df.withColumn('Sex',df.Sex.cast('int'))

    df = df.replace(['S','Q','C'],['-1','0','1'],'Embarked')
    df = df.withColumn('Embarked',df.Embarked.cast('int'))
    df.printSchema()
    """

    # Rformula automatically formats categorical data (Sex and Embarked) into numerical data
    formula = RFormula(formula="Survived ~ Sex + Age + Pclass + Fare + SibSp + Parch",
        featuresCol=feat_name,
        labelCol=lab_name)

    df = formula.fit(df).transform(df)
    df.show(truncate=False)

    return df
def data_preparation(df, avg_age, feat_name="features", lab_name='label'):

    df = df.fillna(avg_age, subset=['Age'])
    """
    ## unnecessary when using Rformula
    df = df.replace(['male','female'],['-1','1'],'Sex')
    df = df.withColumn('Sex',df.Sex.cast('int'))

    df = df.replace(['S','Q','C'],['-1','0','1'],'Embarked')
    df = df.withColumn('Embarked',df.Embarked.cast('int'))
    df.printSchema()
    """

    # Rformula automatically formats categorical data (Sex and Embarked) into numerical data
    formula = RFormula(
        formula="Survived ~ Sex + Age + Pclass + Fare + SibSp + Parch",
        featuresCol=feat_name,
        labelCol=lab_name)

    df = formula.fit(df).transform(df)
    df.show(truncate=False)

    return df
Esempio n. 11
0
adDF = spark.read.csv("dataset/Advertising.csv", inferSchema=True, header=True)
#데이터 위에서 5개 출력 해보자
adDF.show(5)
#데이터 총 갯수는?
adDF.count()

adDF.printSchema()

from pyspark.ml.feature import RFormula
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.linalg import Vectors

#transformer 라이브러리를 이용해서 벡터화 하는 방법
dataModel = RFormula().setFormula("Sales ~.").setFeaturesCol("features").setLabelCol("label")
model_fit = dataModel.fit(adDF).transform(adDF)

model_fit.show()
model_fit.printSchema()

model_fit_select = model_fit.select(["features","label"])

model_fit_select.show()
model_fit_select.printSchema()

#Vectors 함수를 이용해서 벡터화 하기
adV = adDF.rdd.map(lambda x: [Vectors.dense(x[0:3]), x[-1]]).toDF(['features', 'label'])

adV.show()
adV.printSchema()
# Create a Window partion by Id order by Date
w = Window.partitionBy('Target').orderBy('Date')

# Tracking the TargetValue of the previous day
data = data.withColumn('PreviousDay', func.lag(data.TargetValue).over(w))

# Handle null values
data = data.na.fill('na')

# Vectorize the feature with the RFormula
assemblerFormula = RFormula(
    formula=
    'TargetValue ~ Date + Country_Region + Population + Target + Weight + PreviousDay '
)
assemblerFormula.setHandleInvalid('keep')
trainingTF = assemblerFormula.fit(data)

dataR = trainingTF.transform(data).select('Id', 'Date', 'Country_Region',
                                          'Target', 'Weight', 'features',
                                          'label')

# Split the training and test dataset
train = dataR.where(data.Date < '2020-04-27')
test = dataR.where(data.Date >= '2020-04-27')

# Init the Decision Tree Regressor
#dt_model = DecisionTreeRegressor(featuresCol="features", weightCol='Weight', maxDepth=18)
dt_model = GBTRegressor(featuresCol="features", maxIter=10)

# Train the chosen model
trained_model = dt_model.fit(train)
# COMMAND ----------

df = spark.read.json("/data/simple-ml")
df.orderBy("value2").show()


# COMMAND ----------

from pyspark.ml.feature import RFormula
supervised = RFormula(formula="lab ~ . + color:value1 + color:value2")


# COMMAND ----------

fittedRF = supervised.fit(df)
preparedDF = fittedRF.transform(df)
preparedDF.show()


# COMMAND ----------

train, test = preparedDF.randomSplit([0.7, 0.3])


# COMMAND ----------

from pyspark.ml.classification import LogisticRegression
lr = LogisticRegression(labelCol="label",featuresCol="features")

Esempio n. 14
0
#      categorical values)
#    . all columns except target

# RFormula produces a vector column of features and a double or string column
# of label. Like when formulas are used in R for linear regression, string
# input columns will be one-hot encoded, and numeric columns will be cast to
# doubles. If the label column is of type string, it will be first transformed
# to double with StringIndexer. If the label column does not exist in the
# DataFrame, the output label column will be created from the specified
# response variable in the formula.

spark = SparkSession.builder.appName("RFormula").getOrCreate()

dataset = spark.createDataFrame(
    [(7, "US", 18, 1.0),
     (8, "CA", 12, 0.0),
     (9, "NZ", 15, 0.0)],
    ["id", "country", "hour", "clicked"])

formula = RFormula(
    formula="clicked ~ country + hour",
    featuresCol="features",
    labelCol="label")

model = formula.fit(dataset)

output = model.transform(dataset)
output.select("features", "label").show()

spark.stop()
Esempio n. 15
0
# 构建 SparkSession
spark = SparkSession \
    .builder \
    .appName(" GBDT TEST ") \
    .enableHiveSupport() \
    .getOrCreate()
sc = spark.sparkContext

# 从 HDFS 上读取数据
path = '/home/mnist-test/data/train'
df = spark.read.csv(path, header=True, inferSchema=True)
df = df.dropna()  # 删除空值

# 将数据转换为 features labels
rf = RFormula(formula="label ~ .", featuresCol="features", labelCol="labels")
rf_model = rf.fit(df)
df = rf_model.transform(df).select(["features", "labels"])

# 数据集切分
train_df, test_df = df.randomSplit([0.8, 0.2])

# 构造 GBDT 模型
gbdt = GBTClassifier(maxIter=10,
                     maxDepth=3,
                     labelCol="labels",
                     featuresCol="features")

# 构造 One Vs Rest Classifier.
ovr = OneVsRest(classifier=gbdt)
ovr_model = ovr.fit(train_df)
predict_res = ovr_model.transform(test_df)
Esempio n. 16
0
from __future__ import print_function

# $example on$
from pyspark.ml.feature import RFormula
# $example off$
from pyspark.sql import SparkSession

if __name__ == "__main__":
    spark = SparkSession\
        .builder\
        .appName("RFormulaExample")\
        .getOrCreate()

    # $example on$
    dataset = spark.createDataFrame(
        [(7, "US", 18, 1.0),
         (8, "CA", 12, 0.0),
         (9, "NZ", 15, 0.0)],
        ["id", "country", "hour", "clicked"])

    formula = RFormula(
        formula="clicked ~ country + hour",
        featuresCol="features",
        labelCol="label")

    output = formula.fit(dataset).transform(dataset)
    output.select("features", "label").show()
    # $example off$

    spark.stop()
Esempio n. 17
0
    def Logistic_regression(dataset_add, features, label):

        dataset = spark.read.csv(dataset_add, header=True, inferSchema=True, sep=";")

        dataset.show()

        dataset.groupBy("y").count().show()

        # using the rformula for indexing, encoding and vectorising

        f = ""
        f = label + " ~ "

        for x in features:
            f = f + x + "+"
        f = f[:-1]
        f = (f)

        formula = RFormula(formula=f,
                           featuresCol="features",
                           labelCol="label")

        output = formula.fit(dataset).transform(dataset)

        output_2 = output.select("features", "label")

        output_2.show()

        # splitting the dataset into train and test

        train_data, test_data = output_2.randomSplit([0.75, 0.25], seed = 40)

        # implementing the logistic regression
        lr1 =LogisticRegression()

        Accuracy_list = []
        # Accuracy_list.append(accuracy)
        FPR_list = []
        # FPR_list.append(falsePositiveRate)
        TPR_list = []
        precision_list = []
        recall_list = []

        y= 0.1
        # x=[]
        for i in range(0,3):
            y=round(y+0.1,2)

            lr = LogisticRegression(maxIter=5, regParam=0.1, elasticNetParam=1.0, threshold=0.3)



            # fit the model


            lrModel = lr.fit(train_data)
            lrModel

            # print the coefficients and the intercept for the logistic regression

            print ("coefficients:" + str(lrModel.coefficientMatrix))
            # mat = (lrModel.coefficientMatrix)
            # print mat
            print("intercept: " + str(lrModel.interceptVector))





            # getting the summary of the model

            # f-measure calculation
            from pyspark.ml.classification import BinaryLogisticRegressionTrainingSummary

            training_summary = lrModel.summary

            BinaryLogisticRegressionTrainingSummary.accuracy

            print (" area under roc : " , training_summary.areaUnderROC)
            print ("  roc : " , training_summary.roc)
            roc = training_summary.roc
            roc.show()
            print (" pr value : " , training_summary.pr)
            pr = training_summary.pr
            pr.show()
            print (" precision by threshold : " , training_summary.precisionByThreshold)
            prec_by_threshold = training_summary.precisionByThreshold
            prec_by_threshold.show()

            print (" accuracy : ", training_summary.accuracy)
            accuracy_d = training_summary.accuracy
            print (accuracy_d)

            fMeasure = training_summary.fMeasureByThreshold

            fMeasure.show()

            maxFMeasure = fMeasure.groupBy().max('F-Measure').select('max(F-Measure)').head()
            bestThreshold = fMeasure.where(fMeasure['F-Measure'] == maxFMeasure['max(F-Measure)']) \
                .select('threshold').head()['threshold']
            lr.setThreshold(bestThreshold)

            # obtain the objective per iteration

            objectiveHistory = training_summary.objectiveHistory
            print ("objectiveHistory")
            for objective in objectiveHistory:
                print (objective)


            # for a multiclass we can inspect  a matrix on a per label basis

            print ("false positive rate by label:")
            for i, rate in enumerate(training_summary.falsePositiveRateByLabel):
                print ("label %d: %s" % (i, rate))


            print("True positive rate")
            for i, rate in enumerate(training_summary.truePositiveRateByLabel):
                print ("label %d : %s" % (i, rate))
            #
            # print("True Negative rate")
            # for i, rate in enumerate(training_summary)

            print("Precision by label:")
            for i, prec in enumerate(training_summary.precisionByLabel):
                print("label %d: %s" % (i, prec))

            print("Recall by label:")
            for i, rec in enumerate(training_summary.recallByLabel):
                print("label %d: %s" % (i, rec))

            print("F-measure by label:")
            for i, f in enumerate(training_summary.fMeasureByLabel()):
                print("label %d: %s" % (i, f))

            accuracy = training_summary.accuracy
            falsePositiveRate = training_summary.weightedFalsePositiveRate
            truePositiveRate = training_summary.weightedTruePositiveRate
            fMeasure = training_summary.weightedFMeasure()
            precision = training_summary.weightedPrecision
            recall = training_summary.weightedRecall
            print("Accuracy: %s\nFPR: %s\nTPR: %s\nF-measure: %s\nPrecision: %s\nRecall: %s"
                  % (accuracy, falsePositiveRate, truePositiveRate, fMeasure, precision, recall))
            # Accuracy_list = []
            Accuracy_list.append(accuracy)
            # FPR_list = []
            FPR_list.append(falsePositiveRate)
            # TPR_list=[]
            TPR_list.append(truePositiveRate)
            precision_list.append(precision)
            recall_list.append(recall)

        print (Accuracy_list)
        print (FPR_list)
        print (TPR_list)
        print (precision_list)
        print (recall_list)

        import matplotlib.pyplot as plt
        #
        # plt.plot(recall_list, FPR_list)
        # plt.show()

        #
        # fpr = [0.0,0.0,0.0,0.0,0.003067484662576687, 0.003067484662576687, 0.006134969325153374, 0.11042944785276074, 0.1165644171779141, 0.1165644171779141, 0.23006134969325154, 0.9723926380368099, 0.9846625766871165 ]
        # tpr = [0.0, 0.09767441860465116, 0.10232558139534884, 0.13488372093023257 ,0.17674418604651163 ,0.3674418604651163 , 0.37209302325581395  , 0.7534883720930232, 0.8651162790697674 , 0.8697674418604651 , 0.9069767441860465, 0.9953488372093023, 1.0]
        # data visualization

        # ROC graph
        fpr = roc.select("FPR").toPandas()

        tpr = roc.select("TPR").toPandas()


        plt.plot(fpr, tpr)
        plt.show()


        # PR graph

        pr_recall = pr.select("recall").toPandas()
        pr_precision = pr.select("precision").toPandas()

        plt.plot(pr_precision,pr_recall)
        plt.show()


        # now applying the fit on the test data


        prediction_val = lrModel.transform(test_data)
        prediction_val.groupBy("label", "prediction").count().show()
        prediction_val.show()

        prediction_val.groupBy("prediction").count().show()

        prediction_val.groupBy("prediction", "probability").count().show()
data.show()
## 可產生另一個檔案.transform(data)不一定要在(data)檔案裡
#labelIndexer  ===> data


# RFormula
from pyspark.ml.feature import RFormula
## RFormula: string input colums will be one-hot encoded, and numeric columns will be cast to doubles.
##特徵值要被修正formula" "
formula = RFormula(
    formula="label ~ banner_pos + app_id + site_category + site_id + site_domain + device_type + device_conn_type",
    #formula="label ~ banner_pos + app_id + site_category + site_id + site_domain + C14 + C17 + C18 + C19 + C21", #0.707636
    #formula="label ~ banner_pos + site_id + site_domain + C14 + C17 + C21", #0.7
    featuresCol="features",
    labelCol="label")
formula_data = formula.fit(data).transform(data)
formula_data.select("features","label").show()


# Split the data into training and test sets (30% held out for testing)
#已經有了!
# Split training and test data.
(training, test) = formula_data.randomSplit([0.7, 0.3], seed = 12345) #what's seed
training.show()


from pyspark.ml.classification import LogisticRegression
from pyspark.ml.param import Param, Params
from pyspark.ml.feature import HashingTF, Tokenizer
from pyspark.sql import Row
from pyspark.ml import Pipeline
Esempio n. 19
0
tokenized = tkn.transform(sales.select("Description"))
tokenized.show(20, False)

# COMMAND ----------

from pyspark.ml.feature import StandardScaler

sScaler = StandardScaler().setInputCol("features")
sScaler.fit(scaleDF).transform(scaleDF).show()

# COMMAND ----------

from pyspark.ml.feature import RFormula

supervised = RFormula(formula="lab ~ . + color:value1 + color:value2")
supervised.fit(simpleDF).transform(simpleDF).show()

# COMMAND ----------

from pyspark.ml.feature import SQLTransformer

basicTransformation = SQLTransformer()\
  .setStatement("""
    SELECT sum(Quantity), count(*), CustomerID
    FROM __THIS__
    GROUP BY CustomerID
  """)

basicTransformation.transform(sales).show()

# COMMAND ----------
Esempio n. 20
0
    # _import zoo data to a spark dataframe
    zoo_df = spark.read.option("inferschema",
                               "true").option("header", "true").csv("zoo.csv")
    zoo_df.show(5)
    zoo_df.printSchema()

    # _add new column Is_Mammal
    zoo_df = zoo_df.withColumn("Is_Mammal",
                               expr("CASE WHEN Type = 1 THEN 1 ELSE 0 END"))

    # _preprocess data
    pre_process_data = RFormula(
        formula=
        "Is_Mammal ~ Hair + Feathers + Eggs + Milk + Airborne + Aquatic + Predator + Toothed + Backbone + Breathes + Venomous + Fins + Legs + Tail + Domestic + Catsize"
    )
    pre_process_data = pre_process_data.fit(zoo_df)
    pre_process_data = pre_process_data.transform(zoo_df)

    pre_process_data.show(5)

    # _split dataset into test and train datasets
    train, test = pre_process_data.randomSplit([0.7, 0.3])

    # _initialize logistic regression classifier
    lr = LogisticRegression(labelCol="label", featuresCol="features")

    # _train logistic regression model with train data available
    fittedLr = lr.fit(train)

    # _classify test data
    result = fittedLr.transform(test)
  .option("header", "true")\
  .option("inferSchema", "true")\
  .load("/data/retail-data/by-day/*.csv")\
  .coalesce(5)\
  .where("Description IS NOT NULL")
fakeIntDF = spark.read.parquet("/data/simple-ml-integers")
simpleDF = spark.read.json("/data/simple-ml")
scaleDF = spark.read.parquet("/data/simple-ml-scaling")


# COMMAND ----------

from pyspark.ml.feature import RFormula

supervised = RFormula(formula="lab ~ . + color:value1 + color:value2")
supervised.fit(simpleDF).transform(simpleDF).show()


# COMMAND ----------

from pyspark.ml.feature import SQLTransformer

basicTransformation = SQLTransformer()\
  .setStatement("""
    SELECT sum(Quantity), count(*), CustomerID
    FROM __THIS__
    GROUP BY CustomerID
  """)

basicTransformation.transform(sales).show()
Esempio n. 22
0
data.show()
## 可產生另一個檔案.transform(data)不一定要在(data)檔案裡
#labelIndexer  ===> data

# RFormula
from pyspark.ml.feature import RFormula
## RFormula: string input colums will be one-hot encoded, and numeric columns will be cast to doubles.
##特徵值要被修正formula" "
formula = RFormula(
    formula=
    "label ~ banner_pos + app_id + site_category + site_id + site_domain + device_type + device_conn_type",
    #formula="label ~ banner_pos + app_id + site_category + site_id + site_domain + C14 + C17 + C18 + C19 + C21", #0.707636
    #formula="label ~ banner_pos + site_id + site_domain + C14 + C17 + C21", #0.7
    featuresCol="features",
    labelCol="label")
formula_data = formula.fit(data).transform(data)
formula_data.select("features", "label").show()

# Split the data into training and test sets (30% held out for testing)
#已經有了!
# Split training and test data.
(training, test) = formula_data.randomSplit([0.7, 0.3],
                                            seed=12345)  #what's seed
training.show()

from pyspark.ml.classification import LogisticRegression
from pyspark.ml.param import Param, Params
from pyspark.ml.feature import HashingTF, Tokenizer
from pyspark.sql import Row
from pyspark.ml import Pipeline
from pyspark.ml.classification import RandomForestClassifier
Esempio n. 23
0
plan_indexer = StringIndexer(inputCol = 'Product_ID', outputCol = 'product_ID1')
labeller = plan_indexer.fit(train)

#%%

Train1 = labeller.transform(train)
Test1 = labeller.transform(test)

Train1.show()

#%%

from pyspark.ml.feature import RFormula
formula = RFormula(formula="Purchase ~ Age+ Occupation +City_Category+Stay_In_Current_City_Years+Product_Category_1+Product_Category_2+ Gender",featuresCol="features",labelCol="label")

t1 = formula.fit(Train1)
#%%

train1 = t1.transform(Train1)
test1 = t1.transform(Test1)

train1.show()

train1.select('features').show()
train1.select('label').show()

#%%

from pyspark.ml.regression import RandomForestRegressor
rf = RandomForestRegressor()
Esempio n. 24
0
print(categorical)

cat_inter = ['C14', 'C15']

concat = '+'.join(categorical)
interaction = ':'.join(cat_inter)
formula = "label ~ " + concat + '+' + interaction

print(formula)

from pyspark.ml.feature import RFormula
interactor = RFormula(formula=formula,
                      featuresCol="features",
                      labelCol="label").setHandleInvalid("keep")

interactor.fit(df_train).transform(df_train).select("features").show()

from pyspark.ml.classification import LogisticRegression

classifier = LogisticRegression(maxIter=20,
                                regParam=0.000,
                                elasticNetParam=0.000)

stages = [interactor, classifier]

from pyspark.ml import Pipeline

pipeline = Pipeline(stages=stages)

model = pipeline.fit(df_train)
Esempio n. 25
0
# 特征矩阵
features = pandas.DataFrame(iris.data, columns=iris.feature_names)
# 目标矩阵
targets = pandas.DataFrame(iris.target, columns=['Species'])
# 合并矩阵
merged = pandas.concat([features, targets], axis=1)

# 创建SparkSession
sess = SparkSession(sc)

# 创建spark DataFrame
raw_df = sess.createDataFrame(merged)

# 提取特征与目标
fomula = RFormula(formula='Species ~ .')
raw_df = fomula.fit(raw_df).transform(raw_df)

# 拆分训练集和测试集
train_df, test_df = raw_df.randomSplit([0.8, 0.2])

# 创建LR分类器
lr = LogisticRegression()

# 训练
train_df.show()
model = lr.fit(train_df)

# 预测test集合
predict_df = model.transform(test_df)

Esempio n. 26
0
def main():
    #静默弃用sklearn警告
    warnings.filterwarnings(module='sklearn*', action='ignore', category=DeprecationWarning)
    model_name = 'Distr_GBTClassifier'
    dir_of_dict = sys.argv[1]
    bag = too.Read_info(dir_of_dict,'supervision')
    name_dict,options,task_id,job_id,train_result_dir,\
    names_str,names_num,names_show,Y_names,dir_of_inputdata,\
    dir_of_outputdata,open_pca,train_size,test_size,normalized_type = bag

    dir_of_storePara = train_result_dir + '/%s_Parameters.json'%(str(task_id)+'_'+str(job_id)+'_'+model_name)
    dir_of_storeModel = train_result_dir + '/%s_model'%(str(task_id)+'_'+str(job_id)+'_'+model_name)

    # 配置spark客户端
    sess = SparkSession\
        .builder\
        .master("local[4]")\
        .appName("GBTClassifier_spark")\
        .config("spark.some.config.option", "some-value")\
        .getOrCreate()
    sc=sess.sparkContext
    sc.setLogLevel("ERROR")

    if options == 'train':
        time_start = time()
        #获取数据
        dataset = pd.read_csv(dir_of_inputdata)
        #用于测试 
        #dataset = dataset[0:1000]
        #限制多数类的数据
        #dataset = too.CalcMostLabel(dataset,Y_names)
        Y_datavec = dataset[Y_names].values
        #输出每个标签的数量
        print 'Counter:original y',Counter(Y_datavec)
        print'----------------------------------------------'
        #分别获得字符字段和数值型字段数据,且合并
        X_datavec,X_columns,vocabset,datavec_show_list= too.Merge_form(dataset,names_str,names_num,names_show,'vocabset','open')
        #数据归一化
        X_datavec = too.Data_process(X_datavec,normalized_type)
        #处理数据不平衡问题
        #X,Y =  mlp.KMeans_unbalanced(X_datavec,Y_datavec,X_columns,Y_names)
        #X,Y =  mlp.Sample_unbalanced(X_datavec,Y_datavec)
        X,Y = X_datavec, Y_datavec
        ret_num = 'no_num'
        #PCA降维
        if open_pca == 'open_pca':
            pca_num,ret = mlp.GS_PCA(X)
            print 'PCA Information:',pca_num,ret
            print'----------------------------------------------'
            ret_num = ret['99%']
            X = mlp.Model_PCA(X,ret_num)
        #存储vocabset这个list和ret_num
        too.StorePara(dir_of_storePara,vocabset,ret_num)

        print'--------------Train data shape----------------'
        print 'X.shape:',X.shape
        print'----------------------------------------------'
        print 'Y.shape:',Y.shape
        print'----------------------------------------------'
        print'--------------Start %s model------------------'%model_name

        features = pd.DataFrame(X,) 
        targets = pd.DataFrame(Y, columns = ['Y'])
        #合拼矩阵
        merged = pd.concat([features, targets], axis = 1)
        #创建spark DataFrame
        raw_df = sess.createDataFrame(merged)
        #提取特征与目标
        fomula = RFormula(formula = 'Y ~ .', featuresCol="features",labelCol="label")
        raw_df = fomula.fit(raw_df).transform(raw_df)
        #拆分训练集和测试集
        xy_train, xy_test = raw_df.randomSplit([train_size, test_size],seed=666)
        #调用模型
        clf_model = dmp.Distr_GBTClassifier(xy_train,xy_test)
        #保存模型参数
        clf_model.write().overwrite().save(dir_of_storeModel)
        print'----------------------------------------------'
        dmp.Predict_test_data(xy_test, datavec_show_list, names_show, clf_model, dir_of_outputdata)
        duration = too.Duration(time()-time_start)
        print 'Total run time: %s'%duration

    if options == 'predict':
        time_start = time()
        with open(dir_of_storePara,'r') as f:
            para_dict = json.load(f)
        vocabset = para_dict['vocabset']
        ret_num = para_dict['ret_num']
        #获取数据
        dataset = pd.read_csv(dir_of_inputdata)
        #分别获得字符字段和数值型字段数据,且合并
        X_datavec,datavec_show_list = too.Merge_form(dataset,names_str,names_num,names_show,vocabset,'close')
        #数据归一化
        X = too.Data_process(X_datavec,normalized_type)
        #PCA降维
        if open_pca == 'open_pca':
            X = mlp.Model_PCA(X,ret_num)

        print'-------------Pdedict data shape---------------'
        print 'X.shape:',X.shape
        print'----------------------------------------------'
        print'--------------Start %s model------------------'%model_name

        features = pd.DataFrame(X,)
        #创建spark DataFrame
        raw_features = sess.createDataFrame(features)
        raw_x = VectorAssembler(inputCols=raw_features.columns,outputCol='features').transform(raw_features)
        clf_model = GBTClassificationModel.load(dir_of_storeModel)
        dmp.Predict_data(raw_x, datavec_show_list, names_show, clf_model, dir_of_outputdata)
        duration = too.Duration(time()-time_start)
        print 'Total run time: %s'%duration
Esempio n. 27
0
import numpy
from pyspark.ml.feature import RFormula
from pyspark.ml.classification import BinaryLogisticRegressionSummary, LogisticRegression
from pyspark.ml.evaluation import (BinaryClassificationEvaluator,
                                   MulticlassClassificationEvaluator)

# <br>
# <font size=4,font style=arial>
# Bağımlı değişkenimiz Y ve bağımsız değişkenlerimiz U1,U2,U3,N1,N2,N3,N4,C1,C2 olmak üzere Lojistik regresyon analizi yapalım. Spark model için bir features (bağımsız değişkenlerin oluşturduğu sparse matris) ve label(bağımlı değişken) vektörlerinin oluşturulması gerekiyor. R formula bunu oluşturmaktadır.
# </font>

# In[59]:

formula = RFormula(formula="Y ~ U1+U2+U3+N1+N2+N3+N4+C1+C2")
output = formula.fit(df).transform(df)

# <font size=4,font style=arial>
# Modelimiz için gerekli olan features(yer kaplamaması ve işlem kolaylığı açısından oluşturulan sparse matris)  ve label kolonu oluştu. Aşağıda da çıktısı var.
# </font>

# In[60]:

output.show(5, truncate=False)

# <font size=4,font style=arial>
# Model için sadece features ve label kolonunu alacağız.
# </font>

# In[61]:
Esempio n. 28
0
    spark = SparkSession(sc)

    # _import zoo data to a spark dataframe
    mushroom_df = spark.read.option("inferschema",
                                    "true").option("header",
                                                   "true").csv("mushrooms.csv")
    mushroom_df.show(5)
    mushroom_df.printSchema()

    mushroom_df = mushroom_df.na.drop()
    # _No need to create extra column as Lab column is already binary classifiable with either EDIBLE or POISONOUS values
    mushroom_df = mushroom_df.drop("VeilType")

    # _preprocess data
    pre_process_data = RFormula(formula="Lab ~ .")
    pre_process_data = pre_process_data.fit(mushroom_df)
    pre_process_data = pre_process_data.transform(mushroom_df)

    pre_process_data.show(5)

    # _split dataset into test and train datasets
    train, test = pre_process_data.randomSplit([0.7, 0.3])

    # _initialize logistic regression classifier
    lr = LogisticRegression(labelCol="label", featuresCol="features")

    # _train logistic regression model with train data available
    fittedLr = lr.fit(train)

    # _classify test data
    result = fittedLr.transform(test)
    def Logistic_regression(dataset_add, features, label):

        dataset = spark.read.csv(dataset_add,
                                 header=True,
                                 inferSchema=True,
                                 sep=";")

        dataset.show()

        # using the rformula for indexing, encoding and vectorising

        f = ""
        f = label + " ~ "

        for x in features:
            f = f + x + "+"
        f = f[:-1]
        f = (f)

        formula = RFormula(formula=f, featuresCol="features", labelCol="label")

        output = formula.fit(dataset).transform(dataset)

        output_2 = output.select("features", "label")

        output_2.show()

        # implementing the logistic regression
        lr1 = LogisticRegression()

        lr = LogisticRegression(maxIter=10,
                                regParam=0.3,
                                elasticNetParam=0.6,
                                family="multinomial")

        # splitting the dataset

        train_data, test_data = output_2.randomSplit([0.75, 0.25], seed=40)

        # fit the model

        lrModel = lr.fit(train_data)

        # import matplotlib.pyplot as plt
        # import numpy as np
        #
        # beta = np.sort(lrModel.coefficientMatrix)
        #
        # plt.plot(beta)
        # plt.ylabel("beta coefficients")
        # plt.show()

        prediction = lrModel.transform(test_data)
        prediction.groupBy("label", "prediction").count().show()
        prediction.show()

        # print the coefficients and the intercept for the logistic regression
        #
        # print ("coefficients:" + str(lrModel.coefficientMatrix))
        # # mat = (lrModel.coefficientMatrix)
        # # print mat
        # print("intercept: " + str(lrModel.interceptVector))

        # getting the summary of the model

        training_summary = lrModel.summary

        # obtain the objective per iteration

        objectiveHistory = training_summary.objectiveHistory
        print("objectiveHistory")
        for objective in objectiveHistory:
            print objective

        # for a multiclass we can inspect  a matrix on a per label basis

        print("false positive rate by label:")
        for i, rate in enumerate(training_summary.falsePositiveRateByLabel):
            print("label %d: %s" % (i, rate))

        print("True positive rate")
        for i, rate in enumerate(training_summary.truePositiveRateByLabel):
            print("label %d : %s" % (i, rate))

        print("Precision by label:")
        for i, prec in enumerate(training_summary.precisionByLabel):
            print("label %d: %s" % (i, prec))

        print("Recall by label:")
        for i, rec in enumerate(training_summary.recallByLabel):
            print("label %d: %s" % (i, rec))

        print("F-measure by label:")
        for i, f in enumerate(training_summary.fMeasureByLabel()):
            print("label %d: %s" % (i, f))

        accuracy = training_summary.accuracy
        falsePositiveRate = training_summary.weightedFalsePositiveRate
        truePositiveRate = training_summary.weightedTruePositiveRate
        fMeasure = training_summary.weightedFMeasure()
        precision = training_summary.weightedPrecision
        recall = training_summary.weightedRecall

        print(
            "Accuracy: %s\nFPR: %s\nTPR: %s\nF-measure: %s\nPrecision: %s\nRecall: %s"
            % (accuracy, falsePositiveRate, truePositiveRate, fMeasure,
               precision, recall))

        # evaluating the model on test dataset

        from pyspark.ml.evaluation import BinaryClassificationEvaluator
        # from pyspark.ml.classification import BinaryLogisticRegressionTrainingSummary
        #
        #
        # training_sum = BinaryLogisticRegressionTrainingSummary(lrModel)
        # print training_sum.areaUnderROC()

        evaluator = BinaryClassificationEvaluator()
        print('test area under roc : ', evaluator.evaluate(prediction))
sparseVec = Vectors.sparse(size, idx, values)
print(sparseVec)

# COMMAND ----------

df = spark.read.json("/databricks-datasets/definitive-guide/data/simple-ml")
df.orderBy("value2").show()

# COMMAND ----------

from pyspark.ml.feature import RFormula
supervised = RFormula(formula="lab ~ . +color:value1 + color:value2")

# COMMAND ----------

fittedRF = supervised.fit(df)
preparedDF = fittedRF.transform(df)
preparedDF.show()

# COMMAND ----------

train, test = preparedDF.randomSplit([0.7, 0.3])

# COMMAND ----------

from pyspark.ml.classification import LogisticRegression
lr = LogisticRegression(labelCol="label",featuresCol="features")


# COMMAND ----------
Esempio n. 31
0
    def Logistic_regression(dataset_add, feature_colm, label_colm):

        dataset = spark.read.csv(dataset_add,
                                 header=True,
                                 inferSchema=True,
                                 sep=";")

        dataset.show()

        dataset.groupBy("y").count().show()

        label = ''
        for y in label_colm:
            label = y

        f = ""
        f = label + " ~ "

        for x in feature_colm:
            f = f + x + "+"
        f = f[:-1]
        f = (f)

        formula = RFormula(formula=f, featuresCol="features", labelCol="label")

        output = formula.fit(dataset).transform(dataset)

        finalized_data = output.select("features", "label")

        finalized_data.show()

        train_data, test_data = finalized_data.randomSplit([0.75, 0.25],
                                                           seed=40)

        Accuracy_list = []

        FPR_list = []
        TPR_list = []
        precision_list = []
        recall_list = []
        lr = LogisticRegression(maxIter=5)
        lrModel = lr.fit(train_data)

        print("coefficients:" + str(lrModel.coefficientMatrix))
        print("intercept: " + str(lrModel.interceptVector))
        training_summary = lrModel.summary
        BinaryLogisticRegressionTrainingSummary.accuracy
        print(" area under roc : ", training_summary.areaUnderROC)
        print("  roc : ", training_summary.roc)
        roc = training_summary.roc
        roc.show()
        roc.write.parquet(
            'hdfs://10.171.0.181:9000/dev/dmxdeepinsight/datasets/ROC_plot.parquet',
            mode='overwrite')
        print(" pr value : ", training_summary.pr)
        pr = training_summary.pr
        pr.show()
        pr.write.parquet(
            'hdfs://10.171.0.181:9000/dev/dmxdeepinsight/datasets/PR_plot.parquet',
            mode='overwrite')
        print(" precision by threshold : ",
              training_summary.precisionByThreshold)
        prec_by_threshold = training_summary.precisionByThreshold
        prec_by_threshold.show()
        print(" accuracy : ", training_summary.accuracy)
        accuracy_d = training_summary.accuracy
        print(accuracy_d)
        fMeasure = training_summary.fMeasureByThreshold
        fMeasure.show()
        maxFMeasure = fMeasure.groupBy().max('F-Measure').select(
            'max(F-Measure)').head()
        bestThreshold = fMeasure.where(fMeasure['F-Measure'] == maxFMeasure['max(F-Measure)']) \
            .select('threshold').head()['threshold']
        lr.setThreshold(bestThreshold)
        objectiveHistory = training_summary.objectiveHistory
        print("objectiveHistory")
        for objective in objectiveHistory:
            print(objective)
        print("false positive rate by label:")
        for i, rate in enumerate(training_summary.falsePositiveRateByLabel):
            print("label %d: %s" % (i, rate))
        print("True positive rate")
        for i, rate in enumerate(training_summary.truePositiveRateByLabel):
            print("label %d : %s" % (i, rate))
        print("Precision by label:")
        for i, prec in enumerate(training_summary.precisionByLabel):
            print("label %d: %s" % (i, prec))
        print("Recall by label:")
        for i, rec in enumerate(training_summary.recallByLabel):
            print("label %d: %s" % (i, rec))
        print("F-measure by label:")
        for i, f in enumerate(training_summary.fMeasureByLabel()):
            print("label %d: %s" % (i, f))
        accuracy = training_summary.accuracy
        falsePositiveRate = training_summary.weightedFalsePositiveRate
        truePositiveRate = training_summary.weightedTruePositiveRate
        fMeasure = training_summary.weightedFMeasure()
        precision = training_summary.weightedPrecision
        recall = training_summary.weightedRecall
        print(
            "Accuracy: %s\nFPR: %s\nTPR: %s\nF-measure: %s\nPrecision: %s\nRecall: %s"
            % (accuracy, falsePositiveRate, truePositiveRate, fMeasure,
               precision, recall))
        Accuracy_list.append(accuracy)
        FPR_list.append(falsePositiveRate)
        TPR_list.append(truePositiveRate)
        precision_list.append(precision)
        recall_list.append(recall)
        print(Accuracy_list)
        print(FPR_list)
        print(TPR_list)
        print(precision_list)
        print(recall_list)
        fpr = roc.select("FPR").toPandas()
        tpr = roc.select("TPR").toPandas()
        plt.plot(fpr, tpr)
        plt.show()
        pr_recall = pr.select("recall").toPandas()
        pr_precision = pr.select("precision").toPandas()
        plt.plot(pr_precision, pr_recall)
        plt.show()
        prediction_val = lrModel.transform(test_data)
        prediction_val.groupBy("label", "prediction").count().show()
        prediction_val.show()
        prediction_val.groupBy("prediction").count().show()
        prediction_val.groupBy("prediction", "probability").count().show()
Esempio n. 32
0
# limitations under the License.
#

from __future__ import print_function

# $example on$
from pyspark.ml.feature import RFormula
# $example off$
from pyspark.sql import SparkSession

if __name__ == "__main__":
    spark = SparkSession\
        .builder\
        .appName("RFormulaExample")\
        .getOrCreate()

    # $example on$
    dataset = spark.createDataFrame([(7, "US", 18, 1.0), (8, "CA", 12, 0.0),
                                     (9, "NZ", 15, 0.0)],
                                    ["id", "country", "hour", "clicked"])

    formula = RFormula(formula="clicked ~ country + hour",
                       featuresCol="features",
                       labelCol="label")

    output = formula.fit(dataset).transform(dataset)
    output.select("features", "label").show()
    # $example off$

    spark.stop()
Esempio n. 33
0
#predict the number of installments that will be paid (0-1) with anything less than 1
#implying early repayment of loan

#which cols?
#cols:['loan_amnt', 'int_rate', 'installment', 'grade', 'emp_length', 'home_ownership', 'annual_inc', 'issue_d', 'dti',
# 'revol_util', 'total_pymnt', 'last_pymnt_d', 'last_pymnt_amnt', 'mnth_start2last', 
#'fracNumPmts', 'pred_KM']


formula = RFormula(
	formula = "fracNumPmts ~ installment + annual_inc + dti + int_rate + revol_util  + home_ownership + grade + emp_length + pred_KM",
	featuresCol="features",
	labelCol="label")

#transformed data frame with vectors assembled
regFormulaFit = formula.fit(df).transform(df)

#training data frame
training = regFormulaFit.select(["label","features"])
lr = LinearRegression(labelCol = "label", featuresCol= "features", maxIter=10)#, regParam=0.3)
lrModel = lr.fit(training)
trainingSummary = lrModel.summary


df.select('fracNumPmts').describe().show()
# +-------+------------------+                                                    
# |summary|       fracNumPmts|
# +-------+------------------+
# |  count|             28227|
# |   mean|0.5334839555374444|
# | stddev|0.2962701727734131|