Example #1
0
def spark_ml():
    diff_cat_in_train_test=test.select('Product_ID').subtract(train.select('Product_ID'))
    diff_cat_in_train_test.distinct().count()
    
    from pyspark.ml.feature import StringIndexer
    plan_indexer = StringIndexer(inputCol = 'Product_ID', outputCol = 'product_ID')
    labeller = plan_indexer.fit(train)
    Train1 = labeller.transform(train)
    Test1 = labeller.transform(test)
    Train1.show()
    from pyspark.ml.feature import RFormula
    formula = RFormula(formula="Purchase ~ Age+ Occupation +City_Category+Stay_In_Current_City_Years+Product_Category_1+Product_Category_2+ Gender",featuresCol="features",labelCol="label")
    t1 = formula.fit(Train1)
    train1 = t1.transform(Train1)
    test1 = t1.transform(Test1)
    train1.show()
    train1.select('features').show()
    train1.select('label').show()
    from pyspark.ml.regression import RandomForestRegressor
    rf = RandomForestRegressor()
    (train_cv, test_cv) = train1.randomSplit([0.7, 0.3])
    model1 = rf.fit(train_cv)
    predictions = model1.transform(test_cv)
    from pyspark.ml.evaluation import RegressionEvaluator
    evaluator = RegressionEvaluator()
    mse = evaluator.evaluate(predictions,{evaluator.metricName:"mse" })
    import numpy as np
    np.sqrt(mse), mse
    model = rf.fit(train1)
    predictions1 = model.transform(test1)
    df = predictions1.selectExpr("User_ID as User_ID", "Product_ID as Product_ID", 'prediction as Purchase')
    df.toPandas().to_csv('submission.csv')
def Chi_sqr(dataset_add, feature_colm, label_colm):
    dataset = spark.read.csv(dataset_add, header=True, inferSchema=True)

    dataset.show()

    # using the rformula for indexing, encoding and vectorising

    label = ''
    for y in label_colm:
        label = y

    print(label)

    f = ""
    f = label + " ~ "

    for x in feature_colm:
        f = f + x + "+"
    f = f[:-1]
    f = (f)

    formula = RFormula(formula=f, featuresCol="features", labelCol="label")

    length = feature_colm.__len__()

    output = formula.fit(dataset).transform(dataset)

    output.select("features", "label").show()

    # chi selector
    from pyspark.ml.feature import ChiSqSelector

    selector = ChiSqSelector(numTopFeatures=length,
                             featuresCol="features",
                             outputCol="selected_features",
                             labelCol="label")

    result = selector.fit(output).transform(output)

    print("chi2 output with top %d features selected " %
          selector.getNumTopFeatures())
    result.show()

    #runnin gfor the chi vallue test

    r = ChiSquareTest.test(result, "selected_features", "label").head()
    print("pValues: " + str(r.pValues))
    p_values = str(r.pValues)
    print("degreesOfFreedom: " + str(r.degreesOfFreedom))

    print("statistics: " + str(r.statistics))

    json_response = {'pvalues': p_values}

    return json_response


# Chi_sqr(dataset_add, features_colm, label_colm)
Example #3
0
def feature_vector(df, idcol, colname, regressors):
    formula = RFormula(formula=colname + ' ~ ' + '+'.join(regressors),
                       labelCol='label',
                       featuresCol='features')

    # to dense feature vector
    df_features = formula.fit(df).transform(df).select(idcol, 'features',
                                                       'label')

    return df_features
def main():

    spork = SparkSession.builder.appName("titanic").getOrCreate()

    #Gathering data
    df = spork.read.format("csv").option("inferschema", "true").option(
        "header", "true").load("titanic.csv")
    # df.show()
    df.printSchema()
    df = df.na.drop(
        "any"
    )  #has to that if any null value in row otherwise it will show error while feature engineering

    #feature Engineering
    #Change the formula and check the result
    supervised = RFormula(
        formula="Survived ~ Sex:Age + Pclass : Cabin + SibSp+Embarked ")
    fittedRF = supervised.fit(df)
    preparedDF = fittedRF.transform(df)
    preparedDF.show()
    #spliting data in train and validation data
    train, test = preparedDF.randomSplit([0.7, 0.3])
    #classification
    #configure classifier
    lr = LogisticRegression(featuresCol="features", labelCol="label")
    #train classifier
    fittedLR = lr.fit(train)

    #check result
    result = fittedLR.transform(test)
    print("Coefficients:" + str(fittedLR.coefficients))
    result.show(100)
    truePositive = float(
        result.filter("prediction =1.0 and label =1.0").count())
    falsePositive = float(
        result.filter("prediction =1.0 and  label = 0.0").count())
    falseNegative = float(
        result.filter("prediction =0.0 and label = 1.0").count())
    trueNegative = float(
        result.filter("prediction=0.0 and label =0.0 ").count())
    print("True Positive :" + str(truePositive))
    print("True Negative :" + str(trueNegative))
    print("False Positive :" + str(falsePositive))
    print("False Negative :" + str(falseNegative))
    sensitivityOrRecall = truePositive / (truePositive + falseNegative)
    specificity = truePositive / (truePositive + falsePositive)
    precision = truePositive / (truePositive + falsePositive)
    accuracy = (truePositive + trueNegative) / (truePositive + trueNegative +
                                                falsePositive + falseNegative)
    print("sensitivityOrRecall :" + str(sensitivityOrRecall))
    print("specificity :" + str(specificity))
    print("precision :" + str(precision))
    print("accuracy :" + str(accuracy))

    spork.stop()
Example #5
0
 def test_rformula_string_indexer_order_type(self):
     df = self.spark.createDataFrame(
         [(1.0, 1.0, "a"), (0.0, 2.0, "b"), (1.0, 0.0, "a")], ["y", "x", "s"]
     )
     rf = RFormula(formula="y ~ x + s", stringIndexerOrderType="alphabetDesc")
     self.assertEqual(rf.getStringIndexerOrderType(), "alphabetDesc")
     transformedDF = rf.fit(df).transform(df)
     observed = transformedDF.select("features").collect()
     expected = [[1.0, 0.0], [2.0, 1.0], [0.0, 0.0]]
     for i in range(0, len(expected)):
         self.assertTrue(all(observed[i]["features"].toArray() == expected[i]))
Example #6
0
 def test_rformula_string_indexer_order_type(self):
     df = self.spark.createDataFrame([
         (1.0, 1.0, "a"),
         (0.0, 2.0, "b"),
         (1.0, 0.0, "a")], ["y", "x", "s"])
     rf = RFormula(formula="y ~ x + s", stringIndexerOrderType="alphabetDesc")
     self.assertEqual(rf.getStringIndexerOrderType(), 'alphabetDesc')
     transformedDF = rf.fit(df).transform(df)
     observed = transformedDF.select("features").collect()
     expected = [[1.0, 0.0], [2.0, 1.0], [0.0, 0.0]]
     for i in range(0, len(expected)):
         self.assertTrue(all(observed[i]["features"].toArray() == expected[i]))
Example #7
0
 def test_rformula_force_index_label(self):
     df = self.spark.createDataFrame([
         (1.0, 1.0, "a"),
         (0.0, 2.0, "b"),
         (1.0, 0.0, "a")], ["y", "x", "s"])
     # Does not index label by default since it's numeric type.
     rf = RFormula(formula="y ~ x + s")
     model = rf.fit(df)
     transformedDF = model.transform(df)
     self.assertEqual(transformedDF.head().label, 1.0)
     # Force to index label.
     rf2 = RFormula(formula="y ~ x + s").setForceIndexLabel(True)
     model2 = rf2.fit(df)
     transformedDF2 = model2.transform(df)
     self.assertEqual(transformedDF2.head().label, 0.0)
Example #8
0
	def testWorkflow(self):
		df = self.sqlContext.read.csv(os.path.join(os.path.dirname(__file__), "resources/Iris.csv"), header = True, inferSchema = True)
		
		formula = RFormula(formula = "Species ~ .")
		classifier = DecisionTreeClassifier()
		pipeline = Pipeline(stages = [formula, classifier])
		pipelineModel = pipeline.fit(df)
		
		pmmlBuilder = PMMLBuilder(self.sc, df, pipelineModel) \
			.verify(df.sample(False, 0.1))

		pmml = pmmlBuilder.build()
		self.assertIsInstance(pmml, JavaObject)

		pmmlByteArray = pmmlBuilder.buildByteArray()
		self.assertIsInstance(pmmlByteArray, bytes)
		
		pmmlString = pmmlByteArray.decode("UTF-8")
		self.assertTrue("<PMML xmlns=\"http://www.dmg.org/PMML-4_3\" xmlns:data=\"http://jpmml.org/jpmml-model/InlineTable\" version=\"4.3\">" in pmmlString)
		self.assertTrue("<VerificationFields>" in pmmlString)

		pmmlBuilder = pmmlBuilder.putOption(classifier, "compact", False)
		nonCompactFile = tempfile.NamedTemporaryFile(prefix = "pyspark2pmml-", suffix = ".pmml")
		nonCompactPmmlPath = pmmlBuilder.buildFile(nonCompactFile.name)

		pmmlBuilder = pmmlBuilder.putOption(classifier, "compact", True)
		compactFile = tempfile.NamedTemporaryFile(prefix = "pyspark2pmml-", suffix = ".pmml")
		compactPmmlPath = pmmlBuilder.buildFile(compactFile.name)

		self.assertGreater(os.path.getsize(nonCompactPmmlPath), os.path.getsize(compactPmmlPath) + 100)
Example #9
0
def rFormula():
    rFormula = RFormula(formula="price ~ .",
                        featuresCol="features",
                        labelCol="price",
                        handleInvalid="skip")
    lr = LinearRegression(labelCol="price", featuresCol="features")
    return Pipeline(stages=[rFormula, lr])
Example #10
0
def dsi_regression(df: DataFrame, dsi: str, trt: str, ps: str, cov_list: list, regParam: float = 1e-2):
    
    from pyspark.ml.regression import LinearRegression
    from pyspark.ml.feature import RFormula
    
    if ps:
        rhs_ls = [trt, ps] + cov_list
    else:
        rhs_ls = [trt] + cov_list
    
    dsi_formula = RFormula(
        formula = '%s ~ %s' % (dsi, ' + '.join(rhs_ls)),
        featuresCol="features",
        labelCol="label"
    )
    
    dsi_df = dsi_formula\
        .fit(df)\
        .transform(df.select(['customer_id', dsi] + rhs_ls))\
        .select('customer_id', 'features', 'label')
    
    df_stats = df.filter(col(trt) > 0).select(
        mean(col('Treated_F1M')).alias('mean_dosage'),
        count(lit(1)).alias('total_treated')
    ).collect()[0].asDict()
    
    lr = LinearRegression(
        featuresCol='features',
        labelCol = 'label',
        tol=1e-4, regParam=regParam, elasticNetParam=0.5
    )
    lrm = lr.fit(dsi_df)
    
    return lrm.coefficients, df_stats
Example #11
0
	def testWorkflow(self):
		df = self.sqlContext.read.csv(irisCsvFile, header = True, inferSchema = True)
		
		formula = RFormula(formula = "Species ~ .")
		classifier = DecisionTreeClassifier()
		pipeline = Pipeline(stages = [formula, classifier])
		pipelineModel = pipeline.fit(df)
		
		pmmlBytes = toPMMLBytes(self.sc, df, pipelineModel)
		pmmlString = pmmlBytes.decode("UTF-8")
		self.assertTrue(pmmlString.find("<PMML xmlns=\"http://www.dmg.org/PMML-4_3\" version=\"4.3\">") > -1)
def data_preparation(df, avg_age, feat_name="features", lab_name='label'):

    df = df.fillna(avg_age, subset=['Age'])
    """
    ## unnecessary when using Rformula
    df = df.replace(['male','female'],['-1','1'],'Sex')
    df = df.withColumn('Sex',df.Sex.cast('int'))

    df = df.replace(['S','Q','C'],['-1','0','1'],'Embarked')
    df = df.withColumn('Embarked',df.Embarked.cast('int'))
    df.printSchema()
    """

    # Rformula automatically formats categorical data (Sex and Embarked) into numerical data
    formula = RFormula(
        formula="Survived ~ Sex + Age + Pclass + Fare + SibSp + Parch",
        featuresCol=feat_name,
        labelCol=lab_name)

    df = formula.fit(df).transform(df)
    df.show(truncate=False)

    return df
def data_preparation(df, avg_age,feat_name="features",lab_name='label'):

    df = df.fillna(avg_age,subset=['Age'])

    """
    ## unnecessary when using Rformula
    df = df.replace(['male','female'],['-1','1'],'Sex')
    df = df.withColumn('Sex',df.Sex.cast('int'))

    df = df.replace(['S','Q','C'],['-1','0','1'],'Embarked')
    df = df.withColumn('Embarked',df.Embarked.cast('int'))
    df.printSchema()
    """

    # Rformula automatically formats categorical data (Sex and Embarked) into numerical data
    formula = RFormula(formula="Survived ~ Sex + Age + Pclass + Fare + SibSp + Parch",
        featuresCol=feat_name,
        labelCol=lab_name)

    df = formula.fit(df).transform(df)
    df.show(truncate=False)

    return df
Example #14
0
    def __init__(self,
                 formula="tip_amount ~ passenger_count + \
                        fare_amount + vendor_index + ratecode_index \
                        + trip_duration_m + store_and_fwd_flag_index + \
                        trip_type + pu_location_id + do_location_id + \
                        trip_distance"):
        self.reg_formula = RFormula(formula=formula)

        self.feature_indexer = VectorIndexer(inputCol="features",
                                             outputCol="indexed_features",
                                             handleInvalid="keep",
                                             maxCategories=270)
        self.indexers = [self.reg_formula, self.feature_indexer]
        self.form_encoder_model = Pipeline(stages=self.indexers)
Example #15
0
 def test_rformula_force_index_label(self):
     df = self.spark.createDataFrame([(1.0, 1.0, "a"), (0.0, 2.0, "b"),
                                      (1.0, 0.0, "a")], ["y", "x", "s"])
     # Does not index label by default since it's numeric type.
     rf = RFormula(formula="y ~ x + s")
     model = rf.fit(df)
     transformedDF = model.transform(df)
     self.assertEqual(transformedDF.head().label, 1.0)
     # Force to index label.
     rf2 = RFormula(formula="y ~ x + s").setForceIndexLabel(True)
     model2 = rf2.fit(df)
     transformedDF2 = model2.transform(df)
     self.assertEqual(transformedDF2.head().label, 0.0)
Example #16
0
    def testWorkflow(self):
        df = self.sqlContext.read.csv(os.path.join(os.path.dirname(__file__),
                                                   "resources/Iris.csv"),
                                      header=True,
                                      inferSchema=True)

        formula = RFormula(formula="Species ~ .")
        classifier = DecisionTreeClassifier()
        pipeline = Pipeline(stages=[formula, classifier])
        pipelineModel = pipeline.fit(df)

        pmmlBuilder = PMMLBuilder(self.sc, df, pipelineModel) \
         .putOption(classifier, "compact", True)
        pmmlBytes = pmmlBuilder.buildByteArray()
        pmmlString = pmmlBytes.decode("UTF-8")
        self.assertTrue(
            pmmlString.find(
                "<PMML xmlns=\"http://www.dmg.org/PMML-4_3\" version=\"4.3\">")
            > -1)
Example #17
0
def add_propensity(df: DataFrame, lhs: str, rhs: list, ps_col: str, regParam: float = 1e-2):
    
    from pyspark.ml.classification import LogisticRegression
    from pyspark.ml.feature import RFormula
    
    PS_formula = RFormula(
        formula = '%s ~ %s' % (lhs, ' + '.join(rhs)),
        featuresCol="features",
        labelCol="label"
    )
    
    PS_df = PS_formula\
        .fit(df)\
        .transform(df.select(['customer_id', lhs] + rhs))\
        .select('customer_id', 'features', 'label')
    
    lr = LogisticRegression(
        featuresCol='features',
        labelCol = 'label',
        tol=1e-4, regParam=regParam, elasticNetParam=0.3
    )
    
    preds = lr\
                .fit(PS_df).transform(PS_df)\
                .select(['customer_id', 'probability', 'label'])\
                .withColumn(ps_col, split2_udf('probability'))\
                .drop('probability')
    mean_PS = preds.rollup('label').mean(ps_col).alias('mean_ps').collect()
    mean_PS = [x.asDict() for x in mean_PS]
    
    mean_trt = preds.rollup('label').count().collect()
    mean_trt = [x.asDict() for x in mean_trt]
    
    df = df.join(preds.drop('label'), on=['customer_id'], how='inner')
    
    return df, mean_PS, mean_trt
size = 3
idx = [1, 2] # locations of non-zero elements in vector
values = [2.0, 3.0]
sparseVec = Vectors.sparse(size, idx, values)


# COMMAND ----------

df = spark.read.json("/data/simple-ml")
df.orderBy("value2").show()


# COMMAND ----------

from pyspark.ml.feature import RFormula
supervised = RFormula(formula="lab ~ . + color:value1 + color:value2")


# COMMAND ----------

fittedRF = supervised.fit(df)
preparedDF = fittedRF.transform(df)
preparedDF.show()


# COMMAND ----------

train, test = preparedDF.randomSplit([0.7, 0.3])


# COMMAND ----------
Example #19
0
from __future__ import print_function

# $example on$
from pyspark.ml.feature import RFormula
# $example off$
from pyspark.sql import SparkSession

if __name__ == "__main__":
    spark = SparkSession\
        .builder\
        .appName("RFormulaExample")\
        .getOrCreate()

    # $example on$
    dataset = spark.createDataFrame(
        [(7, "US", 18, 1.0),
         (8, "CA", 12, 0.0),
         (9, "NZ", 15, 0.0)],
        ["id", "country", "hour", "clicked"])

    formula = RFormula(
        formula="clicked ~ country + hour",
        featuresCol="features",
        labelCol="label")

    output = formula.fit(dataset).transform(dataset)
    output.select("features", "label").show()
    # $example off$

    spark.stop()
Example #20
0
def main():
    #静默弃用sklearn警告
    warnings.filterwarnings(module='sklearn*', action='ignore', category=DeprecationWarning)
    model_name = 'Distr_GBTClassifier'
    dir_of_dict = sys.argv[1]
    bag = too.Read_info(dir_of_dict,'supervision')
    name_dict,options,task_id,job_id,train_result_dir,\
    names_str,names_num,names_show,Y_names,dir_of_inputdata,\
    dir_of_outputdata,open_pca,train_size,test_size,normalized_type = bag

    dir_of_storePara = train_result_dir + '/%s_Parameters.json'%(str(task_id)+'_'+str(job_id)+'_'+model_name)
    dir_of_storeModel = train_result_dir + '/%s_model'%(str(task_id)+'_'+str(job_id)+'_'+model_name)

    # 配置spark客户端
    sess = SparkSession\
        .builder\
        .master("local[4]")\
        .appName("GBTClassifier_spark")\
        .config("spark.some.config.option", "some-value")\
        .getOrCreate()
    sc=sess.sparkContext
    sc.setLogLevel("ERROR")

    if options == 'train':
        time_start = time()
        #获取数据
        dataset = pd.read_csv(dir_of_inputdata)
        #用于测试 
        #dataset = dataset[0:1000]
        #限制多数类的数据
        #dataset = too.CalcMostLabel(dataset,Y_names)
        Y_datavec = dataset[Y_names].values
        #输出每个标签的数量
        print 'Counter:original y',Counter(Y_datavec)
        print'----------------------------------------------'
        #分别获得字符字段和数值型字段数据,且合并
        X_datavec,X_columns,vocabset,datavec_show_list= too.Merge_form(dataset,names_str,names_num,names_show,'vocabset','open')
        #数据归一化
        X_datavec = too.Data_process(X_datavec,normalized_type)
        #处理数据不平衡问题
        #X,Y =  mlp.KMeans_unbalanced(X_datavec,Y_datavec,X_columns,Y_names)
        #X,Y =  mlp.Sample_unbalanced(X_datavec,Y_datavec)
        X,Y = X_datavec, Y_datavec
        ret_num = 'no_num'
        #PCA降维
        if open_pca == 'open_pca':
            pca_num,ret = mlp.GS_PCA(X)
            print 'PCA Information:',pca_num,ret
            print'----------------------------------------------'
            ret_num = ret['99%']
            X = mlp.Model_PCA(X,ret_num)
        #存储vocabset这个list和ret_num
        too.StorePara(dir_of_storePara,vocabset,ret_num)

        print'--------------Train data shape----------------'
        print 'X.shape:',X.shape
        print'----------------------------------------------'
        print 'Y.shape:',Y.shape
        print'----------------------------------------------'
        print'--------------Start %s model------------------'%model_name

        features = pd.DataFrame(X,) 
        targets = pd.DataFrame(Y, columns = ['Y'])
        #合拼矩阵
        merged = pd.concat([features, targets], axis = 1)
        #创建spark DataFrame
        raw_df = sess.createDataFrame(merged)
        #提取特征与目标
        fomula = RFormula(formula = 'Y ~ .', featuresCol="features",labelCol="label")
        raw_df = fomula.fit(raw_df).transform(raw_df)
        #拆分训练集和测试集
        xy_train, xy_test = raw_df.randomSplit([train_size, test_size],seed=666)
        #调用模型
        clf_model = dmp.Distr_GBTClassifier(xy_train,xy_test)
        #保存模型参数
        clf_model.write().overwrite().save(dir_of_storeModel)
        print'----------------------------------------------'
        dmp.Predict_test_data(xy_test, datavec_show_list, names_show, clf_model, dir_of_outputdata)
        duration = too.Duration(time()-time_start)
        print 'Total run time: %s'%duration

    if options == 'predict':
        time_start = time()
        with open(dir_of_storePara,'r') as f:
            para_dict = json.load(f)
        vocabset = para_dict['vocabset']
        ret_num = para_dict['ret_num']
        #获取数据
        dataset = pd.read_csv(dir_of_inputdata)
        #分别获得字符字段和数值型字段数据,且合并
        X_datavec,datavec_show_list = too.Merge_form(dataset,names_str,names_num,names_show,vocabset,'close')
        #数据归一化
        X = too.Data_process(X_datavec,normalized_type)
        #PCA降维
        if open_pca == 'open_pca':
            X = mlp.Model_PCA(X,ret_num)

        print'-------------Pdedict data shape---------------'
        print 'X.shape:',X.shape
        print'----------------------------------------------'
        print'--------------Start %s model------------------'%model_name

        features = pd.DataFrame(X,)
        #创建spark DataFrame
        raw_features = sess.createDataFrame(features)
        raw_x = VectorAssembler(inputCols=raw_features.columns,outputCol='features').transform(raw_features)
        clf_model = GBTClassificationModel.load(dir_of_storeModel)
        dmp.Predict_data(raw_x, datavec_show_list, names_show, clf_model, dir_of_outputdata)
        duration = too.Duration(time()-time_start)
        print 'Total run time: %s'%duration
Example #21
0
    StructField('Cabin', StringType(), True),
    StructField('Embarked', StringType(), True)
])
rawTraining = spark.read.csv(trainingFilePart,
                             header=True,
                             schema=customSchema)
selectedTraining = rawTraining.select(
    col('Survived').alias('label'), 'PClass', 'Sex', 'Age', 'Fare')
addingColTraining = selectedTraining.withColumn(
    'Missing_Age', selectedTraining['Age'].isNull()).withColumn(
        'Missing_Fare', selectedTraining['Fare'].isNull())
'''build pipeline'''
imputer = Imputer(inputCols=['Age', 'Fare'],
                  outputCols=['Out_Age', 'Out_Fare'])
rformula = RFormula(
    formula='~ Sex + Out_Age + Missing_Age + Out_Fare + Missing_Fare',
    featuresCol='features')
lr = LogisticRegression(family='binomial')
pipeline = Pipeline(stages=[imputer, rformula, lr])
'''build validation'''
evaluator = BinaryClassificationEvaluator()
grid = ParamGridBuilder().addGrid(lr.maxIter, [10, 50, 100])\
       .addGrid(lr.regParam, [0.0, 0.01, 0.03, 0.1, 0.3])\
       .addGrid(lr.elasticNetParam, [0.0, 0.01, 0.03])\
       .build()
cv = CrossValidator(estimator=pipeline,
                    estimatorParamMaps=grid,
                    evaluator=evaluator,
                    numFolds=5)

model = cv.fit(addingColTraining)
Example #22
0
filePath = "/databricks-datasets/learning-spark-v2/sf-airbnb/sf-airbnb-clean.parquet"
airbnbDF = spark.read.parquet(filePath)
(trainDF, testDF) = airbnbDF.randomSplit([.8, .2], seed=42)

# COMMAND ----------

from pyspark.sql.functions import col, log
from pyspark.ml import Pipeline
from pyspark.ml.feature import RFormula
from pyspark.ml.regression import LinearRegression

logTrainDF = trainDF.withColumn("log_price", log(col("price")))
logTestDF = testDF.withColumn("log_price", log(col("price")))

rFormula = RFormula(formula="log_price ~ . - price",
                    featuresCol="features",
                    labelCol="log_price",
                    handleInvalid="skip")

lr = LinearRegression(labelCol="log_price", predictionCol="log_pred")
pipeline = Pipeline(stages=[rFormula, lr])
pipelineModel = pipeline.fit(logTrainDF)
predDF = pipelineModel.transform(logTestDF)

# COMMAND ----------

# MAGIC %md
# MAGIC ## Exponentiate
# MAGIC
# MAGIC In order to interpret our RMSE, we need to convert our predictions back from logarithmic scale.

# COMMAND ----------
Example #23
0
from pyspark.ml.feature import StringIndexer
plan_indexer = StringIndexer(inputCol = 'Product_ID', outputCol = 'product_ID1')
labeller = plan_indexer.fit(train)

#%%

Train1 = labeller.transform(train)
Test1 = labeller.transform(test)

Train1.show()

#%%

from pyspark.ml.feature import RFormula
formula = RFormula(formula="Purchase ~ Age+ Occupation +City_Category+Stay_In_Current_City_Years+Product_Category_1+Product_Category_2+ Gender",featuresCol="features",labelCol="label")

t1 = formula.fit(Train1)
#%%

train1 = t1.transform(Train1)
test1 = t1.transform(Test1)

train1.show()

train1.select('features').show()
train1.select('label').show()

#%%

from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.feature import StringIndexer
## Index labels, adding metadata to the label column.
## Fit on whole dataset to include all labels in index.
data = StringIndexer(inputCol="click", outputCol="label").fit(data).transform(data)
data.show()
## 可產生另一個檔案.transform(data)不一定要在(data)檔案裡
#labelIndexer  ===> data


# RFormula
from pyspark.ml.feature import RFormula
## RFormula: string input colums will be one-hot encoded, and numeric columns will be cast to doubles.
##特徵值要被修正formula" "
formula = RFormula(
    formula="label ~ banner_pos + app_id + site_category + site_id + site_domain + device_type + device_conn_type",
    #formula="label ~ banner_pos + app_id + site_category + site_id + site_domain + C14 + C17 + C18 + C19 + C21", #0.707636
    #formula="label ~ banner_pos + site_id + site_domain + C14 + C17 + C21", #0.7
    featuresCol="features",
    labelCol="label")
formula_data = formula.fit(data).transform(data)
formula_data.select("features","label").show()


# Split the data into training and test sets (30% held out for testing)
#已經有了!
# Split training and test data.
(training, test) = formula_data.randomSplit([0.7, 0.3], seed = 12345) #what's seed
training.show()


from pyspark.ml.classification import LogisticRegression
from pyspark.ml.param import Param, Params
Example #25
0
#csv데이터를 데이터프래임으로 불러오기
adDF = spark.read.csv("dataset/Advertising.csv", inferSchema=True, header=True)
#데이터 위에서 5개 출력 해보자
adDF.show(5)
#데이터 총 갯수는?
adDF.count()

adDF.printSchema()

from pyspark.ml.feature import RFormula
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.linalg import Vectors

#transformer 라이브러리를 이용해서 벡터화 하는 방법
dataModel = RFormula().setFormula("Sales ~.").setFeaturesCol("features").setLabelCol("label")
model_fit = dataModel.fit(adDF).transform(adDF)

model_fit.show()
model_fit.printSchema()

model_fit_select = model_fit.select(["features","label"])

model_fit_select.show()
model_fit_select.printSchema()

#Vectors 함수를 이용해서 벡터화 하기
adV = adDF.rdd.map(lambda x: [Vectors.dense(x[0:3]), x[-1]]).toDF(['features', 'label'])

adV.show()
adV.printSchema()
                             inferSchema=True)

    dataset.show()

    abc = dataset.schema.fields

    featuresCol = []

    for x in abc:
        # print(type(x.dataType))
        if (isinstance(x.dataType, StringType)):
            print(x.name + "   " + str(x.dataType))
            # dataset.select(x.name)
            featuresCol.append(x.name)

    f = ""
    f = "ACCELERATION" + " ~ "

    for x in featuresCol:
        f = f + x + "+"
    f = f[:-1]
    f = (f)

    print(f)

    formula = RFormula(formula=f, featuresCol="features", labelCol="label")

    output = formula.fit(dataset).transform(dataset)

    output.show(truncate=False)
Example #27
0
# Not using Price (label) or address in features
columns.remove('date')
columns.remove('open')
columns.remove('high')
columns.remove('low')
columns.remove('close')
columns.remove('Name')
#columns.remove('market_value')
columns.remove('avgof7D_lag3D')
columns.remove('avgof14D_lag0D')
columns.remove('avgof28D_lag14D')
columns.remove('id')

formula = "{} ~ {}".format("avgof14D_lag0D", " + ".join(columns))
print("Formula : {}".format(formula))
rformula = RFormula(formula=formula)
lr = LinearRegression()
pipeline = Pipeline(stages=[rformula, lr])
# Parameter grid
paramGrid = ParamGridBuilder()\
          .addGrid(lr.regParam,[0.01, .04])\
          .build()
cv = CrossValidator()\
      .setEstimator(pipeline)\
      .setEvaluator(RegressionEvaluator()\
                       .setMetricName("r2"))\
      .setEstimatorParamMaps(paramGrid)\
      .setNumFolds(3)

cvModel = cv.fit(train_data)
Example #28
0
def dr_regression(df: DataFrame, dsi: str, trt: str, ps: str, cov_list: list, 
                  regParam: float = 1e-2, max_iptw: float = 1e-4):
    
    from pyspark.ml.regression import LinearRegression
    from pyspark.ml.feature import RFormula
    
    if not dsi or not trt or not ps:
        return
#     if df.count() < 1000:
#         return
    
    _, df = df.custom.drop_const_cols(cov_list[0], cov_list[1])
    
    if cov_list:
        flat_cov_list = [x for sublist in cov_list for x in sublist]
        flat_cov_list = [c for c in flat_cov_list if c in df.schema.names]
    
    df = (
        df
        .withColumn('iptw', 1./((col(trt)*col(ps)+(1-col(trt))*(1-col(ps)))+1e-4))
        .withColumn('ipt0', 1./(1-col(ps)+1e-4))
        .cache()
    )
    rhs_ls = [trt, 'iptw'] + flat_cov_list
    
    transformer = RFormula(
        formula='%s ~ %s' % (dsi, ' + '.join(rhs_ls)),
        featuresCol="features",
        labelCol="label"
    ).fit(df)
    
    dsi_df = (
        transformer
        .transform(df.select(['customer_id', dsi] + rhs_ls))\
        .select('customer_id', 'features', 'label')
    )
    
    lr = LinearRegression(
        featuresCol='features',
        labelCol = 'label',
        tol=1e-4, regParam=regParam, elasticNetParam=0.5
    )
    lrm = lr.fit(dsi_df)
    
    dsi_1 = lrm.transform(
        transformer
        .transform(
            df
            .filter(col(trt) > 0.)
            .select(['customer_id', dsi] + rhs_ls)
        ).select('customer_id', 'features', 'label')
    ).select('customer_id', col('prediction').alias('dsi_1'))
    
    dsi_0 = lrm.transform(
        transformer
        .transform(
            df
            .filter(col(trt) > 0.)
            .withColumn(trt, 1.-col(trt))
            .drop('iptw').withColumnRenamed('ipt0', 'iptw')
            .select(['customer_id', dsi] + rhs_ls)
        ).select('customer_id', 'features', 'label')
    ).select('customer_id', col('prediction').alias('dsi_0'))
    
    estimates = (
        dsi_1
        .join(dsi_0, on=['customer_id'], how='inner')
        .withColumn('effect', col('dsi_1') - col('dsi_0'))
        .agg(
            mean(col('effect')).alias('att_mean'),
            expr('percentile(effect, array(0.5))')[0].alias('att_median'),
            stddev(col('effect')).alias('att_std'),
            count(col('customer_id')).alias('total_trt')
        ).collect()
    )
    
    return estimates
Example #29
0
      StructField("C18", DoubleType(), False),
      StructField("C19", DoubleType(), False),
      StructField("C20", DoubleType(), False),
      StructField("C21", DoubleType(), False)])
# Get file
df = sqlContext.read.format("com.databricks.spark.csv").options(header= 'true').schema(customSchema).load("file:///home/bigdatas16/Downloads/train100K.csv")
# Displays the content of the DataFrame to stdout
df.show()

from pyspark.ml.feature import StringIndexer
data = StringIndexer(inputCol="click", outputCol="label").fit(df).transform(df)
data.show()

# RFormula
from pyspark.ml.feature import RFormula
formula = RFormula(formula="label ~ banner_pos + app_id + site_category + site_id + site_domain + device_model + C14 + C17 + C18 + C19 + C21 ", featuresCol="features", labelCol="label")
output = formula.fit(data).transform(data)
data1 = output.select("label", "features")
data1.show()

# Split training and test data.
    #(training, test) = data1.randomSplit([0.7, 0.3], seed = 12345)
training, test = data1.randomSplit([0.7, 0.3], seed = 12345)
training.show()
    
# Configure an ML pipeline, which consists of three stages: tokenizer, hashingTF, and rf (random forest).
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.param import Param, Params
from pyspark.ml.feature import HashingTF, Tokenizer
from pyspark.sql import Row
from pyspark.ml import Pipeline
Example #30
0
tkn = Tokenizer().setInputCol("Description").setOutputCol("DescOut")
tokenized = tkn.transform(sales.select("Description"))
tokenized.show(20, False)

# COMMAND ----------

from pyspark.ml.feature import StandardScaler

sScaler = StandardScaler().setInputCol("features")
sScaler.fit(scaleDF).transform(scaleDF).show()

# COMMAND ----------

from pyspark.ml.feature import RFormula

supervised = RFormula(formula="lab ~ . + color:value1 + color:value2")
supervised.fit(simpleDF).transform(simpleDF).show()

# COMMAND ----------

from pyspark.ml.feature import SQLTransformer

basicTransformation = SQLTransformer()\
  .setStatement("""
    SELECT sum(Quantity), count(*), CustomerID
    FROM __THIS__
    GROUP BY CustomerID
  """)

basicTransformation.transform(sales).show()
Example #31
0
categorical = df_train.columns
categorical.remove('label')
print(categorical)

cat_inter = ['C14', 'C15']

concat = '+'.join(categorical)
interaction = ':'.join(cat_inter)
formula = "label ~ " + concat + '+' + interaction

print(formula)

from pyspark.ml.feature import RFormula
interactor = RFormula(formula=formula,
                      featuresCol="features",
                      labelCol="label").setHandleInvalid("keep")

interactor.fit(df_train).transform(df_train).select("features").show()

from pyspark.ml.classification import LogisticRegression

classifier = LogisticRegression(maxIter=20,
                                regParam=0.000,
                                elasticNetParam=0.000)

stages = [interactor, classifier]

from pyspark.ml import Pipeline

pipeline = Pipeline(stages=stages)
denseVec = Vectors.dense(1.0, 2.0, 3.0)
size = 3
idx = [1, 2] # locations of non-zero elements in vector
values = [2.0, 3.0]
sparseVec = Vectors.sparse(size, idx, values)
print(sparseVec)

# COMMAND ----------

df = spark.read.json("/databricks-datasets/definitive-guide/data/simple-ml")
df.orderBy("value2").show()

# COMMAND ----------

from pyspark.ml.feature import RFormula
supervised = RFormula(formula="lab ~ . +color:value1 + color:value2")

# COMMAND ----------

fittedRF = supervised.fit(df)
preparedDF = fittedRF.transform(df)
preparedDF.show()

# COMMAND ----------

train, test = preparedDF.randomSplit([0.7, 0.3])

# COMMAND ----------

from pyspark.ml.classification import LogisticRegression
lr = LogisticRegression(labelCol="label",featuresCol="features")
# We have more work to do!


# ## Exercises

# (1) Import the
# [RFormula](https://spark.apache.org/docs/latest/api/python/pyspark.ml.html#pyspark.ml.feature.RFormula)
# class from the `pyspark.ml.feature` module.

from pyspark.ml.feature import RFormula

# (2) Create an instance of the `RFormula` class with the R formula
# `star_rating ~ reviewed + vehicle_year + vehicle_color`.

rformula = RFormula(formula = "star_rating ~ reviewed + vehicle_year + vehicle_color")

# (3) Specify a pipeline consisting of the `filterer`, `extractor`, and the
# RFormula instance specified above.

pipeline = Pipeline(stages=[filterer, extractor, rformula])

# (4) Fit the pipeline on the `train` DataFrame.

pipeline_model = pipeline.fit(train)

# (5) Use the `save` method to save the pipeline model to the
# `models/pipeline_model` directory in HDFS.

pipeline_model.write().overwrite().save("models/pipeline_model")
sales = spark.read.format("csv")\
  .option("header", "true")\
  .option("inferSchema", "true")\
  .load("/data/retail-data/by-day/*.csv")\
  .coalesce(5)\
  .where("Description IS NOT NULL")
fakeIntDF = spark.read.parquet("/data/simple-ml-integers")
simpleDF = spark.read.json("/data/simple-ml")
scaleDF = spark.read.parquet("/data/simple-ml-scaling")


# COMMAND ----------

from pyspark.ml.feature import RFormula

supervised = RFormula(formula="lab ~ . + color:value1 + color:value2")
supervised.fit(simpleDF).transform(simpleDF).show()


# COMMAND ----------

from pyspark.ml.feature import SQLTransformer

basicTransformation = SQLTransformer()\
  .setStatement("""
    SELECT sum(Quantity), count(*), CustomerID
    FROM __THIS__
    GROUP BY CustomerID
  """)

basicTransformation.transform(sales).show()
Example #35
0
      StructField("device_model", StringType(), True),
      StructField("device_type", DoubleType(), False),
      StructField("device_conn_type", DoubleType(), False),
      StructField("C14", DoubleType(), False),
      StructField("C15", DoubleType(), False),
      StructField("C16", DoubleType(), False),
      StructField("C17", DoubleType(), False),
      StructField("C18", DoubleType(), False),
      StructField("C19", DoubleType(), False),
      StructField("C20", DoubleType(), False),
      StructField("C21", DoubleType(), False)])
df = sqlContext.read.format("com.databricks.spark.csv").options(header= 'true').schema(customSchema).load("file:///home/bigdatas16/Downloads/train100K.csv")

data = StringIndexer(inputCol="click", outputCol="label").fit(df).transform(df)

formula = RFormula(formula="label ~ C1 + banner_pos + site_category + app_category +device_type + device_conn_type + C15 + C16 + C18 + C19", featuresCol="features", labelCol="label")
output = formula.fit(data).transform(data)
data1 = output.select("label", "features")
(training, test) = data1.randomSplit([0.8, 0.2], seed = 12345)


#gbt = GBTClassifier(numTrees = 10, maxDepth = 3, maxBins = 64)
gbt = GBTClassifier(maxIter = 30, maxDepth = 2, impurityType = gini)

#gbt = GBTClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures", maxIter=10)
##rf = RandomForestClassifier(numTrees = 25, maxDepth = 4, maxBins = 64)
pipeline = Pipeline(stages=[gbt])
pipelineModel = pipeline.fit(training)

testPredictions = pipelineModel.transform(test)
testPredictions.select("prediction", "label", "features").show(5)
Example #36
0
# limitations under the License.
#

from __future__ import print_function

# $example on$
from pyspark.ml.feature import RFormula
# $example off$
from pyspark.sql import SparkSession

if __name__ == "__main__":
    spark = SparkSession\
        .builder\
        .appName("RFormulaExample")\
        .getOrCreate()

    # $example on$
    dataset = spark.createDataFrame([(7, "US", 18, 1.0), (8, "CA", 12, 0.0),
                                     (9, "NZ", 15, 0.0)],
                                    ["id", "country", "hour", "clicked"])

    formula = RFormula(formula="clicked ~ country + hour",
                       featuresCol="features",
                       labelCol="label")

    output = formula.fit(dataset).transform(dataset)
    output.select("features", "label").show()
    # $example off$

    spark.stop()
Example #37
0
        .getOrCreate()

data_simple_ml = "C:\\PySpark\\data\\simple-ml"
data_simple_ml_persist = "C:\\PySpark\\data\\simple-ml\\persisted-models"

df = spark.read.json(data_simple_ml)

df.printSchema()
df.orderBy("value2").show()

train, test = df.randomSplit([0.7, 0.3])

from pyspark.ml.feature import RFormula
from pyspark.ml.classification import LogisticRegression

rForm = RFormula()
lr = LogisticRegression().setLabelCol("label").setFeaturesCol("features")

from pyspark.ml import Pipeline

stages = [rForm, lr]
pipeline = Pipeline().setStages(stages)

from pyspark.ml.tuning import ParamGridBuilder

params = ParamGridBuilder()\
  .addGrid(rForm.formula, [
    "lab ~ . + color:value1",
    "lab ~ . + color:value1 + color:value2"])\
  .addGrid(lr.elasticNetParam, [0.0, 0.5, 1.0])\
  .addGrid(lr.regParam, [0.1, 2.0])\