Exemple #1
0
 def test_logistic_regression(self):
     lr = LogisticRegression(maxIter=1)
     path = tempfile.mkdtemp()
     lr_path = path + "/logreg"
     lr.save(lr_path)
     lr2 = LogisticRegression.load(lr_path)
     self.assertEqual(lr2.uid, lr2.maxIter.parent,
                      "Loaded LogisticRegression instance uid (%s) "
                      "did not match Param's uid (%s)"
                      % (lr2.uid, lr2.maxIter.parent))
     self.assertEqual(lr._defaultParamMap[lr.maxIter], lr2._defaultParamMap[lr2.maxIter],
                      "Loaded LogisticRegression instance default params did not match " +
                      "original defaults")
     try:
         rmtree(path)
     except OSError:
         pass
Exemple #2
0
 def test_logistic_regression(self):
     lr = LogisticRegression(maxIter=1)
     path = tempfile.mkdtemp()
     lr_path = path + "/logreg"
     lr.save(lr_path)
     lr2 = LogisticRegression.load(lr_path)
     self.assertEqual(
         lr2.uid, lr2.maxIter.parent,
         "Loaded LogisticRegression instance uid (%s) "
         "did not match Param's uid (%s)" % (lr2.uid, lr2.maxIter.parent))
     self.assertEqual(
         lr._defaultParamMap[lr.maxIter], lr2._defaultParamMap[lr2.maxIter],
         "Loaded LogisticRegression instance default params did not match "
         + "original defaults")
     try:
         rmtree(path)
     except OSError:
         pass
Exemple #3
0
# ----------------------------------------------------------------------
## Model training and prediction
start_time = time.time()
print('model training start at: ', datetime.datetime.now().strftime("%I:%M%p on %B %d, %Y"))
lr = LogisticRegression(labelCol="OUTCOME", featuresCol="features", maxIter=100)

### Fit the model on training data.
trained_model_lr = lr.fit(train_2)

print('model training completed at: ', datetime.datetime.now().strftime("%I:%M%p on %B %d, %Y"))
m, s = divmod(time.time() - start_time, 60)
h, m = divmod(m, 60)
print('model training run time: %d:%02d:%02d' % (h, m, s))

lr.save("Benchmark/trained_model/")


# V. Make predictions on test data


pred_test = trained_model_lr.transform(test_2)

evaluator = BinaryClassificationEvaluator(labelCol="OUTCOME", rawPredictionCol="rawPrediction")
auroc = evaluator.evaluate(pred_test, {evaluator.metricName: "areaUnderROC"})
aupr = evaluator.evaluate(pred_test, {evaluator.metricName: "areaUnderPR"})
print("The ROC_AUC is %.4f and the PR_AUC is %.4f" % (auroc, aupr))

# The ROC_AUC is 0.8365 and the PR_AUC is 0.3634

    'header',
    'true').csv('C:/Users/mrupv/bits/spa/Assignment2/paysim1/train.csv')

df = data.withColumn("oldbalanceOrg",data["oldbalanceOrg"].cast("double"))\
    .withColumn("newbalanceOrig",data["newbalanceOrig"].cast("double"))\
    .withColumn("oldbalanceDest",data["oldbalanceDest"].cast("double"))\
    .withColumn("newbalanceDest",data["newbalanceDest"].cast("double")) \
    .withColumn("step",data["step"].cast("int")) \
    .withColumn("amount",data["amount"].cast("double")) \
    .withColumn("isFraud",data["isFraud"].cast("int"))

type_indexer = StringIndexer(inputCol='type', outputCol='type_index')
orig_indexer = StringIndexer(inputCol='nameOrig', outputCol='nameOrig_index')
dest_indexer = StringIndexer(inputCol='nameDest', outputCol='nameDest_index')
assembler = VectorAssembler(inputCols=[
    'step', 'type_index', 'amount', 'nameOrig_index', 'oldbalanceOrg',
    'newbalanceOrig', 'nameDest_index', 'oldbalanceDest', 'newbalanceDest'
],
                            outputCol='features')

model = LogisticRegression(featuresCol='features', labelCol='isFraud')

pipeline = Pipeline(
    stages=[type_indexer, orig_indexer, dest_indexer, assembler, model])

model = pipeline.fit(df)

#output_df = model.transform(df)

model.save('model/')
Exemple #5
0
    Row(label=0.0, weight=2.0, features=Vectors.dense(1.0, 2.0)),
    Row(label=1.0, weight=3.0, features=Vectors.dense(2.0, 1.0)),
    Row(label=0.0, weight=4.0, features=Vectors.dense(3.0, 3.0))
]).toDF()

blor = LogisticRegression(regParam=0.01, weightCol="weight")
blorModel = blor.fit(bdf)
blorModel.coefficients
blorModel.intercept

test1 = sc.parallelize([Row(features=Vectors.sparse(2, [0], [1.0]))]).toDF()
blorModel.transform(test1).head().prediction

save_path = "C:\\PySpark\\spark_ml\\saved_models\\logistic_regression_example_1\\"
estimator_path = save_path + "lr"
# Save the estimator
blor.save(estimator_path)
lr2 = LogisticRegression.load(estimator_path)
lr2.getRegParam()

#save the model
model_path = save_path + "lr_model"
blorModel.save(model_path)

from pyspark.ml.classification import LogisticRegressionModel
model2 = LogisticRegressionModel.load(model_path)
print(blorModel.coefficients[0] == model2.coefficients[0])
print(blorModel.intercept == model2.intercept)
print(model2, blorModel)

spark.stop()