def logistic_regression_generator(training_data, deal_id):
    ####In:
    #A training data set, as generated by data_prep()
    #The deal_id you want to generate a model for

    ####Out
    #The model is saved
    #An update message is outputted

    training_data = training_data.withColumnRenamed(deal_id, 'label')
    model = LogisticRegression(maxIter=100,
                               regParam=0.0001,
                               elasticNetParam=1,
                               family="binomial")
    model = model.fit(training_data)
    model.write().overwrite().save(
        f"s3://rtl-databricks-datascience/lpater/logistic_regression/{deal_id}/"
    )
    output_message = "Saved a Logistic Regression model for " + deal_id + "."

    #see also: https://spark.apache.org/docs/latest/ml-classification-regression.html

    #note: this currently uses LASSO to select parameters
    return output_message
Exemple #2
0
                print("points labeled as 'A' and 'N' ")
                #spark.stop()
                print("in logistic regression...")
                #Logistic Regression
                df = spark.read.format("csv").option(
                    "header",
                    "false").schema(struct).load("output/labeledN.csv")
                ###df.registerTempTable("census")
                ###tmp = sqlContext.sql("SELECT * from census WHERE Label = 'N'")
                #tmp = tmp.select(LR_FEATURES)
                vecAssembler = VectorAssembler(inputCols=LR_FEATURES,
                                               outputCol="features")
                lr_df = vecAssembler.transform(df).select('Class', 'features')
                lr = LogisticRegression(featuresCol='features',
                                        labelCol='Class',
                                        maxIter=10)
                lrModel = lr.fit(lr_df)
                lr_path = os.getcwd() + "/lr"
                lrModel_path = os.getcwd() + "/lr_model"
                lr.write().overwrite().save(lr_path)
                lrModel.write().overwrite().save(lrModel_path)
            print("pipelined training process completed")
        else:  # if fileSET == tmpSET, there is no change in stream files.
            print("No incoming stream...")
        tmpSET = set(fileSET)
        print("Number of current clusters->", len(centerPoints))
        sleep(10)

    spark.stop()
summary = lr.summary

print("Labels")
print(summary.labels)

print("Accuracy")
print(summary.accuracy)

print("Precision by Label")
print(summary.precisionByLabel)

print("Recall by Label")
print(summary.recallByLabel)

print("False Positve Rate")
print(summary.falsePositiveRateByLabel)

print("True Positive Rate by Label")
print(summary.truePositiveRateByLabel)

print("Area Under ROC")
print(summary.areaUnderROC)

roc = summary.roc.toPandas()
roc.to_csv("./roc.csv", index=False)

lr.write() \
  .overwrite() \
  .save(model_out_path)