def logistic_regression_generator(training_data, deal_id): ####In: #A training data set, as generated by data_prep() #The deal_id you want to generate a model for ####Out #The model is saved #An update message is outputted training_data = training_data.withColumnRenamed(deal_id, 'label') model = LogisticRegression(maxIter=100, regParam=0.0001, elasticNetParam=1, family="binomial") model = model.fit(training_data) model.write().overwrite().save( f"s3://rtl-databricks-datascience/lpater/logistic_regression/{deal_id}/" ) output_message = "Saved a Logistic Regression model for " + deal_id + "." #see also: https://spark.apache.org/docs/latest/ml-classification-regression.html #note: this currently uses LASSO to select parameters return output_message
print("points labeled as 'A' and 'N' ") #spark.stop() print("in logistic regression...") #Logistic Regression df = spark.read.format("csv").option( "header", "false").schema(struct).load("output/labeledN.csv") ###df.registerTempTable("census") ###tmp = sqlContext.sql("SELECT * from census WHERE Label = 'N'") #tmp = tmp.select(LR_FEATURES) vecAssembler = VectorAssembler(inputCols=LR_FEATURES, outputCol="features") lr_df = vecAssembler.transform(df).select('Class', 'features') lr = LogisticRegression(featuresCol='features', labelCol='Class', maxIter=10) lrModel = lr.fit(lr_df) lr_path = os.getcwd() + "/lr" lrModel_path = os.getcwd() + "/lr_model" lr.write().overwrite().save(lr_path) lrModel.write().overwrite().save(lrModel_path) print("pipelined training process completed") else: # if fileSET == tmpSET, there is no change in stream files. print("No incoming stream...") tmpSET = set(fileSET) print("Number of current clusters->", len(centerPoints)) sleep(10) spark.stop()
summary = lr.summary print("Labels") print(summary.labels) print("Accuracy") print(summary.accuracy) print("Precision by Label") print(summary.precisionByLabel) print("Recall by Label") print(summary.recallByLabel) print("False Positve Rate") print(summary.falsePositiveRateByLabel) print("True Positive Rate by Label") print(summary.truePositiveRateByLabel) print("Area Under ROC") print(summary.areaUnderROC) roc = summary.roc.toPandas() roc.to_csv("./roc.csv", index=False) lr.write() \ .overwrite() \ .save(model_out_path)