Exemple #1
0
def evaluate_model(pipeline_model, data):
    evaluator = ev.BinaryClassificationEvaluator(
        rawPredictionCol="probability", labelCol="label")
    results = pipeline_model.transform(data)
    AUC = evaluator.evaluate(results, {evaluator.metricName: 'areaUnderROC'})
    AP = evaluator.evaluate(results, {evaluator.metricName: 'areaUnderPR'})
    return AUC, AP
Exemple #2
0
def GBDTclf(trainingData, testData):

    max_depth = [1, 5, 10]
    grid = tune.ParamGridBuilder() \
        .addGrid(GBDT.maxDepth, max_depth) \
        .build()

    evaluator = ev.BinaryClassificationEvaluator(
        rawPredictionCol='probability', labelCol='label')

    # 3-fold validation
    cv = tune.CrossValidator(estimator=GBDT,
                             estimatorParamMaps=grid,
                             evaluator=evaluator,
                             numFolds=3)

    # pipelineDtCV = Pipeline(stages=[cv])
    cvModel = cv.fit(trainingData)
    results = cvModel.transform(testData)

    label = results.select("label").toPandas().values
    predict = results.select("prediction").toPandas().values
    np.savetxt('res/predictedGBDT_spark.txt', predict, fmt='%01d')
    print("[accuracy,precision,recall,f1]")
    # print(evaluate(label,predict))
    return evaluate(label, predict)
def _calc_auc_auprc(df: DataFrame, prob_col: str,
                    label_col: str) -> Tuple[float, float]:
    r"""
    Given a df, labels, and probabilities, return auc and auprc (area under
    precision recall curve)

    Parameters
    ----------
    df : pyspark.sql.DataFrame
    prob_col : str
        colname w/ raw probabilities of being in class 1
    label_col : str

    Returns
    -------
    auc :float
    aucprc : float
        area under precision recall curve

    Raises
    ------
    UncaughExceptions

    See Also
    --------
    pyspark.ml.evaluation.BinaryClassificationEvaluator
    """

    auc_eval = mle.BinaryClassificationEvaluator(rawPredictionCol=prob_col,
                                                 labelCol=label_col,
                                                 metricName='areaUnderROC')
    auc = auc_eval.evaluate(df)

    auprc_eval = mle.BinaryClassificationEvaluator(rawPredictionCol=prob_col,
                                                   labelCol=label_col,
                                                   metricName='areaUnderPR')
    auprc = auprc_eval.evaluate(df)

    return auc, auprc
Exemple #4
0
def train_evaluate(train_data, test_data):
    # 将文字的分类特征转为数字
    stringIndexer = ft.StringIndexer(inputCol='alchemy_category',
                                     outputCol="alchemy_category_Index")

    encoder = ft.OneHotEncoder(dropLast=False,
                               inputCol='alchemy_category_Index',
                               outputCol="alchemy_category_IndexVec")

    assemblerInputs = ['alchemy_category_IndexVec'] + train_data.columns[4:-1]
    assembler = ft.VectorAssembler(inputCols=assemblerInputs,
                                   outputCol="features")

    # dt = cl.DecisionTreeClassifier(labelCol="label",
    #                             featuresCol="features")
    rf = cl.RandomForestClassifier(labelCol="label", featuresCol="features")

    evaluator = ev.BinaryClassificationEvaluator(
        rawPredictionCol="probability",
        labelCol='label',
        metricName='areaUnderROC')

    grid_search = tune.ParamGridBuilder()\
        .addGrid(rf.impurity, [ "gini","entropy"])\
        .addGrid(rf.maxDepth, [ 5,10,15])\
        .addGrid(rf.maxBins, [10, 15,20])\
        .addGrid(rf.numTrees, [10, 20,30])\
        .build()

    rf_cv = tune.CrossValidator(estimator=rf,
                                estimatorParamMaps=grid_search,
                                evaluator=evaluator,
                                numFolds=5)

    # rf_tvs = tune.TrainValidationSplit(
    #     estimator=rf,
    #     estimatorParamMaps=grid_search,
    #     evaluator=evaluator,
    #     trainRatio=0.7
    # )
    pipeline = Pipeline(stages=[stringIndexer, encoder, assembler, rf_cv])
    cv_pipeline_model = pipeline.fit(train_data)

    best_model = cv_pipeline_model.stages[-1]
    best_parm = get_best_param(best_model)

    AUC, AP = evaluate_model(cv_pipeline_model, test_data)

    return AUC, AP, best_parm, cv_pipeline_model
Exemple #5
0
# Import the evaluation submodule
import pyspark.ml.evaluation as evals

# Create a BinaryClassificationEvaluator
evaluator = evals.BinaryClassificationEvaluator(metricName="areaUnderROC")
def main(path_data, path_parameters, dir_models):
    logger = logging.getLogger(__name__)
    spark = (
        pyspark.sql.SparkSession
            .builder
            .appName("Python Spark Random Forest model training")
            .enableHiveSupport()
            .getOrCreate()
    )

    logger.info("Reading parquet data and splitting into test and train datasets")
    data_df = spark.read.parquet(path_data)
    splits = data_df.randomSplit([0.7, 0.3])
    training_df = splits[0]
    validation_df = splits[1]

    logger.info("Constructing pipeline for prediction model")
    with open(path_parameters) as json_file:
        parameters = json.load(json_file)
    feature_columns = parameters['feature_columns']
    rf_params = parameters['rf_params']
    assembler = feature.VectorAssembler(
        inputCols=feature_columns,
        outputCol="features")

    rf = classification.RandomForestClassifier(
        labelCol="churn", **rf_params)

    rf_pipeline = pipeline.Pipeline(stages=[assembler, rf])
    logger.info("Training prediction model")
    pipeline_model = rf_pipeline.fit(training_df)

    logger.info("Calculating model metrics")
    train_predictions_df = pipeline_model.transform(training_df)
    validation_predictions_df = pipeline_model.transform(validation_df)

    accuracy_evaluator = evaluation.MulticlassClassificationEvaluator(
        metricName="accuracy", labelCol="churn", predictionCol="prediction")

    precision_evaluator = evaluation.MulticlassClassificationEvaluator(
        metricName="weightedPrecision", labelCol="churn", predictionCol="prediction")

    recall_evaluator = evaluation.MulticlassClassificationEvaluator(
        metricName="weightedRecall", labelCol="churn", predictionCol="prediction")

    f1_evaluator = evaluation.MulticlassClassificationEvaluator(
        metricName="f1", labelCol="churn", predictionCol="prediction")

    auroc_evaluator = evaluation.BinaryClassificationEvaluator(metricName='areaUnderROC', labelCol="churn")

    logger.info("Saving model and metrics data")
    train_metrics = {
        "accuracy": accuracy_evaluator.evaluate(train_predictions_df),
        "precision": precision_evaluator.evaluate(train_predictions_df),
        "recall": recall_evaluator.evaluate(train_predictions_df),
        "f1": f1_evaluator.evaluate(train_predictions_df),
        "auroc": auroc_evaluator.evaluate(train_predictions_df)
    }
    validation_metrics = {
        "accuracy": accuracy_evaluator.evaluate(validation_predictions_df),
        "precision": precision_evaluator.evaluate(validation_predictions_df),
        "recall": recall_evaluator.evaluate(validation_predictions_df),
        "f1": f1_evaluator.evaluate(validation_predictions_df),
        "auroc": auroc_evaluator.evaluate(validation_predictions_df)
    }

    rf_model = pipeline_model.stages[-1]
    model_params = rf_model.extractParamMap()
    model_description = {
        "name": "Random Forest",
        "params": {param.name: value for param, value in model_params.items()},
    }

    dir_model = pathlib.Path(dir_models)
    dir_model.mkdir(parents=True, exist_ok=True)

    path_pipeline_model = pathlib.Path(dir_model).joinpath("pipeline_model")
    path_train_metrics = pathlib.Path(dir_model).joinpath("metrics_train.json")
    path_validation_metrics = pathlib.Path(dir_model).joinpath("metrics_validation.json")
    path_model_description = pathlib.Path(dir_model).joinpath("model_description.json")

    pipeline_model.save(str(path_pipeline_model))
    with open(path_train_metrics, "w") as f:
        json.dump(train_metrics, f)
    with open(path_validation_metrics, "w") as f:
        json.dump(validation_metrics, f)
    with open(path_model_description, "w") as f:
        json.dump(model_description, f)
Exemple #7
0
    train, test = data.randomSplit([0.7, 0.3], seed=1234)  #42
    lr_model = cl.LogisticRegression(
        # maxIter=10,
        # regParam=0.01,
        elasticNetParam=0,
        family='binomial',
        threshold=0.5,
        weightCol='weight',
        labelCol='y')

    grid = tune.ParamGridBuilder()\
        .addGrid(lr_model.maxIter,[200,300,500,800])\
        .addGrid(lr_model.regParam,[0.001,0.002])\
        .build()

    evaluator = ev.BinaryClassificationEvaluator(
        rawPredictionCol='probability', labelCol='y')

    cv = tune.CrossValidator(estimator=lr_model,
                             estimatorParamMaps=grid,
                             evaluator=evaluator,
                             numFolds=3)

    ppline = Pipeline(stages=[featuerCreator])
    train_transfomer = ppline.fit(train)

    cv_model = cv.fit(train_transfomer.transform(train))
    test = train_transfomer.transform(test)
    results = cv_model.transform(test)
    print('predict_results_type:', type(results))
    print(evaluator.evaluate(results, {evaluator.metricName: 'areaUnderROC'}))
    print(evaluator.evaluate(results, {evaluator.metricName: 'areaUnderPR'}))
# Import the tuning submodule
import pyspark.ml.tuning as tune

from pyspark.sql.functions import udf, col

# Create a LogisticRegression Estimator
lr = LogisticRegression(maxIter=300)

# Create Pipeline (need to drop the label)
assembler = VectorAssembler(
    inputCols=df.select(modeling_vars).drop('label').columns,
    outputCol="features")
pipeline = Pipeline(stages=[assembler, lr])

# Create a BinaryClassificationEvaluator
evaluator = evals.BinaryClassificationEvaluator(metricName='areaUnderPR')

# Create the parameter grid
grid = tune.ParamGridBuilder()

# Add the hyperparameter
grid = grid.addGrid(lr.regParam, np.arange(0.00001, 1, 50))
# This does both l1 and l2 - list of 0 and 1
# NOTE - 1 = LASSO, 0 = Ridge regression
grid = grid.addGrid(lr.elasticNetParam, [0, 1])

# Build the grid
grid = grid.build()

# Create the CrossValidator
cv = tune.CrossValidator(estimator=pipeline,
Exemple #9
0
births_train, births_test = births.randomSplit([0.7, 0.3], seed=666)

# In[10]:

model = pipeline.fit(births_train)
test_model = model.transform(births_test)

# In[12]:

test_model.take(1)

# In[13]:

import pyspark.ml.evaluation as ev

evaluator = ev.BinaryClassificationEvaluator(rawPredictionCol='probability',
                                             labelCol='INFANT_ALIVE_AT_REPORT')

print(evaluator.evaluate(test_model, {evaluator.metricName: 'areaUnderROC'}))
print(evaluator.evaluate(test_model, {evaluator.metricName: 'areaUnderPR'}))

# In[14]:

pipelinePath = './model/infant_oneHotEncoder_Logistic_Pipeline'
pipeline.write().overwrite().save(pipelinePath)

# In[15]:

loadedPipeline = Pipeline.load(pipelinePath)
loadedPipeline.fit(births_train).transform(births_test).take(1)

# In[16]:
def loadData4Validation(K=5):
    trainDataFile = '/user/mydata/tianchi/repeat_buyers_format2/train_format2.csv'
    #trainDataFile = '/user/mydata/tianchi/repeat_buyers_format2/train_format2_sub.csv'

    log("start to validate.\n")
    aucTotal = 0
    for i in range(5):
        t1 = time.time()
        #读取数据,分割为训练集和测试集
        df = sqlContext.read.csv(trainDataFile,
                                 header='true',
                                 inferSchema='true',
                                 sep=',').repartition(1000)
        df = df.withColumn('random', rand())
        train_df, test_df = df.filter("random<=0.8").repartition(
            1000), df.filter("random>0.2").repartition(1000)

        #训练阶段
        trainData = dataProcess(train_df, mode='validation')
        #         trainData = trainData.withColumn('random', rand())
        #         trainData = trainData.where("(label==0 and random>0.5) or label=1")
        log("data preprocessing costs " + str(time.time() - t1) + ".\n")
        #clf = LogisticRegression(featuresCol='features', labelCol='label', predictionCol='prediction')
        log('training it\n')
        #clf = GBTRegressor(maxIter=200, maxDepth=6, seed=42,subsamplingRate=0.7)
        clf = RandomForestRegressor(subsamplingRate=0.7,
                                    numTrees=50,
                                    featureSubsetStrategy='0.5')
        model = clf.fit(trainData)
        log('training cost ' + str(time.time() - t1) + 's\n')

        #测试阶段
        testData = dataProcess(test_df, mode='validation')
        log('transforming them\n')
        train_prediction = model.transform(trainData)
        test_prediction = model.transform(testData)
        udfProcessFloat201 = udf(process01, DoubleType())
        #log(str(train_prediction.rdd.take(2)) + '\n')
        train_prediction = train_prediction.withColumn(
            'prediction_final',
            udfProcessFloat201(train_prediction.prediction))

        test_prediction = test_prediction.withColumn(
            'prediction_final', udfProcessFloat201(test_prediction.prediction))

        #log(str(train_prediction.rdd.take(20)) + '\n')
        #"""
        #use spark to evaluate model
        print("#####evaluating#######\n\n\n\n\n\n\n\n\n\n\n")
        evaluator = ev.BinaryClassificationEvaluator(
            rawPredictionCol='prediction_final', labelCol='label')
        print("############\n\n\n\n\n\n\n\n\n\n\n")
        train_auc = evaluator.evaluate(train_prediction,
                                       {evaluator.metricName: 'areaUnderROC'})
        test_auc = evaluator.evaluate(test_prediction,
                                      {evaluator.metricName: 'areaUnderROC'})
        log(
            str(i) + " epoch auc is " + str(test_auc) + ', training auc is ' +
            str(train_auc) + '\n')
        t2 = time.time()
        print("############time cost is " + str(t2 - t1) +
              "\n\n\n\n\n\n\n\n\n\n\n")
        #use spark to evaluate model
        #"""
        '''
        #use sklearn to evaluate model
        print("############\n\n\n\n\n\n\n\n\n\n\n")
        from sklearn.metrics import auc
        from sklearn.metrics import roc_auc_score
        print("############\n\n\n\n\n\n\n\n\n\n\n")
        train_prediction, test_prediction = \
                train_prediction.select('label', 'prediction_final').toPandas(), \
                test_prediction.select('label', 'prediction_final').toPandas()
        trainLables, trainPredictions = \
                 train_prediction['label'].values, train_prediction['prediction_final'].values
        testLables, testPredictions = \
                 test_prediction['label'].values, test_prediction['prediction_final'].values
        train_auc, test_auc = roc_auc_score(trainLables, trainPredictions), roc_auc_score(testLables, testPredictions)
        log(str(i) + " epoch auc is " + str(test_auc) + ', training auc is ' +str(train_auc) +  '\n')
        print("############\n\n\n\n\n\n\n\n\n\n\n")
        #use sklearn to evaluate model
        '''
        aucTotal += test_auc
    print("av auc is ", aucTotal / 5)
    log(str(aucTotal / 5) + '\n')
    print("############\n\n\n\n\n\n\n\n\n\n\n")
# Bring everything together
validator = smt.CrossValidator(estimator=pipeline,
                               estimatorParamMaps=param_grid,
                               evaluator=eval_f1,
                               numFolds=3)

# Fit the model to the data #######################################################################'
model = validator.fit(train)

train_predictions = model.transform(train)
val_predictions = model.transform(val)

# Evaluate model performance

eval_roc = sme.BinaryClassificationEvaluator(labelCol='label',
                                             rawPredictionCol='predictedLabel',
                                             metricName='areaUnderROC')

eval_accuracy = sme.MulticlassClassificationEvaluator(
    labelCol='label', predictionCol='predictedLabel', metricName='accuracy')

eval_precision = sme.MulticlassClassificationEvaluator(
    labelCol='label',
    predictionCol='predictedLabel',
    metricName='weightedPrecision')

eval_recall = sme.MulticlassClassificationEvaluator(
    labelCol='label',
    predictionCol='predictedLabel',
    metricName='weightedRecall')
    dt_pipe_md = pipeline.Pipeline(stages=[assembler, classific])
    dt_pipe_md_model = dt_pipe_md.fit(training_df)
    train_predictions_df = dt_pipe_md_model.transform(training_df)
    validation_predictions_df = dt_pipe_md_model.transform(validation_df)
    test_prediction_df = dt_pipe_md_model.transform(test_df)

    accuracy_evaluator = evaluation.MulticlassClassificationEvaluator(
        metricName="accuracy", labelCol="churn", predictionCol="prediction")

    precision_evaluator = evaluation.MulticlassClassificationEvaluator(
        metricName="weightedPrecision", labelCol="churn")
    recall_evaluator = evaluation.MulticlassClassificationEvaluator(
        metricName="weightedRecall",
        labelCol="churn",
        predictionCol="prediction")
    auroc_evaluator = evaluation.BinaryClassificationEvaluator(
        metricName='areaUnderROC', labelCol="churn")

    f1_evaluator = evaluation.MulticlassClassificationEvaluator(
        metricName="f1", labelCol="churn", predictionCol="prediction")

    train_metrics = {
        "accuracy": accuracy_evaluator.evaluate(train_predictions_df),
        "precision": precision_evaluator.evaluate(train_predictions_df),
        "recall": recall_evaluator.evaluate(train_predictions_df),
        "f1": f1_evaluator.evaluate(train_predictions_df),
        "auroc": auroc_evaluator.evaluate(train_predictions_df)
    }

    test_metrics = {
        "accuracy": accuracy_evaluator.evaluate(test_prediction_df),
        "precision": precision_evaluator.evaluate(test_prediction_df),
Exemple #13
0
def hyper_parameter_optimization_ml():
	spark = SparkSession.builder.appName('hyper-parameter-optimization-ml').getOrCreate()
	spark.sparkContext.setLogLevel('WARN')

	labels = [
		('INFANT_ALIVE_AT_REPORT', types.IntegerType()),
		('BIRTH_PLACE', types.StringType()),
		('MOTHER_AGE_YEARS', types.IntegerType()),
		('FATHER_COMBINED_AGE', types.IntegerType()),
		('CIG_BEFORE', types.IntegerType()),
		('CIG_1_TRI', types.IntegerType()),
		('CIG_2_TRI', types.IntegerType()),
		('CIG_3_TRI', types.IntegerType()),
		('MOTHER_HEIGHT_IN', types.IntegerType()),
		('MOTHER_PRE_WEIGHT', types.IntegerType()),
		('MOTHER_DELIVERY_WEIGHT', types.IntegerType()),
		('MOTHER_WEIGHT_GAIN', types.IntegerType()),
		('DIABETES_PRE', types.IntegerType()),
		('DIABETES_GEST', types.IntegerType()),
		('HYP_TENS_PRE', types.IntegerType()),
		('HYP_TENS_GEST', types.IntegerType()),
		('PREV_BIRTH_PRETERM', types.IntegerType())
	]
	schema = types.StructType([types.StructField(e[0], e[1], False) for e in labels])
	births = spark.read.csv('dataset/births_transformed.csv.gz', header=True, schema=schema)

	# Create transformers.
	births = births.withColumn('BIRTH_PLACE_INT', births['BIRTH_PLACE'].cast(types.IntegerType()))
	# Encode the BIRTH_PLACE column using the OneHotEncoder method.
	encoder = ml_feature.OneHotEncoder(inputCol='BIRTH_PLACE_INT', outputCol='BIRTH_PLACE_VEC')

	featuresCreator = ml_feature.VectorAssembler(inputCols=[col[0] for col in labels[2:]] + [encoder.getOutputCol()], outputCol='features')

	# Split the dataset into training and testing datasets.
	births_train, births_test = births.randomSplit([0.7, 0.3], seed=666)

	# Create a purely transforming Pipeline.
	pipeline = Pipeline(stages=[encoder, featuresCreator])
	data_transformer = pipeline.fit(births_train)

	# Specify our model and the list of parameters we want to loop through.
	logistic = ml_classification.LogisticRegression(labelCol='INFANT_ALIVE_AT_REPORT')
	grid = tune.ParamGridBuilder() \
		.addGrid(logistic.maxIter, [2, 10, 50]) \
		.addGrid(logistic.regParam, [0.01, 0.05, 0.3]) \
		.build()
	# Define a way of comparing the models.
	evaluator = ml_eval.BinaryClassificationEvaluator(rawPredictionCol='probability', labelCol='INFANT_ALIVE_AT_REPORT')

	# Create a logic that will do the validation work.
	cv = tune.CrossValidator(estimator=logistic, estimatorParamMaps=grid, evaluator=evaluator)

	cvModel = cv.fit(data_transformer.transform(births_train))

	# See if cvModel performed better than our previous model
	data_train = data_transformer.transform(births_test)
	results = cvModel.transform(data_train)

	print(evaluator.evaluate(results, {evaluator.metricName: 'areaUnderROC'}))
	print(evaluator.evaluate(results, {evaluator.metricName: 'areaUnderPR'}))

	# Parameters which the best model has.
	results = [
		([{key.name: paramValue} for key, paramValue in zip(params.keys(), params.values())], metric)
		for params, metric in zip(cvModel.getEstimatorParamMaps(), cvModel.avgMetrics)
	]
	print(sorted(results, key=lambda el: el[1], reverse=True)[0])
Exemple #14
0
def train_validation_splitting_ml():
	spark = SparkSession.builder.appName('train-validation-splitting-ml').getOrCreate()
	spark.sparkContext.setLogLevel('WARN')

	labels = [
		('INFANT_ALIVE_AT_REPORT', types.IntegerType()),
		('BIRTH_PLACE', types.StringType()),
		('MOTHER_AGE_YEARS', types.IntegerType()),
		('FATHER_COMBINED_AGE', types.IntegerType()),
		('CIG_BEFORE', types.IntegerType()),
		('CIG_1_TRI', types.IntegerType()),
		('CIG_2_TRI', types.IntegerType()),
		('CIG_3_TRI', types.IntegerType()),
		('MOTHER_HEIGHT_IN', types.IntegerType()),
		('MOTHER_PRE_WEIGHT', types.IntegerType()),
		('MOTHER_DELIVERY_WEIGHT', types.IntegerType()),
		('MOTHER_WEIGHT_GAIN', types.IntegerType()),
		('DIABETES_PRE', types.IntegerType()),
		('DIABETES_GEST', types.IntegerType()),
		('HYP_TENS_PRE', types.IntegerType()),
		('HYP_TENS_GEST', types.IntegerType()),
		('PREV_BIRTH_PRETERM', types.IntegerType())
	]
	schema = types.StructType([types.StructField(e[0], e[1], False) for e in labels])
	births = spark.read.csv('dataset/births_transformed.csv.gz', header=True, schema=schema)

	# Create transformers.
	births = births.withColumn('BIRTH_PLACE_INT', births['BIRTH_PLACE'].cast(types.IntegerType()))
	# Encode the BIRTH_PLACE column using the OneHotEncoder method.
	encoder = ml_feature.OneHotEncoder(inputCol='BIRTH_PLACE_INT', outputCol='BIRTH_PLACE_VEC')

	featuresCreator = ml_feature.VectorAssembler(inputCols=[col[0] for col in labels[2:]] + [encoder.getOutputCol()], outputCol='features')

	# Split the dataset into training and testing datasets.
	births_train, births_test = births.randomSplit([0.7, 0.3], seed=666)

	# Select only the top five features.
	selector = ml_feature.ChiSqSelector(
		numTopFeatures=5,
		featuresCol=featuresCreator.getOutputCol(),
		outputCol='selectedFeatures',
		labelCol='INFANT_ALIVE_AT_REPORT'
	)

	# Create a purely transforming Pipeline.
	pipeline = Pipeline(stages=[encoder, featuresCreator, selector])
	data_transformer = pipeline.fit(births_train)

	# Create LogisticRegression and Pipeline.
	logistic = ml_classification.LogisticRegression(labelCol='INFANT_ALIVE_AT_REPORT', featuresCol='selectedFeatures')
	grid = tune.ParamGridBuilder() \
		.addGrid(logistic.maxIter, [2, 10, 50]) \
		.addGrid(logistic.regParam, [0.01, 0.05, 0.3]) \
		.build()
	# Define a way of comparing the models.
	evaluator = ml_eval.BinaryClassificationEvaluator(rawPredictionCol='probability', labelCol='INFANT_ALIVE_AT_REPORT')

	# Create a TrainValidationSplit object.
	tvs = tune.TrainValidationSplit(estimator=logistic, estimatorParamMaps=grid, evaluator=evaluator)

	# Fit our data to the model.
	tvsModel = tvs.fit(data_transformer.transform(births_train))
	data_train = data_transformer.transform(births_test)

	# Calculate results.
	results = tvsModel.transform(data_train)
	print(evaluator.evaluate(results, {evaluator.metricName: 'areaUnderROC'}))
	print(evaluator.evaluate(results, {evaluator.metricName: 'areaUnderPR'}))
Exemple #15
0
def infant_survival_ml():
	spark = SparkSession.builder.appName('infant-survival-ml').getOrCreate()
	spark.sparkContext.setLogLevel('WARN')

	labels = [
		('INFANT_ALIVE_AT_REPORT', types.IntegerType()),
		('BIRTH_PLACE', types.StringType()),
		('MOTHER_AGE_YEARS', types.IntegerType()),
		('FATHER_COMBINED_AGE', types.IntegerType()),
		('CIG_BEFORE', types.IntegerType()),
		('CIG_1_TRI', types.IntegerType()),
		('CIG_2_TRI', types.IntegerType()),
		('CIG_3_TRI', types.IntegerType()),
		('MOTHER_HEIGHT_IN', types.IntegerType()),
		('MOTHER_PRE_WEIGHT', types.IntegerType()),
		('MOTHER_DELIVERY_WEIGHT', types.IntegerType()),
		('MOTHER_WEIGHT_GAIN', types.IntegerType()),
		('DIABETES_PRE', types.IntegerType()),
		('DIABETES_GEST', types.IntegerType()),
		('HYP_TENS_PRE', types.IntegerType()),
		('HYP_TENS_GEST', types.IntegerType()),
		('PREV_BIRTH_PRETERM', types.IntegerType())
	]
	schema = types.StructType([types.StructField(e[0], e[1], False) for e in labels])
	births = spark.read.csv('dataset/births_transformed.csv.gz', header=True, schema=schema)

	# Create transformers.
	births = births.withColumn('BIRTH_PLACE_INT', births['BIRTH_PLACE'].cast(types.IntegerType()))
	# Encode the BIRTH_PLACE column using the OneHotEncoder method.
	encoder = ml_feature.OneHotEncoder(inputCol='BIRTH_PLACE_INT', outputCol='BIRTH_PLACE_VEC')

	featuresCreator = ml_ft.VectorAssembler(inputCols=[col[0] for col in labels[2:]] + [encoder.getOutputCol()], outputCol='features')

	# Create a model.
	logistic = ml_classification.LogisticRegression(maxIter=10, regParam=0.01, labelCol='INFANT_ALIVE_AT_REPORT')

	# Create a pipeline.
	pipeline = Pipeline(stages=[encoder, featuresCreator, logistic])

	# Split the dataset into training and testing datasets.
	births_train, births_test = births.randomSplit([0.7, 0.3], seed=666)

	# Run the pipeline and estimate the model.
	model = pipeline.fit(births_train)
	test_model = model.transform(births_test)

	print(test_model.take(1))

	# Evaluate the performance of the model.
	evaluator = ml_eval.BinaryClassificationEvaluator(rawPredictionCol='probability', labelCol='INFANT_ALIVE_AT_REPORT')
	print(evaluator.evaluate(test_model, {evaluator.metricName: 'areaUnderROC'}))
	print(evaluator.evaluate(test_model, {evaluator.metricName: 'areaUnderPR'}))

	# Save the Pipeline definition.
	pipelinePath = './infant_oneHotEncoder_Logistic_Pipeline'
	pipeline.write().overwrite().save(pipelinePath)

	# Load the Pipeline definition.
	loadedPipeline = Pipeline.load(pipelinePath)
	loadedPipeline.fit(births_train).transform(births_test).take(1)

	# Save the PipelineModel.
	modelPath = './infant_oneHotEncoder_Logistic_PipelineModel'
	model.write().overwrite().save(modelPath)

	# Load the PipelineModel.
	loadedPipelineModel = PipelineModel.load(modelPath)
	test_reloadedModel = loadedPipelineModel.transform(births_test)

	print(test_reloadedModel.take(1))
Exemple #16
0
data_piped = pipe.fit(data).transform(data)
data_piped.show()
data_piped = data_piped.select('features', 'Survived')

# splitting into train, test set
tr, te = data_piped.randomSplit([.7, .3])

# fitting models
from pyspark.ml.classification import LogisticRegression
lr = LogisticRegression(featuresCol = 'features', labelCol = 'Survived')

model = lr.fit(tr)
pred = model.transform(te)

import pyspark.ml.evaluation as evals
evaluator = evals.BinaryClassificationEvaluator(rawPredictionCol = 'prediction', labelCol = 'Survived')
AUC = evaluator.evaluate(pred)
AUC


############# model tunning
from pyspark.ml.tuning import ParamGridBuilder
params = ParamGridBuilder()
params = params.addGrid(lr.regParam, np.arange(0, .1, .01))
params = params.addGrid(lr.elasticNetParam, [0, .5, 1])
params = params.build()
print("Number of models to be tested:", len(params))

# create the CrossValidator
from pyspark.ml.tuning import CrossValidator
cv = CrossValidator(estimator = lr,