def print_performance_metrics(predictions):
    # Evaluate model
    evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction")
    auc = evaluator.evaluate(predictions,
                             {evaluator.metricName: "areaUnderROC"})
    aupr = evaluator.evaluate(predictions,
                              {evaluator.metricName: "areaUnderPR"})
    print("auc = {}".format(auc))
    print("aupr = {}".format(aupr))

    # Get RDD of predictions and labels for eval metrics
    predictionAndLabels = predictions.select("prediction", "label").rdd

    # Instantiate metrics objects
    binary_metrics = BinaryClassificationMetrics(predictionAndLabels)
    multi_metrics = MulticlassMetrics(predictionAndLabels)

    # Area under precision-recall curve
    print("Area under PR = {}".format(binary_metrics.areaUnderPR))
    # Area under ROC curve
    print("Area under ROC = {}".format(binary_metrics.areaUnderROC))
    # Accuracy
    print("Accuracy = {}".format(multi_metrics.accuracy))
    # Confusion Matrix
    print(multi_metrics.confusionMatrix())
    # F1
    print("F1 = {}".format(multi_metrics.fMeasure(1.0)))
    # Precision
    print("Precision = {}".format(multi_metrics.precision(1.0)))
    # Recall
    print("Recall = {}".format(multi_metrics.recall(1.0)))
    # FPR
    print("FPR = {}".format(multi_metrics.falsePositiveRate(1.0)))
    # TPR
    print("TPR = {}".format(multi_metrics.truePositiveRate(1.0)))
Exemple #2
0
    def calculate_metrics(self, df):
        """

    define your own metrics to evaluate cross validation

    :params:

    df: dataframe containing {aprediction} and {label} columns

    :returns:

    confusion matrix

    """

        # turn gt into label
        preds_and_labels = df.select('prediction',
                                     f.col('label').cast(t.FloatType()))
        metrics = MulticlassMetrics(preds_and_labels.rdd.map(tuple))

        # confusion matrix
        metrics_dict = dict(
            # unweighted measures
            tpr=metrics.truePositiveRate(label=1.0),
            fpr=metrics.falsePositiveRate(label=1.0),
            precision=metrics.precision(label=1.0),
            recall=metrics.recall(label=1.0),
            fMeasure=metrics.fMeasure(label=1.0))

        metrics_dict = {
            k: round(v, 3) if k != "confusion" else v
            for k, v in metrics_dict.items()
        }

        return metrics_dict
Exemple #3
0
 def predict(self):
     #print self.predictingData.show()
     predictions = self.model.transform(self.predictingData)
     #print predictions.show()
     #df= predictions.select('prediction').collect()
     #return df[0].asDict()["prediction"]
     predictions.select("URL", "prediction", "indexedLabel",
                        "label").show(200)
     predictionAndLabels = predictions.select("prediction",
                                              "indexedLabel").rdd
     metrics = MulticlassMetrics(predictionAndLabels)
     print("TPR: {:.3%} \tFPR: {:.3%}".format(
         metrics.truePositiveRate(1.0), metrics.falsePositiveRate(1.0)))
     print("TNR: {:.3%} \tFNR: {:.3%}".format(
         metrics.truePositiveRate(0.0), metrics.falsePositiveRate(0.0)))
     print("Confusion Matrix:")
     for line in metrics.confusionMatrix().toArray():
         print(line)
def printStatistics(labelsAndPredictions, data):
    metrics = MulticlassMetrics(labelsAndPredictions)
    labels = data.map(lambda lp: lp.label).distinct().collect()
    print("confusion metrics:")
    cm = metrics.confusionMatrix()
    print(cm)
    print('')
    print('accuracy: ' + str(metrics.accuracy))
    for label in labels:
        print('label: ' + str(label))
        print('fp: ' + str(metrics.falsePositiveRate(label)))
        print('tp: ' + str(metrics.truePositiveRate(label)))
    recall = metrics.recall()
    precision = metrics.precision()
    print("Recall = %s" % recall)
    print("Precision = %s" % precision)
Exemple #5
0
 def performancerdd(self):
     self.calculator = 'RDDs'
     print('Calculating performance metrics using RDDs...')
     predictionRDD = self.predictions.select(['label','prediction']).rdd.map(lambda line: (line[1],line[0]))
     
     binmetrics = BinaryClassificationMetrics(predictionRDD)
     metrics = MulticlassMetrics(predictionRDD)
     
     self.areaUnderROC = binmetrics.areaUnderROC
     self.areaUnderPR = binmetrics.areaUnderPR
     self.confusionMatrix = metrics.confusionMatrix().toArray()
     self.accuracy = metrics.accuracy
     self.precision = metrics.precision()
     self.recall = metrics.recall()
     self.f1measure = metrics.fMeasure()
     self.falsePositive = metrics.falsePositiveRate(1.0)
     self.falseNegative = metrics.falsePositiveRate(0.0)
Exemple #6
0
 def evaluate_model_simple(self, test):
     '''
     generate tpr, fpr, fnr, and tpr for each threshold
     --------
     Parameters:
     test: spark.df post vectorization
     number_of_iterations: number of threshold values between .001 and 1.00 utilized in roc curve
     --------
     Returns:
     list-of-dict - containing rate of pthres, tp, fp, fn, tn
     '''
     score_model = {}
     predictionAndLabels = test.rdd.map(
         lambda lp: (float(self.model.predict(lp.features)), lp.label))
     # Instantiate metrics object
     metrics = BinaryClassificationMetrics(predictionAndLabels)
     metrics2 = MulticlassMetrics(predictionAndLabels)
     # Area under precision-recall curve
     score_model['precision_recall'] = metrics.areaUnderPR
     # Area under ROC curve
     score_model["ROC_area"] = metrics.areaUnderROC
     score_model['tpr'] = metrics2.truePositiveRate('label')
     score_model['fpr'] = metrics2.falsePositiveRate('label')
     return score_model
    def evaluate(self, model=None, trainingData=None, testingData=None):
        """ Ham kiem thu model, in ra man hinh do do chinh xac va thoi gian tinh toan
        """
        time_train = 0
        time_test = 0

        if (not trainingData):
            trainingData = self.trainingData
        if (not testingData):
            testingData = self.testingData

        if (not model):
            # Train model
            print("Training...")
            start_train = datetime.now()
            model = self.trainModel(trainingData)
            time_train = datetime.now() - start_train

        #print("Num nodes: ", model.stages[2].totalNumNodes, "\n", model.stages[2].toDebugString, file=open("modelDebug.txt","w"))
        # Make predictions
        print("Testing...")
        start_test = datetime.now()
        predictions = model.transform(testingData)
        time_test = datetime.now() - start_test

        # Evaluation for flow
        print("{:*^100}".format(""))
        print("Training time: ", time_train)
        print("Testing time: ", time_test)

        featureImportances = {}
        fi = model.stages[2].featureImportances
        features = loadcols(self.dataset)
        index = 0
        for value in fi:
            featureImportances[features[index]] = value
            index = index + 1
        fiSorted = sorted(featureImportances.items(),
                          key=lambda x: x[1],
                          reverse=True)
        print("{:*^100}".format(" Feature Importances "))
        f = open("features_importance.txt", "w")
        for feature in fiSorted:
            if feature[1] > 0.000:
                print("{!s} : {:.4%}".format(feature[0].strip(), feature[1]))
            f.write("{!s}\n".format(feature[0].strip()))
        f.close()

        print("{:*^100}".format(" Evaluate for Flow "))

        print("Total predictions:", predictions.count())
        predictions.select("prediction", "indexedLabel",
                           "label").groupBy("label").count().show()

        predictionAndLabels = predictions.select("prediction",
                                                 "indexedLabel").rdd
        metrics = MulticlassMetrics(predictionAndLabels)

        print("Confusion Matrix:")
        for line in metrics.confusionMatrix().toArray():
            print(line)

        print("TPR: {:.3%} \tFPR: {:.3%}".format(
            metrics.truePositiveRate(1.0), metrics.falsePositiveRate(1.0)))
        print("TNR: {:.3%} \tFNR: {:.3%}".format(
            metrics.truePositiveRate(0.0), metrics.falsePositiveRate(0.0)))

        print("Precision: {:.3%} \tRecall: {:.3%} \tAccuracy: {:.3%}".format(
            metrics.precision(1.0), metrics.recall(1.0), metrics.accuracy))

        print(metrics.accuracy)

        print("{:*^100}".format(""))
Exemple #8
0
print(predictions.show(10))

# Metrics

from pyspark.mllib.util import MLUtils
from pyspark.mllib.evaluation import MulticlassMetrics

#X needs to be which column the prediction is in - 43 in full, 17 in integer only
x = predictions.rdd.map(lambda x: (x[17], float(x[0]))).collect()
predictionAndLabels = sc.parallelize(x)

# Instantiate metrics object
metrics = MulticlassMetrics(predictionAndLabels)

# Overall statistics
precision = metrics.precision()
recall = metrics.recall()
f1Score = metrics.fMeasure()
fpr0 = metrics.falsePositiveRate(0.0)
fpr1 = metrics.falsePositiveRate(1.0)
accuracy = metrics.accuracy
print("Summary Statistics")
print("Precision = %s" % precision)
print("Recall = %s" % recall)
print("F1 score = %s" % f1Score)
print("False positive rate 0 = %s" % fpr0)
print("False positive rate 1 = %s" % fpr1)
print("Accuracy = %s" % accuracy, '\n')
print("Confusion Matrix")
print(metrics.confusionMatrix().toArray(), '\n')
Exemple #9
0
	labels = traindata.map(lambda lp: lp.label).distinct().collect()
	for label in sorted(labels):
	    print("Class %s precision = %s" % (label, metrics.precision(label)))
	    print("Class %s recall = %s" % (label, metrics.recall(label)))
	    print("Class %s F1 Measure = %s" % (label, metrics.fMeasure(label, beta=1.0)))

	# Weighted stats
	print("Weighted recall = %s" % metrics.weightedRecall)
	print("Weighted precision = %s" % metrics.weightedPrecision)
	print("Weighted F(1) Score = %s" % metrics.weightedFMeasure())
	print("Weighted F(0.5) Score = %s" % metrics.weightedFMeasure(beta=0.5))
	print("Weighted false positive rate = %s" % metrics.weightedFalsePositiveRate)

	#return model parameters
	res = [('1','Yes','TP Rate', metrics.truePositiveRate(0.0)),
		   ('2','Yes','FP Rate', metrics.falsePositiveRate(0.0)),
		   ('3','Yes','Precision', metrics.precision(0.0)),
		   ('4','Yes','Recall', metrics.recall(0.0)),
	       ('5','Yes','F-Measure', metrics.fMeasure(0.0, beta=1.0)),
	       ('1','Yes','TP Rate', metrics.truePositiveRate(1.0)),
		   ('2','Yes','FP Rate', metrics.falsePositiveRate(1.0)),
	       ('3','Yes','Precision', metrics.precision(1.0)),
		   ('4','Yes','Recall', metrics.recall(1.0)),
	       ('5','Yes','F-Measure', metrics.fMeasure(1.0, beta=1.0)),
	       ('1','Yes','TP Rate', metrics.truePositiveRate(2.0)),
		   ('2','Yes','FP Rate', metrics.falsePositiveRate(2.0)),
	       ('3','Yes','Precision', metrics.precision(2.0)),
	       ('4','Yes','Recall', metrics.recall(2.0)),
	       ('5','Yes','F-Measure', metrics.fMeasure(2.0, beta=1.0))]	

	#save output file path as JSON and dump into dumpFilePath
# COMMAND ----------

# DBTITLE 1,Visualizing the for data prediction results
display(predAnalysis)

# COMMAND ----------

# DBTITLE 1,Confusion Matrix for the patient model
from pyspark.mllib.evaluation import MulticlassMetrics
metrics = MulticlassMetrics(confusionMatrix.rdd)
cm = metrics.confusionMatrix().toArray()

# COMMAND ----------

# DBTITLE 1,Performance Metrics of the model
print metrics.falsePositiveRate(0.0)
print metrics.accuracy

# COMMAND ----------

# DBTITLE 1,confusion matrix in matplotlib
# MAGIC %python
# MAGIC import matplotlib.pyplot as plt
# MAGIC import numpy as np
# MAGIC import itertools
# MAGIC plt.figure(figsize=(2,2))
# MAGIC classes=list([0,1])
# MAGIC plt.imshow(cm, interpolation='nearest', cmap=plt.cm.Blues)
# MAGIC plt.title('Confusion matrix')
# MAGIC plt.colorbar()
# MAGIC tick_marks = np.arange(len(classes))
Exemple #11
0
def execute(spark, logger, s3_bucket, run_id, aoi_name, complete_catalog,
            probability_images, seed, config_filename):
    """The primary script

    Args:
        spark (``pyspark.sql.SparkSession``)
        logger (``py4j.JavaObject``)
        s3_bucket (str): Name of the S3 bucket to search for configuration objects
            and save results to
        run_id (str): The identifier of the current run
        aoi_id (str): The identifier for the current area of interest
        probability_images (int): The number of tiles to save the generated
            probability images for
        seed (int): A random seed used to sample the probability images, for
            reproducability

    Required external inputs:
        <s3_bucket>/cvmapper_config.yaml
            under ``learner`` key:
                    prefix: The S3 prefix under which CSVs can be read and written
                    pool: Name of CSV file under s3_bucket/prefix giving the
                        comprehensive list of active grid cells
                    incoming_names: Name of CSV file under s3_bucket/prefix giving
                        list of cells used for training/validation
                    image_catalog: Name of CSV file under s3_bucket giving catalog
                        of imagery
                    image_output_pattern: URI pattern used for output of probability
                        images.  Must contain two '{}' tokens to be replaced by the
                        column and row for the relevant cell
                    outgoing: S3 URI to save the CSV of worst-performing cells to

        location pool:
            A CSV of ``name``, ``col``, ``row`` for each grid cell under
            consideration.  Identified by ``pool`` parameter above.

        incoming names:
            CSV containing (at least) ``name``, ``iteration``, and ``usage``
            columns.  Every name in this file must also be contained in the image
            pool.  Location of this file given in YAML file.

        image catalog:
            A CSV minimally containing ``col``, ``row``, ``season``, and ``uri``
            columns.  Season is either 'GS' or 'OS'.  Every grid cell in the
            location pool must be contained here, and must have an entry for both
            seasons.  URI points to TIFF that completely covers listed cell with
            valid image data (no NODATA values).

    Note:

        Grid cells are defined according to the master_layout object, which
        specifies a rectangular extent in long/lat coords.  This extent is
        subdivided into cells (in this case, 13792 columns and 14477 rows).
        Each cell is then given a pixel resolution (in this case 200x200, but
        whatever is chosen must match the resolution of the label images
        provided in the ``s3://<s3_bucket>/<prefix>/<name>_<col>_<row>.tif``
        files identified by the incoming names CSV).  When we refer to tiles,
        we mean image chips of the stated resolution, indexed by
        ``gps.SpatialKey`` objects.  The key is a col/row pair where row=0,
        col=0 corresponds to the chip in the upper left corner of the bounding
        extent.

    Note:

        Grid cell names for the output probability images
        (`image_output_pattern`) are relative to a different, coarser layout.
        These grid cell ids need not be clearly defined, since the output of
        this process is simply a bucket of COGs for display using another
        tool.  However, see the `coarse_layout` definition below for specific
        details of the layout.

    """
    params = parse_yaml_from_s3(s3_bucket, config_filename)['learner']
    label_path = parse_yaml_from_s3(
        s3_bucket, config_filename)['labeller']['consensus_directory'][1:-1]
    s3_prefix = params['prefix']
    s3_prefix = s3_prefix[0:-1] if s3_prefix.endswith('/') else s3_prefix

    catalog_prefix = params['image_catalog']
    catalog_prefix_fix = params['image_catalog_fix']

    feature_names = functools.reduce(lambda a, b: a + b, [[
        "{}_raw_{}".format(season, n), "{}_avg_{}".format(season, n),
        "{}_std_{}".format(season, n)
    ] for season in ["GS", "OS"] for n in range(1, 5)])

    master_layout = gps.LayoutDefinition(
        gps.Extent(-17.541, -35.46, 51.459, 37.54),
        gps.TileLayout(13800, 14600, 200, 200))
    master_metadata = gps.Metadata(
        gps.Bounds(gps.SpatialKey(0, 0), gps.SpatialKey(13800, 14600)),
        "+proj=longlat +datum=WGS84 +no_defs ", gps.CellType.INT8,
        master_layout.extent, master_layout)

    ####################################
    logger.warn("Reading source tables")

    checkpoint = time.time()
    f_pool = spark\
         .read\
         .option('inferScheme', True)\
         .option('header', True)\
         .csv('s3n://{}/{}/{}'.format(s3_bucket, s3_prefix, params['pool']))\
         .repartition('col', 'row')

    qs_in = spark \
        .read \
        .option('inferScheme', True) \
        .option('header', True) \
        .csv('s3n://{}/{}/{}'.format(s3_bucket, s3_prefix, params['qs'])) \
        .repartition('col', 'row')

    incoming = spark.read\
                    .option('header', True)\
                    .schema(StructType([
                        StructField('name', StringType()),
                        StructField('run', IntegerType()),
                        StructField('iteration', IntegerType()),
                        StructField('processed', BooleanType()),
                        StructField('usage', StringType()),
                        StructField('label', StringType())
                    ]))\
                    .csv('s3n://{}/{}/{}'.format(s3_bucket, s3_prefix, params['incoming_names']))

    # merge incoming_names and incoming_names_static
    incoming = incoming.union(spark.read \
        .option('header', True) \
        .schema(StructType([
        StructField('name', StringType()),
        StructField('run', IntegerType()),
        StructField('iteration', IntegerType()),
        StructField('processed', BooleanType()),
        StructField('usage', StringType()),
        StructField('label', StringType())
    ])) \
        .csv('s3n://{}/{}/{}'.format(s3_bucket, s3_prefix, params['incoming_names_static'])))

    incoming = incoming.filter(incoming['run'] == params['runid']).filter(
        incoming['label'] == True)
    test_names = f_pool.join(incoming.select('name'), 'name',
                             'left_anti').withColumn("usage", lit("test"))
    all_names = f_pool.join(incoming.select('name', 'usage'),
                            f_pool.name == incoming.name,
                            how='left')\
                      .select(f_pool.name.alias('name'), 'col', 'row', 'usage')
    num_test_images = test_names.count()

    image_catalog = spark.read\
                          .option('inferScheme', True)\
                          .option('header', True)\
                          .csv('s3n://{}/{}'.format(s3_bucket, catalog_prefix))\
                          .repartition('col', 'row')
    all_image_uris = image_catalog\
                     .filter(image_catalog['season'] == 'GS')\
                     .alias('gs')\
                     .join(image_catalog.filter(image_catalog['season'] == 'OS').alias('os'),
                           (col('gs.col') == col('os.col')) & (col('gs.row') == col('os.row')))\
                     .select(col('gs.col'), col('gs.row'), col('gs.uri').alias('GS'), col('os.uri').alias('OS'))
    logger.warn(
        "Elapsed time for reading source tables: {}s".format(time.time() -
                                                             checkpoint))
    ####################################
    logger.warn("Reading training labels & building training features")

    checkpoint = time.time()
    training_data = gather_data(all_image_uris,
                                all_names.filter(all_names.usage == 'train'),
                                master_metadata,
                                feature_names,
                                s3_bucket,
                                label_path,
                                include_masks=True)
    training_data.show()
    logger.warn(
        "Elapsed time for reading training labels and feature building: {}s".
        format(time.time() - checkpoint))

    ####################################
    logger.warn("Balancing data")

    checkpoint = time.time()
    balanced_data = balance_samples(spark, training_data, 'mask')
    balanced_data.show()
    logger.warn("Elapsed time for balancing data: {}s".format(time.time() -
                                                              checkpoint))

    ####################################
    logger.warn("Training model")

    checkpoint = time.time()
    pipeline = ml_pipeline(feature_names, 'mask')
    model = pipeline.fit(balanced_data)
    print(model)
    logger.warn("Elapsed time for training the model: {}s".format(time.time() -
                                                                  checkpoint))

    ####################################
    logger.warn("Validating model results")

    checkpoint = time.time()
    validation_data = gather_data(
        all_image_uris,
        all_names.filter(all_names.usage == 'validate'),
        master_metadata,
        feature_names,
        s3_bucket,
        label_path,
        include_masks=True)

    valid_fit = model.transform(validation_data).select(
        'prediction', 'probability', 'mask')

    metrics = MulticlassMetrics(
        valid_fit.rdd.map(lambda r: (r.prediction, r.mask)))
    confusion_matrix = metrics.confusionMatrix().toArray().flatten().tolist(
    )  #left to right, top to bottom
    tss = 1.0 * confusion_matrix[3] / (confusion_matrix[3] + confusion_matrix[2]) + \
          1.0 * confusion_matrix[0] / (confusion_matrix[0] + confusion_matrix[1]) - 1
    binmetrics = BinaryClassificationMetrics(
        valid_fit.rdd.map(lambda r: (float(r['probability'][1]), r['mask'])))

    last_iteration = incoming.agg(F.max('iteration')).collect()[0][0]
    report = pd.DataFrame({
        'run': [run_id],
        'iteration': [last_iteration + 1],
        'tss': [tss],
        'accuracy': [metrics.accuracy],
        'precision': [metrics.precision(1.0)],
        'recall': [metrics.recall(1.0)],
        'fpr': [metrics.falsePositiveRate(1.0)],
        'tpr': [metrics.truePositiveRate(1.0)],
        'AUC': [binmetrics.areaUnderROC],
        'aoi': [aoi_name],
        'iteration_time': [datetime.utcnow().strftime('%Y-%m-%d %H:%M:%S.%f')]
    })
    # TODO: allow target location to be derived from params (local or s3)
    # added because of an error where incoming_metrics.csv contained different iteration number (10)
    # than expected by DB (4). Ryan's guess is that this is due to multiple test clusters overwriting csv
    # print("############Old Iteration Metrics  to overwrite###########")
    # incoming_previous = pd.read_csv(os.path.join("s3://",s3_bucket,s3_prefix,params['metrics']))
    # print(incoming_previous.to_string())
    # print("############New Iteration Metrics to use to overwrite###########")
    # print(report.to_string())
    pd_df_to_s3_csv(report, s3_bucket,
                    os.path.join(s3_prefix, params['metrics']))
    logger.warn(
        "Elapsed time for validating and saving metrics to s3: {}s".format(
            time.time() - checkpoint))

    ####################################
    logger.warn("Classifying test data")

    checkpoint = time.time()
    filtered_names = test_names.filter(test_names.usage == "test")
    # filtered_names.cache()
    # filtered_names.show()
    test_features = gather_data(all_image_uris, filtered_names,
                                master_metadata, feature_names, s3_bucket)

    test_features_sample = test_features.sample(True, 0.1)

    fitted = model.transform(test_features_sample).select(
        'spatial_key', 'column_index', 'row_index', 'probability',
        'prediction')
    # fitted.cache()
    # fitted.show()
    grouped = fitted.groupBy('spatial_key')

    # don't want to use following UDF, but indication is that there is a bug in pyspark preventing vector accesses:
    # https://stackoverflow.com/questions/44425159/access-element-of-a-vector-in-a-spark-dataframe-logistic-regression-probability
    # (This did not work without the UDF!)
    firstelement = F.udf(lambda v: float(v[0]), FloatType())
    # added this UDF to select the probability of field rather than no field to write to probability images
    secondelement = F.udf(lambda v: float(v[1]), FloatType())

    logger.warn(
        "Elapsed time for classifying test grids: {}s".format(time.time() -
                                                              checkpoint))

    ####################################
    if probability_images > 0 or complete_catalog:
        logger.warn("Write catalog of {} probability images".format(
            probability_images))
        checkpoint = time.time()

        if complete_catalog:

            # new catalog
            image_catalog_fix = spark.read \
                .option('inferScheme', True) \
                .option('header', True) \
                .csv('s3n://{}/{}'.format(s3_bucket, catalog_prefix_fix)) \
                .repartition('col', 'row')
            all_image_uris_fix = image_catalog_fix \
                .filter(image_catalog_fix['season'] == 'GS') \
                .alias('gs') \
                .join(image_catalog_fix.filter(image_catalog_fix['season'] == 'OS').alias('os'),
                      (col('gs.col') == col('os.col')) & (col('gs.row') == col('os.row'))) \
                .select(col('gs.col'), col('gs.row'), col('gs.uri').alias('GS'), col('os.uri').alias('OS'))

            #recollect all pixels for all testing images
            compreh_names = f_pool.join(qs_in,
                                        ['name', 'col', 'row', 'name_col_row'],
                                        'outer')
            features_compreh = gather_data(all_image_uris_fix, compreh_names,
                                           master_metadata, feature_names,
                                           s3_bucket)
            fitted_compreh = model.transform(features_compreh)\
                 .select('spatial_key', 'column_index', 'row_index', 'probability', 'prediction')
            grouped_compreh = fitted_compreh.groupBy('spatial_key')
            # added to test sampling
            assembled = grouped_compreh.agg(
                assembleTile('column_index', 'row_index',
                             secondelement('probability'),
                             master_layout.tileLayout.tileCols,
                             master_layout.tileLayout.tileRows,
                             'float32').alias('probability'))
            layer = gps.TiledRasterLayer.from_rasterframe(assembled.asRF())

        else:
            ####################################
            logger.warn("Identify worst performing cells")
            checkpoint = time.time()
            # TODO: Determine which images to take
            certainty = grouped \
                .agg(F.avg(F.pow(firstelement(fitted.probability) - lit(0.5), 2.0)).alias('certainty')).cache()
            certainty.show()

            worst_keys_rdd = certainty \
                .sort('certainty') \
                .select('spatial_key') \
                .limit(round(certainty.count() * 0.05)) \
                .rdd.takeSample(False, (params['number_outgoing_names']))
            worst_keys = spark.createDataFrame(worst_keys_rdd)
            outgoing_names = worst_keys \
                .join(f_pool, (col('spatial_key.col') == col('col')) & (col('spatial_key.row') == col('row'))) \
                .select('name') \
                .withColumn('run', lit(run_id)) \
                .withColumn('iteration', lit(last_iteration + 1)) \
                .withColumn('processed', lit(False)) \
                .withColumn('usage', lit('train')) \
                .toPandas()
            uri = urlparse.urlparse(params['outgoing'])
            pd_df_to_s3_csv(outgoing_names, uri.netloc, uri.path[1:])
            logger.warn(
                "Elapsed time for sorting certainty, converting to Pandas Dataframe, and saving to s3: {}s"
                .format(time.time() - checkpoint))

            ###########################################
            checkpoint = time.time()
            # sampling testing images (num = probability_images)
            filtered_names_sample = filtered_names\
                .sample(False, min(1.0, float(probability_images) / float(num_test_images)), seed=seed)\
                .join(image_catalog.filter(image_catalog['season'] == 'GS'), ['col', 'row'])\
                .select('scene_id')\
                .dropDuplicates()\
                .join(image_catalog.filter(image_catalog['season'] == 'GS'), 'scene_id')\
                .join(f_pool.join(qs_in, ['name', 'col', 'row', 'name_col_row'], 'outer'), ['col','row'])\
                .select('name', 'col', 'row', 'name_col_row')

            #re-collect all pixels within sampled images
            features_images = gather_data(all_image_uris,
                                          filtered_names_sample,
                                          master_metadata, feature_names,
                                          s3_bucket)
            #reclassify sampled testing images
            fitted_images = model.transform(features_images)\
                    .select('spatial_key', 'column_index', 'row_index', 'probability', 'prediction')
            grouped_sample = fitted_images.join(
                filtered_names_sample, (col('spatial_key.col') == col('col')) &
                (col('spatial_key.row') == col('row'))).groupby('spatial_key')
            assembled = grouped_sample.agg(
                assembleTile('column_index', 'row_index',
                             secondelement('probability'),
                             master_layout.tileLayout.tileCols,
                             master_layout.tileLayout.tileRows,
                             'float32').alias('probability'))
            layer = gps.TiledRasterLayer.from_rasterframe(assembled.asRF())

        coarse_layout = gps.LayoutDefinition(
            gps.Extent(-17.541, -35.46, 51.459, 37.54),
            gps.TileLayout(1380, 1460, 2000, 2000))
        # we multiply by 100 to select digits that will be kept after converting from float to int.
        # range of int8 is to 128, so we can only preserve 2 sig figs
        output_tiles = (layer*100).convert_data_type(gps.CellType.INT8)\
                            .tile_to_layout(coarse_layout)\
                            .to_geotiff_rdd(storage_method=gps.StorageMethod.TILED)

        cog_location = '/tmp/image_{}_{}.tif' if 'image_output_pattern' not in params else params[
            'image_output_pattern']
        output_tiles.foreach(lambda pair: write_bytes_to_s3(
            cog_location.format(pair[0].col, pair[0].row, aoi_name, run_id,
                                str(last_iteration + 1)), pair[1]))
        logger.warn(
            "Elapsed time for writing catalog of probability images: {}s".
            format(time.time() - checkpoint))
Exemple #12
0
    print("Confusion Matrix")
    print(confMat)

    print("Summary Stats")
    print("Accuracy = %s" % acc)
    print("Precision = %s" % prec)
    print("Recall = %s" % reca)
    print("F1-score = %s" % f1Score)

    print()

    #stats by class
    labels = loadedDataDF_test.map(lambda lp: lp.label).distinct().collect()
    for label in sorted(labels):
        print("Class %s precision = %s" % (label, metrics.precision(label)))
        print("Class %s recall = %s" % (label, metrics.recall(label)))
        print("Class %s F1 Measure= %s" % (label, metrics.fMeasure(label)))
        print("Class %s FPR = %s" % (label, metrics.falsePositiveRate(label)))
        print("Class %s TPR = %s" % (label, metrics.truePositiveRate(label)))

    print()

    #weighted stats
    print("Weighted precision = %s" % metrics.weightedPrecision)
    print("Weighted recall = %s" % metrics.weightedRecall)
    print("Weighted F(1) score = %s" % metrics.weightedFMeasure())
    print("Weighted F(0.5) score = %s" % metrics.weightedFMeasure(beta=0.5))
    print("Weighted FPR = %s" % metrics.weightedFalsePositiveRate)
    print("Weighted TPR = %s" % metrics.weightedTruePositiveRate)
Exemple #13
0
    def train_model (conf):
        sc = SparkUtil.get_spark_context (conf.spark_conf)
        conf.output_dir = conf.output_dir.replace ("file:", "")
        conf.output_dir = "file://{0}".format (conf.output_dir)

        labeled = Evaluate.load_all (sc, conf). \
                  map (lambda b : LabeledPoint ( label = 1.0 if b.fact else 0.0,
                                                 features = [ b.paraDist, b.sentDist, b.docDist ] ) )

#        labeled = sc.parallelize ([ round ((x/10) * 9) for x in random.sample(range(1, 100000000), 30000) ]). \
#                  map (lambda b : LabeledPoint ( 1.0 if b % 2 == 0 else 0.0,
#                                                 [ b, b * 2, b * 9 ] ) )
#        print (labeled.collect ())

        train, test = labeled.randomSplit (weights=[ 0.8, 0.2 ], seed=12345)

        count = train.count ()
        start = time.time ()
        model = LogisticRegressionWithLBFGS.train (train)
        elapsed = time.time () - start
        print ("Trained model on training set of size {0} in {1} seconds".format (count, elapsed))

        start = time.time ()
        model_path = os.path.join (conf.output_dir, "eval", "model")
        file_path = model_path.replace ("file://", "")
        if os.path.isdir (file_path):
            print ("Removing existing model {0}".format (file_path))
            shutil.rmtree (file_path)
        model.save(sc, model_path)
        sameModel = LogisticRegressionModel.load(sc, model_path)
        elapsed = time.time () - start
        print ("Saved and restored model to {0} in {1} seconds".format (model_path, elapsed))


        # Metrics
        labelsAndPreds = test.map (lambda p: (p.label, model.predict (p.features)))
        trainErr = labelsAndPreds.filter(lambda (v, p): v != p).count () / float (train.count())
        print("Training Error => {0}".format (trainErr))

        predictionsAndLabels = labelsAndPreds.map (lambda x : ( float(x[1]), float(x[0]) ))
        metrics = MulticlassMetrics (predictionsAndLabels) 
        print (" --------------> {0}".format (predictionsAndLabels.take (1000)))

        #print (labelsAndPreds.collect ())
        print ("\nMETRICS:")
        try:
            print ("false positive (0.0): {0}".format (metrics.falsePositiveRate(0.0)))
            print ("false positive (1.0): {0}".format (metrics.falsePositiveRate(1.0)))
        except:
            traceback.print_exc ()
        try:
            print ("precision          : {0}".format (metrics.precision(1.0)))
        except:
            traceback.print_exc ()
        try:
            print ("recall             : {0}".format (metrics.recall(1.0)))
        except:
            traceback.print_exc ()
        try:
            print ("fMeasure           : {0}".format (metrics.fMeasure(0.0, 2.0)))
        except:
            traceback.print_exc ()

        print ("confusion matrix   : {0}".format (metrics.confusionMatrix().toArray ()))
        print ("precision          : {0}".format (metrics.precision()))
        print ("recall             : {0}".format (metrics.recall()))
        print ("weighted false pos : {0}".format (metrics.weightedFalsePositiveRate))
        print ("weighted precision : {0}".format (metrics.weightedPrecision))
        print ("weighted recall    : {0}".format (metrics.weightedRecall))
        print ("weight f measure   : {0}".format (metrics.weightedFMeasure()))
        print ("weight f measure 2 : {0}".format (metrics.weightedFMeasure(2.0)))
        print ("")

        # Regression metrics
        predictedAndObserved = test.map (lambda p: (model.predict (p.features) / 1.0 , p.label / 1.0 ) )

        regression_metrics = RegressionMetrics (predictedAndObserved)
        print ("explained variance......: {0}".format (regression_metrics.explainedVariance))
        print ("absolute error..........: {0}".format (regression_metrics.meanAbsoluteError))
        print ("mean squared error......: {0}".format (regression_metrics.meanSquaredError))
        print ("root mean squared error.: {0}".format (regression_metrics.rootMeanSquaredError))
        print ("r2......................: {0}".format (regression_metrics.r2))
        print ("")

        labelsAndPreds = test.map (lambda p: (p.label, sameModel.predict (p.features)))
        testErr = labelsAndPreds.filter (lambda (v, p): v != p).count () / float (test.count ())
        print ("Testing Error => {0}".format (testErr))