#Random Forest Classifier
from pyspark.ml.classification import RandomForestClassifier

features = [
    'app', 'device', 'os', 'channel', 'hour', 'dow', 'doy', 'ip_clicks',
    'app_clicks', 'os_clicks', 'device_clicks', 'app_device_comb_clicks',
    'ip_app_comb_clicks', 'app_os_comb_clicks'
]
pipeline_stages = []
pipeline_stages.append(
    VectorAssembler(inputCols=features, outputCol='feature_vector'))
rf = RandomForestClassifier(featuresCol='feature_vector',
                            labelCol='is_attributed',
                            numTrees=10,
                            maxBins=500)
pipeline_stages.append(rf)
pipeline = Pipeline(stages=pipeline_stages)

model = pipeline.fit(train_set)
test_output = model.transform(test_set)

test_output_rdd = test_output.rdd
predictionsAndLabels = test_output_rdd.map(lambda x:
                                           (x.prediction, x.is_attributed))
metrics1 = MulticlassMetrics(predictionsAndLabels)
metrics2 = BinaryClassificationMetrics(predictionsAndLabels)
print('ROC of random forest model:{}'.format(metrics2.areaUnderROC))
model.write().overwrite().save(
    'Enter URL where the model has to be saved')  ##Todo
Beispiel #2
0
def main(sc):

    train_id = utils.load("data_id/train.p")
    test_id = utils.load("data_id/test.p")

    meta(train_id)

    train_id = [[idx] for idx in train_id]
    test_id = [[idx] for idx in test_id]

    sqlContext = SQLContext(sc)
    train_f = sqlContext.createDataFrame(train_id, ['biz_id'])
    test_f = sqlContext.createDataFrame(test_id, ['biz_id'])

    # Register user defined functions
    # city = udf(lambda b_id: get_city(b_id), StringType())
    state = udf(lambda b_id: MLVectors.dense(get_state(b_id)), VectorUDT())
    stars = udf(lambda b_id: get_stars(b_id), FloatType())
    popularity = udf(lambda b_id: get_popularity(b_id), IntegerType())
    name_size = udf(lambda b_id: get_name_size(b_id), IntegerType())
    name_polar = udf(lambda b_id: get_name_polar(b_id), FloatType())
    pos_neg_score = udf(lambda b_id: MLVectors.dense(get_PosNeg_score(b_id)),
                        VectorUDT())
    # clarity = udf(lambda b_id: get_clarity(b_id), ArrayType(FloatType()))
    elite_cnt = udf(lambda b_id: get_elite_cnt(b_id), IntegerType())
    label = udf(lambda b_id: get_y(b_id), IntegerType())

    # Generate feature columns
    # data_f = data_f.withColumn("city", city(data_f['biz_id']))
    train_f = train_f.withColumn("state", state(train_f['biz_id']))
    train_f = train_f.withColumn("stars", stars(train_f['biz_id']))
    train_f = train_f.withColumn("popularity", popularity(train_f['biz_id']))
    train_f = train_f.withColumn("name_size", name_size(train_f['biz_id']))
    train_f = train_f.withColumn("name_polar", name_polar(train_f['biz_id']))
    train_f = train_f.withColumn("pos_neg_score",
                                 pos_neg_score(train_f['biz_id']))
    # data_f = data_f.withColumn("clarity", clarity(data_f['biz_id']))
    train_f = train_f.withColumn("elite_cnt", elite_cnt(train_f['biz_id']))
    train_f = train_f.withColumn("y", label(train_f['biz_id']))
    train_f.show(5)

    # Generate feature columns
    test_f = test_f.withColumn("state", state(test_f['biz_id']))
    test_f = test_f.withColumn("stars", stars(test_f['biz_id']))
    test_f = test_f.withColumn("popularity", popularity(test_f['biz_id']))
    test_f = test_f.withColumn("name_size", name_size(test_f['biz_id']))
    test_f = test_f.withColumn("name_polar", name_polar(test_f['biz_id']))
    test_f = test_f.withColumn("pos_neg_score",
                               pos_neg_score(test_f['biz_id']))
    test_f = test_f.withColumn("elite_cnt", elite_cnt(test_f['biz_id']))
    test_f = test_f.withColumn("y", label(test_f['biz_id']))
    test_f.show(5)

    # One-hot encoding
    # encoder = OneHotEncoder(inputCol="state", outputCol="stateVec")
    # train_f = encoder.transform(train_f)
    train_f.show(5)
    # test_f = encoder.transform(test_f)
    test_f.show(5)

    # Assemble columns to features
    assembler = VectorAssembler(inputCols=[
        "state", "stars", "popularity", "name_size", "name_polar",
        "pos_neg_score", "elite_cnt"
    ],
                                outputCol="features")

    train_f = assembler.transform(train_f)
    train_f.show(5)
    test_f = assembler.transform(test_f)
    test_f.show(5)

    train_f = train_f.filter(train_f.y != -1)
    test_f = test_f.filter(test_f.y != -1)


    train_d = (train_f.select(col("y"), col("features")) \
                .rdd \
                .map(lambda row: LabeledPoint(float(row.y), MLLibVectors.fromML(row.features))))
    m = SVMWithSGD.train(train_d)
    predictionAndLabels = test_f.rdd.map(lambda row: (float(
        m.predict(MLLibVectors.fromML(row.features))), float(row.y)))
    # Grid search for best params and model
    # scores = {}
    # max_score = 0
    # for m in model_list:
    #     print ('run', m)
    #     evaluator = BinaryClassificationEvaluator()
    #     cv = CrossValidator(estimator=model_list[m],
    #                 estimatorParamMaps=params_list[m],
    #                 evaluator=evaluator,
    #                 numFolds=3)
    #     cv.fit(train)
    #     scores[m] = cv.get_best_score()
    #     if scores[m] > max_score:
    #         op_params = params_list[m][cv.get_best_index()]
    #         op_model = cv.get_best_model()
    #         op_m_name = m

    # predictionAndLabels = test.map(lambda lp: (float(op_model.predict(lp.features)), lp.y))

    # Instantiate metrics object
    bi_metrics = BinaryClassificationMetrics(predictionAndLabels)
    mul_metrics = MulticlassMetrics(predictionAndLabels)

    # Area under precision-recall curve
    print("Area under PR = %s" % bi_metrics.areaUnderPR)
    # Area under ROC curve
    print("Area under ROC = %s" % bi_metrics.areaUnderROC)
    # Confusion Matrix
    print("Confusion Matrix")
    print(mul_metrics.confusionMatrix().toArray())

    # Overall statistics
    precision = mul_metrics.precision()
    recall = mul_metrics.recall()
    f1Score = mul_metrics.fMeasure()
    accuracy = mul_metrics.accuracy
    print("Summary Stats")
    print("Precision = %s" % precision)
    print("Recall = %s" % recall)
    print("F1 Score = %s" % f1Score)
    print("Accuracy = %s" % accuracy)

    # Individual label stats
    labels = [0, 1]
    for label in labels:
        print("Class %s precision = %s" %
              (label, mul_metrics.precision(label)))
        print("Class %s recall = %s" % (label, mul_metrics.recall(label)))
Beispiel #3
0
# Define elephas optimizer
adagrad = elephas_optimizers.Adagrad()

# Initialize Spark ML Estimator
estimator = ElephasEstimator()
estimator.set_keras_model_config(model.to_yaml())
estimator.set_optimizer_config(adagrad.get_config())
estimator.set_nb_epoch(nb_epoch)
estimator.set_batch_size(batch_size)
estimator.set_num_workers(4)
estimator.set_verbosity(2)
estimator.set_validation_split(0.1)
estimator.set_categorical_labels(True)
estimator.set_nb_classes(nb_classes)

estimator.set_frequency('batch')

# Fitting a model returns a Transformer
pipeline = Pipeline(stages=[estimator])
fitted_pipeline = pipeline.fit(df)

# Evaluate Spark model by evaluating the underlying model
prediction = fitted_pipeline.transform(test_df)
pnl = prediction.select("label", "prediction")
pnl.show(100)

prediction_and_label = pnl.map(lambda row: (row.label, row.prediction))
metrics = MulticlassMetrics(prediction_and_label)
print("Precision:", metrics.precision())
print("Recall:", metrics.recall())
Beispiel #4
0
trainParsed = sc.parallelize(map(parsePoint, train_dict))
testParsed = sc.parallelize(map(parsePoint, test_dict))

model = SVMWithSGD.train(trainParsed, iterations=100)

# Training Error
trainLabelsAndPreds = trainParsed.map(lambda p: (p.label, float(model.predict(p.features))))
trainErr = trainLabelsAndPreds.filter(lambda (v, p): v != p).count()/float(trainParsed.count())
print trainErr

# Test Error
testLabelsAndPreds = testParsed.map(lambda p: (p.label, float(model.predict(p.features))))
testErr = testLabelsAndPreds.filter(lambda (v, p): v != p).count()/float(testParsed.count())
print testErr

metrics = BinaryClassificationMetrics(testLabelsAndPreds)

print metrics.areaUnderROC
print metrics.areaUnderPR

mcMetrics = MulticlassMetrics(testLabelsAndPreds)

#TODO: Do this for classes 1.0,0.0 and not just overall
print mcMetrics.precision()
print mcMetrics.recall()
print mcMetrics.fMeasure()

model.save(sc, "SVMModel")

Beispiel #5
0
def execute(spark, logger, s3_bucket, run_id, aoi_name, complete_catalog,
            probability_images, seed, config_filename):
    """The primary script

    Args:
        spark (``pyspark.sql.SparkSession``)
        logger (``py4j.JavaObject``)
        s3_bucket (str): Name of the S3 bucket to search for configuration objects
            and save results to
        run_id (str): The identifier of the current run
        aoi_id (str): The identifier for the current area of interest
        probability_images (int): The number of tiles to save the generated
            probability images for
        seed (int): A random seed used to sample the probability images, for
            reproducability

    Required external inputs:
        <s3_bucket>/cvmapper_config.yaml
            under ``learner`` key:
                    prefix: The S3 prefix under which CSVs can be read and written
                    pool: Name of CSV file under s3_bucket/prefix giving the
                        comprehensive list of active grid cells
                    incoming_names: Name of CSV file under s3_bucket/prefix giving
                        list of cells used for training/validation
                    image_catalog: Name of CSV file under s3_bucket giving catalog
                        of imagery
                    image_output_pattern: URI pattern used for output of probability
                        images.  Must contain two '{}' tokens to be replaced by the
                        column and row for the relevant cell
                    outgoing: S3 URI to save the CSV of worst-performing cells to

        location pool:
            A CSV of ``name``, ``col``, ``row`` for each grid cell under
            consideration.  Identified by ``pool`` parameter above.

        incoming names:
            CSV containing (at least) ``name``, ``iteration``, and ``usage``
            columns.  Every name in this file must also be contained in the image
            pool.  Location of this file given in YAML file.

        image catalog:
            A CSV minimally containing ``col``, ``row``, ``season``, and ``uri``
            columns.  Season is either 'GS' or 'OS'.  Every grid cell in the
            location pool must be contained here, and must have an entry for both
            seasons.  URI points to TIFF that completely covers listed cell with
            valid image data (no NODATA values).

    Note:

        Grid cells are defined according to the master_layout object, which
        specifies a rectangular extent in long/lat coords.  This extent is
        subdivided into cells (in this case, 13792 columns and 14477 rows).
        Each cell is then given a pixel resolution (in this case 200x200, but
        whatever is chosen must match the resolution of the label images
        provided in the ``s3://<s3_bucket>/<prefix>/<name>_<col>_<row>.tif``
        files identified by the incoming names CSV).  When we refer to tiles,
        we mean image chips of the stated resolution, indexed by
        ``gps.SpatialKey`` objects.  The key is a col/row pair where row=0,
        col=0 corresponds to the chip in the upper left corner of the bounding
        extent.

    Note:

        Grid cell names for the output probability images
        (`image_output_pattern`) are relative to a different, coarser layout.
        These grid cell ids need not be clearly defined, since the output of
        this process is simply a bucket of COGs for display using another
        tool.  However, see the `coarse_layout` definition below for specific
        details of the layout.

    """
    params = parse_yaml_from_s3(s3_bucket, config_filename)['learner']
    label_path = parse_yaml_from_s3(
        s3_bucket, config_filename)['labeller']['consensus_directory'][1:-1]
    s3_prefix = params['prefix']
    s3_prefix = s3_prefix[0:-1] if s3_prefix.endswith('/') else s3_prefix

    catalog_prefix = params['image_catalog']
    catalog_prefix_fix = params['image_catalog_fix']

    feature_names = functools.reduce(lambda a, b: a + b, [[
        "{}_raw_{}".format(season, n), "{}_avg_{}".format(season, n),
        "{}_std_{}".format(season, n)
    ] for season in ["GS", "OS"] for n in range(1, 5)])

    master_layout = gps.LayoutDefinition(
        gps.Extent(-17.541, -35.46, 51.459, 37.54),
        gps.TileLayout(13800, 14600, 200, 200))
    master_metadata = gps.Metadata(
        gps.Bounds(gps.SpatialKey(0, 0), gps.SpatialKey(13800, 14600)),
        "+proj=longlat +datum=WGS84 +no_defs ", gps.CellType.INT8,
        master_layout.extent, master_layout)

    ####################################
    logger.warn("Reading source tables")

    checkpoint = time.time()
    f_pool = spark\
         .read\
         .option('inferScheme', True)\
         .option('header', True)\
         .csv('s3n://{}/{}/{}'.format(s3_bucket, s3_prefix, params['pool']))\
         .repartition('col', 'row')

    qs_in = spark \
        .read \
        .option('inferScheme', True) \
        .option('header', True) \
        .csv('s3n://{}/{}/{}'.format(s3_bucket, s3_prefix, params['qs'])) \
        .repartition('col', 'row')

    incoming = spark.read\
                    .option('header', True)\
                    .schema(StructType([
                        StructField('name', StringType()),
                        StructField('run', IntegerType()),
                        StructField('iteration', IntegerType()),
                        StructField('processed', BooleanType()),
                        StructField('usage', StringType()),
                        StructField('label', StringType())
                    ]))\
                    .csv('s3n://{}/{}/{}'.format(s3_bucket, s3_prefix, params['incoming_names']))

    # merge incoming_names and incoming_names_static
    incoming = incoming.union(spark.read \
        .option('header', True) \
        .schema(StructType([
        StructField('name', StringType()),
        StructField('run', IntegerType()),
        StructField('iteration', IntegerType()),
        StructField('processed', BooleanType()),
        StructField('usage', StringType()),
        StructField('label', StringType())
    ])) \
        .csv('s3n://{}/{}/{}'.format(s3_bucket, s3_prefix, params['incoming_names_static'])))

    incoming = incoming.filter(incoming['run'] == params['runid']).filter(
        incoming['label'] == True)
    test_names = f_pool.join(incoming.select('name'), 'name',
                             'left_anti').withColumn("usage", lit("test"))
    all_names = f_pool.join(incoming.select('name', 'usage'),
                            f_pool.name == incoming.name,
                            how='left')\
                      .select(f_pool.name.alias('name'), 'col', 'row', 'usage')
    num_test_images = test_names.count()

    image_catalog = spark.read\
                          .option('inferScheme', True)\
                          .option('header', True)\
                          .csv('s3n://{}/{}'.format(s3_bucket, catalog_prefix))\
                          .repartition('col', 'row')
    all_image_uris = image_catalog\
                     .filter(image_catalog['season'] == 'GS')\
                     .alias('gs')\
                     .join(image_catalog.filter(image_catalog['season'] == 'OS').alias('os'),
                           (col('gs.col') == col('os.col')) & (col('gs.row') == col('os.row')))\
                     .select(col('gs.col'), col('gs.row'), col('gs.uri').alias('GS'), col('os.uri').alias('OS'))
    logger.warn(
        "Elapsed time for reading source tables: {}s".format(time.time() -
                                                             checkpoint))
    ####################################
    logger.warn("Reading training labels & building training features")

    checkpoint = time.time()
    training_data = gather_data(all_image_uris,
                                all_names.filter(all_names.usage == 'train'),
                                master_metadata,
                                feature_names,
                                s3_bucket,
                                label_path,
                                include_masks=True)
    training_data.show()
    logger.warn(
        "Elapsed time for reading training labels and feature building: {}s".
        format(time.time() - checkpoint))

    ####################################
    logger.warn("Balancing data")

    checkpoint = time.time()
    balanced_data = balance_samples(spark, training_data, 'mask')
    balanced_data.show()
    logger.warn("Elapsed time for balancing data: {}s".format(time.time() -
                                                              checkpoint))

    ####################################
    logger.warn("Training model")

    checkpoint = time.time()
    pipeline = ml_pipeline(feature_names, 'mask')
    model = pipeline.fit(balanced_data)
    print(model)
    logger.warn("Elapsed time for training the model: {}s".format(time.time() -
                                                                  checkpoint))

    ####################################
    logger.warn("Validating model results")

    checkpoint = time.time()
    validation_data = gather_data(
        all_image_uris,
        all_names.filter(all_names.usage == 'validate'),
        master_metadata,
        feature_names,
        s3_bucket,
        label_path,
        include_masks=True)

    valid_fit = model.transform(validation_data).select(
        'prediction', 'probability', 'mask')

    metrics = MulticlassMetrics(
        valid_fit.rdd.map(lambda r: (r.prediction, r.mask)))
    confusion_matrix = metrics.confusionMatrix().toArray().flatten().tolist(
    )  #left to right, top to bottom
    tss = 1.0 * confusion_matrix[3] / (confusion_matrix[3] + confusion_matrix[2]) + \
          1.0 * confusion_matrix[0] / (confusion_matrix[0] + confusion_matrix[1]) - 1
    binmetrics = BinaryClassificationMetrics(
        valid_fit.rdd.map(lambda r: (float(r['probability'][1]), r['mask'])))

    last_iteration = incoming.agg(F.max('iteration')).collect()[0][0]
    report = pd.DataFrame({
        'run': [run_id],
        'iteration': [last_iteration + 1],
        'tss': [tss],
        'accuracy': [metrics.accuracy],
        'precision': [metrics.precision(1.0)],
        'recall': [metrics.recall(1.0)],
        'fpr': [metrics.falsePositiveRate(1.0)],
        'tpr': [metrics.truePositiveRate(1.0)],
        'AUC': [binmetrics.areaUnderROC],
        'aoi': [aoi_name],
        'iteration_time': [datetime.utcnow().strftime('%Y-%m-%d %H:%M:%S.%f')]
    })
    # TODO: allow target location to be derived from params (local or s3)
    # added because of an error where incoming_metrics.csv contained different iteration number (10)
    # than expected by DB (4). Ryan's guess is that this is due to multiple test clusters overwriting csv
    # print("############Old Iteration Metrics  to overwrite###########")
    # incoming_previous = pd.read_csv(os.path.join("s3://",s3_bucket,s3_prefix,params['metrics']))
    # print(incoming_previous.to_string())
    # print("############New Iteration Metrics to use to overwrite###########")
    # print(report.to_string())
    pd_df_to_s3_csv(report, s3_bucket,
                    os.path.join(s3_prefix, params['metrics']))
    logger.warn(
        "Elapsed time for validating and saving metrics to s3: {}s".format(
            time.time() - checkpoint))

    ####################################
    logger.warn("Classifying test data")

    checkpoint = time.time()
    filtered_names = test_names.filter(test_names.usage == "test")
    # filtered_names.cache()
    # filtered_names.show()
    test_features = gather_data(all_image_uris, filtered_names,
                                master_metadata, feature_names, s3_bucket)

    test_features_sample = test_features.sample(True, 0.1)

    fitted = model.transform(test_features_sample).select(
        'spatial_key', 'column_index', 'row_index', 'probability',
        'prediction')
    # fitted.cache()
    # fitted.show()
    grouped = fitted.groupBy('spatial_key')

    # don't want to use following UDF, but indication is that there is a bug in pyspark preventing vector accesses:
    # https://stackoverflow.com/questions/44425159/access-element-of-a-vector-in-a-spark-dataframe-logistic-regression-probability
    # (This did not work without the UDF!)
    firstelement = F.udf(lambda v: float(v[0]), FloatType())
    # added this UDF to select the probability of field rather than no field to write to probability images
    secondelement = F.udf(lambda v: float(v[1]), FloatType())

    logger.warn(
        "Elapsed time for classifying test grids: {}s".format(time.time() -
                                                              checkpoint))

    ####################################
    if probability_images > 0 or complete_catalog:
        logger.warn("Write catalog of {} probability images".format(
            probability_images))
        checkpoint = time.time()

        if complete_catalog:

            # new catalog
            image_catalog_fix = spark.read \
                .option('inferScheme', True) \
                .option('header', True) \
                .csv('s3n://{}/{}'.format(s3_bucket, catalog_prefix_fix)) \
                .repartition('col', 'row')
            all_image_uris_fix = image_catalog_fix \
                .filter(image_catalog_fix['season'] == 'GS') \
                .alias('gs') \
                .join(image_catalog_fix.filter(image_catalog_fix['season'] == 'OS').alias('os'),
                      (col('gs.col') == col('os.col')) & (col('gs.row') == col('os.row'))) \
                .select(col('gs.col'), col('gs.row'), col('gs.uri').alias('GS'), col('os.uri').alias('OS'))

            #recollect all pixels for all testing images
            compreh_names = f_pool.join(qs_in,
                                        ['name', 'col', 'row', 'name_col_row'],
                                        'outer')
            features_compreh = gather_data(all_image_uris_fix, compreh_names,
                                           master_metadata, feature_names,
                                           s3_bucket)
            fitted_compreh = model.transform(features_compreh)\
                 .select('spatial_key', 'column_index', 'row_index', 'probability', 'prediction')
            grouped_compreh = fitted_compreh.groupBy('spatial_key')
            # added to test sampling
            assembled = grouped_compreh.agg(
                assembleTile('column_index', 'row_index',
                             secondelement('probability'),
                             master_layout.tileLayout.tileCols,
                             master_layout.tileLayout.tileRows,
                             'float32').alias('probability'))
            layer = gps.TiledRasterLayer.from_rasterframe(assembled.asRF())

        else:
            ####################################
            logger.warn("Identify worst performing cells")
            checkpoint = time.time()
            # TODO: Determine which images to take
            certainty = grouped \
                .agg(F.avg(F.pow(firstelement(fitted.probability) - lit(0.5), 2.0)).alias('certainty')).cache()
            certainty.show()

            worst_keys_rdd = certainty \
                .sort('certainty') \
                .select('spatial_key') \
                .limit(round(certainty.count() * 0.05)) \
                .rdd.takeSample(False, (params['number_outgoing_names']))
            worst_keys = spark.createDataFrame(worst_keys_rdd)
            outgoing_names = worst_keys \
                .join(f_pool, (col('spatial_key.col') == col('col')) & (col('spatial_key.row') == col('row'))) \
                .select('name') \
                .withColumn('run', lit(run_id)) \
                .withColumn('iteration', lit(last_iteration + 1)) \
                .withColumn('processed', lit(False)) \
                .withColumn('usage', lit('train')) \
                .toPandas()
            uri = urlparse.urlparse(params['outgoing'])
            pd_df_to_s3_csv(outgoing_names, uri.netloc, uri.path[1:])
            logger.warn(
                "Elapsed time for sorting certainty, converting to Pandas Dataframe, and saving to s3: {}s"
                .format(time.time() - checkpoint))

            ###########################################
            checkpoint = time.time()
            # sampling testing images (num = probability_images)
            filtered_names_sample = filtered_names\
                .sample(False, min(1.0, float(probability_images) / float(num_test_images)), seed=seed)\
                .join(image_catalog.filter(image_catalog['season'] == 'GS'), ['col', 'row'])\
                .select('scene_id')\
                .dropDuplicates()\
                .join(image_catalog.filter(image_catalog['season'] == 'GS'), 'scene_id')\
                .join(f_pool.join(qs_in, ['name', 'col', 'row', 'name_col_row'], 'outer'), ['col','row'])\
                .select('name', 'col', 'row', 'name_col_row')

            #re-collect all pixels within sampled images
            features_images = gather_data(all_image_uris,
                                          filtered_names_sample,
                                          master_metadata, feature_names,
                                          s3_bucket)
            #reclassify sampled testing images
            fitted_images = model.transform(features_images)\
                    .select('spatial_key', 'column_index', 'row_index', 'probability', 'prediction')
            grouped_sample = fitted_images.join(
                filtered_names_sample, (col('spatial_key.col') == col('col')) &
                (col('spatial_key.row') == col('row'))).groupby('spatial_key')
            assembled = grouped_sample.agg(
                assembleTile('column_index', 'row_index',
                             secondelement('probability'),
                             master_layout.tileLayout.tileCols,
                             master_layout.tileLayout.tileRows,
                             'float32').alias('probability'))
            layer = gps.TiledRasterLayer.from_rasterframe(assembled.asRF())

        coarse_layout = gps.LayoutDefinition(
            gps.Extent(-17.541, -35.46, 51.459, 37.54),
            gps.TileLayout(1380, 1460, 2000, 2000))
        # we multiply by 100 to select digits that will be kept after converting from float to int.
        # range of int8 is to 128, so we can only preserve 2 sig figs
        output_tiles = (layer*100).convert_data_type(gps.CellType.INT8)\
                            .tile_to_layout(coarse_layout)\
                            .to_geotiff_rdd(storage_method=gps.StorageMethod.TILED)

        cog_location = '/tmp/image_{}_{}.tif' if 'image_output_pattern' not in params else params[
            'image_output_pattern']
        output_tiles.foreach(lambda pair: write_bytes_to_s3(
            cog_location.format(pair[0].col, pair[0].row, aoi_name, run_id,
                                str(last_iteration + 1)), pair[1]))
        logger.warn(
            "Elapsed time for writing catalog of probability images: {}s".
            format(time.time() - checkpoint))
Beispiel #6
0
  grid = ParamGridBuilder().addGrid(gbt.maxDepth, [3,4,5]).build()

  metrics = MulticlassClassificationEvaluator(metricName="f1")
  cv = CrossValidator(estimator=gbt, estimatorParamMaps=grid, evaluator=metrics, numFolds=2)

  # Define pre-processing pipeline
  featureCols = ["AN3", "AN4", "AN5", "AN6", "AN7", "AN8", "AN9", "AN10"]
  stages = [VectorAssembler(inputCols=featureCols, outputCol="va"),
            StandardScaler(inputCol="va", outputCol="features"),
            StringIndexer(inputCol="status", outputCol="label"), cv]
  pipeline = Pipeline(stages=stages)

  pipelineTrained = pipeline.fit(training)
  
  predictions = pipelineTrained.transform(test)
  metrics = MulticlassMetrics(predictions.select(['prediction', 'label']).rdd)
  
  # Define mlflow artifacts to log with the experiment run
  mlflow.log_metric("precision", metrics.precision(1.0))
  mlflow.log_metric("recall", metrics.recall(1.0))
  mlflow.log_metric("f1", metrics.fMeasure(1.0))
  
  mlflow.spark.log_model(pipelineTrained, "turbine_anomalies")
  mlflow.set_tag("model", "gbt") 
  
  # Add confusion matrix to the model
  labels = pipelineTrained.stages[2].labels
  fig = plt.figure()
  sn.heatmap(pd.DataFrame(metrics.confusionMatrix().toArray()), annot=True, fmt='g', xticklabels=labels, yticklabels=labels)
  plt.suptitle("Turbine Damage Prediction. F1={:.2f}".format(metrics.fMeasure(1.0)), fontsize = 18)
  plt.xlabel("Predicted Labels")
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
# Evaluate model based on confusion matrix
from pyspark.mllib.evaluation import MulticlassMetrics

# model on training data regPara: lasso regularisation parameter (L1)
lrModel = LogisticRegression(regParam=0.1).fit(trainData)

# make prediction on test data
pred = lrModel.transform(testData)

pred.select('catLabel', 'label', 'prediction').show()


evaluator1 = BinaryClassificationEvaluator(labelCol='label', metricName="areaUnderROC")
evaluator2 = MulticlassClassificationEvaluator(labelCol='label', metricName="f1")
metrics = MulticlassMetrics(pred.select('label', 'prediction').rdd.map(tuple))

print('AUC ROC of Logistic Regression model is %f' % evaluator1.evaluate(pred))
print('F1 score of Logistic Regression model is %f' % evaluator2.evaluate(pred))
metrics.confusionMatrix().toArray().transpose()


# <a id="context322"></a>
# #### 3.2.2. Decision Tree

# In[18]:

from pyspark.ml.classification import DecisionTreeClassifier

# model on training data maxDepth is the hyperparameter
dtModel = DecisionTreeClassifier(maxDepth=3).fit(trainData)
    lambda row: LabeledPoint(row[-1], Vectors.dense(row[0:-1])))

# Split the data into training and test sets (25% held out for testielapsed_timeng)
(X_train, X_val) = X_spark_rdd.randomSplit(
    [TRAINING_DATA_RATIO, 1 - TRAINING_DATA_RATIO])

start_time = time()

rfc = RandomForest.trainClassifier(X_train,
                                   numClasses=3,
                                   categoricalFeaturesInfo={},
                                   numTrees=15,
                                   featureSubsetStrategy="auto",
                                   impurity='gini',
                                   maxDepth=3,
                                   maxBins=92)

end_time = time()

elapsed_time = end_time - start_time
print("Time to train model: %.3f seconds" % elapsed_time)

predictions = rfc.predict(X_val.map(lambda x: x.features))
labels_and_predictions = X_val.map(lambda x: x.label).zip(predictions)

metrics = MulticlassMetrics(labels_and_predictions)
f1Score = metrics.fMeasure()

print("Evaluation Metric : F1-score")
print("F1-Score = {}".format(f1Score))
session = SparkSession.builder.appName('Wine Quality Prediction Model Load').getOrCreate()

#Importing sys for taking the command line parameters
import sys
fileName = sys.argv[1]

#Load Random Forest Model package
from pyspark.mllib.tree import RandomForestModel
loadedRFModel = RandomForestModel.load(session.sparkContext,"myRandomForestClassificationModel")

data = session.read.format('csv').option('header','true').option('inferSchema','true').option('sep',';').load(fileName)

from pyspark.mllib.regression import LabeledPoint
modelData = data.rdd.map(lambda col: LabeledPoint(col[11],col[:11]))

predictionData = loadedRFModel.predict(modelData.map(lambda x: x.features))

labelAndPredictionData = modelData.map(lambda lp: lp.label).zip(predictionData)

#For F1 score using Random Forest with given dataset
from pyspark.mllib.evaluation import MulticlassMetrics
randomFResults = MulticlassMetrics(labelAndPredictionData)
randomFConfMatrix = randomFResults.confusionMatrix().toArray()
randomFPrecision = (randomFConfMatrix[0][0])/(randomFConfMatrix[0][0]+randomFConfMatrix[1][0])
randomFRecall = (randomFConfMatrix[0][0])/(randomFConfMatrix[0][0]+randomFConfMatrix[0][1])
randomFF1=(2*randomFPrecision*randomFRecall)/(randomFPrecision+randomFRecall)
print("=======================================================================================================")
print("F1 score using imported Random Forests model on the given dataset: " + str(randomFF1))
print("=======================================================================================================")
Beispiel #10
0
    labelCol='default_payment_next_month')
print('Evaluator areaUnderROC: ' +
      str(evaluator.evaluate(prediction)))  # 0.7294563666075892

evaluator = BinaryClassificationEvaluator(
    rawPredictionCol='rawPrediction',
    metricName="areaUnderPR",
    labelCol='default_payment_next_month')
print('Evaluator areaUnderPR : ' +
      str(evaluator.evaluate(prediction)))  # 0.7294563666075892

prediction.groupBy('default_payment_next_month', 'prediction').count().show()

# Metrics
predictionRDD = prediction.select(['label', 'prediction']) \
                            .rdd.map(lambda line: (line[1], line[0]))
metrics = MulticlassMetrics(predictionRDD)
# Confusion Matrix
print(metrics.confusionMatrix().toArray())

print('---------------------------Overall statistics------------------------')
print('precision : ' + str(metrics.precision()))
print('recall : ' + str(metrics.recall()))
print('fMeasure : ' + str(metrics.fMeasure()))

print('---------------------------statistics by class------------------------')
labels = [0.0, 1.0]
for label in sorted(labels):
    print('precision : ' + str(metrics.precision(label)))
    print('recall : ' + str(metrics.recall(label)))
    print('fMeasure : ' + str(metrics.fMeasure(label, beta=1.0)))
Beispiel #11
0

def func(line):
    v = [float(z) for z in line.split(';')]
    return LabeledPoint(v[11], v[0:10])


c = SparkConf().setAppName("winequality")
s = SparkContext("local", conf=c)
t = s.textFile("s3://winequalitynew/TrainingDataset.csv")
h = t.first()
r = t.filter(lambda z: z != h)
tnew = r.map(func)
dnew = s.textFile("s3://winequalitynew/ValidationDataset.csv")
h = dnew.first()
r = dnew.filter(lambda z: z != h)
dnew2 = r.map(func)
m = RandomForest.trainClassifier(tnew,
                                 numClasses=11,
                                 categoricalFeaturesInfo={},
                                 numTrees=3,
                                 featureSubsetStrategy="auto",
                                 impurity='gini',
                                 maxDepth=4,
                                 maxBins=32)
p = m.predict(dnew2.map(lambda z: z.features))
lpre = dnew2.map(lambda lp: lp.label).zip(p)
me = MulticlassMetrics(lpre)
scr = me.fMeasure()
scr
m.save(s, "s3://winequalitynew/winequality")
Beispiel #12
0
lr = LogisticRegression(maxIter=20, regParam=0.001)

# Builing model pipeline
pipeline = Pipeline(stages=[indexer, tokenizer, hashingTF, idf, lr])

# Train model on training set
model = pipeline.fit(
    train_df
)  #if you give new names to your indexed datasets, make sure to make adjustments here

# Model prediction on test set
pred = model.transform(test_df)  # ...and here

# Model prediction accuracy (F1-score)
pl = pred.select("label", "prediction").rdd.cache()
metrics = MulticlassMetrics(pl)
metrics.fMeasure()

#####################################################################################################
""" Task 1.2
a.	Run the model provided above. 
    Take your time to carefully understanding what is happening in this model pipeline.
    You are NOT allowed to make changes to this model's configurations.
    Compute and report the F1-score on the test dataset.
b.	Get and report the schema (column names and data types) of the model's prediction output.

"""
# Your code for this part, IF ANY, starts here
print("the F1-score of the first model: {}".format(metrics.fMeasure()))
print(pred)
rf_model1 = rf1.fit(training)
rf_model1.getNumTrees
rf_model1.numClasses
print(rf_model1.toDebugString)

print(rf_model1.featureImportances)

training21 = rf_model1.transform(training)

PredictionsandLabels = training21.select('prediction','Survived').rdd
PredictionsandLabels.collect()

#Resubstitution approach
from pyspark.mllib.evaluation import BinaryClassificationMetrics, MulticlassMetrics

metrics1 = MulticlassMetrics(PredictionsandLabels)
metrics1.accuracy


# for Bagging - two evaluatioin approaches
# K-Fold -- Cross Validation approach:
    
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import BinaryClassificationEvaluator


evaluator2 = BinaryClassificationEvaluator(labelCol='Survived',rawPredictionCol='prediction')
paramGrid = ParamGridBuilder().addGrid(rf1.numTrees,[50,100]).addGrid(rf1.maxDepth,[5,8]).build()

crossval2 = CrossValidator(estimator=rf1,estimatorParamMaps=paramGrid,
               evaluator=evaluator2,
# #### Decision tree

# In[30]:


sqlContext = SQLContext(sc)
predictions = sqlContext.read.load("/media/milad/Linux/bigdata/predictions.csv", 
                          format='com.databricks.spark.csv', 
                          header='true',inferSchema='true')


# In[32]:


conf_tree = MulticlassMetrics(predictions.rdd.map(tuple))


# In[35]:


conf_tree.confusionMatrix().toArray().transpose()


# #### Naive bayse

# In[36]:


sqlContext = SQLContext(sc)
predictions_nb = sqlContext.read.load("/media/milad/Linux/bigdata/predictions_nb.csv", 
Beispiel #15
0
                   "hours-per-week")
data.show()

assembler = VectorAssembler(inputCols=data.columns[1:], outputCol="features")
data = assembler.transform(data)
data.show()

# Splitting the data into training and data set
training, test = data.select("label", "features").randomSplit([0.70, 0.30])

# Create Navie Bayes model and fit the model with training dataset
nb = NaiveBayes()
model = nb.fit(training)

# Generate prediction from test dataset
pred = model.transform(test)

# Evaluate the accuracy of the model
evaluator = MulticlassClassificationEvaluator()
accuracy = evaluator.evaluate(pred)

# Show model accuracy
print("Accuracy:", accuracy, "\n\n")

#Report
predAndLabels = pred.select("prediction", "label").rdd
metrics = MulticlassMetrics(predAndLabels)
print("Confusion Matrix", metrics.confusionMatrix())
print("Precision", metrics.precision())
print("Recall", metrics.recall())
print("F-measure", metrics.fMeasure())
Beispiel #16
0
inputrddval = inputrddNorm.filter(
    lambda (((Community, Year, Month), label), features): Year >= 2015)

DataTrain = inputrddtrain.map(parseDataF)
DataTest = inputrddtest.map(parseDataF)
DataVal = inputrddval.map(parseDataF)

model = RandomForest.trainClassifier(DataTrain,
                                     numClasses=5,
                                     categoricalFeaturesInfo={},
                                     numTrees=3,
                                     featureSubsetStrategy="auto",
                                     impurity='gini',
                                     maxDepth=20,
                                     maxBins=32)

# Evaluate model on test instances and compute test error
predictions = model.predict(DataTest.map(lambda x: x.features))
predictionsAndlabels = predictions.zip(DataTest.map(lambda lp: lp.label))

metrics = MulticlassMetrics(predictionsAndlabels)

labelsAndPredictions = DataTest.map(lambda lp: lp.label).zip(predictions)
testErr = labelsAndPredictions.filter(lambda (v, p): v != p).count() / float(
    DataTest.count())
print('Test Error = ' + str(testErr))

print metrics.confusionMatrix().toArray()
print "precision = " + str(metrics.weightedPrecision)
print "recall = " + str(metrics.weightedRecall)
Beispiel #17
0
    def evaluate(self, model=None, trainingData=None, testingData=None):
        """ Ham kiem thu model, in ra man hinh do do chinh xac va thoi gian tinh toan
        """
        time_train = 0
        time_test = 0

        if (not trainingData):
            trainingData = self.trainingData
        if (not testingData):
            testingData = self.testingData

        if (not model):
            # Train model
            print("Training...")
            start_train = datetime.now()
            model = self.trainModel(trainingData)
            time_train = datetime.now() - start_train

        #print("Num nodes: ", model.stages[2].totalNumNodes, "\n", model.stages[2].toDebugString, file=open("modelDebug.txt","w"))
        # Make predictions
        print("Testing...")
        start_test = datetime.now()
        predictions = model.transform(testingData)
        time_test = datetime.now() - start_test

        # Evaluation for flow
        print("{:*^100}".format(""))
        print("Training time: ", time_train)
        print("Testing time: ", time_test)

        featureImportances = {}
        fi = model.stages[2].featureImportances
        features = loadcols(self.dataset)
        index = 0
        for value in fi:
            featureImportances[features[index]] = value
            index = index + 1
        fiSorted = sorted(featureImportances.items(),
                          key=lambda x: x[1],
                          reverse=True)
        print("{:*^100}".format(" Feature Importances "))
        f = open("features_importance.txt", "w")
        for feature in fiSorted:
            if feature[1] > 0.000:
                print("{!s} : {:.4%}".format(feature[0].strip(), feature[1]))
        # f.write("{!s}\n".format(feature[0].strip()))
        f.close()

        print("{:*^100}".format(" Evaluate for Flow "))

        print("Total predictions:", predictions.count())
        predictions.select("prediction", "indexedLabel",
                           "label").groupBy("label").count().show()

        predictionAndLabels = predictions.select("prediction",
                                                 "indexedLabel").rdd
        metrics = MulticlassMetrics(predictionAndLabels)

        print("Confusion Matrix:")
        for line in metrics.confusionMatrix().toArray():
            print(line)

        print("TPR: {:.3%} \tFPR: {:.3%}".format(
            metrics.truePositiveRate(1.0), metrics.falsePositiveRate(1.0)))
        print("TNR: {:.3%} \tFNR: {:.3%}".format(
            metrics.truePositiveRate(0.0), metrics.falsePositiveRate(0.0)))

        print("Precision: {:.3%} \tRecall: {:.3%} \tAccuracy: {:.3%}".format(
            metrics.precision(1.0), metrics.recall(1.0), metrics.accuracy))

        print(metrics.accuracy)

        print("{:*^100}".format(""))
Beispiel #18
0
	pipeline = Pipeline(stages=[regexTokenizer, stopwordsRemover, countVectors, label_stringIdx])

	pipelineFit = pipeline.fit(spark_df)
	dataset = pipelineFit.transform(spark_df)
	(trainingData, testData) = dataset.randomSplit([0.8, 0.2], seed = 100)
	
	#Logistic Regression
	lr = LogisticRegression(maxIter=100, regParam=0.01, elasticNetParam=0.01)
	lrModel = lr.fit(trainingData)
	predictions = lrModel.transform(testData)
	predictions.show()

	# Evaluate model results
	evaluator = MulticlassClassificationEvaluator(metricName="f1", labelCol='label')
	predictions_and_labels = predictions.select(["prediction","label"])
	metrics = MulticlassMetrics(predictions_and_labels.rdd)
	conf_mat1 = metrics.confusionMatrix()
	
	precision1 = metrics.precision()
	recall1 = metrics.recall()
	f1Score1 = metrics.fMeasure()


	result = evaluator.evaluate(predictions)

	#Naive Bayes classification
	nb=NaiveBayes(smoothing =2)
	nbModel=nb.fit(trainingData)
	nbPredictions=nbModel.transform(testData)
	nbPredictions.show()
Beispiel #19
0
evaluator = MulticlassClassificationEvaluator(metricName="accuracy",
                                              labelCol="category")
# compute the classification error on test data.
accuracy = evaluator.evaluate(predictions)
print("Test Error : " + str(1 - accuracy))

# In[16]:

#  confusion matrix
from pyspark.mllib.evaluation import MulticlassMetrics
predictionsAndLabelsNB = predictions.select("prediction", "category")

# In[18]:

metricsNB = MulticlassMetrics(predictionsAndLabelsNB.rdd)

accuracyNB2 = metricsNB.accuracy
print("Naive Bayes accuracy:")
print(accuracyNB2)

confusionMatrixNB = metricsNB.confusionMatrix().toArray()
print("Confusion Matrix: ")
print(confusionMatrixNB)

# Try classifying a few basic sentences.

# In[15]:

tf = spark.createDataFrame([
    ("Bactibilia has several consequences to human health", ),
print "Model Ready"

predictions = model.predict(testLab.map(lambda x: x.features))

labelsAndPredictions = testLab.map(lambda lp: lp.label).zip(predictions)
labelsAndPredictions = labelsAndPredictions.map(lambda (a,b): (b,a))


finalRdd = testData.map(lambda rec:(rec.yelp_id,rec.stars)).zip(predictions) 

testErr = labelsAndPredictions.filter(lambda (v, p): v != p).count() / float(testLab.count())
print('Test Error = ' + str(testErr))
print('Learned classification forest model:')
print(model.toDebugString())

metrics = MulticlassMetrics(labelsAndPredictions)
# Overall statistics
precision = metrics.precision()
recall = metrics.recall()
f1Score = metrics.fMeasure()
confusionMatrx = metrics.confusionMatrix()

print("RANDOM FORREST FOR TEST SET ONLY: ")
print("All Evaluation Measures Stats")
print("Confusion Matix = %s" % confusionMatrx)
print("Precision = %s" % precision)
print("Recall = %s" % recall)
print("F1 Score = %s" % f1Score)


#now perform random forrest on all data
Beispiel #21
0
predictions_loaded = loaded_model.transform(rowDf)
print(predictions_loaded)
result = predictions_loaded.select(["message", "probability",
                                    "prediction"]).collect()

for row in result:
    print(row.message, row.probability, row.prediction)

predictions = loaded_model.transform(df).withColumn(
    "label", df["label"].cast(DoubleType()))

rdd_map = predictions.select(
    "prediction", "label").rdd.map(lambda lp: (lp["prediction"], lp["label"]))
binary_metrics = BinaryClassificationMetrics(rdd_map)
print(
    f"Overall PR and ROC : {binary_metrics.areaUnderPR}, {binary_metrics.areaUnderROC}\n"
)

multiclass_metrics = MulticlassMetrics(rdd_map)

print("Confusion matrix : ")
print(multiclass_metrics.confusionMatrix())
print(
    f"Overall: Precision {multiclass_metrics.precision(1), multiclass_metrics.precision(0)}"
)
print(
    f"Overall: Recall {multiclass_metrics.recall(1), multiclass_metrics.recall(0)}"
)
print(f"Overall Accuracy {multiclass_metrics.accuracy}")
Beispiel #22
0
def calc_metrics(df, simple_mode=True):
    rdd = df.select("prediction", "Profit").rdd
    metrics = MulticlassMetrics(rdd)
    metrics_dict = {}
    cm = metrics.confusionMatrix().toArray()


    TP = cm[0][0]
    print("TP IS " + str(TP))

    TN = cm[1][1]
    print("TN IS " + str(TN))

    FP = cm[0][1]
    print("FP IS " + str(FP))

    FN = cm[1][0]
    print("FN IS " + str(FN))

    accuracy = (TP + TN) / cm.sum()
    sensitivity = (TP) / (TP + FN)
    specificity = (TN) / (TN + FP)
    precision = (TP) / (TP + FP)
    npv = (TN) / (TN + FN)

    # Overall statistics
    metrics_dict['accuracy'] = accuracy

    metrics_dict['sensitivity'] = sensitivity

    metrics_dict['specificity'] = specificity

    metrics_dict['precision'] = precision

    metrics_dict['npv'] = npv

    # print(metrics_dict)
    # print("Summary Stats")
    print(metrics.confusionMatrix())
    metrics_dict['confusionMatrix'] = metrics.confusionMatrix()
    # print("Accuracy = %.4f" % precision)
    # print("Recall = %.4f" % recall)
    # print("F1 Score = %.4f" % f1Score)

    # print("accuracy ", accuracy)
    # print("sensitivity ", sensitivity)
    # print("specificity ", specificity)
    # print("precision ", precision)

    print(
        "{},{},{},{},{}".format(round(accuracy, 3), round(sensitivity, 3), round(specificity, 3), round(precision, 3),
        round(npv, 3)))
    # print("sensitivity ", sensitivity)
    # print("specificity ", specificity)
    # print("precision ", precision)

    if not simple_mode:
        # Statistics by class
        labels = rdd.map(lambda lp: lp.Profit).distinct().collect()

        for label in sorted(labels):
            print("Class %s accuracy = %.4f" % (label, metrics.precision(label)))
            print("Class %s sensitivity = %.4f" % (label, metrics.recall(label)))
            print("Class %s F1 Measure = %.4f" % (label, metrics.fMeasure(label, beta=1.0)))
        print("\n")

        # Weighted stats
        weightedRecall = metrics.weightedRecall
        print("Weighted sensitivity = %.4f" % weightedRecall)
        metrics_dict['weightedRecall'] = weightedRecall

        weightedPrecision = metrics.weightedPrecision
        print("Weighted precision = %.4f" % weightedPrecision)
        metrics_dict['weightedPrecision'] = weightedPrecision

        # weightedFMeasure = metrics.weightedFMeasure()
        # print("Weighted F(1) Score = %.4f" % weightedFMeasure)
        # metrics_dict['weightedFMeasure'] = weightedFMeasure
        #
        # weightedFMeasure = metrics.weightedFMeasure(beta=0.5)
        # print("Weighted F(1) Score = %.4f" % weightedFMeasure)
        # metrics_dict['weightedFMeasure_beta'] = weightedFMeasure
        #
        # weightedFalsePositiveRate = metrics.weightedFalsePositiveRate
        # print("Weighted F(1) Score = %.4f" % weightedFalsePositiveRate)
        # metrics_dict['weightedFalsePositiveRate'] = weightedFalsePositiveRate
        print("\n")
    return metrics_dict
    d_cat_sim = cat_sim.withColumn("d_cat", udf_get_decision(col("score")))
    d_tdf_sim = tdf_sim.withColumn("d_tdf", udf_get_decision(col("score")))
    d_als_sim = als_sim.withColumn("d_als", udf_get_decision(col("score")))

    # Aggrgate results from attributes, collaborative, and text models.
    agg = d_cat_sim.alias('c').join(d_tdf_sim.alias('t'), col("c.review_id") == col("t.review_id"))\
                              .join(d_als_sim.alias('a'), col("c.review_id") == col("a.review_id"))\
                              .select("c.review_id", "c.business_id", "c.user_id", "c.r_stars", "c.d_cat", "t.d_tdf", "a.d_als")\
                              .filter(col("c.r_stars") != 3).withColumn("label", udf_get_label(col("r_stars")))\
                              .withColumn("d_agg", udf_get_agg_decision(col("c.d_cat"), col("t.d_tdf"), col("a.d_als")))

    # Calculate metrics for content-based filtering - business attributes model.
    cat_labels = agg.select("d_cat", "label").rdd.map(lambda x: (float(x.d_cat), float(x.label)))
    cat_labels_bin = agg.select("d_cat", "label").rdd.map(lambda x: (float(x.d_cat), x.label))
    cat_bin = BinaryClassificationMetrics(cat_labels_bin)
    cat_metrics = MulticlassMetrics(cat_labels)
    cat_precision_1 = cat_metrics.precision(1.0)
    cat_recall_1 = cat_metrics.recall(1.0)
    cat_precision_0 = cat_metrics.precision(0.0)
    cat_recall_0 = cat_metrics.recall(0.0)
    cat_accuracy = cat_metrics.accuracy

    # Calculate metrics for content-based filtering - reviews (text-mining) model.
    tdf_labels = agg.select("d_tdf", "label").rdd.map(lambda x: (float(x.d_tdf), float(x.label)))
    tdf_labels_bin = agg.select("d_tdf", "label").rdd.map(lambda x: (float(x.d_tdf), x.label))
    tdf_bin = BinaryClassificationMetrics(tdf_labels_bin)
    tdf_metrics = MulticlassMetrics(tdf_labels)
    tdf_precision_1 = tdf_metrics.precision(1.0)
    tdf_recall_1 = tdf_metrics.recall(1.0)
    tdf_precision_0 = tdf_metrics.precision(0.0)
    tdf_recall_0 = tdf_metrics.recall(0.0)
train.cache()
test.cache()

# ---------------
# Random Forest:
# ---------------

rf = RandomForestClassifier(numTrees=100,
                            maxDepth=16,
                            labelCol="label",
                            seed=42)
print('Training RandomForest model on training set. \n Model parameters: {}'.
      format(rf._paramMap))
trained_model = rf.fit(train)
res = trained_model.transform(test)
metrics = MulticlassMetrics(res.select(['label', 'prediction']).rdd)
print('Accuracy on test set: ', evaluator.evaluate(res))
print('Area under ROC curve: ', eval.evaluate(res))
find_performance_metrics(res, "random forest")

# ---------------
# Logistic regression:
# ---------------
print('Training LogisticRegression model on training set.')
logistic = LogisticRegression(regParam=0.1,
                              labelCol="label")  # , thresholds = [0.2, 0.5])
trained_model = logistic.fit(train)
res = trained_model.transform(test)
metrics = MulticlassMetrics(res.select(['label', 'prediction']).rdd)
print('Accuracy on test set: ', evaluator.evaluate(res))
print('Area under ROC curve: ', eval.evaluate(res))
Beispiel #25
0
 def sp_accuracy(self, prediction_and_labels):
     from pyspark.mllib.evaluation import MulticlassMetrics
     return MulticlassMetrics(prediction_and_labels)
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(labelCol="label",
                                              predictionCol="prediction",
                                              metricName="accuracy")

accuracy = evaluator.evaluate(predicted)

print(accuracy)

# In[72]:

from pyspark.mllib.evaluation import MulticlassMetrics
pred_label = predicted.select(['label', 'prediction'])

#confmat = pred_label.rdd.map(tuple)

# In[73]:

confmat = pred_label.rdd.map(tuple)

# In[75]:

metrics = MulticlassMetrics(confmat)
confusion_mat = metrics.confusionMatrix()

# In[79]:

print(confusion_mat.toArray())

# In[ ]:
Beispiel #27
0
# specify layers for the neural network:
# input layer of size 28x28 = 784 (features), two intermediate of size 5 and 4
# and output of size 10 (classes)

#layers = [784, 5, 4, 10]

layers = [784, 100, 10]

trainer = MultilayerPerceptronClassifier(maxIter=1000, layers=layers, blockSize=128, seed=1234)
model = trainer.fit(train_vectors_withlabel)
result = model.transform(test_vectors_withlabel)
predictionAndLabels = result.select("prediction", "label")

changedTypedf = predictionAndLabels.withColumn("label", predictionAndLabels["label"].cast(DoubleType()))
test_rdd = changedTypedf.rdd.map(tuple)
metrics = MulticlassMetrics(test_rdd)




#Print F1-score, Recall and Precision for each label.

evaluator = MulticlassClassificationEvaluator(metricName="accuracy")

print("Test set accuracy = " + str(evaluator.evaluate(predictionAndLabels)))

labels = (0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0)

x = PrettyTable(['Label', 'Precision', 'Recall', 'F1-score'])

for label in sorted(labels):
Beispiel #28
0
labels = labeled_data.map(lambda x: x[0])

tf = HashingTF().transform(labeled_data.map(lambda x: x[1]))
idf = IDF(minDocFreq=5).fit(tf)
tfidf = idf.transform(tf)
zipped_data = (
    labels.zip(tfidf).map(lambda x: LabeledPoint(x[0], x[1])).cache())

# Do a random split so we can test our model on non-trained data
training, test = zipped_data.randomSplit([0.7, 0.3])

# Train our model
model = NaiveBayes.train(training)

# Use our model to predict
train_preds = (training.map(lambda x: x.label).zip(
    model.predict(training.map(lambda x: x.features))))
test_preds = (test.map(lambda x: x.label).zip(
    model.predict(test.map(lambda x: x.features))))

# Ask PySpark for some metrics on how our model predictions performed
trained_metrics = MulticlassMetrics(
    train_preds.map(lambda x: (x[0], float(x[1]))))
test_metrics = MulticlassMetrics(test_preds.map(lambda x: (x[0], float(x[1]))))

with open('output_binary.txt', 'w+') as f:
    f.write(str(trained_metrics.confusionMatrix().toArray()) + '\n')
    f.write(str(trained_metrics.precision()) + '\n')
    f.write(str(test_metrics.confusionMatrix().toArray()) + '\n')
    f.write(str(test_metrics.precision()) + '\n')
Beispiel #29
0
data = data.select(data.label.cast("double"), "educational-num",
                   "hours-per-week")

# Create vector assembler for feature columns
assembler = VectorAssembler(inputCols=data.columns[1:], outputCol="features")
data = assembler.transform(data)

# Split data into training and test data set
training, test = data.select("label", "features").randomSplit([0.85, 0.15])

# Create Navie Bayes model and fit the model with training dataset
nb = NaiveBayes()
model = nb.fit(training)

# Generate prediction from test dataset
predictions = model.transform(test)

# Evuluate the accuracy of the model
evaluator = MulticlassClassificationEvaluator()
accuracy = evaluator.evaluate(predictions)

# Show model accuracy
print("Accuracy:", accuracy)

# Report
predictionAndLabels = predictions.select("label", "prediction").rdd
metrics = MulticlassMetrics(predictionAndLabels)
print("Confusion Matrix:", metrics.confusionMatrix())
print("Precision:", metrics.precision())
print("Recall:", metrics.recall())
print("F-measure:", metrics.fMeasure())
Beispiel #30
0
    def fit(self, data):
        '''Dataset must at least contain the following two columns:
        label: the class labels
        features: feature vector

        Parameters
        ----------
        data : Dataset<Row>
           input data

        Returns
        -------
        dict
           map with metrics
        '''

        start = time.time()

        classCount = int(data.select(self.label).distinct().count())

        labelIndexer = StringIndexer().setInputCol(self.label) \
                                      .setOutputCol("indexedLabel") \
                                      .fit(data)

        # Split the data into training and test sets (30% held out for testing)
        splits = data.randomSplit([1.0 - self.testFraction, self.testFraction],
                                  self.seed)
        trainingData = splits[0]
        testData = splits[1]

        labels = labelIndexer.labels

        print("\n Class\tTrain\tTest")
        for l in labels:
            print("%s\t%i\t%i" % (l \
                                  ,(trainingData.filter(trainingData[self.label] == l)).count() \
                                  ,(testData.filter(testData[self.label] == l)).count() \
                                  )
                  )

        # Set input columns
        self.predictor.setLabelCol("indexedLabel").setFeaturesCol("features")

        # Convert indexed labels back to original labels
        labelConverter = IndexToString().setInputCol("prediction") \
                                        .setOutputCol("predictedLabel") \
                                        .setLabels(labelIndexer.labels)

        # Chain indexers and forest ina Pipline
        pipeline = Pipeline().setStages(
            [labelIndexer, self.predictor, labelConverter])

        # Train model. This also runs the indexers
        model = pipeline.fit(trainingData)

        # Make predictions
        predictions = model.transform(testData).cache()

        # Display some sample predictions
        print(f"\nSample predictions: {str(self.predictor).split('_')[0]}"
              )  # TODO predictor.getClass().getSimpleName()
        predictions.sample(False, 0.1, self.seed).show(5)

        predictions = predictions.withColumnRenamed(self.label, "stringLabel")
        predictions = predictions.withColumnRenamed("indexedLabel", self.label)

        # Collect metrics

        pred = predictions.select("prediction", self.label)

        metrics = OrderedDict()
        metrics["Method"] = str(self.predictor).split('_')[0]

        if classCount == 2:
            b = BinaryClassificationMetrics(pred.rdd)
            metrics["AUC"] = str(b.areaUnderROC)
        m = MulticlassMetrics(pred.rdd)
        metrics["F"] = str(m.weightedFMeasure())
        metrics["Accuracy"] = str(m.accuracy)
        metrics["Precision"] = str(m.weightedPrecision)
        metrics["Recall"] = str(m.weightedRecall)
        metrics["False Positive Rate"] = str(m.weightedFalsePositiveRate)
        metrics["True Positive Rate"] = str(m.weightedTruePositiveRate)
        metrics[""] = f"\nConfusion Matrix\n{labels}\n{m.confusionMatrix()}"

        end = time.time()
        print(f"Total time taken: {end-start}\n")

        return metrics