#Random Forest Classifier from pyspark.ml.classification import RandomForestClassifier features = [ 'app', 'device', 'os', 'channel', 'hour', 'dow', 'doy', 'ip_clicks', 'app_clicks', 'os_clicks', 'device_clicks', 'app_device_comb_clicks', 'ip_app_comb_clicks', 'app_os_comb_clicks' ] pipeline_stages = [] pipeline_stages.append( VectorAssembler(inputCols=features, outputCol='feature_vector')) rf = RandomForestClassifier(featuresCol='feature_vector', labelCol='is_attributed', numTrees=10, maxBins=500) pipeline_stages.append(rf) pipeline = Pipeline(stages=pipeline_stages) model = pipeline.fit(train_set) test_output = model.transform(test_set) test_output_rdd = test_output.rdd predictionsAndLabels = test_output_rdd.map(lambda x: (x.prediction, x.is_attributed)) metrics1 = MulticlassMetrics(predictionsAndLabels) metrics2 = BinaryClassificationMetrics(predictionsAndLabels) print('ROC of random forest model:{}'.format(metrics2.areaUnderROC)) model.write().overwrite().save( 'Enter URL where the model has to be saved') ##Todo
def main(sc): train_id = utils.load("data_id/train.p") test_id = utils.load("data_id/test.p") meta(train_id) train_id = [[idx] for idx in train_id] test_id = [[idx] for idx in test_id] sqlContext = SQLContext(sc) train_f = sqlContext.createDataFrame(train_id, ['biz_id']) test_f = sqlContext.createDataFrame(test_id, ['biz_id']) # Register user defined functions # city = udf(lambda b_id: get_city(b_id), StringType()) state = udf(lambda b_id: MLVectors.dense(get_state(b_id)), VectorUDT()) stars = udf(lambda b_id: get_stars(b_id), FloatType()) popularity = udf(lambda b_id: get_popularity(b_id), IntegerType()) name_size = udf(lambda b_id: get_name_size(b_id), IntegerType()) name_polar = udf(lambda b_id: get_name_polar(b_id), FloatType()) pos_neg_score = udf(lambda b_id: MLVectors.dense(get_PosNeg_score(b_id)), VectorUDT()) # clarity = udf(lambda b_id: get_clarity(b_id), ArrayType(FloatType())) elite_cnt = udf(lambda b_id: get_elite_cnt(b_id), IntegerType()) label = udf(lambda b_id: get_y(b_id), IntegerType()) # Generate feature columns # data_f = data_f.withColumn("city", city(data_f['biz_id'])) train_f = train_f.withColumn("state", state(train_f['biz_id'])) train_f = train_f.withColumn("stars", stars(train_f['biz_id'])) train_f = train_f.withColumn("popularity", popularity(train_f['biz_id'])) train_f = train_f.withColumn("name_size", name_size(train_f['biz_id'])) train_f = train_f.withColumn("name_polar", name_polar(train_f['biz_id'])) train_f = train_f.withColumn("pos_neg_score", pos_neg_score(train_f['biz_id'])) # data_f = data_f.withColumn("clarity", clarity(data_f['biz_id'])) train_f = train_f.withColumn("elite_cnt", elite_cnt(train_f['biz_id'])) train_f = train_f.withColumn("y", label(train_f['biz_id'])) train_f.show(5) # Generate feature columns test_f = test_f.withColumn("state", state(test_f['biz_id'])) test_f = test_f.withColumn("stars", stars(test_f['biz_id'])) test_f = test_f.withColumn("popularity", popularity(test_f['biz_id'])) test_f = test_f.withColumn("name_size", name_size(test_f['biz_id'])) test_f = test_f.withColumn("name_polar", name_polar(test_f['biz_id'])) test_f = test_f.withColumn("pos_neg_score", pos_neg_score(test_f['biz_id'])) test_f = test_f.withColumn("elite_cnt", elite_cnt(test_f['biz_id'])) test_f = test_f.withColumn("y", label(test_f['biz_id'])) test_f.show(5) # One-hot encoding # encoder = OneHotEncoder(inputCol="state", outputCol="stateVec") # train_f = encoder.transform(train_f) train_f.show(5) # test_f = encoder.transform(test_f) test_f.show(5) # Assemble columns to features assembler = VectorAssembler(inputCols=[ "state", "stars", "popularity", "name_size", "name_polar", "pos_neg_score", "elite_cnt" ], outputCol="features") train_f = assembler.transform(train_f) train_f.show(5) test_f = assembler.transform(test_f) test_f.show(5) train_f = train_f.filter(train_f.y != -1) test_f = test_f.filter(test_f.y != -1) train_d = (train_f.select(col("y"), col("features")) \ .rdd \ .map(lambda row: LabeledPoint(float(row.y), MLLibVectors.fromML(row.features)))) m = SVMWithSGD.train(train_d) predictionAndLabels = test_f.rdd.map(lambda row: (float( m.predict(MLLibVectors.fromML(row.features))), float(row.y))) # Grid search for best params and model # scores = {} # max_score = 0 # for m in model_list: # print ('run', m) # evaluator = BinaryClassificationEvaluator() # cv = CrossValidator(estimator=model_list[m], # estimatorParamMaps=params_list[m], # evaluator=evaluator, # numFolds=3) # cv.fit(train) # scores[m] = cv.get_best_score() # if scores[m] > max_score: # op_params = params_list[m][cv.get_best_index()] # op_model = cv.get_best_model() # op_m_name = m # predictionAndLabels = test.map(lambda lp: (float(op_model.predict(lp.features)), lp.y)) # Instantiate metrics object bi_metrics = BinaryClassificationMetrics(predictionAndLabels) mul_metrics = MulticlassMetrics(predictionAndLabels) # Area under precision-recall curve print("Area under PR = %s" % bi_metrics.areaUnderPR) # Area under ROC curve print("Area under ROC = %s" % bi_metrics.areaUnderROC) # Confusion Matrix print("Confusion Matrix") print(mul_metrics.confusionMatrix().toArray()) # Overall statistics precision = mul_metrics.precision() recall = mul_metrics.recall() f1Score = mul_metrics.fMeasure() accuracy = mul_metrics.accuracy print("Summary Stats") print("Precision = %s" % precision) print("Recall = %s" % recall) print("F1 Score = %s" % f1Score) print("Accuracy = %s" % accuracy) # Individual label stats labels = [0, 1] for label in labels: print("Class %s precision = %s" % (label, mul_metrics.precision(label))) print("Class %s recall = %s" % (label, mul_metrics.recall(label)))
# Define elephas optimizer adagrad = elephas_optimizers.Adagrad() # Initialize Spark ML Estimator estimator = ElephasEstimator() estimator.set_keras_model_config(model.to_yaml()) estimator.set_optimizer_config(adagrad.get_config()) estimator.set_nb_epoch(nb_epoch) estimator.set_batch_size(batch_size) estimator.set_num_workers(4) estimator.set_verbosity(2) estimator.set_validation_split(0.1) estimator.set_categorical_labels(True) estimator.set_nb_classes(nb_classes) estimator.set_frequency('batch') # Fitting a model returns a Transformer pipeline = Pipeline(stages=[estimator]) fitted_pipeline = pipeline.fit(df) # Evaluate Spark model by evaluating the underlying model prediction = fitted_pipeline.transform(test_df) pnl = prediction.select("label", "prediction") pnl.show(100) prediction_and_label = pnl.map(lambda row: (row.label, row.prediction)) metrics = MulticlassMetrics(prediction_and_label) print("Precision:", metrics.precision()) print("Recall:", metrics.recall())
trainParsed = sc.parallelize(map(parsePoint, train_dict)) testParsed = sc.parallelize(map(parsePoint, test_dict)) model = SVMWithSGD.train(trainParsed, iterations=100) # Training Error trainLabelsAndPreds = trainParsed.map(lambda p: (p.label, float(model.predict(p.features)))) trainErr = trainLabelsAndPreds.filter(lambda (v, p): v != p).count()/float(trainParsed.count()) print trainErr # Test Error testLabelsAndPreds = testParsed.map(lambda p: (p.label, float(model.predict(p.features)))) testErr = testLabelsAndPreds.filter(lambda (v, p): v != p).count()/float(testParsed.count()) print testErr metrics = BinaryClassificationMetrics(testLabelsAndPreds) print metrics.areaUnderROC print metrics.areaUnderPR mcMetrics = MulticlassMetrics(testLabelsAndPreds) #TODO: Do this for classes 1.0,0.0 and not just overall print mcMetrics.precision() print mcMetrics.recall() print mcMetrics.fMeasure() model.save(sc, "SVMModel")
def execute(spark, logger, s3_bucket, run_id, aoi_name, complete_catalog, probability_images, seed, config_filename): """The primary script Args: spark (``pyspark.sql.SparkSession``) logger (``py4j.JavaObject``) s3_bucket (str): Name of the S3 bucket to search for configuration objects and save results to run_id (str): The identifier of the current run aoi_id (str): The identifier for the current area of interest probability_images (int): The number of tiles to save the generated probability images for seed (int): A random seed used to sample the probability images, for reproducability Required external inputs: <s3_bucket>/cvmapper_config.yaml under ``learner`` key: prefix: The S3 prefix under which CSVs can be read and written pool: Name of CSV file under s3_bucket/prefix giving the comprehensive list of active grid cells incoming_names: Name of CSV file under s3_bucket/prefix giving list of cells used for training/validation image_catalog: Name of CSV file under s3_bucket giving catalog of imagery image_output_pattern: URI pattern used for output of probability images. Must contain two '{}' tokens to be replaced by the column and row for the relevant cell outgoing: S3 URI to save the CSV of worst-performing cells to location pool: A CSV of ``name``, ``col``, ``row`` for each grid cell under consideration. Identified by ``pool`` parameter above. incoming names: CSV containing (at least) ``name``, ``iteration``, and ``usage`` columns. Every name in this file must also be contained in the image pool. Location of this file given in YAML file. image catalog: A CSV minimally containing ``col``, ``row``, ``season``, and ``uri`` columns. Season is either 'GS' or 'OS'. Every grid cell in the location pool must be contained here, and must have an entry for both seasons. URI points to TIFF that completely covers listed cell with valid image data (no NODATA values). Note: Grid cells are defined according to the master_layout object, which specifies a rectangular extent in long/lat coords. This extent is subdivided into cells (in this case, 13792 columns and 14477 rows). Each cell is then given a pixel resolution (in this case 200x200, but whatever is chosen must match the resolution of the label images provided in the ``s3://<s3_bucket>/<prefix>/<name>_<col>_<row>.tif`` files identified by the incoming names CSV). When we refer to tiles, we mean image chips of the stated resolution, indexed by ``gps.SpatialKey`` objects. The key is a col/row pair where row=0, col=0 corresponds to the chip in the upper left corner of the bounding extent. Note: Grid cell names for the output probability images (`image_output_pattern`) are relative to a different, coarser layout. These grid cell ids need not be clearly defined, since the output of this process is simply a bucket of COGs for display using another tool. However, see the `coarse_layout` definition below for specific details of the layout. """ params = parse_yaml_from_s3(s3_bucket, config_filename)['learner'] label_path = parse_yaml_from_s3( s3_bucket, config_filename)['labeller']['consensus_directory'][1:-1] s3_prefix = params['prefix'] s3_prefix = s3_prefix[0:-1] if s3_prefix.endswith('/') else s3_prefix catalog_prefix = params['image_catalog'] catalog_prefix_fix = params['image_catalog_fix'] feature_names = functools.reduce(lambda a, b: a + b, [[ "{}_raw_{}".format(season, n), "{}_avg_{}".format(season, n), "{}_std_{}".format(season, n) ] for season in ["GS", "OS"] for n in range(1, 5)]) master_layout = gps.LayoutDefinition( gps.Extent(-17.541, -35.46, 51.459, 37.54), gps.TileLayout(13800, 14600, 200, 200)) master_metadata = gps.Metadata( gps.Bounds(gps.SpatialKey(0, 0), gps.SpatialKey(13800, 14600)), "+proj=longlat +datum=WGS84 +no_defs ", gps.CellType.INT8, master_layout.extent, master_layout) #################################### logger.warn("Reading source tables") checkpoint = time.time() f_pool = spark\ .read\ .option('inferScheme', True)\ .option('header', True)\ .csv('s3n://{}/{}/{}'.format(s3_bucket, s3_prefix, params['pool']))\ .repartition('col', 'row') qs_in = spark \ .read \ .option('inferScheme', True) \ .option('header', True) \ .csv('s3n://{}/{}/{}'.format(s3_bucket, s3_prefix, params['qs'])) \ .repartition('col', 'row') incoming = spark.read\ .option('header', True)\ .schema(StructType([ StructField('name', StringType()), StructField('run', IntegerType()), StructField('iteration', IntegerType()), StructField('processed', BooleanType()), StructField('usage', StringType()), StructField('label', StringType()) ]))\ .csv('s3n://{}/{}/{}'.format(s3_bucket, s3_prefix, params['incoming_names'])) # merge incoming_names and incoming_names_static incoming = incoming.union(spark.read \ .option('header', True) \ .schema(StructType([ StructField('name', StringType()), StructField('run', IntegerType()), StructField('iteration', IntegerType()), StructField('processed', BooleanType()), StructField('usage', StringType()), StructField('label', StringType()) ])) \ .csv('s3n://{}/{}/{}'.format(s3_bucket, s3_prefix, params['incoming_names_static']))) incoming = incoming.filter(incoming['run'] == params['runid']).filter( incoming['label'] == True) test_names = f_pool.join(incoming.select('name'), 'name', 'left_anti').withColumn("usage", lit("test")) all_names = f_pool.join(incoming.select('name', 'usage'), f_pool.name == incoming.name, how='left')\ .select(f_pool.name.alias('name'), 'col', 'row', 'usage') num_test_images = test_names.count() image_catalog = spark.read\ .option('inferScheme', True)\ .option('header', True)\ .csv('s3n://{}/{}'.format(s3_bucket, catalog_prefix))\ .repartition('col', 'row') all_image_uris = image_catalog\ .filter(image_catalog['season'] == 'GS')\ .alias('gs')\ .join(image_catalog.filter(image_catalog['season'] == 'OS').alias('os'), (col('gs.col') == col('os.col')) & (col('gs.row') == col('os.row')))\ .select(col('gs.col'), col('gs.row'), col('gs.uri').alias('GS'), col('os.uri').alias('OS')) logger.warn( "Elapsed time for reading source tables: {}s".format(time.time() - checkpoint)) #################################### logger.warn("Reading training labels & building training features") checkpoint = time.time() training_data = gather_data(all_image_uris, all_names.filter(all_names.usage == 'train'), master_metadata, feature_names, s3_bucket, label_path, include_masks=True) training_data.show() logger.warn( "Elapsed time for reading training labels and feature building: {}s". format(time.time() - checkpoint)) #################################### logger.warn("Balancing data") checkpoint = time.time() balanced_data = balance_samples(spark, training_data, 'mask') balanced_data.show() logger.warn("Elapsed time for balancing data: {}s".format(time.time() - checkpoint)) #################################### logger.warn("Training model") checkpoint = time.time() pipeline = ml_pipeline(feature_names, 'mask') model = pipeline.fit(balanced_data) print(model) logger.warn("Elapsed time for training the model: {}s".format(time.time() - checkpoint)) #################################### logger.warn("Validating model results") checkpoint = time.time() validation_data = gather_data( all_image_uris, all_names.filter(all_names.usage == 'validate'), master_metadata, feature_names, s3_bucket, label_path, include_masks=True) valid_fit = model.transform(validation_data).select( 'prediction', 'probability', 'mask') metrics = MulticlassMetrics( valid_fit.rdd.map(lambda r: (r.prediction, r.mask))) confusion_matrix = metrics.confusionMatrix().toArray().flatten().tolist( ) #left to right, top to bottom tss = 1.0 * confusion_matrix[3] / (confusion_matrix[3] + confusion_matrix[2]) + \ 1.0 * confusion_matrix[0] / (confusion_matrix[0] + confusion_matrix[1]) - 1 binmetrics = BinaryClassificationMetrics( valid_fit.rdd.map(lambda r: (float(r['probability'][1]), r['mask']))) last_iteration = incoming.agg(F.max('iteration')).collect()[0][0] report = pd.DataFrame({ 'run': [run_id], 'iteration': [last_iteration + 1], 'tss': [tss], 'accuracy': [metrics.accuracy], 'precision': [metrics.precision(1.0)], 'recall': [metrics.recall(1.0)], 'fpr': [metrics.falsePositiveRate(1.0)], 'tpr': [metrics.truePositiveRate(1.0)], 'AUC': [binmetrics.areaUnderROC], 'aoi': [aoi_name], 'iteration_time': [datetime.utcnow().strftime('%Y-%m-%d %H:%M:%S.%f')] }) # TODO: allow target location to be derived from params (local or s3) # added because of an error where incoming_metrics.csv contained different iteration number (10) # than expected by DB (4). Ryan's guess is that this is due to multiple test clusters overwriting csv # print("############Old Iteration Metrics to overwrite###########") # incoming_previous = pd.read_csv(os.path.join("s3://",s3_bucket,s3_prefix,params['metrics'])) # print(incoming_previous.to_string()) # print("############New Iteration Metrics to use to overwrite###########") # print(report.to_string()) pd_df_to_s3_csv(report, s3_bucket, os.path.join(s3_prefix, params['metrics'])) logger.warn( "Elapsed time for validating and saving metrics to s3: {}s".format( time.time() - checkpoint)) #################################### logger.warn("Classifying test data") checkpoint = time.time() filtered_names = test_names.filter(test_names.usage == "test") # filtered_names.cache() # filtered_names.show() test_features = gather_data(all_image_uris, filtered_names, master_metadata, feature_names, s3_bucket) test_features_sample = test_features.sample(True, 0.1) fitted = model.transform(test_features_sample).select( 'spatial_key', 'column_index', 'row_index', 'probability', 'prediction') # fitted.cache() # fitted.show() grouped = fitted.groupBy('spatial_key') # don't want to use following UDF, but indication is that there is a bug in pyspark preventing vector accesses: # https://stackoverflow.com/questions/44425159/access-element-of-a-vector-in-a-spark-dataframe-logistic-regression-probability # (This did not work without the UDF!) firstelement = F.udf(lambda v: float(v[0]), FloatType()) # added this UDF to select the probability of field rather than no field to write to probability images secondelement = F.udf(lambda v: float(v[1]), FloatType()) logger.warn( "Elapsed time for classifying test grids: {}s".format(time.time() - checkpoint)) #################################### if probability_images > 0 or complete_catalog: logger.warn("Write catalog of {} probability images".format( probability_images)) checkpoint = time.time() if complete_catalog: # new catalog image_catalog_fix = spark.read \ .option('inferScheme', True) \ .option('header', True) \ .csv('s3n://{}/{}'.format(s3_bucket, catalog_prefix_fix)) \ .repartition('col', 'row') all_image_uris_fix = image_catalog_fix \ .filter(image_catalog_fix['season'] == 'GS') \ .alias('gs') \ .join(image_catalog_fix.filter(image_catalog_fix['season'] == 'OS').alias('os'), (col('gs.col') == col('os.col')) & (col('gs.row') == col('os.row'))) \ .select(col('gs.col'), col('gs.row'), col('gs.uri').alias('GS'), col('os.uri').alias('OS')) #recollect all pixels for all testing images compreh_names = f_pool.join(qs_in, ['name', 'col', 'row', 'name_col_row'], 'outer') features_compreh = gather_data(all_image_uris_fix, compreh_names, master_metadata, feature_names, s3_bucket) fitted_compreh = model.transform(features_compreh)\ .select('spatial_key', 'column_index', 'row_index', 'probability', 'prediction') grouped_compreh = fitted_compreh.groupBy('spatial_key') # added to test sampling assembled = grouped_compreh.agg( assembleTile('column_index', 'row_index', secondelement('probability'), master_layout.tileLayout.tileCols, master_layout.tileLayout.tileRows, 'float32').alias('probability')) layer = gps.TiledRasterLayer.from_rasterframe(assembled.asRF()) else: #################################### logger.warn("Identify worst performing cells") checkpoint = time.time() # TODO: Determine which images to take certainty = grouped \ .agg(F.avg(F.pow(firstelement(fitted.probability) - lit(0.5), 2.0)).alias('certainty')).cache() certainty.show() worst_keys_rdd = certainty \ .sort('certainty') \ .select('spatial_key') \ .limit(round(certainty.count() * 0.05)) \ .rdd.takeSample(False, (params['number_outgoing_names'])) worst_keys = spark.createDataFrame(worst_keys_rdd) outgoing_names = worst_keys \ .join(f_pool, (col('spatial_key.col') == col('col')) & (col('spatial_key.row') == col('row'))) \ .select('name') \ .withColumn('run', lit(run_id)) \ .withColumn('iteration', lit(last_iteration + 1)) \ .withColumn('processed', lit(False)) \ .withColumn('usage', lit('train')) \ .toPandas() uri = urlparse.urlparse(params['outgoing']) pd_df_to_s3_csv(outgoing_names, uri.netloc, uri.path[1:]) logger.warn( "Elapsed time for sorting certainty, converting to Pandas Dataframe, and saving to s3: {}s" .format(time.time() - checkpoint)) ########################################### checkpoint = time.time() # sampling testing images (num = probability_images) filtered_names_sample = filtered_names\ .sample(False, min(1.0, float(probability_images) / float(num_test_images)), seed=seed)\ .join(image_catalog.filter(image_catalog['season'] == 'GS'), ['col', 'row'])\ .select('scene_id')\ .dropDuplicates()\ .join(image_catalog.filter(image_catalog['season'] == 'GS'), 'scene_id')\ .join(f_pool.join(qs_in, ['name', 'col', 'row', 'name_col_row'], 'outer'), ['col','row'])\ .select('name', 'col', 'row', 'name_col_row') #re-collect all pixels within sampled images features_images = gather_data(all_image_uris, filtered_names_sample, master_metadata, feature_names, s3_bucket) #reclassify sampled testing images fitted_images = model.transform(features_images)\ .select('spatial_key', 'column_index', 'row_index', 'probability', 'prediction') grouped_sample = fitted_images.join( filtered_names_sample, (col('spatial_key.col') == col('col')) & (col('spatial_key.row') == col('row'))).groupby('spatial_key') assembled = grouped_sample.agg( assembleTile('column_index', 'row_index', secondelement('probability'), master_layout.tileLayout.tileCols, master_layout.tileLayout.tileRows, 'float32').alias('probability')) layer = gps.TiledRasterLayer.from_rasterframe(assembled.asRF()) coarse_layout = gps.LayoutDefinition( gps.Extent(-17.541, -35.46, 51.459, 37.54), gps.TileLayout(1380, 1460, 2000, 2000)) # we multiply by 100 to select digits that will be kept after converting from float to int. # range of int8 is to 128, so we can only preserve 2 sig figs output_tiles = (layer*100).convert_data_type(gps.CellType.INT8)\ .tile_to_layout(coarse_layout)\ .to_geotiff_rdd(storage_method=gps.StorageMethod.TILED) cog_location = '/tmp/image_{}_{}.tif' if 'image_output_pattern' not in params else params[ 'image_output_pattern'] output_tiles.foreach(lambda pair: write_bytes_to_s3( cog_location.format(pair[0].col, pair[0].row, aoi_name, run_id, str(last_iteration + 1)), pair[1])) logger.warn( "Elapsed time for writing catalog of probability images: {}s". format(time.time() - checkpoint))
grid = ParamGridBuilder().addGrid(gbt.maxDepth, [3,4,5]).build() metrics = MulticlassClassificationEvaluator(metricName="f1") cv = CrossValidator(estimator=gbt, estimatorParamMaps=grid, evaluator=metrics, numFolds=2) # Define pre-processing pipeline featureCols = ["AN3", "AN4", "AN5", "AN6", "AN7", "AN8", "AN9", "AN10"] stages = [VectorAssembler(inputCols=featureCols, outputCol="va"), StandardScaler(inputCol="va", outputCol="features"), StringIndexer(inputCol="status", outputCol="label"), cv] pipeline = Pipeline(stages=stages) pipelineTrained = pipeline.fit(training) predictions = pipelineTrained.transform(test) metrics = MulticlassMetrics(predictions.select(['prediction', 'label']).rdd) # Define mlflow artifacts to log with the experiment run mlflow.log_metric("precision", metrics.precision(1.0)) mlflow.log_metric("recall", metrics.recall(1.0)) mlflow.log_metric("f1", metrics.fMeasure(1.0)) mlflow.spark.log_model(pipelineTrained, "turbine_anomalies") mlflow.set_tag("model", "gbt") # Add confusion matrix to the model labels = pipelineTrained.stages[2].labels fig = plt.figure() sn.heatmap(pd.DataFrame(metrics.confusionMatrix().toArray()), annot=True, fmt='g', xticklabels=labels, yticklabels=labels) plt.suptitle("Turbine Damage Prediction. F1={:.2f}".format(metrics.fMeasure(1.0)), fontsize = 18) plt.xlabel("Predicted Labels")
from pyspark.ml.evaluation import MulticlassClassificationEvaluator # Evaluate model based on confusion matrix from pyspark.mllib.evaluation import MulticlassMetrics # model on training data regPara: lasso regularisation parameter (L1) lrModel = LogisticRegression(regParam=0.1).fit(trainData) # make prediction on test data pred = lrModel.transform(testData) pred.select('catLabel', 'label', 'prediction').show() evaluator1 = BinaryClassificationEvaluator(labelCol='label', metricName="areaUnderROC") evaluator2 = MulticlassClassificationEvaluator(labelCol='label', metricName="f1") metrics = MulticlassMetrics(pred.select('label', 'prediction').rdd.map(tuple)) print('AUC ROC of Logistic Regression model is %f' % evaluator1.evaluate(pred)) print('F1 score of Logistic Regression model is %f' % evaluator2.evaluate(pred)) metrics.confusionMatrix().toArray().transpose() # <a id="context322"></a> # #### 3.2.2. Decision Tree # In[18]: from pyspark.ml.classification import DecisionTreeClassifier # model on training data maxDepth is the hyperparameter dtModel = DecisionTreeClassifier(maxDepth=3).fit(trainData)
lambda row: LabeledPoint(row[-1], Vectors.dense(row[0:-1]))) # Split the data into training and test sets (25% held out for testielapsed_timeng) (X_train, X_val) = X_spark_rdd.randomSplit( [TRAINING_DATA_RATIO, 1 - TRAINING_DATA_RATIO]) start_time = time() rfc = RandomForest.trainClassifier(X_train, numClasses=3, categoricalFeaturesInfo={}, numTrees=15, featureSubsetStrategy="auto", impurity='gini', maxDepth=3, maxBins=92) end_time = time() elapsed_time = end_time - start_time print("Time to train model: %.3f seconds" % elapsed_time) predictions = rfc.predict(X_val.map(lambda x: x.features)) labels_and_predictions = X_val.map(lambda x: x.label).zip(predictions) metrics = MulticlassMetrics(labels_and_predictions) f1Score = metrics.fMeasure() print("Evaluation Metric : F1-score") print("F1-Score = {}".format(f1Score))
session = SparkSession.builder.appName('Wine Quality Prediction Model Load').getOrCreate() #Importing sys for taking the command line parameters import sys fileName = sys.argv[1] #Load Random Forest Model package from pyspark.mllib.tree import RandomForestModel loadedRFModel = RandomForestModel.load(session.sparkContext,"myRandomForestClassificationModel") data = session.read.format('csv').option('header','true').option('inferSchema','true').option('sep',';').load(fileName) from pyspark.mllib.regression import LabeledPoint modelData = data.rdd.map(lambda col: LabeledPoint(col[11],col[:11])) predictionData = loadedRFModel.predict(modelData.map(lambda x: x.features)) labelAndPredictionData = modelData.map(lambda lp: lp.label).zip(predictionData) #For F1 score using Random Forest with given dataset from pyspark.mllib.evaluation import MulticlassMetrics randomFResults = MulticlassMetrics(labelAndPredictionData) randomFConfMatrix = randomFResults.confusionMatrix().toArray() randomFPrecision = (randomFConfMatrix[0][0])/(randomFConfMatrix[0][0]+randomFConfMatrix[1][0]) randomFRecall = (randomFConfMatrix[0][0])/(randomFConfMatrix[0][0]+randomFConfMatrix[0][1]) randomFF1=(2*randomFPrecision*randomFRecall)/(randomFPrecision+randomFRecall) print("=======================================================================================================") print("F1 score using imported Random Forests model on the given dataset: " + str(randomFF1)) print("=======================================================================================================")
labelCol='default_payment_next_month') print('Evaluator areaUnderROC: ' + str(evaluator.evaluate(prediction))) # 0.7294563666075892 evaluator = BinaryClassificationEvaluator( rawPredictionCol='rawPrediction', metricName="areaUnderPR", labelCol='default_payment_next_month') print('Evaluator areaUnderPR : ' + str(evaluator.evaluate(prediction))) # 0.7294563666075892 prediction.groupBy('default_payment_next_month', 'prediction').count().show() # Metrics predictionRDD = prediction.select(['label', 'prediction']) \ .rdd.map(lambda line: (line[1], line[0])) metrics = MulticlassMetrics(predictionRDD) # Confusion Matrix print(metrics.confusionMatrix().toArray()) print('---------------------------Overall statistics------------------------') print('precision : ' + str(metrics.precision())) print('recall : ' + str(metrics.recall())) print('fMeasure : ' + str(metrics.fMeasure())) print('---------------------------statistics by class------------------------') labels = [0.0, 1.0] for label in sorted(labels): print('precision : ' + str(metrics.precision(label))) print('recall : ' + str(metrics.recall(label))) print('fMeasure : ' + str(metrics.fMeasure(label, beta=1.0)))
def func(line): v = [float(z) for z in line.split(';')] return LabeledPoint(v[11], v[0:10]) c = SparkConf().setAppName("winequality") s = SparkContext("local", conf=c) t = s.textFile("s3://winequalitynew/TrainingDataset.csv") h = t.first() r = t.filter(lambda z: z != h) tnew = r.map(func) dnew = s.textFile("s3://winequalitynew/ValidationDataset.csv") h = dnew.first() r = dnew.filter(lambda z: z != h) dnew2 = r.map(func) m = RandomForest.trainClassifier(tnew, numClasses=11, categoricalFeaturesInfo={}, numTrees=3, featureSubsetStrategy="auto", impurity='gini', maxDepth=4, maxBins=32) p = m.predict(dnew2.map(lambda z: z.features)) lpre = dnew2.map(lambda lp: lp.label).zip(p) me = MulticlassMetrics(lpre) scr = me.fMeasure() scr m.save(s, "s3://winequalitynew/winequality")
lr = LogisticRegression(maxIter=20, regParam=0.001) # Builing model pipeline pipeline = Pipeline(stages=[indexer, tokenizer, hashingTF, idf, lr]) # Train model on training set model = pipeline.fit( train_df ) #if you give new names to your indexed datasets, make sure to make adjustments here # Model prediction on test set pred = model.transform(test_df) # ...and here # Model prediction accuracy (F1-score) pl = pred.select("label", "prediction").rdd.cache() metrics = MulticlassMetrics(pl) metrics.fMeasure() ##################################################################################################### """ Task 1.2 a. Run the model provided above. Take your time to carefully understanding what is happening in this model pipeline. You are NOT allowed to make changes to this model's configurations. Compute and report the F1-score on the test dataset. b. Get and report the schema (column names and data types) of the model's prediction output. """ # Your code for this part, IF ANY, starts here print("the F1-score of the first model: {}".format(metrics.fMeasure())) print(pred)
rf_model1 = rf1.fit(training) rf_model1.getNumTrees rf_model1.numClasses print(rf_model1.toDebugString) print(rf_model1.featureImportances) training21 = rf_model1.transform(training) PredictionsandLabels = training21.select('prediction','Survived').rdd PredictionsandLabels.collect() #Resubstitution approach from pyspark.mllib.evaluation import BinaryClassificationMetrics, MulticlassMetrics metrics1 = MulticlassMetrics(PredictionsandLabels) metrics1.accuracy # for Bagging - two evaluatioin approaches # K-Fold -- Cross Validation approach: from pyspark.ml.tuning import CrossValidator, ParamGridBuilder from pyspark.ml.evaluation import BinaryClassificationEvaluator evaluator2 = BinaryClassificationEvaluator(labelCol='Survived',rawPredictionCol='prediction') paramGrid = ParamGridBuilder().addGrid(rf1.numTrees,[50,100]).addGrid(rf1.maxDepth,[5,8]).build() crossval2 = CrossValidator(estimator=rf1,estimatorParamMaps=paramGrid, evaluator=evaluator2,
# #### Decision tree # In[30]: sqlContext = SQLContext(sc) predictions = sqlContext.read.load("/media/milad/Linux/bigdata/predictions.csv", format='com.databricks.spark.csv', header='true',inferSchema='true') # In[32]: conf_tree = MulticlassMetrics(predictions.rdd.map(tuple)) # In[35]: conf_tree.confusionMatrix().toArray().transpose() # #### Naive bayse # In[36]: sqlContext = SQLContext(sc) predictions_nb = sqlContext.read.load("/media/milad/Linux/bigdata/predictions_nb.csv",
"hours-per-week") data.show() assembler = VectorAssembler(inputCols=data.columns[1:], outputCol="features") data = assembler.transform(data) data.show() # Splitting the data into training and data set training, test = data.select("label", "features").randomSplit([0.70, 0.30]) # Create Navie Bayes model and fit the model with training dataset nb = NaiveBayes() model = nb.fit(training) # Generate prediction from test dataset pred = model.transform(test) # Evaluate the accuracy of the model evaluator = MulticlassClassificationEvaluator() accuracy = evaluator.evaluate(pred) # Show model accuracy print("Accuracy:", accuracy, "\n\n") #Report predAndLabels = pred.select("prediction", "label").rdd metrics = MulticlassMetrics(predAndLabels) print("Confusion Matrix", metrics.confusionMatrix()) print("Precision", metrics.precision()) print("Recall", metrics.recall()) print("F-measure", metrics.fMeasure())
inputrddval = inputrddNorm.filter( lambda (((Community, Year, Month), label), features): Year >= 2015) DataTrain = inputrddtrain.map(parseDataF) DataTest = inputrddtest.map(parseDataF) DataVal = inputrddval.map(parseDataF) model = RandomForest.trainClassifier(DataTrain, numClasses=5, categoricalFeaturesInfo={}, numTrees=3, featureSubsetStrategy="auto", impurity='gini', maxDepth=20, maxBins=32) # Evaluate model on test instances and compute test error predictions = model.predict(DataTest.map(lambda x: x.features)) predictionsAndlabels = predictions.zip(DataTest.map(lambda lp: lp.label)) metrics = MulticlassMetrics(predictionsAndlabels) labelsAndPredictions = DataTest.map(lambda lp: lp.label).zip(predictions) testErr = labelsAndPredictions.filter(lambda (v, p): v != p).count() / float( DataTest.count()) print('Test Error = ' + str(testErr)) print metrics.confusionMatrix().toArray() print "precision = " + str(metrics.weightedPrecision) print "recall = " + str(metrics.weightedRecall)
def evaluate(self, model=None, trainingData=None, testingData=None): """ Ham kiem thu model, in ra man hinh do do chinh xac va thoi gian tinh toan """ time_train = 0 time_test = 0 if (not trainingData): trainingData = self.trainingData if (not testingData): testingData = self.testingData if (not model): # Train model print("Training...") start_train = datetime.now() model = self.trainModel(trainingData) time_train = datetime.now() - start_train #print("Num nodes: ", model.stages[2].totalNumNodes, "\n", model.stages[2].toDebugString, file=open("modelDebug.txt","w")) # Make predictions print("Testing...") start_test = datetime.now() predictions = model.transform(testingData) time_test = datetime.now() - start_test # Evaluation for flow print("{:*^100}".format("")) print("Training time: ", time_train) print("Testing time: ", time_test) featureImportances = {} fi = model.stages[2].featureImportances features = loadcols(self.dataset) index = 0 for value in fi: featureImportances[features[index]] = value index = index + 1 fiSorted = sorted(featureImportances.items(), key=lambda x: x[1], reverse=True) print("{:*^100}".format(" Feature Importances ")) f = open("features_importance.txt", "w") for feature in fiSorted: if feature[1] > 0.000: print("{!s} : {:.4%}".format(feature[0].strip(), feature[1])) # f.write("{!s}\n".format(feature[0].strip())) f.close() print("{:*^100}".format(" Evaluate for Flow ")) print("Total predictions:", predictions.count()) predictions.select("prediction", "indexedLabel", "label").groupBy("label").count().show() predictionAndLabels = predictions.select("prediction", "indexedLabel").rdd metrics = MulticlassMetrics(predictionAndLabels) print("Confusion Matrix:") for line in metrics.confusionMatrix().toArray(): print(line) print("TPR: {:.3%} \tFPR: {:.3%}".format( metrics.truePositiveRate(1.0), metrics.falsePositiveRate(1.0))) print("TNR: {:.3%} \tFNR: {:.3%}".format( metrics.truePositiveRate(0.0), metrics.falsePositiveRate(0.0))) print("Precision: {:.3%} \tRecall: {:.3%} \tAccuracy: {:.3%}".format( metrics.precision(1.0), metrics.recall(1.0), metrics.accuracy)) print(metrics.accuracy) print("{:*^100}".format(""))
pipeline = Pipeline(stages=[regexTokenizer, stopwordsRemover, countVectors, label_stringIdx]) pipelineFit = pipeline.fit(spark_df) dataset = pipelineFit.transform(spark_df) (trainingData, testData) = dataset.randomSplit([0.8, 0.2], seed = 100) #Logistic Regression lr = LogisticRegression(maxIter=100, regParam=0.01, elasticNetParam=0.01) lrModel = lr.fit(trainingData) predictions = lrModel.transform(testData) predictions.show() # Evaluate model results evaluator = MulticlassClassificationEvaluator(metricName="f1", labelCol='label') predictions_and_labels = predictions.select(["prediction","label"]) metrics = MulticlassMetrics(predictions_and_labels.rdd) conf_mat1 = metrics.confusionMatrix() precision1 = metrics.precision() recall1 = metrics.recall() f1Score1 = metrics.fMeasure() result = evaluator.evaluate(predictions) #Naive Bayes classification nb=NaiveBayes(smoothing =2) nbModel=nb.fit(trainingData) nbPredictions=nbModel.transform(testData) nbPredictions.show()
evaluator = MulticlassClassificationEvaluator(metricName="accuracy", labelCol="category") # compute the classification error on test data. accuracy = evaluator.evaluate(predictions) print("Test Error : " + str(1 - accuracy)) # In[16]: # confusion matrix from pyspark.mllib.evaluation import MulticlassMetrics predictionsAndLabelsNB = predictions.select("prediction", "category") # In[18]: metricsNB = MulticlassMetrics(predictionsAndLabelsNB.rdd) accuracyNB2 = metricsNB.accuracy print("Naive Bayes accuracy:") print(accuracyNB2) confusionMatrixNB = metricsNB.confusionMatrix().toArray() print("Confusion Matrix: ") print(confusionMatrixNB) # Try classifying a few basic sentences. # In[15]: tf = spark.createDataFrame([ ("Bactibilia has several consequences to human health", ),
print "Model Ready" predictions = model.predict(testLab.map(lambda x: x.features)) labelsAndPredictions = testLab.map(lambda lp: lp.label).zip(predictions) labelsAndPredictions = labelsAndPredictions.map(lambda (a,b): (b,a)) finalRdd = testData.map(lambda rec:(rec.yelp_id,rec.stars)).zip(predictions) testErr = labelsAndPredictions.filter(lambda (v, p): v != p).count() / float(testLab.count()) print('Test Error = ' + str(testErr)) print('Learned classification forest model:') print(model.toDebugString()) metrics = MulticlassMetrics(labelsAndPredictions) # Overall statistics precision = metrics.precision() recall = metrics.recall() f1Score = metrics.fMeasure() confusionMatrx = metrics.confusionMatrix() print("RANDOM FORREST FOR TEST SET ONLY: ") print("All Evaluation Measures Stats") print("Confusion Matix = %s" % confusionMatrx) print("Precision = %s" % precision) print("Recall = %s" % recall) print("F1 Score = %s" % f1Score) #now perform random forrest on all data
predictions_loaded = loaded_model.transform(rowDf) print(predictions_loaded) result = predictions_loaded.select(["message", "probability", "prediction"]).collect() for row in result: print(row.message, row.probability, row.prediction) predictions = loaded_model.transform(df).withColumn( "label", df["label"].cast(DoubleType())) rdd_map = predictions.select( "prediction", "label").rdd.map(lambda lp: (lp["prediction"], lp["label"])) binary_metrics = BinaryClassificationMetrics(rdd_map) print( f"Overall PR and ROC : {binary_metrics.areaUnderPR}, {binary_metrics.areaUnderROC}\n" ) multiclass_metrics = MulticlassMetrics(rdd_map) print("Confusion matrix : ") print(multiclass_metrics.confusionMatrix()) print( f"Overall: Precision {multiclass_metrics.precision(1), multiclass_metrics.precision(0)}" ) print( f"Overall: Recall {multiclass_metrics.recall(1), multiclass_metrics.recall(0)}" ) print(f"Overall Accuracy {multiclass_metrics.accuracy}")
def calc_metrics(df, simple_mode=True): rdd = df.select("prediction", "Profit").rdd metrics = MulticlassMetrics(rdd) metrics_dict = {} cm = metrics.confusionMatrix().toArray() TP = cm[0][0] print("TP IS " + str(TP)) TN = cm[1][1] print("TN IS " + str(TN)) FP = cm[0][1] print("FP IS " + str(FP)) FN = cm[1][0] print("FN IS " + str(FN)) accuracy = (TP + TN) / cm.sum() sensitivity = (TP) / (TP + FN) specificity = (TN) / (TN + FP) precision = (TP) / (TP + FP) npv = (TN) / (TN + FN) # Overall statistics metrics_dict['accuracy'] = accuracy metrics_dict['sensitivity'] = sensitivity metrics_dict['specificity'] = specificity metrics_dict['precision'] = precision metrics_dict['npv'] = npv # print(metrics_dict) # print("Summary Stats") print(metrics.confusionMatrix()) metrics_dict['confusionMatrix'] = metrics.confusionMatrix() # print("Accuracy = %.4f" % precision) # print("Recall = %.4f" % recall) # print("F1 Score = %.4f" % f1Score) # print("accuracy ", accuracy) # print("sensitivity ", sensitivity) # print("specificity ", specificity) # print("precision ", precision) print( "{},{},{},{},{}".format(round(accuracy, 3), round(sensitivity, 3), round(specificity, 3), round(precision, 3), round(npv, 3))) # print("sensitivity ", sensitivity) # print("specificity ", specificity) # print("precision ", precision) if not simple_mode: # Statistics by class labels = rdd.map(lambda lp: lp.Profit).distinct().collect() for label in sorted(labels): print("Class %s accuracy = %.4f" % (label, metrics.precision(label))) print("Class %s sensitivity = %.4f" % (label, metrics.recall(label))) print("Class %s F1 Measure = %.4f" % (label, metrics.fMeasure(label, beta=1.0))) print("\n") # Weighted stats weightedRecall = metrics.weightedRecall print("Weighted sensitivity = %.4f" % weightedRecall) metrics_dict['weightedRecall'] = weightedRecall weightedPrecision = metrics.weightedPrecision print("Weighted precision = %.4f" % weightedPrecision) metrics_dict['weightedPrecision'] = weightedPrecision # weightedFMeasure = metrics.weightedFMeasure() # print("Weighted F(1) Score = %.4f" % weightedFMeasure) # metrics_dict['weightedFMeasure'] = weightedFMeasure # # weightedFMeasure = metrics.weightedFMeasure(beta=0.5) # print("Weighted F(1) Score = %.4f" % weightedFMeasure) # metrics_dict['weightedFMeasure_beta'] = weightedFMeasure # # weightedFalsePositiveRate = metrics.weightedFalsePositiveRate # print("Weighted F(1) Score = %.4f" % weightedFalsePositiveRate) # metrics_dict['weightedFalsePositiveRate'] = weightedFalsePositiveRate print("\n") return metrics_dict
d_cat_sim = cat_sim.withColumn("d_cat", udf_get_decision(col("score"))) d_tdf_sim = tdf_sim.withColumn("d_tdf", udf_get_decision(col("score"))) d_als_sim = als_sim.withColumn("d_als", udf_get_decision(col("score"))) # Aggrgate results from attributes, collaborative, and text models. agg = d_cat_sim.alias('c').join(d_tdf_sim.alias('t'), col("c.review_id") == col("t.review_id"))\ .join(d_als_sim.alias('a'), col("c.review_id") == col("a.review_id"))\ .select("c.review_id", "c.business_id", "c.user_id", "c.r_stars", "c.d_cat", "t.d_tdf", "a.d_als")\ .filter(col("c.r_stars") != 3).withColumn("label", udf_get_label(col("r_stars")))\ .withColumn("d_agg", udf_get_agg_decision(col("c.d_cat"), col("t.d_tdf"), col("a.d_als"))) # Calculate metrics for content-based filtering - business attributes model. cat_labels = agg.select("d_cat", "label").rdd.map(lambda x: (float(x.d_cat), float(x.label))) cat_labels_bin = agg.select("d_cat", "label").rdd.map(lambda x: (float(x.d_cat), x.label)) cat_bin = BinaryClassificationMetrics(cat_labels_bin) cat_metrics = MulticlassMetrics(cat_labels) cat_precision_1 = cat_metrics.precision(1.0) cat_recall_1 = cat_metrics.recall(1.0) cat_precision_0 = cat_metrics.precision(0.0) cat_recall_0 = cat_metrics.recall(0.0) cat_accuracy = cat_metrics.accuracy # Calculate metrics for content-based filtering - reviews (text-mining) model. tdf_labels = agg.select("d_tdf", "label").rdd.map(lambda x: (float(x.d_tdf), float(x.label))) tdf_labels_bin = agg.select("d_tdf", "label").rdd.map(lambda x: (float(x.d_tdf), x.label)) tdf_bin = BinaryClassificationMetrics(tdf_labels_bin) tdf_metrics = MulticlassMetrics(tdf_labels) tdf_precision_1 = tdf_metrics.precision(1.0) tdf_recall_1 = tdf_metrics.recall(1.0) tdf_precision_0 = tdf_metrics.precision(0.0) tdf_recall_0 = tdf_metrics.recall(0.0)
train.cache() test.cache() # --------------- # Random Forest: # --------------- rf = RandomForestClassifier(numTrees=100, maxDepth=16, labelCol="label", seed=42) print('Training RandomForest model on training set. \n Model parameters: {}'. format(rf._paramMap)) trained_model = rf.fit(train) res = trained_model.transform(test) metrics = MulticlassMetrics(res.select(['label', 'prediction']).rdd) print('Accuracy on test set: ', evaluator.evaluate(res)) print('Area under ROC curve: ', eval.evaluate(res)) find_performance_metrics(res, "random forest") # --------------- # Logistic regression: # --------------- print('Training LogisticRegression model on training set.') logistic = LogisticRegression(regParam=0.1, labelCol="label") # , thresholds = [0.2, 0.5]) trained_model = logistic.fit(train) res = trained_model.transform(test) metrics = MulticlassMetrics(res.select(['label', 'prediction']).rdd) print('Accuracy on test set: ', evaluator.evaluate(res)) print('Area under ROC curve: ', eval.evaluate(res))
def sp_accuracy(self, prediction_and_labels): from pyspark.mllib.evaluation import MulticlassMetrics return MulticlassMetrics(prediction_and_labels)
from pyspark.ml.evaluation import MulticlassClassificationEvaluator evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy") accuracy = evaluator.evaluate(predicted) print(accuracy) # In[72]: from pyspark.mllib.evaluation import MulticlassMetrics pred_label = predicted.select(['label', 'prediction']) #confmat = pred_label.rdd.map(tuple) # In[73]: confmat = pred_label.rdd.map(tuple) # In[75]: metrics = MulticlassMetrics(confmat) confusion_mat = metrics.confusionMatrix() # In[79]: print(confusion_mat.toArray()) # In[ ]:
# specify layers for the neural network: # input layer of size 28x28 = 784 (features), two intermediate of size 5 and 4 # and output of size 10 (classes) #layers = [784, 5, 4, 10] layers = [784, 100, 10] trainer = MultilayerPerceptronClassifier(maxIter=1000, layers=layers, blockSize=128, seed=1234) model = trainer.fit(train_vectors_withlabel) result = model.transform(test_vectors_withlabel) predictionAndLabels = result.select("prediction", "label") changedTypedf = predictionAndLabels.withColumn("label", predictionAndLabels["label"].cast(DoubleType())) test_rdd = changedTypedf.rdd.map(tuple) metrics = MulticlassMetrics(test_rdd) #Print F1-score, Recall and Precision for each label. evaluator = MulticlassClassificationEvaluator(metricName="accuracy") print("Test set accuracy = " + str(evaluator.evaluate(predictionAndLabels))) labels = (0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0) x = PrettyTable(['Label', 'Precision', 'Recall', 'F1-score']) for label in sorted(labels):
labels = labeled_data.map(lambda x: x[0]) tf = HashingTF().transform(labeled_data.map(lambda x: x[1])) idf = IDF(minDocFreq=5).fit(tf) tfidf = idf.transform(tf) zipped_data = ( labels.zip(tfidf).map(lambda x: LabeledPoint(x[0], x[1])).cache()) # Do a random split so we can test our model on non-trained data training, test = zipped_data.randomSplit([0.7, 0.3]) # Train our model model = NaiveBayes.train(training) # Use our model to predict train_preds = (training.map(lambda x: x.label).zip( model.predict(training.map(lambda x: x.features)))) test_preds = (test.map(lambda x: x.label).zip( model.predict(test.map(lambda x: x.features)))) # Ask PySpark for some metrics on how our model predictions performed trained_metrics = MulticlassMetrics( train_preds.map(lambda x: (x[0], float(x[1])))) test_metrics = MulticlassMetrics(test_preds.map(lambda x: (x[0], float(x[1])))) with open('output_binary.txt', 'w+') as f: f.write(str(trained_metrics.confusionMatrix().toArray()) + '\n') f.write(str(trained_metrics.precision()) + '\n') f.write(str(test_metrics.confusionMatrix().toArray()) + '\n') f.write(str(test_metrics.precision()) + '\n')
data = data.select(data.label.cast("double"), "educational-num", "hours-per-week") # Create vector assembler for feature columns assembler = VectorAssembler(inputCols=data.columns[1:], outputCol="features") data = assembler.transform(data) # Split data into training and test data set training, test = data.select("label", "features").randomSplit([0.85, 0.15]) # Create Navie Bayes model and fit the model with training dataset nb = NaiveBayes() model = nb.fit(training) # Generate prediction from test dataset predictions = model.transform(test) # Evuluate the accuracy of the model evaluator = MulticlassClassificationEvaluator() accuracy = evaluator.evaluate(predictions) # Show model accuracy print("Accuracy:", accuracy) # Report predictionAndLabels = predictions.select("label", "prediction").rdd metrics = MulticlassMetrics(predictionAndLabels) print("Confusion Matrix:", metrics.confusionMatrix()) print("Precision:", metrics.precision()) print("Recall:", metrics.recall()) print("F-measure:", metrics.fMeasure())
def fit(self, data): '''Dataset must at least contain the following two columns: label: the class labels features: feature vector Parameters ---------- data : Dataset<Row> input data Returns ------- dict map with metrics ''' start = time.time() classCount = int(data.select(self.label).distinct().count()) labelIndexer = StringIndexer().setInputCol(self.label) \ .setOutputCol("indexedLabel") \ .fit(data) # Split the data into training and test sets (30% held out for testing) splits = data.randomSplit([1.0 - self.testFraction, self.testFraction], self.seed) trainingData = splits[0] testData = splits[1] labels = labelIndexer.labels print("\n Class\tTrain\tTest") for l in labels: print("%s\t%i\t%i" % (l \ ,(trainingData.filter(trainingData[self.label] == l)).count() \ ,(testData.filter(testData[self.label] == l)).count() \ ) ) # Set input columns self.predictor.setLabelCol("indexedLabel").setFeaturesCol("features") # Convert indexed labels back to original labels labelConverter = IndexToString().setInputCol("prediction") \ .setOutputCol("predictedLabel") \ .setLabels(labelIndexer.labels) # Chain indexers and forest ina Pipline pipeline = Pipeline().setStages( [labelIndexer, self.predictor, labelConverter]) # Train model. This also runs the indexers model = pipeline.fit(trainingData) # Make predictions predictions = model.transform(testData).cache() # Display some sample predictions print(f"\nSample predictions: {str(self.predictor).split('_')[0]}" ) # TODO predictor.getClass().getSimpleName() predictions.sample(False, 0.1, self.seed).show(5) predictions = predictions.withColumnRenamed(self.label, "stringLabel") predictions = predictions.withColumnRenamed("indexedLabel", self.label) # Collect metrics pred = predictions.select("prediction", self.label) metrics = OrderedDict() metrics["Method"] = str(self.predictor).split('_')[0] if classCount == 2: b = BinaryClassificationMetrics(pred.rdd) metrics["AUC"] = str(b.areaUnderROC) m = MulticlassMetrics(pred.rdd) metrics["F"] = str(m.weightedFMeasure()) metrics["Accuracy"] = str(m.accuracy) metrics["Precision"] = str(m.weightedPrecision) metrics["Recall"] = str(m.weightedRecall) metrics["False Positive Rate"] = str(m.weightedFalsePositiveRate) metrics["True Positive Rate"] = str(m.weightedTruePositiveRate) metrics[""] = f"\nConfusion Matrix\n{labels}\n{m.confusionMatrix()}" end = time.time() print(f"Total time taken: {end-start}\n") return metrics