Esempio n. 1
0
def testClassification(data):
    # Train a GradientBoostedTrees model.

    stringIndexer = StringIndexer(inputCol="label", outputCol="indexLabel")
    si_model = stringIndexer.fit(data)
    td = si_model.transform(data)

    rf = RandomForestClassifier(numTrees=5, maxDepth=4, labelCol="indexLabel",seed=13)

    trainData,testData = td.randomSplit([0.8,0.2],13)

    predictionDF = rf.fit(trainData).transform(testData)

    selected = predictionDF\
        .select('label','indexLabel','prediction','rawPrediction','probability')
    for row in selected.collect():
        print row

    scoresAndLabels = predictionDF\
       .map(lambda x: (float(x.probability.toArray()[1]), x.indexLabel))
    for sl in scoresAndLabels.collect():
        print sl
    evaluator = BinaryClassificationEvaluator(labelCol='indexLabel',metricName='areaUnderROC')
    metric = evaluator.evaluate(selected)
    print metric
Esempio n. 2
0
def randomForestClassification(df,arguments):
	from pyspark.ml.classification import RandomForestClassifier
	maxDepth = 5
	minInstancesPerNode = 1
	numTrees = 20
	impurity = "gini"

	if arguments.maxDepth != None:
		maxDepth = float(arguments.maxDepth)

	if arguments.minInstancesPerNode != None:
		minInstancesPerNode = float(arguments.minInstancesPerNode)

	if arguments.numTrees != None:
		numTrees = float(arguments.numTrees)

	if arguments.impurity != None:
		impurity = arguments.impurity

	rf =  RandomForestClassifier(numTrees=numTrees,
								 maxDepth=maxDepth,
								 minInstancesPerNode=minInstancesPerNode,
								 impurity=impurity)
	model = rf.fit(df)

	return model
Esempio n. 3
0
def rf(ss, data, label_index, feature_indexs, project_url):
    # 1.构造训练数据集
    def func(x):
        features_data = []
        for feature in feature_indexs:
            features_data.append(x[feature])
        return Row(label=label_index, features=Vectors.dense(features_data))

    training_set = data.rdd.map(list).map(lambda x: func(x)).toDF()

    # 2.训练模型
    rf_param = RandomForestClassifier(numTrees=50)
    rf_model = rf_param.fit(training_set)

    # 3.保存模型
    model_path = project_url + '/model/multipleClassification/rf'
    rf_model.write().overwrite().save(model_path)

    # 4.读取模型
    rf2 = rf_model.load(model_path)

    # 5.预测
    rf_pred = rf2.transform(training_set)
    rf_pred.select("prediction", "features").show()

    from pyspark.ml.evaluation import MulticlassClassificationEvaluator
    # 6.评估
    rf_accuracy = MulticlassClassificationEvaluator(
        metricName='accuracy').evaluate(rf_pred)
    print("RF's accuracy is %f" % rf_accuracy)
    rf_precision = MulticlassClassificationEvaluator(
        metricName='weightedPrecision').evaluate(rf_pred)
    print("RF's precision is %f" % rf_precision)
Esempio n. 4
0
def random_forest(df, columns, input_col):
    """
    Runs a random forest for input DataFrame.
    :param df: Pyspark dataframe to analyze.
    :param columns: List of columns to select for prediction.
    :param input_col: Column to predict.
    :return: DataFrame with random forest and prediction run.
    """

    assert_spark_df(df)

    assert isinstance(columns, list), "Error, columns must be a list"

    assert isinstance(input_col, str), "Error, input column must be a string"

    data = df.select(columns)
    feats = data.columns
    feats.remove(input_col)
    transformer = op.DataFrameTransformer(data)
    transformer.string_to_index(input_cols=input_col)
    transformer.vector_assembler(input_cols=feats)
    model = RandomForestClassifier()
    transformer.rename_col(columns=[(input_col + "_index", "label")])
    rf_model = model.fit(transformer.df)
    df_model = rf_model.transform(transformer.df)
    return df_model, rf_model
Esempio n. 5
0
def consulting_project(spark, resources_folder):
    data = spark.read.csv(resources_folder + 'dog_food.csv',
                          header=True,
                          inferSchema=True)
    data.printSchema()
    data.show()
    data.describe().show()
    # data.filter((data['Spoiled']==0)).show()
    assembler = VectorAssembler(inputCols=['A', 'B', 'C', 'D'],
                                outputCol='features')
    data_prepared = assembler.transform(data)
    rfc = RandomForestClassifier(labelCol='Spoiled', featuresCol='features')
    rfc_model = rfc.fit(data_prepared)
    print(rfc_model)
    rfc_model_pred = rfc_model.transform(data_prepared)

    print("Predicciones del modelo")
    print(rfc_model_pred)
    rfc_model_pred.show()

    print("Evaluación del modelo")
    my_binary_evaluator = BinaryClassificationEvaluator(labelCol='Spoiled')
    print(my_binary_evaluator.evaluate(rfc_model_pred))

    print("featureImportances")
    print(rfc_model.featureImportances)
    print(type(rfc_model.featureImportances))
Esempio n. 6
0
 def fit_nb(train):
     rf = RandomForestClassifier(numTrees=20,
                                 maxDepth=20,
                                 labelCol="label",
                                 seed=42)
     model = rf.fit(train)
     return model
Esempio n. 7
0
def predict(df_train, df_test):
    # TODO: Train random forest classifier
    vecAssembler = VectorAssembler(inputCols=[
        "count1", "count2", "count3", "count4", "count5", "count6", "count7",
        "count8"
    ],
                                   outputCol="features")
    new_df = vecAssembler.transform(df_train)
    rf = RandomForestClassifier(numTrees=5, maxDepth=5, labelCol="id", seed=0)
    model = rf.fit(new_df)

    new_df_test = vecAssembler.transform(df_test)
    prediction = model.transform(new_df_test)

    #prediction.show()
    mvv = prediction.select("prediction").rdd.flatMap(lambda x: x).collect()

    # Hint: Column names in the given dataframes need to match the column names
    # expected by the random forest classifier `train` and `transform` functions.
    # Or you can alternatively specify which columns the `train` and `transform`
    # functions should use

    # Result: Result should be a list with the trained model's predictions
    # for all the test data points
    return mvv
Esempio n. 8
0
def randomForestClassifier(train, test):
    rf = RandomForestClassifier(featuresCol='features', labelCol='label')
    rfModel = rf.fit(train)
    predictions = rfModel.transform(test)
    predictions.select('age', 'job', 'label', 'rawPrediction', 'prediction',
                       'probability').show(10)
    return predictions
Esempio n. 9
0
 def test_multiclass_randomforest_classification_summary(self):
     df = self.spark.createDataFrame([(1.0, 2.0, Vectors.dense(1.0)),
                                      (0.0, 2.0, Vectors.sparse(1, [], [])),
                                      (2.0, 2.0, Vectors.dense(2.0)),
                                      (2.0, 2.0, Vectors.dense(1.9))],
                                     ["label", "weight", "features"])
     rf = RandomForestClassifier(weightCol="weight")
     model = rf.fit(df)
     self.assertTrue(model.hasSummary)
     s = model.summary
     # test that api is callable and returns expected types
     self.assertTrue(isinstance(s.predictions, DataFrame))
     self.assertEqual(s.labelCol, "label")
     self.assertEqual(s.predictionCol, "prediction")
     self.assertEqual(s.totalIterations, 0)
     self.assertTrue(isinstance(s.labels, list))
     self.assertTrue(isinstance(s.truePositiveRateByLabel, list))
     self.assertTrue(isinstance(s.falsePositiveRateByLabel, list))
     self.assertTrue(isinstance(s.precisionByLabel, list))
     self.assertTrue(isinstance(s.recallByLabel, list))
     self.assertTrue(isinstance(s.fMeasureByLabel(), list))
     self.assertTrue(isinstance(s.fMeasureByLabel(1.0), list))
     self.assertAlmostEqual(s.accuracy, 1.0, 2)
     self.assertAlmostEqual(s.weightedTruePositiveRate, 1.0, 2)
     self.assertAlmostEqual(s.weightedFalsePositiveRate, 0.0, 2)
     self.assertAlmostEqual(s.weightedRecall, 1.0, 2)
     self.assertAlmostEqual(s.weightedPrecision, 1.0, 2)
     self.assertAlmostEqual(s.weightedFMeasure(), 1.0, 2)
     self.assertAlmostEqual(s.weightedFMeasure(1.0), 1.0, 2)
     # test evaluation (with training dataset) produces a summary with same values
     # one check is enough to verify a summary is returned, Scala version runs full test
     sameSummary = model.evaluate(df)
     self.assertTrue(isinstance(sameSummary, RandomForestClassificationSummary))
     self.assertFalse(isinstance(sameSummary, BinaryRandomForestClassificationSummary))
     self.assertAlmostEqual(sameSummary.accuracy, s.accuracy)
Esempio n. 10
0
    def random_forest(df, columns, input_col, **kargs):
        """
        Runs a random forest classifier for input DataFrame.
        :param df: Pyspark dataframe to analyze.
        :param columns: List of columns to select for prediction.
        :param input_col: Column to predict.
        :return: DataFrame with random forest and prediction run.
        """

        if not is_dataframe(df):
            raise TypeError("Spark dataframe expected")

        columns = parse_columns(df, columns)

        assert isinstance(input_col,
                          str), "Error, input column must be a string"

        data = df.select(columns)
        feats = data.columns
        feats.remove(input_col)

        df = string_to_index(df, input_cols=input_col)
        df = vector_assembler(df, input_cols=feats)

        model = RandomForestClassifier(**kargs)

        df = df.cols.rename([(input_col + "_index", "label")])

        rf_model = model.fit(df)
        df_model = rf_model.transform(df)
        return df_model, rf_model
Esempio n. 11
0
def train_random_forest(df):
    stringIndexer = StringIndexer(inputCol="label", outputCol="indexed")
    si_model = stringIndexer.fit(df)
    td = si_model.transform(df)
    rf = RandomForestClassifier(numTrees=3, maxDepth=2, labelCol="indexed",
                                seed=int(random.random()))
    return rf, rf.fit(td)
Esempio n. 12
0
    def random_forest(df, columns, input_col, **kwargs):
        """
        Runs a random forest classifier for input DataFrame.
        :param df: Pyspark dataframe to analyze.
        :param columns: List of columns to select for prediction.
        :param input_col: Column to predict.
        :return: DataFrame with random forest and prediction run.
        """

        columns = parse_columns(df, columns)

        data = df.select(columns)
        feats = data.columns
        feats.remove(input_col)

        df = string_to_index(df, input_cols=input_col)
        df = vector_assembler(df, input_cols=feats, output_col="features")

        model = RandomForestClassifier(**kwargs)
        df.table()
        df = df.cols.rename(name_col(input_col, STRING_TO_INDEX), "label")

        rf_model = model.fit(df)
        df_model = rf_model.transform(df)
        return df_model, rf_model
Esempio n. 13
0
def testClassification(data):
    # Train a GradientBoostedTrees model.

    stringIndexer = StringIndexer(inputCol="label", outputCol="indexLabel")
    si_model = stringIndexer.fit(data)
    td = si_model.transform(data)

    rf = RandomForestClassifier(numTrees=5,
                                maxDepth=4,
                                labelCol="indexLabel",
                                seed=13)

    trainData, testData = td.randomSplit([0.8, 0.2], 13)

    predictionDF = rf.fit(trainData).transform(testData)

    selected = predictionDF\
        .select('label','indexLabel','prediction','rawPrediction','probability')
    for row in selected.collect():
        print row

    scoresAndLabels = predictionDF\
       .map(lambda x: (float(x.probability.toArray()[1]), x.indexLabel))
    for sl in scoresAndLabels.collect():
        print sl
    evaluator = BinaryClassificationEvaluator(labelCol='indexLabel',
                                              metricName='areaUnderROC')
    metric = evaluator.evaluate(selected)
    print metric
Esempio n. 14
0
def basic_example(spark, resources_folder):
    data = spark.read.format('libsvm').load(resources_folder +
                                            'sample_libsvm_data.txt')
    data.printSchema()
    data.show()
    train_data, test_data = data.randomSplit([0.6, 0.4])
    dtc = DecisionTreeClassifier()
    rfc = RandomForestClassifier()
    gbtc = GBTClassifier()

    dtc_model = dtc.fit(train_data)
    rfc_model = rfc.fit(train_data)
    gbtc_model = gbtc.fit(train_data)

    dtc_predictions = dtc_model.transform(test_data)
    rfc_predictions = rfc_model.transform(test_data)
    gbtc_predictions = gbtc_model.transform(test_data)

    dtc_predictions.show()
    rfc_predictions.show()
    # GBT No tiene rawPrediction Column, si esta haciendo un predictor de clasificacion binaria o multiclasificacion
    # puede que pida el rawPrediction como un input
    gbtc_predictions.show()

    acc_eval = MulticlassClassificationEvaluator(metricName='accuracy')
    print("DTC Accuracy")
    print(acc_eval.evaluate(dtc_predictions))
    print("RFC Accuracy")
    print(acc_eval.evaluate(rfc_predictions))
    print("GBTC Accuracy")
    print(acc_eval.evaluate(gbtc_predictions))

    print(rfc_model.featureImportances)
Esempio n. 15
0
def test_sklearn_interaction():
    import sklearn
    from sklearn.model_selection import train_test_split
    from sklearn.ensemble import RandomForestClassifier

    # train a simple sklean RF model on the iris dataset
    X, y = shap.datasets.iris()
    X_train, X_test, Y_train, Y_test = train_test_split(*shap.datasets.iris(),
                                                        test_size=0.2,
                                                        random_state=0)
    rforest = RandomForestClassifier(n_estimators=100,
                                     max_depth=None,
                                     min_samples_split=2,
                                     random_state=0)
    model = rforest.fit(X_train, Y_train)

    # verify symmetry of the interaction values (this typically breaks if anything is wrong)
    interaction_vals = shap.TreeExplainer(model).shap_interaction_values(X)
    for i in range(len(interaction_vals)):
        for j in range(len(interaction_vals[i])):
            for k in range(len(interaction_vals[i][j])):
                for l in range(len(interaction_vals[i][j][k])):
                    assert abs(interaction_vals[i][j][k][l] -
                               interaction_vals[i][j][l][k]) < 1e-4

    # ensure the interaction plot works
    shap.summary_plot(interaction_vals[0], X, show=False)
def build_randomForest(path):
    df = load_data(path)
    avg_age=find_avg_age(df)
    df = data_preparation(df, avg_age)

    df = df.drop('Cabin')
    df = df.drop('Ticket')
    df = df.drop('Name')

    stringIndexer = StringIndexer(inputCol="Survived", outputCol="indexed")
    si_model = stringIndexer.fit(df)
    df = si_model.transform(df)
    df.show()

    rdf = RandomForestClassifier(labelCol='indexed')
    grid = ParamGridBuilder().addGrid(rdf.maxDepth, [1,2,3,5,6,8,10])\
                            .addGrid(rdf.numTrees,[1,5,10,30,50,100,200]).build()

    evaluator = BinaryClassificationEvaluator()
    cv = CrossValidator(estimator=rdf, estimatorParamMaps=grid, evaluator=evaluator)
    cvModel = rdf.fit(df)

    prediction = cvModel.transform(df)
    prediction.show()

    print "classification evaluation :" , evaluator.evaluate(prediction)

    return cvModel,avg_age
Esempio n. 17
0
    def fit(self,
            df,
            maxDepth=5,
            maxBins=32,
            numTrees=20,
            regParam=0.0,
            featuresCol="features",
            ignoreCols=["id"]):
        self.featuresCol = featuresCol
        self.labelCols = df.columns
        self.labelCols.remove("features")
        for c in ignoreCols:
            self.labelCols.remove(c)
        self.models = []

        for c in self.labelCols:
            lr = RandomForestClassifier(featuresCol=featuresCol,
                                        labelCol=c,
                                        predictionCol=c + "_pred",
                                        probabilityCol=c + "_prob",
                                        rawPredictionCol=c + "_rpred",
                                        maxDepth=maxDepth,
                                        maxBins=maxBins,
                                        impurity="gini",
                                        numTrees=numTrees,
                                        seed=None)
            model = lr.fit(df)
            self.models.append(model)
Esempio n. 18
0
def LearningCurve(df, target):

    df_t = df
    string_cols = []
    for (a, b) in df.dtypes:
        if b == 'string' and a != target:
            string_cols.append(a)

    num_cols = [x for x in df.columns if x not in string_cols and x != target]
    encoded_cols = [x + "_index" for x in string_cols]

    indexers = [
        StringIndexer(inputCol=column, outputCol=column + "_index").fit(df)
        for column in string_cols
    ]
    pipeline = Pipeline(stages=indexers)
    df_t = pipeline.fit(df_t).transform(df_t)

    cols_now = num_cols + encoded_cols
    assembler_features = VectorAssembler(inputCols=cols_now,
                                         outputCol='features')
    labelIndexer = StringIndexer(inputCol=target, outputCol="label")
    tmp = [assembler_features, labelIndexer]
    pipeline = Pipeline(stages=tmp)
    df_t = pipeline.fit(df_t).transform(df_t)
    df_t.cache()
    trainingData, testData = df_t.randomSplit([0.7, 0.3], seed=0)

    rf = RF(labelCol='label', featuresCol='features', numTrees=200)
    plot_points = []

    #Variable to be adjusted for increment in data%
    step_var = 10

    for i in range(step_var, 101, step_var):

        sample_size = (i * trainingData.count()) / 100
        part_Data = trainingData.rdd.takeSample(False, sample_size, seed=i)
        part_Data = sqlContext.createDataFrame(part_Data)

        model = rf.fit(part_Data)
        evaluator = MulticlassClassificationEvaluator(
            labelCol="label",
            predictionCol="prediction",
            metricName="accuracy")

        #Calculating train error
        transformed = model.transform(part_Data)
        train_accuracy = evaluator.evaluate(transformed)
        train_error = 1 - train_accuracy

        #Calculating test error
        transformed = model.transform(testData)
        test_accuracy = evaluator.evaluate(transformed)
        test_error = 1 - test_accuracy

        plot_points.append([i, train_error, test_error])

    return plot_points
Esempio n. 19
0
def random_forest(train, test, numTrees, impurity):
  # Entrenamos el modelo.
  rf = RandomForestClassifier(featuresCol="features", labelCol="label", numTrees=numTrees, impurity=impurity, seed=13)
  model = rf.fit(train)
  evaluator = BinaryClassificationEvaluator()
  accuracy = evaluator.evaluate(model.transform(test))

  return accuracy
Esempio n. 20
0
def rf(df):
    trainingData, testData = df.randomSplit([0.7, 0.3], seed=0)
    rf = RF(labelCol='label', featuresCol='features', numTrees=100)
    fit = rf.fit(trainingData)
    # featureImp = fit.featureImportances
    fit.save("s3a://ffinsight/model_rf")
    prediction = fit.transform(testData)
    return prediction
Esempio n. 21
0
def prediction(dataset):
    (training, test) = dataset.randomSplit([0.8, 0.2])
    rf = RandomForestClassifier(labelCol='Survived',
                                featuresCol='features',
                                maxDepth=5)
    model = rf.fit(training)
    predictions = model.transform(test)
    return predictions
Esempio n. 22
0
def predicted():
    assem = VectorAssembler(inputCols=['Fare', 'Pclass', 'Age'],
                            outputCol='features')
    feat_df = assem.transform(
        df.select('Fare', 'Pclass', 'Age', 'Survived').dropna())
    rf = RandomForestClassifier(featuresCol='features', labelCol='Survived')
    model = rf.fit(feat_df)
    return model.transform(feat_df)
def run_random_forest_algorithm(tn_data, ts_data):
    rf = RandomForestClassifier(featuresCol="scaled_features",
                                labelCol="output",
                                predictionCol="prediction")
    rfModel = rf.fit(tn_data)
    predictions = rfModel.transform(ts_data)

    print_perf_eval(predictions)
Esempio n. 24
0
 def random_forests(self):
     features = self.select_feature()
     rf = RandomForestClassifier(labelCol='temperature',
                                 featuresCol='features')
     final_df = features.select('features', 'temperature')
     rf_model = rf.fit(final_df)
     print(rf_model.featureImportances)
     return rf_model.featureImportances
Esempio n. 25
0
def test_sklearn_random_forest_multiclass():
    import shap
    from sklearn.ensemble import RandomForestClassifier

    X, y = shap.datasets.iris()
    y[y == 2] = 1
    model = RandomForestClassifier(n_estimators=100,
                                   max_depth=None,
                                   min_samples_split=2,
                                   random_state=0)
    model.fit(X, y)

    explainer = shap.TreeExplainer(model)
    shap_values = explainer.shap_values(X)

    assert np.abs(shap_values[0][0, 0] - 0.05) < 1e-3
    assert np.abs(shap_values[1][0, 0] + 0.05) < 1e-3
Esempio n. 26
0
def doRandomForestClassification(filename):
    stages = []
    es.update(index='spark-jobs',
              doc_type='job',
              id=task_id,
              body={'doc': {
                  'current': 30,
                  'status': 'Reading file..'
              }})
    df = openfile(filename)
    es.update(index='spark-jobs',
              doc_type='job',
              id=task_id,
              body={'doc': {
                  'current': 40,
                  'status': 'Mapping..'
              }})
    categoricalColumns = checkCategoricalColumns(df)
    numericColumns = checkNumericColumns(df)
    cols = allColumns(df)
    stages, df = indexInputColumns(categoricalColumns, stages, df)
    stages, df = indexOutputColumn(stages, 'deposit', df)
    stages, df = vectorAsFeatures(categoricalColumns, numericColumns, stages,
                                  df)
    selectedCols, df = pipelane(df, stages, cols)
    es.update(index='spark-jobs',
              doc_type='job',
              id=task_id,
              body={
                  'doc': {
                      'current': 50,
                      'status': 'Splitting data to train and test..'
                  }
              })
    train, test = splitDataToTrainAndTest(df)
    es.update(index='spark-jobs',
              doc_type='job',
              id=task_id,
              body={'doc': {
                  'current': 60,
                  'status': 'Training model..'
              }})
    rf = RandomForestClassifier(featuresCol='features', labelCol='label')
    rfModel = rf.fit(train)
    predictions = rfModel.transform(test)
    predictions.select('age', 'job', 'label', 'rawPrediction', 'prediction',
                       'probability')
    es.update(
        index='spark-jobs',
        doc_type='job',
        id=task_id,
        body={'doc': {
            'current': 80,
            'status': 'Calculating accuracy..'
        }})
    evaluator = binaryClassificationEvaluator(predictions)
    accuracy = evaluator.evaluate(predictions)
    return accuracy, predictions, rfModel
Esempio n. 27
0
def trainModal(training_data):
    rf = RandomForestClassifier(labelCol='quality',
                                featuresCol='features',
                                maxDepth=15,
                                maxBins=25,
                                numTrees=40)
    # Fitting training model in current ML model
    model = rf.fit(training_data)
    return model
def random_Forest(train_data, test_data):
    # Create initial Random Forest model
    print("Accuracy of Random Forest Classifier :")
    rf = RandomForestClassifier()
    model = rf.fit(train_data)
    predictions = model.transform(test_data)
    evaluator = BinaryClassificationEvaluator(labelCol="label")
    accuracy = evaluator.evaluate(predictions)
    print "The  accuracy = %g" % accuracy
Esempio n. 29
0
def random_forest(data, test_data):
    stringIndexer = StringIndexer(inputCol="label", outputCol="indexed")
    si_model = stringIndexer.fit(data)
    data = si_model.transform(data)

    rf = RandomForestClassifier(numTrees=200, maxDepth=20, labelCol="indexed", featuresCol='features', seed=42)
    model = rf.fit(data)
    preds = model.transform(test_data)
    return preds
Esempio n. 30
0
def Train_Model(Training_Dataset, Model_Type):

    # set seed for reproducibility
    (trainingData, testData) = Training_Dataset.randomSplit([0.7, 0.3],
                                                            seed=100)
    print("Training Dataset Count: " + str(trainingData.count()))
    print("Test Dataset Count: " + str(testData.count()))

    if Model_Type == "LR":
        from pyspark.ml.classification import LogisticRegression

        lr = LogisticRegression(maxIter=20, regParam=0.3, elasticNetParam=0)
        model = lr.fit(trainingData)

    elif Model_Type == "LRCV":
        from pyspark.ml.classification import LogisticRegression
        from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
        from pyspark.ml.evaluation import MulticlassClassificationEvaluator

        # define evaluator for cross validation
        evaluator = MulticlassClassificationEvaluator(
            predictionCol="prediction")

        # estimator for cross validation
        lr = LogisticRegression(maxIter=20, regParam=0.3, elasticNetParam=0)

        # Create ParamGrid for Cross Validation
        paramGrid = (
            ParamGridBuilder().addGrid(
                lr.regParam, [0.1, 0.3, 0.5])  # regularization parameter
            .addGrid(lr.elasticNetParam,
                     [0.0, 0.1, 0.2])  # Elastic Net Parameter (Ridge = 0)
            #            .addGrid(model.maxIter, [10, 20, 50]) #Number of iterations
            #            .addGrid(idf.numFeatures, [10, 100, 1000]) # Number of features
            .build())

        # Create 5-fold CrossValidator
        cv = CrossValidator(estimator=lr, \
                            estimatorParamMaps=paramGrid, \
                            evaluator=evaluator, \
                            numFolds=5)
        model = cv.fit(trainingData)

    else:
        from pyspark.ml.classification import RandomForestClassifier

        rf = RandomForestClassifier(labelCol="label", \
                            featuresCol="features", \
                            numTrees = 100, \
                            maxDepth = 4, \
                            maxBins = 32)

        # Train model with Training Data
        model = rf.fit(trainingData)

    return model, testData
Esempio n. 31
0
def test_single_row_random_forest():
    import shap
    import numpy as np
    from sklearn.model_selection import train_test_split
    from sklearn.ensemble import RandomForestClassifier
    import sklearn

    X_train, X_test, Y_train, _ = train_test_split(*shap.datasets.adult(),
                                                   test_size=0.2,
                                                   random_state=0)
    clf = RandomForestClassifier(random_state=202,
                                 n_estimators=10,
                                 max_depth=10)
    clf.fit(X_train, Y_train)
    predicted = clf.predict_proba(X_test)
    ex = shap.TreeExplainer(clf)
    shap_values = ex.shap_values(X_test.iloc[0, :])
    assert np.abs(shap_values[0].sum() + ex.expected_value[0] - predicted[0,0]) < 1e-4, \
        "SHAP values don't sum to model output!"
def RF_model(td, n, m, s=50):
    # td_new = change_column_datatype(td, "label", DoubleType)
    td_new = td.withColumn("label", td["label"].cast(DoubleType()))
    rf = RandomForestClassifier(numTrees=n,
                                maxDepth=m,
                                maxBins=32,
                                labelCol="label",
                                seed=s)
    model = rf.fit(td_new)
    return model
Esempio n. 33
0
def transform_predictions(dataframe, spark):
    df_transformed = dataframe.drop("Patient addmited to regular ward (1=yes, 0=no)",
                                    "Patient addmited to semi-intensive unit (1=yes, 0=no)",
                                    "Patient addmited to intensive care unit (1=yes, 0=no)")

    df_transformed_no_missing = dismiss_missing_values(df_transformed)

    # build the dataset to be used as a rf_model base
    outcome_features = ["SARS-Cov-2 exam result"]
    required_features = ['Hemoglobin', 'Hematocrit', 'Platelets', 'Eosinophils', 'Red blood Cells', 'Lymphocytes',
                         'Leukocytes', 'Basophils', 'Monocytes']

    assembler = VectorAssembler(inputCols=required_features, outputCol='features')
    model_data = assembler.transform(df_transformed_no_missing)

    # split the dataset into train/test subgroups
    (training_data, test_data) = model_data.randomSplit([0.8, 0.2], seed=2020)

    # Random Forest classifier
    rf = RandomForestClassifier(labelCol='SARS-Cov-2 exam result', featuresCol='features', maxDepth=5)
    rf_model = rf.fit(training_data)
    rf_predictions = rf_model.transform(test_data)

    multi_evaluator = MulticlassClassificationEvaluator(labelCol='SARS-Cov-2 exam result', metricName='accuracy')
    rf_accuracy = multi_evaluator.evaluate(rf_predictions)

    # Decision Tree Classifier
    dt = DecisionTreeClassifier(featuresCol='features', labelCol='SARS-Cov-2 exam result', maxDepth=3)
    dt_model = dt.fit(training_data)
    dt_predictions = dt_model.transform(test_data)
    dt_predictions.select(outcome_features + required_features).show(10)

    multi_evaluator = MulticlassClassificationEvaluator(labelCol='SARS-Cov-2 exam result', metricName='accuracy')
    dt_accuracy = multi_evaluator.evaluate(dt_predictions)

    # Logistic Regression Model
    lr = LogisticRegression(featuresCol='features', labelCol='SARS-Cov-2 exam result', maxIter=10)
    lr_model = lr.fit(training_data)
    lr_predictions = lr_model.transform(test_data)

    multi_evaluator = MulticlassClassificationEvaluator(labelCol='SARS-Cov-2 exam result', metricName='accuracy')
    lr_accuracy = multi_evaluator.evaluate(lr_predictions)

    # Gradient-boosted Tree classifier Model
    gb = GBTClassifier(labelCol='SARS-Cov-2 exam result', featuresCol='features')
    gb_model = gb.fit(training_data)
    gb_predictions = gb_model.transform(test_data)

    multi_evaluator = MulticlassClassificationEvaluator(labelCol='SARS-Cov-2 exam result', metricName='accuracy')
    gb_accuracy = multi_evaluator.evaluate(gb_predictions)

    rdd = spark.sparkContext.parallelize([rf_accuracy, dt_accuracy, lr_accuracy, gb_accuracy])
    predictions_dataframe = spark.createDataFrame(rdd, FloatType())

    return predictions_dataframe
def testClassification(train, test):
    # Train a RandomForest model.
    # Setting featureSubsetStrategy="auto" lets the algorithm choose.
    # Note: Use larger numTrees in practice.

    rf = RandomForestClassifier(labelCol="indexedLabel", numTrees=3, maxDepth=4)

    model = rf.fit(train)
    predictionAndLabels = model.transform(test).select("prediction", "indexedLabel") \
        .map(lambda x: (x.prediction, x.indexedLabel))

    metrics = MulticlassMetrics(predictionAndLabels)
    print("weighted f-measure %.3f" % metrics.weightedFMeasure())
    print("precision %s" % metrics.precision())
    print("recall %s" % metrics.recall())
# In[509]:

pca = PCA(inputCol="features", outputCol="pca", k=15).fit(train_df)

train_df = pca.transform(train_df)
test_df = pca.transform(test_df)


# ## Classification algorithms

# In[ ]:

rf = RandomForestClassifier(labelCol="indexedResult", featuresCol="pca", numTrees=5000)
#rf = RandomForestClassifier(labelCol="indexedResult", featuresCol="features", numTrees=5000)
model = rf.fit(train_df)


# ## Evaluation & results

# In[ ]:

label_to_str_map = {'2': 'HOME', '1': 'DRAW', '0': 'AWAY'}
str_to_labelmap = {'HOME': '2', 'DRAW': '1', 'AWAY': '0'}
predictions = model.transform(test_df).select("home_name", "away_name", "B365A", "B365D", "B365H", "probability", 
                                              "indexedResult")

length = test_df.count()
correct = 0
total_profit = 0
for prediction in predictions.collect():
# MAGIC %md
# MAGIC ####Random Forest
# MAGIC 
# MAGIC Random Forests uses an ensemble of trees to improve model accuracy.
# MAGIC 
# MAGIC You can read more about Random Forest from the programming guide [here](http://spark.apache.org/docs/latest/mllib-ensembles.html#random-forests).

# COMMAND ----------

from pyspark.ml.classification import RandomForestClassifier

# Create an initial RandomForest model.
rf = RandomForestClassifier(labelCol="label", featuresCol="features")

# Train model with Training Data
rfModel = rf.fit(trainingData)

# COMMAND ----------

# Make predictions on test data using the Transformer.transform() method.
predictions = rfModel.transform(testData)

# COMMAND ----------

predictions.printSchema()

# COMMAND ----------

# View model's predictions and probabilities of each prediction class
selected = predictions.select("label", "prediction", "probability", "age", "occupation")
display(selected)
Esempio n. 37
0
#                 1.],
#             [   1.,    0.,    1.,    0.,    2.,  183.,    0.,    1.,    0.,
#                 1.],
#             [   1.,    0.,    0.,    0.,    0.,    0.,  192.,    1.,    1.,
#                 0.],
#             [   0.,    0.,    0.,    0.,    0.,    0.,    1.,  187.,    5.,
#                 0.],
#             [   0.,    1.,    2.,    0.,    0.,    0.,    1.,    5.,  172.,
#                 4.],
#             [   0.,    0.,    0.,    0.,    3.,    0.,    0.,    2.,    2.,
#               176.]])

#section 8.3.2
from pyspark.ml.classification import RandomForestClassifier
rf = RandomForestClassifier(maxDepth=20)
rfmodel = rf.fit(pendttrain)
# RandomForestModel doesn't expose trees field in Python
rfpredicts = rfmodel.transform(pendtvalid)
rfresrdd = rfpredicts.select("prediction", "label").map(lambda row:  (row.prediction, row.label))
rfmm = MulticlassMetrics(rfresrdd)
rfmm.precision()
#0.9894640403114979
print(rfmm.confusionMatrix())
#DenseMatrix([[ 211.,    0.,    1.,    0.,    0.,    0.,    0.,    0.,    0.,
#                 0.],
#             [   0.,  220.,    0.,    1.,    0.,    0.,    0.,    0.,    0.,
#                 0.],
#             [   0.,    1.,  211.,    0.,    0.,    0.,    0.,    0.,    0.,
#                 0.],
#             [   0.,    0.,    0.,  175.,    1.,    0.,    0.,    0.,    0.,
#                 0.],
Esempio n. 38
0
    def _train_model_spark(self, data):
        df = self._prepare_data_spark(data)
        input_num = len(data.keys().difference({self.CHANGE_AMOUNT, self.CHANGE_DIRECTION, self.TARGET_PRICE,
                                                self.TODAY_PRICE}))

        if self.ann_hidden_nodes_num is None:
            self.ann_hidden_nodes_num = input_num / 2 + 1
        ann_layers = [input_num,
                      # input_num / 3 * 2,
                      # input_num / 3,
                      self.ann_hidden_nodes_num,
                      2]

        self.logger.info('layer settings are {}'.format(ann_layers))
        self.logger.info('training method is {}'.format(self._train_method))
        self.logger.info('trees num is {}'.format(self.random_forest_tree_number))
        if isinstance(self._train_method, dict):
            if self._model is not None and self._train_method[self.CHANGE_AMOUNT] == self.ARTIFICIAL_NEURAL_NETWORK:
                self._model[self.CHANGE_AMOUNT].stop_server()
            self._model = {self.CHANGE_AMOUNT: None,
                           self.CHANGE_DIRECTION: None}

            if self._train_method[self.CHANGE_AMOUNT] == self.LINEAR_REGRESSION:
                lr = LinearRegression(featuresCol="features", labelCol=self.CHANGE_AMOUNT,
                                      maxIter=self.linear_regression_training_times,
                                      regParam=self.linear_regression_regularization_parameter,
                                      predictionCol='AmountPrediction')
                self._model[self.CHANGE_AMOUNT] = lr.fit(df)
            elif self._train_method[self.CHANGE_AMOUNT] == self.RANDOM_FOREST:
                rfr = RandomForestRegressor(featuresCol="features", labelCol=self.CHANGE_AMOUNT,
                                            numTrees=self.random_forest_tree_number,
                                            maxDepth=self.random_forest_tree_max_depth,
                                            predictionCol='AmountPrediction')
                self._model[self.CHANGE_AMOUNT] = rfr.fit(df)
            elif self._train_method[self.CHANGE_AMOUNT] == self.ARTIFICIAL_NEURAL_NETWORK:
                ann_layers[-1] = 1
                self._model[self.CHANGE_AMOUNT] = KerasNeuralNetworkSpark(layers=ann_layers, spark=self._spark,
                                                                          num_workers=self.spark_worker_numbers,
                                                                          epoch=self.ann_epoch_number,
                                                                          featuresCol="features",
                                                                          labelCol=self.CHANGE_AMOUNT,
                                                                          predictionCol='AmountPrediction'
                                                                          )
                self._model[self.CHANGE_AMOUNT].fit(df)
            else:
                self.logger.warn('Unsupported training method {}'.format(self._train_method))
                raise ValueError('Unsupported training method {}'.format(self._train_method))

            if self._train_method[self.CHANGE_DIRECTION] == self.LOGISTIC_REGRESSION:
                lr = LogisticRegression(featuresCol="features", labelCol=self.CHANGE_DIRECTION,
                                        maxIter=self.logistic_regression_training_times,
                                        regParam=self.linear_regression_regularization_parameter,
                                        predictionCol='DirPrediction')
                self._model[self.CHANGE_DIRECTION] = lr.fit(df)
            elif self._train_method[self.CHANGE_DIRECTION] == self.RANDOM_FOREST:
                rfc = RandomForestClassifier(featuresCol="features", labelCol=self.CHANGE_DIRECTION,
                                             numTrees=self.random_forest_tree_number,
                                             maxDepth=self.random_forest_tree_max_depth,
                                             predictionCol='DirPrediction')
                self._model[self.CHANGE_DIRECTION] = rfc.fit(df)

            elif self._train_method[self.CHANGE_DIRECTION] == self.ARTIFICIAL_NEURAL_NETWORK:
                ann_layers[-1] = 2
                mlpc = MultilayerPerceptronClassifier(featuresCol="features",
                                                      labelCol=self.CHANGE_DIRECTION,
                                                      layers=ann_layers,
                                                      predictionCol='DirPrediction')
                self._model[self.CHANGE_DIRECTION] = mlpc.fit(df)

            else:
                self.logger.warn('Unsupported training method {}'.format(self._train_method))
                raise ValueError('Unsupported training method {}'.format(self._train_method))

        else:
            if self._train_method == self.LINEAR_REGRESSION:
                lr = LinearRegression(featuresCol="features", labelCol=self.TARGET_PRICE, predictionCol='prediction',
                                      regParam=self.linear_regression_regularization_parameter,
                                      maxIter=self.linear_regression_training_times)
                self._model = lr.fit(df)
            elif self._train_method == self.RANDOM_FOREST:
                rfr = RandomForestRegressor(featuresCol="features", labelCol=self.TARGET_PRICE,
                                            predictionCol='prediction',
                                            numTrees=self.random_forest_tree_number,
                                            maxDepth=self.random_forest_tree_max_depth)
                self._model = rfr.fit(df)

            elif self._train_method == self.ARTIFICIAL_NEURAL_NETWORK:
                ann_layers[-1] = 1
                if self._model is not None:
                    self._model.stop_server()
                self.logger.warn('layers are {}'.format(ann_layers))
                self._model = KerasNeuralNetworkSpark(layers=ann_layers, spark=self._spark,
                                                      num_workers=self.spark_worker_numbers, epoch=100,
                                                      featuresCol="features", labelCol=self.TARGET_PRICE,
                                                      predictionCol='prediction'
                                                      )
                self._model.fit(df)

            else:
                self.logger.warn('Unsupported training method {}'.format(self._train_method))
                raise ValueError('Unsupported training method {}'.format(self._train_method))

        return self._model
def main(base_path):
  APP_NAME = "train_spark_mllib_model.py"
  
  # If there is no SparkSession, create the environment
  try:
    sc and spark
  except NameError as e:
    import findspark
    findspark.init()
    import pyspark
    import pyspark.sql
    
    sc = pyspark.SparkContext()
    spark = pyspark.sql.SparkSession(sc).builder.appName(APP_NAME).getOrCreate()
  
  #
  # {
  #   "ArrDelay":5.0,"CRSArrTime":"2015-12-31T03:20:00.000-08:00","CRSDepTime":"2015-12-31T03:05:00.000-08:00",
  #   "Carrier":"WN","DayOfMonth":31,"DayOfWeek":4,"DayOfYear":365,"DepDelay":14.0,"Dest":"SAN","Distance":368.0,
  #   "FlightDate":"2015-12-30T16:00:00.000-08:00","FlightNum":"6109","Origin":"TUS"
  # }
  #
  from pyspark.sql.types import StringType, IntegerType, FloatType, DoubleType, DateType, TimestampType
  from pyspark.sql.types import StructType, StructField
  from pyspark.sql.functions import udf
  
  schema = StructType([
    StructField("ArrDelay", DoubleType(), True),
    StructField("CRSArrTime", TimestampType(), True),
    StructField("CRSDepTime", TimestampType(), True),
    StructField("Carrier", StringType(), True),
    StructField("DayOfMonth", IntegerType(), True),
    StructField("DayOfWeek", IntegerType(), True),
    StructField("DayOfYear", IntegerType(), True),
    StructField("DepDelay", DoubleType(), True),
    StructField("Dest", StringType(), True),
    StructField("Distance", DoubleType(), True),
    StructField("FlightDate", DateType(), True),
    StructField("FlightNum", StringType(), True),
    StructField("Origin", StringType(), True),
    StructField("Route", StringType(), True),
    StructField("TailNum", StringType(), True),
    StructField("EngineManufacturer", StringType(), True),
    StructField("EngineModel", StringType(), True),
    StructField("Manufacturer", StringType(), True),
    StructField("ManufacturerYear", StringType(), True),
    StructField("OwnerState", StringType(), True),
  ])
  
  input_path = "{}/data/simple_flight_delay_features_airplanes.json".format(
    base_path
  )
  features = spark.read.json(input_path, schema=schema)
  features.first()
  
  #
  # Add the hour of day of scheduled arrival/departure
  #
  from pyspark.sql.functions import hour
  features_with_hour = features.withColumn(
    "CRSDepHourOfDay",
    hour(features.CRSDepTime)
  )
  features_with_hour = features_with_hour.withColumn(
    "CRSArrHourOfDay",
    hour(features.CRSArrTime)
  )
  features_with_hour.select("CRSDepTime", "CRSDepHourOfDay", "CRSArrTime", "CRSArrHourOfDay").show()
  
  #
  # Check for nulls in features before using Spark ML
  #
  null_counts = [(column, features_with_hour.where(features_with_hour[column].isNull()).count()) for column in features_with_hour.columns]
  cols_with_nulls = filter(lambda x: x[1] > 0, null_counts)
  print("\nNull Value Report")
  print("-----------------")
  print(tabulate(cols_with_nulls, headers=["Column", "Nulls"]))
  
  #
  # Use pysmark.ml.feature.Bucketizer to bucketize ArrDelay into on-time, slightly late, very late (0, 1, 2)
  #
  from pyspark.ml.feature import Bucketizer
  
  # Setup the Bucketizer
  splits = [-float("inf"), -15.0, 0, 30.0, float("inf")]
  arrival_bucketizer = Bucketizer(
    splits=splits,
    inputCol="ArrDelay",
    outputCol="ArrDelayBucket"
  )
  
  # Save the model
  arrival_bucketizer_path = "{}/models/arrival_bucketizer_2.0.bin".format(base_path)
  arrival_bucketizer.write().overwrite().save(arrival_bucketizer_path)
  
  # Apply the model
  ml_bucketized_features = arrival_bucketizer.transform(features_with_hour)
  ml_bucketized_features.select("ArrDelay", "ArrDelayBucket").show()
  
  #
  # Extract features tools in with pyspark.ml.feature
  #
  from pyspark.ml.feature import StringIndexer, VectorAssembler
  
  # Turn category fields into indexes
  string_columns = ["Carrier", "Origin", "Dest", "Route",
                    "TailNum"]
  for column in string_columns:
    string_indexer = StringIndexer(
      inputCol=column,
      outputCol=column + "_index"
    )
    
    string_indexer_model = string_indexer.fit(ml_bucketized_features)
    ml_bucketized_features = string_indexer_model.transform(ml_bucketized_features)
    
    # Save the pipeline model
    string_indexer_output_path = "{}/models/string_indexer_model_4.0.{}.bin".format(
      base_path,
      column
    )
    string_indexer_model.write().overwrite().save(string_indexer_output_path)
  
  # Combine continuous, numeric fields with indexes of nominal ones
  # ...into one feature vector
  numeric_columns = [
    "DepDelay", "Distance",
    "DayOfYear",
    "CRSDepHourOfDay",
    "CRSArrHourOfDay"]
  index_columns = [column + "_index" for column in string_columns]
  
  vector_assembler = VectorAssembler(
    inputCols=numeric_columns + index_columns,
    outputCol="Features_vec"
  )
  final_vectorized_features = vector_assembler.transform(ml_bucketized_features)
  
  # Save the numeric vector assembler
  vector_assembler_path = "{}/models/numeric_vector_assembler_5.0.bin".format(base_path)
  vector_assembler.write().overwrite().save(vector_assembler_path)
  
  # Drop the index columns
  for column in index_columns:
    final_vectorized_features = final_vectorized_features.drop(column)
  
  # Inspect the finalized features
  final_vectorized_features.show()
  
  #
  # Cross validate, train and evaluate classifier: loop 5 times for 4 metrics
  #
  
  from collections import defaultdict
  scores = defaultdict(list)
  feature_importances = defaultdict(list)
  metric_names = ["accuracy", "weightedPrecision", "weightedRecall", "f1"]
  split_count = 3
  
  for i in range(1, split_count + 1):
    print("\nRun {} out of {} of test/train splits in cross validation...".format(
      i,
      split_count,
    )
    )
    
    # Test/train split
    training_data, test_data = final_vectorized_features.randomSplit([0.8, 0.2])
    
    # Instantiate and fit random forest classifier on all the data
    from pyspark.ml.classification import RandomForestClassifier
    rfc = RandomForestClassifier(
      featuresCol="Features_vec",
      labelCol="ArrDelayBucket",
      predictionCol="Prediction",
      maxBins=4896,
    )
    model = rfc.fit(training_data)
    
    # Save the new model over the old one
    model_output_path = "{}/models/spark_random_forest_classifier.flight_delays.baseline.bin".format(
      base_path
    )
    model.write().overwrite().save(model_output_path)
    
    # Evaluate model using test data
    predictions = model.transform(test_data)
    
    # Evaluate this split's results for each metric
    from pyspark.ml.evaluation import MulticlassClassificationEvaluator
    for metric_name in metric_names:
      evaluator = MulticlassClassificationEvaluator(
        labelCol="ArrDelayBucket",
        predictionCol="Prediction",
        metricName=metric_name
      )
      score = evaluator.evaluate(predictions)
      
      scores[metric_name].append(score)
      print("{} = {}".format(metric_name, score))
    
    #
    # Collect feature importances
    #
    feature_names = vector_assembler.getInputCols()
    feature_importance_list = model.featureImportances
    for feature_name, feature_importance in zip(feature_names, feature_importance_list):
      feature_importances[feature_name].append(feature_importance)
  
  #
  # Evaluate average and STD of each metric and print a table
  #
  import numpy as np
  score_averages = defaultdict(float)
  
  # Compute the table data
  average_stds = []  # ha
  for metric_name in metric_names:
    metric_scores = scores[metric_name]
    
    average_accuracy = sum(metric_scores) / len(metric_scores)
    score_averages[metric_name] = average_accuracy
    
    std_accuracy = np.std(metric_scores)
    
    average_stds.append((metric_name, average_accuracy, std_accuracy))
  
  # Print the table
  print("\nExperiment Log")
  print("--------------")
  print(tabulate(average_stds, headers=["Metric", "Average", "STD"]))
  
  #
  # Persist the score to a sccore log that exists between runs
  #
  import pickle
  
  # Load the score log or initialize an empty one
  try:
    score_log_filename = "{}/models/score_log.pickle".format(base_path)
    score_log = pickle.load(open(score_log_filename, "rb"))
    if not isinstance(score_log, list):
      score_log = []
  except IOError:
    score_log = []
  
  # Compute the existing score log entry
  score_log_entry = {
    metric_name: score_averages[metric_name] for metric_name in metric_names
  }
  
  # Compute and display the change in score for each metric
  try:
    last_log = score_log[-1]
  except (IndexError, TypeError, AttributeError):
    last_log = score_log_entry
  
  experiment_report = []
  for metric_name in metric_names:
    run_delta = score_log_entry[metric_name] - last_log[metric_name]
    experiment_report.append((metric_name, run_delta))
  
  print("\nExperiment Report")
  print("-----------------")
  print(tabulate(experiment_report, headers=["Metric", "Score"]))
  
  # Append the existing average scores to the log
  score_log.append(score_log_entry)
  
  # Persist the log for next run
  pickle.dump(score_log, open(score_log_filename, "wb"))
  
  #
  # Analyze and report feature importance changes
  #
  
  # Compute averages for each feature
  feature_importance_entry = defaultdict(float)
  for feature_name, value_list in feature_importances.items():
    average_importance = sum(value_list) / len(value_list)
    feature_importance_entry[feature_name] = average_importance
  
  # Sort the feature importances in descending order and print
  import operator
  sorted_feature_importances = sorted(
    feature_importance_entry.items(),
    key=operator.itemgetter(1),
    reverse=True
  )
  
  print("\nFeature Importances")
  print("-------------------")
  print(tabulate(sorted_feature_importances, headers=['Name', 'Importance']))
  
  #
  # Compare this run's feature importances with the previous run's
  #
  
  # Load the feature importance log or initialize an empty one
  try:
    feature_log_filename = "{}/models/feature_log.pickle".format(base_path)
    feature_log = pickle.load(open(feature_log_filename, "rb"))
    if not isinstance(feature_log, list):
      feature_log = []
  except IOError:
    feature_log = []
  
  # Compute and display the change in score for each feature
  try:
    last_feature_log = feature_log[-1]
  except (IndexError, TypeError, AttributeError):
    last_feature_log = defaultdict(float)
    for feature_name, importance in feature_importance_entry.items():
      last_feature_log[feature_name] = importance
  
  # Compute the deltas
  feature_deltas = {}
  for feature_name in feature_importances.keys():
    run_delta = feature_importance_entry[feature_name] - last_feature_log[feature_name]
    feature_deltas[feature_name] = run_delta
  
  # Sort feature deltas, biggest change first
  import operator
  sorted_feature_deltas = sorted(
    feature_deltas.items(),
    key=operator.itemgetter(1),
    reverse=True
  )
  
  # Display sorted feature deltas
  print("\nFeature Importance Delta Report")
  print("-------------------------------")
  print(tabulate(sorted_feature_deltas, headers=["Feature", "Delta"]))
  
  # Append the existing average deltas to the log
  feature_log.append(feature_importance_entry)
  
  # Persist the log for next run
  pickle.dump(feature_log, open(feature_log_filename, "wb"))
Esempio n. 40
0
rdd = sc.textFile("/user/demo/train.csv").filter(lambda x: x != titile).\
map(lambda x:x.split(","))
D = 2 ** 24 

def helper1(r):
    features=[]
    try:
        fe = r[1:-1]
        for i in range(len(fe)):
            features.append(float(abs(hash("VAR_"+'{0:04}'.format(i)+fe[i])))%D)
        target = float(r[-1])
        ID=float(r[0])
        return target, Vectors.dense(features)
    except:
        return (0.0,[0.0]*1932)
new_rdd = rdd.filter(lambda i : len(i)==1934)
rdd_after_trans = new_rdd.map(helper1)
rdd_after_trans.cache()
df = sqlContext.createDataFrame(rdd_after_trans,["label", "features"])
(trainingData, testData) = df.randomSplit([0.7, 0.3])

stringIndexer = StringIndexer(inputCol="label", outputCol="indexed")
si_model = stringIndexer.fit(trainingData)
td = si_model.transform(trainingData)

rf = RandomForestClassifier(numTrees=50, maxDepth=25, labelCol="indexed", seed=42)
model = rf.fit(td)
result = model.transform(testData).rdd.map(lambda r: str(r.label)+','+str(r.probability[0]))
result.saveAsTextFile("/user/demo/rf_50_25")

# Check out the features
final_vectorized_features.show()

#
# Cross validate, train and evaluate classifier
#

# Test/train split
training_data, test_data = final_vectorized_features.randomSplit([0.7, 0.3])

# Instantiate and fit random forest classifier
from pyspark.ml.classification import RandomForestClassifier
rfc = RandomForestClassifier(
  featuresCol="Features_vec",
  labelCol="ArrDelayBucket",
  maxBins=4657,
  maxMemoryInMB=1024
)
model = rfc.fit(training_data)

# Evaluate model using test data
predictions = model.transform(test_data)

from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(labelCol="ArrDelayBucket", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Accuracy = {}".format(accuracy))

# Check a sample
predictions.sample(False, 0.001, 18).orderBy("CRSDepTime").show(6)
 'title_subjectivity',
 'title_sentiment_polarity',
 'abs_title_subjectivity',
 'abs_title_sentiment_polarity'],outputCol='features' )
new_data = assembler.transform(data)


final_data = new_data.select('features','shares')
from pyspark.ml.feature import QuantileDiscretizer
discretizer = QuantileDiscretizer(numBuckets=2, inputCol="shares", outputCol="result")
result = discretizer.fit(final_data).transform(final_data)
finalData = result.select('result','features')
from pyspark.ml.classification import RandomForestClassifier
rfc = RandomForestClassifier(numTrees=250,labelCol='result',featuresCol='features')
train_data,test_data = finalData.randomSplit([0.7,0.3])
rfc_model = rfc.fit(train_data)
result = rfc_model.transform(test_data);
from pyspark.ml.evaluation import BinaryClassificationEvaluator
acc_eval = BinaryClassificationEvaluator(labelCol='result')
print(acc_eval.evaluate(result))
test_data.head(1)


# import os, sys
# import pandas
# import plotly.plotly as py
# from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
# import cufflinks as cf
# import plotly.graph_objs as go
# init_notebook_mode(connected=True)
# sys.path.append("".join([os.environ["HOME"]])) 
#segregating the labels and features
selectData = transformDF.select("label","features","id")
#Creating RDD of LabeledPoints
lpSelectData = selectData.map(lambda x : (x.id, LabeledPoint(x.label,x.features)))
#Instantiating string indexer for random forest
stringIndexer = StringIndexer(inputCol="label", outputCol="indexed")
#fitting the data in stringindexer
si_model = stringIndexer.fit(selectData)
#transforming the data
transformData = si_model.transform(selectData)
#Spliting the data for training and test
(trainingData, testData) = transformData.randomSplit([0.6, 0.4])
#instantiating Random forest model
randomForest = RandomForestClassifier(numTrees=2, maxDepth=2, labelCol="indexed", seed=42)
#training the model
randomForestModel = randomForest.fit(trainingData)
#trsnforming test data
result = randomForestModel.transform(testData)
#calculating the accuracy and printing it.
accuracy = result.filter(result.label == result.prediction).count() / float(testData.count())
print("Accuracy = " + str(accuracy))