def clasificador_PerceptronMulticapa(dataFrame, capas, NumIter, TamLote):
    # dividimos en conjunto de entrenamiento y de test
    splits = dataFrame.randomSplit([0.7, 0.3], 1234)
    trainData = splits[0]
    testData = splits[1]

    # Especificamos las capas para la red neuronal:
    layers = capas
    # Creamos el entrenador de la red y le indicamos sus parámetros
    trainer = MultilayerPerceptronClassifier(maxIter=NumIter,
                                             layers=layers,
                                             blockSize=TamLote,
                                             seed=1234)

    # Entrenamos el modelo
    model = trainer.fit(trainData)
    # compute accuracy on the test set
    result = model.transform(testData)
    predictionAndLabels = result.select('prediction', 'label')
    evaluator = MulticlassClassificationEvaluator(metricName='accuracy')
    accuracy = evaluator.evaluate(predictionAndLabels)
    print('Test Error = %g ' % (1.0 - accuracy))
    print('Accuracy = ', accuracy)

    #Calcular AUC
    evaluator = BinaryClassificationEvaluator(rawPredictionCol='prediction')
    evaluation = evaluator.evaluate(model.transform(testData))
    print('AUC:', evaluation)
    print('Perceptron Multicapa: maxIter:' + str(NumIter) + ' Layers: ' +
          str(layers) + ' blockSize: ' + str(TamLote))
Example #2
0
 def test_mlp_classification_summary(self):
     df = self.spark.createDataFrame([(0.0, Vectors.dense([0.0, 0.0])),
                                      (1.0, Vectors.dense([0.0, 1.0])),
                                      (1.0, Vectors.dense([1.0, 0.0])),
                                      (0.0, Vectors.dense([1.0, 1.0]))
                                      ],
                                     ["label", "features"])
     mlp = MultilayerPerceptronClassifier(layers=[2, 2, 2], seed=123)
     model = mlp.fit(df)
     self.assertTrue(model.hasSummary)
     s = model.summary()
     # test that api is callable and returns expected types
     self.assertTrue(isinstance(s.predictions, DataFrame))
     self.assertEqual(s.labelCol, "label")
     self.assertEqual(s.predictionCol, "prediction")
     self.assertGreater(s.totalIterations, 0)
     self.assertTrue(isinstance(s.labels, list))
     self.assertTrue(isinstance(s.truePositiveRateByLabel, list))
     self.assertTrue(isinstance(s.falsePositiveRateByLabel, list))
     self.assertTrue(isinstance(s.precisionByLabel, list))
     self.assertTrue(isinstance(s.recallByLabel, list))
     self.assertTrue(isinstance(s.fMeasureByLabel(), list))
     self.assertTrue(isinstance(s.fMeasureByLabel(1.0), list))
     self.assertAlmostEqual(s.accuracy, 1.0, 2)
     self.assertAlmostEqual(s.weightedTruePositiveRate, 1.0, 2)
     self.assertAlmostEqual(s.weightedFalsePositiveRate, 0.0, 2)
     self.assertAlmostEqual(s.weightedRecall, 1.0, 2)
     self.assertAlmostEqual(s.weightedPrecision, 1.0, 2)
     self.assertAlmostEqual(s.weightedFMeasure(), 1.0, 2)
     self.assertAlmostEqual(s.weightedFMeasure(1.0), 1.0, 2)
     # test evaluation (with training dataset) produces a summary with same values
     # one check is enough to verify a summary is returned, Scala version runs full test
     sameSummary = model.evaluate(df)
     self.assertTrue(isinstance(sameSummary, MultilayerPerceptronClassificationSummary))
     self.assertAlmostEqual(sameSummary.accuracy, s.accuracy)
    def GetFScore(self, i, ratio):
        spark = SparkSession.builder.getOrCreate()
        sc = spark.sparkContext
        maldataset = sc.textFile("dataset.csv")
        trainHeader = maldataset.first()
        maldataset = maldataset.filter(lambda line: line != trainHeader
                                       ).mapPartitions(lambda x: csv.reader(x))
        maldataset = maldataset.map(lambda l: self.toint(l))
        df = maldataset.map(lambda l: (l[-1], Vectors.dense(l[0:-1])))
        maldataset = maldataset.map(
            lambda line: LabeledPoint(line[-1], [line[0:len(line) - 1]]))
        trainData, testData = maldataset.randomSplit([ratio, 1 - ratio])
        if i > 0:
            return self.BC(trainData, testData, i)

        df = spark.createDataFrame(df.collect(), ["label", "features"])
        splits = df.randomSplit([ratio, 1 - ratio], 1234)
        train = splits[0]
        test = splits[1]
        mlp = MultilayerPerceptronClassifier(maxIter=100,
                                             layers=[35, 100, 100],
                                             blockSize=1,
                                             seed=123)
        model = mlp.fit(train)
        result = model.transform(test)
        predictionAndLabels = result.select("prediction", "label")
        evaluator = MulticlassClassificationEvaluator(metricName="accuracy")
        return evaluator.evaluate(predictionAndLabels)
Example #4
0
    def test_raw_and_probability_prediction(self):

        data_path = "data/mllib/sample_multiclass_classification_data.txt"
        df = self.spark.read.format("libsvm").load(data_path)

        mlp = MultilayerPerceptronClassifier(
            maxIter=100, layers=[4, 5, 4, 3], blockSize=128, seed=123
        )
        model = mlp.fit(df)
        test = self.sc.parallelize([Row(features=Vectors.dense(0.1, 0.1, 0.25, 0.25))]).toDF()
        result = model.transform(test).head()
        expected_prediction = 2.0
        expected_probability = [0.0, 0.0, 1.0]
        expected_rawPrediction = [-11.6081922998, -8.15827998691, 22.17757045]
        self.assertTrue(result.prediction, expected_prediction)
        self.assertTrue(np.allclose(result.probability, expected_probability, atol=1e-4))
        # Use `assert_allclose` to show the value of `result.rawPrediction` in the assertion error
        # message
        np.testing.assert_allclose(
            result.rawPrediction,
            expected_rawPrediction,
            rtol=0.3,
            # Use the same default value as `np.allclose`
            atol=1e-08,
        )
def TrainMLP(trainingData, testData, layers):
    # specify layers for the neural network:
    # input layer of size (features), two intermediate layers
    # and output of size (classes)

    # create the trainer and set its parameters
    mlp = MultilayerPerceptronClassifier(maxIter=100,
                                         layers=layers,
                                         blockSize=128)

    # train the model
    start = time.time()
    model = mlp.fit(trainingData)
    end = time.time()
    print('Training MLP model took', end - start)

    # Make predictions.
    predictions = model.transform(testData)

    # Select (prediction, true label) and compute test error
    evaluator = MulticlassClassificationEvaluator(labelCol="label",
                                                  predictionCol="prediction",
                                                  metricName="accuracy")
    accuracy = evaluator.evaluate(predictions)
    print("Test Error = %g, accuracy = %g" % (1.0 - accuracy, accuracy))

    return model
def main(args):
    spark=SparkSession\
            .builder\
            .master(args[2])\
            .appName(args[1])\
            .getOrCreate()
    
    start_computing_time = time.time()

    # Load the data stored in LIBSVM format as a DataFrame.
    data = spark.read.format("libsvm").load(args[3])

    (trainingData, testData) = data.randomSplit([0.7, 0.3],seed=1234)

    # specify layers for the neural network:
    # input layer of size 4 (features), two intermediate of size 5 and 4
    # and output of size 3 (classes)
    layers = [4, 5, 4, 3]

    # create the trainer and set its parameters
    trainer = MultilayerPerceptronClassifier(maxIter=100, layers=layers, blockSize=128, seed=1234)

    # train the model
    model = trainer.fit(trainingData)

    # compute accuracy on the test set
    result = model.transform(testData)

    appendTime(sys.argv,start_computing_time)

    spark.stop()
Example #7
0
def mpc(ss, data, label_index, feature_indexs, project_url):
    # 1.构造训练数据集
    def func(x):
        features_data = []
        for feature in feature_indexs:
            features_data.append(x[feature])
        return Row(label=label_index, features=Vectors.dense(features_data))

    training_set = data.rdd.map(lambda x: func(x)).toDF()

    # 2.训练模型
    # maxIter=100, tol=1e-6, seed=None, layers=None, blockSize=128, stepSize=0.03, solver="l-bfgs", initialWeights=None
    mpc_param = MultilayerPerceptronClassifier(maxIter=100, tol=1e-6, blockSize=128, stepSize=0.03, solver="l-bfgs")
    mpc_param.setSeed(1)
    mpc_param.setLayers([4, 2, 2])
    mpc_model = mpc_param.fit(training_set)

    # 3.保存模型
    model_path = project_url + '/model/multipleClassification/mpc'
    mpc_model.write().overwrite().save(model_path)

    # 4.读取模型
    mpc2 = MultilayerPerceptronClassificationModel.load(model_path)

    # 5.预测
    result = mpc2.transform(training_set).select("prediction", "features").show()
Example #8
0
def clasificar_chi2():
    #Leemos la data y convertimos a float los valores de cada columna
    conf = SparkConf().setAppName("NN_1").setMaster("local")
    sc = SparkContext(conf=conf)
    sqlContext = SQLContext(sc)
    rdd = sqlContext.read.csv(
        "/home/ulima-azure/data/Enfermedad_Oncologica_T3.csv", header=True).rdd
    rdd = rdd.map(lambda x: (float(x[0]), float(x[1]), float(x[2]), float(x[
        3]), float(x[4]), float(x[5]), float(x[6]), float(x[7]), float(x[8]),
                             float(x[9])))

    df = rdd.toDF([
        "Cellenght", "Cellsize", "Cellshape", "mgadhesion", "sepics",
        "bnuclei", "bchromatin", "nucleos", "mitoses", "P_Benigno"
    ])
    #Construir nuestro vector assembler (features)
    assembler = VectorAssembler(inputCols=[
        "Cellenght", "Cellsize", "Cellshape", "nucleos", "bchromatin",
        "mitoses"
    ],
                                outputCol="featuresChi2")
    df_chi2 = assembler.transform(df)
    df_chi2 = df_chi2.select("featuresChi2", "P_Benigno")

    selector = ChiSqSelector(numTopFeatures=3,
                             featuresCol="featuresChi2",
                             labelCol="P_Benigno",
                             outputCol="featuresSelected")
    df_result = selector.fit(df_chi2).transform(df_chi2)

    #Dividir data en training y test
    (df_training, df_test) = df_result.randomSplit([0.7, 0.3])

    # Definir arquitectura de nuestra red (hiperparametro)
    capas = [3, 4, 6, 2]

    # Construimos al entrenador
    # Hiperparametro: maxIter
    entrenador = MultilayerPerceptronClassifier(featuresCol="featuresSelected",
                                                labelCol="P_Benigno",
                                                maxIter=1000,
                                                layers=capas)
    # Entrenar nuestro modelo
    modelo = entrenador.fit(df_training)

    # Validar nuestro modelo
    df_predictions = modelo.transform(df_test)
    evaluador = MulticlassClassificationEvaluator(labelCol="P_Benigno",
                                                  predictionCol="prediction",
                                                  metricName="accuracy")
    accuracy = evaluador.evaluate(df_predictions)
    print(f"Accuracy: {accuracy}")

    df_predictions.select("prediction", "rawPrediction", "probability").show()

    #Mostramos la cantidad de 0 y 1 de las predicciones
    df_predictions.groupby('prediction').count().show()
def neuralNetwork_model(train, x, y, feature_count):
    layers = [feature_count, feature_count * 3, feature_count * 2, 2]
    mlp = MultilayerPerceptronClassifier(featuresCol=x,
                                         labelCol=y,
                                         maxIter=100,
                                         layers=layers,
                                         blockSize=512,
                                         seed=12345)
    mlpModel = mlp.fit(train)
    return mlpModel
def _get_mlp_model(feat_train):
    from pyspark.ml.classification import MultilayerPerceptronClassifier
    global num_features
    layers = [num_features, 10, 10, 2]
    mlp_trainer = MultilayerPerceptronClassifier(maxIter=10,
                                                 layers=layers,
                                                 seed=123,
                                                 stepSize=0.005,
                                                 solver='gd',
                                                 featuresCol="features",
                                                 labelCol="label")
    mlp_model = mlp_trainer.fit(feat_train)
    return mlp_model
Example #11
0
def make_model(train,val):
	layers = [100, 100, 2]
	# create the trainer and set its parameters
	trainer = MultilayerPerceptronClassifier(maxIter=100, layers=layers, blockSize=128, seed=1234)
	model = trainer.fit(train)
	result = model.transform(val)
	predictionAndLabels = result.select("prediction", "label")
	#predictionAndLabels.where(predictionAndLabels['prediction'] == 0 ).show()
	evaluator = MulticlassClassificationEvaluator(metricName="accuracy")
	print("Test set accuracy = " + str(evaluator.evaluate(predictionAndLabels)))

	#save model
	mlp_path = "s3://projfakenews/mlp"
	model.save(mlp_path)
def price_predict(path, windows=5, spark_contest=None, sql_context=None):
    if spark_contest is None:
        spark_contest, sql_context = load_spark_context()
    input_data = DataParser(path=path, window_size=windows)
    close_train_df, close_test_df, open_train_df, open_test_df = input_data.get_n_days_history_data(
        data_type=DATA_FRAME, spark_context=spark_contest, sql_context=sql_context)
    evaluator = MulticlassClassificationEvaluator(metricName=PREDICTION)

    # handle open data
    open_trainer = MultilayerPerceptronClassifier(maxIter=1, layers=[4, 5, 4, 3], blockSize=128,
                                                  featuresCol=FEATURES, labelCol=LABEL, seed=1234)
    open_model = open_trainer.fit(open_train_df)
    open_result = open_model.transform(open_test_df)
    open_prediction_labels = open_result.select(PREDICTION, LABEL)
    print("Precision:" + str(evaluator.evaluate(open_prediction_labels)))

    # handle close data
    close_trainer = MultilayerPerceptronClassifier(maxIter=100, layers=[4, 5, 4, 3], blockSize=128,
                                                   featuresCol=FEATURES, labelCol=LABEL, seed=1234)
    close_model = close_trainer.fit(close_train_df)
    close_result = close_model.transform(close_test_df)
    close_prediction_labels = close_result.select(PREDICTION, LABEL)
    print("Precision:" + str(evaluator.evaluate(close_prediction_labels)))
Example #13
0
def perceptron_multicapa(train, test, capas, num_iter, tamlot):

  layers = capas

  trainer = MultilayerPerceptronClassifier(
        maxIter=num_iter, layers=layers, blockSize=tamlot, seed=13)
  # Entrenamos el modelo
  model = trainer.fit(train)
  # compute accuracy on the test set
  result = model.transform(test)
  predictionAndLabels = result.select('prediction', 'label')
  evaluator = MulticlassClassificationEvaluator(metricName='accuracy')
  accuracy = evaluator.evaluate(predictionAndLabels)

  return accuracy
Example #14
0
def MLPclf(trainingData, testData):

    mlp = MultilayerPerceptronClassifier().setFeaturesCol(
        "features").setLabelCol("label").setLayers(layers).setSolver(
            "gd").setStepSize(0.3).setMaxIter(1000)

    mlpModel = mlp.fit(trainingData)
    results = mlpModel.transform(testData)

    label = results.select("label").toPandas().values
    predict = results.select("prediction").toPandas().values
    np.savetxt('res/predictedMLP_spark.txt', predict, fmt='%01d')
    print("[accuracy,precision,recall,f1]")
    # print(evaluate(label,predict))
    return evaluate(label, predict)
Example #15
0
    def test_raw_and_probability_prediction(self):

        data_path = "data/mllib/sample_multiclass_classification_data.txt"
        df = self.spark.read.format("libsvm").load(data_path)

        mlp = MultilayerPerceptronClassifier(maxIter=100, layers=[4, 5, 4, 3],
                                             blockSize=128, seed=123)
        model = mlp.fit(df)
        test = self.sc.parallelize([Row(features=Vectors.dense(0.1, 0.1, 0.25, 0.25))]).toDF()
        result = model.transform(test).head()
        expected_prediction = 2.0
        expected_probability = [0.0, 0.0, 1.0]
        expected_rawPrediction = [57.3955, -124.5462, 67.9943]
        self.assertTrue(result.prediction, expected_prediction)
        self.assertTrue(np.allclose(result.probability, expected_probability, atol=1E-4))
        self.assertTrue(np.allclose(result.rawPrediction, expected_rawPrediction, atol=1E-4))
Example #16
0
def run_nlp(train_df, test_df):
    # len(features)
    # layers = [21, 80, 3]
    layers = [train_df.schema["features"].metadata["ml_attr"]["num_attrs"], 160, 150, 50, 10, 2]

    # create the trainer and set its parameters
    trainer = MultilayerPerceptronClassifier(featuresCol='features', labelCol='label', predictionCol='prediction',
                                             maxIter=200, layers=layers, stepSize=0.0003, blockSize=30, tol=0.00001,
                                             seed=seed)
    # train the model
    model = trainer.fit(train_df)
    # compute accuracy on the test set
    predictTest = model.transform(test_df)
    predictionAndLabels = predictTest.select("prediction", "label")
    evaluator = MulticlassClassificationEvaluator(metricName="accuracy")
    print("Test set accuracy of MLP = " + str(evaluator.evaluate(predictionAndLabels)))
    return model
Example #17
0
    def test_multilayer_load(self):
        df = self.spark.createDataFrame([(0.0, Vectors.dense([0.0, 0.0])),
                                         (1.0, Vectors.dense([0.0, 1.0])),
                                         (1.0, Vectors.dense([1.0, 0.0])),
                                         (0.0, Vectors.dense([1.0, 1.0]))],
                                        ["label", "features"])

        mlp = MultilayerPerceptronClassifier(layers=[2, 2, 2], seed=123)
        model = mlp.fit(df)
        self.assertEqual(model.getSolver(), "l-bfgs")
        transformed1 = model.transform(df)
        path = tempfile.mkdtemp()
        model_path = path + "/mlp"
        model.save(model_path)
        model2 = MultilayerPerceptronClassificationModel.load(model_path)
        self.assertEqual(model2.getSolver(), "l-bfgs")
        transformed2 = model2.transform(df)
        self.assertEqual(transformed1.take(4), transformed2.take(4))
Example #18
0
def MLP_train(training):
    """
        Input : 
            normalized tweet-term format training set
        Output : 
            Neural Network training model
    """

    num_cols = training.select(
        'features').collect()[0].features.size  #vocabulary size
    layers = [num_cols, 100, 2]
    MLP_trainer = MultilayerPerceptronClassifier(maxIter=100,
                                                 layers=layers,
                                                 blockSize=128,
                                                 seed=1234)
    model = MLP_trainer.fit(training)

    return model
Example #19
0
def train_model(train_data, num_features):
    """Train the multilayer perceptron model.

    Params:
    - train_data (pyspark.rdd.RDD): The training dataset partition
    - num_features (int): The number of features

    Returns:
    - model (pyspark.ml.MultilayerPerceptronModel): The trained MLP model
    """
    multilayer_perceptron = MultilayerPerceptronClassifier(
        blockSize=1,
        featuresCol="presence_feature_set",
        labelCol="label",
        predictionCol="prediction",
        layers=[num_features, 100, 50, 10, 2])
    model = multilayer_perceptron.fit(train_data)
    return model
def multilayer_perceptron_classify(comment_preprocessed):
    sc = SparkContext(appName="Classification")
    sql_context = SQLContext(sc)
    data = sql_context.createDataFrame(comment_preprocessed)

    train, test = data.randomSplit([0.7, 0.3], 1234)
    layers = [len(comment_preprocessed[0].features), 11, 2]
    # sqrt(2000) = 45, sqrt(4000) = 63, log(2000, 2) = 11
    trainer = MultilayerPerceptronClassifier(maxIter=100,
                                             layers=layers,
                                             blockSize=128,
                                             seed=1234)
    model = trainer.fit(train)
    predictions = model.transform(test)
    evaluate_classification(predictions)

    time.sleep(1)
    # predict_comment(sql_context, model)
    compare_classification_with_tool(sql_context, model)
def naiveBayeseian():

    def parseLine(line):
        keys  = [float(x) for x in line.split(",")]
        #return LabeledPoint(keys[0],keys[1:])
        return keys
    scdata1 = sc.textFile("/home/ubantu/TwoClassfeatureSet.csv")
    data= scdata1.map(parseLine)
    splits = data.randomSplit([0.8, 0.2], 1234)
    train = splits[0]
    test = splits[1]
    layers = [30, 20, 20, 2]
    # create the trainer and set its parameters
    trainer = MultilayerPerceptronClassifier(maxIter=100, layers=layers, blockSize=128, seed=1234)
    # train the model
    model = trainer.fit(train)
    # compute precision on the test set
    result = model.transform(test)
    predictionAndLabels = result.select("prediction", "label")
    evaluator = MulticlassClassificationEvaluator(metricName="precision")
    print("Precision:" + str(evaluator.evaluate(predictionAndLabels)))
def multilayer_perceptron_classifier(trainingDataFrame,
                                     maxIter=100,
                                     tol=1e-6,
                                     seed=None,
                                     layers=None,
                                     blockSize=128,
                                     stepSize=0.03,
                                     solver="gd",
                                     initialWeights=None):
    mlp = MultilayerPerceptronClassifier(maxIter=maxIter,
                                         tol=tol,
                                         seed=seed,
                                         layers=layers,
                                         blockSize=blockSize,
                                         stepSize=stepSize,
                                         solver=solver,
                                         initialWeights=initialWeights)
    mlpModel = mlp.fit(trainingDataFrame)
    result = {}
    result["model"] = mlpModel
    return result
Example #23
0
def train_evaluate(train, test, hidden_layers, num_columns, num_classes,
                   labelCol):
    # specify layers for the neural network:
    layers = [num_columns - 1, *hidden_layers, max(2, num_classes)]

    # create the trainer and set its parameters
    from pyspark.ml.classification import MultilayerPerceptronClassifier
    trainer = MultilayerPerceptronClassifier(labelCol=labelCol,
                                             maxIter=500,
                                             layers=layers)

    # train the model
    model = trainer.fit(train)

    # compute accuracy on the test set
    result = model.transform(test)
    predictionAndLabels = result.select("prediction", labelCol)

    from pyspark.ml.evaluation import MulticlassClassificationEvaluator
    evaluator = MulticlassClassificationEvaluator(labelCol=labelCol,
                                                  metricName="accuracy")
    return evaluator.evaluate(predictionAndLabels)
Example #24
0
def get_results(data):
    results = []
    for i in data:
        splits = i.randomSplit([0.8, 0.2], 1234)
        training = splits[0]
        testing = splits[1]
        training = training.toDF(["label", "features"])
        testing = testing.toDF(["label", "features"])
        numFeatures = training.take(1)[0].features.size

        #First layers has to be the number of the features of the data

        layers = [numFeatures, 4, 5, 2]

        trainer = MultilayerPerceptronClassifier(maxIter=1000, layers=layers, blockSize=128, seed=1234)
        model = trainer.fit(training)
        result = model.transform(testing)
        predictionAndLabels = result.select("prediction", "label")
        evaluator = MulticlassClassificationEvaluator(metricName="accuracy")
        answer = "Test set precision = " + str(evaluator.evaluate(predictionAndLabels)) + '\n'
        results.append(answer)
    return sc.parallelize(results)
def FeedforwardNeuralNet(input_size):
    # specify layers for the neural network:
    # input layer of size 4 (features), two intermediate of size 5 and 4
    # and output of size 3 (classes)
    layers = [input_size, 100, 20, 2]

    # create the trainer and set its parameters
    trainer = MultilayerPerceptronClassifier(maxIter=100,
                                             layers=layers,
                                             blockSize=128,
                                             seed=1234)

    # train the model
    model = trainer.fit(train)
    model.write().overwrite().save("save/tencent2vec_nn")

    # compute accuracy on the test set
    result = model.transform(test)
    # result.select("prediction", "label").show(400)
    predictionAndLabels = result.select("prediction", "label")
    evaluator = MulticlassClassificationEvaluator(metricName="accuracy")
    print("Test set accuracy = " +
          str(evaluator.evaluate(predictionAndLabels)))
def run(data, headers_feature):
    print('-*-*-*- Iniciando la red neuronal -*-*-*-')
    start_time = datetime.datetime.now()
    print('Tiempo inicial', start_time)

    # Obtener data de entrenamiento y pruebas
    train_data, test_data = prepare_dataset(data, headers_feature)
    train_data.show()

    # Especificar las capas para la red neuronal:
    # input de 5000 (features),
    # 5 capas intermedias de 50 neuronas
    # y output de 101 (clases)
    layers = [5000, 50, 50, 50, 50, 50, 101]

    # Configurar la el clasificador perceptron multicapa
    mlpc = MultilayerPerceptronClassifier(
        maxIter=100, layers=layers, blockSize=128, seed=1234
    )

    # Obtener el modelo de clasificacion
    mlpc_model = mlpc.fit(train_data)

    # print("Coeficientes: " + str(lr_model.coefficients))
    # print("Intercepto: " + str(lr_model.intercept))

    data_to_validate = mlpc_model.transform(test_data)

    # Validar la precision de la prediccion de la data de prueba
    prediction = data_to_validate.select("prediction", "label")
    evaluator = MulticlassClassificationEvaluator(metricName="accuracy")
    print("Precision de la prueba = " + str(evaluator.evaluate(prediction)))

    end_time = datetime.datetime.now()
    print('Tiempo final', end_time)
    print('Tiempo transcurrido para red neuronal', end_time - start_time)
    rf = RandomForestClassifier(labelCol="label", featuresCol="features")
    rf_model = rf.fit(train_df)
    rf_predictions = rf_model.transform(test_df)
    rf_predictions.take(1)

# COMMAND ----------

from pyspark.ml.classification import MultilayerPerceptronClassifier

if enabled[3]:
    layers = [len(inputCols), 5, 4, 2]
    mp = MultilayerPerceptronClassifier(maxIter=100,
                                        layers=layers,
                                        labelCol="label",
                                        featuresCol="features")
    mp_model = mp.fit(train_df)
    mp_predictions = mp_model.transform(test_df)
    mp_predictions.take(1)

# COMMAND ----------

from pyspark.ml.tuning import TrainValidationSplit, ParamGridBuilder, CrossValidator
from pyspark.ml.evaluation import BinaryClassificationEvaluator

evaluator = BinaryClassificationEvaluator(labelCol="label",
                                          rawPredictionCol="rawPrediction")

accuracies = []

if enabled[0]:
    dt_accuracy = evaluator.evaluate(dt_predictions)
Example #28
0
    train = splits[0]
    test = splits[1]

    # specify layers for the neural network:
    # input layer of size 6 (features), two intermediate of size 6 and 4
    # and output of size 2 (classes)
    layers = [6, 10, 2]

    # create the trainer and set its parameters
    trainer = MultilayerPerceptronClassifier(maxIter=100,
                                             layers=layers,
                                             blockSize=128,
                                             seed=1234)

    # train the model
    model = trainer.fit(train)

    # compute accuracy on the test set
    result = model.transform(test)
    predictionAndLabels = result.select('prediction', 'label')
    evaluator = MulticlassClassificationEvaluator(metricName='accuracy')
    print('Test set accuracy = ' +
          str(evaluator.evaluate(predictionAndLabels)))

    #Calcular AUC
    evaluator = BinaryClassificationEvaluator(rawPredictionCol='prediction')
    evaluation = evaluator.evaluate(model.transform(test))
    print('AUC:', evaluation)

    #Detener
    sc.stop()
    print(riskdata.stat.crosstab("bad","reason").show())
    #################################################################
    # Multilayer Perceptron Classifier
    #################################################################

    # specify layers for the neural network:
    # input layer of size 10 (features), two intermediate of size 3 and 2
    # and output of size 2 (classes)
    layers = [10, 3, 2, 2]
    # create the trainer and set its parameters
    MLPtrainer = MultilayerPerceptronClassifier(maxIter = 100, layers = layers,
                                             labelCol = "bad", featuresCol = "predictors",
                                             predictionCol = "prediction", 
                                             blockSize = 1000, seed = 1234)
    # train the model
    MLP_model = MLPtrainer.fit(train)
    
    # compute precision on the test set
    MLP_result = MLP_model.transform(test)
    MLP_predictionAndLabels = MLP_result.select("prediction", "bad")
    MLP_evaluator = MulticlassClassificationEvaluator(metricName="precision")
    #print(MLP_model)
    #print(str(MLP_result.show())) # Print first 20 rows result to output file (plain text)

""""
    #################################################################
    # Decision Tree Classification
    #################################################################
    # Train a DecisionTree model.
    dt_model_spec = DecisionTreeClassifier(labelCol="bad", featuresCol="predictors")
    data = sqlContext.read.format("libsvm")\
        .load("data/mllib/sample_multiclass_classification_data.txt")
    # Split the data into train and test
    
    data.show() 
    data.printSchema()
    data.select('features').show()
    splits = data.randomSplit([0.6, 0.4], 1234)
    train = splits[0]
    print (train.count())
    train.show()
    test = splits[1]
    
    
    # specify layers for the neural network:
    # input layer of size 4 (features), two intermediate of size 5 and 4
    # and output of size 3 (classes)
    layers = [4, 5, 4, 3]
    # create the trainer and set its parameters
    trainer = MultilayerPerceptronClassifier(maxIter=100, layers=layers, blockSize=128, seed=1234)
    # train the model
    model = trainer.fit(train)
    # compute precision on the test set
    result = model.transform(test)
    predictionAndLabels = result.select("prediction", "label")
    evaluator = MulticlassClassificationEvaluator(metricName="precision")
    print("Precision:" + str(evaluator.evaluate(predictionAndLabels)))
    # $example off$

    sc.stop()
                                    seed=1234)

# Get the models for each expert using the parameters of the best model defined above
print("Generating and training experts...")
start = time.time()
for expert in range(num_of_experts):

    train_data_experts, test_data_experts = dataframes[expert].randomSplit(
        [0.8, 0.2])

    trainer = MultilayerPerceptronClassifier(maxIter=iters,
                                             layers=layers,
                                             stepSize=lr,
                                             blockSize=128,
                                             seed=1234)
    model = trainer.fit(train_data_experts)
    dict_of_models[expert] = model

# Dictionary to store the predictions of the full dataset for each trained expert
dict_of_predictions = dict()

# Iterate through the expert and predict the values of each dataset
print("Generating predictions...")
for expert in range(num_of_experts):
    dict_of_predictions[expert] = dict_of_models[expert].transform(test_data)

# Create a pandas dataframe whose columns are each predictions of each expert
evaluations = pd.concat([
    dict_of_predictions[x].toPandas().prediction for x in range(num_of_experts)
],
                        axis=1)
       labelCol="indexed", featuresCol="pcaFeatures")
lrModel = lr.fit(trainingData)
#Predict on the test data
lrPredictions = lrModel.transform(testData)
lrPredictions.select("prediction", "indexed", "label", "pcaFeatures").collect()
evaluator.evaluate(lrPredictions)

# COMPARE TO NEURAL NETWORK MULTILAYER PERCEPTRON
from pyspark.ml.classification import MultilayerPerceptronClassifier
layers = [3, 25, 25, 2]
# layers = [input_dim, internal layers, output_dim(number of classe) ]
nn = MultilayerPerceptronClassifier(maxIter=100, \
        layers=layers, \
    blockSize=128, seed=124, labelCol="indexed", \
    featuresCol="pcaFeatures")
nnModel = nn.fit(trainingData)
#Predict on the test data
nnPredictions = nnModel.transform(testData)
nnPredictions.select("prediction", "indexed", "label", "pcaFeatures").collect()
evaluator.evaluate(nnPredictions)
"""--------------------------------------------------
Modify the code above to:
    - train a logistic regression with the original vars (5% significant p-value)
    - from the selected vars above, train 2 logistic models with regParam = [0.01 and 0.5]
    - train 2 random forest (number of trees = 10 and 100)
    - compare results
"""

#Create the model
rmClassifer10 = RandomForestClassifier(labelCol="indexed", \
                featuresCol="pcaFeatures", numTrees=10)
    def _train_model_spark(self, data):
        df = self._prepare_data_spark(data)
        input_num = len(data.keys().difference({self.CHANGE_AMOUNT, self.CHANGE_DIRECTION, self.TARGET_PRICE,
                                                self.TODAY_PRICE}))

        if self.ann_hidden_nodes_num is None:
            self.ann_hidden_nodes_num = input_num / 2 + 1
        ann_layers = [input_num,
                      # input_num / 3 * 2,
                      # input_num / 3,
                      self.ann_hidden_nodes_num,
                      2]

        self.logger.info('layer settings are {}'.format(ann_layers))
        self.logger.info('training method is {}'.format(self._train_method))
        self.logger.info('trees num is {}'.format(self.random_forest_tree_number))
        if isinstance(self._train_method, dict):
            if self._model is not None and self._train_method[self.CHANGE_AMOUNT] == self.ARTIFICIAL_NEURAL_NETWORK:
                self._model[self.CHANGE_AMOUNT].stop_server()
            self._model = {self.CHANGE_AMOUNT: None,
                           self.CHANGE_DIRECTION: None}

            if self._train_method[self.CHANGE_AMOUNT] == self.LINEAR_REGRESSION:
                lr = LinearRegression(featuresCol="features", labelCol=self.CHANGE_AMOUNT,
                                      maxIter=self.linear_regression_training_times,
                                      regParam=self.linear_regression_regularization_parameter,
                                      predictionCol='AmountPrediction')
                self._model[self.CHANGE_AMOUNT] = lr.fit(df)
            elif self._train_method[self.CHANGE_AMOUNT] == self.RANDOM_FOREST:
                rfr = RandomForestRegressor(featuresCol="features", labelCol=self.CHANGE_AMOUNT,
                                            numTrees=self.random_forest_tree_number,
                                            maxDepth=self.random_forest_tree_max_depth,
                                            predictionCol='AmountPrediction')
                self._model[self.CHANGE_AMOUNT] = rfr.fit(df)
            elif self._train_method[self.CHANGE_AMOUNT] == self.ARTIFICIAL_NEURAL_NETWORK:
                ann_layers[-1] = 1
                self._model[self.CHANGE_AMOUNT] = KerasNeuralNetworkSpark(layers=ann_layers, spark=self._spark,
                                                                          num_workers=self.spark_worker_numbers,
                                                                          epoch=self.ann_epoch_number,
                                                                          featuresCol="features",
                                                                          labelCol=self.CHANGE_AMOUNT,
                                                                          predictionCol='AmountPrediction'
                                                                          )
                self._model[self.CHANGE_AMOUNT].fit(df)
            else:
                self.logger.warn('Unsupported training method {}'.format(self._train_method))
                raise ValueError('Unsupported training method {}'.format(self._train_method))

            if self._train_method[self.CHANGE_DIRECTION] == self.LOGISTIC_REGRESSION:
                lr = LogisticRegression(featuresCol="features", labelCol=self.CHANGE_DIRECTION,
                                        maxIter=self.logistic_regression_training_times,
                                        regParam=self.linear_regression_regularization_parameter,
                                        predictionCol='DirPrediction')
                self._model[self.CHANGE_DIRECTION] = lr.fit(df)
            elif self._train_method[self.CHANGE_DIRECTION] == self.RANDOM_FOREST:
                rfc = RandomForestClassifier(featuresCol="features", labelCol=self.CHANGE_DIRECTION,
                                             numTrees=self.random_forest_tree_number,
                                             maxDepth=self.random_forest_tree_max_depth,
                                             predictionCol='DirPrediction')
                self._model[self.CHANGE_DIRECTION] = rfc.fit(df)

            elif self._train_method[self.CHANGE_DIRECTION] == self.ARTIFICIAL_NEURAL_NETWORK:
                ann_layers[-1] = 2
                mlpc = MultilayerPerceptronClassifier(featuresCol="features",
                                                      labelCol=self.CHANGE_DIRECTION,
                                                      layers=ann_layers,
                                                      predictionCol='DirPrediction')
                self._model[self.CHANGE_DIRECTION] = mlpc.fit(df)

            else:
                self.logger.warn('Unsupported training method {}'.format(self._train_method))
                raise ValueError('Unsupported training method {}'.format(self._train_method))

        else:
            if self._train_method == self.LINEAR_REGRESSION:
                lr = LinearRegression(featuresCol="features", labelCol=self.TARGET_PRICE, predictionCol='prediction',
                                      regParam=self.linear_regression_regularization_parameter,
                                      maxIter=self.linear_regression_training_times)
                self._model = lr.fit(df)
            elif self._train_method == self.RANDOM_FOREST:
                rfr = RandomForestRegressor(featuresCol="features", labelCol=self.TARGET_PRICE,
                                            predictionCol='prediction',
                                            numTrees=self.random_forest_tree_number,
                                            maxDepth=self.random_forest_tree_max_depth)
                self._model = rfr.fit(df)

            elif self._train_method == self.ARTIFICIAL_NEURAL_NETWORK:
                ann_layers[-1] = 1
                if self._model is not None:
                    self._model.stop_server()
                self.logger.warn('layers are {}'.format(ann_layers))
                self._model = KerasNeuralNetworkSpark(layers=ann_layers, spark=self._spark,
                                                      num_workers=self.spark_worker_numbers, epoch=100,
                                                      featuresCol="features", labelCol=self.TARGET_PRICE,
                                                      predictionCol='prediction'
                                                      )
                self._model.fit(df)

            else:
                self.logger.warn('Unsupported training method {}'.format(self._train_method))
                raise ValueError('Unsupported training method {}'.format(self._train_method))

        return self._model
Example #34
0
    #########                    Training and Test                    #########

    print("\n======================================================= ")
    print("==================== NEURAL NETWORK =================== ")
    print("=======================================================\n")

    print("\n================== Training ===================\n")

    #training model MLP
    num_cols = rescaledData.select(
        'features').collect()[0].features.size  #vocabulary size
    layers = [num_cols, 100, 2]
    trainer_MLP = MultilayerPerceptronClassifier(maxIter=100,
                                                 layers=layers,
                                                 blockSize=128,
                                                 seed=1234)
    model_MLP = trainer_MLP.fit(rescaledData)
    print("Done : Neural Network Training")

    print("\n========= Test on Brexit labeled data =========\n ")

    #MLP
    result_MLP = model_MLP.transform(rescaled_test_df_brexit)
    predictionAndLabels = result_MLP.select("prediction", "label")
    evaluator = MulticlassClassificationEvaluator(metricName="accuracy")
    accuracy_MLP = evaluator.evaluate(predictionAndLabels)
    print("Accuracy MLP = " + str(accuracy_MLP))

    file.write("\n" + "== Results on labeled data (Brexit) ==" + "\n")
    file.write('-> ACCURACY MLP : ' + str(accuracy_MLP) + '\n')
Example #35
0
    # define parameters
    input_layer = 200  # number of features
    output_layer = 10  # output 0~9
    hidden_1 = 150
    hidden_2 = 150
    layers = [input_layer, hidden_1, hidden_2, output_layer]

    MPC = MultilayerPerceptronClassifier(featuresCol='feature',
                                         labelCol='label',
                                         predictionCol='prediction',
                                         maxIter=400,
                                         layers=layers,
                                         blockSize=128,
                                         seed=123)

    model = MPC.fit(pca_train_result)

    result = model.transform(pca_test_result).select("label", "prediction")
    result_lp = result.selectExpr("label",
                                  "cast (prediction as int) prediction")
    final_result = result_lp.rdd
    count = final_result.count()

    # calculate the accuracy

    neutral_zero_value = 0

    def seqOp(a, b):
        if b[0] == b[1]:
            return a
        else: