Beispiel #1
0
def main(inputs):
    data = spark.read.csv(inputs, schema=colour_schema)
    train, validation = data.randomSplit([0.75, 0.25])
    train = train.cache()
    validation = validation.cache()
    
    word_indexer = StringIndexer(inputCol = "word", outputCol = "labelCol", handleInvalid = 'error')
    classifier = MultilayerPerceptronClassifier(maxIter = 400, layers = [3, 30, 11], blockSize = 1, seed = 123, labelCol = "labelCol")
    # TODO: create a pipeline to predict RGB colours -> word
    rgb_assembler = VectorAssembler(inputCols = ['R', 'G', 'B'], outputCol = "features")
    
    classifier = MultilayerPerceptronClassifier(maxIter = 400, layers = [3, 30, 11], blockSize = 1, seed = 123, labelCol = "labelCol")
    rgb_pipeline = Pipeline(stages=[rgb_assembler, word_indexer, classifier])
    rgb_model = rgb_pipeline.fit(train)
    
    # TODO: create an evaluator and score the validation data
    evaluator = MulticlassClassificationEvaluator(labelCol = "labelCol" , predictionCol = "prediction")
    
    predictions = rgb_model.transform(validation)
    score = evaluator.evaluate(predictions)
    plot_predictions(rgb_model, 'RGB', labelCol='word')
    print('Validation score for RGB model: %g' % (score, ))
    
    rgb_to_lab_query = rgb2lab_query(passthrough_columns=["word"])
    sqlTrans = SQLTransformer(statement = rgb_to_lab_query)
    # TODO: create a pipeline to predict RGB colours -> word; train and evaluate.
    lab_assembler = VectorAssembler(inputCols = ['labL', 'labA', 'labB'], outputCol = "features")
    lab_pipeline = Pipeline(stages=[sqlTrans,lab_assembler, word_indexer, classifier])
    lab_model = lab_pipeline.fit(train)

    predictions_lab = lab_model.transform(validation)
    score_lab = evaluator.evaluate(predictions_lab)
    plot_predictions(lab_model, 'LAB', labelCol='word')
    print('Validation score for LAB model: %g' % (score_lab, ))
Beispiel #2
0
def main(inputs):
    data = spark.read.csv(inputs, schema=colour_schema)
    train, validation = data.randomSplit([0.75, 0.25])  #use seed here
    train = train.cache()
    validation = validation.cache()

    #creating a pipeline to predict RGB colours -> word
    rgb_assembler = VectorAssembler(inputCols=['R', 'G', 'B'],
                                    outputCol="features")
    #dataframe1 = rgb_assembler.transform(data)
    word_indexer = StringIndexer(inputCol="word",
                                 outputCol="target",
                                 handleInvalid="error",
                                 stringOrderType="frequencyDesc")
    classifier = MultilayerPerceptronClassifier(featuresCol="features",
                                                labelCol="target",
                                                layers=[3, 25, 25])
    rgb_pipeline = Pipeline(stages=[rgb_assembler, word_indexer, classifier])
    rgb_model = rgb_pipeline.fit(train)

    #creating an evaluator and score the validation data
    #model_train = rgb_model.transform(train)
    evaluator = MulticlassClassificationEvaluator(predictionCol="prediction",
                                                  labelCol="target")
    rgb_validation = rgb_model.transform(validation)
    score = evaluator.evaluate(rgb_validation,
                               {evaluator.metricName: "accuracy"})

    print('Validation score for RGB model: %g' % (score, ))
    plot_predictions(rgb_model, 'RGB', labelCol='target')

    rgb_to_lab_query = rgb2lab_query(passthrough_columns=['word'])
    # creating a pipeline to predict RGB colours -> word; train and evaluate.
    sqlTrans = SQLTransformer(statement=rgb_to_lab_query)
    labdata = sqlTrans.transform(data)
    ltrain, lvalidation = labdata.randomSplit([0.75, 0.25])
    lrgb_assembler = VectorAssembler(inputCols=['labL', 'labA', 'labB'],
                                     outputCol="LAB")
    lword_indexer = StringIndexer(inputCol="word",
                                  outputCol="labTarget",
                                  handleInvalid="error",
                                  stringOrderType="frequencyDesc")
    lclassifier = MultilayerPerceptronClassifier(featuresCol="LAB",
                                                 labelCol="labTarget",
                                                 layers=[3, 25, 25])
    lrgb_pipeline = Pipeline(
        stages=[sqlTrans, lrgb_assembler, lword_indexer, lclassifier])
    lrgb_model = lrgb_pipeline.fit(ltrain)
    #lmodel_train = lrgb_model.transform(ltrain)
    lrgb_validation = lrgb_model.transform(lvalidation)
    print(lrgb_validation.show())
    evaluator = MulticlassClassificationEvaluator(predictionCol="prediction",
                                                  labelCol="labTarget")
    lscore = evaluator.evaluate(lrgb_validation,
                                {evaluator.metricName: "accuracy"})

    print('Validation score for LAB model: %g' % (lscore, ))
    plot_predictions(lrgb_model, 'LAB', labelCol='word')
def main(inputs):
    data = spark.read.csv(inputs, schema=colour_schema)
    train, validation = data.randomSplit([0.75, 0.25], seed=123)
    train = train.cache()
    validation = validation.cache()
    # TODO: create a pipeline to predict RGB colours -> word
    rgb_assembler = VectorAssembler(inputCols=['R', 'G', 'B'],
                                    outputCol='features')
    word_indexer = StringIndexer(inputCol='word', outputCol='label')
    classifier = MultilayerPerceptronClassifier(featuresCol='features',
                                                labelCol='label',
                                                layers=[3, 25, 25],
                                                seed=123)
    rgb_pipeline = Pipeline(stages=[rgb_assembler, word_indexer, classifier])
    rgb_model = rgb_pipeline.fit(train)

    # TODO: create an evaluator and score the validation data
    predictions = rgb_model.transform(validation)
    evaluator = MulticlassClassificationEvaluator(predictionCol='prediction',
                                                  labelCol='label',
                                                  metricName='accuracy')
    score = evaluator.evaluate(predictions)
    plot_predictions(rgb_model, 'RGB', labelCol='word')
    print('-------------------------------------------------')
    print('Validation score for RGB model: %g' % (score, ))
    print('-------------------------------------------------')
    rgb_to_lab_query = rgb2lab_query(passthrough_columns=["word"])

    # TODO: create a pipeline to predict RGB colours -> word; train and evaluate.
    sqlTrans = SQLTransformer(statement=rgb_to_lab_query)
    lab_assembler = VectorAssembler(inputCols=['labL', 'labA', 'labB'],
                                    outputCol='features')
    lab_word_indexer = StringIndexer(inputCol='word', outputCol='label')
    lab_classifier = MultilayerPerceptronClassifier(featuresCol='features',
                                                    labelCol='label',
                                                    layers=[3, 25, 25],
                                                    seed=123)
    lab_pipeline = Pipeline(
        stages=[sqlTrans, lab_assembler, lab_word_indexer, lab_classifier])
    lab_model = lab_pipeline.fit(train)
    lab_predictions = lab_model.transform(validation)
    lab_evaluator = MulticlassClassificationEvaluator(
        predictionCol='prediction', labelCol='label', metricName='accuracy')
    score = lab_evaluator.evaluate(lab_predictions)
    plot_predictions(lab_model, 'LAB', labelCol='word')
    print('-------------------------------------------------')
    print('Validation score for LAB model:', score)
    print('-------------------------------------------------')
def get_model(model_string='LogisticRegression'):
    """
    Get the desired model object for training and classification

    Args:
    Returns:
    model object from pyspark.ml.classification
    """
    models_dict = {
        'LogisticRegression':
        LogisticRegression(maxIter=20, regParam=0.3, elasticNetParam=0),
        'DecisionTreeClassifier':
        DecisionTreeClassifier(),
        'RandomForestClassifier':
        RandomForestClassifier(numTrees=10),
        # Deep Learning note: the number on neurons in the last layer needs to equal
        # the number of categories. The number of neurons in the first layer
        # needs to be equal to the vocabulary of count vectorizer
        'MultilayerPerceptronClassifier':
        MultilayerPerceptronClassifier(tol=1e-3,
                                       maxIter=10000,
                                       layers=[500, 100, 20, 6],
                                       blockSize=128,
                                       seed=1234),
        'NaiveBayes':
        NaiveBayes()
    }
    return models_dict[model_string]
def clasificador_PerceptronMulticapa(dataFrame, capas, NumIter, TamLote):
    # dividimos en conjunto de entrenamiento y de test
    splits = dataFrame.randomSplit([0.7, 0.3], 1234)
    trainData = splits[0]
    testData = splits[1]

    # Especificamos las capas para la red neuronal:
    layers = capas
    # Creamos el entrenador de la red y le indicamos sus parámetros
    trainer = MultilayerPerceptronClassifier(maxIter=NumIter,
                                             layers=layers,
                                             blockSize=TamLote,
                                             seed=1234)

    # Entrenamos el modelo
    model = trainer.fit(trainData)
    # compute accuracy on the test set
    result = model.transform(testData)
    predictionAndLabels = result.select('prediction', 'label')
    evaluator = MulticlassClassificationEvaluator(metricName='accuracy')
    accuracy = evaluator.evaluate(predictionAndLabels)
    print('Test Error = %g ' % (1.0 - accuracy))
    print('Accuracy = ', accuracy)

    #Calcular AUC
    evaluator = BinaryClassificationEvaluator(rawPredictionCol='prediction')
    evaluation = evaluator.evaluate(model.transform(testData))
    print('AUC:', evaluation)
    print('Perceptron Multicapa: maxIter:' + str(NumIter) + ' Layers: ' +
          str(layers) + ' blockSize: ' + str(TamLote))
    def GetFScore(self, i, ratio):
        spark = SparkSession.builder.getOrCreate()
        sc = spark.sparkContext
        maldataset = sc.textFile("dataset.csv")
        trainHeader = maldataset.first()
        maldataset = maldataset.filter(lambda line: line != trainHeader
                                       ).mapPartitions(lambda x: csv.reader(x))
        maldataset = maldataset.map(lambda l: self.toint(l))
        df = maldataset.map(lambda l: (l[-1], Vectors.dense(l[0:-1])))
        maldataset = maldataset.map(
            lambda line: LabeledPoint(line[-1], [line[0:len(line) - 1]]))
        trainData, testData = maldataset.randomSplit([ratio, 1 - ratio])
        if i > 0:
            return self.BC(trainData, testData, i)

        df = spark.createDataFrame(df.collect(), ["label", "features"])
        splits = df.randomSplit([ratio, 1 - ratio], 1234)
        train = splits[0]
        test = splits[1]
        mlp = MultilayerPerceptronClassifier(maxIter=100,
                                             layers=[35, 100, 100],
                                             blockSize=1,
                                             seed=123)
        model = mlp.fit(train)
        result = model.transform(test)
        predictionAndLabels = result.select("prediction", "label")
        evaluator = MulticlassClassificationEvaluator(metricName="accuracy")
        return evaluator.evaluate(predictionAndLabels)
Beispiel #7
0
def mpc(ss, data, label_index, feature_indexs, project_url):
    # 1.构造训练数据集
    def func(x):
        features_data = []
        for feature in feature_indexs:
            features_data.append(x[feature])
        return Row(label=label_index, features=Vectors.dense(features_data))

    training_set = data.rdd.map(lambda x: func(x)).toDF()

    # 2.训练模型
    # maxIter=100, tol=1e-6, seed=None, layers=None, blockSize=128, stepSize=0.03, solver="l-bfgs", initialWeights=None
    mpc_param = MultilayerPerceptronClassifier(maxIter=100, tol=1e-6, blockSize=128, stepSize=0.03, solver="l-bfgs")
    mpc_param.setSeed(1)
    mpc_param.setLayers([4, 2, 2])
    mpc_model = mpc_param.fit(training_set)

    # 3.保存模型
    model_path = project_url + '/model/multipleClassification/mpc'
    mpc_model.write().overwrite().save(model_path)

    # 4.读取模型
    mpc2 = MultilayerPerceptronClassificationModel.load(model_path)

    # 5.预测
    result = mpc2.transform(training_set).select("prediction", "features").show()
def TrainMLP(trainingData, testData, layers):
    # specify layers for the neural network:
    # input layer of size (features), two intermediate layers
    # and output of size (classes)

    # create the trainer and set its parameters
    mlp = MultilayerPerceptronClassifier(maxIter=100,
                                         layers=layers,
                                         blockSize=128)

    # train the model
    start = time.time()
    model = mlp.fit(trainingData)
    end = time.time()
    print('Training MLP model took', end - start)

    # Make predictions.
    predictions = model.transform(testData)

    # Select (prediction, true label) and compute test error
    evaluator = MulticlassClassificationEvaluator(labelCol="label",
                                                  predictionCol="prediction",
                                                  metricName="accuracy")
    accuracy = evaluator.evaluate(predictions)
    print("Test Error = %g, accuracy = %g" % (1.0 - accuracy, accuracy))

    return model
Beispiel #9
0
    def test_raw_and_probability_prediction(self):

        data_path = "data/mllib/sample_multiclass_classification_data.txt"
        df = self.spark.read.format("libsvm").load(data_path)

        mlp = MultilayerPerceptronClassifier(
            maxIter=100, layers=[4, 5, 4, 3], blockSize=128, seed=123
        )
        model = mlp.fit(df)
        test = self.sc.parallelize([Row(features=Vectors.dense(0.1, 0.1, 0.25, 0.25))]).toDF()
        result = model.transform(test).head()
        expected_prediction = 2.0
        expected_probability = [0.0, 0.0, 1.0]
        expected_rawPrediction = [-11.6081922998, -8.15827998691, 22.17757045]
        self.assertTrue(result.prediction, expected_prediction)
        self.assertTrue(np.allclose(result.probability, expected_probability, atol=1e-4))
        # Use `assert_allclose` to show the value of `result.rawPrediction` in the assertion error
        # message
        np.testing.assert_allclose(
            result.rawPrediction,
            expected_rawPrediction,
            rtol=0.3,
            # Use the same default value as `np.allclose`
            atol=1e-08,
        )
Beispiel #10
0
def get_data_transformers():
    """
    Creates Data Transformers
    :return: tokenizer, hasher, classifier
    :rtype: Tokenizer, HashingTF, MultilayerPerceptronClassifier
    """
    # Tokenizer : Splits each name into words
    tokenizer = Tokenizer(inputCol="name", outputCol="words")
    # HashingTF : builds term frequency feature vectors from text data
    hasher = HTF(inputCol=tokenizer.getOutputCol(), outputCol="features", numFeatures=8)
    """
        specify layers for the neural network:
        input layer of size 4 (features), two intermediate of size 5 and 4
        and output of size 3 (classes)
    """
    # Network params
    maxIter = 20
    layers = 8, 5, 4, 5, 2
    blockSize = 128
    seed = 1234
    # Creating the trainer and set its parameters
    classifier = MultilayerPerceptronClassifier(maxIter=maxIter,
                                                layers=layers,
                                                blockSize=blockSize,
                                                seed=seed)
    return tokenizer, hasher, classifier
Beispiel #11
0
 def test_mlp_classification_summary(self):
     df = self.spark.createDataFrame([(0.0, Vectors.dense([0.0, 0.0])),
                                      (1.0, Vectors.dense([0.0, 1.0])),
                                      (1.0, Vectors.dense([1.0, 0.0])),
                                      (0.0, Vectors.dense([1.0, 1.0]))
                                      ],
                                     ["label", "features"])
     mlp = MultilayerPerceptronClassifier(layers=[2, 2, 2], seed=123)
     model = mlp.fit(df)
     self.assertTrue(model.hasSummary)
     s = model.summary()
     # test that api is callable and returns expected types
     self.assertTrue(isinstance(s.predictions, DataFrame))
     self.assertEqual(s.labelCol, "label")
     self.assertEqual(s.predictionCol, "prediction")
     self.assertGreater(s.totalIterations, 0)
     self.assertTrue(isinstance(s.labels, list))
     self.assertTrue(isinstance(s.truePositiveRateByLabel, list))
     self.assertTrue(isinstance(s.falsePositiveRateByLabel, list))
     self.assertTrue(isinstance(s.precisionByLabel, list))
     self.assertTrue(isinstance(s.recallByLabel, list))
     self.assertTrue(isinstance(s.fMeasureByLabel(), list))
     self.assertTrue(isinstance(s.fMeasureByLabel(1.0), list))
     self.assertAlmostEqual(s.accuracy, 1.0, 2)
     self.assertAlmostEqual(s.weightedTruePositiveRate, 1.0, 2)
     self.assertAlmostEqual(s.weightedFalsePositiveRate, 0.0, 2)
     self.assertAlmostEqual(s.weightedRecall, 1.0, 2)
     self.assertAlmostEqual(s.weightedPrecision, 1.0, 2)
     self.assertAlmostEqual(s.weightedFMeasure(), 1.0, 2)
     self.assertAlmostEqual(s.weightedFMeasure(1.0), 1.0, 2)
     # test evaluation (with training dataset) produces a summary with same values
     # one check is enough to verify a summary is returned, Scala version runs full test
     sameSummary = model.evaluate(df)
     self.assertTrue(isinstance(sameSummary, MultilayerPerceptronClassificationSummary))
     self.assertAlmostEqual(sameSummary.accuracy, s.accuracy)
def main(args):
    spark=SparkSession\
            .builder\
            .master(args[2])\
            .appName(args[1])\
            .getOrCreate()
    
    start_computing_time = time.time()

    # Load the data stored in LIBSVM format as a DataFrame.
    data = spark.read.format("libsvm").load(args[3])

    (trainingData, testData) = data.randomSplit([0.7, 0.3],seed=1234)

    # specify layers for the neural network:
    # input layer of size 4 (features), two intermediate of size 5 and 4
    # and output of size 3 (classes)
    layers = [4, 5, 4, 3]

    # create the trainer and set its parameters
    trainer = MultilayerPerceptronClassifier(maxIter=100, layers=layers, blockSize=128, seed=1234)

    # train the model
    model = trainer.fit(trainingData)

    # compute accuracy on the test set
    result = model.transform(testData)

    appendTime(sys.argv,start_computing_time)

    spark.stop()
def get_pipeline(vector_size=50, class_num=5, stopwords=None):
    '''
	构建pipeline
        该demo pipeline包含以下步骤:
	1. labelIndexer 将标签索引,从字符装化为整数
        2. tokenizer 将句子分成单词
        3. remover 去除停用词
        4. word2vec 使用word2vec将文本转化为低维度向量
        5. mpc 神经网络分类器
    '''
    labelIndexer = StringIndexer(inputCol="label", outputCol="indexLabel")
    tokenizer = Tokenizer(inputCol="text", outputCol="raw_words")
    remover = StopWordsRemover(inputCol="raw_words",
                               outputCol="words",
                               stopWords=stopwords)
    word2vec = Word2Vec(vectorSize=vector_size,
                        minCount=2,
                        inputCol="words",
                        outputCol="vector")
    layers = [vector_size, (vector_size + class_num) / 2, class_num]
    mpc = MultilayerPerceptronClassifier(maxIter=100,
                                         layers=layers,
                                         seed=1234,
                                         featuresCol="vector",
                                         labelCol="indexLabel")
    pipeline = Pipeline(
        stages=[labelIndexer, tokenizer, remover, word2vec, mpc])
    return pipeline
def entrenar(df):
    vectorAssembler = VectorAssembler(inputCols=[
        "Position", "Crossing", "Finishing", "HeadingAccuracy", "ShortPassing",
        "Volleys", "Dribbling", "Curve", "FKAccuracy", "LongPassing",
        "BallControl", "Acceleration", "SprintSpeed", "Agility", "Reactions",
        "Balance", "ShotPower", "Jumping", "Stamina", "Strength", "LongShots",
        "Aggression", "Interceptions", "Positioning", "Vision", "Penalties",
        "Composure", "Marking", "StandingTackle", "SlidingTackle", "GKDiving",
        "GKHandling", "GKKicking", "GKPositioning", "GKReflexes"
    ],
                                      outputCol="features")
    stringIndexer = StringIndexer(inputCol="Position",
                                  outputCol="indexedLabel")
    vectorIndexer = VectorIndexer(inputCol="features",
                                  outputCol="indexedFeatures")

    # Division en data de entrenamiento y data de test
    (training_df, test_df) = df.randomSplit([0.7, 0.3])

    # Configurar Red Neuronal
    capas = [13, 13, 13, 2]
    entrenador = MultilayerPerceptronClassifier(layers=capas,
                                                featuresCol="indexedFeatures",
                                                labelCol="indexedLabel",
                                                maxIter=10000)

    # Entrenar mi RN
    pipeline = Pipeline(
        stages=[vectorAssembler, stringIndexer, vectorIndexer, entrenador])
    return pipeline.fit(training_df), test_df
Beispiel #15
0
def models():
    rf_classifier = RandomForestClassifier(labelCol="label",
                                           featuresCol="features")
    print("Random Forest F1 = %g" % evaluate(rf_classifier))
    lsvc = LinearSVC(maxIter=50)
    print("Linear SVC F1 = %g" % evaluate(lsvc))
    gbt = GBTClassifier()
    print("GBT F1 = %g" % evaluate(gbt))

    mlp = MultilayerPerceptronClassifier(seed=1234, featuresCol='features')
    print("MLP F1 = %g" % evaluate(mlp))

    fm = FMClassifier()
    print('FM')
    evaluate(fm)
    featurize_lda()
    # NGrams
    # print("NGram Random Forest F1 = %g" % evaluate(rf_classifier, "ngrams"))
    # print("Ngram Linear SVC F1 = %g" % evaluate(lsvc, "ngrams"))
    # print("Ngram GBT F1 = %g" % evaluate(gbt, "ngrams"))
    # TF-IDF
    print("Ngram TF-IDF Random Forest F1 = %g" %
          evaluate(rf_classifier, "ngrams", "TF-IDF"))
    print("Ngram TF-IDF Linear SVC F1 = %g" %
          evaluate(lsvc, "ngrams", "TF-IDF"))
    print("Ngram TF-IDF GBT F1 = %g" % evaluate(gbt, "ngrams", "TF-IDF"))
    print("Words TF-IDF Random Forest F1 = %g" %
          evaluate(rf_classifier, "words", "TF-IDF"))
    print("Words TF-IDF Linear SVC F1 = %g" %
          evaluate(lsvc, "words", "TF-IDF"))
    print("Words TF-IDF GBT F1 = %g" % evaluate(gbt, "words", "TF-IDF"))
Beispiel #16
0
def main(inputs):
    data = spark.read.csv(inputs, header=True, schema=colour_schema)
    lab_query = rgb2lab_query(passthrough_columns=['labelword'])

    # TODO: actually build the components for the pipelines, and the pipelines.
    indexer = StringIndexer(inputCol="labelword",
                            outputCol="labelCol",
                            handleInvalid='error')

    rgb_assembler = VectorAssembler(inputCols=['R', 'G', 'B'],
                                    outputCol="features")
    lab_assembler = VectorAssembler(inputCols=['lL', 'lA', 'lB'],
                                    outputCol="features")

    forest = RandomForestClassifier(numTrees=22,
                                    maxDepth=10,
                                    labelCol="labelCol",
                                    seed=42)
    mlp = MultilayerPerceptronClassifier(maxIter=400,
                                         layers=[3, 16, 11],
                                         blockSize=1,
                                         seed=123,
                                         labelCol="labelCol")

    sqlTrans = SQLTransformer(statement=lab_query)

    models = [
        ('RGB-forest', Pipeline(stages=[indexer, rgb_assembler, forest])),
        ('LAB-forest',
         Pipeline(stages=[sqlTrans, indexer, lab_assembler, forest])),
        ('RGB-MLP', Pipeline(stages=[indexer, rgb_assembler, mlp])),
        ('LAB-MLP', Pipeline(stages=[sqlTrans, indexer, lab_assembler, mlp])),
    ]

    # TODO: need an evaluator
    evaluator = MulticlassClassificationEvaluator(labelCol="labelCol",
                                                  predictionCol="prediction")

    # TODO: split data into training and testing
    train, test = data.randomSplit([0.75, 0.25])
    train = train.cache()
    test = test.cache()
    score_dict = dict()
    for label, pipeline in models:
        # TODO: fit the pipeline to create a model
        model = pipeline.fit(train)

        # Output a visual representation of the predictions we're
        # making: uncomment when you have a model working
        plot_predictions(model, label)

        # TODO: predict on the test data
        predictions = model.transform(test)

        # calculate a score
        score = evaluator.evaluate(predictions)
        score_dict[label] = score
    return score_dict
def main(inputs):

    # Read the CSV File
    df = spark.read.csv(inputs, schema=colour_schema)

    # Total label count
    label_num = df.select('word').distinct().count()

    # Split the dataset. Make 75% as training set and the remaining 25% as validation set
    train, validation = df.randomSplit([0.75, 0.25])
    train = train.cache()
    validation = validation.cache()

    # Creating pipeline
    rgb_assembler = VectorAssembler(inputCols=["R", "G", "B"],
                                    outputCol="features")
    word_indexer = StringIndexer(inputCol="word",
                                 outputCol="label",
                                 handleInvalid="error")
    classifier_mpc = MultilayerPerceptronClassifier(layers=[3, 250, label_num])

    # Transformer for the lab pipeline
    rgb_to_lab_query = rgb2lab_query(passthrough_columns=['word'])
    sqlTrans = SQLTransformer(statement=rgb_to_lab_query)
    lab_assembler = VectorAssembler(inputCols=["labL", "labA", "labB"],
                                    outputCol="features")

    # TODO: create a pipeline to predict RGB colours -> word; train and evaluate.

    #  pipeline to predict RGB colours
    rgb_pipeline = Pipeline(
        stages=[rgb_assembler, word_indexer, classifier_mpc])
    lab_pipeline = Pipeline(
        stages=[sqlTrans, lab_assembler, word_indexer, classifier_mpc])

    # Train the model
    rgb_model = rgb_pipeline.fit(train)
    lab_model = lab_pipeline.fit(train)

    # Transform the validation set
    predictions_rgb = rgb_model.transform(validation)
    predictions_lab = lab_model.transform(validation)

    # TODO: create an evaluator and score the validation data

    # Create a Multiclass Classification Evaluator
    evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")

    # Evaluate it on validation data
    score_rgb = evaluator.evaluate(predictions_rgb)
    score_lab = evaluator.evaluate(predictions_lab)

    plot_predictions(rgb_model, 'RGB', labelCol='word')
    plot_predictions(lab_model, 'LAB', labelCol='word')

    # Print the validation scores
    print('Validation score for RGB model: %g' % (score_rgb, ))
    print('Validation score for LAB model: %g' % (score_lab, ))
Beispiel #18
0
def clasificar_chi2():
    #Leemos la data y convertimos a float los valores de cada columna
    conf = SparkConf().setAppName("NN_1").setMaster("local")
    sc = SparkContext(conf=conf)
    sqlContext = SQLContext(sc)
    rdd = sqlContext.read.csv(
        "/home/ulima-azure/data/Enfermedad_Oncologica_T3.csv", header=True).rdd
    rdd = rdd.map(lambda x: (float(x[0]), float(x[1]), float(x[2]), float(x[
        3]), float(x[4]), float(x[5]), float(x[6]), float(x[7]), float(x[8]),
                             float(x[9])))

    df = rdd.toDF([
        "Cellenght", "Cellsize", "Cellshape", "mgadhesion", "sepics",
        "bnuclei", "bchromatin", "nucleos", "mitoses", "P_Benigno"
    ])
    #Construir nuestro vector assembler (features)
    assembler = VectorAssembler(inputCols=[
        "Cellenght", "Cellsize", "Cellshape", "nucleos", "bchromatin",
        "mitoses"
    ],
                                outputCol="featuresChi2")
    df_chi2 = assembler.transform(df)
    df_chi2 = df_chi2.select("featuresChi2", "P_Benigno")

    selector = ChiSqSelector(numTopFeatures=3,
                             featuresCol="featuresChi2",
                             labelCol="P_Benigno",
                             outputCol="featuresSelected")
    df_result = selector.fit(df_chi2).transform(df_chi2)

    #Dividir data en training y test
    (df_training, df_test) = df_result.randomSplit([0.7, 0.3])

    # Definir arquitectura de nuestra red (hiperparametro)
    capas = [3, 4, 6, 2]

    # Construimos al entrenador
    # Hiperparametro: maxIter
    entrenador = MultilayerPerceptronClassifier(featuresCol="featuresSelected",
                                                labelCol="P_Benigno",
                                                maxIter=1000,
                                                layers=capas)
    # Entrenar nuestro modelo
    modelo = entrenador.fit(df_training)

    # Validar nuestro modelo
    df_predictions = modelo.transform(df_test)
    evaluador = MulticlassClassificationEvaluator(labelCol="P_Benigno",
                                                  predictionCol="prediction",
                                                  metricName="accuracy")
    accuracy = evaluador.evaluate(df_predictions)
    print(f"Accuracy: {accuracy}")

    df_predictions.select("prediction", "rawPrediction", "probability").show()

    #Mostramos la cantidad de 0 y 1 de las predicciones
    df_predictions.groupby('prediction').count().show()
Beispiel #19
0
 def mlp(params,features):
     input_layers = len(features)
     layers = [input_layers, params[1], 2]
     print(layers)
     mlpClassifier = MultilayerPerceptronClassifier(featuresCol = 'features',
                                                    labelCol = 'Class',
                                                    maxIter = params[0],
                                                    layers = layers,
                                                    stepSize = params[2])
     return mlpClassifier
def neuralNetwork_model(train, x, y, feature_count):
    layers = [feature_count, feature_count * 3, feature_count * 2, 2]
    mlp = MultilayerPerceptronClassifier(featuresCol=x,
                                         labelCol=y,
                                         maxIter=100,
                                         layers=layers,
                                         blockSize=512,
                                         seed=12345)
    mlpModel = mlp.fit(train)
    return mlpModel
    def __init__(self, sc, configs):
        self.sqlContext = SQLContext(sc)

        self.spark_context = spark_context

        self.configs = configs
        self.path_to_task = self.configs['Data']['task']
        self.undersample = self.configs['Training']['undersample']

        self.task = Task(self.path_to_task)
        self.task_number = self.path_to_task[-1]
        self.split = self.configs['Data']['split']
        self.training = self.task.get_split(self.split,
                                            part='train',
                                            chunks=10)
        _, self.labels, self.users = map(list, zip(*self.training))
        self.posts = [post for user in self.users for post in user]
        self.posts = list(filter(lambda p: len(p.split()) > 15, self.posts))
        self.labels, self.users = zip(
            *filter(lambda p: len(p[1]) > 10, zip(self.labels, self.users)))
        self.users = [' '.join(user) for user in self.users]

        if self.undersample != 'false':
            positives = list(
                filter(lambda s: s[0] == '1', zip(self.labels, self.users)))
            negatives = list(
                filter(lambda s: s[0] == '0', zip(self.labels, self.users)))
            shuffle(negatives)
            both = positives + negatives[:len(positives)]
            shuffle(both)
            self.labels, self.users = map(list, zip(*both))

        self.tokenizer = Tokenizer(inputCol="text", outputCol="rawWords")
        self.stopWords = StopWordsRemover(
            inputCol="rawWords",
            outputCol="words",
            caseSensitive=False,
            stopWords=StopWordsRemover.loadDefaultStopWords("english"))
        self.cv = CountVectorizer(inputCol="words",
                                  outputCol="rawFeatures",
                                  vocabSize=30000)
        self.idf = IDF(minDocFreq=2,
                       inputCol="rawFeatures",
                       outputCol="features")
        self.mlp = MultilayerPerceptronClassifier(maxIter=2000,
                                                  layers=[30000, 80, 100, 2],
                                                  blockSize=128,
                                                  seed=1234)

        self.pipeline = Pipeline(stages=[
            self.tokenizer, self.stopWords, self.cv, self.idf, self.mlp
        ])

        self.model = self.pipeline.fit(
            self.create_data_frame(self.users, self.labels))
 def mlpc(self, maxIter=100, blockSize=128, seed=1234):
     self.time_calc.start_time('\nMultilayer perceptron classifier')
     layers = [4, 5, 4, 2]
     # specify layers for the neural network:
     # input layer of size 4 (features), two intermediate of size 5 and 4
     # and output of size 3 (classes)
     mlpc = MultilayerPerceptronClassifier(maxIter=maxIter,
                                           layers=layers,
                                           blockSize=blockSize,
                                           seed=seed)
     self.classify('mlpc', mlpc)
     self.time_calc.end_time('Multilayer perceptron classifier')
Beispiel #23
0
def getclassifiers():
    #Classifiers used for training a model.
    lr = LogisticRegression(maxIter=10)
    lrgrid = ParamGridBuilder()\
    .addGrid(lr.regParam, [0.1, 0.01])\
    .addGrid(lr.elasticNetParam, [0.0, 0.5, 1.0])\
    .build()
    LR = Classifier(lr, lrgrid)

    dt = DecisionTreeClassifier()
    dtgrid = ParamGridBuilder()\
    .addGrid(dt.maxDepth, [2, 5, 10])\
    .addGrid(dt.minInfoGain, [0.0, 0.1])\
    .addGrid(dt.maxBins, [6, 32])\
    .build()
    DT = Classifier(dt, dtgrid)

    rf = RandomForestClassifier()
    rfgrid = ParamGridBuilder()\
    .addGrid(rf.numTrees, [5, 20])\
    .addGrid(rf.maxDepth, [2, 5])\
    .build()
    RF = Classifier(rf, rfgrid)

    gbt = GBTClassifier(maxDepth=3, maxBins=16, maxIter=5)
    gbtgrid = ParamGridBuilder()\
    .addGrid(gbt.maxDepth, [2, 5])\
    .addGrid(gbt.maxIter, [5, 20])\
    .addGrid(gbt.stepSize, [0.01, 0.1])\
    .build()
    GBT = Classifier(gbt, gbtgrid)

    mpc = MultilayerPerceptronClassifier(layers=[6, 4, 2])
    mpcgrid = ParamGridBuilder()\
    .addGrid(mpc.maxIter, [20, 100])\
    .addGrid(mpc.tol, [1e-06, 1e-04])\
    .build()
    MPC = Classifier(mpc, mpcgrid)

    lsvc = LinearSVC(maxIter=15)
    lsvcgrid = ParamGridBuilder()\
    .addGrid(lsvc.threshold, [0.0, 0.05])\
    .build()
    LSVC = Classifier(lsvc, lsvcgrid)

    nb = NaiveBayes()
    nbgrid = ParamGridBuilder()\
    .addGrid(nb.smoothing, [0.2, 0.5, 1.0])\
    .build()
    NB = Classifier(nb, nbgrid)

    return [LR, DT, RF, GBT, LSVC, NB]
def _get_mlp_model(feat_train):
    from pyspark.ml.classification import MultilayerPerceptronClassifier
    global num_features
    layers = [num_features, 10, 10, 2]
    mlp_trainer = MultilayerPerceptronClassifier(maxIter=10,
                                                 layers=layers,
                                                 seed=123,
                                                 stepSize=0.005,
                                                 solver='gd',
                                                 featuresCol="features",
                                                 labelCol="label")
    mlp_model = mlp_trainer.fit(feat_train)
    return mlp_model
def NeuralNetworkCV(trainingData, testData):
    start_time = time.time()

    layers = [187, 8, 5]

    nn = MultilayerPerceptronClassifier(layers=layers)

    # Parametri su cui effettuare il tuning
    paramGrid = ParamGridBuilder() \
        .addGrid(nn.stepSize, [1, 0.01]) \
        .addGrid(nn.maxIter, [100, 1000]) \
        .build()

    cv = CrossValidator(estimator=nn,
                        estimatorParamMaps=paramGrid,
                        evaluator=MulticlassClassificationEvaluator(),
                        numFolds=5)

    model = cv.fit(trainingData)

    prediction = model.transform(testData)
    predictionAndLabels = prediction.select("prediction", "label")
    evaluator = MulticlassClassificationEvaluator(metricName="accuracy")
    accuracy = evaluator.evaluate(predictionAndLabels)
    evaluator = MulticlassClassificationEvaluator(metricName="f1")
    f1score = evaluator.evaluate(predictionAndLabels)

    # Confusion Matrix
    class_temp = prediction.select("label").groupBy("label") \
        .count().sort('count', ascending=False).toPandas()
    class_temp = class_temp["label"].values.tolist()

    y_true = prediction.select("label")
    y_true = y_true.toPandas()

    y_pred = prediction.select("prediction")
    y_pred = y_pred.toPandas()

    cnf_matrix = confusion_matrix(y_true, y_pred, labels=class_temp)
    print(cnf_matrix)
    print("Accuracy K-Folds: ", accuracy)
    print("F1-Score K-Folds: ", f1score)
    print("")
    print("")
    print("Doc Parameters : [", model.explainParams(), "]")
    print("")
    print("")
    print("Confusion Matrix: ")
    print(cnf_matrix)
    print("Neural Network K-Folds Execution TIME:", time.time() - start_time)
    return f1score, cnf_matrix, cv
Beispiel #26
0
def main(inputs):

    data = spark.read.csv(inputs, schema=colour_schema)
    train, validation = data.randomSplit([0.75, 0.25])
    train = train.cache()
    validation = validation.cache()

    # TODO: create a pipeline to predict RGB colours -> word

    rgb_assembler = VectorAssembler(inputCols=['R', 'G', 'B'],
                                    outputCol='features')
    word_indexer = StringIndexer(inputCol='word', outputCol='new_word')
    classifier = MultilayerPerceptronClassifier(labelCol="new_word",
                                                layers=[3, 30, 11])
    rgb_pipeline = Pipeline(stages=[rgb_assembler, word_indexer, classifier])
    rgb_model = rgb_pipeline.fit(train)

    # TODO: create an evaluator and score the validation data

    rgb_validation = rgb_model.transform(validation)
    # rgb_validation.show()
    plot_predictions(rgb_model, 'RGB', labelCol='word')
    vali_evaluator = MulticlassClassificationEvaluator(
        predictionCol="prediction", labelCol='new_word')
    score = vali_evaluator.evaluate(rgb_validation)
    print('Validation score for RGB model: %g' % (score, ))

    # TODO: create a pipeline RGB colours -> LAB colours -> word; train and evaluate.

    rgb_to_lab_query = rgb2lab_query(passthrough_columns=['word'])
    sql_transformer = SQLTransformer(statement=rgb_to_lab_query)

    new_assembler = VectorAssembler(inputCols=['labL', 'labA', 'labB'],
                                    outputCol='features')
    new_pipeline = Pipeline(
        stages=[sql_transformer, new_assembler, word_indexer, classifier])
    new_training = sql_transformer.transform(train)
    new_model = new_pipeline.fit(new_training)
    new_validation = new_model.transform(validation)

    #new_validation.show()

    new_vali_evaluator = MulticlassClassificationEvaluator(
        predictionCol='prediction', labelCol='new_word')
    new_score = new_vali_evaluator.evaluate(new_validation)
    print('Validation score for LAB model:', new_score)
    print('Validation score for LAB model:', new_score)
    print('Validation score for LAB model:', new_score)

    plot_predictions(new_model, 'LAB', labelCol="word")
Beispiel #27
0
def make_model(train,val):
	layers = [100, 100, 2]
	# create the trainer and set its parameters
	trainer = MultilayerPerceptronClassifier(maxIter=100, layers=layers, blockSize=128, seed=1234)
	model = trainer.fit(train)
	result = model.transform(val)
	predictionAndLabels = result.select("prediction", "label")
	#predictionAndLabels.where(predictionAndLabels['prediction'] == 0 ).show()
	evaluator = MulticlassClassificationEvaluator(metricName="accuracy")
	print("Test set accuracy = " + str(evaluator.evaluate(predictionAndLabels)))

	#save model
	mlp_path = "s3://projfakenews/mlp"
	model.save(mlp_path)
def rgb_classify(type,train,validation,figName):
    rgb_assembler = VectorAssembler(inputCols=['R','G','B'],outputCol='features')
    word_indexer = StringIndexer(inputCol='word',outputCol='label',stringOrderType='alphabetAsc')
    if (type == "MLPC"):
        classifier = MultilayerPerceptronClassifier(layers=[3, 25, 25],seed=42)
    elif (type == "LogReg"):
        classifier = LogisticRegression()
    rgb_pipe = Pipeline(stages=[rgb_assembler, word_indexer, classifier])
    rgb_model = rgb_pipe.fit(train)
    predictions = rgb_model.transform(validation)
    evaluator = MulticlassClassificationEvaluator(predictionCol='prediction', labelCol='label', metricName='accuracy')
    score = evaluator.evaluate(predictions)
    plot_predictions(rgb_model, 'RGB_'+figName, labelCol='word')
    return score
Beispiel #29
0
def MLPclf(trainingData, testData):

    mlp = MultilayerPerceptronClassifier().setFeaturesCol(
        "features").setLabelCol("label").setLayers(layers).setSolver(
            "gd").setStepSize(0.3).setMaxIter(1000)

    mlpModel = mlp.fit(trainingData)
    results = mlpModel.transform(testData)

    label = results.select("label").toPandas().values
    predict = results.select("prediction").toPandas().values
    np.savetxt('res/predictedMLP_spark.txt', predict, fmt='%01d')
    print("[accuracy,precision,recall,f1]")
    # print(evaluate(label,predict))
    return evaluate(label, predict)
Beispiel #30
0
def perceptron_multicapa(train, test, capas, num_iter, tamlot):

  layers = capas

  trainer = MultilayerPerceptronClassifier(
        maxIter=num_iter, layers=layers, blockSize=tamlot, seed=13)
  # Entrenamos el modelo
  model = trainer.fit(train)
  # compute accuracy on the test set
  result = model.transform(test)
  predictionAndLabels = result.select('prediction', 'label')
  evaluator = MulticlassClassificationEvaluator(metricName='accuracy')
  accuracy = evaluator.evaluate(predictionAndLabels)

  return accuracy