train_binary_classification = pd.concat( [train["id"], train["comment_text"], train["clean"]], axis=1) tokenizer = Tokenizer().setInputCol("comment_text").setOutputCol("words") #Remove stopwords remover = StopWordsRemover().setInputCol("words").setOutputCol( "filtered").setCaseSensitive(False) # ngram = NGram().setN(2).setInputCol("filtered").setOutputCol("ngrams") #For each sentence (bag of words),use HashingTF to hash the sentence into a feature vector. hashingTF = HashingTF().setNumFeatures(1000).setInputCol( "filtered").setOutputCol("rawFeatures") #Create TF_IDF features idf = IDF().setInputCol("rawFeatures").setOutputCol("features").setMinDocFreq( 0) # Create a Logistic regression model lr = LinearSVC(labelCol="label", featuresCol="features", maxIter=20) # Streamline all above steps into a pipeline pipeline = Pipeline(stages=[tokenizer, remover, hashingTF, idf, lr]) #Clean=1/toxic=0 test = [] for i in test_data["class"]: if (i == 2): test.append(1) else: test.append(0) rebalanceDatasetTechnique = ["undersampling"] #,"oversampling","no technique"] for technique in rebalanceDatasetTechnique: print('**Processing {} on imbalanced data...**'.format(technique)) df = spark.createDataFrame(train_binary_classification)
lr = LogisticRegression(featuresCol='scaledFeatures', maxIter=100, regParam=0.3, elasticNetParam=0.8, tol=0.0001, family="binomial") dt = DecisionTreeClassifier(featuresCol='scaledFeatures', seed=seed) rf = RandomForestClassifier(featuresCol='scaledFeatures', seed=seed, numTrees=20) GBDT = GBTClassifier(featuresCol='scaledFeatures', seed=seed) layers = [feature_number, 10, 5, 2] mlp = MultilayerPerceptronClassifier(featuresCol='scaledFeatures', layers=layers, seed=seed) svm = LinearSVC(featuresCol='scaledFeatures', regParam=0.1) nb = NaiveBayes(featuresCol='scaledFeatures', smoothing=1.0) times = [] #model training and testing functions def LR(trainingData, testData): start = time.time() Model = lr.fit(trainingData) end = time.time() times.append(end - start) results = Model.transform(testData)
# Valor de precision sobre el conjunto de testeo evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy") accuracy = evaluator.evaluate(predictions) f = open("/tmp/data/ml_data/nbModel_2_accuracy.txt", "w+") f.write("Test Accuracy: " + str(evaluator.evaluate(predictions))) # Valor de la curva ROC sobre el conjunto de testeo evaluator = BinaryClassificationEvaluator() f = open("/tmp/data/ml_data/nbModel_2_test_set_area_under_ROC.txt", "w+") f.write("Test set Area Under ROC: " + str(evaluator.evaluate(predictions))) ### Linear support vector machine 1 # Entrenar lsvc = LinearSVC(maxIter=10, regParam=0.1) lsvcModel_1 = lsvc.fit(train_set) predictions = lsvcModel_1.transform(test_set) # Valor de precision sobre el conjunto de testeo evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy") accuracy = evaluator.evaluate(predictions) f = open("/tmp/data/ml_data/lsvcModel_1_accuracy.txt", "w+") f.write("Test Accuracy: " + str(evaluator.evaluate(predictions))) # Valor de la curva ROC sobre el conjunto de testeo evaluator = BinaryClassificationEvaluator() f = open("/tmp/data/ml_data/lsvcModel_1_test_set_area_under_ROC.txt", "w+") f.write("Test set Area Under ROC: " + str(evaluator.evaluate(predictions)))
pca_model = pca.fit(standardized_features_df70) pca_train = pca_model.transform(standardized_features_df70) logger.error("###### pca on standarded scaler using test") pca = PCA(k=2, inputCol="std_features", outputCol="pca_features") pca_model = pca.fit(standardized_features_df30) pca_test = pca_model.transform(standardized_features_df30) logger.error("############# svm") from pyspark.ml.classification import LinearSVC # Define your classifier lsvc = LinearSVC(maxIter=30, regParam=0.1, featuresCol="pca_features", labelCol="label") stages209 = [] #stages += string_indexer #stages += one_hot_encoder #stages209 += [vector_assembler] #stages209 += [minmax] stages209 += [lsvc] from pyspark.ml import Pipeline pipeline209 = Pipeline().setStages(stages209) svm7_model209 = pipeline209.fit(pca_train) svm7_pp_df209 = svm7_model209.transform(pca_test)
plan_indexer = StringIndexer(inputCol = 'intl_plan', outputCol = 'intl_plan_indexed') input_cols=['intl_plan_indexed'] + reduced_numeric_cols #Feature Vector Assembler assembler = VectorAssembler(inputCols = input_cols, outputCol = 'features') #Standard Scaler scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures",withStd=True, withMean=False) #Configure Random Forest Classifier Model from pyspark.ml import Pipeline from pyspark.ml.classification import LinearSVC #svmclassifier = LinearSVC(labelCol = 'label', featuresCol = 'scaledFeatures') svmclassifier = LinearSVC(labelCol = 'label', featuresCol = 'features') #Set Random Forest Pipeline Stages #pipeline = Pipeline(stages=[plan_indexer, label_indexer, assembler, scaler, svmclassifier]) pipeline = Pipeline(stages=[plan_indexer, label_indexer, assembler, svmclassifier]) #Spilt Test and Train Sets (train, test) = churn_data.randomSplit([0.75, 0.25]) #Spark Model Hyper Turning from pyspark.ml.tuning import CrossValidator from pyspark.ml.tuning import ParamGridBuilder from pyspark.ml.evaluation import BinaryClassificationEvaluator from pyspark.ml.evaluation import MulticlassClassificationEvaluator
) label_Idxstr = IndexToString( inputCol="prediction", outputCol="predicted_class", labels=["False", "True"], ) # Text Vectorization hashTF = HashingTF(inputCol="token_features", outputCol="tf_features") idf = IDF(inputCol="tf_features", outputCol="features", minDocFreq=2) # Classification Models mnb_clf = NaiveBayes(smoothing=1.0) svm_clf = LinearSVC(standardization=False) # Loading Everything to Pipeline pipeline = Pipeline().setStages([ document_assembler, sentence, tokenizer, normalizer, lemmatizer, stopwords_cleaner, finisher, hashTF, idf, label_strIdx, svm_clf,
old_columns_names = df.columns print(old_columns_names) new_columns_names = [name + '-new' for name in old_columns_names] for i in range(len(old_columns_names)): indexer = StringIndexer(inputCol=old_columns_names[i], outputCol=new_columns_names[i]) df = indexer.fit(df).transform(df) vecAss = VectorAssembler(inputCols=new_columns_names[1:], outputCol='features') df = vecAss.transform(df) # 更换label列名 df = df.withColumnRenamed(new_columns_names[0], 'label') # 创建新的只有label和features的表 data = df.select(['label', 'features']) # 数据概观 print(data.show(5, truncate=0)) # 将数据集分为训练集和测试集 train_data, test_data = data.randomSplit([4.0, 1.0], 100) from pyspark.ml.classification import LinearSVC svm = LinearSVC() svmModel = svm.fit(train_data) result = svmModel.transform(test_data) # accuracy print( result.filter(result.label == result.prediction).count() / result.count()) # 0.9797172710510141
models = cv.fit(ngramDataFrame) result = models.transform(ngramDataFrame) result1 = result.select("business_id","text","stars","label","features","ngrams") idf = IDF(inputCol="features", outputCol="tdfeatures") idfModel = idf.fit(result1) rescaledData = idfModel.transform(result1) testing = rescaledData.select("business_id","text","stars","label","tdfeatures","ngrams").withColumnRenamed("tdfeatures","features") testing = testing.withColumn("label", testing["label"].cast(IntegerType())) svm = LinearSVC() model = svm.fit(testing) coeffs = model.coefficients vocabulary_ngram = models.vocabulary weights_ngram = coeffs.toArray() svm_coeffs_df_ngram = pd.DataFrame({'ngram': vocabulary_ngram, 'weight': weights_ngram}) sql = SQLContext(sc) result = sql.createDataFrame(svm_coeffs_df_ngram) result.coalesce(1).write.csv('bdad_dataset/output/twogramfeatures_'+top_bid)
rf = RandomForestClassifier(labelCol="CANCELLED", featuresCol="features") rfModel = rf.fit(train) predictions_rf = rfModel.transform(test) accuracy_rf = evaluator.evaluate(predictions_rf) # Naive Bayes from pyspark.ml.classification import NaiveBayes nb = NaiveBayes(smoothing = 1.0, modelType = "multinomial", featuresCol = "features", labelCol = "CANCELLED") nbModel = nb.fit(train) predictions_nb = nbModel.transform(test) accuracy_nb = evaluator.evaluate(predictions_nb) # 0.59431219823991344 # SVM - tried but didn't work from pyspark.ml.classification import LinearSVC lsvc = LinearSVC(maxIter=10, regParam=0.1, featuresCol = ‘features’, labelCol = ‘CANCELLED’) lsvcModel = lsvc.fit(train) predictions_svm = lsvcModel.transform(test) accuracy_svm = evaluator.evaluate(predictions_svm) # Plotting accuracies for all models import matplotlib.pyplot as plt A = ['Logistic Regression','Decision Tree','Random Forest','Naive Bayes'] B = [accuracy_lr, accuracy_dt, accuracy_rf, accuracy_nb] fig = plt.figure() ax = fig.add_subplot(111) plt.scatter(A, B) axes = plt.gca() axes.set_ylim([0,100])
centers = model.clusterCenters() print("Cluster Centers: ") for center in centers: print(center) print("------------------------------------------------") #detekovanie anomalii for center in centers: for point in center: if (point > 5 or -5 > point): print "anomalia: {0:.15f}".format(point) #SVM print "-------------------------------------------------" print "-----------------------SVM-----------------------" print "-------------------------------------------------" svm_classifier = LinearSVC(featuresCol="features", labelCol="Accident_Severity") svm_model = svm_classifier.fit(training_data) predictions = svm_model.transform(test_data) test_error = predictions.filter( predictions["prediction"] != predictions["Accident_Severity"]).count( ) / float(test_data.count()) print "Testing error: {0:.4f}".format(test_error) #kontingencna tabulka SVM cf = predictions.crosstab("prediction", "Accident_Severity") cf.show() #vyhodnotenie SVM evaluatorMulti = MulticlassClassificationEvaluator( labelCol="Accident_Severity", predictionCol="prediction") evaluator = BinaryClassificationEvaluator(labelCol="Accident_Severity", rawPredictionCol="prediction", metricName='areaUnderROC')
labelCol='label_0', predictionCol='nb_pred_0', probabilityCol='nb_prob_0', rawPredictionCol='nb_raw_0') nb_1 = NaiveBayes(featuresCol='features', labelCol='label_1', predictionCol='nb_pred_1', probabilityCol='nb_prob_1', rawPredictionCol='nb_raw_1') nb_2 = NaiveBayes(featuresCol='features', labelCol='label_2', predictionCol='nb_pred_2', probabilityCol='nb_prob_2', rawPredictionCol='nb_raw_2') svm_0 = LinearSVC(featuresCol='features', labelCol='label_0', predictionCol='svm_pred_0', rawPredictionCol='svm_raw_0') svm_1 = LinearSVC(featuresCol='features', labelCol='label_1', predictionCol='svm_pred_1', rawPredictionCol='svm_raw_1') svm_2 = LinearSVC(featuresCol='features', labelCol='label_2', predictionCol='svm_pred_2', rawPredictionCol='svm_raw_2') # build pipeline to generate predictions from base classifiers, will be used in task 1.3 gen_base_pred_pipeline = Pipeline( stages=[nb_0, nb_1, nb_2, svm_0, svm_1, svm_2]) gen_base_pred_pipeline_model = gen_base_pred_pipeline.fit(training_set)
# COMMAND ---------- nb_accuracy = evaluator.evaluate(nb_prediction) print("Accuracy of NaiveBayes is = %g"% (nb_accuracy)) print("Test Error of NaiveBayes = %g " % (1.0 - nb_accuracy)) # COMMAND ---------- # MAGIC %md # MAGIC ###### Support Vector Machine # COMMAND ---------- from pyspark.ml.classification import LinearSVC svm = LinearSVC(labelCol="Survived", featuresCol="features") svm_model = svm.fit(trainingData) svm_prediction = svm_model.transform(testData) svm_prediction.select("prediction", "Survived", "features").show() # COMMAND ---------- # MAGIC %md # MAGIC ###### Evaluating the accuracy of Support Vector Machine. # COMMAND ---------- svm_accuracy = evaluator.evaluate(svm_prediction) print("Accuracy of Support Vector Machine is = %g"% (svm_accuracy)) print("Test Error of Support Vector Machine = %g " % (1.0 - svm_accuracy))
eval_metrics = lr_model.avgMetrics param_res = [] for params, metric in zip(param_maps, eval_metrics): param_metric = {} for key, param_val in zip(params.keys(), params.values()): param_metric[key.name] = param_val param_res.append((param_metric, metric)) sorted(param_res, key=lambda x: x[1], reverse=True) # In[85]: #创建模型2 lsvc = LinearSVC(maxIter=5) #paramGrid2 = ParamGridBuilder().addGrid(lsvc.regParam, [0.3, 0.01]).addGrid(lsvc.maxIter, [10, 5]).build() paramGrid2 = ParamGridBuilder().addGrid(lsvc.regParam, [0.3, 0.01]).build() evaluator2 = MulticlassClassificationEvaluator(metricName="f1") crossval1 = CrossValidator(estimator=lsvc, estimatorParamMaps=paramGrid2, evaluator=evaluator2, numFolds=3) lsvc_model = crossval1.fit(train) print('lsvcpre value: {}'.format( evaluator2.evaluate(lsvc_model.transform(validation)))) # In[86]: lsvc_model.getEstimatorParamMaps()
"label", when(col("a.id") == col("b.id"), lit("1")).otherwise(lit("0"))).selectExpr( "label", "text_a", "text_b") matched_df = joined_df.where(col("label") == 1) not_matched_df = joined_df.where(col("label") == 0).limit(1000) labeled_df = matched_df.unionAll(not_matched_df) labeled_df.show(10, False) pipeline_model = pipeline.fit(labeled_df) transform_df = pipeline_model.transform(labeled_df).selectExpr( "cast(label as double) label", "features") # view the transformed data (train_df, test_df) = transform_df.randomSplit([0.7, 0.3], 24) logging.info("Count of training data: {}".format(train_df.count())) logging.info("Count of testing data: {}".format(test_df.count())) svm = LinearSVC(maxIter=5, regParam=0.01) model = svm.fit(train_df) logging.info("Model Coefficient {}".format(model.coefficients)) logging.info("Model Intercept {}".format(model.intercept)) logging.info("Model number of classes {}".format(model.numClasses)) logging.info("Model number of features {}".format(model.numFeatures)) predictions = model.transform(test_df) evaluator_svm = BinaryClassificationEvaluator( rawPredictionCol="prediction") area_under_curve = evaluator_svm.evaluate(predictions) logging.info("Area Under Curve is {}".format(area_under_curve)) new_df = spark.createDataFrame([ ("ALIABBAS BHOJANI", "LIABBAS BHOJANI"), ("ALIABBAS BHOJANI", "MUSTAFA CHALLAWALA") ]).toDF("text_a", "text_b").select( split(col("text_a"), " ").alias("text_a"),
def SparkML(train_df, test_df=None, featuresCol='features', labelCol='label', binaryclass=False, multiclass=False, n_cluster=2, userCol='user', itemCol='item', ratingCol='rating', rank=10, userid=3, itemid=3, itemsCol='items', minSupport=0.3, minConfidence=0.8, stringIndexer=False, inputColStringIndexer=None, outputColStringIndexer=None, oneHotEncoder=False, inputColOneHotEncoder=None, outputColOneHotEncoder=None, vectorAssembler=False, inputColsVectorAssembler=None, outputColsVectorAssembler=None, vectorIndexer=False, inputColsVectorIndexer=None, outputColsVectorIndexer=None, maxCategories=None, classification=False, logisticregression=False, decisiontreeclassifier=False, linearsvc=False, naivebayes=False, randomforestclassifier=False, gbtclassifier=False, regression=False, linearregression=True, decisiontreeregressor=False, randomforestregressor=False, gbtregressor=False, clustering=False, kmeans=False, gaussianmixture=False, lda=False, recommendation=False, als=False, association=False, fpgrowth=False): if classification: if logisticregression: stagesList = FeaturesTransform( stringIndexer=stringIndexer, inputColStringIndexer=inputColStringIndexer, outputColStringIndexer=outputColStringIndexer, oneHotEncoder=oneHotEncoder, inputColOneHotEncoder=inputColOneHotEncoder, outputColOneHotEncoder=outputColOneHotEncoder, vectorAssembler=vectorAssembler, inputColsVectorAssembler=inputColsVectorAssembler, outputColsVectorAssembler=outputColsVectorAssembler, vectorIndexer=vectorIndexer, inputColsVectorIndexer=inputColsVectorIndexer, outputColsVectorIndexer=outputColsVectorIndexer, maxCategories=maxCategories) LRClassifier = LogisticRegression(featuresCol=featuresCol, labelCol=labelCol, predictionCol='Prediction', probabilityCol='Probability', rawPredictionCol='RawPrediction', standardization=True, maxIter=100, regParam=0.0, elasticNetParam=0.0, tol=1e-06, fitIntercept=True, threshold=0.5) paramGrid = ParamGridBuilder().addGrid( LRClassifier.maxIter, [10, 20, 50, 100, 200, 300, 500, 1000, 2000, 5000]).addGrid( LRClassifier.regParam, [0.0, 0.01, 0.05, 0.1, 0.5, 1.0, 2.0, 5.0, 10.0, 20.0 ]).build() if binaryclass: evaluator = BinaryClassificationEvaluator( rawPredictionCol="RawPrediction", labelCol=labelCol, metricName="areaUnderROC") if multiclass: evaluator = MulticlassClassificationEvaluator( labelCol=labelCol, predictionCol="Prediction", metricName="accuracy") LRCV = CrossValidator(estimator=LRClassifier, evaluator=evaluator, estimatorParamMaps=paramGrid, numFolds=10) stagesList.append(LRCV) LRC_Pipeline = Pipeline(stages=stagesList) LRC_PipelineModel = LRC_Pipeline.fit(train_df) LRC_Predicted = LRC_PipelineModel.transform(test_df) LRC_BestModel = LRC_PipelineModel.stages[-1].bestModel LRC_Probability = LRC_Predicted.select("Probability").toPandas() LRC_Prediction = LRC_Predicted.select("Prediction").toPandas() LRC_Score = evaluator.evaluate(LRC_Predicted) return LRC_BestModel, LRC_Predicted, LRC_Probability, LRC_Prediction, LRC_Score if decisiontreeclassifier: stagesList = FeaturesTransform( stringIndexer=stringIndexer, inputColStringIndexer=inputColStringIndexer, outputColStringIndexer=outputColStringIndexer, oneHotEncoder=oneHotEncoder, inputColOneHotEncoder=inputColOneHotEncoder, outputColOneHotEncoder=outputColOneHotEncoder, vectorAssembler=vectorAssembler, inputColsVectorAssembler=inputColsVectorAssembler, outputColsVectorAssembler=outputColsVectorAssembler, vectorIndexer=vectorIndexer, inputColsVectorIndexer=inputColsVectorIndexer, outputColsVectorIndexer=outputColsVectorIndexer, maxCategories=maxCategories) DTClassifier = DecisionTreeClassifier( featuresCol=featuresCol, labelCol=labelCol, predictionCol='Prediction', probabilityCol='Probability', rawPredictionCol='RawPrediction', maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, impurity='gini', seed=None) paramGrid = ParamGridBuilder().addGrid( DTClassifier.impurity, ["gini", "entropy"]).addGrid( DTClassifier.maxDepth, [3, 5, 10, 15, 20, 25]).addGrid( DTClassifier.maxBins, [3, 5, 10, 50, 100, 200]).build() if binaryclass: evaluator = BinaryClassificationEvaluator( rawPredictionCol="RawPrediction", labelCol=labelCol, metricName="areaUnderROC") if multiclass: evaluator = MulticlassClassificationEvaluator( labelCol=labelCol, predictionCol="Prediction", metricName="accuracy") DTCV = CrossValidator(estimator=DTClassifier, evaluator=evaluator, estimatorParamMaps=paramGrid, numFolds=10) stagesList.append(DTCV) DTC_Pipeline = Pipeline(stages=stagesList) DTC_PipelineModel = DTC_Pipeline.fit(train_df) DTC_Predicted = DTC_PipelineModel.transform(test_df) DTC_BestModel = DTC_PipelineModel.stages[-1].bestModel DTC_Probability = DTC_Predicted.select("Probability").toPandas() DTC_Prediction = DTC_Predicted.select("Prediction").toPandas() DTC_Score = evaluator.evaluate(DTC_Predicted) return DTC_BestModel, DTC_Predicted, DTC_Probability, DTC_Prediction, DTC_Score if linearsvc: stagesList = FeaturesTransform( stringIndexer=stringIndexer, inputColStringIndexer=inputColStringIndexer, outputColStringIndexer=outputColStringIndexer, oneHotEncoder=oneHotEncoder, inputColOneHotEncoder=inputColOneHotEncoder, outputColOneHotEncoder=outputColOneHotEncoder, vectorAssembler=vectorAssembler, inputColsVectorAssembler=inputColsVectorAssembler, outputColsVectorAssembler=outputColsVectorAssembler, vectorIndexer=vectorIndexer, inputColsVectorIndexer=inputColsVectorIndexer, outputColsVectorIndexer=outputColsVectorIndexer, maxCategories=maxCategories) SVClassifier = LinearSVC(featuresCol=featuresCol, labelCol=labelCol, predictionCol='Prediction', rawPredictionCol='RawPrediction', maxIter=100, regParam=0.0, tol=1e-06, fitIntercept=True, standardization=True, threshold=0.0) paramGrid = ParamGridBuilder().addGrid( SVClassifier.maxIter, [10, 20, 50, 100, 200, 300, 500, 1000, 2000, 5000]).addGrid( SVClassifier.regParam, [0.0, 0.01, 0.05, 0.1, 0.5, 1.0, 2.0, 5.0, 10.0, 20.0 ]).build() if binaryclass: evaluator = BinaryClassificationEvaluator( rawPredictionCol="RawPrediction", labelCol=labelCol, metricName="areaUnderROC") if multiclass: evaluator = MulticlassClassificationEvaluator( labelCol=labelCol, predictionCol="Prediction", metricName="accuracy") SVCV = CrossValidator(estimator=SVClassifier, evaluator=evaluator, estimatorParamMaps=paramGrid, numFolds=10) stagesList.append(SVCV) SVC_Pipeline = Pipeline(stages=stagesList) SVC_PipelineModel = SVC_Pipeline.fit(train_df) SVC_Predicted = SVC_PipelineModel.transform(test_df) SVC_BestModel = SVC_PipelineModel.stages[-1].bestModel SVC_Prediction = SVC_Predicted.select("Prediction").toPandas() SVC_Score = evaluator.evaluate(SVC_Predicted) return SVC_BestModel, SVC_Predicted, SVC_Prediction, SVC_Score if naivebayes: stagesList = FeaturesTransform( stringIndexer=stringIndexer, inputColStringIndexer=inputColStringIndexer, outputColStringIndexer=outputColStringIndexer, oneHotEncoder=oneHotEncoder, inputColOneHotEncoder=inputColOneHotEncoder, outputColOneHotEncoder=outputColOneHotEncoder, vectorAssembler=vectorAssembler, inputColsVectorAssembler=inputColsVectorAssembler, outputColsVectorAssembler=outputColsVectorAssembler, vectorIndexer=vectorIndexer, inputColsVectorIndexer=inputColsVectorIndexer, outputColsVectorIndexer=outputColsVectorIndexer, maxCategories=maxCategories) NBClassifier = NaiveBayes(featuresCol=featuresCol, labelCol=labelCol, predictionCol='Prediction', probabilityCol='Probability', rawPredictionCol='RawPrediction', smoothing=1.0, modelType='multinomial', thresholds=None) paramGrid = ParamGridBuilder().addGrid( NBClassifier.smoothing, [0.1, 0.5, 1.0, 2.0, 5.0, 10.0]).build() if binaryclass: evaluator = BinaryClassificationEvaluator( rawPredictionCol="RawPrediction", labelCol=labelCol, metricName="areaUnderROC") if multiclass: evaluator = MulticlassClassificationEvaluator( labelCol=labelCol, predictionCol="Prediction", metricName="accuracy") NBCV = CrossValidator(estimator=NBClassifier, evaluator=evaluator, estimatorParamMaps=paramGrid, numFolds=10) stagesList.append(NBCV) NBC_Pipeline = Pipeline(stages=stagesList) NBC_PipelineModel = NBC_Pipeline.fit(train_df) NBC_Predicted = NBC_PipelineModel.transform(test_df) NBC_BestModel = NBC_PipelineModel.stages[-1].bestModel NBC_Probability = NBC_Predicted.select("Probability").toPandas() NBC_Prediction = NBC_Predicted.select("Prediction").toPandas() NBC_Score = evaluator.evaluate(NBC_Predicted) return NBC_BestModel, NBC_Predicted, NBC_Probability, NBC_Prediction, NBC_Score if randomforestclassifier: stagesList = FeaturesTransform( stringIndexer=stringIndexer, inputColStringIndexer=inputColStringIndexer, outputColStringIndexer=outputColStringIndexer, oneHotEncoder=oneHotEncoder, inputColOneHotEncoder=inputColOneHotEncoder, outputColOneHotEncoder=outputColOneHotEncoder, vectorAssembler=vectorAssembler, inputColsVectorAssembler=inputColsVectorAssembler, outputColsVectorAssembler=outputColsVectorAssembler, vectorIndexer=vectorIndexer, inputColsVectorIndexer=inputColsVectorIndexer, outputColsVectorIndexer=outputColsVectorIndexer, maxCategories=maxCategories) RFClassifier = RandomForestClassifier( featuresCol=featuresCol, labelCol=labelCol, predictionCol='Prediction', probabilityCol='Probability', rawPredictionCol='RawPrediction', maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, impurity='gini', numTrees=20, featureSubsetStrategy='auto', seed=None, subsamplingRate=1.0) paramGrid = ParamGridBuilder().addGrid( RFClassifier.impurity, ["gini", "entropy"]).addGrid( RFClassifier.maxDepth, [3, 5, 10, 15, 20, 25]).addGrid( RFClassifier.maxBins, [3, 5, 10, 50, 100, 200]).addGrid( RFClassifier.numTrees, [5, 10, 20, 50, 100, 200]).addGrid( RFClassifier.subsamplingRate, [0.1, 0.2, 0.5, 0.8, 0.9, 1.0]).build() if binaryclass: evaluator = BinaryClassificationEvaluator( rawPredictionCol="RawPrediction", labelCol=labelCol, metricName="areaUnderROC") if multiclass: evaluator = MulticlassClassificationEvaluator( labelCol=labelCol, predictionCol="Prediction", metricName="accuracy") RFCV = CrossValidator(estimator=RFClassifier, evaluator=evaluator, estimatorParamMaps=paramGrid, numFolds=10) stagesList.append(RFCV) RFC_Pipeline = Pipeline(stages=stagesList) RFC_PipelineModel = RFC_Pipeline.fit(train_df) RFC_Predicted = RFC_PipelineModel.transform(test_df) RFC_BestModel = RFC_PipelineModel.stages[-1].bestModel RFC_Probability = RFC_Predicted.select("Probability").toPandas() RFC_Prediction = RFC_Predicted.select("Prediction").toPandas() RFC_Score = evaluator.evaluate(RFC_Predicted) return RFC_BestModel, RFC_Predicted, RFC_Probability, RFC_Prediction, RFC_Score if gbtclassifier: stagesList = FeaturesTransform( stringIndexer=stringIndexer, inputColStringIndexer=inputColStringIndexer, outputColStringIndexer=outputColStringIndexer, oneHotEncoder=oneHotEncoder, inputColOneHotEncoder=inputColOneHotEncoder, outputColOneHotEncoder=outputColOneHotEncoder, vectorAssembler=vectorAssembler, inputColsVectorAssembler=inputColsVectorAssembler, outputColsVectorAssembler=outputColsVectorAssembler, vectorIndexer=vectorIndexer, inputColsVectorIndexer=inputColsVectorIndexer, outputColsVectorIndexer=outputColsVectorIndexer, maxCategories=maxCategories) GBClassifier = GBTClassifier(featuresCol=featuresCol, labelCol=labelCol, predictionCol='Prediction', maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, lossType='logistic', maxIter=20, stepSize=0.1, seed=None, subsamplingRate=1.0) paramGrid = ParamGridBuilder().addGrid( GBClassifier.maxDepth, [3, 5, 10, 15, 20, 25]).addGrid( GBClassifier.maxBins, [3, 5, 10, 50, 100, 200]).addGrid( GBClassifier.maxIter, [5, 10, 20, 50, 100, 200]).addGrid( GBClassifier.stepSize, [0.01, 0.05, 0.1, 0.2, 0.5, 1.0]).addGrid( GBClassifier.subsamplingRate, [0.1, 0.2, 0.5, 0.8, 0.9, 1.0]).build() evaluator = MulticlassClassificationEvaluator( labelCol=labelCol, predictionCol="Prediction", metricName="accuracy") GBCV = CrossValidator(estimator=GBClassifier, evaluator=evaluator, estimatorParamMaps=paramGrid, numFolds=10) stagesList.append(GBCV) GBC_Pipeline = Pipeline(stages=stagesList) GBC_PipelineModel = GBC_Pipeline.fit(train_df) GBC_Predicted = GBC_PipelineModel.transform(test_df) GBC_BestModel = GBC_PipelineModel.stages[-1].bestModel GBC_Prediction = GBC_Predicted.select("Prediction").toPandas() GBC_Score = evaluator.evaluate(GBC_Predicted) return GBC_BestModel, GBC_Predicted, GBC_Prediction, GBC_Score if regression: if linearregression: stagesList = FeaturesTransform( stringIndexer=stringIndexer, inputColStringIndexer=inputColStringIndexer, outputColStringIndexer=outputColStringIndexer, oneHotEncoder=oneHotEncoder, inputColOneHotEncoder=inputColOneHotEncoder, outputColOneHotEncoder=outputColOneHotEncoder, vectorAssembler=vectorAssembler, inputColsVectorAssembler=inputColsVectorAssembler, outputColsVectorAssembler=outputColsVectorAssembler, vectorIndexer=vectorIndexer, inputColsVectorIndexer=inputColsVectorIndexer, outputColsVectorIndexer=outputColsVectorIndexer, maxCategories=maxCategories) LRegressor = LinearRegression(featuresCol=featuresCol, labelCol=labelCol, predictionCol='Prediction', standardization=True, fitIntercept=True, loss='squaredError', maxIter=100, regParam=0.0, elasticNetParam=0.0, tol=1e-06, epsilon=1.35) paramGrid = ParamGridBuilder().addGrid( LRegressor.maxIter, [10, 20, 50, 100, 200, 300, 500, 1000, 2000, 5000]).addGrid( LRegressor.regParam, [0.0, 0.01, 0.05, 0.1, 0.5, 1.0, 2.0, 5.0, 10.0, 20.0 ]).build() evaluator = RegressionEvaluator(labelCol=labelCol, predictionCol="Prediction", metricName="rmse") LRCV = CrossValidator(estimator=LRegressor, evaluator=evaluator, estimatorParamMaps=paramGrid, numFolds=10) stagesList.append(LRCV) LR_Pipeline = Pipeline(stages=stagesList) LR_PipelineModel = LR_Pipeline.fit(train_df) LR_Predicted = LR_PipelineModel.transform(test_df) LR_BestModel = LR_PipelineModel.stages[-1].bestModel LR_Prediction = LR_Predicted.select("Prediction").toPandas() LR_Score = evaluator.evaluate(LR_Predicted) return LR_BestModel, LR_Predicted, LR_Prediction, LR_Score if decisiontreeregressor: stagesList = FeaturesTransform( stringIndexer=stringIndexer, inputColStringIndexer=inputColStringIndexer, outputColStringIndexer=outputColStringIndexer, oneHotEncoder=oneHotEncoder, inputColOneHotEncoder=inputColOneHotEncoder, outputColOneHotEncoder=outputColOneHotEncoder, vectorAssembler=vectorAssembler, inputColsVectorAssembler=inputColsVectorAssembler, outputColsVectorAssembler=outputColsVectorAssembler, vectorIndexer=vectorIndexer, inputColsVectorIndexer=inputColsVectorIndexer, outputColsVectorIndexer=outputColsVectorIndexer, maxCategories=maxCategories) DTRegressor = DecisionTreeRegressor(featuresCol=featuresCol, labelCol=labelCol, predictionCol='Prediction', maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, impurity='variance', seed=None, varianceCol=None) paramGrid = ParamGridBuilder().addGrid( DTRegressor.maxDepth, [3, 5, 10, 15, 20, 25]).addGrid( DTRegressor.maxBins, [3, 5, 10, 50, 100, 200]).build() evaluator = RegressionEvaluator(labelCol=labelCol, predictionCol="Prediction", metricName="rmse") DTRCV = CrossValidator(estimator=DTRegressor, evaluator=evaluator, estimatorParamMaps=paramGrid, numFolds=10) stagesList.append(DTRCV) DTR_Pipeline = Pipeline(stages=stagesList) DTR_PipelineModel = DTR_Pipeline.fit(train_df) DTR_Predicted = DTR_PipelineModel.transform(test_df) DTR_BestModel = DTR_PipelineModel.stages[-1].bestModel DTR_Prediction = DTR_Predicted.select("Prediction").toPandas() DTR_Score = evaluator.evaluate(DTR_Predicted) return DTR_BestModel, DTR_Predicted, DTR_Prediction, DTR_Score if randomforestregressor: stagesList = FeaturesTransform( stringIndexer=stringIndexer, inputColStringIndexer=inputColStringIndexer, outputColStringIndexer=outputColStringIndexer, oneHotEncoder=oneHotEncoder, inputColOneHotEncoder=inputColOneHotEncoder, outputColOneHotEncoder=outputColOneHotEncoder, vectorAssembler=vectorAssembler, inputColsVectorAssembler=inputColsVectorAssembler, outputColsVectorAssembler=outputColsVectorAssembler, vectorIndexer=vectorIndexer, inputColsVectorIndexer=inputColsVectorIndexer, outputColsVectorIndexer=outputColsVectorIndexer, maxCategories=maxCategories) RFRegressor = RandomForestRegressor(featuresCol=featuresCol, labelCol=labelCol, predictionCol='Prediction', maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, impurity='variance', subsamplingRate=1.0, seed=None, numTrees=20) paramGrid = ParamGridBuilder().addGrid( RFRegressor.maxDepth, [3, 5, 10, 15, 20, 25]).addGrid( RFRegressor.maxBins, [3, 5, 10, 50, 100, 200]).addGrid( RFRegressor.numTrees, [5, 10, 20, 50, 100, 200]).addGrid( RFRegressor.subsamplingRate, [0.1, 0.2, 0.5, 0.8, 0.9, 1.0]).build() evaluator = RegressionEvaluator(labelCol=labelCol, predictionCol="Prediction", metricName="rmse") RFRCV = CrossValidator(estimator=RFRegressor, evaluator=evaluator, estimatorParamMaps=paramGrid, numFolds=10) stagesList.append(RFRCV) RFR_Pipeline = Pipeline(stages=stagesList) RFR_PipelineModel = RFR_Pipeline.fit(train_df) RFR_Predicted = RFR_PipelineModel.transform(test_df) RFR_BestModel = RFR_PipelineModel.stages[-1].bestModel RFR_Prediction = RFR_Predicted.select("Prediction").toPandas() RFR_Score = evaluator.evaluate(RFR_Predicted) return RFR_BestModel, RFR_Predicted, RFR_Prediction, RFR_Score if gbtregressor: stagesList = FeaturesTransform( stringIndexer=stringIndexer, inputColStringIndexer=inputColStringIndexer, outputColStringIndexer=outputColStringIndexer, oneHotEncoder=oneHotEncoder, inputColOneHotEncoder=inputColOneHotEncoder, outputColOneHotEncoder=outputColOneHotEncoder, vectorAssembler=vectorAssembler, inputColsVectorAssembler=inputColsVectorAssembler, outputColsVectorAssembler=outputColsVectorAssembler, vectorIndexer=vectorIndexer, inputColsVectorIndexer=inputColsVectorIndexer, outputColsVectorIndexer=outputColsVectorIndexer, maxCategories=maxCategories) GBRegressor = GBTRegressor(featuresCol=featuresCol, labelCol=labelCol, predictionCol='Prediction', maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, subsamplingRate=1.0, lossType='squared', maxIter=20, stepSize=0.1, seed=None, impurity='variance') paramGrid = ParamGridBuilder().addGrid( GBRegressor.maxDepth, [3, 5, 10, 15, 20, 25]).addGrid( GBRegressor.maxBins, [3, 5, 10, 50, 100, 200]).addGrid( GBRegressor.maxIter, [5, 10, 20, 50, 100, 200]).addGrid( GBRegressor.stepSize, [0.01, 0.05, 0.1, 0.2, 0.5, 1.0]).addGrid( GBRegressor.subsamplingRate, [0.1, 0.2, 0.5, 0.8, 0.9, 1.0]).build() evaluator = RegressionEvaluator(labelCol=labelCol, predictionCol="Prediction", metricName="rmse") GBRCV = CrossValidator(estimator=GBRegressor, evaluator=evaluator, estimatorParamMaps=paramGrid, numFolds=10) stagesList.append(GBRCV) GBR_Pipeline = Pipeline(stages=stagesList) GBR_PipelineModel = GBR_Pipeline.fit(train_df) GBR_Predicted = GBR_PipelineModel.transform(test_df) GBR_BestModel = GBR_PipelineModel.stages[-1].bestModel GBR_Prediction = GBR_Predicted.select("Prediction").toPandas() GBR_Score = evaluator.evaluate(GBR_Predicted) return GBR_BestModel, GBR_Predicted, GBR_Prediction, GBR_Score if clustering: if kmeans: stagesList = FeaturesTransform( stringIndexer=stringIndexer, inputColStringIndexer=inputColStringIndexer, outputColStringIndexer=outputColStringIndexer, oneHotEncoder=oneHotEncoder, inputColOneHotEncoder=inputColOneHotEncoder, outputColOneHotEncoder=outputColOneHotEncoder, vectorAssembler=vectorAssembler, inputColsVectorAssembler=inputColsVectorAssembler, outputColsVectorAssembler=outputColsVectorAssembler, vectorIndexer=vectorIndexer, inputColsVectorIndexer=inputColsVectorIndexer, outputColsVectorIndexer=outputColsVectorIndexer, maxCategories=maxCategories) KCluster = KMeans(featuresCol=featuresCol, predictionCol='Prediction', k=n_cluster, initMode='k-means||', initSteps=2, tol=0.0001, maxIter=20, seed=None) paramGrid = ParamGridBuilder().addGrid( KCluster.initSteps, [1, 2, 5, 10, 20, 50, 100]).addGrid( KCluster.maxIter, [10, 20, 50, 100, 200, 500, 1000, 2000]).addGrid( KCluster.seed, [i for i in range(1001)]).build() evaluator = ClusteringEvaluator(predictionCol='Prediction', featuresCol=featuresCol, metricName='silhouette') KMCV = CrossValidator(estimator=KCluster, evaluator=evaluator, estimatorParamMaps=paramGrid, numFolds=10) stagesList.append(KMCV) KMC_Pipeline = Pipeline(stages=stagesList) KMC_PipelineModel = KMC_Pipeline.fit(train_df) KMC_Predicted = KMC_PipelineModel.transform(train_df) KMC_BestModel = KMC_PipelineModel.stages[-1].bestModel KMC_Prediction = KMC_Predicted.select("Prediction").toPandas() KMC_Score = evaluator.evaluate(KMC_Predicted) return KMC_BestModel, KMC_Predicted, KMC_Prediction, KMC_Score if gaussianmixture: stagesList = FeaturesTransform( stringIndexer=stringIndexer, inputColStringIndexer=inputColStringIndexer, outputColStringIndexer=outputColStringIndexer, oneHotEncoder=oneHotEncoder, inputColOneHotEncoder=inputColOneHotEncoder, outputColOneHotEncoder=outputColOneHotEncoder, vectorAssembler=vectorAssembler, inputColsVectorAssembler=inputColsVectorAssembler, outputColsVectorAssembler=outputColsVectorAssembler, vectorIndexer=vectorIndexer, inputColsVectorIndexer=inputColsVectorIndexer, outputColsVectorIndexer=outputColsVectorIndexer, maxCategories=maxCategories) GMCluster = GaussianMixture(featuresCol=featuresCol, predictionCol='Prediction', probabilityCol='Probability', k=n_cluster, tol=0.01, maxIter=100, seed=None) paramGrid = ParamGridBuilder().addGrid( GMCluster.maxIter, [10, 20, 50, 100, 200, 500, 1000, 2000]).addGrid( GMCluster.seed, [i for i in range(1001)]).build() evaluator = ClusteringEvaluator(predictionCol='Prediction', featuresCol=featuresCol, metricName='silhouette') GMCV = CrossValidator(estimator=GMCluster, evaluator=evaluator, estimatorParamMaps=paramGrid, numFolds=10) stagesList.append(GMCV) GMC_Pipeline = Pipeline(stages=stagesList) GMC_PipelineModel = GMC_Pipeline.fit(train_df) GMC_Predicted = GMC_PipelineModel.transform(train_df) GMC_BestModel = GMC_PipelineModel.stages[-1].bestModel GMC_Probability = GMC_Predicted.select("Probability").toPandas() GMC_Prediction = GMC_Predicted.select("Prediction").toPandas() GMC_Score = evaluator.evaluate(GMC_Predicted) return GMC_BestModel, GMC_Predicted, GMC_Probability, GMC_Prediction, GMC_Score if lda: stagesList = FeaturesTransform( stringIndexer=stringIndexer, inputColStringIndexer=inputColStringIndexer, outputColStringIndexer=outputColStringIndexer, oneHotEncoder=oneHotEncoder, inputColOneHotEncoder=inputColOneHotEncoder, outputColOneHotEncoder=outputColOneHotEncoder, vectorAssembler=vectorAssembler, inputColsVectorAssembler=inputColsVectorAssembler, outputColsVectorAssembler=outputColsVectorAssembler, vectorIndexer=vectorIndexer, inputColsVectorIndexer=inputColsVectorIndexer, outputColsVectorIndexer=outputColsVectorIndexer, maxCategories=maxCategories) LDACluster = LDA(featuresCol=featuresCol, maxIter=20, seed=None, k=n_cluster, learningOffset=1024.0, learningDecay=0.51, subsamplingRate=0.05) paramGrid = ParamGridBuilder().addGrid( LDACluster.maxIter, [10, 20, 50, 100, 200, 500, 1000, 2000]).addGrid( LDACluster.seed, [i for i in range(1001)]).addGrid( LDACluster.subsamplingRate, [0.01, 0.05, 0.1, 0.2, 0.5, 1.0]).build() evaluator = ClusteringEvaluator(predictionCol='Prediction', featuresCol=featuresCol, metricName='silhouette') LDACV = CrossValidator(estimator=LDACluster, evaluator=evaluator, estimatorParamMaps=paramGrid, numFolds=10) stagesList.append(LDACV) LDA_Pipeline = Pipeline(stages=stagesList) LDA_PipelineModel = LDA_Pipeline.fit(train_df) LDA_Predicted = LDA_PipelineModel.transform(train_df) LDA_BestModel = LDA_PipelineModel.stages[-1].bestModel LDA_Topics = LDA_BestModel.describeTopics().toPandas() LDA_Score = evaluator.evaluate(LDA_Predicted) return LDA_BestModel, LDA_Topics, LDA_Score if recommendation: if als: ALSR = ALS(userCol=userCol, itemCol=itemCol, ratingCol=ratingCol, rank=rank, maxIter=10, regParam=0.1, numUserBlocks=10, numItemBlocks=10, alpha=1.0, seed=1) ALSR_Model = ALSR.fit(train_df) ALSR_ForUsers = ALSR_Model.recommendForAllUsers(userid=userid) ALSR_ForItems = ALSR_Model.recommendForAllItems(itemid=itemid) return ALSR_Model, ALSR_ForUsers, ALSR_ForItems if association: if fpgrowth: fpg = FPGrowth(minSupport=minSupport, minConfidence=minConfidence, itemsCol=itemsCol, predictionCol='Prediction') fpg_model = fpg.fit(train_df) fpg_freqItemsets = fpg_model.freqItemsets.toPandas() fpg_associationRules = fpg_model.associationRules.toPandas() return fpg_model, fpg_freqItemsets, fpg_associationRules
evaluator = BinaryClassificationEvaluator(labelCol="model_photography") print("\nModelo de Árbol de Decisión") print("Test Area Under ROC: " + str( evaluator.evaluate(predictions, {evaluator.metricName: "areaUnderROC"}))) print("Precision: " + str(metrics.precision(1.0))) print("Recall: " + str(metrics.recall(1.0))) print("F1-Score: " + str(metrics.fMeasure(1.0))) # # Modelo de máquinas de vectores de soporte # In[155]: # se construye y entrena el modelo lsvc = LinearSVC(featuresCol='features', labelCol='model_photography', maxIter=10, regParam=0.1) # Fit the model lsvcModel = lsvc.fit(train_df) # ahora se pueden hacer algunas predicciones y evaluar el rendimiento lsv_predictions = lsvcModel.transform(test_df) test = test_df.rdd # Instantiate metrics object #important: need to cast to float type, and order by prediction, else it won't work preds_and_labels = lsv_predictions.select( ['prediction', 'model_photography']).withColumn( 'model_photography', F.col('model_photography').cast(FloatType())).orderBy('prediction')
from pyspark.sql import SparkSession if __name__ == "__main__": spark_session = SparkSession\ .builder\ .appName("Spark SVM")\ .getOrCreate() # Loads data dataset = spark_session\ .read\ .format("libsvm")\ .load("data/classificationDataLibsvm.txt") dataset.printSchema() dataset.show() linear_SVM = LinearSVC(maxIter=10, regParam=0.1) svm_model = linear_SVM.fit(dataset) print("Coefficients: " + str(svm_model.coefficients)) print("Intercept: " + str(svm_model.intercept)) svm_model.save("SVMModel") spark_session.stop()
def main(): #Encabezado del dataframe headings = [ 'school', 'sex', 'age', 'address', 'famsize', 'Pstatus', 'Medu', 'Fedu', 'Mjob', 'Fjob', 'reason', 'guardian', 'traveltime', 'studytime', 'failures', 'schoolsup', 'famsup', 'paid', 'activities', 'nursery', 'higher', 'internet', 'romantic', 'famrel', 'freetime', 'goout', 'Dalc', 'Walc', 'health', 'absences', 'G1', 'G2', 'G3' ] #Crear Spark Session spark = SparkSession.builder.appName("Student").getOrCreate() #--------------------------PREPROCESAMIENTO Y ENTREAMIENTO DEL MODELO--------------# #Crear dataframe previamente usado df = spark.read.csv('Datos streaming/feed/student-por1.csv', sep=';', header=True) esquema = df.schema #Reemplazar valores categoricos a numericos df = categoricalToNumerical(df) #Convertir los datos de string a int df = stringToInt(df) #Convertir variables categorica a numericas df = approvedOrReproved(df) #Eliminar datos atípicos df = dropAtypicValues(df) vector = VectorAssembler(inputCols=[ 'school', 'sex', 'age', 'address', 'famsize', 'Pstatus', 'Medu', 'Fedu', 'Mjob', 'Fjob', 'reason', 'guardian', 'traveltime', 'studytime', 'failures', 'schoolsup', 'famsup', 'paid', 'activities', 'nursery', 'higher', 'internet', 'romantic', 'famrel', 'freetime', 'goout', 'Dalc', 'Walc', 'health', 'absences', 'G1', 'G2' ], outputCol="features") #Adaptar los vectores al conjunto de datos df_temp = vector.transform(df) df = df_temp.drop('school', 'sex', 'age', 'address', 'famsize', 'Pstatus', 'Medu', 'Fedu', 'Mjob', 'Fjob', 'reason', 'guardian', 'traveltime', 'studytime', 'failures', 'schoolsup', 'famsup', 'paid', 'activities', 'nursery', 'higher', 'internet', 'romantic', 'famrel', 'freetime', 'goout', 'Dalc', 'Walc', 'health', 'absences', 'G1', 'G2') svm = LinearSVC(labelCol="G3", featuresCol="features", maxIter=10, threshold=0.5, aggregationDepth=2, regParam=0.0) model = svm.fit(df) model.write().overwrite().save("Modelo1") #---------------------------------STREAMING--------------------------------# #Crear dataframe para el streaming df = spark.readStream.csv('Datos streaming/read', sep=';', header=True, schema=esquema) #Reemplazar valores categoricos a numericos df = categoricalToNumerical(df) #Convertir los datos de string a int df = stringToInt(df) #Convertir variables categorica a numericas df = approvedOrReproved(df) #Eliminar datos atípicos df = dropAtypicValues(df) vector = VectorAssembler(inputCols=[ 'school', 'sex', 'age', 'address', 'famsize', 'Pstatus', 'Medu', 'Fedu', 'Mjob', 'Fjob', 'reason', 'guardian', 'traveltime', 'studytime', 'failures', 'schoolsup', 'famsup', 'paid', 'activities', 'nursery', 'higher', 'internet', 'romantic', 'famrel', 'freetime', 'goout', 'Dalc', 'Walc', 'health', 'absences', 'G1', 'G2' ], outputCol="features") #Adaptar los vectores al conjunto de datos df_temp = vector.transform(df) df = df_temp.drop('school', 'sex', 'age', 'address', 'famsize', 'Pstatus', 'Medu', 'Fedu', 'Mjob', 'Fjob', 'reason', 'guardian', 'traveltime', 'studytime', 'failures', 'schoolsup', 'famsup', 'paid', 'activities', 'nursery', 'higher', 'internet', 'romantic', 'famrel', 'freetime', 'goout', 'Dalc', 'Walc', 'health', 'absences', 'G1', 'G2') #Visualiación de los datos para propositos de depuración test = df.writeStream.format("console").outputMode("update").foreach( predict).trigger(processingTime='65 seconds').start() test.awaitTermination()
assembler = VectorAssembler(inputCols=[ "hair", "feathers", "eggs", "milk", "airborne", "aquatic", "predator", "toothed", "backbone", "breathes", "venomous", "fins", "legs", "tail", "domestic", "catsize" ], outputCol="features") # Step - 2: Transform dataframe to vectorized dataframe output = assembler.transform(animals).select("features", "eatable", "cyr_name") output.cache() # Step - 3: Set up the LinearSVC Classifier trainer = LinearSVC(labelCol="eatable", featuresCol="features") # Step - 4: Train the model model = trainer.fit(output) print("Coefficients: " + str(model.coefficients) + " Intercept: " + str(model.intercept)) rawPredictions = model.transform(output) predictions = enrichPredictions(rawPredictions) predictions.show(100) # Step - 5: Evaluate prediction evaluator = BinaryClassificationEvaluator(labelCol="eatable",
# precision: 0.22341727876880027 # recall: 0.6314878892733564 # accuracy: 0.8737938503096747 # auroc: 0.858413849659377 # 19 #============Linear SVM Classifier # LinearSVC(featuresCol='features', labelCol='label', predictionCol='prediction', # maxIter=100, regParam=0.0, tol=1e-06, rawPredictionCol='rawPrediction', fitIntercept=True, standardization=True, # threshold=0.0, weightCol=None, aggregationDepth=2) #This binary classifier optimizes the Hinge Loss using the OWLQN optimizer. Only supports L2 regularization currently. #standardization=True a=datetime.now() svm = LinearSVC(featuresCol='raw_Features', labelCol='Class') svm_model = svm.fit(training_downsampled) predictions = svm_model.transform(test_downsampled) #test predictions.cache() print_binary_metrics(predictions) b=datetime.now() print((b-a).seconds) # actual total: 82183 # actual positive: 4046 # actual negative: 78137 # nP: 12611 # nN: 69572 # TP: 2653
print("Correct : ", correct) print("Wrong: ", wrong) print("Ratio wrong: " , ratioWrong) print("Ratio correct: ", ratioCorrect) print("Ratio true positive : ", truep) print("Ratio false positive : ", falsep) print("Ratio true negative : ", truen) print("Ratio false negative : ", falsen) # COMMAND ---------- #CV model of LSVC from pyspark.ml.classification import LinearSVC, LinearSVCModel svm = ( LinearSVC() .setFeaturesCol("features") .setLabelCol("label") ) from pyspark.ml import Pipeline pipeline = Pipeline().setStages([ ipindexer, # categorize internation_plan labelindexer, # categorize churn assembler, # assemble the feature vector for all columns svm]) pipelineModel = pipeline.fit(trainDF) numFolds = 3 MaxIter = [1000] RegParam = [0.1, 0.01] # L2 regularization param, set 1.0 with L1 regularization Tol=[1e-8] # for convergence tolerance for iterative algorithms
def downstream_ml_func(features_df, results_dict, layer_index, model_name='LogisticRegression', extra_config={}, tuning_method=None, seed=2019, test_size=0.2): def hyperparameter_tuned_model(clf, train_df): pipeline = Pipeline(stages=[clf]) paramGrid = ParamGridBuilder() for i in extra_config: if i == 'numFolds': continue paramGrid = paramGrid.addGrid(eval('clf.' + i), extra_config[i]) paramGrid = paramGrid.build() evaluator = MulticlassClassificationEvaluator() if tuning_method == 'CrossValidator': if 'numFolds' in extra_config: numFolds = extra_config['numFolds'] else: numFolds = 3 # default val_model = CrossValidator(estimator=pipeline, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=numFolds, seed=seed) if tuning_method == 'TrainValidationSplit': val_model = TrainValidationSplit( estimator=pipeline, estimatorParamMaps=paramGrid, evaluator=evaluator, seed=seed, # 80% of the data will be used for training, 20% for validation. trainRatio=1 - test_size) # Run cross-validation, and choose the best set of parameters. return val_model.fit(train_df) train_df, test_df = features_df.randomSplit([1 - test_size, test_size], seed=seed) if model_name == 'LogisticRegression': clf = LogisticRegression(labelCol="label", featuresCol="features", maxIter=10, regParam=0.1) if model_name == 'LinearSVC': clf = LinearSVC(maxIter=5, regParam=0.01) if model_name == 'DecisionTreeClassifier': stringIndexer = StringIndexer(inputCol="label", outputCol="indexed") si_model = stringIndexer.fit(train_df) train_df = si_model.transform(train_df) clf = DecisionTreeClassifier(maxDepth=2, labelCol="indexed", seed=seed) if model_name == 'GBTClassifier': stringIndexer = StringIndexer(inputCol="label", outputCol="indexed") si_model = stringIndexer.fit(train_df) train_df = si_model.transform(train_df) clf = GBTClassifier(labelCol="label", featuresCol="features", maxIter=50, maxDepth=5, seed=seed) if model_name == 'RandomForestClassifier': stringIndexer = StringIndexer(inputCol="label", outputCol="indexed") si_model = stringIndexer.fit(train_df) td = si_model.transform(train_df) clf = RandomForestClassifier(labelCol="label", featuresCol="features", seed=seed) if model_name == 'OneVsRest': lr = LogisticRegression(labelCol="label", featuresCol="features", maxIter=50, regParam=0.5) clf = OneVsRest(labelCol="label", featuresCol="features", predictionCol="prediction", classifier=lr) if tuning_method is not None: model = hyperparameter_tuned_model(clf, train_df) else: model = clf.fit(train_df) predictions = model.transform(test_df) evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy") results_dict[layer_index] = evaluator.evaluate(predictions) return results_dict
rando_forest_model = rando_forest.fit(train) rando_forest_preds = rando_forest_model.transform(validation) custom_evaluation(rando_forest_preds, 'Random Forest') # In[128]: #Gradient boosted trees (ie ada boost) gbtrees = GBTClassifier(maxIter=10) gbtree_model = gbtrees.fit(train) gbtree_preds = gbtree_model.transform(validation) custom_evaluation(gbtree_preds, 'Gradient Boosted Trees') # In[129]: #SVM svm = LinearSVC(maxIter=10, regParam=0.1) svm_model = svm.fit(train) svm_preds = svm_model.transform(validation) custom_evaluation(svm_preds, 'Support Vector Machine') # In[130]: #Logistic regression model logReg = LogisticRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8) lrModel = logReg.fit(train) lr_preds = lrModel.transform(validation) custom_evaluation(lr_preds, 'Logistic Regression') # In[131]: #Visual check for predictions
teb_vectorAssembler = VectorAssembler(inputCols=[ "Nouns", "Verbs", "Exclamations", "Question_Marks", "Interjections", "Ellipsis", "Capitals", "Passive_aggressive_count" ], outputCol='features') contrast_based_transdf = cb_vectorAssembler.transform(contrast_based_features) contrast_based_df = contrast_based_transdf.select(["features", "label"]) emotion_based_transdf = eb_vectorAssembler.transform(emotion_based_features) emotion_based_df = emotion_based_transdf.select(["features", "label"]) text_expression_transdf = teb_vectorAssembler.transform( text_expression_based_features) text_expression_based_df = text_expression_transdf.select( ["features", "label"]) svc = LinearSVC(maxIter=10, regParam=0.1) df_list = [contrast_based_df, emotion_based_df, text_expression_based_df] RMSEs = [] MAEs = [] FS = [] Accuracies = [] Precisions = [] Recalls = [] for item in range(len(df_list)): print("-----------------RDD: " + str(item) + " -----------------------") for i in range(1, 6): print("---------------------FOLD " + str(i) + "-----------------------------") train, test = df_list[item].randomSplit([0.8, 0.2]) svcModel = svc.fit(train) preds = svcModel.transform(test)
def linearSVC(trainingData, testData, maxIter, regParam, aggregationDepth, enableCrossValidator=False, featuresCol="features", labelCol="label", predictionCol="prediction", tol=1e-6, rawPredictionCol="rawPrediction", fitIntercept=True, standardization=False, threshold=0.0): print("\nInizio classificazione con LinearSVCClassifier") # Inizializzo il modello del classificatore con i parametri in input (e quelli default) lsvc = LinearSVC(featuresCol=featuresCol, labelCol=labelCol, predictionCol=predictionCol, maxIter=maxIter, regParam=regParam, tol=tol, rawPredictionCol=rawPredictionCol, fitIntercept=fitIntercept, standardization=standardization, threshold=threshold, aggregationDepth=aggregationDepth) print(" -modello creato") validator = None # In caso di cross validation if enableCrossValidator: # Creo la mappa dei parametri paramGrid = ParamGridBuilder().build() # Inizializzo l'evaluator evaluator = BinaryClassificationEvaluator() # Creo il sistema di k-fold cross validation, dove estiamtor è il classificatore da valutare e numFolds è il K crossVal = CrossValidator(estimator=lsvc, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=5) # use 3+ folds in practice validator = crossVal else: validator = lsvc print(" -validator creato") training = trainingData.map(lambda x: (x[31], Vectors.dense(x[1:29]), x[ 30])).toDF(schema=['index', 'features', 'label']).orderBy('index') # Configure an ML pipeline, which consists of three stages: tokenizer, hashingTF, and lr. # tokenizer = Tokenizer(inputCol="features", outputCol="transactions") # hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="rawFeatures", numFeatures=29) pipeline = Pipeline(stages=[validator]) model = pipeline.fit(training) print(" -modello addestrato con la pipeline (" + str(training.count()) + " elementi utilizzati come training)") test = testData.map(lambda x: (x[30], Vectors.dense(x[1:29]), x[31])).toDF( schema=['label', 'features', 'index']).orderBy('index') # prediction = predictions, label, index predictionsAndLabels = model.transform(test).rdd.map(lambda x: (x[4], x[0], x[2])) print(" -" + str(predictionsAndLabels.count()) + " elementi predetti (" + str(test.count()) + " elementi usati come test)") return predictionsAndLabels
layers=layers, blockSize=128, seed=1234) # train the model model = trainer.fit(train) # compute accuracy on the test set result = model.transform(test) predictionAndLabels = result.select("prediction", "label") evaluator = MulticlassClassificationEvaluator(metricName="accuracy") print("Test set accuracy = " + str(evaluator.evaluate(predictionAndLabels))) from pyspark.ml.classification import LinearSVC # Load training data training = spark.read.format("libsvm").load( "file:///usr/local/spark/data/mllib/sample_libsvm_data.txt") lsvc = LinearSVC(maxIter=10, regParam=0.1) # Fit the model lsvcModel = lsvc.fit(training) # Print the coefficients and intercept for linear SVC print("Coefficients: " + str(lsvcModel.coefficients)) print("Intercept: " + str(lsvcModel.intercept)) from pyspark.ml.classification import LogisticRegression, OneVsRest from pyspark.ml.evaluation import MulticlassClassificationEvaluator # load data file. inputData = spark.read.format("libsvm").load( "file:///usr/local/spark/data/mllib/sample_multiclass_classification_data.txt" ) # generate the train/test split. (train, test) = inputData.randomSplit([0.8, 0.2]) # instantiate the base classifier.
def SVM(trainingData, testData): start_time = time.time() print(" ") print("--------------------- SUPPORT VECTOR MACHINE ---------------------") svm = LinearSVC() ovr = OneVsRest(classifier=svm) # Parametri su cui effettuare il tuning paramGrid = ParamGridBuilder() \ .addGrid(svm.regParam, [1, 0]) \ .addGrid(svm.maxIter, [100, 1000]) \ .build() # Tuning sui vari parametri per scegliere il modello migliore tvs = TrainValidationSplit(estimator=ovr, estimatorParamMaps=paramGrid, evaluator=MulticlassClassificationEvaluator(), # Validation test: 80% traning, 20% validation. trainRatio=0.8) model = tvs.fit(trainingData) prediction = model.transform(testData) result = prediction.select('features', 'label', 'prediction') # Calcolo accuracy evaluator = MulticlassClassificationEvaluator( labelCol="label", predictionCol="prediction", metricName="accuracy") accuracy = evaluator.evaluate(prediction) evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="f1") f1score = evaluator.evaluate(prediction) # Confusion Matrix class_temp = prediction.select("label").groupBy("label") \ .count().sort('count', ascending=False).toPandas() class_temp = class_temp["label"].values.tolist() y_true = prediction.select("label") y_true = y_true.toPandas() y_pred = prediction.select("prediction") y_pred = y_pred.toPandas() cnf_matrix = confusion_matrix(y_true, y_pred, labels=class_temp) print("Accuracy Hold-Out: ", accuracy) print("F1-Score Hold-Out: ", f1score) print("") print("") print("Doc Parameters : [", model.explainParams(), "]") print("") print("") print("Confusion Matrix: ") print(cnf_matrix) print("SVM HoldOut Execution TIME:", time.time() - start_time) # Richiamo SVM che utilizza la validazione K-Folds f1score_cv, cnf_matrix_cv, cv = SVMCV(trainingData, testData) # Restituisco il modello migliore tra Hold Out e K-Folds if (f1score <= f1score_cv): return (f1score_cv, cnf_matrix_cv, cv) else: return (f1score, cnf_matrix, tvs)
#print("RandomForestClassifier parameters:\n" + rf.explainParams() + "\n") model = rf.fit(final_train) predictions = model.transform(final_test) predictions.show() accuracy = evaluator.evaluate(predictions) print("RandomForestClassifier - Test set accuracy = " + str(accuracy)) gbt = GBTClassifier(labelCol="label", featuresCol="features", maxIter=10) #print("GBTClassifier parameters:\n" + gbt.explainParams() + "\n") model = gbt.fit(final_train) predictions = model.transform(final_test) predictions.show() accuracy = evaluator.evaluate(predictions) print("GBTClassifier - Test set accuracy = " + str(accuracy)) lsvc = LinearSVC(maxIter=10, regParam=0.1) #print("LinearSVC parameters:\n" + lsvc.explainParams() + "\n") model = lsvc.fit(final_train) predictions = model.transform(final_test) predictions.show() accuracy = evaluator.evaluate(predictions) print("LinearSVC - Test set accuracy = " + str(accuracy)) nb = NaiveBayes(smoothing=1.0, modelType="multinomial") #print("NaiveBayes parameters:\n" + nb.explainParams() + "\n") model = nb.fit(final_train) predictions = model.transform(final_test) predictions.show() accuracy = evaluator.evaluate(predictions) print("NaiveBayes - Test set accuracy = " + str(accuracy)) '''def cleanup_age():
def lsvc(self, maxIter=10, regParam=0.1): self.time_calc.start_time('\nLinear Support Vector Machine') lsvc = LinearSVC(maxIter=maxIter, regParam=regParam) self.classify('lsvc', lsvc) self.time_calc.end_time('Linear Support Vector Machine')
from __future__ import print_function # $example on$ from pyspark.ml.classification import LinearSVC # $example off$ from pyspark.sql import SparkSession if __name__ == "__main__": spark = SparkSession\ .builder\ .appName("linearSVC Example")\ .getOrCreate() # $example on$ # Load training data training = spark.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt") lsvc = LinearSVC(maxIter=10, regParam=0.1) # Fit the model lsvcModel = lsvc.fit(training) # Print the coefficients and intercept for linearsSVC print("Coefficients: " + str(lsvcModel.coefficients)) print("Intercept: " + str(lsvcModel.intercept)) # $example off$ spark.stop()
predictionAndLabels = result_MLP.select("prediction", "label") evaluator = MulticlassClassificationEvaluator(metricName="accuracy") accuracy_MLP = evaluator.evaluate(predictionAndLabels) print("Accuracy MLP = " + str(accuracy_MLP)) file.write("\n" + "== Results on labeled data (Brexit) ==" + "\n") file.write('-> ACCURACY MLP : ' + str(accuracy_MLP) + '\n') print("\n======================================================= ") print("====================== LINEAR SVC ===================== ") print("=======================================================\n") print("\n================== Training ===================\n") #training model SVC trainer_SVC = LinearSVC(maxIter=10, regParam=0.1) model_linear_svc = trainer_SVC.fit(rescaledData) print("Done : Linear_SVC training") print("\n=================== Testing =================== \n") #SVC test predictions_svc = model_linear_svc.transform(rescaled_test_df) #predictions_svc.show() num_pos_svc = predictions_svc.select("prediction").rdd.map( lambda x: x["prediction"]).countByValue()[1.0] num_neg_svc = predictions_svc.select("prediction").rdd.map( lambda x: x["prediction"]).countByValue()[0.0] print("\n== PREDICTION SVC : ==")