def feature_selection(df): assembler = VectorAssembler(inputCols=[ "Crossing", "Finishing", "HeadingAccuracy", "ShortPassing", "Volleys", "Dribbling", "Curve", "FKAccuracy", "LongPassing", "BallControl", "Acceleration", "SprintSpeed", "Agility", "Reactions", "Balance", "ShotPower", "Jumping", "Stamina", "Strength", "LongShots", "Aggression", "Interceptions", "Positioning", "Vision", "Penalties", "Composure", "Marking", "StandingTackle", "SlidingTackle", "GKDiving", "GKHandling", "GKKicking", "GKPositioning", "GKReflexes" ], outputCol="features") df = assembler.transform(df) indexer = VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4) df = indexer.fit(df).transform(df) # Seleccionamos features que mas suman al modelo selector = ChiSqSelector(numTopFeatures=5, featuresCol="indexedFeatures", labelCol="Position", outputCol="selectedFeatures") resultado = selector.fit(df).transform(df) resultado.select("features", "selectedFeatures").show()
def preprocessed_df(df, label="flg_cmd_lowcostIndex"): max_values_to_define_str_cols = 10 id_col = 'ID_CLIENT' dty = dict(df.dtypes) str_cols = [k for k, v in dty.items() if v == 'string'] str_cols.remove(id_col) for c in str_cols: stringIndexer = StringIndexer(inputCol=c, outputCol=c + "Index") model_str = stringIndexer.fit(df) df = model_str.transform(df).drop(c) input_cols = df.columns input_cols.remove(id_col) input_cols.remove(label) assembler = VectorAssembler(inputCols=input_cols, outputCol="features") df = assembler.transform(df) featureIndexer = VectorIndexer( inputCol="features", outputCol="indexedFeatures", maxCategories=max_values_to_define_str_cols).fit(df) return featureIndexer.transform(df), df
def feature_selection(df): assembler = VectorAssembler(inputCols=[ "Edad", "Genero", "Zona", "Fumador_Activo", "ultimo_estado_de_Glicemia", "Enfermedad_Coronaria", "Tension_sistolica", "Tension_diastolica", "Colesterol_Total", "Trigliceridos", "Clasificacion_RCV_Global", "Glicemia_de_ayuno", "Perimetro_Abdominal", "Peso", "IMC", "CLAIFICACION_IMC", "Creatinina", "Factor_correccion", "Proteinuria", "Farmacos_Antihipertensivos", "Estatina", "Antidiabeticos", "Adherencia_tratamiento" ], outputCol="features") df = assembler.transform(df) indexer = VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=15) df = indexer.fit(df).transform(df) # Seleccionamos features que mas suman al modelo selector = ChiSqSelector(numTopFeatures=15, featuresCol="indexedFeatures", labelCol="Diabetes", outputCol="selectedFeatures") resultado = selector.fit(df).transform(df) resultado.select("features", "selectedFeatures").show(100)
def test_model_vector_indexer_single(self): vi = VectorIndexer(maxCategories=3, inputCol="a", outputCol="indexed") data = self.spark.createDataFrame([(Vectors.dense([-1.0]), ), (Vectors.dense([0.0]), ), (Vectors.dense([0.0]), )], ["a"]) model = vi.fit(data) model_onnx = convert_sparkml( model, 'Sparkml VectorIndexer Single', [('a', FloatTensorType([None, model.numFeatures]))], target_opset=9) self.assertTrue(model_onnx is not None) # run the model predicted = model.transform(data) expected = predicted.toPandas().indexed.apply( lambda x: pandas.Series(x.toArray())).values data_np = data.toPandas().a.apply( lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32) paths = save_data_models(data_np, expected, model, model_onnx, basename="SparkmlVectorIndexerSingle") onnx_model_path = paths[-1] output, output_shapes = run_onnx_model(['indexed'], data_np, onnx_model_path) compare_results(expected, output, decimal=5)
def training_data_prepare(spark, filename): # 若只是处理较少数据时,为提升速度应启用该函数 header, rdd = fw.read_data(spark, filename) # 初步过滤表头 header = list(header.split(',')) header.pop(0) # # 若要进行并发应该启用该函数 # header, rdd = fw.read_list_data(spark, filename) # 打印数据的记录数、最大值、最小值、平均值以及标准差,以及计算皮尔逊相关系数 personal_array = data_description(spark, rdd.cache(), header) # 打印数据的皮尔逊相关系数 personal_show(personal_array, header) # 将文件中数量级过大的数据转换为合适的规格 rdd = annual_premium_scaler(spark, rdd, header) # 将线性相关性强的特征进行合并 rdd, header = damaged_couple(rdd, header) # 将年龄进行定性化 rdd = arrange_for_age(rdd, header) # 使用卡方验证二次清洗数据 cleaned_rdd, cleaned_header = useful_select(spark, rdd, header) def map_fuc_rdd(row): ret = [] for i in range(len(row) - 1): ret.append(row[i]) return ret, row[-1] cleaned_rdd = cleaned_rdd.map(map_fuc_rdd) # 设置PCA降维的维度 # n = 3 # pp.PCA_builder(spark,rdd,n) # 将数据格式转换为机器学习所要求的格式 def map_fuc(row): features_array = np.array(row[0]) index_array = np.arange(features_array.size) num = features_array.size return row[1], Vectors.sparse(num, index_array, features_array) labeled_points_rdd = cleaned_rdd.map(map_fuc) # print(labeled_points_rdd.first()) data = spark.createDataFrame(labeled_points_rdd, schema=['label', 'indexedFeatures']) # 使用特征转换器进行对数据的进一步处理 data = VectorIndexer(inputCol="indexedFeatures", outputCol="features", maxCategories=4).fit(data).transform(data) # 筛选掉无用的列 data = data.drop("indexedFeatures") data = balanceDataset(data) # 将数据集分为训练集和测试集 training_data, check_data = data.randomSplit([0.7, 0.3]) # 对小类进行过采样 training_data = enlarge_data(training_data, 0.15) return training_data.cache(), check_data.cache(), cleaned_header
def chiSquareTest(self,categoricalFeatures,maxCategories): dataset=self.dataset labelColm=self.labelColm features=self.features length = features.__len__() featureassembler = VectorAssembler( inputCols=self.features, outputCol="featuresChiSquare", handleInvalid="skip") dataset= featureassembler.transform(dataset) vec_indexer = VectorIndexer(inputCol="featuresChiSquare", outputCol='vecIndexedFeaturesChiSqaure', maxCategories=maxCategories, handleInvalid="skip").fit(dataset) categorical_features = vec_indexer.categoryMaps print("Chose %d categorical features: %s" % (len(categorical_features), ", ".join(str(k) for k in categorical_features.keys()))) dataset = vec_indexer.transform(dataset) # finalized_data = dataset.select(labelColm, 'vecIndexedFeaturesChiSqaure') # finalized_data.show() # using chi selector selector = ChiSqSelector(numTopFeatures=length, featuresCol="vecIndexedFeaturesChiSqaure", outputCol="selectedFeatures", labelCol=labelColm) result = selector.fit(dataset).transform(dataset) print("chi2 output with top %d features selected " % selector.getNumTopFeatures()) result.show() # runnin gfor the chi vallue test r = ChiSquareTest.test(result, "selectedFeatures", labelColm).head() p_values = list(r.pValues) PValues = [] for val in p_values: PValues.append(round(val, 4)) print(PValues) dof = list(r.degreesOfFreedom) stats = list(r.statistics) statistics = [] for val in stats: statistics.append(round(val, 4)) print(statistics) chiSquareDict = {} for pval, doF, stat, colm in zip(PValues, dof, statistics, categoricalFeatures): print(pval, doF, stat) chiSquareDict[colm] = pval, doF, stat chiSquareDict['summaryName'] = ['pValue', 'DoF', 'statistics'] print(chiSquareDict) result = {'pvalues': chiSquareDict} return result
def train_test(self, df): df = self.dropNonTCPUDP(df) catCols = [] numCols = ['avg_ipt', 'bytes_in', 'bytes_out', 'entropy', 'total_entropy', 'num_pkts_out', 'num_pkts_in', 'duration'] labelCol = 'label' data = self.get_dummy(df, catCols, numCols, labelCol) data.show() labelIndexer = StringIndexer(inputCol='label', outputCol='indexedLabel').fit(data) labelIndexer.transform(data) featureIndexer = VectorIndexer(inputCol="features", \ outputCol="indexedFeatures").fit(data) featureIndexer.transform(data) (trainingData, testData) = data.randomSplit([0.7, 0.3]) trainingData.cache() # trainingData.repartition(200) testData.cache() # testData.repartition(200) trainingData.show(5,False) testData.show(5,False) rf = RandomForestClassifier(featuresCol='indexedFeatures', labelCol='indexedLabel') gbt = GBTClassifier(featuresCol='indexedFeatures', labelCol='indexedLabel') logr = LogisticRegression(featuresCol='indexedFeatures', labelCol='indexedLabel') # Convert indexed labels back to original labels. labelConverter = IndexToString(inputCol="prediction", outputCol="predictedLabel", labels=labelIndexer.labels) pipeline = Pipeline(stages=[labelIndexer, featureIndexer, gbt, labelConverter]) model = pipeline.fit(trainingData) predictions = model.transform(testData) # Select example rows to display. predictions.select("features","label","predictedLabel", "prediction") # Select (prediction, true label) and compute test error print(self.getTestError(predictions)) self.printMetrics(predictions) # print(self.ExtractFeatureImp(model.stages[-2].featureImportances, testData, "features")) return model
def testVectorIndexer(spark, data): indexer = VectorIndexer(inputCol="features", outputCol="indexed", maxCategories=10) indexerModel = indexer.fit(data) categoricalFeatures = indexerModel.categoryMaps print("Chose %d categorical features: %s" % (len(categoricalFeatures), ", ".join( str(k) for k in categoricalFeatures.keys()))) # Create new column "indexed" with categorical values transformed to indices indexedData = indexerModel.transform(data) indexedData.show()
def test_data_prepare(spark, filename, header): # 若只是处理较少数据时,为提升速度应启用该函数 test_header, rdd = fw.read_data(spark, filename) # 初步清洗表头 test_header = list(test_header.split(',')) test_header.pop(0) # # 若要进行并发应该启用该函数 # test_header, rdd = fw.read_list_data(spark, filename) # 将文件中数量级过大的数据转换为合适的规格 rdd = annual_premium_scaler(spark, rdd, test_header) # 将相关性强的特征进行合并 rdd, test_header = damaged_couple(rdd, test_header) # 将年龄进行定性化 rdd = arrange_for_age(rdd, test_header) # 根据清洗后的训练集的表头筛选要保留的列并将其记录到列表中 num_array = [] for i in range(len(test_header)): if test_header[i] in header: num_array.append(i) # 根据列表过滤数据 def map_fuc(row): ret = [] for n in num_array: ret.append(row[n]) return ret rdd = rdd.map(map_fuc) # 将格式转换为机器学习算法所需要的格式 def Vectors_map_fuc(row): features_array = np.array(row) index_array = np.arange(features_array.size) num = features_array.size return (Vectors.sparse(num, index_array, features_array), ) labeled_points_rdd = rdd.map(Vectors_map_fuc) data = spark.createDataFrame(labeled_points_rdd, schema=['indexedFeatures']) data = VectorIndexer(inputCol="indexedFeatures", outputCol="features", maxCategories=4).fit(data).transform(data) # 过滤掉不需要的表头 data = data.drop("indexedFeatures") return data.cache()
def training(df): # 0. load the cleanning data df_cleanning = df.select("id").distinct() # Split the data into training and test sets (30% held out for testing) (df_training, df_test) = df_cleanning.randomSplit([0.7, 0.3]) # 1. load the training data # 准备训练集合 df_result = df df_result = df_result.select("id", "label", "features") labelIndexer = StringIndexer(inputCol="label", outputCol="indexedLabel").fit(df_result) featureIndexer = VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=6).fit(df_result) df_training.show(10) # 1.1 构建训练集合 df_training = df_training.join(df_result, how="left", on="id") df_training.show() print(df_training.count()) # 1.2 构建测试集合 df_test = df_test.join(df_result, how="left", on="id") df_test.show() print(df_test.count()) # Train a DecisionTree model. dt = DecisionTreeClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures") # Chain indexers and tree in a Pipeline pipeline = Pipeline(stages=[labelIndexer, featureIndexer, dt]) # Train model. This also runs the indexers. model = pipeline.fit(df_training) # Make predictions. df_predictions = model.transform(df_test) # Select example rows to display. df_predictions.show(10) df_predictions.select("prediction", "indexedLabel", "features").show(5) # Select (prediction, true label) and compute test error evaluator = MulticlassClassificationEvaluator(labelCol="indexedLabel", predictionCol="prediction", metricName="accuracy") accuracy = evaluator.evaluate(df_predictions) print("Test Error = %g " % (1.0 - accuracy)) treeModel = model.stages[2] # summary only print(treeModel) model.write().overwrite().save( "s3a://ph-max-auto/2020-08-11/BPBatchDAG/refactor/zyyin/pfizer_model/0.0.4/model_without_prod" ) print(treeModel.toDebugString) return treeModel
def decision_tree_classifier(trainingDataFrame, maxCategories=4, maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, impurity="gini", seed=None): labelIndexer = StringIndexer(inputCol="label", outputCol="indexedLabel"). \ setHandleInvalid("keep").fit(trainingDataFrame) featureIndexer = VectorIndexer( inputCol="features", outputCol="indexedFeatures", maxCategories=maxCategories).fit(trainingDataFrame) dt = DecisionTreeClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures", maxDepth=maxDepth, maxBins=maxBins, minInstancesPerNode=minInstancesPerNode, minInfoGain=minInfoGain, maxMemoryInMB=maxMemoryInMB, cacheNodeIds=cacheNodeIds, checkpointInterval=checkpointInterval, impurity=impurity, seed=seed) pipeline = Pipeline(stages=[labelIndexer, featureIndexer, dt]) dtModel = pipeline.fit(trainingDataFrame) result = {} result["model"] = dtModel result["summary"] = dtModel.stages[2] return result
def trainModel(self, trainingData): """ Ham huan luyen du lieu Mac dinh training toan bo du lieu trong dataset splitratio 100% training, 0% testing """ labelIndexer = StringIndexer( inputCol="label", outputCol="indexedLabel").fit(trainingData) featureIndexer = VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4).fit(trainingData) rf = RandomForestClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures", numTrees=30, maxDepth=5, maxBins=32, seed=None, impurity="gini") labelConverter = IndexToString(inputCol="prediction", outputCol="predictedLabel", labels=labelIndexer.labels) pipeline = Pipeline( stages=[labelIndexer, featureIndexer, rf, labelConverter]) model = pipeline.fit(trainingData) model.write().overwrite().save(os.path.join(self.modelpath, "detector")) return model
def main(): # 1. Configure Spark conf = SparkConf().setAppName(APP_NAME) conf = conf.setMaster("local[*]") sc = SparkContext(conf=conf) spark = SparkSession(sc) text_file = sc.textFile("s3a://spotifybuck/albumfeatures/2017/*/*/*/*/*") #3. Transform data af = (text_file.map(getVals)) #4. Create a DataFrame out of this using the toDF method and cache it afdf = af.toDF([ 'acousticness', 'danceability', 'energy', 'instrumentalness', 'liveness', 'loudness', 'duration' ]).cache() # Automatically identify categorical features, and index them. # Set maxCategories so features with > 4 distinct values are treated as continuous. featureIndexer = \ VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4).fit(afdf) #5. Create a train/test split with 70% of data in training set and 30% of data in test set afdf_train, afdf_test = afdf.randomSplit([0.7, 0.3], seed=123) # Train a RandomForest model. rf = RandomForestRegressor(featuresCol="indexedFeatures") # Chain indexer and forest in a Pipeline pipeline = Pipeline(stages=[featureIndexer, rf]) # Train model. This also runs the indexer. model = pipeline.fit(afdf_train) # Make predictions. predictions = model.transform(afdf_test) # Select example rows to display. predictions.select("prediction", "label", "features").show(5) # Select (prediction, true label) and compute test error evaluator = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="rmse") rmse = evaluator.evaluate(predictions) print("Root Mean Squared Error (RMSE) on test data = %g" % rmse) rfModel = model.stages[1] print(rfModel) # summary only #Step 3: Building our Pipelines rfModel.save('s3a://spotifybuck/model-export' + datetime.now().strftime('%Y%m%d%H%M')) pipeline.save('s3a://spotifybuck/pipeline-export' + datetime.now().strftime('%Y%m%d%H%M')) sc.stop()
def model(classifiers, training, testing, week): results = "" timing = [] for classifier in classifiers: timeStart = time.time() clf = get_classifier(classifier) labelIndexer = StringIndexer(inputCol="label", outputCol="indexed") featureIndexer = VectorIndexer(inputCol="features", outputCol="indexedFeatures") pipeline = Pipeline(stages=[labelIndexer, featureIndexer, clf]) model = pipeline.fit(training) prediction = model.transform(testing) metrics = BinaryClassificationMetrics( prediction.select("label", "prediction").rdd) results = results + "new," + classifier + "," + week + "," + str( metrics.areaUnderROC) + "," + str(metrics.areaUnderPR) + "\n" timing.append(time.time() - timeStart) return results, timing
def prepare(self): data = (self.spark_session.read.format(self.data_format) .load(self.data_file)) labelIndexer = StringIndexer( inputCol="label", outputCol="indexedLabel").fit(data) featureIndexer = (VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=self.max_categories) .fit(data)) self.train_data, self.valid_data = data.randomSplit([0.8, 0.2]) if self.model_builder.__name__ == 'DecisionTreeClassifier': classifier = self.model_builder(labelCol="indexedLabel", featuresCol="indexedFeatures") elif self.model_builder.__name__ == 'RandomForestClassifier': classifier = self.model_builder(labelCol="indexedLabel", featuresCol="indexedFeatures", numTrees=self.num_trees) labelConverter = IndexToString(inputCol="prediction", outputCol="predictedLabel", labels=labelIndexer.labels) elif self.model_builder.__name__ == 'GBTClassifier': classifier = self.model_builder(labelCol="indexedLabel", featuresCol="indexedFeatures", maxIter=self.max_iter) if self.model_builder.__name__ == 'RandomForestClassifier': self.pipeline = Pipeline(stages=[labelIndexer, featureIndexer, classifier, labelConverter]) else: self.pipeline = Pipeline(stages=[labelIndexer, featureIndexer, classifier])
def _fit(self, dataset, estimator, estimatorParamMaps, samplingrates, numfolds=5): all_list = dataset.columns all_list.remove('Class') #所有特征列名 assembler = VectorAssembler().setInputCols(all_list).setOutputCol( "features_vector") #特征列转换为一列向量 labelIndexer = StringIndexer(inputCol="Class", outputCol="label") #统一标签列名称为label featureIndexer = VectorIndexer( inputCol="features_vector", outputCol="features", maxCategories=10) #统一特征向量列名称,不同值数量小于10视作离散变量编号 pipeline = Pipeline(stages=[labelIndexer, featureIndexer, estimator]) #机器学习流建模,三部分整合 dataset = assembler.transform(dataset) #训练集生成特征向量列 best_epm, best_sampling, metricsX = self.Cross_Validation( dataset, estimator, estimatorParamMaps, samplingrates, numfolds) #交叉验证 bestModel = pipeline.fit( dataset.sampleBy("Class", fractions={ 1.0: 1.0, 0.0: best_sampling }), best_epm) # fit最优模型并输出 return bestModel, best_epm, best_sampling
def dtr(self): # Load and parse the data file, converting it to a DataFrame. data = self.session.read.format("libsvm").load(self.dataDir + "/data/mllib/sample_libsvm_data.txt") # Automatically identify categorical features, and index them. # Set maxCategories so features with > 4 distinct values are treated as continuous. featureIndexer = \ VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4).fit(data) # Split the data into training and test sets (30% held out for testing) (trainingData, testData) = data.randomSplit([0.7, 0.3]) # Train a GBT model. drg = DecisionTreeRegressor(featuresCol="indexedFeatures") # Chain indexer and GBT in a Pipeline pipeline = Pipeline(stages=[featureIndexer, drg]) # Train model. This also runs the indexer. model = pipeline.fit(trainingData) # Make predictions. predictions = model.transform(testData) # Select example rows to display. predictions.select("prediction", "label", "features").show(5) # Select (prediction, true label) and compute test error evaluator = RegressionEvaluator( labelCol="label", predictionCol="prediction", metricName="rmse") rmse = evaluator.evaluate(predictions) print("Root Mean Squared Error (RMSE) on test data = %g" % rmse) gbtModel = model.stages[1] print(gbtModel) # summary only
def DecisionTree(): IrisData = spark.sparkContext.textFile("file:///home/unbroken/MyFiles/Work/Programming/Spark/DecisionTree/Iris.txt")\ .map(lambda line: line.split(',')).map(lambda p: Row(**f(p))).toDF() IrisData.createOrReplaceTempView("iris") df = spark.sql("select * from iris") labelIndexer = StringIndexer(inputCol='label', outputCol='labelIndex').fit(IrisData) featureIndexer = VectorIndexer( inputCol='feature', outputCol='indexFeature').setMaxCategories(4).fit(IrisData) labelConverter = IndexToString(inputCol='prediction', outputCol='predictionLabel').setLabels( labelIndexer.labels) trainningData, testingData = IrisData.randomSplit([0.7, 0.3]) dtClassifier = DecisionTreeClassifier().setLabelCol( 'labelIndex').setFeaturesCol('indexFeature') pipelineClassifier = Pipeline().setStages( [labelIndexer, featureIndexer, dtClassifier, labelConverter]) modelClassifier = pipelineClassifier.fit(trainningData) prediction = modelClassifier.transform(testingData) print(prediction.show()) evaluator = MulticlassClassificationEvaluator().setLabelCol( 'labelIndex').setPredictionCol('prediction').setMetricName("accuracy") accuracy = evaluator.evaluate(prediction) print(accuracy) treeModelClassifier = modelClassifier.stages[2] print("Learned classification tree model:\n" + str(treeModelClassifier.toDebugString))
def entrenar(df): vectorAssembler = VectorAssembler(inputCols=[ "Position", "Crossing", "Finishing", "HeadingAccuracy", "ShortPassing", "Volleys", "Dribbling", "Curve", "FKAccuracy", "LongPassing", "BallControl", "Acceleration", "SprintSpeed", "Agility", "Reactions", "Balance", "ShotPower", "Jumping", "Stamina", "Strength", "LongShots", "Aggression", "Interceptions", "Positioning", "Vision", "Penalties", "Composure", "Marking", "StandingTackle", "SlidingTackle", "GKDiving", "GKHandling", "GKKicking", "GKPositioning", "GKReflexes" ], outputCol="features") stringIndexer = StringIndexer(inputCol="Position", outputCol="indexedLabel") vectorIndexer = VectorIndexer(inputCol="features", outputCol="indexedFeatures") # Division en data de entrenamiento y data de test (training_df, test_df) = df.randomSplit([0.7, 0.3]) # Configurar Red Neuronal capas = [13, 13, 13, 2] entrenador = MultilayerPerceptronClassifier(layers=capas, featuresCol="indexedFeatures", labelCol="indexedLabel", maxIter=10000) # Entrenar mi RN pipeline = Pipeline( stages=[vectorAssembler, stringIndexer, vectorIndexer, entrenador]) return pipeline.fit(training_df), test_df
def trainModel(self, trainingData): """ Ham huan luyen du lieu Mac dinh training toan bo du lieu trong dataset splitratio 100% training, 0% testing """ # Chuyen toan bo nhan thanh so neu chua chuyen # trainingData.select("label").groupBy("label").count().show() labelIndexer = StringIndexer( inputCol="label", outputCol="indexedLabel").fit(trainingData) # Chuyen toan bo gia tri thuoc tinh thanh so neu chua chuyen featureIndexer = VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4).fit(trainingData) # Khai bao thuat toan RandomForest rf = RandomForestClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures", numTrees=30, maxDepth=5, maxBins=32, seed=None, impurity="gini") # Chuyen nhan du doan duoc tu dang so ve dang ban dau, labelConverter = IndexToString(inputCol="prediction", outputCol="predictedLabel", labels=labelIndexer.labels) # Hop nhat tat ca cac buoc thanh mot luong duy nhat pipeline pipeline = Pipeline( stages=[labelIndexer, featureIndexer, rf, labelConverter]) # Train model qua pipeline model = pipeline.fit(trainingData) model.write().overwrite().save(os.path.join(self.modelpath, "detector")) return model
def train_boosted_regression(self, depth=2, n_trees=50, learning_rate=.01, max_cats=6): ''' train dataset on boosted decision trees -------- Parameters depth: int - max_allowable depth of decision tree leafs n_trees: int - max number of iterations learning_rate: int - rate which the model fits -------- ''' featureIndexer = \ VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=max_cats).fit(self.train) gbr = GBTRegressor(labelCol='label', featuresCol="features", maxDepth=depth, maxIter=n_trees, stepSize=learning_rate, maxMemoryInMB=2000) pipeline = Pipeline(stages=[featureIndexer, gbr]) # Train model. This also runs the indexer. self.model = pipeline.fit(self.train)
def run(start1, end1, start2, end2, df, sc, sql_context, is_pred): lp_data= get_labeled_points(start1, end2, df, sc, sql_context) print lp_data.count() labelIndexer = StringIndexer(inputCol="label", outputCol="indexedLabel").fit(lp_data) td = labelIndexer.transform(lp_data) label2index = {} for each in sorted(set([(i[0], i[1]) for i in td.select(td.label, td.indexedLabel).distinct().collect()]), key=lambda x: x[0]): label2index[int(each[0])] = int(each[1]) print label2index featureIndexer = \ VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4).fit(lp_data) rf = get_model() pipeline = Pipeline(stages=[labelIndexer, featureIndexer, rf]) lp_train = lp_data.filter(lp_data.date3<end1).filter(lp_data.is_labeled == 1) model = pipeline.fit(lp_train) lp_check = lp_data.filter(lp_data.date2>start2) predictions = model.transform(lp_check) predictions = val(predictions, label2index, sql_context) if is_pred: predictions = predictions.filter(predictions.is_labeled ==0).filter(predictions.date2 == get_cur()).sort(predictions.prob.desc()) dfToTableWithPar(sql_context, predictions, "predictions", get_cur()) for each in predictions.take(10): print each
def vector_index(cls, input_column, max_categories, output_column="features"): """ author: [email protected] Appends a column to the df, containing all the values of required columns for every record. :param df: dataframe containing data to be processed :param feature_list: list of required columns to be considered for vector assembling :param features_column: name for the newly appended column :return: vector assembled df """ try: cls.logger.debug("Columns to vector index: " + str(input_column)) cls.logger.info("Vector indexing data") vector_indexer = VectorIndexer(inputCol=input_column, outputCol=output_column, maxCategories=max_categories, handleInvalid='keep') return vector_indexer except Exception as exp: cls.logger.error( 'Exception occured while applying vector indexer using feature_list : ' + str(input_column)) raise DataFrameException(exp)
def UsefulnessPredictionSentmentWithoutCV(trainingdata, model): # Data Preprocessing assembler = VectorAssembler(inputCols=[ 'num', 'sentiment_neg', 'sentiment_neu', 'sentiment_pos', 'sentiment_compound', 'Character_adj', 'Character_noun', 'Character_verb', 'Character_adv' ], outputCol="features") featureIndexer = VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4) if model == 'RandomForest': model = RandomForestRegressor(featuresCol="indexedFeatures") pipeline = Pipeline(stages=[assembler, featureIndexer, model]) evaluator_rmse = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="rmse") Model = pipeline.fit(trainingdata) return Model
def test_random_forrest_regression(self): this_script_dir = os.path.dirname( os.path.abspath(inspect.getfile(inspect.currentframe()))) input_path = os.path.join(this_script_dir, "data", "sample_libsvm_data.txt") original_data = self.spark.read.format("libsvm").load(input_path) # # truncate the features # feature_count = 5 self.spark.udf.register( "truncateFeatures", lambda x: SparseVector(feature_count, range(0, feature_count), x.toArray()[125:130]), VectorUDT()) data = original_data.selectExpr( "cast(label as string) as label", "truncateFeatures(features) as features") label_indexer = StringIndexer(inputCol="label", outputCol="indexedLabel") feature_indexer = VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=10, handleInvalid='error') rf = RandomForestRegressor(labelCol="indexedLabel", featuresCol="indexedFeatures", numTrees=10) pipeline = Pipeline(stages=[label_indexer, feature_indexer, rf]) model = pipeline.fit(data) model_onnx = convert_sparkml( model, 'Sparkml RandomForest Regressor', [('label', StringTensorType([1, 1])), ('features', FloatTensorType([1, feature_count]))], spark_session=self.spark) self.assertTrue(model_onnx is not None) # run the model predicted = model.transform(data.limit(1)) data_np = { 'label': data.limit(1).toPandas().label.values, 'features': data.limit(1).toPandas().features.apply( lambda x: pandas.Series(x.toArray())).values.astype( numpy.float32) } expected = [ predicted.toPandas().indexedLabel.values.astype(numpy.int64), predicted.toPandas().prediction.values.astype(numpy.float32) ] paths = save_data_models(data_np, expected, model, model_onnx, basename="SparkmlRandomForestRegressor") onnx_model_path = paths[3] output, output_shapes = run_onnx_model(['indexedLabel', 'prediction'], data_np, onnx_model_path) compare_results(expected, output, decimal=5)
def train(data, max_depth, max_bins): print("Parameters: max_depth: {} max_bins: {}".format( max_depth, max_bins)) # spark = SparkSession.builder.appName("DecisionTreeClassificationExample").getOrCreate() # Load the data stored in LIBSVM format as a DataFrame. # data = spark.read.format("libsvm").load(os.environ['DSX_PROJECT_DIR']+data_path) # Index labels, adding metadata to the label column. # Fit on whole dataset to include all labels in index. label_indexer = StringIndexer(inputCol="label", outputCol="indexedLabel").fit(data) # Automatically identify categorical features, and index them. # We specify maxCategories so features with > 4 distinct values are treated as continuous. feature_indexer = VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4).fit(data) # Split the data into training and test sets (trainingData, testData) = data.randomSplit([0.7, 0.3]) mlflow.log_param("max_depth", max_depth) mlflow.log_param("max_bins", max_bins) # Train a DecisionTree model. dt = DecisionTreeClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures", maxDepth=max_depth, maxBins=max_bins) # Chain indexers and tree in a Pipeline. pipeline = Pipeline(stages=[label_indexer, feature_indexer, dt]) # Train model. This also runs the indexers. model = pipeline.fit(trainingData) # Make predictions predictions = model.transform(testData) # Select example rows to display. predictions.select("prediction", "indexedLabel", "features").show(5) # Select (prediction, true label) and compute test error. evaluator = MulticlassClassificationEvaluator(labelCol="indexedLabel", predictionCol="prediction", metricName="accuracy") accuracy = evaluator.evaluate(predictions) test_error = 1.0 - accuracy print("Test Error = {} ".format(test_error)) mlflow.log_metric("accuracy", accuracy) mlflow.log_metric("test_error", test_error) tree_model = model.stages[2] print(tree_model) mlflow.spark.log_model(model, '') spark.stop()
def DonusumuBaslat(self): sp_df = self.spark_df messagebox.showinfo("Uyarı","Dönüşüm Başladı") self.data_f = self.get_dummy() self.data_f.show(25,False) self.labelIndexer = StringIndexer(inputCol='label',outputCol='indexedLabel').fit(self.data_f) self.labelIndexer.transform(self.data_f).show(25,False) self.featureIndexer =VectorIndexer(inputCol="features", outputCol="indexedFeatures",maxCategories=4).fit(self.data_f) self.featureIndexer.transform(self.data_f).show(25,False) self.labelConverter = IndexToString(inputCol="prediction", outputCol="predictedLabel",labels=self.labelIndexer.labels) if self.testTxt.get()=='': messagebox.showinfo("Hata","Lütfen Test oranını girin") else: deger = self.testTxt.get() testPoint=float(deger)/100 (self.trainingData, self.testData) = self.data_f.randomSplit([1.0-testPoint, testPoint], seed = 100) messagebox.showinfo("Başarılı","Oran Hesaplandı") self.DonusumBtn.grid_remove()
def decision_tree_regression(trainingDataFrame, maxCategories=4): featureIndexer = VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=maxCategories).fit(trainingDataFrame) dt = DecisionTreeRegressor(featuresCol="indexedFeatures") pipeline = Pipeline(stages=[featureIndexer, dt]) dtModel = pipeline.fit(trainingDataFrame) result = {} result["model"] = dtModel result["summary"] = dtModel.stages[1] return result
def random_forest_regression(trainingDataFrame, maxCategories=4, numTrees=10): featureIndexer = VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4).fit(trainingDataFrame) rf = RandomForestRegressor(featuresCol="indexedFeatures", numTrees=numTrees) pipeline = Pipeline(stages=[featureIndexer, rf]) rfModel = pipeline.fit(trainingDataFrame) result = {} result["model"] = rfModel result["summary"] = rfModel.stages[1] return result
def gradient_boosted_tree_regression(trainingDataFrame, maxCategories=4, maxIter=10): featureIndexer = VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=maxCategories).fit(trainingDataFrame) gbt = GBTRegressor(featuresCol="indexedFeatures", maxIter=maxIter) pipeline = Pipeline(stages=[featureIndexer, gbt]) gbtModel = pipeline.fit(trainingDataFrame) result = {} result["model"] = gbtModel result["summary"] = gbtModel.stages[1] return result
from pyspark.ml.feature import IndexToString labelReverse = IndexToString().setInputCol("labelInd") labelReverse.transform(idxRes).show() # COMMAND ---------- from pyspark.ml.feature import VectorIndexer from pyspark.ml.linalg import Vectors idxIn = spark.createDataFrame([ (Vectors.dense(1, 2, 3),1), (Vectors.dense(2, 5, 6),2), (Vectors.dense(1, 8, 9),3) ]).toDF("features", "label") indxr = VectorIndexer()\ .setInputCol("features")\ .setOutputCol("idxed")\ .setMaxCategories(2) indxr.fit(idxIn).transform(idxIn).show() # COMMAND ---------- from pyspark.ml.feature import OneHotEncoder, StringIndexer lblIndxr = StringIndexer().setInputCol("color").setOutputCol("colorInd") colorLab = lblIndxr.fit(simpleDF).transform(simpleDF.select("color")) ohe = OneHotEncoder().setInputCol("colorInd") ohe.transform(colorLab).show() # COMMAND ----------
from __future__ import print_function # $example on$ from pyspark.ml.feature import VectorIndexer # $example off$ from pyspark.sql import SparkSession if __name__ == "__main__": spark = SparkSession\ .builder\ .appName("VectorIndexerExample")\ .getOrCreate() # $example on$ data = spark.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt") indexer = VectorIndexer(inputCol="features", outputCol="indexed", maxCategories=10) indexerModel = indexer.fit(data) categoricalFeatures = indexerModel.categoryMaps print("Chose %d categorical features: %s" % (len(categoricalFeatures), ", ".join(str(k) for k in categoricalFeatures.keys()))) # Create new column "indexed" with categorical values transformed to indices indexedData = indexerModel.transform(data) indexedData.show() # $example off$ spark.stop()