def feature_selection(df): assembler = VectorAssembler(inputCols=[ "Edad", "Genero", "Zona", "Fumador_Activo", "ultimo_estado_de_Glicemia", "Enfermedad_Coronaria", "Tension_sistolica", "Tension_diastolica", "Colesterol_Total", "Trigliceridos", "Clasificacion_RCV_Global", "Glicemia_de_ayuno", "Perimetro_Abdominal", "Peso", "IMC", "CLAIFICACION_IMC", "Creatinina", "Factor_correccion", "Proteinuria", "Farmacos_Antihipertensivos", "Estatina", "Antidiabeticos", "Adherencia_tratamiento" ], outputCol="features") df = assembler.transform(df) indexer = VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=15) df = indexer.fit(df).transform(df) # Seleccionamos features que mas suman al modelo selector = ChiSqSelector(numTopFeatures=15, featuresCol="indexedFeatures", labelCol="Diabetes", outputCol="selectedFeatures") resultado = selector.fit(df).transform(df) resultado.select("features", "selectedFeatures").show(100)
def feature_selection(df): assembler = VectorAssembler(inputCols=[ "Crossing", "Finishing", "HeadingAccuracy", "ShortPassing", "Volleys", "Dribbling", "Curve", "FKAccuracy", "LongPassing", "BallControl", "Acceleration", "SprintSpeed", "Agility", "Reactions", "Balance", "ShotPower", "Jumping", "Stamina", "Strength", "LongShots", "Aggression", "Interceptions", "Positioning", "Vision", "Penalties", "Composure", "Marking", "StandingTackle", "SlidingTackle", "GKDiving", "GKHandling", "GKKicking", "GKPositioning", "GKReflexes" ], outputCol="features") df = assembler.transform(df) indexer = VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4) df = indexer.fit(df).transform(df) # Seleccionamos features que mas suman al modelo selector = ChiSqSelector(numTopFeatures=5, featuresCol="indexedFeatures", labelCol="Position", outputCol="selectedFeatures") resultado = selector.fit(df).transform(df) resultado.select("features", "selectedFeatures").show()
def test_model_vector_indexer_single(self): vi = VectorIndexer(maxCategories=3, inputCol="a", outputCol="indexed") data = self.spark.createDataFrame([(Vectors.dense([-1.0]), ), (Vectors.dense([0.0]), ), (Vectors.dense([0.0]), )], ["a"]) model = vi.fit(data) model_onnx = convert_sparkml( model, 'Sparkml VectorIndexer Single', [('a', FloatTensorType([None, model.numFeatures]))], target_opset=9) self.assertTrue(model_onnx is not None) # run the model predicted = model.transform(data) expected = predicted.toPandas().indexed.apply( lambda x: pandas.Series(x.toArray())).values data_np = data.toPandas().a.apply( lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32) paths = save_data_models(data_np, expected, model, model_onnx, basename="SparkmlVectorIndexerSingle") onnx_model_path = paths[-1] output, output_shapes = run_onnx_model(['indexed'], data_np, onnx_model_path) compare_results(expected, output, decimal=5)
def testVectorIndexer(spark, data): indexer = VectorIndexer(inputCol="features", outputCol="indexed", maxCategories=10) indexerModel = indexer.fit(data) categoricalFeatures = indexerModel.categoryMaps print("Chose %d categorical features: %s" % (len(categoricalFeatures), ", ".join( str(k) for k in categoricalFeatures.keys()))) # Create new column "indexed" with categorical values transformed to indices indexedData = indexerModel.transform(data) indexedData.show()
def vector_indexer_usecase(): spark = getSparkSession() data = spark.read.format("libsvm").load("data/lib_svm.txt") data.show(1) indexer = VectorIndexer(inputCol="features", outputCol="indexed", maxCategories=10) indexerModel = indexer.fit(data) categoricalFeatures = indexerModel.categoryMaps print("Chose %d categorical features: %s" % (len(categoricalFeatures), ", ".join( str(k) for k in categoricalFeatures.keys()))) # Create new column "indexed" with categorical values transformed to indices indexedData = indexerModel.transform(data) indexedData.show(1, truncate=False)
def feature_selection(df): assembler = VectorAssembler(inputCols=[ "age", "sex", "cp", "trestbps", "chol", "fbs", "restecg", "thalach", "exang", "oldpeak", "slope", "ca", "thal" ], outputCol="features") df = assembler.transform(df) indexer = VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4) df = indexer.fit(df).transform(df) # Seleccionamos features que mas suman al modelo selector = ChiSqSelector(numTopFeatures=4, featuresCol="indexedFeatures", labelCol="target", outputCol="selectedFeatures") resultado = selector.fit(df).transform(df) resultado.select("features", "selectedFeatures").show()
def feature_selection(df): # Creamos vectorassembler assembler = VectorAssembler(inputCols=[ "EDAD", "GENERO", "ETNIA", "ZONA", "ESCOLARIDAD", "FUMADOR", "HAS", "HTADM", "GLICEMIA", "ENF_CORONARIA", "T_SISTOLICA", "T_DIASTOLICA", "COLESTEROL_TOTAL", "TRIGLICERIDOS", "RCV_GLOBAL", "GLICEMIA_AYUNO", "PERIMETRO_ABDOMINAL", "PESO", "TALLA", "IMC", "CREATININA", "MICROALBUMINURIA", "ESTADO_IRC", "FARMACOS_ANTIHIPERTENSIVOS" ], outputCol="features") df = assembler.transform(df) # Vectorindexer indexer = VectorIndexer(inputCol="features", outputCol="indexedFeatures") df = indexer.fit(df).transform(df) # Prueba ChiSquare selector = ChiSqSelector(numTopFeatures=8, featuresCol="indexedFeatures", labelCol="DIABETES", outputCol="selectedFeatures") resultado = selector.fit(df).transform(df) resultado.select("features", "selectedFeatures").show()
def vectorCategory(self): from pyspark.ml.feature import VectorIndexer data = self.session.read.format("libsvm").load( self.dataDir + "/data/mllib/sample_libsvm_data.txt") indexer = VectorIndexer(inputCol="features", outputCol="indexed", maxCategories=10) indexerModel = indexer.fit(data) categoricalFeatures = indexerModel.categoryMaps print("Chose %d categorical features: %s" % (len(categoricalFeatures), ", ".join( str(k) for k in categoricalFeatures.keys()))) # Create new column "indexed" with categorical values transformed to indices indexedData = indexerModel.transform(data) indexedData.show() ## 问题来了,那我们怎么能够多个字段转化一个vector字段么? from pyspark.ml.linalg import Vectors from pyspark.ml.feature import VectorAssembler dataset = self.session.createDataFrame( [(0, 18, 1.0, Vectors.dense([0.0, 10.0, 0.5]), 1.0)], ["id", "hour", "mobile", "userFeatures", "clicked"]) assembler = VectorAssembler( inputCols=["hour", "mobile", "userFeatures"], outputCol="features") output = assembler.transform(dataset) print( "Assembled columns 'hour', 'mobile', 'userFeatures' to vector column 'features'" ) output.select("features", "clicked").show(truncate=False)
# DBTITLE 1,Importação das bibliotecas para Árvore de Regressão from pyspark.ml import Pipeline from pyspark.ml.regression import DecisionTreeRegressor from pyspark.ml.feature import VectorIndexer from pyspark.ml.evaluation import RegressionEvaluator # COMMAND ---------- # DBTITLE 1,Criação da coluna "indexedFeatures" no dataframe "featureIndexer" indexer = VectorIndexer(inputCol="features", \ outputCol="featureIndexer",\ maxCategories=10) # COMMAND ---------- featureIndexer = indexer.fit(df_assembler) # COMMAND ---------- indexedData = featureIndexer.transform(df_assembler) indexedData.show() # COMMAND ---------- df_assembler.columns # COMMAND ---------- # DBTITLE 1,Divisão da base em treino e teste (trainingData, testData) = df_assembler.randomSplit([0.8, 0.2])
""" @author: zhouning @file:VectorIndexer_demo.py @time:2018/8/8 9:03 @desc: StringIndexer是针对单个类别特征进行转换, 倘若所有特征都已经被组织在一个向量中,又想对其中某些单个分量进行处理时,Spark ML提供了VectorIndexer类来解决向量数据集中的类别特征转换。 通过为其提供maxCategories超参数,它可以自动识别哪些特征是类别型的,并且将原始值转换为类别索引。 它基于不同特征值的数量的数量来识别哪些特征需要被类别化,那些取值可能性最对不超过maxCategories的特征需要会被认为是类别型的。 在下面的例子中,我们读入一个数据集,然后使用VectorIndexer训练出模型,来决定哪些特征需要被作为类别特征,将类别特征转换为索引, 这里设置maxCategories为10,即只有种类小的特征才被认为是类别型特征,否则被认为是连续型特征。 """ from pyspark.sql import SparkSession from pyspark.ml.feature import VectorIndexer spark = SparkSession.builder.appName("logistic_regression").getOrCreate() data = spark.read.format('libsvm').load('sample_libsvm_data.txt') indexer = VectorIndexer(inputCol="features", outputCol="indexed", maxCategories=2) indexed_model = indexer.fit(data) categorical_features = indexed_model.categoryMaps print(categorical_features) indexed_data = indexed_model.transform(data) indexed_data.show(truncate=False) spark.stop()
def main(sc): sqlContext = SQLContext(sc) # In[1]: input_path = '' model_path = '' model_info_path = model_path + '' model_scaler_path = model_path + '' model_train_set_path = model_path + '' # Import the table of features and labels into dataframes df_data = sqlContext.read.format('com.databricks.spark.csv').options( header='true', inferschema='true').load(input_path) # Convert all features to double type except for ID and Label, which remain as strings # This is done because the Random Forest Algorithm requires features to be numbers df_data = df_data.select( *(col(c).cast("double").alias(c) for c in df_data.columns[1:-1]), df_data.u_msisdn.cast('string'), df_data.tag.cast('string')) # Defines that the first column is the unique ID, the last one contains the labels and all the ones in between are the given features df_master = df_data.rdd.map(lambda r: Row( cust_id=r[-2], label=r[-1], features=Vectors.dense(r[:-2]))).toDF() # Randomly Split the data into a test and train set (df_master_train, df_master_test) = df_master.randomSplit([0.5, 0.5], seed=123) # Set the Random Forest input to the training set rf_init_data = df_master_train # Indexing labels for Random Forest Algorithm labelIndexer = StringIndexer(inputCol="label", outputCol="indexed_label") model = labelIndexer.fit(rf_init_data) rf_init_data = model.transform(rf_init_data) # Indexing features for Random Forest Algorithm featureIndexer = VectorIndexer(inputCol="features", outputCol="indexed_features", maxCategories=2) model = featureIndexer.fit(rf_init_data) rf_init_data = model.transform(rf_init_data) # Configures inbuilt Random Forest Classifier function with 500 trees, # max depth = 8 and 32 bins rf_init = RandomForestClassifier(labelCol="indexed_label", featuresCol="indexed_features", numTrees=500, impurity="gini", maxDepth=8, maxBins=32) rf_init_data.persist() # Cache the data set rf_init_model = rf_init.fit( rf_init_data) # Run the Random Forest Algorithm rf_init_data.unpersist() # Extract a list of feature importances from the output of the Random Forest # Algorithm with each element corresponding to a feature rf_init_varimp = np.sqrt(rf_init_model.featureImportances.toArray()) # Creates a list containing the 6 most important features to be used later # to subset our entire data from 146 features to just 6! # Create a list containing the names of all features column_names = df_data.columns[:-2] #Creating a dictionary mapping feature names to their respective importances NameToImp = dict() for i in range(len(column_names)): key = column_names[i] value = rf_init_varimp[i] NameToImp[key] = value # Sorted list in reverse order according to the variable importances sorted_varimp = sorted(NameToImp.values(), reverse=True) # Collect importances of 6 most important features sorted_top_varimp = sorted_varimp[:6] # Sorted list of column names in reverse order according to varimp sorted_colnames = sorted(NameToImp, key=NameToImp.get, reverse=True) # Collect colnames of 6 most imp features col_names = sorted_colnames[:6] # Pulling data for most import 6 features df_data_new = df_data.select( df_data.u_msisdn.cast('string'), df_data.tag.cast('string'), *(col(c).cast("double").alias(c) for c in col_names)) # Defines that the first column is the unique ID, the last one contains the labels and all the ones in between are the given features df_master_new = df_data_new.rdd.map(lambda r: Row( cust_id=r[0], label=r[1], features=Vectors.dense(r[2:]))).toDF() # Scale and normaize the features so that all features can be compared # and create a new column for the features scaler = StandardScaler(inputCol="features", outputCol="scaled_features", withStd=True, withMean=True) # Compute summary statistics by fitting the StandardScaler scalerModel = scaler.fit(df_master_new) # Normalize each feature to have unit standard deviation. df_master_new = scalerModel.transform(df_master_new) #The old features have been replaced with their scaled versions and thus # we no longer care about the old, unbalanced features df_master_new = df_master_new.drop('features') # Randomly Split the data into a test and train set (df_master_train, df_master_test) = df_master_new.randomSplit([0.5, 0.5], seed=123) test_all = df_master_test sqlContext.registerDataFrameAsTable(df_master_train, "df_master_train_table") # Remove the negative labels as only the positive ones are important train_all = sqlContext.sql( 'select * from df_master_train_table where label = 1') # Multiply feature values with corresponding importances m = ElementwiseProduct(scalingVec=Vectors.dense(sorted_top_varimp), inputCol="scaled_features", outputCol="scaled_weighted_features") train_all = m.transform(train_all) test_all = m.transform(test_all) sqlContext.dropTempTable("df_master_train_table") # Create a list of tasks containing tuples of number of neighbours and # cutoff frequencies to be passed to KNN algorithm number_of_neighbours = [250, 550, 750, 1000] popshared = 0.30 num_indices = int(popshared * (test_all.count())) tasks = [] for num_neighbour in number_of_neighbours: tasks = tasks + [(num_neighbour, num_indices)] # Partitioning the tasks for parallel processing tasksRDD = sc.parallelize(tasks, numSlices=len(tasks)) tasksRDD.collect() train_pd = train_all.toPandas() test_pd = test_all.toPandas() train_pd['indices'] = train_pd.index test_pd['indices'] = test_pd.index # Converting features into SparseVector format l_train = list() for k in train_pd.scaled_weighted_features: l_train.append( Vectors.sparse(len(k), [(i, j) for i, j in enumerate(k) if j != 0])) l_test = list() for k in test_pd.scaled_weighted_features: l_test.append( Vectors.sparse(len(k), [(i, j) for i, j in enumerate(k) if j != 0])) # Converting to a numpy array knn_train = np.asarray(l_train) knn_test = np.asarray(l_test) # Broadcasting the training and test sets to all partitions train_broadcast = sc.broadcast(knn_train) test_broadcast = sc.broadcast(knn_test) # Calling K Nearest Neighbour search on each partition tree_type = "kd_tree" resultsRDD = tasksRDD.map(lambda nc: findNearestNeighbour( train_broadcast, test_broadcast, nc[0], nc[1], test_pd, tree_type)) resultsRDD.cache() resultsRDD.count() resultsPD = resultsRDD.toDF().toPandas() resultsPD["popshared"] = popshared resultsPD = resultsPD.rename(columns={'_1': 'Recall'}) resultsPD = resultsPD.rename(columns={'_2': 'Number of Neighbors'}) bestResult = (resultsPD.sort_values(by=["Recall"], ascending=[0])).iloc[0] bestNN = int(bestResult["Number of Neighbors"]) bestRecall = bestResult["Recall"] # saving the model info - varimp,recall,NN,col_names to model_path column_names = [i for i in col_names] model_info = sc.parallelize([{ "varimp": sorted_top_varimp, "recall": bestRecall, "NN": bestNN, "col_names": column_names }]) model_info.saveAsPickleFile(path=model_info_path) # saving the scaler model to model_path scalerModel.write().overwrite().save(model_scaler_path) # saving the train set to model_path df_master_new.rdd.saveAsPickleFile(path=model_train_set_path)
df1, df2, df3, df4 = mtcars.randomSplit(weights=[0.2, 0.2, 0.15, 0.25], seed=123) df1.count() df2.count() df3.count() df4.count() mtcars.show() iris.show(n=5) from pyspark.ml.feature import VectorIndexer iris.select(['species']).distinct().count() indexer = VectorIndexer(maxCategories=3, inputCol='species', outputCol='indexed_species') model = indexer.fit(iris) from pyspark.ml.feature import StringIndexer indexer = StringIndexer(inputCol='species', outputCol='indexed_species') model = indexer.fit(iris) model.transform(iris).show(n=5) model.transform(iris).select(['species', 'indexed_species']).distinct().show() species = iris.select(['species']) mtcars.show(n=5) mtcars.select(['carb']).distinct().count()
# COMMAND ---------- from pyspark.ml.feature import VectorIndexer from pyspark.ml.linalg import Vectors idxIn = spark.createDataFrame([ (Vectors.dense(1, 2, 3),1), (Vectors.dense(2, 5, 6),2), (Vectors.dense(1, 8, 9),3) ]).toDF("features", "label") indxr = VectorIndexer()\ .setInputCol("features")\ .setOutputCol("idxed")\ .setMaxCategories(2) indxr.fit(idxIn).transform(idxIn).show() # COMMAND ---------- from pyspark.ml.feature import OneHotEncoder, StringIndexer lblIndxr = StringIndexer().setInputCol("color").setOutputCol("colorInd") colorLab = lblIndxr.fit(simpleDF).transform(simpleDF.select("color")) ohe = OneHotEncoder().setInputCol("colorInd") ohe.transform(colorLab).show() # COMMAND ---------- from pyspark.ml.feature import Tokenizer tkn = Tokenizer().setInputCol("Description").setOutputCol("DescOut")
# -*- coding: utf-8 -*- """ Created on Fri Jul 7 17:47:59 2017 @author: Akshaykumar.Kore """ from pyspark.ml.linalg import Vectors from pyspark.ml.feature import VectorIndexer from pyspark import SparkSession spark = SparkSession.builder.master("spark://50.50.50.226:7077").appName("adult1").getOrCreate() df = spark.createDataFrame([(Vectors.dense([-1.0, 0.0]),),(Vectors.dense([0.0, 1.0]),), (Vectors.dense([0.0, 2.0]),)], ["a"]) indexer = VectorIndexer(maxCategories=2, inputCol="a", outputCol="indexed") model = indexer.fit(df) print(model.transform(df).head().indexed) ''' model.numFeatures model.categoryMaps indexer.setParams(outputCol="test").fit(df).transform(df).collect()[1].test params = {indexer.maxCategories: 3, indexer.outputCol: "vector"} model2 = indexer.fit(df, params) model2.transform(df).head().vector vectorIndexerPath = temp_path + "/vector-indexer" indexer.save(vectorIndexerPath) loadedIndexer = VectorIndexer.load(vectorIndexerPath)
# assembling features # transforming all the feature columns to one Vector column assembler = VectorAssembler( inputCols=[x for x in irisML.columns if x not in ignore], outputCol="features") assembled_df = assembler.transform(irisML) # indexing label col labelIndexer = StringIndexer(inputCol="species", outputCol="indexedLabel") lbl_indexed_df = labelIndexer.fit(assembled_df).transform(assembled_df) # indexing features col featureIndexer = VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4) ftrs_indexed_df = featureIndexer.fit(lbl_indexed_df).transform(lbl_indexed_df) # declaring classifier dt = DecisionTreeClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures") # pipelining stages, chaining assembler and indexers pipeline = Pipeline(stages=[assembler, labelIndexer, featureIndexer, dt]) # COMMAND ---------- # training model model = pipeline.fit(trainingData) # predicting values predictions = model.transform(testData)
labelReverse.transform(idxRes).show(5) # COMMAND ---------- from pyspark.ml.feature import VectorIndexer from pyspark.ml.linalg import Vectors idxIn = spark.createDataFrame([(Vectors.dense(1, 2, 3), 1), (Vectors.dense(2, 5, 6), 2), (Vectors.dense(1, 8, 9), 3) ]).toDF("features", "label") indxr = VectorIndexer()\ .setInputCol("features")\ .setOutputCol("idxed")\ .setMaxCategories(2) indxr.fit(idxIn).transform(idxIn).show() # COMMAND ---------- from pyspark.ml.feature import OneHotEncoder, StringIndexer lblIndxr = StringIndexer().setInputCol("color").setOutputCol("colorInd") colorLab = lblIndxr.fit(simpleDF).transform(simpleDF.select("color")) ohe = OneHotEncoder().setInputCol("colorInd") ohe.transform(colorLab).show(10) # COMMAND ---------- from pyspark.ml.feature import Tokenizer tkn = Tokenizer()\ .setInputCol("Description")\
from __future__ import print_function # $example on$ from pyspark.ml.feature import VectorIndexer # $example off$ from pyspark.sql import SparkSession if __name__ == "__main__": spark = SparkSession\ .builder\ .appName("VectorIndexerExample")\ .getOrCreate() # $example on$ data = spark.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt") indexer = VectorIndexer(inputCol="features", outputCol="indexed", maxCategories=10) indexerModel = indexer.fit(data) categoricalFeatures = indexerModel.categoryMaps print("Chose %d categorical features: %s" % (len(categoricalFeatures), ", ".join(str(k) for k in categoricalFeatures.keys()))) # Create new column "indexed" with categorical values transformed to indices indexedData = indexerModel.transform(data) indexedData.show() # $example off$ spark.stop()
def getModel(path, file): if path_exist(path + 'index-' + file): index = sc.textFile(path + 'index-' + file) a = index.collect() b = lambda x: [int(i) for i in x] return DecisionTreeModel.load(sc, path + 'model-' + file), b(a) else: vector, classes = dataPreparing(sc.textFile(path + file)) index = CorrelationFeature( vector) #se precisar de feature do Feature Selection reduced = MatrixReducer(vector, index) data = pass2libsvm(reduced, classes) # Train a DecisionTree model. # Empty categoricalFeaturesInfo indicates all features are continuous. # Load CSV data data2 = spark.read.format("csv").schema(schema).load(path + file) # Create vector assembler to produce a feature vector for each record for use in MLlib # First 45 csv fields are features, the 46th field is the label. Remove IPs from features. assembler = VectorAssembler(inputCols=[schema.names[1]] + schema.names[3:-1], outputCol="features") # Assemble feature vector in new dataframe assembledData = assembler.transform(data2) # Create a label and feature indexers to speed up categorical columns for decision tree labelIndexer = StringIndexer(inputCol="label", outputCol="indexedLabel") labelIndexed = labelIndexer.fit(assembledData).transform(assembledData) featureIndexer = VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=20) featureIndexed = featureIndexer.fit(labelIndexed).transform( labelIndexed) # Create a DecisionTree model trainer dt = DecisionTreeClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures") # Chain indexers and model training in a Pipeline # pipeline = Pipeline(stages=[labelIndexer, featureIndexer, dt]) # Train model # model = pipeline.fit(assembledData) model = dt.fit(featureIndexed) #model = DecisionTree.trainClassifier(data, numberClasses,{}) #, maxDepth=5, maxBins=32) #model.save(sc, path+'model-'+file) return model, index