Python VectorIndexer.fitの例、pyspark.ml.feature.VectorIndexer.fit Pythonの例

コード例 #1

0

ファイルを表示

ファイル: ChiSquare.py プロジェクト: AlexanderPrincipe/Python-ChiSquare

def feature_selection(df):
    assembler = VectorAssembler(inputCols=[
        "Edad", "Genero", "Zona", "Fumador_Activo",
        "ultimo_estado_de_Glicemia", "Enfermedad_Coronaria",
        "Tension_sistolica", "Tension_diastolica", "Colesterol_Total",
        "Trigliceridos", "Clasificacion_RCV_Global", "Glicemia_de_ayuno",
        "Perimetro_Abdominal", "Peso", "IMC", "CLAIFICACION_IMC", "Creatinina",
        "Factor_correccion", "Proteinuria", "Farmacos_Antihipertensivos",
        "Estatina", "Antidiabeticos", "Adherencia_tratamiento"
    ],
                                outputCol="features")
    df = assembler.transform(df)

    indexer = VectorIndexer(inputCol="features",
                            outputCol="indexedFeatures",
                            maxCategories=15)

    df = indexer.fit(df).transform(df)

    # Seleccionamos features que mas suman al modelo
    selector = ChiSqSelector(numTopFeatures=15,
                             featuresCol="indexedFeatures",
                             labelCol="Diabetes",
                             outputCol="selectedFeatures")
    resultado = selector.fit(df).transform(df)
    resultado.select("features", "selectedFeatures").show(100)

コード例 #2

0

ファイルを表示

def feature_selection(df):
    assembler = VectorAssembler(inputCols=[
        "Crossing", "Finishing", "HeadingAccuracy", "ShortPassing", "Volleys",
        "Dribbling", "Curve", "FKAccuracy", "LongPassing", "BallControl",
        "Acceleration", "SprintSpeed", "Agility", "Reactions", "Balance",
        "ShotPower", "Jumping", "Stamina", "Strength", "LongShots",
        "Aggression", "Interceptions", "Positioning", "Vision", "Penalties",
        "Composure", "Marking", "StandingTackle", "SlidingTackle", "GKDiving",
        "GKHandling", "GKKicking", "GKPositioning", "GKReflexes"
    ],
                                outputCol="features")
    df = assembler.transform(df)

    indexer = VectorIndexer(inputCol="features",
                            outputCol="indexedFeatures",
                            maxCategories=4)

    df = indexer.fit(df).transform(df)

    # Seleccionamos features que mas suman al modelo
    selector = ChiSqSelector(numTopFeatures=5,
                             featuresCol="indexedFeatures",
                             labelCol="Position",
                             outputCol="selectedFeatures")
    resultado = selector.fit(df).transform(df)
    resultado.select("features", "selectedFeatures").show()

コード例 #3

0

ファイルを表示

 def test_model_vector_indexer_single(self):
     vi = VectorIndexer(maxCategories=3, inputCol="a", outputCol="indexed")
     data = self.spark.createDataFrame([(Vectors.dense([-1.0]), ),
                                        (Vectors.dense([0.0]), ),
                                        (Vectors.dense([0.0]), )], ["a"])
     model = vi.fit(data)
     model_onnx = convert_sparkml(
         model,
         'Sparkml VectorIndexer Single',
         [('a', FloatTensorType([None, model.numFeatures]))],
         target_opset=9)
     self.assertTrue(model_onnx is not None)
     # run the model
     predicted = model.transform(data)
     expected = predicted.toPandas().indexed.apply(
         lambda x: pandas.Series(x.toArray())).values
     data_np = data.toPandas().a.apply(
         lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32)
     paths = save_data_models(data_np,
                              expected,
                              model,
                              model_onnx,
                              basename="SparkmlVectorIndexerSingle")
     onnx_model_path = paths[-1]
     output, output_shapes = run_onnx_model(['indexed'], data_np,
                                            onnx_model_path)
     compare_results(expected, output, decimal=5)

コード例 #4

0

ファイルを表示

def testVectorIndexer(spark, data):

    indexer = VectorIndexer(inputCol="features",
                            outputCol="indexed",
                            maxCategories=10)
    indexerModel = indexer.fit(data)

    categoricalFeatures = indexerModel.categoryMaps
    print("Chose %d categorical features: %s" %
          (len(categoricalFeatures), ", ".join(
              str(k) for k in categoricalFeatures.keys())))

    # Create new column "indexed" with categorical values transformed to indices
    indexedData = indexerModel.transform(data)
    indexedData.show()

コード例 #5

0

ファイルを表示

ファイル: feature_conversion.py プロジェクト: YanhuiJing/machine_learn

def vector_indexer_usecase():
    spark = getSparkSession()

    data = spark.read.format("libsvm").load("data/lib_svm.txt")

    data.show(1)

    indexer = VectorIndexer(inputCol="features",
                            outputCol="indexed",
                            maxCategories=10)
    indexerModel = indexer.fit(data)

    categoricalFeatures = indexerModel.categoryMaps
    print("Chose %d categorical features: %s" %
          (len(categoricalFeatures), ", ".join(
              str(k) for k in categoricalFeatures.keys())))

    # Create new column "indexed" with categorical values transformed to indices
    indexedData = indexerModel.transform(data)
    indexedData.show(1, truncate=False)

コード例 #6

0

ファイルを表示

def feature_selection(df):
    assembler = VectorAssembler(inputCols=[
        "age", "sex", "cp", "trestbps", "chol", "fbs", "restecg", "thalach",
        "exang", "oldpeak", "slope", "ca", "thal"
    ],
                                outputCol="features")
    df = assembler.transform(df)

    indexer = VectorIndexer(inputCol="features",
                            outputCol="indexedFeatures",
                            maxCategories=4)

    df = indexer.fit(df).transform(df)

    # Seleccionamos features que mas suman al modelo
    selector = ChiSqSelector(numTopFeatures=4,
                             featuresCol="indexedFeatures",
                             labelCol="target",
                             outputCol="selectedFeatures")
    resultado = selector.fit(df).transform(df)
    resultado.select("features", "selectedFeatures").show()

コード例 #7

0

ファイルを表示

def feature_selection(df):
    # Creamos vectorassembler
    assembler = VectorAssembler(inputCols=[
        "EDAD", "GENERO", "ETNIA", "ZONA", "ESCOLARIDAD", "FUMADOR", "HAS",
        "HTADM", "GLICEMIA", "ENF_CORONARIA", "T_SISTOLICA", "T_DIASTOLICA",
        "COLESTEROL_TOTAL", "TRIGLICERIDOS", "RCV_GLOBAL", "GLICEMIA_AYUNO",
        "PERIMETRO_ABDOMINAL", "PESO", "TALLA", "IMC", "CREATININA",
        "MICROALBUMINURIA", "ESTADO_IRC", "FARMACOS_ANTIHIPERTENSIVOS"
    ],
                                outputCol="features")
    df = assembler.transform(df)

    # Vectorindexer
    indexer = VectorIndexer(inputCol="features", outputCol="indexedFeatures")

    df = indexer.fit(df).transform(df)

    # Prueba ChiSquare
    selector = ChiSqSelector(numTopFeatures=8,
                             featuresCol="indexedFeatures",
                             labelCol="DIABETES",
                             outputCol="selectedFeatures")
    resultado = selector.fit(df).transform(df)
    resultado.select("features", "selectedFeatures").show()

コード例 #8

0

ファイルを表示

ファイル: feature.py プロジェクト: allwefantasy/spark-deep-learning-toy

    def vectorCategory(self):
        from pyspark.ml.feature import VectorIndexer

        data = self.session.read.format("libsvm").load(
            self.dataDir + "/data/mllib/sample_libsvm_data.txt")

        indexer = VectorIndexer(inputCol="features",
                                outputCol="indexed",
                                maxCategories=10)
        indexerModel = indexer.fit(data)

        categoricalFeatures = indexerModel.categoryMaps
        print("Chose %d categorical features: %s" %
              (len(categoricalFeatures), ", ".join(
                  str(k) for k in categoricalFeatures.keys())))

        # Create new column "indexed" with categorical values transformed to indices
        indexedData = indexerModel.transform(data)
        indexedData.show()

        ## 问题来了，那我们怎么能够多个字段转化一个vector字段么？
        from pyspark.ml.linalg import Vectors
        from pyspark.ml.feature import VectorAssembler

        dataset = self.session.createDataFrame(
            [(0, 18, 1.0, Vectors.dense([0.0, 10.0, 0.5]), 1.0)],
            ["id", "hour", "mobile", "userFeatures", "clicked"])

        assembler = VectorAssembler(
            inputCols=["hour", "mobile", "userFeatures"], outputCol="features")

        output = assembler.transform(dataset)
        print(
            "Assembled columns 'hour', 'mobile', 'userFeatures' to vector column 'features'"
        )
        output.select("features", "clicked").show(truncate=False)

コード例 #9

0

ファイルを表示

ファイル: Entrega2 - Regressão Linear e Árvore de Regressão.py プロジェクト: anacox/Olist-freight-value-prediction-model

# DBTITLE 1,Importação das bibliotecas para Árvore de Regressão
from pyspark.ml import Pipeline
from pyspark.ml.regression import DecisionTreeRegressor
from pyspark.ml.feature import VectorIndexer
from pyspark.ml.evaluation import RegressionEvaluator

# COMMAND ----------

# DBTITLE 1,Criação da coluna "indexedFeatures" no dataframe "featureIndexer"
indexer = VectorIndexer(inputCol="features", \
                              outputCol="featureIndexer",\
                              maxCategories=10)

# COMMAND ----------

featureIndexer = indexer.fit(df_assembler)

# COMMAND ----------

indexedData = featureIndexer.transform(df_assembler)
indexedData.show()

# COMMAND ----------

df_assembler.columns

# COMMAND ----------

# DBTITLE 1,Divisão da base em treino e teste
(trainingData, testData) = df_assembler.randomSplit([0.8, 0.2])

コード例 #10

0

ファイルを表示

ファイル: VectorIndexer_demo.py プロジェクト: xrw560/learn-pyspark

"""
@author: zhouning
@file:VectorIndexer_demo.py
@time:2018/8/8 9:03
@desc:
StringIndexer是针对单个类别特征进行转换，
倘若所有特征都已经被组织在一个向量中，又想对其中某些单个分量进行处理时，Spark ML提供了VectorIndexer类来解决向量数据集中的类别特征转换。

通过为其提供maxCategories超参数，它可以自动识别哪些特征是类别型的，并且将原始值转换为类别索引。
它基于不同特征值的数量的数量来识别哪些特征需要被类别化，那些取值可能性最对不超过maxCategories的特征需要会被认为是类别型的。

在下面的例子中，我们读入一个数据集，然后使用VectorIndexer训练出模型，来决定哪些特征需要被作为类别特征，将类别特征转换为索引，
这里设置maxCategories为10，即只有种类小的特征才被认为是类别型特征，否则被认为是连续型特征。
"""

from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorIndexer

spark = SparkSession.builder.appName("logistic_regression").getOrCreate()

data = spark.read.format('libsvm').load('sample_libsvm_data.txt')
indexer = VectorIndexer(inputCol="features",
                        outputCol="indexed",
                        maxCategories=2)
indexed_model = indexer.fit(data)
categorical_features = indexed_model.categoryMaps
print(categorical_features)
indexed_data = indexed_model.transform(data)
indexed_data.show(truncate=False)
spark.stop()

コード例 #11

0

ファイルを表示

ファイル: train.py プロジェクト: ussozi/lookalike-modelling

def main(sc):
    sqlContext = SQLContext(sc)
    # In[1]:
    input_path = ''
    model_path = ''
    model_info_path = model_path + ''
    model_scaler_path = model_path + ''
    model_train_set_path = model_path + ''

    # Import the table of features and labels into dataframes
    df_data = sqlContext.read.format('com.databricks.spark.csv').options(
        header='true', inferschema='true').load(input_path)

    # Convert all features to double type except for ID and Label, which remain as strings
    # This is done because the Random Forest Algorithm requires features to be numbers
    df_data = df_data.select(
        *(col(c).cast("double").alias(c) for c in df_data.columns[1:-1]),
        df_data.u_msisdn.cast('string'), df_data.tag.cast('string'))

    # Defines that the first column is the unique ID, the last one contains the labels and all the ones in between are the given features
    df_master = df_data.rdd.map(lambda r: Row(
        cust_id=r[-2], label=r[-1], features=Vectors.dense(r[:-2]))).toDF()

    # Randomly Split the data into a test and train set
    (df_master_train, df_master_test) = df_master.randomSplit([0.5, 0.5],
                                                              seed=123)

    # Set the Random Forest input to the training set
    rf_init_data = df_master_train

    # Indexing labels for Random Forest Algorithm
    labelIndexer = StringIndexer(inputCol="label", outputCol="indexed_label")
    model = labelIndexer.fit(rf_init_data)
    rf_init_data = model.transform(rf_init_data)

    # Indexing features for Random Forest Algorithm
    featureIndexer = VectorIndexer(inputCol="features",
                                   outputCol="indexed_features",
                                   maxCategories=2)
    model = featureIndexer.fit(rf_init_data)
    rf_init_data = model.transform(rf_init_data)

    # Configures inbuilt Random Forest Classifier function with 500 trees,
    # max depth = 8 and 32 bins
    rf_init = RandomForestClassifier(labelCol="indexed_label",
                                     featuresCol="indexed_features",
                                     numTrees=500,
                                     impurity="gini",
                                     maxDepth=8,
                                     maxBins=32)

    rf_init_data.persist()  # Cache the data set
    rf_init_model = rf_init.fit(
        rf_init_data)  # Run the Random Forest Algorithm

    rf_init_data.unpersist()

    # Extract a list of feature importances from the output of the Random Forest
    # Algorithm with each element corresponding to a feature
    rf_init_varimp = np.sqrt(rf_init_model.featureImportances.toArray())

    # Creates a list containing the 6 most important features to be used later
    # to subset our entire data from 146 features to just 6!

    # Create a list containing the names of all features
    column_names = df_data.columns[:-2]

    #Creating a dictionary mapping feature names to their respective importances
    NameToImp = dict()
    for i in range(len(column_names)):
        key = column_names[i]
        value = rf_init_varimp[i]
        NameToImp[key] = value

    # Sorted list in reverse order according to the variable importances
    sorted_varimp = sorted(NameToImp.values(), reverse=True)

    # Collect importances of 6 most important features
    sorted_top_varimp = sorted_varimp[:6]

    # Sorted list of column names in reverse order according to varimp
    sorted_colnames = sorted(NameToImp, key=NameToImp.get, reverse=True)

    # Collect colnames of 6 most imp features
    col_names = sorted_colnames[:6]

    # Pulling data for most import 6 features
    df_data_new = df_data.select(
        df_data.u_msisdn.cast('string'), df_data.tag.cast('string'),
        *(col(c).cast("double").alias(c) for c in col_names))

    # Defines that the first column is the unique ID, the last one contains the labels and all the ones in between are the given features
    df_master_new = df_data_new.rdd.map(lambda r: Row(
        cust_id=r[0], label=r[1], features=Vectors.dense(r[2:]))).toDF()

    # Scale and normaize the features so that all features can be compared
    # and create a new column for the features
    scaler = StandardScaler(inputCol="features",
                            outputCol="scaled_features",
                            withStd=True,
                            withMean=True)

    # Compute summary statistics by fitting the StandardScaler
    scalerModel = scaler.fit(df_master_new)

    # Normalize each feature to have unit standard deviation.
    df_master_new = scalerModel.transform(df_master_new)

    #The old features have been replaced with their scaled versions and thus
    # we no longer care about the old, unbalanced features
    df_master_new = df_master_new.drop('features')

    # Randomly Split the data into a test and train set
    (df_master_train, df_master_test) = df_master_new.randomSplit([0.5, 0.5],
                                                                  seed=123)

    test_all = df_master_test

    sqlContext.registerDataFrameAsTable(df_master_train,
                                        "df_master_train_table")

    # Remove the negative labels as only the positive ones are important
    train_all = sqlContext.sql(
        'select * from df_master_train_table where label = 1')

    # Multiply feature values with corresponding importances
    m = ElementwiseProduct(scalingVec=Vectors.dense(sorted_top_varimp),
                           inputCol="scaled_features",
                           outputCol="scaled_weighted_features")

    train_all = m.transform(train_all)

    test_all = m.transform(test_all)

    sqlContext.dropTempTable("df_master_train_table")

    # Create a list of tasks containing tuples of number of neighbours and
    # cutoff frequencies to be passed to KNN algorithm
    number_of_neighbours = [250, 550, 750, 1000]
    popshared = 0.30
    num_indices = int(popshared * (test_all.count()))
    tasks = []
    for num_neighbour in number_of_neighbours:
        tasks = tasks + [(num_neighbour, num_indices)]

    # Partitioning the tasks for parallel processing
    tasksRDD = sc.parallelize(tasks, numSlices=len(tasks))
    tasksRDD.collect()

    train_pd = train_all.toPandas()
    test_pd = test_all.toPandas()

    train_pd['indices'] = train_pd.index
    test_pd['indices'] = test_pd.index

    # Converting features into SparseVector format
    l_train = list()
    for k in train_pd.scaled_weighted_features:
        l_train.append(
            Vectors.sparse(len(k),
                           [(i, j) for i, j in enumerate(k) if j != 0]))

    l_test = list()
    for k in test_pd.scaled_weighted_features:
        l_test.append(
            Vectors.sparse(len(k),
                           [(i, j) for i, j in enumerate(k) if j != 0]))

        # Converting to a numpy array
    knn_train = np.asarray(l_train)
    knn_test = np.asarray(l_test)
    # Broadcasting the training and test sets to all partitions
    train_broadcast = sc.broadcast(knn_train)
    test_broadcast = sc.broadcast(knn_test)

    # Calling K Nearest Neighbour search on each partition
    tree_type = "kd_tree"
    resultsRDD = tasksRDD.map(lambda nc: findNearestNeighbour(
        train_broadcast, test_broadcast, nc[0], nc[1], test_pd, tree_type))
    resultsRDD.cache()
    resultsRDD.count()

    resultsPD = resultsRDD.toDF().toPandas()

    resultsPD["popshared"] = popshared
    resultsPD = resultsPD.rename(columns={'_1': 'Recall'})
    resultsPD = resultsPD.rename(columns={'_2': 'Number of Neighbors'})

    bestResult = (resultsPD.sort_values(by=["Recall"], ascending=[0])).iloc[0]
    bestNN = int(bestResult["Number of Neighbors"])
    bestRecall = bestResult["Recall"]

    # saving the model info - varimp,recall,NN,col_names to model_path
    column_names = [i for i in col_names]
    model_info = sc.parallelize([{
        "varimp": sorted_top_varimp,
        "recall": bestRecall,
        "NN": bestNN,
        "col_names": column_names
    }])
    model_info.saveAsPickleFile(path=model_info_path)

    # saving the scaler model to model_path
    scalerModel.write().overwrite().save(model_scaler_path)

    # saving the train set to model_path
    df_master_new.rdd.saveAsPickleFile(path=model_train_set_path)

コード例 #12

0

ファイルを表示

df1, df2, df3, df4 = mtcars.randomSplit(weights=[0.2, 0.2, 0.15, 0.25], seed=123)
df1.count()
df2.count()
df3.count()
df4.count()


mtcars.show()

iris.show(n=5)

from pyspark.ml.feature import VectorIndexer
iris.select(['species']).distinct().count()
indexer = VectorIndexer(maxCategories=3, inputCol='species', outputCol='indexed_species')
model = indexer.fit(iris)


from pyspark.ml.feature import StringIndexer
indexer = StringIndexer(inputCol='species', outputCol='indexed_species')
model = indexer.fit(iris)
model.transform(iris).show(n=5)
model.transform(iris).select(['species', 'indexed_species']).distinct().show()


species = iris.select(['species'])

mtcars.show(n=5)

mtcars.select(['carb']).distinct().count()

コード例 #13

0

ファイルを表示

ファイル: Advanced_Analytics_and_Machine_Learning-Chapter_25_Preprocessing_and_Feature_Engineering.py プロジェクト: yehonatc/Spark-The-Definitive-Guide


# COMMAND ----------

from pyspark.ml.feature import VectorIndexer
from pyspark.ml.linalg import Vectors
idxIn = spark.createDataFrame([
  (Vectors.dense(1, 2, 3),1),
  (Vectors.dense(2, 5, 6),2),
  (Vectors.dense(1, 8, 9),3)
]).toDF("features", "label")
indxr = VectorIndexer()\
  .setInputCol("features")\
  .setOutputCol("idxed")\
  .setMaxCategories(2)
indxr.fit(idxIn).transform(idxIn).show()


# COMMAND ----------

from pyspark.ml.feature import OneHotEncoder, StringIndexer
lblIndxr = StringIndexer().setInputCol("color").setOutputCol("colorInd")
colorLab = lblIndxr.fit(simpleDF).transform(simpleDF.select("color"))
ohe = OneHotEncoder().setInputCol("colorInd")
ohe.transform(colorLab).show()


# COMMAND ----------

from pyspark.ml.feature import Tokenizer
tkn = Tokenizer().setInputCol("Description").setOutputCol("DescOut")

コード例 #14

0

ファイルを表示

ファイル: VectorIndexer.py プロジェクト: akki24/SparkPython

# -*- coding: utf-8 -*-
"""
Created on Fri Jul  7 17:47:59 2017

@author: Akshaykumar.Kore
"""

from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorIndexer
from pyspark import SparkSession

spark = SparkSession.builder.master("spark://50.50.50.226:7077").appName("adult1").getOrCreate()

df = spark.createDataFrame([(Vectors.dense([-1.0, 0.0]),),(Vectors.dense([0.0, 1.0]),), (Vectors.dense([0.0, 2.0]),)], ["a"])
indexer = VectorIndexer(maxCategories=2, inputCol="a", outputCol="indexed")
model = indexer.fit(df)
print(model.transform(df).head().indexed)
'''
model.numFeatures

model.categoryMaps

indexer.setParams(outputCol="test").fit(df).transform(df).collect()[1].test

params = {indexer.maxCategories: 3, indexer.outputCol: "vector"}
model2 = indexer.fit(df, params)
model2.transform(df).head().vector

vectorIndexerPath = temp_path + "/vector-indexer"
indexer.save(vectorIndexerPath)
loadedIndexer = VectorIndexer.load(vectorIndexerPath)

コード例 #15

0

ファイルを表示

ファイル: iris-ml.py プロジェクト: acifani/spark-with-databricks

# assembling features
# transforming all the feature columns to one Vector column
assembler = VectorAssembler(
    inputCols=[x for x in irisML.columns if x not in ignore],
    outputCol="features")
assembled_df = assembler.transform(irisML)

# indexing label col
labelIndexer = StringIndexer(inputCol="species", outputCol="indexedLabel")
lbl_indexed_df = labelIndexer.fit(assembled_df).transform(assembled_df)

# indexing features col
featureIndexer = VectorIndexer(inputCol="features",
                               outputCol="indexedFeatures",
                               maxCategories=4)
ftrs_indexed_df = featureIndexer.fit(lbl_indexed_df).transform(lbl_indexed_df)

# declaring classifier
dt = DecisionTreeClassifier(labelCol="indexedLabel",
                            featuresCol="indexedFeatures")

# pipelining stages, chaining assembler and indexers
pipeline = Pipeline(stages=[assembler, labelIndexer, featureIndexer, dt])

# COMMAND ----------

# training model
model = pipeline.fit(trainingData)
# predicting values
predictions = model.transform(testData)

コード例 #16

0

ファイルを表示

labelReverse.transform(idxRes).show(5)

# COMMAND ----------

from pyspark.ml.feature import VectorIndexer
from pyspark.ml.linalg import Vectors

idxIn = spark.createDataFrame([(Vectors.dense(1, 2, 3), 1),
                               (Vectors.dense(2, 5, 6), 2),
                               (Vectors.dense(1, 8, 9), 3)
                               ]).toDF("features", "label")
indxr = VectorIndexer()\
  .setInputCol("features")\
  .setOutputCol("idxed")\
  .setMaxCategories(2)
indxr.fit(idxIn).transform(idxIn).show()

# COMMAND ----------

from pyspark.ml.feature import OneHotEncoder, StringIndexer

lblIndxr = StringIndexer().setInputCol("color").setOutputCol("colorInd")
colorLab = lblIndxr.fit(simpleDF).transform(simpleDF.select("color"))
ohe = OneHotEncoder().setInputCol("colorInd")
ohe.transform(colorLab).show(10)

# COMMAND ----------

from pyspark.ml.feature import Tokenizer
tkn = Tokenizer()\
  .setInputCol("Description")\

コード例 #17

0

ファイルを表示

from __future__ import print_function

# $example on$
from pyspark.ml.feature import VectorIndexer
# $example off$
from pyspark.sql import SparkSession

if __name__ == "__main__":
    spark = SparkSession\
        .builder\
        .appName("VectorIndexerExample")\
        .getOrCreate()

    # $example on$
    data = spark.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt")

    indexer = VectorIndexer(inputCol="features", outputCol="indexed", maxCategories=10)
    indexerModel = indexer.fit(data)

    categoricalFeatures = indexerModel.categoryMaps
    print("Chose %d categorical features: %s" %
          (len(categoricalFeatures), ", ".join(str(k) for k in categoricalFeatures.keys())))

    # Create new column "indexed" with categorical values transformed to indices
    indexedData = indexerModel.transform(data)
    indexedData.show()
    # $example off$

    spark.stop()

コード例 #18

0

ファイルを表示

def getModel(path, file):

    if path_exist(path + 'index-' + file):
        index = sc.textFile(path + 'index-' + file)
        a = index.collect()
        b = lambda x: [int(i) for i in x]

        return DecisionTreeModel.load(sc, path + 'model-' + file), b(a)

    else:

        vector, classes = dataPreparing(sc.textFile(path + file))

        index = CorrelationFeature(
            vector)  #se precisar de feature do Feature Selection

        reduced = MatrixReducer(vector, index)

        data = pass2libsvm(reduced, classes)

        # Train a DecisionTree model.
        #  Empty categoricalFeaturesInfo indicates all features are continuous.

        # Load CSV data
        data2 = spark.read.format("csv").schema(schema).load(path + file)

        # Create vector assembler to produce a feature vector for each record for use in MLlib
        # First 45 csv fields are features, the 46th field is the label. Remove IPs from features.
        assembler = VectorAssembler(inputCols=[schema.names[1]] +
                                    schema.names[3:-1],
                                    outputCol="features")

        # Assemble feature vector in new dataframe
        assembledData = assembler.transform(data2)

        # Create a label and feature indexers to speed up categorical columns for decision tree
        labelIndexer = StringIndexer(inputCol="label",
                                     outputCol="indexedLabel")
        labelIndexed = labelIndexer.fit(assembledData).transform(assembledData)
        featureIndexer = VectorIndexer(inputCol="features",
                                       outputCol="indexedFeatures",
                                       maxCategories=20)
        featureIndexed = featureIndexer.fit(labelIndexed).transform(
            labelIndexed)

        # Create a DecisionTree model trainer
        dt = DecisionTreeClassifier(labelCol="indexedLabel",
                                    featuresCol="indexedFeatures")

        # Chain indexers and model training in a Pipeline
        #		pipeline = Pipeline(stages=[labelIndexer, featureIndexer, dt])

        # Train model
        #		model = pipeline.fit(assembledData)
        model = dt.fit(featureIndexed)

        #model = DecisionTree.trainClassifier(data, numberClasses,{})	 #, maxDepth=5, maxBins=32)

        #model.save(sc, path+'model-'+file)

        return model, index