Python OneHotEncoder.getOutputCols Beispiele

Programmiersprache: Python

Namespace / Paketname: pyspark.ml.feature

Klasse / Typ: OneHotEncoder

Methode / Funktion: getOutputCols

Beispiele auf hotexamples.com: 3

Python OneHotEncoder.getOutputCols - 3 Beispiele gefunden. Dies sind die am besten bewerteten Python Beispiele für die pyspark.ml.feature.OneHotEncoder.getOutputCols, die aus Open Source-Projekten extrahiert wurden. Sie können Beispiele bewerten, um die Qualität der Beispiele zu verbessern.

Häufig verwendete Methoden

Anzeigen Verbergen

OneHotEncoder(30)

transform(30)

fit(24)

setDropLast(4)

getOutputCols(3)

select(3)

getOutputCol(1)

load(1)

save(1)

setInputCols(1)

show(1)

toPandas(1)

write(1)

Beispiel #1

Datei anzeigen

def build_pipeline (pipeconfig: dict) -> pyspark.ml.Pipeline:
    '''
    Build a Pipeline instance based on config file
    :param pipeconfig: metadata dictionary
    :return: pyspark.ml.Pipeline
    '''

    # Pipeline metadata
    cats = pipeconfig['variables']['categoricals']
    nums = pipeconfig['variables']['numericals']
    index_names = pipeconfig['metadata']['index_names']
    encoded_names = pipeconfig['metadata']['encoded_names']
    vect_name = pipeconfig['metadata']['vect_name']
    feats_name = pipeconfig['metadata']['feats_name']
    labelcol = pipeconfig['model']['labelCol']
    maxdepth = pipeconfig['model']['maxDepth']
    maxbins = pipeconfig['model']['maxBins']
    maxiter = pipeconfig['model']['maxIter']
    seed = pipeconfig['model']['seed']

    # Build stages
    stageone = StringIndexer(inputCols=cats,
                             outputCols=index_names)

    stagetwo = OneHotEncoder(dropLast=False,
                             inputCols=stageone.getOutputCols(),
                             outputCols=encoded_names)

    stagethree = VectorAssembler(inputCols=nums + stagetwo.getOutputCols(),
                                 outputCol=vect_name)

    stagefour = MinMaxScaler(inputCol=stagethree.getOutputCol(),
                             outputCol=feats_name)

    stagefive = GBTClassifier(featuresCol=stagefour.getOutputCol(),
                              labelCol=labelcol,
                              maxDepth=maxdepth,
                              maxBins=maxbins,
                              maxIter=maxiter,
                              seed=seed)
    pipeline = Pipeline(stages=[stageone, stagetwo, stagethree, stagefour, stagefive])

    return pipeline

Beispiel #2

Datei anzeigen

Datei: Classification.py Projekt: witoldmerkel/smart-city-predictions

def make_class_model(data,
                     sc,
                     model_path,
                     model_name,
                     target,
                     ml_model='default',
                     save=True):

    t0 = time()
    # Stages for pipline
    stages = []

    # Index labels, adding metadata to the label column.
    # Fit on whole dataset to include all labels in index.
    targetIndexer = StringIndexer(inputCol="target",
                                  outputCol="indexedTarget",
                                  handleInvalid="keep").fit(data)
    stages += [targetIndexer]

    # Split the data into training and test sets (30% held out for testing)
    (trainingData, testData) = data.randomSplit([0.7, 0.3])

    # Identify categorical and numerical variables
    catCols = [
        x for (x, dataType) in trainingData.dtypes
        if (((dataType == "string") | (dataType == "boolean"))
            & (x != "target"))
    ]

    numCols = [
        x for (x, dataType) in trainingData.dtypes
        if ((dataType == "int") | (dataType == "bigint")
            | (dataType == "float") | (dataType == "double"))
    ]

    # OneHotEncode categorical variables
    indexers = [
        StringIndexer(inputCol=column,
                      outputCol=column + "-index",
                      handleInvalid="keep") for column in catCols
    ]

    encoder = OneHotEncoder(
        inputCols=[indexer.getOutputCol() for indexer in indexers],
        outputCols=[
            "{0}-encoded".format(indexer.getOutputCol())
            for indexer in indexers
        ])
    assembler_cat = VectorAssembler(inputCols=encoder.getOutputCols(),
                                    outputCol="categorical-features",
                                    handleInvalid="skip")

    stages += indexers
    stages += [encoder, assembler_cat]

    assembler_num = VectorAssembler(inputCols=numCols,
                                    outputCol="numerical-features",
                                    handleInvalid="skip")

    # Standardize numerical variables
    scaler = StandardScaler(inputCol="numerical-features",
                            outputCol="numerical-features_scaled")

    # Combine all features in one vector
    assembler_all = VectorAssembler(
        inputCols=['categorical-features', 'numerical-features_scaled'],
        outputCol='features',
        handleInvalid="skip")

    stages += [assembler_num, scaler, assembler_all]

    # Train a RandomForest model by default or another specified model.
    if ml_model == 'default':
        rf = RandomForestClassifier(labelCol="indexedTarget",
                                    featuresCol="features",
                                    numTrees=10)
    else:
        rf = ml_model

    # Convert indexed labels back to original labels.
    labelConverter = IndexToString(inputCol="prediction",
                                   outputCol="predictedLabel",
                                   labels=targetIndexer.labels)

    stages += [rf, labelConverter]

    # Chain indexers and forest in a Pipeline
    pipeline = Pipeline(stages=stages)

    # Train model.  This also runs the indexers.
    model = pipeline.fit(trainingData)

    # Make predictions.
    predictions = model.transform(testData)

    # Select example rows to display.
    #predictions.select("predictedLabel", "target", "features").show(5)

    # Select (prediction, true label) and compute test error
    evaluator = MulticlassClassificationEvaluator(labelCol="indexedTarget",
                                                  predictionCol="prediction",
                                                  metricName="accuracy")
    accuracy = evaluator.evaluate(predictions)
    print("Accuracy = %g" % (0.0 + accuracy))

    if save:
        # Final model saving and statistics writing
        tt = time() - t0
        timestamp = int(time())
        model.write().overwrite().save(model_path)

        cluster = Cluster(['127.0.0.1'], "9042")
        session = cluster.connect("models")
        query = (
            "INSERT INTO %s (model_name, timestamp, target, learning_time, model_path, stat)"
        ) % ("models_statistics")
        query = query + " VALUES (%s, %s, %s, %s, %s, %s)"
        session.execute(
            query, (model_name, timestamp, target, tt, model_path, accuracy))
        session.shutdown()
        cluster.shutdown()

        # Stop spark session
        sc.stop()

    if not save:
        return model, sc

Beispiel #3

Datei anzeigen

Datei: Spark_ML.py Projekt: zifwang/Internetwork-and-Cloud-Computing


# In[24]:


# Encode the categorical data
categorical_variables = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'gender', 'native_country']

indexers = [StringIndexer(inputCol=column, outputCol=column+"-index") for column in categorical_variables]

encoder = OneHotEncoderEstimator(
    inputCols=[indexer.getOutputCol() for indexer in indexers],
    outputCols=["{0}-encoded".format(indexer.getOutputCol()) for indexer in indexers]
)
assembler = VectorAssembler(
    inputCols=encoder.getOutputCols(),
    outputCol="categorical-features"
)


# In[25]:


# # Create a Pipeline.
pipeline = Pipeline(stages=indexers + [encoder, assembler])
pipelineModel = pipeline.fit(df_remove)
model = pipelineModel.transform(df_remove)


# In[26]: