Exemple #1
0
def transform_data_in_pipeline(df):
    """

    :param df:
    :return:
    """

    # Initialise pipeline variables
    stages = []
    assembler_inputs = []

    # Assemble features vector from Spark dataframe fields
    assembler = VectorAssembler(
        inputCols=['x', 'y', 'star_rating_number', 'avg_adr'],
        outputCol='features')
    stages += [assembler]
    assembler_inputs += [assembler.getOutputCol()]

    # Apply standard scaling with unit std and centroid about the mean
    scaler = StandardScaler(inputCol=assembler.getOutputCol(),
                            outputCol='scaledFeatures')
    stages += [scaler]
    assembler_inputs += [scaler.getOutputCol()]

    # Execute the pipeline
    pipeline_model = Pipeline() \
        .setStages(stages) \
        .fit(df)

    # Return the dataframe with the additional transformed features vector
    return pipeline_model.transform(df)
    def train(cls, spark, sdf, cat_colnames, num_colnames):
        string_indexer_list = list()
        for cat_colname in cat_colnames:
            string_indexer = StringIndexer(inputCol=cat_colname,
                                           outputCol=cat_colname + "_index",
                                           handleInvalid="skip")
            string_indexer_list.append(string_indexer)

        out = []
        pipe = []
        if len(num_colnames) > 0:

            assembler = VectorAssembler(inputCols=num_colnames,
                                        outputCol="features_vec")
            standard_scaler = StandardScaler(inputCol="features_vec",
                                             outputCol="features_zs",
                                             withMean=True,
                                             withStd=True)
            out = [standard_scaler.getOutputCol()]
            pipe = [assembler, standard_scaler]
        assembler_2 = VectorAssembler(
            inputCols=[x.getOutputCol() for x in string_indexer_list] + out,
            outputCol="features")
        estimator = KMeans(featuresCol="features",
                           predictionCol="cluster_id",
                           k=4)

        clustering_pipeline = Pipeline(stages=string_indexer_list + pipe +
                                       [assembler_2] + [estimator])
        clustering_pipeline_model = clustering_pipeline.fit(sdf)

        return KMeansPipeline(pipeline_model=clustering_pipeline_model)
    def create_standard_pipeline(self, cross_validate=False):
        """
        This method creates a standard pipeline, standard meaning: vectorize, standardize and model...
        :return: Pipeline for pyspark, ParameterGrid for Pyspark pipeline
        """

        # Feature columns are created from instance variables
        # feature_columns = [i.name for i in self._feature_cols]

        # Vectorized transformation
        vectorizer = VectorAssembler(inputCols=self._feature_cols,
                                     outputCol='v_features')
        # Cast the vector from mllib to ml
        converter = ConvertAllToVecToMl(inputCol=vectorizer.getOutputCol(),
                                        outputCol='casted')
        # Standardize estimator
        standardizes = StandardScaler(withMean=self._standardize,
                                      withStd=self._standardize,
                                      inputCol=converter.getOutputCol(),
                                      outputCol="scaled")
        # Labels and strings are already set into the model, +
        dict_parameters = dict(
            filter(lambda x: not isinstance(x[1], tuple),
                   self._params.items()))
        dict_parameters['featuresCol'] = standardizes.getOutputCol()
        dict_parameters['labelCol'] = self._label_col[0]  # HACK!!!
        #print(label_dict)

        # Model is set
        model = eval("classification." + self._algorithm)(**dict_parameters)

        pipe = Pipeline(stages=[vectorizer, converter, standardizes, model])
        return pipe
Exemple #4
0
def Model(Data, Tgt='Target', Indp='Nada'):

    vector_assembler = VectorAssembler(
        inputCols=Indp, outputCol='assembled_important_features')
    standard_scaler = StandardScaler(inputCol=vector_assembler.getOutputCol(),
                                     outputCol='standardized_features')
    rf = RandomForestClassifier(featuresCol=standard_scaler.getOutputCol(),
                                labelCol=Tgt)
    #	letters_train, letters_test = letters.randomSplit([0.8,0.2], seed=4)
    pipeline = Pipeline(stages=[vector_assembler, standard_scaler, rf])
    pipeline_model_rf = pipeline.fit(Data)
    return pipeline_model_rf
Exemple #5
0
def main(spark, data_file, model_file):
    '''Main routine for supervised training

    Parameters
    ----------
    spark : SparkSession object

    data_file : string, path to the parquet file to load

    model_file : string, path to store the serialized model file
    '''

    ###
    # TODO: YOUR CODE GOES HERE
    ###

    # Read data
    df = spark.read.parquet(data_file)

    # Take 1/10 data without replacement
    df = df.sample(False, 0.1, seed = 0)

    # Vectorize selected features
    features = ['mfcc_' + '%.2d' % i for i in range(20)]
    assembler = VectorAssembler(inputCols=features, outputCol="vectorized_features")

    # Standardize the features
    scaler = StandardScaler(inputCol="vectorized_features", outputCol="scaled_features", withStd=True, withMean=False)

    # Transform string target variable into numerical
    indexer = StringIndexer(inputCol="genre", outputCol="label", handleInvalid = "skip")

    # Build logistic regression
    lr = LogisticRegression(maxIter=20, featuresCol = scaler.getOutputCol(), labelCol=indexer.getOutputCol())

    # Build a pipeline
    pipeline = Pipeline(stages = [assembler, scaler, indexer, lr])

    # Build parameter grid and cross validation
    paramGrid = ParamGridBuilder().addGrid(lr.elasticNetParam,[0.1,0.3,0.5,0.8]).addGrid(lr.regParam, [0.1,0.08,0.05,0.02,0.01]).build()

    crossval = CrossValidator(estimator = pipeline, estimatorParamMaps = paramGrid, evaluator = MulticlassClassificationEvaluator(), numFolds = 5)

    # Save model
    cvModel = crossval.fit(df)
    cvModel.bestModel.write().overwrite().save(model_file)
Exemple #6
0
            tgt_other_agent_indexer.getOutputCol()
        ],
                                     outputCols=[
                                         "country_code_ohe", "geoname_ohe",
                                         "source_ohe", "src_actor_ohe",
                                         "src_agent_ohe",
                                         "src_other_agent_ohe", "target_ohe",
                                         "tgt_actor_ohe", "tgt_agent_ohe",
                                         "tgt_other_agent_ohe"
                                     ],
                                     handleInvalid='keep',
                                     dropLast=True)

        # Combine all features into a single column
        feature_assembler = VectorAssembler(inputCols=ohe.getOutputCols() +
                                            [goldstein_scaler.getOutputCol()],
                                            outputCol="features")

        # Index root_code labels
        label_indexer = StringIndexer(
            inputCol="root_code",
            outputCol="indexedLabel").setHandleInvalid('skip')

        # Select a subset of important features
        feature_selector = ChiSqSelector(
            percentile=0.5,
            featuresCol=feature_assembler.getOutputCol(),
            labelCol=label_indexer.getOutputCol(),
            outputCol="selected_features")

        # Train a RandomForest model
# |-- genre: string (nullable = true)
# |-- label: integer (nullable = false)

# Preparing Data for Machine Learning

from pyspark.ml.feature import VectorAssembler, PCA, StringIndexer, StandardScaler
from pyspark.ml import Pipeline

numeric_features = [
    t[0] for t in binary_audio_feature_genre.dtypes if t[1] == 'double'
]
assembler = VectorAssembler(inputCols=numeric_features,
                            outputCol="VEC-FEATURES")
standard_scaler = StandardScaler(inputCol=assembler.getOutputCol(),
                                 outputCol="SCALED_FEATURES")
pca = PCA(k=5, inputCol=standard_scaler.getOutputCol(), outputCol="features")

# use Pipeline to chain multiple Transformers and Estimators together to specify our machine learning workflow

pipeline = Pipeline(stages=[assembler, standard_scaler, pca])
pipelineModel = pipeline.fit(training)
training = pipelineModel.transform(training).select('genre', 'features',
                                                    'label')
test = pipelineModel.transform(test).select('genre', 'features', 'label')

# check the training data after transformers

training.show()
test.show()

# Train the Logistic Regression
Exemple #8
0
import time
from pyspark.sql import SparkSession

if __name__ == "__main__":
    spark = SparkSession\
        .builder\
        .appName("PCAExample")\
        .getOrCreate()

    dataset = spark.read.format("csv").options(header='false',
                                               inferSchema='true',
                                               delimiter=',').load(sys.argv[1])
    t0 = time.time()
    assembler = VectorAssembler(inputCols=[
        '_c0', '_c1', '_c2', '_c3', '_c4', '_c5', '_c6', '_c7', '_c8', '_c9',
        '_c10', '_c11', '_c12', '_c13', '_c14', '_c15', '_c16', '_c17', '_c18',
        '_c20', '_c21'
    ],
                                outputCol="features")
    df = assembler.transform(dataset)
    scaler = StandardScaler(inputCol="features",
                            outputCol="scaledFeatures",
                            withStd=False,
                            withMean=True)
    pca = PCA(k=1, inputCol=scaler.getOutputCol(), outputCol="pcaFeatures")
    pipeline = Pipeline(stages=[scaler, pca])
    model = pipeline.fit(df)
    result = model.transform(df).select("pcaFeatures")
    print(time.time() - t0)
    spark.stop()
Exemple #9
0
from pyspark.ml import Pipeline
from pyspark.sql import SparkSession
from sklearn.datasets import load_iris

import mlflow

spark = SparkSession.builder.getOrCreate()
mlflow.pyspark.ml.autolog()

df = load_iris(as_frame=True).frame.rename(columns={"target": "label"})
df = spark.createDataFrame(df)
train, test = df.randomSplit([0.8, 0.2])

assembler = VectorAssembler(inputCols=df.columns[:-1], outputCol="features")
scaler = StandardScaler(inputCol=assembler.getOutputCol(), outputCol="scaledFeatures")
lor = LogisticRegression(maxIter=5, featuresCol=scaler.getOutputCol())

# Non-neseted pipeline
pipeline = Pipeline(stages=[assembler, scaler, lor])
with mlflow.start_run():
    pipeline_model = pipeline.fit(train)

columns = ["features", "prediction"]
pipeline_model.transform(test).select(columns).show()

# Nested pipeline
nested_pipeline = Pipeline(stages=[Pipeline(stages=[assembler, scaler]), lor])
with mlflow.start_run():
    nested_pipeline_model = nested_pipeline.fit(train)

nested_pipeline_model.transform(test).select(columns).show()