Exemple #1
0
def rescale_df(data):
    """Rescale the data."""
    standardScaler = StandardScaler(inputCol="features",
                                    outputCol="features_scaled")
    scaler = standardScaler.fit(data)
    scaled_df = scaler.transform(data)
    return scaled_df
def createPipeline(irisData, lrElasticNetParam, lrRegParam):
    '''Creates a pipeline for coverting the data into features and label with the required format
    Args: irisData - Input data for the feature and label processing
          lrElasticNetParam - ElasticNet parameter of LR, 0-L2 penalty and 1-L1 penalty
          lrRegParam - Regularization parameter
    '''
    strIndexer = StringIndexer().setInputCol('species').setOutputCol(
        'label').fit(irisData)
    va = VectorAssembler(inputCols=[
        'sepal_length', 'sepal_width', 'petal_length', 'petal_width'
    ],
                         outputCol='vec_features')
    ss = StandardScaler().setInputCol(
        va.getOutputCol()).setOutputCol('features').fit(va.transform(irisData))
    lr = LogisticRegression().setFeaturesCol('features')
    labelConverter = IndexToString(inputCol='prediction',
                                   outputCol='predictedLabel',
                                   labels=strIndexer.labels)
    stages = [strIndexer, va, ss, lr, labelConverter]
    pipeline = Pipeline().setStages(stages)

    params = ParamGridBuilder().addGrid(lr.elasticNetParam,
                                        lrElasticNetParam).addGrid(
                                            lr.regParam, lrRegParam).build()
    evaluator = MulticlassClassificationEvaluator(labelCol='label',
                                                  predictionCol='prediction',
                                                  metricName=lrMetric)

    return pipeline, params, evaluator
def normalize_score(df):
    
    from pyspark.ml.linalg import Vectors
    from pyspark.ml.feature import VectorAssembler
    from pyspark.ml.feature import StandardScaler


    assembler = VectorAssembler(
        inputCols=["score"],
        outputCol="score_v")

    output = assembler.transform(df)
    
    # Normalize each Vector using $L^1$ norm.
    
    scaler = StandardScaler(inputCol="score_v", outputCol="popularity_score",
                        withStd=False, withMean=True)

    # Compute summary statistics by fitting the StandardScaler
    scalerModel = scaler.fit(output)

    # Normalize each feature to have unit standard deviation.
    scaledData = scalerModel.transform(output)
    
    return scaledData
    def train(cls, spark, sdf, cat_colnames, num_colnames):
        string_indexer_list = list()
        for cat_colname in cat_colnames:
            string_indexer = StringIndexer(inputCol=cat_colname,
                                           outputCol=cat_colname + "_index",
                                           handleInvalid="skip")
            string_indexer_list.append(string_indexer)

        out = []
        pipe = []
        if len(num_colnames) > 0:

            assembler = VectorAssembler(inputCols=num_colnames,
                                        outputCol="features_vec")
            standard_scaler = StandardScaler(inputCol="features_vec",
                                             outputCol="features_zs",
                                             withMean=True,
                                             withStd=True)
            out = [standard_scaler.getOutputCol()]
            pipe = [assembler, standard_scaler]
        assembler_2 = VectorAssembler(
            inputCols=[x.getOutputCol() for x in string_indexer_list] + out,
            outputCol="features")
        estimator = KMeans(featuresCol="features",
                           predictionCol="cluster_id",
                           k=4)

        clustering_pipeline = Pipeline(stages=string_indexer_list + pipe +
                                       [assembler_2] + [estimator])
        clustering_pipeline_model = clustering_pipeline.fit(sdf)

        return KMeansPipeline(pipeline_model=clustering_pipeline_model)
    def __init__(self, file_name, spark_context, maxIter=100, regParam=0.0, tol=1e-6, threshold=0.0,
                 aggregationDepth=2):
        self.sqlContext = SQLContext(spark_context)

        self.spark_context = spark_context

        self.data = self.sqlContext.read.options(header='true', inferschema='true', delimiter=',').csv(file_name)

        self.data.cache()

        self.lr_data = self.data.select(col("Class").alias("label"), *features)

        vectorAssembler = VectorAssembler(inputCols=features, outputCol="unscaled_features")

        standardScaler = StandardScaler(inputCol="unscaled_features", outputCol="features")

        self.settings = [('maxIter',maxIter), ('regParam',regParam), ('tol',tol), ('threshold',threshold),('aggregationDepth',aggregationDepth)]

        self.SVM = LinearSVC(maxIter=maxIter, regParam=regParam, tol=tol, threshold=threshold,
                             aggregationDepth=aggregationDepth)

        stages = [vectorAssembler, standardScaler, self.SVM]

        pipeline = Pipeline(stages=stages)

        self.model = pipeline.fit(self.lr_data)
Exemple #6
0
def df_train_test():
    df_train = spark.read.parquet(os.path.join("datasets", "train.vector.parquet"))
    df_test = spark.read.parquet(os.path.join("datasets", "test.vector.parquet"))
    cols = ["vx"+str(i) for i in range(10)]
    assembler = VectorAssembler(inputCols=cols, outputCol="vx_t1")
    dct = DCT(inverse=False, inputCol="vx_t1", outputCol="vx_t2")
    slicer = VectorSlicer(inputCol="vx_t2", outputCol="vx_t3", indices=[i for i in range(40000)])
    scaler = StandardScaler(inputCol="vx_t3", outputCol="vx", withStd=True, withMean=False)

    pipeline = Pipeline(stages=[assembler, dct, slicer, scaler])
    p_model = pipeline.fit(df_train)

    df_train = p_model.transform(df_train)
    df_train = df_train.drop("vx0").drop("vx1").drop("vx2").drop("vx3").drop("vx4")
    df_train = df_train.drop("vx5").drop("vx6").drop("vx7").drop("vx8").drop("vx9")
    df_train = df_train.drop("vx_t1").drop("vx_t2").drop("vx_t3")

    df_test = p_model.transform(df_test)
    df_test = df_test.drop("vx0").drop("vx1").drop("vx2").drop("vx3").drop("vx4")
    df_test = df_test.drop("vx5").drop("vx6").drop("vx7").drop("vx8").drop("vx9")
    df_test = df_test.drop("vx_t1").drop("vx_t2").drop("vx_t3")

    df_train.write.mode("overwrite").parquet(os.path.join("datasets", "train.vector.dct.parquet"))
    df_test.write.mode("overwrite").parquet(os.path.join("datasets", "test.vector.dct.parquet"))

    df_train.printSchema()
    df_test.printSchema()
    def create_standard_pipeline(self, cross_validate=False):
        """
        This method creates a standard pipeline, standard meaning: vectorize, standardize and model...
        :return: Pipeline for pyspark, ParameterGrid for Pyspark pipeline
        """

        # Feature columns are created from instance variables
        # feature_columns = [i.name for i in self._feature_cols]

        # Vectorized transformation
        vectorizer = VectorAssembler(inputCols=self._feature_cols,
                                     outputCol='v_features')
        # Cast the vector from mllib to ml
        converter = ConvertAllToVecToMl(inputCol=vectorizer.getOutputCol(),
                                        outputCol='casted')
        # Standardize estimator
        standardizes = StandardScaler(withMean=self._standardize,
                                      withStd=self._standardize,
                                      inputCol=converter.getOutputCol(),
                                      outputCol="scaled")
        # Labels and strings are already set into the model, +
        dict_parameters = dict(
            filter(lambda x: not isinstance(x[1], tuple),
                   self._params.items()))
        dict_parameters['featuresCol'] = standardizes.getOutputCol()
        dict_parameters['labelCol'] = self._label_col[0]  # HACK!!!
        #print(label_dict)

        # Model is set
        model = eval("classification." + self._algorithm)(**dict_parameters)

        pipe = Pipeline(stages=[vectorizer, converter, standardizes, model])
        return pipe
Exemple #8
0
    def train(self, df):
        df = self.build_features_vectors(df)

        scaler = StandardScaler()
        scaler.setInputCol(self.features_values_column)
        scaler.setOutputCol(self.features_values_scaled)
        scaler.setWithMean(self.scaler_with_mean)
        scaler.setWithStd(self.scaler_with_std)
        self.scaler_model = scaler.fit(df)
        df = self.scaler_model.transform(df).persist(
            StorageLevelFactory.get_storage_level(self.storage_level))
        if len(self.categorical_features):
            self._create_indexes(df)
            self._add_categorical_features(df, self.features_values_scaled)

        iforest = IForest(
            featuresCol=self.features_values_scaled,
            predictionCol=self.prediction_column,
            # anomalyScore=self.score_column,
            numTrees=self.num_trees,
            maxSamples=self.max_samples,
            maxFeatures=self.max_features,
            maxDepth=self.max_depth,
            contamination=self.contamination,
            bootstrap=self.bootstrap,
            approxQuantileRelativeError=self.
            approximate_quantile_relative_error,
            # numCategoricalFeatures=len(self.categorical_features)
        )
        iforest.setSeed(self.seed)
        params = {'threshold': self.threshold}
        self.iforest_model = iforest.fit(df, params)
        df.unpersist()
    def standardScaler(self):
        from pyspark.ml.feature import StandardScaler

        dataFrame = self.session.read.format("libsvm").load(
            self.dataDir + "/data/mllib/sample_libsvm_data.txt")
        scaler = StandardScaler(inputCol="features",
                                outputCol="scaledFeatures",
                                withStd=True,
                                withMean=False)

        scalerModel = scaler.fit(dataFrame)
        scaledData = scalerModel.transform(dataFrame)
        scaledData.show()

        scaler = MinMaxScaler(inputCol="features", outputCol="scaledFeatures")

        # Compute summary statistics and generate MinMaxScalerModel
        scalerModel = scaler.fit(dataFrame)

        # rescale each feature to range [min, max].
        scaledData = scalerModel.transform(dataFrame)
        print("Features scaled to range: [%f, %f]" %
              (scaler.getMin(), scaler.getMax()))
        scaledData.select("features", "scaledFeatures").show()

        scaler = MaxAbsScaler(inputCol="features", outputCol="scaledFeatures")

        # Compute summary statistics and generate MaxAbsScalerModel
        scalerModel = scaler.fit(dataFrame)

        # rescale each feature to range [-1, 1].
        scaledData = scalerModel.transform(dataFrame)

        scaledData.select("features", "scaledFeatures").show()
Exemple #10
0
 def build_standard_scaler(self,
                           df,
                           with_mean=False,
                           with_std=True,
                           persist_estimator_path=None,
                           input_col='features',
                           output_col='scaled_features'):
     """
     Standard Scaler estimator builder and transformer for dense feature vectors.
     Warnings: It will build a dense output, so take care when applying to sparse input.
     :param df: Spark DataFrame object auto-reference from DataFrame class
     :param with_mean: False by default. Centers the data with mean before scaling
     :param with_std: True by default. Scales the data to unit standard deviation
     :param persist_estimator_path: Persist model estimator metadata path
     :param input_col: Name for input column to scale
     :param output_col: Name of output column to create with scaled features
     :return: Standard Scaler model
     """
     std_scaler = StandardScaler(withMean=with_mean,
                                 withStd=with_std,
                                 inputCol=input_col,
                                 outputCol=output_col)
     if persist_estimator_path:
         self.__logger.info("Compute Feature Standard ScalerModel Metadata")
         self.__logger.warning(
             f"Persist Metadata Model Path: {persist_estimator_path}")
         std_scaler.fit(df).write().overwrite().save(persist_estimator_path)
         self.__logger.info("Loading Scaler Estimator For Prediction")
         return StandardScalerModel.load(persist_estimator_path).tansfrom(
             df)
     self.__logger.info("Compute Feature Standard Scaler DataFrame")
     return std_scaler.fit(df).transform(df)
Exemple #11
0
def main(spark, data_file, model_file):
    '''Main routine for unsupervised training

    Parameters
    ----------
    spark : SparkSession object

    data_file : string, path to the parquet file to load

    model_file : string, path to store the serialized model file
    '''
    DF = spark.read.parquet(data_file)
    DF=DF.select("mfcc_00", "mfcc_01", "mfcc_02","mfcc_03","mfcc_04","mfcc_05","mfcc_06","mfcc_07","mfcc_08","mfcc_09","mfcc_10", "mfcc_11", "mfcc_12","mfcc_13","mfcc_14","mfcc_15","mfcc_16","mfcc_17","mfcc_18","mfcc_19")
    assembler = VectorAssembler(
    inputCols=["mfcc_00", "mfcc_01", "mfcc_02","mfcc_03","mfcc_04","mfcc_05","mfcc_06","mfcc_07","mfcc_08","mfcc_09","mfcc_10", "mfcc_11", "mfcc_12","mfcc_13","mfcc_14","mfcc_15","mfcc_16","mfcc_17","mfcc_18","mfcc_19"],
    outputCol="features")
    #DF = assembler.transform(DF)
    scaler = StandardScaler(inputCol="features", outputCol="features_scaled", withStd=True, withMean=False)
    #scalerModel = scaler.fit(DF)
    #DFnew = scalerModel.transform(DF)
    kmeans = KMeans().setK(100).setSeed(1)
    ###
    # TODO: Enter your code
    ###
        
    pipeline = Pipeline(stages=[assembler,scaler,kmeans])
    model=pipeline.fit(DF)
    model.save(model_file)
   
    pass
Exemple #12
0
def preprocess(df, should_undersample, scaler=None):
    """ Escala los datos y balancea usando Random Undersample (RUS) """
    # Agrupar las caracteristicas para poder usarlas en la MLlib:
    assembler = VectorAssembler(inputCols=[
        "PSSM_r1_1_K", "PSSM_r2_-1_R", "PSSM_central_2_D", "PSSM_central_0_A",
        "PSSM_r1_1_W", "PSSM_central_-1_V"
    ],
                                outputCol="features")

    out = assembler.transform(df).select("features", "class")

    # Random Undersample (RUS)
    # Antes: POS = 550.140, NEG = 1.100.591
    # Despues: POS = 550.140, NEG = 549.668
    if should_undersample:
        positive = out.filter(out["class"] == 1.0)
        negative = out.filter(out["class"] == 0.0)
        fraction = float(positive.count()) / float(negative.count())
        negative = negative.sample(withReplacement=False,
                                   fraction=fraction,
                                   seed=89)
        out = negative.union(positive)

    # Escalar:
    if scaler == None:
        scaler = StandardScaler(withMean=True,
                                withStd=True,
                                inputCol="features",
                                outputCol="scaled_features")
        scaler = scaler.fit(out)
        out = scaler.transform(out)
    else:
        out = scaler.transform(out)

    return out, scaler
Exemple #13
0
def make_pipeline(spark_df):

    for c in spark_df.columns:
        spark_df = spark_df.withColumn(c, spark_df[c].cast("float"))

    stages= []

    cols = ['acc_now_delinq', 'acc_open_past_24mths', 'annual_inc', 'avg_cur_bal', 'funded_amnt']

    #Assembling mixed data type transformations:
    assembler = VectorAssembler(inputCols=cols, outputCol="features")
    stages += [assembler]

    #Scaling features
    scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures", withStd=True, withMean=True)
    stages += [scaler]

    #Logistic Regression
    lr = LogisticRegression(featuresCol='scaledFeatures', labelCol='is_default', maxIter=10, regParam=0.1, elasticNetParam=0.1)
    stages += [lr]

    #Creating and running the pipeline:
    pipeline = Pipeline(stages=stages)
    pipelineModel = pipeline.fit(spark_df)

    return pipelineModel
Exemple #14
0
def prepare_data():
    """Commodity function to read the data from the files and prepare the features for the kmeans model fit.
    """
    # Read data from files.
    _data = load_data()

    # As the distribution of the following feature is not normal they will be log scaled to have a more
    # normally distributed distribution. This is required for kmeans algorithm to work better.
    _data = _data.withColumn('log_age', F.log('age')).withColumn('log_avg_buy', F.log('avg_buy'))\
        .withColumn('log_min_buy', F.log('min_buy')).withColumn('log_max_buy', F.log('max_buy'))

    # Select the features to use in kmeans. The features will be also standard scaled, that is mean centered
    # and scaled to have standard deviation of one.
    features = _data.columns[4:]

    assembler = VectorAssembler(inputCols=features,
                                outputCol='features_unscaled')
    assembled = assembler.transform(_data)

    scaler = StandardScaler(inputCol='features_unscaled',
                            outputCol='features',
                            withStd=True,
                            withMean=True)
    scaler_model = scaler.fit(assembled)
    scaled_data = scaler_model.transform(assembled)

    return scaled_data, features
Exemple #15
0
    def test_standard_scaler(self):
        data = self.spark.createDataFrame([(
            0,
            Vectors.dense([1.0, 0.1, -1.0]),
        ), (
            1,
            Vectors.dense([2.0, 1.1, 1.0]),
        ), (
            2,
            Vectors.dense([3.0, 10.1, 3.0]),
        )], ["id", "features"])
        scaler = StandardScaler(inputCol='features',
                                outputCol='scaled_features')
        model = scaler.fit(data)

        # the input names must match the inputCol(s) above
        model_onnx = convert_sparkml(model, 'Sparkml StandardScaler',
                                     [('features', FloatTensorType([1, 3]))])
        self.assertTrue(model_onnx is not None)

        # run the model
        predicted = model.transform(data)
        expected = predicted.toPandas().scaled_features.apply(
            lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32)
        data_np = data.toPandas().features.apply(
            lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32)
        paths = save_data_models(data_np,
                                 expected,
                                 model,
                                 model_onnx,
                                 basename="SparkmlStandardScaler")
        onnx_model_path = paths[3]
        output, output_shapes = run_onnx_model(['scaled_features'], data_np,
                                               onnx_model_path)
        compare_results(expected, output, decimal=5)
Exemple #16
0
    def z_score_encoder(self, column_names, mean=True, sd=True):
        """
        Normalize features in column_names
        Args:
            column_names: List(String), feature names for scale columns
            mean: Boolean, default True
            sd: Boolean, default True

        Returns:
            list of pipeline stage
        """

        stages = list()
        footer = self._z_score_footer
        feature_name = [(name, name + footer) for name in column_names]

        for name in feature_name:
            inner_name = name[0]
            output_name = name[1]

            vector = VectorAssembler().setInputCols([inner_name])\
                .setOutputCol(inner_name + "_vector")

            z_score = StandardScaler().setInputCol(inner_name+"_vector")\
                .setOutputCol(output_name)\
                .setWithMean(mean)\
                .setWithStd(sd)

            stages.extend([vector, z_score])

        return stages
Exemple #17
0
def prepare_spark_pipeline_for_DT():
    print('----------Preparing spark pipeline for DT----------')
    label_indexer = StringIndexer(inputCol="price",
                                  outputCol="label",
                                  handleInvalid="keep")
    vector_assembler = VectorAssembler(inputCols=features,
                                       outputCol="unscaled_features")
    standard_scaler = StandardScaler(inputCol="unscaled_features",
                                     outputCol="features")
    DT_model = DecisionTreeRegressor(maxDepth=8)

    stages = [label_indexer, vector_assembler, standard_scaler, DT_model]
    pipeline = Pipeline(stages=stages)

    estimator_param = ParamGridBuilder().addGrid(DT_model.maxDepth,
                                                 [8, 16]).addGrid(
                                                     DT_model.impurity,
                                                     ["variance"]).build()
    eval = RegressionEvaluator(labelCol="label",
                               predictionCol="prediction",
                               metricName="mse")
    return CrossValidator(estimator=pipeline,
                          estimatorParamMaps=estimator_param,
                          evaluator=eval,
                          numFolds=3), eval
def test_build_pipeline_by_dtypes_with_obvious_transformers(
        spark, string_transformers, number_transformers):
    df = spark.createDataFrame(
        [("Benjamin", 178.1, 0), ("Omar", 178.1, 1)],
        schema=["name", "height", "target"],
    )

    pipeline = pipeasy_spark.build_pipeline_by_dtypes(
        df,
        exclude_columns=["target"],
        string_transformers=[StringIndexer(),
                             OneHotEncoderEstimator()],
        numeric_transformers=[VectorAssembler(),
                              StandardScaler()],
    )

    trained_pipeline = pipeline.fit(df)
    df_transformed = trained_pipeline.transform(df)

    def get_target_values(dataframe):
        return dataframe.rdd.map(lambda row: row["target"]).collect()

    assert set(df.columns) == set(df_transformed.columns)
    assert df.count() == df_transformed.count()
    assert get_target_values(df) == get_target_values(df_transformed)
Exemple #19
0
def generate_train_test(data,
                        multiplier_minority, 
                        fraction_majority, 
                        label_col='label',
                        minority_tag=1,
                        train_perc=0.7):
    '''
    Train test split on the data (after the step of features assembling)

    multiplier_minority: how many small proportions do we want of the minority data
    fraction_majority: sample fraction for majority group
    label_col: column name of the label column
    minority_tag: tag that has very few representatives
    train_perc: how many percentages of the data will go to the training set
    '''
    po = data.filter("{} == {}".format(label_col, minority_tag))
    ne = data.filter("{} != {}".format(label_col, minority_tag))
    training_po, testing_po = po.randomSplit([train_perc, 1-train_perc], seed = 100)
    training_ne, testing_ne = ne.randomSplit([train_perc, 1-train_perc], seed = 100)
    training = training_po.union(training_ne)
    training = resample(training, 
                        multiplier_minority=multiplier_minority, 
                        fraction_majority=fraction_majority)
    testing = testing_po.union(testing_ne)
    scaler = StandardScaler(inputCol="features", 
                            outputCol="scaledFeatures",
                            withStd=True, 
                            withMean=False)
    scale_model = scaler.fit(training)
    training = scale_model.transform(training)
    testing = scale_model.transform(testing)
    return training, testing
    def __init__(self,
                 file_name,
                 spark_context,
                 smoothing=1.0,
                 modelType="multinomial"):
        self.sqlContext = SQLContext(spark_context)

        self.spark_context = spark_context

        self.data = self.sqlContext.read.options(header='true',
                                                 inferschema='true',
                                                 delimiter=',').csv(file_name)

        self.data.cache()

        self.settings = [('smoothing', smoothing), ('modelType', modelType)]

        self.lr_data = self.data.select(col("Class").alias("label"), *features)

        vectorAssembler = VectorAssembler(inputCols=features,
                                          outputCol="unscaled_features")

        standardScaler = StandardScaler(inputCol="unscaled_features",
                                        outputCol="features")

        self.nb = NaiveBayes(smoothing=smoothing, modelType=modelType)

        stages = [vectorAssembler, standardScaler, self.nb]

        pipeline = Pipeline(stages=stages)

        self.model = pipeline.fit(self.lr_data)
Exemple #21
0
def main(spark, data_file, model_file):
    '''Main routine for unsupervised training

    Parameters
    ----------
    spark : SparkSession object

    data_file : string, path to the parquet file to load

    model_file : string, path to store the serialized model file
    '''

    ###
    # TODO: YOUR CODE GOES HERE
    # Load the dataframe
    df = spark.read.parquet(data_file)
    #select out the 20 attribute columns labeled mfcc_00, mfcc_01, ..., mfcc_19
    features = [c for c in df.columns if c[:4]=='mfcc']
    assembler= VectorAssembler(inputCols=features,outputCol='features')

    standardScaler = StandardScaler(inputCol="features", outputCol="features_scaled")

    kmeans = KMeans().setK(100).setFeaturesCol("features_scaled")
    # Build the pipeline with our assembler, standardScaler, and Kmeans stages
    pipeline = Pipeline(stages=[assembler, standardScaler, kmeans])

    model = pipeline.fit(df)
    result = model.transform(df)

    model.write().overwrite().save(model_file)
Exemple #22
0
def spark_data_flow():
    input_df = spark.read.parquet(
        ("{path}/"
         "p2p_feature_merge/"
         "{version}").format(path=IN_PAHT, 
                             version=RELATION_VERSION))

    tid_vector_df = input_df.rdd.map(
        get_vectors
    ).toDF(
    ).withColumnRenamed(
        '_1', 'features'
    ).withColumnRenamed(
        '_2', 'bbd_qyxx_id'
    ).withColumnRenamed(
        '_3', 'company_name'
    ).withColumnRenamed(
        '_4', 'platform_name'
    ).withColumnRenamed(
        '_5', 'platform_state'
    )
    
    scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures",
                            withStd=True, withMean=True)
    
    # Compute summary statistics by fitting the StandardScaler
    scalerModel = scaler.fit(tid_vector_df)
    
    # Normalize each feature to have unit standard deviation.
    scaled_df = scalerModel.transform(tid_vector_df)
    
    return scaled_df
Exemple #23
0
def vectorize_data(training_data, test_data):
    # Assemble the vectors
    input_columns = training_data.columns
    input_columns.remove(TARGET)
    print("Using these features: {}".format(input_columns))
    vector_assembler = VectorAssembler(inputCols=input_columns, outputCol='features')
    train_df = vector_assembler.transform(training_data)

    # Normalize the data using Scalar
    scalar = StandardScaler(inputCol='features', outputCol='scaledFeatures', withStd=True, withMean=True).fit(train_df)
    train_df = scalar.transform(train_df)

    # Select the rows needed
    train_df = train_df.select(['scaledFeatures', TARGET])

    new_test_data = dict()
    for company in test_data:
        company_data = test_data[company]
        test_df = vector_assembler.transform(company_data)
        test_df = scalar.transform(test_df)

        test_df = test_df.select(['scaledFeatures', TARGET])
        new_test_data[company] = test_df

    return train_df, new_test_data
def fit_eval_model(data, classifier, seed=0):
    train, test = split_data(data, seed)
    scaler = StandardScaler(inputCol='NumFeatures', outputCol='features')
    pipeline = create_pipeline(data, classifier, scaler)
    model = pipeline.fit(train)
    calc_metrics(data, model, test)
    return model
Exemple #25
0
def Featurizer(categorical):

    normalizer = StandardScaler(inputCol='numerical',
                                outputCol='numerical_norm')

    label_encoder   = [StringIndexer(inputCol  = col,
                                     outputCol = col + '_label',
                                     handleInvalid  = 'keep')\
                       for col in categorical]

    one_hot_encoder = OneHotEncoderEstimator(\
                       inputCols  = [c + '_label' for c in categorical],
                       outputCols = [c + '_encod' for c in categorical],
                       handleInvalid  = 'keep'
                      )

    assemblerInputs = ['numerical_norm']\
                      + [c + "_encod" for c in categorical]
    assembler = VectorAssembler(inputCols=assemblerInputs,
                                outputCol="features")

    featurizer = Pipeline(stages =                                      \
                            [normalizer]      +                         \
                            label_encoder     +                         \
                            [one_hot_encoder] +                         \
                            [assembler]
                 )

    return featurizer
def normalizationBySpark(RDDSparkDF):
  
  scalerSD = StandardScaler(inputCol="features", outputCol="scaledFeatures",withStd=True, withMean=False)
  #scalerMaxMin = MinMaxScaler(inputCol="features", outputCol="scaledFeatures")
  #scalerMaxAbs = MaxAbsScaler(inputCol="features", outputCol="scaledFeatures")
  
  # Compute summary statistics and generate MaxAbsScalerModel
  scalerSDModel = scalerSD.fit(RDDSparkDF)
  #scalerMaxMinModel = scalerMaxMin.fit(RDDSparkDF)
  #scalerMaxAbsModel = scalerMaxAbs.fit(RDDSparkDF)
  # rescale each feature to range [-1, 1].
  scaledDataSD = scalerSDModel.transform(RDDSparkDF)
  #scaledDataMinMax = scalerMaxMinModel.transform(RDDSparkDF)
  #scaledDataMaxAbs = scalerMaxAbsModel.transform(RDDSparkDF)
  
  #Compute summary statistics by fitting the StandardScaler
  #Compute summary statistics and generate MinMaxScalerModel
  #print("Features scaled by SD to range: [%f, %f]" % (scaledDataSD.getMin(), scaledDataSD.getMax()))
  #print("Features scaled by MinMax to range: [%f, %f]" % (scaledDataMinMax.getMin(), scaledDataMinMax.getMax()))
  
  scaledFeatures_outcome = scaledDataSD.rdd.map(extractRow).toDF(newColumns)
  
  leftDF = RDDSparkDF.select(col2)
  df = leftDF.join(scaledFeatures_outcome, ["KEY"])
  
  #return scaledDataSD, scaledDataMinMax,scaledDataMaxAbs,df
  return df
Exemple #27
0
def build_scaled_features_pipeline(categorical_columns):
    pipeline_stages = []

    # encode the columns
    for col in categorical_columns:
        indexer = StringIndexer(inputCol=col, outputCol=col + '_idx')
        pipeline_stages += [indexer]

    # build feature vector
    features = [c + "_idx" for c in categorical_columns]
    assembler = VectorAssembler(inputCols=features, outputCol="feature_vec")
    pipeline_stages += [assembler]

    # scale the features
    scaler = StandardScaler(inputCol="feature_vec",
                            outputCol="features",
                            withStd=True,
                            withMean=False)
    pipeline_stages += [scaler]

    # add a new label column but this time
    # label_string_indexer = StringIndexer(inputCol='label', outputCol='label_idx')
    # pipeline_stages += [label_string_indexer]

    return Pipeline(stages=pipeline_stages)
Exemple #28
0
    def fit(self, sdf):
        """

        :param sdf:
        :return:
        """

        if self.weighter is None:
            raise NotImplementedError(
                "The weighter parameter has not been defined.")

        weights_arr = self.weighter.get_feature_importances(sdf)

        pipeline_lst = [
            VectorAssembler(inputCols=self.input_cols, outputCol="vec"),
            StandardScaler(inputCol="vec", outputCol="standard_vec"),
            ElementwiseProduct(scalingVec=weights_arr,
                               inputCol='standard_vec',
                               outputCol='scaled_vec')
        ]

        _model = Pipeline(stages=pipeline_lst)
        model = _model.fit(sdf)

        self.model = model

        return self
Exemple #29
0
def lr_history_data(df_buildFeatures):
       categary_features = ['org', 'dst', 'isReturn_type', 'isDirect_type', 'departYear', 'is_vacation', 'vacation_days', 'day_of_vacation']
       stringIndexer_stages = []
       onehotEncoder_stages = []
       for cateIndexer in categary_features:
              stringIndexer = StringIndexer(inputCol=cateIndexer, outputCol='stringIndexer_'+cateIndexer)
              onehotEncoder = OneHotEncoder(inputCol=stringIndexer.getOutputCol(), outputCol='onehotEncoder_'+cateIndexer, dropLast=False)
              stringIndexer_stages.append(stringIndexer)
              onehotEncoder_stages.append(onehotEncoder)
       scaler_features = ['departMonth', 'depart_dayofmonth', 'depart_weekofyear', 'depart_dayofweek', 'departQuarter',\
                          'intervalDays', 'intervalMonths', 'intervalWeeks', 'intervalQuarters', 'preceding3day_price']
       scaler_assembler = VectorAssembler(inputCols=scaler_features, outputCol='scalerAssembler')
       scaler = StandardScaler(inputCol='scalerAssembler', outputCol='scalerFeatures', withMean=True, withStd=True)
       features = []
       for cateIndexer in categary_features:
              features.append(("onehotEncoder_"+cateIndexer))
       features.append('scalerFeatures')
       featrues_assembler = VectorAssembler(inputCols=features, outputCol='features')
       stages = stringIndexer_stages + onehotEncoder_stages + [scaler_assembler, scaler, featrues_assembler]
       pipeline = Pipeline(stages=stages)
       pipelineModel = pipeline.fit(df_buildFeatures)
       if hdfs.exists('/predict-2019/pipelineModel_' + todayStr1):
              hdfs.rm('/predict-2019/pipelineModel_' + todayStr1)
       pipelineModel.save('hdfs://10.0.1.218:9000/predict-2019/pipelineModel_' + todayStr1)
       lr_historyData = pipelineModel.transform(df_buildFeatures)\
                              .select('predictDate', 'departDate', 'flineId_noDate', 'price', 'features')
       # if hdfs.exists('/predict-2019/lr_data_20190624.parquet'):
       #        hdfs.rm('/predict-2019/lr_data_20190624.parquet')
       # lr_data.write.save('hdfs://10.0.1.218:9000/predict-2019/lr_data_20190624.parquet')
       return lr_historyData
Exemple #30
0
def train_new_feature_pipeline(df: DataFrame,
                               degree: int = 3) -> PipelineModel:
    """Create a new feature pipeline and fit to training data

    :param df: raw Iris spark sql data frame
    :type df: DataFrame
    :param degree: degree of polynomial feature expansion
    :type degree: int

    :returns: fitted feature pipeline
    :rtype: PipelineModel    
    """
    assembler = VectorAssembler(
        inputCols=[
            "sepal_length_cm",
            "sepal_width_cm",
            "petal_length_cm",
            "petal_width_cm",
        ],
        outputCol="features",
    )
    scaler = StandardScaler(inputCol="features",
                            outputCol="scaledFeatures",
                            withStd=True,
                            withMean=True)
    polyExpansion = PolynomialExpansion(degree=degree,
                                        inputCol="scaledFeatures",
                                        outputCol="polyFeatures")
    pipeline = Pipeline(stages=[assembler, scaler, polyExpansion])
    pipeline_model = pipeline.fit(df)
    return pipeline_model