Esempio n. 1
0
 def test_clear_param(self):
     df = self.spark.createDataFrame([(Vectors.dense([1.0]), ),
                                      (Vectors.dense([2.0]), )], ["a"])
     maScaler = MaxAbsScaler(inputCol="a", outputCol="scaled")
     model = maScaler.fit(df)
     self.assertTrue(model.isSet(model.outputCol))
     self.assertEqual(model.getOutputCol(), "scaled")
     model.clear(model.outputCol)
     self.assertFalse(model.isSet(model.outputCol))
     self.assertEqual(model.getOutputCol()[:12], 'MaxAbsScaler')
     output = model.transform(df)
     self.assertEqual(model.getOutputCol(), output.schema.names[1])
 def scaling(dataFrame, inputColName):
     outputColName = "scaled " + inputColName
     assembler = VectorAssembler(inputCols=[inputColName], \
                                 outputCol="features")
     assembledDF = assembler.transform(dataFrame)
     scaler=MaxAbsScaler(inputCol="features", \
                         outputCol=outputColName)
     scalerModel=scaler.fit(assembledDF)
     scaledDF = scalerModel.transform(assembledDF).drop("features")
     castVectorToFloat = udf(lambda v : float(v[0]), FloatType())
     scaledDF = scaledDF.withColumn(outputColName, castVectorToFloat(outputColName)) 
     print ("Successfully scale the column '{0:s}' to range (-1, 1) and create a new column '{1:s}'.".format(inputColName, outputColName))
     return scaledDF
    def standardScaler(self):
        from pyspark.ml.feature import StandardScaler

        dataFrame = self.session.read.format("libsvm").load(
            self.dataDir + "/data/mllib/sample_libsvm_data.txt")
        scaler = StandardScaler(inputCol="features",
                                outputCol="scaledFeatures",
                                withStd=True,
                                withMean=False)

        scalerModel = scaler.fit(dataFrame)
        scaledData = scalerModel.transform(dataFrame)
        scaledData.show()

        scaler = MinMaxScaler(inputCol="features", outputCol="scaledFeatures")

        # Compute summary statistics and generate MinMaxScalerModel
        scalerModel = scaler.fit(dataFrame)

        # rescale each feature to range [min, max].
        scaledData = scalerModel.transform(dataFrame)
        print("Features scaled to range: [%f, %f]" %
              (scaler.getMin(), scaler.getMax()))
        scaledData.select("features", "scaledFeatures").show()

        scaler = MaxAbsScaler(inputCol="features", outputCol="scaledFeatures")

        # Compute summary statistics and generate MaxAbsScalerModel
        scalerModel = scaler.fit(dataFrame)

        # rescale each feature to range [-1, 1].
        scaledData = scalerModel.transform(dataFrame)

        scaledData.select("features", "scaledFeatures").show()
Esempio n. 4
0
    def maxabs_scale(self, columns='*'):
        '''
        rescale the columns by dividing by the max absolute value
        '''
        if columns == "*":
            columns = self._df.schema.names
        else:
            assert isinstance(columns,
                              list), "Error: columns argument must be a list!"

        for column in columns:
            outputcol = column + '_scaled'
            assembler = VectorAssembler(inputCols=[column],
                                        outputCol='features')
            df = assembler.transform(self._df)
            scaler = MaxAbsScaler(inputCol='features', outputCol=outputcol)
            df = scaler.fit(df).transform(df).drop('features')
            to_float = udf(lambda x: float(x[0]))
            self._df = df.withColumn(outputcol, to_float(outputcol))
        return self._df
Esempio n. 5
0
    def test_maxabs_scaler(self):
        data = self.spark.createDataFrame([
            (0, Vectors.dense([1.0, 0.1, -1.0]),),
            (1, Vectors.dense([2.0, 1.1, 1.0]),),
            (2, Vectors.dense([3.0, 10.1, 3.0]),)
        ], ["id", "features"])
        scaler = MaxAbsScaler(inputCol='features', outputCol='scaled_features')
        model = scaler.fit(data)

        # the input names must match the inputCol(s) above
        model_onnx = convert_sparkml(model, 'Sparkml MaxAbsScaler', [('features', FloatTensorType([1, 3]))])
        self.assertTrue(model_onnx is not None)

        # run the model
        predicted = model.transform(data)
        expected = predicted.toPandas().scaled_features.apply(lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32)
        data_np = data.toPandas().features.apply(lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32)
        paths = save_data_models(data_np, expected, model, model_onnx, basename="SparkmlMaxAbsScaler")
        onnx_model_path = paths[3]
        output, output_shapes = run_onnx_model(['scaled_features'], data_np, onnx_model_path)
        compare_results(expected, output, decimal=5)
Esempio n. 6
0
 def get_scaler(self, option):
     """Set up scaler for dataset."""
     if option == 'standard':
         scaler = StandardScaler(inputCol="features",
                                 outputCol="scaledFeatures",
                                 withStd=True,
                                 withMean=False)
     elif option == 'minmax':
         scaler = MinMaxScaler(inputCol="features",
                               outputCol="scaledFeatures")
     elif option == 'maxabs':
         scaler = MaxAbsScaler(inputCol="features",
                               outputCol="scaledFeatures")
     else:
         scaler = None
     return scaler
def ml_pipeline_factory(inputCols, classifier, param_gird=None):
    """
    Helper function to build a Spark ML pipeline based on a given classifier for
    the given feature names in the given DataFrame. Result is a `CrossValidator`
    that needs to be fitted with `.fit(df)` to the training/validation set.

    INPUT:
        - inputCols (list [string]): list of string names of the feature columns of `df` that shall be considered.
        - classifier: a classifier instance from pyspark.ml.classification
        - param_grid: a ParamGrid that was built for the passed classifier based on pyspark.ml.ParamGridBuilder
    OUTPUT:
        - result (CrossValidator): A Spark ML `CrossValidator`. 
    """

    # VectorAssembler
    vecAssembler = VectorAssembler(inputCols=inputCols,
                                   outputCol="features",
                                   handleInvalid='skip')

    # Normalizer / Scaler
    """
    TODO Apply Standardization instead of scaling to account for outliers
    """
    maScaler = MaxAbsScaler(inputCol="features", outputCol="features_scaled")

    # Define a pipeline
    pipe = Pipeline(stages=[vecAssembler, maScaler, classifier])

    # Use cross-validation
    cv = CrossValidator(estimator=pipe,
                        evaluator=MulticlassClassificationEvaluator(
                            labelCol='churn', metricName='f1'),
                        estimatorParamMaps=param_gird,
                        numFolds=3,
                        parallelism=4)

    return cv
def loadMaxAbsScaler(path):
    """
        input: path
        return value: scaler [MaxAbsScaler]
    """
    return MaxAbsScaler.load(path)
Esempio n. 9
0
sScaler.fit(scaleDF).transform(scaleDF).show()

# COMMAND ----------

from pyspark.ml.feature import MinMaxScaler

minMax = MinMaxScaler().setMin(5).setMax(10).setInputCol(
    "features").setOutputCol("features_minmax_scaled")
fittedminMax = minMax.fit(scaleDF)
fittedminMax.transform(scaleDF).show()

# COMMAND ----------

from pyspark.ml.feature import MaxAbsScaler

maScaler = MaxAbsScaler().setInputCol("features").setOutputCol(
    "features_MaxAbs_scaled")
fittedmaScaler = maScaler.fit(scaleDF)
fittedmaScaler.transform(scaleDF).show()

# COMMAND ----------

from pyspark.ml.feature import ElementwiseProduct
from pyspark.ml.linalg import Vectors

scaleUpVec = Vectors.dense(10.0, 15.0, 20.0)
scalingUp = ElementwiseProduct()\
  .setScalingVec(scaleUpVec)\
  .setInputCol("features")
scalingUp.transform(scaleDF).show()

# COMMAND ----------
Esempio n. 10
0
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

from __future__ import print_function

# $example on$
from pyspark.ml.feature import MaxAbsScaler
# $example off$
from pyspark.sql import SparkSession

if __name__ == "__main__":
    spark = SparkSession.builder.appName("MaxAbsScalerExample").getOrCreate()

    # $example on$
    dataFrame = spark.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt")

    scaler = MaxAbsScaler(inputCol="features", outputCol="scaledFeatures")

    # Compute summary statistics and generate MaxAbsScalerModel
    scalerModel = scaler.fit(dataFrame)

    # rescale each feature to range [-1, 1].
    scaledData = scalerModel.transform(dataFrame)
    scaledData.show()
    # $example off$

    spark.stop()
Esempio n. 11
0
    ).transform(df)
    temp_normalized_vector_col = temp_col_name(assembled)

    trained_parameters = load_trained_parameters(trained_parameters, {"input_column": input_column,})

    scaler_model, scaler_model_loaded = load_pyspark_model_from_trained_parameters(
        trained_parameters, MinMaxScalerModel, "scaler_model"
    )

    if scaler_model is None:
        scaler = MinMaxScaler(inputCol=temp_vector_col, outputCol=temp_normalized_vector_col)
        scaler_model = fit_and_save_model(trained_parameters, "scaler_model", scaler, assembled_wo_nans)

    output_df = transform_using_trained_model(scaler_model, assembled, scaler_model_loaded)

    scaler = MaxAbsScaler(inputCol=temp_vector_col, outputCol=temp_normalized_vector_col)

    output_df = scaler.fit(assembled_wo_nans).transform(assembled)

    # convert the resulting vector back to numeric
    temp_flattened_vector_col = temp_col_name(output_df)
    output_df = output_df.withColumn(temp_flattened_vector_col, vector_to_array(temp_normalized_vector_col))

    # keep only the final scaled column.
    output_column = input_column if output_column is None or not output_column else output_column
    output_column_value = sf.col(temp_flattened_vector_col)[0].alias(output_column)
    output_df = output_df.withColumn(output_column, output_column_value)
    final_columns = list(dict.fromkeys((list(df.columns) + [output_column])))
    output_df = output_df.select(final_columns)

    return default_spark_with_trained_parameters(output_df, trained_parameters)
Esempio n. 12
0
#Precise daily perspective
fc_5 = ["Hour", "Minute"] + [enc.getOutputCol() for enc in encoders]

#Unprecise daily perspective
fc_6 = ["Hour"] + [enc.getOutputCol() for enc in encoders]

fcs = [fc_1, fc_2, fc_3, fc_4, fc_5, fc_6]
#=========== END FC ===========#

standard_scaler = StandardScaler(inputCol="Features",
                                 outputCol="scaledFeatures",
                                 withStd=False,
                                 withMean=True)
min_max_scaler = MinMaxScaler(inputCol="Features", outputCol="scaledFeatures")
max_abs_scaler = MaxAbsScaler(inputCol="Features", outputCol="scaledFeatures")

norm_standard_scaler = StandardScaler(inputCol="normFeatures",
                                      outputCol="scaledFeatures",
                                      withStd=False,
                                      withMean=True)
norm_min_max_scaler = MinMaxScaler(inputCol="normFeatures",
                                   outputCol="scaledFeatures")
norm_max_abs_scaler = MaxAbsScaler(inputCol="normFeatures",
                                   outputCol="scaledFeatures")

normalizer = Normalizer(inputCol="Features", outputCol="normFeatures")

######END PIPELINE

from pyspark.ml.classification import LogisticRegression, MultilayerPerceptronClassifier, DecisionTreeClassifier
sScaler = StandardScaler().setInputCol("features")
sScaler.fit(scaleDF).transform(scaleDF).show()


# COMMAND ----------

from pyspark.ml.feature import MinMaxScaler
minMax = MinMaxScaler().setMin(5).setMax(10).setInputCol("features")
fittedminMax = minMax.fit(scaleDF)
fittedminMax.transform(scaleDF).show()


# COMMAND ----------

from pyspark.ml.feature import MaxAbsScaler
maScaler = MaxAbsScaler().setInputCol("features")
fittedmaScaler = maScaler.fit(scaleDF)
fittedmaScaler.transform(scaleDF).show()


# COMMAND ----------

from pyspark.ml.feature import ElementwiseProduct
from pyspark.ml.linalg import Vectors
scaleUpVec = Vectors.dense(10.0, 15.0, 20.0)
scalingUp = ElementwiseProduct()\
  .setScalingVec(scaleUpVec)\
  .setInputCol("features")
scalingUp.transform(scaleDF).show()

    def process(self, data_input, data_output):
        """
        An spark process to do feature engineering
        :param data_input: data input filename
        :param data_output: data output filename
        """

        df = self.spark.read.parquet(data_input).select('SHP_DATE_CREATED_ID', 'SHP_DATETIME_CREATED_ID', 'SHP_DATE_HANDLING_ID', 'SHP_DATETIME_HANDLING_ID', 'SHP_SENDER_ID', 'SHP_ORDER_COST', 'CAT_CATEG_ID_L7', 'SHP_ADD_ZIP_CODE', 'SHP_DATE_SHIPPED_ID', 'SHP_DATETIME_SHIPPED_ID', 'HT_REAL')

        # 1. SHP_ORDER_COST_INT: Se tranforma la columna SHP_ORDER_COST de float a integer.
        df = df.withColumn("SHP_ORDER_COST_INT", (df["SHP_ORDER_COST"].cast(IntegerType())))

        # 2. SHP_DAY: Se añade una columna para indicar que dia de la semana se acredito el pago.
        shp_day_udf = udf(self.shp_day, IntegerType())

        df = df.withColumn('SHP_DAY', shp_day_udf(df['SHP_DATE_HANDLING_ID']))

        # 3. WKND_DAY: Se añade una columna para indicar si el pago se acredito durante el fin de semana.
        weekend_day_udf = udf(self.weekend_day, IntegerType())

        df = df.withColumn('WKND_DAY', weekend_day_udf(df['SHP_DATE_HANDLING_ID']))
        df.select('WKND_DAY').show(10)

        # 4. MONTH_NUM: Se añanade una columna para indicar el mes del pago.
        week_number_udf = udf(self.week_number, IntegerType())

        df = df.withColumn('WK_NUM', week_number_udf(df['SHP_DATE_HANDLING_ID']))
        df.select('WK_NUM').show(10)

        # 5. *WK_NUM*: Se añanade una columna para indicar la semana del año en la que se realizó el pago.
        month_number_udf = udf(self.month_number, IntegerType())
        df = df.withColumn('MONTH_NUM', month_number_udf(df['SHP_DATE_HANDLING_ID']))

        # 6. *TIMESTAMP*: Se añanade un TIMESTAMP de las fechas.

        get_timestamp_udf = udf(self.get_timestamp, IntegerType())
        df = df.withColumn('SHP_DATE_HANDLING_TIMESTAMP', get_timestamp_udf(df['SHP_DATE_HANDLING_ID']))
        df = df.withColumn('SHP_DATE_CREATED_TIMESTAMP', get_timestamp_udf(df['SHP_DATE_CREATED_ID']))

        my_handling_time_udf = udf(self.my_handling_time, IntegerType())

        df = df.withColumn('HT', my_handling_time_udf(array('SHP_DATETIME_SHIPPED_ID', 'SHP_DATETIME_HANDLING_ID')))
        shp_sender_indexer = StringIndexer(inputCol="SHP_SENDER_ID", outputCol="SHP_SENDER_ID_NUM").fit(df)
        df = shp_sender_indexer.transform(df)
        shp_sender_indexer = StringIndexer(inputCol="CAT_CATEG_ID_L7", outputCol="CAT_CATEG_ID_L7_NUM").fit(df)
        df = shp_sender_indexer.transform(df)

        #create the vector assembler 
        vec_assembler = VectorAssembler(inputCols=['SHP_DATE_HANDLING_TIMESTAMP', 'SHP_DATE_CREATED_TIMESTAMP','SHP_SENDER_ID_NUM', 'CAT_CATEG_ID_L7_NUM', 
                            'SHP_ORDER_COST_INT', 'SHP_DAY', 'WKND_DAY', 
                            'WK_NUM', 'MONTH_NUM', 'SHP_ADD_ZIP_CODE'], outputCol='features')

        #transform the values
        features_df = vec_assembler.transform(df)

        scaler = MaxAbsScaler(inputCol="features", outputCol="scaledFeatures")

        # Compute summary statistics and generate MaxAbsScalerModel
        scalerModel = scaler.fit(features_df)

        # rescale each feature to range [-1, 1].
        scaledData = scalerModel.transform(features_df)

        #Save dataset as parquet
        scaledData.write.format("parquet").mode('overwrite').option("header", "true").save(data_output)
dfAmazon = dfJoin.select('*').where(dfJoin.company == 'AMAZON')
dfGoogle = dfJoin.select('*').where(dfJoin.company == 'GOOGLE')
dfNetflix = dfJoin.select('*').where(dfJoin.company == 'NETFLIX')
dfSnapchat = dfJoin.select('*').where(dfJoin.company == 'SNAPCHAT')
dfMicrosoft = dfJoin.select('*').where(dfJoin.company == 'MICROSOFT')
dfFacebook.describe().toPandas().transpose()
display(dfFacebook)

# COMMAND ----------

#Feature scaling using MaxAbsScaler
vectorAssembler = VectorAssembler(
    inputCols=['avg-sentiment', 'avg-followers', 'avg-volume'],
    outputCol='features')
v_dffacebook = vectorAssembler.transform(dfFacebook)
scaler = MaxAbsScaler(inputCol="features", outputCol="scaledFeatures")
scalerModel = scaler.fit(v_dffacebook)
scaledData = scalerModel.transform(v_dffacebook)
scaledData.select("features", "scaledFeatures").show()
v_dffacebook1 = scaledData.select(['features', 'scaledFeatures', 'avg-close'])
v_dffacebook1.show()

# COMMAND ----------

#Train test split
train_df, test_df = v_dffacebook1.randomSplit([0.8, 0.2])

# COMMAND ----------

#Linear Regression model
lr = LinearRegression(featuresCol='features', labelCol='avg-close', maxIter=10)
###MaxAbsScaler (-1, 1)
from pyspark.ml.feature import MaxAbsScaler
from pyspark.ml.linalg import Vectors

dataFrame = spark.createDataFrame([(
    0,
    Vectors.dense([1.0, 0.1, -8.0]),
), (
    1,
    Vectors.dense([2.0, 1.0, -4.0]),
), (
    2,
    Vectors.dense([4.0, 10.0, 8.0]),
)], ["id", "features"])

scaler = MaxAbsScaler(inputCol="features", outputCol="scaledFeatures")

# Compute summary statistics and generate MaxAbsScalerModel
scalerModel = scaler.fit(dataFrame)

# rescale each feature to range [-1, 1].
scaledData = scalerModel.transform(dataFrame)

scaledData.select("features", "scaledFeatures").show()

# COMMAND ----------

#####Bucketizer transform the continuous features into columns of feature bucket, by defining the size of the bucket
from pyspark.ml.feature import Bucketizer

splits = [-float("inf"), -0.5, 0.0, 0.5, float("inf")]
Esempio n. 17
0
def scaler(input_features):
    scaler = MaxAbsScaler(inputCol="raw_features", outputCol="features")
    scalerModel = scaler.fit(input_features)
    scaledData = scalerModel.transform(input_features).drop("raw_features")
    #scaledData.show(3)
    return scaledData
Esempio n. 18
0
@author: luogan
"""

from pyspark.ml.feature import MaxAbsScaler
from pyspark.ml.linalg import Vectors
from pyspark.sql import SparkSession

spark= SparkSession\
                .builder \
                .appName("dataFrame") \
                .getOrCreate()
dataFrame = spark.createDataFrame([(
    0,
    Vectors.dense([1.0, 0.1, -8.0]),
), (
    1,
    Vectors.dense([2.0, 1.0, -4.0]),
), (
    2,
    Vectors.dense([4.0, 10.0, 8.0]),
)], ["id", "features"])

scaler = MaxAbsScaler(inputCol="features", outputCol="scaledFeatures")

# Compute summary statistics and generate MaxAbsScalerModel
scalerModel = scaler.fit(dataFrame)

# rescale each feature to range [-1, 1].
scaledData = scalerModel.transform(dataFrame)

scaledData.select("features", "scaledFeatures").show()
Esempio n. 19
0
dfJoin = dfJoin.withColumnRenamed("avg(volume)","avg-volume")
dfJoin = dfJoin.withColumnRenamed("avg(followers_count)","avg-followers")
dfJoin.show()

# COMMAND ----------

from pyspark.ml.feature import VectorAssembler
dfJoin1 = dfJoin.select("avg-sentiment","avg-followers","avg-volume")
inputFeatures = ["avg-sentiment","avg-followers","avg-volume"]
assembler = VectorAssembler(inputCols=inputFeatures, outputCol="features")
dfJoin2 = assembler.transform(dfJoin1)

# COMMAND ----------

# Scaling features
scaler = MaxAbsScaler(inputCol="features", outputCol="scaledFeatures")
scalerModel = scaler.fit(dfJoin2)
scaledData = scalerModel.transform(dfJoin2)
scaledData.select("features", "scaledFeatures").show()

# COMMAND ----------

#Elbow method
import numpy as np
cost = np.zeros(10)
for k in range(2,10):
    kmeans = KMeans().setK(k).setFeaturesCol("scaledFeatures").setPredictionCol("prediction").setMaxIter(1).setSeed(1)
    model = kmeans.fit(scaledData)
    cost[k] = model.computeCost(scaledData)

# COMMAND ----------