Esempio n. 1
0
def piStrOneHotEncoding(featurename, dataframe):
    from pyspark.ml.feature import OneHotEncoder
    from pyspark.ml.feature import StringIndexer
    #from pyspark.ml.feature import VectorIndexer
    indexed = dataframe
    indexer = StringIndexer(inputCol=featurename, outputCol=featurename + "HE")
    indexed = indexer.fit(indexed).transform(indexed)
    encoder = OneHotEncoder(inputCols=[featurename + "HE"],
                            outputCols=[featurename + "OHE"])
    indexed = encoder.fit(indexed).transform(indexed)

    def convertSparseVectortoDenseVectorInt(v):
        v = DenseVector(v)
        new_array = list([int(x) for x in v])
        return new_array

    toDenseVectorUdfInt = F.udf(convertSparseVectortoDenseVectorInt,
                                T.ArrayType(T.IntegerType()))

    from pyspark.ml.feature import Interaction, VectorAssembler
    assembler1 = VectorAssembler(inputCols=[featurename + "OHE"],
                                 outputCol="vec1")
    assembled1 = assembler1.transform(indexed)
    a = assembled1.toPandas()
    indexed = indexed.drop(featurename).drop(featurename + "HE").withColumn(
        featurename,
        toDenseVectorUdfInt(featurename + "OHE")).drop(featurename + "OHE")
    #indexer = VectorIndexer(inputCol=featurename+"OHE", outputCol=featurename+"tHE", maxCategories=10)
    #indexerModel = indexer.fit(indexed)
    #indexed = indexerModel.transform(indexed)

    return indexed
Esempio n. 2
0
def add_pref_user_system(spark):
    """
    For each user, add the mode of its prefered system (iPad, Macos, Windows...)
    as a dummy variable with one-hot-encoding.
    Input:
        - spark (SparkSession): A initiated pyspark.sql.SparkSession
    """
    pref_user_system = """
                    SELECT userId, MAX(userSystem) as pref_user_system
                    FROM df_table
                    GROUP BY userId
                    """
    features_df = add_features(spark, pref_user_system, ['pref_user_system'])

    # Fill null values of the operating system, i.e. "unknown"
    features_df = features_df.fillna("unknown", subset=['pref_user_system'])

    # Index the prefered user system (macos, iphone etc...)
    stringIndexer = StringIndexer(inputCol="pref_user_system",
                                  outputCol="pref_user_system_ind",
                                  stringOrderType="frequencyDesc")

    features_df = stringIndexer.fit(features_df).transform(features_df)

    # Apply one-hot-encoding for the user system
    ohe = OneHotEncoder(inputCol="pref_user_system_ind",
                        outputCol="pref_user_system_ohe",
                        dropLast=True)

    ohe_model = ohe.fit(features_df)
    features_df = ohe_model.transform(features_df)
    features_df.createOrReplaceTempView("features_df")

    return features_df
def oneHotEncodeColumns(df, cols):
    newdf = df
    for c in cols:
        ohe = OneHotEncoder(inputCol=c, outputCol=c+'-onehot', dropLast=False)
        ohe_model = ohe.fit(newdf)
        newdf = ohe_model.transform(newdf).drop(c)
        newdf = newdf.withColumnRenamed(c+'-onehot', c)
    return newdf
Esempio n. 4
0
def one_hot_example(data: DataFrame):
    data_with_id_number = data.withColumn("movieIdNumber",
                                          F.col("movieId").cast("Integer"))
    encoder = OneHotEncoder(inputCol="movieIdNumber",
                            outputCol="movieIdVector")
    model = encoder.fit(data_with_id_number)
    result = model.transform(data_with_id_number)
    print_info(result)
 def add_origin(df, trained_model=None):
     from pyspark.ml.feature import OneHotEncoder, StringIndexer
     if not trained_model:
         indexer = StringIndexer(inputCol='ORIGIN',
                                 outputCol='origin_index')
         trained_model = indexer.fit(df)
     indexed = trained_model.transform(df)
     encoder = OneHotEncoder(inputCol='origin_index',
                             outputCol='origin_onehot')
     return trained_model, encoder.fit(indexed).transform(indexed)
Esempio n. 6
0
def oneHotEncoderExample(movieSamples):
    samplesWithIdNumber = movieSamples.withColumn(
        "movieIdNumber",
        F.col("movieId").cast(IntegerType()))
    encoder = OneHotEncoder(inputCols=["movieIdNumber"],
                            outputCols=["movieIdVector"],
                            dropLast=False)
    oneHotEncoderSamples = encoder.fit(samplesWithIdNumber).transform(
        samplesWithIdNumber)
    oneHotEncoderSamples.printSchema()
    oneHotEncoderSamples.show(10)
Esempio n. 7
0
def transform_data(data):
    '''
    Transform dataset
    Input:
        (data) Spark DataFrame
    Output:
        Transformed DataFrame
    '''
    # Save features that are already numerical
    data_num = data.select('age', 'hypertension', 'heart_disease', 'avg_glucose_level')

    # Use StringIndexer to transform categories to numerical values
    inputs = ['gender', 'ever_married', 'work_type',
                'Residence_type', 'smoking_status']
    outputs = ['gender_i', 'ever_married_i', 'work_type_i',
                'Residence_type_i', 'smoking_status_i']
    indexer = StringIndexer(inputCols=inputs, outputCols=outputs)
    indexed = indexer.fit(data).transform(data)
    indexed = indexed.select(*outputs)

    # Use OneHotEncoder to map the numerical values to vectors
    encoder = OneHotEncoder(inputCols=indexed.columns, outputCols=inputs)
    encoded = encoder.fit(indexed).transform(indexed)
    encoded = encoded.select(*inputs)

    # Combine numerical features into a single DataFrame
    w = Window.orderBy(lit(1))
    data_num = data_num.withColumn('rn', row_number().over(w)-1)
    encoded = encoded.withColumn('rn', row_number().over(w)-1)
    combined_data = data_num.join(encoded, ['rn']).drop('rn')

    # Combine features into a single feature column using VectorAssembler
    assembler = VectorAssembler(inputCols=combined_data.columns, outputCol='features')
    assembled = assembler.transform(combined_data)

    # Convert sparse vectors to NumPy arrays
    assembled = assembled.toPandas()
    assembled['features'] = assembled['features'].apply(np.asarray)
    
    # Transform feature arrays to columns
    new_columns = range(len(df['features'][0]))
    new_data = assembled.features.to_list()
    assembled = assembled.DataFrame(new_data, columns=new_columns)

    return assembled
Esempio n. 8
0
    def oneHotEncodeColumns(self, df, cols, save_path):
        '''
        :param df: 输入spark.DataFrame
        :param cols: 需要编码的列名
        :return: 编码后的新spark.DataFrame
        '''
        newdf = df
        num = 0
        total = len(cols)
        print("正在onehot特征化...")

        @udf(ArrayType(IntegerType()))
        def toDense(v):
            print(v)
            print(Vectors.dense(v).toArray())
            v = DenseVector(v)

            new_array = list([int(x) for x in v])

            return new_array

        for c in cols:
            num += 1
            # print("{0}/{1} 正在onehot特征:{2}".format(num, total, c))
            onehotEncoderPath = save_path + "/onehot-" + c
            print("{0}/{1} 正在onehot特征:{2}".format(num, total, c))
            stringIndexer = StringIndexer(inputCol=c,
                                          outputCol=c + "Index",
                                          handleInvalid="keep")
            model = stringIndexer.fit(df)
            indexed = model.transform(df)
            ohe = OneHotEncoder(inputCol=c + "Index",
                                outputCol=c + "-sparse",
                                dropLast=False)
            newdf = ohe.fit(indexed).transform(indexed)
            # newdf = newdf.withColumnRenamed(c + "-onehot", c)
            newdf = newdf.withColumn(
                c + "-onehot", toDense(c + "-sparse")).drop(c + "-sparse")
            # newdf = newdf.withColumnRenamed(c + "-onehot", c)
            ohe.write().overwrite().save(onehotEncoderPath)
        print("完成onehot特征化!")
        # newdf.withColumn('updatetime', pyf.current_timestamp())
        # newdf.write.mode("overwrite").saveAsTable("mkt_mldb_tmp.TRAIN_dfhotable")  #
        return newdf
def oneHotEncoderExample(movieSamples):
    # 增加一列movieIdNumber整数电影ID列
    samplesWithIdNumber = movieSamples.withColumn(
        "movieIdNumber",
        F.col("movieId").cast(IntegerType()))

    # pyspark.ml.feature 特征工程,movieIdNumber列做OneHot编码到movieIdVector
    # dropLast=True会导致使用全0的onehot向量表示最后一个category,这不适合我们
    # 一共1000部电影,之所以有1001列onehot是因为没有ID为0的电影
    encoder = OneHotEncoder(inputCols=["movieIdNumber"],
                            outputCols=['movieIdVector'],
                            dropLast=False)

    # 执行转换
    oneHotEncoderSamples = encoder.fit(samplesWithIdNumber).transform(
        samplesWithIdNumber)

    # 打印schema
    oneHotEncoderSamples.printSchema()
    # 打印数据
    oneHotEncoderSamples.show(10, False)
Esempio n. 10
0
    def test_model_onehot_encoder(self):
        encoder = OneHotEncoder(inputCols=['index'], outputCols=['indexVec'])
        data = self.spark.createDataFrame(
            [(0.0,), (1.0,), (2.0,), (2.0,), (0.0,), (2.0,)], ['index'])
        model = encoder.fit(data)
        model_onnx = convert_sparkml(
            model, 'Sparkml OneHotEncoder', [('index', FloatTensorType([None, 1]))])
        self.assertTrue(model_onnx is not None)
        self.assertTrue(model_onnx.graph.node is not None)
        # run the model
        predicted = model.transform(data)
        data_np = data.select("index").toPandas().values.astype(numpy.float32)
        predicted_np = predicted.select("indexVec").toPandas().indexVec.apply(
            lambda x: x.toArray().tolist()).values
        expected = numpy.asarray(
            [x + [0] if numpy.amax(x) == 1 else x + [1] for x in predicted_np])

        paths = save_data_models(data_np, expected, model, model_onnx,
                                basename="SparkmlOneHotEncoder")
        onnx_model_path = paths[-1]
        output, output_shapes = run_onnx_model(['indexVec'], data_np, onnx_model_path)
        compare_results(expected, output, decimal=5)
Esempio n. 11
0
def one_hot_indexer_example():
    # https://spark.apache.org/docs/latest/mllib-data-types.html#local-vector
    # https://stackoverflow.com/questions/42295001/how-to-interpret-results-of-spark-onehotencoder#42297197
    df = spark.createDataFrame([
        (0.0, 1.0),
        (1.0, 0.0),
        (2.0, 1.0),
        (0.0, 2.0),
        (3.0, 1.0),
        (2.0, 0.0)
    ], ["categoryIndex1", "categoryIndex2"])

    encoder = OneHotEncoder(
        inputCols=[
            "categoryIndex1",
            "categoryIndex2"],
        outputCols=[
            "categoryVec1",
            "categoryVec2"],
        dropLast=False)
    model = encoder.fit(df)
    encoded = model.transform(df)
    encoded.show()
Esempio n. 12
0
# limitations under the License.
#

from __future__ import print_function

# $example on$
from pyspark.ml.feature import OneHotEncoder
# $example off$
from pyspark.sql import SparkSession

if __name__ == "__main__":
    spark = SparkSession\
        .builder\
        .appName("OneHotEncoderExample")\
        .getOrCreate()

    # Note: categorical features are usually first encoded with StringIndexer
    # $example on$
    df = spark.createDataFrame([(0.0, 1.0), (1.0, 0.0), (2.0, 1.0), (0.0, 2.0),
                                (0.0, 1.0), (2.0, 0.0)],
                               ["categoryIndex1", "categoryIndex2"])

    encoder = OneHotEncoder(inputCols=["categoryIndex1", "categoryIndex2"],
                            outputCols=["categoryVec1", "categoryVec2"])
    model = encoder.fit(df)
    encoded = model.transform(df)
    encoded.show()
    # $example off$

    spark.stop()
from pyspark.ml.feature import OneHotEncoder

# COMMAND ----------

# MAGIC %md
# MAGIC ##### Dealing with Cruise Line column: turn string to index and apply onehot encoding

# COMMAND ----------

# StringIndex + Onehot for the Cruiseline Column:
indexer = StringIndexer(inputCol='Cruise_line', outputCol='Cruiseline_Index')
indexed = indexer.fit(dataset).transform(dataset)

encoder = OneHotEncoder(inputCols=['Cruiseline_Index'],
                        outputCols=['Cruiseline_Onehot'])
encoded = encoder.fit(indexed).transform(indexed)

encoded.show()

# COMMAND ----------

# MAGIC %md
# MAGIC ##### Construct the Feature column for the model by using VectorAssembler

# COMMAND ----------

assembler = VectorAssembler(inputCols=[
    'Age', 'Tonnage', 'passengers', 'length', 'cabins', 'passenger_density',
    'Cruiseline_Onehot'
],
                            outputCol='features')
Esempio n. 14
0
        .getOrCreate()

    # TODO add timer
    # TODO check resource usage

    # Load training data
    data = spark.read.options(header=True, inferSchema=True)\
                .csv("../UCI_repo/Abalone/abalone.csv")
    data.cache()
    print(data.printSchema())
    print(data.describe().toPandas().transpose())

    indexer = StringIndexer(inputCol='Sex', outputCol='Sex_cat')
    indexed = indexer.fit(data).transform(data)
    onehot = OneHotEncoder(inputCol='Sex_cat', outputCol='Sex_onehot')
    onehot_encoded = onehot.fit(indexed).transform(indexed)

    for item in onehot_encoded.head(3):
        print(item)
        print('\n')

    assembler = VectorAssembler(inputCols=['Length', 'Diameter', 'Height', 'Whole weight',\
                                         'Shucked weight', 'Viscera weight', 'Shell weight',\
                                         'Sex_onehot'], outputCol='features')

    output = assembler.transform(onehot_encoded)

    final_data = output.select('features', 'Rings')
    train_data, test_data = final_data.randomSplit([0.7, 0.3])
    test_data.describe().show(3)
Esempio n. 15
0
# $example on$
from pyspark.ml.feature import OneHotEncoder
# $example off$
from pyspark.sql import SparkSession

if __name__ == "__main__":
    spark = SparkSession\
        .builder\
        .appName("OneHotEncoderExample")\
        .getOrCreate()

    # Note: categorical features are usually first encoded with StringIndexer
    # $example on$
    df = spark.createDataFrame([
        (0.0, 1.0),
        (1.0, 0.0),
        (2.0, 1.0),
        (0.0, 2.0),
        (0.0, 1.0),
        (2.0, 0.0)
    ], ["categoryIndex1", "categoryIndex2"])

    encoder = OneHotEncoder(inputCols=["categoryIndex1", "categoryIndex2"],
                            outputCols=["categoryVec1", "categoryVec2"])
    model = encoder.fit(df)
    encoded = model.transform(df)
    encoded.show()
    # $example off$

    spark.stop()
Esempio n. 16
0
}).show()  # group by and compute statistics

df = df.withColumn(
    'age-square',
    col('age')**2)  # add a column named age-square that is the square of age

# df = df.select([...]) # can easily change the order of columns

df.filter(df['native-country'] == 'Holand-Netherlands').count()  # count dutch

df.groupby('native-country').agg({
    'native-country': 'count'
}).sort(asc('count(native-country)')).show(
)  # show statistics that shows the count of each native-country ordered by the count asc

df_remove = df.filter(
    df['native-country'] != 'Holand-Netherlands')  # remove dutch cases

stringIndexer = StringIndexer(inputCol='workclass',
                              outputCol='workclass-encoded'
                              )  # to one-hot-encode, transform a string column
model = stringIndexer.fit(df)
indexed = model.transform(
    df)  # workclass transformed to workclass-encoded (float)

encoder = OneHotEncoder(dropLast=False,
                        inputCol='workclass-encoded',
                        outputCol='workclass-vec')
one_hot_model = encoder.fit(indexed)
encoded_df = encoded.transform(indexed)
Esempio n. 17
0
string_indexer = StringIndexer(inputCol="country_code",
                               outputCol="country_indexed",
                               handleInvalid="skip")
indexed_df = string_indexer.fit(w2v_df).transform(w2v_df)
indexed_df.select("country_code", "country_indexed").display()

# COMMAND ----------

# MAGIC %md Transforming label indices into binary vectors using OneHotEncoder

# COMMAND ----------

from pyspark.ml.feature import OneHotEncoder

ohe = OneHotEncoder(inputCol="country_indexed", outputCol="country_ohe")
ohe_df = ohe.fit(indexed_df).transform(indexed_df)
ohe_df.select("country_code", "country_ohe").display()

# COMMAND ----------

# MAGIC %md Run StringIndexer on `quantity` column as well.

# COMMAND ----------

qty_indexer = StringIndexer(inputCol="quantity",
                            outputCol="quantity_indexed",
                            handleInvalid="skip")
qty_df = qty_indexer.fit(ohe_df).transform(ohe_df)
qty_df.select("quantity", "quantity_indexed").display()

# COMMAND ----------
Esempio n. 18
0
	if args.mode != 'test':
		dataset = dataset.withColumn('duration', F.when(F.col('duration') == 0, 1e-6).otherwise(F.col('duration')))
		dataset = dataset.withColumn('duration', F.log(F.lit(1e-6))/F.col('duration'))
		dataset = dataset.withColumn('duration', F.exp(F.col('duration')))
	stringIndex_model = None
	if args.mode == 'train':
		stringIndexer = StringIndexer(inputCol='source', outputCol='source_index')
		stringIndex_model = stringIndexer.fit(dataset)
		stringIndex_model.save('/user/ronghui_safe/hgy/nid/edw/stringIndex_model_v2')
	else:
		stringIndex_model = StringIndexerModel.load('/user/ronghui_safe/hgy/nid/edw/stringIndex_model_v2')
	dataset = stringIndex_model.transform(dataset)
	encoder_model = None
	if args.mode == 'train':
		encoder = OneHotEncoder(inputCol='source_index', outputCol='source_vec')
		encoder_model = encoder.fit(dataset)
		encoder_model.save('/user/ronghui_safe/hgy/nid/edw/oneHotEncoder_model_v2')
	else:
		encoder_model = OneHotEncoderModel.load('/user/ronghui_safe/hgy/nid/edw/oneHotEncoder_model_v2')
	dataset = encoder_model.transform(dataset)
	feature_cols = ['source_vec', 'aging', 'PC1', 'PC2', 'PC3', 'PC4']
	assembler = VectorAssembler(inputCols=feature_cols, outputCol='feature_vec')
	dataset = assembler.transform(dataset)
	scaler_model = None
	if args.mode == 'train':
		scaler = StandardScaler(inputCol='feature_vec', outputCol='scaled_feature_vec', withStd=True, withMean=True)
		scaler_model = scaler.fit(dataset)
		scaler_model.save('/user/ronghui_safe/hgy/nid/edw/standardScaler_model_v2')
	else:
		scaler_model = StandardScalerModel.load('/user/ronghui_safe/hgy/nid/edw/standardScaler_model_v2')
	dataset = scaler_model.transform(dataset)
Esempio n. 19
0
string_index_encoder = StringIndexer(
    inputCols=categorical_columns,
    outputCols=[c + '_str_ind' for c in categorical_columns],
    stringOrderType='alphabetAsc')
train_car_data_frame = string_index_encoder.fit(car_data_frame).transform(
    car_data_frame)
train_car_data_frame.head(1)

# In[ ]:

one_hot_encoder = OneHotEncoder(
    inputCols=[c + '_str_ind' for c in categorical_columns],
    outputCols=[c + '_vec' for c in categorical_columns],
    dropLast=False)
train_car_data_frame = one_hot_encoder.fit(train_car_data_frame).transform(
    train_car_data_frame)
train_car_data_frame.head(1)

# In[ ]:

numeric_columns = [
    'wheelbase',
    'carlength',
    'carwidth',
    'carheight',
    'curbweight',
    'enginesize',
    'boreratio',
    'stroke',
    'compressionratio',
    'horsepower',
Esempio n. 20
0
#One hot encoding the columns which have been converted from cat to numerical

#Categorical columns that need to be one hot encoded
cat_columns = [
    'sex_index', 'marital_index', 'relationship_index', 'workclass_index',
    'occupation_index'
]

#new column names after OHE
cat_columns_enc = [col + '_enc' for col in cat_columns]

ohe = OneHotEncoder(inputCols=cat_columns,
                    outputCols=cat_columns_enc,
                    dropLast=True)
ohe = ohe.fit(df2)
df2 = ohe.transform(df2)

#dropping the older columns
for col in cat_columns:
    df2 = df2.drop(col)

print('Dataframe and columns after OHE:')
df2.show(2)
print(df2.columns)

# Converting the preprocessed columns into vectors
# Pyspark expects the features to be coalesced together, so this snippet of code
# combines all the features that will be used for prediction of the target
# and stores them in the same dataframe under a new column 'vectors'
from pyspark.ml.feature import StandardScaler
scaler = StandardScaler(inputCol="features",
                        outputCol="scaled_features",
                        withStd=True,
                        withMean=False)
scalerModel = scaler.fit(irisLpDf)
irisNormalizedDf = scalerModel.transform(irisLpDf)
irisNormalizedDf.printSchema()
""" Label OneHotEncoder (Estimator \ the Transformer is now deprecated)"""

print("One Hot Encoder")
from pyspark.ml.feature import OneHotEncoder
encoder = OneHotEncoder(dropLast=False,
                        inputCol="label",
                        outputCol="encoded_label")
encoder_model = encoder.fit(irisNormalizedDf)
irisNormalizedencodedDf = encoder_model.transform(irisNormalizedDf)
irisNormalizedencodedDf.printSchema()
""" PCA Principal Component Analysis"""

print("PCA Analysis")
from pyspark.ml.feature import PCA
pca = PCA(k=3, inputCol="scaled_features", outputCol="pcaFeatures")
pca_model = pca.fit(irisNormalizedencodedDf)
iris_pcaDf = pca_model.transform(irisNormalizedencodedDf)
iris_pcaDf.printSchema()
print(
    "PCA model explained variance", pca_model.explainedVariance,
    "Cumulative explained variance", pca_model.explainedVariance[0] +
    pca_model.explainedVariance[1] + pca_model.explainedVariance[2])
""" Define the final DataFrame : apply the pipeline"""
def test_log_stage_type_params(spark_session):
    from pyspark.ml.base import Estimator, Transformer, Model
    from pyspark.ml.evaluation import Evaluator
    from pyspark.ml.param import Param, Params
    from pyspark.ml.feature import Binarizer, OneHotEncoder

    class TestingEstimator(Estimator):

        transformer = Param(Params._dummy(), "transformer",
                            "a transformer param")
        model = Param(Params._dummy(), "model", "a model param")
        evaluator = Param(Params._dummy(), "evaluator", "an evaluator param")

        def setTransformer(self, transformer: Transformer):
            return self._set(transformer=transformer)

        def setModel(self, model: Model):
            return self._set(model=model)

        def setEvaluator(self, evaluator: Evaluator):
            return self._set(evaluator=evaluator)

        def _fit(self, dataset):
            return TestingModel()

    class TestingModel(Model):
        def _transform(self, dataset):
            return dataset

    binarizer = Binarizer(threshold=1.0,
                          inputCol="values",
                          outputCol="features")
    df = spark_session.createDataFrame([(0.0, ), (1.0, ), (2.0, )], ["input"])
    ohe = OneHotEncoder().setInputCols(["input"]).setOutputCols(["output"])
    ohemodel = ohe.fit(df)
    bcd = BinaryClassificationEvaluator(metricName="areaUnderROC")

    estimator = TestingEstimator().setTransformer(binarizer).setModel(
        ohemodel).setEvaluator(bcd)
    param_map = get_params_to_log(estimator)
    assert param_map["transformer"] == "Binarizer"
    assert param_map["model"] == "OneHotEncoderModel"
    assert param_map["evaluator"] == "BinaryClassificationEvaluator"

    mlflow.pyspark.ml.autolog()
    with mlflow.start_run() as run:
        estimator.fit(df)
        metadata = _gen_estimator_metadata(estimator)
        estimator_info = load_json_artifact("estimator_info.json")
        assert metadata.hierarchy == estimator_info["hierarchy"]
        assert isinstance(estimator_info["hierarchy"]["params"], dict)
        assert estimator_info["hierarchy"]["params"]["transformer"][
            "name"] == "Binarizer"
        assert estimator_info["hierarchy"]["params"]["model"][
            "name"] == "OneHotEncoderModel"
        assert (estimator_info["hierarchy"]["params"]["evaluator"]["name"] ==
                "BinaryClassificationEvaluator")
    run_id = run.info.run_id
    run_data = get_run_data(run_id)
    assert run_data.params == truncate_param_dict(
        stringify_dict_values(get_params_to_log(estimator)))
Esempio n. 23
0
scaler = StandardScaler(withMean=True, withStd=True)
scaler.setInputCol("features").setOutputCol("scaled_features")
scaledHousing = scaler.fit(featuredHousing).transform(featuredHousing)
scaledHousing.select('scaled_features').show()

# 3-2 bu tai ming bai zhe li?????????????????????????????
distinct = renamedHousing.select('ocean_proximity').distinct().collect()
print(distinct)
renamedHousing.agg(countDistinct("ocean_proximity")).show()

indexer = StringIndexer().setInputCol('ocean_proximity').setOutputCol('idx_ocean_proximity')
idxHousing = indexer.fit(renamedHousing).transform(renamedHousing)
idxHousing.show()

encoder = OneHotEncoder().setInputCol('idx_ocean_proximity').setOutputCol('one_hot_ocean_proximity')
ohHousing = encoder.fit(idxHousing).transform(idxHousing)
ohHousing.show()

#4
numPipeline = [imputer,va,scaler]
catPipeline = [indexer,encoder]
pipeline = Pipeline(stages=numPipeline)
newHousing = pipeline.fit(renamedHousing).transform(renamedHousing)
newHousing = newHousing.drop('features')
newHousing.show()
pipeline = pipeline.setStages(catPipeline)
newHousing = pipeline.fit(newHousing).transform(newHousing)
newHousing.show()


va2 = VectorAssembler().setInputCols(['scaled_features','one_hot_ocean_proximity']).setOutputCol('features')
indexer.fit(spark_df).transform(spark_df).show(5)

temp_sdf = indexer.fit(spark_df).transform(spark_df)
spark_df = temp_sdf.withColumn("gender_label",
                               temp_sdf["gender_label"].cast("integer"))

spark_df = spark_df.drop('segment')
spark_df = spark_df.drop('geography')
spark_df = spark_df.drop('gender')

##################################################
# One Hot Encoding
##################################################

encoder = OneHotEncoder(inputCols=["age_cat"], outputCols=["age_cat_ohe"])
spark_df = encoder.fit(spark_df).transform(spark_df)

encoder = OneHotEncoder(inputCols=["hascrcard"],
                        outputCols=["hascrcard_cat_ohe"])
spark_df = encoder.fit(spark_df).transform(spark_df)

##################################################
# definition of TARGET
##################################################

# definition of TARGET
stringIndexer = StringIndexer(inputCol='exited', outputCol='label')
temp_sdf = stringIndexer.fit(spark_df).transform(spark_df)
spark_df = temp_sdf.withColumn("label", temp_sdf["label"].cast("integer"))

##################################################
Esempio n. 25
0
    categorical_features_names = [
        'tonal_chords_key', 'tonal_chords_scale', 'tonal_key_key',
        'tonal_key_scale'
    ]
    indexed_names = list(
        map(lambda x: x + '_indexed', categorical_features_names))
    encoded_names = list(map(lambda x: x + '_encoded', indexed_names))
    indexer = StringIndexer(inputCols=categorical_features_names,
                            outputCols=indexed_names).setHandleInvalid("keep")
    df_ = indexer.fit(df_).transform(df_) \
        .drop(*categorical_features_names)

    encoder = OneHotEncoder(inputCols=indexed_names,
                            outputCols=encoded_names,
                            dropLast=False)
    df_ = encoder.fit(df_).transform(df_)
    for name in encoded_names:
        df_ = df_.withColumn(name + "_arr", one_hot_vector_to_array(col(name)))

    # cleanup
    for name in indexed_names:
        df_ = df_.drop(name)
    for name in encoded_names:
        df_ = df_.drop(name)

    # flatten array columns
    df_ = flatten_df_arrays(df_) \
        .persist(StorageLevel.DISK_ONLY)

    feature_columns = df_.columns
    feature_columns.remove('rec_MBID')