Beispiel #1
0
    def run(self):
        parquet_path = self.settings.parquet_path
        tf_path = self.settings.tf_path
        target = self.settings.target

        spark = SparkSession.builder.getOrCreate()

        with open("transform_spark.txt", "w") as file:
            file.write("spark context" + str(spark.sparkContext))
            file.write("===SeessionID===")
            file.write(str(id(spark)))

        df = spark.read.option("header", "true") \
            .option("inferSchema", "true") \
            .parquet(parquet_path)
        df.repartition(10)

        # DATA TYPE SUMMARY
        data_types = defaultdict(list)
        for entry in df.schema.fields:
            data_types[str(entry.dataType)].append(entry.name)

        # NUMERIC PIPELINE
        numeric_features = data_types["DoubleType"] + data_types["IntegerType"]
        if target in numeric_features:
            numeric_features.remove(target)

        for c in data_types["IntegerType"]:
            df = df.withColumn(c, df[c].cast("double"))

        imputer = Imputer(
            inputCols=numeric_features,
            outputCols=[num + "_imputed" for num in numeric_features])
        numeric_imputed = VectorAssembler(inputCols=imputer.getOutputCols(),
                                          outputCol="imputed")
        scaler = StandardScaler(inputCol="imputed", outputCol="scaled")
        num_assembler = VectorAssembler(inputCols=["scaled"],
                                        outputCol="numeric")
        num_pipeline = Pipeline(stages=[imputer, numeric_imputed, scaler] +
                                [num_assembler])
        df = num_pipeline.fit(df).transform(df)

        # CATEGORY PIPELINE
        category_features = [
            var for var in data_types["StringType"] if var != target
        ]
        cat_missing = {}
        for var in category_features:
            cat_missing[var] = "unknown"  # Impute category features
        df = df.fillna(cat_missing)
        useful_category_features = []

        for var in category_features:
            # Drop if distinct values in a category column is greater than 15% of sample number.
            if df.select(var).distinct().count() < 0.15 * df.count():
                useful_category_features.append(var)

        indexers = [
            StringIndexer(inputCol=c,
                          outputCol="{0}_indexed".format(c),
                          handleInvalid='skip')
            for c in useful_category_features
        ]

        encoders = [
            OneHotEncoder(inputCol=indexer.getOutputCol(),
                          outputCol="{0}_encoded".format(
                              indexer.getOutputCol())) for indexer in indexers
        ]

        cat_assembler = VectorAssembler(
            inputCols=[encoder.getOutputCol() for encoder in encoders],
            outputCol="category")
        cat_pipeline = Pipeline(stages=indexers + encoders + [cat_assembler])
        df = cat_pipeline.fit(df).transform(df)

        # Integrate features
        features_processed = VectorAssembler(inputCols=["category", "numeric"],
                                             outputCol="features")
        tot_pipeline = Pipeline(stages=[features_processed])
        processed = tot_pipeline.fit(df).transform(df)
        processed.write.mode("overwrite").parquet(tf_path)

        feature_info = {
            "numeric_features": numeric_features,
            "category_features": category_features
        }

        with open("./feature_info.pickle", "wb") as handle:
            pickle.dump(feature_info, handle)
# DATA TYPE SUMMARY
data_types = defaultdict(list)
for entry in df.schema.fields:
    data_types[str(entry.dataType)].append(entry.name)

# NUMERIC PIPELINE
numeric_features = data_types["DoubleType"] + data_types["IntegerType"]
if target in numeric_features:
    numeric_features.remove(target)

for c in data_types["IntegerType"]:
    df = df.withColumn(c, df[c].cast("double"))

imputer = Imputer(inputCols=numeric_features,
                  outputCols=[num + "_imputed" for num in numeric_features])
numeric_imputed = VectorAssembler(inputCols=imputer.getOutputCols(),
                                  outputCol="imputed")
scaler = StandardScaler(inputCol="imputed", outputCol="scaled")
num_assembler = VectorAssembler(inputCols=["scaled"], outputCol="numeric")
num_pipeline = Pipeline(stages=[imputer, numeric_imputed, scaler] +
                        [num_assembler])
df = num_pipeline.fit(df).transform(df)

# CATEGORY PIPELINE
category_features = [var for var in data_types["StringType"] if var != target]
cat_missing = {}
for var in category_features:
    cat_missing[var] = "unknown"  # Impute category features
df = df.fillna(cat_missing)
useful_category_features = []