def discrete(self): # Bucketizer from pyspark.ml.feature import Bucketizer splits = [-float("inf"), -0.5, 0.0, 0.5, float("inf")] data = [(-999.9, ), (-0.5, ), (-0.3, ), (0.0, ), (0.2, ), (999.9, )] dataFrame = self.session.createDataFrame(data, ["features"]) bucketizer = Bucketizer(splits=splits, inputCol="features", outputCol="bucketedFeatures") # Transform original data into its bucket index. bucketedData = bucketizer.transform(dataFrame) print("Bucketizer output with %d buckets" % (len(bucketizer.getSplits()) - 1)) bucketedData.show() # QuantileDiscretizer data = [(0, 18.0), (1, 19.0), (2, 8.0), (3, 5.0), (4, 2.2)] df = self.createDataFrame(data, ["id", "hour"]) discretizer = QuantileDiscretizer(numBuckets=3, inputCol="hour", outputCol="result") result = discretizer.fit(df).transform(df) result.show()
def bucketizer_splits(dataFrame, inputCol, splits=[-float('inf'), -0.5, 0.0, 0.5, float('inf')]): # 按给定边界分桶离散化——按边界分桶 bucketizer = Bucketizer(splits=splits, inputCol=inputCol, outputCol='%s_bucketizer' % (inputCol)) # splits指定分桶边界 bucketedData = bucketizer.transform(dataFrame) print('Bucketizer output with %d buckets' % (len(bucketizer.getSplits()) - 1)) return bucketedData
def buckert(self, df, column): """ 按指定边界 分桶Bucketizer """ splits = [-float('inf'), -0.5, 0.0, 0.5, float('inf')] # 按给定边界分桶离散化——按边界分桶 bucketizer = Bucketizer(splits=splits, inputCol=column, outputCol=column + '_bucketed') # splits指定分桶边界 bucketedData = bucketizer.transform(df) print('Bucketizer output with %d buckets' % (len(bucketizer.getSplits()) - 1)) return bucketedData
def _bucketize_age_column( self, dataframe: DataFrame, input_col: str, output_col: str) -> Tuple[DataFrame, int, List[str]]: bucketizer = Bucketizer(splits=self.age_groups, inputCol=input_col, outputCol=output_col) output = bucketizer.setHandleInvalid("keep").transform(dataframe) splits = [s for s in bucketizer.getSplits()] mapping = [ "[{}, {})".format(splits[i], splits[i + 1]) for i in range(len(splits) - 1) ] n_age_groups = len(mapping) return output, n_age_groups, mapping
def pre_processing(dataFrame): splits = [-float("inf"), -0.5, 0.0, 0.5, float("inf")] bucketizer = Bucketizer(splits=splits, inputCol="features", outputCol="bucketedFeatures") # Transform original data into its bucket index. bucketedData = bucketizer.transform(dataFrame) print("Bucketizer output with %d buckets" % (len(bucketizer.getSplits()) - 1)) bucketedData.show()
def test_list_float(self): b = Bucketizer(splits=[1, 4]) self.assertEqual(b.getSplits(), [1.0, 4.0]) self.assertTrue(all([type(v) == float for v in b.getSplits()])) self.assertRaises(TypeError, lambda: Bucketizer(splits=["a", 1.0]))
#####Bucketizer transform the continuous features into columns of feature bucket, by defining the size of the bucket from pyspark.ml.feature import Bucketizer splits = [-float("inf"), -0.5, 0.0, 0.5, float("inf")] data = [(-999.9, ), (-0.5, ), (-0.3, ), (0.0, ), (0.2, ), (999.9, )] dataFrame = spark.createDataFrame(data, ["features"]) bucketizer = Bucketizer(splits=splits, inputCol="features", outputCol="bucketedFeatures") # Transform original data into its bucket index. bucketedData = bucketizer.transform(dataFrame) print("Bucketizer output with %d buckets" % (len(bucketizer.getSplits()) - 1)) bucketedData.show() # COMMAND ---------- ###Element wise product multiplies the given vectors with the scaling vector from pyspark.ml.feature import ElementwiseProduct from pyspark.ml.linalg import Vectors # Create some vector data; also works for sparse vectors data = [(Vectors.dense([1.0, 2.0, 3.0]), ), (Vectors.dense([4.0, 5.0, 6.0]), )] df = spark.createDataFrame(data, ["vector"]) transformer = ElementwiseProduct(scalingVec=Vectors.dense([0.0, 1.0, 2.0]), inputCol="vector", outputCol="transformedVector") # Batch transform the vectors to create new column:
from pyspark.sql import SparkSession # $example on$ from pyspark.ml.feature import Bucketizer # $example off$ if __name__ == "__main__": spark = SparkSession\ .builder\ .appName("BucketizerExample")\ .getOrCreate() # $example on$ splits = [-float("inf"), -0.5, 0.0, 0.5, float("inf")] data = [(-999.9, ), (-0.5, ), (-0.3, ), (0.0, ), (0.2, ), (999.9, )] dataFrame = spark.createDataFrame(data, ["features"]) bucketizer = Bucketizer(splits=splits, inputCol="features", outputCol="bucketedFeatures") # Transform original data into its bucket index. bucketedData = bucketizer.transform(dataFrame) print("Bucketizer output with %d buckets" % (len(bucketizer.getSplits()) - 1)) bucketedData.show() # $example off$ spark.stop()
def create_sample(df, target, num, cat, n_feat, sample_set): ''' Create a final sample set : calculate WOE/IV, select features based on IV, replace variable values with WOEs # Variables with IV - # useless : <.02 # weak : between .02 and .1 # medium : between .1 and .3 # strong : between .3 and .5 # Too good to be true : > .5 ''' print('*** Calculating WOE and IV ....***') woe_iv = pd.DataFrame([], columns=['feat_name','bucket','all','event','tot_event','nonevent','tot_nonevent','pct_event','pct_nonevent','woe','iv']) # Continuous col_decile = {} for c in num: min_ = df.groupby().min(c).collect()[0][0] max_ = df.groupby().max(c).collect()[0][0] col_decile.setdefault(c, []).append(np.linspace(min_, max_, 11).tolist()) for c in num: buckets = Bucketizer(splits=col_decile[c][0], inputCol = c, outputCol = 'bucket', handleInvalid='keep') bucketed = buckets.getSplits() df_binned = buckets.transform(df) # buckets = QuantileDiscretizer(numBuckets = 10, inputCol = c, outputCol = "bucket",handleInvalid='keep') # bucketed = buckets.fit(df).getSplits() # df_binned = buckets.fit(df).transform(df) print('Boundaries for ' + c + ' : ' + str(bucketed)) df_binned.persist() df_binned = (df_binned.withColumn('feat_name', F.lit(c)).withColumn('bucket', F.col('bucket').cast('string')) .select('*', F.count('*').over(W.partitionBy('feat_name')).alias('feat_total'), F.count('*').over(W.partitionBy('feat_name', 'bucket')).alias('all'), \ F.sum(target).over(W.partitionBy('feat_name', 'bucket')).alias('event')) .withColumn('nonevent', F.col('all')-F.col('event')) .withColumn('tot_event', F.sum(target).over(W.partitionBy('feat_name'))) .withColumn('tot_nonevent', F.col('feat_total')-F.col('tot_event')) .withColumn('pct_event', F.round(F.when(F.col('event')==0, (.5/F.col('tot_event'))).otherwise(F.col('event')/F.col('tot_event')), 3)) .withColumn('pct_nonevent', F.round(F.when(F.col('nonevent')==0, (.5/F.col('tot_nonevent'))).otherwise(F.col('nonevent')/F.col('tot_nonevent')), 3)) .withColumn('woe', F.log(F.col('pct_nonevent')/F.col('pct_event'))) .withColumn('iv', F.col('woe')*(F.col('pct_nonevent')-F.col('pct_event'))) .select('feat_name','bucket','all','event','tot_event','nonevent','tot_nonevent','pct_event','pct_nonevent','woe','iv') .distinct() .orderBy('feat_name', 'bucket')) df_tmp = df_binned.toPandas() woe_iv = woe_iv.append(df_tmp, ignore_index=True) # Categorical for c in cat: df_cat = (df.withColumn('feat_name', F.lit(c)).withColumnRenamed(c, 'bucket') .select('*', F.count('*').over(W.partitionBy('feat_name')).alias('feat_total'), \ F.count('*').over(W.partitionBy('feat_name', 'bucket')).alias('event+non_event'), \ F.sum(target).over(W.partitionBy('feat_name', 'bucket')).alias('event')) .withColumn('nonevent', F.col('event+non_event')-F.col('event')) .withColumn('tot_event', F.sum(target).over(W.partitionBy('feat_name'))) .withColumn('tot_nonevent', F.col('feat_total')-F.col('tot_event')) .withColumn('pct_event', F.round(F.when(F.col('event')==0, (.5/F.col('tot_event'))).otherwise(F.col('event')/F.col('tot_event')), 3)) .withColumn('pct_nonevent', F.round(F.when(F.col('nonevent')==0, (.5/F.col('tot_nonevent'))).otherwise(F.col('nonevent')/F.col('tot_nonevent')),3)) .withColumn('woe', F.log(F.col('pct_nonevent')/F.col('pct_event'))) .withColumn('iv', F.col('woe')*(F.col('pct_nonevent')-F.col('pct_event'))) .select('feat_name','bucket','event+non_event','event','tot_event','nonevent','tot_nonevent','pct_event','pct_nonevent') .distinct() .orderBy('feat_name', 'bucket')) df_tmp = df_cat.toPandas() woe_iv = woe_iv.append(df_tmp, ignore_index=True) woe_iv = spark.createDataFrame(woe_iv) print('*** WOE/IV Table ***') woe_iv.show(truncate=False) print('\n*** Feature importance by Information Value, Top ' + str(n_feat) + ' features ***') ivCalc = woe_iv.groupby('feat_name').agg(F.sum('iv').alias('iv')).orderBy('iv', ascending=False) ivCalc.show(n_feat, truncate=False) print('\n*** Selected features with IV >= .02 ***') select_feat = ivCalc.filter('iv >= .02').select('feat_name').rdd.flatMap(lambda x:x).collect() print('\n*** Replace values with WOEs ***') # Bucketize continuous variables except for irrelevant ones buckets = [Bucketizer(splits=col_decile[c][0], inputCol = c, outputCol = c+'_bucket', handleInvalid='keep') for c in num] # buckets = [QuantileDiscretizer(numBuckets = 10, inputCol = c, outputCol = c+"_bucket",handleInvalid='keep') for c in num] pipeline = Pipeline(stages=buckets) df_bucketed = pipeline.fit(df).transform(df) cols = [c for c in df_bucketed.columns if c.endswith('_bucket')] + cat woe_list = [row.asDict() for row in woe_iv.select('feat_name','bucket','woe').collect()] def woe_mapper(feat, bucket): for d in woe_list: if d['feat_name'] == feat and d['bucket'] == bucket: return d['woe'] woe_mapper_udf = F.udf(woe_mapper, DoubleType()) for c in cols: if c.endswith('_bucket'): df_bucketed = df_bucketed.withColumn(c.replace('_bucket','_woe'), F.lit(woe_mapper_udf(F.lit(c[:-len('_bucket')]), F.col(c).cast('string')))) else: df_bucketed = df_bucketed.withColumn(c+'_woe', F.lit(woe_mapper_udf(F.lit(c), F.col(c)))) # Create a data set with select features df_model = df_bucketed.select(target, *[x+'_woe' for x in select_feat]) df_model.printSchema() df_model.show(truncate=False) df_model.write.mode('overwrite').saveAsTable(sample_set) print('\n*** Finished creating a final data set for fitting...')
from __future__ import print_function from pyspark.sql import SparkSession # $example on$ from pyspark.ml.feature import Bucketizer # $example off$ if __name__ == "__main__": spark = SparkSession\ .builder\ .appName("BucketizerExample")\ .getOrCreate() # $example on$ splits = [-float("inf"), -0.5, 0.0, 0.5, float("inf")] data = [(-999.9,), (-0.5,), (-0.3,), (0.0,), (0.2,), (999.9,)] dataFrame = spark.createDataFrame(data, ["features"]) bucketizer = Bucketizer(splits=splits, inputCol="features", outputCol="bucketedFeatures") # Transform original data into its bucket index. bucketedData = bucketizer.transform(dataFrame) print("Bucketizer output with %d buckets" % (len(bucketizer.getSplits())-1)) bucketedData.show() # $example off$ spark.stop()