def discrete(self):
        # Bucketizer
        from pyspark.ml.feature import Bucketizer

        splits = [-float("inf"), -0.5, 0.0, 0.5, float("inf")]

        data = [(-999.9, ), (-0.5, ), (-0.3, ), (0.0, ), (0.2, ), (999.9, )]
        dataFrame = self.session.createDataFrame(data, ["features"])

        bucketizer = Bucketizer(splits=splits,
                                inputCol="features",
                                outputCol="bucketedFeatures")

        # Transform original data into its bucket index.
        bucketedData = bucketizer.transform(dataFrame)

        print("Bucketizer output with %d buckets" %
              (len(bucketizer.getSplits()) - 1))
        bucketedData.show()

        # QuantileDiscretizer

        data = [(0, 18.0), (1, 19.0), (2, 8.0), (3, 5.0), (4, 2.2)]
        df = self.createDataFrame(data, ["id", "hour"])

        discretizer = QuantileDiscretizer(numBuckets=3,
                                          inputCol="hour",
                                          outputCol="result")

        result = discretizer.fit(df).transform(df)
        result.show()
Exemple #2
0
def bucketizer_splits(dataFrame,
                      inputCol,
                      splits=[-float('inf'), -0.5, 0.0, 0.5,
                              float('inf')]):
    # 按给定边界分桶离散化——按边界分桶
    bucketizer = Bucketizer(splits=splits,
                            inputCol=inputCol,
                            outputCol='%s_bucketizer' %
                            (inputCol))  # splits指定分桶边界
    bucketedData = bucketizer.transform(dataFrame)
    print('Bucketizer output with %d buckets' %
          (len(bucketizer.getSplits()) - 1))
    return bucketedData
Exemple #3
0
 def buckert(self, df, column):
     """
     按指定边界 分桶Bucketizer
     """
     splits = [-float('inf'), -0.5, 0.0, 0.5, float('inf')]
     # 按给定边界分桶离散化——按边界分桶
     bucketizer = Bucketizer(splits=splits,
                             inputCol=column,
                             outputCol=column + '_bucketed')  # splits指定分桶边界
     bucketedData = bucketizer.transform(df)
     print('Bucketizer output with %d buckets' %
           (len(bucketizer.getSplits()) - 1))
     return bucketedData
Exemple #4
0
 def _bucketize_age_column(
         self, dataframe: DataFrame, input_col: str,
         output_col: str) -> Tuple[DataFrame, int, List[str]]:
     bucketizer = Bucketizer(splits=self.age_groups,
                             inputCol=input_col,
                             outputCol=output_col)
     output = bucketizer.setHandleInvalid("keep").transform(dataframe)
     splits = [s for s in bucketizer.getSplits()]
     mapping = [
         "[{}, {})".format(splits[i], splits[i + 1])
         for i in range(len(splits) - 1)
     ]
     n_age_groups = len(mapping)
     return output, n_age_groups, mapping
def pre_processing(dataFrame):

    splits = [-float("inf"), -0.5, 0.0, 0.5, float("inf")]

    bucketizer = Bucketizer(splits=splits,
                            inputCol="features",
                            outputCol="bucketedFeatures")

    # Transform original data into its bucket index.
    bucketedData = bucketizer.transform(dataFrame)

    print("Bucketizer output with %d buckets" %
          (len(bucketizer.getSplits()) - 1))
    bucketedData.show()
Exemple #6
0
 def test_list_float(self):
     b = Bucketizer(splits=[1, 4])
     self.assertEqual(b.getSplits(), [1.0, 4.0])
     self.assertTrue(all([type(v) == float for v in b.getSplits()]))
     self.assertRaises(TypeError, lambda: Bucketizer(splits=["a", 1.0]))
#####Bucketizer transform the continuous features into columns of feature bucket, by defining the size of the bucket
from pyspark.ml.feature import Bucketizer

splits = [-float("inf"), -0.5, 0.0, 0.5, float("inf")]

data = [(-999.9, ), (-0.5, ), (-0.3, ), (0.0, ), (0.2, ), (999.9, )]
dataFrame = spark.createDataFrame(data, ["features"])

bucketizer = Bucketizer(splits=splits,
                        inputCol="features",
                        outputCol="bucketedFeatures")

# Transform original data into its bucket index.
bucketedData = bucketizer.transform(dataFrame)

print("Bucketizer output with %d buckets" % (len(bucketizer.getSplits()) - 1))
bucketedData.show()

# COMMAND ----------

###Element wise product multiplies the given vectors with the scaling vector
from pyspark.ml.feature import ElementwiseProduct
from pyspark.ml.linalg import Vectors

# Create some vector data; also works for sparse vectors
data = [(Vectors.dense([1.0, 2.0, 3.0]), ), (Vectors.dense([4.0, 5.0, 6.0]), )]
df = spark.createDataFrame(data, ["vector"])
transformer = ElementwiseProduct(scalingVec=Vectors.dense([0.0, 1.0, 2.0]),
                                 inputCol="vector",
                                 outputCol="transformedVector")
# Batch transform the vectors to create new column:
from pyspark.sql import SparkSession
# $example on$
from pyspark.ml.feature import Bucketizer
# $example off$

if __name__ == "__main__":
    spark = SparkSession\
        .builder\
        .appName("BucketizerExample")\
        .getOrCreate()

    # $example on$
    splits = [-float("inf"), -0.5, 0.0, 0.5, float("inf")]

    data = [(-999.9, ), (-0.5, ), (-0.3, ), (0.0, ), (0.2, ), (999.9, )]
    dataFrame = spark.createDataFrame(data, ["features"])

    bucketizer = Bucketizer(splits=splits,
                            inputCol="features",
                            outputCol="bucketedFeatures")

    # Transform original data into its bucket index.
    bucketedData = bucketizer.transform(dataFrame)

    print("Bucketizer output with %d buckets" %
          (len(bucketizer.getSplits()) - 1))
    bucketedData.show()
    # $example off$

    spark.stop()
Exemple #9
0
def create_sample(df, target, num, cat, n_feat, sample_set):
    '''
    Create a final sample set : calculate WOE/IV, select features based on IV, replace variable values with WOEs
    # Variables with IV - 
    # useless : <.02
    # weak : between .02 and .1
    # medium : between .1 and .3
    # strong : between .3 and .5
    # Too good to be true : > .5
    '''
    print('*** Calculating WOE and IV ....***')
    woe_iv = pd.DataFrame([], columns=['feat_name','bucket','all','event','tot_event','nonevent','tot_nonevent','pct_event','pct_nonevent','woe','iv'])
    # Continuous
    col_decile = {}
    for c in num:
        min_ = df.groupby().min(c).collect()[0][0]
        max_ = df.groupby().max(c).collect()[0][0]
        col_decile.setdefault(c, []).append(np.linspace(min_, max_, 11).tolist())
        
    for c in num:
        buckets = Bucketizer(splits=col_decile[c][0], inputCol = c, outputCol = 'bucket', handleInvalid='keep')
        bucketed = buckets.getSplits()   
        df_binned = buckets.transform(df)
        
#        buckets = QuantileDiscretizer(numBuckets = 10, inputCol = c, outputCol = "bucket",handleInvalid='keep') 
#        bucketed = buckets.fit(df).getSplits()
#        df_binned = buckets.fit(df).transform(df)
        
        print('Boundaries for ' + c +  ' : ' + str(bucketed))
        
        df_binned.persist()        
        df_binned = (df_binned.withColumn('feat_name', F.lit(c)).withColumn('bucket', F.col('bucket').cast('string'))
                              .select('*', F.count('*').over(W.partitionBy('feat_name')).alias('feat_total'), F.count('*').over(W.partitionBy('feat_name', 'bucket')).alias('all'), \
                                     F.sum(target).over(W.partitionBy('feat_name', 'bucket')).alias('event'))
                              .withColumn('nonevent', F.col('all')-F.col('event'))
                              .withColumn('tot_event', F.sum(target).over(W.partitionBy('feat_name')))
                              .withColumn('tot_nonevent', F.col('feat_total')-F.col('tot_event'))
                              .withColumn('pct_event', F.round(F.when(F.col('event')==0, (.5/F.col('tot_event'))).otherwise(F.col('event')/F.col('tot_event')), 3))
                              .withColumn('pct_nonevent', F.round(F.when(F.col('nonevent')==0, (.5/F.col('tot_nonevent'))).otherwise(F.col('nonevent')/F.col('tot_nonevent')), 3))
                              .withColumn('woe', F.log(F.col('pct_nonevent')/F.col('pct_event')))
                              .withColumn('iv', F.col('woe')*(F.col('pct_nonevent')-F.col('pct_event')))
                              .select('feat_name','bucket','all','event','tot_event','nonevent','tot_nonevent','pct_event','pct_nonevent','woe','iv')
                              .distinct()
                              .orderBy('feat_name', 'bucket'))
        df_tmp = df_binned.toPandas()
        woe_iv = woe_iv.append(df_tmp, ignore_index=True)
    
    # Categorical
    for c in cat:
        df_cat = (df.withColumn('feat_name', F.lit(c)).withColumnRenamed(c, 'bucket')
                  .select('*', F.count('*').over(W.partitionBy('feat_name')).alias('feat_total'), \
                          F.count('*').over(W.partitionBy('feat_name', 'bucket')).alias('event+non_event'), \
                          F.sum(target).over(W.partitionBy('feat_name', 'bucket')).alias('event'))
                  .withColumn('nonevent', F.col('event+non_event')-F.col('event'))
                  .withColumn('tot_event', F.sum(target).over(W.partitionBy('feat_name')))
                  .withColumn('tot_nonevent', F.col('feat_total')-F.col('tot_event'))
                  .withColumn('pct_event', F.round(F.when(F.col('event')==0, (.5/F.col('tot_event'))).otherwise(F.col('event')/F.col('tot_event')), 3))
                  .withColumn('pct_nonevent', F.round(F.when(F.col('nonevent')==0, (.5/F.col('tot_nonevent'))).otherwise(F.col('nonevent')/F.col('tot_nonevent')),3))
                  .withColumn('woe', F.log(F.col('pct_nonevent')/F.col('pct_event')))
                  .withColumn('iv', F.col('woe')*(F.col('pct_nonevent')-F.col('pct_event')))
                  .select('feat_name','bucket','event+non_event','event','tot_event','nonevent','tot_nonevent','pct_event','pct_nonevent')
                  .distinct()
                  .orderBy('feat_name', 'bucket'))
        df_tmp = df_cat.toPandas()
        woe_iv = woe_iv.append(df_tmp, ignore_index=True)
    
    woe_iv = spark.createDataFrame(woe_iv)
    print('*** WOE/IV Table ***')
    woe_iv.show(truncate=False)
    
    print('\n*** Feature importance by Information Value, Top ' + str(n_feat) + ' features ***')
    ivCalc = woe_iv.groupby('feat_name').agg(F.sum('iv').alias('iv')).orderBy('iv', ascending=False)
    ivCalc.show(n_feat, truncate=False)
    
    print('\n*** Selected features with IV >= .02 ***')
    select_feat = ivCalc.filter('iv >= .02').select('feat_name').rdd.flatMap(lambda x:x).collect()
    
    print('\n*** Replace values with WOEs ***')
    # Bucketize continuous variables except for irrelevant ones
    buckets = [Bucketizer(splits=col_decile[c][0], inputCol = c, outputCol = c+'_bucket', handleInvalid='keep') for c in num]
#    buckets = [QuantileDiscretizer(numBuckets = 10, inputCol = c, outputCol = c+"_bucket",handleInvalid='keep') for c in num]
    pipeline = Pipeline(stages=buckets)
    df_bucketed = pipeline.fit(df).transform(df)
        
    cols = [c for c in df_bucketed.columns if c.endswith('_bucket')] + cat
    woe_list = [row.asDict() for row in woe_iv.select('feat_name','bucket','woe').collect()]
    def woe_mapper(feat, bucket):
        for d in woe_list:
            if d['feat_name'] == feat and d['bucket'] == bucket:
                return d['woe']
    
    woe_mapper_udf = F.udf(woe_mapper, DoubleType())
    for c in cols:
        if c.endswith('_bucket'):
            df_bucketed = df_bucketed.withColumn(c.replace('_bucket','_woe'), F.lit(woe_mapper_udf(F.lit(c[:-len('_bucket')]), F.col(c).cast('string'))))
        else:
            df_bucketed = df_bucketed.withColumn(c+'_woe', F.lit(woe_mapper_udf(F.lit(c), F.col(c))))
    
    # Create a data set with select features
    df_model = df_bucketed.select(target, *[x+'_woe' for x in select_feat])
    df_model.printSchema()
    df_model.show(truncate=False)
    df_model.write.mode('overwrite').saveAsTable(sample_set)
    print('\n*** Finished creating a final data set for fitting...')
Exemple #10
0
from __future__ import print_function

from pyspark.sql import SparkSession
# $example on$
from pyspark.ml.feature import Bucketizer
# $example off$

if __name__ == "__main__":
    spark = SparkSession\
        .builder\
        .appName("BucketizerExample")\
        .getOrCreate()

    # $example on$
    splits = [-float("inf"), -0.5, 0.0, 0.5, float("inf")]

    data = [(-999.9,), (-0.5,), (-0.3,), (0.0,), (0.2,), (999.9,)]
    dataFrame = spark.createDataFrame(data, ["features"])

    bucketizer = Bucketizer(splits=splits, inputCol="features", outputCol="bucketedFeatures")

    # Transform original data into its bucket index.
    bucketedData = bucketizer.transform(dataFrame)

    print("Bucketizer output with %d buckets" % (len(bucketizer.getSplits())-1))
    bucketedData.show()
    # $example off$

    spark.stop()
Exemple #11
0
 def test_list_float(self):
     b = Bucketizer(splits=[1, 4])
     self.assertEqual(b.getSplits(), [1.0, 4.0])
     self.assertTrue(all([type(v) == float for v in b.getSplits()]))
     self.assertRaises(TypeError, lambda: Bucketizer(splits=["a", 1.0]))