Esempio n. 1
0
    def data_transformation(self, df):

        df = df.withColumn(
            "date",
            from_unixtime(unix_timestamp('DateStringYYYYMMDD', 'yyyyMMdd')))
        df = df.withColumn("Year", year("date"))  # extract year
        df = df.withColumn("Day", dayofmonth("date"))  # extract day
        df = df.withColumn("Month", month("date"))  # extract month

        # bin continuous variable based on quantiles (temperature)
        discretizer = QuantileDiscretizer(numBuckets=6,
                                          inputCol="AvgHourlyTemp",
                                          outputCol="temp_quantile")
        result = discretizer.fit(df).transform(df)
        df = result

        # based on variation seen in boxplots
        df = df.withColumn(
         'store_variability',
         F.when((F.col("Store_ID").isin('16', '17', '18', '20', '22', '31')), "variable")\
         .when((F.col("Store_ID").isin('11', '2', '21', '23', '32', '34', '36', '38')), "not-variable")
         )

        # based on mean revenue per store id
        df = df.withColumn(
         'store_rank',
         F.when((F.col("Store_ID").isin('31', '17', '20', '38', '2', '36', '21')), "great")\
         .when((F.col("Store_ID").isin('18', '32', '34', '23')), "good")\
         .when((F.col("Store_ID").isin('11', '22', '16')), "ok")
         )

        return df
    def discrete(self):
        # Bucketizer
        from pyspark.ml.feature import Bucketizer

        splits = [-float("inf"), -0.5, 0.0, 0.5, float("inf")]

        data = [(-999.9, ), (-0.5, ), (-0.3, ), (0.0, ), (0.2, ), (999.9, )]
        dataFrame = self.session.createDataFrame(data, ["features"])

        bucketizer = Bucketizer(splits=splits,
                                inputCol="features",
                                outputCol="bucketedFeatures")

        # Transform original data into its bucket index.
        bucketedData = bucketizer.transform(dataFrame)

        print("Bucketizer output with %d buckets" %
              (len(bucketizer.getSplits()) - 1))
        bucketedData.show()

        # QuantileDiscretizer

        data = [(0, 18.0), (1, 19.0), (2, 8.0), (3, 5.0), (4, 2.2)]
        df = self.createDataFrame(data, ["id", "hour"])

        discretizer = QuantileDiscretizer(numBuckets=3,
                                          inputCol="hour",
                                          outputCol="result")

        result = discretizer.fit(df).transform(df)
        result.show()
Esempio n. 3
0
    def quantile_discretizer(self, df, column, num_buckets):
        """
        按指定等分数 分位数分桶QuantileDiscretizer
        """
        print('QuantileDiscretizerExample')
        df = df.repartition(1)
        # 按分位数分桶离散化——分位数离散化
        if isinstance(column, list):
            output_column = [str(v) + '_quant' for v in numic_columns]
            print(len(column), len(output_column), len(num_buckets))
            discretizer = QuantileDiscretizer(relativeError=0.01,
                                              handleInvalid="error",
                                              numBucketsArray=num_buckets,
                                              inputCols=column,
                                              outputCols=output_column)
        else:
            discretizer = QuantileDiscretizer(numBuckets=num_buckets,
                                              relativeError=0.01,
                                              handleInvalid="error",
                                              inputCol=column,
                                              outputCol=column +
                                              '_quant')  # numBuckets指定分桶数

        result = discretizer.setHandleInvalid("keep").fit(df).transform(df)
        return result
 def Run(self):
     def Lift(cum_resp,decil_no):
         return (cum_resp/float(decil_no+1))
     self._df=self._df.orderBy(self._proba_column,ascending=False)
     self._df = self._df.withColumn("id", monotonically_increasing_id())
     # w = Window.orderBy(desc(self._proba_column))
     # self._df = self._df.withColumn('id',row_number().over(w))
     discretizer = QuantileDiscretizer(numBuckets=10, inputCol="id", outputCol="deciles")
     self._df = discretizer.fit(self._df).transform(self._df)
     Rank=self._df.groupby('deciles').agg(F.count(self._df[self._proba_column]).alias('cnt'), F.count(when(self._df[self._target_column] == int(self._posLabel), True)).alias('cnt_resp'))
     Rank=Rank.withColumn('cnt_non_resp',Rank['cnt']-Rank['cnt_resp'])
     Rank=Rank.orderBy('deciles',ascending=True)
     cumsum_window = (Window.orderBy(Rank['deciles']).rangeBetween(Window.unboundedPreceding, Window.currentRow))
     Rank=Rank.withColumn("cum_resp",F.sum('cnt_resp').over(cumsum_window))
     Rank=Rank.withColumn("cum_non_resp",F.sum('cnt_non_resp').over(cumsum_window))
     Rank=Rank.withColumn("% Responders(Cumulative)",F.round(old_div(Rank["cum_resp"]*100,Rank.select(F.sum('cnt_resp')).collect()[0][0]),2))
     Rank=Rank.withColumn("% Non-Responders(Cumulative)",F.round(old_div(Rank["cum_non_resp"]*100,Rank.select(F.sum('cnt_non_resp')).collect()[0][0]),2))
     Rank=Rank.withColumn("cum_population",F.sum("cnt").over(cumsum_window))
     Rank=Rank.withColumn("pop_pct_per_decile",F.round(old_div(Rank["cnt"]*100,Rank.select(F.sum('cnt')).collect()[0][0])))
     Rank=Rank.withColumn("% Population(Cumulative)",F.round(F.sum('pop_pct_per_decile').over(cumsum_window)))
     Rank=Rank.withColumn("KS",F.round(Rank["% Responders(Cumulative)"] - Rank["% Non-Responders(Cumulative)"],2))
     Rank=Rank.withColumn("Lift at Decile",F.round(old_div(Rank["cnt_resp"]*Rank["pop_pct_per_decile"]*100,Rank.select(F.sum('cnt_resp')).collect()[0][0]),2))
     Rank = Rank.withColumn("id", monotonically_increasing_id())
     Lift_udf=udf(lambda x,y:Lift(x,y),FloatType())
     Rank=Rank.withColumn("Total_Lift",F.round(Lift_udf("cum_resp","id"),2))
     Rank=Rank.drop('id')
     return(Rank)
Esempio n. 5
0
    def test_quantilediscretizer_converter(self):
        iris = load_iris()
        features = [
            "sepal_length", "sepal_width", "petal_length", "petal_width"
        ]

        pd_df = pd.DataFrame(data=np.c_[iris["data"], iris["target"]],
                             columns=features + ["target"])
        df = sql.createDataFrame(pd_df).select("sepal_length")

        quantile = QuantileDiscretizer(inputCol="sepal_length",
                                       outputCol="sepal_length_bucket",
                                       numBuckets=2)
        model = quantile.fit(df)

        test_df = df
        torch_model = convert(model, "torch", test_df)
        self.assertTrue(torch_model is not None)

        spark_output = model.transform(test_df).select(
            "sepal_length_bucket").toPandas()
        torch_output_np = torch_model.transform(pd_df[["sepal_length"]])
        np.testing.assert_allclose(spark_output.to_numpy(),
                                   torch_output_np,
                                   rtol=1e-06,
                                   atol=1e-06)
Esempio n. 6
0
def load(dev):
    """Loads the submissions from MongoDB database."""
    logging.info('Loading submissions...')
    df = dev.read.format('com.mongodb.spark.sql.DefaultSource').load()

    df.createOrReplaceTempView('submissions')
    df.printSchema()

    query = 'select score, upvote_ratio, is_nfsw, text_embedded from \
    submissions'

    df = dev.sql(query)

    df = df.rdd.map(
        lambda r: Row(score=r['score'],
                      upvote_ratio=r['upvote_ratio'],
                      is_nfsw=float(r['is_nfsw']),
                      text_embedded=Vectors.dense(r['text_embedded']))).toDF()
    qd = QuantileDiscretizer(numBuckets=6,
                             inputCol='upvote_ratio',
                             outputCol='upvote_class')
    df = qd.fit(df).transform(df)

    df.printSchema()
    df.show()
    return df
Esempio n. 7
0
def Binning(df, num_col, no_of_buckets):

	for (a,b) in df.dtypes:
			if a==num_col:
				o_dtype = b


	tdf = df.withColumn(num_col, col(num_col).cast('double'))
	qds = QuantileDiscretizer(numBuckets=no_of_buckets, inputCol=num_col, outputCol="bucket_no")
	bucketizer = qds.fit(tdf)
	splits = bucketizer.getSplits()
	tdf = bucketizer.transform(tdf)

	bucket_dict = dict()

	for i in range(no_of_buckets):
		bucket_dict[float(i)] = str(splits[i]) + ' to ' + str(splits[i+1])

	#tdf = tdf.withColumn('bucket_no', col(num_col).cast('string'))

	mapping_expr=create_map([lit(x) for x in chain(*bucket_dict.items())])
	tdf = tdf.withColumn(num_col + '_bucket_range', mapping_expr.getItem(col('bucket_no')))

	tdf = tdf.withColumn(num_col, col(num_col).cast(o_dtype))
	
	return tdf, bucket_dict
Esempio n. 8
0
def aggregate_spark(data, features, args):
    from pyspark.ml.feature import QuantileDiscretizer

    discretizer = QuantileDiscretizer(
        numBuckets=args["num_buckets"], inputCol=features["col"], outputCol="_"
    ).fit(data)

    return discretizer.getSplits()
Esempio n. 9
0
def quantile_discretizer(dataFrame, inputCol, numBuckets=4):
    # 按分位数分桶离散化——分位数离散化
    discretizer = QuantileDiscretizer(numBuckets=numBuckets,
                                      inputCol=inputCol,
                                      outputCol='%s_bucketizer' %
                                      (inputCol))  # numBuckets指定分桶数
    bucketedData = discretizer.fit(dataFrame).transform(dataFrame)
    return bucketedData
Esempio n. 10
0
def do_quantile_discretizer(input_data,
                            result_data,
                            column,
                            prefix="buckets_",
                            num_buckets=100):
    discretizer = QuantileDiscretizer(inputCol=column,
                                      outputCol=prefix + column,
                                      numBuckets=num_buckets)
    fittedBucketer = discretizer.fit(input_data)
    return fittedBucketer.transform(result_data)
Esempio n. 11
0
 def qcut(input_col, output_col, num_buckets):
     """
     Bin columns into n buckets. Quantile Discretizer
     :param input_col: Input column to processed
     :param output_col: Output columns with the bin number
     :param num_buckets: Number of buckets in which the column will be divided
     :return:
     """
     discretizer = QuantileDiscretizer(numBuckets=num_buckets,
                                       inputCol=input_col,
                                       outputCol=output_col)
     return discretizer.fit(self).transform(self)
def ReturnQuartile(df, cols):
    #This function groups a column into quartiles.  The input is cols, which is a list of columns
    for c in cols:
        #Create the quartiles
        discretizer = QuantileDiscretizer(numBuckets=4,
                                          inputCol=c,
                                          outputCol=c + "bin",
                                          relativeError=0.01,
                                          handleInvalid="error")
        #Apply the quartiles
        df = discretizer.fit(df).transform(df)
    return df
Esempio n. 13
0
 def qcut(input_col, output_col, num_buckets):
     """
     Bin columns into n buckets
     :param input_col:
     :param output_col:
     :param num_buckets:
     :return:
     """
     discretizer = QuantileDiscretizer(numBuckets=num_buckets,
                                       inputCol=input_col,
                                       outputCol=output_col)
     return discretizer.fit(self).transform(self)
Esempio n. 14
0
def quantiles(df, input_col):
    try:
        qds = QuantileDiscretizer(
            # 254 is used so the 255th can be inf
            numBuckets=254,
            inputCol=input_col,
            outputCol='bucketed',
            relativeError=1. / 2550,
            handleInvalid='error')
        return qds.fit(df).getSplits()
    except Exception as e:
        print(e)
        raise
    def create_categorical_feature(self, dataframe, base_field, categorical_field, levels, increment=0):
        """Produces a PySpark dataframe containing a categorical field based on a specified field.

        :param dataframe: the PySpark dataframe
        :param base_field: the field that provides the values used to create the categorical field
        :param categorical_field: the name of the categorical field to be created
        :param levels: the number of levels to be created in the categorical field
        :param increment: the value to add to each level (Default value = 0)
        :returns: the PySpark dataframe containing a categorical field and all fields in the supplied dataframe
        """
        dataframe = self.fix_data_type(dataframe, [base_field], 'double')
        discretizer = QuantileDiscretizer(numBuckets=levels, inputCol=base_field, outputCol=categorical_field)
        dataframe = discretizer.fit(dataframe).transform(dataframe)
        return(dataframe.withColumn(categorical_field, dataframe[categorical_field].cast('int')+increment))
Esempio n. 16
0
 def quantile_bucketize(col_list: list, num_buckets : int = 3):
     
     from pyspark.ml.feature import QuantileDiscretizer
     df = self
     
     for c in col_list:
         if c in df.schema.names:
             non_zero_values = df.select(c).where(col(c)!=0)
             bucketizer = QuantileDiscretizer(
                 numBuckets = num_buckets,
                 inputCol = c,
                 outputCol = c+"_bucket"
                 ).fit(non_zero_values).setHandleInvalid('keep')
             df = bucketizer.transform(df).drop(c)
     return df
Esempio n. 17
0
 def qcut(columns, num_buckets, handle_invalid="skip"):
     """
     Bin columns into n buckets. Quantile Discretizer
     :param columns: Input columns to processed
     :param num_buckets: Number of buckets in which the column will be divided
     :param handle_invalid:
     :return:
     """
     df = self
     columns = parse_columns(self, columns, filter_by_column_dtypes=PYSPARK_NUMERIC_TYPES)
     for col_name in columns:
         output_col = col_name + "_qcut"
         discretizer = QuantileDiscretizer(numBuckets=num_buckets, inputCol=col_name, outputCol=output_col,
                                           handleInvalid=handle_invalid)
         df = discretizer.fit(df).transform(df)
     return df
def ratingFeatures(ratingSamples):
    ratingSamples.printSchema()
    ratingSamples.show()

    # calculate average movie rating score and rating count
    # 按movieId做聚合,统计电影点击次数count(1) as ratingCount
    # avg(rating) as avgRating
    # variance(rating) as ratingVar   -- 这个是方差
    movieFeatures = ratingSamples.groupBy('movieId').agg(F.count(F.lit(1)).alias('ratingCount'),
                                                         F.avg("rating").alias("avgRating"),
                                                         F.variance('rating').alias('ratingVar')) \
        .withColumn('avgRatingVec', udf(lambda x: Vectors.dense(x), VectorUDT())('avgRating'))  # 把平均得分转成只有1列的向量存储,后续做标准化要求的
    movieFeatures.show(10)

    ######## 走pipeline特征处理 ########
    # bucketing
    # 连续值分桶:对ratingCount按分布划分成100个大小一样的桶
    ratingCountDiscretizer = QuantileDiscretizer(numBuckets=100,
                                                 inputCol="ratingCount",
                                                 outputCol="ratingCountBucket")
    # Normalization
    # 标准化:将平均得分向量进行标准化
    ratingScaler = MinMaxScaler(inputCol="avgRatingVec",
                                outputCol="scaleAvgRating")

    # 创建pipeline
    pipelineStage = [ratingCountDiscretizer, ratingScaler]
    featurePipeline = Pipeline(stages=pipelineStage)
    movieProcessedFeatures = featurePipeline.fit(movieFeatures).transform(
        movieFeatures)

    # 把分桶转成整数类型, 把标准化的向量提取为非向量
    movieProcessedFeatures = movieProcessedFeatures.withColumn('ratingCountBucket', F.col('ratingCountBucket').cast(IntegerType()))\
        .withColumn('scaleAvgRating', udf(lambda v: float(v[0]), FloatType())(F.col('scaleAvgRating'))).drop(F.col('avgRatingVec'))
    movieProcessedFeatures.show(10)
Esempio n. 19
0
    def test_pipeline3(self):
        iris = load_iris()
        features = [
            "sepal_length", "sepal_width", "petal_length", "petal_width"
        ]

        pd_df = pd.DataFrame(data=np.c_[iris["data"], iris["target"]],
                             columns=features + ["label"])
        df = sql.createDataFrame(pd_df)

        quantile1 = QuantileDiscretizer(inputCol="sepal_length",
                                        outputCol="sepal_length_bucket",
                                        numBuckets=2)
        quantile2 = QuantileDiscretizer(inputCol="sepal_width",
                                        outputCol="sepal_width_bucket",
                                        numBuckets=2)
        features = ["sepal_length_bucket", "sepal_width_bucket"] + features
        assembler = VectorAssembler(inputCols=features, outputCol="features")
        pipeline = Pipeline(
            stages=[quantile1, quantile2, assembler,
                    LogisticRegression()])
        model = pipeline.fit(df)

        df = df.select(
            ["sepal_length", "sepal_width", "petal_length", "petal_width"])
        pd_df = pd_df[[
            "sepal_length", "sepal_width", "petal_length", "petal_width"
        ]]
        torch_model = convert(model, "torch", df)
        self.assertTrue(torch_model is not None)

        np.testing.assert_allclose(
            np.array(model.transform(df).select(
                "prediction").collect()).reshape(-1),
            torch_model.predict(pd_df),
            rtol=1e-06,
            atol=1e-06,
        )

        np.testing.assert_allclose(
            np.array(
                model.transform(df).select("probability").collect()).reshape(
                    -1, 3),
            torch_model.predict_proba(pd_df),
            rtol=1e-06,
            atol=1e-05,
        )
def _sparse_feature_with_quantile(train_data: DataFrame, test_data: DataFrame,
                                  sc: SparkContext):
    """
    连续特征离散化
    :param train_data:
    :param test_data:
    :param sc: sparkContext
    :return: 含有离散化特征的DataFrame, 分桶splits
    """
    feature_bucket_splits_dict = dict()
    # fit splitter
    for col in config.CONTINUOUS_COLUMNS:
        sparser = QuantileDiscretizer(
            numBuckets=config.CONTINUOUS_COLUMNS_BUCKET_NUM,
            inputCol=col,
            outputCol=config.BUCKET_FEATURE_PREFIX + col,
            relativeError=0.01,
            handleInvalid="error")
        sparser_model = sparser.fit(train_data)
        feature_bucket_splits_dict[col] = sparser_model.getSplits()
    # 保存 splits 方便上线时一致
    utils.write_to_hdfs(sc,
                        config.CONTINUOUS_COLUMNS_BUCKETS,
                        json.dumps(feature_bucket_splits_dict),
                        overwrite=True)
    print("[INFO] save continuous columns buckets {0}".format(
        config.CONTINUOUS_COLUMNS_BUCKETS))

    # transform data
    for k, v in feature_bucket_splits_dict.items():
        input_col, output_col, splits = k, config.BUCKET_FEATURE_PREFIX + k, v

        bucket_model = feature.Bucketizer(inputCol=input_col,
                                          outputCol=output_col,
                                          splits=splits)
        train_data = bucket_model.setHandleInvalid("skip").transform(
            train_data)
        test_data = bucket_model.setHandleInvalid("skip").transform(test_data)

        train_data.select(output_col).show(10, False)

        print("[INFO] continuous sparse transform {0} to {1}".format(
            input_col, output_col))

    return train_data, test_data, feature_bucket_splits_dict
Esempio n. 21
0
def _load_dataframe(sc):
    """Loads the submissions from MongoDB database."""
    df = sc.read.format(SPARK_SQL_DEFAULT).load()
    df.createOrReplaceTempView('submissions')

    query = 'select score, upvote_ratio, is_nfsw, text_embedded\
        from submissions'

    df = sc.sql(query)

    df = df.rdd.map(
        lambda r: Row(score=r['score'],
                      upvote_ratio=r['upvote_ratio'],
                      is_nfsw=float(r['is_nfsw']),
                      text_embedded=Vectors.dense(r['text_embedded']))).toDF()
    qd = QuantileDiscretizer(numBuckets=6,
                             inputCol='upvote_ratio',
                             outputCol='upvote_class')
    df = qd.fit(df).transform(df)

    return df
Esempio n. 22
0
    def evaluateKs(self,
                   predictions: 'sparkdf',
                   tableName: 'string',
                   prob: 'string' = "core") -> 'double':
        predictions.createOrReplaceTempView(tableName)
        result = self.spark.sql("SELECT score as prob, label FROM %s" %
                                tableName)
        viewName = tableName + "_result"
        result.createOrReplaceTemView(viewName)
        quantileDiscretizer = QuantileDiscretizer(numBuckets=10,
                                                  inputCol='prob',
                                                  outputCol='prob_cut')
        discreDF = quantileDiscretizer.fit(result).transform(result)
        cut_view_name = viewName + "_with_cut"
        discreDF.createOrReplaceTemView(cut_view_name)
        sql_str = r"SELECT count(label) as label_all, sum(label) as label_bad, min(prob) as min,  max(prob) as max, prob_cut FROM " + cut_view_name + " group by prob_cut order by prob_cut"
        resultLocal = spark.sql(sql_str).collect()
        ks, ks_cut_local = self.compute_ks(resultLocal)
        print(r"ks:\t%s" % str(ks))
        for line in ks_cut_local:
            print(line)

        return float(ks)
    def create_categorical_feature(self,
                                   dataframe,
                                   base_field,
                                   categorical_field,
                                   levels,
                                   increment=0):
        """Produces a PySpark dataframe containing a categorical field based on a specified field.

        :param dataframe: the PySpark dataframe
        :param base_field: the field that provides the values used to create the categorical field
        :param categorical_field: the name of the categorical field to be created
        :param levels: the number of levels to be created in the categorical field
        :param increment: the value to add to each level (Default value = 0)
        :returns: the PySpark dataframe containing a categorical field and all fields in the supplied dataframe
        """
        dataframe = self.fix_data_type(dataframe, [base_field], 'double')
        discretizer = QuantileDiscretizer(numBuckets=levels,
                                          inputCol=base_field,
                                          outputCol=categorical_field)
        dataframe = discretizer.fit(dataframe).transform(dataframe)
        return (dataframe.withColumn(
            categorical_field,
            dataframe[categorical_field].cast('int') + increment))
Esempio n. 24
0
def numerical_example(data: DataFrame):
    movie_features = data.groupBy("movieId").agg(
        F.count(F.lit(1)).alias("ratingCount"),
        F.avg("rating").alias("avgRating"),
        F.variance("rating").alias("ratingVar")).withColumn(
            "avgRatingVec", udf_avg_rating_to_vec(F.col("avgRating")))
    print_info(movie_features)
    # bucketing
    rating_count_discretizer = QuantileDiscretizer(
        numBuckets=100, inputCol="ratingCount", outputCol="ratingCountBucket")
    # normalization
    rating_scaler = MinMaxScaler(inputCol="avgRatingVec",
                                 outputCol="scaleAvgRating")
    pipeline_stage = [rating_count_discretizer, rating_scaler]
    feature_pipeline = Pipeline(stages=pipeline_stage)
    movie_processed_features = feature_pipeline.fit(movie_features).transform(
        movie_features)
    print_info(movie_processed_features)
Esempio n. 25
0
def ratingFeatures(ratingSamples):
    ratingSamples.printSchema()
    ratingSamples.show()
    # calculate average movie rating score and rating count
    movieFeatures = ratingSamples.groupBy('movieId').agg(F.count(F.lit(1)).alias('ratingCount'),
                                                         F.avg("rating").alias("avgRating"),
                                                         F.variance('rating').alias('ratingVar')) \
        .withColumn('avgRatingVec', udf(lambda x: Vectors.dense(x), VectorUDT())('avgRating'))
    movieFeatures.show(10)
    # bucketing
    ratingCountDiscretizer = QuantileDiscretizer(numBuckets=100,
                                                 inputCol="ratingCount",
                                                 outputCol="ratingCountBucket")
    # Normalization
    ratingScaler = MinMaxScaler(inputCol="avgRatingVec",
                                outputCol="scaleAvgRating")
    pipelineStage = [ratingCountDiscretizer, ratingScaler]
    featurePipeline = Pipeline(stages=pipelineStage)
    movieProcessedFeatures = featurePipeline.fit(movieFeatures).transform(
        movieFeatures)
    movieProcessedFeatures.show(10)
Esempio n. 26
0
def create_quantilesDiscretizer(input_col: str, nq:int) -> QuantileDiscretizer:
    """
    Create a Quantile Discretizer for a specified column 
    Uses as output colum the input + _encoded
    
    Parameters
    ----------
    input_col: str
        Name of the Input Column
    nq: int
        Number of Quantiles to use
        
    Return
    ------
    QuantileDiscretizer
    """
    output_col = input_col + "_encoded"
    return QuantileDiscretizer(numBuckets=nq,
                                  relativeError=0.05,
                                  handleInvalid='keep',
                                  inputCol=input_col,
                                  outputCol=output_col)
def pipeline_preprocess(inp_df):

    # since categorical variables need both string indexing and
    # encoder, we can merge both the operations

    stages = []  #this will host the steps for our data transformation

    for col in cat_cols:
        stringIndexer = StringIndexer(inputCol= col, \
                                      outputCol=col + "Index")
        encoder = OneHotEncoderEstimator(inputCols=[stringIndexer \
                                                    .getOutputCol()], \
                                    outputCols = [col + "catVec"])
        stages += [stringIndexer, encoder]

    acctlen_bin = QuantileDiscretizer(numBuckets=4, \
                                        inputCol = "accountlength", \
                                        outputCol="acctlen_bin")
    stages += [acctlen_bin]

    # Create the label_Idx for the Output Column
    label_Idx = StringIndexer(inputCol="churn", outputCol="label")

    stages += [label_Idx]

    # Create the vector assembler stage to assemble data into labels
    # and features
    numeric_cols = num_cols.copy()
    numeric_cols.remove("accountlength")
    numeric_cols.append("acctlen_bin")

    inp_features = [c + "catVec" for c in cat_cols] + numeric_cols
    assembler = VectorAssembler(inputCols=inp_features, \
                                outputCol="features")
    stages += [assembler]

    data_preprocess = Pipeline(stages=stages).fit(inp_df)

    return data_preprocess
Esempio n. 28
0
from pyspark.ml.feature import QuantileDiscretizer
# $example off$
from pyspark.sql import SparkSession

if __name__ == "__main__":
    spark = SparkSession\
        .builder\
        .appName("QuantileDiscretizerExample")\
        .getOrCreate()

    # $example on$
    data = [(0, 18.0), (1, 19.0), (2, 8.0), (3, 5.0), (4, 2.2)]
    df = spark.createDataFrame(data, ["id", "hour"])
    # $example off$

    # Output of QuantileDiscretizer for such small datasets can depend on the number of
    # partitions. Here we force a single partition to ensure consistent results.
    # Note this is not necessary for normal use cases
    df = df.repartition(1)

    # $example on$
    discretizer = QuantileDiscretizer(numBuckets=3,
                                      inputCol="hour",
                                      outputCol="result")

    result = discretizer.fit(df).transform(df)
    result.show()
    # $example off$

    spark.stop()
 'weekday_is_friday',
 'weekday_is_saturday',
 'weekday_is_sunday',
 'is_weekend',
 'global_subjectivity',
 'global_sentiment_polarity',
 'title_subjectivity',
 'title_sentiment_polarity',
 'abs_title_subjectivity',
 'abs_title_sentiment_polarity'],outputCol='features' )
new_data = assembler.transform(data)


final_data = new_data.select('features','shares')
from pyspark.ml.feature import QuantileDiscretizer
discretizer = QuantileDiscretizer(numBuckets=2, inputCol="shares", outputCol="result")
result = discretizer.fit(final_data).transform(final_data)
finalData = result.select('result','features')
from pyspark.ml.classification import RandomForestClassifier
rfc = RandomForestClassifier(numTrees=250,labelCol='result',featuresCol='features')
train_data,test_data = finalData.randomSplit([0.7,0.3])
rfc_model = rfc.fit(train_data)
result = rfc_model.transform(test_data);
from pyspark.ml.evaluation import BinaryClassificationEvaluator
acc_eval = BinaryClassificationEvaluator(labelCol='result')
print(acc_eval.evaluate(result))
test_data.head(1)


# import os, sys
# import pandas
Esempio n. 30
0
# QuantileDiscretizer takes a column with continuous features and outputs a
# column with binned categorical features. The number of bins is set by the
# numBuckets parameter. It is possible that the number of buckets used will be
# smaller than this value, for example, if there are too few distinct values of
# the input to create enough distinct quantiles.

# NaN values: NaN values will be removed from the column during
# QuantileDiscretizer fitting. This will produce a Bucketizer model for making
# predictions. During the transformation, Bucketizer will raise an error when
# it finds NaN values in the dataset, but the user can also choose to either
# keep or remove NaN values within the dataset by setting handleInvalid. If the
# user chooses to keep NaN values, they will be handled specially and placed
# into their own bucket, for example, if 4 buckets are used, then non-NaN data
# will be put into buckets[0-3], but NaNs will be counted in a special
# bucket[4].

spark = SparkSession.builder.appName("QuantileDiscretizer").getOrCreate()

data = [(0, 18.0), (1, 19.0), (2, 8.0), (3, 5.0), (4, 2.2)]
df = spark.createDataFrame(data, ["id", "hour"])

discretizer = QuantileDiscretizer(inputCol="hour", outputCol="result",
                                  numBuckets=3)
discretizerModel = discretizer.fit(df)

result = discretizerModel.transform(df)
result.show()

spark.stop()
# $example on$
from pyspark.ml.feature import QuantileDiscretizer
# $example off$
from pyspark.sql import SparkSession

if __name__ == "__main__":
    spark = SparkSession\
        .builder\
        .appName("QuantileDiscretizerExample")\
        .getOrCreate()

    # $example on$
    data = [(0, 18.0,), (1, 19.0,), (2, 8.0,), (3, 5.0,), (4, 2.2,)]
    df = spark.createDataFrame(data, ["id", "hour"])
    # $example off$

    # Output of QuantileDiscretizer for such small datasets can depend on the number of
    # partitions. Here we force a single partition to ensure consistent results.
    # Note this is not necessary for normal use cases
    df = df.repartition(1)

    # $example on$
    discretizer = QuantileDiscretizer(numBuckets=3, inputCol="hour", outputCol="result")

    result = discretizer.fit(df).transform(df)
    result.show()
    # $example off$

    spark.stop()
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

from __future__ import print_function

# $example on$
from pyspark.ml.feature import QuantileDiscretizer

# $example off$
from pyspark.sql import SparkSession


if __name__ == "__main__":
    spark = SparkSession.builder.appName("PythonQuantileDiscretizerExample").getOrCreate()

    # $example on$
    data = [(0, 18.0), (1, 19.0), (2, 8.0), (3, 5.0), (4, 2.2)]
    dataFrame = spark.createDataFrame(data, ["id", "hour"])

    discretizer = QuantileDiscretizer(numBuckets=3, inputCol="hour", outputCol="result")

    result = discretizer.fit(dataFrame).transform(dataFrame)
    result.show()
    # $example off$

    spark.stop()
Esempio n. 33
0
    df_val = df_val.withColumn(output_col, (f.hash(f.col(feature))))
    df_val = df_val.withColumn(
        output_col,
        f.when(f.col(output_col) < 0,
               f.col(output_col) * -1 % 50).otherwise(f.col(output_col) % 50))

# Set the numbers of quantiles/buckets for the baseline approach
nq = 50

from pyspark.ml.feature import QuantileDiscretizer, StringIndexer, FeatureHasher, HashingTF, OneHotEncoderEstimator, VectorAssembler
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

AllQuantileDiscretizers = [
    QuantileDiscretizer(numBuckets=nq,
                        inputCol=col,
                        outputCol=(col + "_bucketized"),
                        handleInvalid="keep") for col in numeric_features
]

AllStringIndexers = [
    StringIndexer(inputCol=col, outputCol=(col + "_indexed"))
    for col in categorical_features
]

### FeatureHasher has been adapted to a hardcoded feature hashing + bucketing in the preprocessing step
#AllFeatureHashers = [FeatureHasher(numFeatures=nq,
#                           inputCols=[col],
#                           outputCol=(col + "_hashed")) for col in id_features]

#AllHashingTF = [HashingTF(inputCol=col,
#                          outputCol=(col + "_vectorized")) for col in text_features]
Esempio n. 34
0
# -*- coding: utf-8 -*-
"""
Created on Tue Jul  4 15:01:21 2017

@author: 10639497
"""

from __future__ import print_function
from pyspark.sql import SparkSession

session = SparkSession.builder.appName("Quantilte").getOrCreate()

from pyspark.ml.feature import QuantileDiscretizer

data = [(0, 18.0), (1, 19.0), (2, 8.0), (3, 5.0), (4, 2.2)]
dataset = session.createDataFrame(data, ['id', 'hour'])

dataset.show()
discretizer = QuantileDiscretizer(numBuckets=3,
                                  inputCol="hour",
                                  outputCol="result")
result = discretizer.fit(dataset).transform(dataset).select('result')
result.show()