Exemple #1
0
 def test_list_list_float(self):
     b = Bucketizer(splitsArray=[[-0.1, 0.5, 3], [-5, 1.5]])
     self.assertEqual(b.getSplitsArray(), [[-0.1, 0.5, 3.0], [-5.0, 1.5]])
     self.assertTrue(all([type(v) == list for v in b.getSplitsArray()]))
     self.assertTrue(all([type(v) == float for v in b.getSplitsArray()[0]]))
     self.assertTrue(all([type(v) == float for v in b.getSplitsArray()[1]]))
     self.assertRaises(TypeError, lambda: Bucketizer(splitsArray=["a", 1.0]))
     self.assertRaises(TypeError, lambda: Bucketizer(splitsArray=[[-5, 1.5], ["a", 1.0]]))
 def bucketize(self, df, c):   
     bucketizer4 = Bucketizer(splits=[-float("inf"), 0, 0.25, 0.5, 0.75, 1.0 ,float("inf")], inputCol=c, outputCol="B4_"+c) 
     bucketizer10 = Bucketizer(splits=[-float("inf"), 0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0 ,float("inf")], inputCol=c, outputCol="B10_"+c)   
      
     df = bucketizer4.transform(df.select('snapshotDate','ID',c))
     df = bucketizer10.transform(df)
                      
     return( df.select('snapshotDate','ID','B4_'+c, 'B10_'+c) )
Exemple #3
0
def calc(df, col_x: str, col_y: str, bins=50, bin_width=None):
    """
    Calculate the buckets and weights for a histogram

    Returns
    -------
        (buckets, weights): tuple of two lists
    """
    # Calculate buckets
    data = df[[col_x, col_y]]

    # Check
    int_types = (IntegerType, LongType, FloatType, DoubleType, DecimalType)
    col_type = data.schema.fields[0].dataType
    if not isinstance(col_type, int_types):
        raise ValueError(
            "hist2d method requires numerical or datetime columns, nothing to plot."
        )

    # Calculate buckets
    buckets_x = utils.spark_buckets(data,
                                    col_x,
                                    bins=bins,
                                    bin_width=bin_width)
    buckets_y = utils.spark_buckets(data,
                                    col_y,
                                    bins=bins,
                                    bin_width=bin_width)

    # Generate DF with buckets
    bucketizer = Bucketizer(splits=buckets_x,
                            inputCol=col_x,
                            outputCol="bucket_x")
    buckets_df = bucketizer.transform(data)
    bucketizer = Bucketizer(splits=buckets_y,
                            inputCol=col_y,
                            outputCol="bucket_y")
    buckets_df = bucketizer.transform(buckets_df)

    histogram = buckets_df.groupby("bucket_x", "bucket_y").agg(
        F.count(col_x).alias("count"))

    # Create weights matrix (locally)
    hist_pd = histogram.toPandas()
    weights = np.zeros((bins, bins))
    for index, row in hist_pd.iterrows():
        weights[int(row["bucket_x"]), int(row["bucket_y"])] = row["count"]

    # Mask values that are zero so they look transparent
    weights = np.ma.masked_where(weights == 0, weights)

    len(buckets_x)
    len(weights)

    return buckets_x, buckets_y, weights
def main_emm_recode_demos(emm_raw_sdf):

    recode_demo_pipeline = Pipeline(stages=[
        Bucketizer(splits=[0, 2, 6, 12, 18, 25, 35, 45, 55, 65, 150],
                   inputCol='age',
                   outputCol="age1"),
        Bucketizer(splits=[0, 18, 25, 35, 45, 55, 65, 150],
                   inputCol='age',
                   outputCol="age7"),
        Bucketizer(splits=[0, 12, 18, 25, 35, 45, 55, 65, 150],
                   inputCol='age',
                   outputCol="age8"),
        #        Bucketizer(splits=[-25, 0, 25., 50., 75., 100., float('Inf')], inputCol='income_amt', outputCol="income1"),
        #        Bucketizer(splits=[-25, 0, 25., 35., 50., 75., 100., float('Inf')], inputCol='income_amt', outputCol="income9"),
        IfElseTransformer(
            vals=[83], inputCol='hispanicid', outputCol='hispanic'),
        IfElseTransformer(
            vals=['M'], inputCol='gender_char', outputCol='gender'),
        IfElseTransformer(vals=[86], inputCol='raceid', outputCol='race_back'),
        IfElseTransformer(vals=[88], inputCol='raceid',
                          outputCol='race_asian'),
        YesNoTransformer(inputCol='dvr_flag', outputCol='dvr'),
        YesNoTransformer(inputCol='cable_plus_flag', outputCol='cableplus'),
        YesNoTransformer(inputCol='video_game_owner_flag',
                         outputCol='video_game'),
        YesNoTransformer(inputCol='internet_access_flag',
                         outputCol='internet'),
        YesNoTransformer(inputCol='pay_cable_flag', outputCol='paycable'),
        YesNoTransformer(
            inputCol='television_high_definition_display_capability_flag',
            outputCol='hdtv'),
        YesNoTransformer(inputCol='alternative_delivery_flag',
                         outputCol='satellite'),
        IsInTransformer(isin_bins=[[0, 1], [2], [3, 4, 5, 6, 7], [8]],
                        inputCol='nielsen_occupation_code',
                        outputCol='occupation1'),
        IsInTransformer(isin_bins=[[0, 8, 9, 10, 11, 12], [13, 14, 15], [16],
                                   [18, 19, 20]],
                        inputCol='education_level_number',
                        outputCol='education7'),
        IsInTransformer(isin_bins=[[16, 18, 19, 20],
                                   [0, 8, 9, 10, 11, 12, 13, 14, 15]],
                        inputCol='education_level_number',
                        outputCol='education2'),
        IsInTransformer(isin_bins=[['A'], ['B'], ['C'], ['D']],
                        inputCol='county_size_code',
                        outputCol='county_size')
    ])

    return None
    def discrete(self):
        # Bucketizer
        from pyspark.ml.feature import Bucketizer

        splits = [-float("inf"), -0.5, 0.0, 0.5, float("inf")]

        data = [(-999.9, ), (-0.5, ), (-0.3, ), (0.0, ), (0.2, ), (999.9, )]
        dataFrame = self.session.createDataFrame(data, ["features"])

        bucketizer = Bucketizer(splits=splits,
                                inputCol="features",
                                outputCol="bucketedFeatures")

        # Transform original data into its bucket index.
        bucketedData = bucketizer.transform(dataFrame)

        print("Bucketizer output with %d buckets" %
              (len(bucketizer.getSplits()) - 1))
        bucketedData.show()

        # QuantileDiscretizer

        data = [(0, 18.0), (1, 19.0), (2, 8.0), (3, 5.0), (4, 2.2)]
        df = self.createDataFrame(data, ["id", "hour"])

        discretizer = QuantileDiscretizer(numBuckets=3,
                                          inputCol="hour",
                                          outputCol="result")

        result = discretizer.fit(df).transform(df)
        result.show()
def create_buckets(percentage_of_missing_ctus_per_partyid):
    """
    Devide party ids by percentage of missing ctus into a list of 5 buckets
    > 0   < 0.25
    > 0.25 < 0.5
    > 0.5 < 0.75
    > 0.75 < 0.99
    > 0.99
    Output:
    +--------+-----------------------+-------+
    |party_id|percentage_missing_ctus|buckets|
    +--------+-----------------------+-------+
    |       1|                    0.2|    0.0|
    |       2|                   0.33|    1.0|
    |       3|                    1.0|    4.0|
    |       4|                   0.75|    3.0|
    |       5|                    0.6|    2.0|
    |       6|                    0.6|    2.0|
    +--------+-----------------------+-------+
    """

    bucketizer = Bucketizer(splits=[ 0, 0.25, 0.5, 0.75, 0.99, float('Inf') ], \
                            inputCol="percentage_missing_ctus", outputCol="buckets")
    df_of_buckets_ratio_between_imputed_distinct_ctus\
            = bucketizer.setHandleInvalid("keep").\
            transform(percentage_of_missing_ctus_per_partyid)
    return df_of_buckets_ratio_between_imputed_distinct_ctus
Exemple #7
0
    def _compute_hist(sdf, bins):
        # 'data' is a Spark DataFrame that selects one column.
        assert isinstance(bins, (np.ndarray, np.generic))

        colname = sdf.columns[-1]

        bucket_name = "__{}_bucket".format(colname)
        # creates a Bucketizer to get corresponding bin of each value
        bucketizer = Bucketizer(
            splits=bins, inputCol=colname, outputCol=bucket_name, handleInvalid="skip"
        )
        # after bucketing values, groups and counts them
        result = (
            bucketizer.transform(sdf)
            .select(bucket_name)
            .groupby(bucket_name)
            .agg(F.count("*").alias("count"))
            .toPandas()
            .sort_values(by=bucket_name)
        )

        # generates a pandas DF with one row for each bin
        # we need this as some of the bins may be empty
        indexes = pd.DataFrame({bucket_name: np.arange(0, len(bins) - 1), "bucket": bins[:-1]})
        # merges the bins with counts on it and fills remaining ones with zeros
        pdf = indexes.merge(result, how="left", on=[bucket_name]).fillna(0)[["count"]]
        pdf.columns = [bucket_name]

        return pdf[bucket_name]
    def get_binned_stat(self, df, colname, col_stat, n_split=10):

        splits = CommonUtils.frange(col_stat["min"],
                                    col_stat["max"],
                                    num_steps=n_split)
        splits = sorted(splits)
        splits_range = [(splits[idx], splits[idx + 1])
                        for idx in range(len(splits) - 1)]

        splits_data = {"splits": splits, "splits_range": splits_range}
        splits = splits_data["splits"]
        double_df = df.withColumn(colname, df[colname].cast(DoubleType()))
        bucketizer = Bucketizer(inputCol=colname, outputCol="BINNED_INDEX")
        bucketizer.setSplits(splits)
        binned_df = bucketizer.transform(double_df)
        histogram_df = binned_df.groupBy("BINNED_INDEX").count().toPandas()
        str_splits_range = [
            " to ".join([str(x[0]), str(x[1])]) for x in splits_range
        ]
        bin_name_dict = dict(zip(range(len(splits_range)), str_splits_range))
        bin_name_dict[n_split] = "null"
        histogram_df["orderIndex"] = histogram_df["BINNED_INDEX"].apply(
            lambda x: n_split if pd.isnull(x) else x)
        histogram_df["bins"] = histogram_df["orderIndex"].apply(
            lambda x: bin_name_dict[int(x)])
        relevant_df = histogram_df[["bins", "count", "orderIndex"]]
        histogram_dict = relevant_df.T.to_dict().values()
        histogram_dict = sorted(histogram_dict, key=lambda x: x["orderIndex"])
        output = []
        for val in histogram_dict:
            output.append({"name": val["bins"], "value": val["count"]})
        return output
Exemple #9
0
    def calc_histogram(self, bins):
        bucket_name = '__{}_bucket'.format(self.colname)
        # creates a Bucketizer to get corresponding bin of each value
        bucketizer = Bucketizer(splits=bins,
                                inputCol=self.colname,
                                outputCol=bucket_name,
                                handleInvalid="skip")
        # after bucketing values, groups and counts them
        result = (bucketizer
                  .transform(self.data._kdf._sdf)
                  .select(bucket_name)
                  .groupby(bucket_name)
                  .agg(F.count('*').alias('count'))
                  .toPandas()
                  .sort_values(by=bucket_name))

        # generates a pandas DF with one row for each bin
        # we need this as some of the bins may be empty
        indexes = pd.DataFrame({bucket_name: np.arange(0, len(bins) - 1),
                                'bucket': bins[:-1]})
        # merges the bins with counts on it and fills remaining ones with zeros
        data = indexes.merge(result, how='left', on=[bucket_name]).fillna(0)[['count']]
        data.columns = [bucket_name]

        return data
    def test_bucketizer(self):
        values = [(0.1, ), (0.4, ), (1.2, ), (1.5, )]
        data = self.spark.createDataFrame(values, ["features"])
        model = Bucketizer(splits=[-float("inf"), 0.5, 1.4,
                                   float("inf")],
                           inputCol="features",
                           outputCol="buckets")

        feature_count = len(data.select('features').first())
        model_onnx = convert_sparkml(
            model, 'Sparkml Bucketizer',
            [('features', FloatTensorType([1, feature_count]))])
        self.assertTrue(model_onnx is not None)
        # run the model
        predicted = model.setHandleInvalid("error").transform(data)
        expected = predicted.select("buckets").toPandas().values.astype(
            numpy.float32)
        data_np = [data.toPandas().values.astype(numpy.float32)]
        paths = save_data_models(data_np,
                                 expected,
                                 model,
                                 model_onnx,
                                 basename="SparkmlBucketizer")
        onnx_model_path = paths[3]
        output, output_shapes = run_onnx_model(['buckets'], data_np,
                                               onnx_model_path)
        compare_results(expected, output, decimal=5)
    def OneHotEncoder(self):
        """
        Converts string-type categories to indexes, splits continuous data interval to indexes,
        encodes the categorical data using One-Hot encoding.

        """
        splits = [-float("inf"), 500, 1200, 1700, float("inf")]
        self.bucketizer = Bucketizer(
            splitsArray=[splits, splits, splits],
            inputCols=["CRSDepTime", "CRSArrTime", "DepTime"],
            outputCols=["CatCRSDepTime", "CatCRSArrTime", "CatDepTime"])

        self.varIdxer = StringIndexer(
            inputCol="OrigDest",
            outputCol="IndOrigDest").setHandleInvalid("skip")

        self.oneHot = OneHotEncoder(inputCols=[
            'Month', 'DayOfWeek', 'CatCRSDepTime', 'CatCRSArrTime',
            'IndOrigDest', 'CatDepTime'
        ],
                                    outputCols=[
                                        'HotMonth', 'HotDayOfWeek',
                                        'HotCRSCatDepTime', 'HotCRSCatArrTime',
                                        'HotIndOrigDest', 'HotDepTime'
                                    ]).setHandleInvalid("keep")
    def get_column_hist(self, column, bins):
        """return a list of counts corresponding to bins"""
        bins = list(copy.deepcopy(bins))  # take a copy since we are inserting and popping
        if bins[0] == -np.inf or bins[0] == -float("inf"):
            added_min = False
            bins[0] = -float("inf")
        else:
            added_min = True
            bins.insert(0, -float("inf"))

        if bins[-1] == np.inf or bins[-1] == float("inf"):
            added_max = False
            bins[-1] = float("inf")
        else:
            added_max = True
            bins.append(float("inf"))

        temp_column = self.spark_df.select(column).where(col(column).isNotNull())
        bucketizer = Bucketizer(
            splits=bins, inputCol=column, outputCol="buckets")
        bucketed = bucketizer.setHandleInvalid("skip").transform(temp_column)

        # This is painful to do, but: bucketizer cannot handle values outside of a range
        # (hence adding -/+ infinity above)

        # Further, it *always* follows the numpy convention of lower_bound <= bin < upper_bound
        # for all but the last bin

        # But, since the last bin in our case will often be +infinity, we need to
        # find the number of values exactly equal to the upper bound to add those

        # We'll try for an optimization by asking for it at the same time
        if added_max == True:
            upper_bound_count = temp_column.select(column).filter(col(column) == bins[-2]).count()
        else:
            upper_bound_count = 0

        hist_rows = bucketed.groupBy("buckets").count().collect()
        # Spark only returns buckets that have nonzero counts.
        hist = [0] * (len(bins) - 1)
        for row in hist_rows:
            hist[int(row["buckets"])] = row["count"]

        hist[-2] += upper_bound_count

        if added_min:
            below_bins = hist.pop(0)
            bins.pop(0)
            if below_bins > 0:
                logger.warning("Discarding histogram values below lowest bin.")

        if added_max:
            above_bins = hist.pop(-1)
            bins.pop(-1)
            if above_bins > 0:
                logger.warning("Discarding histogram values above highest bin.")

        return hist
Exemple #13
0
def model_train(zipcode, complaint, day):
    print("Loading Data ...")
    data311 = spark.read.format("csv").option("header",
                                              "true").load("Data_Final/*.csv")
    infer_schema = "true"
    first_row_is_header = "true"
    delimiter = ","
    data311.registerTempTable("data311")
    data311 = data311.withColumn("ResTimeH",
                                 data311.Resolution_Time_Hours.cast('int'))
    data311 = data311.withColumn('day_of_week',
                                 dayofweek(data311['Created Date']))
    data311 = data311.withColumn("Zip", data311["Incident Zip"].cast('int'))
    data311 = data311.filter(data311.ResTimeH > 0)
    data311 = data311.filter(data311.ResTimeH < 99)
    bucketizer = Bucketizer(splits=[0, 2, 6, float('Inf')],
                            inputCol="ResTimeH",
                            outputCol="categories")
    data311 = bucketizer.setHandleInvalid("keep").transform(data311)
    X = data311['Zip', 'Complaint_Type_Groups', 'day_of_week', 'categories']
    X = X.filter(X["Zip"].isNotNull())
    X = X.filter(X["Complaint_Type_Groups"].isNotNull())
    X = X.filter(X["day_of_week"].isNotNull())

    stage_1 = StringIndexer(inputCol="Complaint_Type_Groups",
                            outputCol="categoryIndex")
    stage_2 = OneHotEncoderEstimator(inputCols=["categoryIndex"],
                                     outputCols=["categoryVec"])
    stage_3 = VectorAssembler(inputCols=['Zip', 'day_of_week', 'categoryVec'],
                              outputCol="features")
    stage_4 = StandardScaler().setInputCol("features").setOutputCol(
        "Scaled_ip_features")
    stage_5 = LogisticRegression(labelCol="categories",
                                 featuresCol="Scaled_ip_features")
    # setup the pipeline
    pipeline = Pipeline(stages=[stage_1, stage_2, stage_3, stage_4, stage_5])
    # fit the pipeline model and transform the data as defined
    pipeline_model = pipeline.fit(X)

    zipcode = int(zipcode)
    day = int(day)
    input_variables = pd.DataFrame(
        [[zipcode, complaint, day]],
        columns=['Zip', 'Complaint_Type_Groups', 'day_of_week'])
    input_variables = spark.createDataFrame(input_variables)

    transformed = pipeline_model.transform(input_variables)
    ans = transformed.select(collect_list('prediction')).first()[0]

    if (ans[0] == 0.0):
        prediction = "Your complaint will be resolved within 2 hours."
    elif (ans[0] == 1.0):
        prediction = "Your complaint will be resolved within 2-6 hours."
    else:
        prediction = "Your complaint will be resolved after 6 hours"
    return prediction
Exemple #14
0
def transform_spark(data, columns, args, transformed_column_name):
    from pyspark.ml.feature import Bucketizer
    import pyspark.sql.functions as F

    new_b = Bucketizer(
        splits=args["bucket_boundaries"], inputCol=columns["num"], outputCol=transformed_column_name
    )
    return new_b.transform(data).withColumn(
        transformed_column_name, F.col(transformed_column_name).cast("int")
    )
def get_binned_dataframe(df, bin_name, variable_name, edges):
    '''
    Produces a dataframe with a new column `bin_name` corresponding
    to the variable `variable_name` binned with the given `edges`.
    '''
    splits = [-float('inf')]+list(edges)+[float('inf')]
    bucketizer = Bucketizer(
        splits=splits, inputCol=variable_name, outputCol=bin_name)
    binnedDF = bucketizer.transform(df)
    return binnedDF
Exemple #16
0
def strat_scatterplot(sdf, col1, col2, n=30):
    stages = []
    for col in [col1, col2]:
        splits = get_buckets(sdf.select(col).rdd.map(itemgetter(0)), n)
        stages.append(Bucketizer(splits=splits,
                                 inputCol=col,
                                 outputCol="__{}_bucket".format(col),
                                 handleInvalid="skip"))

    pipeline = Pipeline(stages=stages)
    model = pipeline.fit(sdf)
    return model, sdf.count()
def age_recoder(spark_df, age_col):
    """
    :param spark_df:
    :param age_col:
    :return:
    """

    age1 = Bucketizer(splits=[0, 2, 6, 12, 18, 25, 35, 45, 55, 65, 150],
                      inputCol=age_col,
                      outputCol="age1")
    age7 = Bucketizer(splits=[0, 18, 25, 35, 45, 55, 65, 150],
                      inputCol=age_col,
                      outputCol="age7")
    age8 = Bucketizer(splits=[0, 12, 18, 25, 35, 45, 55, 65, 150],
                      inputCol=age_col,
                      outputCol="age8")

    sdf_1 = age1.setHandleInvalid("keep").transform(spark_df)
    sdf_2 = age7.setHandleInvalid("keep").transform(sdf_1)
    res_sdf = age8.setHandleInvalid("keep").transform(sdf_2)

    return res_sdf
Exemple #18
0
def bucketizer_splits(dataFrame,
                      inputCol,
                      splits=[-float('inf'), -0.5, 0.0, 0.5,
                              float('inf')]):
    # 按给定边界分桶离散化——按边界分桶
    bucketizer = Bucketizer(splits=splits,
                            inputCol=inputCol,
                            outputCol='%s_bucketizer' %
                            (inputCol))  # splits指定分桶边界
    bucketedData = bucketizer.transform(dataFrame)
    print('Bucketizer output with %d buckets' %
          (len(bucketizer.getSplits()) - 1))
    return bucketedData
 def test_save_and_load_on_nested_list_params(self):
     temp_path = tempfile.mkdtemp()
     splitsArray = [
         [-float("inf"), 0.5, 1.4, float("inf")],
         [-float("inf"), 0.1, 1.2, float("inf")],
     ]
     bucketizer = Bucketizer(splitsArray=splitsArray,
                             inputCols=["values", "values"],
                             outputCols=["b1", "b2"])
     savePath = temp_path + "/bk"
     bucketizer.write().overwrite().save(savePath)
     loadedBucketizer = Bucketizer.load(savePath)
     assert loadedBucketizer.getSplitsArray() == splitsArray
Exemple #20
0
 def buckert(self, df, column):
     """
     按指定边界 分桶Bucketizer
     """
     splits = [-float('inf'), -0.5, 0.0, 0.5, float('inf')]
     # 按给定边界分桶离散化——按边界分桶
     bucketizer = Bucketizer(splits=splits,
                             inputCol=column,
                             outputCol=column + '_bucketed')  # splits指定分桶边界
     bucketedData = bucketizer.transform(df)
     print('Bucketizer output with %d buckets' %
           (len(bucketizer.getSplits()) - 1))
     return bucketedData
def pre_processing(dataFrame):

    splits = [-float("inf"), -0.5, 0.0, 0.5, float("inf")]

    bucketizer = Bucketizer(splits=splits,
                            inputCol="features",
                            outputCol="bucketedFeatures")

    # Transform original data into its bucket index.
    bucketedData = bucketizer.transform(dataFrame)

    print("Bucketizer output with %d buckets" %
          (len(bucketizer.getSplits()) - 1))
    bucketedData.show()
Exemple #22
0
def add_age_id(spark, df, logger):
    """Calculate the age_id by splitting the visitor age into buckets"""
    agebucketizer = Bucketizer(splits=[ float('-Inf'), 0, 2, 11, 16, 21,
                                        26, 36, 46, 56, 66, float('Inf') ],
                                inputCol="i94bir",
                                outputCol="agebuckets")
    agebuck_df = agebucketizer.setHandleInvalid("keep").transform(df)
    age_id_df = agebuck_df.withColumn("age_id", when(col("i94bir") == -1, 999)\
                                                .otherwise(col("agebuckets")
                                                .cast(IntegerType()))
                                    )
    logger.info("Added age_id")
    age_id_df.persist()
    return age_id_df
Exemple #23
0
 def _bucketize_age_column(
         self, dataframe: DataFrame, input_col: str,
         output_col: str) -> Tuple[DataFrame, int, List[str]]:
     bucketizer = Bucketizer(splits=self.age_groups,
                             inputCol=input_col,
                             outputCol=output_col)
     output = bucketizer.setHandleInvalid("keep").transform(dataframe)
     splits = [s for s in bucketizer.getSplits()]
     mapping = [
         "[{}, {})".format(splits[i], splits[i + 1])
         for i in range(len(splits) - 1)
     ]
     n_age_groups = len(mapping)
     return output, n_age_groups, mapping
Exemple #24
0
def add_duration_id(spark, df, logger):
    """Calculate the visitduration_id by splitting the visit duration into buckets"""
    durdays_df = df.withColumn("duration_days", datediff("depdate", "arrdate"))
    ddbucketizer = Bucketizer(splits=[ float('-Inf'), 0, 4, 8, 11, 15, 22,
                                        29, float('Inf') ],
                                        inputCol="duration_days",
                        outputCol="ddbuckets")
    ddbuck_df = ddbucketizer.setHandleInvalid("keep").transform(durdays_df)
    dur_id_df = ddbuck_df.withColumn("visitduration_id",
                                   when(isnull(col("arrdate")) |
                                        isnull(col("depdate")), 999)\
                                   .otherwise(col("ddbuckets").cast(IntegerType()))
                                 )
    logger.info("Added duration_id")
    return dur_id_df
Exemple #25
0
    def _transform_data(self, data):
        data_handling = self.data_settings.get('data_handling', {})

        # interactions
        if data_handling.get('interactions', False):
            columns_list = list(data.columns)
            columns_list.remove(self.model_settings['variable_to_predict'])
            for col1 in columns_list:
                for col2 in columns_list:
                    if col1 != col2:
                        name = str(col1) + '_' + str(col2)
                        reverse_name = str(col2) + '_' + str(col1)
                        if reverse_name not in list(data.columns):
                            data = data.withColumn(name, (F.col(col1) + 1) *
                                                   (F.col(col2) + 1))

        # binning
        for feature_to_bin in data_handling.get("features_to_bin", []):
            min_val = data.agg({feature_to_bin['name']: "min"}).collect()[0][0]
            max_val = data.agg({feature_to_bin['name']: "max"}).collect()[0][0]
            full_bins = [(min_val - 1)
                         ] + feature_to_bin['bins'] + [(max_val + 1)]

            bucketizer = Bucketizer(splits=full_bins,
                                    inputCol=feature_to_bin['name'],
                                    outputCol=feature_to_bin['name'] +
                                    '_binned')

            data = bucketizer.transform(data)

        # transformation
        for col in data_handling.get("features_handling", {}).keys():
            transformation_array = data_handling["features_handling"][col].get(
                "transformation", [])
            # applying transformations
            for feature_transformation_method in transformation_array:
                data = data.withColumn(
                    col + '_' + feature_transformation_method,
                    eval('F.' + feature_transformation_method)(col))

        # dropping features
        features_to_remove = data_handling.get('features_to_remove', [])
        if len(features_to_remove) > 0:
            data = data.drop(*[
                feature for feature in features_to_remove
                if feature in data.columns
            ])
        return data
 def generateGroupedMeasureDataDict(self, measure_column):
     splits_data = self.get_measure_column_splits(self._data_frame,
                                                  measure_column, 4)
     splits = splits_data["splits"]
     double_df = self._data_frame.withColumn(
         measure_column,
         self._data_frame[measure_column].cast(DoubleType()))
     bucketizer = Bucketizer(inputCol=measure_column,
                             outputCol="BINNED_INDEX")
     bucketizer.setSplits(splits)
     binned_df = bucketizer.transform(double_df)
     unique_bins = binned_df.select("BINNED_INDEX").distinct().collect()
     unique_bins = [int(x[0]) for x in unique_bins]
     binned_index_dict = dict(zip(unique_bins, splits_data["splits_range"]))
     output = {"bins": binned_index_dict, "data": binned_df}
     return output
Exemple #27
0
def strat_scatterplot(sdf, col1, col2, n=30):
    stages = []
    for col in [col1, col2]:
        splits = np.linspace(
            *sdf.agg(F.min(col), F.max(col)).rdd.map(tuple).collect()[0],
            n + 1)
        bucket_name = '__{}_bucket'.format(col)
        stages.append(
            Bucketizer(splits=splits,
                       inputCol=col,
                       outputCol=bucket_name,
                       handleInvalid="skip"))

    pipeline = Pipeline(stages=stages)
    model = pipeline.fit(sdf)
    return model, sdf.count()
 def bucketize(self, df, field):
     df = df.withColumn(field, df[field].cast("double"))
     max = df.agg({field: "max"}).collect()[0][0]
     min = df.agg({field: "min"}).collect()[0][0]
     stddev = df.agg({field: "stddev"}).collect()[0][0]
     number_of_buckets = 1
     if stddev != 0:
         number_of_buckets = ((max - min) // (stddev))
     buckets = np.arange(number_of_buckets, dtype=np.float).tolist()
     buckets = [-float('inf')] + buckets + [float('inf')]
     bucketizer = Bucketizer(splits=buckets,
                             inputCol=field,
                             outputCol=field + '_bucketized')
     print("Bucketizing column: ", field)
     bucketized_features = bucketizer.transform(df)
     return bucketized_features
def transform_data(content_items):
    content_items = content_items.withColumn('receive_date',
                                             F.to_date(
                                                 F.col('time'))).drop('time')
    bucketizer = Bucketizer(splits=DAYS_FROM_EULA_BINS,
                            inputCol='days_from_eula',
                            outputCol='days_from_eula_bin',
                            handleInvalid='skip')
    content_items = bucketizer.transform(content_items) \
        .drop('days_from_eula') \
        .withColumn(
            'days_from_eula_bin',
            convert_to_char(F.col('days_from_eula_bin').astype('int') + INT_TO_CHAR_BASELINE)
        )

    print('content item data transformed')
    return content_items
    def test_measures(self, targetDimension, testMeasure):
        chisquare_result = ChiSquareResult()
        df = self._data_frame.withColumn(
            testMeasure, self._data_frame[testMeasure].cast(DoubleType()))
        measureSummaryDict = dict(df.describe([testMeasure]).toPandas().values)
        if float(measureSummaryDict["count"]) > 10:
            maxval = float(measureSummaryDict["max"])
            minval = float(measureSummaryDict["min"])
            step = (maxval - minval) / 5.0
            splits = [
                math.floor(minval), minval + step, minval + (step * 2),
                minval + (step * 3), minval + (step * 4),
                math.ceil(maxval)
            ]
            bucketizer = Bucketizer(splits=splits,
                                    inputCol=testMeasure,
                                    outputCol="bucketedColumn")
            # bucketedData = bucketizer.transform(df)
            bucketedData = bucketizer.transform(df.na.drop(subset=testMeasure))
            pivot_table = bucketedData.stat.crosstab(
                "{}".format(targetDimension), 'bucketedColumn')
        else:
            pivot_table = df.stat.crosstab("{}".format(targetDimension),
                                           testMeasure)

        rdd = list(
            chain(*zip(*pivot_table.drop(pivot_table.columns[0]).collect())))
        data_matrix = Matrices.dense(pivot_table.count(),
                                     len(pivot_table.columns) - 1, rdd)
        result = Statistics.chiSqTest(data_matrix)
        chisquare_result.set_params(result)
        freq_table = self._get_contingency_table_of_freq(pivot_table)
        freq_table.update_col2_names(splits)
        freq_table.set_tables()
        chisquare_result.set_table_result(freq_table)
        # Cramers V Calculation
        stat_value = result.statistic
        n = freq_table.get_total()
        t = min(len(freq_table.column_one_values),
                len(freq_table.column_two_values))

        v_value = math.sqrt(float(stat_value) / (n * float(t)))
        chisquare_result.set_v_value(v_value)
        chisquare_result.set_split_values([float(x) for x in splits])
        # chisquare_result.set_buckeddata(bucketedData)
        return chisquare_result