Example #1
0
 def bucketize(self, df, c):   
     bucketizer4 = Bucketizer(splits=[-float("inf"), 0, 0.25, 0.5, 0.75, 1.0 ,float("inf")], inputCol=c, outputCol="B4_"+c) 
     bucketizer10 = Bucketizer(splits=[-float("inf"), 0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0 ,float("inf")], inputCol=c, outputCol="B10_"+c)   
      
     df = bucketizer4.transform(df.select('snapshotDate','ID',c))
     df = bucketizer10.transform(df)
                      
     return( df.select('snapshotDate','ID','B4_'+c, 'B10_'+c) )
Example #2
0
def calc(df, col_x: str, col_y: str, bins=50, bin_width=None):
    """
    Calculate the buckets and weights for a histogram

    Returns
    -------
        (buckets, weights): tuple of two lists
    """
    # Calculate buckets
    data = df[[col_x, col_y]]

    # Check
    int_types = (IntegerType, LongType, FloatType, DoubleType, DecimalType)
    col_type = data.schema.fields[0].dataType
    if not isinstance(col_type, int_types):
        raise ValueError(
            "hist2d method requires numerical or datetime columns, nothing to plot."
        )

    # Calculate buckets
    buckets_x = utils.spark_buckets(data,
                                    col_x,
                                    bins=bins,
                                    bin_width=bin_width)
    buckets_y = utils.spark_buckets(data,
                                    col_y,
                                    bins=bins,
                                    bin_width=bin_width)

    # Generate DF with buckets
    bucketizer = Bucketizer(splits=buckets_x,
                            inputCol=col_x,
                            outputCol="bucket_x")
    buckets_df = bucketizer.transform(data)
    bucketizer = Bucketizer(splits=buckets_y,
                            inputCol=col_y,
                            outputCol="bucket_y")
    buckets_df = bucketizer.transform(buckets_df)

    histogram = buckets_df.groupby("bucket_x", "bucket_y").agg(
        F.count(col_x).alias("count"))

    # Create weights matrix (locally)
    hist_pd = histogram.toPandas()
    weights = np.zeros((bins, bins))
    for index, row in hist_pd.iterrows():
        weights[int(row["bucket_x"]), int(row["bucket_y"])] = row["count"]

    # Mask values that are zero so they look transparent
    weights = np.ma.masked_where(weights == 0, weights)

    len(buckets_x)
    len(weights)

    return buckets_x, buckets_y, weights
Example #3
0
    def calc_histogram(self, bins):
        bucket_name = '__{}_bucket'.format(self.colname)
        # creates a Bucketizer to get corresponding bin of each value
        bucketizer = Bucketizer(splits=bins,
                                inputCol=self.colname,
                                outputCol=bucket_name,
                                handleInvalid="skip")
        # after bucketing values, groups and counts them
        result = (bucketizer.transform(
            self.data._kdf._sdf).select(bucket_name).groupby(bucket_name).agg(
                F.count('*').alias('count')).toPandas().sort_values(
                    by=bucket_name))

        # generates a pandas DF with one row for each bin
        # we need this as some of the bins may be empty
        indexes = pd.DataFrame({
            bucket_name: np.arange(0,
                                   len(bins) - 1),
            'bucket': bins[:-1]
        })
        # merges the bins with counts on it and fills remaining ones with zeros
        data = indexes.merge(result, how='left',
                             on=[bucket_name]).fillna(0)[['count']]
        data.columns = [bucket_name]

        return data
Example #4
0
    def _compute_hist(sdf, bins):
        # 'data' is a Spark DataFrame that selects one column.
        assert isinstance(bins, (np.ndarray, np.generic))

        colname = sdf.columns[-1]

        bucket_name = "__{}_bucket".format(colname)
        # creates a Bucketizer to get corresponding bin of each value
        bucketizer = Bucketizer(
            splits=bins, inputCol=colname, outputCol=bucket_name, handleInvalid="skip"
        )
        # after bucketing values, groups and counts them
        result = (
            bucketizer.transform(sdf)
            .select(bucket_name)
            .groupby(bucket_name)
            .agg(F.count("*").alias("count"))
            .toPandas()
            .sort_values(by=bucket_name)
        )

        # generates a pandas DF with one row for each bin
        # we need this as some of the bins may be empty
        indexes = pd.DataFrame({bucket_name: np.arange(0, len(bins) - 1), "bucket": bins[:-1]})
        # merges the bins with counts on it and fills remaining ones with zeros
        pdf = indexes.merge(result, how="left", on=[bucket_name]).fillna(0)[["count"]]
        pdf.columns = [bucket_name]

        return pdf[bucket_name]
    def discrete(self):
        # Bucketizer
        from pyspark.ml.feature import Bucketizer

        splits = [-float("inf"), -0.5, 0.0, 0.5, float("inf")]

        data = [(-999.9, ), (-0.5, ), (-0.3, ), (0.0, ), (0.2, ), (999.9, )]
        dataFrame = self.session.createDataFrame(data, ["features"])

        bucketizer = Bucketizer(splits=splits,
                                inputCol="features",
                                outputCol="bucketedFeatures")

        # Transform original data into its bucket index.
        bucketedData = bucketizer.transform(dataFrame)

        print("Bucketizer output with %d buckets" %
              (len(bucketizer.getSplits()) - 1))
        bucketedData.show()

        # QuantileDiscretizer

        data = [(0, 18.0), (1, 19.0), (2, 8.0), (3, 5.0), (4, 2.2)]
        df = self.createDataFrame(data, ["id", "hour"])

        discretizer = QuantileDiscretizer(numBuckets=3,
                                          inputCol="hour",
                                          outputCol="result")

        result = discretizer.fit(df).transform(df)
        result.show()
    def get_binned_stat(self, df, colname, col_stat, n_split=10):

        splits = CommonUtils.frange(col_stat["min"],
                                    col_stat["max"],
                                    num_steps=n_split)
        splits = sorted(splits)
        splits_range = [(splits[idx], splits[idx + 1])
                        for idx in range(len(splits) - 1)]

        splits_data = {"splits": splits, "splits_range": splits_range}
        splits = splits_data["splits"]
        double_df = df.withColumn(colname, df[colname].cast(DoubleType()))
        bucketizer = Bucketizer(inputCol=colname, outputCol="BINNED_INDEX")
        bucketizer.setSplits(splits)
        binned_df = bucketizer.transform(double_df)
        histogram_df = binned_df.groupBy("BINNED_INDEX").count().toPandas()
        str_splits_range = [
            " to ".join([str(x[0]), str(x[1])]) for x in splits_range
        ]
        bin_name_dict = dict(zip(range(len(splits_range)), str_splits_range))
        bin_name_dict[n_split] = "null"
        histogram_df["orderIndex"] = histogram_df["BINNED_INDEX"].apply(
            lambda x: n_split if pd.isnull(x) else x)
        histogram_df["bins"] = histogram_df["orderIndex"].apply(
            lambda x: bin_name_dict[int(x)])
        relevant_df = histogram_df[["bins", "count", "orderIndex"]]
        histogram_dict = relevant_df.T.to_dict().values()
        histogram_dict = sorted(histogram_dict, key=lambda x: x["orderIndex"])
        output = []
        for val in histogram_dict:
            output.append({"name": val["bins"], "value": val["count"]})
        return output
Example #7
0
    def discretize(self, test=False):
        """
        Discretize a continous feature into a discrete one
        """

        for col in list(self.config_dict.keys()):
            # check if the discretizer transformation needs to be applied
            if self.config_dict[col]["discretize"]["apply"]:
                splits = self.config_dict[col]["discretize"]["value"]
                splits = [-math.inf] + splits
                splits = splits + [math.inf]
                bucketizer = Bucketizer(splits=splits,
                                        inputCol=col,
                                        outputCol="{}_discretized".format(col))
                if test:
                    self.test_data = bucketizer.transform(self.test_data)
                else:
                    self.train_data = bucketizer.transform(self.train_data)
def get_binned_dataframe(df, bin_name, variable_name, edges):
    '''
    Produces a dataframe with a new column `bin_name` corresponding
    to the variable `variable_name` binned with the given `edges`.
    '''
    splits = [-float('inf')]+list(edges)+[float('inf')]
    bucketizer = Bucketizer(
        splits=splits, inputCol=variable_name, outputCol=bin_name)
    binnedDF = bucketizer.transform(df)
    return binnedDF
Example #9
0
def transform_spark(data, columns, args, transformed_column_name):
    from pyspark.ml.feature import Bucketizer
    import pyspark.sql.functions as F

    new_b = Bucketizer(
        splits=args["bucket_boundaries"], inputCol=columns["num"], outputCol=transformed_column_name
    )
    return new_b.transform(data).withColumn(
        transformed_column_name, F.col(transformed_column_name).cast("int")
    )
Example #10
0
 def buckert(self, df, column):
     """
     按指定边界 分桶Bucketizer
     """
     splits = [-float('inf'), -0.5, 0.0, 0.5, float('inf')]
     # 按给定边界分桶离散化——按边界分桶
     bucketizer = Bucketizer(splits=splits,
                             inputCol=column,
                             outputCol=column + '_bucketed')  # splits指定分桶边界
     bucketedData = bucketizer.transform(df)
     print('Bucketizer output with %d buckets' %
           (len(bucketizer.getSplits()) - 1))
     return bucketedData
Example #11
0
def bucketizer_splits(dataFrame,
                      inputCol,
                      splits=[-float('inf'), -0.5, 0.0, 0.5,
                              float('inf')]):
    # 按给定边界分桶离散化——按边界分桶
    bucketizer = Bucketizer(splits=splits,
                            inputCol=inputCol,
                            outputCol='%s_bucketizer' %
                            (inputCol))  # splits指定分桶边界
    bucketedData = bucketizer.transform(dataFrame)
    print('Bucketizer output with %d buckets' %
          (len(bucketizer.getSplits()) - 1))
    return bucketedData
def pre_processing(dataFrame):

    splits = [-float("inf"), -0.5, 0.0, 0.5, float("inf")]

    bucketizer = Bucketizer(splits=splits,
                            inputCol="features",
                            outputCol="bucketedFeatures")

    # Transform original data into its bucket index.
    bucketedData = bucketizer.transform(dataFrame)

    print("Bucketizer output with %d buckets" %
          (len(bucketizer.getSplits()) - 1))
    bucketedData.show()
Example #13
0
    def _transform_data(self, data):
        data_handling = self.data_settings.get('data_handling', {})

        # interactions
        if data_handling.get('interactions', False):
            columns_list = list(data.columns)
            columns_list.remove(self.model_settings['variable_to_predict'])
            for col1 in columns_list:
                for col2 in columns_list:
                    if col1 != col2:
                        name = str(col1) + '_' + str(col2)
                        reverse_name = str(col2) + '_' + str(col1)
                        if reverse_name not in list(data.columns):
                            data = data.withColumn(name, (F.col(col1) + 1) *
                                                   (F.col(col2) + 1))

        # binning
        for feature_to_bin in data_handling.get("features_to_bin", []):
            min_val = data.agg({feature_to_bin['name']: "min"}).collect()[0][0]
            max_val = data.agg({feature_to_bin['name']: "max"}).collect()[0][0]
            full_bins = [(min_val - 1)
                         ] + feature_to_bin['bins'] + [(max_val + 1)]

            bucketizer = Bucketizer(splits=full_bins,
                                    inputCol=feature_to_bin['name'],
                                    outputCol=feature_to_bin['name'] +
                                    '_binned')

            data = bucketizer.transform(data)

        # transformation
        for col in data_handling.get("features_handling", {}).keys():
            transformation_array = data_handling["features_handling"][col].get(
                "transformation", [])
            # applying transformations
            for feature_transformation_method in transformation_array:
                data = data.withColumn(
                    col + '_' + feature_transformation_method,
                    eval('F.' + feature_transformation_method)(col))

        # dropping features
        features_to_remove = data_handling.get('features_to_remove', [])
        if len(features_to_remove) > 0:
            data = data.drop(*[
                feature for feature in features_to_remove
                if feature in data.columns
            ])
        return data
 def generateGroupedMeasureDataDict(self, measure_column):
     splits_data = self.get_measure_column_splits(self._data_frame,
                                                  measure_column, 4)
     splits = splits_data["splits"]
     double_df = self._data_frame.withColumn(
         measure_column,
         self._data_frame[measure_column].cast(DoubleType()))
     bucketizer = Bucketizer(inputCol=measure_column,
                             outputCol="BINNED_INDEX")
     bucketizer.setSplits(splits)
     binned_df = bucketizer.transform(double_df)
     unique_bins = binned_df.select("BINNED_INDEX").distinct().collect()
     unique_bins = [int(x[0]) for x in unique_bins]
     binned_index_dict = dict(zip(unique_bins, splits_data["splits_range"]))
     output = {"bins": binned_index_dict, "data": binned_df}
     return output
 def bucketize(self, df, field):
     df = df.withColumn(field, df[field].cast("double"))
     max = df.agg({field: "max"}).collect()[0][0]
     min = df.agg({field: "min"}).collect()[0][0]
     stddev = df.agg({field: "stddev"}).collect()[0][0]
     number_of_buckets = 1
     if stddev != 0:
         number_of_buckets = ((max - min) // (stddev))
     buckets = np.arange(number_of_buckets, dtype=np.float).tolist()
     buckets = [-float('inf')] + buckets + [float('inf')]
     bucketizer = Bucketizer(splits=buckets,
                             inputCol=field,
                             outputCol=field + '_bucketized')
     print("Bucketizing column: ", field)
     bucketized_features = bucketizer.transform(df)
     return bucketized_features
def transform_data(content_items):
    content_items = content_items.withColumn('receive_date',
                                             F.to_date(
                                                 F.col('time'))).drop('time')
    bucketizer = Bucketizer(splits=DAYS_FROM_EULA_BINS,
                            inputCol='days_from_eula',
                            outputCol='days_from_eula_bin',
                            handleInvalid='skip')
    content_items = bucketizer.transform(content_items) \
        .drop('days_from_eula') \
        .withColumn(
            'days_from_eula_bin',
            convert_to_char(F.col('days_from_eula_bin').astype('int') + INT_TO_CHAR_BASELINE)
        )

    print('content item data transformed')
    return content_items
 def bucketize(self, splits, target_col):
     self._bucket_name = 'bucket_' + target_col
     bucketizer = Bucketizer(inputCol=target_col,
                             outputCol=self._bucket_name)
     splits.sort()
     bucketizer.setSplits(splits)
     column_data_types = {
         field.name: field.dataType
         for field in self._data_frame.schema.fields
     }
     if column_data_types[target_col] != DoubleType:
         self._data_frame = self._data_frame.select(*[
             col(target_col).cast('double').alias(target_col) if column ==
             target_col else column for column in self._data_frame.columns
         ])
     self._data_frame = bucketizer.transform(self._data_frame)
     return self._bucket_name
Example #18
0
    def test_measures(self, targetDimension, testMeasure):
        chisquare_result = ChiSquareResult()
        df = self._data_frame.withColumn(
            testMeasure, self._data_frame[testMeasure].cast(DoubleType()))
        measureSummaryDict = dict(df.describe([testMeasure]).toPandas().values)
        if float(measureSummaryDict["count"]) > 10:
            maxval = float(measureSummaryDict["max"])
            minval = float(measureSummaryDict["min"])
            step = (maxval - minval) / 5.0
            splits = [
                math.floor(minval), minval + step, minval + (step * 2),
                minval + (step * 3), minval + (step * 4),
                math.ceil(maxval)
            ]
            bucketizer = Bucketizer(splits=splits,
                                    inputCol=testMeasure,
                                    outputCol="bucketedColumn")
            # bucketedData = bucketizer.transform(df)
            bucketedData = bucketizer.transform(df.na.drop(subset=testMeasure))
            pivot_table = bucketedData.stat.crosstab(
                "{}".format(targetDimension), 'bucketedColumn')
        else:
            pivot_table = df.stat.crosstab("{}".format(targetDimension),
                                           testMeasure)

        rdd = list(
            chain(*zip(*pivot_table.drop(pivot_table.columns[0]).collect())))
        data_matrix = Matrices.dense(pivot_table.count(),
                                     len(pivot_table.columns) - 1, rdd)
        result = Statistics.chiSqTest(data_matrix)
        chisquare_result.set_params(result)
        freq_table = self._get_contingency_table_of_freq(pivot_table)
        freq_table.update_col2_names(splits)
        freq_table.set_tables()
        chisquare_result.set_table_result(freq_table)
        # Cramers V Calculation
        stat_value = result.statistic
        n = freq_table.get_total()
        t = min(len(freq_table.column_one_values),
                len(freq_table.column_two_values))

        v_value = math.sqrt(float(stat_value) / (n * float(t)))
        chisquare_result.set_v_value(v_value)
        chisquare_result.set_split_values([float(x) for x in splits])
        # chisquare_result.set_buckeddata(bucketedData)
        return chisquare_result
Example #19
0
 def bin_columns(self, colsToBin):
     for bincol in colsToBin:
         if self._pandas_flag:
             try:
                 minval, maxval = float(min(
                     self._data_frame[bincol])), float(
                         max(self._data_frame[bincol]))
                 n_split = 10
                 splitsData = CommonUtils.get_splits(
                     minval, maxval, n_split)
                 self._data_frame[bincol] = pd.cut(
                     self._data_frame[bincol],
                     bins=splitsData["splits"],
                     labels=list(splitsData['bin_mapping'].values()),
                     right=True,
                     include_lowest=True)
             except Exception as e:
                 print("Binning failed for : ", bincol)
         else:
             try:
                 minval, maxval = self._data_frame.select([
                     FN.max(bincol).alias("max"),
                     FN.min(bincol).alias("min")
                 ]).collect()[0]
                 n_split = 10
                 splitsData = CommonUtils.get_splits(
                     minval, maxval, n_split)
                 splits = splitsData["splits"]
                 self._data_frame = self._data_frame.withColumn(
                     bincol, self._data_frame[bincol].cast(DoubleType()))
                 bucketizer = Bucketizer(inputCol=bincol,
                                         outputCol="BINNED_INDEX")
                 bucketizer.setSplits(splits)
                 self._data_frame = bucketizer.transform(self._data_frame)
                 mapping_expr = create_map([
                     lit(x) for x in chain(
                         *list(splitsData["bin_mapping"].items()))
                 ])
                 # self._data_frame = self._data_frame.withColumnRenamed("bincol",bincol+"JJJLLLLKJJ")
                 self._data_frame = self._data_frame.withColumn(
                     bincol, mapping_expr.getItem(col("BINNED_INDEX")))
                 self._data_frame = self._data_frame.select(self.columns)
             except Exception as e:
                 print("Binning failed for : ", bincol)
Example #20
0
def strat_histogram(sdf, colname, bins=10, categorical=False):
    if categorical:
        result = sdf.cols[colname]._value_counts(dropna=False, raw=True)

        if hasattr(result.index, 'levels'):
            indexes = pd.MultiIndex.from_product(
                result.index.levels[:-1] +
                [result.reset_index()[colname].unique().tolist()],
                names=result.index.names)
            result = (pd.DataFrame(index=indexes).join(
                result.to_frame(),
                how='left').fillna(0)[result.name].astype(result.dtype))

        start_values = result.index.tolist()
    else:
        bucket_name = '__{}_bucket'.format(colname)
        strata = sdf._handy.strata_colnames
        colnames = strata + ensure_list(bucket_name)

        start_values = np.linspace(
            *sdf.agg(F.min(colname),
                     F.max(colname)).rdd.map(tuple).collect()[0], bins + 1)
        bucketizer = Bucketizer(splits=start_values,
                                inputCol=colname,
                                outputCol=bucket_name,
                                handleInvalid="skip")
        result = (
            bucketizer.transform(sdf).select(colnames).groupby(colnames).agg(
                F.count('*').alias('count')).toPandas().sort_values(
                    by=colnames))

        indexes = pd.DataFrame({
            bucket_name: np.arange(0, bins),
            'bucket': start_values[:-1]
        })
        if len(strata):
            indexes = (indexes.assign(key=1).merge(
                result[strata].drop_duplicates().assign(key=1),
                on='key').drop(columns=['key']))
        result = indexes.merge(result, how='left', on=strata +
                               [bucket_name]).fillna(0)[strata +
                                                        [bucket_name, 'count']]

    return start_values, result
Example #21
0
def calc(df, column: str, bins=50, bin_width=None):
    """
    Calculate the buckets and weights for a histogram

    Returns
    -------
        (buckets, weights): tuple of two lists
    """
    if bins is None and bin_width is None:
        raise ValueError("Must indicate bins or bin_width")
    elif bins is None and bin_width is not None:
        raise ValueError("bins and bin_width arguments are mutually exclusive")

    # Calculate buckets
    data = df[[column]]

    int_types = (IntegerType, LongType, FloatType, DoubleType, DecimalType)
    col_type = data.schema.fields[0].dataType
    if not isinstance(col_type, int_types):
        raise ValueError(
            "hist method requires numerical or datetime columns, nothing to plot."
        )

    # Calculate buckets
    buckets = utils.spark_buckets(data, column, bins=bins, bin_width=bin_width)

    # Calculate counts based on the buckets
    bucketizer = Bucketizer(splits=buckets, inputCol=column, outputCol="bucket")
    buckets_df = bucketizer.transform(data)

    histogram = buckets_df.groupby("bucket").agg(F.count(column).alias("count"))
    histogram = histogram.orderBy("bucket", ascending=True)

    # Create weights (locally)
    hist_pd = histogram.toPandas()

    # Create a new DF with complete buckets and empty counts if needed
    full_buckets = pd.DataFrame(columns=["bucket"])
    full_buckets["bucket"] = np.arange(len(buckets))
    full_buckets = full_buckets.merge(hist_pd, on="bucket", how="left")
    weights = full_buckets["count"]

    return buckets, weights
Example #22
0
    def biVariate(self, columns=None, buckets=5):
        if not columns:
            columns = self.data.drop(self.targetColumn, self.idColumn).columns
        '''Implements functionality of pd.melt; Transforms dataframe from wide to long'''
        # Create and explode an array of (column_name, column_value) structs
        melter = explode(
            array([
                struct(lit(colnames).alias("key"),
                       col(colnames).alias("val")) for colnames in columns
            ])).alias("kvs")

        long_data = self.data.select(melter, self.targetColumn) \
            .selectExpr(self.targetColumn, "kvs.key AS key", "kvs.val AS val")

        observations = self.count
        split_val = [
            i / buckets for i in range(buckets, (observations * buckets) +
                                       1, observations - 1)
        ]
        bucketizer = Bucketizer(splits=split_val,
                                inputCol="row",
                                outputCol="bucket")

        biv = bucketizer.transform(
            long_data.select(
                self.targetColumn,
                'key',
                'val',
                row_number().over(Window.partitionBy('key').orderBy('val')).alias('row')
            )
        ) \
        .groupby('key', 'bucket') \
        .agg(
            count('*').alias('num_records'),
            min('val').alias('bucket_min'),
            max('val').alias('bucket_max'),
            sum('target').alias('ones')
        ) \
        .withColumn('event_rate', 100 * col('ones') / col('num_records')) \
        .orderBy('key', 'bucket')

        return biv.toPandas()
Example #23
0
 def bin_columns(self, colsToBin):
     for bincol in colsToBin:
         minval, maxval = self._data_frame.select(
             [FN.max(bincol).alias("max"),
              FN.min(bincol).alias("min")]).collect()[0]
         n_split = 10
         splitsData = CommonUtils.get_splits(minval, maxval, n_split)
         splits = splitsData["splits"]
         self._data_frame = self._data_frame.withColumn(
             bincol, self._data_frame[bincol].cast(DoubleType()))
         bucketizer = Bucketizer(inputCol=bincol, outputCol="BINNED_INDEX")
         bucketizer.setSplits(splits)
         self._data_frame = bucketizer.transform(self._data_frame)
         mapping_expr = create_map(
             [lit(x) for x in chain(*splitsData["bin_mapping"].items())])
         self._data_frame = self._data_frame.withColumnRenamed(
             "bincol", bincol + "JJJLLLLKJJ")
         self._data_frame = self._data_frame.withColumn(
             bincol, mapping_expr.getItem(col("BINNED_INDEX")))
         self._data_frame = self._data_frame.select(self.columns)
Example #24
0
def spark_cut(df, col_name, bins, labels):
    """
    Turns a continuous variable into categorical.
    :param df: a spark dataframe
    :param col_name: the continuous column to be categorized.
    :param bins: lower and upper bounds. must be sorted ascending and encompass the col entire range.
    :param labels: labels for each category. should be len(bins)-1
    :return: a spark dataframe with the specified column binned and labeled as specified.
    """
    bucketizer = Bucketizer(splits=bins,
                            inputCol=col_name,
                            outputCol=col_name + '_binned')

    df = bucketizer.transform(df)
    label_array = F.array(*(F.lit(label) for label in labels))
    df = df.withColumn(
        col_name,
        label_array.getItem(F.col(col_name + '_binned').cast('integer')))
    df = df.drop(col_name + '_binned')
    return df
Example #25
0
    def main(self, sc, *args):
        """ For each input files, i.e. train and test 'initiated, apply the same set of transformatons
        """

        sqlContext = SQLContext(sc)
        # For each key in the output dictionary of the Initiate task, i.e. train and test
        for inputFile in Initiate(self.input_file, self.output_path).output():
            df = sqlContext.read.csv(Initiate(
                self.input_file, self.output_path).output()[inputFile].path,
                                     sep=",",
                                     header=True,
                                     inferSchema=True)

            # Select final list of features
            list_features = ["Age", "Sex_indexed", "Fare", "Survived"]
            df = df.select(*list_features)

            # Replace missing values
            cols_missing = ["Age"]
            for col in cols_missing:
                imputer = Imputer(inputCols=[col],
                                  outputCols=[
                                      "{}_replace_missings".format(col)
                                  ]).setMissingValue(26.0)
                df = imputer.fit(df).transform(df)

            # Discretize
            cols_disc = {
                "Age_replace_missings":
                [-math.inf, 0.83, 21.0, 26.0, 33.0, 71.0, math.inf],
                "Fare": [-math.inf, 7.225, 8.122, 26.0, 83.475, math.inf],
            }
            for col in cols_disc:
                bucketizer = Bucketizer(splits=cols_disc[col],
                                        inputCol=col,
                                        outputCol="{}_discretized".format(col))
                df = bucketizer.transform(df)

            df.write.csv(self.output()[inputFile].path, header=True)
Example #26
0
 def bucketize(self, splits, target_col):
     self._bucket_name = 'bucket_' + target_col
     if self._pandas_flag:
         ''' TO DO: this method is not being used anywhere '''
         pass
     else:
         bucketizer = Bucketizer(inputCol=target_col,
                                 outputCol=self._bucket_name)
         splits.sort()
         bucketizer.setSplits(splits)
         column_data_types = {
             field.name: field.dataType
             for field in self._data_frame.schema.fields
         }
         if column_data_types[target_col] != DoubleType:
             self._data_frame = self._data_frame.select(*[
                 col(target_col).cast('double').
                 alias(target_col) if column == target_col else column
                 for column in self._data_frame.columns
             ])
         self._data_frame = bucketizer.transform(self._data_frame)
     return self._bucket_name
#

from __future__ import print_function

from pyspark import SparkContext
from pyspark.sql import SQLContext
# $example on$
from pyspark.ml.feature import Bucketizer
# $example off$

if __name__ == "__main__":
    sc = SparkContext(appName="BucketizerExample")
    sqlContext = SQLContext(sc)

    # $example on$
    splits = [-float("inf"), -0.5, 0.0, 0.5, float("inf")]

    data = [(-0.5, ), (-0.3, ), (0.0, ), (0.2, )]
    dataFrame = sqlContext.createDataFrame(data, ["features"])

    bucketizer = Bucketizer(splits=splits,
                            inputCol="features",
                            outputCol="bucketedFeatures")

    # Transform original data into its bucket index.
    bucketedData = bucketizer.transform(dataFrame)
    bucketedData.show()
    # $example off$

    sc.stop()
Example #28
0
def main(base_path):
    APP_NAME = "train_spark_mllib_model.py"

    # If there is no SparkSession, create the environment
    try:
        sc and spark
    except NameError as e:
        import findspark
        findspark.init()
        import pyspark
        import pyspark.sql

        sc = pyspark.SparkContext()
        spark = pyspark.sql.SparkSession(sc).builder.appName(
            APP_NAME).getOrCreate()

    #
    # {
    #   "ArrDelay":5.0,"CRSArrTime":"2015-12-31T03:20:00.000-08:00","CRSDepTime":"2015-12-31T03:05:00.000-08:00",
    #   "Carrier":"WN","DayOfMonth":31,"DayOfWeek":4,"DayOfYear":365,"DepDelay":14.0,"Dest":"SAN","Distance":368.0,
    #   "FlightDate":"2015-12-30T16:00:00.000-08:00","FlightNum":"6109","Origin":"TUS"
    # }
    #
    from pyspark.sql.types import StringType, IntegerType, FloatType, DoubleType, DateType, TimestampType
    from pyspark.sql.types import StructType, StructField
    from pyspark.sql.functions import udf

    schema = StructType([
        StructField("ArrDelay", DoubleType(), True),
        StructField("CRSArrTime", TimestampType(), True),
        StructField("CRSDepTime", TimestampType(), True),
        StructField("Carrier", StringType(), True),
        StructField("DayOfMonth", IntegerType(), True),
        StructField("DayOfWeek", IntegerType(), True),
        StructField("DayOfYear", IntegerType(), True),
        StructField("DepDelay", DoubleType(), True),
        StructField("Dest", StringType(), True),
        StructField("Distance", DoubleType(), True),
        StructField("FlightDate", DateType(), True),
        StructField("FlightNum", StringType(), True),
        StructField("Origin", StringType(), True),
        StructField("Route", StringType(), True),
        StructField("TailNum", StringType(), True),
        StructField("EngineManufacturer", StringType(), True),
        StructField("EngineModel", StringType(), True),
        StructField("Manufacturer", StringType(), True),
        StructField("ManufacturerYear", StringType(), True),
        StructField("OwnerState", StringType(), True),
    ])

    input_path = "{}/data/simple_flight_delay_features_airplanes.json".format(
        base_path)
    features = spark.read.json(input_path, schema=schema)
    features.first()

    #
    # Add the hour of day of scheduled arrival/departure
    #
    from pyspark.sql.functions import hour
    features_with_hour = features.withColumn("CRSDepHourOfDay",
                                             hour(features.CRSDepTime))
    features_with_hour = features_with_hour.withColumn(
        "CRSArrHourOfDay", hour(features.CRSArrTime))
    features_with_hour.select("CRSDepTime", "CRSDepHourOfDay", "CRSArrTime",
                              "CRSArrHourOfDay").show()

    #
    # Check for nulls in features before using Spark ML
    #
    null_counts = [
        (column,
         features_with_hour.where(features_with_hour[column].isNull()).count())
        for column in features_with_hour.columns
    ]
    cols_with_nulls = filter(lambda x: x[1] > 0, null_counts)
    print("\nNull Value Report")
    print("-----------------")
    print(tabulate(cols_with_nulls, headers=["Column", "Nulls"]))

    #
    # Use pysmark.ml.feature.Bucketizer to bucketize ArrDelay into on-time, slightly late, very late (0, 1, 2)
    #
    from pyspark.ml.feature import Bucketizer

    # Setup the Bucketizer
    splits = [-float("inf"), -15.0, 0, 30.0, float("inf")]
    arrival_bucketizer = Bucketizer(splits=splits,
                                    inputCol="ArrDelay",
                                    outputCol="ArrDelayBucket")

    # Save the model
    arrival_bucketizer_path = "{}/models/arrival_bucketizer_2.0.bin".format(
        base_path)
    arrival_bucketizer.write().overwrite().save(arrival_bucketizer_path)

    # Apply the model
    ml_bucketized_features = arrival_bucketizer.transform(features_with_hour)
    ml_bucketized_features.select("ArrDelay", "ArrDelayBucket").show()

    #
    # Extract features tools in with pyspark.ml.feature
    #
    from pyspark.ml.feature import StringIndexer, VectorAssembler

    # Turn category fields into indexes
    string_columns = ["Carrier", "Origin", "Dest", "Route", "TailNum"]
    for column in string_columns:
        string_indexer = StringIndexer(inputCol=column,
                                       outputCol=column + "_index")

        string_indexer_model = string_indexer.fit(ml_bucketized_features)
        ml_bucketized_features = string_indexer_model.transform(
            ml_bucketized_features)

        # Save the pipeline model
        string_indexer_output_path = "{}/models/string_indexer_model_4.0.{}.bin".format(
            base_path, column)
        string_indexer_model.write().overwrite().save(
            string_indexer_output_path)

    # Combine continuous, numeric fields with indexes of nominal ones
    # ...into one feature vector
    numeric_columns = [
        "DepDelay", "Distance", "DayOfYear", "CRSDepHourOfDay",
        "CRSArrHourOfDay"
    ]
    index_columns = [column + "_index" for column in string_columns]

    vector_assembler = VectorAssembler(inputCols=numeric_columns +
                                       index_columns,
                                       outputCol="Features_vec")
    final_vectorized_features = vector_assembler.transform(
        ml_bucketized_features)

    # Save the numeric vector assembler
    vector_assembler_path = "{}/models/numeric_vector_assembler_5.0.bin".format(
        base_path)
    vector_assembler.write().overwrite().save(vector_assembler_path)

    # Drop the index columns
    for column in index_columns:
        final_vectorized_features = final_vectorized_features.drop(column)

    # Inspect the finalized features
    final_vectorized_features.show()

    #
    # Cross validate, train and evaluate classifier: loop 5 times for 4 metrics
    #

    from collections import defaultdict
    scores = defaultdict(list)
    feature_importances = defaultdict(list)
    metric_names = ["accuracy", "weightedPrecision", "weightedRecall", "f1"]
    split_count = 3

    for i in range(1, split_count + 1):
        print("\nRun {} out of {} of test/train splits in cross validation...".
              format(
                  i,
                  split_count,
              ))

        # Test/train split
        training_data, test_data = final_vectorized_features.randomSplit(
            [0.8, 0.2])

        # Instantiate and fit random forest classifier on all the data
        from pyspark.ml.classification import RandomForestClassifier
        rfc = RandomForestClassifier(
            featuresCol="Features_vec",
            labelCol="ArrDelayBucket",
            predictionCol="Prediction",
            maxBins=4896,
        )
        model = rfc.fit(training_data)

        # Save the new model over the old one
        model_output_path = "{}/models/spark_random_forest_classifier.flight_delays.baseline.bin".format(
            base_path)
        model.write().overwrite().save(model_output_path)

        # Evaluate model using test data
        predictions = model.transform(test_data)

        # Evaluate this split's results for each metric
        from pyspark.ml.evaluation import MulticlassClassificationEvaluator
        for metric_name in metric_names:
            evaluator = MulticlassClassificationEvaluator(
                labelCol="ArrDelayBucket",
                predictionCol="Prediction",
                metricName=metric_name)
            score = evaluator.evaluate(predictions)

            scores[metric_name].append(score)
            print("{} = {}".format(metric_name, score))

        #
        # Collect feature importances
        #
        feature_names = vector_assembler.getInputCols()
        feature_importance_list = model.featureImportances
        for feature_name, feature_importance in zip(feature_names,
                                                    feature_importance_list):
            feature_importances[feature_name].append(feature_importance)

    #
    # Evaluate average and STD of each metric and print a table
    #
    import numpy as np
    score_averages = defaultdict(float)

    # Compute the table data
    average_stds = []  # ha
    for metric_name in metric_names:
        metric_scores = scores[metric_name]

        average_accuracy = sum(metric_scores) / len(metric_scores)
        score_averages[metric_name] = average_accuracy

        std_accuracy = np.std(metric_scores)

        average_stds.append((metric_name, average_accuracy, std_accuracy))

    # Print the table
    print("\nExperiment Log")
    print("--------------")
    print(tabulate(average_stds, headers=["Metric", "Average", "STD"]))

    #
    # Persist the score to a sccore log that exists between runs
    #
    import pickle

    # Load the score log or initialize an empty one
    try:
        score_log_filename = "{}/models/score_log.pickle".format(base_path)
        score_log = pickle.load(open(score_log_filename, "rb"))
        if not isinstance(score_log, list):
            score_log = []
    except IOError:
        score_log = []

    # Compute the existing score log entry
    score_log_entry = {
        metric_name: score_averages[metric_name]
        for metric_name in metric_names
    }

    # Compute and display the change in score for each metric
    try:
        last_log = score_log[-1]
    except (IndexError, TypeError, AttributeError):
        last_log = score_log_entry

    experiment_report = []
    for metric_name in metric_names:
        run_delta = score_log_entry[metric_name] - last_log[metric_name]
        experiment_report.append((metric_name, run_delta))

    print("\nExperiment Report")
    print("-----------------")
    print(tabulate(experiment_report, headers=["Metric", "Score"]))

    # Append the existing average scores to the log
    score_log.append(score_log_entry)

    # Persist the log for next run
    pickle.dump(score_log, open(score_log_filename, "wb"))

    #
    # Analyze and report feature importance changes
    #

    # Compute averages for each feature
    feature_importance_entry = defaultdict(float)
    for feature_name, value_list in feature_importances.items():
        average_importance = sum(value_list) / len(value_list)
        feature_importance_entry[feature_name] = average_importance

    # Sort the feature importances in descending order and print
    import operator
    sorted_feature_importances = sorted(feature_importance_entry.items(),
                                        key=operator.itemgetter(1),
                                        reverse=True)

    print("\nFeature Importances")
    print("-------------------")
    print(tabulate(sorted_feature_importances, headers=['Name', 'Importance']))

    #
    # Compare this run's feature importances with the previous run's
    #

    # Load the feature importance log or initialize an empty one
    try:
        feature_log_filename = "{}/models/feature_log.pickle".format(base_path)
        feature_log = pickle.load(open(feature_log_filename, "rb"))
        if not isinstance(feature_log, list):
            feature_log = []
    except IOError:
        feature_log = []

    # Compute and display the change in score for each feature
    try:
        last_feature_log = feature_log[-1]
    except (IndexError, TypeError, AttributeError):
        last_feature_log = defaultdict(float)
        for feature_name, importance in feature_importance_entry.items():
            last_feature_log[feature_name] = importance

    # Compute the deltas
    feature_deltas = {}
    for feature_name in feature_importances.keys():
        run_delta = feature_importance_entry[feature_name] - last_feature_log[
            feature_name]
        feature_deltas[feature_name] = run_delta

    # Sort feature deltas, biggest change first
    import operator
    sorted_feature_deltas = sorted(feature_deltas.items(),
                                   key=operator.itemgetter(1),
                                   reverse=True)

    # Display sorted feature deltas
    print("\nFeature Importance Delta Report")
    print("-------------------------------")
    print(tabulate(sorted_feature_deltas, headers=["Feature", "Delta"]))

    # Append the existing average deltas to the log
    feature_log.append(feature_importance_entry)

    # Persist the log for next run
    pickle.dump(feature_log, open(feature_log_filename, "wb"))
from pyspark.ml.feature import VectorAssembler
va = VectorAssembler().setInputCols(["int1", "int2", "int3"])
va.transform(fakeIntDF).show()


# COMMAND ----------

contDF = spark.range(20).selectExpr("cast(id as double)")


# COMMAND ----------

from pyspark.ml.feature import Bucketizer
bucketBorders = [-1.0, 5.0, 10.0, 250.0, 600.0]
bucketer = Bucketizer().setSplits(bucketBorders).setInputCol("id")
bucketer.transform(contDF).show()


# COMMAND ----------

from pyspark.ml.feature import QuantileDiscretizer
bucketer = QuantileDiscretizer().setNumBuckets(5).setInputCol("id")
fittedBucketer = bucketer.fit(contDF)
fittedBucketer.transform(contDF).show()


# COMMAND ----------

from pyspark.ml.feature import StandardScaler
sScaler = StandardScaler().setInputCol("features")
sScaler.fit(scaleDF).transform(scaleDF).show()
Example #30
0
    def compute_hist(psdf, bins):
        # 'data' is a Spark DataFrame that selects one column.
        assert isinstance(bins, (np.ndarray, np.generic))

        sdf = psdf._internal.spark_frame
        scols = []
        input_column_names = []
        for label in psdf._internal.column_labels:
            input_column_name = name_like_string(label)
            input_column_names.append(input_column_name)
            scols.append(
                psdf._internal.spark_column_for(label).alias(
                    input_column_name))
        sdf = sdf.select(*scols)

        # 1. Make the bucket output flat to:
        #     +----------+-------+
        #     |__group_id|buckets|
        #     +----------+-------+
        #     |0         |0.0    |
        #     |0         |0.0    |
        #     |0         |1.0    |
        #     |0         |2.0    |
        #     |0         |3.0    |
        #     |0         |3.0    |
        #     |1         |0.0    |
        #     |1         |1.0    |
        #     |1         |1.0    |
        #     |1         |2.0    |
        #     |1         |1.0    |
        #     |1         |0.0    |
        #     +----------+-------+
        colnames = sdf.columns
        bucket_names = ["__{}_bucket".format(colname) for colname in colnames]

        output_df = None
        for group_id, (colname,
                       bucket_name) in enumerate(zip(colnames, bucket_names)):
            # creates a Bucketizer to get corresponding bin of each value
            bucketizer = Bucketizer(splits=bins,
                                    inputCol=colname,
                                    outputCol=bucket_name,
                                    handleInvalid="skip")

            bucket_df = bucketizer.transform(sdf)

            if output_df is None:
                output_df = bucket_df.select(
                    F.lit(group_id).alias("__group_id"),
                    F.col(bucket_name).alias("__bucket"))
            else:
                output_df = output_df.union(
                    bucket_df.select(
                        F.lit(group_id).alias("__group_id"),
                        F.col(bucket_name).alias("__bucket")))

        # 2. Calculate the count based on each group and bucket.
        #     +----------+-------+------+
        #     |__group_id|buckets| count|
        #     +----------+-------+------+
        #     |0         |0.0    |2     |
        #     |0         |1.0    |1     |
        #     |0         |2.0    |1     |
        #     |0         |3.0    |2     |
        #     |1         |0.0    |2     |
        #     |1         |1.0    |3     |
        #     |1         |2.0    |1     |
        #     +----------+-------+------+
        result = (output_df.groupby("__group_id", "__bucket").agg(
            F.count("*").alias("count")).toPandas().sort_values(
                by=["__group_id", "__bucket"]))

        # 3. Fill empty bins and calculate based on each group id. From:
        #     +----------+--------+------+
        #     |__group_id|__bucket| count|
        #     +----------+--------+------+
        #     |0         |0.0     |2     |
        #     |0         |1.0     |1     |
        #     |0         |2.0     |1     |
        #     |0         |3.0     |2     |
        #     +----------+--------+------+
        #     +----------+--------+------+
        #     |__group_id|__bucket| count|
        #     +----------+--------+------+
        #     |1         |0.0     |2     |
        #     |1         |1.0     |3     |
        #     |1         |2.0     |1     |
        #     +----------+--------+------+
        #
        # to:
        #     +-----------------+
        #     |__values1__bucket|
        #     +-----------------+
        #     |2                |
        #     |1                |
        #     |1                |
        #     |2                |
        #     |0                |
        #     +-----------------+
        #     +-----------------+
        #     |__values2__bucket|
        #     +-----------------+
        #     |2                |
        #     |3                |
        #     |1                |
        #     |0                |
        #     |0                |
        #     +-----------------+
        output_series = []
        for i, (input_column_name,
                bucket_name) in enumerate(zip(input_column_names,
                                              bucket_names)):
            current_bucket_result = result[result["__group_id"] == i]
            # generates a pandas DF with one row for each bin
            # we need this as some of the bins may be empty
            indexes = pd.DataFrame({"__bucket": np.arange(0, len(bins) - 1)})
            # merges the bins with counts on it and fills remaining ones with zeros
            pdf = indexes.merge(current_bucket_result,
                                how="left",
                                on=["__bucket"]).fillna(0)[["count"]]
            pdf.columns = [input_column_name]
            output_series.append(pdf[input_column_name])

        return output_series
def main(base_path):

    # Default to "."
    try:
        base_path
    except NameError:
        base_path = "."
    if not base_path:
        base_path = "."

    APP_NAME = "train_spark_mllib_model.py"

    # If there is no SparkSession, create the environment
    try:
        sc and spark
    except (NameError, UnboundLocalError) as e:
        import findspark
        findspark.init()
        import pyspark
        import pyspark.sql

        sc = pyspark.SparkContext()
        spark = pyspark.sql.SparkSession(sc).builder.appName(
            APP_NAME).getOrCreate()

    #
    # {
    #   "ArrDelay":5.0,"CRSArrTime":"2015-12-31T03:20:00.000-08:00","CRSDepTime":"2015-12-31T03:05:00.000-08:00",
    #   "Carrier":"WN","DayOfMonth":31,"DayOfWeek":4,"DayOfYear":365,"DepDelay":14.0,"Dest":"SAN","Distance":368.0,
    #   "FlightDate":"2015-12-30T16:00:00.000-08:00","FlightNum":"6109","Origin":"TUS"
    # }
    #
    from pyspark.sql.types import StringType, IntegerType, FloatType, DoubleType, DateType, TimestampType
    from pyspark.sql.types import StructType, StructField
    from pyspark.sql.functions import udf

    schema = StructType([
        StructField("ArrDelay", DoubleType(), True),  # "ArrDelay":5.0
        StructField("CRSArrTime", TimestampType(),
                    True),  # "CRSArrTime":"2015-12-31T03:20:00.000-08:00"
        StructField("CRSDepTime", TimestampType(),
                    True),  # "CRSDepTime":"2015-12-31T03:05:00.000-08:00"
        StructField("Carrier", StringType(), True),  # "Carrier":"WN"
        StructField("DayOfMonth", IntegerType(), True),  # "DayOfMonth":31
        StructField("DayOfWeek", IntegerType(), True),  # "DayOfWeek":4
        StructField("DayOfYear", IntegerType(), True),  # "DayOfYear":365
        StructField("DepDelay", DoubleType(), True),  # "DepDelay":14.0
        StructField("Dest", StringType(), True),  # "Dest":"SAN"
        StructField("Distance", DoubleType(), True),  # "Distance":368.0
        StructField("FlightDate", DateType(),
                    True),  # "FlightDate":"2015-12-30T16:00:00.000-08:00"
        StructField("FlightNum", StringType(), True),  # "FlightNum":"6109"
        StructField("Origin", StringType(), True),  # "Origin":"TUS"
    ])

    input_path = "{}/data/simple_flight_delay_features.jsonl.bz2".format(
        base_path)
    features = spark.read.json(input_path, schema=schema)
    features.first()

    #
    # Check for nulls in features before using Spark ML
    #
    null_counts = [(column, features.where(features[column].isNull()).count())
                   for column in features.columns]
    cols_with_nulls = filter(lambda x: x[1] > 0, null_counts)
    print(list(cols_with_nulls))

    #
    # Add a Route variable to replace FlightNum
    #
    from pyspark.sql.functions import lit, concat
    features_with_route = features.withColumn(
        'Route', concat(features.Origin, lit('-'), features.Dest))
    features_with_route.show(6)

    #
    # Use pysmark.ml.feature.Bucketizer to bucketize ArrDelay into on-time, slightly late, very late (0, 1, 2)
    #
    from pyspark.ml.feature import Bucketizer

    # Setup the Bucketizer
    splits = [-float("inf"), -15.0, 0, 30.0, float("inf")]
    arrival_bucketizer = Bucketizer(splits=splits,
                                    inputCol="ArrDelay",
                                    outputCol="ArrDelayBucket")

    # Save the bucketizer
    arrival_bucketizer_path = "{}/models/arrival_bucketizer_2.0.bin".format(
        base_path)
    arrival_bucketizer.write().overwrite().save(arrival_bucketizer_path)

    # Apply the bucketizer
    ml_bucketized_features = arrival_bucketizer.transform(features_with_route)
    ml_bucketized_features.select("ArrDelay", "ArrDelayBucket").show()

    #
    # Extract features tools in with pyspark.ml.feature
    #
    from pyspark.ml.feature import StringIndexer, VectorAssembler

    # Turn category fields into indexes
    for column in [
            "Carrier", "DayOfMonth", "DayOfWeek", "DayOfYear", "Origin",
            "Dest", "Route"
    ]:
        string_indexer = StringIndexer(inputCol=column,
                                       outputCol=column + "_index")

        string_indexer_model = string_indexer.fit(ml_bucketized_features)
        ml_bucketized_features = string_indexer_model.transform(
            ml_bucketized_features)

        # Drop the original column
        ml_bucketized_features = ml_bucketized_features.drop(column)

        # Save the pipeline model
        string_indexer_output_path = "{}/models/string_indexer_model_{}.bin".format(
            base_path, column)
        string_indexer_model.write().overwrite().save(
            string_indexer_output_path)

    # Combine continuous, numeric fields with indexes of nominal ones
    # ...into one feature vector
    numeric_columns = ["DepDelay", "Distance"]
    index_columns = [
        "Carrier_index", "DayOfMonth_index", "DayOfWeek_index",
        "DayOfYear_index", "Origin_index", "Origin_index", "Dest_index",
        "Route_index"
    ]
    vector_assembler = VectorAssembler(inputCols=numeric_columns +
                                       index_columns,
                                       outputCol="Features_vec")
    final_vectorized_features = vector_assembler.transform(
        ml_bucketized_features)

    # Save the numeric vector assembler
    vector_assembler_path = "{}/models/numeric_vector_assembler.bin".format(
        base_path)
    vector_assembler.write().overwrite().save(vector_assembler_path)

    # Drop the index columns
    for column in index_columns:
        final_vectorized_features = final_vectorized_features.drop(column)

    # Inspect the finalized features
    final_vectorized_features.show()

    # Instantiate and fit random forest classifier on all the data
    from pyspark.ml.classification import RandomForestClassifier
    rfc = RandomForestClassifier(featuresCol="Features_vec",
                                 labelCol="ArrDelayBucket",
                                 predictionCol="Prediction",
                                 maxBins=4657,
                                 maxMemoryInMB=1024)
    model = rfc.fit(final_vectorized_features)

    # Save the new model over the old one
    model_output_path = "{}/models/spark_random_forest_classifier.flight_delays.5.0.bin".format(
        base_path)
    model.write().overwrite().save(model_output_path)

    # Evaluate model using test data
    predictions = model.transform(final_vectorized_features)

    from pyspark.ml.evaluation import MulticlassClassificationEvaluator
    evaluator = MulticlassClassificationEvaluator(predictionCol="Prediction",
                                                  labelCol="ArrDelayBucket",
                                                  metricName="accuracy")
    accuracy = evaluator.evaluate(predictions)
    print("Accuracy = {}".format(accuracy))

    # Check the distribution of predictions
    predictions.groupBy("Prediction").count().show()

    # Check a sample
    predictions.sample(False, 0.001, 18).orderBy("CRSDepTime").show(6)
def main(base_path):
  APP_NAME = "train_spark_mllib_model.py"
  
  # If there is no SparkSession, create the environment
  try:
    sc and spark
  except NameError as e:
    import findspark
    findspark.init()
    import pyspark
    import pyspark.sql
    
    sc = pyspark.SparkContext()
    spark = pyspark.sql.SparkSession(sc).builder.appName(APP_NAME).getOrCreate()
  
  #
  # {
  #   "ArrDelay":5.0,"CRSArrTime":"2015-12-31T03:20:00.000-08:00","CRSDepTime":"2015-12-31T03:05:00.000-08:00",
  #   "Carrier":"WN","DayOfMonth":31,"DayOfWeek":4,"DayOfYear":365,"DepDelay":14.0,"Dest":"SAN","Distance":368.0,
  #   "FlightDate":"2015-12-30T16:00:00.000-08:00","FlightNum":"6109","Origin":"TUS"
  # }
  #
  from pyspark.sql.types import StringType, IntegerType, FloatType, DoubleType, DateType, TimestampType
  from pyspark.sql.types import StructType, StructField
  from pyspark.sql.functions import udf
  
  schema = StructType([
    StructField("ArrDelay", DoubleType(), True),
    StructField("CRSArrTime", TimestampType(), True),
    StructField("CRSDepTime", TimestampType(), True),
    StructField("Carrier", StringType(), True),
    StructField("DayOfMonth", IntegerType(), True),
    StructField("DayOfWeek", IntegerType(), True),
    StructField("DayOfYear", IntegerType(), True),
    StructField("DepDelay", DoubleType(), True),
    StructField("Dest", StringType(), True),
    StructField("Distance", DoubleType(), True),
    StructField("FlightDate", DateType(), True),
    StructField("FlightNum", StringType(), True),
    StructField("Origin", StringType(), True),
    StructField("Route", StringType(), True),
    StructField("TailNum", StringType(), True),
    StructField("EngineManufacturer", StringType(), True),
    StructField("EngineModel", StringType(), True),
    StructField("Manufacturer", StringType(), True),
    StructField("ManufacturerYear", StringType(), True),
    StructField("OwnerState", StringType(), True),
  ])
  
  input_path = "{}/data/simple_flight_delay_features_airplanes.json".format(
    base_path
  )
  features = spark.read.json(input_path, schema=schema)
  features.first()
  
  #
  # Add the hour of day of scheduled arrival/departure
  #
  from pyspark.sql.functions import hour
  features_with_hour = features.withColumn(
    "CRSDepHourOfDay",
    hour(features.CRSDepTime)
  )
  features_with_hour = features_with_hour.withColumn(
    "CRSArrHourOfDay",
    hour(features.CRSArrTime)
  )
  features_with_hour.select("CRSDepTime", "CRSDepHourOfDay", "CRSArrTime", "CRSArrHourOfDay").show()
  
  #
  # Check for nulls in features before using Spark ML
  #
  null_counts = [(column, features_with_hour.where(features_with_hour[column].isNull()).count()) for column in features_with_hour.columns]
  cols_with_nulls = filter(lambda x: x[1] > 0, null_counts)
  print("\nNull Value Report")
  print("-----------------")
  print(tabulate(cols_with_nulls, headers=["Column", "Nulls"]))
  
  #
  # Use pysmark.ml.feature.Bucketizer to bucketize ArrDelay into on-time, slightly late, very late (0, 1, 2)
  #
  from pyspark.ml.feature import Bucketizer
  
  # Setup the Bucketizer
  splits = [-float("inf"), -15.0, 0, 30.0, float("inf")]
  arrival_bucketizer = Bucketizer(
    splits=splits,
    inputCol="ArrDelay",
    outputCol="ArrDelayBucket"
  )
  
  # Save the model
  arrival_bucketizer_path = "{}/models/arrival_bucketizer_2.0.bin".format(base_path)
  arrival_bucketizer.write().overwrite().save(arrival_bucketizer_path)
  
  # Apply the model
  ml_bucketized_features = arrival_bucketizer.transform(features_with_hour)
  ml_bucketized_features.select("ArrDelay", "ArrDelayBucket").show()
  
  #
  # Extract features tools in with pyspark.ml.feature
  #
  from pyspark.ml.feature import StringIndexer, VectorAssembler
  
  # Turn category fields into indexes
  string_columns = ["Carrier", "Origin", "Dest", "Route",
                    "TailNum"]
  for column in string_columns:
    string_indexer = StringIndexer(
      inputCol=column,
      outputCol=column + "_index"
    )
    
    string_indexer_model = string_indexer.fit(ml_bucketized_features)
    ml_bucketized_features = string_indexer_model.transform(ml_bucketized_features)
    
    # Save the pipeline model
    string_indexer_output_path = "{}/models/string_indexer_model_4.0.{}.bin".format(
      base_path,
      column
    )
    string_indexer_model.write().overwrite().save(string_indexer_output_path)
  
  # Combine continuous, numeric fields with indexes of nominal ones
  # ...into one feature vector
  numeric_columns = [
    "DepDelay", "Distance",
    "DayOfYear",
    "CRSDepHourOfDay",
    "CRSArrHourOfDay"]
  index_columns = [column + "_index" for column in string_columns]
  
  vector_assembler = VectorAssembler(
    inputCols=numeric_columns + index_columns,
    outputCol="Features_vec"
  )
  final_vectorized_features = vector_assembler.transform(ml_bucketized_features)
  
  # Save the numeric vector assembler
  vector_assembler_path = "{}/models/numeric_vector_assembler_5.0.bin".format(base_path)
  vector_assembler.write().overwrite().save(vector_assembler_path)
  
  # Drop the index columns
  for column in index_columns:
    final_vectorized_features = final_vectorized_features.drop(column)
  
  # Inspect the finalized features
  final_vectorized_features.show()
  
  #
  # Cross validate, train and evaluate classifier: loop 5 times for 4 metrics
  #
  
  from collections import defaultdict
  scores = defaultdict(list)
  feature_importances = defaultdict(list)
  metric_names = ["accuracy", "weightedPrecision", "weightedRecall", "f1"]
  split_count = 3
  
  for i in range(1, split_count + 1):
    print("\nRun {} out of {} of test/train splits in cross validation...".format(
      i,
      split_count,
    )
    )
    
    # Test/train split
    training_data, test_data = final_vectorized_features.randomSplit([0.8, 0.2])
    
    # Instantiate and fit random forest classifier on all the data
    from pyspark.ml.classification import RandomForestClassifier
    rfc = RandomForestClassifier(
      featuresCol="Features_vec",
      labelCol="ArrDelayBucket",
      predictionCol="Prediction",
      maxBins=4896,
    )
    model = rfc.fit(training_data)
    
    # Save the new model over the old one
    model_output_path = "{}/models/spark_random_forest_classifier.flight_delays.baseline.bin".format(
      base_path
    )
    model.write().overwrite().save(model_output_path)
    
    # Evaluate model using test data
    predictions = model.transform(test_data)
    
    # Evaluate this split's results for each metric
    from pyspark.ml.evaluation import MulticlassClassificationEvaluator
    for metric_name in metric_names:
      evaluator = MulticlassClassificationEvaluator(
        labelCol="ArrDelayBucket",
        predictionCol="Prediction",
        metricName=metric_name
      )
      score = evaluator.evaluate(predictions)
      
      scores[metric_name].append(score)
      print("{} = {}".format(metric_name, score))
    
    #
    # Collect feature importances
    #
    feature_names = vector_assembler.getInputCols()
    feature_importance_list = model.featureImportances
    for feature_name, feature_importance in zip(feature_names, feature_importance_list):
      feature_importances[feature_name].append(feature_importance)
  
  #
  # Evaluate average and STD of each metric and print a table
  #
  import numpy as np
  score_averages = defaultdict(float)
  
  # Compute the table data
  average_stds = []  # ha
  for metric_name in metric_names:
    metric_scores = scores[metric_name]
    
    average_accuracy = sum(metric_scores) / len(metric_scores)
    score_averages[metric_name] = average_accuracy
    
    std_accuracy = np.std(metric_scores)
    
    average_stds.append((metric_name, average_accuracy, std_accuracy))
  
  # Print the table
  print("\nExperiment Log")
  print("--------------")
  print(tabulate(average_stds, headers=["Metric", "Average", "STD"]))
  
  #
  # Persist the score to a sccore log that exists between runs
  #
  import pickle
  
  # Load the score log or initialize an empty one
  try:
    score_log_filename = "{}/models/score_log.pickle".format(base_path)
    score_log = pickle.load(open(score_log_filename, "rb"))
    if not isinstance(score_log, list):
      score_log = []
  except IOError:
    score_log = []
  
  # Compute the existing score log entry
  score_log_entry = {
    metric_name: score_averages[metric_name] for metric_name in metric_names
  }
  
  # Compute and display the change in score for each metric
  try:
    last_log = score_log[-1]
  except (IndexError, TypeError, AttributeError):
    last_log = score_log_entry
  
  experiment_report = []
  for metric_name in metric_names:
    run_delta = score_log_entry[metric_name] - last_log[metric_name]
    experiment_report.append((metric_name, run_delta))
  
  print("\nExperiment Report")
  print("-----------------")
  print(tabulate(experiment_report, headers=["Metric", "Score"]))
  
  # Append the existing average scores to the log
  score_log.append(score_log_entry)
  
  # Persist the log for next run
  pickle.dump(score_log, open(score_log_filename, "wb"))
  
  #
  # Analyze and report feature importance changes
  #
  
  # Compute averages for each feature
  feature_importance_entry = defaultdict(float)
  for feature_name, value_list in feature_importances.items():
    average_importance = sum(value_list) / len(value_list)
    feature_importance_entry[feature_name] = average_importance
  
  # Sort the feature importances in descending order and print
  import operator
  sorted_feature_importances = sorted(
    feature_importance_entry.items(),
    key=operator.itemgetter(1),
    reverse=True
  )
  
  print("\nFeature Importances")
  print("-------------------")
  print(tabulate(sorted_feature_importances, headers=['Name', 'Importance']))
  
  #
  # Compare this run's feature importances with the previous run's
  #
  
  # Load the feature importance log or initialize an empty one
  try:
    feature_log_filename = "{}/models/feature_log.pickle".format(base_path)
    feature_log = pickle.load(open(feature_log_filename, "rb"))
    if not isinstance(feature_log, list):
      feature_log = []
  except IOError:
    feature_log = []
  
  # Compute and display the change in score for each feature
  try:
    last_feature_log = feature_log[-1]
  except (IndexError, TypeError, AttributeError):
    last_feature_log = defaultdict(float)
    for feature_name, importance in feature_importance_entry.items():
      last_feature_log[feature_name] = importance
  
  # Compute the deltas
  feature_deltas = {}
  for feature_name in feature_importances.keys():
    run_delta = feature_importance_entry[feature_name] - last_feature_log[feature_name]
    feature_deltas[feature_name] = run_delta
  
  # Sort feature deltas, biggest change first
  import operator
  sorted_feature_deltas = sorted(
    feature_deltas.items(),
    key=operator.itemgetter(1),
    reverse=True
  )
  
  # Display sorted feature deltas
  print("\nFeature Importance Delta Report")
  print("-------------------------------")
  print(tabulate(sorted_feature_deltas, headers=["Feature", "Delta"]))
  
  # Append the existing average deltas to the log
  feature_log.append(feature_importance_entry)
  
  # Persist the log for next run
  pickle.dump(feature_log, open(feature_log_filename, "wb"))
  dummy_function_udf(features['ArrDelay'])
)
manual_bucketized_features.select("ArrDelay", "ArrDelayBucket").show()

#
# Use pysmark.ml.feature.Bucketizer to bucketize ArrDelay
#
from pyspark.ml.feature import Bucketizer

splits = [-float("inf"), -15.0, 0, 30.0, float("inf")]
bucketizer = Bucketizer(
  splits=splits,
  inputCol="ArrDelay",
  outputCol="ArrDelayBucket"
)
ml_bucketized_features = bucketizer.transform(features_with_route)

# Check the buckets out
ml_bucketized_features.select("ArrDelay", "ArrDelayBucket").show()

#
# Extract features tools in with pyspark.ml.feature
#
from pyspark.ml.feature import StringIndexer, VectorAssembler

# Turn category fields into categoric feature vectors, then drop intermediate fields
for column in ["Carrier", "DayOfMonth", "DayOfWeek", "DayOfYear",
               "Origin", "Dest", "Route"]:
  string_indexer = StringIndexer(
    inputCol=column,
    outputCol=column + "_index"
Example #34
0
# See the License for the specific language governing permissions and
# limitations under the License.
#

from __future__ import print_function

from pyspark import SparkContext
from pyspark.sql import SQLContext
# $example on$
from pyspark.ml.feature import Bucketizer
# $example off$

if __name__ == "__main__":
    sc = SparkContext(appName="BucketizerExample")
    sqlContext = SQLContext(sc)

    # $example on$
    splits = [-float("inf"), -0.5, 0.0, 0.5, float("inf")]

    data = [(-0.5,), (-0.3,), (0.0,), (0.2,)]
    dataFrame = sqlContext.createDataFrame(data, ["features"])

    bucketizer = Bucketizer(splits=splits, inputCol="features", outputCol="bucketedFeatures")

    # Transform original data into its bucket index.
    bucketedData = bucketizer.transform(dataFrame)
    bucketedData.show()
    # $example off$

    sc.stop()