def standardize_train_test_data(train_df, columns):
    '''
    Add normalised columns to the input dataframe.
    formula = [(X - mean) / std_dev]
    Inputs : training dataframe, list of column name strings to be normalised
    Returns : dataframe with new normalised columns, averages and std deviation dataframes
    '''
    # Find the Mean and the Standard Deviation for each column
    aggExpr = []
    aggStd = []
    print(columns)
    for column in columns:
        print(column)
        aggExpr.append(np.mean(train_df[column]).alias(column))
        aggStd.append(stddev(train_df[column]).alias(column + '_stddev'))

    averages = train_df.agg(*aggExpr).collect()[0]
    std_devs = train_df.agg(*aggStd).collect()[0]

    # Standardise each dataframe, column by column
    for column in columns:
        # Standardise the TRAINING data
        train_df = train_df.withColumn(column + '_norm',
                                       ((train_df[column] - averages[column]) /
                                        std_devs[column + '_stddev']))

        # Standardise the TEST data (using the training mean and std_dev)
        # test_df = test_df.withColumn(column + '_norm', ((test_df[column] - averages[column]) /
        #                                                 std_devs[column + '_stddev']))
    return train_df, averages, std_devs
Example #2
0
def test_summary_stddev(pyspark, summarizers, tests_utils, price, forecast):
    expected_pdf = make_pdf([(
        0,
        1.802775638,
    )], ["time", "price_stddev"])
    joined = price.leftJoin(forecast, key="id")
    result = joined.summarize(summarizers.stddev("price")).toPandas()
    pdt.assert_frame_equal(result, expected_pdf)
Example #3
0
 def test_summary_stddev(self):
     from ts.flint import summarizers
     price = self.price()
     forecast = self.forecast()
     expected_pdf = test_utils.make_pdf([(
         0,
         1.802775638,
     )], ["time", "price_stddev"])
     joined = price.leftJoin(forecast, key="id")
     result = joined.summarize(summarizers.stddev("price")).toPandas()
     pdt.assert_frame_equal(result, expected_pdf)
Example #4
0
    def test_summary_stddev(self):
        from ts.flint import summarizers

        price = self.price()
        forecast = self.forecast()

        expected_pdf = make_pdf([
            (0, 1.802775638,)
        ], ["time", "price_stddev"])
        joined = price.leftJoin(forecast, key="id")

        result = joined.summarize(summarizers.stddev("price")).toPandas()
        pdt.assert_frame_equal(result, expected_pdf)
Example #5
0
    def test_summary_compose(self):
        from ts.flint import summarizers

        price = self.price()

        expected_pdf = make_pdf([
            (0, 6.0, 0.5, 3.25, 1.802775638,)
        ], ["time", "price_max", "price_min", "price_mean", "price_stddev"])

        result = price.summarize([summarizers.max("price"),
                                  summarizers.min("price"),
                                  summarizers.mean("price"),
                                  summarizers.stddev("price")]).toPandas()
        pdt.assert_frame_equal(result, expected_pdf)
Example #6
0
def test_summary_compose(pyspark, summarizers, tests_utils, price):
    expected_pdf = make_pdf([(
        0,
        6.0,
        0.5,
        3.25,
        1.802775638,
    )], ["time", "price_max", "price_min", "price_mean", "price_stddev"])

    result = price.summarize([
        summarizers.max("price"),
        summarizers.min("price"),
        summarizers.mean("price"),
        summarizers.stddev("price")
    ]).toPandas()
    pdt.assert_frame_equal(result, expected_pdf)
Example #7
0
 def test_summary_compose(self):
     from ts.flint import summarizers
     price = self.price()
     expected_pdf = test_utils.make_pdf([(
         0,
         6.0,
         0.5,
         3.25,
         1.802775638,
     )], ["time", "price_max", "price_min", "price_mean", "price_stddev"])
     result = price.summarize([
         summarizers.max("price"),
         summarizers.min("price"),
         summarizers.mean("price"),
         summarizers.stddev("price")
     ]).toPandas()
     pdt.assert_frame_equal(result, expected_pdf)