def test_default_new_col_name(self):
        link = FindDaysUntilEvent(read_key='test_input_single_part',
                                  store_key='test_output',
                                  datetime_col='dt',
                                  event_col='a')

        link.initialize()
        link.execute()

        ds = process_manager.service(DataStore)
        self.assertIn('days_until_event', ds['test_output'].columns)
    def test_store_key_used(self):
        link = FindDaysUntilEvent(read_key='test_input_single_part',
                                  store_key='test_output',
                                  datetime_col='dt',
                                  event_col='a')

        link.initialize()
        link.execute()

        ds = process_manager.service(DataStore)
        self.assertIn('test_output', ds)
    def test_null_after_last_event(self):
        link = FindDaysUntilEvent(read_key='test_input_single_part',
                                  store_key='test_output',
                                  datetime_col='dt',
                                  event_col='a')

        link.initialize()
        link.execute()

        ds = process_manager.service(DataStore)
        pdf = ds['test_output'].toPandas().set_index('dt')

        self.assertTrue(np.isnan(pdf.loc['2017-01-04', 'days_until_event']))
    def test_countdown(self):
        link = FindDaysUntilEvent(read_key='test_input_single_part',
                                  store_key='test_output',
                                  datetime_col='dt',
                                  event_col='a')

        link.initialize()
        link.execute()

        ds = process_manager.service(DataStore)
        pdf = ds['test_output'].toPandas().set_index('dt')

        self.assertEqual(pdf.loc['2017-01-01', 'days_until_event'], 2)
        self.assertEqual(pdf.loc['2017-01-02', 'days_until_event'], 1)
        self.assertEqual(pdf.loc['2017-01-03', 'days_until_event'], 0)
    def test_all_null_if_no_events(self):
        link = FindDaysUntilEvent(read_key='test_input_two_parts',
                                  store_key='test_output',
                                  datetime_col='dt',
                                  event_col='a',
                                  partitionby_cols=['b'])

        link.initialize()
        link.execute()

        ds = process_manager.service(DataStore)
        pdf = ds['test_output'].toPandas().set_index('dt')

        self.assertTrue(
            all(list(np.isnan(pdf[pdf['b'] == 2]['days_until_event']))))
    def test_partitionby_cols(self):
        link = FindDaysUntilEvent(read_key='test_input_two_parts',
                                  store_key='test_output',
                                  datetime_col='dt',
                                  event_col='a',
                                  partitionby_cols=['b'])

        link.initialize()
        link.execute()

        ds = process_manager.service(DataStore)
        pdf = ds['test_output'].toPandas().set_index('dt')

        self.assertIn('b', ds['test_output'])
        self.assertIn(1, list(pdf['b']))
        self.assertIn(2, list(pdf['b']))
        self.assertFalse(any(list(pdf['b'] > 2)))
        self.assertEqual(8, len(pdf))
Example #7
0
# This is useful if we want to do a regression analysis, predicting how long
# we have until the next failure.
#
# We count down towards the newly created column `failure_sum_0d`, since this
# column will be >0 whenever there was a failure and 0 otherwise.
#
# The newly created column `days_until_failure` will be `null` after the most
# recent failure. You can check the behaviors of the column using the following
# commands after running the macro in an interactive session:
# >>> cols = ['date','machine','failure_sum_0d','days_until_failure']
# >>> ds['df_agg'].select(cols).orderBy(['machine','date']).show(100)
countdown_link = FindDaysUntilEvent(
    name='days_until_failure',
    read_key='df_daily',
    store_key='df_countdown',
    datetime_col='date',
    # column we're counting down towards:
    event_col='failure_sum_0d',
    # new col name:
    countdown_col_name='days_until_failure',
    partitionby_cols=['machine'])

daily_chain.add(countdown_link)

# STEP 4: Look back in time using sliding windows. Here we loop through the
# windows sizes in `settings['lookback_windows']` and make the aggregations for
# windows of each of the chosen sizes. This adds new columns per window per
# chosen aggregation, so we get new columns like `cycle_time_sec_sum_3d` for the
# total cycle time in a window up to 3 days back.
lookback_chain = Chain('lookback_windows')

# First we move the daily dataframe to a new location in the datastore,