def test_default_new_col_name(self): link = FindDaysUntilEvent(read_key='test_input_single_part', store_key='test_output', datetime_col='dt', event_col='a') link.initialize() link.execute() ds = process_manager.service(DataStore) self.assertIn('days_until_event', ds['test_output'].columns)
def test_store_key_used(self): link = FindDaysUntilEvent(read_key='test_input_single_part', store_key='test_output', datetime_col='dt', event_col='a') link.initialize() link.execute() ds = process_manager.service(DataStore) self.assertIn('test_output', ds)
def test_null_after_last_event(self): link = FindDaysUntilEvent(read_key='test_input_single_part', store_key='test_output', datetime_col='dt', event_col='a') link.initialize() link.execute() ds = process_manager.service(DataStore) pdf = ds['test_output'].toPandas().set_index('dt') self.assertTrue(np.isnan(pdf.loc['2017-01-04', 'days_until_event']))
def test_countdown(self): link = FindDaysUntilEvent(read_key='test_input_single_part', store_key='test_output', datetime_col='dt', event_col='a') link.initialize() link.execute() ds = process_manager.service(DataStore) pdf = ds['test_output'].toPandas().set_index('dt') self.assertEqual(pdf.loc['2017-01-01', 'days_until_event'], 2) self.assertEqual(pdf.loc['2017-01-02', 'days_until_event'], 1) self.assertEqual(pdf.loc['2017-01-03', 'days_until_event'], 0)
def test_all_null_if_no_events(self): link = FindDaysUntilEvent(read_key='test_input_two_parts', store_key='test_output', datetime_col='dt', event_col='a', partitionby_cols=['b']) link.initialize() link.execute() ds = process_manager.service(DataStore) pdf = ds['test_output'].toPandas().set_index('dt') self.assertTrue( all(list(np.isnan(pdf[pdf['b'] == 2]['days_until_event']))))
def test_partitionby_cols(self): link = FindDaysUntilEvent(read_key='test_input_two_parts', store_key='test_output', datetime_col='dt', event_col='a', partitionby_cols=['b']) link.initialize() link.execute() ds = process_manager.service(DataStore) pdf = ds['test_output'].toPandas().set_index('dt') self.assertIn('b', ds['test_output']) self.assertIn(1, list(pdf['b'])) self.assertIn(2, list(pdf['b'])) self.assertFalse(any(list(pdf['b'] > 2))) self.assertEqual(8, len(pdf))
# This is useful if we want to do a regression analysis, predicting how long # we have until the next failure. # # We count down towards the newly created column `failure_sum_0d`, since this # column will be >0 whenever there was a failure and 0 otherwise. # # The newly created column `days_until_failure` will be `null` after the most # recent failure. You can check the behaviors of the column using the following # commands after running the macro in an interactive session: # >>> cols = ['date','machine','failure_sum_0d','days_until_failure'] # >>> ds['df_agg'].select(cols).orderBy(['machine','date']).show(100) countdown_link = FindDaysUntilEvent( name='days_until_failure', read_key='df_daily', store_key='df_countdown', datetime_col='date', # column we're counting down towards: event_col='failure_sum_0d', # new col name: countdown_col_name='days_until_failure', partitionby_cols=['machine']) daily_chain.add(countdown_link) # STEP 4: Look back in time using sliding windows. Here we loop through the # windows sizes in `settings['lookback_windows']` and make the aggregations for # windows of each of the chosen sizes. This adds new columns per window per # chosen aggregation, so we get new columns like `cycle_time_sec_sum_3d` for the # total cycle time in a window up to 3 days back. lookback_chain = Chain('lookback_windows') # First we move the daily dataframe to a new location in the datastore,