def test_requires(self, connect_s3_mock): s3_conn_mock = connect_s3_mock bucket_mock = s3_conn_mock.get_bucket.return_value class FakeKey(object): """A test double of the structure returned by boto when listing keys in an S3 bucket.""" def __init__(self, path): self.key = path self.size = 10 bucket_mock.list.return_value = [FakeKey(path) for path in self.SAMPLE_KEY_PATHS] task = PathSelectionByDateIntervalTask( source=self.SOURCE, interval=Month.parse('2014-03'), pattern=[r'.*?FakeServerGroup/tracking.log-(?P<date>\d{8}).*\.gz'], expand_interval=datetime.timedelta(0), ) expected_paths = [ 'FakeServerGroup/tracking.log-20140318.gz', 'FakeServerGroup/tracking.log-20140319-1395256622.gz', ] self.assertItemsEqual( task.requires(), [UncheckedExternalURL(source + path) for path in expected_paths for source in self.SOURCE] )
def test_requires(self, connect_s3_mock): s3_conn_mock = connect_s3_mock.return_value bucket_mock = s3_conn_mock.get_bucket.return_value class FakeKey(object): """A test double of the structure returned by boto when listing keys in an S3 bucket.""" def __init__(self, path): self.key = path self.size = 10 bucket_mock.list.return_value = [FakeKey(path) for path in self.SAMPLE_KEY_PATHS] task = PathSelectionByDateIntervalTask( source=self.SOURCE, interval=Month.parse('2014-03'), pattern=[r'.*?FakeServerGroup/tracking.log-(?P<date>\d{8}).*\.gz'], expand_interval=datetime.timedelta(0), ) expected_paths = [ 'FakeServerGroup/tracking.log-20140318.gz', 'FakeServerGroup/tracking.log-20140319-1395256622.gz', ] self.assertItemsEqual( task.requires(), [UncheckedExternalURL(source + path) for path in expected_paths for source in self.SOURCE] )
def requires(self): return PathSelectionByDateIntervalTask( source=self.source, interval=self.interval, pattern=self.pattern, date_pattern=self.date_pattern, expand_interval=datetime.timedelta(0))
def test_timestamped_urls(self): task = PathSelectionByDateIntervalTask( source=self.SOURCE, interval=Month.parse('2014-03'), pattern=[r'.*?FakeServerGroup/tracking.log-.*-(?P<timestamp>\d{10})\.gz'], expand_interval=datetime.timedelta(0), ) self.assert_only_matched(task, [ 'FakeServerGroup/tracking.log-20140319-1395256622.gz', ]) task = PathSelectionByDateIntervalTask( source=self.SOURCE, interval=Month.parse('2014-03'), pattern=[r'.*?FakeServerGroup/tracking.log-.*-(?P<timestamp>\d{10})\.gz'], expand_interval=datetime.timedelta(1), ) self.assert_only_matched(task, [ 'FakeServerGroup/tracking.log-20140319-1395256622.gz', 'FakeServerGroup/tracking.log-20140401-1396379384.gz', ])
def test_edge_urls(self): task = PathSelectionByDateIntervalTask( source=self.SOURCE, interval=Month.parse('2014-03'), pattern=[r'.*?FakeEdgeServerGroup/tracking.log-(?P<date>\d{8}).*\.gz'], expand_interval=datetime.timedelta(0), ) self.assert_only_matched(task, [ 'FakeEdgeServerGroup/tracking.log-20140324-1395670621.gz', ])
def requires(self): yield PathSelectionByDateIntervalTask( source=[url_path_join(self.warehouse_path, 'payments')], interval=self.selection_interval, pattern=['.*dt=(?P<date>\\d{4}-\\d{2}-\\d{2})/paypal\\.tsv'], expand_interval=datetime.timedelta(0), date_pattern='%Y-%m-%d', ) for day in self.run_interval: yield PaypalTransactionsByDayTask( account_id=self.account_id, output_root=self.output_root, date=day, overwrite=self.overwrite, )
def test_multiple_filtering_of_urls(self): task = PathSelectionByDateIntervalTask( source=self.SOURCE, interval=Month.parse('2014-03'), pattern=[ r'.*?FakeServerGroup/tracking.log-(?P<date>\d{8}).*\.gz', r'.*?FakeEdgeServerGroup/tracking.log-(?P<date>\d{8}).*\.gz', r'.*tracking_\d{3,5}\.log\.gz$', ], expand_interval=datetime.timedelta(0), ) self.assert_only_matched(task, [ 'FakeServerGroup/tracking.log-20140318.gz', 'FakeServerGroup/tracking.log-20140319-1395256622.gz', 'FakeEdgeServerGroup/tracking.log-20140324-1395670621.gz', 'FakeOldServerGroup3/tracking_14602.log.gz', ])
def requires(self): """Internal method to actually calculate required tasks once.""" yield PathSelectionByDateIntervalTask( source=[url_path_join(self.warehouse_path, 'payments')], interval=self.selection_interval, pattern=['.*dt=(?P<date>\\d{{4}}-\\d{{2}}-\\d{{2}})/cybersource_{}\\.tsv'.format(self.merchant_id)], expand_interval=datetime.timedelta(0), date_pattern='%Y-%m-%d', ) for run_date in self.run_interval: yield DailyProcessFromCybersourceTask( merchant_id=self.merchant_id, output_root=self.output_root, run_date=run_date, overwrite=self.overwrite, is_empty_transaction_allowed=self.is_empty_transaction_allowed )
def requires_hadoop(self): # This defines the data that is treated as input to the Hadoop job. if not self.cached_hadoop_requirements: # We want to pass in the historical data as well as the overwritten output # of LastDailyIpAddressOfUserTask to the hadoop job. # So go find whatever is there in the historical date range. # This allows us in future to collapse historical data into fewer files, # if we felt that was worth the effort. For example, a month's worth # of daily files could be cooked down into a single file representing the # last IP address for users per course in that month. This code wouldn't # care. path_selection_interval = luigi.date_interval.Custom( self.interval.date_a, self.overwrite_from_date) last_ip_of_user_root = url_path_join(self.warehouse_path, 'last_ip_of_user') path_selection_task = PathSelectionByDateIntervalTask( source=[last_ip_of_user_root], pattern=[LastDailyIpAddressOfUserTask.FILEPATH_PATTERN], interval=path_selection_interval, expand_interval=datetime.timedelta(0), date_pattern='%Y-%m-%d', ) requirements = { 'path_selection_task': path_selection_task, } if self.overwrite_n_days > 0: # LastDailyIpAddressOfUserTask returns the marker as output, # so we need custom logic to pass the output of # LastDailyIpAddressOfUserTask as actual hadoop input to this job. downstream_input_tasks = self.requires_local( )['user_addresses_task'].downstream_input_tasks() requirements['downstream_input_tasks'] = downstream_input_tasks self.cached_hadoop_requirements = requirements return self.cached_hadoop_requirements
def test_default_pattern(self): task = PathSelectionByDateIntervalTask(interval=Month.parse('2014-03')) self.assertEquals(task.pattern, ( r'.*tracking.log-(?P<date>\d{8}).*\.gz', r'.*tracking.notalog-(?P<date>\d{8}).*\.gz', ))
def test_default_source(self): task = PathSelectionByDateIntervalTask(interval=Month.parse('2014-03')) self.assertEquals(task.source, ('s3://fake/input/', 's3://fake/input2/'))
def test_pattern_override(self): task = PathSelectionByDateIntervalTask( interval=Month.parse('2014-03'), pattern=['baz'] ) self.assertEquals(task.pattern, ('baz',))
def test_pattern_from_config(self): task = PathSelectionByDateIntervalTask( interval=Month.parse('2014-03') ) self.assertEquals(task.pattern, ('foobar',))