Ejemplo n.º 1
0
    def test_requires(self, connect_s3_mock):
        s3_conn_mock = connect_s3_mock
        bucket_mock = s3_conn_mock.get_bucket.return_value

        class FakeKey(object):
            """A test double of the structure returned by boto when listing keys in an S3 bucket."""
            def __init__(self, path):
                self.key = path
                self.size = 10

        bucket_mock.list.return_value = [FakeKey(path) for path in self.SAMPLE_KEY_PATHS]

        task = PathSelectionByDateIntervalTask(
            source=self.SOURCE,
            interval=Month.parse('2014-03'),
            pattern=[r'.*?FakeServerGroup/tracking.log-(?P<date>\d{8}).*\.gz'],
            expand_interval=datetime.timedelta(0),
        )

        expected_paths = [
            'FakeServerGroup/tracking.log-20140318.gz',
            'FakeServerGroup/tracking.log-20140319-1395256622.gz',
        ]

        self.assertItemsEqual(
            task.requires(),
            [UncheckedExternalURL(source + path) for path in expected_paths for source in self.SOURCE]
        )
Ejemplo n.º 2
0
    def test_requires(self, connect_s3_mock):
        s3_conn_mock = connect_s3_mock.return_value
        bucket_mock = s3_conn_mock.get_bucket.return_value

        class FakeKey(object):
            """A test double of the structure returned by boto when listing keys in an S3 bucket."""
            def __init__(self, path):
                self.key = path
                self.size = 10

        bucket_mock.list.return_value = [FakeKey(path) for path in self.SAMPLE_KEY_PATHS]

        task = PathSelectionByDateIntervalTask(
            source=self.SOURCE,
            interval=Month.parse('2014-03'),
            pattern=[r'.*?FakeServerGroup/tracking.log-(?P<date>\d{8}).*\.gz'],
            expand_interval=datetime.timedelta(0),
        )

        expected_paths = [
            'FakeServerGroup/tracking.log-20140318.gz',
            'FakeServerGroup/tracking.log-20140319-1395256622.gz',
        ]

        self.assertItemsEqual(
            task.requires(),
            [UncheckedExternalURL(source + path) for path in expected_paths for source in self.SOURCE]
        )
Ejemplo n.º 3
0
 def requires(self):
     return PathSelectionByDateIntervalTask(
         source=self.source,
         interval=self.interval,
         pattern=self.pattern,
         date_pattern=self.date_pattern,
         expand_interval=datetime.timedelta(0))
Ejemplo n.º 4
0
 def test_timestamped_urls(self):
     task = PathSelectionByDateIntervalTask(
         source=self.SOURCE,
         interval=Month.parse('2014-03'),
         pattern=[r'.*?FakeServerGroup/tracking.log-.*-(?P<timestamp>\d{10})\.gz'],
         expand_interval=datetime.timedelta(0),
     )
     self.assert_only_matched(task, [
         'FakeServerGroup/tracking.log-20140319-1395256622.gz',
     ])
     task = PathSelectionByDateIntervalTask(
         source=self.SOURCE,
         interval=Month.parse('2014-03'),
         pattern=[r'.*?FakeServerGroup/tracking.log-.*-(?P<timestamp>\d{10})\.gz'],
         expand_interval=datetime.timedelta(1),
     )
     self.assert_only_matched(task, [
         'FakeServerGroup/tracking.log-20140319-1395256622.gz',
         'FakeServerGroup/tracking.log-20140401-1396379384.gz',
     ])
Ejemplo n.º 5
0
    def test_edge_urls(self):
        task = PathSelectionByDateIntervalTask(
            source=self.SOURCE,
            interval=Month.parse('2014-03'),
            pattern=[r'.*?FakeEdgeServerGroup/tracking.log-(?P<date>\d{8}).*\.gz'],
            expand_interval=datetime.timedelta(0),
        )

        self.assert_only_matched(task, [
            'FakeEdgeServerGroup/tracking.log-20140324-1395670621.gz',
        ])
Ejemplo n.º 6
0
    def requires(self):

        yield PathSelectionByDateIntervalTask(
            source=[url_path_join(self.warehouse_path, 'payments')],
            interval=self.selection_interval,
            pattern=['.*dt=(?P<date>\\d{4}-\\d{2}-\\d{2})/paypal\\.tsv'],
            expand_interval=datetime.timedelta(0),
            date_pattern='%Y-%m-%d',
        )

        for day in self.run_interval:
            yield PaypalTransactionsByDayTask(
                account_id=self.account_id,
                output_root=self.output_root,
                date=day,
                overwrite=self.overwrite,
            )
Ejemplo n.º 7
0
    def test_multiple_filtering_of_urls(self):
        task = PathSelectionByDateIntervalTask(
            source=self.SOURCE,
            interval=Month.parse('2014-03'),
            pattern=[
                r'.*?FakeServerGroup/tracking.log-(?P<date>\d{8}).*\.gz',
                r'.*?FakeEdgeServerGroup/tracking.log-(?P<date>\d{8}).*\.gz',
                r'.*tracking_\d{3,5}\.log\.gz$',
            ],
            expand_interval=datetime.timedelta(0),
        )

        self.assert_only_matched(task, [
            'FakeServerGroup/tracking.log-20140318.gz',
            'FakeServerGroup/tracking.log-20140319-1395256622.gz',
            'FakeEdgeServerGroup/tracking.log-20140324-1395670621.gz',
            'FakeOldServerGroup3/tracking_14602.log.gz',
        ])
Ejemplo n.º 8
0
    def requires(self):
        """Internal method to actually calculate required tasks once."""

        yield PathSelectionByDateIntervalTask(
            source=[url_path_join(self.warehouse_path, 'payments')],
            interval=self.selection_interval,
            pattern=['.*dt=(?P<date>\\d{{4}}-\\d{{2}}-\\d{{2}})/cybersource_{}\\.tsv'.format(self.merchant_id)],
            expand_interval=datetime.timedelta(0),
            date_pattern='%Y-%m-%d',
        )

        for run_date in self.run_interval:
            yield DailyProcessFromCybersourceTask(
                merchant_id=self.merchant_id,
                output_root=self.output_root,
                run_date=run_date,
                overwrite=self.overwrite,
                is_empty_transaction_allowed=self.is_empty_transaction_allowed
            )
    def requires_hadoop(self):
        # This defines the data that is treated as input to the Hadoop job.

        if not self.cached_hadoop_requirements:
            # We want to pass in the historical data as well as the overwritten output
            # of LastDailyIpAddressOfUserTask to the hadoop job.
            # So go find whatever is there in the historical date range.
            # This allows us in future to collapse historical data into fewer files,
            # if we felt that was worth the effort.  For example, a month's worth
            # of daily files could be cooked down into a single file representing the
            # last IP address for users per course in that month.  This code wouldn't
            # care.
            path_selection_interval = luigi.date_interval.Custom(
                self.interval.date_a, self.overwrite_from_date)
            last_ip_of_user_root = url_path_join(self.warehouse_path,
                                                 'last_ip_of_user')
            path_selection_task = PathSelectionByDateIntervalTask(
                source=[last_ip_of_user_root],
                pattern=[LastDailyIpAddressOfUserTask.FILEPATH_PATTERN],
                interval=path_selection_interval,
                expand_interval=datetime.timedelta(0),
                date_pattern='%Y-%m-%d',
            )

            requirements = {
                'path_selection_task': path_selection_task,
            }

            if self.overwrite_n_days > 0:
                # LastDailyIpAddressOfUserTask returns the marker as output,
                # so we need custom logic to pass the output of
                # LastDailyIpAddressOfUserTask as actual hadoop input to this job.
                downstream_input_tasks = self.requires_local(
                )['user_addresses_task'].downstream_input_tasks()
                requirements['downstream_input_tasks'] = downstream_input_tasks

            self.cached_hadoop_requirements = requirements

        return self.cached_hadoop_requirements
Ejemplo n.º 10
0
 def test_default_pattern(self):
     task = PathSelectionByDateIntervalTask(interval=Month.parse('2014-03'))
     self.assertEquals(task.pattern, (
         r'.*tracking.log-(?P<date>\d{8}).*\.gz',
         r'.*tracking.notalog-(?P<date>\d{8}).*\.gz',
     ))
Ejemplo n.º 11
0
 def test_default_source(self):
     task = PathSelectionByDateIntervalTask(interval=Month.parse('2014-03'))
     self.assertEquals(task.source, ('s3://fake/input/', 's3://fake/input2/'))
Ejemplo n.º 12
0
 def test_pattern_override(self):
     task = PathSelectionByDateIntervalTask(
         interval=Month.parse('2014-03'),
         pattern=['baz']
     )
     self.assertEquals(task.pattern, ('baz',))
Ejemplo n.º 13
0
 def test_pattern_from_config(self):
     task = PathSelectionByDateIntervalTask(
         interval=Month.parse('2014-03')
     )
     self.assertEquals(task.pattern, ('foobar',))