Beispiel #1
0
    def _get_requirements(self):
        """
        Gather the set of requirements needed to run the task.

        This can be a rather expensive operation that requires usage of the S3 API to list all files in the source
        bucket and select the ones that are applicable to the given date range.
        """
        url_gens = []
        for source in self.source:
            if source.startswith('s3'):
                url_gens.append(self._get_s3_urls(source))
            elif source.startswith('hdfs'):
                url_gens.append(self._get_hdfs_urls(source))
            else:
                url_gens.append(self._get_local_urls(source))

        log.debug('Matching urls using pattern(s)="%s"', self.pattern)
        log.debug('Date interval: %s <= date < %s',
                  self.interval.date_a.isoformat(),
                  self.interval.date_b.isoformat())

        return [
            UncheckedExternalURL(url) for url_gen in url_gens
            for url in url_gen if self.should_include_url(url)
        ]
Beispiel #2
0
    def test_requires(self, connect_s3_mock):
        s3_conn_mock = connect_s3_mock.return_value
        bucket_mock = s3_conn_mock.get_bucket.return_value

        class FakeKey(object):
            """A test double of the structure returned by boto when listing keys in an S3 bucket."""
            def __init__(self, path):
                self.key = path
                self.size = 10

        bucket_mock.list.return_value = [FakeKey(path) for path in self.SAMPLE_KEY_PATHS]

        task = PathSelectionByDateIntervalTask(
            source=self.SOURCE,
            interval=Month.parse('2014-03'),
            pattern=[r'.*?FakeServerGroup/tracking.log-(?P<date>\d{8}).*\.gz'],
            expand_interval=datetime.timedelta(0),
        )

        expected_paths = [
            'FakeServerGroup/tracking.log-20140318.gz',
            'FakeServerGroup/tracking.log-20140319-1395256622.gz',
        ]

        self.assertItemsEqual(
            task.requires(),
            [UncheckedExternalURL(source + path) for path in expected_paths for source in self.SOURCE]
        )
    def downstream_input_tasks(self):
        """
        MultiOutputMapReduceJobTask returns marker as output(which cannot be used as input in other jobs).
        This method returns the external tasks, which can then be used as input.
        """

        tasks = []
        for date in self.interval:  # pylint: disable=not-an-iterable
            url = self.output_path_for_key(date.isoformat())
            tasks.append(UncheckedExternalURL(url))

        return tasks
    def downstream_input_tasks(self):
        """
        Provide a list of tasks that a downstream task would use as input.

        This is necessary because a MultiOutputMapReduceJobTask returns a marker as output.
        Note that this method does not verify the existence of the underlying urls. It assumes that
        there is an output file for every date within the interval. Any MapReduce job
        which uses this as input would fail if there is missing data for any date within the interval,
        so this task will create empty output files for dates with no data.
        """
        tasks = []
        for date in self.interval:
            url = self.output_path_for_key(date.isoformat())
            tasks.append(UncheckedExternalURL(url))

        return tasks