Esempio n. 1
0
    def assert_new_tmp_bucket(self, location, **runner_kwargs):
        """Assert that if we create an DataprocJobRunner with the given keyword
        args, it'll create a new tmp bucket with the given location
        constraint.
        """
        bucket_cache = self._gcs_client._cache_buckets

        existing_buckets = set(bucket_cache.keys())

        runner = DataprocJobRunner(conf_paths=[], **runner_kwargs)

        bucket_name, path = parse_gcs_uri(runner._cloud_tmp_dir)
        runner._create_fs_tmp_bucket(bucket_name, location=location)

        self.assertTrue(bucket_name.startswith('mrjob-'))
        self.assertNotIn(bucket_name, existing_buckets)
        self.assertEqual(path, 'tmp/')

        current_bucket = bucket_cache[bucket_name]
        self.assertEqual(current_bucket['location'], location)

        # Verify that we setup bucket lifecycle rules of 28-day retention
        first_lifecycle_rule = current_bucket['lifecycle']['rule'][0]
        self.assertEqual(first_lifecycle_rule['action'], dict(type='Delete'))
        self.assertEqual(first_lifecycle_rule['condition'],
                         dict(age=_DEFAULT_CLOUD_TMP_DIR_OBJECT_TTL_DAYS))
Esempio n. 2
0
    def _launch_cluster(self):
        """Create an empty cluster on Dataproc, and set self._cluster_id to
        its ID."""
        bucket_name, _ = parse_gcs_uri(self._job_tmpdir)
        self._create_fs_tmp_bucket(bucket_name)

        # clusterName must be a match of
        # regex '(?:[a-z](?:[-a-z0-9]{0,53}[a-z0-9])?).'
        # as documented in an API error message
        # (not currently documented in the Dataproc docs)
        if not self._cluster_id:
            self._cluster_id = '-'.join(
                ['mrjob', self._gce_zone.lower(), random_identifier()])

        # Create the cluster if it's missing, otherwise join an existing one
        try:
            self._api_cluster_get(self._cluster_id)
            log.info('Adding job to existing cluster - %s' % self._cluster_id)
        except google_errors.HttpError as e:
            if not e.resp.status == 404:
                raise

            log.info(
                'Creating Dataproc Hadoop cluster - %s' % self._cluster_id)

            cluster_data = self._cluster_create_args()

            self._api_cluster_create(cluster_data)

            self._wait_for_cluster_ready(self._cluster_id)

        # keep track of when we launched our job
        self._dataproc_job_start = time.time()
        return self._cluster_id
Esempio n. 3
0
    def assert_new_tmp_bucket(self, location, **runner_kwargs):
        """Assert that if we create an DataprocJobRunner with the given keyword
        args, it'll create a new tmp bucket with the given location
        constraint.
        """
        bucket_cache = self._gcs_client._cache_buckets

        existing_buckets = set(bucket_cache.keys())

        runner = DataprocJobRunner(conf_paths=[], **runner_kwargs)

        bucket_name, path = parse_gcs_uri(runner._cloud_tmp_dir)
        runner._create_fs_tmp_bucket(bucket_name, location=location)

        self.assertTrue(bucket_name.startswith('mrjob-'))
        self.assertNotIn(bucket_name, existing_buckets)
        self.assertEqual(path, 'tmp/')

        current_bucket = bucket_cache[bucket_name]
        self.assertEqual(current_bucket['location'], location)

        # Verify that we setup bucket lifecycle rules of 28-day retention
        first_lifecycle_rule = current_bucket['lifecycle']['rule'][0]
        self.assertEqual(first_lifecycle_rule['action'], dict(type='Delete'))
        self.assertEqual(first_lifecycle_rule['condition'], dict(age=_DEFAULT_CLOUD_TMP_DIR_OBJECT_TTL_DAYS))
Esempio n. 4
0
    def _launch_cluster(self):
        """Create an empty cluster on Dataproc, and set self._cluster_id to
        its ID."""
        bucket_name, _ = parse_gcs_uri(self._job_tmpdir)
        self._create_fs_tmp_bucket(bucket_name)

        # clusterName must be a match of
        # regex '(?:[a-z](?:[-a-z0-9]{0,53}[a-z0-9])?).'
        # as documented in an API error message
        # (not currently documented in the Dataproc docs)
        if not self._cluster_id:
            self._cluster_id = '-'.join(
                ['mrjob', self._gce_zone.lower(),
                 random_identifier()])

        # Create the cluster if it's missing, otherwise join an existing one
        try:
            self._api_cluster_get(self._cluster_id)
            log.info('Adding job to existing cluster - %s' % self._cluster_id)
        except google_errors.HttpError as e:
            if not e.resp.status == 404:
                raise

            log.info('Creating Dataproc Hadoop cluster - %s' %
                     self._cluster_id)

            cluster_data = self._cluster_create_args()

            self._api_cluster_create(cluster_data)

            self._wait_for_cluster_ready(self._cluster_id)

        # keep track of when we launched our job
        self._dataproc_job_start = time.time()
        return self._cluster_id
    def put_gcs(self, gcs_uri, data):
        """Put data at gcs_uri, creating a bucket if necessary"""
        bucket, name = parse_gcs_uri(gcs_uri)

        try:
            self._fs.get_bucket(bucket)
        except google_errors.HttpError:
            self._fs.create_bucket(project=_TEST_PROJECT, name=bucket)

        bytes_io_obj = BytesIO(data)
        self.upload_io(bytes_io_obj, gcs_uri)
Esempio n. 6
0
    def put_gcs(self, gcs_uri, data):
        """Put data at gcs_uri, creating a bucket if necessary"""
        bucket, name = parse_gcs_uri(gcs_uri)

        try:
            self._fs.get_bucket(bucket)
        except google_errors.HttpError:
            self._fs.create_bucket(project=_TEST_PROJECT, name=bucket)

        bytes_io_obj = BytesIO(data)
        self.upload_io(bytes_io_obj, gcs_uri)
Esempio n. 7
0
File: case.py Progetto: Yelp/mrjob
    def put_gcs_multi(self, gcs_uri_to_data_map):
        client = self.storage_client()

        for uri, data in gcs_uri_to_data_map.items():
            bucket_name, blob_name = parse_gcs_uri(uri)

            bucket = client.bucket(bucket_name)
            if not bucket.exists():
                bucket.create()

            blob = bucket.blob(blob_name)
            blob.upload_from_string(data)
Esempio n. 8
0
    def put_gcs_multi(self, gcs_uri_to_data_map):
        client = self.storage_client()

        for uri, data in gcs_uri_to_data_map.items():
            bucket_name, blob_name = parse_gcs_uri(uri)

            bucket = client.bucket(bucket_name)
            if not bucket.exists():
                bucket.create()

            blob = bucket.blob(blob_name)
            blob.upload_from_string(data)
Esempio n. 9
0
    def _upload_local_files_to_fs(self):
        """Copy local files tracked by self._upload_mgr to FS."""
        bucket_name, _ = parse_gcs_uri(self._job_tmpdir)
        self._create_fs_tmp_bucket(bucket_name)

        log.info('Copying non-input files into %s' % self._upload_mgr.prefix)

        for path, gcs_uri in self._upload_mgr.path_to_uri().items():
            log.debug('uploading %s -> %s' % (path, gcs_uri))

            self.fs.put(path, gcs_uri, chunk_size=self._fs_chunk_size())

        self._wait_for_fs_sync()
Esempio n. 10
0
    def _upload_local_files_to_fs(self):
        """Copy local files tracked by self._upload_mgr to FS."""
        bucket_name, _ = parse_gcs_uri(self._job_tmpdir)
        self._create_fs_tmp_bucket(bucket_name)

        log.info('Copying non-input files into %s' % self._upload_mgr.prefix)

        for path, gcs_uri in self._upload_mgr.path_to_uri().items():
            log.debug('uploading %s -> %s' % (path, gcs_uri))

            # TODO - mtai @ davidmarin - Implement put function for other FSs
            self.fs.put(path, gcs_uri)

        self._wait_for_fs_sync()
Esempio n. 11
0
    def download_io(self, src_uri, io_obj):
        """
        Clobber GCSFilesystem._download_io
        """
        bucket, name = parse_gcs_uri(src_uri)

        object_dict = _get_deep(self._cache_objects, [bucket, name])

        if not object_dict:
            raise Exception

        object_data = object_dict['_data']
        io_obj.write(object_data)
        return io_obj
Esempio n. 12
0
    def _upload_local_files_to_fs(self):
        """Copy local files tracked by self._upload_mgr to FS."""
        bucket_name, _ = parse_gcs_uri(self._job_tmpdir)
        self._create_fs_tmp_bucket(bucket_name)

        log.info('Copying non-input files into %s' % self._upload_mgr.prefix)

        for path, gcs_uri in self._upload_mgr.path_to_uri().items():
            log.debug('uploading %s -> %s' % (path, gcs_uri))

            # TODO - mtai @ davidmarin - Implement put function for other FSs
            self.fs.put(path, gcs_uri)

        self._wait_for_fs_sync()
Esempio n. 13
0
    def download_io(self, src_uri, io_obj):
        """
        Clobber GCSFilesystem._download_io
        """
        bucket, name = parse_gcs_uri(src_uri)

        object_dict = _get_deep(self._cache_objects, [bucket, name])

        if not object_dict:
            raise Exception

        object_data = object_dict['_data']
        io_obj.write(object_data)
        return io_obj
Esempio n. 14
0
    def _test_cloud_tmp_cleanup(self, mode, tmp_len):
        stdin = BytesIO(b"foo\nbar\n")

        mr_job = MRTwoStepJob(["-r", "dataproc", "-v", "-", "--cleanup", mode])
        mr_job.sandbox(stdin=stdin)

        with mr_job.make_runner() as runner:
            tmp_bucket, _ = parse_gcs_uri(runner._cloud_tmp_dir)

            runner.run()

            # this is set and unset before we can get at it unless we do this
            list(runner.stream_output())

        objects_in_bucket = self._gcs_fs.api_client._cache_objects[tmp_bucket]
        self.assertEqual(len(objects_in_bucket), tmp_len)
Esempio n. 15
0
    def _test_cloud_tmp_cleanup(self, mode, tmp_len):
        stdin = BytesIO(b'foo\nbar\n')

        mr_job = MRTwoStepJob(['-r', 'dataproc', '-v', '-', '--cleanup', mode])
        mr_job.sandbox(stdin=stdin)

        with mr_job.make_runner() as runner:
            tmp_bucket, _ = parse_gcs_uri(runner._cloud_tmp_dir)

            runner.run()

            # this is set and unset before we can get at it unless we do this
            list(runner.cat_output())

        objects_in_bucket = self._gcs_fs.api_client._cache_objects[tmp_bucket]
        self.assertEqual(len(objects_in_bucket), tmp_len)
Esempio n. 16
0
    def upload_io(self, io_obj, dest_uri):
        """
        Clobber GCSFilesystem._upload_io
        """
        bucket, name = parse_gcs_uri(dest_uri)

        assert bucket in self._cache_buckets

        io_obj.seek(0)

        data = io_obj.read()

        # TODO - io_obj.close() ?  Not sure if callers of this function would expect their io_objs to be closed

        object_resp = _insert_object_resp(bucket=bucket, name=name, data=data)

        _set_deep(self._cache_objects, [bucket, name], object_resp)

        return object_resp
Esempio n. 17
0
    def upload_io(self, io_obj, dest_uri):
        """
        Clobber GCSFilesystem._upload_io
        """
        bucket, name = parse_gcs_uri(dest_uri)

        assert bucket in self._cache_buckets

        io_obj.seek(0)

        data = io_obj.read()

        # TODO - io_obj.close() ?  Not sure if callers of this function would
        # expect their io_objs to be closed

        object_resp = _insert_object_resp(bucket=bucket, name=name, data=data)

        _set_deep(self._cache_objects, [bucket, name], object_resp)

        return object_resp
Esempio n. 18
0
    def _test_cloud_tmp_cleanup(self, mode, tmp_len):
        stdin = BytesIO(b'foo\nbar\n')

        mr_job = MRTwoStepJob(['-r', 'dataproc', '-v', '-', '--cleanup', mode])
        mr_job.sandbox(stdin=stdin)

        with mr_job.make_runner() as runner:
            tmp_bucket, _ = parse_gcs_uri(runner._cloud_tmp_dir)

            runner.run()

            # this is set and unset before we can get at it unless we do this
            list(runner.cat_output())

            fs = runner.fs

        # with statement finishes, cleanup runs

        self.assertEqual(len(list(fs.client.bucket(tmp_bucket).list_blobs())),
                         tmp_len)