Ejemplo n.º 1
0
    def test_forward_put_with_part_size(self):
        fs = CompositeFilesystem()

        fs.add_fs('s3', self.s3_fs)

        fs.put('/path/to/file', 's3://walrus/file', part_size_mb=99999)
        self.s3_fs.put.assert_called_once_with(
            '/path/to/file', 's3://walrus/file', 99999)
Ejemplo n.º 2
0
    def test_forward_put(self):
        # put() is a special case since the path that matters comes second
        fs = CompositeFilesystem()

        fs.add_fs('s3', self.s3_fs)

        fs.put('/path/to/file', 's3://walrus/file')
        self.s3_fs.put.assert_called_once_with(
            '/path/to/file', 's3://walrus/file')
Ejemplo n.º 3
0
    def test_forward_join(self):
        # join() is a special case since it takes multiple arguments
        fs = CompositeFilesystem()

        fs.add_fs('s3', self.s3_fs)

        self.assertEqual(fs.join('s3://walrus/fish', 'salmon'),
                         self.s3_fs.join.return_value)
        self.s3_fs.join.assert_called_once_with(
            's3://walrus/fish', 'salmon')
Ejemplo n.º 4
0
    def test_forward_fs_extensions(self):
        fs = CompositeFilesystem()

        fs.add_fs('s3', self.s3_fs)
        fs.add_fs('hadoop', self.hadoop_fs)

        self.assertEqual(fs.create_bucket, self.s3_fs.create_bucket)
        self.assertEqual(fs.get_hadoop_version,
                         self.hadoop_fs.get_hadoop_version)

        self.assertRaises(AttributeError, lambda: fs.client)
Ejemplo n.º 5
0
    def test_pick_fs(self):
        fs = CompositeFilesystem()

        fs.add_fs('s3', self.s3_fs)
        fs.add_fs('hadoop', self.hadoop_fs)

        self.assertEqual(fs.ls('s3://walrus/fish'), self.s3_fs.ls.return_value)
        # hadoop fs could have handled it, but s3_fs got it first
        self.assertTrue(self.hadoop_fs.can_handle_path('s3://walrus/fish'))
        self.assertFalse(self.hadoop_fs.ls.called)

        self.assertEqual(fs.ls('hdfs:///user/hadoop/'),
                         self.hadoop_fs.ls.return_value)

        # don't move on to the next FS on an error (unlike old
        # CompositeFilesystem implementation)
        self.s3_fs.ls.side_effect = IOError

        self.assertRaises(IOError, fs.ls, 's3://walrus/fish')
Ejemplo n.º 6
0
    def test_pick_fs(self):
        fs = CompositeFilesystem()

        fs.add_fs('s3', self.s3_fs)
        fs.add_fs('hadoop', self.hadoop_fs)

        self.assertEqual(fs.ls('s3://walrus/fish'),
                         self.s3_fs.ls.return_value)
        # hadoop fs could have handled it, but s3_fs got it first
        self.assertTrue(self.hadoop_fs.can_handle_path('s3://walrus/fish'))
        self.assertFalse(self.hadoop_fs.ls.called)

        self.assertEqual(fs.ls('hdfs:///user/hadoop/'),
                         self.hadoop_fs.ls.return_value)

        # don't move on to the next FS on an error (unlike old
        # CompositeFilesystem implementation)
        self.s3_fs.ls.side_effect = IOError

        self.assertRaises(IOError, fs.ls, 's3://walrus/fish')
Ejemplo n.º 7
0
    def test_disable_fs(self):
        class NoCredentialsError(Exception):
            pass

        fs = CompositeFilesystem()

        # tentatively use S3 filesystem, if set up
        fs.add_fs('s3', self.s3_fs,
                  disable_if=lambda ex: isinstance(ex, NoCredentialsError))
        fs.add_fs('hadoop', self.hadoop_fs)

        self.s3_fs.ls.side_effect = NoCredentialsError

        # calling ls() on S3 fs disables it, so we move on to hadoop fs
        self.assertEqual(fs.ls('s3://walrus/'),
                         self.hadoop_fs.ls.return_value)
        self.assertTrue(self.s3_fs.ls.called)

        self.assertIn('s3', fs._disabled)

        # now that s3 fs is disabled, we won't even try to call it
        self.assertEqual(fs.cat('s3://walrus/fish'),
                         self.hadoop_fs.cat.return_value)
        self.assertFalse(self.s3_fs.cat.called)
Ejemplo n.º 8
0
class DataprocJobRunner(HadoopInTheCloudJobRunner, LogInterpretationMixin):
    """Runs an :py:class:`~mrjob.job.MRJob` on Google Cloud Dataproc.
    Invoked when you run your job with ``-r dataproc``.

    :py:class:`DataprocJobRunner` runs your job in an Dataproc cluster, which
    is basically a temporary Hadoop cluster.

    Input, support, and jar files can be either local or on GCS; use
    ``gs://...`` URLs to refer to files on GCS.

    This class has some useful utilities for talking directly to GCS and
    Dataproc, so you may find it useful to instantiate it without a script::

        from mrjob.dataproc import DataprocJobRunner
        ...
    """
    alias = 'dataproc'

    OPT_NAMES = HadoopInTheCloudJobRunner.OPT_NAMES | {
        'cluster_properties',
        'core_instance_config',
        'gcloud_bin',
        'master_instance_config',
        'network',
        'project_id',
        'service_account',
        'service_account_scopes',
        'subnet',
        'task_instance_config',
    }

    # no Spark support yet (see #1765)
    _STEP_TYPES = {'jar', 'streaming'}

    def __init__(self, **kwargs):
        """:py:class:`~mrjob.dataproc.DataprocJobRunner` takes the same
        arguments as
        :py:class:`~mrjob.runner.MRJobRunner`, plus some additional options
        which can be defaulted in :ref:`mrjob.conf <mrjob.conf>`.
        """
        super(DataprocJobRunner, self).__init__(**kwargs)

        # check for library support
        if google is None:
            raise ImportError('You must install google-cloud-logging and '
                              'google-cloud-storage to connect to Dataproc')

        # Dataproc requires a master and >= 2 core instances
        # num_core_instances refers ONLY to number of CORE instances and does
        # NOT include the required 1 instance for master
        # In other words, minimum cluster size is 3 machines, 1 master and 2
        # "num_core_instances" workers
        if self._opts['num_core_instances'] < _DATAPROC_MIN_WORKERS:
            raise DataprocException('Dataproc expects at LEAST %d workers' %
                                    _DATAPROC_MIN_WORKERS)

        if (self._opts['core_instance_type'] !=
                self._opts['task_instance_type']):
            raise DataprocException(
                'Dataproc v1 expects core/task instance types to be identical')

        # see #1820
        if self._opts['image_id']:
            log.warning('mrjob does not yet support custom machine images'
                        ' on Dataproc')

        # load credentials and project ID
        self._credentials, auth_project_id = google.auth.default(
            scopes=[_FULL_SCOPE])  # needed for $GOOGLE_APPLICATION_CREDENTIALS

        self._project_id = self._opts['project_id'] or auth_project_id

        if not self._project_id:
            raise DataprocException(
                'project_id must be set. Use --project_id or'
                ' set $GOOGLE_CLOUD_PROJECT')

        self._fix_zone_and_region_opts()

        if self._opts['service_account_scopes']:
            self._opts['service_account_scopes'] = [
                _fully_qualify_scope_uri(s)
                for s in self._opts['service_account_scopes']
            ]

        # cluster_id can be None here
        self._cluster_id = self._opts['cluster_id']

        self._api_client = None
        self._gcs_fs = None
        self._fs = None

        # BEGIN - setup directories
        base_tmpdir = self._get_tmpdir(self._opts['cloud_tmp_dir'])

        self._cloud_tmp_dir = _check_and_fix_fs_dir(base_tmpdir)

        # use job key to make a unique tmp dir
        self._job_tmpdir = self._cloud_tmp_dir + self._job_key + '/'

        # pick/validate output dir
        if self._output_dir:
            self._output_dir = _check_and_fix_fs_dir(self._output_dir)
        else:
            self._output_dir = self._job_tmpdir + 'output/'
        # END - setup directories

        # manage local files that we want to upload to GCS. We'll add them
        # to this manager just before we need them.
        fs_files_dir = self._job_tmpdir + 'files/'
        self._upload_mgr = UploadDirManager(fs_files_dir)

        # when did our particular task start?
        self._dataproc_job_start = None

        # init hadoop, ami version caches
        self._image_version = None
        self._hadoop_version = None

        # map driver_output_uri to a dict with the keys:
        # log_uri: uri of file we're reading from
        # pos: position in file
        # buffer: bytes read from file already
        self._driver_output_state = {}

        # This will be filled by _run_steps()
        # NOTE - log_interpretations will be empty except job_id until we
        # parse task logs
        self._log_interpretations = []

    def _fix_zone_and_region_opts(self):
        """Ensure that exactly one of region and zone is set."""
        if self._opts['region'] and self._opts['zone']:
            log.warning('you do not need to set region if you set zone')
            self._opts['region'] = None
            return

        if not (self._opts['region'] or self._opts['zone']):
            if environ.get('CLOUDSDK_COMPUTE_ZONE'):
                self._opts['zone'] = environ['CLOUDSDK_COMPUTE_ZONE']
            elif environ.get('CLOUDSDK_COMPUTE_REGION'):
                self._opts['region'] = environ['CLOUDSDK_COMPUTE_REGION']
            else:
                self._opts['region'] = _DEFAULT_GCE_REGION

    def _default_opts(self):
        return combine_dicts(
            super(DataprocJobRunner, self)._default_opts(),
            dict(
                bootstrap_python=True,
                check_cluster_every=_DEFAULT_CHECK_CLUSTER_EVERY,
                cleanup=['CLUSTER', 'JOB', 'LOCAL_TMP'],
                cloud_fs_sync_secs=_DEFAULT_CLOUD_FS_SYNC_SECS,
                image_version=_DEFAULT_IMAGE_VERSION,
                instance_type=_DEFAULT_INSTANCE_TYPE,
                master_instance_type=_DEFAULT_INSTANCE_TYPE,
                num_core_instances=_DATAPROC_MIN_WORKERS,
                num_task_instances=0,
            ))

    def _combine_opts(self, opt_list):
        """Blank out conflicts between *network*/*subnet* and
        *region*/*zone*."""
        opt_list = _blank_out_conflicting_opts(opt_list, ['region', 'zone'])
        opt_list = _blank_out_conflicting_opts(opt_list, ['network', 'subnet'])

        # now combine opts, with region/zone blanked out
        return super(DataprocJobRunner, self)._combine_opts(opt_list)

    @property
    def cluster_client(self):
        return google.cloud.dataproc_v1beta2.ClusterControllerClient(
            **self._client_create_kwargs())

    @property
    def job_client(self):
        return google.cloud.dataproc_v1beta2.JobControllerClient(
            **self._client_create_kwargs())

    @property
    def logging_client(self):
        return google.cloud.logging.Client(credentials=self._credentials,
                                           project=self._project_id)

    def _client_create_kwargs(self):
        if self._opts['region']:
            endpoint = '%s-%s' % (self._opts['region'], _DEFAULT_ENDPOINT)
            return dict(channel=google.api_core.grpc_helpers.create_channel(
                endpoint, credentials=self._credentials))
        else:
            return dict(credentials=self._credentials)

    @property
    def api_client(self):
        raise NotImplementedError(
            '"api_client" was disabled in v0.6.2. Use "cluster_client"'
            ' or "job_client" instead.')

    @property
    def fs(self):
        """:py:class:`~mrjob.fs.base.Filesystem` object for SSH, S3, GCS, and
        the local filesystem.
        """
        if self._fs is None:
            self._fs = CompositeFilesystem()

            location = self._opts['region'] or _zone_to_region(
                self._opts['zone'])

            self._fs.add_fs(
                'gcs',
                GCSFilesystem(
                    credentials=self._credentials,
                    project_id=self._project_id,
                    part_size=self._upload_part_size(),
                    location=location,
                    object_ttl_days=_DEFAULT_CLOUD_TMP_DIR_OBJECT_TTL_DAYS,
                ))

            self._fs.add_fs('local', LocalFilesystem())

        return self._fs

    def _get_tmpdir(self, given_tmpdir):
        """Helper for _fix_tmpdir"""
        if given_tmpdir:
            return given_tmpdir

        # Loop over buckets until we find one that matches region
        # NOTE - because this is a tmpdir, we look for a GCS bucket in the
        # same GCE region
        chosen_bucket_name = None

        # determine region for bucket
        region = self._region()

        for tmp_bucket_name in self.fs.gcs.get_all_bucket_names(
                prefix='mrjob-'):
            tmp_bucket = self.fs.gcs.get_bucket(tmp_bucket_name)

            # NOTE - GCP ambiguous Behavior - Bucket location is being
            # returned as UPPERCASE, ticket filed as of Apr 23, 2016 as docs
            # suggest lowercase. (As of Feb. 12, 2018, this is still true,
            # observed on google-cloud-sdk)
            if tmp_bucket.location.lower() == region:
                # Regions are both specified and match
                log.info("using existing temp bucket %s" % tmp_bucket_name)
                chosen_bucket_name = tmp_bucket_name
                break

        # Example default - "mrjob-us-central1-RANDOMHEX"
        if not chosen_bucket_name:
            chosen_bucket_name = '-'.join(
                ['mrjob', region, random_identifier()])

        return 'gs://%s/tmp/' % chosen_bucket_name

    def _region(self):
        # region of cluster, which is either the region set by the user,
        # or the region derived from the zone they set.
        # used to pick bucket location and name cluster
        return self._opts['region'] or _zone_to_region(self._opts['zone'])

    def _run(self):
        self._launch()
        self._run_steps()

    def _launch(self):
        self._prepare_for_launch()
        self._launch_cluster()

    def _prepare_for_launch(self):
        self._check_output_not_exists()
        self._create_setup_wrapper_scripts()
        self._add_bootstrap_files_for_upload()
        self._add_job_files_for_upload()
        self._upload_local_files()
        self._wait_for_fs_sync()

    def _check_output_not_exists(self):
        """Verify the output path does not already exist. This avoids
        provisioning a cluster only to have Hadoop refuse to launch.
        """
        if self.fs.exists(self._output_dir):
            raise IOError('Output path %s already exists!' %
                          (self._output_dir, ))

    def _add_bootstrap_files_for_upload(self):
        """Add files needed by the bootstrap script to self._upload_mgr.

        Tar up mrjob if bootstrap_mrjob is True.

        Create the master bootstrap script if necessary.

        """
        # lazily create mrjob.zip
        if self._bootstrap_mrjob():
            self._create_mrjob_zip()
            self._bootstrap_dir_mgr.add('file', self._mrjob_zip_path)

        # all other files needed by the script are already in
        # _bootstrap_dir_mgr
        for path in self._bootstrap_dir_mgr.paths():
            self._upload_mgr.add(path)

        # now that we know where the above files live, we can create
        # the master bootstrap script
        self._create_master_bootstrap_script_if_needed()
        if self._master_bootstrap_script_path:
            self._upload_mgr.add(self._master_bootstrap_script_path)

    def _add_job_files_for_upload(self):
        """Add files needed for running the job (setup and input)
        to self._upload_mgr."""
        for path in self._working_dir_mgr.paths('archive'):
            self._upload_mgr.add(path)

        if self._opts['hadoop_streaming_jar']:
            self._upload_mgr.add(self._opts['hadoop_streaming_jar'])

        for step in self._get_steps():
            if step.get('jar'):
                self._upload_mgr.add(step['jar'])

    ### Running the job ###

    def cleanup(self, mode=None):
        super(DataprocJobRunner, self).cleanup(mode=mode)

        # close our SSH tunnel, if any
        self._kill_ssh_tunnel()

        # stop the cluster if it belongs to us (it may have stopped on its
        # own already, but that's fine)
        if self._cluster_id and not self._opts['cluster_id']:
            self._cleanup_cluster()

    def _cleanup_cloud_tmp(self):
        # delete all the files we created
        if not self._job_tmpdir:
            return

        try:
            log.info('Removing all files in %s' % self._job_tmpdir)
            self.fs.rm(self._job_tmpdir)
            self._job_tmpdir = None
        except Exception as e:
            log.exception(e)

    # TODO - mtai @ davidmarin - Re-enable log support and supporting cleanup
    def _cleanup_logs(self):
        super(DataprocJobRunner, self)._cleanup_logs()

    def _cleanup_job(self):
        job_prefix = self._dataproc_job_prefix()
        for job in self._list_jobs(cluster_name=self._cluster_id,
                                   state_matcher=_STATE_MATCHER_ACTIVE):
            # Kill all active jobs with the same job_prefix as this job
            job_id = job.reference.job_id

            if not job_id.startswith(job_prefix):
                continue

            self._cancel_job(job_id)
            self._wait_for_api('job cancellation')

    def _cleanup_cluster(self):
        if not self._cluster_id:
            # If we don't have a cluster, then we can't terminate it.
            return

        try:
            log.info("Attempting to terminate cluster")
            self._delete_cluster(self._cluster_id)
        except Exception as e:
            log.exception(e)
            return
        log.info('cluster %s successfully terminated' % self._cluster_id)

    def _wait_for_api(self, msg):
        _wait_for(msg, self._opts['check_cluster_every'])

    def _wait_for_fs_sync(self):
        """Sleep for a little while, to give FS a chance to sync up.
        """
        _wait_for('GCS sync (eventual consistency)',
                  self._opts['cloud_fs_sync_secs'])

    def _streaming_step_job_kwarg(self, step_num):
        """Returns a map from ``'hadoop_job'`` to a dict representing
        a hadoop streaming job.
        """
        return dict(hadoop_job=dict(
            args=self._hadoop_streaming_jar_args(step_num),
            main_jar_file_uri=self._hadoop_streaming_jar_uri(),
        ))

    def _jar_step_job_kwarg(self, step_num):
        """Returns a map from ``'hadoop_job'`` to a dict representing
        a Hadoop job that runs a JAR"""
        step = self._get_step(step_num)

        hadoop_job = {}

        hadoop_job['args'] = (self._interpolate_jar_step_args(
            step['args'], step_num))

        jar_uri = self._upload_mgr.uri(step['jar'])

        # can't specify main_class and main_jar_file_uri; see
        # https://cloud.google.com/dataproc/docs/reference/rest/v1/projects.regions.jobs#HadoopJob  # noqa
        if step.get('main_class'):
            hadoop_job['jar_file_uris'] = [jar_uri]
            hadoop_job['main_class'] = step['main_class']
        else:
            hadoop_job['main_jar_file_uri'] = jar_uri

        return dict(hadoop_job=hadoop_job)

    def _hadoop_streaming_jar_uri(self):
        if self._opts['hadoop_streaming_jar']:
            return self._upload_mgr.uri(self._opts['hadoop_streaming_jar'])
        else:
            return _HADOOP_STREAMING_JAR_URI

    def _launch_cluster(self):
        """Create an empty cluster on Dataproc, and set self._cluster_id to
        its ID."""
        self.fs.mkdir(self._job_tmpdir)

        # clusterName must be a match of
        # regex '(?:[a-z](?:[-a-z0-9]{0,53}[a-z0-9])?).'
        # as documented in an API error message
        # (not currently documented in the Dataproc docs)
        if not self._cluster_id:
            self._cluster_id = '-'.join(
                ['mrjob', self._region(),
                 random_identifier()])

        # Create the cluster if it's missing, otherwise join an existing one
        try:
            self._get_cluster(self._cluster_id)
            log.info('Adding job to existing cluster - %s' % self._cluster_id)
        except google.api_core.exceptions.NotFound:
            log.info('Creating Dataproc Hadoop cluster - %s' %
                     self._cluster_id)

            cluster_data = self._cluster_create_kwargs()
            self._create_cluster(cluster_data)

            self._wait_for_cluster_ready(self._cluster_id)

        self._set_up_ssh_tunnel()

        # keep track of when we launched our job
        self._dataproc_job_start = time.time()
        return self._cluster_id

    def _wait_for_cluster_ready(self, cluster_id):
        # See https://cloud.google.com/dataproc/reference/rest/v1/projects.regions.clusters#State  # noqa
        cluster_state = None

        # Poll until cluster is ready
        while cluster_state not in ('RUNNING', 'UPDATING'):
            cluster = self._get_cluster(cluster_id)
            cluster_state = cluster.status.State.Name(cluster.status.state)

            if cluster_state in ('ERROR', 'DELETING'):
                raise DataprocException(cluster)

            self._wait_for_api('cluster to accept jobs')

        return cluster_id

    def _dataproc_job_prefix(self):
        return _cleanse_gcp_job_id(self._job_key)

    def _run_steps(self):
        """Wait for every step of the job to complete, one by one."""
        total_steps = self._num_steps()
        # define out steps
        for step_num in range(total_steps):
            job_id = self._launch_step(step_num)

            self._wait_for_step_to_complete(job_id,
                                            step_num=step_num,
                                            num_steps=total_steps)

            log.info('Completed Dataproc Hadoop Job - %s', job_id)

        # After all steps completed, wait for the last output (which is
        # usually written to GCS) to sync
        self._wait_for_fs_sync()

    def _launch_step(self, step_num):
        step = self._get_step(step_num)

        # Clean-up step name
        step_name = '%s---step-%05d-of-%05d' % (
            self._dataproc_job_prefix(), step_num + 1, self._num_steps())

        # Build step

        # job_kwarg is a single-item dict, where the key is 'hadoop_job',
        # 'spark_job', etc.
        if step['type'] == 'streaming':
            job_kwarg = self._streaming_step_job_kwarg(step_num)
        elif step['type'] == 'jar':
            job_kwarg = self._jar_step_job_kwarg(step_num)
        else:
            raise NotImplementedError('Unsupported step type: %r' %
                                      step['type'])

        # Submit it
        log.info('Submitting Dataproc Hadoop Job - %s', step_name)
        result = self._submit_job(step_name, job_kwarg)
        log.info('Submitted Dataproc Hadoop Job - %s', step_name)

        job_id = result.reference.job_id
        assert job_id == step_name

        return job_id

    def _wait_for_step_to_complete(self, job_id, step_num, num_steps):
        """Helper for _wait_for_step_to_complete(). Wait for
        step with the given ID to complete, and fetch counters.
        If it fails, attempt to diagnose the error, and raise an
        exception.

        This also adds an item to self._log_interpretations
        """
        log_interpretation = dict(job_id=job_id)
        self._log_interpretations.append(log_interpretation)

        log_interpretation['step'] = {}
        step_type = self._get_step(step_num)['type']

        while True:
            # https://cloud.google.com/dataproc/reference/rest/v1/projects.regions.jobs#JobStatus  # noqa
            job = self._get_job(job_id)

            job_state = job.status.State.Name(job.status.state)

            log.info('%s => %s' % (job_id, job_state))

            log_interpretation['step']['driver_output_uri'] = (
                job.driver_output_resource_uri)

            self._interpret_step_logs(log_interpretation, step_type)

            progress = log_interpretation['step'].get('progress')
            if progress:
                log.info(' ' + progress['message'])

            # https://cloud.google.com/dataproc/reference/rest/v1/projects.regions.jobs#State  # noqa
            # these are the states covered by the ACTIVE job state matcher,
            # plus SETUP_DONE
            if job_state in ('PENDING', 'RUNNING', 'CANCEL_PENDING',
                             'SETUP_DONE'):
                self._wait_for_api('job completion')
                continue

            # print counters if job wasn't CANCELLED
            if job_state != 'CANCELLED':
                self._log_counters(log_interpretation, step_num)

            if job_state == 'ERROR':
                error = self._pick_error(log_interpretation, step_type)
                if error:
                    log.error('Probable cause of failure:\n\n%s\n\n' %
                              _format_error(error))

            # we're done, will return at the end of this
            if job_state == 'DONE':
                break
            else:
                raise StepFailedException(step_num=step_num,
                                          num_steps=num_steps)

    def _default_step_output_dir(self):
        # put intermediate data in HDFS
        return 'hdfs:///tmp/mrjob/%s/step-output' % self._job_key

    ### log intepretation ###

    # step

    def _interpret_step_logs(self, log_interpretation, step_type):
        """Hook for interpreting step logs.

        Unlike with most runners, you may call this multiple times and it
        will continue to parse the step log incrementally, which is useful
        for getting job progress."""
        # don't turn this off even if read_logs opt is false; it's
        # the only way this runner can track job progress

        driver_output_uri = log_interpretation.get('step',
                                                   {}).get('driver_output_uri')

        if driver_output_uri:
            self._update_step_interpretation(log_interpretation['step'],
                                             driver_output_uri)

    def _update_step_interpretation(self, step_interpretation,
                                    driver_output_uri):
        new_lines = self._get_new_driver_output_lines(driver_output_uri)
        _interpret_new_dataproc_step_stderr(step_interpretation, new_lines)

    def _get_new_driver_output_lines(self, driver_output_uri):
        """Get a list of complete job driver output lines that are
        new since the last time we checked.
        """
        state = self._driver_output_state.setdefault(
            driver_output_uri, dict(log_uri=None, pos=0, buffer=b''))

        # driver output is in logs with names like driveroutput.000000000
        log_uris = sorted(self.fs.ls(driver_output_uri + '*'))

        for log_uri in log_uris:
            # initialize log_uri with first URI we see
            if state['log_uri'] is None:
                # log the location of job driver output just once
                log.info('  Parsing job driver output from %s*' %
                         driver_output_uri)
                state['log_uri'] = log_uri

            # skip log files already parsed
            if log_uri < state['log_uri']:
                continue

            # when parsing the next file, reset *pos*
            elif log_uri > state['log_uri']:
                state['pos'] = 0
                state['log_uri'] = log_uri

            log_blob = self.fs.gcs._get_blob(log_uri)

            try:
                new_data = log_blob.download_as_string(start=state['pos'])
            except (google.api_core.exceptions.NotFound,
                    google.api_core.exceptions.RequestRangeNotSatisfiable):
                # blob was just created, or no more data is available
                break

            state['buffer'] += new_data
            state['pos'] += len(new_data)

        # convert buffer into lines, saving leftovers for next time
        stream = BytesIO(state['buffer'])
        state['buffer'] = b''

        lines = []

        for line_bytes in stream:
            if line_bytes.endswith(b'\n'):
                lines.append(to_unicode(line_bytes))
            else:
                # leave final partial line (if any) in buffer
                state['buffer'] = line_bytes

        return lines

    # history

    def _interpret_history_log(self, log_interpretation):
        """Does nothing. We can't get the history logs, and we don't need
        them."""
        if not self._read_logs():
            return

        log_interpretation.setdefault('history', {})

    # task

    def _interpret_task_logs(self,
                             log_interpretation,
                             step_type,
                             error_attempt_ids=(),
                             partial=True):
        """Scan node manager log to find failed container IDs of failed
        tasks, and then scan the corresponding stderr and syslogs."""
        if 'task' in log_interpretation and (
                partial or not log_interpretation['task'].get('partial')):
            return  # already interpreted

        if not self._read_logs():
            return

        step_interpretation = log_interpretation.get('step') or {}

        application_id = step_interpretation.get('application_id')
        if not application_id:
            log.warning(
                "Can't parse node manager logs; missing application ID")
            return

        log_interpretation['task'] = self._task_log_interpretation(
            application_id, step_type, partial)

    def _task_log_interpretation(self,
                                 application_id,
                                 step_type,
                                 partial=True):
        """Helper for :py:meth:`_interpret_task_logs`"""
        # not bothering with _read_logs() since this is a helper method
        result = {}

        for container_id in self._failed_task_container_ids(application_id):
            error = _parse_task_syslog_records(
                self._task_syslog_records(application_id, container_id,
                                          step_type))

            if not error.get('hadoop_error'):
                # not sure if this ever happens, since we already know
                # which containers failed
                continue

            error['container_id'] = container_id

            # fix weird munging of java stacktrace
            error['hadoop_error']['message'] = _fix_java_stack_trace(
                error['hadoop_error']['message'])

            task_error = _parse_task_stderr(
                self._task_stderr_lines(application_id, container_id,
                                        step_type))

            if task_error:
                task_error['message'] = _fix_traceback(task_error['message'])
                error['task_error'] = task_error

            result.setdefault('errors', []).append(error)

            # if partial is true, bail out when we find the first task error
            if task_error and partial:
                result['partial'] = True
                return result

        return result

    def _failed_task_container_ids(self, application_id):
        """Stream container IDs of failed tasks, in reverse order."""
        container_id_prefix = 'container' + application_id[11:]

        log_filter = self._make_log_filter(
            'yarn-yarn-nodemanager',
            {'jsonPayload.class': _CONTAINER_EXECUTOR_CLASS_NAME})

        log.info('Scanning node manager logs for IDs of failed tasks...')

        # it doesn't seem to work to do self.logging_client.logger();
        # there's some RPC dispute about whether the log name should
        # be qualified by project name or not
        entries = self.logging_client.list_entries(
            filter_=log_filter, order_by=google.cloud.logging.DESCENDING)

        for entry in entries:
            message = entry.payload.get('message')
            if not message:
                continue

            m = _CONTAINER_EXIT_RE.match(message)
            if not m:
                continue

            returncode = int(m.group('returncode'))
            if not returncode:
                continue

            container_id = m.group('container_id')
            # matches some other step
            if not container_id.startswith(container_id_prefix):
                continue

            log.debug('  %s' % container_id)
            yield container_id

    def _task_stderr_lines(self, application_id, container_id, step_type):
        """Yield lines from a specific stderr log."""
        log_filter = self._make_log_filter(
            'yarn-userlogs',
            {
                'jsonPayload.application': application_id,
                'jsonPayload.container': container_id,
                # TODO: pick based on step_type
                'jsonPayload.container_logname': 'stderr',
            })

        log.info('    reading stderr log...')
        entries = self.logging_client.list_entries(filter_=log_filter)

        # use log4j parsing to handle tab -> newline conversion
        for record in _log_entries_to_log4j(entries):
            for line in record['message'].split('\n'):
                yield line

    def _task_syslog_records(self, application_id, container_id, step_type):
        """Yield log4j records from a specific syslog.
        """
        log_filter = self._make_log_filter(
            'yarn-userlogs',
            {
                'jsonPayload.application': application_id,
                'jsonPayload.container': container_id,
                # TODO: pick based on step_type
                'jsonPayload.container_logname': 'syslog',
            })

        log.info('    reading syslog...')
        entries = self.logging_client.list_entries(filter_=log_filter)

        return _log_entries_to_log4j(entries)

    # misc

    def _make_log_filter(self, log_name=None, extra_values=None):
        # we only want logs from this project, cluster, and region
        d = {}

        d['resource.labels.cluster_name'] = self._cluster_id
        d['resource.labels.project_id'] = self._project_id
        d['resource.labels.region'] = self._region()
        d['resource.type'] = 'cloud_dataproc_cluster'

        if log_name:
            d['logName'] = 'projects/%s/logs/%s' % (self._project_id, log_name)

        if extra_values:
            d.update(extra_values)

        return _log_filter_str(d)

    def counters(self):
        return [
            _pick_counters(log_interpretation)
            for log_interpretation in self._log_interpretations
        ]

    ### Bootstrapping ###

    def get_hadoop_version(self):
        if self._hadoop_version is None:
            self._store_cluster_info()
        return self._hadoop_version

    def get_image_version(self):
        """Get the version that our cluster is running.
        """
        if self._image_version is None:
            self._store_cluster_info()
        return self._image_version

    def _store_cluster_info(self):
        """Set self._image_version and self._hadoop_version."""
        if not self._cluster_id:
            raise ValueError('cluster has not yet been created')

        cluster = self._get_cluster(self._cluster_id)
        self._image_version = (cluster.config.software_config.image_version)
        # protect against new versions, including patch versions
        # we didn't explicitly request. See #1428
        self._hadoop_version = map_version(self._image_version,
                                           _DATAPROC_IMAGE_TO_HADOOP_VERSION)

    def _bootstrap_pre_commands(self):
        # don't run the bootstrap script in / (see #1601)
        return [
            'mkdir /tmp/mrjob',
            'cd /tmp/mrjob',
        ]

    ### Bootstrapping ###

    def _bootstrap_python(self):
        """Return a (possibly empty) list of parsed commands (in the same
        format as returned by parse_setup_cmd())'"""
        if not self._opts['bootstrap_python']:
            return []

        if PY2:
            # Python 2 is already installed; install pip and dev packages
            return [
                ['sudo apt-get install -y python-pip python-dev'],
            ]
        else:
            return [
                ['sudo apt-get install -y python3 python3-pip python3-dev'],
            ]

    def get_cluster_id(self):
        return self._cluster_id

    def _cluster_create_kwargs(self):
        gcs_init_script_uris = []
        if self._master_bootstrap_script_path:
            gcs_init_script_uris.append(
                self._upload_mgr.uri(self._master_bootstrap_script_path))

        cluster_metadata = dict()
        cluster_metadata['mrjob-version'] = mrjob.__version__

        # TODO: remove mrjob-max-secs-idle once lifecycle_config is visible
        # through the gcloud utility and the Google Cloud Console
        cluster_metadata['mrjob-max-secs-idle'] = str(
            int(self._opts['max_mins_idle'] * 60))

        gce_cluster_config = dict(
            metadata=cluster_metadata,
            service_account_scopes=self._opts['service_account_scopes'],
        )

        if self._opts['network']:
            gce_cluster_config['network_uri'] = self._opts['network']

        if self._opts['subnet']:
            gce_cluster_config['subnetwork_uri'] = self._opts['subnet']

        if self._opts['service_account']:
            gce_cluster_config['service_account'] = (
                self._opts['service_account'])

        if self._opts['service_account_scopes']:
            gce_cluster_config['service_account_scopes'] = (
                self._opts['service_account_scopes'])

        if self._opts['zone']:
            gce_cluster_config['zone_uri'] = _gcp_zone_uri(
                project=self._project_id, zone=self._opts['zone'])

        cluster_config = dict(gce_cluster_config=gce_cluster_config,
                              initialization_actions=[
                                  dict(executable_file=init_script_uri)
                                  for init_script_uri in gcs_init_script_uris
                              ])

        # Task tracker
        master_conf = _gcp_instance_group_config(
            project=self._project_id,
            zone=self._opts['zone'],
            count=1,
            instance_type=self._opts['master_instance_type'],
        )
        if self._opts['master_instance_config']:
            master_conf.update(self._opts['master_instance_config'])

        # Compute + storage
        worker_conf = _gcp_instance_group_config(
            project=self._project_id,
            zone=self._opts['zone'],
            count=self._opts['num_core_instances'],
            instance_type=self._opts['core_instance_type'])
        if self._opts['core_instance_config']:
            worker_conf.update(self._opts['core_instance_config'])

        # Compute ONLY
        secondary_worker_conf = _gcp_instance_group_config(
            project=self._project_id,
            zone=self._opts['zone'],
            count=self._opts['num_task_instances'],
            instance_type=self._opts['task_instance_type'],
            is_preemptible=True)
        if self._opts['task_instance_config']:
            secondary_worker_conf.update(self._opts['task_instance_config'])

        cluster_config['master_config'] = master_conf
        cluster_config['worker_config'] = worker_conf
        if secondary_worker_conf.get('num_instances'):
            cluster_config['secondary_worker_config'] = secondary_worker_conf

        cluster_config['lifecycle_config'] = dict(idle_delete_ttl=dict(
            seconds=int(self._opts['max_mins_idle'] * 60)))

        software_config = {}

        if self._opts['cluster_properties']:
            software_config['properties'] = _values_to_text(
                self._opts['cluster_properties'])

        # See - https://cloud.google.com/dataproc/dataproc-versions
        if self._opts['image_version']:
            software_config['image_version'] = self._opts['image_version']

        if software_config:
            cluster_config['software_config'] = software_config

        # in Python 2, dict keys loaded from JSON will be unicode, which
        # the Google protobuf objects don't like
        if PY2:
            cluster_config = _clean_json_dict_keys(cluster_config)

        kwargs = dict(project_id=self._project_id,
                      cluster_name=self._cluster_id,
                      config=cluster_config)

        return self._add_extra_cluster_params(kwargs)

    ### Dataproc-specific Stuff ###

    def _get_cluster(self, cluster_id):
        return self.cluster_client.get_cluster(cluster_name=cluster_id,
                                               **self._project_id_and_region())

    def _create_cluster(self, cluster_data):
        # https://cloud.google.com/dataproc/reference/rest/v1/projects.regions.clusters/create  # noqa
        # https://cloud.google.com/dataproc/reference/rest/v1/projects.regions.clusters/get  # noqa

        self.cluster_client.create_cluster(cluster=cluster_data,
                                           **self._project_id_and_region())

    def _delete_cluster(self, cluster_id):
        return self.cluster_client.delete_cluster(
            cluster_name=cluster_id, **self._project_id_and_region())

    def _list_jobs(self, cluster_name=None, state_matcher=None):
        # https://cloud.google.com/dataproc/reference/rest/v1/projects.regions.jobs/list#JobStateMatcher  # noqa
        list_kwargs = self._project_id_and_region()

        if cluster_name:
            list_kwargs['cluster_name'] = cluster_name

        if state_matcher:
            list_kwargs['job_state_matcher'] = state_matcher

        return self.job_client.list_jobs(**list_kwargs)

    def _get_job(self, job_id):
        return self.job_client.get_job(job_id=job_id,
                                       **self._project_id_and_region())

    def _cancel_job(self, job_id):
        return self.job_client.cancel_job(job_id=job_id,
                                          **self._project_id_and_region())

    def _submit_job(self, step_name, job_kwarg):
        # https://cloud.google.com/dataproc/reference/rest/v1/projects.regions.jobs/submit  # noqa
        # https://cloud.google.com/dataproc/reference/rest/v1/projects.regions.jobs#HadoopJob  # noqa
        # https://cloud.google.com/dataproc/reference/rest/v1/projects.regions.jobs#JobReference  # noqa

        submit_job_kwargs = dict(job=dict(
            reference=dict(project_id=self._project_id, job_id=step_name),
            placement=dict(cluster_name=self._cluster_id),
            **job_kwarg),
                                 **self._project_id_and_region())

        log.debug('  submit_job(%s)' %
                  ', '.join('%s=%r' % (k, v)
                            for k, v in sorted(submit_job_kwargs.items())))

        return self.job_client.submit_job(**submit_job_kwargs)

    def _project_id_and_region(self):
        return dict(
            project_id=self._project_id,
            region=(self._opts['region'] or 'global'),
        )

    def _manifest_download_commands(self):
        return [
            # TODO: SSH in and figure out how to use gsutil or similar
            # ('gs://*', 'gsutil cp'),
            ('*://*', 'hadoop fs -copyToLocal'),
        ]

    ### SSH hooks ###

    def _job_tracker_host(self):
        return '%s-m' % self._cluster_id

    def _ssh_tunnel_config(self):
        return _SSH_TUNNEL_CONFIG

    def _launch_ssh_proc(self, args):
        ssh_proc = super(DataprocJobRunner, self)._launch_ssh_proc(args)

        # enter an empty passphrase if creating a key for the first time
        ssh_proc.stdin.write(b'\n\n')

        return ssh_proc

    def _ssh_launch_wait_secs(self):
        """Wait 20 seconds because gcloud has to update project metadata
        (unless we were going to check the cluster sooner anyway)."""
        return min(20.0, self._opts['check_cluster_every'])

    def _ssh_tunnel_args(self, bind_port):
        if not self._cluster_id:
            return

        gcloud_bin = self._opts['gcloud_bin'] or ['gcloud']

        cluster = self._get_cluster(self._cluster_id)
        zone = cluster.config.gce_cluster_config.zone_uri.split('/')[-1]

        return gcloud_bin + [
            'compute',
            'ssh',
            '--zone',
            zone,
            self._job_tracker_host(),
            '--',
        ] + self._ssh_tunnel_opts(bind_port)
Ejemplo n.º 9
0
class HadoopJobRunner(MRJobBinRunner, LogInterpretationMixin):
    """Runs an :py:class:`~mrjob.job.MRJob` on your Hadoop cluster.
    Invoked when you run your job with ``-r hadoop``.

    Input and support files can be either local or on HDFS; use ``hdfs://...``
    URLs to refer to files on HDFS.
    """
    alias = 'hadoop'

    OPT_NAMES = MRJobBinRunner.OPT_NAMES | {
        'hadoop_bin',
        'hadoop_extra_args',
        'hadoop_log_dirs',
        'hadoop_streaming_jar',
        'hadoop_tmp_dir',
        'spark_deploy_mode',
        'spark_master',
    }

    # supports everything (so far)
    _STEP_TYPES = {'jar', 'spark', 'spark_jar', 'spark_script', 'streaming'}

    def __init__(self, **kwargs):
        """:py:class:`~mrjob.hadoop.HadoopJobRunner` takes the same arguments
        as :py:class:`~mrjob.runner.MRJobRunner`, plus some additional options
        which can be defaulted in :ref:`mrjob.conf <mrjob.conf>`.
        """
        super(HadoopJobRunner, self).__init__(**kwargs)

        self._hadoop_tmp_dir = fully_qualify_hdfs_path(
            posixpath.join(self._opts['hadoop_tmp_dir'], self._job_key))

        # Keep track of local files to upload to HDFS. We'll add them
        # to this manager just before we need them.
        hdfs_files_dir = posixpath.join(self._hadoop_tmp_dir, 'files', '')
        self._upload_mgr = UploadDirManager(hdfs_files_dir)

        # Set output dir if it wasn't set explicitly
        self._output_dir = fully_qualify_hdfs_path(
            self._output_dir or posixpath.join(self._hadoop_tmp_dir, 'output'))

        # Fully qualify step_output_dir, if set
        if self._step_output_dir:
            self._step_output_dir = fully_qualify_hdfs_path(
                self._step_output_dir)

        # Track job and (YARN) application ID to enable log parsing
        self._application_id = None
        self._job_id = None

        # Keep track of where the hadoop streaming jar is
        self._hadoop_streaming_jar = self._opts['hadoop_streaming_jar']
        self._searched_for_hadoop_streaming_jar = False

        # List of dicts (one for each step) potentially containing
        # the keys 'history', 'step', and 'task' ('step' will always
        # be filled because it comes from the hadoop jar command output,
        # others will be filled as needed)
        self._log_interpretations = []

    def _default_opts(self):
        return combine_dicts(
            super(HadoopJobRunner, self)._default_opts(),
            dict(hadoop_tmp_dir='tmp/mrjob', ))

    @property
    def fs(self):
        """:py:class:`mrjob.fs.base.Filesystem` object for HDFS and the local
        filesystem.
        """
        if self._fs is None:
            self._fs = CompositeFilesystem()

            # don't pass [] to fs; this means not to use hadoop until
            # fs.set_hadoop_bin() is called (used for running hadoop over SSH).
            hadoop_bin = self._opts['hadoop_bin'] or None

            self._fs.add_fs('hadoop', HadoopFilesystem(hadoop_bin))
            self._fs.add_fs('local', LocalFilesystem())

        return self._fs

    def get_hadoop_version(self):
        """Invoke the hadoop executable to determine its version"""
        return self.fs.hadoop.get_hadoop_version()

    def get_hadoop_bin(self):
        """Find the hadoop binary. A list: binary followed by arguments."""
        return self.fs.hadoop.get_hadoop_bin()

    def get_hadoop_streaming_jar(self):
        """Find the path of the hadoop streaming jar, or None if not found."""
        if not (self._hadoop_streaming_jar
                or self._searched_for_hadoop_streaming_jar):

            self._hadoop_streaming_jar = self._find_hadoop_streaming_jar()

            if self._hadoop_streaming_jar:
                log.info('Found Hadoop streaming jar: %s' %
                         self._hadoop_streaming_jar)
            else:
                log.warning('Hadoop streaming jar not found. Use'
                            ' --hadoop-streaming-jar')

            self._searched_for_hadoop_streaming_jar = True

        return self._hadoop_streaming_jar

    def _find_hadoop_streaming_jar(self):
        """Search for the hadoop streaming jar. See
        :py:meth:`_hadoop_streaming_jar_dirs` for where we search."""
        for path in unique(self._hadoop_streaming_jar_dirs()):
            log.info('Looking for Hadoop streaming jar in %s...' % path)

            streaming_jars = []
            for path in self.fs.ls(path):
                if _HADOOP_STREAMING_JAR_RE.match(posixpath.basename(path)):
                    streaming_jars.append(path)

            if streaming_jars:
                # prefer shorter names and shallower paths
                def sort_key(p):
                    return (len(p.split('/')), len(posixpath.basename(p)), p)

                streaming_jars.sort(key=sort_key)

                return streaming_jars[0]

        return None

    def _hadoop_dirs(self):
        """Yield all possible hadoop directories (used for streaming jar
        and logs). May yield duplicates"""
        for name in ('HADOOP_PREFIX', 'HADOOP_HOME', 'HADOOP_INSTALL',
                     'HADOOP_MAPRED_HOME'):
            path = os.environ.get(name)
            if path:
                yield path

        # guess it from the path of the Hadoop binary
        hadoop_home = _hadoop_prefix_from_bin(self.get_hadoop_bin()[0])
        if hadoop_home:
            yield hadoop_home

        # try HADOOP_*_HOME
        for name, path in sorted(os.environ.items()):
            if name.startswith('HADOOP_') and name.endswith('_HOME'):
                yield path

    def _hadoop_streaming_jar_dirs(self):
        """Yield all possible places to look for the Hadoop streaming jar.
        May yield duplicates.
        """
        for hadoop_dir in self._hadoop_dirs():
            yield hadoop_dir

        # use hard-coded paths to work out-of-the-box on EMR
        for path in _EMR_HADOOP_STREAMING_JAR_DIRS:
            yield path

    def _hadoop_log_dirs(self, output_dir=None):
        """Yield all possible places to look for hadoop logs."""
        # hadoop_log_dirs opt overrides all this
        if self._opts['hadoop_log_dirs']:
            for path in self._opts['hadoop_log_dirs']:
                yield path
            return

        hadoop_log_dir = os.environ.get('HADOOP_LOG_DIR')
        if hadoop_log_dir:
            yield hadoop_log_dir

        yarn = uses_yarn(self.get_hadoop_version())

        if yarn:
            yarn_log_dir = os.environ.get('YARN_LOG_DIR')
            if yarn_log_dir:
                yield yarn_log_dir

            yield _DEFAULT_YARN_HDFS_LOG_DIR

        if output_dir:
            # Cloudera style of logging
            yield posixpath.join(output_dir, '_logs')

        for hadoop_dir in self._hadoop_dirs():
            yield posixpath.join(hadoop_dir, 'logs')

        # hard-coded fallback paths
        if yarn:
            for path in _FALLBACK_HADOOP_YARN_LOG_DIRS:
                yield path

        for path in _FALLBACK_HADOOP_LOG_DIRS:
            yield path

    def _run(self):
        self._find_binaries_and_jars()
        self._create_setup_wrapper_scripts()
        self._add_job_files_for_upload()
        self._upload_local_files()
        self._run_job_in_hadoop()

    def _find_binaries_and_jars(self):
        """Find hadoop and (if needed) spark-submit bin up-front, before
        continuing with the job.

        (This is just for user-interaction purposes; these would otherwise
        lazy-load as needed.)
        """
        # this triggers looking for Hadoop binary
        self.get_hadoop_version()

        if self._has_streaming_steps():
            self.get_hadoop_streaming_jar()

        if self._has_spark_steps():
            self.get_spark_submit_bin()

    def _add_job_files_for_upload(self):
        """Add files needed for running the job (setup and input)
        to self._upload_mgr."""
        for path in self._py_files():
            self._upload_mgr.add(path)

    def _dump_stdin_to_local_file(self):
        """Dump sys.stdin to a local file, and return the path to it."""
        stdin_path = posixpath.join(self._get_local_tmp_dir(), 'STDIN')
        # prompt user, so they don't think the process has stalled
        log.info('reading from STDIN')

        log.debug('dumping stdin to local file %s...' % stdin_path)
        stdin_file = open(stdin_path, 'wb')
        for line in self._stdin:
            stdin_file.write(line)

        return stdin_path

    def _run_job_in_hadoop(self):
        for step_num, step in enumerate(self._get_steps()):
            self._warn_about_spark_archives(step)

            step_args = self._args_for_step(step_num)
            env = _fix_env(self._env_for_step(step_num))

            # log this *after* _args_for_step(), which can start a search
            # for the Hadoop streaming jar
            log.info('Running step %d of %d...' %
                     (step_num + 1, self._num_steps()))
            log.debug('> %s' % cmd_line(step_args))
            log.debug('  with environment: %r' % sorted(env.items()))

            log_interpretation = {}
            self._log_interpretations.append(log_interpretation)

            # try to use a PTY if it's available
            try:
                pid, master_fd = pty.fork()
            except (AttributeError, OSError):
                # no PTYs, just use Popen

                # user won't get much feedback for a while, so tell them
                # Hadoop is running
                log.debug('No PTY available, using Popen() to invoke Hadoop')

                step_proc = Popen(step_args, stdout=PIPE, stderr=PIPE, env=env)

                step_interpretation = _interpret_hadoop_jar_command_stderr(
                    step_proc.stderr, record_callback=_log_record_from_hadoop)

                # there shouldn't be much output to STDOUT
                for line in step_proc.stdout:
                    _log_line_from_driver(to_unicode(line).strip('\r\n'))

                step_proc.stdout.close()
                step_proc.stderr.close()

                returncode = step_proc.wait()
            else:
                # we have PTYs
                if pid == 0:  # we are the child process
                    os.execvpe(step_args[0], step_args, env)
                else:
                    log.debug('Invoking Hadoop via PTY')

                    with os.fdopen(master_fd, 'rb') as master:
                        # reading from master gives us the subprocess's
                        # stderr and stdout (it's a fake terminal)
                        step_interpretation = (
                            _interpret_hadoop_jar_command_stderr(
                                master,
                                record_callback=_log_record_from_hadoop))
                        _, returncode = os.waitpid(pid, 0)

            # make sure output_dir is filled
            if 'output_dir' not in step_interpretation:
                step_interpretation['output_dir'] = (
                    self._step_output_uri(step_num))

            log_interpretation['step'] = step_interpretation

            self._log_counters(log_interpretation, step_num)

            step_type = step['type']

            if returncode:
                error = self._pick_error(log_interpretation, step_type)
                if error:
                    log.error('Probable cause of failure:\n\n%s\n' %
                              _format_error(error))

                # use CalledProcessError's well-known message format
                reason = str(CalledProcessError(returncode, step_args))
                raise StepFailedException(reason=reason,
                                          step_num=step_num,
                                          num_steps=self._num_steps())

    def _warn_about_spark_archives(self, step):
        """If *step* is a Spark step, the *upload_archives* option is set,
        and *spark_master* is not ``'yarn'``, warn that *upload_archives*
        will be ignored by Spark."""
        if (_is_spark_step_type(step['type'])
                and self._spark_master() != 'yarn'
                and self._opts['upload_archives']):
            log.warning('Spark will probably ignore archives because'
                        " spark_master is not 'yarn'")

    def _spark_master(self):
        return self._opts['spark_master'] or 'yarn'

    def _args_for_step(self, step_num):
        step = self._get_step(step_num)

        if step['type'] == 'streaming':
            return self._args_for_streaming_step(step_num)
        elif step['type'] == 'jar':
            return self._args_for_jar_step(step_num)
        elif _is_spark_step_type(step['type']):
            return self._args_for_spark_step(step_num)
        else:
            raise ValueError('Bad step type: %r' % (step['type'], ))

    def _args_for_streaming_step(self, step_num):
        hadoop_streaming_jar = self.get_hadoop_streaming_jar()
        if not hadoop_streaming_jar:
            raise Exception('no Hadoop streaming jar')

        return (self.get_hadoop_bin() + ['jar', hadoop_streaming_jar] +
                self._hadoop_streaming_jar_args(step_num))

    def _args_for_jar_step(self, step_num):
        step = self._get_step(step_num)

        args = []

        args.extend(self.get_hadoop_bin())

        # special case for consistency with EMR runner.
        #
        # This might look less like duplicated code if we ever
        # implement #780 (fetching jars from URIs)
        if step['jar'].startswith('file:///'):
            jar = step['jar'][7:]  # keep leading slash
        else:
            jar = step['jar']

        args.extend(['jar', jar])

        if step.get('main_class'):
            args.append(step['main_class'])

        if step.get('args'):
            args.extend(self._interpolate_jar_step_args(
                step['args'], step_num))

        return args

    def _env_for_step(self, step_num):
        step = self._get_step(step_num)

        env = dict(os.environ)

        # when running spark-submit, set its environment directly. See #1464
        if _is_spark_step_type(step['type']):
            env.update(self._spark_cmdenv(step_num))

        return env

    def _default_step_output_dir(self):
        return posixpath.join(self._hadoop_tmp_dir, 'step-output')

    def _cleanup_hadoop_tmp(self):
        if self._hadoop_tmp_dir:
            log.info('Removing HDFS temp directory %s...' %
                     self._hadoop_tmp_dir)
            try:
                self.fs.rm(self._hadoop_tmp_dir)
            except Exception as e:
                log.exception(e)

    def _manifest_download_commands(self):
        cp_to_local = self.get_hadoop_bin() + ['fs', '-copyToLocal']

        return [
            ('*://*', cmd_line(cp_to_local)),
        ]

    ### LOG (implementation of LogInterpretationMixin) ###

    def _stream_history_log_dirs(self, output_dir=None):
        """Yield lists of directories to look for the history log in."""
        if not self._read_logs():
            return

        for log_dir in unique(self._hadoop_log_dirs(output_dir=output_dir)):
            if _logs_exist(self.fs, log_dir):
                log.info('Looking for history log in %s...' % log_dir)
                # logs aren't always in a subdir named history/
                yield [log_dir]

    def _stream_task_log_dirs(self, application_id=None, output_dir=None):
        """Yield lists of directories to look for the task logs in."""
        # Note: this is unlikely to be super-helpful on "real" (multi-node)
        # pre-YARN Hadoop because task logs aren't generally shipped to a
        # local directory. It's a start, anyways. See #1201.
        if not self._read_logs():
            return

        for log_dir in unique(self._hadoop_log_dirs(output_dir=output_dir)):
            if application_id:
                path = self.fs.join(log_dir, 'userlogs', application_id)
            else:
                path = self.fs.join(log_dir, 'userlogs')

            if _logs_exist(self.fs, path):
                log.info('Looking for task syslogs in %s...' % path)
                yield [path]

    def counters(self):
        return [
            _pick_counters(log_interpretation)
            for log_interpretation in self._log_interpretations
        ]
Ejemplo n.º 10
0
Archivo: runner.py Proyecto: stug/mrjob
class MRJobRunner(object):
    """Abstract base class for all runners"""

    # this class handles the basic runner framework, options and config files,
    # arguments to mrjobs, and setting up job working dirs and environments.
    # this will put files from setup scripts, py_files, and bootstrap_mrjob
    # into the job's working dir, but won't actually run/import them
    #
    # command lines to run substeps (including Spark) are handled by
    # mrjob.bin.MRJobBinRunner

    #: alias for this runner, used on the command line with ``-r``
    alias = None

    # libjars is only here because the job can set it; might want to
    # handle this with a warning from the launcher instead
    OPT_NAMES = {
        'bootstrap_mrjob', 'check_input_paths', 'cleanup',
        'cleanup_on_failure', 'cmdenv', 'jobconf', 'label', 'libjars',
        'local_tmp_dir', 'owner', 'py_files', 'read_logs', 'setup',
        'upload_archives', 'upload_dirs', 'upload_files'
    }

    # re-define this as a set of step types supported by your runner
    _STEP_TYPES = None

    # if this is true, when bootstrap_mrjob is true, create a mrjob.zip
    # and patch it into the *py_files* option
    _BOOTSTRAP_MRJOB_IN_PY_FILES = True

    ### methods to call from your batch script ###

    def __init__(self,
                 mr_job_script=None,
                 conf_paths=None,
                 extra_args=None,
                 file_upload_args=None,
                 hadoop_input_format=None,
                 hadoop_output_format=None,
                 input_paths=None,
                 output_dir=None,
                 partitioner=None,
                 sort_values=None,
                 stdin=None,
                 steps=None,
                 step_output_dir=None,
                 **opts):
        """All runners take the following keyword arguments:

        :type mr_job_script: str
        :param mr_job_script: the path of the ``.py`` file containing the
                              :py:class:`~mrjob.job.MRJob`. If this is None,
                              you won't actually be able to :py:meth:`run` the
                              job, but other utilities (e.g. :py:meth:`ls`)
                              will work.
        :type conf_paths: None or list
        :param conf_paths: List of config files to combine and use, or None to
                           search for mrjob.conf in the default locations.
        :type extra_args: list of str
        :param extra_args: a list of extra cmd-line arguments to pass to the
                           mr_job script. This is a hook to allow jobs to take
                           additional arguments.
        :param file_upload_args: a list of tuples of ``('--ARGNAME', path)``.
                                 The file at the given path will be uploaded
                                 to the local directory of the mr_job script
                                 when it runs, and then passed into the script
                                 with ``--ARGNAME``. Useful for passing in
                                 SQLite DBs and other configuration files to
                                 your job.
        :type hadoop_input_format: str
        :param hadoop_input_format: name of an optional Hadoop ``InputFormat``
                                    class. Passed to Hadoop along with your
                                    first step with the ``-inputformat``
                                    option. Note that if you write your own
                                    class, you'll need to include it in your
                                    own custom streaming jar (see
                                    :mrjob-opt:`hadoop_streaming_jar`).
        :type hadoop_output_format: str
        :param hadoop_output_format: name of an optional Hadoop
                                     ``OutputFormat`` class. Passed to Hadoop
                                     along with your first step with the
                                     ``-outputformat`` option. Note that if you
                                     write your own class, you'll need to
                                     include it in your own custom streaming
                                     jar (see
                                     :mrjob-opt:`hadoop_streaming_jar`).
        :type input_paths: list of str
        :param input_paths: Input files for your job. Supports globs and
                            recursively walks directories (e.g.
                            ``['data/common/', 'data/training/*.gz']``). If
                            this is left blank, we'll read from stdin
        :type output_dir: str
        :param output_dir: An empty/non-existent directory where Hadoop
                           should put the final output from the job.
                           If you don't specify an output directory, we'll
                           output into a subdirectory of this job's temporary
                           directory. You can control this from the command
                           line with ``--output-dir``. This option cannot be
                           set from configuration files. If used with the
                           hadoop runner, this path does not need to be fully
                           qualified with ``hdfs://`` URIs because it's
                           understood that it has to be on HDFS.
        :type partitioner: str
        :param partitioner: Optional name of a Hadoop partitioner class, e.g.
                            ``'org.apache.hadoop.mapred.lib.HashPartitioner'``.
                            Hadoop streaming will use this to determine how
                            mapper output should be sorted and distributed
                            to reducers.
        :type sort_values: bool
        :param sort_values: if true, set partitioners and jobconf variables
                            so that reducers to receive the values
                            associated with any key in sorted order (sorted by
                            their *encoded* value). Also known as secondary
                            sort.
        :param stdin: an iterable (can be a ``BytesIO`` or even a list) to use
                      as stdin. This is a hook for testing; if you set
                      ``stdin`` via :py:meth:`~mrjob.job.MRJob.sandbox`, it'll
                      get passed through to the runner. If for some reason
                      your lines are missing newlines, we'll add them;
                      this makes it easier to write automated tests.
        :param steps: a list of descriptions of steps to run (see :doc:`step`
                      for description formats)
        :type step_output_dir: str
        :param step_output_dir: An empty/non-existent directory where Hadoop
                                should put output from all steps other than
                                the last one (this only matters for multi-step
                                jobs). Currently ignored by local runners.
        """
        self._ran_job = False

        # opts are made from:
        #
        # empty defaults (everything set to None)
        # runner-specific defaults
        # opts from config file(s)
        # opts from command line
        self._opts = self._combine_confs(
            [(None, {key: None
                     for key in self.OPT_NAMES})] +
            [(None, self._default_opts())] +
            load_opts_from_mrjob_confs(self.alias, conf_paths) +
            [('the command line', opts)])

        log.debug('Active configuration:')
        log.debug(
            pprint.pformat({
                opt_key: self._obfuscate_opt(opt_key, opt_value)
                for opt_key, opt_value in self._opts.items()
            }))

        self._fs = None

        # a local tmp directory that will be cleaned up when we're done
        # access/make this using self._get_local_tmp_dir()
        self._local_tmp_dir = None

        self._working_dir_mgr = WorkingDirManager()

        # mapping from dir to path for corresponding archive. we pick
        # paths during init(), but don't actually create the archives
        # until self._create_dir_archives() is called
        self._dir_to_archive_path = {}
        # dir archive names (the filename minus ".tar.gz") already taken
        self._dir_archive_names_taken = set()
        # set of dir_archives that have actually been created
        self._dir_archives_created = set()

        # track (name, path) of files and archives to upload to spark
        # if not using a setup script.
        self._spark_files = []
        self._spark_archives = []

        # set this to an :py:class:`~mrjob.setup.UploadDirManager` in
        # runners that upload files to HDFS, S3, etc.
        self._upload_mgr = None

        self._script_path = mr_job_script
        if self._script_path:
            self._working_dir_mgr.add('file', self._script_path)

        # give this job a unique name
        self._job_key = self._make_unique_job_key()

        # extra args to our job
        self._extra_args = list(extra_args) if extra_args else []
        for extra_arg in self._extra_args:
            if isinstance(extra_arg, dict):
                if extra_arg.get('type') != 'file':
                    raise NotImplementedError
                self._working_dir_mgr.add(**extra_arg)
                self._spark_files.append(
                    (extra_arg['name'], extra_arg['path']))

        # extra file arguments to our job
        if file_upload_args:
            log.warning('file_upload_args is deprecated and will be removed'
                        ' in v0.6.0. Pass dicts to extra_args instead.')
            for arg, path in file_upload_args:
                arg_file = parse_legacy_hash_path('file', path)
                self._working_dir_mgr.add(**arg_file)
                self._extra_args.extend([arg, arg_file])
                self._spark_files.append((arg_file['name'], arg_file['path']))

        # set up uploading
        for hash_path in self._opts['upload_files']:
            uf = parse_legacy_hash_path('file',
                                        hash_path,
                                        must_name='upload_files')
            self._working_dir_mgr.add(**uf)
            self._spark_files.append((uf['name'], uf['path']))

        for hash_path in self._opts['upload_archives']:
            ua = parse_legacy_hash_path('archive',
                                        hash_path,
                                        must_name='upload_archives')
            self._working_dir_mgr.add(**ua)
            self._spark_archives.append((ua['name'], ua['path']))

        for hash_path in self._opts['upload_dirs']:
            # pick name based on directory path
            ud = parse_legacy_hash_path('dir',
                                        hash_path,
                                        must_name='upload_archives')
            # but feed working_dir_mgr the archive's path
            archive_path = self._dir_archive_path(ud['path'])
            self._working_dir_mgr.add('archive', archive_path, name=ud['name'])
            self._spark_archives.append((ud['name'], archive_path))

        # Where to read input from (log files, etc.)
        self._input_paths = input_paths or ['-']  # by default read from stdin
        if PY2:
            self._stdin = stdin or sys.stdin
        else:
            self._stdin = stdin or sys.stdin.buffer
        self._stdin_path = None  # temp file containing dump from stdin

        # where to keep the input manifest
        self._input_manifest_path = None

        # store output_dir
        self._output_dir = output_dir

        # store partitioner
        self._partitioner = partitioner

        # store sort_values
        self._sort_values = sort_values

        # store step_output_dir
        self._step_output_dir = step_output_dir

        # store hadoop input and output formats
        self._hadoop_input_format = hadoop_input_format
        self._hadoop_output_format = hadoop_output_format

        # check and store *steps*
        self._steps = None
        if steps is None:
            if not mr_job_script:
                self._steps = []
            # otherwise we'll load steps on-the-fly, see _load_steps()
        else:
            self._check_steps(steps)
            self._steps = copy.deepcopy(steps)

        # this variable marks whether a cleanup has happened and this runner's
        # output stream is no longer available.
        self._closed = False

    ### Options ####

    def _default_opts(self):
        try:
            owner = getpass.getuser()
        except:
            owner = None

        return dict(
            check_input_paths=True,
            cleanup=['ALL'],
            cleanup_on_failure=['NONE'],
            owner=owner,
        )

    def _combine_confs(self, source_and_opt_list):
        """Combine several opt dictionaries into one.

        *source_and_opt_list* is a list of tuples of *source*,
        *opts* where *opts* is a dictionary and *source* is either
        None or a description of where the opts came from (usually a path).

        Only override this if you need truly fine-grained control,
        including knowledge of the options' source.
        """
        opt_list = [
            self._fix_opts(opts, source)
            for source, opts in source_and_opt_list
        ]

        return self._combine_opts(opt_list)

    def _combine_opts(self, opt_list):
        """Combine several opt dictionaries into one. *opt_list*
        is a list of dictionaries containing validated options

        Override this if you need to base options off the values of
        other options, but don't need to issue warnings etc.
        about the options' source.
        """
        return combine_opts(self._opt_combiners(), *opt_list)

    def _opt_combiners(self):
        """A dictionary mapping opt name to combiner funciton. This
        won't necessarily include every opt name (we default to
        :py:func:`~mrjob.conf.combine_value`).
        """
        return _combiners(self.OPT_NAMES)

    def _fix_opts(self, opts, source=None):
        """Take an options dictionary, and either return a sanitized
        version of it, or raise an exception.

        *source* is either a string describing where the opts came from
        or None.

        This ensures that opt dictionaries are really dictionaries
        and handles deprecated options.
        """
        if source is None:
            source = 'defaults'  # defaults shouldn't trigger warnings

        if not isinstance(opts, dict):
            raise TypeError('options for %s (from %s) must be a dict' %
                            (self.alias, source))

        deprecated_aliases = _deprecated_aliases(self.OPT_NAMES)

        results = {}

        for k, v in sorted(opts.items()):
            # rewrite deprecated aliases
            if k in deprecated_aliases:
                if v is None:  # don't care
                    continue

                aliased_opt = deprecated_aliases

                log.warning('Deprecated option %s (from %s) has been renamed'
                            ' to %s and will be removed in v0.7.0' %
                            (k, source, aliased_opt))

                if opts.get(aliased_opt) is not None:
                    return  # don't overwrite non-aliased opt

                k = aliased_opt

            if k in self.OPT_NAMES:
                results[k] = None if v is None else self._fix_opt(k, v, source)
            elif v:
                log.warning('Unexpected option %s (from %s)' % (k, source))

        return results

    def _fix_opt(self, opt_key, opt_value, source):
        """Fix a single option, returning its correct value or raising
        an exception. This is not called for options that are ``None``.

        This currently handles cleanup opts.

        Override this if you require additional opt validation or cleanup.
        """
        if opt_key in ('cleanup', 'cleanup_on_failure'):
            return self._fix_cleanup_opt(opt_key, opt_value, source)
        else:
            return opt_value

    def _fix_cleanup_opt(self, opt_key, opt_value, source):
        """Fix a cleanup option, or raise ValueError."""
        if isinstance(opt_value, string_types):
            opt_value = [opt_value]

        if 'NONE' in opt_value and len(set(opt_value)) > 1:
            raise ValueError('Cannot clean up both nothing and something!'
                             ' (%s option from %s)' % (opt_key, source))

        for cleanup_type in opt_value:
            if cleanup_type not in CLEANUP_CHOICES:
                raise ValueError(
                    '%s must be one of %s, not %s (from %s)' %
                    (opt_key, ', '.join(CLEANUP_CHOICES), opt_value, source))

        return opt_value

    def _obfuscate_opt(self, opt_key, opt_value):
        """Return value of opt to show in debug printout. Used to obfuscate
        credentials, etc."""
        return opt_value

    ### Filesystem object ###

    @property
    def fs(self):
        """:py:class:`~mrjob.fs.base.Filesystem` object for the local
        filesystem.
        """
        if self._fs is None:
            # wrap LocalFilesystem in LocalFilesystem to get IOError
            # on URIs (see #1185)
            self._fs = CompositeFilesystem()
            self._fs.add_fs('local', LocalFilesystem())
        return self._fs

    ### Running the job and parsing output ###

    def run(self):
        """Run the job, and block until it finishes.

        Raise :py:class:`~mrjob.step.StepFailedException` if there
        are any problems (except on
        :py:class:`~mrjob.inline.InlineMRJobRunner`, where we raise the
        actual exception that caused the step to fail).
        """
        if self._ran_job:
            raise ValueError('Job already ran!')

        if self._num_steps() == 0:
            raise ValueError('Job has no steps!')

        self._create_dir_archives()
        # TODO: no point in checking input paths if we're going to
        # make a manifest out of them
        self._check_input_paths()
        self._add_input_files_for_upload()
        self._create_input_manifest_if_needed()
        self._run()
        self._ran_job = True

        last_step = self._get_steps()[-1]

        # only print this message if the last step uses our output dir
        if 'args' not in last_step or OUTPUT in last_step['args']:
            log.info('job output is in %s' % self._output_dir)

    def cat_output(self):
        """Stream the jobs output, as a stream of ``bytes``. If there are
        multiple output files, there will be an empty bytestring
        (``b''``) between them.

        .. versionadded:: 0.6.0

           In previous versions, you'd use :py:meth:`stream_output`.
        """
        output_dir = self.get_output_dir()
        if output_dir is None:
            raise ValueError('Run the job before streaming output')

        if self._closed is True:
            log.warning(
                'WARNING! Trying to stream output from a closed runner, output'
                ' will probably be empty.')

        log.info('Streaming final output from %s...' % output_dir)

        def split_path(path):
            while True:
                base, name = os.path.split(path)

                # no more elements
                if not name:
                    break

                yield name

                path = base

        def ls_output():
            for filename in self.fs.ls(output_dir):
                subpath = filename[len(output_dir):]
                # Hadoop ignores files and dirs inside the output dir
                # whose names start with '_' or '.'. See #1337.
                if not (any(name[0] in '_.' for name in split_path(subpath))):
                    yield filename

        for i, filename in enumerate(ls_output()):
            if i > 0:
                yield b''  # EOF of previous file

            for chunk in self.fs._cat_file(filename):
                yield chunk

    def stream_output(self):
        """Like :py:meth:`cat_output` except that it groups bytes into
        lines. Equivalent to ``mrjob.util.to_lines(runner.cat_output())``.

        .. deprecated:: 0.6.0
        """
        log.warning('stream_output() is deprecated and will be removed in'
                    ' v0.7.0. use mrjob.util.to_lines(runner.cat_output())'
                    ' instead.')

        return to_lines(self.cat_output())

    def _cleanup_mode(self, mode=None):
        """Actual cleanup action to take based on various options"""
        if self._script_path and not self._ran_job:
            return mode or self._opts['cleanup_on_failure']
        else:
            return mode or self._opts['cleanup']

    def _cleanup_cloud_tmp(self):
        """Cleanup any files/directories on cloud storage (e.g. S3) we created
        while running this job. Should be safe to run this at any time, or
        multiple times.
        """
        pass  # only EMR runner does this

    def _cleanup_hadoop_tmp(self):
        """Cleanup any files/directories on HDFS we created
        while running this job. Should be safe to run this at any time, or
        multiple times.
        """
        pass  # only Hadoop runner does this

    def _cleanup_local_tmp(self):
        """Cleanup any files/directories on the local machine we created while
        running this job. Should be safe to run this at any time, or multiple
        times.

        This particular function removes any local tmp directories
        added to the list self._local_tmp_dirs

        This won't remove output_dir if it's outside of our tmp dir.
        """
        if self._local_tmp_dir:
            log.info('Removing temp directory %s...' % self._local_tmp_dir)
            try:
                shutil.rmtree(self._local_tmp_dir)
            except OSError as e:
                log.exception(e)

        self._local_tmp_dir = None

    def _cleanup_cluster(self):
        """Terminate the cluster if there is one."""
        pass  # this only happens on EMR

    def _cleanup_logs(self):
        """Cleanup any log files that are created as a side-effect of the job.
        """
        pass  # this only happens on EMR

    def _cleanup_job(self):
        """Stop any jobs that we created that are still running."""
        pass  # currently disabled (see #1241)

    def cleanup(self, mode=None):
        """Clean up running jobs, temp files, and logs, subject to the
        *cleanup* option passed to the constructor.

        If you create your runner in a ``with`` block,
        :py:meth:`cleanup` will be called automatically::

            with mr_job.make_runner() as runner:
                ...

            # cleanup() called automatically here

        :param mode: override *cleanup* passed into the constructor. Should be
                     a list of strings from
                     :py:data:`~mrjob.options.CLEANUP_CHOICES`
        """
        mode = self._cleanup_mode(mode)

        def mode_has(*args):
            return any((choice in mode) for choice in args)

        if self._script_path and not self._ran_job:
            if mode_has('CLUSTER', 'ALL'):
                self._cleanup_cluster()

            if mode_has('JOB', 'ALL'):
                self._cleanup_job()

        if mode_has('ALL', 'TMP', 'CLOUD_TMP'):
            self._cleanup_cloud_tmp()

        if mode_has('ALL', 'TMP', 'HADOOP_TMP'):
            self._cleanup_hadoop_tmp()

        if mode_has('ALL', 'TMP', 'LOCAL_TMP'):
            self._cleanup_local_tmp()

        if mode_has('ALL', 'LOGS'):
            self._cleanup_logs()

        self._closed = True

    def counters(self):
        """Get counters associated with this run in this form::

            [{'group name': {'counter1': 1, 'counter2': 2}},
             {'group name': ...}]

        The list contains an entry for every step of the current job.
        """
        raise NotImplementedError

    ### hooks for the with statement ###

    def __enter__(self):
        """Don't do anything special at start of with block"""
        return self

    def __exit__(self, type, value, traceback):
        """Call self.cleanup() at end of with block."""
        self.cleanup()

    ### more runner information ###

    def get_opts(self):
        """Get options set for this runner, as a dict."""
        log.warning('get_opts() is deprecated and will be removed in v0.7.0')
        return copy.deepcopy(self._opts)

    def get_job_key(self):
        """Get the unique key for the job run by this runner.
        This has the format ``label.owner.date.time.microseconds``
        """
        return self._job_key

    def get_output_dir(self):
        """Find the directory containing the job output. If the job hasn't
        run yet, returns None"""
        if self._script_path and not self._ran_job:
            return None

        return self._output_dir

    ### other methods you need to implement in your subclass ###

    def get_hadoop_version(self):
        """Return the version number of the Hadoop environment as a string if
        Hadoop is being used or simulated. Return None if not applicable.

        :py:class:`~mrjob.emr.EMRJobRunner` infers this from the cluster.
        :py:class:`~mrjob.hadoop.HadoopJobRunner` gets this from
        ``hadoop version``. :py:class:`~mrjob.local.LocalMRJobRunner` has an
        additional `hadoop_version` option to specify which version it
        simulates.
        :py:class:`~mrjob.inline.InlineMRJobRunner` does not simulate Hadoop at
        all.
        """
        return None

    # you'll probably wan't to add your own __init__() and cleanup() as well

    def _run(self):
        """Run the job."""
        raise NotImplementedError

    ### internal utilities for implementing MRJobRunners ###

    def _get_local_tmp_dir(self):
        """Create a tmp directory on the local filesystem that will be
        cleaned up by self.cleanup()"""
        if not self._local_tmp_dir:
            tmp_dir = (self._opts['local_tmp_dir'] or tempfile.gettempdir())

            path = os.path.join(tmp_dir, self._job_key)
            log.info('Creating temp directory %s' % path)
            if os.path.isdir(path):
                shutil.rmtree(path)
            os.makedirs(path)
            self._local_tmp_dir = path

        return self._local_tmp_dir

    def _make_unique_job_key(self, label=None, owner=None):
        """Come up with a useful unique ID for this job. Optionally,
        you can specify a custom label or owner (otherwise we use
        :py:meth:`_label` and :py:meth:`_owner`.

        We use this to choose the output directory, etc. for the job.
        """
        if label is None:
            label = self._label()

        if owner is None:
            owner = self._owner()

        now = datetime.datetime.utcnow()
        return '%s.%s.%s.%06d' % (label, owner, now.strftime('%Y%m%d.%H%M%S'),
                                  now.microsecond)

    def _label(self):
        """Return *label* opt, or if not set, the name of the file
        containing the MRJob, minus extension, or if none, ``'no_script'``"""
        if self._opts['label']:
            return self._opts['label']
        elif self._script_path:
            return os.path.basename(self._script_path).split('.')[0]
        else:
            return 'no_script'

    def _owner(self):
        """Return *owner* opt (which defaults to :py:func:`getpass.getuser`),
        or ``'no_user'`` if not set."""
        if self._opts['owner']:
            # owner opt defaults to getpass.getuser()
            return self._opts['owner']
        else:
            return 'no_user'

    def _get_steps(self):
        """If *steps* was not set at init time, call the job script to
        find out how many steps it has, and whether
        there are mappers and reducers for each step. Validate its
        output.

        Returns output as described in :ref:`steps-format`.
        """
        if self._steps is None:
            log.warning('querying jobs for steps is deprecated and'
                        ' will go away in v0.7.0')
            steps = self._load_steps()
            self._check_steps(steps)
            self._steps = steps

        return self._steps

    def _load_steps(self):
        """Ask job how many steps it has, and whether
        there are mappers and reducers for each step.

        Returns output as described in :ref:`steps-format`.

        If this is called, you can assume self._script_path is set.
        """
        raise NotImplementedError

    def _check_steps(self, steps):
        """Look at the step definition (*steps*). If it is not supported by
        the runner, raise :py:class:`NotImplementedError`. If it is not
        supported by mrjob, raise :py:class:`ValueError`.
        """
        if not self._STEP_TYPES:
            # use __class__.__name__ because only MRJobRunner would
            # trigger this
            raise NotImplementedError('%s cannot run steps!' %
                                      self.__class__.__name__)

        for step_num, step in enumerate(steps):
            if step.get('type') not in self._STEP_TYPES:
                raise NotImplementedError(
                    'step %d has type %r, but %s runner only supports:'
                    ' %s' % (step_num, step.get('type'), self.alias, ', '.join(
                        sorted(self._STEP_TYPES))))

            if step.get('input_manifest') and step_num != 0:
                raise ValueError('step %d may not take an input manifest (only'
                                 ' first step can' % step_num)

            # some step types assume a MRJob script
            if not self._script_path:
                if step['type'] == 'spark':
                    raise ValueError(
                        "SparkStep (step %d) can't run without a MRJob script"
                        " (try SparkScriptStep instead)" % step_num)

                elif step['type'] == 'streaming':
                    for mrc in ('mapper', 'combiner', 'reducer'):
                        if not step.get(mrc):
                            continue

                        substep = step[mrc]
                        if substep['type'] == 'script':
                            raise ValueError(
                                "%s (step %d) can't run without a MRJob"
                                " script" % (mrc, step_num))

    def _get_step(self, step_num):
        """Get a single step (calls :py:meth:`_get_steps`)."""
        return self._get_steps()[step_num]

    def _num_steps(self):
        """Get the number of steps (calls :py:meth:`get_steps`)."""
        return len(self._get_steps())

    def _uses_input_manifest(self):
        """Does the first step take an input manifest?"""
        return bool(self._get_step(0).get('input_manifest'))

    def _has_streaming_steps(self):
        """Are any of our steps Hadoop Streaming steps?"""
        return any(step['type'] == 'streaming' for step in self._get_steps())

    def _has_spark_steps(self):
        """Are any of our steps Spark steps? (e.g. spark, spark_jar,
        spark_script)

        Generally used to determine if we need to install Spark on a cluster.
        """
        return any(
            _is_spark_step_type(step['type']) for step in self._get_steps())

    def _has_pyspark_steps(self):
        """Do any of our steps involve running Python on Spark?
        Includes spark and spark_script types, but not spark_jar.

        Generally used to tell if we need a Spark setup script.
        """
        return any(
            _is_pyspark_step_type(step['type']) for step in self._get_steps())

    def _args_for_task(self, step_num, mrc):
        return [
            '--step-num=%d' % step_num,
            '--%s' % mrc,
        ] + self._mr_job_extra_args()

    def _mr_job_extra_args(self, local=False):
        """Return arguments to add to every invocation of MRJob.

        :type local: boolean
        :param local: if this is True, use files' local paths rather than
            the path they'll have inside Hadoop streaming
        """
        result = []

        for extra_arg in self._extra_args:
            if isinstance(extra_arg, dict):
                if local:
                    result.append(extra_arg['path'])
                else:
                    result.append(self._working_dir_mgr.name(**extra_arg))
            else:
                result.append(extra_arg)

        return result

    def _dir_archive_path(self, dir_path):
        """Assign a path for the archive of *dir_path* but don't
        actually create anything."""
        if dir_path not in self._dir_to_archive_path:
            # we can check local paths now
            if not (is_uri(dir_path) or os.path.isdir(dir_path)):
                raise OSError('%s is not a directory!' % dir_path)

            name = name_uniquely(dir_path,
                                 names_taken=self._dir_archive_names_taken)
            self._dir_archive_names_taken.add(name)

            self._dir_to_archive_path[dir_path] = os.path.join(
                self._get_local_tmp_dir(), 'archives', name + '.tar.gz')

        return self._dir_to_archive_path[dir_path]

    def _create_dir_archives(self):
        """Call this to create all dir archives"""
        for dir_path in sorted(set(self._dir_to_archive_path)):
            self._create_dir_archive(dir_path)

    def _create_dir_archive(self, dir_path):
        """Helper for :py:meth:`archive_dir`"""
        if not self.fs.exists(dir_path):
            raise OSError('%s does not exist')

        tar_gz_path = self._dir_archive_path(dir_path)

        if tar_gz_path in self._dir_archives_created:
            return  # already created

        if not os.path.isdir(os.path.dirname(tar_gz_path)):
            os.makedirs(os.path.dirname(tar_gz_path))

        # for remote files
        tmp_download_path = os.path.join(self._get_local_tmp_dir(),
                                         'tmp-download')

        log.info('Archiving %s -> %s' % (dir_path, tar_gz_path))

        with tarfile.open(tar_gz_path, mode='w:gz') as tar_gz:
            for path in self.fs.ls(dir_path):
                # fs.ls() only lists files
                if path == dir_path:
                    raise OSError('%s is a file, not a directory!' % dir_path)

                # TODO: do we need this?
                if os.path.realpath(path) == os.path.realpath(tar_gz_path):
                    raise OSError('attempted to archive %s into itself!' %
                                  tar_gz_path)

                if is_uri(path):
                    path_in_tar_gz = path[len(dir_path):].lstrip('/')

                    log.info('  downloading %s -> %s' %
                             (path, tmp_download_path))
                    with open(tmp_download_path, 'wb') as f:
                        for chunk in self.fs.cat(path):
                            f.write(chunk)
                    local_path = tmp_download_path
                else:
                    path_in_tar_gz = path[len(dir_path):].lstrip(os.sep)
                    local_path = path

                log.debug('  adding %s to %s' % (path, tar_gz_path))
                tar_gz.add(local_path, path_in_tar_gz, recursive=False)

        self._dir_archives_created.add(tar_gz_path)

    def _bootstrap_mrjob(self):
        """Should we bootstrap mrjob?"""
        if self._opts['bootstrap_mrjob'] is None:
            return self._opts['interpreter'] is None
        else:
            return bool(self._opts['bootstrap_mrjob'])

    def _get_input_paths(self):
        """Get the paths to input files, dumping STDIN to a local
        file if need be."""
        if self._input_manifest_path:
            return [self._input_manifest_path]

        if '-' in self._input_paths:
            if self._stdin_path is None:
                # prompt user, so they don't think the process has stalled
                log.info('reading from STDIN')

                stdin_path = os.path.join(self._get_local_tmp_dir(), 'STDIN')
                log.debug('dumping stdin to local file %s' % stdin_path)
                with open(stdin_path, 'wb') as stdin_file:
                    for line in self._stdin:
                        # catch missing newlines (often happens with test data)
                        if not line.endswith(b'\n'):
                            line += b'\n'
                        stdin_file.write(line)

                self._stdin_path = stdin_path

        return [self._stdin_path if p == '-' else p for p in self._input_paths]

    def _create_input_manifest_if_needed(self):
        """Create a file with a list of URIs of input files."""
        if self._input_manifest_path or not self._uses_input_manifest():
            return

        uris = []

        log.info('finding input files to add to manifest...')

        for path in self._get_input_paths():
            log.debug('  in %s' % path)
            if is_uri(path):
                # URIs might be globs
                for uri in self.fs.ls(path):
                    uris.append(uri)
            else:
                # local paths are expected to be single files
                # (shell would resolve globs)
                if self._upload_mgr:
                    uris.append(self._upload_mgr.uri(path))
                else:
                    # just make sure job can find files from it's working dir
                    uris.append(os.path.abspath(path))

        log.info('found %d input files' % len(uris))

        path = os.path.join(self._get_local_tmp_dir(), 'input-manifest.txt')
        self._write_script(uris, path, 'input manifest')

        self._input_manifest_path = path
        if self._upload_mgr:
            self._upload_mgr.add(self._input_manifest_path)

    def _check_input_paths(self):
        """Check that input exists prior to running the job, if the
        `check_input_paths` option is true."""
        if not self._opts['check_input_paths']:
            return

        for path in self._input_paths:
            self._check_input_path(path)

    def _check_input_path(self, path):
        """Raise :py:class:`IOError` if the given input does not exist or
        is otherwise invalid. Override this to provide custom check
        behavior."""
        if path == '-':
            return  # STDIN always exists

        if not self.fs.can_handle_path(path):
            return  # no way to check (e.g. non-S3 URIs on EMR)

        if not self.fs.exists(path):
            raise IOError('Input path %s does not exist!' % (path, ))

    def _add_input_files_for_upload(self):
        """If there is an upload manager, add input files to it."""
        if self._upload_mgr:
            for path in self._get_input_paths():
                self._upload_mgr.add(path)

    def _intermediate_output_dir(self, step_num, local=False):
        """A directory for intermediate output for the given step number."""
        join = os.path.join if local else posixpath.join

        return join(self._step_output_dir or self._default_step_output_dir(),
                    '%04d' % step_num)

    def _default_step_output_dir(self):
        """Where to put output for steps other than the last one,
        if not specified by the *output_dir* constructor keyword.
        Usually you want this to be on HDFS (most efficient).

        Define this in your runner subclass.
        """
        raise NotImplementedError

    def _step_input_uris(self, step_num):
        """A list of URIs to use as input for the given step. For all
        except the first step, this list will have a single item (a
        directory)."""
        if step_num == 0:
            return [
                self._upload_mgr.uri(path)
                if self._upload_mgr else to_uri(path)
                for path in self._get_input_paths()
            ]
        else:
            return [to_uri(self._intermediate_output_dir(step_num - 1))]

    def _step_output_uri(self, step_num):
        """URI to use as output for the given step. This is either an
        intermediate dir (see :py:meth:`intermediate_output_uri`) or
        ``self._output_dir`` for the final step."""
        if step_num == len(self._get_steps()) - 1:
            return to_uri(self._output_dir)
        else:
            return to_uri(self._intermediate_output_dir(step_num))

    def _jobconf_for_step(self, step_num):
        """Get the jobconf dictionary, optionally including step-specific
        jobconf info.

        Also translate jobconfs to the current Hadoop version, if necessary.
        """

        step = self._get_step(step_num)

        # _sort_values_jobconf() isn't relevant to Spark,
        # but it doesn't do any harm either

        jobconf = combine_jobconfs(self._sort_values_jobconf(),
                                   self._opts['jobconf'], step.get('jobconf'))

        # if user is using the wrong jobconfs, add in the correct ones
        # and log a warning
        hadoop_version = self.get_hadoop_version()
        if hadoop_version:
            jobconf = translate_jobconf_dict(jobconf, hadoop_version)

        return jobconf

    def _sort_values_jobconf(self):
        """Jobconf dictionary to enable sorting by value.
        """
        if not self._sort_values:
            return {}

        # translate _SORT_VALUES_JOBCONF to the correct Hadoop version,
        # without logging a warning
        hadoop_version = self.get_hadoop_version()

        jobconf = {}
        for k, v in _SORT_VALUES_JOBCONF.items():
            if hadoop_version:
                jobconf[translate_jobconf(k, hadoop_version)] = v
            else:
                for j in translate_jobconf_for_all_versions(k):
                    jobconf[j] = v

        return jobconf

    def _sort_values_partitioner(self):
        """Partitioner to use with *sort_values* keyword to the constructor."""
        if self._sort_values:
            return _SORT_VALUES_PARTITIONER
        else:
            return None

    def _upload_args(self):
        # just upload every file and archive in the working dir manager
        return self._upload_args_helper('-files', None, '-archives', None)

    def _upload_args_helper(self, files_opt_str, files, archives_opt_str,
                            archives):
        args = []

        file_hash_paths = list(self._arg_hash_paths('file', files))
        if file_hash_paths:
            args.append(files_opt_str)
            args.append(','.join(file_hash_paths))

        archive_hash_paths = list(self._arg_hash_paths('archive', archives))
        if archive_hash_paths:
            args.append(archives_opt_str)
            args.append(','.join(archive_hash_paths))

        return args

    def _arg_hash_paths(self, type, named_paths=None):
        """Helper function for the *upload_args methods."""
        if named_paths is None:
            # just return everything managed by _working_dir_mgr
            named_paths = sorted(
                self._working_dir_mgr.name_to_path(type).items())

        for name, path in named_paths:
            if not name:
                name = self._working_dir_mgr.name(type, path)

            if self._upload_mgr:
                uri = self._upload_mgr.uri(path)
            else:
                uri = path

            yield '%s#%s' % (uri, name)

    def _write_script(self, lines, path, description):
        """Write text of a setup script, input manifest, etc. to the given
        file.

        By default, this writes binary data. Redefine :py:meth:`write_lines`
        to use other line endings.

        :param lines: a list of lines as ``str``
        :param path: path of file to write to
        :param description: what we're writing to, for debug messages
        """
        log.debug('Writing %s to %s:' % (description, path))
        for line in lines:
            log.debug('  ' + line)

        self._write_script_lines(lines, path)

    def _write_script_lines(self, lines, path):
        """Write text to the given file. By default, this writes
        binary data, but can be redefined to use local line endings."""
        with open(path, 'wb') as f:
            for line in lines:
                f.write((line + '\n').encode('utf-8'))
Ejemplo n.º 11
0
class SparkMRJobRunner(MRJobBinRunner):
    """Runs a :py:class:`~mrjob.job.MRJob` on your Spark cluster (with or
    without Hadoop). Invoked when you run your job with ``-r spark``.

    See :ref:`running-on-your-spark-cluster` for more information.

    The Spark runner can also run "classic" MRJobs directly on Spark, without
    using Hadoop streaming. See :ref:`classic-mrjobs-on-spark`.

    .. versionadded:: 0.6.8
    """
    alias = 'spark'

    # other than ``spark_*``, these options are only used for filesystems
    #
    # max_output_files doesn't appear here because it can only be read from
    # the command line, not mrjob.conf (see #2040)
    OPT_NAMES = MRJobBinRunner.OPT_NAMES | {
        'aws_access_key_id',
        'aws_secret_access_key',
        'aws_session_token',
        'cloud_fs_sync_secs',
        'cloud_part_size_mb',
        'emulate_map_input_file',
        'gcs_region',  # used when creating buckets on GCS
        'hadoop_bin',
        'project_id',  # used by GCS filesystem
        's3_endpoint',
        's3_region',  # used when creating buckets on S3
        'spark_deploy_mode',
        'spark_master',
        'spark_tmp_dir',  # where to put temp files in Spark
    }

    # everything except Hadoop JARs
    # streaming jobs will be run using mrjob/spark/harness.py (see #1972)
    _STEP_TYPES = {
        'spark', 'spark_jar', 'spark_script', 'streaming',
    }

    def __init__(self, max_output_files=None, mrjob_cls=None, **kwargs):
        """Create a Spark runner.

        :param max_output_files: limit on number of output files when
                                 running streaming jobs. Can only be
                                 set on command line (not config file)
        :param mrjob_cls: class of the job you want to run. Used for
                          running streaming steps in Spark
        """
        # need to set this before checking steps in superclass __init__()
        self._mrjob_cls = mrjob_cls

        super(SparkMRJobRunner, self).__init__(**kwargs)

        self._max_output_files = max_output_files

        if self._opts['spark_tmp_dir']:
            self._check_spark_tmp_dir_opt()

        self._spark_tmp_dir = self._pick_spark_tmp_dir()

        # where local files are uploaded into Spark
        if is_uri(self._spark_tmp_dir):
            spark_files_dir = posixpath.join(self._spark_tmp_dir, 'files', '')
            self._upload_mgr = UploadDirManager(spark_files_dir)

        # where to put job output (if not set explicitly)
        if not self._output_dir:
            self._output_dir = self.fs.join(self._spark_tmp_dir, 'output')

        # keep track of where the spark-submit binary is
        self._spark_submit_bin = self._opts['spark_submit_bin']

        # where to store a .zip file containing the MRJob, with a unique
        # module name
        self._job_script_zip_path = None

        # counters, one per job step. (Counters will be {} for non-streaming
        # steps because Spark doesn't have counters).
        self._counters = []

        # TODO: we may eventually want log interpretation, but it shouldn't
        # include counters, as they are not found in logs.

    def _check_spark_tmp_dir_opt(self):
        # warn if spark_tmp_dir isn't actually visible to Spark executors
        # (see #2062)
        tmp_dir_is_local = to_uri(
            self._opts['spark_tmp_dir']).startswith('file://')
        spark_master_is_local = self._spark_master().startswith('local')

        if tmp_dir_is_local != spark_master_is_local:
            log.warning(
                'Warning: executors on Spark master %s may not be able to'
                ' access spark_tmp_dir %s' %
                (self._spark_master(), self._opts['spark_tmp_dir']))

    def _check_step(self, step, step_num):
        """Don't try to run steps that include commands or use manifests."""
        super(SparkMRJobRunner, self)._check_step(step, step_num)

        if step.get('input_manifest'):
            raise NotImplementedError(
                'spark runner does not support input manifests')

        # we don't currently support commands, but we *could* (see #1956).
        if step['type'] == 'streaming':
            if not self._mrjob_cls:
                raise ValueError(
                    'You must set mrjob_cls to run streaming steps')

            for mrc in ('mapper', 'combiner', 'reducer'):
                if step.get(mrc):
                    if 'command' in step[mrc] or 'pre_filter' in step[mrc]:
                        raise NotImplementedError(
                            "step %d's %s runs a command, but spark"
                            " runner does not support commands" % (
                                step_num, mrc))

    def _default_opts(self):
        return combine_dicts(
            super(SparkMRJobRunner, self)._default_opts(),
            dict(
                cloud_part_size_mb=_DEFAULT_CLOUD_PART_SIZE_MB,
            ),
        )

    def _run(self):
        self.get_spark_submit_bin()  # find spark-submit up front
        self._create_setup_wrapper_scripts()
        self._upload_local_files()
        self._run_steps_on_spark()

    def _pick_spark_tmp_dir(self):
        if self._opts['spark_tmp_dir']:
            return self.fs.join(self._opts['spark_tmp_dir'], self._job_key)
        else:
            master = self._spark_master() or 'local'
            if master.startswith('local'):  # including local-cluster
                # need a local temp dir
                # add "-spark" so we don't collide with default local temp dir
                return os.path.join(
                    gettempdir(), self._job_key + '-spark')
            else:
                # use HDFS (same default as HadoopJobRunner)
                return posixpath.join(
                    fully_qualify_hdfs_path('tmp/mrjob'), self._job_key)

    def _default_step_output_dir(self):
        return self.fs.join(self._spark_tmp_dir, 'step-output')

    def _counter_output_dir(self, step_num):
        return self.fs.join(
            self._spark_tmp_dir, 'counter-output-step-%d' % step_num)

    def counters(self):
        return deepcopy(self._counters)

    @property
    def fs(self):
        # Spark supports basically every filesystem there is

        if not self._fs:
            self._fs = CompositeFilesystem()

            if boto3_installed:
                self._fs.add_fs('s3', S3Filesystem(
                    aws_access_key_id=self._opts['aws_access_key_id'],
                    aws_secret_access_key=self._opts['aws_secret_access_key'],
                    aws_session_token=self._opts['aws_session_token'],
                    s3_endpoint=self._opts['s3_endpoint'],
                    s3_region=self._opts['s3_region'],
                ), disable_if=_is_permanent_boto3_error)

            if google_libs_installed:
                self._fs.add_fs('gcs', GCSFilesystem(
                    project_id=self._opts['project_id'],
                    location=self._opts['gcs_region'],
                    object_ttl_days=_DEFAULT_CLOUD_TMP_DIR_OBJECT_TTL_DAYS,
                ), disable_if=_is_permanent_google_error)

            # Hadoop FS is responsible for all URIs that fall through to it
            self._fs.add_fs('hadoop', HadoopFilesystem(
                self._opts['hadoop_bin']))

            self._fs.add_fs('local', LocalFilesystem())

        return self._fs

    # making mr_job_script visible in Spark

    def _job_script_module_name(self):
        """A unique module name to use with the MRJob script."""
        return re.sub(r'[^\w\d]', '_', self._job_key)

    def _create_job_script_zip(self):
        if not self._job_script_zip_path:
            zip_path = os.path.join(self._get_local_tmp_dir(), 'script.zip')
            name_in_zip = self._job_script_module_name() + '.py'

            log.debug('archiving %s -> %s as %s' % (
                self._script_path, zip_path, name_in_zip))
            with _create_zip_file(zip_path) as zip_file:
                zip_file.write(self._script_path, arcname=name_in_zip)

            self._job_script_zip_path = zip_path

        return self._job_script_zip_path

    def _py_files(self):
        """Patch in :py:attr:`_job_script_zip_path`, if running streaming
        steps."""
        py_files = super(SparkMRJobRunner, self)._py_files()

        if self._has_streaming_steps():
            py_files.append(self._create_job_script_zip())

        return py_files

    # running the job

    def _run_steps_on_spark(self):
        steps = self._get_steps()

        for group in self._group_steps(steps):
            step_num = group['step_num']
            last_step_num = step_num + len(group['steps']) - 1

            # the Spark harness can run several streaming steps in one job
            if step_num == last_step_num:
                step_desc = 'step %d' % (step_num + 1)
            else:
                step_desc = 'steps %d-%d' % (step_num + 1, last_step_num + 1)

            log.info('Running %s of %d' % (step_desc, len(steps)))

            self._run_step_on_spark(group['steps'][0], step_num, last_step_num)

    def _group_steps(self, steps):
        """Group streaming steps together."""
        # a list of dicts with:
        #
        # type -- shared type of steps
        # steps -- list of steps in group
        # step_num -- (0-indexed) number of first step
        groups = []

        for step_num, step in enumerate(steps):
            # should we add *step* to existing group of streaming steps?
            if (step['type'] == 'streaming' and groups and
                    groups[-1]['type'] == 'streaming' and
                    step.get('jobconf') ==
                    groups[-1]['steps'][0].get('jobconf')):
                groups[-1]['steps'].append(step)
            else:
                # start a new step group
                groups.append(dict(
                    type=step['type'],
                    steps=[step],
                    step_num=step_num))

        return groups

    def _run_step_on_spark(self, step, step_num, last_step_num=None):
        if self._opts['upload_archives'] and self._spark_master() != 'yarn':
            log.warning('Spark master %r will probably ignore archives' %
                        self._spark_master())

        spark_submit_args = self._args_for_spark_step(step_num, last_step_num)

        env = dict(os.environ)
        env.update(self._spark_cmdenv(step_num))

        returncode = self._run_spark_submit(spark_submit_args, env,
                                            record_callback=_log_log4j_record)

        counters = None
        if step['type'] == 'streaming':
            counter_file = self.fs.join(
                self._counter_output_dir(step_num), 'part-*')
            counter_json = b''.join(self.fs.cat(counter_file))
            if counter_json.strip():
                # json.loads() on Python 3.4/3.5 can't take bytes
                counters = json.loads(to_unicode(counter_json))

        if isinstance(counters, list):
            self._counters.extend(counters)

            # desc_num is 1-indexed user-readable step num
            for desc_num, counter_dict in enumerate(
                    counters, start=(step_num + 1)):
                if counter_dict:
                    log.info(_format_counters(
                        counter_dict,
                        desc=('Counters for step %d' % desc_num)))

        # for non-streaming steps, there are no counters.
        # pad self._counters to match number of steps
        while len(self._counters) < (last_step_num or step_num) + 1:
            self._counters.append({})

        if returncode:
            reason = str(CalledProcessError(returncode, spark_submit_args))
            raise StepFailedException(
                reason=reason, step_num=step_num, last_step_num=last_step_num,
                num_steps=self._num_steps())

    def _spark_script_path(self, step_num):
        """For streaming steps, return the path of the harness script
        (and handle other spark step types the usual way)."""
        step = self._get_step(step_num)

        if step['type'] == 'streaming':
            return self._spark_harness_path()
        else:
            return super(SparkMRJobRunner, self)._spark_script_path(step_num)

    def _spark_script_args(self, step_num, last_step_num=None):
        """Generate spark harness args for streaming steps (and handle
        other spark step types the usual way).
        """
        if last_step_num is None:
            last_step_num = step_num

        steps = self._get_steps()[step_num:last_step_num + 1]

        if steps[0]['type'] != 'streaming':
            return super(SparkMRJobRunner, self)._spark_script_args(
                step_num, last_step_num)

        args = []

        # class name
        args.append('%s.%s' % (self._job_script_module_name(),
                               self._mrjob_cls.__name__))

        # INPUT
        args.append(
            ','.join(self._step_input_uris(step_num)))

        # OUTPUT
        # note that we use the output dir for the *last* step
        args.append(
            self._step_output_uri(last_step_num))

        # --hadoop-input-format. Pass '' to indicate we know there is none
        args.extend(['--hadoop-input-format',
                     self._hadoop_input_format or ''])

        # --hadoop-output-format. Pass '' to indicate we know there is none
        args.extend(['--hadoop-output-format',
                     self._hadoop_output_format or ''])

        # --sort-values
        if self._sort_values:
            args.append('--sort-values')
        else:
            args.append('--no-sort-values')

        # --steps-desc
        args.extend(['--steps-desc', json.dumps(steps)])

        # --counter-output-dir, to simulate counters
        args.extend(['--counter-output-dir',
                     self._counter_output_dir(step_num)])

        # --first-step-num, --last-step-num (step range)
        args.extend(['--first-step-num', str(step_num),
                     '--last-step-num', str(last_step_num)])

        # --job-args (passthrough args)

        # if on local[*] master, keep file upload args as-is (see #2031)
        job_args = self._mr_job_extra_args(
            local=not self._spark_executors_have_own_wd())

        if job_args:
            args.extend(['--job-args', cmd_line(job_args)])

        # --compression-codec
        jobconf = self._jobconf_for_step(step_num)

        compress_conf = jobconf_from_dict(
            jobconf, 'mapreduce.output.fileoutputformat.compress')
        codec_conf = jobconf_from_dict(
            jobconf, 'mapreduce.output.fileoutputformat.compress.codec')

        if compress_conf and compress_conf != 'false' and codec_conf:
            args.extend(['--compression-codec', codec_conf])

        # --num-reducers
        num_reducers = jobconf_from_dict(jobconf, 'mapreduce.job.reduces')
        if num_reducers and int(num_reducers) > 0:
            args.extend(['--num-reducers', str(num_reducers)])

        # --max-output-files
        if self._max_output_files:
            args.extend(['--max-output-files',
                         str(self._max_output_files)])

        if self._opts['emulate_map_input_file']:
            args.append('--emulate-map-input-file')

        return args

    def _spark_harness_path(self):
        """Where to find the Spark harness."""
        path = mrjob.spark.harness.__file__
        if path.endswith('.pyc'):
            path = path[:-1]
        return path

    # "streaming" steps run on Spark too

    def _has_spark_steps(self):
        """Treat streaming steps as Spark steps."""
        return (super(SparkMRJobRunner, self)._has_spark_steps() or
                self._has_streaming_steps())

    def _has_hadoop_streaming_steps(self):
        # the Spark runner doesn't run "streaming" steps on Hadoop
        return False

    def _has_streaming_steps(self):
        """Are any of our steps "streaming" steps that would normally run
        on Hadoop Streaming?"""
        return any(step['type'] == 'streaming'
                   for step in self._get_steps())

    def _is_pyspark_step(self, step):
        """Treat streaming steps as Spark steps that use Python."""
        return (super(SparkMRJobRunner, self)._is_pyspark_step(step) or
                step['type'] == 'streaming')
Ejemplo n.º 12
0
Archivo: runner.py Proyecto: Yelp/mrjob
class SparkMRJobRunner(MRJobBinRunner):
    """Runs a :py:class:`~mrjob.job.MRJob` on your Spark cluster (with or
    without Hadoop). Invoked when you run your job with ``-r spark``.

    See :ref:`running-on-your-spark-cluster` for more information.

    The Spark runner can also run "classic" MRJobs directly on Spark, without
    using Hadoop streaming. See :ref:`classic-mrjobs-on-spark`.

    .. versionadded:: 0.6.8
    """
    alias = 'spark'

    # other than ``spark_*``, these options are only used for filesystems
    #
    # max_output_files doesn't appear here because it can only be read from
    # the command line, not mrjob.conf (see #2040)
    OPT_NAMES = MRJobBinRunner.OPT_NAMES | {
        'aws_access_key_id',
        'aws_secret_access_key',
        'aws_session_token',
        'cloud_fs_sync_secs',
        'cloud_part_size_mb',
        'gcs_region',  # used when creating buckets on GCS
        'hadoop_bin',
        'project_id',  # used by GCS filesystem
        's3_endpoint',
        's3_region',  # used when creating buckets on S3
        'spark_deploy_mode',
        'spark_master',
        'spark_tmp_dir',  # where to put temp files in Spark
    }

    # everything except Hadoop JARs
    # streaming jobs will be run using mrjob/spark/harness.py (see #1972)
    _STEP_TYPES = {
        'spark', 'spark_jar', 'spark_script', 'streaming',
    }

    def __init__(self, max_output_files=None, mrjob_cls=None, **kwargs):
        """Create a Spark runner.

        :param max_output_files: limit on number of output files when
                                 running streaming jobs. Can only be
                                 set on command line (not config file)
        :param mrjob_cls: class of the job you want to run. Used for
                          running streaming steps in Spark
        """
        # need to set this before checking steps in superclass __init__()
        self._mrjob_cls = mrjob_cls

        super(SparkMRJobRunner, self).__init__(**kwargs)

        self._max_output_files = max_output_files

        self._spark_tmp_dir = self._pick_spark_tmp_dir()

        # where local files are uploaded into Spark
        if is_uri(self._spark_tmp_dir):
            spark_files_dir = posixpath.join(self._spark_tmp_dir, 'files', '')
            self._upload_mgr = UploadDirManager(spark_files_dir)

        # where to put job output (if not set explicitly)
        if not self._output_dir:
            self._output_dir = self.fs.join(self._spark_tmp_dir, 'output')

        # keep track of where the spark-submit binary is
        self._spark_submit_bin = self._opts['spark_submit_bin']

        # where to store a .zip file containing the MRJob, with a unique
        # module name
        self._job_script_zip_path = None

        # counters, one per job step. (Counters will be {} for non-streaming
        # steps because Spark doesn't have counters).
        self._counters = []

        # TODO: we may eventually want log interpretation, but it shouldn't
        # include counters, as they are not found in logs.

    def _check_step(self, step, step_num):
        """Don't try to run steps that include commands or use manifests."""
        super(SparkMRJobRunner, self)._check_step(step, step_num)

        if step.get('input_manifest'):
            raise NotImplementedError(
                'spark runner does not support input manifests')

        # we don't currently support commands, but we *could* (see #1956).
        if step['type'] == 'streaming':
            if not self._mrjob_cls:
                raise ValueError(
                    'You must set mrjob_cls to run streaming steps')

            for mrc in ('mapper', 'combiner', 'reducer'):
                if step.get(mrc):
                    if 'command' in step[mrc] or 'pre_filter' in step[mrc]:
                        raise NotImplementedError(
                            "step %d's %s runs a command, but spark"
                            " runner does not support commands" % (
                                step_num, mrc))

    def _default_opts(self):
        return combine_dicts(
            super(SparkMRJobRunner, self)._default_opts(),
            dict(
                cloud_part_size_mb=_DEFAULT_CLOUD_PART_SIZE_MB,
            ),
        )

    def _run(self):
        self.get_spark_submit_bin()  # find spark-submit up front
        self._create_setup_wrapper_scripts()
        self._upload_local_files()
        self._run_steps_on_spark()

    def _pick_spark_tmp_dir(self):
        if self._opts['spark_tmp_dir']:
            return self.fs.join(self._opts['spark_tmp_dir'], self._job_key)
        else:
            master = self._spark_master() or 'local'
            if master.startswith('local'):  # including local-cluster
                # need a local temp dir
                # add "-spark" so we don't collide with default local temp dir
                return os.path.join(
                    gettempdir(), self._job_key + '-spark')
            else:
                # use HDFS (same default as HadoopJobRunner)
                return posixpath.join(
                    fully_qualify_hdfs_path('tmp/mrjob'), self._job_key)

    def _default_step_output_dir(self):
        return self.fs.join(self._spark_tmp_dir, 'step-output')

    def _counter_output_dir(self, step_num):
        return self.fs.join(
            self._spark_tmp_dir, 'counter-output-step-%d' % step_num)

    def counters(self):
        return deepcopy(self._counters)

    @property
    def fs(self):
        # Spark supports basically every filesystem there is

        if not self._fs:
            self._fs = CompositeFilesystem()

            if boto3_installed:
                self._fs.add_fs('s3', S3Filesystem(
                    aws_access_key_id=self._opts['aws_access_key_id'],
                    aws_secret_access_key=self._opts['aws_secret_access_key'],
                    aws_session_token=self._opts['aws_session_token'],
                    s3_endpoint=self._opts['s3_endpoint'],
                    s3_region=self._opts['s3_region'],
                ), disable_if=_is_permanent_boto3_error)

            if google_libs_installed:
                self._fs.add_fs('gcs', GCSFilesystem(
                    project_id=self._opts['project_id'],
                    location=self._opts['gcs_region'],
                    object_ttl_days=_DEFAULT_CLOUD_TMP_DIR_OBJECT_TTL_DAYS,
                ), disable_if=_is_permanent_google_error)

            # Hadoop FS is responsible for all URIs that fall through to it
            self._fs.add_fs('hadoop', HadoopFilesystem(
                self._opts['hadoop_bin']))

            self._fs.add_fs('local', LocalFilesystem())

        return self._fs

    # making mr_job_script visible in Spark

    def _job_script_module_name(self):
        """A unique module name to use with the MRJob script."""
        return re.sub(r'[^\w\d]', '_', self._job_key)

    def _create_job_script_zip(self):
        if not self._job_script_zip_path:
            zip_path = os.path.join(self._get_local_tmp_dir(), 'script.zip')
            name_in_zip = self._job_script_module_name() + '.py'

            log.debug('archiving %s -> %s as %s' % (
                self._script_path, zip_path, name_in_zip))
            with _create_zip_file(zip_path) as zip_file:
                zip_file.write(self._script_path, arcname=name_in_zip)

            self._job_script_zip_path = zip_path

        return self._job_script_zip_path

    def _py_files(self):
        """Patch in :py:attr:`_job_script_zip_path`, if running streaming
        steps."""
        py_files = super(SparkMRJobRunner, self)._py_files()

        if self._has_streaming_steps():
            py_files.append(self._create_job_script_zip())

        return py_files

    # running the job

    def _run_steps_on_spark(self):
        steps = self._get_steps()

        for group in self._group_steps(steps):
            step_num = group['step_num']
            last_step_num = step_num + len(group['steps']) - 1

            # the Spark harness can run several streaming steps in one job
            if step_num == last_step_num:
                step_desc = 'step %d' % (step_num + 1)
            else:
                step_desc = 'steps %d-%d' % (step_num + 1, last_step_num + 1)

            log.info('Running %s of %d' % (step_desc, len(steps)))

            self._run_step_on_spark(group['steps'][0], step_num, last_step_num)

    def _group_steps(self, steps):
        """Group streaming steps together."""
        # a list of dicts with:
        #
        # type -- shared type of steps
        # steps -- list of steps in group
        # step_num -- (0-indexed) number of first step
        groups = []

        for step_num, step in enumerate(steps):
            # should we add *step* to existing group of streaming steps?
            if (step['type'] == 'streaming' and groups and
                    groups[-1]['type'] == 'streaming' and
                    step.get('jobconf') ==
                    groups[-1]['steps'][0].get('jobconf')):
                groups[-1]['steps'].append(step)
            else:
                # start a new step group
                groups.append(dict(
                    type=step['type'],
                    steps=[step],
                    step_num=step_num))

        return groups

    def _run_step_on_spark(self, step, step_num, last_step_num=None):
        if self._opts['upload_archives'] and self._spark_master() != 'yarn':
            log.warning('Spark master %r will probably ignore archives' %
                        self._spark_master())

        spark_submit_args = self._args_for_spark_step(step_num, last_step_num)

        env = dict(os.environ)
        env.update(self._spark_cmdenv(step_num))

        returncode = self._run_spark_submit(spark_submit_args, env,
                                            record_callback=_log_log4j_record)

        counters = None
        if step['type'] == 'streaming':
            counter_file = self.fs.join(
                self._counter_output_dir(step_num), 'part-*')
            counter_json = b''.join(self.fs.cat(counter_file))
            if counter_json.strip():
                # json.loads() on Python 3.4/3.5 can't take bytes
                counters = json.loads(to_unicode(counter_json))

        if isinstance(counters, list):
            self._counters.extend(counters)

            # desc_num is 1-indexed user-readable step num
            for desc_num, counter_dict in enumerate(
                    counters, start=(step_num + 1)):
                if counter_dict:
                    log.info(_format_counters(
                        counter_dict,
                        desc=('Counters for step %d' % desc_num)))

        # for non-streaming steps, there are no counters.
        # pad self._counters to match number of steps
        while len(self._counters) < (last_step_num or step_num) + 1:
            self._counters.append({})

        if returncode:
            reason = str(CalledProcessError(returncode, spark_submit_args))
            raise StepFailedException(
                reason=reason, step_num=step_num, last_step_num=last_step_num,
                num_steps=self._num_steps())

    def _spark_script_path(self, step_num):
        """For streaming steps, return the path of the harness script
        (and handle other spark step types the usual way)."""
        step = self._get_step(step_num)

        if step['type'] == 'streaming':
            return self._spark_harness_path()
        else:
            return super(SparkMRJobRunner, self)._spark_script_path(step_num)

    def _spark_script_args(self, step_num, last_step_num=None):
        """Generate spark harness args for streaming steps (and handle
        other spark step types the usual way).
        """
        if last_step_num is None:
            last_step_num = step_num

        steps = self._get_steps()[step_num:last_step_num + 1]

        if steps[0]['type'] != 'streaming':
            return super(SparkMRJobRunner, self)._spark_script_args(
                step_num, last_step_num)

        args = []

        # class name
        args.append('%s.%s' % (self._job_script_module_name(),
                               self._mrjob_cls.__name__))

        # INPUT
        args.append(
            ','.join(self._step_input_uris(step_num)))

        # OUTPUT
        # note that we use the output dir for the *last* step
        args.append(
            self._step_output_uri(last_step_num))

        # --hadoop-input-format. Pass '' to indicate we know there is none
        args.extend(['--hadoop-input-format',
                     self._hadoop_input_format or ''])

        # --hadoop-output-format. Pass '' to indicate we know there is none
        args.extend(['--hadoop-output-format',
                     self._hadoop_output_format or ''])

        # --sort-values
        if self._sort_values:
            args.append('--sort-values')
        else:
            args.append('--no-sort-values')

        # --steps-desc
        args.extend(['--steps-desc', json.dumps(steps)])

        # --counter-output-dir, to simulate counters
        args.extend(['--counter-output-dir',
                     self._counter_output_dir(step_num)])

        # --first-step-num, --last-step-num (step range)
        args.extend(['--first-step-num', str(step_num),
                     '--last-step-num', str(last_step_num)])

        # --job-args (passthrough args)

        # if on local[*] master, keep file upload args as-is (see #2031)
        job_args = self._mr_job_extra_args(
            local=not self._spark_executors_have_own_wd())

        if job_args:
            args.extend(['--job-args', cmd_line(job_args)])

        # --compression-codec
        jobconf = self._jobconf_for_step(step_num)

        compress_conf = jobconf_from_dict(
            jobconf, 'mapreduce.output.fileoutputformat.compress')
        codec_conf = jobconf_from_dict(
            jobconf, 'mapreduce.output.fileoutputformat.compress.codec')

        if compress_conf and compress_conf != 'false' and codec_conf:
            args.extend(['--compression-codec', codec_conf])

        # --num-reducers
        num_reducers = jobconf_from_dict(jobconf, 'mapreduce.job.reduces')
        if num_reducers and int(num_reducers) > 0:
            args.extend(['--num-reducers', str(num_reducers)])

        # --max-output-files
        if self._max_output_files:
            args.extend(['--max-output-files',
                         str(self._max_output_files)])

        return args

    def _spark_harness_path(self):
        """Where to find the Spark harness."""
        path = mrjob.spark.harness.__file__
        if path.endswith('.pyc'):
            path = path[:-1]
        return path

    # "streaming" steps run on Spark too

    def _has_spark_steps(self):
        """Treat streaming steps as Spark steps."""
        return (super(SparkMRJobRunner, self)._has_spark_steps() or
                self._has_streaming_steps())

    def _has_hadoop_streaming_steps(self):
        # the Spark runner doesn't run "streaming" steps on Hadoop
        return False

    def _has_streaming_steps(self):
        """Are any of our steps "streaming" steps that would normally run
        on Hadoop Streaming?"""
        return any(step['type'] == 'streaming'
                   for step in self._get_steps())

    def _is_pyspark_step(self, step):
        """Treat streaming steps as Spark steps that use Python."""
        return (super(SparkMRJobRunner, self)._is_pyspark_step(step) or
                step['type'] == 'streaming')
Ejemplo n.º 13
0
class HadoopJobRunner(MRJobBinRunner, LogInterpretationMixin):
    """Runs an :py:class:`~mrjob.job.MRJob` on your Hadoop cluster.
    Invoked when you run your job with ``-r hadoop``.

    Input and support files can be either local or on HDFS; use ``hdfs://...``
    URLs to refer to files on HDFS.
    """
    alias = 'hadoop'

    OPT_NAMES = MRJobBinRunner.OPT_NAMES | {
        'hadoop_bin',
        'hadoop_extra_args',
        'hadoop_log_dirs',
        'hadoop_streaming_jar',
        'hadoop_tmp_dir',
        'spark_deploy_mode',
        'spark_master',
    }

    # supports everything (so far)
    _STEP_TYPES = {
        'jar', 'spark', 'spark_jar', 'spark_script', 'streaming'}

    def __init__(self, **kwargs):
        """:py:class:`~mrjob.hadoop.HadoopJobRunner` takes the same arguments
        as :py:class:`~mrjob.runner.MRJobRunner`, plus some additional options
        which can be defaulted in :ref:`mrjob.conf <mrjob.conf>`.
        """
        super(HadoopJobRunner, self).__init__(**kwargs)

        self._hadoop_tmp_dir = fully_qualify_hdfs_path(
            posixpath.join(
                self._opts['hadoop_tmp_dir'], self._job_key))

        # Keep track of local files to upload to HDFS. We'll add them
        # to this manager just before we need them.
        hdfs_files_dir = posixpath.join(self._hadoop_tmp_dir, 'files', '')
        self._upload_mgr = UploadDirManager(hdfs_files_dir)

        # Set output dir if it wasn't set explicitly
        self._output_dir = fully_qualify_hdfs_path(
            self._output_dir or
            posixpath.join(self._hadoop_tmp_dir, 'output'))

        # Fully qualify step_output_dir, if set
        if self._step_output_dir:
            self._step_output_dir = fully_qualify_hdfs_path(
                self._step_output_dir)

        # Track job and (YARN) application ID to enable log parsing
        self._application_id = None
        self._job_id = None

        # Keep track of where the hadoop streaming jar is
        self._hadoop_streaming_jar = self._opts['hadoop_streaming_jar']
        self._searched_for_hadoop_streaming_jar = False

        # List of dicts (one for each step) potentially containing
        # the keys 'history', 'step', and 'task' ('step' will always
        # be filled because it comes from the hadoop jar command output,
        # others will be filled as needed)
        self._log_interpretations = []

    def _default_opts(self):
        return combine_dicts(
            super(HadoopJobRunner, self)._default_opts(),
            dict(
                hadoop_tmp_dir='tmp/mrjob',
                spark_deploy_mode='client',
                spark_master='yarn',
            )
        )

    @property
    def fs(self):
        """:py:class:`mrjob.fs.base.Filesystem` object for HDFS and the local
        filesystem.
        """
        if self._fs is None:
            self._fs = CompositeFilesystem()

            # don't pass [] to fs; this means not to use hadoop until
            # fs.set_hadoop_bin() is called (used for running hadoop over SSH).
            hadoop_bin = self._opts['hadoop_bin'] or None

            self._fs.add_fs('hadoop',
                            HadoopFilesystem(hadoop_bin))
            self._fs.add_fs('local', LocalFilesystem())

        return self._fs

    def get_hadoop_version(self):
        """Invoke the hadoop executable to determine its version"""
        return self.fs.hadoop.get_hadoop_version()

    def get_hadoop_bin(self):
        """Find the hadoop binary. A list: binary followed by arguments."""
        return self.fs.hadoop.get_hadoop_bin()

    def get_hadoop_streaming_jar(self):
        """Find the path of the hadoop streaming jar, or None if not found."""
        if not (self._hadoop_streaming_jar or
                self._searched_for_hadoop_streaming_jar):

            self._hadoop_streaming_jar = self._find_hadoop_streaming_jar()

            if self._hadoop_streaming_jar:
                log.info('Found Hadoop streaming jar: %s' %
                         self._hadoop_streaming_jar)
            else:
                log.warning('Hadoop streaming jar not found. Use'
                            ' --hadoop-streaming-jar')

            self._searched_for_hadoop_streaming_jar = True

        return self._hadoop_streaming_jar

    def _find_hadoop_streaming_jar(self):
        """Search for the hadoop streaming jar. See
        :py:meth:`_hadoop_streaming_jar_dirs` for where we search."""
        for path in unique(self._hadoop_streaming_jar_dirs()):
            log.info('Looking for Hadoop streaming jar in %s...' % path)

            streaming_jars = []
            for path in self.fs.ls(path):
                if _HADOOP_STREAMING_JAR_RE.match(posixpath.basename(path)):
                    streaming_jars.append(path)

            if streaming_jars:
                # prefer shorter names and shallower paths
                def sort_key(p):
                    return (len(p.split('/')),
                            len(posixpath.basename(p)),
                            p)

                streaming_jars.sort(key=sort_key)

                return streaming_jars[0]

        return None

    def _hadoop_dirs(self):
        """Yield all possible hadoop directories (used for streaming jar
        and logs). May yield duplicates"""
        for name in ('HADOOP_PREFIX', 'HADOOP_HOME', 'HADOOP_INSTALL',
                     'HADOOP_MAPRED_HOME'):
            path = os.environ.get(name)
            if path:
                yield path

        # guess it from the path of the Hadoop binary
        hadoop_home = _hadoop_prefix_from_bin(self.get_hadoop_bin()[0])
        if hadoop_home:
            yield hadoop_home

        # try HADOOP_*_HOME
        for name, path in sorted(os.environ.items()):
            if name.startswith('HADOOP_') and name.endswith('_HOME'):
                yield path

    def _hadoop_streaming_jar_dirs(self):
        """Yield all possible places to look for the Hadoop streaming jar.
        May yield duplicates.
        """
        for hadoop_dir in self._hadoop_dirs():
            yield hadoop_dir

        # use hard-coded paths to work out-of-the-box on EMR
        for path in _EMR_HADOOP_STREAMING_JAR_DIRS:
            yield path

    def _hadoop_log_dirs(self, output_dir=None):
        """Yield all possible places to look for hadoop logs."""
        # hadoop_log_dirs opt overrides all this
        if self._opts['hadoop_log_dirs']:
            for path in self._opts['hadoop_log_dirs']:
                yield path
            return

        hadoop_log_dir = os.environ.get('HADOOP_LOG_DIR')
        if hadoop_log_dir:
            yield hadoop_log_dir

        yarn = uses_yarn(self.get_hadoop_version())

        if yarn:
            yarn_log_dir = os.environ.get('YARN_LOG_DIR')
            if yarn_log_dir:
                yield yarn_log_dir

            yield _DEFAULT_YARN_HDFS_LOG_DIR

        if output_dir:
            # Cloudera style of logging
            yield posixpath.join(output_dir, '_logs')

        for hadoop_dir in self._hadoop_dirs():
            yield posixpath.join(hadoop_dir, 'logs')

        # hard-coded fallback paths
        if yarn:
            for path in _FALLBACK_HADOOP_YARN_LOG_DIRS:
                yield path

        for path in _FALLBACK_HADOOP_LOG_DIRS:
            yield path

    def _run(self):
        self._find_binaries_and_jars()
        self._create_setup_wrapper_scripts()
        self._add_job_files_for_upload()
        self._upload_local_files_to_hdfs()
        self._run_job_in_hadoop()

    def _find_binaries_and_jars(self):
        """Find hadoop and (if needed) spark-submit bin up-front, before
        continuing with the job.

        (This is just for user-interaction purposes; these would otherwise
        lazy-load as needed.)
        """
        # this triggers looking for Hadoop binary
        self.get_hadoop_version()

        if self._has_streaming_steps():
            self.get_hadoop_streaming_jar()

        if self._has_spark_steps():
            self.get_spark_submit_bin()

    def _add_job_files_for_upload(self):
        """Add files needed for running the job (setup and input)
        to self._upload_mgr."""
        for path in self._working_dir_mgr.paths():
            self._upload_mgr.add(path)
        for path in self._py_files():
            self._upload_mgr.add(path)

    def _upload_local_files_to_hdfs(self):
        """Copy files managed by self._upload_mgr to HDFS
        """
        self.fs.mkdir(self._upload_mgr.prefix)

        log.info('Copying local files to %s...' % self._upload_mgr.prefix)
        for path, uri in self._upload_mgr.path_to_uri().items():
            self._upload_to_hdfs(path, uri)

    def _upload_to_hdfs(self, path, target):
        log.debug('  %s -> %s' % (path, target))
        self.fs.hadoop.put(path, target)

    def _dump_stdin_to_local_file(self):
        """Dump sys.stdin to a local file, and return the path to it."""
        stdin_path = posixpath.join(self._get_local_tmp_dir(), 'STDIN')
         # prompt user, so they don't think the process has stalled
        log.info('reading from STDIN')

        log.debug('dumping stdin to local file %s...' % stdin_path)
        stdin_file = open(stdin_path, 'wb')
        for line in self._stdin:
            stdin_file.write(line)

        return stdin_path

    def _run_job_in_hadoop(self):
        for step_num, step in enumerate(self._get_steps()):
            self._warn_about_spark_archives(step)

            step_args = self._args_for_step(step_num)
            env = _fix_env(self._env_for_step(step_num))

            # log this *after* _args_for_step(), which can start a search
            # for the Hadoop streaming jar
            log.info('Running step %d of %d...' %
                     (step_num + 1, self._num_steps()))
            log.debug('> %s' % cmd_line(step_args))
            log.debug('  with environment: %r' % sorted(env.items()))

            log_interpretation = {}
            self._log_interpretations.append(log_interpretation)

            # try to use a PTY if it's available
            try:
                pid, master_fd = pty.fork()
            except (AttributeError, OSError):
                # no PTYs, just use Popen

                # user won't get much feedback for a while, so tell them
                # Hadoop is running
                log.debug('No PTY available, using Popen() to invoke Hadoop')

                step_proc = Popen(step_args, stdout=PIPE, stderr=PIPE, env=env)

                step_interpretation = _interpret_hadoop_jar_command_stderr(
                    step_proc.stderr,
                    record_callback=_log_record_from_hadoop)

                # there shouldn't be much output to STDOUT
                for line in step_proc.stdout:
                    _log_line_from_driver(to_unicode(line).strip('\r\n'))

                step_proc.stdout.close()
                step_proc.stderr.close()

                returncode = step_proc.wait()
            else:
                # we have PTYs
                if pid == 0:  # we are the child process
                    os.execvpe(step_args[0], step_args, env)
                else:
                    log.debug('Invoking Hadoop via PTY')

                    with os.fdopen(master_fd, 'rb') as master:
                        # reading from master gives us the subprocess's
                        # stderr and stdout (it's a fake terminal)
                        step_interpretation = (
                            _interpret_hadoop_jar_command_stderr(
                                master,
                                record_callback=_log_record_from_hadoop))
                        _, returncode = os.waitpid(pid, 0)

            # make sure output_dir is filled
            if 'output_dir' not in step_interpretation:
                step_interpretation['output_dir'] = (
                    self._step_output_uri(step_num))

            log_interpretation['step'] = step_interpretation

            self._log_counters(log_interpretation, step_num)

            step_type = step['type']

            if returncode:
                error = self._pick_error(log_interpretation, step_type)
                if error:
                    log.error('Probable cause of failure:\n\n%s\n' %
                              _format_error(error))

                # use CalledProcessError's well-known message format
                reason = str(CalledProcessError(returncode, step_args))
                raise StepFailedException(
                    reason=reason, step_num=step_num,
                    num_steps=self._num_steps())

    def _warn_about_spark_archives(self, step):
        """If *step* is a Spark step, the *upload_archives* option is set,
        and *spark_master* is not ``'yarn'``, warn that *upload_archives*
        will be ignored by Spark."""
        if (_is_spark_step_type(step['type']) and
                self._opts['spark_master'] != 'yarn' and
                self._opts['upload_archives']):
            log.warning('Spark will probably ignore archives because'
                        " spark_master is not set to 'yarn'")

    def _args_for_step(self, step_num):
        step = self._get_step(step_num)

        if step['type'] == 'streaming':
            return self._args_for_streaming_step(step_num)
        elif step['type'] == 'jar':
            return self._args_for_jar_step(step_num)
        elif _is_spark_step_type(step['type']):
            return self._args_for_spark_step(step_num)
        else:
            raise ValueError('Bad step type: %r' % (step['type'],))

    def _args_for_streaming_step(self, step_num):
        hadoop_streaming_jar = self.get_hadoop_streaming_jar()
        if not hadoop_streaming_jar:
            raise Exception('no Hadoop streaming jar')

        return (self.get_hadoop_bin() + ['jar', hadoop_streaming_jar] +
                self._hadoop_streaming_jar_args(step_num))

    def _args_for_jar_step(self, step_num):
        step = self._get_step(step_num)

        args = []

        args.extend(self.get_hadoop_bin())

        # special case for consistency with EMR runner.
        #
        # This might look less like duplicated code if we ever
        # implement #780 (fetching jars from URIs)
        if step['jar'].startswith('file:///'):
            jar = step['jar'][7:]  # keep leading slash
        else:
            jar = step['jar']

        args.extend(['jar', jar])

        if step.get('main_class'):
            args.append(step['main_class'])

        if step.get('args'):
            args.extend(
                self._interpolate_step_args(step['args'], step_num))

        return args

    def _env_for_step(self, step_num):
        step = self._get_step(step_num)

        env = dict(os.environ)

        # when running spark-submit, set its environment directly. See #1464
        if _is_spark_step_type(step['type']):
            env.update(self._spark_cmdenv(step_num))

        return env

    def _default_step_output_dir(self):
        return posixpath.join(self._hadoop_tmp_dir, 'step-output')

    def _cleanup_hadoop_tmp(self):
        if self._hadoop_tmp_dir:
            log.info('Removing HDFS temp directory %s...' %
                     self._hadoop_tmp_dir)
            try:
                self.fs.rm(self._hadoop_tmp_dir)
            except Exception as e:
                log.exception(e)

    def _manifest_download_commands(self):
        cp_to_local = self.get_hadoop_bin() + ['fs', '-copyToLocal']

        return [
            ('*://*', cmd_line(cp_to_local)),
        ]

    ### LOG (implementation of LogInterpretationMixin) ###

    def _stream_history_log_dirs(self, output_dir=None):
        """Yield lists of directories to look for the history log in."""
        if not self._read_logs():
            return

        for log_dir in unique(self._hadoop_log_dirs(output_dir=output_dir)):
            if _logs_exist(self.fs, log_dir):
                log.info('Looking for history log in %s...' % log_dir)
                # logs aren't always in a subdir named history/
                yield [log_dir]

    def _stream_task_log_dirs(self, application_id=None, output_dir=None):
        """Yield lists of directories to look for the task logs in."""
        # Note: this is unlikely to be super-helpful on "real" (multi-node)
        # pre-YARN Hadoop because task logs aren't generally shipped to a
        # local directory. It's a start, anyways. See #1201.
        if not self._read_logs():
            return

        for log_dir in unique(self._hadoop_log_dirs(output_dir=output_dir)):
            if application_id:
                path = self.fs.join(log_dir, 'userlogs', application_id)
            else:
                path = self.fs.join(log_dir, 'userlogs')

            if _logs_exist(self.fs, path):
                log.info('Looking for task syslogs in %s...' % path)
                yield [path]

    def counters(self):
        return [_pick_counters(log_interpretation)
                for log_interpretation in self._log_interpretations]
Ejemplo n.º 14
0
class DataprocJobRunner(HadoopInTheCloudJobRunner, LogInterpretationMixin):
    """Runs an :py:class:`~mrjob.job.MRJob` on Google Cloud Dataproc.
    Invoked when you run your job with ``-r dataproc``.

    :py:class:`DataprocJobRunner` runs your job in an Dataproc cluster, which
    is basically a temporary Hadoop cluster.

    Input, support, and jar files can be either local or on GCS; use
    ``gs://...`` URLs to refer to files on GCS.

    This class has some useful utilities for talking directly to GCS and
    Dataproc, so you may find it useful to instantiate it without a script::

        from mrjob.dataproc import DataprocJobRunner
        ...
    """
    alias = 'dataproc'

    OPT_NAMES = HadoopInTheCloudJobRunner.OPT_NAMES | {
        'cluster_properties',
        'core_instance_config',
        'gcloud_bin',
        'master_instance_config',
        'network',
        'project_id',
        'service_account',
        'service_account_scopes',
        'subnet',
        'task_instance_config',
    }

    # no Spark support yet (see #1765)
    _STEP_TYPES = {'jar', 'streaming'}

    def __init__(self, **kwargs):
        """:py:class:`~mrjob.dataproc.DataprocJobRunner` takes the same
        arguments as
        :py:class:`~mrjob.runner.MRJobRunner`, plus some additional options
        which can be defaulted in :ref:`mrjob.conf <mrjob.conf>`.
        """
        super(DataprocJobRunner, self).__init__(**kwargs)

        # check for library support
        if google is None:
            raise ImportError(
                'You must install google-cloud-logging and '
                'google-cloud-storage to connect to Dataproc')

        # Dataproc requires a master and >= 2 core instances
        # num_core_instances refers ONLY to number of CORE instances and does
        # NOT include the required 1 instance for master
        # In other words, minimum cluster size is 3 machines, 1 master and 2
        # "num_core_instances" workers
        if self._opts['num_core_instances'] < _DATAPROC_MIN_WORKERS:
            raise DataprocException(
                'Dataproc expects at LEAST %d workers' % _DATAPROC_MIN_WORKERS)

        if (self._opts['core_instance_type'] !=
                self._opts['task_instance_type']):
            raise DataprocException(
                'Dataproc v1 expects core/task instance types to be identical')

        # see #1820
        if self._opts['image_id']:
            log.warning('mrjob does not yet support custom machine images'
                        ' on Dataproc')

        # load credentials and project ID
        self._credentials, auth_project_id = google.auth.default(
            scopes=[_FULL_SCOPE])  # needed for $GOOGLE_APPLICATION_CREDENTIALS

        self._project_id = self._opts['project_id'] or auth_project_id

        if not self._project_id:
            raise DataprocException(
                'project_id must be set. Use --project_id or'
                ' set $GOOGLE_CLOUD_PROJECT')

        self._fix_zone_and_region_opts()

        if self._opts['service_account_scopes']:
            self._opts['service_account_scopes'] = [
                _fully_qualify_scope_uri(s)
                for s in self._opts['service_account_scopes']
            ]

        # cluster_id can be None here
        self._cluster_id = self._opts['cluster_id']

        self._api_client = None
        self._gcs_fs = None
        self._fs = None

        # BEGIN - setup directories
        base_tmpdir = self._get_tmpdir(self._opts['cloud_tmp_dir'])

        self._cloud_tmp_dir = _check_and_fix_fs_dir(base_tmpdir)

        # use job key to make a unique tmp dir
        self._job_tmpdir = self._cloud_tmp_dir + self._job_key + '/'

        # pick/validate output dir
        if self._output_dir:
            self._output_dir = _check_and_fix_fs_dir(self._output_dir)
        else:
            self._output_dir = self._job_tmpdir + 'output/'
        # END - setup directories

        # manage local files that we want to upload to GCS. We'll add them
        # to this manager just before we need them.
        fs_files_dir = self._job_tmpdir + 'files/'
        self._upload_mgr = UploadDirManager(fs_files_dir)

        # when did our particular task start?
        self._dataproc_job_start = None

        # init hadoop, ami version caches
        self._image_version = None
        self._hadoop_version = None

        # map driver_output_uri to a dict with the keys:
        # log_uri: uri of file we're reading from
        # pos: position in file
        # buffer: bytes read from file already
        self._driver_output_state = {}

        # This will be filled by _run_steps()
        # NOTE - log_interpretations will be empty except job_id until we
        # parse task logs
        self._log_interpretations = []

    def _fix_zone_and_region_opts(self):
        """Ensure that exactly one of region and zone is set."""
        if self._opts['region'] and self._opts['zone']:
            log.warning('you do not need to set region if you set zone')
            self._opts['region'] = None
            return

        if not (self._opts['region'] or self._opts['zone']):
            if environ.get('CLOUDSDK_COMPUTE_ZONE'):
                self._opts['zone'] = environ['CLOUDSDK_COMPUTE_ZONE']
            elif environ.get('CLOUDSDK_COMPUTE_REGION'):
                self._opts['region'] = environ['CLOUDSDK_COMPUTE_REGION']
            else:
                self._opts['region'] = _DEFAULT_GCE_REGION

    def _default_opts(self):
        return combine_dicts(
            super(DataprocJobRunner, self)._default_opts(),
            dict(
                bootstrap_python=True,
                check_cluster_every=_DEFAULT_CHECK_CLUSTER_EVERY,
                cleanup=['CLUSTER', 'JOB', 'LOCAL_TMP'],
                cloud_fs_sync_secs=_DEFAULT_CLOUD_FS_SYNC_SECS,
                image_version=_DEFAULT_IMAGE_VERSION,
                instance_type=_DEFAULT_INSTANCE_TYPE,
                master_instance_type=_DEFAULT_INSTANCE_TYPE,
                num_core_instances=_DATAPROC_MIN_WORKERS,
                num_task_instances=0,
            )
        )

    def _combine_opts(self, opt_list):
        """Blank out conflicts between *network*/*subnet* and
        *region*/*zone*."""
        opt_list = _blank_out_conflicting_opts(opt_list, ['region', 'zone'])
        opt_list = _blank_out_conflicting_opts(opt_list, ['network', 'subnet'])

        # now combine opts, with region/zone blanked out
        return super(DataprocJobRunner, self)._combine_opts(opt_list)

    @property
    def cluster_client(self):
        return google.cloud.dataproc_v1beta2.ClusterControllerClient(
            **self._client_create_kwargs())

    @property
    def job_client(self):
        return google.cloud.dataproc_v1beta2.JobControllerClient(
            **self._client_create_kwargs())

    @property
    def logging_client(self):
        return google.cloud.logging.Client(credentials=self._credentials,
                                           project=self._project_id)

    def _client_create_kwargs(self):
        if self._opts['region']:
            endpoint = '%s-%s' % (self._opts['region'], _DEFAULT_ENDPOINT)
            return dict(
                channel=google.api_core.grpc_helpers.create_channel(
                    endpoint, credentials=self._credentials))
        else:
            return dict(credentials=self._credentials)

    @property
    def api_client(self):
        raise NotImplementedError(
            '"api_client" was disabled in v0.6.2. Use "cluster_client"'
            ' or "job_client" instead.')

    @property
    def fs(self):
        """:py:class:`~mrjob.fs.base.Filesystem` object for SSH, S3, GCS, and
        the local filesystem.
        """
        if self._fs is None:
            self._fs = CompositeFilesystem()

            location = self._opts['region'] or _zone_to_region(
                self._opts['zone'])

            self._fs.add_fs('gcs', GCSFilesystem(
                credentials=self._credentials,
                project_id=self._project_id,
                part_size=self._upload_part_size(),
                location=location,
                object_ttl_days=_DEFAULT_CLOUD_TMP_DIR_OBJECT_TTL_DAYS,
            ))

            self._fs.add_fs('local', LocalFilesystem())

        return self._fs

    def _get_tmpdir(self, given_tmpdir):
        """Helper for _fix_tmpdir"""
        if given_tmpdir:
            return given_tmpdir

        # Loop over buckets until we find one that matches region
        # NOTE - because this is a tmpdir, we look for a GCS bucket in the
        # same GCE region
        chosen_bucket_name = None

        # determine region for bucket
        region = self._region()

        for tmp_bucket_name in self.fs.gcs.get_all_bucket_names(
                prefix='mrjob-'):
            tmp_bucket = self.fs.gcs.get_bucket(tmp_bucket_name)

            # NOTE - GCP ambiguous Behavior - Bucket location is being
            # returned as UPPERCASE, ticket filed as of Apr 23, 2016 as docs
            # suggest lowercase. (As of Feb. 12, 2018, this is still true,
            # observed on google-cloud-sdk)
            if tmp_bucket.location.lower() == region:
                # Regions are both specified and match
                log.info("using existing temp bucket %s" % tmp_bucket_name)
                chosen_bucket_name = tmp_bucket_name
                break

        # Example default - "mrjob-us-central1-RANDOMHEX"
        if not chosen_bucket_name:
            chosen_bucket_name = '-'.join(
                ['mrjob', region, random_identifier()])

        return 'gs://%s/tmp/' % chosen_bucket_name

    def _region(self):
        # region of cluster, which is either the region set by the user,
        # or the region derived from the zone they set.
        # used to pick bucket location and name cluster
        return self._opts['region'] or _zone_to_region(self._opts['zone'])

    def _run(self):
        self._launch()
        self._run_steps()

    def _launch(self):
        self._prepare_for_launch()
        self._launch_cluster()

    def _prepare_for_launch(self):
        self._check_output_not_exists()
        self._create_setup_wrapper_scripts()
        self._add_bootstrap_files_for_upload()
        self._add_job_files_for_upload()
        self._upload_local_files()
        self._wait_for_fs_sync()

    def _check_output_not_exists(self):
        """Verify the output path does not already exist. This avoids
        provisioning a cluster only to have Hadoop refuse to launch.
        """
        if self.fs.exists(self._output_dir):
            raise IOError(
                'Output path %s already exists!' % (self._output_dir,))

    def _add_bootstrap_files_for_upload(self):
        """Add files needed by the bootstrap script to self._upload_mgr.

        Tar up mrjob if bootstrap_mrjob is True.

        Create the master bootstrap script if necessary.

        """
        # lazily create mrjob.zip
        if self._bootstrap_mrjob():
            self._create_mrjob_zip()
            self._bootstrap_dir_mgr.add('file', self._mrjob_zip_path)

        # all other files needed by the script are already in
        # _bootstrap_dir_mgr
        for path in self._bootstrap_dir_mgr.paths():
            self._upload_mgr.add(path)

        # now that we know where the above files live, we can create
        # the master bootstrap script
        self._create_master_bootstrap_script_if_needed()
        if self._master_bootstrap_script_path:
            self._upload_mgr.add(self._master_bootstrap_script_path)

    def _add_job_files_for_upload(self):
        """Add files needed for running the job (setup and input)
        to self._upload_mgr."""
        if self._opts['hadoop_streaming_jar']:
            self._upload_mgr.add(self._opts['hadoop_streaming_jar'])

        for step in self._get_steps():
            if step.get('jar'):
                self._upload_mgr.add(step['jar'])

    ### Running the job ###

    def cleanup(self, mode=None):
        super(DataprocJobRunner, self).cleanup(mode=mode)

        # close our SSH tunnel, if any
        self._kill_ssh_tunnel()

        # stop the cluster if it belongs to us (it may have stopped on its
        # own already, but that's fine)
        if self._cluster_id and not self._opts['cluster_id']:
            self._cleanup_cluster()

    def _cleanup_cloud_tmp(self):
        # delete all the files we created
        if not self._job_tmpdir:
            return

        try:
            log.info('Removing all files in %s' % self._job_tmpdir)
            self.fs.rm(self._job_tmpdir)
            self._job_tmpdir = None
        except Exception as e:
            log.exception(e)

    # TODO - mtai @ davidmarin - Re-enable log support and supporting cleanup
    def _cleanup_logs(self):
        super(DataprocJobRunner, self)._cleanup_logs()

    def _cleanup_job(self):
        job_prefix = self._dataproc_job_prefix()
        for job in self._list_jobs(
                cluster_name=self._cluster_id,
                state_matcher=_STATE_MATCHER_ACTIVE):
            # Kill all active jobs with the same job_prefix as this job
            job_id = job.reference.job_id

            if not job_id.startswith(job_prefix):
                continue

            self._cancel_job(job_id)
            self._wait_for_api('job cancellation')

    def _cleanup_cluster(self):
        if not self._cluster_id:
            # If we don't have a cluster, then we can't terminate it.
            return

        try:
            log.info("Attempting to terminate cluster")
            self._delete_cluster(self._cluster_id)
        except Exception as e:
            log.exception(e)
            return
        log.info('cluster %s successfully terminated' % self._cluster_id)

    def _wait_for_api(self, msg):
        _wait_for(msg, self._opts['check_cluster_every'])

    def _wait_for_fs_sync(self):
        """Sleep for a little while, to give FS a chance to sync up.
        """
        _wait_for('GCS sync (eventual consistency)',
                  self._opts['cloud_fs_sync_secs'])

    def _streaming_step_job_kwarg(self, step_num):
        """Returns a map from ``'hadoop_job'`` to a dict representing
        a hadoop streaming job.
        """
        return dict(
            hadoop_job=dict(
                args=self._hadoop_streaming_jar_args(step_num),
                main_jar_file_uri=self._hadoop_streaming_jar_uri(),
            )
        )

    def _jar_step_job_kwarg(self, step_num):
        """Returns a map from ``'hadoop_job'`` to a dict representing
        a Hadoop job that runs a JAR"""
        step = self._get_step(step_num)

        hadoop_job = {}

        hadoop_job['args'] = (
            self._interpolate_jar_step_args(step['args'], step_num))

        jar_uri = self._upload_mgr.uri(step['jar'])

        # can't specify main_class and main_jar_file_uri; see
        # https://cloud.google.com/dataproc/docs/reference/rest/v1/projects.regions.jobs#HadoopJob  # noqa
        if step.get('main_class'):
            hadoop_job['jar_file_uris'] = [jar_uri]
            hadoop_job['main_class'] = step['main_class']
        else:
            hadoop_job['main_jar_file_uri'] = jar_uri

        return dict(hadoop_job=hadoop_job)

    def _hadoop_streaming_jar_uri(self):
        if self._opts['hadoop_streaming_jar']:
            return self._upload_mgr.uri(self._opts['hadoop_streaming_jar'])
        else:
            return _HADOOP_STREAMING_JAR_URI

    def _launch_cluster(self):
        """Create an empty cluster on Dataproc, and set self._cluster_id to
        its ID."""
        self.fs.mkdir(self._job_tmpdir)

        # clusterName must be a match of
        # regex '(?:[a-z](?:[-a-z0-9]{0,53}[a-z0-9])?).'
        # as documented in an API error message
        # (not currently documented in the Dataproc docs)
        if not self._cluster_id:
            self._cluster_id = '-'.join(
                ['mrjob', self._region(), random_identifier()])

        # Create the cluster if it's missing, otherwise join an existing one
        try:
            self._get_cluster(self._cluster_id)
            log.info('Adding job to existing cluster - %s' % self._cluster_id)
        except google.api_core.exceptions.NotFound:
            log.info(
                'Creating Dataproc Hadoop cluster - %s' % self._cluster_id)

            cluster_data = self._cluster_create_kwargs()
            self._create_cluster(cluster_data)

            self._wait_for_cluster_ready(self._cluster_id)

        self._set_up_ssh_tunnel()

        # keep track of when we launched our job
        self._dataproc_job_start = time.time()
        return self._cluster_id

    def _wait_for_cluster_ready(self, cluster_id):
        # See https://cloud.google.com/dataproc/reference/rest/v1/projects.regions.clusters#State  # noqa
        cluster_state = None

        # Poll until cluster is ready
        while cluster_state not in ('RUNNING', 'UPDATING'):
            cluster = self._get_cluster(cluster_id)
            cluster_state = cluster.status.State.Name(cluster.status.state)

            if cluster_state in ('ERROR', 'DELETING'):
                raise DataprocException(cluster)

            self._wait_for_api('cluster to accept jobs')

        return cluster_id

    def _dataproc_job_prefix(self):
        return _cleanse_gcp_job_id(self._job_key)

    def _run_steps(self):
        """Wait for every step of the job to complete, one by one."""
        total_steps = self._num_steps()
        # define out steps
        for step_num in range(total_steps):
            job_id = self._launch_step(step_num)

            self._wait_for_step_to_complete(
                job_id, step_num=step_num, num_steps=total_steps)

            log.info('Completed Dataproc Hadoop Job - %s', job_id)

        # After all steps completed, wait for the last output (which is
        # usually written to GCS) to sync
        self._wait_for_fs_sync()

    def _launch_step(self, step_num):
        step = self._get_step(step_num)

        # Clean-up step name
        step_name = '%s---step-%05d-of-%05d' % (
            self._dataproc_job_prefix(), step_num + 1, self._num_steps())

        # Build step

        # job_kwarg is a single-item dict, where the key is 'hadoop_job',
        # 'spark_job', etc.
        if step['type'] == 'streaming':
            job_kwarg = self._streaming_step_job_kwarg(step_num)
        elif step['type'] == 'jar':
            job_kwarg = self._jar_step_job_kwarg(step_num)
        else:
            raise NotImplementedError(
                'Unsupported step type: %r' % step['type'])

        # Submit it
        log.info('Submitting Dataproc Hadoop Job - %s', step_name)
        result = self._submit_job(step_name, job_kwarg)
        log.info('Submitted Dataproc Hadoop Job - %s', step_name)

        job_id = result.reference.job_id
        assert job_id == step_name

        return job_id

    def _wait_for_step_to_complete(self, job_id, step_num, num_steps):
        """Helper for _wait_for_step_to_complete(). Wait for
        step with the given ID to complete, and fetch counters.
        If it fails, attempt to diagnose the error, and raise an
        exception.

        This also adds an item to self._log_interpretations
        """
        log_interpretation = dict(job_id=job_id)
        self._log_interpretations.append(log_interpretation)

        log_interpretation['step'] = {}
        step_type = self._get_step(step_num)['type']

        while True:
            # https://cloud.google.com/dataproc/reference/rest/v1/projects.regions.jobs#JobStatus  # noqa
            job = self._get_job(job_id)

            job_state = job.status.State.Name(job.status.state)

            log.info('%s => %s' % (job_id, job_state))

            log_interpretation['step']['driver_output_uri'] = (
                job.driver_output_resource_uri)

            self._interpret_step_logs(log_interpretation, step_type)

            progress = log_interpretation['step'].get('progress')
            if progress:
                log.info(' ' + progress['message'])

            # https://cloud.google.com/dataproc/reference/rest/v1/projects.regions.jobs#State  # noqa
            # these are the states covered by the ACTIVE job state matcher,
            # plus SETUP_DONE
            if job_state in ('PENDING', 'RUNNING',
                             'CANCEL_PENDING', 'SETUP_DONE'):
                self._wait_for_api('job completion')
                continue

            # print counters if job wasn't CANCELLED
            if job_state != 'CANCELLED':
                self._log_counters(log_interpretation, step_num)

            if job_state == 'ERROR':
                error = self._pick_error(log_interpretation, step_type)
                if error:
                    log.error('Probable cause of failure:\n\n%s\n\n' %
                              _format_error(error))

            # we're done, will return at the end of this
            if job_state == 'DONE':
                break
            else:
                raise StepFailedException(
                    step_num=step_num, num_steps=num_steps)

    def _default_step_output_dir(self):
        # put intermediate data in HDFS
        return 'hdfs:///tmp/mrjob/%s/step-output' % self._job_key

    ### log intepretation ###

    # step

    def _interpret_step_logs(self, log_interpretation, step_type):
        """Hook for interpreting step logs.

        Unlike with most runners, you may call this multiple times and it
        will continue to parse the step log incrementally, which is useful
        for getting job progress."""
        # don't turn this off even if read_logs opt is false; it's
        # the only way this runner can track job progress

        driver_output_uri = log_interpretation.get(
            'step', {}).get('driver_output_uri')

        if driver_output_uri:
            self._update_step_interpretation(
                log_interpretation['step'], driver_output_uri)

    def _update_step_interpretation(
            self, step_interpretation, driver_output_uri):
        new_lines = self._get_new_driver_output_lines(driver_output_uri)
        _interpret_new_dataproc_step_stderr(step_interpretation, new_lines)

    def _get_new_driver_output_lines(self, driver_output_uri):
        """Get a list of complete job driver output lines that are
        new since the last time we checked.
        """
        state = self._driver_output_state.setdefault(
            driver_output_uri,
            dict(log_uri=None, pos=0, buffer=b''))

        # driver output is in logs with names like driveroutput.000000000
        log_uris = sorted(self.fs.ls(driver_output_uri + '*'))

        for log_uri in log_uris:
            # initialize log_uri with first URI we see
            if state['log_uri'] is None:
                # log the location of job driver output just once
                log.info(
                    '  Parsing job driver output from %s*' % driver_output_uri)
                state['log_uri'] = log_uri

            # skip log files already parsed
            if log_uri < state['log_uri']:
                continue

            # when parsing the next file, reset *pos*
            elif log_uri > state['log_uri']:
                state['pos'] = 0
                state['log_uri'] = log_uri

            log_blob = self.fs.gcs._get_blob(log_uri)

            try:
                new_data = log_blob.download_as_string(start=state['pos'])
            except (google.api_core.exceptions.NotFound,
                    google.api_core.exceptions.RequestRangeNotSatisfiable):
                # blob was just created, or no more data is available
                break

            state['buffer'] += new_data
            state['pos'] += len(new_data)

        # convert buffer into lines, saving leftovers for next time
        stream = BytesIO(state['buffer'])
        state['buffer'] = b''

        lines = []

        for line_bytes in stream:
            if line_bytes.endswith(b'\n'):
                lines.append(to_unicode(line_bytes))
            else:
                # leave final partial line (if any) in buffer
                state['buffer'] = line_bytes

        return lines

    # history

    def _interpret_history_log(self, log_interpretation):
        """Does nothing. We can't get the history logs, and we don't need
        them."""
        if not self._read_logs():
            return

        log_interpretation.setdefault('history', {})

    # task

    def _interpret_task_logs(self, log_interpretation, step_type,
                             error_attempt_ids=(), partial=True):
        """Scan node manager log to find failed container IDs of failed
        tasks, and then scan the corresponding stderr and syslogs."""
        if 'task' in log_interpretation and (
                partial or not log_interpretation['task'].get('partial')):
            return   # already interpreted

        if not self._read_logs():
            return

        step_interpretation = log_interpretation.get('step') or {}

        application_id = step_interpretation.get('application_id')
        if not application_id:
            log.warning(
                "Can't parse node manager logs; missing application ID")
            return

        log_interpretation['task'] = self._task_log_interpretation(
            application_id, step_type, partial)

    def _task_log_interpretation(
            self, application_id, step_type, partial=True):
        """Helper for :py:meth:`_interpret_task_logs`"""
        # not bothering with _read_logs() since this is a helper method
        result = {}

        for container_id in self._failed_task_container_ids(application_id):
            error = _parse_task_syslog_records(
                self._task_syslog_records(
                    application_id, container_id, step_type))

            if not error.get('hadoop_error'):
                # not sure if this ever happens, since we already know
                # which containers failed
                continue

            error['container_id'] = container_id

            # fix weird munging of java stacktrace
            error['hadoop_error']['message'] = _fix_java_stack_trace(
                error['hadoop_error']['message'])

            task_error = _parse_task_stderr(
                self._task_stderr_lines(
                    application_id, container_id, step_type))

            if task_error:
                task_error['message'] = _fix_traceback(task_error['message'])
                error['task_error'] = task_error

            result.setdefault('errors', []).append(error)

            # if partial is true, bail out when we find the first task error
            if task_error and partial:
                result['partial'] = True
                return result

        return result

    def _failed_task_container_ids(self, application_id):
        """Stream container IDs of failed tasks, in reverse order."""
        container_id_prefix = 'container' + application_id[11:]

        log_filter = self._make_log_filter(
            'yarn-yarn-nodemanager',
            {'jsonPayload.class': _CONTAINER_EXECUTOR_CLASS_NAME})

        log.info('Scanning node manager logs for IDs of failed tasks...')

        # it doesn't seem to work to do self.logging_client.logger();
        # there's some RPC dispute about whether the log name should
        # be qualified by project name or not
        entries = self.logging_client.list_entries(
            filter_=log_filter, order_by=google.cloud.logging.DESCENDING)

        for entry in entries:
            message = entry.payload.get('message')
            if not message:
                continue

            m = _CONTAINER_EXIT_RE.match(message)
            if not m:
                continue

            returncode = int(m.group('returncode'))
            if not returncode:
                continue

            container_id = m.group('container_id')
            # matches some other step
            if not container_id.startswith(container_id_prefix):
                continue

            log.debug('  %s' % container_id)
            yield container_id

    def _task_stderr_lines(self, application_id, container_id, step_type):
        """Yield lines from a specific stderr log."""
        log_filter = self._make_log_filter(
            'yarn-userlogs', {
                'jsonPayload.application': application_id,
                'jsonPayload.container': container_id,
                # TODO: pick based on step_type
                'jsonPayload.container_logname': 'stderr',
            })

        log.info('    reading stderr log...')
        entries = self.logging_client.list_entries(filter_=log_filter)

        # use log4j parsing to handle tab -> newline conversion
        for record in _log_entries_to_log4j(entries):
            for line in record['message'].split('\n'):
                yield line

    def _task_syslog_records(self, application_id, container_id, step_type):
        """Yield log4j records from a specific syslog.
        """
        log_filter = self._make_log_filter(
            'yarn-userlogs', {
                'jsonPayload.application': application_id,
                'jsonPayload.container': container_id,
                # TODO: pick based on step_type
                'jsonPayload.container_logname': 'syslog',
            })

        log.info('    reading syslog...')
        entries = self.logging_client.list_entries(filter_=log_filter)

        return _log_entries_to_log4j(entries)

    # misc

    def _make_log_filter(self, log_name=None, extra_values=None):
        # we only want logs from this project, cluster, and region
        d = {}

        d['resource.labels.cluster_name'] = self._cluster_id
        d['resource.labels.project_id'] = self._project_id
        d['resource.labels.region'] = self._region()
        d['resource.type'] = 'cloud_dataproc_cluster'

        if log_name:
            d['logName'] = 'projects/%s/logs/%s' % (
                self._project_id, log_name)

        if extra_values:
            d.update(extra_values)

        return _log_filter_str(d)

    def counters(self):
        return [_pick_counters(log_interpretation)
                for log_interpretation in self._log_interpretations]

    ### Bootstrapping ###

    def get_hadoop_version(self):
        if self._hadoop_version is None:
            self._store_cluster_info()
        return self._hadoop_version

    def get_image_version(self):
        """Get the version that our cluster is running.
        """
        if self._image_version is None:
            self._store_cluster_info()
        return self._image_version

    def _store_cluster_info(self):
        """Set self._image_version and self._hadoop_version."""
        if not self._cluster_id:
            raise ValueError('cluster has not yet been created')

        cluster = self._get_cluster(self._cluster_id)
        self._image_version = (
            cluster.config.software_config.image_version)
        # protect against new versions, including patch versions
        # we didn't explicitly request. See #1428
        self._hadoop_version = map_version(
            self._image_version, _DATAPROC_IMAGE_TO_HADOOP_VERSION)

    def _bootstrap_pre_commands(self):
        # don't run the bootstrap script in / (see #1601)
        return [
            'mkdir /tmp/mrjob',
            'cd /tmp/mrjob',
        ]

    ### Bootstrapping ###

    def _bootstrap_python(self):
        """Return a (possibly empty) list of parsed commands (in the same
        format as returned by parse_setup_cmd())'"""
        if not self._opts['bootstrap_python']:
            return []

        if PY2:
            # Python 2 is already installed; install pip and dev packages
            return [
                ['sudo apt-get install -y python-pip python-dev'],
            ]
        else:
            return [
                ['sudo apt-get install -y python3 python3-pip python3-dev'],
            ]

    def get_cluster_id(self):
        return self._cluster_id

    def _cluster_create_kwargs(self):
        gcs_init_script_uris = []
        if self._master_bootstrap_script_path:
            gcs_init_script_uris.append(
                self._upload_mgr.uri(self._master_bootstrap_script_path))

        cluster_metadata = dict()
        cluster_metadata['mrjob-version'] = mrjob.__version__

        # TODO: remove mrjob-max-secs-idle once lifecycle_config is visible
        # through the gcloud utility and the Google Cloud Console
        cluster_metadata['mrjob-max-secs-idle'] = str(int(
            self._opts['max_mins_idle'] * 60))

        gce_cluster_config = dict(
            metadata=cluster_metadata,
            service_account_scopes=self._opts['service_account_scopes'],
        )

        if self._opts['network']:
            gce_cluster_config['network_uri'] = self._opts['network']

        if self._opts['subnet']:
            gce_cluster_config['subnetwork_uri'] = self._opts['subnet']

        if self._opts['service_account']:
            gce_cluster_config['service_account'] = (
                self._opts['service_account'])

        if self._opts['service_account_scopes']:
            gce_cluster_config['service_account_scopes'] = (
                self._opts['service_account_scopes'])

        if self._opts['zone']:
            gce_cluster_config['zone_uri'] = _gcp_zone_uri(
                project=self._project_id, zone=self._opts['zone'])

        cluster_config = dict(
            gce_cluster_config=gce_cluster_config,
            initialization_actions=[
                dict(executable_file=init_script_uri)
                for init_script_uri in gcs_init_script_uris
            ]
        )

        # Task tracker
        master_conf = _gcp_instance_group_config(
            project=self._project_id, zone=self._opts['zone'],
            count=1, instance_type=self._opts['master_instance_type'],
        )
        if self._opts['master_instance_config']:
            master_conf.update(self._opts['master_instance_config'])

        # Compute + storage
        worker_conf = _gcp_instance_group_config(
            project=self._project_id, zone=self._opts['zone'],
            count=self._opts['num_core_instances'],
            instance_type=self._opts['core_instance_type']
        )
        if self._opts['core_instance_config']:
            worker_conf.update(self._opts['core_instance_config'])

        # Compute ONLY
        secondary_worker_conf = _gcp_instance_group_config(
            project=self._project_id, zone=self._opts['zone'],
            count=self._opts['num_task_instances'],
            instance_type=self._opts['task_instance_type'],
            is_preemptible=True
        )
        if self._opts['task_instance_config']:
            secondary_worker_conf.update(self._opts['task_instance_config'])

        cluster_config['master_config'] = master_conf
        cluster_config['worker_config'] = worker_conf
        if secondary_worker_conf.get('num_instances'):
            cluster_config['secondary_worker_config'] = secondary_worker_conf

        cluster_config['lifecycle_config'] = dict(
            idle_delete_ttl=dict(
                seconds=int(self._opts['max_mins_idle'] * 60)))

        software_config = {}

        if self._opts['cluster_properties']:
            software_config['properties'] = _values_to_text(
                self._opts['cluster_properties'])

        # See - https://cloud.google.com/dataproc/dataproc-versions
        if self._opts['image_version']:
            software_config['image_version'] = self._opts['image_version']

        if software_config:
            cluster_config['software_config'] = software_config

        # in Python 2, dict keys loaded from JSON will be unicode, which
        # the Google protobuf objects don't like
        if PY2:
            cluster_config = _clean_json_dict_keys(cluster_config)

        kwargs = dict(project_id=self._project_id,
                      cluster_name=self._cluster_id,
                      config=cluster_config)

        return self._add_extra_cluster_params(kwargs)

    ### Dataproc-specific Stuff ###

    def _get_cluster(self, cluster_id):
        return self.cluster_client.get_cluster(
            cluster_name=cluster_id,
            **self._project_id_and_region()
        )

    def _create_cluster(self, cluster_data):
        # https://cloud.google.com/dataproc/reference/rest/v1/projects.regions.clusters/create  # noqa
        # https://cloud.google.com/dataproc/reference/rest/v1/projects.regions.clusters/get  # noqa

        self.cluster_client.create_cluster(
            cluster=cluster_data,
            **self._project_id_and_region()
        )

    def _delete_cluster(self, cluster_id):
        return self.cluster_client.delete_cluster(
            cluster_name=cluster_id,
            **self._project_id_and_region()
        )

    def _list_jobs(self, cluster_name=None, state_matcher=None):
        # https://cloud.google.com/dataproc/reference/rest/v1/projects.regions.jobs/list#JobStateMatcher  # noqa
        list_kwargs = self._project_id_and_region()

        if cluster_name:
            list_kwargs['cluster_name'] = cluster_name

        if state_matcher:
            list_kwargs['job_state_matcher'] = state_matcher

        return self.job_client.list_jobs(**list_kwargs)

    def _get_job(self, job_id):
        return self.job_client.get_job(
            job_id=job_id,
            **self._project_id_and_region()
        )

    def _cancel_job(self, job_id):
        return self.job_client.cancel_job(
            job_id=job_id,
            **self._project_id_and_region()
        )

    def _submit_job(self, step_name, job_kwarg):
        # https://cloud.google.com/dataproc/reference/rest/v1/projects.regions.jobs/submit  # noqa
        # https://cloud.google.com/dataproc/reference/rest/v1/projects.regions.jobs#HadoopJob  # noqa
        # https://cloud.google.com/dataproc/reference/rest/v1/projects.regions.jobs#JobReference  # noqa

        submit_job_kwargs = dict(
            job=dict(
                reference=dict(project_id=self._project_id, job_id=step_name),
                placement=dict(cluster_name=self._cluster_id),
                **job_kwarg
            ),
            **self._project_id_and_region()
        )

        log.debug('  submit_job(%s)' % ', '.join(
            '%s=%r' % (k, v) for k, v in sorted(submit_job_kwargs.items())))

        return self.job_client.submit_job(**submit_job_kwargs)

    def _project_id_and_region(self):
        return dict(
            project_id=self._project_id,
            region=(self._opts['region'] or 'global'),
        )

    def _manifest_download_commands(self):
        return [
            # TODO: SSH in and figure out how to use gsutil or similar
            # ('gs://*', 'gsutil cp'),
            ('*://*', 'hadoop fs -copyToLocal'),
        ]

    ### SSH hooks ###

    def _job_tracker_host(self):
        return '%s-m' % self._cluster_id

    def _ssh_tunnel_config(self):
        return _SSH_TUNNEL_CONFIG

    def _launch_ssh_proc(self, args):
        ssh_proc = super(DataprocJobRunner, self)._launch_ssh_proc(args)

        # enter an empty passphrase if creating a key for the first time
        ssh_proc.stdin.write(b'\n\n')

        return ssh_proc

    def _ssh_launch_wait_secs(self):
        """Wait 20 seconds because gcloud has to update project metadata
        (unless we were going to check the cluster sooner anyway)."""
        return min(20.0, self._opts['check_cluster_every'])

    def _ssh_tunnel_args(self, bind_port):
        if not self._cluster_id:
            return

        gcloud_bin = self._opts['gcloud_bin'] or ['gcloud']

        cluster = self._get_cluster(self._cluster_id)
        zone = cluster.config.gce_cluster_config.zone_uri.split('/')[-1]

        return gcloud_bin + [
            'compute', 'ssh',
            '--zone', zone,
            self._job_tracker_host(),
            '--',
        ] + self._ssh_tunnel_opts(bind_port)
Ejemplo n.º 15
0
class MRJobRunner(object):
    """Abstract base class for all runners"""

    # this class handles the basic runner framework, options and config files,
    # arguments to mrjobs, and setting up job working dirs and environments.
    # this will put files from setup scripts, py_files, and bootstrap_mrjob
    # into the job's working dir, but won't actually run/import them
    #
    # command lines to run substeps (including Spark) are handled by
    # mrjob.bin.MRJobBinRunner

    #: alias for this runner, used on the command line with ``-r``
    alias = None

    # libjars is only here because the job can set it; might want to
    # handle this with a warning from the launcher instead
    OPT_NAMES = {
        'bootstrap_mrjob',
        'check_input_paths',
        'cleanup',
        'cleanup_on_failure',
        'cmdenv',
        'jobconf',
        'label',
        'libjars',
        'local_tmp_dir',
        'owner',
        'py_files',
        'read_logs',
        'setup',
        'upload_archives',
        'upload_dirs',
        'upload_files'
    }

    # re-define this as a set of step types supported by your runner
    _STEP_TYPES = None

    # if this is true, when bootstrap_mrjob is true, create a mrjob.zip
    # and patch it into the *py_files* option
    _BOOTSTRAP_MRJOB_IN_PY_FILES = True

    ### methods to call from your batch script ###

    def __init__(self, mr_job_script=None, conf_paths=None,
                 extra_args=None, file_upload_args=None,
                 hadoop_input_format=None, hadoop_output_format=None,
                 input_paths=None, output_dir=None, partitioner=None,
                 sort_values=None, stdin=None, steps=None,
                 step_output_dir=None,
                 **opts):
        """All runners take the following keyword arguments:

        :type mr_job_script: str
        :param mr_job_script: the path of the ``.py`` file containing the
                              :py:class:`~mrjob.job.MRJob`. If this is None,
                              you won't actually be able to :py:meth:`run` the
                              job, but other utilities (e.g. :py:meth:`ls`)
                              will work.
        :type conf_paths: None or list
        :param conf_paths: List of config files to combine and use, or None to
                           search for mrjob.conf in the default locations.
        :type extra_args: list of str
        :param extra_args: a list of extra cmd-line arguments to pass to the
                           mr_job script. This is a hook to allow jobs to take
                           additional arguments.
        :param file_upload_args: a list of tuples of ``('--ARGNAME', path)``.
                                 The file at the given path will be uploaded
                                 to the local directory of the mr_job script
                                 when it runs, and then passed into the script
                                 with ``--ARGNAME``. Useful for passing in
                                 SQLite DBs and other configuration files to
                                 your job.
        :type hadoop_input_format: str
        :param hadoop_input_format: name of an optional Hadoop ``InputFormat``
                                    class. Passed to Hadoop along with your
                                    first step with the ``-inputformat``
                                    option. Note that if you write your own
                                    class, you'll need to include it in your
                                    own custom streaming jar (see
                                    :mrjob-opt:`hadoop_streaming_jar`).
        :type hadoop_output_format: str
        :param hadoop_output_format: name of an optional Hadoop
                                     ``OutputFormat`` class. Passed to Hadoop
                                     along with your first step with the
                                     ``-outputformat`` option. Note that if you
                                     write your own class, you'll need to
                                     include it in your own custom streaming
                                     jar (see
                                     :mrjob-opt:`hadoop_streaming_jar`).
        :type input_paths: list of str
        :param input_paths: Input files for your job. Supports globs and
                            recursively walks directories (e.g.
                            ``['data/common/', 'data/training/*.gz']``). If
                            this is left blank, we'll read from stdin
        :type output_dir: str
        :param output_dir: An empty/non-existent directory where Hadoop
                           should put the final output from the job.
                           If you don't specify an output directory, we'll
                           output into a subdirectory of this job's temporary
                           directory. You can control this from the command
                           line with ``--output-dir``. This option cannot be
                           set from configuration files. If used with the
                           hadoop runner, this path does not need to be fully
                           qualified with ``hdfs://`` URIs because it's
                           understood that it has to be on HDFS.
        :type partitioner: str
        :param partitioner: Optional name of a Hadoop partitioner class, e.g.
                            ``'org.apache.hadoop.mapred.lib.HashPartitioner'``.
                            Hadoop streaming will use this to determine how
                            mapper output should be sorted and distributed
                            to reducers.
        :type sort_values: bool
        :param sort_values: if true, set partitioners and jobconf variables
                            so that reducers to receive the values
                            associated with any key in sorted order (sorted by
                            their *encoded* value). Also known as secondary
                            sort.
        :param stdin: an iterable (can be a ``BytesIO`` or even a list) to use
                      as stdin. This is a hook for testing; if you set
                      ``stdin`` via :py:meth:`~mrjob.job.MRJob.sandbox`, it'll
                      get passed through to the runner. If for some reason
                      your lines are missing newlines, we'll add them;
                      this makes it easier to write automated tests.
        :param steps: a list of descriptions of steps to run (see :doc:`step`
                      for description formats)
        :type step_output_dir: str
        :param step_output_dir: An empty/non-existent directory where Hadoop
                                should put output from all steps other than
                                the last one (this only matters for multi-step
                                jobs). Currently ignored by local runners.
        """
        self._ran_job = False

        # opts are made from:
        #
        # empty defaults (everything set to None)
        # runner-specific defaults
        # opts from config file(s)
        # opts from command line
        self._opts = self._combine_confs(
            [(None, {key: None for key in self.OPT_NAMES})] +
            [(None, self._default_opts())] +
            load_opts_from_mrjob_confs(self.alias, conf_paths) +
            [('the command line', opts)]
        )

        log.debug('Active configuration:')
        log.debug(pprint.pformat({
            opt_key: self._obfuscate_opt(opt_key, opt_value)
            for opt_key, opt_value in self._opts.items()
        }))

        self._fs = None

        # a local tmp directory that will be cleaned up when we're done
        # access/make this using self._get_local_tmp_dir()
        self._local_tmp_dir = None

        self._working_dir_mgr = WorkingDirManager()

        # mapping from dir to path for corresponding archive. we pick
        # paths during init(), but don't actually create the archives
        # until self._create_dir_archives() is called
        self._dir_to_archive_path = {}
        # dir archive names (the filename minus ".tar.gz") already taken
        self._dir_archive_names_taken = set()
        # set of dir_archives that have actually been created
        self._dir_archives_created = set()

        # track (name, path) of files and archives to upload to spark
        # if not using a setup script.
        self._spark_files = []
        self._spark_archives = []

        # set this to an :py:class:`~mrjob.setup.UploadDirManager` in
        # runners that upload files to HDFS, S3, etc.
        self._upload_mgr = None

        self._script_path = mr_job_script
        if self._script_path:
            self._working_dir_mgr.add('file', self._script_path)

        # give this job a unique name
        self._job_key = self._make_unique_job_key()

        # extra args to our job
        self._extra_args = list(extra_args) if extra_args else []
        for extra_arg in self._extra_args:
            if isinstance(extra_arg, dict):
                if extra_arg.get('type') != 'file':
                    raise NotImplementedError
                self._working_dir_mgr.add(**extra_arg)
                self._spark_files.append(
                    (extra_arg['name'], extra_arg['path']))

        # extra file arguments to our job
        if file_upload_args:
            log.warning('file_upload_args is deprecated and will be removed'
                        ' in v0.6.0. Pass dicts to extra_args instead.')
            for arg, path in file_upload_args:
                arg_file = parse_legacy_hash_path('file', path)
                self._working_dir_mgr.add(**arg_file)
                self._extra_args.extend([arg, arg_file])
                self._spark_files.append((arg_file['name'], arg_file['path']))

        # set up uploading
        for hash_path in self._opts['upload_files']:
            uf = parse_legacy_hash_path('file', hash_path,
                                        must_name='upload_files')
            self._working_dir_mgr.add(**uf)
            self._spark_files.append((uf['name'], uf['path']))

        for hash_path in self._opts['upload_archives']:
            ua = parse_legacy_hash_path('archive', hash_path,
                                        must_name='upload_archives')
            self._working_dir_mgr.add(**ua)
            self._spark_archives.append((ua['name'], ua['path']))

        for hash_path in self._opts['upload_dirs']:
            # pick name based on directory path
            ud = parse_legacy_hash_path('dir', hash_path,
                                        must_name='upload_archives')
            # but feed working_dir_mgr the archive's path
            archive_path = self._dir_archive_path(ud['path'])
            self._working_dir_mgr.add(
                'archive', archive_path, name=ud['name'])
            self._spark_archives.append((ud['name'], archive_path))

        # Where to read input from (log files, etc.)
        self._input_paths = input_paths or ['-']  # by default read from stdin
        if PY2:
            self._stdin = stdin or sys.stdin
        else:
            self._stdin = stdin or sys.stdin.buffer
        self._stdin_path = None  # temp file containing dump from stdin

        # where to keep the input manifest
        self._input_manifest_path = None

        # store output_dir
        self._output_dir = output_dir

        # store partitioner
        self._partitioner = partitioner

        # store sort_values
        self._sort_values = sort_values

        # store step_output_dir
        self._step_output_dir = step_output_dir

        # store hadoop input and output formats
        self._hadoop_input_format = hadoop_input_format
        self._hadoop_output_format = hadoop_output_format

        # check and store *steps*
        self._steps = None
        if steps is None:
            if not mr_job_script:
                self._steps = []
            # otherwise we'll load steps on-the-fly, see _load_steps()
        else:
            self._check_steps(steps)
            self._steps = copy.deepcopy(steps)

        # this variable marks whether a cleanup has happened and this runner's
        # output stream is no longer available.
        self._closed = False

    ### Options ####

    def _default_opts(self):
        try:
            owner = getpass.getuser()
        except:
            owner = None

        return dict(
            check_input_paths=True,
            cleanup=['ALL'],
            cleanup_on_failure=['NONE'],
            owner=owner,
        )

    def _combine_confs(self, source_and_opt_list):
        """Combine several opt dictionaries into one.

        *source_and_opt_list* is a list of tuples of *source*,
        *opts* where *opts* is a dictionary and *source* is either
        None or a description of where the opts came from (usually a path).

        Only override this if you need truly fine-grained control,
        including knowledge of the options' source.
        """
        opt_list = [
            self._fix_opts(opts, source)
            for source, opts in source_and_opt_list
        ]

        return self._combine_opts(opt_list)

    def _combine_opts(self, opt_list):
        """Combine several opt dictionaries into one. *opt_list*
        is a list of dictionaries containing validated options

        Override this if you need to base options off the values of
        other options, but don't need to issue warnings etc.
        about the options' source.
        """
        return combine_opts(self._opt_combiners(), *opt_list)

    def _opt_combiners(self):
        """A dictionary mapping opt name to combiner funciton. This
        won't necessarily include every opt name (we default to
        :py:func:`~mrjob.conf.combine_value`).
        """
        return _combiners(self.OPT_NAMES)

    def _fix_opts(self, opts, source=None):
        """Take an options dictionary, and either return a sanitized
        version of it, or raise an exception.

        *source* is either a string describing where the opts came from
        or None.

        This ensures that opt dictionaries are really dictionaries
        and handles deprecated options.
        """
        if source is None:
            source = 'defaults'  # defaults shouldn't trigger warnings

        if not isinstance(opts, dict):
            raise TypeError(
                'options for %s (from %s) must be a dict' %
                (self.alias, source))

        deprecated_aliases = _deprecated_aliases(self.OPT_NAMES)

        results = {}

        for k, v in sorted(opts.items()):
            # rewrite deprecated aliases
            if k in deprecated_aliases:
                if v is None:  # don't care
                    continue

                aliased_opt = deprecated_aliases

                log.warning('Deprecated option %s (from %s) has been renamed'
                            ' to %s and will be removed in v0.7.0' % (
                                k, source, aliased_opt))

                if opts.get(aliased_opt) is not None:
                    return  # don't overwrite non-aliased opt

                k = aliased_opt

            if k in self.OPT_NAMES:
                results[k] = None if v is None else self._fix_opt(k, v, source)
            elif v:
                log.warning('Unexpected option %s (from %s)' % (k, source))

        return results

    def _fix_opt(self, opt_key, opt_value, source):
        """Fix a single option, returning its correct value or raising
        an exception. This is not called for options that are ``None``.

        This currently handles cleanup opts.

        Override this if you require additional opt validation or cleanup.
        """
        if opt_key in ('cleanup', 'cleanup_on_failure'):
            return self._fix_cleanup_opt(opt_key, opt_value, source)
        else:
            return opt_value

    def _fix_cleanup_opt(self, opt_key, opt_value, source):
        """Fix a cleanup option, or raise ValueError."""
        if isinstance(opt_value, string_types):
            opt_value = [opt_value]

        if 'NONE' in opt_value and len(set(opt_value)) > 1:
            raise ValueError(
                'Cannot clean up both nothing and something!'
                ' (%s option from %s)' % (opt_key, source))

        for cleanup_type in opt_value:
            if cleanup_type not in CLEANUP_CHOICES:
                raise ValueError(
                    '%s must be one of %s, not %s (from %s)' % (
                        opt_key, ', '.join(CLEANUP_CHOICES), opt_value,
                        source))

        return opt_value

    def _obfuscate_opt(self, opt_key, opt_value):
        """Return value of opt to show in debug printout. Used to obfuscate
        credentials, etc."""
        return opt_value

    ### Filesystem object ###

    @property
    def fs(self):
        """:py:class:`~mrjob.fs.base.Filesystem` object for the local
        filesystem.
        """
        if self._fs is None:
            # wrap LocalFilesystem in LocalFilesystem to get IOError
            # on URIs (see #1185)
            self._fs = CompositeFilesystem()
            self._fs.add_fs('local', LocalFilesystem())
        return self._fs

    ### Running the job and parsing output ###

    def run(self):
        """Run the job, and block until it finishes.

        Raise :py:class:`~mrjob.step.StepFailedException` if there
        are any problems (except on
        :py:class:`~mrjob.inline.InlineMRJobRunner`, where we raise the
        actual exception that caused the step to fail).
        """
        if self._ran_job:
            raise ValueError('Job already ran!')

        if self._num_steps() == 0:
            raise ValueError('Job has no steps!')

        self._create_dir_archives()
        # TODO: no point in checking input paths if we're going to
        # make a manifest out of them
        self._check_input_paths()
        self._add_input_files_for_upload()
        self._create_input_manifest_if_needed()
        self._run()
        self._ran_job = True

        last_step = self._get_steps()[-1]

        # only print this message if the last step uses our output dir
        if 'args' not in last_step or OUTPUT in last_step['args']:
            log.info('job output is in %s' % self._output_dir)

    def cat_output(self):
        """Stream the jobs output, as a stream of ``bytes``. If there are
        multiple output files, there will be an empty bytestring
        (``b''``) between them.

        .. versionadded:: 0.6.0

           In previous versions, you'd use :py:meth:`stream_output`.
        """
        output_dir = self.get_output_dir()
        if output_dir is None:
            raise ValueError('Run the job before streaming output')

        if self._closed is True:
            log.warning(
                'WARNING! Trying to stream output from a closed runner, output'
                ' will probably be empty.')

        log.info('Streaming final output from %s...' % output_dir)

        def split_path(path):
            while True:
                base, name = os.path.split(path)

                # no more elements
                if not name:
                    break

                yield name

                path = base

        def ls_output():
            for filename in self.fs.ls(output_dir):
                subpath = filename[len(output_dir):]
                # Hadoop ignores files and dirs inside the output dir
                # whose names start with '_' or '.'. See #1337.
                if not (any(name[0] in '_.'
                            for name in split_path(subpath))):
                    yield filename

        for i, filename in enumerate(ls_output()):
            if i > 0:
                yield b''  # EOF of previous file

            for chunk in self.fs._cat_file(filename):
                yield chunk

    def stream_output(self):
        """Like :py:meth:`cat_output` except that it groups bytes into
        lines. Equivalent to ``mrjob.util.to_lines(runner.cat_output())``.

        .. deprecated:: 0.6.0
        """
        log.warning('stream_output() is deprecated and will be removed in'
                    ' v0.7.0. use mrjob.util.to_lines(runner.cat_output())'
                    ' instead.')

        return to_lines(self.cat_output())

    def _cleanup_mode(self, mode=None):
        """Actual cleanup action to take based on various options"""
        if self._script_path and not self._ran_job:
            return mode or self._opts['cleanup_on_failure']
        else:
            return mode or self._opts['cleanup']

    def _cleanup_cloud_tmp(self):
        """Cleanup any files/directories on cloud storage (e.g. S3) we created
        while running this job. Should be safe to run this at any time, or
        multiple times.
        """
        pass  # only EMR runner does this

    def _cleanup_hadoop_tmp(self):
        """Cleanup any files/directories on HDFS we created
        while running this job. Should be safe to run this at any time, or
        multiple times.
        """
        pass  # only Hadoop runner does this

    def _cleanup_local_tmp(self):
        """Cleanup any files/directories on the local machine we created while
        running this job. Should be safe to run this at any time, or multiple
        times.

        This particular function removes any local tmp directories
        added to the list self._local_tmp_dirs

        This won't remove output_dir if it's outside of our tmp dir.
        """
        if self._local_tmp_dir:
            log.info('Removing temp directory %s...' % self._local_tmp_dir)
            try:
                rmtree(self._local_tmp_dir)
            except OSError as e:
                log.exception(e)

        self._local_tmp_dir = None

    def _cleanup_cluster(self):
        """Terminate the cluster if there is one."""
        pass  # this only happens on EMR

    def _cleanup_logs(self):
        """Cleanup any log files that are created as a side-effect of the job.
        """
        pass  # this only happens on EMR

    def _cleanup_job(self):
        """Stop any jobs that we created that are still running."""
        pass  # currently disabled (see #1241)

    def cleanup(self, mode=None):
        """Clean up running jobs, temp files, and logs, subject to the
        *cleanup* option passed to the constructor.

        If you create your runner in a ``with`` block,
        :py:meth:`cleanup` will be called automatically::

            with mr_job.make_runner() as runner:
                ...

            # cleanup() called automatically here

        :param mode: override *cleanup* passed into the constructor. Should be
                     a list of strings from
                     :py:data:`~mrjob.options.CLEANUP_CHOICES`
        """
        mode = self._cleanup_mode(mode)

        def mode_has(*args):
            return any((choice in mode) for choice in args)

        if self._script_path and not self._ran_job:
            if mode_has('CLUSTER', 'ALL'):
                self._cleanup_cluster()

            if mode_has('JOB', 'ALL'):
                self._cleanup_job()

        if mode_has('ALL', 'TMP', 'CLOUD_TMP'):
            self._cleanup_cloud_tmp()

        if mode_has('ALL', 'TMP', 'HADOOP_TMP'):
            self._cleanup_hadoop_tmp()

        if mode_has('ALL', 'TMP', 'LOCAL_TMP'):
            self._cleanup_local_tmp()

        if mode_has('ALL', 'LOGS'):
            self._cleanup_logs()

        self._closed = True

    def counters(self):
        """Get counters associated with this run in this form::

            [{'group name': {'counter1': 1, 'counter2': 2}},
             {'group name': ...}]

        The list contains an entry for every step of the current job.
        """
        raise NotImplementedError

    ### hooks for the with statement ###

    def __enter__(self):
        """Don't do anything special at start of with block"""
        return self

    def __exit__(self, type, value, traceback):
        """Call self.cleanup() at end of with block."""
        self.cleanup()

    ### more runner information ###

    def get_opts(self):
        """Get options set for this runner, as a dict."""
        log.warning('get_opts() is deprecated and will be removed in v0.7.0')
        return copy.deepcopy(self._opts)

    def get_job_key(self):
        """Get the unique key for the job run by this runner.
        This has the format ``label.owner.date.time.microseconds``
        """
        return self._job_key

    def get_output_dir(self):
        """Find the directory containing the job output. If the job hasn't
        run yet, returns None"""
        if self._script_path and not self._ran_job:
            return None

        return self._output_dir

    ### other methods you need to implement in your subclass ###

    def get_hadoop_version(self):
        """Return the version number of the Hadoop environment as a string if
        Hadoop is being used or simulated. Return None if not applicable.

        :py:class:`~mrjob.emr.EMRJobRunner` infers this from the cluster.
        :py:class:`~mrjob.hadoop.HadoopJobRunner` gets this from
        ``hadoop version``. :py:class:`~mrjob.local.LocalMRJobRunner` has an
        additional `hadoop_version` option to specify which version it
        simulates.
        :py:class:`~mrjob.inline.InlineMRJobRunner` does not simulate Hadoop at
        all.
        """
        return None

    # you'll probably wan't to add your own __init__() and cleanup() as well

    def _run(self):
        """Run the job."""
        raise NotImplementedError

    ### internal utilities for implementing MRJobRunners ###

    def _get_local_tmp_dir(self):
        """Create a tmp directory on the local filesystem that will be
        cleaned up by self.cleanup()"""
        if not self._local_tmp_dir:
            tmp_dir = (self._opts['local_tmp_dir'] or
                       tempfile.gettempdir())

            path = os.path.join(tmp_dir, self._job_key)
            log.info('Creating temp directory %s' % path)
            if os.path.isdir(path):
                rmtree(path)
            os.makedirs(path)
            self._local_tmp_dir = path

        return self._local_tmp_dir

    def _make_unique_job_key(self, label=None, owner=None):
        """Come up with a useful unique ID for this job. Optionally,
        you can specify a custom label or owner (otherwise we use
        :py:meth:`_label` and :py:meth:`_owner`.

        We use this to choose the output directory, etc. for the job.
        """
        if label is None:
            label = self._label()

        if owner is None:
            owner = self._owner()

        now = datetime.datetime.utcnow()
        return '%s.%s.%s.%06d' % (
            label, owner,
            now.strftime('%Y%m%d.%H%M%S'), now.microsecond)

    def _label(self):
        """Return *label* opt, or if not set, the name of the file
        containing the MRJob, minus extension, or if none, ``'no_script'``"""
        if self._opts['label']:
            return self._opts['label']
        elif self._script_path:
            return os.path.basename(self._script_path).split('.')[0]
        else:
            return 'no_script'

    def _owner(self):
        """Return *owner* opt (which defaults to :py:func:`getpass.getuser`),
        or ``'no_user'`` if not set."""
        if self._opts['owner']:
            # owner opt defaults to getpass.getuser()
            return self._opts['owner']
        else:
            return 'no_user'

    def _get_steps(self):
        """If *steps* was not set at init time, call the job script to
        find out how many steps it has, and whether
        there are mappers and reducers for each step. Validate its
        output.

        Returns output as described in :ref:`steps-format`.
        """
        if self._steps is None:
            log.warning(
                'querying jobs for steps is deprecated and'
                ' will go away in v0.7.0')
            steps = self._load_steps()
            self._check_steps(steps)
            self._steps = steps

        return self._steps

    def _load_steps(self):
        """Ask job how many steps it has, and whether
        there are mappers and reducers for each step.

        Returns output as described in :ref:`steps-format`.

        If this is called, you can assume self._script_path is set.
        """
        raise NotImplementedError

    def _check_steps(self, steps):
        """Look at the step definition (*steps*). If it is not supported by
        the runner, raise :py:class:`NotImplementedError`. If it is not
        supported by mrjob, raise :py:class:`ValueError`.
        """
        if not self._STEP_TYPES:
            # use __class__.__name__ because only MRJobRunner would
            # trigger this
            raise NotImplementedError(
                '%s cannot run steps!' % self.__class__.__name__)

        for step_num, step in enumerate(steps):
           self._check_step(step, step_num)

    def _check_step(self, step, step_num):
        """Raise an exception if the given step is invalid
        (:py:class:`ValueError`) or not handled by this runner
        (:py:class:`NotImplementedError`).

        By default, we check that *step* has a support step type,
        only uses an input manifest if it's the first step, and that
        :py:attr:`_script_path` exists if necessary. You can re-define
        this in your subclass.
        """
        if step.get('type') not in self._STEP_TYPES:
            raise NotImplementedError(
                'step %d has type %r, but %s runner only supports:'
                ' %s' % (step_num, step.get('type'), self.alias,
                         ', '.join(sorted(self._STEP_TYPES))))

        if step.get('input_manifest') and step_num != 0:
            raise ValueError(
                'step %d may not take an input manifest (only'
                ' first step can' % step_num)

        # some step types assume a MRJob script
        if not self._script_path:
            if step['type'] == 'spark':
                raise ValueError(
                    "SparkStep (step %d) can't run without a MRJob script"
                    " (try SparkScriptStep instead)" % step_num)

            elif step['type'] == 'streaming':
                for mrc in ('mapper', 'combiner', 'reducer'):
                    if not step.get(mrc):
                        continue

                    substep = step[mrc]
                    if substep['type'] == 'script':
                        raise ValueError(
                            "%s (step %d) can't run without a MRJob"
                            " script" % (mrc, step_num))

    def _get_step(self, step_num):
        """Get a single step (calls :py:meth:`_get_steps`)."""
        return self._get_steps()[step_num]

    def _num_steps(self):
        """Get the number of steps (calls :py:meth:`get_steps`)."""
        return len(self._get_steps())

    def _uses_input_manifest(self):
        """Does the first step take an input manifest?"""
        return bool(self._get_step(0).get('input_manifest'))

    def _has_streaming_steps(self):
        """Are any of our steps Hadoop Streaming steps?"""
        return any(step['type'] == 'streaming'
                   for step in self._get_steps())

    def _has_spark_steps(self):
        """Are any of our steps Spark steps? (e.g. spark, spark_jar,
        spark_script)

        Generally used to determine if we need to install Spark on a cluster.
        """
        return any(_is_spark_step_type(step['type'])
                   for step in self._get_steps())

    def _has_pyspark_steps(self):
        """Do any of our steps involve running Python on Spark?
        Includes spark and spark_script types, but not spark_jar.

        Generally used to tell if we need a Spark setup script.
        """
        return any(_is_pyspark_step_type(step['type'])
                   for step in self._get_steps())

    def _args_for_task(self, step_num, mrc):
        return [
            '--step-num=%d' % step_num,
            '--%s' % mrc,
        ] + self._mr_job_extra_args()

    def _mr_job_extra_args(self, local=False):
        """Return arguments to add to every invocation of MRJob.

        :type local: boolean
        :param local: if this is True, use files' local paths rather than
            the path they'll have inside Hadoop streaming
        """
        result = []

        for extra_arg in self._extra_args:
            if isinstance(extra_arg, dict):
                if local:
                    result.append(extra_arg['path'])
                else:
                    result.append(self._working_dir_mgr.name(**extra_arg))
            else:
                result.append(extra_arg)

        return result

    def _dir_archive_path(self, dir_path):
        """Assign a path for the archive of *dir_path* but don't
        actually create anything."""
        if dir_path not in self._dir_to_archive_path:
            # we can check local paths now
            if not (is_uri(dir_path) or os.path.isdir(dir_path)):
                raise OSError('%s is not a directory!' % dir_path)

            name = name_uniquely(
                dir_path, names_taken=self._dir_archive_names_taken)
            self._dir_archive_names_taken.add(name)

            self._dir_to_archive_path[dir_path] = os.path.join(
                self._get_local_tmp_dir(), 'archives', name + '.tar.gz')

        return self._dir_to_archive_path[dir_path]

    def _create_dir_archives(self):
        """Call this to create all dir archives"""
        for dir_path in sorted(set(self._dir_to_archive_path)):
            self._create_dir_archive(dir_path)

    def _create_dir_archive(self, dir_path):
        """Helper for :py:meth:`archive_dir`"""
        if not self.fs.exists(dir_path):
            raise OSError('%s does not exist')

        tar_gz_path = self._dir_archive_path(dir_path)

        if tar_gz_path in self._dir_archives_created:
            return  # already created

        if not os.path.isdir(os.path.dirname(tar_gz_path)):
            os.makedirs(os.path.dirname(tar_gz_path))

        # for remote files
        tmp_download_path = os.path.join(
            self._get_local_tmp_dir(), 'tmp-download')

        log.info('Archiving %s -> %s' % (dir_path, tar_gz_path))

        with tarfile.open(tar_gz_path, mode='w:gz') as tar_gz:
            for path in self.fs.ls(dir_path):
                # fs.ls() only lists files
                if path == dir_path:
                    raise OSError('%s is a file, not a directory!' % dir_path)

                # TODO: do we need this?
                if os.path.realpath(path) == os.path.realpath(tar_gz_path):
                    raise OSError(
                        'attempted to archive %s into itself!' % tar_gz_path)

                if is_uri(path):
                    path_in_tar_gz = path[len(dir_path):].lstrip('/')

                    log.info('  downloading %s -> %s' % (
                        path, tmp_download_path))
                    with open(tmp_download_path, 'wb') as f:
                        for chunk in self.fs.cat(path):
                            f.write(chunk)
                    local_path = tmp_download_path
                else:
                    path_in_tar_gz = path[len(dir_path):].lstrip(os.sep)
                    local_path = path

                log.debug('  adding %s to %s' % (path, tar_gz_path))
                tar_gz.add(local_path, path_in_tar_gz, recursive=False)

        self._dir_archives_created.add(tar_gz_path)

    def _bootstrap_mrjob(self):
        """Should we bootstrap mrjob?"""
        if self._opts['bootstrap_mrjob'] is None:
            return self._opts['interpreter'] is None
        else:
            return bool(self._opts['bootstrap_mrjob'])

    def _get_input_paths(self):
        """Get the paths to input files, dumping STDIN to a local
        file if need be."""
        if self._input_manifest_path:
            return [self._input_manifest_path]

        if '-' in self._input_paths:
            if self._stdin_path is None:
                # prompt user, so they don't think the process has stalled
                log.info('reading from STDIN')

                stdin_path = os.path.join(self._get_local_tmp_dir(), 'STDIN')
                log.debug('dumping stdin to local file %s' % stdin_path)
                with open(stdin_path, 'wb') as stdin_file:
                    for line in self._stdin:
                        # catch missing newlines (often happens with test data)
                        if not line.endswith(b'\n'):
                            line += b'\n'
                        stdin_file.write(line)

                self._stdin_path = stdin_path

        return [self._stdin_path if p == '-' else p for p in self._input_paths]

    def _create_input_manifest_if_needed(self):
        """Create a file with a list of URIs of input files."""
        if self._input_manifest_path or not self._uses_input_manifest():
            return

        uris = []

        log.info('finding input files to add to manifest...')

        for path in self._get_input_paths():
            log.debug('  in %s' % path)
            if is_uri(path):
                # URIs might be globs
                for uri in self.fs.ls(path):
                    uris.append(uri)
            else:
                # local paths are expected to be single files
                # (shell would resolve globs)
                if self._upload_mgr:
                    uris.append(self._upload_mgr.uri(path))
                else:
                    # just make sure job can find files from it's working dir
                    uris.append(os.path.abspath(path))

        log.info('found %d input files' % len(uris))

        path = os.path.join(self._get_local_tmp_dir(), 'input-manifest.txt')
        self._write_script(uris, path, 'input manifest')

        self._input_manifest_path = path
        if self._upload_mgr:
            self._upload_mgr.add(self._input_manifest_path)

    def _check_input_paths(self):
        """Check that input exists prior to running the job, if the
        `check_input_paths` option is true."""
        if not self._opts['check_input_paths']:
            return

        for path in self._input_paths:
            self._check_input_path(path)

    def _check_input_path(self, path):
        """Raise :py:class:`IOError` if the given input does not exist or
        is otherwise invalid. Override this to provide custom check
        behavior."""
        if path == '-':
            return  # STDIN always exists

        if not self.fs.can_handle_path(path):
            return  # no way to check (e.g. non-S3 URIs on EMR)

        if not self.fs.exists(path):
            raise IOError(
                'Input path %s does not exist!' % (path,))

    def _add_input_files_for_upload(self):
        """If there is an upload manager, add input files to it."""
        if self._upload_mgr:
            for path in self._get_input_paths():
                self._upload_mgr.add(path)

    def _intermediate_output_dir(self, step_num, local=False):
        """A directory for intermediate output for the given step number."""
        join = os.path.join if local else posixpath.join

        return join(
            self._step_output_dir or self._default_step_output_dir(),
            '%04d' % step_num)

    def _default_step_output_dir(self):
        """Where to put output for steps other than the last one,
        if not specified by the *output_dir* constructor keyword.
        Usually you want this to be on HDFS (most efficient).

        Define this in your runner subclass.
        """
        raise NotImplementedError

    def _step_input_uris(self, step_num):
        """A list of URIs to use as input for the given step. For all
        except the first step, this list will have a single item (a
        directory)."""
        if step_num == 0:
            return [self._upload_mgr.uri(path) if self._upload_mgr
                    else to_uri(path)
                    for path in self._get_input_paths()]
        else:
            return [to_uri(self._intermediate_output_dir(step_num - 1))]

    def _step_output_uri(self, step_num):
        """URI to use as output for the given step. This is either an
        intermediate dir (see :py:meth:`intermediate_output_uri`) or
        ``self._output_dir`` for the final step."""
        if step_num == len(self._get_steps()) - 1:
            return to_uri(self._output_dir)
        else:
            return to_uri(self._intermediate_output_dir(step_num))

    def _jobconf_for_step(self, step_num):
        """Get the jobconf dictionary, optionally including step-specific
        jobconf info.

        Also translate jobconfs to the current Hadoop version, if necessary.
        """

        step = self._get_step(step_num)

        # _sort_values_jobconf() isn't relevant to Spark,
        # but it doesn't do any harm either

        jobconf = combine_jobconfs(self._sort_values_jobconf(),
                                   self._opts['jobconf'],
                                   step.get('jobconf'))

        # if user is using the wrong jobconfs, add in the correct ones
        # and log a warning
        hadoop_version = self.get_hadoop_version()
        if hadoop_version:
            jobconf = translate_jobconf_dict(jobconf, hadoop_version)

        return jobconf

    def _sort_values_jobconf(self):
        """Jobconf dictionary to enable sorting by value.
        """
        if not self._sort_values:
            return {}

        # translate _SORT_VALUES_JOBCONF to the correct Hadoop version,
        # without logging a warning
        hadoop_version = self.get_hadoop_version()

        jobconf = {}
        for k, v in _SORT_VALUES_JOBCONF.items():
            if hadoop_version:
                jobconf[translate_jobconf(k, hadoop_version)] = v
            else:
                for j in translate_jobconf_for_all_versions(k):
                    jobconf[j] = v

        return jobconf

    def _sort_values_partitioner(self):
        """Partitioner to use with *sort_values* keyword to the constructor."""
        if self._sort_values:
            return _SORT_VALUES_PARTITIONER
        else:
            return None

    def _upload_args(self):
        # just upload every file and archive in the working dir manager
        return self._upload_args_helper('-files', None, '-archives', None)

    def _upload_args_helper(
            self, files_opt_str, files, archives_opt_str, archives,
            always_use_hash=True):
        args = []

        file_hash_paths = list(
            self._arg_hash_paths('file', files,
                                 always_use_hash=always_use_hash))
        if file_hash_paths:
            args.append(files_opt_str)
            args.append(','.join(file_hash_paths))

        archive_hash_paths = list(
            self._arg_hash_paths('archive', archives,
                                 always_use_hash=always_use_hash))
        if archive_hash_paths:
            args.append(archives_opt_str)
            args.append(','.join(archive_hash_paths))

        return args

    def _arg_hash_paths(self, type, named_paths=None, always_use_hash=True):
        """Helper function for the *upload_args methods."""
        if named_paths is None:
            # just return everything managed by _working_dir_mgr
            named_paths = sorted(
                self._working_dir_mgr.name_to_path(type).items())

        for name, path in named_paths:
            if not name:
                name = self._working_dir_mgr.name(type, path)

            if self._upload_mgr:
                uri = self._upload_mgr.uri(path)
            else:
                uri = path

            if not always_use_hash and _basename(uri) == name:
                yield uri
            else:
                yield '%s#%s' % (uri, name)

    def _write_script(self, lines, path, description):
        """Write text of a setup script, input manifest, etc. to the given
        file.

        By default, this writes binary data. Redefine :py:meth:`write_lines`
        to use other line endings.

        :param lines: a list of lines as ``str``
        :param path: path of file to write to
        :param description: what we're writing to, for debug messages
        """
        log.debug('Writing %s to %s:' % (description, path))
        for line in lines:
            log.debug('  ' + line)

        self._write_script_lines(lines, path)

    def _write_script_lines(self, lines, path):
        """Write text to the given file. By default, this writes
        binary data, but can be redefined to use local line endings."""
        with open(path, 'wb') as f:
            for line in lines:
                f.write((line + '\n').encode('utf-8'))
Ejemplo n.º 16
0
class SparkMRJobRunner(MRJobBinRunner):
    """Runs a :py:class:`~mrjob.job.MRJob` on your Spark cluster (with or
    without Hadoop). Invoked when you run your job with ``-r spark``.
    """
    alias = 'spark'

    # other than ``spark_*``, these options are only used for filesystems
    OPT_NAMES = MRJobBinRunner.OPT_NAMES | {
        'aws_access_key_id',
        'aws_secret_access_key',
        'aws_session_token',
        'cloud_fs_sync_secs',
        'cloud_part_size_mb',
        'google_project_id',  # used by GCS filesystem
        'hadoop_bin',
        's3_endpoint',
        's3_region',  # only used along with s3_endpoint
        'spark_deploy_mode',
        'spark_master',
        'spark_tmp_dir',  # where to put temp files in Spark
    }

    # everything except Hadoop JARs
    # streaming jobs will be run using mrjob_spark_harness.py (see #1972)
    _STEP_TYPES = {
        'spark', 'spark_jar', 'spark_script', # 'streaming',
    }

    def __init__(self, **kwargs):
        super(SparkMRJobRunner, self).__init__(**kwargs)

        self._spark_tmp_dir = self._pick_spark_tmp_dir()

        # where local files are uploaded into Spark
        if is_uri(self._spark_tmp_dir):
            spark_files_dir = posixpath.join(self._spark_tmp_dir, 'files', '')
            self._upload_mgr = UploadDirManager(spark_files_dir)

        # where to put job output (if not set explicitly)
        if not self._output_dir:
            self._output_dir = posixpath.join(self._spark_tmp_dir, 'output')

        # keep track of where the spark-submit binary is
        self._spark_submit_bin = self._opts['spark_submit_bin']

    def _default_opts(self):
        return combine_dicts(
            super(SparkMRJobRunner, self)._default_opts(),
            dict(
                spark_master='local[*]',
                spark_deploy_mode='client',
            )
        )

    def _run(self):
        self.get_spark_submit_bin()  # find spark-submit up front
        self._create_setup_wrapper_scripts()
        self._add_job_files_for_upload()
        self._upload_local_files()
        self._run_steps_on_spark()

    def _add_job_files_for_upload(self):
        """Add files needed for running the job (setup and input)
        to self._upload_mgr."""
        if self._upload_mgr:
            for path in self._working_dir_mgr.paths():
                self._upload_mgr.add(path)

        # no need to upload py_files, spark-submit handles this

    def _pick_spark_tmp_dir(self):
        if self._opts['spark_tmp_dir']:
            if is_uri(self._opts['spark_tmp_dir']):
                return posixpath.join(
                    self._opts['spark_tmp_dir'], self._job_key)
            else:
                return os.path.join(
                    self._opts['spark_tmp_dir'], self._job_key)
        elif self._spark_master_is_local():
            # need a local temp dir
            # add "-spark" so we don't collide with default local temp dir
            return os.path.join(
                gettempdir(), self._job_key + '-spark')
        else:
            # use HDFS (same default as HadoopJobRunner)
            return posixpath.join(
                fully_qualify_hdfs_path('tmp/mrjob'), self._job_key)

    def _default_step_output_dir(self):
        return posixpath.join(self._spark_tmp_dir, 'step-output')

    @property
    def fs(self):
        # Spark supports basically every filesystem there is

        if not self._fs:
            self._fs = CompositeFilesystem()

            if boto3_installed:
                self._fs.add_fs('s3', S3Filesystem(
                    aws_access_key_id=self._opts['aws_access_key_id'],
                    aws_secret_access_key=self._opts['aws_secret_access_key'],
                    aws_session_token=self._opts['aws_session_token'],
                    s3_endpoint=self._opts['s3_endpoint'],
                    s3_region=self._opts['s3_region'],
                ), disable_if=_is_permanent_boto3_error)

            if google_libs_installed:
                self._fs.add_fs('gcs', GCSFilesystem(
                    project_id=self._opts['google_project_id']
                ), disable_if=_is_permanent_google_error)

            self._fs.add_fs('hadoop', HadoopFilesystem(
                self._opts['hadoop_bin']))

            self._fs.add_fs('local', LocalFilesystem())

        return self._fs

    def _upload_local_files(self):
        # in local mode, nothing to upload
        if not self._upload_mgr:
            return

        self.fs.mkdir(self._upload_mgr.prefix)

        log.info('Copying local files to %s' % self._upload_mgr.prefix)
        for src_path, uri in self._upload_mgr.path_to_uri().items():
            log.debug('  %s -> %s' % (src_path, uri))
            self.fs.put(src_path, uri)

    def _run_steps_on_spark(self):
        for step_num, step in enumerate(self._get_steps()):
            self._run_step_on_spark(step, step_num)

    def _run_step_on_spark(self, step, step_num):
        if self._opts['upload_archives'] and self._spark_master() != 'yarn':
            log.warning('Spark master %r will probably ignore archives' %
                        self._spark_master())

        spark_submit_args = self._args_for_spark_step(step_num)

        env = dict(os.environ)
        env.update(self._spark_cmdenv(step_num))

        returncode = self._run_spark_submit(spark_submit_args, env,
                                            record_callback=_log_log4j_record)

        if returncode:
            reason = str(CalledProcessError(returncode, spark_submit_args))
            raise StepFailedException(
                reason=reason, step_num=step_num,
                num_steps=self._num_steps())