Example #1
0
    def test_dot_underscore(self):
        sd = UploadDirManager('hdfs:///')

        sd.add('._')
        sd.add('._.txt')
        sd.add('._foo')

        self.assertEqual(sd.path_to_uri(), {
            '._': 'hdfs:///1',
            '._.txt': 'hdfs:///1.txt',
            '._foo': 'hdfs:///foo'
        })
Example #2
0
    def __init__(self, **kwargs):
        super(SparkMRJobRunner, self).__init__(**kwargs)

        self._spark_tmp_dir = self._pick_spark_tmp_dir()

        # where local files are uploaded into Spark
        if is_uri(self._spark_tmp_dir):
            spark_files_dir = posixpath.join(self._spark_tmp_dir, 'files', '')
            self._upload_mgr = UploadDirManager(spark_files_dir)

        # where to put job output (if not set explicitly)
        if not self._output_dir:
            self._output_dir = posixpath.join(self._spark_tmp_dir, 'output')

        # keep track of where the spark-submit binary is
        self._spark_submit_bin = self._opts['spark_submit_bin']
Example #3
0
    def __init__(self, **kwargs):
        """:py:class:`~mrjob.hadoop.HadoopJobRunner` takes the same arguments
        as :py:class:`~mrjob.runner.MRJobRunner`, plus some additional options
        which can be defaulted in :ref:`mrjob.conf <mrjob.conf>`.
        """
        super(HadoopJobRunner, self).__init__(**kwargs)

        if self._opts['hadoop_home']:
            log.warning(
                'hadoop_home is deprecated since 0.5.0 and will be removed'
                ' in v0.6.0. In most cases, mrjob will now find the hadoop'
                ' binary and streaming jar without help. If not, use the'
                ' hadoop_bin and hadoop_streaming_jar options.')

        self._hadoop_tmp_dir = fully_qualify_hdfs_path(
            posixpath.join(self._opts['hadoop_tmp_dir'], self._job_key))

        # Keep track of local files to upload to HDFS. We'll add them
        # to this manager just before we need them.
        hdfs_files_dir = posixpath.join(self._hadoop_tmp_dir, 'files', '')
        self._upload_mgr = UploadDirManager(hdfs_files_dir)

        # Set output dir if it wasn't set explicitly
        self._output_dir = fully_qualify_hdfs_path(
            self._output_dir or posixpath.join(self._hadoop_tmp_dir, 'output'))

        # Fully qualify step_output_dir, if set
        if self._step_output_dir:
            self._step_output_dir = fully_qualify_hdfs_path(
                self._step_output_dir)

        # Track job and (YARN) application ID to enable log parsing
        self._application_id = None
        self._job_id = None

        # Keep track of where the hadoop streaming jar is
        self._hadoop_streaming_jar = self._opts['hadoop_streaming_jar']
        self._searched_for_hadoop_streaming_jar = False

        # Keep track of where the spark-submit binary is
        self._spark_submit_bin = self._opts['spark_submit_bin']

        # List of dicts (one for each step) potentially containing
        # the keys 'history', 'step', and 'task' ('step' will always
        # be filled because it comes from the hadoop jar command output,
        # others will be filled as needed)
        self._log_interpretations = []
Example #4
0
    def __init__(self, max_output_files=None, mrjob_cls=None, **kwargs):
        """Create a spark runner

        :param max_output_files: limit on number of output files when
                                 running streaming jobs. Can only be
                                 set on command line (not config file)
        :param mrjob_cls: class of the job you want to run. Used for
                          running streaming steps in Spark

        SparkMRJobRunner ignores the keyword arguments *hadoop_input_format*,
        *hadoop_output_format*, and *sort_values* (see
        :py:meth:`MRJobRunner.__init__`). These are only set by the job as a
        way to communicate certain attributes to the runner, and the Spark
        runner instead inspects the job directly.
        """
        # need to set this before checking steps in superclass __init__()
        self._mrjob_cls = mrjob_cls

        super(SparkMRJobRunner, self).__init__(**kwargs)

        self._max_output_files = max_output_files

        self._spark_tmp_dir = self._pick_spark_tmp_dir()

        # where local files are uploaded into Spark
        if is_uri(self._spark_tmp_dir):
            spark_files_dir = posixpath.join(self._spark_tmp_dir, 'files', '')
            self._upload_mgr = UploadDirManager(spark_files_dir)

        # where to put job output (if not set explicitly)
        if not self._output_dir:
            self._output_dir = self.fs.join(self._spark_tmp_dir, 'output')

        # keep track of where the spark-submit binary is
        self._spark_submit_bin = self._opts['spark_submit_bin']

        # where to store a .zip file containing the MRJob, with a unique
        # module name
        self._job_script_zip_path = None

        # counters, one per job step. (Counters will be {} for non-streaming
        # steps because Spark doesn't have counters).
        self._counters = []
Example #5
0
    def __init__(self, max_output_files=None, mrjob_cls=None, **kwargs):
        """Create a Spark runner.

        :param max_output_files: limit on number of output files when
                                 running streaming jobs. Can only be
                                 set on command line (not config file)
        :param mrjob_cls: class of the job you want to run. Used for
                          running streaming steps in Spark
        """
        # need to set this before checking steps in superclass __init__()
        self._mrjob_cls = mrjob_cls

        super(SparkMRJobRunner, self).__init__(**kwargs)

        self._max_output_files = max_output_files

        if self._opts['spark_tmp_dir']:
            self._check_spark_tmp_dir_opt()

        self._spark_tmp_dir = self._pick_spark_tmp_dir()

        # where local files are uploaded into Spark
        if is_uri(self._spark_tmp_dir):
            spark_files_dir = posixpath.join(self._spark_tmp_dir, 'files', '')
            self._upload_mgr = UploadDirManager(spark_files_dir)

        # where to put job output (if not set explicitly)
        if not self._output_dir:
            self._output_dir = self.fs.join(self._spark_tmp_dir, 'output')

        # keep track of where the spark-submit binary is
        self._spark_submit_bin = self._opts['spark_submit_bin']

        # where to store a .zip file containing the MRJob, with a unique
        # module name
        self._job_script_zip_path = None

        # counters, one per job step. (Counters will be {} for non-streaming
        # steps because Spark doesn't have counters).
        self._counters = []
Example #6
0
    def __init__(self, **kwargs):
        """:py:class:`~mrjob.dataproc.DataprocJobRunner` takes the same
        arguments as
        :py:class:`~mrjob.runner.MRJobRunner`, plus some additional options
        which can be defaulted in :ref:`mrjob.conf <mrjob.conf>`.
        """
        super(DataprocJobRunner, self).__init__(**kwargs)

        # check for library support
        if google is None:
            raise ImportError('You must install google-cloud-logging and '
                              'google-cloud-storage to connect to Dataproc')

        # Dataproc requires a master and >= 2 core instances
        # num_core_instances refers ONLY to number of CORE instances and does
        # NOT include the required 1 instance for master
        # In other words, minimum cluster size is 3 machines, 1 master and 2
        # "num_core_instances" workers
        if self._opts['num_core_instances'] < _DATAPROC_MIN_WORKERS:
            raise DataprocException('Dataproc expects at LEAST %d workers' %
                                    _DATAPROC_MIN_WORKERS)

        if (self._opts['core_instance_type'] !=
                self._opts['task_instance_type']):
            raise DataprocException(
                'Dataproc v1 expects core/task instance types to be identical')

        # see #1820
        if self._opts['image_id']:
            log.warning('mrjob does not yet support custom machine images'
                        ' on Dataproc')

        # load credentials and project ID
        self._credentials, auth_project_id = google.auth.default(
            scopes=[_FULL_SCOPE])  # needed for $GOOGLE_APPLICATION_CREDENTIALS

        self._project_id = self._opts['project_id'] or auth_project_id

        if not self._project_id:
            raise DataprocException(
                'project_id must be set. Use --project_id or'
                ' set $GOOGLE_CLOUD_PROJECT')

        self._fix_zone_and_region_opts()

        if self._opts['service_account_scopes']:
            self._opts['service_account_scopes'] = [
                _fully_qualify_scope_uri(s)
                for s in self._opts['service_account_scopes']
            ]

        # cluster_id can be None here
        self._cluster_id = self._opts['cluster_id']

        self._api_client = None
        self._gcs_fs = None
        self._fs = None

        # BEGIN - setup directories
        base_tmpdir = self._get_tmpdir(self._opts['cloud_tmp_dir'])

        self._cloud_tmp_dir = _check_and_fix_fs_dir(base_tmpdir)

        # use job key to make a unique tmp dir
        self._job_tmpdir = self._cloud_tmp_dir + self._job_key + '/'

        # pick/validate output dir
        if self._output_dir:
            self._output_dir = _check_and_fix_fs_dir(self._output_dir)
        else:
            self._output_dir = self._job_tmpdir + 'output/'
        # END - setup directories

        # manage local files that we want to upload to GCS. We'll add them
        # to this manager just before we need them.
        fs_files_dir = self._job_tmpdir + 'files/'
        self._upload_mgr = UploadDirManager(fs_files_dir)

        # when did our particular task start?
        self._dataproc_job_start = None

        # init hadoop, ami version caches
        self._image_version = None
        self._hadoop_version = None

        # map driver_output_uri to a dict with the keys:
        # log_uri: uri of file we're reading from
        # pos: position in file
        # buffer: bytes read from file already
        self._driver_output_state = {}

        # This will be filled by _run_steps()
        # NOTE - log_interpretations will be empty except job_id until we
        # parse task logs
        self._log_interpretations = []
Example #7
0
    def __init__(self, **kwargs):
        """:py:class:`~mrjob.dataproc.DataprocJobRunner` takes the same
        arguments as
        :py:class:`~mrjob.runner.MRJobRunner`, plus some additional options
        which can be defaulted in :ref:`mrjob.conf <mrjob.conf>`.
        """
        super(DataprocJobRunner, self).__init__(**kwargs)

        # Lazy-load gcloud config as needed - invocations fail in PyCharm
        # debugging
        self._gcloud_config = None

        # Google Cloud Platform - project
        self._gcp_project = (self._opts['gcp_project']
                             or self.gcloud_config()['core.project'])

        # Google Compute Engine - Region / Zone
        self._gce_region = (self._opts['region']
                            or self.gcloud_config()['compute.region'])
        self._gce_zone = (self._opts['zone']
                          or self.gcloud_config()['compute.zone'])

        # cluster_id can be None here
        self._cluster_id = self._opts['cluster_id']

        self._api_client = None
        self._gcs_fs = None
        self._fs = None

        # BEGIN - setup directories
        base_tmpdir = self._get_tmpdir(self._opts['cloud_tmp_dir'])

        self._cloud_tmp_dir = _check_and_fix_fs_dir(base_tmpdir)

        # use job key to make a unique tmp dir
        self._job_tmpdir = self._cloud_tmp_dir + self._job_key + '/'

        # pick/validate output dir
        if self._output_dir:
            self._output_dir = _check_and_fix_fs_dir(self._output_dir)
        else:
            self._output_dir = self._job_tmpdir + 'output/'
        # END - setup directories

        # manage working dir for bootstrap script
        self._bootstrap_dir_mgr = BootstrapWorkingDirManager()

        # manage local files that we want to upload to GCS. We'll add them
        # to this manager just before we need them.
        fs_files_dir = self._job_tmpdir + 'files/'
        self._upload_mgr = UploadDirManager(fs_files_dir)

        self._bootstrap = self._bootstrap_python() + self._parse_bootstrap()

        for cmd in self._bootstrap:
            for maybe_path_dict in cmd:
                if isinstance(maybe_path_dict, dict):
                    self._bootstrap_dir_mgr.add(**maybe_path_dict)

        # we'll create the script later
        self._master_bootstrap_script_path = None

        # when did our particular task start?
        self._dataproc_job_start = None

        # init hadoop, ami version caches
        self._image_version = None
        self._hadoop_version = None

        # This will be filled by _run_steps()
        # NOTE - log_interpretations will be empty except job_id until we
        # parse task logs
        self._log_interpretations = []
Example #8
0
 def uri_adds_trailing_slash(self):
     sd = UploadDirManager('s3://bucket/dir')
     sd.add('foo/bar.py')
     self.assertEqual(sd.uri('foo/bar.py'), 's3://bucket/dir/bar.py')
     self.assertEqual(sd.path_to_uri(),
                      {'foo/bar.py': 's3://bucket/dir/bar.py'})
Example #9
0
 def test_uri(self):
     sd = UploadDirManager('hdfs:///')
     sd.add('foo/bar.py')
     self.assertEqual(sd.uri('foo/bar.py'), 'hdfs:///bar.py')
Example #10
0
 def test_add_is_idempotent(self):
     sd = UploadDirManager('hdfs:///')
     sd.add('foo/bar.py')
     self.assertEqual(sd.path_to_uri(), {'foo/bar.py': 'hdfs:///bar.py'})
     sd.add('foo/bar.py')
     self.assertEqual(sd.path_to_uri(), {'foo/bar.py': 'hdfs:///bar.py'})
Example #11
0
 def test_simple(self):
     sd = UploadDirManager('hdfs:///')
     sd.add('foo/bar.py')
     self.assertEqual(sd.path_to_uri(), {'foo/bar.py': 'hdfs:///bar.py'})
Example #12
0
 def test_empty(self):
     sd = UploadDirManager('hdfs:///')
     self.assertEqual(sd.path_to_uri(), {})
Example #13
0
    def __init__(self, **kwargs):
        """:py:class:`~mrjob.dataproc.DataprocJobRunner` takes the same
        arguments as
        :py:class:`~mrjob.runner.MRJobRunner`, plus some additional options
        which can be defaulted in :ref:`mrjob.conf <mrjob.conf>`.
        """
        super(DataprocJobRunner, self).__init__(**kwargs)

        # Dataproc requires a master and >= 2 core instances
        # num_core_instances refers ONLY to number of CORE instances and does
        # NOT include the required 1 instance for master
        # In other words, minimum cluster size is 3 machines, 1 master and 2
        # "num_core_instances" workers
        if self._opts['num_core_instances'] < _DATAPROC_MIN_WORKERS:
            raise DataprocException(
                'Dataproc expects at LEAST %d workers' % _DATAPROC_MIN_WORKERS)

        if (self._opts['core_instance_type'] !=
                self._opts['task_instance_type']):
            raise DataprocException(
                'Dataproc v1 expects core/task instance types to be identical')

        # Lazy-load gcloud config as needed - invocations fail in PyCharm
        # debugging
        self._gcloud_config = None

        # Google Cloud Platform - project
        self._gcp_project = (
            self._opts['gcp_project'] or self.gcloud_config()['core.project'])

        # Google Compute Engine - Region / Zone
        self._gce_region = (
            self._opts['region'] or self.gcloud_config()['compute.region'])
        self._gce_zone = (
            self._opts['zone'] or self.gcloud_config()['compute.zone'])

        # cluster_id can be None here
        self._cluster_id = self._opts['cluster_id']

        self._api_client = None
        self._gcs_fs = None
        self._fs = None

        # BEGIN - setup directories
        base_tmpdir = self._get_tmpdir(self._opts['cloud_tmp_dir'])

        self._cloud_tmp_dir = _check_and_fix_fs_dir(base_tmpdir)

        # use job key to make a unique tmp dir
        self._job_tmpdir = self._cloud_tmp_dir + self._job_key + '/'

        # pick/validate output dir
        if self._output_dir:
            self._output_dir = _check_and_fix_fs_dir(self._output_dir)
        else:
            self._output_dir = self._job_tmpdir + 'output/'
        # END - setup directories

        # manage local files that we want to upload to GCS. We'll add them
        # to this manager just before we need them.
        fs_files_dir = self._job_tmpdir + 'files/'
        self._upload_mgr = UploadDirManager(fs_files_dir)

        # when did our particular task start?
        self._dataproc_job_start = None

        # init hadoop, ami version caches
        self._image_version = None
        self._hadoop_version = None

        # This will be filled by _run_steps()
        # NOTE - log_interpretations will be empty except job_id until we
        # parse task logs
        self._log_interpretations = []
Example #14
0
    def __init__(self, **kwargs):
        """:py:class:`~mrjob.dataproc.DataprocJobRunner` takes the same
        arguments as
        :py:class:`~mrjob.runner.MRJobRunner`, plus some additional options
        which can be defaulted in :ref:`mrjob.conf <mrjob.conf>`.
        """
        super(DataprocJobRunner, self).__init__(**kwargs)

        # check for library support
        if google is None:
            raise ImportError(
                'You must install google-cloud and google-cloud-dataproc'
                ' to connect to Dataproc')

        # Dataproc requires a master and >= 2 core instances
        # num_core_instances refers ONLY to number of CORE instances and does
        # NOT include the required 1 instance for master
        # In other words, minimum cluster size is 3 machines, 1 master and 2
        # "num_core_instances" workers
        if self._opts['num_core_instances'] < _DATAPROC_MIN_WORKERS:
            raise DataprocException('Dataproc expects at LEAST %d workers' %
                                    _DATAPROC_MIN_WORKERS)

        if (self._opts['core_instance_type'] !=
                self._opts['task_instance_type']):
            raise DataprocException(
                'Dataproc v1 expects core/task instance types to be identical')

        # load credentials and project ID
        self._credentials, auth_project_id = google.auth.default(
            scopes=_DEFAULT_GCE_SERVICE_ACCOUNT_SCOPES)

        self._project_id = self._opts['project_id'] or auth_project_id

        if not self._project_id:
            raise DataprocException(
                'project_id must be set. Use --project_id or'
                ' set $GOOGLE_CLOUD_PROJECT')

        self._fix_zone_and_region_opts()

        # cluster_id can be None here
        self._cluster_id = self._opts['cluster_id']

        self._api_client = None
        self._gcs_fs = None
        self._fs = None

        # BEGIN - setup directories
        base_tmpdir = self._get_tmpdir(self._opts['cloud_tmp_dir'])

        self._cloud_tmp_dir = _check_and_fix_fs_dir(base_tmpdir)

        # use job key to make a unique tmp dir
        self._job_tmpdir = self._cloud_tmp_dir + self._job_key + '/'

        # pick/validate output dir
        if self._output_dir:
            self._output_dir = _check_and_fix_fs_dir(self._output_dir)
        else:
            self._output_dir = self._job_tmpdir + 'output/'
        # END - setup directories

        # manage local files that we want to upload to GCS. We'll add them
        # to this manager just before we need them.
        fs_files_dir = self._job_tmpdir + 'files/'
        self._upload_mgr = UploadDirManager(fs_files_dir)

        # when did our particular task start?
        self._dataproc_job_start = None

        # init hadoop, ami version caches
        self._image_version = None
        self._hadoop_version = None

        # This will be filled by _run_steps()
        # NOTE - log_interpretations will be empty except job_id until we
        # parse task logs
        self._log_interpretations = []