Esempio n. 1
    def __init__(self, **kwargs):
        super(HadoopInTheCloudJobRunner, self).__init__(**kwargs)

        # if *cluster_id* is not set, ``self._cluster_id`` will be
        # set when we create or join a cluster
        self._cluster_id = self._opts['cluster_id']

        # bootstrapping
        self._bootstrap = self._bootstrap_python() + self._parse_bootstrap()

        # add files to manager
        self._bootstrap_dir_mgr = WorkingDirManager()

        for cmd in self._bootstrap:
            for token in cmd:
                if isinstance(token, dict):
                    # convert dir archive tokens to archives
                    if token['type'] == 'dir':
                        token['path'] = self._dir_archive_path(token['path'])
                        token['type'] = 'archive'


        # we'll create this script later, as needed
        self._master_bootstrap_script_path = None
Esempio n. 2
 def test_cant_give_same_path_different_types(self):
     wd = WorkingDirManager()
     wd.add('archive', 'foo/', name='')
Esempio n. 3
 def test_simple(self):
     wd = WorkingDirManager()
     wd.add('archive', 's3://bucket/path/to/baz.tar.gz')
     wd.add('file', 'foo/')
     self.assertEqual(wd.name_to_path('file'), {'': 'foo/'})
                      {'baz.tar.gz': 's3://bucket/path/to/baz.tar.gz'})
                      set(['foo/', 's3://bucket/path/to/baz.tar.gz']))
Esempio n. 4
    def test_allow_hidden_files(self):
        wd = WorkingDirManager()
        wd.add('archive', '_foo.tar.gz')
        wd.add('file', '.bazrc')

        self.assertEqual('archive', '_foo.tar.gz'), '_foo.tar.gz')
        self.assertEqual('file', '.bazrc'), '.bazrc')
Esempio n. 5
 def test_auto_names_are_different_from_assigned_names(self):
     wd = WorkingDirManager()
     wd.add('file', 'foo/', name='')
     wd.add('file', 'foo/')  # use default name
                      {'': 'foo/',
                       '': 'foo/'})
     self.assertEqual(wd.paths(), set(['foo/']))
Esempio n. 6
 def test_empty(self):
     wd = WorkingDirManager()
     self.assertEqual(wd.name_to_path('archive'), {})
     self.assertEqual(wd.name_to_path('file'), {})
     self.assertEqual(wd.name_to_path(), {})
     self.assertEqual(wd.paths(), set())
     self.assertEqual(wd.paths('archive'), set())
     self.assertEqual(wd.paths('file'), set())
Esempio n. 7
    def __init__(self, **kwargs):
        super(HadoopInTheCloudJobRunner, self).__init__(**kwargs)

        # if *cluster_id* is not set, ``self._cluster_id`` will be
        # set when we create or join a cluster
        self._cluster_id = self._opts['cluster_id']

        # bootstrapping
        self._bootstrap = self._bootstrap_python() + self._parse_bootstrap()

        # add files to manager
        self._bootstrap_dir_mgr = WorkingDirManager()

        for cmd in self._bootstrap:
            for token in cmd:
                if isinstance(token, dict):
                    # convert dir archive tokens to archives
                    if token['type'] == 'dir':
                        token['path'] = self._dir_archive_path(token['path'])
                        token['type'] = 'archive'


        # we'll create this script later, as needed
        self._master_bootstrap_script_path = None
Esempio n. 8
    def test_allow_hidden_files(self):
        wd = WorkingDirManager()
        wd.add('archive', '_foo.tar.gz')
        wd.add('file', '.bazrc')

        self.assertEqual('archive', '_foo.tar.gz'), '_foo.tar.gz')
        self.assertEqual('file', '.bazrc'), '.bazrc')
Esempio n. 9
 def test_simple(self):
     wd = WorkingDirManager()
     wd.add('archive', 's3://bucket/path/to/baz.tar.gz')
     wd.add('file', 'foo/')
                      {'': 'foo/'})
                      {'baz.tar.gz': 's3://bucket/path/to/baz.tar.gz'})
         set(['foo/', 's3://bucket/path/to/baz.tar.gz']))
Esempio n. 10
 def test_lazy_naming(self):
     wd = WorkingDirManager()
     wd.add('file', '')  # by default
     wd.add('file', '', name='')
     self.assertEqual(wd.name_to_path('file'), {
         '': '',
         '': ''
Esempio n. 11
 def test_auto_names_are_different_from_assigned_names(self):
     wd = WorkingDirManager()
     wd.add('file', 'foo/', name='')
     wd.add('file', 'foo/')  # use default name
                      {'': 'foo/',
                       '': 'foo/'})
     self.assertEqual(wd.paths(), set(['foo/']))
Esempio n. 12
    def test_archive_file_suffix(self):
        wd = WorkingDirManager(archive_file_suffix='.file')

        wd.add('archive', 'foo.tar.gz')

        self.assertEqual('archive', 'foo.tar.gz'), 'foo')
        self.assertEqual('archive_file', 'foo.tar.gz'),
Esempio n. 13
    def __init__(self, **kwargs):
        super(HadoopInTheCloudJobRunner, self).__init__(**kwargs)

        # if *cluster_id* is not set, ``self._cluster_id`` will be
        # set when we create or join a cluster
        self._cluster_id = self._opts['cluster_id']

        # bootstrapping
        self._bootstrap = self._bootstrap_python() + self._parse_bootstrap()

        # add files to manager
        self._bootstrap_dir_mgr = WorkingDirManager()

        for cmd in self._bootstrap:
            for token in cmd:
                if isinstance(token, dict):
                    # convert dir archive tokens to archives
                    if token['type'] == 'dir':
                        token['path'] = self._dir_archive_path(token['path'])
                        token['type'] = 'archive'


        # we'll create this script later, as needed
        self._master_bootstrap_script_path = None

        # ssh state

        # the process for the SSH tunnel
        self._ssh_proc = None

        # if this is true, stop trying to launch the SSH tunnel
        self._give_up_on_ssh_tunnel = False

        # store the (tunneled) URL of the job tracker/resource manager
        self._ssh_tunnel_url = None
Esempio n. 14
    def __init__(self, **kwargs):
        super(HadoopInTheCloudJobRunner, self).__init__(**kwargs)

        # if *cluster_id* is not set, ``self._cluster_id`` will be
        # set when we create or join a cluster
        self._cluster_id = self._opts['cluster_id']

        # bootstrapping
        self._bootstrap = self._bootstrap_python() + self._parse_bootstrap()

        # add files to manager
        self._bootstrap_dir_mgr = WorkingDirManager()

        for cmd in self._bootstrap:
            for token in cmd:
                if isinstance(token, dict):
                    # convert dir archive tokens to archives
                    if token['type'] == 'dir':
                        token['path'] = self._dir_archive_path(token['path'])
                        token['type'] = 'archive'


        # we'll create this script later, as needed
        self._master_bootstrap_script_path = None

        # ssh state

        # the process for the SSH tunnel
        self._ssh_proc = None

        # if this is true, stop trying to launch the SSH tunnel
        self._give_up_on_ssh_tunnel = False

        # store the (tunneled) URL of the job tracker/resource manager
        self._ssh_tunnel_url = None
Esempio n. 15
 def test_auto_names_are_different_from_assigned_names(self):
     wd = WorkingDirManager()
     wd.add("file", "foo/", name="")
     wd.add("file", "foo/")  # use default name
     self.assertEqual(wd.name_to_path("file"), {"": "foo/", "": "foo/"})
Esempio n. 16
 def test_okay_to_give_same_path_same_name(self):
     wd = WorkingDirManager()
     wd.add("file", "foo/", name="")
     wd.add("file", "foo/", name="")
     self.assertEqual(wd.name_to_path("file"), {"": "foo/"})
Esempio n. 17
 def test_explicit_name_collision(self):
     wd = WorkingDirManager()
     wd.add("file", "", name="")
     self.assertRaises(ValueError, wd.add, "file", "", name="")
Esempio n. 18
class HadoopInTheCloudJobRunner(MRJobBinRunner):
    """Abstract base class for all Hadoop-in-the-cloud services."""

    alias = '_cloud'

    OPT_NAMES = MRJobBinRunner.OPT_NAMES | {

    # so far, every service provides the ability to run bootstrap scripts

    def __init__(self, **kwargs):
        super(HadoopInTheCloudJobRunner, self).__init__(**kwargs)

        # if *cluster_id* is not set, ``self._cluster_id`` will be
        # set when we create or join a cluster
        self._cluster_id = self._opts['cluster_id']

        # bootstrapping
        self._bootstrap = self._bootstrap_python() + self._parse_bootstrap()

        # add files to manager
        self._bootstrap_dir_mgr = WorkingDirManager()

        for cmd in self._bootstrap:
            for token in cmd:
                if isinstance(token, dict):
                    # convert dir archive tokens to archives
                    if token['type'] == 'dir':
                        token['path'] = self._dir_archive_path(token['path'])
                        token['type'] = 'archive'


        # we'll create this script later, as needed
        self._master_bootstrap_script_path = None

    ### Options ###

    def _fix_opts(self, opts, source=None):
        opts = super(HadoopInTheCloudJobRunner, self)._fix_opts(
            opts, source=source)

        # patch max_hours_idle into max_mins_idle (see #1663)
        if opts.get('max_hours_idle') is not None:
                'max_hours_idle is deprecated and will be removed in v0.7.0.' +
                (' Please use max_mins_idle instead'
                 if opts.get('max_mins_idle') is None else ''))

        if opts.get('max_mins_idle') is None:
            if opts.get('max_hours_idle') is not None:
                opts['max_mins_idle'] = opts['max_hours_idle'] * 60
                opts['max_mins_idle'] = _DEFAULT_MAX_MINS_IDLE

        # warn about issues with
        if opts['max_mins_idle'] < _DEFAULT_MAX_MINS_IDLE:
            log.warning('Setting max_mins_idle to less than %.1f may result'
                        ' in cluster shutting down before job can run' %

        return opts

    def _combine_opts(self, opt_list):
        """Propagate *instance_type* to other instance type opts, if not
        already set.

        Also propagate core instance type to task instance type, if it's
        not already set.
        opts = super(HadoopInTheCloudJobRunner, self)._combine_opts(opt_list)

        if opts['instance_type']:
            # figure out how late in the configs opt was set (setting
            # --instance_type on the command line overrides core_instance_type
            # set in configs)
            opt_priority = {k: -1 for k in opts}

            for i, sub_opts in enumerate(opt_list):
                for k, v in sub_opts.items():
                    if v == opts[k]:
                        opt_priority[k] = i

            # instance_type only affects master_instance_type if there are
            # no other instances
            if opts['num_core_instances'] or opts['num_task_instances']:
                propagate_to = ['core_instance_type', 'task_instance_type']
                propagate_to = ['master_instance_type']

            for k in propagate_to:
                if opts[k] is None or (
                        opt_priority[k] < opt_priority['instance_type']):
                    opts[k] = opts['instance_type']

        if not opts['task_instance_type']:
            opts['task_instance_type'] = opts['core_instance_type']

        return opts

    ### Bootstrapping ###

    def _bootstrap_python(self):
        """Redefine this to return a (possibly empty) list of parsed commands
        (in the same format as returned by parse_setup_cmd())' to make sure a
        compatible version of Python is installed

        If the *bootstrap_python* option is false, should always return ``[]``.
        return []

    def _cp_to_local_cmd(self):
        """Command to copy files from the cloud to the local directory
        (usually via Hadoop). Redefine this as needed; for example, on EMR,
        we sometimes have to use ``aws s3 cp`` because ``hadoop`` isn't
        installed at bootstrap time."""
        return 'hadoop fs -copyToLocal'

    def _parse_bootstrap(self):
        """Parse the *bootstrap* option with
        return [parse_setup_cmd(cmd) for cmd in self._opts['bootstrap']]

    def _create_master_bootstrap_script_if_needed(self):
        """Helper for :py:meth:`_add_bootstrap_files_for_upload`.

        Create the master bootstrap script and write it into our local
        temp directory. Set self._master_bootstrap_script_path.

        This will do nothing if there are no bootstrap scripts or commands,
        or if it has already been called."""
        if self._master_bootstrap_script_path:

        # don't bother if we're not starting a cluster
        if self._cluster_id:

        # Also don't bother if we're not bootstrapping
        if not (self._bootstrap or self._bootstrap_mrjob()):

        # create if we need it, and add commands to install it
        mrjob_bootstrap = []
        if self._bootstrap_mrjob():
            assert self._mrjob_zip_path
            path_dict = {
                'type': 'file', 'name': None, 'path': self._mrjob_zip_path}

            # find out where python keeps its libraries
                "__mrjob_PYTHON_LIB=$(%s -c "
                "'from distutils.sysconfig import get_python_lib;"
                " print(get_python_lib())')" %

            # remove anything that might be in the way (see #1567)
            mrjob_bootstrap.append(['sudo rm -rf $__mrjob_PYTHON_LIB/mrjob'])

            # unzip
                ['sudo unzip ', path_dict, ' -d $__mrjob_PYTHON_LIB'])

            # re-compile pyc files now, since mappers/reducers can't
            # write to this directory. Don't fail if there is extra
            # un-compileable crud in the tarball (this would matter if
            # sh_bin were 'sh -e')
                ['sudo %s -m compileall -q'
                 ' -f $__mrjob_PYTHON_LIB/mrjob && true' %

        path = os.path.join(self._get_local_tmp_dir(), '')'writing master bootstrap script to %s' % path)

        contents = self._master_bootstrap_script_content(
            self._bootstrap + mrjob_bootstrap)
        for line in contents:
            log.debug('BOOTSTRAP: ' + line)

        with open(path, 'wb') as f:
            for line in contents:
                f.write(line.encode('utf-8') + b'\n')

        self._master_bootstrap_script_path = path

    def _master_bootstrap_script_content(self, bootstrap):
        """Return a list containing the lines of the master bootstrap script.
        (without trailing newlines)
        out = []

        # shebang, precommands

        # store $PWD
        out.append('# store $PWD')

        # special case for PWD being in /, which happens on Dataproc
        # (really we should cd to tmp or something)
        out.append('if [ $__mrjob_PWD = "/" ]; then')
        out.append('  __mrjob_PWD=""')

        # run commands in a block so we can redirect stdout to stderr
        # (e.g. to catch errors from compileall). See #370

        # download files
        out.append('  # download files and mark them executable')

        cp_to_local = self._cp_to_local_cmd()

        # TODO: why bother with $__mrjob_PWD here, since we're already in it?
        for name, path in sorted(
            uri = self._upload_mgr.uri(path)
            out.append('  %s %s $__mrjob_PWD/%s' %
                       (cp_to_local, pipes.quote(uri), pipes.quote(name)))
            # imitate Hadoop Distributed Cache (see #1602)
            out.append('  chmod u+rx $__mrjob_PWD/%s' % pipes.quote(name))

        # download and unarchive archives
        archive_names_and_paths = sorted(
        if archive_names_and_paths:
            # make tmp dir if needed
            out.append('  # download and unpack archives')
            out.append('  __mrjob_TMP=$(mktemp -d)')

            for name, path in archive_names_and_paths:
                uri = self._upload_mgr.uri(path)
                ext = file_ext(basename(path))

                # copy file to tmp dir
                quoted_archive_path = '$__mrjob_TMP/%s' % pipes.quote(name)

                out.append('  %s %s %s' % (
                    cp_to_local, pipes.quote(uri), quoted_archive_path))

                # unarchive file
                if ext not in _EXT_TO_UNARCHIVE_CMD:
                    raise KeyError('unknown archive file extension: %s' % path)
                unarchive_cmd = _EXT_TO_UNARCHIVE_CMD[ext]

                out.append('  ' + unarchive_cmd % dict(
                    dir='$__mrjob_PWD/' + pipes.quote(name)))

                # imitate Hadoop Distributed Cache (see #1602)
                    '  chmod u+rx -R $__mrjob_PWD/%s' % pipes.quote(name))


        # run bootstrap commands
        out.append('  # bootstrap commands')
        for cmd in bootstrap:
            # reconstruct the command line, substituting $__mrjob_PWD/<name>
            # for path dicts
            line = '  '
            for token in cmd:
                if isinstance(token, dict):
                    # it's a path dictionary
                    line += '$__mrjob_PWD/'
                    line += pipes.quote(**token))
                    # it's raw script
                    line += token

        out.append('} 1>&2')  # stdout -> stderr for ease of error log parsing

        return out

    def _start_of_sh_script(self):
        """Return a list of lines (without trailing newlines) containing the
        shell script shebang and pre-commands."""
        out = []

        # shebang
        sh_bin = self._sh_bin()
        if not sh_bin[0].startswith('/'):
            sh_bin = ['/usr/bin/env'] + sh_bin
        out.append('#!' + cmd_line(sh_bin))

        # hook for 'set -e', etc. (see #1549)

        return out

    ### Launching Clusters ###

    def _add_extra_cluster_params(self, params):
        """Return a dict with the *extra_cluster_params* opt patched into
        *params*, and ``None`` values removed."""
        params = params.copy()
        params = {k: v for k, v in params.items() if v is not None}

        return params
Esempio n. 19
class HadoopInTheCloudJobRunner(MRJobRunner):
    """Abstract base class for all Hadoop-in-the-cloud services."""

    alias = '_cloud'

    OPT_NAMES = MRJobRunner.OPT_NAMES | {

    # so far, every service provides the ability to run bootstrap scripts

    def __init__(self, **kwargs):
        super(HadoopInTheCloudJobRunner, self).__init__(**kwargs)

        # if *cluster_id* is not set, ``self._cluster_id`` will be
        # set when we create or join a cluster
        self._cluster_id = self._opts['cluster_id']

        # bootstrapping
        self._bootstrap = self._bootstrap_python() + self._parse_bootstrap()

        # add files to manager
        self._bootstrap_dir_mgr = WorkingDirManager()

        for cmd in self._bootstrap:
            for token in cmd:
                if isinstance(token, dict):
                    # convert dir archive tokens to archives
                    if token['type'] == 'dir':
                        token['path'] = self._dir_archive_path(token['path'])
                        token['type'] = 'archive'


        # we'll create this script later, as needed
        self._master_bootstrap_script_path = None

    ### Options ###

    def _combine_opts(self, opt_list):
        """Propagate *instance_type* to other instance type opts, if not
        already set.

        Also propagate core instance type to task instance type, if it's
        not already set.
        opts = super(HadoopInTheCloudJobRunner, self)._combine_opts(opt_list)

        if opts['instance_type']:
            # figure out how late in the configs opt was set (setting
            # --instance_type on the command line overrides core_instance_type
            # set in configs)
            opt_priority = {k: -1 for k in opts}

            for i, sub_opts in enumerate(opt_list):
                for k, v in sub_opts.items():
                    if v == opts[k]:
                        opt_priority[k] = i

            # instance_type only affects master_instance_type if there are
            # no other instances
            if opts['num_core_instances'] or opts['num_task_instances']:
                propagate_to = ['core_instance_type', 'task_instance_type']
                propagate_to = ['master_instance_type']

            for k in propagate_to:
                if opts[k] is None or (opt_priority[k] <
                    opts[k] = opts['instance_type']

        if not opts['task_instance_type']:
            opts['task_instance_type'] = opts['core_instance_type']

        return opts

    ### Bootstrapping ###

    def _bootstrap_python(self):
        """Redefine this to return a (possibly empty) list of parsed commands
        (in the same format as returned by parse_setup_cmd())' to make sure a
        compatible version of Python is installed

        If the *bootstrap_python* option is false, should always return ``[]``.
        return []

    def _cp_to_local_cmd(self):
        """Command to copy files from the cloud to the local directory
        (usually via Hadoop). Redefine this as needed; for example, on EMR,
        we sometimes have to use ``aws s3 cp`` because ``hadoop`` isn't
        installed at bootstrap time."""
        return 'hadoop fs -copyToLocal'

    def _parse_bootstrap(self):
        """Parse the *bootstrap* option with
        return [parse_setup_cmd(cmd) for cmd in self._opts['bootstrap']]

    def _create_master_bootstrap_script_if_needed(self):
        """Helper for :py:meth:`_add_bootstrap_files_for_upload`.

        Create the master bootstrap script and write it into our local
        temp directory. Set self._master_bootstrap_script_path.

        This will do nothing if there are no bootstrap scripts or commands,
        or if it has already been called."""
        if self._master_bootstrap_script_path:

        # don't bother if we're not starting a cluster
        if self._cluster_id:

        # Also don't bother if we're not bootstrapping
        if not (self._bootstrap or self._bootstrap_mrjob()):

        # create if we need it, and add commands to install it
        mrjob_bootstrap = []
        if self._bootstrap_mrjob():
            assert self._mrjob_zip_path
            path_dict = {
                'type': 'file',
                'name': None,
                'path': self._mrjob_zip_path

            # find out where python keeps its libraries
                "__mrjob_PYTHON_LIB=$(%s -c "
                "'from distutils.sysconfig import get_python_lib;"
                " print(get_python_lib())')" % cmd_line(self._python_bin())

            # remove anything that might be in the way (see #1567)
            mrjob_bootstrap.append(['sudo rm -rf $__mrjob_PYTHON_LIB/mrjob'])

            # unzip
                ['sudo unzip ', path_dict, ' -d $__mrjob_PYTHON_LIB'])

            # re-compile pyc files now, since mappers/reducers can't
            # write to this directory. Don't fail if there is extra
            # un-compileable crud in the tarball (this would matter if
            # sh_bin were 'sh -e')
                'sudo %s -m compileall -q'
                ' -f $__mrjob_PYTHON_LIB/mrjob && true' %

        path = os.path.join(self._get_local_tmp_dir(), '')'writing master bootstrap script to %s' % path)

        contents = self._master_bootstrap_script_content(self._bootstrap +
        for line in contents:
            log.debug('BOOTSTRAP: ' + line)

        with open(path, 'wb') as f:
            for line in contents:
                f.write(line.encode('utf-8') + b'\n')

        self._master_bootstrap_script_path = path

    def _master_bootstrap_script_content(self, bootstrap):
        """Return a list containing the lines of the master bootstrap script.
        (without trailing newlines)
        out = []

        # shebang, precommands

        # store $PWD
        out.append('# store $PWD')

        # special case for PWD being in /, which happens on Dataproc
        # (really we should cd to tmp or something)
        out.append('if [ $__mrjob_PWD = "/" ]; then')
        out.append('  __mrjob_PWD=""')

        # run commands in a block so we can redirect stdout to stderr
        # (e.g. to catch errors from compileall). See #370

        # download files
        out.append('  # download files and mark them executable')

        cp_to_local = self._cp_to_local_cmd()

        # TODO: why bother with $__mrjob_PWD here, since we're already in it?
        for name, path in sorted(
            uri = self._upload_mgr.uri(path)
            out.append('  %s %s $__mrjob_PWD/%s' %
                       (cp_to_local, pipes.quote(uri), pipes.quote(name)))
            # imitate Hadoop Distributed Cache (see #1602)
            out.append('  chmod u+rx $__mrjob_PWD/%s' % pipes.quote(name))

        # download and unarchive archives
        archive_names_and_paths = sorted(
        if archive_names_and_paths:
            # make tmp dir if needed
            out.append('  # download and unpack archives')
            out.append('  __mrjob_TMP=$(mktemp -d)')

            for name, path in archive_names_and_paths:
                uri = self._upload_mgr.uri(path)
                ext = file_ext(basename(path))

                # copy file to tmp dir
                quoted_archive_path = '$__mrjob_TMP/%s' % pipes.quote(name)

                    '  %s %s %s' %
                    (cp_to_local, pipes.quote(uri), quoted_archive_path))

                # unarchive file
                if ext not in _EXT_TO_UNARCHIVE_CMD:
                    raise KeyError('unknown archive file extension: %s' % path)
                unarchive_cmd = _EXT_TO_UNARCHIVE_CMD[ext]

                out.append('  ' + unarchive_cmd %
                                dir='$__mrjob_PWD/' + pipes.quote(name)))

                # imitate Hadoop Distributed Cache (see #1602)
                out.append('  chmod u+rx -R $__mrjob_PWD/%s' %


        # run bootstrap commands
        out.append('  # bootstrap commands')
        for cmd in bootstrap:
            # reconstruct the command line, substituting $__mrjob_PWD/<name>
            # for path dicts
            line = '  '
            for token in cmd:
                if isinstance(token, dict):
                    # it's a path dictionary
                    line += '$__mrjob_PWD/'
                    line += pipes.quote(**token))
                    # it's raw script
                    line += token

        out.append('} 1>&2')  # stdout -> stderr for ease of error log parsing

        return out

    def _start_of_sh_script(self):
        """Return a list of lines (without trailing newlines) containing the
        shell script shebang and pre-commands."""
        out = []

        # shebang
        sh_bin = self._sh_bin()
        if not sh_bin[0].startswith('/'):
            sh_bin = ['/usr/bin/env'] + sh_bin
        out.append('#!' + cmd_line(sh_bin))

        # hook for 'set -e', etc. (see #1549)

        return out
Esempio n. 20
 def test_cant_name_unknown_paths(self):
     wd = WorkingDirManager()
     self.assertRaises(ValueError,, 'file', '')
     self.assertRaises(ValueError,, 'file', '', name='')
Esempio n. 21
 def test_eager_naming(self):
     wd = WorkingDirManager()
     wd.add("file", "")  # by default
     self.assertEqual("file", ""), "")
     # whoops, picked that name too soon!
     self.assertRaises(ValueError, wd.add, "file", "", name="")
Esempio n. 22
 def test_explicit_name_collision(self):
     wd = WorkingDirManager()
     wd.add('file', '', name='')
     self.assertRaises(ValueError, wd.add, 'file', '', name='')
Esempio n. 23
 def test_eager_naming(self):
     wd = WorkingDirManager()
     wd.add('file', '')  # by default
     self.assertEqual('file', ''), '')
     # whoops, picked that name too soon!
     self.assertRaises(ValueError, wd.add, 'file', '', name='')
Esempio n. 24
class MRJobRunner(object):
    """Abstract base class for all runners"""

    # this class handles the basic runner framework, options and config files,
    # arguments to mrjobs, and setting up job working dirs and environments.
    # this will put files from setup scripts, py_files, and bootstrap_mrjob
    # into the job's working dir, but won't actually run/import them
    # command lines to run substeps (including Spark) are handled by
    # mrjob.bin.MRJobBinRunner

    #: alias for this runner; used for picking section of
    #: :py:mod:``mrjob.conf`` to load one of ``'local'``, ``'emr'``,
    #: or ``'hadoop'``
    alias = None

    # libjars is only here because the job can set it; might want to
    # handle this with a warning from the launcher instead
    OPT_NAMES = {

    # if this is true, when bootstrap_mrjob is true, add it through the
    # setup script

    ### methods to call from your batch script ###

    def __init__(self, mr_job_script=None, conf_paths=None,
                 extra_args=None, file_upload_args=None,
                 hadoop_input_format=None, hadoop_output_format=None,
                 input_paths=None, output_dir=None, partitioner=None,
                 sort_values=None, stdin=None, step_output_dir=None,
        """All runners take the following keyword arguments:

        :type mr_job_script: str
        :param mr_job_script: the path of the ``.py`` file containing the
                              :py:class:`~mrjob.job.MRJob`. If this is None,
                              you won't actually be able to :py:meth:`run` the
                              job, but other utilities (e.g. :py:meth:`ls`)
                              will work.
        :type conf_paths: None or list
        :param conf_paths: List of config files to combine and use, or None to
                           search for mrjob.conf in the default locations.
        :type extra_args: list of str
        :param extra_args: a list of extra cmd-line arguments to pass to the
                           mr_job script. This is a hook to allow jobs to take
                           additional arguments.
        :param file_upload_args: a list of tuples of ``('--ARGNAME', path)``.
                                 The file at the given path will be uploaded
                                 to the local directory of the mr_job script
                                 when it runs, and then passed into the script
                                 with ``--ARGNAME``. Useful for passing in
                                 SQLite DBs and other configuration files to
                                 your job.
        :type hadoop_input_format: str
        :param hadoop_input_format: name of an optional Hadoop ``InputFormat``
                                    class. Passed to Hadoop along with your
                                    first step with the ``-inputformat``
                                    option. Note that if you write your own
                                    class, you'll need to include it in your
                                    own custom streaming jar (see
        :type hadoop_output_format: str
        :param hadoop_output_format: name of an optional Hadoop
                                     ``OutputFormat`` class. Passed to Hadoop
                                     along with your first step with the
                                     ``-outputformat`` option. Note that if you
                                     write your own class, you'll need to
                                     include it in your own custom streaming
                                     jar (see
        :type input_paths: list of str
        :param input_paths: Input files for your job. Supports globs and
                            recursively walks directories (e.g.
                            ``['data/common/', 'data/training/*.gz']``). If
                            this is left blank, we'll read from stdin
        :type output_dir: str
        :param output_dir: An empty/non-existent directory where Hadoop
                           should put the final output from the job.
                           If you don't specify an output directory, we'll
                           output into a subdirectory of this job's temporary
                           directory. You can control this from the command
                           line with ``--output-dir``. This option cannot be
                           set from configuration files. If used with the
                           hadoop runner, this path does not need to be fully
                           qualified with ``hdfs://`` URIs because it's
                           understood that it has to be on HDFS.
        :type partitioner: str
        :param partitioner: Optional name of a Hadoop partitioner class, e.g.
                            Hadoop streaming will use this to determine how
                            mapper output should be sorted and distributed
                            to reducers.
        :type sort_values: bool
        :param sort_values: if true, set partitioners and jobconf variables
                            so that reducers to receive the values
                            associated with any key in sorted order (sorted by
                            their *encoded* value). Also known as secondary
        :param stdin: an iterable (can be a ``BytesIO`` or even a list) to use
                      as stdin. This is a hook for testing; if you set
                      ``stdin`` via :py:meth:`~mrjob.job.MRJob.sandbox`, it'll
                      get passed through to the runner. If for some reason
                      your lines are missing newlines, we'll add them;
                      this makes it easier to write automated tests.
        :type step_output_dir: str
        :param step_output_dir: An empty/non-existent directory where Hadoop
                                should put output from all steps other than
                                the last one (this only matters for multi-step
                                jobs). Currently ignored by local runners.
        self._ran_job = False

        # opts are made from:
        # empty defaults (everything set to None)
        # runner-specific defaults
        # opts from config file(s)
        # opts from command line
        self._opts = self._combine_confs(
            [(None, {key: None for key in self.OPT_NAMES})] +
            [(None, self._default_opts())] +
            load_opts_from_mrjob_confs(self.alias, conf_paths) +
            [('the command line', opts)]

        log.debug('Active configuration:')
            opt_key: self._obfuscate_opt(opt_key, opt_value)
            for opt_key, opt_value in self._opts.items()

        self._fs = None

        # a local tmp directory that will be cleaned up when we're done
        # access/make this using self._get_local_tmp_dir()
        self._local_tmp_dir = None

        self._working_dir_mgr = WorkingDirManager()

        # mapping from dir to path for corresponding archive. we pick
        # paths during init(), but don't actually create the archives
        # until self._create_dir_archives() is called
        self._dir_to_archive_path = {}
        # dir archive names (the filename minus ".tar.gz") already taken
        self._dir_archive_names_taken = set()
        # set of dir_archives that have actually been created
        self._dir_archives_created = set()

        # track (name, path) of files and archives to upload to spark.
        # these are a subset of those in self._working_dir_mgr
        self._spark_files = []
        self._spark_archives = []

        self._upload_mgr = None  # define in subclasses that use this

        self._script_path = mr_job_script
        if self._script_path:
            self._working_dir_mgr.add('file', self._script_path)

        # give this job a unique name
        self._job_key = self._make_unique_job_key(
            label=self._opts['label'], owner=self._opts['owner'])

        # extra args to our job
        self._extra_args = list(extra_args) if extra_args else []
        for extra_arg in self._extra_args:
            if isinstance(extra_arg, dict):
                if extra_arg.get('type') != 'file':
                    raise NotImplementedError
                    (extra_arg['name'], extra_arg['path']))

        # extra file arguments to our job
        if file_upload_args:
            log.warning('file_upload_args is deprecated and will be removed'
                        ' in v0.6.0. Pass dicts to extra_args instead.')
            for arg, path in file_upload_args:
                arg_file = parse_legacy_hash_path('file', path)
                self._extra_args.extend([arg, arg_file])
                self._spark_files.append((arg_file['name'], arg_file['path']))

        # set up uploading
        for hash_path in self._opts['upload_files']:
            uf = parse_legacy_hash_path('file', hash_path,
            self._spark_files.append((uf['name'], uf['path']))

        for hash_path in self._opts['upload_archives']:
            ua = parse_legacy_hash_path('archive', hash_path,
            self._spark_archives.append((ua['name'], ua['path']))

        for hash_path in self._opts['upload_dirs']:
            # pick name based on directory path
            ud = parse_legacy_hash_path('dir', hash_path,
            # but feed working_dir_mgr the archive's path
            archive_path = self._dir_archive_path(ud['path'])
                'archive', archive_path, name=ud['name'])
            self._spark_archives.append((ud['name'], archive_path))

        # py_files

        # self._setup is a list of shell commands with path dicts
        # interleaved; see mrjob.setup.parse_setup_cmd() for details
        self._setup = self._parse_setup_and_py_files()
        for cmd in self._setup:
            for token in cmd:
                if isinstance(token, dict):
                    # convert dir archives tokens to archives
                    if token['type'] == 'dir':
                        # feed the archive's path to self._working_dir_mgr
                        token['path'] = self._dir_archive_path(token['path'])
                        token['type'] = 'archive'


        # Where to read input from (log files, etc.)
        self._input_paths = input_paths or ['-']  # by default read from stdin
        if PY2:
            self._stdin = stdin or sys.stdin
            self._stdin = stdin or sys.stdin.buffer
        self._stdin_path = None  # temp file containing dump from stdin

        # where a zip file of the mrjob library is stored locally
        self._mrjob_zip_path = None

        # store output_dir
        self._output_dir = output_dir

        # store partitioner
        self._partitioner = partitioner

        # store sort_values
        self._sort_values = sort_values

        # store step_output_dir
        self._step_output_dir = step_output_dir

        # store hadoop input and output formats
        self._hadoop_input_format = hadoop_input_format
        self._hadoop_output_format = hadoop_output_format

        # A cache for self._get_steps(); also useful as a test hook
        self._steps = None

        # this variable marks whether a cleanup has happened and this runner's
        # output stream is no longer available.
        self._closed = False

    ### Options ####

    def _default_opts(self):
            owner = getpass.getuser()
            owner = None

        return dict(

    def _combine_confs(self, source_and_opt_list):
        """Combine several opt dictionaries into one.

        *source_and_opt_list* is a list of tuples of *source*,
        *opts* where *opts* is a dictionary and *source* is either
        None or a description of where the opts came from (usually a path).

        Only override this if you need truly fine-grained control,
        including knowledge of the options' source.
        opt_list = [
            self._fix_opts(opts, source)
            for source, opts in source_and_opt_list

        return self._combine_opts(opt_list)

    def _combine_opts(self, opt_list):
        """Combine several opt dictionaries into one. *opt_list*
        is a list of dictionaries containing validated options

        Override this if you need to base options off the values of
        other options, but don't need to issue warnings etc.
        about the options' source.
        return combine_opts(self._opt_combiners(), *opt_list)

    def _opt_combiners(self):
        """A dictionary mapping opt name to combiner funciton. This
        won't necessarily include every opt name (we default to
        return _combiners(self.OPT_NAMES)

    def _fix_opts(self, opts, source=None):
        """Take an options dictionary, and either return a sanitized
        version of it, or raise an exception.

        *source* is either a string describing where the opts came from
        or None.

        This ensures that opt dictionaries are really dictionaries
        and handles deprecated options.
        if source is None:
            source = 'defaults'  # defaults shouldn't trigger warnings

        if not isinstance(opts, dict):
            raise TypeError(
                'options for %s (from %s) must be a dict' %
                (self.alias, source))

        deprecated_aliases = _deprecated_aliases(self.OPT_NAMES)

        results = {}

        for k, v in sorted(opts.items()):

            # rewrite deprecated aliases
            if k in deprecated_aliases:
                if v is None:  # don't care

                aliased_opt = deprecated_aliases

                log.warning('Deprecated option %s (from %s) has been renamed'
                            ' to %s and will be removed in v0.7.0' % (
                                k, source, aliased_opt))

                if opts.get(aliased_opt) is not None:
                    return  # don't overwrite non-aliased opt

                k = aliased_opt

            if k in self.OPT_NAMES:
                results[k] = None if v is None else self._fix_opt(k, v, source)
                log.warning('Unexpected option %s (from %s)' % (k, source))

        return results

    def _fix_opt(self, opt_key, opt_value, source):
        """Fix a single option, returning its correct value or raising
        an exception. This is not called for options that are ``None``.

        This currently handles cleanup opts.

        Override this if you require additional opt validation or cleanup.
        if opt_key in ('cleanup', 'cleanup_on_failure'):
            return self._fix_cleanup_opt(opt_key, opt_value, source)
            return opt_value

    def _fix_cleanup_opt(self, opt_key, opt_value, source):
        """Fix a cleanup option, or raise ValueError."""
        if isinstance(opt_value, string_types):
            opt_value = [opt_value]

        if 'NONE' in opt_value and len(set(opt_value)) > 1:
            raise ValueError(
                'Cannot clean up both nothing and something!'
                ' (%s option from %s)' % (opt_key, source))

        for cleanup_type in opt_value:
            if cleanup_type not in CLEANUP_CHOICES:
                raise ValueError(
                    '%s must be one of %s, not %s (from %s)' % (
                        opt_key, ', '.join(CLEANUP_CHOICES), opt_value,

        return opt_value

    def _obfuscate_opt(self, opt_key, opt_value):
        """Return value of opt to show in debug printout. Used to obfuscate
        credentials, etc."""
        return opt_value

    ### Filesystem object ###

    def fs(self):
        """:py:class:`~mrjob.fs.base.Filesystem` object for the local
        if self._fs is None:
            # wrap LocalFilesystem in CompositeFilesystem to get IOError
            # on URIs (see #1185)
            self._fs = CompositeFilesystem(LocalFilesystem())
        return self._fs

    ### Running the job and parsing output ###

    def run(self):
        """Run the job, and block until it finishes.

        Raise :py:class:`~mrjob.step.StepFailedException` if there
        are any problems (except on
        :py:class:`~mrjob.inline.InlineMRJobRunner`, where we raise the
        actual exception that caused the step to fail).
        if not self._script_path:
            raise AssertionError("No script to run!")

        if self._ran_job:
            raise AssertionError("Job already ran!")

        self._ran_job = True

    def cat_output(self):
        """Stream the jobs output, as a stream of ``bytes``. If there are
        multiple output files, there will be an empty bytestring
        (``b''``) between them.

        .. versionadded:: 0.6.0

           In previous versions, you'd use :py:meth:`stream_output`.
        output_dir = self.get_output_dir()
        if output_dir is None:
            raise AssertionError('Run the job before streaming output')

        if self._closed is True:
                'WARNING! Trying to stream output from a closed runner, output'
                ' will probably be empty.')'Streaming final output from %s...' % output_dir)

        def split_path(path):
            while True:
                base, name = os.path.split(path)

                # no more elements
                if not name:

                yield name

                path = base

        def ls_output():
            for filename in
                subpath = filename[len(output_dir):]
                if not (any(name.startswith('_')
                            for name in split_path(subpath))):
                    yield filename

        for i, filename in enumerate(ls_output()):
            if i > 0:
                yield b''  # EOF of previous file

            for chunk in self.fs._cat_file(filename):
                yield chunk

    def stream_output(self):
        """Like :py:meth:`cat_output` except that it groups bytes into
        lines. Equivalent to ``mrjob.util.to_lines(runner.stream_output())``.

        .. deprecated:: 0.6.0
        log.warning('stream_output() is deprecated and will be removed in'
                    ' v0.7.0. use mrjob.util.to_lines(runner.cat_output())'
                    ' instead.')

        return to_lines(self.cat_output())

    def _cleanup_mode(self, mode=None):
        """Actual cleanup action to take based on various options"""
        if self._script_path and not self._ran_job:
            return mode or self._opts['cleanup_on_failure']
            return mode or self._opts['cleanup']

    def _cleanup_cloud_tmp(self):
        """Cleanup any files/directories on cloud storage (e.g. S3) we created
        while running this job. Should be safe to run this at any time, or
        multiple times.
        pass  # only EMR runner does this

    def _cleanup_hadoop_tmp(self):
        """Cleanup any files/directories on HDFS we created
        while running this job. Should be safe to run this at any time, or
        multiple times.
        pass  # only Hadoop runner does this

    def _cleanup_local_tmp(self):
        """Cleanup any files/directories on the local machine we created while
        running this job. Should be safe to run this at any time, or multiple

        This particular function removes any local tmp directories
        added to the list self._local_tmp_dirs

        This won't remove output_dir if it's outside of our tmp dir.
        if self._local_tmp_dir:
  'Removing temp directory %s...' % self._local_tmp_dir)
            except OSError as e:

        self._local_tmp_dir = None

    def _cleanup_cluster(self):
        """Terminate the cluster if there is one."""
        pass  # this only happens on EMR

    def _cleanup_logs(self):
        """Cleanup any log files that are created as a side-effect of the job.
        pass  # this only happens on EMR

    def _cleanup_job(self):
        """Stop any jobs that we created that are still running."""
        pass  # currently disabled (see #1241)

    def cleanup(self, mode=None):
        """Clean up running jobs, temp files, and logs, subject to the
        *cleanup* option passed to the constructor.

        If you create your runner in a :keyword:`with` block,
        :py:meth:`cleanup` will be called automatically::

            with mr_job.make_runner() as runner:

            # cleanup() called automatically here

        :param mode: override *cleanup* passed into the constructor. Should be
                     a list of strings from :py:data:`CLEANUP_CHOICES`
        mode = self._cleanup_mode(mode)

        def mode_has(*args):
            return any((choice in mode) for choice in args)

        if self._script_path and not self._ran_job:
            if mode_has('CLUSTER', 'ALL'):

            if mode_has('JOB', 'ALL'):

        if mode_has('ALL', 'TMP', 'CLOUD_TMP'):

        if mode_has('ALL', 'TMP', 'HADOOP_TMP'):

        if mode_has('ALL', 'TMP', 'LOCAL_TMP'):

        if mode_has('ALL', 'LOGS'):

        self._closed = True

    def counters(self):
        """Get counters associated with this run in this form::

            [{'group name': {'counter1': 1, 'counter2': 2}},
             {'group name': ...}]

        The list contains an entry for every step of the current job.
        raise NotImplementedError

    ### hooks for the with statement ###

    def __enter__(self):
        """Don't do anything special at start of with block"""
        return self

    def __exit__(self, type, value, traceback):
        """Call self.cleanup() at end of with block."""

    ### more runner information ###

    def get_opts(self):
        """Get options set for this runner, as a dict."""
        log.warning('get_opts() is deprecated and will be removed in v0.7.0')
        return copy.deepcopy(self._opts)

    def get_job_key(self):
        """Get the unique key for the job run by this runner.
        This has the format ````
        return self._job_key

    def get_output_dir(self):
        """Find the directory containing the job output. If the job hasn't
        run yet, returns None"""
        if self._script_path and not self._ran_job:
            return None

        return self._output_dir

    ### other methods you need to implement in your subclass ###

    def get_hadoop_version(self):
        """Return the version number of the Hadoop environment as a string if
        Hadoop is being used or simulated. Return None if not applicable.

        :py:class:`~mrjob.emr.EMRJobRunner` infers this from the cluster.
        :py:class:`~mrjob.hadoop.HadoopJobRunner` gets this from
        ``hadoop version``. :py:class:`~mrjob.local.LocalMRJobRunner` has an
        additional `hadoop_version` option to specify which version it
        :py:class:`~mrjob.inline.InlineMRJobRunner` does not simulate Hadoop at
        return None

    # you'll probably wan't to add your own __init__() and cleanup() as well

    def _run(self):
        """Run the job."""
        raise NotImplementedError

    ### internal utilities for implementing MRJobRunners ###

    def _get_local_tmp_dir(self):
        """Create a tmp directory on the local filesystem that will be
        cleaned up by self.cleanup()"""
        if not self._local_tmp_dir:
            path = os.path.join(self._opts['local_tmp_dir'], self._job_key)
  'Creating temp directory %s' % path)
            if os.path.isdir(path):
            self._local_tmp_dir = path

        return self._local_tmp_dir

    def _make_unique_job_key(self, label=None, owner=None):
        """Come up with a useful unique ID for this job.

        We use this to choose the output directory, etc. for the job.
        # use the name of the script if one wasn't explicitly
        # specified
        if not label:
            if self._script_path:
                label = os.path.basename(self._script_path).split('.')[0]
                label = 'no_script'

        if not owner:
            owner = 'no_user'

        now = datetime.datetime.utcnow()
        return '%s.%s.%s.%06d' % (
            label, owner,
            now.strftime('%Y%m%d.%H%M%S'), now.microsecond)

    def _get_steps(self):
        """Call the job script to find out how many steps it has, and whether
        there are mappers and reducers for each step. Validate its

        Returns output as described in :ref:`steps-format`.

        Results are cached, so call this as many times as you want.
        if self._steps is None:
            self._steps = self._load_steps()

        return self._steps

    def _load_steps(self):
        """Ask job how many steps it has, and whether
        there are mappers and reducers for each step.

        Returns output as described in :ref:`steps-format`.
        raise NotImplementedError

    def _get_step(self, step_num):
        """Get a single step (calls :py:meth:`_get_steps`)."""
        return self._get_steps()[step_num]

    def _num_steps(self):
        """Get the number of steps (calls :py:meth:`get_steps`)."""
        return len(self._get_steps())

    def _has_streaming_steps(self):
        """Are any of our steps Hadoop streaming steps?"""
        return any(step['type'] == 'streaming'
                   for step in self._get_steps())

    def _has_spark_steps(self):
        """Are any of our steps Spark steps (either spark or spark_script)"""
        return any(_is_spark_step_type(step['type'])
                   for step in self._get_steps())

    def _args_for_task(self, step_num, mrc):
        return [
            '--step-num=%d' % step_num,
            '--%s' % mrc,
        ] + self._mr_job_extra_args()

    def _mr_job_extra_args(self, local=False):
        """Return arguments to add to every invocation of MRJob.

        :type local: boolean
        :param local: if this is True, use files' local paths rather than
            the path they'll have inside Hadoop streaming
        result = []

        for extra_arg in self._extra_args:
            if isinstance(extra_arg, dict):
                if local:

        return result

    def _dir_archive_path(self, dir_path):
        """Assign a path for the archive of *dir_path* but don't
        actually create anything."""
        if dir_path not in self._dir_to_archive_path:
            # we can check local paths now
            if not (is_uri(dir_path) or os.path.isdir(dir_path)):
                raise OSError('%s is not a directory!' % dir_path)

            name = name_uniquely(
                dir_path, names_taken=self._dir_archive_names_taken)

            self._dir_to_archive_path[dir_path] = os.path.join(
                self._get_local_tmp_dir(), 'archives', name + '.tar.gz')

        return self._dir_to_archive_path[dir_path]

    def _create_dir_archives(self):
        """Call this to create all dir archives"""
        for dir_path in sorted(set(self._dir_to_archive_path)):

    def _create_dir_archive(self, dir_path):
        """Helper for :py:meth:`archive_dir`"""
        if not self.fs.exists(dir_path):
            raise OSError('%s does not exist')

        tar_gz_path = self._dir_archive_path(dir_path)

        if tar_gz_path in self._dir_archives_created:
            return  # already created

        if not os.path.isdir(os.path.dirname(tar_gz_path)):

        # for remote files
        tmp_download_path = os.path.join(
            self._get_local_tmp_dir(), 'tmp-download')'Archiving %s -> %s' % (dir_path, tar_gz_path))

        with, mode='w:gz') as tar_gz:
            for path in
                # only lists files
                if path == dir_path:
                    raise OSError('%s is a file, not a directory!' % dir_path)

                # TODO: do we need this?
                if os.path.realpath(path) == os.path.realpath(tar_gz_path):
                    raise OSError(
                        'attempted to archive %s into itself!' % tar_gz_path)

                if is_uri(path):
                    path_in_tar_gz = path[len(dir_path):].lstrip('/')

          '  downloading %s -> %s' % (
                        path, tmp_download_path))
                    with open(tmp_download_path, 'wb') as f:
                        for chunk in
                    local_path = tmp_download_path
                    path_in_tar_gz = path[len(dir_path):].lstrip(os.sep)
                    local_path = path

                log.debug('  adding %s to %s' % (path, tar_gz_path))
                tar_gz.add(local_path, path_in_tar_gz, recursive=False)


    def _bootstrap_mrjob(self):
        """Should we bootstrap mrjob?"""
        if self._opts['bootstrap_mrjob'] is None:
            return self._opts['interpreter'] is None
            return bool(self._opts['bootstrap_mrjob'])

    def _get_input_paths(self):
        """Get the paths to input files, dumping STDIN to a local
        file if need be."""
        if '-' in self._input_paths:
            if self._stdin_path is None:
                # prompt user, so they don't think the process has stalled
      'reading from STDIN')

                stdin_path = os.path.join(self._get_local_tmp_dir(), 'STDIN')
                log.debug('dumping stdin to local file %s' % stdin_path)
                with open(stdin_path, 'wb') as stdin_file:
                    for line in self._stdin:
                        # catch missing newlines (often happens with test data)
                        if not line.endswith(b'\n'):
                            line += b'\n'

                self._stdin_path = stdin_path

        return [self._stdin_path if p == '-' else p for p in self._input_paths]

    def _check_input_paths(self):
        """Check that input exists prior to running the job, if the
        `check_input_paths` option is true."""
        if not self._opts['check_input_paths']:

        for path in self._input_paths:
            if path == '-':
                    continue  # STDIN always exists

            if not self.fs.can_handle_path(path):
                continue  # e.g. non-S3 URIs on EMR

            if not self.fs.exists(path):
                raise IOError(
                    'Input path %s does not exist!' % (path,))

    def _intermediate_output_uri(self, step_num, local=False):
        """A URI for intermediate output for the given step number."""
        join = os.path.join if local else posixpath.join

        return join(
            self._step_output_dir or self._default_step_output_dir(),
            '%04d' % step_num)

    def _default_step_output_dir(self):
        """Where to put output for steps other than the last one,
        if not specified by the *output_dir* constructor keyword.
        Usually you want this to be on HDFS (most efficient).

        Define this in your runner subclass.
        raise NotImplementedError

    def _step_input_uris(self, step_num):
        """A list of URIs to use as input for the given step. For all
        except the first step, this list will have a single item (a
        if step_num == 0:
            return [self._upload_mgr.uri(path)
                    for path in self._get_input_paths()]
            return [self._intermediate_output_uri(step_num - 1)]

    def _step_output_uri(self, step_num):
        """URI to use as output for the given step. This is either an
        intermediate dir (see :py:meth:`intermediate_output_uri`) or
        ``self._output_dir`` for the final step."""
        if step_num == len(self._get_steps()) - 1:
            return self._output_dir
            return self._intermediate_output_uri(step_num)

    def _interpolate_input_and_output(self, args, step_num):
        """Replace :py:data:`~mrjob.step.INPUT` and
        :py:data:`~mrjob.step.OUTPUT` in arguments to a jar or Spark

        If there are multiple input paths (i.e. on the first step), they'll
        be joined with a comma.

        def interpolate(arg):
            if arg == mrjob.step.INPUT:
                return ','.join(self._step_input_uris(step_num))
            elif arg == mrjob.step.OUTPUT:
                return self._step_output_uri(step_num)
                return arg

        return [interpolate(arg) for arg in args]

    def _create_mrjob_zip(self):
        """Make a zip of the mrjob library, without .pyc or .pyo files,
        This will also set ``self._mrjob_zip_path`` and return it.

        Typically called from

        It's safe to call this method multiple times (we'll only create
        the zip file once.)
        if not self._mrjob_zip_path:
            # find mrjob library
            import mrjob

            if not os.path.basename(mrjob.__file__).startswith('__init__.'):
                raise Exception(
                    "Bad path for mrjob library: %s; can't bootstrap mrjob",

            mrjob_dir = os.path.dirname(mrjob.__file__) or '.'

            zip_path = os.path.join(self._get_local_tmp_dir(), '')

            def filter_path(path):
                filename = os.path.basename(path)
                return not(filename.lower().endswith('.pyc') or
                           filename.lower().endswith('.pyo') or
                           # filter out emacs backup files
                           filename.endswith('~') or
                           # filter out emacs lock files
                           filename.startswith('.#') or
                           # filter out MacFuse resource forks

            log.debug('archiving %s -> %s as %s' % (
                mrjob_dir, zip_path, os.path.join('mrjob', '')))
            zip_dir(mrjob_dir, zip_path, filter=filter_path, prefix='mrjob')

            self._mrjob_zip_path = zip_path

        return self._mrjob_zip_path

    def _jobconf_for_step(self, step_num):
        """Get the jobconf dictionary, optionally including step-specific
        jobconf info.

        Also translate jobconfs to the current Hadoop version, if necessary.

        step = self._get_step(step_num)

        # _sort_values_jobconf() isn't relevant to Spark,
        # but it doesn't do any harm either

        jobconf = combine_dicts(self._sort_values_jobconf(),

        # if user is using the wrong jobconfs, add in the correct ones
        # and log a warning
        hadoop_version = self.get_hadoop_version()
        if hadoop_version:
            jobconf = translate_jobconf_dict(jobconf, hadoop_version)

        return jobconf

    def _sort_values_jobconf(self):
        """Jobconf dictionary to enable sorting by value.
        if not self._sort_values:
            return {}

        # translate _SORT_VALUES_JOBCONF to the correct Hadoop version,
        # without logging a warning
        hadoop_version = self.get_hadoop_version()

        jobconf = {}
        for k, v in _SORT_VALUES_JOBCONF.items():
            if hadoop_version:
                jobconf[translate_jobconf(k, hadoop_version)] = v
                for j in translate_jobconf_for_all_versions(k):
                    jobconf[j] = v

        return jobconf

    def _sort_values_partitioner(self):
        """Partitioner to use with *sort_values* keyword to the constructor."""
        if self._sort_values:
            return _SORT_VALUES_PARTITIONER
            return None

    def _parse_setup_and_py_files(self):
        """Parse the *setup* option with
        :py:func:`mrjob.setup.parse_setup_cmd()`, and patch in *py_files*.
        setup = []

        # py_files
        for path in self._opts['py_files']:
            # Spark (at least v1.3.1) doesn't work with # and --py-files,
            # see #1375
            if '#' in path:
                raise ValueError("py_files cannot contain '#'")
            path_dict = parse_legacy_hash_path('file', path)
            setup.append(['export PYTHONPATH=', path_dict, ':$PYTHONPATH'])

        # setup
        for cmd in self._opts['setup']:

        return setup

    def _upload_args(self):
        # just upload every file and archive in the working dir manager
        return self._upload_args_helper('-files', None, '-archives', None)

    def _upload_args_helper(
            self, files_opt_str, files, archives_opt_str, archives):
        args = []

        file_hash_paths = list(self._arg_hash_paths('file', files))
        if file_hash_paths:

        archive_hash_paths = list(self._arg_hash_paths('archive', archives))
        if archive_hash_paths:

        return args

    def _arg_hash_paths(self, type, named_paths=None):
        """Helper function for the *upload_args methods."""
        if named_paths is None:
            # just return everything managed by _working_dir_mgr
            named_paths = sorted(

        for name, path in named_paths:
            if not name:
                name =, path)
            uri = self._upload_mgr.uri(path)
            yield '%s#%s' % (uri, name)
Esempio n. 25
 def test_okay_to_give_same_path_same_name(self):
     wd = WorkingDirManager()
     wd.add('file', 'foo/', name='')
     wd.add('file', 'foo/', name='')
                      {'': 'foo/'})
Esempio n. 26
 def test_explicit_name_collision(self):
     wd = WorkingDirManager()
     wd.add('file', '', name='')
     self.assertRaises(ValueError, wd.add, 'file', '', name='')
Esempio n. 27
 def test_cant_give_same_path_different_types(self):
     wd = WorkingDirManager()
     wd.add("archive", "foo/", name="")
     self.assertRaises(ValueError, wd.add, "file", "foo/", name="")
Esempio n. 28
    def __init__(self, mr_job_script=None, conf_paths=None,
                 extra_args=None, file_upload_args=None,
                 hadoop_input_format=None, hadoop_output_format=None,
                 input_paths=None, output_dir=None, partitioner=None,
                 sort_values=None, stdin=None, step_output_dir=None,
        """All runners take the following keyword arguments:

        :type mr_job_script: str
        :param mr_job_script: the path of the ``.py`` file containing the
                              :py:class:`~mrjob.job.MRJob`. If this is None,
                              you won't actually be able to :py:meth:`run` the
                              job, but other utilities (e.g. :py:meth:`ls`)
                              will work.
        :type conf_paths: None or list
        :param conf_paths: List of config files to combine and use, or None to
                           search for mrjob.conf in the default locations.
        :type extra_args: list of str
        :param extra_args: a list of extra cmd-line arguments to pass to the
                           mr_job script. This is a hook to allow jobs to take
                           additional arguments.
        :param file_upload_args: a list of tuples of ``('--ARGNAME', path)``.
                                 The file at the given path will be uploaded
                                 to the local directory of the mr_job script
                                 when it runs, and then passed into the script
                                 with ``--ARGNAME``. Useful for passing in
                                 SQLite DBs and other configuration files to
                                 your job.
        :type hadoop_input_format: str
        :param hadoop_input_format: name of an optional Hadoop ``InputFormat``
                                    class. Passed to Hadoop along with your
                                    first step with the ``-inputformat``
                                    option. Note that if you write your own
                                    class, you'll need to include it in your
                                    own custom streaming jar (see
        :type hadoop_output_format: str
        :param hadoop_output_format: name of an optional Hadoop
                                     ``OutputFormat`` class. Passed to Hadoop
                                     along with your first step with the
                                     ``-outputformat`` option. Note that if you
                                     write your own class, you'll need to
                                     include it in your own custom streaming
                                     jar (see
        :type input_paths: list of str
        :param input_paths: Input files for your job. Supports globs and
                            recursively walks directories (e.g.
                            ``['data/common/', 'data/training/*.gz']``). If
                            this is left blank, we'll read from stdin
        :type output_dir: str
        :param output_dir: An empty/non-existent directory where Hadoop
                           should put the final output from the job.
                           If you don't specify an output directory, we'll
                           output into a subdirectory of this job's temporary
                           directory. You can control this from the command
                           line with ``--output-dir``. This option cannot be
                           set from configuration files. If used with the
                           hadoop runner, this path does not need to be fully
                           qualified with ``hdfs://`` URIs because it's
                           understood that it has to be on HDFS.
        :type partitioner: str
        :param partitioner: Optional name of a Hadoop partitioner class, e.g.
                            Hadoop streaming will use this to determine how
                            mapper output should be sorted and distributed
                            to reducers.
        :type sort_values: bool
        :param sort_values: if true, set partitioners and jobconf variables
                            so that reducers to receive the values
                            associated with any key in sorted order (sorted by
                            their *encoded* value). Also known as secondary
        :param stdin: an iterable (can be a ``BytesIO`` or even a list) to use
                      as stdin. This is a hook for testing; if you set
                      ``stdin`` via :py:meth:`~mrjob.job.MRJob.sandbox`, it'll
                      get passed through to the runner. If for some reason
                      your lines are missing newlines, we'll add them;
                      this makes it easier to write automated tests.
        :type step_output_dir: str
        :param step_output_dir: An empty/non-existent directory where Hadoop
                                should put output from all steps other than
                                the last one (this only matters for multi-step
                                jobs). Currently ignored by local runners.
        self._ran_job = False

        # opts are made from:
        # empty defaults (everything set to None)
        # runner-specific defaults
        # opts from config file(s)
        # opts from command line
        self._opts = self._combine_confs(
            [(None, {key: None for key in self.OPT_NAMES})] +
            [(None, self._default_opts())] +
            load_opts_from_mrjob_confs(self.alias, conf_paths) +
            [('the command line', opts)]

        log.debug('Active configuration:')
            opt_key: self._obfuscate_opt(opt_key, opt_value)
            for opt_key, opt_value in self._opts.items()

        self._fs = None

        # a local tmp directory that will be cleaned up when we're done
        # access/make this using self._get_local_tmp_dir()
        self._local_tmp_dir = None

        self._working_dir_mgr = WorkingDirManager()

        # mapping from dir to path for corresponding archive. we pick
        # paths during init(), but don't actually create the archives
        # until self._create_dir_archives() is called
        self._dir_to_archive_path = {}
        # dir archive names (the filename minus ".tar.gz") already taken
        self._dir_archive_names_taken = set()
        # set of dir_archives that have actually been created
        self._dir_archives_created = set()

        # track (name, path) of files and archives to upload to spark.
        # these are a subset of those in self._working_dir_mgr
        self._spark_files = []
        self._spark_archives = []

        self._upload_mgr = None  # define in subclasses that use this

        self._script_path = mr_job_script
        if self._script_path:
            self._working_dir_mgr.add('file', self._script_path)

        # give this job a unique name
        self._job_key = self._make_unique_job_key(
            label=self._opts['label'], owner=self._opts['owner'])

        # extra args to our job
        self._extra_args = list(extra_args) if extra_args else []
        for extra_arg in self._extra_args:
            if isinstance(extra_arg, dict):
                if extra_arg.get('type') != 'file':
                    raise NotImplementedError
                    (extra_arg['name'], extra_arg['path']))

        # extra file arguments to our job
        if file_upload_args:
            log.warning('file_upload_args is deprecated and will be removed'
                        ' in v0.6.0. Pass dicts to extra_args instead.')
            for arg, path in file_upload_args:
                arg_file = parse_legacy_hash_path('file', path)
                self._extra_args.extend([arg, arg_file])
                self._spark_files.append((arg_file['name'], arg_file['path']))

        # set up uploading
        for hash_path in self._opts['upload_files']:
            uf = parse_legacy_hash_path('file', hash_path,
            self._spark_files.append((uf['name'], uf['path']))

        for hash_path in self._opts['upload_archives']:
            ua = parse_legacy_hash_path('archive', hash_path,
            self._spark_archives.append((ua['name'], ua['path']))

        for hash_path in self._opts['upload_dirs']:
            # pick name based on directory path
            ud = parse_legacy_hash_path('dir', hash_path,
            # but feed working_dir_mgr the archive's path
            archive_path = self._dir_archive_path(ud['path'])
                'archive', archive_path, name=ud['name'])
            self._spark_archives.append((ud['name'], archive_path))

        # py_files

        # self._setup is a list of shell commands with path dicts
        # interleaved; see mrjob.setup.parse_setup_cmd() for details
        self._setup = self._parse_setup_and_py_files()
        for cmd in self._setup:
            for token in cmd:
                if isinstance(token, dict):
                    # convert dir archives tokens to archives
                    if token['type'] == 'dir':
                        # feed the archive's path to self._working_dir_mgr
                        token['path'] = self._dir_archive_path(token['path'])
                        token['type'] = 'archive'


        # Where to read input from (log files, etc.)
        self._input_paths = input_paths or ['-']  # by default read from stdin
        if PY2:
            self._stdin = stdin or sys.stdin
            self._stdin = stdin or sys.stdin.buffer
        self._stdin_path = None  # temp file containing dump from stdin

        # where a zip file of the mrjob library is stored locally
        self._mrjob_zip_path = None

        # store output_dir
        self._output_dir = output_dir

        # store partitioner
        self._partitioner = partitioner

        # store sort_values
        self._sort_values = sort_values

        # store step_output_dir
        self._step_output_dir = step_output_dir

        # store hadoop input and output formats
        self._hadoop_input_format = hadoop_input_format
        self._hadoop_output_format = hadoop_output_format

        # A cache for self._get_steps(); also useful as a test hook
        self._steps = None

        # this variable marks whether a cleanup has happened and this runner's
        # output stream is no longer available.
        self._closed = False
Esempio n. 29
 def test_lazy_naming(self):
     wd = WorkingDirManager()
     wd.add("file", "")  # by default
     wd.add("file", "", name="")
     self.assertEqual(wd.name_to_path("file"), {"": "", "": ""})
Esempio n. 30
class MRJobRunner(object):
    """Abstract base class for all runners"""

    #: alias for this runner; used for picking section of
    #: :py:mod:``mrjob.conf`` to load one of ``'local'``, ``'emr'``,
    #: or ``'hadoop'``
    alias = None

    # if this is true, when bootstrap_mrjob is true, add it through the
    # setup script

    OPTION_STORE_CLASS = RunnerOptionStore

    ### methods to call from your batch script ###

    def __init__(self, mr_job_script=None, conf_paths=None,
                 extra_args=None, file_upload_args=None,
                 hadoop_input_format=None, hadoop_output_format=None,
                 input_paths=None, output_dir=None, partitioner=None,
                 stdin=None, **opts):
        """All runners take the following keyword arguments:

        :type mr_job_script: str
        :param mr_job_script: the path of the ``.py`` file containing the
                              :py:class:`~mrjob.job.MRJob`. If this is None,
                              you won't actually be able to :py:meth:`run` the
                              job, but other utilities (e.g. :py:meth:`ls`)
                              will work.
        :type conf_paths: None or list
        :param conf_paths: List of config files to combine and use, or None to
                           search for mrjob.conf in the default locations.
        :type extra_args: list of str
        :param extra_args: a list of extra cmd-line arguments to pass to the
                           mr_job script. This is a hook to allow jobs to take
                           additional arguments.
        :param file_upload_args: a list of tuples of ``('--ARGNAME', path)``.
                                 The file at the given path will be uploaded
                                 to the local directory of the mr_job script
                                 when it runs, and then passed into the script
                                 with ``--ARGNAME``. Useful for passing in
                                 SQLite DBs and other configuration files to
                                 your job.
        :type hadoop_input_format: str
        :param hadoop_input_format: name of an optional Hadoop ``InputFormat``
                                    class. Passed to Hadoop along with your
                                    first step with the ``-inputformat``
                                    option. Note that if you write your own
                                    class, you'll need to include it in your
                                    own custom streaming jar (see
        :type hadoop_output_format: str
        :param hadoop_output_format: name of an optional Hadoop
                                     ``OutputFormat`` class. Passed to Hadoop
                                     along with your first step with the
                                     ``-outputformat`` option. Note that if you
                                     write your own class, you'll need to
                                     include it in your own custom streaming
                                     jar (see *hadoop_streaming_jar*).
        :type input_paths: list of str
        :param input_paths: Input files for your job. Supports globs and
                            recursively walks directories (e.g.
                            ``['data/common/', 'data/training/*.gz']``). If
                            this is left blank, we'll read from stdin
        :type output_dir: str
        :param output_dir: An empty/non-existent directory where Hadoop
                           streaming should put the final output from the job.
                           If you don't specify an output directory, we'll
                           output into a subdirectory of this job's temporary
                           directory. You can control this from the command
                           line with ``--output-dir``. This option cannot be
                           set from configuration files. If used with the
                           hadoop runner, this path does not need to be fully
                           qualified with ``hdfs://`` URIs because it's
                           understood that it has to be on HDFS.
        :type partitioner: str
        :param partitioner: Optional name of a Hadoop partitoner class, e.g.
                            Hadoop streaming will use this to determine how
                            mapper output should be sorted and distributed
                            to reducers.
        :param stdin: an iterable (can be a ``BytesIO`` or even a list) to use
                      as stdin. This is a hook for testing; if you set
                      ``stdin`` via :py:meth:`~mrjob.job.MRJob.sandbox`, it'll
                      get passed through to the runner. If for some reason
                      your lines are missing newlines, we'll add them;
                      this makes it easier to write automated tests.
        self._ran_job = False

        self._opts = self.OPTION_STORE_CLASS(self.alias, opts, conf_paths)
        self._fs = None

        self._working_dir_mgr = WorkingDirManager()

        self._script_path = mr_job_script
        if self._script_path:
            self._working_dir_mgr.add('file', self._script_path)

        # give this job a unique name
        self._job_key = self._make_unique_job_key(
            label=self._opts['label'], owner=self._opts['owner'])

        # we'll create the wrapper script later
        self._setup_wrapper_script_path = None

        # extra args to our job
        self._extra_args = list(extra_args) if extra_args else []

        # extra file arguments to our job
        self._file_upload_args = []
        if file_upload_args:
            for arg, path in file_upload_args:
                arg_file = parse_legacy_hash_path('file', path)
                self._file_upload_args.append((arg, arg_file))

        # set up uploading
        for path in self._opts['upload_files']:
                'file', path, must_name='upload_files'))
        for path in self._opts['upload_archives']:
                'archive', path, must_name='upload_archives'))

        # python_archives, setup, setup_cmds, and setup_scripts
        # self._setup is a list of shell commands with path dicts
        # interleaved; see mrjob.setup.parse_setup_cmds() for details
        self._setup = self._parse_setup()
        for cmd in self._setup:
            for maybe_path_dict in cmd:
                if isinstance(maybe_path_dict, dict):

        # Where to read input from (log files, etc.)
        self._input_paths = input_paths or ['-']  # by default read from stdin
        if PY2:
            self._stdin = stdin or sys.stdin
            self._stdin = stdin or sys.stdin.buffer
        self._stdin_path = None  # temp file containing dump from stdin

        # where a tarball of the mrjob library is stored locally
        self._mrjob_tar_gz_path = None

        # store output_dir
        self._output_dir = output_dir

        # store partitioner
        self._partitioner = partitioner

        # store hadoop input and output formats
        self._hadoop_input_format = hadoop_input_format
        self._hadoop_output_format = hadoop_output_format

        # a local tmp directory that will be cleaned up when we're done
        # access/make this using self._get_local_tmp_dir()
        self._local_tmp_dir = None

        # A cache for self._get_steps(); also useful as a test hook
        self._steps = None

        # if this is True, we have to pipe input into the sort command
        # rather than feed it multiple files
        self._sort_is_windows_sort = None

        # this variable marks whether a cleanup has happened and this runner's
        # output stream is no longer available.
        self._closed = False

    ### Filesystem object ###

    def fs(self):
        """:py:class:`~mrjob.fs.base.Filesystem` object for the local
        filesystem. Methods on :py:class:`~mrjob.fs.base.Filesystem` objects
        will be forwarded to :py:class:`~mrjob.runner.MRJobRunner` until mrjob
        0.6.0, but **this behavior is deprecated.**
        if self._fs is None:
            # wrap LocalFilesystem in CompositeFilesystem to get IOError
            # on URIs (see #1185)
            self._fs = CompositeFilesystem(LocalFilesystem())
        return self._fs

    def __getattr__(self, name):
        # For backward compatibility, forward filesystem methods
            value = getattr(self.fs, name)
        except AttributeError:
            raise AttributeError(name)

        # friendly deprecation warning
        is_func = ismethod(value) or isfunction(value)
            'deprecated: %s %s.fs.%s%s directly'
            ' (%s.%s is going away in v0.6.0)' % (
                'call' if is_func else 'access',
                '()' if is_func else '',

        return value

    ### Running the job and parsing output ###

    def run(self):
        """Run the job, and block until it finishes.

        Raise an exception if there are any problems.
        if not self._script_path:
            raise AssertionError("No script to run!")

        if self._ran_job:
            raise AssertionError("Job already ran!")

        self._ran_job = True

    def stream_output(self):
        """Stream raw lines from the job's output. You can parse these
        using the read() method of the appropriate HadoopStreamingProtocol
        output_dir = self.get_output_dir()
        if output_dir is None:
            raise AssertionError('Run the job before streaming output')

        if self._closed is True:
                'WARNING! Trying to stream output from a closed runner, output'
                ' will probably be empty.')'Streaming final output from %s' % output_dir)

        def split_path(path):
            while True:
                base, name = os.path.split(path)

                # no more elements
                if not name:

                yield name

                path = base

        for filename in
            subpath = filename[len(output_dir):]
            if not any(name.startswith('_') for name in split_path(subpath)):
                for line in self.fs._cat_file(filename):
                    yield line

    def _cleanup_mode(self, mode=None):
        """Actual cleanup action to take based on various options"""
        if self._script_path and not self._ran_job:
            return mode or self._opts['cleanup_on_failure']
            return mode or self._opts['cleanup']

    def _cleanup_local_tmp(self):
        """Cleanup any files/directories on the local machine we created while
        running this job. Should be safe to run this at any time, or multiple

        This particular function removes any local tmp directories
        added to the list self._local_tmp_dirs

        This won't remove output_dir if it's outside of our tmp dir.
        if self._local_tmp_dir:
  'removing tmp directory %s' % self._local_tmp_dir)
            except OSError as e:

        self._local_tmp_dir = None

    def _cleanup_remote_tmp(self):
        """Cleanup any files/directories on the remote machine (S3) we created
        while running this job. Should be safe to run this at any time, or
        multiple times.
        pass  # this only happens on EMR

    def _cleanup_logs(self):
        """Cleanup any log files that are created as a side-effect of the job.
        pass  # this only happens on EMR

    def _cleanup_job(self):
        """Stop any jobs that we created that are still running."""
        pass  # this only happens on EMR

    def _cleanup_job_flow(self):
        """Terminate the job flow if there is one."""
        pass  # this only happens on EMR

    def cleanup(self, mode=None):
        """Clean up running jobs, temp files, and logs, subject to the
        *cleanup* option passed to the constructor.

        If you create your runner in a :keyword:`with` block,
        :py:meth:`cleanup` will be called automatically::

            with mr_job.make_runner() as runner:

            # cleanup() called automatically here

        :param mode: override *cleanup* passed into the constructor. Should be
                     a list of strings from :py:data:`CLEANUP_CHOICES`
        mode = self._cleanup_mode(mode)

        def mode_has(*args):
            return any((choice in mode) for choice in args)

        if self._script_path and not self._ran_job:
            if mode_has('JOB_FLOW', 'ALL'):

            if mode_has('JOB', 'ALL'):

        if mode_has('ALL', 'TMP', 'LOCAL_TMP'):

        if mode_has('ALL', 'TMP', 'REMOTE_TMP'):

        if mode_has('ALL', 'LOGS'):

        self._closed = True

    def counters(self):
        """Get counters associated with this run in this form::

            [{'group name': {'counter1': 1, 'counter2': 2}},
             {'group name': ...}]

        The list contains an entry for every step of the current job, ignoring
        earlier steps in the same job flow.
        raise NotImplementedError

    def _print_counters(self, step_nums=None):
        """Log this run's counters in a user-friendly way.

        :type step_nums: list of int
        :param step_nums: Optional list of indexes of steps in
                          ``self.counters()`` to filter on.

        Prints step nums 1-indexed (e.g. "step 1"), but *step_nums* is
        0-indexed (e.g. [0]).
        for step_num, step_counters in enumerate(self.counters()):
            if step_nums is None or step_num in step_nums:
      'Counters from step %d:' % (step_num + 1))
                if step_counters:
                    for group, group_counters in sorted(step_counters.items()):
              '\t%s' % group)
                        for counter, amount in sorted(group_counters.items()):
                  '\t\t%s=%d' % (counter, amount))
          '  (none found)')

    ### hooks for the with statement ###

    def __enter__(self):
        """Don't do anything special at start of with block"""
        return self

    def __exit__(self, type, value, traceback):
        """Call self.cleanup() at end of with block."""

    ### more runner information ###

    def get_opts(self):
        """Get options set for this runner, as a dict."""
        return copy.deepcopy(self._opts)

    def get_job_key(self):
        """Get the unique key for the job run by this runner.
        This has the format ````
        return self._job_key

    def get_job_name(self):
        """Alias for :py:meth:`get_job_key`. Will be removed in v0.6.0.

        .. deprecated:: 0.5.0
        log.warn('get_job_name() has been renamed to get_job_key().'
                 ' get_job_name() will be removed in v0.6.0')
        return self.get_job_key()

    def get_output_dir(self):
        """Find the directory containing the job output. If the job hasn't
        run yet, returns None"""
        if self._script_path and not self._ran_job:
            return None

        return self._output_dir

    ### other methods you need to implement in your subclass ###

    def get_hadoop_version(self):
        """Return the version number of the Hadoop environment as a string if
        Hadoop is being used or simulated. Return None if not applicable.

        :py:class:`~mrjob.emr.EMRJobRunner` infers this from the job flow.
        :py:class:`~mrjob.hadoop.HadoopJobRunner` gets this from
        ``hadoop version``. :py:class:`~mrjob.local.LocalMRJobRunner` has an
        additional `hadoop_version` option to specify which version it
        simulates, with a default of 0.20.
        :py:class:`~mrjob.inline.InlineMRJobRunner` does not simulate Hadoop at
        return None

    # you'll probably wan't to add your own __init__() and cleanup() as well

    def _run(self):
        """Run the job."""
        raise NotImplementedError

    ### internal utilities for implementing MRJobRunners ###

    def _get_local_tmp_dir(self):
        """Create a tmp directory on the local filesystem that will be
        cleaned up by self.cleanup()"""
        if not self._local_tmp_dir:
            path = os.path.join(self._opts['local_tmp_dir'], self._job_key)
  'creating tmp directory %s' % path)
            if os.path.isdir(path):
            self._local_tmp_dir = path

        return self._local_tmp_dir

    def _make_unique_job_key(self, label=None, owner=None):
        """Come up with a useful unique ID for this job.

        We use this to choose the output directory, etc. for the job.
        # use the name of the script if one wasn't explicitly
        # specified
        if not label:
            if self._script_path:
                label = os.path.basename(self._script_path).split('.')[0]
                label = 'no_script'

        if not owner:
            owner = 'no_user'

        now = datetime.datetime.utcnow()
        return '%s.%s.%s.%06d' % (
            label, owner,
            now.strftime('%Y%m%d.%H%M%S'), now.microsecond)

    def _get_steps(self):
        """Call the job script to find out how many steps it has, and whether
        there are mappers and reducers for each step. Validate its

        Returns output as described in :ref:`steps-format`.

        Results are cached, so call this as many times as you want.
        if self._steps is None:
            if not self._script_path:
                self._steps = []
                args = (self._executable(True) + ['--steps'] +
                log.debug('> %s' % cmd_line(args))
                # add . to PYTHONPATH (in case mrjob isn't actually installed)
                env = combine_local_envs(os.environ,
                                         {'PYTHONPATH': os.path.abspath('.')})
                steps_proc = Popen(args, stdout=PIPE, stderr=PIPE, env=env)
                stdout, stderr = steps_proc.communicate()

                if steps_proc.returncode != 0:
                    raise Exception(
                        'error getting step information: \n%s' % stderr)

                # on Python 3, convert stdout to str so we can json.loads() it
                if not isinstance(stdout, str):
                    stdout = stdout.decode('utf_8')

                    steps = json.loads(stdout)
                except ValueError:
                    raise ValueError("Bad --steps response: \n%s" % stdout)

                # verify that this is a proper step description
                if not steps or not stdout:
                    raise ValueError('step description is empty!')
                for step in steps:
                    if step['type'] not in STEP_TYPES:
                        raise ValueError(
                            'unexpected step type %r in steps %r' % (
                                step['type'], stdout))

                self._steps = steps

        return self._steps

    def _get_step(self, step_num):
        """Get a single step (calls :py:meth:`_get_steps`)."""
        return self._get_steps()[step_num]

    def _num_steps(self):
        """Get the number of steps (calls :py:meth:`get_steps`)."""
        return len(self._get_steps())

    def _interpreter(self, steps=False):
        if steps:
            return (self._opts['steps_interpreter'] or
                    self._opts['interpreter'] or
            return (self._opts['interpreter'] or

    def _executable(self, steps=False):
        if steps:
            return self._interpreter(steps=True) + [self._script_path]
            return self._interpreter() + [
      'file', self._script_path)]

    def _python_bin(self, steps=False):
        if steps:
            return (self._opts['steps_python_bin'] or
            return (self._opts['python_bin'] or

    def _default_python_bin(self, local=False):
        """The default python command. If local is true, try to use
        sys.executable. Otherwise use 'python' or 'python3' as appropriate.

        This returns a single-item list (because it's a command).
        if local and sys.executable:
            return [sys.executable]
        elif PY2:
            return ['python']
            # e.g. python3
            return ['python%d' % sys.version_info[0]]

    def _script_args_for_step(self, step_num, mrc):
        assert self._script_path

        args = self._executable() + [
            '--step-num=%d' % step_num,
            '--%s' % mrc,
        ] + self._mr_job_extra_args()

        if self._setup_wrapper_script_path:
            return (self._opts['sh_bin'] +
                        'file', self._setup_wrapper_script_path)] +
            return args

    def _substep_cmd_line(self, step_num, mrc):
        step = self._get_step(step_num)

        if step[mrc]['type'] == 'command':
            # never wrap custom hadoop streaming commands in bash
            return step[mrc]['command'], False

        elif step[mrc]['type'] == 'script':
            cmd = cmd_line(self._script_args_for_step(step_num, mrc))

            # filter input and pipe for great speed, if user asks
            # but we have to wrap the command in bash
            if 'pre_filter' in step[mrc]:
                return '%s | %s' % (step[mrc]['pre_filter'], cmd), True
                return cmd, False
            raise ValueError("Invalid %s step %d: %r" % (
                mrc, step_num, step[mrc]))

    def _render_substep(self, step_num, mrc):
        step = self._get_step(step_num)

        if mrc in step:
            return self._substep_cmd_line(step_num, mrc)
            if mrc == 'mapper':
                return 'cat', False
                return None, False

    def _hadoop_streaming_commands(self, step_num):
        version = self.get_hadoop_version()

        # Hadoop streaming stuff
        mapper, bash_wrap_mapper = self._render_substep(
            step_num, 'mapper')

        combiner, bash_wrap_combiner = self._render_substep(
            step_num, 'combiner')

        reducer, bash_wrap_reducer = self._render_substep(
            step_num, 'reducer')

        if (combiner is not None and
            not supports_combiners_in_hadoop_streaming(version)):

            # krazy hack to support combiners on hadoop <0.20
            bash_wrap_mapper = True
            mapper = "%s | sort | %s" % (mapper, combiner)

            # take the combiner away, hadoop will just be confused
            combiner = None
            bash_wrap_combiner = False

        if bash_wrap_mapper:
            mapper = bash_wrap(mapper)

        if bash_wrap_combiner:
            combiner = bash_wrap(combiner)

        if bash_wrap_reducer:
            reducer = bash_wrap(reducer)

        return mapper, combiner, reducer

    def _mr_job_extra_args(self, local=False):
        """Return arguments to add to every invocation of MRJob.

        :type local: boolean
        :param local: if this is True, use files' local paths rather than
            the path they'll have inside Hadoop streaming
        return (self._get_file_upload_args(local=local) +
                self._get_strict_protocols_args() +

    def _get_file_upload_args(self, local=False):
        """Arguments used to pass through config files, etc from the job
        runner through to the local directory where the script is run.

        :type local: boolean
        :param local: if this is True, use files' local paths rather than
            the path they'll have inside Hadoop streaming
        args = []
        for arg, path_dict in self._file_upload_args:
            if local:
        return args

    def _get_strict_protocols_args(self):
        """Arguments used to control protocol behavior in the job.

        This just adds --no-strict-protocols when strict_protocols
        is false.
        # These are only in the runner so that we can default them from
        # mrjob.conf, which will allow us to eventually remove them.
        # See issue #726.
        if not self._opts['strict_protocols']:
            return ['--no-strict-protocols']
            return []

    def _create_setup_wrapper_script(
            self, dest='', local=False):
        """Create the wrapper script, and write it into our local temp
        directory (by default, to a file named

        This will set ``self._setup_wrapper_script_path``, and add it to

        This will do nothing if ``self._setup`` is empty or
        this method has already been called.

        If *local* is true, use local line endings (e.g. Windows). Otherwise,
        use UNIX line endings (see #1071).
        if self._setup_wrapper_script_path:

        setup = self._setup

        if self._bootstrap_mrjob() and self.BOOTSTRAP_MRJOB_IN_SETUP:
            # patch setup to add mrjob.tar.gz to PYTYHONPATH
            mrjob_tar_gz = self._create_mrjob_tar_gz()
            path_dict = {'type': 'archive', 'name': None, 'path': mrjob_tar_gz}
            setup = [['export PYTHONPATH=', path_dict, ':$PYTHONPATH']] + setup

        if not setup:

        path = os.path.join(self._get_local_tmp_dir(), dest)'writing wrapper script to %s' % path)

        contents = self._setup_wrapper_script_content(setup)
        for line in contents:
            log.debug('WRAPPER: ' + line.rstrip('\n'))

        if local:
            with open(path, 'w') as f:
                for line in contents:
            with open(path, 'wb') as f:
                for line in contents:

        self._setup_wrapper_script_path = path
        self._working_dir_mgr.add('file', self._setup_wrapper_script_path)

    def _parse_setup(self):
        """Parse the *setup* option with

        If *bootstrap_mrjob* and ``self.BOOTSTRAP_MRJOB_IN_SETUP`` are both
        true, create mrjob.tar.gz (if it doesn't exist already) and
        prepend a setup command that adds it to PYTHONPATH.

        Also patch in the deprecated
        options *python_archives*, *setup_cmd*, and *setup_script*
        as setup commands.
        setup = []

        # python_archives
        for path in self._opts['python_archives']:
            path_dict = parse_legacy_hash_path('archive', path)
            setup.append(['export PYTHONPATH=', path_dict, ':$PYTHONPATH'])

        # setup
        for cmd in self._opts['setup']:

        # setup_cmds
        if self._opts['setup_cmds']:
                "setup_cmds is deprecated since v0.4.2 and will be removed"
                " in v0.6.0. Consider using setup instead.")

        for cmd in self._opts['setup_cmds']:
            if not isinstance(cmd, string_types):
                cmd = cmd_line(cmd)

        # setup_scripts
        if self._opts['setup_scripts']:
                "setup_scripts is deprecated since v0.4.2 and will be removed"
                " in v0.6.0. Consider using setup instead.")

        for path in self._opts['setup_scripts']:
            path_dict = parse_legacy_hash_path('file', path)

        return setup

    def _setup_wrapper_script_content(self, setup, mrjob_tar_gz_name=None):
        """Return a (Bourne) shell script that runs the setup commands and then
        executes whatever is passed to it (this will be our mapper/reducer),
        as a list of strings (one for each line, including newlines).

        We obtain a file lock so that two copies of the setup commands
        cannot run simultaneously on the same machine (this helps for running
        :command:`make` on a shared source code archive, for example).
        out = []

        def writeln(line=''):
            out.append(line + '\n')

        # we're always going to execute this script as an argument to
        # sh, so there's no need to add a shebang (e.g. #!/bin/sh)

        writeln('# store $PWD')

        writeln('# obtain exclusive file lock')
        # Basically, we're going to tie file descriptor 9 to our lockfile,
        # use a subprocess to obtain a lock (which we somehow inherit too),
        # and then release the lock by closing the file descriptor.
        # File descriptors 10 and higher are used internally by the shell,
        # so 9 is as out-of-the-way as we can get.
        writeln('exec 9>/tmp/wrapper.lock.%s' % self._job_key)
        # would use flock(1), but it's not always available
        writeln("%s -c 'import fcntl; fcntl.flock(9, fcntl.LOCK_EX)'" %

        writeln('# setup commands')
        # group setup commands so we can redirect their input/output (see
        # below). Don't use parens; this would invoke a subshell, which would
        # keep us from exporting environment variables to the task.
        for cmd in setup:
            # reconstruct the command line, substituting $__mrjob_PWD/<name>
            # for path dicts
            line = '  '  # indent, since these commands are in a group
            for token in cmd:
                if isinstance(token, dict):
                    # it's a path dictionary
                    line += '$__mrjob_PWD/'
                    line += pipes.quote(**token))
                    # it's raw script
                    line += token
        # redirect setup commands' input/output so they don't interfere
        # with the task (see Issue #803).
        writeln('} 0</dev/null 1>&2')

        writeln('# release exclusive file lock')
        writeln('exec 9>&-')

        writeln('# run task from the original working directory')
        writeln('cd $__mrjob_PWD')

        return out

    def _bootstrap_mrjob(self):
        """Should we bootstrap mrjob?"""
        if self._opts['bootstrap_mrjob'] is None:
            return self._opts['interpreter'] is None
            return bool(self._opts['bootstrap_mrjob'])

    def _get_input_paths(self):
        """Get the paths to input files, dumping STDIN to a local
        file if need be."""
        if '-' in self._input_paths:
            if self._stdin_path is None:
                # prompt user, so they don't think the process has stalled
      'reading from STDIN')

                stdin_path = os.path.join(self._get_local_tmp_dir(), 'STDIN')
                log.debug('dumping stdin to local file %s' % stdin_path)
                with open(stdin_path, 'wb') as stdin_file:
                    for line in self._stdin:
                        # catch missing newlines (often happens with test data)
                        if not line.endswith(b'\n'):
                            line += b'\n'

                self._stdin_path = stdin_path

        return [self._stdin_path if p == '-' else p for p in self._input_paths]

    def _create_mrjob_tar_gz(self):
        """Make a tarball of the mrjob library, without .pyc or .pyo files,
        This will also set ``self._mrjob_tar_gz_path`` and return it.

        Typically called from

        It's safe to call this method multiple times (we'll only create
        the tarball once.)
        if not self._mrjob_tar_gz_path:
            # find mrjob library
            import mrjob

            if not os.path.basename(mrjob.__file__).startswith('__init__.'):
                raise Exception(
                    "Bad path for mrjob library: %s; can't bootstrap mrjob",

            mrjob_dir = os.path.dirname(mrjob.__file__) or '.'

            tar_gz_path = os.path.join(self._get_local_tmp_dir(),

            def filter_path(path):
                filename = os.path.basename(path)
                return not(file_ext(filename).lower() in ('.pyc', '.pyo') or
                           # filter out emacs backup files
                           filename.endswith('~') or
                           # filter out emacs lock files
                           filename.startswith('.#') or
                           # filter out MacFuse resource forks

            log.debug('archiving %s -> %s as %s' % (
                mrjob_dir, tar_gz_path, os.path.join('mrjob', '')))
                mrjob_dir, tar_gz_path, filter=filter_path, prefix='mrjob')

            self._mrjob_tar_gz_path = tar_gz_path

        return self._mrjob_tar_gz_path

    def _jobconf_for_step(self, step_num):
        """Get the jobconf dictionary, optionally including step-specific
        jobconf info.

        Also translate jobconfs to the current Hadoop version, if necessary.
        step = self._get_step(step_num)
        jobconf = combine_dicts(self._opts['jobconf'], step.get('jobconf'))

        # if user is using the wrong jobconfs, add in the correct ones
            jobconf, self.get_hadoop_version())

        return jobconf

    def _update_jobconf_for_hadoop_version(self, jobconf, hadoop_version):
        """If *jobconf* (a dict) contains jobconf variables from the wrong
        version of Hadoop, add variables for the right one.

        If *hadoop_version* is empty, do nothing.
        if not hadoop_version:  # this happens for sim runner

        translations = {}  # for warning, below

        for key, value in sorted(jobconf.items()):
            new_key = translate_jobconf(key, hadoop_version)
            if new_key not in jobconf:
                jobconf[new_key] = value
                translations[key] = new_key

        if translations:
                "Detected hadoop configuration property names that"
                " do not match hadoop version %s:"
                "\nThey have been translated as follows\n %s",
                    "%s: %s" % (key, new_key) for key, new_key
                    in sorted(translations.items())]))

    def _hadoop_args_for_step(self, step_num):
        """Build a list of extra arguments to the hadoop binary.

        This handles *cmdenv*, *hadoop_extra_args*, *hadoop_input_format*,
        *hadoop_output_format*, *jobconf*, and *partitioner*.

        This doesn't handle input, output, mappers, reducers, or uploading
        assert 0 <= step_num < self._num_steps()

        args = []

        # hadoop_extra_args

        # new-style jobconf
        version = self.get_hadoop_version()

        # translate the jobconf configuration names to match
        # the hadoop version
        jobconf = self._jobconf_for_step(step_num)

        if uses_generic_jobconf(version):
            for key, value in sorted(jobconf.items()):
                if value is not None:
                    args.extend(['-D', '%s=%s' % (key, value)])
        # old-style jobconf
            for key, value in sorted(jobconf.items()):
                if value is not None:
                    args.extend(['-jobconf', '%s=%s' % (key, value)])

        # partitioner
        if self._partitioner:
            args.extend(['-partitioner', self._partitioner])

        # cmdenv
        for key, value in sorted(self._opts['cmdenv'].items()):
            args.append('%s=%s' % (key, value))

        # hadoop_input_format
        if (step_num == 0 and self._hadoop_input_format):
            args.extend(['-inputformat', self._hadoop_input_format])

        # hadoop_output_format
        if (step_num == self._num_steps() - 1 and self._hadoop_output_format):
            args.extend(['-outputformat', self._hadoop_output_format])

        return args

    def _arg_hash_paths(self, type, upload_mgr):
        """Helper function for the *upload_args methods."""
        for name, path in self._working_dir_mgr.name_to_path(type).items():
            uri = self._upload_mgr.uri(path)
            yield '%s#%s' % (uri, name)

    def _upload_args(self, upload_mgr):
        args = []

        # TODO: does Hadoop have a way of coping with paths that have
        # commas in their names?

        file_hash_paths = list(self._arg_hash_paths('file', upload_mgr))
        if file_hash_paths:

        archive_hash_paths = list(self._arg_hash_paths('archive', upload_mgr))
        if archive_hash_paths:

        return args

    def _pre_0_20_upload_args(self, upload_mgr):
        """-files/-archive args for Hadoop prior to 0.20.203"""
        args = []

        for file_hash in self._arg_hash_paths('file', upload_mgr):

        for archive_hash in self._arg_hash_paths('archive', upload_mgr):

        return args

    def _invoke_sort(self, input_paths, output_path):
        """Use the local sort command to sort one or more input files. Raise
        an exception if there is a problem.

        This is is just a wrapper to handle limitations of Windows sort
        (see Issue #288).

        :type input_paths: list of str
        :param input_paths: paths of one or more input files
        :type output_path: str
        :param output_path: where to pipe sorted output into
        if not input_paths:
            raise ValueError('Must specify at least one input path.')

        # ignore locale when sorting
        env = os.environ.copy()
        env['LC_ALL'] = 'C'

        # Make sure that the tmp dir environment variables are changed if
        # the default is changed.
        env['TMP'] = self._opts['local_tmp_dir']
        env['TMPDIR'] = self._opts['local_tmp_dir']
        env['TEMP'] = self._opts['local_tmp_dir']'writing to %s' % output_path)

        err_path = os.path.join(self._get_local_tmp_dir(), 'sort-stderr')

        # assume we're using UNIX sort unless we know otherwise
        if (not self._sort_is_windows_sort) or len(input_paths) == 1:
            with open(output_path, 'wb') as output:
                with open(err_path, 'wb') as err:
                    args = ['sort'] + list(input_paths)
          '> %s' % cmd_line(args))
                        check_call(args, stdout=output, stderr=err, env=env)
                    except CalledProcessError:

        # Looks like we're using Windows sort
        self._sort_is_windows_sort = True'Piping files into sort for Windows compatibility')
        with open(output_path, 'wb') as output:
            with open(err_path, 'wb') as err:
                args = ['sort']
      '> %s' % cmd_line(args))
                proc = Popen(args, stdin=PIPE, stdout=output, stderr=err,

                # shovel bytes into the sort process
                for input_path in input_paths:
                    with open(input_path, 'rb') as input:
                        while True:
                            buf =
                            if not buf:


                if proc.returncode == 0:

        # looks like there was a problem. log it and raise an error
        with open(err_path) as err:
            for line in err:
                log.error('STDERR: %s' % line.rstrip('\r\n'))
        raise CalledProcessError(proc.returncode, args)
Esempio n. 31
 def test_cant_auto_name_unless_added_as_auto(self):
     wd = WorkingDirManager()
     wd.add("file", "", name="")
     self.assertEqual("file", "", ""), "")
     self.assertRaises(ValueError,, "file", "")
Esempio n. 32
    def __init__(self,
        """All runners take the following keyword arguments:

        :type mr_job_script: str
        :param mr_job_script: the path of the ``.py`` file containing the
                              :py:class:`~mrjob.job.MRJob`. If this is None,
                              you won't actually be able to :py:meth:`run` the
                              job, but other utilities (e.g. :py:meth:`ls`)
                              will work.
        :type conf_paths: None or list
        :param conf_paths: List of config files to combine and use, or None to
                           search for mrjob.conf in the default locations.
        :type extra_args: list of str
        :param extra_args: a list of extra cmd-line arguments to pass to the
                           mr_job script. This is a hook to allow jobs to take
                           additional arguments.
        :param file_upload_args: a list of tuples of ``('--ARGNAME', path)``.
                                 The file at the given path will be uploaded
                                 to the local directory of the mr_job script
                                 when it runs, and then passed into the script
                                 with ``--ARGNAME``. Useful for passing in
                                 SQLite DBs and other configuration files to
                                 your job.
        :type hadoop_input_format: str
        :param hadoop_input_format: name of an optional Hadoop ``InputFormat``
                                    class. Passed to Hadoop along with your
                                    first step with the ``-inputformat``
                                    option. Note that if you write your own
                                    class, you'll need to include it in your
                                    own custom streaming jar (see
        :type hadoop_output_format: str
        :param hadoop_output_format: name of an optional Hadoop
                                     ``OutputFormat`` class. Passed to Hadoop
                                     along with your first step with the
                                     ``-outputformat`` option. Note that if you
                                     write your own class, you'll need to
                                     include it in your own custom streaming
                                     jar (see
        :type input_paths: list of str
        :param input_paths: Input files for your job. Supports globs and
                            recursively walks directories (e.g.
                            ``['data/common/', 'data/training/*.gz']``). If
                            this is left blank, we'll read from stdin
        :type output_dir: str
        :param output_dir: An empty/non-existent directory where Hadoop
                           should put the final output from the job.
                           If you don't specify an output directory, we'll
                           output into a subdirectory of this job's temporary
                           directory. You can control this from the command
                           line with ``--output-dir``. This option cannot be
                           set from configuration files. If used with the
                           hadoop runner, this path does not need to be fully
                           qualified with ``hdfs://`` URIs because it's
                           understood that it has to be on HDFS.
        :type partitioner: str
        :param partitioner: Optional name of a Hadoop partitioner class, e.g.
                            Hadoop streaming will use this to determine how
                            mapper output should be sorted and distributed
                            to reducers.
        :type sort_values: bool
        :param sort_values: if true, set partitioners and jobconf variables
                            so that reducers to receive the values
                            associated with any key in sorted order (sorted by
                            their *encoded* value). Also known as secondary
        :param stdin: an iterable (can be a ``BytesIO`` or even a list) to use
                      as stdin. This is a hook for testing; if you set
                      ``stdin`` via :py:meth:`~mrjob.job.MRJob.sandbox`, it'll
                      get passed through to the runner. If for some reason
                      your lines are missing newlines, we'll add them;
                      this makes it easier to write automated tests.
        :type step_output_dir: str
        :param step_output_dir: An empty/non-existent directory where Hadoop
                                should put output from all steps other than
                                the last one (this only matters for multi-step
                                jobs). Currently ignored by local runners.
        self._ran_job = False

        # opts are made from:
        # empty defaults (everything set to None)
        # runner-specific defaults
        # opts from config file(s)
        # opts from command line
        self._opts = self._combine_confs(
            [(None, {key: None
                     for key in self.OPT_NAMES})] +
            [(None, self._default_opts())] +
            load_opts_from_mrjob_confs(self.alias, conf_paths) +
            [('the command line', opts)])

        log.debug('Active configuration:')
                opt_key: self._obfuscate_opt(opt_key, opt_value)
                for opt_key, opt_value in self._opts.items()

        self._fs = None

        # a local tmp directory that will be cleaned up when we're done
        # access/make this using self._get_local_tmp_dir()
        self._local_tmp_dir = None

        self._working_dir_mgr = WorkingDirManager()

        # mapping from dir to path for corresponding archive. we pick
        # paths during init(), but don't actually create the archives
        # until self._create_dir_archives() is called
        self._dir_to_archive_path = {}
        # dir archive names (the filename minus ".tar.gz") already taken
        self._dir_archive_names_taken = set()
        # set of dir_archives that have actually been created
        self._dir_archives_created = set()

        # track (name, path) of files and archives to upload to spark.
        # these are a subset of those in self._working_dir_mgr
        self._spark_files = []
        self._spark_archives = []

        self._upload_mgr = None  # define in subclasses that use this

        self._script_path = mr_job_script
        if self._script_path:
            self._working_dir_mgr.add('file', self._script_path)

        # give this job a unique name
        self._job_key = self._make_unique_job_key(label=self._opts['label'],

        # extra args to our job
        self._extra_args = list(extra_args) if extra_args else []
        for extra_arg in self._extra_args:
            if isinstance(extra_arg, dict):
                if extra_arg.get('type') != 'file':
                    raise NotImplementedError
                    (extra_arg['name'], extra_arg['path']))

        # extra file arguments to our job
        if file_upload_args:
            log.warning('file_upload_args is deprecated and will be removed'
                        ' in v0.6.0. Pass dicts to extra_args instead.')
            for arg, path in file_upload_args:
                arg_file = parse_legacy_hash_path('file', path)
                self._extra_args.extend([arg, arg_file])
                self._spark_files.append((arg_file['name'], arg_file['path']))

        # set up uploading
        for hash_path in self._opts['upload_files']:
            uf = parse_legacy_hash_path('file',
            self._spark_files.append((uf['name'], uf['path']))

        for hash_path in self._opts['upload_archives']:
            ua = parse_legacy_hash_path('archive',
            self._spark_archives.append((ua['name'], ua['path']))

        for hash_path in self._opts['upload_dirs']:
            # pick name based on directory path
            ud = parse_legacy_hash_path('dir',
            # but feed working_dir_mgr the archive's path
            archive_path = self._dir_archive_path(ud['path'])
            self._working_dir_mgr.add('archive', archive_path, name=ud['name'])
            self._spark_archives.append((ud['name'], archive_path))

        # py_files

        # self._setup is a list of shell commands with path dicts
        # interleaved; see mrjob.setup.parse_setup_cmd() for details
        self._setup = self._parse_setup_and_py_files()
        for cmd in self._setup:
            for token in cmd:
                if isinstance(token, dict):
                    # convert dir archives tokens to archives
                    if token['type'] == 'dir':
                        # feed the archive's path to self._working_dir_mgr
                        token['path'] = self._dir_archive_path(token['path'])
                        token['type'] = 'archive'


        # Where to read input from (log files, etc.)
        self._input_paths = input_paths or ['-']  # by default read from stdin
        if PY2:
            self._stdin = stdin or sys.stdin
            self._stdin = stdin or sys.stdin.buffer
        self._stdin_path = None  # temp file containing dump from stdin

        # where a zip file of the mrjob library is stored locally
        self._mrjob_zip_path = None

        # store output_dir
        self._output_dir = output_dir

        # store partitioner
        self._partitioner = partitioner

        # store sort_values
        self._sort_values = sort_values

        # store step_output_dir
        self._step_output_dir = step_output_dir

        # store hadoop input and output formats
        self._hadoop_input_format = hadoop_input_format
        self._hadoop_output_format = hadoop_output_format

        # A cache for self._get_steps(); also useful as a test hook
        self._steps = None

        # this variable marks whether a cleanup has happened and this runner's
        # output stream is no longer available.
        self._closed = False
Esempio n. 33
 def test_cant_give_same_path_different_types(self):
     wd = WorkingDirManager()
     wd.add('archive', 'foo/', name='')
                       wd.add, 'file', 'foo/', name='')
Esempio n. 34
 def test_empty(self):
     wd = WorkingDirManager()
     self.assertEqual(wd.name_to_path('archive'), {})
     self.assertEqual(wd.name_to_path('file'), {})
     self.assertEqual(wd.paths(), set())
class MRJobRunner(object):
    """Abstract base class for all runners"""

    #: alias for this runner; used for picking section of
    #: :py:mod:``mrjob.conf`` to load one of ``'local'``, ``'emr'``,
    #: or ``'hadoop'``
    alias = None

    # if this is true, when bootstrap_mrjob is true, add it through the
    # setup script

    OPTION_STORE_CLASS = RunnerOptionStore

    ### methods to call from your batch script ###

    def __init__(self, mr_job_script=None, conf_path=None,
                 extra_args=None, file_upload_args=None,
                 hadoop_input_format=None, hadoop_output_format=None,
                 input_paths=None, output_dir=None, partitioner=None,
                 stdin=None, conf_paths=None, **opts):
        """All runners take the following keyword arguments:

        :type mr_job_script: str
        :param mr_job_script: the path of the ``.py`` file containing the
                              :py:class:`~mrjob.job.MRJob`. If this is None,
                              you won't actually be able to :py:meth:`run` the
                              job, but other utilities (e.g. :py:meth:`ls`)
                              will work.
        :type conf_path: str, None, or False
        :param conf_path: Deprecated. Alternate path to read configs from, or
                          ``False`` to ignore all config files. Use
                          *conf_paths* instead.
        :type conf_paths: None or list
        :param conf_paths: List of config files to combine and use, or None to
                           search for mrjob.conf in the default locations.
        :type extra_args: list of str
        :param extra_args: a list of extra cmd-line arguments to pass to the
                           mr_job script. This is a hook to allow jobs to take
                           additional arguments.
        :param file_upload_args: a list of tuples of ``('--ARGNAME', path)``.
                                 The file at the given path will be uploaded
                                 to the local directory of the mr_job script
                                 when it runs, and then passed into the script
                                 with ``--ARGNAME``. Useful for passing in
                                 SQLite DBs and other configuration files to
                                 your job.
        :type hadoop_input_format: str
        :param hadoop_input_format: name of an optional Hadoop ``InputFormat``
                                    class. Passed to Hadoop along with your
                                    first step with the ``-inputformat``
                                    option. Note that if you write your own
                                    class, you'll need to include it in your
                                    own custom streaming jar (see
        :type hadoop_output_format: str
        :param hadoop_output_format: name of an optional Hadoop
                                     ``OutputFormat`` class. Passed to Hadoop
                                     along with your first step with the
                                     ``-outputformat`` option. Note that if you
                                     write your own class, you'll need to
                                     include it in your own custom streaming
                                     jar (see *hadoop_streaming_jar*).
        :type input_paths: list of str
        :param input_paths: Input files for your job. Supports globs and
                            recursively walks directories (e.g.
                            ``['data/common/', 'data/training/*.gz']``). If
                            this is left blank, we'll read from stdin
        :type output_dir: str
        :param output_dir: An empty/non-existent directory where Hadoop
                           streaming should put the final output from the job.
                           If you don't specify an output directory, we'll
                           output into a subdirectory of this job's temporary
                           directory. You can control this from the command
                           line with ``--output-dir``. This option cannot be
                           set from configuration files. If used with the
                           hadoop runner, this path does not need to be fully
                           qualified with ``hdfs://`` URIs because it's
                           understood that it has to be on HDFS.
        :type partitioner: str
        :param partitioner: Optional name of a Hadoop partitoner class, e.g.
                            Hadoop streaming will use this to determine how
                            mapper output should be sorted and distributed
                            to reducers.
        :param stdin: an iterable (can be a ``StringIO`` or even a list) to use
                      as stdin. This is a hook for testing; if you set
                      ``stdin`` via :py:meth:`~mrjob.job.MRJob.sandbox`, it'll
                      get passed through to the runner. If for some reason
                      your lines are missing newlines, we'll add them;
                      this makes it easier to write automated tests.
        self._ran_job = False

        if conf_path is not None:
            if conf_paths is not None:
                raise ValueError("Can't specify both conf_path and conf_paths")
                log.warn("The conf_path argument to MRJobRunner() is"
                         " deprecated. Use conf_paths instead.")
                if conf_path is False:
                    conf_paths = []
                    conf_paths = [conf_path]
        self._opts = self.OPTION_STORE_CLASS(self.alias, opts, conf_paths)
        self._fs = None

        self._working_dir_mgr = WorkingDirManager()

        self._script_path = mr_job_script
        if self._script_path:
            self._working_dir_mgr.add('file', self._script_path)

        # give this job a unique name
        self._job_name = self._make_unique_job_name(
            label=self._opts['label'], owner=self._opts['owner'])

        # export the unique name to a environment variable
        if self._opts['export_job_name']:
            self._opts['cmdenv'].update({'MRJOB_JOB_NAME': self._job_name})

        # we'll create the wrapper script later
        self._setup_wrapper_script_path = None

        # extra args to our job
        self._extra_args = list(extra_args) if extra_args else []

        # extra file arguments to our job
        self._file_upload_args = []
        if file_upload_args:
            for arg, path in file_upload_args:
                arg_file = parse_legacy_hash_path('file', path)
                self._file_upload_args.append((arg, arg_file))

        # set up uploading
        for path in self._opts['upload_files']:
                'file', path, must_name='upload_files'))
        for path in self._opts['upload_archives']:
                'archive', path, must_name='upload_archives'))

        # python_archives, setup, setup_cmds, and setup_scripts
        # self._setup is a list of shell commands with path dicts
        # interleaved; see mrjob.setup.parse_setup_cmds() for details
        self._setup = self._parse_setup()
        for cmd in self._setup:
            for maybe_path_dict in cmd:
                if isinstance(maybe_path_dict, dict):

        # Where to read input from (log files, etc.)
        self._input_paths = input_paths or ['-']  # by default read from stdin
        self._stdin = stdin or sys.stdin
        self._stdin_path = None  # temp file containing dump from stdin

        # where a tarball of the mrjob library is stored locally
        self._mrjob_tar_gz_path = None

        # store output_dir
        self._output_dir = output_dir

        # store partitioner
        self._partitioner = partitioner

        # store hadoop input and output formats
        self._hadoop_input_format = hadoop_input_format
        self._hadoop_output_format = hadoop_output_format

        # a local tmp directory that will be cleaned up when we're done
        # access/make this using self._get_local_tmp_dir()
        self._local_tmp_dir = None

        # A cache for self._get_steps(); also useful as a test hook
        self._steps = None

        # if this is True, we have to pipe input into the sort command
        # rather than feed it multiple files
        self._sort_is_windows_sort = None

    ### Filesystem object ###

    def fs(self):
        """:py:class:`~mrjob.fs.base.Filesystem` object for the local
        filesystem. Methods on :py:class:`~mrjob.fs.base.Filesystem` objects
        will be forwarded to :py:class:`~mrjob.runner.MRJobRunner` until mrjob
        0.5, but **this behavior is deprecated.**
        if self._fs is None:
            self._fs = LocalFilesystem()
        return self._fs

    def __getattr__(self, name):
        # For backward compatibility, forward filesystem methods
            return getattr(self.fs, name)
        except AttributeError:
            raise AttributeError(name)

    ### Running the job and parsing output ###

    def run(self):
        """Run the job, and block until it finishes.

        Raise an exception if there are any problems.
        if not self._script_path:
            raise AssertionError("No script to run!")

        if self._ran_job:
            raise AssertionError("Job already ran!")

        self._ran_job = True

    def stream_output(self):
        """Stream raw lines from the job's output. You can parse these
        using the read() method of the appropriate HadoopStreamingProtocol
        output_dir = self.get_output_dir()
        if output_dir is None:
            raise AssertionError('Run the job before streaming output')'Streaming final output from %s' % output_dir)

        def split_path(path):
            while True:
                base, name = os.path.split(path)

                # no more elements
                if not name:

                yield name

                path = base

        for filename in
            subpath = filename[len(output_dir):]
            if not any(name.startswith('_') for name in split_path(subpath)):
                for line in self._cat_file(filename):
                    yield line

    def _cleanup_mode(self, mode=None):
        """Actual cleanup action to take based on various options"""
        if self._script_path and not self._ran_job:
            return mode or self._opts['cleanup_on_failure']
            return mode or self._opts['cleanup']

    def _cleanup_local_scratch(self):
        """Cleanup any files/directories on the local machine we created while
        running this job. Should be safe to run this at any time, or multiple

        This particular function removes any local tmp directories
        added to the list self._local_tmp_dirs

        This won't remove output_dir if it's outside of our scratch dir.
        if self._local_tmp_dir:
  'removing tmp directory %s' % self._local_tmp_dir)
            except OSError, e:

        self._local_tmp_dir = None
Esempio n. 36
 def test_lazy_naming(self):
     wd = WorkingDirManager()
     wd.add('file', '')  # by default
     wd.add('file', '', name='')
                      {'': '', '': ''})
Esempio n. 37
 def test_okay_to_give_same_path_same_name(self):
     wd = WorkingDirManager()
     wd.add('file', 'foo/', name='')
     wd.add('file', 'foo/', name='')
                      {'': 'foo/'})
Esempio n. 38
 def test_eager_naming(self):
     wd = WorkingDirManager()
     wd.add('file', '')  # by default
     self.assertEqual('file', ''), '')
     # whoops, picked that name too soon!
     self.assertRaises(ValueError, wd.add, 'file', '', name='')
Esempio n. 39
 def test_bad_path_type(self):
     wd = WorkingDirManager()
     self.assertRaises(ValueError, wd.add, 'dir', '')
     self.assertRaises(ValueError, wd.name_to_path, 'dir')
     self.assertRaises(ValueError,, 'dir', '')
Esempio n. 40
 def test_cant_auto_name_unless_added_as_auto(self):
     wd = WorkingDirManager()
     wd.add('file', '', name='')
     self.assertEqual('file', '', ''), '')
             , 'file', '')
Esempio n. 41
 def test_cant_auto_name_unless_added_as_auto(self):
     wd = WorkingDirManager()
     wd.add('file', '', name='')
     self.assertEqual('file', '', ''), '')
             , 'file', '')
Esempio n. 42
class HadoopInTheCloudJobRunner(MRJobBinRunner):
    """Abstract base class for all Hadoop-in-the-cloud services."""

    alias = '_cloud'

    OPT_NAMES = MRJobBinRunner.OPT_NAMES | {

    # so far, every service provides the ability to run bootstrap scripts

    def __init__(self, **kwargs):
        super(HadoopInTheCloudJobRunner, self).__init__(**kwargs)

        # if *cluster_id* is not set, ``self._cluster_id`` will be
        # set when we create or join a cluster
        self._cluster_id = self._opts['cluster_id']

        # bootstrapping
        self._bootstrap = self._bootstrap_python() + self._parse_bootstrap()

        # add files to manager
        self._bootstrap_dir_mgr = WorkingDirManager()

        for cmd in self._bootstrap:
            for token in cmd:
                if isinstance(token, dict):
                    # convert dir archive tokens to archives
                    if token['type'] == 'dir':
                        token['path'] = self._dir_archive_path(token['path'])
                        token['type'] = 'archive'


        # we'll create this script later, as needed
        self._master_bootstrap_script_path = None

        # ssh state

        # the process for the SSH tunnel
        self._ssh_proc = None

        # if this is true, stop trying to launch the SSH tunnel
        self._give_up_on_ssh_tunnel = False

        # store the (tunneled) URL of the job tracker/resource manager
        self._ssh_tunnel_url = None

    ### Options ###

    def _default_opts(self):
        return combine_dicts(
            super(HadoopInTheCloudJobRunner, self)._default_opts(),
                cloud_part_size_mb=100,  # 100 MB
                # don't use a list because it makes it hard to read option
                # values when running in verbose mode. See #1284
                ssh_bind_ports=xrange(40001, 40841),
                # ssh_bin isn't included here. For example, the Dataproc
                # runner launches ssh through the gcloud util

    def _fix_opts(self, opts, source=None):
        opts = super(HadoopInTheCloudJobRunner, self)._fix_opts(opts,

        # cloud_part_size_mb should be a number
        if opts.get('cloud_part_size_mb') is not None:
            if not isinstance(opts['cloud_part_size_mb'],
                              (integer_types, float)):
                raise TypeError('cloud_part_size_mb must be a number')

        # patch max_hours_idle into max_mins_idle (see #1663)
        if opts.get('max_hours_idle') is not None:
                'max_hours_idle is deprecated and will be removed in v0.7.0.' +
                (' Please use max_mins_idle instead'
                 if opts.get('max_mins_idle') is None else ''))

            if opts.get('max_mins_idle') is None:
                opts['max_mins_idle'] = opts['max_hours_idle'] * 60

        return opts

    def _combine_opts(self, opt_list):
        """Propagate *instance_type* to other instance type opts, if not
        already set.

        Also propagate core instance type to task instance type, if it's
        not already set.
        opts = super(HadoopInTheCloudJobRunner, self)._combine_opts(opt_list)

        if opts['instance_type']:
            # figure out how late in the configs opt was set (setting
            # --instance_type on the command line overrides core_instance_type
            # set in configs)
            opt_priority = {k: -1 for k in opts}

            for i, sub_opts in enumerate(opt_list):
                for k, v in sub_opts.items():
                    if v == opts[k]:
                        opt_priority[k] = i

            # instance_type only affects master_instance_type if there are
            # no other instances
            if opts['num_core_instances'] or opts['num_task_instances']:
                propagate_to = ['core_instance_type', 'task_instance_type']
                propagate_to = ['master_instance_type']

            for k in propagate_to:
                if opts[k] is None or (opt_priority[k] <
                    opts[k] = opts['instance_type']

        if not opts['task_instance_type']:
            opts['task_instance_type'] = opts['core_instance_type']

        return opts

    ### Bootstrapping ###

    def _bootstrap_python(self):
        """Redefine this to return a (possibly empty) list of parsed commands
        (in the same format as returned by parse_setup_cmd())' to make sure a
        compatible version of Python is installed

        If the *bootstrap_python* option is false, should always return ``[]``.
        return []

    def _cp_to_local_cmd(self):
        """Command to copy files from the cloud to the local directory
        (usually via Hadoop). Redefine this as needed; for example, on EMR,
        we sometimes have to use ``aws s3 cp`` because ``hadoop`` isn't
        installed at bootstrap time."""
        return 'hadoop fs -copyToLocal'

    def _parse_bootstrap(self):
        """Parse the *bootstrap* option with
        return [parse_setup_cmd(cmd) for cmd in self._opts['bootstrap']]

    def _create_master_bootstrap_script_if_needed(self):
        """Helper for :py:meth:`_add_bootstrap_files_for_upload`.

        Create the master bootstrap script and write it into our local
        temp directory. Set self._master_bootstrap_script_path.

        This will do nothing if there are no bootstrap scripts or commands,
        or if it has already been called."""
        if self._master_bootstrap_script_path:

        # don't bother if we're not starting a cluster
        if self._cluster_id:

        # Also don't bother if we're not bootstrapping
        if not (self._bootstrap or self._bootstrap_mrjob()):

        # create if we need it, and add commands to install it
        mrjob_bootstrap = []
        if self._bootstrap_mrjob():
            assert self._mrjob_zip_path
            path_dict = {
                'type': 'file',
                'name': None,
                'path': self._mrjob_zip_path

            # find out where python keeps its libraries
                "__mrjob_PYTHON_LIB=$(%s -c "
                "'from distutils.sysconfig import get_python_lib;"
                " print(get_python_lib())')" % cmd_line(self._python_bin())

            # remove anything that might be in the way (see #1567)
            mrjob_bootstrap.append(['sudo rm -rf $__mrjob_PYTHON_LIB/mrjob'])

            # unzip
                ['sudo unzip ', path_dict, ' -d $__mrjob_PYTHON_LIB'])

            # re-compile pyc files now, since mappers/reducers can't
            # write to this directory. Don't fail if there is extra
            # un-compileable crud in the tarball (this would matter if
            # sh_bin were 'sh -e')
                'sudo %s -m compileall -q'
                ' -f $__mrjob_PYTHON_LIB/mrjob && true' %

        path = os.path.join(self._get_local_tmp_dir(), '')'writing master bootstrap script to %s' % path)

        contents = self._master_bootstrap_script_content(self._bootstrap +

        self._write_script(contents, path, 'master bootstrap script')

        self._master_bootstrap_script_path = path

    def _master_bootstrap_script_content(self, bootstrap):
        """Return a list containing the lines of the master bootstrap script.
        (without trailing newlines)
        out = []

        # shebang, precommands

        # for example, create a tmp dir and cd to it
        if self._bootstrap_pre_commands():

        # store $PWD
        out.append('# store $PWD')

        # special case for PWD being in /, which happens on Dataproc
        # (really we should cd to tmp or something)
        out.append('if [ $__mrjob_PWD = "/" ]; then')
        out.append('  __mrjob_PWD=""')

        # run commands in a block so we can redirect stdout to stderr
        # (e.g. to catch errors from compileall). See #370

        # download files
        out.append('  # download files and mark them executable')

        cp_to_local = self._cp_to_local_cmd()

        # TODO: why bother with $__mrjob_PWD here, since we're already in it?
        for name, path in sorted(
            uri = self._upload_mgr.uri(path)
            out.append('  %s %s $__mrjob_PWD/%s' %
                       (cp_to_local, pipes.quote(uri), pipes.quote(name)))
            # imitate Hadoop Distributed Cache (see #1602)
            out.append('  chmod u+rx $__mrjob_PWD/%s' % pipes.quote(name))

        # download and unarchive archives
        archive_names_and_paths = sorted(
        if archive_names_and_paths:
            # make tmp dir if needed
            out.append('  # download and unpack archives')
            out.append('  __mrjob_TMP=$(mktemp -d)')

            for name, path in archive_names_and_paths:
                uri = self._upload_mgr.uri(path)
                ext = file_ext(basename(path))

                # copy file to tmp dir
                quoted_archive_path = '$__mrjob_TMP/%s' % pipes.quote(name)

                    '  %s %s %s' %
                    (cp_to_local, pipes.quote(uri), quoted_archive_path))

                # unarchive file
                if ext not in _EXT_TO_UNARCHIVE_CMD:
                    raise KeyError('unknown archive file extension: %s' % path)
                unarchive_cmd = _EXT_TO_UNARCHIVE_CMD[ext]

                out.append('  ' + unarchive_cmd %
                                dir='$__mrjob_PWD/' + pipes.quote(name)))

                # imitate Hadoop Distributed Cache (see #1602)
                out.append('  chmod u+rx -R $__mrjob_PWD/%s' %


        # run bootstrap commands
        out.append('  # bootstrap commands')
        for cmd in bootstrap:
            # reconstruct the command line, substituting $__mrjob_PWD/<name>
            # for path dicts
            line = '  '
            for token in cmd:
                if isinstance(token, dict):
                    # it's a path dictionary
                    line += '$__mrjob_PWD/'
                    line += pipes.quote(**token))
                    # it's raw script
                    line += token

        out.append('} 1>&2')  # stdout -> stderr for ease of error log parsing

        return out

    def _bootstrap_pre_commands(self):
        """A list of hard-coded commands to run at the beginning of the
        bootstrap script. Currently used by dataproc to cd into a tmp dir."""
        return []

    def _start_of_sh_script(self):
        """Return a list of lines (without trailing newlines) containing the
        shell script shebang and pre-commands."""
        out = []

        # shebang
        sh_bin = self._sh_bin()
        if not sh_bin[0].startswith('/'):
            sh_bin = ['/usr/bin/env'] + sh_bin
        out.append('#!' + cmd_line(sh_bin))

        # hook for 'set -e', etc. (see #1549)

        return out

    ### Launching Clusters ###

    def _add_extra_cluster_params(self, params):
        """Return a dict with the *extra_cluster_params* opt patched into
        *params*, and ``None`` values removed."""
        params = params.copy()
        params = {k: v for k, v in params.items() if v is not None}

        return params

    ### SSH Tunnel ###

    def _ssh_tunnel_args(self, bind_port):
        """Redefine this in your subclass. You will probably want to call
        :py:meth:`_ssh_tunnel_opts` somewhere in here.

        Should return the list of args used to run the command
        to open the SSH tunnel, bound to *bind_port* on your computer,
        or ``None`` if it isn't possible to set up an SSH tunnel.
        return None

    def _ssh_tunnel_config(self):
        """Redefine this in your subclass. Should return a dict with the
        following keys:

        *localhost*: once we SSH in, is the web interface?
                     reachable at ``localhost``
        *name*: either ``'job tracker'`` or ``'resource manager'``
        *path*: path of main page on web interface (e.g. "/cluster")
        *port*: port number of the web interface
        raise NotImplementedError

    def _launch_ssh_proc(self, args):
        """The command used to create a :py:class:`subprocess.Popen` to
        run the SSH tunnel. You usually don't need to redefine this."""
        log.debug('> %s' % cmd_line(args))
        return Popen(args, stdin=PIPE, stdout=PIPE, stderr=PIPE)

    def _ssh_launch_wait_secs(self):
        """Wait this long after launching the SSH process before checking
        for failure (default 1 second). You may redefine this."""
        return 1.0

    def _set_up_ssh_tunnel(self):
        """Call this whenever you think it is possible to SSH to your cluster.
        This sets :py:attr:`_ssh_proc`. Does nothing if :mrjob-opt:`ssh_tunnel`
        is not set, or there is already a tunnel process running.
        # did the user request an SSH tunnel?
        if not self._opts['ssh_tunnel']:

        # no point in trying to launch a nonexistent command twice
        if self._give_up_on_ssh_tunnel:

        # did we already launch the SSH tunnel process? is it still running?
        if self._ssh_proc:
            if self._ssh_proc.returncode is None:
                log.warning('  Oops, ssh subprocess exited with return code'
                            ' %d, restarting...' % self._ssh_proc.returncode)
                self._ssh_proc = None

        tunnel_config = self._ssh_tunnel_config()

        bind_port = None
        popen_exception = None
        ssh_tunnel_args = []

        for bind_port in self._pick_ssh_bind_ports():
            ssh_proc = None
            ssh_tunnel_args = self._ssh_tunnel_args(bind_port)

            # can't launch SSH tunnel right now
            if not ssh_tunnel_args:

                ssh_proc = self._launch_ssh_proc(ssh_tunnel_args)
            except OSError as ex:
                # e.g. OSError(2, 'File not found')
                popen_exception = ex  # warning handled below

            if ssh_proc:

                # still running. We are golden
                if ssh_proc.returncode is None:
                    self._ssh_proc = ssh_proc

        if self._ssh_proc:
            if self._opts['ssh_tunnel_is_open']:
                bind_host = socket.getfqdn()
                bind_host = 'localhost'
            self._ssh_tunnel_url = 'http://%s:%d%s' % (bind_host, bind_port,
  '  Connect to %s at: %s' %
                     (tunnel_config['name'], self._ssh_tunnel_url))

            if popen_exception:
                # this only happens if the ssh binary is not present
                # or not executable (so tunnel_config and the args to the
                # ssh binary don't matter)
                log.warning("    Couldn't open SSH tunnel: %s" %
                self._give_up_on_ssh_tunnel = True
                log.warning('    Failed to open ssh tunnel to %s' %

    def _kill_ssh_tunnel(self):
        """Send SIGKILL to SSH tunnel, if it's running."""
        if not self._ssh_proc:

        if self._ssh_proc.returncode is None:
  'Killing our SSH tunnel (pid %d)' %


                os.kill(, SIGKILL)
            except Exception as e:

        self._ssh_proc = None
        self._ssh_tunnel_url = None

    def _ssh_tunnel_opts(self, bind_port):
        """Options to SSH related to setting up a tunnel (rather than
        SSHing in). Helper for :py:meth:`_ssh_tunnel_args`.
        args = self._ssh_local_tunnel_opt(bind_port) + [
        if self._opts['ssh_tunnel_is_open']:
            args.extend(['-g', '-4'])  # -4: listen on IPv4 only

        return args

    def _ssh_local_tunnel_opt(self, bind_port):
        """Helper for :py:meth:`_ssh_tunnel_opts`."""
        tunnel_config = self._ssh_tunnel_config()

        return [
            '%d:%s:%d' % (

    def _pick_ssh_bind_ports(self):
        """Pick a list of ports to try binding our SSH tunnel to.

        We will try to bind the same port for any given cluster (Issue #67)
        # don't perturb the random number generator
        random_state = random.getstate()
            # seed random port selection on cluster ID
            num_picks = min(_MAX_SSH_RETRIES,
            return random.sample(self._opts['ssh_bind_ports'], num_picks)
Esempio n. 43
class MRJobRunner(object):
    """Abstract base class for all runners"""

    #: alias for this runner; used for picking section of
    #: :py:mod:``mrjob.conf`` to load one of ``'local'``, ``'emr'``,
    #: ``'hadoop'``, or ``'mapr'``
    alias = None

    OPTION_STORE_CLASS = RunnerOptionStore

    ### methods to call from your batch script ###

    def __init__(self, mr_job_script=None, conf_path=None,
                 extra_args=None, file_upload_args=None,
                 hadoop_input_format=None, hadoop_output_format=None,
                 input_paths=None, output_dir=None, partitioner=None,
                 stdin=None, conf_paths=None, **opts):
        """All runners take the following keyword arguments:

        :type mr_job_script: str
        :param mr_job_script: the path of the ``.py`` file containing the
                              :py:class:`~mrjob.job.MRJob`. If this is None,
                              you won't actually be able to :py:meth:`run` the
                              job, but other utilities (e.g. :py:meth:`ls`)
                              will work.
        :type conf_path: str, None, or False
        :param conf_path: Deprecated. Alternate path to read configs from, or
                          ``False`` to ignore all config files. Use
                          *conf_paths* instead.
        :type conf_paths: None or list
        :param conf_paths: List of config files to combine and use, or None to
                           search for mrjob.conf in the default locations.
        :type extra_args: list of str
        :param extra_args: a list of extra cmd-line arguments to pass to the
                           mr_job script. This is a hook to allow jobs to take
                           additional arguments.
        :param file_upload_args: a list of tuples of ``('--ARGNAME', path)``.
                                 The file at the given path will be uploaded
                                 to the local directory of the mr_job script
                                 when it runs, and then passed into the script
                                 with ``--ARGNAME``. Useful for passing in
                                 SQLite DBs and other configuration files to
                                 your job.
        :type hadoop_input_format: str
        :param hadoop_input_format: name of an optional Hadoop ``InputFormat``
                                    class. Passed to Hadoop along with your
                                    first step with the ``-inputformat``
                                    option. Note that if you write your own
                                    class, you'll need to include it in your
                                    own custom streaming jar (see
        :type hadoop_output_format: str
        :param hadoop_output_format: name of an optional Hadoop
                                     ``OutputFormat`` class. Passed to Hadoop
                                     along with your first step with the
                                     ``-outputformat`` option. Note that if you
                                     write your own class, you'll need to
                                     include it in your own custom streaming
                                     jar (see *hadoop_streaming_jar*).
        :type input_paths: list of str
        :param input_paths: Input files for your job. Supports globs and
                            recursively walks directories (e.g.
                            ``['data/common/', 'data/training/*.gz']``). If
                            this is left blank, we'll read from stdin
        :type output_dir: str
        :param output_dir: An empty/non-existent directory where Hadoop
                           streaming should put the final output from the job.
                           If you don't specify an output directory, we'll
                           output into a subdirectory of this job's temporary
                           directory. You can control this from the command
                           line with ``--output-dir``. This option cannot be
                           set from configuration files. If used with the
                           hadoop runner, this path does not need to be fully
                           qualified with ``hdfs://`` URIs because it's
                           understood that it has to be on HDFS.
        :type partitioner: str
        :param partitioner: Optional name of a Hadoop partitoner class, e.g.
                            Hadoop streaming will use this to determine how
                            mapper output should be sorted and distributed
                            to reducers.
        :param stdin: an iterable (can be a ``StringIO`` or even a list) to use
                      as stdin. This is a hook for testing; if you set
                      ``stdin`` via :py:meth:`~mrjob.job.MRJob.sandbox`, it'll
                      get passed through to the runner. If for some reason
                      your lines are missing newlines, we'll add them;
                      this makes it easier to write automated tests.
        self._ran_job = False

        if conf_path is not None:
            if conf_paths is not None:
                raise ValueError("Can't specify both conf_path and conf_paths")
                log.warn("The conf_path argument to MRJobRunner() is"
                         " deprecated. Use conf_paths instead.")
                if conf_path is False:
                    conf_paths = []
                    conf_paths = [conf_path]
        self._opts = self.OPTION_STORE_CLASS(self.alias, opts, conf_paths)
        self._fs = None

        self._working_dir_mgr = WorkingDirManager()

        self._script_path = mr_job_script
        if self._script_path:
            self._working_dir_mgr.add('file', self._script_path)

        # setup cmds and wrapper script
        self._setup_scripts = []
        for path in self._opts['setup_scripts']:
            setup_script = parse_legacy_hash_path('file', path)

        # we'll create the wrapper script later
        self._wrapper_script_path = None

        # extra args to our job
        self._extra_args = list(extra_args) if extra_args else []

        # extra file arguments to our job
        self._file_upload_args = []
        if file_upload_args:
            for arg, path in file_upload_args:
                arg_file = parse_legacy_hash_path('file', path)
                self._file_upload_args.append((arg, arg_file))

        # set up uploading
        for path in self._opts['upload_files']:
                'file', path, must_name='upload_files'))
        for path in self._opts['upload_archives']:
                'archive', path, must_name='upload_archives'))

        # set up python archives
        self._python_archives = []
        for path in self._opts['python_archives']:

        # Where to read input from (log files, etc.)
        self._input_paths = input_paths or ['-']  # by default read from stdin
        self._stdin = stdin or sys.stdin
        self._stdin_path = None  # temp file containing dump from stdin

        # where a tarball of the mrjob library is stored locally
        self._mrjob_tar_gz_path = None

        # store output_dir
        self._output_dir = output_dir

        # store partitioner
        self._partitioner = partitioner

        # store hadoop input and output formats
        self._hadoop_input_format = hadoop_input_format
        self._hadoop_output_format = hadoop_output_format

        # give this job a unique name
        self._job_name = self._make_unique_job_name(
            label=self._opts['label'], owner=self._opts['owner'])

        # a local tmp directory that will be cleaned up when we're done
        # access/make this using self._get_local_tmp_dir()
        self._local_tmp_dir = None

        # info about our steps. this is basically a cache for self._get_steps()
        self._steps = None

        # if this is True, we have to pipe input into the sort command
        # rather than feed it multiple files
        self._sort_is_windows_sort = None

    ### Filesystem object ###

    def fs(self):
        """:py:class:`~mrjob.fs.base.Filesystem` object for the local
        filesystem. Methods on :py:class:`~mrjob.fs.base.Filesystem` objects
        will be forwarded to :py:class:`~mrjob.runner.MRJobRunner` until mrjob
        0.5, but **this behavior is deprecated.**
        if self._fs is None:
            self._fs = LocalFilesystem()
        return self._fs

    def __getattr__(self, name):
        # For backward compatibility, forward filesystem methods
            return getattr(self.fs, name)
        except AttributeError:
            raise AttributeError(name)

    ### Running the job and parsing output ###

    def run(self):
        """Run the job, and block until it finishes.

        Raise an exception if there are any problems.
        if not self._script_path:
            raise AssertionError("No script to run!")

        if self._ran_job:
            raise AssertionError("Job already ran!")

        self._ran_job = True

    def stream_output(self):
        """Stream raw lines from the job's output. You can parse these
        using the read() method of the appropriate HadoopStreamingProtocol
        output_dir = self.get_output_dir()
        if output_dir is None:
            raise AssertionError('Run the job before streaming output')'Streaming final output from %s' % output_dir)

        def split_path(path):
            while True:
                base, name = os.path.split(path)

                # no more elements
                if not name:

                yield name

                path = base

        for filename in
            subpath = filename[len(output_dir):]
            if not any(name.startswith('_') for name in split_path(subpath)):
                for line in self._cat_file(filename):
                    yield line

    def _cleanup_mode(self, mode=None):
        """Actual cleanup action to take based on various options"""
        if self._script_path and not self._ran_job:
            return mode or self._opts['cleanup_on_failure']
            return mode or self._opts['cleanup']

    def _cleanup_local_scratch(self):
        """Cleanup any files/directories on the local machine we created while
        running this job. Should be safe to run this at any time, or multiple

        This particular function removes any local tmp directories
        added to the list self._local_tmp_dirs

        This won't remove output_dir if it's outside of our scratch dir.
        if self._local_tmp_dir:
  'removing tmp directory %s' % self._local_tmp_dir)
            except OSError, e:

        self._local_tmp_dir = None
Esempio n. 44
 def test_empty(self):
     wd = WorkingDirManager()
     self.assertEqual(wd.name_to_path("archive"), {})
     self.assertEqual(wd.name_to_path("file"), {})
Esempio n. 45
    def __init__(self, mr_job_script=None, conf_paths=None,
                 extra_args=None, file_upload_args=None,
                 hadoop_input_format=None, hadoop_output_format=None,
                 input_paths=None, output_dir=None, partitioner=None,
                 stdin=None, **opts):
        """All runners take the following keyword arguments:

        :type mr_job_script: str
        :param mr_job_script: the path of the ``.py`` file containing the
                              :py:class:`~mrjob.job.MRJob`. If this is None,
                              you won't actually be able to :py:meth:`run` the
                              job, but other utilities (e.g. :py:meth:`ls`)
                              will work.
        :type conf_paths: None or list
        :param conf_paths: List of config files to combine and use, or None to
                           search for mrjob.conf in the default locations.
        :type extra_args: list of str
        :param extra_args: a list of extra cmd-line arguments to pass to the
                           mr_job script. This is a hook to allow jobs to take
                           additional arguments.
        :param file_upload_args: a list of tuples of ``('--ARGNAME', path)``.
                                 The file at the given path will be uploaded
                                 to the local directory of the mr_job script
                                 when it runs, and then passed into the script
                                 with ``--ARGNAME``. Useful for passing in
                                 SQLite DBs and other configuration files to
                                 your job.
        :type hadoop_input_format: str
        :param hadoop_input_format: name of an optional Hadoop ``InputFormat``
                                    class. Passed to Hadoop along with your
                                    first step with the ``-inputformat``
                                    option. Note that if you write your own
                                    class, you'll need to include it in your
                                    own custom streaming jar (see
        :type hadoop_output_format: str
        :param hadoop_output_format: name of an optional Hadoop
                                     ``OutputFormat`` class. Passed to Hadoop
                                     along with your first step with the
                                     ``-outputformat`` option. Note that if you
                                     write your own class, you'll need to
                                     include it in your own custom streaming
                                     jar (see *hadoop_streaming_jar*).
        :type input_paths: list of str
        :param input_paths: Input files for your job. Supports globs and
                            recursively walks directories (e.g.
                            ``['data/common/', 'data/training/*.gz']``). If
                            this is left blank, we'll read from stdin
        :type output_dir: str
        :param output_dir: An empty/non-existent directory where Hadoop
                           streaming should put the final output from the job.
                           If you don't specify an output directory, we'll
                           output into a subdirectory of this job's temporary
                           directory. You can control this from the command
                           line with ``--output-dir``. This option cannot be
                           set from configuration files. If used with the
                           hadoop runner, this path does not need to be fully
                           qualified with ``hdfs://`` URIs because it's
                           understood that it has to be on HDFS.
        :type partitioner: str
        :param partitioner: Optional name of a Hadoop partitoner class, e.g.
                            Hadoop streaming will use this to determine how
                            mapper output should be sorted and distributed
                            to reducers.
        :param stdin: an iterable (can be a ``BytesIO`` or even a list) to use
                      as stdin. This is a hook for testing; if you set
                      ``stdin`` via :py:meth:`~mrjob.job.MRJob.sandbox`, it'll
                      get passed through to the runner. If for some reason
                      your lines are missing newlines, we'll add them;
                      this makes it easier to write automated tests.
        self._ran_job = False

        self._opts = self.OPTION_STORE_CLASS(self.alias, opts, conf_paths)
        self._fs = None

        self._working_dir_mgr = WorkingDirManager()

        self._script_path = mr_job_script
        if self._script_path:
            self._working_dir_mgr.add('file', self._script_path)

        # give this job a unique name
        self._job_key = self._make_unique_job_key(
            label=self._opts['label'], owner=self._opts['owner'])

        # we'll create the wrapper script later
        self._setup_wrapper_script_path = None

        # extra args to our job
        self._extra_args = list(extra_args) if extra_args else []

        # extra file arguments to our job
        self._file_upload_args = []
        if file_upload_args:
            for arg, path in file_upload_args:
                arg_file = parse_legacy_hash_path('file', path)
                self._file_upload_args.append((arg, arg_file))

        # set up uploading
        for path in self._opts['upload_files']:
                'file', path, must_name='upload_files'))
        for path in self._opts['upload_archives']:
                'archive', path, must_name='upload_archives'))

        # python_archives, setup, setup_cmds, and setup_scripts
        # self._setup is a list of shell commands with path dicts
        # interleaved; see mrjob.setup.parse_setup_cmds() for details
        self._setup = self._parse_setup()
        for cmd in self._setup:
            for maybe_path_dict in cmd:
                if isinstance(maybe_path_dict, dict):

        # Where to read input from (log files, etc.)
        self._input_paths = input_paths or ['-']  # by default read from stdin
        if PY2:
            self._stdin = stdin or sys.stdin
            self._stdin = stdin or sys.stdin.buffer
        self._stdin_path = None  # temp file containing dump from stdin

        # where a tarball of the mrjob library is stored locally
        self._mrjob_tar_gz_path = None

        # store output_dir
        self._output_dir = output_dir

        # store partitioner
        self._partitioner = partitioner

        # store hadoop input and output formats
        self._hadoop_input_format = hadoop_input_format
        self._hadoop_output_format = hadoop_output_format

        # a local tmp directory that will be cleaned up when we're done
        # access/make this using self._get_local_tmp_dir()
        self._local_tmp_dir = None

        # A cache for self._get_steps(); also useful as a test hook
        self._steps = None

        # if this is True, we have to pipe input into the sort command
        # rather than feed it multiple files
        self._sort_is_windows_sort = None

        # this variable marks whether a cleanup has happened and this runner's
        # output stream is no longer available.
        self._closed = False
Esempio n. 46
 def test_simple(self):
     wd = WorkingDirManager()
     wd.add("archive", "s3://bucket/path/to/baz.tar.gz")
     wd.add("file", "foo/")
     self.assertEqual(wd.name_to_path("file"), {"": "foo/"})
     self.assertEqual(wd.name_to_path("archive"), {"baz.tar.gz": "s3://bucket/path/to/baz.tar.gz"})
Esempio n. 47
class MRJobRunner(object):
    """Abstract base class for all runners"""

    # this class handles the basic runner framework, options and config files,
    # arguments to mrjobs, and setting up job working dirs and environments.
    # this will put files from setup scripts, py_files, and bootstrap_mrjob
    # into the job's working dir, but won't actually run/import them
    # command lines to run substeps (including Spark) are handled by
    # mrjob.bin.MRJobBinRunner

    #: alias for this runner; used for picking section of
    #: :py:mod:``mrjob.conf`` to load one of ``'local'``, ``'emr'``,
    #: or ``'hadoop'``
    alias = None

    # libjars is only here because the job can set it; might want to
    # handle this with a warning from the launcher instead
    OPT_NAMES = {
        'bootstrap_mrjob', 'check_input_paths', 'cleanup',
        'cleanup_on_failure', 'cmdenv', 'jobconf', 'label', 'libjars',
        'local_tmp_dir', 'owner', 'py_files', 'setup', 'upload_archives',
        'upload_dirs', 'upload_files'

    # if this is true, when bootstrap_mrjob is true, add it through the
    # setup script

    ### methods to call from your batch script ###

    def __init__(self,
        """All runners take the following keyword arguments:

        :type mr_job_script: str
        :param mr_job_script: the path of the ``.py`` file containing the
                              :py:class:`~mrjob.job.MRJob`. If this is None,
                              you won't actually be able to :py:meth:`run` the
                              job, but other utilities (e.g. :py:meth:`ls`)
                              will work.
        :type conf_paths: None or list
        :param conf_paths: List of config files to combine and use, or None to
                           search for mrjob.conf in the default locations.
        :type extra_args: list of str
        :param extra_args: a list of extra cmd-line arguments to pass to the
                           mr_job script. This is a hook to allow jobs to take
                           additional arguments.
        :param file_upload_args: a list of tuples of ``('--ARGNAME', path)``.
                                 The file at the given path will be uploaded
                                 to the local directory of the mr_job script
                                 when it runs, and then passed into the script
                                 with ``--ARGNAME``. Useful for passing in
                                 SQLite DBs and other configuration files to
                                 your job.
        :type hadoop_input_format: str
        :param hadoop_input_format: name of an optional Hadoop ``InputFormat``
                                    class. Passed to Hadoop along with your
                                    first step with the ``-inputformat``
                                    option. Note that if you write your own
                                    class, you'll need to include it in your
                                    own custom streaming jar (see
        :type hadoop_output_format: str
        :param hadoop_output_format: name of an optional Hadoop
                                     ``OutputFormat`` class. Passed to Hadoop
                                     along with your first step with the
                                     ``-outputformat`` option. Note that if you
                                     write your own class, you'll need to
                                     include it in your own custom streaming
                                     jar (see
        :type input_paths: list of str
        :param input_paths: Input files for your job. Supports globs and
                            recursively walks directories (e.g.
                            ``['data/common/', 'data/training/*.gz']``). If
                            this is left blank, we'll read from stdin
        :type output_dir: str
        :param output_dir: An empty/non-existent directory where Hadoop
                           should put the final output from the job.
                           If you don't specify an output directory, we'll
                           output into a subdirectory of this job's temporary
                           directory. You can control this from the command
                           line with ``--output-dir``. This option cannot be
                           set from configuration files. If used with the
                           hadoop runner, this path does not need to be fully
                           qualified with ``hdfs://`` URIs because it's
                           understood that it has to be on HDFS.
        :type partitioner: str
        :param partitioner: Optional name of a Hadoop partitioner class, e.g.
                            Hadoop streaming will use this to determine how
                            mapper output should be sorted and distributed
                            to reducers.
        :type sort_values: bool
        :param sort_values: if true, set partitioners and jobconf variables
                            so that reducers to receive the values
                            associated with any key in sorted order (sorted by
                            their *encoded* value). Also known as secondary
        :param stdin: an iterable (can be a ``BytesIO`` or even a list) to use
                      as stdin. This is a hook for testing; if you set
                      ``stdin`` via :py:meth:`~mrjob.job.MRJob.sandbox`, it'll
                      get passed through to the runner. If for some reason
                      your lines are missing newlines, we'll add them;
                      this makes it easier to write automated tests.
        :type step_output_dir: str
        :param step_output_dir: An empty/non-existent directory where Hadoop
                                should put output from all steps other than
                                the last one (this only matters for multi-step
                                jobs). Currently ignored by local runners.
        self._ran_job = False

        # opts are made from:
        # empty defaults (everything set to None)
        # runner-specific defaults
        # opts from config file(s)
        # opts from command line
        self._opts = self._combine_confs(
            [(None, {key: None
                     for key in self.OPT_NAMES})] +
            [(None, self._default_opts())] +
            load_opts_from_mrjob_confs(self.alias, conf_paths) +
            [('the command line', opts)])

        log.debug('Active configuration:')
                opt_key: self._obfuscate_opt(opt_key, opt_value)
                for opt_key, opt_value in self._opts.items()

        self._fs = None

        # a local tmp directory that will be cleaned up when we're done
        # access/make this using self._get_local_tmp_dir()
        self._local_tmp_dir = None

        self._working_dir_mgr = WorkingDirManager()

        # mapping from dir to path for corresponding archive. we pick
        # paths during init(), but don't actually create the archives
        # until self._create_dir_archives() is called
        self._dir_to_archive_path = {}
        # dir archive names (the filename minus ".tar.gz") already taken
        self._dir_archive_names_taken = set()
        # set of dir_archives that have actually been created
        self._dir_archives_created = set()

        # track (name, path) of files and archives to upload to spark.
        # these are a subset of those in self._working_dir_mgr
        self._spark_files = []
        self._spark_archives = []

        self._upload_mgr = None  # define in subclasses that use this

        self._script_path = mr_job_script
        if self._script_path:
            self._working_dir_mgr.add('file', self._script_path)

        # give this job a unique name
        self._job_key = self._make_unique_job_key(label=self._opts['label'],

        # extra args to our job
        self._extra_args = list(extra_args) if extra_args else []
        for extra_arg in self._extra_args:
            if isinstance(extra_arg, dict):
                if extra_arg.get('type') != 'file':
                    raise NotImplementedError
                    (extra_arg['name'], extra_arg['path']))

        # extra file arguments to our job
        if file_upload_args:
            log.warning('file_upload_args is deprecated and will be removed'
                        ' in v0.6.0. Pass dicts to extra_args instead.')
            for arg, path in file_upload_args:
                arg_file = parse_legacy_hash_path('file', path)
                self._extra_args.extend([arg, arg_file])
                self._spark_files.append((arg_file['name'], arg_file['path']))

        # set up uploading
        for hash_path in self._opts['upload_files']:
            uf = parse_legacy_hash_path('file',
            self._spark_files.append((uf['name'], uf['path']))

        for hash_path in self._opts['upload_archives']:
            ua = parse_legacy_hash_path('archive',
            self._spark_archives.append((ua['name'], ua['path']))

        for hash_path in self._opts['upload_dirs']:
            # pick name based on directory path
            ud = parse_legacy_hash_path('dir',
            # but feed working_dir_mgr the archive's path
            archive_path = self._dir_archive_path(ud['path'])
            self._working_dir_mgr.add('archive', archive_path, name=ud['name'])
            self._spark_archives.append((ud['name'], archive_path))

        # py_files

        # self._setup is a list of shell commands with path dicts
        # interleaved; see mrjob.setup.parse_setup_cmd() for details
        self._setup = self._parse_setup_and_py_files()
        for cmd in self._setup:
            for token in cmd:
                if isinstance(token, dict):
                    # convert dir archives tokens to archives
                    if token['type'] == 'dir':
                        # feed the archive's path to self._working_dir_mgr
                        token['path'] = self._dir_archive_path(token['path'])
                        token['type'] = 'archive'


        # Where to read input from (log files, etc.)
        self._input_paths = input_paths or ['-']  # by default read from stdin
        if PY2:
            self._stdin = stdin or sys.stdin
            self._stdin = stdin or sys.stdin.buffer
        self._stdin_path = None  # temp file containing dump from stdin

        # where a zip file of the mrjob library is stored locally
        self._mrjob_zip_path = None

        # store output_dir
        self._output_dir = output_dir

        # store partitioner
        self._partitioner = partitioner

        # store sort_values
        self._sort_values = sort_values

        # store step_output_dir
        self._step_output_dir = step_output_dir

        # store hadoop input and output formats
        self._hadoop_input_format = hadoop_input_format
        self._hadoop_output_format = hadoop_output_format

        # A cache for self._get_steps(); also useful as a test hook
        self._steps = None

        # this variable marks whether a cleanup has happened and this runner's
        # output stream is no longer available.
        self._closed = False

    ### Options ####

    def _default_opts(self):
            owner = getpass.getuser()
            owner = None

        return dict(

    def _combine_confs(self, source_and_opt_list):
        """Combine several opt dictionaries into one.

        *source_and_opt_list* is a list of tuples of *source*,
        *opts* where *opts* is a dictionary and *source* is either
        None or a description of where the opts came from (usually a path).

        Only override this if you need truly fine-grained control,
        including knowledge of the options' source.
        opt_list = [
            self._fix_opts(opts, source)
            for source, opts in source_and_opt_list

        return self._combine_opts(opt_list)

    def _combine_opts(self, opt_list):
        """Combine several opt dictionaries into one. *opt_list*
        is a list of dictionaries containing validated options

        Override this if you need to base options off the values of
        other options, but don't need to issue warnings etc.
        about the options' source.
        return combine_opts(self._opt_combiners(), *opt_list)

    def _opt_combiners(self):
        """A dictionary mapping opt name to combiner funciton. This
        won't necessarily include every opt name (we default to
        return _combiners(self.OPT_NAMES)

    def _fix_opts(self, opts, source=None):
        """Take an options dictionary, and either return a sanitized
        version of it, or raise an exception.

        *source* is either a string describing where the opts came from
        or None.

        This ensures that opt dictionaries are really dictionaries
        and handles deprecated options.
        if source is None:
            source = 'defaults'  # defaults shouldn't trigger warnings

        if not isinstance(opts, dict):
            raise TypeError(
                'options for %s (from %s) must be a dict' % self.runner_alias,

        deprecated_aliases = _deprecated_aliases(self.OPT_NAMES)

        results = {}

        for k, v in sorted(opts.items()):

            # rewrite deprecated aliases
            if k in deprecated_aliases:
                if v is None:  # don't care

                aliased_opt = deprecated_aliases

                log.warning('Deprecated option %s (from %s) has been renamed'
                            ' to %s and will be removed in v0.7.0' %
                            (k, source, aliased_opt))

                if opts.get(aliased_opt) is not None:
                    return  # don't overwrite non-aliased opt

                k = aliased_opt

            if k in self.OPT_NAMES:
                results[k] = None if v is None else self._fix_opt(k, v, source)
                log.warning('Unexpected option %s (from %s)' % (k, source))

        return results

    def _fix_opt(self, opt_key, opt_value, source):
        """Fix a single option, returning its correct value or raising
        an exception. This is not called for options that are ``None``.

        This currently handles cleanup opts.

        Override this if you require additional opt validation or cleanup.
        if opt_key in ('cleanup', 'cleanup_on_failure'):
            return self._fix_cleanup_opt(opt_key, opt_value, source)
            return opt_value

    def _fix_cleanup_opt(self, opt_key, opt_value, source):
        """Fix a cleanup option, or raise ValueError."""
        if isinstance(opt_value, string_types):
            opt_value = [opt_value]

        if 'NONE' in opt_value and len(set(opt_value)) > 1:
            raise ValueError('Cannot clean up both nothing and something!'
                             ' (%s option from %s)' % (opt_key, source))

        for cleanup_type in opt_value:
            if cleanup_type not in CLEANUP_CHOICES:
                raise ValueError(
                    '%s must be one of %s, not %s (from %s)' %
                    (opt_key, ', '.join(CLEANUP_CHOICES), opt_value, source))

        return opt_value

    def _obfuscate_opt(self, opt_key, opt_value):
        """Return value of opt to show in debug printout. Used to obfuscate
        credentials, etc."""
        return opt_value

    ### Filesystem object ###

    def fs(self):
        """:py:class:`~mrjob.fs.base.Filesystem` object for the local
        if self._fs is None:
            # wrap LocalFilesystem in CompositeFilesystem to get IOError
            # on URIs (see #1185)
            self._fs = CompositeFilesystem(LocalFilesystem())
        return self._fs

    ### Running the job and parsing output ###

    def run(self):
        """Run the job, and block until it finishes.

        Raise :py:class:`~mrjob.step.StepFailedException` if there
        are any problems (except on
        :py:class:`~mrjob.inline.InlineMRJobRunner`, where we raise the
        actual exception that caused the step to fail).
        if not self._script_path:
            raise AssertionError("No script to run!")

        if self._ran_job:
            raise AssertionError("Job already ran!")

        self._ran_job = True

    def cat_output(self):
        """Stream the jobs output, as a stream of ``bytes``. If there are
        multiple output files, there will be an empty bytestring
        (``b''``) between them.

        .. versionadded:: 0.6.0

           In previous versions, you'd use :py:meth:`stream_output`.
        output_dir = self.get_output_dir()
        if output_dir is None:
            raise AssertionError('Run the job before streaming output')

        if self._closed is True:
                'WARNING! Trying to stream output from a closed runner, output'
                ' will probably be empty.')'Streaming final output from %s...' % output_dir)

        def split_path(path):
            while True:
                base, name = os.path.split(path)

                # no more elements
                if not name:

                yield name

                path = base

        def ls_output():
            for filename in
                subpath = filename[len(output_dir):]
                if not (any(
                        name.startswith('_') for name in split_path(subpath))):
                    yield filename

        for i, filename in enumerate(ls_output()):
            if i > 0:
                yield b''  # EOF of previous file

            for chunk in self.fs._cat_file(filename):
                yield chunk

    def stream_output(self):
        """Like :py:meth:`cat_output` except that it groups bytes into
        lines. Equivalent to ``mrjob.util.to_lines(runner.stream_output())``.

        .. deprecated:: 0.6.0
        log.warning('stream_output() is deprecated and will be removed in'
                    ' v0.7.0. use mrjob.util.to_lines(runner.cat_output())'
                    ' instead.')

        return to_lines(self.cat_output())

    def _cleanup_mode(self, mode=None):
        """Actual cleanup action to take based on various options"""
        if self._script_path and not self._ran_job:
            return mode or self._opts['cleanup_on_failure']
            return mode or self._opts['cleanup']

    def _cleanup_cloud_tmp(self):
        """Cleanup any files/directories on cloud storage (e.g. S3) we created
        while running this job. Should be safe to run this at any time, or
        multiple times.
        pass  # only EMR runner does this

    def _cleanup_hadoop_tmp(self):
        """Cleanup any files/directories on HDFS we created
        while running this job. Should be safe to run this at any time, or
        multiple times.
        pass  # only Hadoop runner does this

    def _cleanup_local_tmp(self):
        """Cleanup any files/directories on the local machine we created while
        running this job. Should be safe to run this at any time, or multiple

        This particular function removes any local tmp directories
        added to the list self._local_tmp_dirs

        This won't remove output_dir if it's outside of our tmp dir.
        if self._local_tmp_dir:
  'Removing temp directory %s...' % self._local_tmp_dir)
            except OSError as e:

        self._local_tmp_dir = None

    def _cleanup_cluster(self):
        """Terminate the cluster if there is one."""
        pass  # this only happens on EMR

    def _cleanup_logs(self):
        """Cleanup any log files that are created as a side-effect of the job.
        pass  # this only happens on EMR

    def _cleanup_job(self):
        """Stop any jobs that we created that are still running."""
        pass  # currently disabled (see #1241)

    def cleanup(self, mode=None):
        """Clean up running jobs, temp files, and logs, subject to the
        *cleanup* option passed to the constructor.

        If you create your runner in a :keyword:`with` block,
        :py:meth:`cleanup` will be called automatically::

            with mr_job.make_runner() as runner:

            # cleanup() called automatically here

        :param mode: override *cleanup* passed into the constructor. Should be
                     a list of strings from :py:data:`CLEANUP_CHOICES`
        mode = self._cleanup_mode(mode)

        def mode_has(*args):
            return any((choice in mode) for choice in args)

        if self._script_path and not self._ran_job:
            if mode_has('CLUSTER', 'ALL'):

            if mode_has('JOB', 'ALL'):

        if mode_has('ALL', 'TMP', 'CLOUD_TMP'):

        if mode_has('ALL', 'TMP', 'HADOOP_TMP'):

        if mode_has('ALL', 'TMP', 'LOCAL_TMP'):

        if mode_has('ALL', 'LOGS'):

        self._closed = True

    def counters(self):
        """Get counters associated with this run in this form::

            [{'group name': {'counter1': 1, 'counter2': 2}},
             {'group name': ...}]

        The list contains an entry for every step of the current job.
        raise NotImplementedError

    ### hooks for the with statement ###

    def __enter__(self):
        """Don't do anything special at start of with block"""
        return self

    def __exit__(self, type, value, traceback):
        """Call self.cleanup() at end of with block."""

    ### more runner information ###

    def get_opts(self):
        """Get options set for this runner, as a dict."""
        log.warning('get_opts() is deprecated and will be removed in v0.7.0')
        return copy.deepcopy(self._opts)

    def get_job_key(self):
        """Get the unique key for the job run by this runner.
        This has the format ````
        return self._job_key

    def get_output_dir(self):
        """Find the directory containing the job output. If the job hasn't
        run yet, returns None"""
        if self._script_path and not self._ran_job:
            return None

        return self._output_dir

    ### other methods you need to implement in your subclass ###

    def get_hadoop_version(self):
        """Return the version number of the Hadoop environment as a string if
        Hadoop is being used or simulated. Return None if not applicable.

        :py:class:`~mrjob.emr.EMRJobRunner` infers this from the cluster.
        :py:class:`~mrjob.hadoop.HadoopJobRunner` gets this from
        ``hadoop version``. :py:class:`~mrjob.local.LocalMRJobRunner` has an
        additional `hadoop_version` option to specify which version it
        :py:class:`~mrjob.inline.InlineMRJobRunner` does not simulate Hadoop at
        return None

    # you'll probably wan't to add your own __init__() and cleanup() as well

    def _run(self):
        """Run the job."""
        raise NotImplementedError

    ### internal utilities for implementing MRJobRunners ###

    def _get_local_tmp_dir(self):
        """Create a tmp directory on the local filesystem that will be
        cleaned up by self.cleanup()"""
        if not self._local_tmp_dir:
            path = os.path.join(self._opts['local_tmp_dir'], self._job_key)
  'Creating temp directory %s' % path)
            if os.path.isdir(path):
            self._local_tmp_dir = path

        return self._local_tmp_dir

    def _make_unique_job_key(self, label=None, owner=None):
        """Come up with a useful unique ID for this job.

        We use this to choose the output directory, etc. for the job.
        # use the name of the script if one wasn't explicitly
        # specified
        if not label:
            if self._script_path:
                label = os.path.basename(self._script_path).split('.')[0]
                label = 'no_script'

        if not owner:
            owner = 'no_user'

        now = datetime.datetime.utcnow()
        return '%s.%s.%s.%06d' % (label, owner, now.strftime('%Y%m%d.%H%M%S'),

    def _get_steps(self):
        """Call the job script to find out how many steps it has, and whether
        there are mappers and reducers for each step. Validate its

        Returns output as described in :ref:`steps-format`.

        Results are cached, so call this as many times as you want.
        if self._steps is None:
            self._steps = self._load_steps()

        return self._steps

    def _load_steps(self):
        """Ask job how many steps it has, and whether
        there are mappers and reducers for each step.

        Returns output as described in :ref:`steps-format`.
        raise NotImplementedError

    def _get_step(self, step_num):
        """Get a single step (calls :py:meth:`_get_steps`)."""
        return self._get_steps()[step_num]

    def _num_steps(self):
        """Get the number of steps (calls :py:meth:`get_steps`)."""
        return len(self._get_steps())

    def _has_streaming_steps(self):
        """Are any of our steps Hadoop streaming steps?"""
        return any(step['type'] == 'streaming' for step in self._get_steps())

    def _has_spark_steps(self):
        """Are any of our steps Spark steps (either spark or spark_script)"""
        return any(
            _is_spark_step_type(step['type']) for step in self._get_steps())

    def _args_for_task(self, step_num, mrc):
        return [
            '--step-num=%d' % step_num,
            '--%s' % mrc,
        ] + self._mr_job_extra_args()

    def _mr_job_extra_args(self, local=False):
        """Return arguments to add to every invocation of MRJob.

        :type local: boolean
        :param local: if this is True, use files' local paths rather than
            the path they'll have inside Hadoop streaming
        result = []

        for extra_arg in self._extra_args:
            if isinstance(extra_arg, dict):
                if local:

        return result

    def _dir_archive_path(self, dir_path):
        """Assign a path for the archive of *dir_path* but don't
        actually create anything."""
        if dir_path not in self._dir_to_archive_path:
            # we can check local paths now
            if not (is_uri(dir_path) or os.path.isdir(dir_path)):
                raise OSError('%s is not a directory!' % dir_path)

            name = name_uniquely(dir_path,

            self._dir_to_archive_path[dir_path] = os.path.join(
                self._get_local_tmp_dir(), 'archives', name + '.tar.gz')

        return self._dir_to_archive_path[dir_path]

    def _create_dir_archives(self):
        """Call this to create all dir archives"""
        for dir_path in sorted(set(self._dir_to_archive_path)):

    def _create_dir_archive(self, dir_path):
        """Helper for :py:meth:`archive_dir`"""
        if not self.fs.exists(dir_path):
            raise OSError('%s does not exist')

        tar_gz_path = self._dir_archive_path(dir_path)

        if tar_gz_path in self._dir_archives_created:
            return  # already created

        if not os.path.isdir(os.path.dirname(tar_gz_path)):

        # for remote files
        tmp_download_path = os.path.join(self._get_local_tmp_dir(),
                                         'tmp-download')'Archiving %s -> %s' % (dir_path, tar_gz_path))

        with, mode='w:gz') as tar_gz:
            for path in
                # only lists files
                if path == dir_path:
                    raise OSError('%s is a file, not a directory!' % dir_path)

                # TODO: do we need this?
                if os.path.realpath(path) == os.path.realpath(tar_gz_path):
                    raise OSError('attempted to archive %s into itself!' %

                if is_uri(path):
                    path_in_tar_gz = path[len(dir_path):].lstrip('/')

          '  downloading %s -> %s' %
                             (path, tmp_download_path))
                    with open(tmp_download_path, 'wb') as f:
                        for chunk in
                    local_path = tmp_download_path
                    path_in_tar_gz = path[len(dir_path):].lstrip(os.sep)
                    local_path = path

                log.debug('  adding %s to %s' % (path, tar_gz_path))
                tar_gz.add(local_path, path_in_tar_gz, recursive=False)


    def _bootstrap_mrjob(self):
        """Should we bootstrap mrjob?"""
        if self._opts['bootstrap_mrjob'] is None:
            return self._opts['interpreter'] is None
            return bool(self._opts['bootstrap_mrjob'])

    def _get_input_paths(self):
        """Get the paths to input files, dumping STDIN to a local
        file if need be."""
        if '-' in self._input_paths:
            if self._stdin_path is None:
                # prompt user, so they don't think the process has stalled
      'reading from STDIN')

                stdin_path = os.path.join(self._get_local_tmp_dir(), 'STDIN')
                log.debug('dumping stdin to local file %s' % stdin_path)
                with open(stdin_path, 'wb') as stdin_file:
                    for line in self._stdin:
                        # catch missing newlines (often happens with test data)
                        if not line.endswith(b'\n'):
                            line += b'\n'

                self._stdin_path = stdin_path

        return [self._stdin_path if p == '-' else p for p in self._input_paths]

    def _check_input_paths(self):
        """Check that input exists prior to running the job, if the
        `check_input_paths` option is true."""
        if not self._opts['check_input_paths']:

        for path in self._input_paths:
            if path == '-':
                continue  # STDIN always exists

            if not self.fs.can_handle_path(path):
                continue  # e.g. non-S3 URIs on EMR

            if not self.fs.exists(path):
                raise IOError('Input path %s does not exist!' % (path, ))

    def _intermediate_output_uri(self, step_num, local=False):
        """A URI for intermediate output for the given step number."""
        join = os.path.join if local else posixpath.join

        return join(self._step_output_dir or self._default_step_output_dir(),
                    '%04d' % step_num)

    def _default_step_output_dir(self):
        """Where to put output for steps other than the last one,
        if not specified by the *output_dir* constructor keyword.
        Usually you want this to be on HDFS (most efficient).

        Define this in your runner subclass.
        raise NotImplementedError

    def _step_input_uris(self, step_num):
        """A list of URIs to use as input for the given step. For all
        except the first step, this list will have a single item (a
        if step_num == 0:
            return [
                self._upload_mgr.uri(path) for path in self._get_input_paths()
            return [self._intermediate_output_uri(step_num - 1)]

    def _step_output_uri(self, step_num):
        """URI to use as output for the given step. This is either an
        intermediate dir (see :py:meth:`intermediate_output_uri`) or
        ``self._output_dir`` for the final step."""
        if step_num == len(self._get_steps()) - 1:
            return self._output_dir
            return self._intermediate_output_uri(step_num)

    def _interpolate_input_and_output(self, args, step_num):
        """Replace :py:data:`~mrjob.step.INPUT` and
        :py:data:`~mrjob.step.OUTPUT` in arguments to a jar or Spark

        If there are multiple input paths (i.e. on the first step), they'll
        be joined with a comma.
        def interpolate(arg):
            if arg == mrjob.step.INPUT:
                return ','.join(self._step_input_uris(step_num))
            elif arg == mrjob.step.OUTPUT:
                return self._step_output_uri(step_num)
                return arg

        return [interpolate(arg) for arg in args]

    def _create_mrjob_zip(self):
        """Make a zip of the mrjob library, without .pyc or .pyo files,
        This will also set ``self._mrjob_zip_path`` and return it.

        Typically called from

        It's safe to call this method multiple times (we'll only create
        the zip file once.)
        if not self._mrjob_zip_path:
            # find mrjob library
            import mrjob

            if not os.path.basename(mrjob.__file__).startswith('__init__.'):
                raise Exception(
                    "Bad path for mrjob library: %s; can't bootstrap mrjob",

            mrjob_dir = os.path.dirname(mrjob.__file__) or '.'

            zip_path = os.path.join(self._get_local_tmp_dir(), '')

            def filter_path(path):
                filename = os.path.basename(path)
                return not (filename.lower().endswith('.pyc')
                            or filename.lower().endswith('.pyo') or
                            # filter out emacs backup files
                            filename.endswith('~') or
                            # filter out emacs lock files
                            filename.startswith('.#') or
                            # filter out MacFuse resource forks

            log.debug('archiving %s -> %s as %s' %
                      (mrjob_dir, zip_path, os.path.join('mrjob', '')))
            zip_dir(mrjob_dir, zip_path, filter=filter_path, prefix='mrjob')

            self._mrjob_zip_path = zip_path

        return self._mrjob_zip_path

    def _jobconf_for_step(self, step_num):
        """Get the jobconf dictionary, optionally including step-specific
        jobconf info.

        Also translate jobconfs to the current Hadoop version, if necessary.

        step = self._get_step(step_num)

        # _sort_values_jobconf() isn't relevant to Spark,
        # but it doesn't do any harm either

        jobconf = combine_dicts(self._sort_values_jobconf(),
                                self._opts['jobconf'], step.get('jobconf'))

        # if user is using the wrong jobconfs, add in the correct ones
        # and log a warning
        hadoop_version = self.get_hadoop_version()
        if hadoop_version:
            jobconf = translate_jobconf_dict(jobconf, hadoop_version)

        return jobconf

    def _sort_values_jobconf(self):
        """Jobconf dictionary to enable sorting by value.
        if not self._sort_values:
            return {}

        # translate _SORT_VALUES_JOBCONF to the correct Hadoop version,
        # without logging a warning
        hadoop_version = self.get_hadoop_version()

        jobconf = {}
        for k, v in _SORT_VALUES_JOBCONF.items():
            if hadoop_version:
                jobconf[translate_jobconf(k, hadoop_version)] = v
                for j in translate_jobconf_for_all_versions(k):
                    jobconf[j] = v

        return jobconf

    def _sort_values_partitioner(self):
        """Partitioner to use with *sort_values* keyword to the constructor."""
        if self._sort_values:
            return _SORT_VALUES_PARTITIONER
            return None

    def _parse_setup_and_py_files(self):
        """Parse the *setup* option with
        :py:func:`mrjob.setup.parse_setup_cmd()`, and patch in *py_files*.
        setup = []

        # py_files
        for path in self._opts['py_files']:
            # Spark (at least v1.3.1) doesn't work with # and --py-files,
            # see #1375
            if '#' in path:
                raise ValueError("py_files cannot contain '#'")
            path_dict = parse_legacy_hash_path('file', path)
            setup.append(['export PYTHONPATH=', path_dict, ':$PYTHONPATH'])

        # setup
        for cmd in self._opts['setup']:

        return setup

    def _upload_args(self):
        # just upload every file and archive in the working dir manager
        return self._upload_args_helper('-files', None, '-archives', None)

    def _upload_args_helper(self, files_opt_str, files, archives_opt_str,
        args = []

        file_hash_paths = list(self._arg_hash_paths('file', files))
        if file_hash_paths:

        archive_hash_paths = list(self._arg_hash_paths('archive', archives))
        if archive_hash_paths:

        return args

    def _arg_hash_paths(self, type, named_paths=None):
        """Helper function for the *upload_args methods."""
        if named_paths is None:
            # just return everything managed by _working_dir_mgr
            named_paths = sorted(

        for name, path in named_paths:
            if not name:
                name =, path)
            uri = self._upload_mgr.uri(path)
            yield '%s#%s' % (uri, name)
Esempio n. 48
class HadoopInTheCloudJobRunner(MRJobBinRunner):
    """Abstract base class for all Hadoop-in-the-cloud services."""

    alias = '_cloud'

    OPT_NAMES = MRJobBinRunner.OPT_NAMES | {

    # so far, every service provides the ability to run bootstrap scripts

    def __init__(self, **kwargs):
        super(HadoopInTheCloudJobRunner, self).__init__(**kwargs)

        # if *cluster_id* is not set, ``self._cluster_id`` will be
        # set when we create or join a cluster
        self._cluster_id = self._opts['cluster_id']

        # bootstrapping
        self._bootstrap = self._bootstrap_python() + self._parse_bootstrap()

        # add files to manager
        self._bootstrap_dir_mgr = WorkingDirManager()

        for cmd in self._bootstrap:
            for token in cmd:
                if isinstance(token, dict):
                    # convert dir archive tokens to archives
                    if token['type'] == 'dir':
                        token['path'] = self._dir_archive_path(token['path'])
                        token['type'] = 'archive'


        # we'll create this script later, as needed
        self._master_bootstrap_script_path = None

        # ssh state

        # the process for the SSH tunnel
        self._ssh_proc = None

        # if this is true, stop trying to launch the SSH tunnel
        self._give_up_on_ssh_tunnel = False

        # store the (tunneled) URL of the job tracker/resource manager
        self._ssh_tunnel_url = None

    ### Options ###

    def _default_opts(self):
        return combine_dicts(
            super(HadoopInTheCloudJobRunner, self)._default_opts(),
                cloud_part_size_mb=100,  # 100 MB
                # don't use a list because it makes it hard to read option
                # values when running in verbose mode. See #1284
                ssh_bind_ports=xrange(40001, 40841),
                # ssh_bin isn't included here. For example, the Dataproc
                # runner launches ssh through the gcloud util

    def _fix_opts(self, opts, source=None):
        opts = super(HadoopInTheCloudJobRunner, self)._fix_opts(
            opts, source=source)

        # cloud_part_size_mb should be a number
        if opts.get('cloud_part_size_mb') is not None:
            if not isinstance(opts['cloud_part_size_mb'],
                              (integer_types, float)):
                raise TypeError('cloud_part_size_mb must be a number')

        # patch max_hours_idle into max_mins_idle (see #1663)
        if opts.get('max_hours_idle') is not None:
                'max_hours_idle is deprecated and will be removed in v0.7.0.' +
                (' Please use max_mins_idle instead'
                 if opts.get('max_mins_idle') is None else ''))

            if opts.get('max_mins_idle') is None:
                opts['max_mins_idle'] = opts['max_hours_idle'] * 60

        return opts

    def _combine_opts(self, opt_list):
        """Propagate *instance_type* to other instance type opts, if not
        already set.

        Also propagate core instance type to task instance type, if it's
        not already set.
        opts = super(HadoopInTheCloudJobRunner, self)._combine_opts(opt_list)

        if opts['instance_type']:
            # figure out how late in the configs opt was set (setting
            # --instance_type on the command line overrides core_instance_type
            # set in configs)
            opt_priority = {k: -1 for k in opts}

            for i, sub_opts in enumerate(opt_list):
                for k, v in sub_opts.items():
                    if v == opts[k]:
                        opt_priority[k] = i

            # instance_type only affects master_instance_type if there are
            # no other instances
            if opts['num_core_instances'] or opts['num_task_instances']:
                propagate_to = ['core_instance_type', 'task_instance_type']
                propagate_to = ['master_instance_type']

            for k in propagate_to:
                if opts[k] is None or (
                        opt_priority[k] < opt_priority['instance_type']):
                    opts[k] = opts['instance_type']

        if not opts['task_instance_type']:
            opts['task_instance_type'] = opts['core_instance_type']

        return opts

    ### Bootstrapping ###

    def _bootstrap_python(self):
        """Redefine this to return a (possibly empty) list of parsed commands
        (in the same format as returned by parse_setup_cmd())' to make sure a
        compatible version of Python is installed

        If the *bootstrap_python* option is false, should always return ``[]``.
        return []

    def _cp_to_local_cmd(self):
        """Command to copy files from the cloud to the local directory
        (usually via Hadoop). Redefine this as needed; for example, on EMR,
        we sometimes have to use ``aws s3 cp`` because ``hadoop`` isn't
        installed at bootstrap time."""
        return 'hadoop fs -copyToLocal'

    def _parse_bootstrap(self):
        """Parse the *bootstrap* option with
        return [parse_setup_cmd(cmd) for cmd in self._opts['bootstrap']]

    def _create_master_bootstrap_script_if_needed(self):
        """Helper for :py:meth:`_add_bootstrap_files_for_upload`.

        Create the master bootstrap script and write it into our local
        temp directory. Set self._master_bootstrap_script_path.

        This will do nothing if there are no bootstrap scripts or commands,
        or if it has already been called."""
        if self._master_bootstrap_script_path:

        # don't bother if we're not starting a cluster
        if self._cluster_id:

        # Also don't bother if we're not bootstrapping
        if not (self._bootstrap or self._bootstrap_mrjob()):

        # create if we need it, and add commands to install it
        mrjob_bootstrap = []
        if self._bootstrap_mrjob():
            assert self._mrjob_zip_path
            path_dict = {
                'type': 'file', 'name': None, 'path': self._mrjob_zip_path}

            # find out where python keeps its libraries
                "__mrjob_PYTHON_LIB=$(%s -c "
                "'from distutils.sysconfig import get_python_lib;"
                " print(get_python_lib())')" %

            # remove anything that might be in the way (see #1567)
            mrjob_bootstrap.append(['sudo rm -rf $__mrjob_PYTHON_LIB/mrjob'])

            # unzip
                ['sudo unzip ', path_dict, ' -d $__mrjob_PYTHON_LIB'])

            # re-compile pyc files now, since mappers/reducers can't
            # write to this directory. Don't fail if there is extra
            # un-compileable crud in the tarball (this would matter if
            # sh_bin were 'sh -e')
                ['sudo %s -m compileall -q'
                 ' -f $__mrjob_PYTHON_LIB/mrjob && true' %

        path = os.path.join(self._get_local_tmp_dir(), '')'writing master bootstrap script to %s' % path)

        contents = self._master_bootstrap_script_content(
            self._bootstrap + mrjob_bootstrap)

        self._write_script(contents, path, 'master bootstrap script')

        self._master_bootstrap_script_path = path

    def _master_bootstrap_script_content(self, bootstrap):
        """Return a list containing the lines of the master bootstrap script.
        (without trailing newlines)
        out = []

        # shebang, precommands

        # for example, create a tmp dir and cd to it
        if self._bootstrap_pre_commands():

        # store $PWD
        out.append('# store $PWD')

        # special case for PWD being in /, which happens on Dataproc
        # (really we should cd to tmp or something)
        out.append('if [ $__mrjob_PWD = "/" ]; then')
        out.append('  __mrjob_PWD=""')

        # run commands in a block so we can redirect stdout to stderr
        # (e.g. to catch errors from compileall). See #370

        # download files
        out.append('  # download files and mark them executable')

        cp_to_local = self._cp_to_local_cmd()

        # TODO: why bother with $__mrjob_PWD here, since we're already in it?
        for name, path in sorted(
            uri = self._upload_mgr.uri(path)
            out.append('  %s %s $__mrjob_PWD/%s' %
                       (cp_to_local, pipes.quote(uri), pipes.quote(name)))
            # imitate Hadoop Distributed Cache (see #1602)
            out.append('  chmod u+rx $__mrjob_PWD/%s' % pipes.quote(name))

        # download and unarchive archives
        archive_names_and_paths = sorted(
        if archive_names_and_paths:
            # make tmp dir if needed
            out.append('  # download and unpack archives')
            out.append('  __mrjob_TMP=$(mktemp -d)')

            for name, path in archive_names_and_paths:
                uri = self._upload_mgr.uri(path)
                ext = file_ext(basename(path))

                # copy file to tmp dir
                quoted_archive_path = '$__mrjob_TMP/%s' % pipes.quote(name)

                out.append('  %s %s %s' % (
                    cp_to_local, pipes.quote(uri), quoted_archive_path))

                # unarchive file
                if ext not in _EXT_TO_UNARCHIVE_CMD:
                    raise KeyError('unknown archive file extension: %s' % path)
                unarchive_cmd = _EXT_TO_UNARCHIVE_CMD[ext]

                out.append('  ' + unarchive_cmd % dict(
                    dir='$__mrjob_PWD/' + pipes.quote(name)))

                # imitate Hadoop Distributed Cache (see #1602)
                    '  chmod u+rx -R $__mrjob_PWD/%s' % pipes.quote(name))


        # run bootstrap commands
        out.append('  # bootstrap commands')
        for cmd in bootstrap:
            # reconstruct the command line, substituting $__mrjob_PWD/<name>
            # for path dicts
            line = '  '
            for token in cmd:
                if isinstance(token, dict):
                    # it's a path dictionary
                    line += '$__mrjob_PWD/'
                    line += pipes.quote(**token))
                    # it's raw script
                    line += token

        out.append('} 1>&2')  # stdout -> stderr for ease of error log parsing

        return out

    def _bootstrap_pre_commands(self):
        """A list of hard-coded commands to run at the beginning of the
        bootstrap script. Currently used by dataproc to cd into a tmp dir."""
        return []

    def _start_of_sh_script(self):
        """Return a list of lines (without trailing newlines) containing the
        shell script shebang and pre-commands."""
        out = []

        # shebang
        sh_bin = self._sh_bin()
        if not sh_bin[0].startswith('/'):
            sh_bin = ['/usr/bin/env'] + sh_bin
        out.append('#!' + cmd_line(sh_bin))

        # hook for 'set -e', etc. (see #1549)

        return out

    ### Launching Clusters ###

    def _add_extra_cluster_params(self, params):
        """Return a dict with the *extra_cluster_params* opt patched into
        *params*, and ``None`` values removed."""
        params = deepcopy(params)

        for k, v in sorted(self._opts['extra_cluster_params'].items()):
            _patch_params(params, k, v)

        return params

    ### SSH Tunnel ###

    def _ssh_tunnel_args(self, bind_port):
        """Redefine this in your subclass. You will probably want to call
        :py:meth:`_ssh_tunnel_opts` somewhere in here.

        Should return the list of args used to run the command
        to open the SSH tunnel, bound to *bind_port* on your computer,
        or ``None`` if it isn't possible to set up an SSH tunnel.
        return None

    def _ssh_tunnel_config(self):
        """Redefine this in your subclass. Should return a dict with the
        following keys:

        *localhost*: once we SSH in, is the web interface?
                     reachable at ``localhost``
        *name*: either ``'job tracker'`` or ``'resource manager'``
        *path*: path of main page on web interface (e.g. "/cluster")
        *port*: port number of the web interface
        raise NotImplementedError

    def _launch_ssh_proc(self, args):
        """The command used to create a :py:class:`subprocess.Popen` to
        run the SSH tunnel. You usually don't need to redefine this."""
        log.debug('> %s' % cmd_line(args))
        return Popen(args, stdin=PIPE, stdout=PIPE, stderr=PIPE)

    def _ssh_launch_wait_secs(self):
        """Wait this long after launching the SSH process before checking
        for failure (default 1 second). You may redefine this."""
        return 1.0

    def _set_up_ssh_tunnel(self):
        """Call this whenever you think it is possible to SSH to your cluster.
        This sets :py:attr:`_ssh_proc`. Does nothing if :mrjob-opt:`ssh_tunnel`
        is not set, or there is already a tunnel process running.
        # did the user request an SSH tunnel?
        if not self._opts['ssh_tunnel']:

        # no point in trying to launch a nonexistent command twice
        if self._give_up_on_ssh_tunnel:

        # did we already launch the SSH tunnel process? is it still running?
        if self._ssh_proc:
            if self._ssh_proc.returncode is None:
                log.warning('  Oops, ssh subprocess exited with return code'
                            ' %d, restarting...' % self._ssh_proc.returncode)
                self._ssh_proc = None

        tunnel_config = self._ssh_tunnel_config()

        bind_port = None
        popen_exception = None
        ssh_tunnel_args = []

        for bind_port in self._pick_ssh_bind_ports():
            ssh_proc = None
            ssh_tunnel_args = self._ssh_tunnel_args(bind_port)

            # can't launch SSH tunnel right now
            if not ssh_tunnel_args:

                ssh_proc = self._launch_ssh_proc(ssh_tunnel_args)
            except OSError as ex:
                # e.g. OSError(2, 'File not found')
                popen_exception = ex   # warning handled below

            if ssh_proc:

                # still running. We are golden
                if ssh_proc.returncode is None:
                    self._ssh_proc = ssh_proc

        if self._ssh_proc:
            if self._opts['ssh_tunnel_is_open']:
                bind_host = socket.getfqdn()
                bind_host = 'localhost'
            self._ssh_tunnel_url = 'http://%s:%d%s' % (
                bind_host, bind_port, tunnel_config['path'])
  '  Connect to %s at: %s' % (
                tunnel_config['name'], self._ssh_tunnel_url))

            if popen_exception:
                # this only happens if the ssh binary is not present
                # or not executable (so tunnel_config and the args to the
                # ssh binary don't matter)
                    "    Couldn't open SSH tunnel: %s" % popen_exception)
                self._give_up_on_ssh_tunnel = True
                    '    Failed to open ssh tunnel to %s' %

    def _kill_ssh_tunnel(self):
        """Send SIGKILL to SSH tunnel, if it's running."""
        if not self._ssh_proc:

        if self._ssh_proc.returncode is None:
  'Killing our SSH tunnel (pid %d)' %


                if hasattr(signal, 'SIGKILL'):
                    os.kill(, signal.SIGKILL)
                    # Windows doesn't have SIGKILL, see #1892
                    os.kill(, signal.SIGABRT)
            except Exception as e:

        self._ssh_proc = None
        self._ssh_tunnel_url = None

    def _ssh_tunnel_opts(self, bind_port):
        """Options to SSH related to setting up a tunnel (rather than
        SSHing in). Helper for :py:meth:`_ssh_tunnel_args`.
        args = self._ssh_local_tunnel_opt(bind_port) + [
            '-N', '-n', '-q',
        if self._opts['ssh_tunnel_is_open']:
            args.extend(['-g', '-4'])  # -4: listen on IPv4 only

        return args

    def _ssh_local_tunnel_opt(self, bind_port):
        """Helper for :py:meth:`_ssh_tunnel_opts`."""
        tunnel_config = self._ssh_tunnel_config()

        return [
            '-L', '%d:%s:%d' % (

    def _pick_ssh_bind_ports(self):
        """Pick a list of ports to try binding our SSH tunnel to.

        We will try to bind the same port for any given cluster (Issue #67)
        # don't perturb the random number generator
        random_state = random.getstate()
            # seed random port selection on cluster ID
            num_picks = min(_MAX_SSH_RETRIES,
            return random.sample(self._opts['ssh_bind_ports'], num_picks)