コード例 #1
0
ファイル: test_conf.py プロジェクト: Affirm/mrjob
    def test_include_relative_to_real_path(self):
        os.mkdir(os.path.join(self.tmp_dir, 'conf'))

        base_conf_path = os.path.join(self.tmp_dir, 'conf', 'mrjob.base.conf')
        real_base_conf_path = os.path.realpath(base_conf_path)

        conf_path = os.path.join(self.tmp_dir, 'conf', 'mrjob.conf')
        conf_symlink_path = os.path.join(self.tmp_dir, 'mrjob.conf')

        with open(base_conf_path, 'w') as f:
            dump_mrjob_conf({}, f)

        with open(conf_path, 'w') as f:
            dump_mrjob_conf({'include': 'mrjob.base.conf'}, f)

        os.symlink(os.path.join('conf', 'mrjob.conf'), conf_symlink_path)

        self.assertEqual(
            load_opts_from_mrjob_conf('foo', conf_path),
            [(real_base_conf_path, {}), (conf_path, {})])

        # relative include should work from the symlink even though
        # it's not in the same directory as mrjob.base.conf
        self.assertEqual(
            load_opts_from_mrjob_conf('foo', conf_symlink_path),
            [(real_base_conf_path, {}), (conf_symlink_path, {})])
コード例 #2
0
ファイル: test_conf.py プロジェクト: zhiaozhou/mrjob
    def test_include_relative_to_real_path(self):
        os.mkdir(os.path.join(self.tmp_dir, 'conf'))

        base_conf_path = os.path.join(self.tmp_dir, 'conf', 'mrjob.base.conf')
        real_base_conf_path = os.path.realpath(base_conf_path)

        conf_path = os.path.join(self.tmp_dir, 'conf', 'mrjob.conf')
        conf_symlink_path = os.path.join(self.tmp_dir, 'mrjob.conf')

        with open(base_conf_path, 'w') as f:
            dump_mrjob_conf({}, f)

        with open(conf_path, 'w') as f:
            dump_mrjob_conf({'include': 'mrjob.base.conf'}, f)

        os.symlink(os.path.join('conf', 'mrjob.conf'), conf_symlink_path)

        self.assertEqual(
            load_opts_from_mrjob_conf('foo', conf_path),
            [(real_base_conf_path, {}), (conf_path, {})])

        # relative include should work from the symlink even though
        # it's not in the same directory as mrjob.base.conf
        self.assertEqual(
            load_opts_from_mrjob_conf('foo', conf_symlink_path),
            [(real_base_conf_path, {}), (conf_symlink_path, {})])
コード例 #3
0
ファイル: test_conf.py プロジェクト: JeffersonK/mrjob
    def test_load_mrjob_conf_and_load_opts(self):
        conf_path = os.path.join(self.tmp_dir, "mrjob.conf.2")
        with open(conf_path, "w") as f:
            f.write('{"runners": {"foo": {"qux": "quux"}}}')

        self.assertEqual(load_mrjob_conf(conf_path=conf_path), {"runners": {"foo": {"qux": "quux"}}})
        self.assertEqual(load_opts_from_mrjob_conf("foo", conf_path=conf_path)[0][1], {"qux": "quux"})
        # test missing options
        with logger_disabled("mrjob.conf"):
            self.assertEqual(load_opts_from_mrjob_conf("bar", conf_path=conf_path)[0][1], {})
コード例 #4
0
ファイル: conf_test.py プロジェクト: gimlids/LTPM
    def test_load_mrjob_conf_and_load_opts(self):
        conf_path = os.path.join(self.tmp_dir, 'mrjob.conf.2')
        with open(conf_path, 'w') as f:
            f.write('{"runners": {"foo": {"qux": "quux"}}}')

        assert_equal(load_mrjob_conf(conf_path=conf_path),
                     {'runners': {'foo': {'qux': 'quux'}}})
        assert_equal(load_opts_from_mrjob_conf('foo', conf_path=conf_path),
                     {'qux': 'quux'})
        # test missing options
        with logger_disabled('mrjob.conf'):
            assert_equal(
                load_opts_from_mrjob_conf('bar', conf_path=conf_path), {})
コード例 #5
0
    def test_recursive_include(self):
        conf_path = os.path.join(self.tmp_dir, 'mrjob.conf')
        with open(conf_path, 'w') as f:
            dump_mrjob_conf({'include': conf_path}, f)

        self.assertEqual(load_opts_from_mrjob_conf('foo', conf_path),
                         [(conf_path, {})])
コード例 #6
0
ファイル: test_conf.py プロジェクト: zhiaozhou/mrjob
    def test_nested_include(self):
        conf_path = os.path.join(self.tmp_dir, 'mrjob.conf')
        conf_path_1 = os.path.join(self.tmp_dir, 'mrjob.1.conf')
        conf_path_2 = os.path.join(self.tmp_dir, 'mrjob.2.conf')
        conf_path_3 = os.path.join(self.tmp_dir, 'mrjob.3.conf')

        # accidentally reversed the order of nested includes when
        # trying to make precedence work; this test would catch that

        with open(conf_path, 'w') as f:
            dump_mrjob_conf({'include': conf_path_1}, f)

        with open(conf_path_1, 'w') as f:
            dump_mrjob_conf({'include': [conf_path_2, conf_path_3]}, f)

        with open(conf_path_2, 'w') as f:
            dump_mrjob_conf({}, f)

        with open(conf_path_3, 'w') as f:
            dump_mrjob_conf({}, f)

        self.assertEqual(
            load_opts_from_mrjob_conf('foo', conf_path),
            [(conf_path_2, {}),
             (conf_path_3, {}),
             (conf_path_1, {}),
             (conf_path, {})])
コード例 #7
0
ファイル: test_conf.py プロジェクト: Affirm/mrjob
    def test_nested_include(self):
        conf_path = os.path.join(self.tmp_dir, 'mrjob.conf')
        conf_path_1 = os.path.join(self.tmp_dir, 'mrjob.1.conf')
        conf_path_2 = os.path.join(self.tmp_dir, 'mrjob.2.conf')
        conf_path_3 = os.path.join(self.tmp_dir, 'mrjob.3.conf')

        # accidentally reversed the order of nested includes when
        # trying to make precedence work; this test would catch that

        with open(conf_path, 'w') as f:
            dump_mrjob_conf({'include': conf_path_1}, f)

        with open(conf_path_1, 'w') as f:
            dump_mrjob_conf({'include': [conf_path_2, conf_path_3]}, f)

        with open(conf_path_2, 'w') as f:
            dump_mrjob_conf({}, f)

        with open(conf_path_3, 'w') as f:
            dump_mrjob_conf({}, f)

        self.assertEqual(
            load_opts_from_mrjob_conf('foo', conf_path),
            [(conf_path_2, {}),
             (conf_path_3, {}),
             (conf_path_1, {}),
             (conf_path, {})])
コード例 #8
0
ファイル: test_conf.py プロジェクト: Affirm/mrjob
    def test_recursive_include(self):
        conf_path = os.path.join(self.tmp_dir, 'mrjob.conf')
        with open(conf_path, 'w') as f:
            dump_mrjob_conf({'include': conf_path}, f)

        self.assertEqual(
            load_opts_from_mrjob_conf('foo', conf_path),
            [(conf_path, {})])
コード例 #9
0
ファイル: test_conf.py プロジェクト: tummykung/mrjob
    def test_load_mrjob_conf_and_load_opts(self):
        conf_path = os.path.join(self.tmp_dir, 'mrjob.conf.2')
        with open(conf_path, 'w') as f:
            f.write('{"runners": {"foo": {"qux": "quux"}}}')

        with no_handlers_for_logger('mrjob.conf'):
            self.assertEqual(
                load_mrjob_conf(conf_path=conf_path),
                {'runners': {'foo': {'qux': 'quux'}}})
        self.assertEqual(
            load_opts_from_mrjob_conf('foo', conf_path=conf_path)[0][1],
            {'qux': 'quux'})
        # test missing options
        with logger_disabled('mrjob.conf'):
            self.assertEqual(
                load_opts_from_mrjob_conf('bar', conf_path=conf_path)[0][1],
                {})
コード例 #10
0
    def test_load_and_load_opts_use_find_mrjob_conf(self):
        os.environ['HOME'] = self.tmp_dir

        dot_mrjob_path = os.path.join(self.tmp_dir, '.mrjob.conf')
        with open(dot_mrjob_path, 'w') as f:
            f.write('{"runners": {"foo": {"bar": "baz"}}}')

        assert_equal(load_mrjob_conf(), {'runners': {'foo': {'bar': 'baz'}}})
        assert_equal(load_opts_from_mrjob_conf('foo'), {'bar': 'baz'})
コード例 #11
0
ファイル: test_conf.py プロジェクト: JeffersonK/mrjob
    def test_load_and_load_opts_use_find_mrjob_conf(self):
        os.environ["HOME"] = self.tmp_dir

        dot_mrjob_path = os.path.join(self.tmp_dir, ".mrjob.conf")
        with open(dot_mrjob_path, "w") as f:
            f.write('{"runners": {"foo": {"bar": "baz"}}}')

        self.assertEqual(load_mrjob_conf(), {"runners": {"foo": {"bar": "baz"}}})
        self.assertEqual(load_opts_from_mrjob_conf("foo")[0][1], {"bar": "baz"})
コード例 #12
0
ファイル: conf_test.py プロジェクト: gimlids/LTPM
    def test_load_and_load_opts_use_find_mrjob_conf(self):
        os.environ['HOME'] = self.tmp_dir

        dot_mrjob_path = os.path.join(self.tmp_dir, '.mrjob.conf')
        with open(dot_mrjob_path, 'w') as f:
            f.write('{"runners": {"foo": {"bar": "baz"}}}')

        assert_equal(load_mrjob_conf(),
                     {'runners': {'foo': {'bar': 'baz'}}})
        assert_equal(load_opts_from_mrjob_conf('foo'), {'bar': 'baz'})
コード例 #13
0
ファイル: main.py プロジェクト: niranjangit6/aws-dev
def build_config():
    runner_kwargs = dict()
    opts = load_opts_from_mrjob_conf('emr', conf_path=conf_path())
    for _, kwargs in opts:
        runner_kwargs.update(kwargs)
    _add_subnet(runner_kwargs)
    _add_key_pair(runner_kwargs)
    _add_tmp_dir(runner_kwargs)
    logger.info('build_cinfig() -- conf_path: %s', conf_path())
    return runner_kwargs
コード例 #14
0
ファイル: test_conf.py プロジェクト: kartheek6/mrjob
    def test_doubly_recursive_include(self):
        conf_path_1 = os.path.join(self.tmp_dir, "mrjob.1.conf")
        conf_path_2 = os.path.join(self.tmp_dir, "mrjob.2.conf")

        with open(conf_path_1, "w") as f:
            dump_mrjob_conf({"include": conf_path_2}, f)

        with open(conf_path_2, "w") as f:
            dump_mrjob_conf({"include": conf_path_1}, f)

        self.assertEqual(load_opts_from_mrjob_conf("foo", conf_path_1), [(conf_path_2, {}), (conf_path_1, {})])
コード例 #15
0
    def test_doubly_recursive_include(self):
        conf_path_1 = os.path.join(self.tmp_dir, 'mrjob.1.conf')
        conf_path_2 = os.path.join(self.tmp_dir, 'mrjob.2.conf')

        with open(conf_path_1, 'w') as f:
            dump_mrjob_conf({'include': conf_path_2}, f)

        with open(conf_path_2, 'w') as f:
            dump_mrjob_conf({'include': conf_path_1}, f)

        self.assertEqual(load_opts_from_mrjob_conf('foo', conf_path_1),
                         [(conf_path_2, {}), (conf_path_1, {})])
コード例 #16
0
ファイル: test_conf.py プロジェクト: kartheek6/mrjob
    def test_relative_include(self):
        base_conf_path = os.path.join(self.tmp_dir, "mrjob.base.conf")
        real_base_conf_path = os.path.realpath(base_conf_path)

        conf_path = os.path.join(self.tmp_dir, "mrjob.conf")

        with open(base_conf_path, "w") as f:
            dump_mrjob_conf({}, f)

        with open(conf_path, "w") as f:
            dump_mrjob_conf({"include": "mrjob.base.conf"}, f)

        self.assertEqual(load_opts_from_mrjob_conf("foo", conf_path), [(real_base_conf_path, {}), (conf_path, {})])
コード例 #17
0
ファイル: test_conf.py プロジェクト: Affirm/mrjob
    def test_doubly_recursive_include(self):
        conf_path_1 = os.path.join(self.tmp_dir, 'mrjob.1.conf')
        conf_path_2 = os.path.join(self.tmp_dir, 'mrjob.2.conf')

        with open(conf_path_1, 'w') as f:
            dump_mrjob_conf({'include': conf_path_2}, f)

        with open(conf_path_2, 'w') as f:
            dump_mrjob_conf({'include': conf_path_1}, f)

        self.assertEqual(
            load_opts_from_mrjob_conf('foo', conf_path_1),
            [(conf_path_2, {}), (conf_path_1, {})])
コード例 #18
0
    def test_relative_include(self):
        base_conf_path = os.path.join(self.tmp_dir, 'mrjob.base.conf')
        real_base_conf_path = os.path.realpath(base_conf_path)

        conf_path = os.path.join(self.tmp_dir, 'mrjob.conf')

        with open(base_conf_path, 'w') as f:
            dump_mrjob_conf({}, f)

        with open(conf_path, 'w') as f:
            dump_mrjob_conf({'include': 'mrjob.base.conf'}, f)

        self.assertEqual(load_opts_from_mrjob_conf('foo', conf_path),
                         [(real_base_conf_path, {}), (conf_path, {})])
コード例 #19
0
    def test_tilde_in_include(self):
        # regression test for #1308

        os.environ['HOME'] = self.tmp_dir
        base_conf_path = os.path.join(self.tmp_dir, 'mrjob.base.conf')
        conf_path = os.path.join(self.tmp_dir, 'mrjob.conf')

        with open(base_conf_path, 'w') as f:
            dump_mrjob_conf({}, f)

        with open(conf_path, 'w') as f:
            dump_mrjob_conf({'include': '~/mrjob.base.conf'}, f)

        self.assertEqual(load_opts_from_mrjob_conf('foo', conf_path),
                         [(base_conf_path, {}), (conf_path, {})])
コード例 #20
0
ファイル: test_conf.py プロジェクト: kartheek6/mrjob
    def test_include_relative_to_real_path(self):
        os.mkdir(os.path.join(self.tmp_dir, "conf"))

        base_conf_path = os.path.join(self.tmp_dir, "conf", "mrjob.base.conf")
        real_base_conf_path = os.path.realpath(base_conf_path)

        conf_path = os.path.join(self.tmp_dir, "conf", "mrjob.conf")
        conf_symlink_path = os.path.join(self.tmp_dir, "mrjob.conf")

        with open(base_conf_path, "w") as f:
            dump_mrjob_conf({}, f)

        with open(conf_path, "w") as f:
            dump_mrjob_conf({"include": "mrjob.base.conf"}, f)

        os.symlink(os.path.join("conf", "mrjob.conf"), conf_symlink_path)

        self.assertEqual(load_opts_from_mrjob_conf("foo", conf_path), [(real_base_conf_path, {}), (conf_path, {})])

        # relative include should work from the symlink even though
        # it's not in the same directory as mrjob.base.conf
        self.assertEqual(
            load_opts_from_mrjob_conf("foo", conf_symlink_path), [(real_base_conf_path, {}), (conf_symlink_path, {})]
        )
コード例 #21
0
ファイル: test_conf.py プロジェクト: Affirm/mrjob
    def test_relative_include(self):
        base_conf_path = os.path.join(self.tmp_dir, 'mrjob.base.conf')
        real_base_conf_path = os.path.realpath(base_conf_path)

        conf_path = os.path.join(self.tmp_dir, 'mrjob.conf')

        with open(base_conf_path, 'w') as f:
            dump_mrjob_conf({}, f)

        with open(conf_path, 'w') as f:
            dump_mrjob_conf({'include': 'mrjob.base.conf'}, f)

        self.assertEqual(
            load_opts_from_mrjob_conf('foo', conf_path),
            [(real_base_conf_path, {}), (conf_path, {})])
コード例 #22
0
ファイル: test_conf.py プロジェクト: Affirm/mrjob
    def test_tilde_in_include(self):
        # regression test for #1308

        os.environ['HOME'] = self.tmp_dir
        base_conf_path = os.path.join(self.tmp_dir, 'mrjob.base.conf')
        conf_path = os.path.join(self.tmp_dir, 'mrjob.conf')

        with open(base_conf_path, 'w') as f:
            dump_mrjob_conf({}, f)

        with open(conf_path, 'w') as f:
            dump_mrjob_conf({'include': '~/mrjob.base.conf'}, f)

        self.assertEqual(
            load_opts_from_mrjob_conf('foo', conf_path),
            [(base_conf_path, {}), (conf_path, {})])
コード例 #23
0
    def test_include_order_beats_include(self):
        conf_path = os.path.join(self.tmp_dir, 'mrjob.conf')
        conf_path_1 = os.path.join(self.tmp_dir, 'mrjob.1.conf')
        conf_path_2 = os.path.join(self.tmp_dir, 'mrjob.2.conf')

        with open(conf_path, 'w') as f:
            dump_mrjob_conf({'include': [conf_path_1, conf_path_2]}, f)

        with open(conf_path_1, 'w') as f:
            dump_mrjob_conf({'include': [conf_path_2]}, f)

        with open(conf_path_2, 'w') as f:
            dump_mrjob_conf({}, f)

        # shouldn't matter that conf_path_1 includes conf_path_2
        self.assertEqual(load_opts_from_mrjob_conf('foo', conf_path),
                         [(conf_path_1, {}), (conf_path_2, {}),
                          (conf_path, {})])
コード例 #24
0
ファイル: test_conf.py プロジェクト: kartheek6/mrjob
    def test_include_order_beats_include(self):
        conf_path = os.path.join(self.tmp_dir, "mrjob.conf")
        conf_path_1 = os.path.join(self.tmp_dir, "mrjob.1.conf")
        conf_path_2 = os.path.join(self.tmp_dir, "mrjob.2.conf")

        with open(conf_path, "w") as f:
            dump_mrjob_conf({"include": [conf_path_1, conf_path_2]}, f)

        with open(conf_path_1, "w") as f:
            dump_mrjob_conf({"include": [conf_path_2]}, f)

        with open(conf_path_2, "w") as f:
            dump_mrjob_conf({}, f)

        # shouldn't matter that conf_path_1 includes conf_path_2
        self.assertEqual(
            load_opts_from_mrjob_conf("foo", conf_path), [(conf_path_1, {}), (conf_path_2, {}), (conf_path, {})]
        )
コード例 #25
0
ファイル: test_conf.py プロジェクト: Affirm/mrjob
    def test_include_order_beats_include(self):
        conf_path = os.path.join(self.tmp_dir, 'mrjob.conf')
        conf_path_1 = os.path.join(self.tmp_dir, 'mrjob.1.conf')
        conf_path_2 = os.path.join(self.tmp_dir, 'mrjob.2.conf')

        with open(conf_path, 'w') as f:
            dump_mrjob_conf({'include': [conf_path_1, conf_path_2]}, f)

        with open(conf_path_1, 'w') as f:
            dump_mrjob_conf({'include': [conf_path_2]}, f)

        with open(conf_path_2, 'w') as f:
            dump_mrjob_conf({}, f)

        # shouldn't matter that conf_path_1 includes conf_path_2
        self.assertEqual(
            load_opts_from_mrjob_conf('foo', conf_path),
            [(conf_path_1, {}), (conf_path_2, {}), (conf_path, {})])
コード例 #26
0
ファイル: runner.py プロジェクト: icio/mrjob
    def __init__(self, alias, opts, conf_path):
        super(RunnerOptionStore, self).__init__()

        # sanitize incoming options and issue warnings for bad keys
        opts = self.validated_options(
            opts, 'Got unexpected keyword arguments: %s')

        unsanitized_opt_dicts = load_opts_from_mrjob_conf(
            alias, conf_path=conf_path)

        for path, mrjob_conf_opts in unsanitized_opt_dicts:
            self.cascading_dicts.append(self.validated_options(
                mrjob_conf_opts, 'Got unexpected opts from %s: %%s' % path))

        self.cascading_dicts.append(opts)

        self.populate_values_from_cascading_dicts()

        self._validate_cleanup()
コード例 #27
0
ファイル: runner.py プロジェクト: Jyrsa/mrjob
    def __init__(self, mr_job_script=None, conf_path=None,
                 extra_args=None, file_upload_args=None,
                 input_paths=None, output_dir=None, stdin=None,
                 **opts):
        """All runners take the following keyword arguments:

        :type mr_job_script: str
        :param mr_job_script: the path of the ``.py`` file containing the :py:class:`~mrjob.job.MRJob`. If this is None, you won't actually be able to :py:meth:`run` the job, but other utilities (e.g. :py:meth:`ls`) will work.
        :type conf_path: str
        :param conf_path: Alternate path to read configs from, or ``False`` to ignore all config files.
        :type extra_args: list of str
        :param extra_args: a list of extra cmd-line arguments to pass to the mr_job script. For example: ``['--protocol', 'repr']``. This is a hook to allow jobs to take additional arguments.
        :param file_upload_args: a list of tuples of ``('--ARGNAME', path)``. The file at the given path will be uploaded to the local directory of the mr_job script when it runs, and then passed into the script with ``--ARGNAME``. Useful for passing in SQLite DBs and other configuration files to your job.
        :type input_paths: list of str
        :param input_paths: Input files for your job. Supports globs and recursively walks directories (e.g. ``['data/common/', 'data/training/*.gz']``). If this is left blank, we'll read from stdin
        :type output_dir: str
        :param output_dir: an empty/non-existent directory where Hadoop streaming should put the final output from the job. If you don't specify an output directory, we'll output into a subdirectory of this job's temporary directory. You can control this from the command line with ``--output-dir``.
        :param stdin: an iterable (can be a ``StringIO`` or even a list) to use as stdin. This is a hook for testing; if you set ``stdin`` via :py:meth:`~mrjob.job.MRJob.sandbox`, it'll get passed through to the runner. If for some reason your lines are missing newlines, we'll add them; this makes it easier to write automated tests.

        All runners also take the following options as keyword arguments.
        These can be defaulted in your :mod:`mrjob.conf` file:

        :type base_tmp_dir: str
        :param base_tmp_dir: path to put local temp dirs inside. By default we just call :py:func:`tempfile.gettempdir`
        :type bootstrap_mrjob: bool
        :param bootstrap_mrjob: should we automatically tar up the mrjob library and install it when we run the mrjob? Set this to ``False`` if you've already installed ``mrjob`` on your Hadoop cluster.
        :type cleanup: str
        :param cleanup: is :py:meth:`cleanup` allowed to clean up logs and scratch files? See :py:data:`CLEANUP_CHOICES`.
        :type cmdenv: dict
        :param cmdenv: environment variables to pass to the job inside Hadoop streaming
        :type hadoop_extra_args: list of str
        :param hadoop_extra_args: extra arguments to pass to hadoop streaming
        :type hadoop_input_format: str
        :param hadoop_input_format: name of an optional Hadoop ``InputFormat`` class. Passed to Hadoop along with your first step with the ``-inputformat`` option. Note that if you write your own class, you'll need to include it in your own custom streaming jar (see *hadoop_streaming_jar*).
        :type hadoop_output_format: str
        :param hadoop_output_format: name of an optional Hadoop ``OutputFormat`` class. Passed to Hadoop along with your first step with the ``-outputformat`` option. Note that if you write your own class, you'll need to include it in your own custom streaming jar (see *hadoop_streaming_jar*).
        :type hadoop_streaming_jar: str
        :param hadoop_streaming_jar: path to a custom hadoop streaming jar.
        :type jobconf: dict
        :param jobconf: ``-jobconf`` args to pass to hadoop streaming. This should be a map from property name to value. Equivalent to passing ``['-jobconf', 'KEY1=VALUE1', '-jobconf', 'KEY2=VALUE2', ...]`` to ``hadoop_extra_args``.
        :type label: str
        :param label: description of this job to use as the part of its name. By default, we use the script's module name, or ``no_script`` if there is none. This used to be called *job_name_prefix* (which still works but is deprecated).
        :type owner: str
        :param owner: who is running this job. Used solely to set the job name. By default, we use :py:func:`getpass.getuser`, or ``no_user`` if it fails.
        :type python_archives: list of str
        :param python_archives: same as upload_archives, except they get added to the job's :envvar:`PYTHONPATH`
        :type python_bin: str
        :param python_bin: Name/path of alternate python binary for mappers/reducers (e.g. for use with :py:mod:`virtualenv`). Defaults to ``'python'``.
        :type setup_cmds: list
        :param setup_cmds: a list of commands to run before each mapper/reducer step (e.g. ``['cd my-src-tree; make', 'mkdir -p /tmp/foo']``). You can specify commands as strings, which will be run through the shell, or lists of args, which will be invoked directly. We'll use file locking to ensure that multiple mappers/reducers running on the same node won't run *setup_cmds* simultaneously (it's safe to run ``make``).
        :type setup_scripts: list of str
        :param setup_scripts: files that will be copied into the local working directory and then run. These are run after *setup_cmds*. Like with *setup_cmds*, we use file locking to keep multiple mappers/reducers on the same node from running *setup_scripts* simultaneously.
        :type steps_python_bin: str
        :param steps_python_bin: Name/path of alternate python binary to use to query the job about its steps (e.g. for use with :py:mod:`virtualenv`). Rarely needed. Defaults to ``sys.executable`` (the current python interpreter).
        :type upload_archives: list of str
        :param upload_archives: a list of archives (e.g. tarballs) to unpack in the local directory of the mr_job script when it runs. You can set the local name of the dir we unpack into by appending ``#localname`` to the path; otherwise we just use the name of the archive file (e.g. ``foo.tar.gz``)
        :type upload_files: list of str
        :param upload_files: a list of files to copy to the local directory of the mr_job script when it runs. You can set the local name of the dir we unpack into by appending ``#localname`` to the path; otherwise we just use the name of the file
        """
        # enforce correct arguments
        self._fix_deprecated_opts(opts)
        allowed_opts = set(self._allowed_opts())
        unrecognized_opts = set(opts) - allowed_opts
        if unrecognized_opts:
            log.warn('got unexpected keyword arguments: ' +
                     ', '.join(sorted(unrecognized_opts)))
            opts = dict((k, v) for k, v in opts.iteritems()
                        if k in allowed_opts)

        # issue a warning for unknown opts from mrjob.conf and filter them out
        mrjob_conf_opts = load_opts_from_mrjob_conf(
            self.alias, conf_path=conf_path)
        self._fix_deprecated_opts(mrjob_conf_opts)
        unrecognized_opts = set(mrjob_conf_opts) - set(self._allowed_opts())
        if unrecognized_opts:
            log.warn('got unexpected opts from mrjob.conf: ' +
                     ', '.join(sorted(unrecognized_opts)))
            mrjob_conf_opts = dict((k, v)
                                   for k, v in mrjob_conf_opts.iteritems()
                                   if k in allowed_opts)

        # make sure all opts are at least set to None
        blank_opts = dict((key, None) for key in allowed_opts)

        # combine all of these options
        # only __init__() methods should modify self._opts!
        self._opts = self.combine_opts(blank_opts, self._default_opts(),
                                       mrjob_conf_opts, opts)

        # we potentially have a lot of files to copy, so we keep track
        # of them as a list of dictionaries, with the following keys:
        #
        # 'path': the path to the file on the local system
        # 'name': a unique name for the file when we copy it into HDFS etc.
        # if this is blank, we'll pick one
        # 'cache': if 'file', copy into mr_job_script's working directory
        # on the Hadoop nodes. If 'archive', uncompress the file
        self._files = []

        # validate cleanup
        if not self._opts['cleanup'] in CLEANUP_CHOICES:
            raise ValueError(
                'cleanup must be one of %s, not %r' %
                (', '.join(CLEANUP_CHOICES), self._opts['cleanup']))

        # add the script to our list of files (don't actually commit to
        # uploading it)
        if mr_job_script:
            self._script = {'path': mr_job_script}
            self._files.append(self._script)
            self._ran_job = False
        else:
            self._script = None
            self._ran_job = True # don't allow user to call run()

        # setup cmds and wrapper script
        self._setup_scripts = []
        for path in self._opts['setup_scripts']:
            file_dict = self._add_file_for_upload(path)
            self._setup_scripts.append(file_dict)

        # we'll create the wrapper script later
        self._wrapper_script = None

        # extra args to our job
        self._extra_args = list(extra_args) if extra_args else []

        # extra file arguments to our job
        self._file_upload_args = []
        if file_upload_args:
            for arg, path in file_upload_args:
                file_dict = self._add_file_for_upload(path)
                self._file_upload_args.append((arg, file_dict))

        # set up uploading
        for path in self._opts['upload_archives']:
            self._add_archive_for_upload(path)
        for path in self._opts['upload_files']:
            self._add_file_for_upload(path)

        # set up python archives
        self._python_archives = []
        for path in self._opts['python_archives']:
            self._add_python_archive(path)

        # where to read input from (log files, etc.)
        self._input_paths = input_paths or ['-'] # by default read from stdin
        self._stdin = stdin or sys.stdin
        self._stdin_path = None # temp file containing dump from stdin

        # where a tarball of the mrjob library is stored locally
        self._mrjob_tar_gz_path = None

        # store output_dir
        self._output_dir = output_dir

        # give this job a unique name
        self._job_name = self._make_unique_job_name(
            label=self._opts['label'], owner=self._opts['owner'])

        # a local tmp directory that will be cleaned up when we're done
        # access/make this using self._get_local_tmp_dir()
        self._local_tmp_dir = None

        # info about our steps. this is basically a cache for self._get_steps()
        self._steps = None
コード例 #28
0
    def __init__(self,
                 mr_job_script=None,
                 conf_path=None,
                 extra_args=None,
                 file_upload_args=None,
                 input_paths=None,
                 output_dir=None,
                 stdin=None,
                 **opts):
        """All runners take the following keyword arguments:

        :type mr_job_script: str
        :param mr_job_script: the path of the ``.py`` file containing the :py:class:`~mrjob.job.MRJob`. If this is None, you won't actually be able to :py:meth:`run` the job, but other utilities (e.g. :py:meth:`ls`) will work.
        :type conf_path: str
        :param conf_path: Alternate path to read configs from, or ``False`` to ignore all config files.
        :type extra_args: list of str
        :param extra_args: a list of extra cmd-line arguments to pass to the mr_job script. For example: ``['--protocol', 'repr']``. This is a hook to allow jobs to take additional arguments.
        :param file_upload_args: a list of tuples of ``('--ARGNAME', path)``. The file at the given path will be uploaded to the local directory of the mr_job script when it runs, and then passed into the script with ``--ARGNAME``. Useful for passing in SQLite DBs and other configuration files to your job.
        :type input_paths: list of str
        :param input_paths: Input files for your job. Supports globs and recursively walks directories (e.g. ``['data/common/', 'data/training/*.gz']``). If this is left blank, we'll read from stdin
        :type output_dir: str
        :param output_dir: an empty/non-existent directory where Hadoop streaming should put the final output from the job. If you don't specify an output directory, we'll output into a subdirectory of this job's temporary directory. You can control this from the command line with ``--output-dir``.
        :param stdin: an iterable (can be a ``StringIO`` or even a list) to use as stdin. This is a hook for testing; if you set ``stdin`` via :py:meth:`~mrjob.job.MRJob.sandbox`, it'll get passed through to the runner. If for some reason your lines are missing newlines, we'll add them; this makes it easier to write automated tests.

        All runners also take the following options as keyword arguments.
        These can be defaulted in your :mod:`mrjob.conf` file:

        :type base_tmp_dir: str
        :param base_tmp_dir: path to put local temp dirs inside. By default we just call :py:func:`tempfile.gettempdir`
        :type bootstrap_mrjob: bool
        :param bootstrap_mrjob: should we automatically tar up the mrjob library and install it when we run the mrjob? Set this to ``False`` if you've already installed ``mrjob`` on your Hadoop cluster.
        :type cleanup: str
        :param cleanup: is :py:meth:`cleanup` allowed to clean up logs and scratch files? See :py:data:`CLEANUP_CHOICES`.
        :type cmdenv: dict
        :param cmdenv: environment variables to pass to the job inside Hadoop streaming
        :type hadoop_extra_args: list of str
        :param hadoop_extra_args: extra arguments to pass to hadoop streaming
        :type jobconf: dict
        :param jobconf: ``-jobconf`` args to pass to hadoop streaming. This should be a map from property name to value. Equivalent to passing ``['-jobconf', 'KEY1=VALUE1', '-jobconf', 'KEY2=VALUE2', ...]`` to ``hadoop_extra_args``.
        :type label: str
        :param label: description of this job to use as the part of its name. By default, we use the script's module name, or ``no_script`` if there is none. This used to be called *job_name_prefix* (which still works but is deprecated).
        :type owner: str
        :param owner: who is running this job. Used solely to set the job name. By default, we use :py:func:`getpass.getuser`, or ``no_user`` if it fails.
        :type python_archives: list of str
        :param python_archives: same as upload_archives, except they get added to the job's :envvar:`PYTHONPATH`
        :type python_bin: str
        :param python_bin: Name/path of alternate python binary for mappers/reducers (e.g. for use with :py:mod:`virtualenv`). Defaults to ``'python'``.
        :type setup_cmds: list
        :param setup_cmds: a list of commands to run before each mapper/reducer step (e.g. ``['cd my-src-tree; make', 'mkdir -p /tmp/foo']``). You can specify commands as strings, which will be run through the shell, or lists of args, which will be invoked directly. We'll use file locking to ensure that multiple mappers/reducers running on the same node won't run *setup_cmds* simultaneously (it's safe to run ``make``).
        :type setup_scripts: list of str
        :param setup_scripts: files that will be copied into the local working directory and then run. These are run after *setup_cmds*. Like with *setup_cmds*, we use file locking to keep multiple mappers/reducers on the same node from running *setup_scripts* simultaneously.
        :type upload_archives: list of str
        :param upload_archives: a list of archives (e.g. tarballs) to unpack in the local directory of the mr_job script when it runs. You can set the local name of the dir we unpack into by appending ``#localname`` to the path; otherwise we just use the name of the archive file (e.g. ``foo.tar.gz``)
        :type upload_files: list of str
        :param upload_files: a list of files to copy to the local directory of the mr_job script when it runs. You can set the local name of the dir we unpack into by appending ``#localname`` to the path; otherwise we just use the name of the file
        """
        # enforce correct arguments
        self._fix_deprecated_opts(opts)
        allowed_opts = set(self._allowed_opts())
        unrecognized_opts = set(opts) - allowed_opts
        if unrecognized_opts:
            log.warn('got unexpected keyword arguments: ' +
                     ', '.join(sorted(unrecognized_opts)))
            opts = dict(
                (k, v) for k, v in opts.iteritems() if k in allowed_opts)

        # issue a warning for unknown opts from mrjob.conf and filter them out
        mrjob_conf_opts = load_opts_from_mrjob_conf(self.alias,
                                                    conf_path=conf_path)
        self._fix_deprecated_opts(mrjob_conf_opts)
        unrecognized_opts = set(mrjob_conf_opts) - set(self._allowed_opts())
        if unrecognized_opts:
            log.warn('got unexpected opts from mrjob.conf: ' +
                     ', '.join(sorted(unrecognized_opts)))
            mrjob_conf_opts = dict((k, v)
                                   for k, v in mrjob_conf_opts.iteritems()
                                   if k in allowed_opts)

        # make sure all opts are at least set to None
        blank_opts = dict((key, None) for key in allowed_opts)

        # combine all of these options
        # only __init__() methods should modify self._opts!
        self._opts = self.combine_opts(blank_opts, self._default_opts(),
                                       mrjob_conf_opts, opts)

        # we potentially have a lot of files to copy, so we keep track
        # of them as a list of dictionaries, with the following keys:
        #
        # 'path': the path to the file on the local system
        # 'name': a unique name for the file when we copy it into HDFS etc.
        # if this is blank, we'll pick one
        # 'cache': if 'file', copy into mr_job_script's working directory
        # on the Hadoop nodes. If 'archive', uncompress the file
        self._files = []

        # validate cleanup
        if not self._opts['cleanup'] in CLEANUP_CHOICES:
            raise ValueError(
                'cleanup must be one of %s, not %r' %
                (', '.join(CLEANUP_CHOICES), self._opts['cleanup']))

        # add the script to our list of files (don't actually commit to
        # uploading it)
        if mr_job_script:
            self._script = {'path': mr_job_script}
            self._files.append(self._script)
            self._ran_job = False
        else:
            self._script = None
            self._ran_job = True  # don't allow user to call run()

        # setup cmds and wrapper script
        self._setup_scripts = []
        for path in self._opts['setup_scripts']:
            file_dict = self._add_file_for_upload(path)
            self._setup_scripts.append(file_dict)

        # we'll create the wrapper script later
        self._wrapper_script = None

        # extra args to our job
        self._extra_args = list(extra_args) if extra_args else []

        # extra file arguments to our job
        self._file_upload_args = []
        if file_upload_args:
            for arg, path in file_upload_args:
                file_dict = self._add_file_for_upload(path)
                self._file_upload_args.append((arg, file_dict))

        # set up uploading
        for path in self._opts['upload_archives']:
            self._add_archive_for_upload(path)
        for path in self._opts['upload_files']:
            self._add_file_for_upload(path)

        # set up python archives
        self._python_archives = []
        for path in self._opts['python_archives']:
            self._add_python_archive(path)

        # where to read input from (log files, etc.)
        self._input_paths = input_paths or ['-']  # by default read from stdin
        self._stdin = stdin or sys.stdin
        self._stdin_path = None  # temp file containing dump from stdin

        # where a tarball of the mrjob library is stored locally
        self._mrjob_tar_gz_path = None

        # store output_dir
        self._output_dir = output_dir

        # give this job a unique name
        self._job_name = self._make_unique_job_name(label=self._opts['label'],
                                                    owner=self._opts['owner'])

        # a local tmp directory that will be cleaned up when we're done
        # access/make this using self._get_local_tmp_dir()
        self._local_tmp_dir = None

        # info about our steps. this is basically a cache for self._get_steps()
        self._steps = None
コード例 #29
0
ファイル: test_conf.py プロジェクト: kartheek6/mrjob
    def test_recursive_include(self):
        conf_path = os.path.join(self.tmp_dir, "mrjob.conf")
        with open(conf_path, "w") as f:
            dump_mrjob_conf({"include": conf_path}, f)

        self.assertEqual(load_opts_from_mrjob_conf("foo", conf_path), [(conf_path, {})])