def test_symlink_to_duplicate_conf_path(self): conf_path = os.path.join(self.tmp_dir, "mrjob.conf") with open(conf_path, "w") as f: dump_mrjob_conf({}, f) conf_symlink_path = os.path.join(self.tmp_dir, "mrjob.conf.symlink") os.symlink("mrjob.conf", conf_symlink_path) self.assertEqual(load_opts_from_mrjob_confs("foo", [conf_path, conf_symlink_path]), [(conf_symlink_path, {})]) self.assertEqual(load_opts_from_mrjob_confs("foo", [conf_symlink_path, conf_path]), [(conf_path, {})])
def test_symlink_to_duplicate_conf_path(self): conf_path = os.path.join(self.tmp_dir, 'mrjob.conf') with open(conf_path, 'w') as f: dump_mrjob_conf({}, f) conf_symlink_path = os.path.join(self.tmp_dir, 'mrjob.conf.symlink') os.symlink('mrjob.conf', conf_symlink_path) self.assertEqual( load_opts_from_mrjob_confs('foo', [conf_path, conf_symlink_path]), [(conf_symlink_path, {})]) self.assertEqual( load_opts_from_mrjob_confs('foo', [conf_symlink_path, conf_path]), [(conf_path, {})])
def __init__(self, alias, opts, conf_paths): """ :param alias: Runner alias (e.g. ``'local'``) :param opts: Keyword args to runner's constructor (usually from the command line). :param conf_paths: An iterable of paths to config files """ super(RunnerOptionStore, self).__init__() # sanitize incoming options and issue warnings for bad keys opts = self.validated_options(opts) unsanitized_opt_dicts = load_opts_from_mrjob_confs( alias, conf_paths=conf_paths) for path, mrjob_conf_opts in unsanitized_opt_dicts: self.cascading_dicts.append(self.validated_options( mrjob_conf_opts, from_where=(' from %s' % path))) self.cascading_dicts.append(opts) if (len(self.cascading_dicts) > 2 and all(len(d) == 0 for d in self.cascading_dicts[2:-1]) and (len(conf_paths or []) > 0)): log.warning('No configs specified for %s runner' % alias) self.populate_values_from_cascading_dicts() log.debug('Active configuration:') log.debug(pprint.pformat(self))
def __init__(self, alias, opts, conf_paths): """ :param alias: Runner alias (e.g. ``'local'``) :param opts: Options from the command line :param conf_paths: Either a file path or an iterable of paths to config files """ super(RunnerOptionStore, self).__init__() # sanitize incoming options and issue warnings for bad keys opts = self.validated_options( opts, 'Got unexpected keyword arguments: %s') unsanitized_opt_dicts = load_opts_from_mrjob_confs( alias, conf_paths=conf_paths) for path, mrjob_conf_opts in unsanitized_opt_dicts: self.cascading_dicts.append(self.validated_options( mrjob_conf_opts, 'Got unexpected opts from %s: %%s' % path)) self.cascading_dicts.append(opts) if (len(self.cascading_dicts) > 2 and all(len(d) == 0 for d in self.cascading_dicts[2:-1])): log.warning('No configs specified for %s runner' % alias) self.populate_values_from_cascading_dicts() self._validate_cleanup()
def test_symlink_to_duplicate_conf_path(self): conf_path = os.path.join(self.tmp_dir, 'mrjob.conf') with open(conf_path, 'w') as f: dump_mrjob_conf({}, f) conf_symlink_path = os.path.join(self.tmp_dir, 'mrjob.conf.symlink') os.symlink('mrjob.conf', conf_symlink_path) self.assertEqual( load_opts_from_mrjob_confs( 'foo', [conf_path, conf_symlink_path]), [(conf_symlink_path, {})]) self.assertEqual( load_opts_from_mrjob_confs( 'foo', [conf_symlink_path, conf_path]), [(conf_path, {})])
def test_conf_path_order_beats_include(self): conf_path_1 = os.path.join(self.tmp_dir, 'mrjob.1.conf') conf_path_2 = os.path.join(self.tmp_dir, 'mrjob.2.conf') with open(conf_path_1, 'w') as f: dump_mrjob_conf({}, f) with open(conf_path_2, 'w') as f: dump_mrjob_conf({}, f) # shouldn't matter that conf_path_1 includes conf_path_2 self.assertEqual( load_opts_from_mrjob_confs('foo', [conf_path_1, conf_path_2]), [(conf_path_1, {}), (conf_path_2, {})])
def __init__(self, mr_job_script=None, conf_paths=None, extra_args=None, file_upload_args=None, hadoop_input_format=None, hadoop_output_format=None, input_paths=None, output_dir=None, partitioner=None, sort_values=None, stdin=None, step_output_dir=None, **opts): """All runners take the following keyword arguments: :type mr_job_script: str :param mr_job_script: the path of the ``.py`` file containing the :py:class:`~mrjob.job.MRJob`. If this is None, you won't actually be able to :py:meth:`run` the job, but other utilities (e.g. :py:meth:`ls`) will work. :type conf_paths: None or list :param conf_paths: List of config files to combine and use, or None to search for mrjob.conf in the default locations. :type extra_args: list of str :param extra_args: a list of extra cmd-line arguments to pass to the mr_job script. This is a hook to allow jobs to take additional arguments. :param file_upload_args: a list of tuples of ``('--ARGNAME', path)``. The file at the given path will be uploaded to the local directory of the mr_job script when it runs, and then passed into the script with ``--ARGNAME``. Useful for passing in SQLite DBs and other configuration files to your job. :type hadoop_input_format: str :param hadoop_input_format: name of an optional Hadoop ``InputFormat`` class. Passed to Hadoop along with your first step with the ``-inputformat`` option. Note that if you write your own class, you'll need to include it in your own custom streaming jar (see :mrjob-opt:`hadoop_streaming_jar`). :type hadoop_output_format: str :param hadoop_output_format: name of an optional Hadoop ``OutputFormat`` class. Passed to Hadoop along with your first step with the ``-outputformat`` option. Note that if you write your own class, you'll need to include it in your own custom streaming jar (see :mrjob-opt:`hadoop_streaming_jar`). :type input_paths: list of str :param input_paths: Input files for your job. Supports globs and recursively walks directories (e.g. ``['data/common/', 'data/training/*.gz']``). If this is left blank, we'll read from stdin :type output_dir: str :param output_dir: An empty/non-existent directory where Hadoop should put the final output from the job. If you don't specify an output directory, we'll output into a subdirectory of this job's temporary directory. You can control this from the command line with ``--output-dir``. This option cannot be set from configuration files. If used with the hadoop runner, this path does not need to be fully qualified with ``hdfs://`` URIs because it's understood that it has to be on HDFS. :type partitioner: str :param partitioner: Optional name of a Hadoop partitioner class, e.g. ``'org.apache.hadoop.mapred.lib.HashPartitioner'``. Hadoop streaming will use this to determine how mapper output should be sorted and distributed to reducers. :type sort_values: bool :param sort_values: if true, set partitioners and jobconf variables so that reducers to receive the values associated with any key in sorted order (sorted by their *encoded* value). Also known as secondary sort. :param stdin: an iterable (can be a ``BytesIO`` or even a list) to use as stdin. This is a hook for testing; if you set ``stdin`` via :py:meth:`~mrjob.job.MRJob.sandbox`, it'll get passed through to the runner. If for some reason your lines are missing newlines, we'll add them; this makes it easier to write automated tests. :type step_output_dir: str :param step_output_dir: An empty/non-existent directory where Hadoop should put output from all steps other than the last one (this only matters for multi-step jobs). Currently ignored by local runners. """ self._ran_job = False # opts are made from: # # empty defaults (everything set to None) # runner-specific defaults # opts from config file(s) # opts from command line self._opts = self._combine_confs( [(None, {key: None for key in self.OPT_NAMES})] + [(None, self._default_opts())] + load_opts_from_mrjob_confs(self.alias, conf_paths) + [('the command line', opts)]) log.debug('Active configuration:') log.debug( pprint.pformat({ opt_key: self._obfuscate_opt(opt_key, opt_value) for opt_key, opt_value in self._opts.items() })) self._fs = None # a local tmp directory that will be cleaned up when we're done # access/make this using self._get_local_tmp_dir() self._local_tmp_dir = None self._working_dir_mgr = WorkingDirManager() # mapping from dir to path for corresponding archive. we pick # paths during init(), but don't actually create the archives # until self._create_dir_archives() is called self._dir_to_archive_path = {} # dir archive names (the filename minus ".tar.gz") already taken self._dir_archive_names_taken = set() # set of dir_archives that have actually been created self._dir_archives_created = set() # track (name, path) of files and archives to upload to spark. # these are a subset of those in self._working_dir_mgr self._spark_files = [] self._spark_archives = [] self._upload_mgr = None # define in subclasses that use this self._script_path = mr_job_script if self._script_path: self._working_dir_mgr.add('file', self._script_path) # give this job a unique name self._job_key = self._make_unique_job_key(label=self._opts['label'], owner=self._opts['owner']) # extra args to our job self._extra_args = list(extra_args) if extra_args else [] for extra_arg in self._extra_args: if isinstance(extra_arg, dict): if extra_arg.get('type') != 'file': raise NotImplementedError self._working_dir_mgr.add(**extra_arg) self._spark_files.append( (extra_arg['name'], extra_arg['path'])) # extra file arguments to our job if file_upload_args: log.warning('file_upload_args is deprecated and will be removed' ' in v0.6.0. Pass dicts to extra_args instead.') for arg, path in file_upload_args: arg_file = parse_legacy_hash_path('file', path) self._working_dir_mgr.add(**arg_file) self._extra_args.extend([arg, arg_file]) self._spark_files.append((arg_file['name'], arg_file['path'])) # set up uploading for hash_path in self._opts['upload_files']: uf = parse_legacy_hash_path('file', hash_path, must_name='upload_files') self._working_dir_mgr.add(**uf) self._spark_files.append((uf['name'], uf['path'])) for hash_path in self._opts['upload_archives']: ua = parse_legacy_hash_path('archive', hash_path, must_name='upload_archives') self._working_dir_mgr.add(**ua) self._spark_archives.append((ua['name'], ua['path'])) for hash_path in self._opts['upload_dirs']: # pick name based on directory path ud = parse_legacy_hash_path('dir', hash_path, must_name='upload_archives') # but feed working_dir_mgr the archive's path archive_path = self._dir_archive_path(ud['path']) self._working_dir_mgr.add('archive', archive_path, name=ud['name']) self._spark_archives.append((ud['name'], archive_path)) # py_files # self._setup is a list of shell commands with path dicts # interleaved; see mrjob.setup.parse_setup_cmd() for details self._setup = self._parse_setup_and_py_files() for cmd in self._setup: for token in cmd: if isinstance(token, dict): # convert dir archives tokens to archives if token['type'] == 'dir': # feed the archive's path to self._working_dir_mgr token['path'] = self._dir_archive_path(token['path']) token['type'] = 'archive' self._working_dir_mgr.add(**token) # Where to read input from (log files, etc.) self._input_paths = input_paths or ['-'] # by default read from stdin if PY2: self._stdin = stdin or sys.stdin else: self._stdin = stdin or sys.stdin.buffer self._stdin_path = None # temp file containing dump from stdin # where a zip file of the mrjob library is stored locally self._mrjob_zip_path = None # store output_dir self._output_dir = output_dir # store partitioner self._partitioner = partitioner # store sort_values self._sort_values = sort_values # store step_output_dir self._step_output_dir = step_output_dir # store hadoop input and output formats self._hadoop_input_format = hadoop_input_format self._hadoop_output_format = hadoop_output_format # A cache for self._get_steps(); also useful as a test hook self._steps = None # this variable marks whether a cleanup has happened and this runner's # output stream is no longer available. self._closed = False
def __init__(self, mr_job_script=None, conf_paths=None, extra_args=None, file_upload_args=None, hadoop_input_format=None, hadoop_output_format=None, input_paths=None, output_dir=None, partitioner=None, sort_values=None, stdin=None, step_output_dir=None, **opts): """All runners take the following keyword arguments: :type mr_job_script: str :param mr_job_script: the path of the ``.py`` file containing the :py:class:`~mrjob.job.MRJob`. If this is None, you won't actually be able to :py:meth:`run` the job, but other utilities (e.g. :py:meth:`ls`) will work. :type conf_paths: None or list :param conf_paths: List of config files to combine and use, or None to search for mrjob.conf in the default locations. :type extra_args: list of str :param extra_args: a list of extra cmd-line arguments to pass to the mr_job script. This is a hook to allow jobs to take additional arguments. :param file_upload_args: a list of tuples of ``('--ARGNAME', path)``. The file at the given path will be uploaded to the local directory of the mr_job script when it runs, and then passed into the script with ``--ARGNAME``. Useful for passing in SQLite DBs and other configuration files to your job. :type hadoop_input_format: str :param hadoop_input_format: name of an optional Hadoop ``InputFormat`` class. Passed to Hadoop along with your first step with the ``-inputformat`` option. Note that if you write your own class, you'll need to include it in your own custom streaming jar (see :mrjob-opt:`hadoop_streaming_jar`). :type hadoop_output_format: str :param hadoop_output_format: name of an optional Hadoop ``OutputFormat`` class. Passed to Hadoop along with your first step with the ``-outputformat`` option. Note that if you write your own class, you'll need to include it in your own custom streaming jar (see :mrjob-opt:`hadoop_streaming_jar`). :type input_paths: list of str :param input_paths: Input files for your job. Supports globs and recursively walks directories (e.g. ``['data/common/', 'data/training/*.gz']``). If this is left blank, we'll read from stdin :type output_dir: str :param output_dir: An empty/non-existent directory where Hadoop should put the final output from the job. If you don't specify an output directory, we'll output into a subdirectory of this job's temporary directory. You can control this from the command line with ``--output-dir``. This option cannot be set from configuration files. If used with the hadoop runner, this path does not need to be fully qualified with ``hdfs://`` URIs because it's understood that it has to be on HDFS. :type partitioner: str :param partitioner: Optional name of a Hadoop partitioner class, e.g. ``'org.apache.hadoop.mapred.lib.HashPartitioner'``. Hadoop streaming will use this to determine how mapper output should be sorted and distributed to reducers. :type sort_values: bool :param sort_values: if true, set partitioners and jobconf variables so that reducers to receive the values associated with any key in sorted order (sorted by their *encoded* value). Also known as secondary sort. :param stdin: an iterable (can be a ``BytesIO`` or even a list) to use as stdin. This is a hook for testing; if you set ``stdin`` via :py:meth:`~mrjob.job.MRJob.sandbox`, it'll get passed through to the runner. If for some reason your lines are missing newlines, we'll add them; this makes it easier to write automated tests. :type step_output_dir: str :param step_output_dir: An empty/non-existent directory where Hadoop should put output from all steps other than the last one (this only matters for multi-step jobs). Currently ignored by local runners. """ self._ran_job = False # opts are made from: # # empty defaults (everything set to None) # runner-specific defaults # opts from config file(s) # opts from command line self._opts = self._combine_confs( [(None, {key: None for key in self.OPT_NAMES})] + [(None, self._default_opts())] + load_opts_from_mrjob_confs(self.alias, conf_paths) + [('the command line', opts)] ) log.debug('Active configuration:') log.debug(pprint.pformat({ opt_key: self._obfuscate_opt(opt_key, opt_value) for opt_key, opt_value in self._opts.items() })) self._fs = None # a local tmp directory that will be cleaned up when we're done # access/make this using self._get_local_tmp_dir() self._local_tmp_dir = None self._working_dir_mgr = WorkingDirManager() # mapping from dir to path for corresponding archive. we pick # paths during init(), but don't actually create the archives # until self._create_dir_archives() is called self._dir_to_archive_path = {} # dir archive names (the filename minus ".tar.gz") already taken self._dir_archive_names_taken = set() # set of dir_archives that have actually been created self._dir_archives_created = set() # track (name, path) of files and archives to upload to spark. # these are a subset of those in self._working_dir_mgr self._spark_files = [] self._spark_archives = [] self._upload_mgr = None # define in subclasses that use this self._script_path = mr_job_script if self._script_path: self._working_dir_mgr.add('file', self._script_path) # give this job a unique name self._job_key = self._make_unique_job_key( label=self._opts['label'], owner=self._opts['owner']) # extra args to our job self._extra_args = list(extra_args) if extra_args else [] for extra_arg in self._extra_args: if isinstance(extra_arg, dict): if extra_arg.get('type') != 'file': raise NotImplementedError self._working_dir_mgr.add(**extra_arg) self._spark_files.append( (extra_arg['name'], extra_arg['path'])) # extra file arguments to our job if file_upload_args: log.warning('file_upload_args is deprecated and will be removed' ' in v0.6.0. Pass dicts to extra_args instead.') for arg, path in file_upload_args: arg_file = parse_legacy_hash_path('file', path) self._working_dir_mgr.add(**arg_file) self._extra_args.extend([arg, arg_file]) self._spark_files.append((arg_file['name'], arg_file['path'])) # set up uploading for hash_path in self._opts['upload_files']: uf = parse_legacy_hash_path('file', hash_path, must_name='upload_files') self._working_dir_mgr.add(**uf) self._spark_files.append((uf['name'], uf['path'])) for hash_path in self._opts['upload_archives']: ua = parse_legacy_hash_path('archive', hash_path, must_name='upload_archives') self._working_dir_mgr.add(**ua) self._spark_archives.append((ua['name'], ua['path'])) for hash_path in self._opts['upload_dirs']: # pick name based on directory path ud = parse_legacy_hash_path('dir', hash_path, must_name='upload_archives') # but feed working_dir_mgr the archive's path archive_path = self._dir_archive_path(ud['path']) self._working_dir_mgr.add( 'archive', archive_path, name=ud['name']) self._spark_archives.append((ud['name'], archive_path)) # py_files # self._setup is a list of shell commands with path dicts # interleaved; see mrjob.setup.parse_setup_cmd() for details self._setup = self._parse_setup_and_py_files() for cmd in self._setup: for token in cmd: if isinstance(token, dict): # convert dir archives tokens to archives if token['type'] == 'dir': # feed the archive's path to self._working_dir_mgr token['path'] = self._dir_archive_path(token['path']) token['type'] = 'archive' self._working_dir_mgr.add(**token) # Where to read input from (log files, etc.) self._input_paths = input_paths or ['-'] # by default read from stdin if PY2: self._stdin = stdin or sys.stdin else: self._stdin = stdin or sys.stdin.buffer self._stdin_path = None # temp file containing dump from stdin # where a zip file of the mrjob library is stored locally self._mrjob_zip_path = None # store output_dir self._output_dir = output_dir # store partitioner self._partitioner = partitioner # store sort_values self._sort_values = sort_values # store step_output_dir self._step_output_dir = step_output_dir # store hadoop input and output formats self._hadoop_input_format = hadoop_input_format self._hadoop_output_format = hadoop_output_format # A cache for self._get_steps(); also useful as a test hook self._steps = None # this variable marks whether a cleanup has happened and this runner's # output stream is no longer available. self._closed = False