def libjars(self): """Optional list of paths of jar files to run our job with using Hadoop's ``-libjar`` option. Normally setting :py:attr:`LIBJARS` is sufficient. By default, this combines :option:`libjars` options from the command lines with :py:attr:`LIBJARS`, with command line arguments taking precedence. Paths from :py:attr:`LIBJARS` are interpreted as relative to the the directory containing the script (paths from the command-line are relative to the current working directory). Note that ``~`` and environment variables in paths will always be expanded by the job runner (see :mrjob-opt:`libjars`). .. versionadded:: 0.5.3 """ script_dir = os.path.dirname(self.mr_job_script()) paths_from_libjars = [] # libjar paths will eventually be combined with combine_path_lists, # which will expand environment variables. We don't want to assume # a path like $MY_DIR/some.jar is always relative ($MY_DIR could start # with /), but we also don't want to expand environment variables # prematurely. for path in self.LIBJARS or []: if os.path.isabs(expand_path(path)): paths_from_libjars.append(path) else: paths_from_libjars.append(os.path.join(script_dir, path)) return combine_lists(paths_from_libjars, self.options.libjars)
def _job_kwargs(self): """Keyword arguments to the runner class that can be specified by the job/launcher itself.""" # use the most basic combiners; leave magic like resolving paths # and blanking out jobconf values to the runner return dict( # command-line has the final say on jobconf and libjars jobconf=combine_dicts( self.jobconf(), self.options.jobconf), libjars=combine_lists( self.libjars(), self.options.libjars), partitioner=self.partitioner(), sort_values=self.sort_values(), # TODO: should probably put self.options last below for consistency upload_archives=combine_lists( self.options.upload_archives, self.archives()), upload_dirs=combine_lists( self.options.upload_dirs, self.dirs()), upload_files=combine_lists( self.options.upload_files, self.files()), )
def emr_job_runner_kwargs(self): args = super(DownloadToS3, self).emr_job_runner_kwargs() # set up AWS credentials on EMR instances access_key = os.environ['AWS_ACCESS_KEY_ID'] secret = os.environ['AWS_SECRET_ACCESS_KEY'] args['cmdenv'] = combine_dicts(args['cmdenv'], {'AWS_ACCESS_KEY_ID': access_key, 'AWS_SECRET_ACCESS_KEY': secret}) # install pip, aws-cli, and boto args['bootstrap_cmds'] = combine_lists(args['bootstrap_cmds'], ['sysctl -w "net.ipv4.tcp_window_scaling=0"', 'sudo apt-get install python-pip', 'sudo pip install awscli', 'sudo pip install boto']) return args
def test_concatenation(self): assert_equal(combine_lists([1, 2], None, (3, 4)), [1, 2, 3, 4])
def test_empty(self): assert_equal(combine_lists(), [])
def test_mix_lists_and_scalars(self): self.assertEqual(combine_lists([1, 2], 3, (4, 5), 6), [1, 2, 3, 4, 5, 6])
def test_scalars(self): self.assertEqual(combine_lists(None, False, b'\x00', 42, 3.14), [False, b'\x00', 42, 3.14])
def test_strings(self): self.assertEqual(combine_lists('one', None, 'two', u'three'), ['one', 'two', u'three'])
def test_concatenation(self): self.assertEqual(combine_lists([1, 2], None, (3, 4)), [1, 2, 3, 4])
def test_strings(self): self.assertEqual(combine_lists("one", None, "two", u"three"), ["one", "two", u"three"])
def test_empty(self): self.assertEqual(combine_lists(), [])
def test_dicts(self): self.assertEqual(combine_lists({1: 2}, None, {}), [{1: 2}, {}])