def spark_app_config_template(self, master, name=str(uuid.uuid4())): _config = Configuration.create() _config.set(section=name, key=TaskOptions.SPARK_APP_CONFIG_MASTER, value=master) _config.set(section=name, key=TaskOptions.SPARK_APP_CONFIG_APPLICATION_JAR, value=os.path.join(os.path.dirname(__file__), "resources", "spark", "SparkExample.jar")) _config.set(section=name, key=TaskOptions.SPARK_APP_CONFIG_MAIN_CLASS, value="example.spark.WordCounter") return _config
def test_run_preconfigured_job_without_parameters_substitution(self): _test_id = str(uuid.uuid4()) _job_name = "TEST_PIG_{}".format(_test_id) _input_dir = self.copy_file_from_local(self.temp_file("hello,world,world", ".txt")) _output_dir = "/tmp/data_{}".format(_test_id) _commands = "A = load '{}' using PigStorage(',');".format(_input_dir) _commands += "B = foreach A generate \$0 as id;" _commands += "STORE B into '{}';".format(_output_dir) # create job configuration. can also be loaded from .ini file _config = Configuration.create() _config.set(_job_name, TaskOptions.CONFIG_KEY_COMMANDS_STRING, _commands) _config.set(_job_name, TaskOptions.CONFIG_KEY_LOG_BRIEF, "enabled") _config.set( _job_name, TaskOptions.CONFIG_KEY_PARAMETER_VALUE, "input_dir={}\noutput_dir={}".format(_input_dir, _output_dir), ) try: _pig = Pig.load_preconfigured_job(config=_config, job_name=_job_name) _result = _pig.run() _result.if_failed_raise(AssertionError("test_run_preconfigured_job failed")) self.assertTrue(HDFS(_output_dir).exists(), "Cannot find job output") finally: self.delete_file_in_hdfs(_input_dir) self.delete_file_in_hdfs(_output_dir)
def test_run_preconfigured_job_without_parameters_substitution(self): _test_id = str(uuid.uuid4()) _job_name = "TEST_PIG_{}".format(_test_id) _input_dir = self.copy_file_from_local( self.temp_file("hello,world,world", ".txt")) _output_dir = "/tmp/data_{}".format(_test_id) _commands = "A = load '{}' using PigStorage(',');".format(_input_dir) _commands += "B = foreach A generate \$0 as id;" _commands += "STORE B into '{}';".format(_output_dir) # create job configuration. can also be loaded from .ini file _config = Configuration.create() _config.set(_job_name, TaskOptions.CONFIG_KEY_COMMANDS_STRING, _commands) _config.set(_job_name, TaskOptions.CONFIG_KEY_LOG_BRIEF, 'enabled') _config.set( _job_name, TaskOptions.CONFIG_KEY_PARAMETER_VALUE, 'input_dir={}\noutput_dir={}'.format(_input_dir, _output_dir)) try: _pig = Pig.load_preconfigured_job(config=_config, job_name=_job_name) _result = _pig.run() _result.if_failed_raise( AssertionError("test_run_preconfigured_job failed")) self.assertTrue( HDFS(_output_dir).exists(), "Cannot find job output") finally: self.delete_file_in_hdfs(_input_dir) self.delete_file_in_hdfs(_output_dir)
def __init__(self, name, config, executable, executor, main_class=None, shell_command="hadoop jar"): self.executor = executor self.executable = executable self._config = config if config else Configuration.create(readonly=False, accepts_nulls=True) self.name = name if name else "MR_TASK_{0}".format(uuid.uuid4()) self.main_class = main_class self._shell_command = shell_command self._process = None
def test_wrap_with_quotes(self): _pc = Pig(config=Configuration.create(), job_name=None, command_executor=None) self.assertEqual("", _pc._wrap_with_quotes_("")) self.assertEqual(None, _pc._wrap_with_quotes_(None)) self.assertEqual('"test"', _pc._wrap_with_quotes_("test")) self.assertEqual("'test'", _pc._wrap_with_quotes_("'test'")) self.assertEqual("'te\"st'", _pc._wrap_with_quotes_('te"st')) self.assertEqual('"te\'st"', _pc._wrap_with_quotes_("te'st"))
def test_create_new_config(self): _config = Configuration.create() _section = 'new_section' _key = 'new_key' _value = 'new_value' _config.set(section=_section, key=_key, value=_value) self.assertTrue(_config.has(_section, _key), "Config option was not added") self.assertEqual(_value, _config.get(_section, _key))
def __init__(self, agent=None, conf_file=None, config=None, executor=execute_shell_command): """ Creates wrapper for Flume command line utility :param executor: custom executor :type executor: """ self.name = agent if agent else "FLUME_AGENT_{0}".format(uuid.uuid4()) self._executor = executor self._config = config if config else Configuration.create(readonly=False, accepts_nulls=True) self.__set_attr__(TaskOptions.CONFIG_KEY_AGENT_NAME, agent) self.__set_attr__(TaskOptions.CONFIG_KEY_CONF_FILE, conf_file)
def __init__(self, name=None, config=None, executor=execute_shell_command): """ Creates wrapper for Hive command line utility :param executor: custom executor :type executor: """ super(Hive, self).__init__() self.name = name if name else "HIVE_TASK_{0}".format(uuid.uuid4()) self.__executor = executor self._config = config if config else Configuration.create( readonly=False, accepts_nulls=True)
def __init__(self, config=None, name=None, executor=execute_shell_command): """ :param config: configurations :param name: name of the config section containing specific application configurations :param executor: he interface used by the client to launch Spark Application. """ super(SparkApplication, self).__init__() self.executor = executor self._configs = config if config else Configuration.create() self.name = name if name \ else "SPARK_JOB_{0}".format(uuid.uuid4())
def test_fsimage_from_config_withot_acls(self): config = Configuration.create(readonly=False, accepts_nulls=True) config.set(section=CONFIG_HDFS_DIRS_KEY, key='/raw/sales', value=None) snapshot = FsSnapshot.load_from_config(config=config, fs_section=CONFIG_HDFS_DIRS_KEY) files = snapshot.files self.assertTrue('/raw/sales' in files, 'File was not added to fs snapshot') self.assertTrue( len(files['/raw/sales']) == 0, 'ACL should be ignored for current configuration')
def test_fsimage_from_config_withot_acls(self): config = Configuration.create(readonly=False, accepts_nulls=True) config.set(section=CONFIG_HDFS_DIRS_KEY, key='/raw/sales', value=None) snapshot = FsSnapshot.load_from_config(config=config, fs_section=CONFIG_HDFS_DIRS_KEY) files = snapshot.files self.assertTrue('/raw/sales' in files, 'File was not added to fs snapshot') self.assertTrue(len(files['/raw/sales']) == 0, 'ACL should be ignored for current configuration')
def __init__(self, name=None, config=None, executor=execute_shell_command): """ Creates wrapper for Hive command line utility :param executor: custom executor :type executor: """ super(Hive, self).__init__() self.name = name if name else "HIVE_TASK_{0}".format(uuid.uuid4()) self.__executor = executor self._config = config if config else Configuration.create( readonly=False, accepts_nulls=True )
def __init__(self, name, config, executable, executor, main_class=None, shell_command="hadoop jar"): self.executor = executor self.executable = executable self._config = config if config else Configuration.create( readonly=False, accepts_nulls=True) self.name = name if name else "MR_TASK_{0}".format(uuid.uuid4()) self.main_class = main_class self._shell_command = shell_command self._process = None
def __init__(self, agent=None, conf_file=None, config=None, executor=execute_shell_command): """ Creates wrapper for Flume command line utility :param executor: custom executor :type executor: """ self.name = agent if agent else "FLUME_AGENT_{0}".format(uuid.uuid4()) self._executor = executor self._config = config if config else Configuration.create( readonly=False, accepts_nulls=True ) self.__set_attr__(TaskOptions.CONFIG_KEY_AGENT_NAME, agent) self.__set_attr__(TaskOptions.CONFIG_KEY_CONF_FILE, conf_file)
def prepare_streaming_job(config=None, name=None, jar="hadoop-streaming.jar", executor=execute_shell_command): """ Creates instance of StreamingJob :param name: name of job :param jar: executing jar :param executor: interface used by the client to run command. :return: StreamingJob template :rtype : StreamingJob """ MapReduce.LOG.info("MapReduce streaming job") config = config if config else Configuration.create(readonly=False, accepts_nulls=True) MapReduce.__validate_configs(config, name, "StreamingJob", TaskOptions.KEYS_FOR_MAPREDUCE) return StreamingJob( config=config, name=name if name else "MR_STREAMING_JOB_{0}".format(uuid.uuid4()), jar=jar, executor=executor, )
def load_commands_from_string(commands, command_executor=execute_shell_command): """ Creates an instance of Pig client. Configures Pig client to parse and run commands from string. :param commands: Commands to execute (within quotes) :param command_executor: The interface used by the client to run command. :type commands: str :rtype: Pig """ _config = Configuration.create(readonly=False, accepts_nulls=True) _job_name = "PIG_TASK_{0}".format(uuid.uuid4()) _pig = Pig(config=_config, job_name=_job_name, command_executor=command_executor) _pig.execute_commands(commands=commands) return _pig
def load_commands_from_file(path, command_executor=execute_shell_command): """ Creates an instance of Pig client. Configures Pig client to run commands from specified script file. :param path: path to the script to execute :param command_executor: The interface used by the client to run command. :type path: str :rtype: Pig """ Pig.LOG.info("Loading Pig script from file : {0}".format(path)) _config = Configuration.create(readonly=False, accepts_nulls=True) _job_name = "PIG_TASK_{0}".format(uuid.uuid4()) _pig = Pig(config=_config, job_name=_job_name, command_executor=command_executor) _pig.execute_script(path=path) return _pig
def prepare_mapreduce_job(jar, main_class=None, config=None, name=None, executor=execute_shell_command): """ Creates instance of MapReduceJob :param name: name of job :param jar: executing jar :param executor: interface used by the client to run command. :return: MapReduceJob template :rtype : MapReduceJob """ MapReduce.LOG.info("MapReduce job") config = config if config else Configuration.create(readonly=False, accepts_nulls=True) MapReduce.__validate_configs(config, name, "MapReduceJob", TaskOptions.KEYS_FOR_STREAMING_JOB) return MapReduceJob( name=name if name else "MR_JOB_{0}".format(uuid.uuid4()), config=config, jar=jar, main_class=main_class, executor=executor, )
def prepare_streaming_job(config=None, name=None, jar="hadoop-streaming.jar", executor=execute_shell_command): """ Creates instance of StreamingJob :param name: name of job :param jar: executing jar :param executor: interface used by the client to run command. :return: StreamingJob template :rtype : StreamingJob """ MapReduce.LOG.info("MapReduce streaming job") config = config if config else Configuration.create(readonly=False, accepts_nulls=True) MapReduce.__validate_configs(config, name, "StreamingJob", TaskOptions.KEYS_FOR_MAPREDUCE) return StreamingJob( config=config, name=name if name else "MR_STREAMING_JOB_{0}".format(uuid.uuid4()), jar=jar, executor=executor)
def prepare_mapreduce_job(jar, main_class=None, config=None, name=None, executor=execute_shell_command): """ Creates instance of MapReduceJob :param name: name of job :param jar: executing jar :param executor: interface used by the client to run command. :return: MapReduceJob template :rtype : MapReduceJob """ MapReduce.LOG.info("MapReduce job") config = config if config else Configuration.create(readonly=False, accepts_nulls=True) MapReduce.__validate_configs(config, name, "MapReduceJob", TaskOptions.KEYS_FOR_STREAMING_JOB) return MapReduceJob( name=name if name else "MR_JOB_{0}".format(uuid.uuid4()), config=config, jar=jar, main_class=main_class, executor=executor)
def test_fsimage_from_config(self): config = Configuration.create(readonly=False, accepts_nulls=True) config.set(section=CONFIG_ACLS_KEY, key='confidential', value='user:su:rwx') config.set(section=CONFIG_ACLS_KEY, key='sales', value='group:sales:r-x') config.set(section=CONFIG_HDFS_DIRS_KEY, key='/raw/sales', value='confidential,sales') snapshot = FsSnapshot.load_from_config(config=config, fs_section=CONFIG_HDFS_DIRS_KEY, acl_section=CONFIG_ACLS_KEY) files = snapshot.files self.assertTrue('/raw/sales' in files, 'File was not added to fs snapshot') self.assertTrue('user:su:rwx' in files['/raw/sales'], '\'confidential\' access lvl was not mapped to file') self.assertTrue('group:sales:r-x' in files['/raw/sales'], '\'sales\' access lvl was not mapped to file') self.assertFalse('default:fake:r-x' in files['/raw/sales'], 'Error in access lvl mapping')
def test_try_execute_empty_command(self): self.assertRaises(PigCommandError, Pig( config=Configuration.create(), job_name=None, command_executor=None).run)
def test_try_execute_empty_command(self): self.assertRaises( PigCommandError, Pig(config=Configuration.create(), job_name=None, command_executor=None).run)