Exemple #1
0
 def spark_app_config_template(self, master, name=str(uuid.uuid4())):
     _config = Configuration.create()
     _config.set(section=name, key=TaskOptions.SPARK_APP_CONFIG_MASTER, value=master)
     _config.set(section=name, key=TaskOptions.SPARK_APP_CONFIG_APPLICATION_JAR,
                 value=os.path.join(os.path.dirname(__file__), "resources", "spark", "SparkExample.jar"))
     _config.set(section=name, key=TaskOptions.SPARK_APP_CONFIG_MAIN_CLASS, value="example.spark.WordCounter")
     return _config
Exemple #2
0
    def test_run_preconfigured_job_without_parameters_substitution(self):
        _test_id = str(uuid.uuid4())
        _job_name = "TEST_PIG_{}".format(_test_id)
        _input_dir = self.copy_file_from_local(self.temp_file("hello,world,world", ".txt"))
        _output_dir = "/tmp/data_{}".format(_test_id)

        _commands = "A = load '{}' using PigStorage(',');".format(_input_dir)
        _commands += "B = foreach A generate \$0 as id;"
        _commands += "STORE B into '{}';".format(_output_dir)
        # create job configuration. can also be loaded from .ini file
        _config = Configuration.create()
        _config.set(_job_name, TaskOptions.CONFIG_KEY_COMMANDS_STRING, _commands)
        _config.set(_job_name, TaskOptions.CONFIG_KEY_LOG_BRIEF, "enabled")
        _config.set(
            _job_name,
            TaskOptions.CONFIG_KEY_PARAMETER_VALUE,
            "input_dir={}\noutput_dir={}".format(_input_dir, _output_dir),
        )
        try:
            _pig = Pig.load_preconfigured_job(config=_config, job_name=_job_name)
            _result = _pig.run()
            _result.if_failed_raise(AssertionError("test_run_preconfigured_job failed"))
            self.assertTrue(HDFS(_output_dir).exists(), "Cannot find job output")
        finally:
            self.delete_file_in_hdfs(_input_dir)
            self.delete_file_in_hdfs(_output_dir)
Exemple #3
0
    def test_run_preconfigured_job_without_parameters_substitution(self):
        _test_id = str(uuid.uuid4())
        _job_name = "TEST_PIG_{}".format(_test_id)
        _input_dir = self.copy_file_from_local(
            self.temp_file("hello,world,world", ".txt"))
        _output_dir = "/tmp/data_{}".format(_test_id)

        _commands = "A = load '{}' using PigStorage(',');".format(_input_dir)
        _commands += "B = foreach A generate \$0 as id;"
        _commands += "STORE B into '{}';".format(_output_dir)
        # create job configuration. can also be loaded from .ini file
        _config = Configuration.create()
        _config.set(_job_name, TaskOptions.CONFIG_KEY_COMMANDS_STRING,
                    _commands)
        _config.set(_job_name, TaskOptions.CONFIG_KEY_LOG_BRIEF, 'enabled')
        _config.set(
            _job_name, TaskOptions.CONFIG_KEY_PARAMETER_VALUE,
            'input_dir={}\noutput_dir={}'.format(_input_dir, _output_dir))
        try:
            _pig = Pig.load_preconfigured_job(config=_config,
                                              job_name=_job_name)
            _result = _pig.run()
            _result.if_failed_raise(
                AssertionError("test_run_preconfigured_job failed"))
            self.assertTrue(
                HDFS(_output_dir).exists(), "Cannot find job output")
        finally:
            self.delete_file_in_hdfs(_input_dir)
            self.delete_file_in_hdfs(_output_dir)
Exemple #4
0
 def __init__(self, name, config, executable, executor, main_class=None, shell_command="hadoop jar"):
     self.executor = executor
     self.executable = executable
     self._config = config if config else Configuration.create(readonly=False, accepts_nulls=True)
     self.name = name if name else "MR_TASK_{0}".format(uuid.uuid4())
     self.main_class = main_class
     self._shell_command = shell_command
     self._process = None
Exemple #5
0
 def test_wrap_with_quotes(self):
     _pc = Pig(config=Configuration.create(), job_name=None, command_executor=None)
     self.assertEqual("", _pc._wrap_with_quotes_(""))
     self.assertEqual(None, _pc._wrap_with_quotes_(None))
     self.assertEqual('"test"', _pc._wrap_with_quotes_("test"))
     self.assertEqual("'test'", _pc._wrap_with_quotes_("'test'"))
     self.assertEqual("'te\"st'", _pc._wrap_with_quotes_('te"st'))
     self.assertEqual('"te\'st"', _pc._wrap_with_quotes_("te'st"))
Exemple #6
0
 def test_create_new_config(self):
     _config = Configuration.create()
     _section = 'new_section'
     _key = 'new_key'
     _value = 'new_value'
     _config.set(section=_section, key=_key, value=_value)
     self.assertTrue(_config.has(_section, _key), "Config option was not added")
     self.assertEqual(_value, _config.get(_section, _key))
Exemple #7
0
 def test_create_new_config(self):
     _config = Configuration.create()
     _section = 'new_section'
     _key = 'new_key'
     _value = 'new_value'
     _config.set(section=_section, key=_key, value=_value)
     self.assertTrue(_config.has(_section, _key),
                     "Config option was not added")
     self.assertEqual(_value, _config.get(_section, _key))
Exemple #8
0
 def test_wrap_with_quotes(self):
     _pc = Pig(config=Configuration.create(),
               job_name=None,
               command_executor=None)
     self.assertEqual("", _pc._wrap_with_quotes_(""))
     self.assertEqual(None, _pc._wrap_with_quotes_(None))
     self.assertEqual('"test"', _pc._wrap_with_quotes_("test"))
     self.assertEqual("'test'", _pc._wrap_with_quotes_("'test'"))
     self.assertEqual("'te\"st'", _pc._wrap_with_quotes_('te"st'))
     self.assertEqual('"te\'st"', _pc._wrap_with_quotes_("te'st"))
Exemple #9
0
    def __init__(self, agent=None, conf_file=None, config=None, executor=execute_shell_command):
        """
        Creates wrapper for Flume command line utility
        :param executor: custom executor
        :type executor:
        """

        self.name = agent if agent else "FLUME_AGENT_{0}".format(uuid.uuid4())
        self._executor = executor
        self._config = config if config else Configuration.create(readonly=False, accepts_nulls=True)
        self.__set_attr__(TaskOptions.CONFIG_KEY_AGENT_NAME, agent)
        self.__set_attr__(TaskOptions.CONFIG_KEY_CONF_FILE, conf_file)
Exemple #10
0
    def __init__(self, name=None, config=None, executor=execute_shell_command):
        """
        Creates wrapper for Hive command line utility
        :param executor: custom executor
        :type executor:
        """

        super(Hive, self).__init__()
        self.name = name if name else "HIVE_TASK_{0}".format(uuid.uuid4())
        self.__executor = executor
        self._config = config if config else Configuration.create(
            readonly=False, accepts_nulls=True)
Exemple #11
0
    def __init__(self, config=None, name=None, executor=execute_shell_command):
        """

        :param config: configurations
        :param name: name of the config section containing specific application configurations
        :param executor: he interface used by the client to launch Spark Application.
        """
        super(SparkApplication, self).__init__()
        self.executor = executor
        self._configs = config if config else Configuration.create()
        self.name = name if name \
            else "SPARK_JOB_{0}".format(uuid.uuid4())
Exemple #12
0
    def test_fsimage_from_config_withot_acls(self):
        config = Configuration.create(readonly=False, accepts_nulls=True)
        config.set(section=CONFIG_HDFS_DIRS_KEY, key='/raw/sales', value=None)
        snapshot = FsSnapshot.load_from_config(config=config,
                                               fs_section=CONFIG_HDFS_DIRS_KEY)
        files = snapshot.files

        self.assertTrue('/raw/sales' in files,
                        'File was not added to fs snapshot')
        self.assertTrue(
            len(files['/raw/sales']) == 0,
            'ACL should be ignored for current configuration')
Exemple #13
0
    def test_fsimage_from_config_withot_acls(self):
        config = Configuration.create(readonly=False, accepts_nulls=True)
        config.set(section=CONFIG_HDFS_DIRS_KEY,
                   key='/raw/sales',
                   value=None)
        snapshot = FsSnapshot.load_from_config(config=config,
                                               fs_section=CONFIG_HDFS_DIRS_KEY)
        files = snapshot.files

        self.assertTrue('/raw/sales' in files,
                        'File was not added to fs snapshot')
        self.assertTrue(len(files['/raw/sales']) == 0,
                        'ACL should be ignored for current configuration')
Exemple #14
0
 def spark_app_config_template(self, master, name=str(uuid.uuid4())):
     _config = Configuration.create()
     _config.set(section=name,
                 key=TaskOptions.SPARK_APP_CONFIG_MASTER,
                 value=master)
     _config.set(section=name,
                 key=TaskOptions.SPARK_APP_CONFIG_APPLICATION_JAR,
                 value=os.path.join(os.path.dirname(__file__), "resources",
                                    "spark", "SparkExample.jar"))
     _config.set(section=name,
                 key=TaskOptions.SPARK_APP_CONFIG_MAIN_CLASS,
                 value="example.spark.WordCounter")
     return _config
Exemple #15
0
    def __init__(self, name=None, config=None, executor=execute_shell_command):
        """
        Creates wrapper for Hive command line utility
        :param executor: custom executor
        :type executor:
        """

        super(Hive, self).__init__()
        self.name = name if name else "HIVE_TASK_{0}".format(uuid.uuid4())
        self.__executor = executor
        self._config = config if config else Configuration.create(
            readonly=False,
            accepts_nulls=True
        )
Exemple #16
0
 def __init__(self,
              name,
              config,
              executable,
              executor,
              main_class=None,
              shell_command="hadoop jar"):
     self.executor = executor
     self.executable = executable
     self._config = config if config else Configuration.create(
         readonly=False, accepts_nulls=True)
     self.name = name if name else "MR_TASK_{0}".format(uuid.uuid4())
     self.main_class = main_class
     self._shell_command = shell_command
     self._process = None
Exemple #17
0
    def __init__(self, agent=None, conf_file=None, config=None,
                 executor=execute_shell_command):
        """
        Creates wrapper for Flume command line utility
        :param executor: custom executor
        :type executor:
        """

        self.name = agent if agent else "FLUME_AGENT_{0}".format(uuid.uuid4())
        self._executor = executor
        self._config = config if config else Configuration.create(
            readonly=False,
            accepts_nulls=True
        )
        self.__set_attr__(TaskOptions.CONFIG_KEY_AGENT_NAME, agent)
        self.__set_attr__(TaskOptions.CONFIG_KEY_CONF_FILE, conf_file)
Exemple #18
0
 def prepare_streaming_job(config=None, name=None, jar="hadoop-streaming.jar", executor=execute_shell_command):
     """
     Creates instance of StreamingJob
     :param name: name of job
     :param jar: executing jar
     :param executor: interface used by the client to run command.
     :return: StreamingJob template
     :rtype : StreamingJob
     """
     MapReduce.LOG.info("MapReduce streaming job")
     config = config if config else Configuration.create(readonly=False, accepts_nulls=True)
     MapReduce.__validate_configs(config, name, "StreamingJob", TaskOptions.KEYS_FOR_MAPREDUCE)
     return StreamingJob(
         config=config,
         name=name if name else "MR_STREAMING_JOB_{0}".format(uuid.uuid4()),
         jar=jar,
         executor=executor,
     )
Exemple #19
0
    def load_commands_from_string(commands,
                                  command_executor=execute_shell_command):
        """
         Creates an instance of Pig client.
         Configures Pig client to parse and run commands from string.
         :param commands: Commands to execute (within quotes)
         :param command_executor:  The interface used by the client to run command.

         :type commands: str
         :rtype: Pig
         """
        _config = Configuration.create(readonly=False, accepts_nulls=True)
        _job_name = "PIG_TASK_{0}".format(uuid.uuid4())
        _pig = Pig(config=_config,
                   job_name=_job_name,
                   command_executor=command_executor)
        _pig.execute_commands(commands=commands)
        return _pig
Exemple #20
0
    def load_commands_from_file(path,
                                command_executor=execute_shell_command):
        """
        Creates an instance of Pig client.
        Configures Pig client to run commands from specified script file.
        :param path: path to the script to execute
        :param command_executor:  The interface used by the client to run command.

        :type path: str
        :rtype: Pig
        """
        Pig.LOG.info("Loading Pig script from file : {0}".format(path))
        _config = Configuration.create(readonly=False, accepts_nulls=True)
        _job_name = "PIG_TASK_{0}".format(uuid.uuid4())
        _pig = Pig(config=_config,
                   job_name=_job_name,
                   command_executor=command_executor)
        _pig.execute_script(path=path)
        return _pig
Exemple #21
0
 def prepare_mapreduce_job(jar, main_class=None, config=None, name=None, executor=execute_shell_command):
     """
     Creates instance of MapReduceJob
     :param name: name of job
     :param jar: executing jar
     :param executor: interface used by the client to run command.
     :return: MapReduceJob template
     :rtype : MapReduceJob
     """
     MapReduce.LOG.info("MapReduce job")
     config = config if config else Configuration.create(readonly=False, accepts_nulls=True)
     MapReduce.__validate_configs(config, name, "MapReduceJob", TaskOptions.KEYS_FOR_STREAMING_JOB)
     return MapReduceJob(
         name=name if name else "MR_JOB_{0}".format(uuid.uuid4()),
         config=config,
         jar=jar,
         main_class=main_class,
         executor=executor,
     )
Exemple #22
0
 def prepare_streaming_job(config=None,
                           name=None,
                           jar="hadoop-streaming.jar",
                           executor=execute_shell_command):
     """
     Creates instance of StreamingJob
     :param name: name of job
     :param jar: executing jar
     :param executor: interface used by the client to run command.
     :return: StreamingJob template
     :rtype : StreamingJob
     """
     MapReduce.LOG.info("MapReduce streaming job")
     config = config if config else Configuration.create(readonly=False,
                                                         accepts_nulls=True)
     MapReduce.__validate_configs(config, name, "StreamingJob",
                                  TaskOptions.KEYS_FOR_MAPREDUCE)
     return StreamingJob(
         config=config,
         name=name if name else "MR_STREAMING_JOB_{0}".format(uuid.uuid4()),
         jar=jar,
         executor=executor)
Exemple #23
0
 def prepare_mapreduce_job(jar,
                           main_class=None,
                           config=None,
                           name=None,
                           executor=execute_shell_command):
     """
     Creates instance of MapReduceJob
     :param name: name of job
     :param jar: executing jar
     :param executor: interface used by the client to run command.
     :return: MapReduceJob template
     :rtype : MapReduceJob
     """
     MapReduce.LOG.info("MapReduce job")
     config = config if config else Configuration.create(readonly=False,
                                                         accepts_nulls=True)
     MapReduce.__validate_configs(config, name, "MapReduceJob",
                                  TaskOptions.KEYS_FOR_STREAMING_JOB)
     return MapReduceJob(
         name=name if name else "MR_JOB_{0}".format(uuid.uuid4()),
         config=config,
         jar=jar,
         main_class=main_class,
         executor=executor)
Exemple #24
0
    def test_fsimage_from_config(self):
        config = Configuration.create(readonly=False, accepts_nulls=True)
        config.set(section=CONFIG_ACLS_KEY,
                   key='confidential',
                   value='user:su:rwx')
        config.set(section=CONFIG_ACLS_KEY,
                   key='sales',
                   value='group:sales:r-x')
        config.set(section=CONFIG_HDFS_DIRS_KEY,
                   key='/raw/sales',
                   value='confidential,sales')
        snapshot = FsSnapshot.load_from_config(config=config,
                                               fs_section=CONFIG_HDFS_DIRS_KEY,
                                               acl_section=CONFIG_ACLS_KEY)
        files = snapshot.files

        self.assertTrue('/raw/sales' in files,
                        'File was not added to fs snapshot')
        self.assertTrue('user:su:rwx' in files['/raw/sales'],
                        '\'confidential\' access lvl was not mapped to file')
        self.assertTrue('group:sales:r-x' in files['/raw/sales'],
                        '\'sales\' access lvl was not mapped to file')
        self.assertFalse('default:fake:r-x' in files['/raw/sales'],
                         'Error in access lvl mapping')
Exemple #25
0
    def test_fsimage_from_config(self):
        config = Configuration.create(readonly=False, accepts_nulls=True)
        config.set(section=CONFIG_ACLS_KEY,
                   key='confidential',
                   value='user:su:rwx')
        config.set(section=CONFIG_ACLS_KEY,
                   key='sales',
                   value='group:sales:r-x')
        config.set(section=CONFIG_HDFS_DIRS_KEY,
                   key='/raw/sales',
                   value='confidential,sales')
        snapshot = FsSnapshot.load_from_config(config=config,
                                               fs_section=CONFIG_HDFS_DIRS_KEY,
                                               acl_section=CONFIG_ACLS_KEY)
        files = snapshot.files

        self.assertTrue('/raw/sales' in files,
                        'File was not added to fs snapshot')
        self.assertTrue('user:su:rwx' in files['/raw/sales'],
                        '\'confidential\' access lvl was not mapped to file')
        self.assertTrue('group:sales:r-x' in files['/raw/sales'],
                        '\'sales\' access lvl was not mapped to file')
        self.assertFalse('default:fake:r-x' in files['/raw/sales'],
                         'Error in access lvl mapping')
Exemple #26
0
 def test_try_execute_empty_command(self):
     self.assertRaises(PigCommandError, Pig(
         config=Configuration.create(),
         job_name=None,
         command_executor=None).run)
Exemple #27
0
 def test_try_execute_empty_command(self):
     self.assertRaises(
         PigCommandError,
         Pig(config=Configuration.create(),
             job_name=None,
             command_executor=None).run)