def test_spark_process_runcmd(self, mock_popen):
        # Given
        mock_popen.return_value.stdout = six.StringIO(
            'Spark-sql communicates using stdout')
        mock_popen.return_value.stderr = six.StringIO('stderr')
        mock_popen.return_value.wait.return_value = 0

        # When
        hook = SparkSqlHook(conn_id='spark_default', sql='SELECT 1')
        with patch.object(hook.log, 'debug') as mock_debug:
            with patch.object(hook.log, 'info') as mock_info:
                hook.run_query()
                mock_debug.assert_called_with('Spark-Sql cmd: %s', [
                    'spark-sql', '-e', 'SELECT 1', '--master', 'yarn',
                    '--name', 'default-name', '--verbose', '--queue', 'default'
                ])
                mock_info.assert_called_with(
                    'Spark-sql communicates using stdout')

        # Then
        self.assertEqual(
            mock_popen.mock_calls[0],
            call([
                'spark-sql', '-e', 'SELECT 1', '--master', 'yarn', '--name',
                'default-name', '--verbose', '--queue', 'default'
            ],
                 stderr=-2,
                 stdout=-1))
    def test_spark_process_runcmd(self, mock_popen):
        # Given
        mock_popen.return_value.stdout = six.StringIO('Spark-sql communicates using stdout')
        mock_popen.return_value.stderr = six.StringIO('stderr')
        mock_popen.return_value.wait.return_value = 0

        # When
        hook = SparkSqlHook(
            conn_id='spark_default',
            sql='SELECT 1'
        )
        with patch.object(hook.log, 'debug') as mock_debug:
            with patch.object(hook.log, 'info') as mock_info:
                hook.run_query()
                mock_debug.assert_called_with(
                    'Spark-Sql cmd: %s',
                    ['spark-sql', '-e', 'SELECT 1', '--master', 'yarn', '--name', 'default-name', '--verbose', '--queue', 'default']
                )
                mock_info.assert_called_with(
                    'Spark-sql communicates using stdout'
                )

        # Then
        self.assertEqual(
            mock_popen.mock_calls[0],
            call(['spark-sql', '-e', 'SELECT 1', '--master', 'yarn', '--name', 'default-name', '--verbose', '--queue', 'default'], stderr=-2, stdout=-1)
        )
    def test_build_command(self):
        hook = SparkSqlHook(**self._config)

        # The subprocess requires an array but we build the cmd by joining on a space
        cmd = ' '.join(hook._prepare_command(""))

        # Check all the parameters
        assert "--executor-cores {}".format(
            self._config['executor_cores']) in cmd
        assert "--executor-memory {}".format(
            self._config['executor_memory']) in cmd
        assert "--keytab {}".format(self._config['keytab']) in cmd
        assert "--name {}".format(self._config['name']) in cmd
        assert "--num-executors {}".format(
            self._config['num_executors']) in cmd
        sql_path = get_after('-f', hook._prepare_command(""))
        assert self._config['sql'].strip() == sql_path

        # Check if all config settings are there
        for kv in self._config['conf'].split(","):
            k, v = kv.split('=')
            assert "--conf {0}={1}".format(k, v) in cmd

        if self._config['verbose']:
            assert "--verbose" in cmd
Ejemplo n.º 4
0
 def execute(self, context):
     """
     Call the SparkSqlHook to run the provided sql query
     """
     self._hook = SparkSqlHook(sql=self._sql,
                               conf=self._conf,
                               conn_id=self._conn_id,
                               executor_cores=self._executor_cores,
                               executor_memory=self._executor_memory,
                               keytab=self._keytab,
                               name=self._name,
                               num_executors=self._num_executors,
                               master=self._master,
                               yarn_queue=self._yarn_queue)
     self._hook.run_query()
    def test_spark_process_runcmd_with_list(self, mock_popen):
        # Given
        mock_popen.return_value.stdout = six.StringIO('Spark-sql communicates using stdout')
        mock_popen.return_value.stderr = six.StringIO('stderr')
        mock_popen.return_value.wait.return_value = 0

        # When
        hook = SparkSqlHook(
            conn_id='spark_default',
            sql='SELECT 1'
        )
        hook.run_query(['--deploy-mode', 'cluster'])

        # Then
        self.assertEqual(
            mock_popen.mock_calls[0],
            call(['spark-sql', '-e', 'SELECT 1', '--master', 'yarn', '--name', 'default-name', '--verbose',
                  '--queue', 'default', '--deploy-mode', 'cluster'], stderr=-2, stdout=-1)
        )
    def test_build_command(self):
        hook = SparkSqlHook(**self._config)

        # The subprocess requires an array but we build the cmd by joining on a space
        cmd = ' '.join(hook._prepare_command(""))

        # Check all the parameters
        assert "--executor-cores {}".format(self._config['executor_cores']) in cmd
        assert "--executor-memory {}".format(self._config['executor_memory']) in cmd
        assert "--keytab {}".format(self._config['keytab']) in cmd
        assert "--name {}".format(self._config['name']) in cmd
        assert "--num-executors {}".format(self._config['num_executors']) in cmd
        sql_path = get_after('-f', hook._prepare_command(""))
        assert self._config['sql'].strip() == sql_path

        # Check if all config settings are there
        for kv in self._config['conf'].split(","):
            k, v = kv.split('=')
            assert "--conf {0}={1}".format(k, v) in cmd

        if self._config['verbose']:
            assert "--verbose" in cmd
 def execute(self, context):
     """
     Call the SparkSqlHook to run the provided sql query
     """
     self._hook = SparkSqlHook(sql=self._sql,
                               conf=self._conf,
                               conn_id=self._conn_id,
                               executor_cores=self._executor_cores,
                               executor_memory=self._executor_memory,
                               keytab=self._keytab,
                               name=self._name,
                               num_executors=self._num_executors,
                               master=self._master,
                               yarn_queue=self._yarn_queue
                               )
     self._hook.run_query()
class SparkSqlOperator(BaseOperator):
    """
    Execute Spark SQL query

    :param sql: The SQL query to execute. (templated)
    :type sql: str
    :param conf: arbitrary Spark configuration property
    :type conf: str (format: PROP=VALUE)
    :param conn_id: connection_id string
    :type conn_id: str
    :param total_executor_cores: (Standalone & Mesos only) Total cores for all
        executors (Default: all the available cores on the worker)
    :type total_executor_cores: int
    :param executor_cores: (Standalone & YARN only) Number of cores per
        executor (Default: 2)
    :type executor_cores: int
    :param executor_memory: Memory per executor (e.g. 1000M, 2G) (Default: 1G)
    :type executor_memory: str
    :param keytab: Full path to the file that contains the keytab
    :type keytab: str
    :param master: spark://host:port, mesos://host:port, yarn, or local
    :type master: str
    :param name: Name of the job
    :type name: str
    :param num_executors: Number of executors to launch
    :type num_executors: int
    :param verbose: Whether to pass the verbose flag to spark-sql
    :type verbose: bool
    :param yarn_queue: The YARN queue to submit to (Default: "default")
    :type yarn_queue: str
    """

    template_fields = ["_sql"]
    template_ext = [".sql", ".hql"]

    @apply_defaults
    def __init__(self,
                 sql,
                 conf=None,
                 conn_id='spark_sql_default',
                 total_executor_cores=None,
                 executor_cores=None,
                 executor_memory=None,
                 keytab=None,
                 principal=None,
                 master='yarn',
                 name='default-name',
                 num_executors=None,
                 yarn_queue='default',
                 *args,
                 **kwargs):
        super(SparkSqlOperator, self).__init__(*args, **kwargs)
        self._sql = sql
        self._conf = conf
        self._conn_id = conn_id
        self._total_executor_cores = total_executor_cores
        self._executor_cores = executor_cores
        self._executor_memory = executor_memory
        self._keytab = keytab
        self._principal = principal
        self._master = master
        self._name = name
        self._num_executors = num_executors
        self._yarn_queue = yarn_queue
        self._hook = None

    def execute(self, context):
        """
        Call the SparkSqlHook to run the provided sql query
        """
        self._hook = SparkSqlHook(sql=self._sql,
                                  conf=self._conf,
                                  conn_id=self._conn_id,
                                  total_executor_cores=self._total_executor_cores,
                                  executor_cores=self._executor_cores,
                                  executor_memory=self._executor_memory,
                                  keytab=self._keytab,
                                  principal=self._principal,
                                  name=self._name,
                                  num_executors=self._num_executors,
                                  master=self._master,
                                  yarn_queue=self._yarn_queue
                                  )
        self._hook.run_query()

    def on_kill(self):
        self._hook.kill()
class SparkSqlOperator(BaseOperator):
    """
    Execute Spark SQL query

    :param sql: The SQL query to execute. (templated)
    :type sql: str
    :param conf: arbitrary Spark configuration property
    :type conf: str (format: PROP=VALUE)
    :param conn_id: connection_id string
    :type conn_id: str
    :param total_executor_cores: (Standalone & Mesos only) Total cores for all
        executors (Default: all the available cores on the worker)
    :type total_executor_cores: int
    :param executor_cores: (Standalone & YARN only) Number of cores per
        executor (Default: 2)
    :type executor_cores: int
    :param executor_memory: Memory per executor (e.g. 1000M, 2G) (Default: 1G)
    :type executor_memory: str
    :param keytab: Full path to the file that contains the keytab
    :type keytab: str
    :param master: spark://host:port, mesos://host:port, yarn, or local
    :type master: str
    :param name: Name of the job
    :type name: str
    :param num_executors: Number of executors to launch
    :type num_executors: int
    :param verbose: Whether to pass the verbose flag to spark-sql
    :type verbose: bool
    :param yarn_queue: The YARN queue to submit to (Default: "default")
    :type yarn_queue: str
    """

    template_fields = ["_sql"]
    template_ext = [".sql", ".hql"]

    @apply_defaults
    def __init__(self,
                 sql,
                 conf=None,
                 conn_id='spark_sql_default',
                 total_executor_cores=None,
                 executor_cores=None,
                 executor_memory=None,
                 keytab=None,
                 principal=None,
                 master='yarn',
                 name='default-name',
                 num_executors=None,
                 yarn_queue='default',
                 *args,
                 **kwargs):
        super().__init__(*args, **kwargs)
        self._sql = sql
        self._conf = conf
        self._conn_id = conn_id
        self._total_executor_cores = total_executor_cores
        self._executor_cores = executor_cores
        self._executor_memory = executor_memory
        self._keytab = keytab
        self._principal = principal
        self._master = master
        self._name = name
        self._num_executors = num_executors
        self._yarn_queue = yarn_queue
        self._hook = None

    def execute(self, context):
        """
        Call the SparkSqlHook to run the provided sql query
        """
        self._hook = SparkSqlHook(sql=self._sql,
                                  conf=self._conf,
                                  conn_id=self._conn_id,
                                  total_executor_cores=self._total_executor_cores,
                                  executor_cores=self._executor_cores,
                                  executor_memory=self._executor_memory,
                                  keytab=self._keytab,
                                  principal=self._principal,
                                  name=self._name,
                                  num_executors=self._num_executors,
                                  master=self._master,
                                  yarn_queue=self._yarn_queue
                                  )
        self._hook.run_query()

    def on_kill(self):
        self._hook.kill()