def test_spark_process_runcmd(self, mock_popen): # Given mock_popen.return_value.stdout = six.StringIO( 'Spark-sql communicates using stdout') mock_popen.return_value.stderr = six.StringIO('stderr') mock_popen.return_value.wait.return_value = 0 # When hook = SparkSqlHook(conn_id='spark_default', sql='SELECT 1') with patch.object(hook.log, 'debug') as mock_debug: with patch.object(hook.log, 'info') as mock_info: hook.run_query() mock_debug.assert_called_with('Spark-Sql cmd: %s', [ 'spark-sql', '-e', 'SELECT 1', '--master', 'yarn', '--name', 'default-name', '--verbose', '--queue', 'default' ]) mock_info.assert_called_with( 'Spark-sql communicates using stdout') # Then self.assertEqual( mock_popen.mock_calls[0], call([ 'spark-sql', '-e', 'SELECT 1', '--master', 'yarn', '--name', 'default-name', '--verbose', '--queue', 'default' ], stderr=-2, stdout=-1))
def test_spark_process_runcmd(self, mock_popen): # Given mock_popen.return_value.stdout = six.StringIO('Spark-sql communicates using stdout') mock_popen.return_value.stderr = six.StringIO('stderr') mock_popen.return_value.wait.return_value = 0 # When hook = SparkSqlHook( conn_id='spark_default', sql='SELECT 1' ) with patch.object(hook.log, 'debug') as mock_debug: with patch.object(hook.log, 'info') as mock_info: hook.run_query() mock_debug.assert_called_with( 'Spark-Sql cmd: %s', ['spark-sql', '-e', 'SELECT 1', '--master', 'yarn', '--name', 'default-name', '--verbose', '--queue', 'default'] ) mock_info.assert_called_with( 'Spark-sql communicates using stdout' ) # Then self.assertEqual( mock_popen.mock_calls[0], call(['spark-sql', '-e', 'SELECT 1', '--master', 'yarn', '--name', 'default-name', '--verbose', '--queue', 'default'], stderr=-2, stdout=-1) )
def test_build_command(self): hook = SparkSqlHook(**self._config) # The subprocess requires an array but we build the cmd by joining on a space cmd = ' '.join(hook._prepare_command("")) # Check all the parameters assert "--executor-cores {}".format( self._config['executor_cores']) in cmd assert "--executor-memory {}".format( self._config['executor_memory']) in cmd assert "--keytab {}".format(self._config['keytab']) in cmd assert "--name {}".format(self._config['name']) in cmd assert "--num-executors {}".format( self._config['num_executors']) in cmd sql_path = get_after('-f', hook._prepare_command("")) assert self._config['sql'].strip() == sql_path # Check if all config settings are there for kv in self._config['conf'].split(","): k, v = kv.split('=') assert "--conf {0}={1}".format(k, v) in cmd if self._config['verbose']: assert "--verbose" in cmd
def execute(self, context): """ Call the SparkSqlHook to run the provided sql query """ self._hook = SparkSqlHook(sql=self._sql, conf=self._conf, conn_id=self._conn_id, executor_cores=self._executor_cores, executor_memory=self._executor_memory, keytab=self._keytab, name=self._name, num_executors=self._num_executors, master=self._master, yarn_queue=self._yarn_queue) self._hook.run_query()
def test_spark_process_runcmd_with_list(self, mock_popen): # Given mock_popen.return_value.stdout = six.StringIO('Spark-sql communicates using stdout') mock_popen.return_value.stderr = six.StringIO('stderr') mock_popen.return_value.wait.return_value = 0 # When hook = SparkSqlHook( conn_id='spark_default', sql='SELECT 1' ) hook.run_query(['--deploy-mode', 'cluster']) # Then self.assertEqual( mock_popen.mock_calls[0], call(['spark-sql', '-e', 'SELECT 1', '--master', 'yarn', '--name', 'default-name', '--verbose', '--queue', 'default', '--deploy-mode', 'cluster'], stderr=-2, stdout=-1) )
def test_build_command(self): hook = SparkSqlHook(**self._config) # The subprocess requires an array but we build the cmd by joining on a space cmd = ' '.join(hook._prepare_command("")) # Check all the parameters assert "--executor-cores {}".format(self._config['executor_cores']) in cmd assert "--executor-memory {}".format(self._config['executor_memory']) in cmd assert "--keytab {}".format(self._config['keytab']) in cmd assert "--name {}".format(self._config['name']) in cmd assert "--num-executors {}".format(self._config['num_executors']) in cmd sql_path = get_after('-f', hook._prepare_command("")) assert self._config['sql'].strip() == sql_path # Check if all config settings are there for kv in self._config['conf'].split(","): k, v = kv.split('=') assert "--conf {0}={1}".format(k, v) in cmd if self._config['verbose']: assert "--verbose" in cmd
def execute(self, context): """ Call the SparkSqlHook to run the provided sql query """ self._hook = SparkSqlHook(sql=self._sql, conf=self._conf, conn_id=self._conn_id, executor_cores=self._executor_cores, executor_memory=self._executor_memory, keytab=self._keytab, name=self._name, num_executors=self._num_executors, master=self._master, yarn_queue=self._yarn_queue ) self._hook.run_query()
class SparkSqlOperator(BaseOperator): """ Execute Spark SQL query :param sql: The SQL query to execute. (templated) :type sql: str :param conf: arbitrary Spark configuration property :type conf: str (format: PROP=VALUE) :param conn_id: connection_id string :type conn_id: str :param total_executor_cores: (Standalone & Mesos only) Total cores for all executors (Default: all the available cores on the worker) :type total_executor_cores: int :param executor_cores: (Standalone & YARN only) Number of cores per executor (Default: 2) :type executor_cores: int :param executor_memory: Memory per executor (e.g. 1000M, 2G) (Default: 1G) :type executor_memory: str :param keytab: Full path to the file that contains the keytab :type keytab: str :param master: spark://host:port, mesos://host:port, yarn, or local :type master: str :param name: Name of the job :type name: str :param num_executors: Number of executors to launch :type num_executors: int :param verbose: Whether to pass the verbose flag to spark-sql :type verbose: bool :param yarn_queue: The YARN queue to submit to (Default: "default") :type yarn_queue: str """ template_fields = ["_sql"] template_ext = [".sql", ".hql"] @apply_defaults def __init__(self, sql, conf=None, conn_id='spark_sql_default', total_executor_cores=None, executor_cores=None, executor_memory=None, keytab=None, principal=None, master='yarn', name='default-name', num_executors=None, yarn_queue='default', *args, **kwargs): super(SparkSqlOperator, self).__init__(*args, **kwargs) self._sql = sql self._conf = conf self._conn_id = conn_id self._total_executor_cores = total_executor_cores self._executor_cores = executor_cores self._executor_memory = executor_memory self._keytab = keytab self._principal = principal self._master = master self._name = name self._num_executors = num_executors self._yarn_queue = yarn_queue self._hook = None def execute(self, context): """ Call the SparkSqlHook to run the provided sql query """ self._hook = SparkSqlHook(sql=self._sql, conf=self._conf, conn_id=self._conn_id, total_executor_cores=self._total_executor_cores, executor_cores=self._executor_cores, executor_memory=self._executor_memory, keytab=self._keytab, principal=self._principal, name=self._name, num_executors=self._num_executors, master=self._master, yarn_queue=self._yarn_queue ) self._hook.run_query() def on_kill(self): self._hook.kill()
class SparkSqlOperator(BaseOperator): """ Execute Spark SQL query :param sql: The SQL query to execute. (templated) :type sql: str :param conf: arbitrary Spark configuration property :type conf: str (format: PROP=VALUE) :param conn_id: connection_id string :type conn_id: str :param total_executor_cores: (Standalone & Mesos only) Total cores for all executors (Default: all the available cores on the worker) :type total_executor_cores: int :param executor_cores: (Standalone & YARN only) Number of cores per executor (Default: 2) :type executor_cores: int :param executor_memory: Memory per executor (e.g. 1000M, 2G) (Default: 1G) :type executor_memory: str :param keytab: Full path to the file that contains the keytab :type keytab: str :param master: spark://host:port, mesos://host:port, yarn, or local :type master: str :param name: Name of the job :type name: str :param num_executors: Number of executors to launch :type num_executors: int :param verbose: Whether to pass the verbose flag to spark-sql :type verbose: bool :param yarn_queue: The YARN queue to submit to (Default: "default") :type yarn_queue: str """ template_fields = ["_sql"] template_ext = [".sql", ".hql"] @apply_defaults def __init__(self, sql, conf=None, conn_id='spark_sql_default', total_executor_cores=None, executor_cores=None, executor_memory=None, keytab=None, principal=None, master='yarn', name='default-name', num_executors=None, yarn_queue='default', *args, **kwargs): super().__init__(*args, **kwargs) self._sql = sql self._conf = conf self._conn_id = conn_id self._total_executor_cores = total_executor_cores self._executor_cores = executor_cores self._executor_memory = executor_memory self._keytab = keytab self._principal = principal self._master = master self._name = name self._num_executors = num_executors self._yarn_queue = yarn_queue self._hook = None def execute(self, context): """ Call the SparkSqlHook to run the provided sql query """ self._hook = SparkSqlHook(sql=self._sql, conf=self._conf, conn_id=self._conn_id, total_executor_cores=self._total_executor_cores, executor_cores=self._executor_cores, executor_memory=self._executor_memory, keytab=self._keytab, principal=self._principal, name=self._name, num_executors=self._num_executors, master=self._master, yarn_queue=self._yarn_queue ) self._hook.run_query() def on_kill(self): self._hook.kill()