def execute(self, context): hook = DataFlowHook(gcp_conn_id=self.gcp_conn_id, delegate_to=self.delegate_to, poll_sleep=self.poll_sleep) hook.start_template_dataflow(self.task_id, self.dataflow_default_options, self.parameters, self.template)
def execute(self, context): hook = DataFlowHook(gcp_conn_id=self.gcp_conn_id, delegate_to=self.delegate_to, poll_sleep=self.poll_sleep) hook.start_template_dataflow(self.job_name, self.dataflow_default_options, self.parameters, self.template)
def execute(self, context): hook = DataFlowHook(gcp_conn_id=self.gcp_conn_id, delegate_to=self.delegate_to) dataflow_options = copy.copy(self.dataflow_default_options) dataflow_options.update(self.options) hook.start_java_dataflow(self.task_id, dataflow_options, self.jar)
def execute(self, context): bucket_helper = GoogleCloudBucketHelper( self.gcp_conn_id, self.delegate_to) self.jar = bucket_helper.google_cloud_to_local(self.jar) hook = DataFlowHook(gcp_conn_id=self.gcp_conn_id, delegate_to=self.delegate_to) dataflow_options = copy.copy(self.dataflow_default_options) dataflow_options.update(self.options) hook.start_java_dataflow(self.task_id, dataflow_options, self.jar)
def execute(self, context): bucket_helper = GoogleCloudBucketHelper(self.gcp_conn_id, self.delegate_to) self.jar = bucket_helper.google_cloud_to_local(self.jar) hook = DataFlowHook(gcp_conn_id=self.gcp_conn_id, delegate_to=self.delegate_to) dataflow_options = copy.copy(self.dataflow_default_options) dataflow_options.update(self.options) hook.start_java_dataflow(self.task_id, dataflow_options, self.jar)
def execute( self, # Some context about the context: https://bcb.github.io/airflow/execute-context context: Dict[str, Any] # pylint: disable=unused-argument ) -> None: hook = DataFlowHook(gcp_conn_id=self.gcp_conn_id, delegate_to=self.delegate_to, poll_sleep=self.poll_sleep) # In DataflowTemplateOperator, start_template_dataflow has the default append_job_name set to True # so it adds a unique-id to the end of the job name. This overwrites that default argument. hook.start_template_dataflow(self.task_id, self.dataflow_default_options, self.parameters, self.template, append_job_name=False)
class DataFlowTemplateHookTest(unittest.TestCase): def setUp(self): with mock.patch(BASE_STRING.format('GoogleCloudBaseHook.__init__'), new=mock_init): self.dataflow_hook = DataFlowHook(gcp_conn_id='test') @mock.patch(DATAFLOW_STRING.format('DataFlowHook._start_template_dataflow')) def test_start_template_dataflow(self, internal_dataflow_mock): self.dataflow_hook.start_template_dataflow( task_id=TASK_ID, variables=DATAFLOW_OPTIONS_TEMPLATE, parameters=PARAMETERS, dataflow_template=TEMPLATE) internal_dataflow_mock.assert_called_once_with( mock.ANY, DATAFLOW_OPTIONS_TEMPLATE, PARAMETERS, TEMPLATE)
class DataFlowHookTest(unittest.TestCase): def setUp(self): with mock.patch(BASE_STRING.format('GoogleCloudBaseHook.__init__'), new=mock_init): self.dataflow_hook = DataFlowHook(gcp_conn_id='test') @mock.patch(DATAFLOW_STRING.format('DataFlowHook._start_dataflow')) def test_start_python_dataflow(self, internal_dataflow_mock): self.dataflow_hook.start_python_dataflow( task_id=TASK_ID, variables=OPTIONS, dataflow=PY_FILE, py_options=PY_OPTIONS) internal_dataflow_mock.assert_called_once_with( TASK_ID, OPTIONS, PY_FILE, mock.ANY, ['python'] + PY_OPTIONS)
class DataFlowTemplateHookTest(unittest.TestCase): def setUp(self): with mock.patch(BASE_STRING.format('GoogleCloudBaseHook.__init__'), new=mock_init): self.dataflow_hook = DataFlowHook(gcp_conn_id='test') @mock.patch(DATAFLOW_STRING.format('DataFlowHook._start_template_dataflow') ) def test_start_template_dataflow(self, internal_dataflow_mock): self.dataflow_hook.start_template_dataflow( job_name=JOB_NAME, variables=DATAFLOW_OPTIONS_TEMPLATE, parameters=PARAMETERS, dataflow_template=TEMPLATE) options_with_region = {'region': 'us-central1'} options_with_region.update(DATAFLOW_OPTIONS_TEMPLATE) internal_dataflow_mock.assert_called_once_with(mock.ANY, options_with_region, PARAMETERS, TEMPLATE) @mock.patch(DATAFLOW_STRING.format('_DataflowJob')) @mock.patch(DATAFLOW_STRING.format('DataFlowHook.get_conn')) def test_start_template_dataflow_with_runtime_env(self, mock_conn, mock_dataflowjob): dataflow_options_template = copy.deepcopy(DATAFLOW_OPTIONS_TEMPLATE) options_with_runtime_env = copy.deepcopy(RUNTIME_ENV) options_with_runtime_env.update(dataflow_options_template) dataflowjob_instance = mock_dataflowjob.return_value dataflowjob_instance.wait_for_done.return_value = None method = (mock_conn.return_value.projects.return_value.locations. return_value.templates.return_value.launch) self.dataflow_hook.start_template_dataflow( job_name=JOB_NAME, variables=options_with_runtime_env, parameters=PARAMETERS, dataflow_template=TEMPLATE) body = { "jobName": mock.ANY, "parameters": PARAMETERS, "environment": RUNTIME_ENV } method.assert_called_once_with( projectId=options_with_runtime_env['project'], location='us-central1', gcsPath=TEMPLATE, body=body, )
def execute(self, context): """Execute the python dataflow job.""" hook = DataFlowHook(gcp_conn_id=self.gcp_conn_id, delegate_to=self.delegate_to) dataflow_options = self.dataflow_default_options.copy() dataflow_options.update(self.options) # Convert argument names from lowerCamelCase to snake case. camel_to_snake = lambda name: re.sub( r'[A-Z]', lambda x: '_' + x.group(0).lower(), name) formatted_options = { camel_to_snake(key): dataflow_options[key] for key in dataflow_options } hook.start_python_dataflow(self.task_id, formatted_options, self.py_file, self.py_options)
def __init__(self, task_run): super(DataFlowJobCtrl, self).__init__(task_run=task_run) self.dataflow_config = task_run.task.beam_engine # type: DataflowConfig gcp_conn_id = self.task_env.conn_id from airflow.contrib.hooks.gcp_dataflow_hook import DataFlowHook self._gcp_dataflow_hook = DataFlowHook( gcp_conn_id=gcp_conn_id, delegate_to=self.task_env.delegate_to) if self.dataflow_config.temp_location: # override sync location with temp_location self.remote_sync_root = self.dataflow_config.temp_location self.current_dataflow_job_id = None
class DataFlowTemplateHookTest(unittest.TestCase): def setUp(self): with mock.patch(BASE_STRING.format('GoogleCloudBaseHook.__init__'), new=mock_init): self.dataflow_hook = DataFlowHook(gcp_conn_id='test') @mock.patch(DATAFLOW_STRING.format('DataFlowHook._start_template_dataflow')) def test_start_template_dataflow(self, internal_dataflow_mock): self.dataflow_hook.start_template_dataflow( job_name=JOB_NAME, variables=DATAFLOW_OPTIONS_TEMPLATE, parameters=PARAMETERS, dataflow_template=TEMPLATE) options_with_region = {'region': 'us-central1'} options_with_region.update(DATAFLOW_OPTIONS_TEMPLATE) internal_dataflow_mock.assert_called_once_with( mock.ANY, options_with_region, PARAMETERS, TEMPLATE) @mock.patch(DATAFLOW_STRING.format('_DataflowJob')) @mock.patch(DATAFLOW_STRING.format('DataFlowHook.get_conn')) def test_start_template_dataflow_with_runtime_env(self, mock_conn, mock_dataflowjob): dataflow_options_template = copy.deepcopy(DATAFLOW_OPTIONS_TEMPLATE) options_with_runtime_env = copy.deepcopy(RUNTIME_ENV) options_with_runtime_env.update(dataflow_options_template) dataflowjob_instance = mock_dataflowjob.return_value dataflowjob_instance.wait_for_done.return_value = None method = (mock_conn.return_value .projects.return_value .locations.return_value .templates.return_value .launch) self.dataflow_hook.start_template_dataflow( job_name=JOB_NAME, variables=options_with_runtime_env, parameters=PARAMETERS, dataflow_template=TEMPLATE ) body = {"jobName": mock.ANY, "parameters": PARAMETERS, "environment": RUNTIME_ENV } method.assert_called_once_with( projectId=options_with_runtime_env['project'], location='us-central1', gcsPath=TEMPLATE, body=body, )
class DataFlowTemplateHookTest(unittest.TestCase): def setUp(self): with mock.patch(BASE_STRING.format('GoogleCloudBaseHook.__init__'), new=mock_init): self.dataflow_hook = DataFlowHook(gcp_conn_id='test') @mock.patch(DATAFLOW_STRING.format('DataFlowHook._start_template_dataflow')) def test_start_template_dataflow(self, internal_dataflow_mock): self.dataflow_hook.start_template_dataflow( job_name=JOB_NAME, variables=DATAFLOW_OPTIONS_TEMPLATE, parameters=PARAMETERS, dataflow_template=TEMPLATE) options_with_region = {'region': 'us-central1'} options_with_region.update(DATAFLOW_OPTIONS_TEMPLATE) internal_dataflow_mock.assert_called_once_with( mock.ANY, options_with_region, PARAMETERS, TEMPLATE)
class DataFlowTemplateHookTest(unittest.TestCase): def setUp(self): with mock.patch(BASE_STRING.format('GoogleCloudBaseHook.__init__'), new=mock_init): self.dataflow_hook = DataFlowHook(gcp_conn_id='test') @mock.patch(DATAFLOW_STRING.format('DataFlowHook._start_template_dataflow') ) def test_start_template_dataflow(self, internal_dataflow_mock): self.dataflow_hook.start_template_dataflow( task_id=TASK_ID, variables=DATAFLOW_OPTIONS_TEMPLATE, parameters=PARAMETERS, dataflow_template=TEMPLATE) internal_dataflow_mock.assert_called_once_with( mock.ANY, DATAFLOW_OPTIONS_TEMPLATE, PARAMETERS, TEMPLATE)
class DataFlowJobCtrl(ApacheBeamJobCtrl): def __init__(self, task_run): super(DataFlowJobCtrl, self).__init__(task_run=task_run) self.dataflow_config = task_run.task.beam_engine # type: DataflowConfig gcp_conn_id = self.task_env.conn_id from airflow.contrib.hooks.gcp_dataflow_hook import DataFlowHook self._gcp_dataflow_hook = DataFlowHook( gcp_conn_id=gcp_conn_id, delegate_to=self.task_env.delegate_to) if self.dataflow_config.temp_location: # override sync location with temp_location self.remote_sync_root = self.dataflow_config.temp_location self.current_dataflow_job_id = None def _get_base_options(self): options = super(DataFlowJobCtrl, self)._get_base_options() dfc = self.dataflow_config options.update(dfc.options) options.setdefault("runner", dfc.runner) options.setdefault("region", dfc.region) options.setdefault("project", dfc.project) options.setdefault("tempLocation", dfc.temp_location) return options def _process_dataflow_log(self, msg): msg = msg.strip() if self.current_dataflow_job_id is None: matched_job = _DATAFLOW_ID_REGEXP.search(msg) if matched_job: self.current_dataflow_job_id = matched_job.group(1) logger.info("Found dataflow job id '%s'", self.current_dataflow_job_id) logger.info(msg) def _run_cmd(self, cmd): dfc = self.dataflow_config from airflow.contrib.hooks.gcp_dataflow_hook import _DataflowJob run_cmd( cmd, name="dataflow %s" % self.task_run.job_name, stdout_handler=self._process_dataflow_log, ) _DataflowJob( self._gcp_dataflow_hook.get_conn(), dfc.project, self.task_run.job_id, dfc.region, dfc.poll_sleep, self.current_dataflow_job_id, ).wait_for_done()
def execute(self, context): """Execute the python dataflow job.""" bucket_helper = GoogleCloudBucketHelper( self.gcp_conn_id, self.delegate_to) self.py_file = bucket_helper.google_cloud_to_local(self.py_file) hook = DataFlowHook(gcp_conn_id=self.gcp_conn_id, delegate_to=self.delegate_to) dataflow_options = self.dataflow_default_options.copy() dataflow_options.update(self.options) # Convert argument names from lowerCamelCase to snake case. camel_to_snake = lambda name: re.sub( r'[A-Z]', lambda x: '_' + x.group(0).lower(), name) formatted_options = {camel_to_snake(key): dataflow_options[key] for key in dataflow_options} hook.start_python_dataflow( self.task_id, formatted_options, self.py_file, self.py_options)
def execute(self, context): """Execute the python dataflow job.""" bucket_helper = GoogleCloudBucketHelper( self.gcp_conn_id, self.delegate_to) self.py_file = bucket_helper.google_cloud_to_local(self.py_file) hook = DataFlowHook(gcp_conn_id=self.gcp_conn_id, delegate_to=self.delegate_to, poll_sleep=self.poll_sleep) dataflow_options = self.dataflow_default_options.copy() dataflow_options.update(self.options) # Convert argument names from lowerCamelCase to snake case. camel_to_snake = lambda name: re.sub( r'[A-Z]', lambda x: '_' + x.group(0).lower(), name) formatted_options = {camel_to_snake(key): dataflow_options[key] for key in dataflow_options} hook.start_python_dataflow( self.job_name, formatted_options, self.py_file, self.py_options)
class DataFlowHookTest(unittest.TestCase): def setUp(self): with mock.patch(BASE_STRING.format('GoogleCloudBaseHook.__init__'), new=mock_init): self.dataflow_hook = DataFlowHook(gcp_conn_id='test') @mock.patch(DATAFLOW_STRING.format('DataFlowHook._start_dataflow')) def test_start_python_dataflow(self, internal_dataflow_mock): self.dataflow_hook.start_python_dataflow(task_id=TASK_ID, variables=OPTIONS, dataflow=PY_FILE, py_options=PY_OPTIONS) internal_dataflow_mock.assert_called_once_with(TASK_ID, OPTIONS, PY_FILE, mock.ANY, ['python'] + PY_OPTIONS) @mock.patch('airflow.contrib.hooks.gcp_dataflow_hook._Dataflow.log') @mock.patch('subprocess.Popen') @mock.patch('select.select') def test_dataflow_wait_for_done_logging(self, mock_select, mock_popen, mock_logging): mock_logging.info = MagicMock() mock_logging.warning = MagicMock() mock_proc = MagicMock() mock_proc.stderr = MagicMock() mock_proc.stderr.readlines = MagicMock( return_value=['test\n', 'error\n']) mock_stderr_fd = MagicMock() mock_proc.stderr.fileno = MagicMock(return_value=mock_stderr_fd) mock_proc_poll = MagicMock() mock_select.return_value = [[mock_stderr_fd]] def poll_resp_error(): mock_proc.return_code = 1 return True mock_proc_poll.side_effect = [None, poll_resp_error] mock_proc.poll = mock_proc_poll mock_popen.return_value = mock_proc dataflow = _Dataflow(['test', 'cmd']) mock_logging.info.assert_called_with('Running command: %s', 'test cmd') self.assertRaises(Exception, dataflow.wait_for_done) mock_logging.warning.assert_has_calls([call('test'), call('error')])
class DataFlowHookTest(unittest.TestCase): def setUp(self): with mock.patch(BASE_STRING.format('GoogleCloudBaseHook.__init__'), new=mock_init): self.dataflow_hook = DataFlowHook(gcp_conn_id='test') @mock.patch(DATAFLOW_STRING.format('DataFlowHook._start_dataflow')) def test_start_python_dataflow(self, internal_dataflow_mock): self.dataflow_hook.start_python_dataflow( task_id=TASK_ID, variables=OPTIONS, dataflow=PY_FILE, py_options=PY_OPTIONS) internal_dataflow_mock.assert_called_once_with( TASK_ID, OPTIONS, PY_FILE, mock.ANY, ['python'] + PY_OPTIONS) @mock.patch('airflow.contrib.hooks.gcp_dataflow_hook._Dataflow.log') @mock.patch('subprocess.Popen') @mock.patch('select.select') def test_dataflow_wait_for_done_logging(self, mock_select, mock_popen, mock_logging): mock_logging.info = MagicMock() mock_logging.warning = MagicMock() mock_proc = MagicMock() mock_proc.stderr = MagicMock() mock_proc.stderr.readlines = MagicMock(return_value=['test\n','error\n']) mock_stderr_fd = MagicMock() mock_proc.stderr.fileno = MagicMock(return_value=mock_stderr_fd) mock_proc_poll = MagicMock() mock_select.return_value = [[mock_stderr_fd]] def poll_resp_error(): mock_proc.return_code = 1 return True mock_proc_poll.side_effect=[None, poll_resp_error] mock_proc.poll = mock_proc_poll mock_popen.return_value = mock_proc dataflow = _Dataflow(['test', 'cmd']) mock_logging.info.assert_called_with('Running command: %s', 'test cmd') self.assertRaises(Exception, dataflow.wait_for_done) mock_logging.warning.assert_has_calls([call('test'), call('error')])
def execute(self, context): bucket_helper = GoogleCloudBucketHelper(self.gcp_conn_id, self.delegate_to) self.jar = bucket_helper.google_cloud_to_local(self.jar) hook = DataFlowHook(gcp_conn_id=self.gcp_conn_id, delegate_to=self.delegate_to, poll_sleep=self.poll_sleep) dataflow_options = copy.copy(self.dataflow_default_options) dataflow_options.update(self.options) # Legacy code for xcom key if 'xcom_key' in dataflow_options: value = context['task_instance'].xcom_pull( key=dataflow_options['xcom_key']) dataflow_options['queryParameters'] = value del dataflow_options['xcom_key'] # Code for xcom_keys (to be implemented sanity check) if self.xcom_element_list is not None: for xcom_element in self.xcom_element_list: # Sanity check:' if any(key in xcom_element for key in ['xcom_key', 'task_id', 'dataflow_par_name']): pulled_xcom_value = \ context['task_instance'].xcom_pull(key=xcom_element['xcom_key'], task_ids=xcom_element['task_id']) dataflow_options[ xcom_element['dataflow_par_name']] = pulled_xcom_value else: raise Exception( "ERROR: one of the fields ['xcom_key', 'task_id', 'dataflow_par_name']" " is not non-existent") print("dataflow_options: ", dataflow_options) hook.start_java_dataflow(self.job_name, dataflow_options, self.jar, self.job_class)
def execute(self, context): hook = DataFlowHook(gcp_conn_id=self.gcp_conn_id, delegate_to=self.delegate_to, poll_sleep=self.poll_sleep) dataflow_options = copy.copy(self.dataflow_default_options) dataflow_options.update(self.options) is_running = False if self.check_if_running != CheckJobRunning.IgnoreJob: is_running = hook.is_job_dataflow_running(self.job_name, dataflow_options) while is_running and self.check_if_running == CheckJobRunning.WaitForRun: is_running = hook.is_job_dataflow_running(self.job_name, dataflow_options) if not is_running: bucket_helper = GoogleCloudBucketHelper( self.gcp_conn_id, self.delegate_to) self.jar = bucket_helper.google_cloud_to_local(self.jar) hook.start_java_dataflow(self.job_name, dataflow_options, self.jar, self.job_class, True, self.multiple_jobs)
def setUp(self): with mock.patch(BASE_STRING.format('GoogleCloudBaseHook.__init__'), new=mock_init): self.dataflow_hook = DataFlowHook(gcp_conn_id='test')
class DataFlowPythonHookTest(unittest.TestCase): def setUp(self): with mock.patch(BASE_STRING.format('GoogleCloudBaseHook.__init__'), new=mock_init): self.dataflow_hook = DataFlowHook(gcp_conn_id='test') @mock.patch(DATAFLOW_STRING.format('uuid.uuid1')) @mock.patch(DATAFLOW_STRING.format('_DataflowJob')) @mock.patch(DATAFLOW_STRING.format('_Dataflow')) @mock.patch(DATAFLOW_STRING.format('DataFlowHook.get_conn')) def test_start_python_dataflow(self, mock_conn, mock_dataflow, mock_dataflowjob, mock_uuid): mock_uuid.return_value = MOCK_UUID mock_conn.return_value = None dataflow_instance = mock_dataflow.return_value dataflow_instance.wait_for_done.return_value = None dataflowjob_instance = mock_dataflowjob.return_value dataflowjob_instance.wait_for_done.return_value = None self.dataflow_hook.start_python_dataflow(task_id=TASK_ID, variables=DATAFLOW_OPTIONS_PY, dataflow=PY_FILE, py_options=PY_OPTIONS) EXPECTED_CMD = [ 'python', '-m', PY_FILE, '--runner=DataflowRunner', '--project=test', '--labels=foo=bar', '--staging_location=gs://test/staging', '--job_name={}-{}'.format(TASK_ID, MOCK_UUID) ] self.assertListEqual(sorted(mock_dataflow.call_args[0][0]), sorted(EXPECTED_CMD)) @mock.patch(DATAFLOW_STRING.format('uuid.uuid1')) @mock.patch(DATAFLOW_STRING.format('_DataflowJob')) @mock.patch(DATAFLOW_STRING.format('_Dataflow')) @mock.patch(DATAFLOW_STRING.format('DataFlowHook.get_conn')) def test_start_java_dataflow(self, mock_conn, mock_dataflow, mock_dataflowjob, mock_uuid): mock_uuid.return_value = MOCK_UUID mock_conn.return_value = None dataflow_instance = mock_dataflow.return_value dataflow_instance.wait_for_done.return_value = None dataflowjob_instance = mock_dataflowjob.return_value dataflowjob_instance.wait_for_done.return_value = None self.dataflow_hook.start_java_dataflow(task_id=TASK_ID, variables=DATAFLOW_OPTIONS_JAVA, dataflow=JAR_FILE) EXPECTED_CMD = [ 'java', '-jar', JAR_FILE, '--runner=DataflowRunner', '--project=test', '--stagingLocation=gs://test/staging', '--labels={"foo":"bar"}', '--jobName={}-{}'.format(TASK_ID, MOCK_UUID) ] self.assertListEqual(sorted(mock_dataflow.call_args[0][0]), sorted(EXPECTED_CMD)) @mock.patch('airflow.contrib.hooks.gcp_dataflow_hook._Dataflow.log') @mock.patch('subprocess.Popen') @mock.patch('select.select') def test_dataflow_wait_for_done_logging(self, mock_select, mock_popen, mock_logging): mock_logging.info = MagicMock() mock_logging.warning = MagicMock() mock_proc = MagicMock() mock_proc.stderr = MagicMock() mock_proc.stderr.readlines = MagicMock( return_value=['test\n', 'error\n']) mock_stderr_fd = MagicMock() mock_proc.stderr.fileno = MagicMock(return_value=mock_stderr_fd) mock_proc_poll = MagicMock() mock_select.return_value = [[mock_stderr_fd]] def poll_resp_error(): mock_proc.return_code = 1 return True mock_proc_poll.side_effect = [None, poll_resp_error] mock_proc.poll = mock_proc_poll mock_popen.return_value = mock_proc dataflow = _Dataflow(['test', 'cmd']) mock_logging.info.assert_called_with('Running command: %s', 'test cmd') self.assertRaises(Exception, dataflow.wait_for_done) mock_logging.warning.assert_has_calls([call('test'), call('error')])
class DataFlowHookTest(unittest.TestCase): def setUp(self): with mock.patch(BASE_STRING.format('GoogleCloudBaseHook.__init__'), new=mock_init): self.dataflow_hook = DataFlowHook(gcp_conn_id='test') @mock.patch(DATAFLOW_STRING.format('uuid.uuid4')) @mock.patch(DATAFLOW_STRING.format('_DataflowJob')) @mock.patch(DATAFLOW_STRING.format('_Dataflow')) @mock.patch(DATAFLOW_STRING.format('DataFlowHook.get_conn')) def test_start_python_dataflow(self, mock_conn, mock_dataflow, mock_dataflowjob, mock_uuid): mock_uuid.return_value = MOCK_UUID mock_conn.return_value = None dataflow_instance = mock_dataflow.return_value dataflow_instance.wait_for_done.return_value = None dataflowjob_instance = mock_dataflowjob.return_value dataflowjob_instance.wait_for_done.return_value = None self.dataflow_hook.start_python_dataflow(job_name=JOB_NAME, variables=DATAFLOW_OPTIONS_PY, dataflow=PY_FILE, py_options=PY_OPTIONS) EXPECTED_CMD = [ 'python2', '-m', PY_FILE, '--region=us-central1', '--runner=DataflowRunner', '--project=test', '--labels=foo=bar', '--staging_location=gs://test/staging', '--job_name={}-{}'.format(JOB_NAME, MOCK_UUID) ] self.assertListEqual(sorted(mock_dataflow.call_args[0][0]), sorted(EXPECTED_CMD)) @mock.patch(DATAFLOW_STRING.format('uuid.uuid4')) @mock.patch(DATAFLOW_STRING.format('_DataflowJob')) @mock.patch(DATAFLOW_STRING.format('_Dataflow')) @mock.patch(DATAFLOW_STRING.format('DataFlowHook.get_conn')) def test_start_java_dataflow(self, mock_conn, mock_dataflow, mock_dataflowjob, mock_uuid): mock_uuid.return_value = MOCK_UUID mock_conn.return_value = None dataflow_instance = mock_dataflow.return_value dataflow_instance.wait_for_done.return_value = None dataflowjob_instance = mock_dataflowjob.return_value dataflowjob_instance.wait_for_done.return_value = None self.dataflow_hook.start_java_dataflow(job_name=JOB_NAME, variables=DATAFLOW_OPTIONS_JAVA, dataflow=JAR_FILE) EXPECTED_CMD = [ 'java', '-jar', JAR_FILE, '--region=us-central1', '--runner=DataflowRunner', '--project=test', '--stagingLocation=gs://test/staging', '--labels={"foo":"bar"}', '--jobName={}-{}'.format(JOB_NAME, MOCK_UUID) ] self.assertListEqual(sorted(mock_dataflow.call_args[0][0]), sorted(EXPECTED_CMD)) @mock.patch(DATAFLOW_STRING.format('uuid.uuid4')) @mock.patch(DATAFLOW_STRING.format('_DataflowJob')) @mock.patch(DATAFLOW_STRING.format('_Dataflow')) @mock.patch(DATAFLOW_STRING.format('DataFlowHook.get_conn')) def test_start_java_dataflow_with_job_class(self, mock_conn, mock_dataflow, mock_dataflowjob, mock_uuid): mock_uuid.return_value = MOCK_UUID mock_conn.return_value = None dataflow_instance = mock_dataflow.return_value dataflow_instance.wait_for_done.return_value = None dataflowjob_instance = mock_dataflowjob.return_value dataflowjob_instance.wait_for_done.return_value = None self.dataflow_hook.start_java_dataflow(job_name=JOB_NAME, variables=DATAFLOW_OPTIONS_JAVA, dataflow=JAR_FILE, job_class=JOB_CLASS) EXPECTED_CMD = [ 'java', '-cp', JAR_FILE, JOB_CLASS, '--region=us-central1', '--runner=DataflowRunner', '--project=test', '--stagingLocation=gs://test/staging', '--labels={"foo":"bar"}', '--jobName={}-{}'.format(JOB_NAME, MOCK_UUID) ] self.assertListEqual(sorted(mock_dataflow.call_args[0][0]), sorted(EXPECTED_CMD)) @mock.patch('airflow.contrib.hooks.gcp_dataflow_hook._Dataflow.log') @mock.patch('subprocess.Popen') @mock.patch('select.select') def test_dataflow_wait_for_done_logging(self, mock_select, mock_popen, mock_logging): mock_logging.info = MagicMock() mock_logging.warning = MagicMock() mock_proc = MagicMock() mock_proc.stderr = MagicMock() mock_proc.stderr.readlines = MagicMock( return_value=['test\n', 'error\n']) mock_stderr_fd = MagicMock() mock_proc.stderr.fileno = MagicMock(return_value=mock_stderr_fd) mock_proc_poll = MagicMock() mock_select.return_value = [[mock_stderr_fd]] def poll_resp_error(): mock_proc.return_code = 1 return True mock_proc_poll.side_effect = [None, poll_resp_error] mock_proc.poll = mock_proc_poll mock_popen.return_value = mock_proc dataflow = _Dataflow(['test', 'cmd']) mock_logging.info.assert_called_with('Running command: %s', 'test cmd') self.assertRaises(Exception, dataflow.wait_for_done) def test_valid_dataflow_job_name(self): job_name = self.dataflow_hook._build_dataflow_job_name( job_name=JOB_NAME, append_job_name=False) self.assertEqual(job_name, JOB_NAME) def test_fix_underscore_in_job_name(self): job_name_with_underscore = 'test_example' fixed_job_name = job_name_with_underscore.replace('_', '-') job_name = self.dataflow_hook._build_dataflow_job_name( job_name=job_name_with_underscore, append_job_name=False) self.assertEqual(job_name, fixed_job_name) def test_invalid_dataflow_job_name(self): invalid_job_name = '9test_invalid_name' fixed_name = invalid_job_name.replace('_', '-') with self.assertRaises(ValueError) as e: self.dataflow_hook._build_dataflow_job_name( job_name=invalid_job_name, append_job_name=False) # Test whether the job_name is present in the Error msg self.assertIn('Invalid job_name ({})'.format(fixed_name), str(e.exception)) def test_dataflow_job_regex_check(self): self.assertEqual( self.dataflow_hook._build_dataflow_job_name(job_name='df-job-1', append_job_name=False), 'df-job-1') self.assertEqual( self.dataflow_hook._build_dataflow_job_name(job_name='df-job', append_job_name=False), 'df-job') self.assertEqual( self.dataflow_hook._build_dataflow_job_name(job_name='dfjob', append_job_name=False), 'dfjob') self.assertEqual( self.dataflow_hook._build_dataflow_job_name(job_name='dfjob1', append_job_name=False), 'dfjob1') self.assertRaises(ValueError, self.dataflow_hook._build_dataflow_job_name, job_name='1dfjob', append_job_name=False) self.assertRaises(ValueError, self.dataflow_hook._build_dataflow_job_name, job_name='dfjob@', append_job_name=False) self.assertRaises(ValueError, self.dataflow_hook._build_dataflow_job_name, job_name='df^jo', append_job_name=False)
class DataFlowHookTest(unittest.TestCase): def setUp(self): with mock.patch(BASE_STRING.format('GoogleCloudBaseHook.__init__'), new=mock_init): self.dataflow_hook = DataFlowHook(gcp_conn_id='test') @mock.patch(DATAFLOW_STRING.format('uuid.uuid1')) @mock.patch(DATAFLOW_STRING.format('_DataflowJob')) @mock.patch(DATAFLOW_STRING.format('_Dataflow')) @mock.patch(DATAFLOW_STRING.format('DataFlowHook.get_conn')) def test_start_python_dataflow(self, mock_conn, mock_dataflow, mock_dataflowjob, mock_uuid): mock_uuid.return_value = MOCK_UUID mock_conn.return_value = None dataflow_instance = mock_dataflow.return_value dataflow_instance.wait_for_done.return_value = None dataflowjob_instance = mock_dataflowjob.return_value dataflowjob_instance.wait_for_done.return_value = None self.dataflow_hook.start_python_dataflow( task_id=TASK_ID, variables=DATAFLOW_OPTIONS_PY, dataflow=PY_FILE, py_options=PY_OPTIONS) EXPECTED_CMD = ['python', '-m', PY_FILE, '--runner=DataflowRunner', '--project=test', '--labels=foo=bar', '--staging_location=gs://test/staging', '--job_name={}-{}'.format(TASK_ID, MOCK_UUID)] self.assertListEqual(sorted(mock_dataflow.call_args[0][0]), sorted(EXPECTED_CMD)) @mock.patch(DATAFLOW_STRING.format('uuid.uuid1')) @mock.patch(DATAFLOW_STRING.format('_DataflowJob')) @mock.patch(DATAFLOW_STRING.format('_Dataflow')) @mock.patch(DATAFLOW_STRING.format('DataFlowHook.get_conn')) def test_start_java_dataflow(self, mock_conn, mock_dataflow, mock_dataflowjob, mock_uuid): mock_uuid.return_value = MOCK_UUID mock_conn.return_value = None dataflow_instance = mock_dataflow.return_value dataflow_instance.wait_for_done.return_value = None dataflowjob_instance = mock_dataflowjob.return_value dataflowjob_instance.wait_for_done.return_value = None self.dataflow_hook.start_java_dataflow( task_id=TASK_ID, variables=DATAFLOW_OPTIONS_JAVA, dataflow=JAR_FILE) EXPECTED_CMD = ['java', '-jar', JAR_FILE, '--runner=DataflowRunner', '--project=test', '--stagingLocation=gs://test/staging', '--labels={"foo":"bar"}', '--jobName={}-{}'.format(TASK_ID, MOCK_UUID)] self.assertListEqual(sorted(mock_dataflow.call_args[0][0]), sorted(EXPECTED_CMD)) @mock.patch(DATAFLOW_STRING.format('uuid.uuid1')) @mock.patch(DATAFLOW_STRING.format('_DataflowJob')) @mock.patch(DATAFLOW_STRING.format('_Dataflow')) @mock.patch(DATAFLOW_STRING.format('DataFlowHook.get_conn')) def test_start_java_dataflow_with_job_class( self, mock_conn, mock_dataflow, mock_dataflowjob, mock_uuid): mock_uuid.return_value = MOCK_UUID mock_conn.return_value = None dataflow_instance = mock_dataflow.return_value dataflow_instance.wait_for_done.return_value = None dataflowjob_instance = mock_dataflowjob.return_value dataflowjob_instance.wait_for_done.return_value = None self.dataflow_hook.start_java_dataflow( task_id=TASK_ID, variables=DATAFLOW_OPTIONS_JAVA, dataflow=JAR_FILE, job_class=JOB_CLASS) EXPECTED_CMD = ['java', '-cp', JAR_FILE, JOB_CLASS, '--runner=DataflowRunner', '--project=test', '--stagingLocation=gs://test/staging', '--labels={"foo":"bar"}', '--jobName={}-{}'.format(TASK_ID, MOCK_UUID)] self.assertListEqual(sorted(mock_dataflow.call_args[0][0]), sorted(EXPECTED_CMD)) @mock.patch('airflow.contrib.hooks.gcp_dataflow_hook._Dataflow.log') @mock.patch('subprocess.Popen') @mock.patch('select.select') def test_dataflow_wait_for_done_logging(self, mock_select, mock_popen, mock_logging): mock_logging.info = MagicMock() mock_logging.warning = MagicMock() mock_proc = MagicMock() mock_proc.stderr = MagicMock() mock_proc.stderr.readlines = MagicMock(return_value=['test\n','error\n']) mock_stderr_fd = MagicMock() mock_proc.stderr.fileno = MagicMock(return_value=mock_stderr_fd) mock_proc_poll = MagicMock() mock_select.return_value = [[mock_stderr_fd]] def poll_resp_error(): mock_proc.return_code = 1 return True mock_proc_poll.side_effect=[None, poll_resp_error] mock_proc.poll = mock_proc_poll mock_popen.return_value = mock_proc dataflow = _Dataflow(['test', 'cmd']) mock_logging.info.assert_called_with('Running command: %s', 'test cmd') self.assertRaises(Exception, dataflow.wait_for_done) mock_logging.warning.assert_has_calls([call('test'), call('error')])
class DataFlowHookTest(unittest.TestCase): def setUp(self): with mock.patch(BASE_STRING.format('GoogleCloudBaseHook.__init__'), new=mock_init): self.dataflow_hook = DataFlowHook(gcp_conn_id='test') @mock.patch(DATAFLOW_STRING.format('uuid.uuid1')) @mock.patch(DATAFLOW_STRING.format('_DataflowJob')) @mock.patch(DATAFLOW_STRING.format('_Dataflow')) @mock.patch(DATAFLOW_STRING.format('DataFlowHook.get_conn')) def test_start_python_dataflow(self, mock_conn, mock_dataflow, mock_dataflowjob, mock_uuid): mock_uuid.return_value = MOCK_UUID mock_conn.return_value = None dataflow_instance = mock_dataflow.return_value dataflow_instance.wait_for_done.return_value = None dataflowjob_instance = mock_dataflowjob.return_value dataflowjob_instance.wait_for_done.return_value = None self.dataflow_hook.start_python_dataflow( task_id=TASK_ID, variables=DATAFLOW_OPTIONS_PY, dataflow=PY_FILE, py_options=PY_OPTIONS) EXPECTED_CMD = ['python', '-m', PY_FILE, '--region=us-central1', '--runner=DataflowRunner', '--project=test', '--labels=foo=bar', '--staging_location=gs://test/staging', '--job_name={}-{}'.format(TASK_ID, MOCK_UUID)] self.assertListEqual(sorted(mock_dataflow.call_args[0][0]), sorted(EXPECTED_CMD)) @mock.patch(DATAFLOW_STRING.format('uuid.uuid1')) @mock.patch(DATAFLOW_STRING.format('_DataflowJob')) @mock.patch(DATAFLOW_STRING.format('_Dataflow')) @mock.patch(DATAFLOW_STRING.format('DataFlowHook.get_conn')) def test_start_java_dataflow(self, mock_conn, mock_dataflow, mock_dataflowjob, mock_uuid): mock_uuid.return_value = MOCK_UUID mock_conn.return_value = None dataflow_instance = mock_dataflow.return_value dataflow_instance.wait_for_done.return_value = None dataflowjob_instance = mock_dataflowjob.return_value dataflowjob_instance.wait_for_done.return_value = None self.dataflow_hook.start_java_dataflow( task_id=TASK_ID, variables=DATAFLOW_OPTIONS_JAVA, dataflow=JAR_FILE) EXPECTED_CMD = ['java', '-jar', JAR_FILE, '--region=us-central1', '--runner=DataflowRunner', '--project=test', '--stagingLocation=gs://test/staging', '--labels={"foo":"bar"}', '--jobName={}-{}'.format(TASK_ID, MOCK_UUID)] self.assertListEqual(sorted(mock_dataflow.call_args[0][0]), sorted(EXPECTED_CMD)) @mock.patch(DATAFLOW_STRING.format('uuid.uuid1')) @mock.patch(DATAFLOW_STRING.format('_DataflowJob')) @mock.patch(DATAFLOW_STRING.format('_Dataflow')) @mock.patch(DATAFLOW_STRING.format('DataFlowHook.get_conn')) def test_start_java_dataflow_with_job_class( self, mock_conn, mock_dataflow, mock_dataflowjob, mock_uuid): mock_uuid.return_value = MOCK_UUID mock_conn.return_value = None dataflow_instance = mock_dataflow.return_value dataflow_instance.wait_for_done.return_value = None dataflowjob_instance = mock_dataflowjob.return_value dataflowjob_instance.wait_for_done.return_value = None self.dataflow_hook.start_java_dataflow( task_id=TASK_ID, variables=DATAFLOW_OPTIONS_JAVA, dataflow=JAR_FILE, job_class=JOB_CLASS) EXPECTED_CMD = ['java', '-cp', JAR_FILE, JOB_CLASS, '--region=us-central1', '--runner=DataflowRunner', '--project=test', '--stagingLocation=gs://test/staging', '--labels={"foo":"bar"}', '--jobName={}-{}'.format(TASK_ID, MOCK_UUID)] self.assertListEqual(sorted(mock_dataflow.call_args[0][0]), sorted(EXPECTED_CMD)) @mock.patch('airflow.contrib.hooks.gcp_dataflow_hook._Dataflow.log') @mock.patch('subprocess.Popen') @mock.patch('select.select') def test_dataflow_wait_for_done_logging(self, mock_select, mock_popen, mock_logging): mock_logging.info = MagicMock() mock_logging.warning = MagicMock() mock_proc = MagicMock() mock_proc.stderr = MagicMock() mock_proc.stderr.readlines = MagicMock(return_value=['test\n','error\n']) mock_stderr_fd = MagicMock() mock_proc.stderr.fileno = MagicMock(return_value=mock_stderr_fd) mock_proc_poll = MagicMock() mock_select.return_value = [[mock_stderr_fd]] def poll_resp_error(): mock_proc.return_code = 1 return True mock_proc_poll.side_effect=[None, poll_resp_error] mock_proc.poll = mock_proc_poll mock_popen.return_value = mock_proc dataflow = _Dataflow(['test', 'cmd']) mock_logging.info.assert_called_with('Running command: %s', 'test cmd') self.assertRaises(Exception, dataflow.wait_for_done) mock_logging.warning.assert_has_calls([call('test'), call('error')]) def test_valid_dataflow_job_name(self): job_name = self.dataflow_hook._build_dataflow_job_name( task_id=TASK_ID, append_job_name=False ) self.assertEquals(job_name, TASK_ID) def test_fix_underscore_in_task_id(self): task_id_with_underscore = 'test_example' fixed_job_name = task_id_with_underscore.replace( '_', '-' ) job_name = self.dataflow_hook._build_dataflow_job_name( task_id=task_id_with_underscore, append_job_name=False ) self.assertEquals(job_name, fixed_job_name) def test_invalid_dataflow_job_name(self): invalid_job_name = '9test_invalid_name' fixed_name = invalid_job_name.replace( '_', '-') with self.assertRaises(AssertionError) as e: self.dataflow_hook._build_dataflow_job_name( task_id=invalid_job_name, append_job_name=False ) # Test whether the job_name is present in the Error msg self.assertIn('Invalid job_name ({})'.format(fixed_name), str(e.exception)) def test_dataflow_job_regex_check(self): self.assertEquals(self.dataflow_hook._build_dataflow_job_name( task_id='df-job-1', append_job_name=False ), 'df-job-1') self.assertEquals(self.dataflow_hook._build_dataflow_job_name( task_id='df-job', append_job_name=False ), 'df-job') self.assertEquals(self.dataflow_hook._build_dataflow_job_name( task_id='dfjob', append_job_name=False ), 'dfjob') self.assertEquals(self.dataflow_hook._build_dataflow_job_name( task_id='dfjob1', append_job_name=False ), 'dfjob1') self.assertRaises( AssertionError, self.dataflow_hook._build_dataflow_job_name, task_id='1dfjob', append_job_name=False ) self.assertRaises( AssertionError, self.dataflow_hook._build_dataflow_job_name, task_id='dfjob@', append_job_name=False ) self.assertRaises( AssertionError, self.dataflow_hook._build_dataflow_job_name, task_id='df^jo', append_job_name=False )