def test_missing_spark_arguments(self): task_config = { 'application_source': 'my_app.py', 'application_arguments': { '--query': "select * from my_table where" " date_prt >= '{{yesterday_ds}}'", '--output': 'myoutputtable' } } expected = ['spark-submit', 'my_app.py', '--query', "select * from my_table where " "date_prt >= '{{yesterday_ds}}'", '--output', 'myoutputtable'] actual = SparkTask( 'my_spark_task', DummyDag('dag-id', 'my_spark_task'), [], trigger_rule='all_success', liminal_config={}, pipeline_config={'pipeline': 'pipeline'}, task_config=task_config ).get_runnable_command() self.assertEqual(actual.sort(), expected.sort())
def test_partially_missing_spark_arguments(self): task_config = { 'application_source': 'my_app.py', 'class': 'org.apache.liminal.MySparkApp', 'conf': { 'spark.driver.memory': '1g', 'spark.driver.maxResultSize': '1g', 'spark.yarn.executor.memoryOverhead': '500M' }, 'application_arguments': { '--query': "select * from dlk_visitor_funnel_dwh_staging.fact_events where unified_Date_prt >= " "'{{yesterday_ds}}'", '--output': 'mytable' } } expected = [ 'spark-submit', '--class', 'org.apache.liminal.MySparkApp', '--conf', 'spark.driver.memory=1g', '--conf', 'spark.driver.maxResultSize=1g', '--conf', 'spark.yarn.executor.memoryOverhead=500M', 'my_app.py', '--query', 'select * from dlk_visitor_funnel_dwh_staging.fact_events where ' "unified_Date_prt >= '{{yesterday_ds}}'", '--output', 'mytable' ].sort() actual = SparkTask('my_spark_task', DummyDag('dag-id', 'my_spark_task'), [], trigger_rule='all_success', liminal_config={}, pipeline_config={}, task_config=task_config).get_runnable_command() self.assertEqual(actual.sort(), expected)
def test_missing_spark_arguments(self): task_config = { 'application_source': 'my_app.py', 'application_arguments': { '--query': "select * from dlk_visitor_funnel_dwh_staging.fact_events where unified_Date_prt >= " "'{{yesterday_ds}}'", '--output': 'mytable' } } expected = [ 'spark-submit', 'my_app.py', '--query', "select * from dlk_visitor_funnel_dwh_staging.fact_events where unified_Date_prt >=" " '{{yesterday_ds}}'", '--output', 'mytable' ] actual = SparkTask('my_spark_task', DummyDag('dag-id', 'my_spark_task'), [], trigger_rule='all_success', liminal_config={}, pipeline_config={}, task_config=task_config).get_runnable_command() self.assertEqual(actual, expected)
def test_apply_task_to_dag(self): dag = dag_test_utils.create_dag() task0 = self.__create_python_task(dag, 'my_input_task', None, 'my_python_task_img', 'python -u write_inputs.py', env_vars={ 'NUM_FILES': 10, 'NUM_SPLITS': 3 }) task0.apply_task_to_dag() task1 = self.__create_python_task(dag, 'my_output_task', dag.tasks[0], 'my_parallelized_python_task_img', 'python -u write_outputs.py', executors=3) task1.apply_task_to_dag() for task in dag.tasks: print(f'Executing task {task.task_id}') task.execute(DummyDag('my_dag', task.task_id).context) inputs_dir = os.path.join(self.temp_dir, 'inputs') outputs_dir = os.path.join(self.temp_dir, 'outputs') self.assertListEqual(sorted(os.listdir(self.temp_dir)), sorted(['outputs', 'inputs'])) inputs_dir_contents = sorted(os.listdir(inputs_dir)) self.assertListEqual(inputs_dir_contents, ['0', '1', '2']) self.assertListEqual( sorted(os.listdir(os.path.join(inputs_dir, '0'))), ['input0.json', 'input3.json', 'input6.json', 'input9.json']) self.assertListEqual(sorted(os.listdir(os.path.join(inputs_dir, '1'))), ['input1.json', 'input4.json', 'input7.json']) self.assertListEqual(sorted(os.listdir(os.path.join(inputs_dir, '2'))), ['input2.json', 'input5.json', 'input8.json']) self.assertListEqual(sorted(os.listdir(outputs_dir)), [ 'output0.txt', 'output1.txt', 'output2.txt', 'output3.txt', 'output4.txt', 'output5.txt', 'output6.txt', 'output7.txt', 'output8.txt', 'output9.txt' ]) for filename in os.listdir(outputs_dir): with open(os.path.join(outputs_dir, filename)) as f: expected_file_content = filename.replace('output', 'myval').replace( '.txt', '') self.assertEqual(f.read(), expected_file_content)
def setUp(self) -> None: self.run_job_flow_args = dict(Instances={ "InstanceCount": 1, "KeepJobFlowAliveWhenNoSteps": True, "MasterInstanceType": "c3.medium", "Placement": { "AvailabilityZone": "us-east-1" }, "SlaveInstanceType": "c3.xlarge", }, JobFlowRole="EMR_EC2_DefaultRole", LogUri="s3://liminal/log", Name="test-emr-cluster", ServiceRole="EMR_DefaultRole", VisibleToAllUsers=True) self.client = boto3.client("emr", region_name="us-east-1") args = deepcopy(self.run_job_flow_args) self.cluster_id = self.client.run_job_flow(**args)["JobFlowId"] self.dag = dag_test_utils.create_dag() self.dag.context = DummyDag(dag_id=self.dag.dag_id, task_id="").context self.executor_name = 'test-emr-cluster' executor_config = { 'executor': self.executor_name, 'cluster_name': self.executor_name, 'aws_conn_id': 'us-east-1', 'type': 'emr', 'properties': { 'ActionOnFailure': 'CONTINUE' } } self.hadoop_task = MagicMock(spec=hadoop.HadoopTask) self.hadoop_task.get_runnable_command.return_value = [ 'spark-submit', 'test', 'params', '--param' ] self.hadoop_task.task_id = 'spark-task' self.hadoop_task.dag = self.dag self.hadoop_task.trigger_rule = 'all_done' self.hadoop_task.parent = None self.emr = EMRExecutor(self.executor_name, liminal_config={}, executor_config=executor_config)
def test_get_runnable_command(self): task_config = { 'application_source': 'my_app.py', 'master': 'yarn', 'class': 'org.apache.liminal.MySparkApp', 'conf': { 'spark.driver.memory': '1g', 'spark.driver.maxResultSize': '1g', 'spark.yarn.executor.memoryOverhead': '500M' }, 'application_arguments': { '--query': "select * from " "my_table where date_prt >= " "'{{yesterday_ds}}'", '--output': 'mytable' } } expected = ['spark-submit', '--master', 'yarn', '--class', 'org.apache.liminal.MySparkApp', '--conf', 'spark.driver.maxResultSize=1g', '--conf', 'spark.driver.memory=1g', '--conf', 'spark.yarn.executor.memoryOverhead=500M', 'my_app.py', '--query', "select * from my_table where " "date_prt >= '{{yesterday_ds}}'", '--output', 'mytable'] actual = SparkTask( 'my_spark_task', DummyDag('dag-id', 'my_spark_task'), [], trigger_rule='all_success', liminal_config={}, pipeline_config={'pipeline': 'pipeline'}, task_config=task_config ).get_runnable_command() self.assertEqual(actual.sort(), expected.sort())
def test_spark_on_k8s(self): volume_util.delete_local_volume(self._VOLUME_NAME) os.environ['TMPDIR'] = '/tmp' self.temp_dir = tempfile.mkdtemp() self.liminal_config = { 'volumes': [ { 'volume': self._VOLUME_NAME, 'local': { 'path': self.temp_dir.replace( "/var/folders", "/private/var/folders" ) } } ] } volume_util.create_local_volumes(self.liminal_config, None) # build spark image liminal_apps_builder.build_liminal_apps( os.path.join(os.path.dirname(__file__), '../../apps/test_spark_app')) outputs_dir = os.path.join(self.temp_dir, 'outputs') task_config = { 'task': "my_spark_task", 'image': "my_spark_image", 'application_source': 'wordcount.py', 'application_arguments': ['words.txt', '/mnt/vol1/outputs/'], 'env_vars': {}, 'mounts': [ { 'mount': 'mymount', 'volume': self._VOLUME_NAME, 'path': '/mnt/vol1' } ] } dag = dag_test_utils.create_dag() task1 = SparkTask( task_id="my_spark_task", dag=dag, liminal_config=self.liminal_config, pipeline_config={ 'pipeline': 'my_pipeline' }, task_config=task_config, parent=None, trigger_rule='all_success') executor = KubernetesPodExecutor( task_id='k8s', liminal_config=self.liminal_config, executor_config={ 'executor': 'k8s', 'name': 'mypod' } ) executor.apply_task_to_dag(task=task1) for task in dag.tasks: print(f'Executing task {task.task_id}') task.execute(DummyDag('my_dag', task.task_id).context) expected_output = '{"word":"my","count":1}\n' \ '{"word":"first","count":1}\n' \ '{"word":"liminal","count":1}\n' \ '{"word":"spark","count":1}\n' \ '{"word":"task","count":1}\n'.split("\n") actual = '' for filename in os.listdir(outputs_dir): if filename.endswith(".json"): with open(os.path.join(outputs_dir, filename)) as f: actual = f.read() self.assertEqual(actual.split("\n"), expected_output)