Exemple #1
0
        def my_pipeline():
            spec = {
                "name":
                "test-job",
                "new_cluster": {
                    "spark_version": "5.3.x-scala2.11",
                    "node_type_id": "Standard_D3_v2",
                    "num_workers": 2
                },
                "libraries": [{
                    "jar": "dbfs:/my-jar.jar"
                }, {
                    "maven": {
                        "coordinates": "org.jsoup:jsoup:1.7.2"
                    }
                }],
                "timeout_seconds":
                3600,
                "max_retries":
                1,
                "schedule": {
                    "quartz_cron_expression": "0 15 22 ? * *",
                    "timezone_id": "America/Los_Angeles"
                },
                "spark_jar_task": {
                    "main_class_name": "com.databricks.ComputeModels"
                }
            }

            res = CreateJobOp(name="createjob", spec=spec)

            self.assert_res(res, spec)
Exemple #2
0
        def my_pipeline():
            job_name = "test-job"
            existing_cluster_id = "1201-my-cluster"
            schedule = {
                "quartz_cron_expression": "0 15 22 ? * *",
                "timezone_id": "America/Los_Angeles"
            }
            notebook_task = {
                "notebook_path": "/Users/[email protected]/my-notebook"
            }
            timeout_seconds = 120

            expected_spec = {
                "name": job_name,
                "existing_cluster_id": existing_cluster_id,
                "schedule": schedule,
                "notebook_task": notebook_task,
                "timeout_seconds": timeout_seconds
            }

            res = CreateJobOp(name="createjob",
                              job_name=job_name,
                              existing_cluster_id=existing_cluster_id,
                              schedule=schedule,
                              notebook_task=notebook_task,
                              timeout_seconds=timeout_seconds)

            self.assert_res(res, expected_spec)
Exemple #3
0
        def my_pipeline():
            job_name = "test-job"
            new_cluster = {
                "spark_version": "5.3.x-scala2.11",
                "node_type_id": "Standard_D3_v2",
                "num_workers": 2
            }
            schedule = {
                "quartz_cron_expression": "0 15 22 ? * *",
                "timezone_id": "America/Los_Angeles"
            }
            spark_submit_task = {
                "parameters": [
                    "--class", "org.apache.spark.examples.SparkPi",
                    "dbfs:/docs/sparkpi.jar", "10"
                ]
            }

            expected_spec = {
                "name": job_name,
                "new_cluster": new_cluster,
                "schedule": schedule,
                "spark_submit_task": spark_submit_task
            }

            res = CreateJobOp(name="createjob",
                              job_name=job_name,
                              new_cluster=new_cluster,
                              schedule=schedule,
                              spark_submit_task=spark_submit_task)

            self.assert_res(res, expected_spec)
Exemple #4
0
        def my_pipeline():
            job_name = "test-job"
            new_cluster = {
                "spark_version": "5.3.x-scala2.11",
                "node_type_id": "Standard_D3_v2",
                "num_workers": 2
            }
            libraries = [{
                "jar": "dbfs:/my-jar.jar"
            }, {
                "maven": {
                    "coordinates": "org.jsoup:jsoup:1.7.2"
                }
            }]
            timeout_seconds = 3600
            max_retries = 1
            schedule = {
                "quartz_cron_expression": "0 15 22 ? * *",
                "timezone_id": "America/Los_Angeles"
            }
            spark_jar_task = {
                "main_class_name": "com.databricks.ComputeModels"
            }

            expected_spec = {
                "name": job_name,
                "new_cluster": new_cluster,
                "libraries": libraries,
                "timeout_seconds": timeout_seconds,
                "max_retries": max_retries,
                "schedule": schedule,
                "spark_jar_task": spark_jar_task
            }

            res = CreateJobOp(name="createjob",
                              job_name=job_name,
                              new_cluster=new_cluster,
                              libraries=libraries,
                              timeout_seconds=timeout_seconds,
                              max_retries=max_retries,
                              schedule=schedule,
                              spark_jar_task=spark_jar_task)

            self.assert_res(res, expected_spec)
Exemple #5
0
        def my_pipeline():
            job_name = "test-job"
            new_cluster = {
                "spark_version": "5.3.x-scala2.11",
                "node_type_id": "Standard_D3_v2",
                "num_workers": 2
            }
            timeout_seconds = 3600
            max_retries = 3
            min_retry_interval_millis = 3600
            retry_on_timeout = True
            schedule = {
                "quartz_cron_expression": "0 15 22 ? * *",
                "timezone_id": "America/Los_Angeles"
            }
            spark_python_task = {
                "python_file": "dbfs:/docs/pi.py",
                "parameters": ["10"]
            }

            expected_spec = {
                "name": job_name,
                "new_cluster": new_cluster,
                "timeout_seconds": timeout_seconds,
                "max_retries": max_retries,
                "min_retry_interval_millis": min_retry_interval_millis,
                "retry_on_timeout": retry_on_timeout,
                "schedule": schedule,
                "spark_python_task": spark_python_task
            }

            res = CreateJobOp(
                name="createjob",
                job_name=job_name,
                new_cluster=new_cluster,
                timeout_seconds=timeout_seconds,
                max_retries=max_retries,
                min_retry_interval_millis=min_retry_interval_millis,
                retry_on_timeout=retry_on_timeout,
                schedule=schedule,
                spark_python_task=spark_python_task)
            self.assert_res(res, expected_spec)
Exemple #6
0
        def my_pipeline():
            job_name = "test-job"
            current_path = Path(__file__).parent
            json_spec_file_name = current_path.joinpath("job_spec.json")

            expected_spec = {
                "name":
                job_name,
                "new_cluster": {
                    "spark_version": "5.3.x-scala2.11",
                    "node_type_id": "Standard_D3_v2",
                    "num_workers": 2
                },
                "libraries": [{
                    "jar": "dbfs:/my-jar.jar"
                }, {
                    "maven": {
                        "coordinates": "org.jsoup:jsoup:1.7.2"
                    }
                }],
                "timeout_seconds":
                3600,
                "max_retries":
                1,
                "schedule": {
                    "quartz_cron_expression": "0 15 22 ? * *",
                    "timezone_id": "America/Los_Angeles"
                },
                "spark_jar_task": {
                    "main_class_name": "com.databricks.ComputeModels"
                }
            }

            res = CreateJobOp.from_file_name(name="createjob",
                                             job_name=job_name,
                                             file_name=json_spec_file_name)

            self.assert_res(res, expected_spec)
Exemple #7
0
 def my_pipeline():
     CreateJobOp(name="createjob",
                 new_cluster={
                     "spark_version": "5.3.x-scala2.11",
                     "node_type_id": "Standard_D3_v2",
                     "num_workers": 2
                 },
                 libraries=[{
                     "jar": "dbfs:/my-jar.jar"
                 }, {
                     "maven": {
                         "coordinates": "org.jsoup:jsoup:1.7.2"
                     }
                 }],
                 timeout_seconds=3600,
                 max_retries=1,
                 schedule={
                     "quartz_cron_expression": "0 15 22 ? * *",
                     "timezone_id": "America/Los_Angeles"
                 },
                 spark_jar_task={
                     "main_class_name": "com.databricks.ComputeModels"
                 })