def main(): if len(sys.argv) == 1: print(f"Usage: {sys.argv[0]} YYYY-mm-dd") sys.exit(1) date = sys.argv[1] job_bucket = os.environ["PQ_FLATTENER_GLUE_JOB_BUCKET"] iam_role = os.environ["PQ_FLATTENER_JOB_IAM_ROLE"] source_path = os.environ["PQ_FLATTENER_SOURCE_PATH"] dest_path = os.environ["PQ_FLATTENER_DEST_PATH"] job = GlueJob( "v1/glue_jobs/pq_flattener", bucket=job_bucket, job_role=iam_role, job_arguments={ "--date": date, "--s3_source": source_path, "--s3_dest": dest_path, }, ) job.job_name = f"pq_flattener" # Run job on AWS Glue print(f'Starting job "{job.job_name}"...') try: job.run_job() job.wait_for_completion() finally: job.cleanup()
def test_glue_param_error(self): g = GlueJob('example/glue_jobs/simple_etl_job/', bucket='alpha-everyone', job_role='alpha_user_isichei', job_arguments={'--test_arg': 'this is a test'}) with self.assertRaises(ValueError): g.job_arguments = '--bad_job_argument1' with self.assertRaises(ValueError): g.job_arguments = {'bad_job_argument2': 'test'} with self.assertRaises(ValueError): g.job_arguments = {"--JOB_NAME": "new_job_name"}
def test_glue_param_error(self): g = GlueJob( "example/glue_jobs/simple_etl_job/", bucket="alpha-everyone", job_role="alpha_user_isichei", job_arguments={"--test_arg": "this is a test"}, ) with self.assertRaises(ValueError): g.job_arguments = "--bad_job_argument1" with self.assertRaises(ValueError): g.job_arguments = {"bad_job_argument2": "test"} with self.assertRaises(ValueError): g.job_arguments = {"--JOB_NAME": "new_job_name"}
def main(job_role): package_name = 'gluejobutils' to_path = f'test/glue_test/glue_py_resources/{package_name}.zip' zf = zipfile.ZipFile(to_path, "w") zf.write(os.path.join(package_name, '__init__.py')) zf.write(os.path.join(package_name, 'datatypes.py')) zf.write(os.path.join(package_name, 'dates.py')) zf.write(os.path.join(package_name, 's3.py')) zf.write(os.path.join(package_name, 'utils.py')) zf.write(os.path.join(package_name, 'dea_record_datetimes.py')) zf.write(os.path.join(package_name, 'data/data_type_conversion.json')) zf.close() g = GlueJob('test/glue_test/', bucket='alpha-gluejobutils', job_role=job_role) g.job_name = 'gluejobutils_unit_test' g.run_job()
def test_init(self): g = GlueJob('example/glue_jobs/simple_etl_job/', bucket='alpha-everyone', job_role='alpha_user_isichei', job_arguments={'--test_arg': 'this is a test'}) self.assertEqual(g.resources, [ 'example/glue_jobs/simple_etl_job/glue_resources/employees.json', 'example/glue_jobs/shared_job_resources/glue_resources/teams.json' ]) self.assertEqual(g.py_resources, [ 'example/glue_jobs/shared_job_resources/glue_py_resources/my_dummy_utils.zip' ]) self.assertEqual(g.job_name, 'simple_etl_job') self.assertEqual(g.bucket, "alpha-everyone") self.assertEqual(g.job_role, 'alpha_user_isichei') self.assertEqual(g.github_zip_urls, [ 'https://github.com/moj-analytical-services/gluejobutils/archive/master.zip' ]) self.assertEqual(g.job_arguments["--test_arg"], 'this is a test') self.assertEqual(g.github_py_resources, []) self.assertEqual(g.max_retries, 0) self.assertEqual(g.max_concurrent_runs, 1) self.assertEqual(g.allocated_capacity, 2) g2 = GlueJob('example/glue_jobs/simple_etl_job/', bucket='alpha-everyone', job_role='alpha_user_isichei', include_shared_job_resources=False) self.assertEqual( g2.resources, ['example/glue_jobs/simple_etl_job/glue_resources/employees.json']) self.assertEqual(g2.py_resources, []) self.assertTrue( "_GlueJobs_" in g2.job_arguments['--metadata_base_path'])
def main(): iam_role = os.environ["IAM_ROLE"] github_tag = os.environ["GITHUB_TAG"] snapshot_date = os.environ["SNAPSHOT_DATE"] # Get job parameters for specific glue job job_args = {"--github_tag": github_tag, "--snapshot_date": snapshot_date} job = GlueJob(f"glue_jobs/example_job/", bucket=job_bucket, job_role=iam_role, job_arguments=job_args) print(f'Starting job "{job.job_name}"...') job.run_job() job.wait_for_completion(verbose=True) if job.job_run_state == 'SUCCEEDED': print('Job successful - cleaning up') job.cleanup()
def main(job_role): package_name = "gluejobutils" to_path = f"test/glue_test/glue_py_resources/{package_name}.zip" zf = zipfile.ZipFile(to_path, "w") zf.write(os.path.join(package_name, "__init__.py")) zf.write(os.path.join(package_name, "datatypes.py")) zf.write(os.path.join(package_name, "s3.py")) zf.write(os.path.join(package_name, "utils.py")) zf.write(os.path.join(package_name, "record_datetimes.py")) zf.write(os.path.join(package_name, "df_transforms.py")) zf.write(os.path.join(package_name, "data/data_type_conversion.json")) zf.close() g = GlueJob("test/glue_test/", bucket="alpha-gluejobutils", job_role=job_role) g.job_name = "gluejobutils_unit_test" g.run_job() g.wait_for_completion(True) if g.job_run_state == "SUCCEEDED": print("cleaning up job...") g.cleanup()
def main(job_role): package_name = 'gluejobutils' to_path = f'test/glue_test/glue_py_resources/{package_name}.zip' zf = zipfile.ZipFile(to_path, "w") zf.write(os.path.join(package_name, '__init__.py')) zf.write(os.path.join(package_name, 'datatypes.py')) zf.write(os.path.join(package_name, 's3.py')) zf.write(os.path.join(package_name, 'utils.py')) zf.write(os.path.join(package_name, 'record_datetimes.py')) zf.write(os.path.join(package_name, 'df_transforms.py')) zf.write(os.path.join(package_name, 'data/data_type_conversion.json')) zf.close() g = GlueJob('test/glue_test/', bucket='alpha-gluejobutils', job_role=job_role) g.job_name = 'gluejobutils_unit_test' g.run_job() g.wait_for_completion(True) if g.job_run_state == 'SUCCEEDED': print("cleaning up job...") g.cleanup()
def test_db_value_properties(self): g = GlueJob('example/glue_jobs/simple_etl_job/', bucket='alpha-everyone', job_role='alpha_user_isichei', job_arguments={'--test_arg': 'this is a test'}) g.job_name = 'changed_job_name' self.assertEqual(g.job_name, 'changed_job_name') g.bucket = "new-bucket" self.assertEqual(g.bucket, "new-bucket") with self.assertRaises(ValueError): g.bucket = "s3://new-bucket" g.job_role = 'alpha_new_user' self.assertEqual(g.job_role, 'alpha_new_user') g.job_arguments = {"--new_args": "something"} self.assertEqual(g.job_arguments["--new_args"], "something")
def test_db_value_properties(self): g = GlueJob( "example/glue_jobs/simple_etl_job/", bucket="alpha-everyone", job_role="alpha_user_isichei", job_arguments={"--test_arg": "this is a test"}, ) g.job_name = "changed_job_name" self.assertEqual(g.job_name, "changed_job_name") g.bucket = "new-bucket" self.assertEqual(g.bucket, "new-bucket") with self.assertRaises(ValueError): g.bucket = "s3://new-bucket" g.job_role = "alpha_new_user" self.assertEqual(g.job_role, "alpha_new_user") g.job_arguments = {"--new_args": "something"} self.assertEqual(g.job_arguments["--new_args"], "something")
import os try: ROLE = os.environ["GLUE_ROLE"] except: raise Exception("You must provide a role name") bucket = 'alpha-data-linking' job = GlueJob( 'match/', bucket=bucket, job_role=ROLE, job_arguments={ "--test_arg": 'some_string', "--conf": 'spark.jars.packages=graphframes:graphframes:0.6.0-spark2.3-s_2.11', '--enable-spark-ui': 'true', '--spark-event-logs-path': 's3://alpha-data-linking/glue_test_delete/logsdelete', '--enable-continuous-cloudwatch-log': 'true' }) job.job_name = '1m_p_50_e_6' print(job._job_definition()) job.allocated_capacity = 2 try: job.run_job() job.wait_for_completion()
from etl_manager.etl import GlueJob job = GlueJob('gluejob/', bucket='alpha-mojap-curated-open-data', job_role='airflow_osrm_scraper', job_arguments={ '--test_arg': 'this is a test', '--enable-metrics': '' }) job.allocated_capacity = 4 try: job.run_job() job.wait_for_completion() finally: job.cleanup()
from etl_manager.etl import GlueJob import os ROLE = 'name_of_iam_role_for_glue_job_to_assume' bucket = 's3_bucket_where_job_python_file_and_jar_will_go' job = GlueJob('glue_job/', bucket=bucket, job_role=ROLE, job_arguments={'--enable-spark-ui': 'true', '--spark-event-logs-path': 's3://path_to_logs', '--enable-continuous-cloudwatch-log': 'true', '--enable-metrics': ''}) job.job_name = 'my_job_name' job.allocated_capacity = 4 try: job.run_job() job.wait_for_completion() finally: job.cleanup()
def test_init(self): g = GlueJob( "example/glue_jobs/simple_etl_job/", bucket="alpha-everyone", job_role="alpha_user_isichei", job_arguments={"--test_arg": "this is a test"}, ) self.assertEqual( g.resources, [ "example/glue_jobs/simple_etl_job/glue_resources/employees.json", "example/glue_jobs/shared_job_resources/glue_resources/teams.json", ], ) self.assertEqual( g.py_resources, [ "example/glue_jobs/shared_job_resources/glue_py_resources/" "my_dummy_utils.zip" ], ) self.assertEqual( set(g.jars), set([ "example/glue_jobs/simple_etl_job/glue_jars/j1.jar", "example/glue_jobs/simple_etl_job/glue_jars/j2.jar", ]), ) self.assertEqual(g.job_name, "simple_etl_job") self.assertEqual(g.bucket, "alpha-everyone") self.assertEqual(g.job_role, "alpha_user_isichei") self.assertEqual( g.github_zip_urls, [ "https://github.com/moj-analytical-services/gluejobutils/archive/" "master.zip" ], ) self.assertEqual(g.job_arguments["--test_arg"], "this is a test") self.assertEqual(g.github_py_resources, []) self.assertEqual(g.max_retries, 0) self.assertEqual(g.max_concurrent_runs, 1) self.assertEqual(g.allocated_capacity, 2) jobdef = g._job_definition() self.assertTrue("j2.jar" in jobdef["DefaultArguments"]["--extra-jars"]) g2 = GlueJob( "example/glue_jobs/simple_etl_job/", bucket="alpha-everyone", job_role="alpha_user_isichei", include_shared_job_resources=False, ) self.assertEqual( g2.resources, ["example/glue_jobs/simple_etl_job/glue_resources/employees.json"], ) self.assertEqual(g2.py_resources, []) self.assertTrue( "_GlueJobs_" in g2.job_arguments["--metadata_base_path"])
def test_timeout(self): g = GlueJob( "example/glue_jobs/simple_etl_job/", bucket="alpha-everyone", job_role="alpha_user_isichei", job_arguments={"--test_arg": "this is a test"}, ) self.assertEqual(g._job_definition()["Timeout"], 1363) g.allocated_capacity = 10 self.assertEqual(g._job_definition()["Timeout"], 272) g.allocated_capacity = 40 self.assertEqual(g._job_definition()["Timeout"], 68) g = GlueJob( "example/glue_jobs/simple_etl_job/", bucket="alpha-everyone", job_role="alpha_user_isichei", job_arguments={"--test_arg": "this is a test"}, timeout_override_minutes=2880, ) g.allocated_capacity = 40 self.assertEqual(g._job_definition()["Timeout"], 2880)