def main():
    if len(sys.argv) == 1:
        print(f"Usage: {sys.argv[0]} YYYY-mm-dd")
        sys.exit(1)

    date = sys.argv[1]

    job_bucket = os.environ["PQ_FLATTENER_GLUE_JOB_BUCKET"]
    iam_role = os.environ["PQ_FLATTENER_JOB_IAM_ROLE"]

    source_path = os.environ["PQ_FLATTENER_SOURCE_PATH"]
    dest_path = os.environ["PQ_FLATTENER_DEST_PATH"]

    job = GlueJob(
        "v1/glue_jobs/pq_flattener",
        bucket=job_bucket,
        job_role=iam_role,
        job_arguments={
            "--date": date,
            "--s3_source": source_path,
            "--s3_dest": dest_path,
        },
    )

    job.job_name = f"pq_flattener"

    # Run job on AWS Glue
    print(f'Starting job "{job.job_name}"...')

    try:
        job.run_job()
        job.wait_for_completion()
    finally:
        job.cleanup()
Example #2
0
 def test_glue_param_error(self):
     g = GlueJob('example/glue_jobs/simple_etl_job/',
                 bucket='alpha-everyone',
                 job_role='alpha_user_isichei',
                 job_arguments={'--test_arg': 'this is a test'})
     with self.assertRaises(ValueError):
         g.job_arguments = '--bad_job_argument1'
     with self.assertRaises(ValueError):
         g.job_arguments = {'bad_job_argument2': 'test'}
     with self.assertRaises(ValueError):
         g.job_arguments = {"--JOB_NAME": "new_job_name"}
Example #3
0
 def test_glue_param_error(self):
     g = GlueJob(
         "example/glue_jobs/simple_etl_job/",
         bucket="alpha-everyone",
         job_role="alpha_user_isichei",
         job_arguments={"--test_arg": "this is a test"},
     )
     with self.assertRaises(ValueError):
         g.job_arguments = "--bad_job_argument1"
     with self.assertRaises(ValueError):
         g.job_arguments = {"bad_job_argument2": "test"}
     with self.assertRaises(ValueError):
         g.job_arguments = {"--JOB_NAME": "new_job_name"}
def main(job_role):
    package_name = 'gluejobutils'
    to_path = f'test/glue_test/glue_py_resources/{package_name}.zip'
    zf = zipfile.ZipFile(to_path, "w")
    zf.write(os.path.join(package_name, '__init__.py'))
    zf.write(os.path.join(package_name, 'datatypes.py'))
    zf.write(os.path.join(package_name, 'dates.py'))
    zf.write(os.path.join(package_name, 's3.py'))
    zf.write(os.path.join(package_name, 'utils.py'))
    zf.write(os.path.join(package_name, 'dea_record_datetimes.py'))
    zf.write(os.path.join(package_name, 'data/data_type_conversion.json'))
    zf.close()

    g = GlueJob('test/glue_test/',
                bucket='alpha-gluejobutils',
                job_role=job_role)
    g.job_name = 'gluejobutils_unit_test'
    g.run_job()
Example #5
0
    def test_init(self):
        g = GlueJob('example/glue_jobs/simple_etl_job/',
                    bucket='alpha-everyone',
                    job_role='alpha_user_isichei',
                    job_arguments={'--test_arg': 'this is a test'})

        self.assertEqual(g.resources, [
            'example/glue_jobs/simple_etl_job/glue_resources/employees.json',
            'example/glue_jobs/shared_job_resources/glue_resources/teams.json'
        ])
        self.assertEqual(g.py_resources, [
            'example/glue_jobs/shared_job_resources/glue_py_resources/my_dummy_utils.zip'
        ])
        self.assertEqual(g.job_name, 'simple_etl_job')
        self.assertEqual(g.bucket, "alpha-everyone")
        self.assertEqual(g.job_role, 'alpha_user_isichei')
        self.assertEqual(g.github_zip_urls, [
            'https://github.com/moj-analytical-services/gluejobutils/archive/master.zip'
        ])
        self.assertEqual(g.job_arguments["--test_arg"], 'this is a test')
        self.assertEqual(g.github_py_resources, [])
        self.assertEqual(g.max_retries, 0)
        self.assertEqual(g.max_concurrent_runs, 1)
        self.assertEqual(g.allocated_capacity, 2)

        g2 = GlueJob('example/glue_jobs/simple_etl_job/',
                     bucket='alpha-everyone',
                     job_role='alpha_user_isichei',
                     include_shared_job_resources=False)
        self.assertEqual(
            g2.resources,
            ['example/glue_jobs/simple_etl_job/glue_resources/employees.json'])
        self.assertEqual(g2.py_resources, [])

        self.assertTrue(
            "_GlueJobs_" in g2.job_arguments['--metadata_base_path'])
Example #6
0
def main():
    iam_role = os.environ["IAM_ROLE"]
    github_tag = os.environ["GITHUB_TAG"]
    snapshot_date = os.environ["SNAPSHOT_DATE"]

    # Get job parameters for specific glue job
    job_args = {"--github_tag": github_tag, "--snapshot_date": snapshot_date}
    job = GlueJob(f"glue_jobs/example_job/",
                  bucket=job_bucket,
                  job_role=iam_role,
                  job_arguments=job_args)

    print(f'Starting job "{job.job_name}"...')
    job.run_job()
    job.wait_for_completion(verbose=True)

    if job.job_run_state == 'SUCCEEDED':
        print('Job successful - cleaning up')
        job.cleanup()
def main(job_role):
    package_name = "gluejobutils"
    to_path = f"test/glue_test/glue_py_resources/{package_name}.zip"
    zf = zipfile.ZipFile(to_path, "w")
    zf.write(os.path.join(package_name, "__init__.py"))
    zf.write(os.path.join(package_name, "datatypes.py"))
    zf.write(os.path.join(package_name, "s3.py"))
    zf.write(os.path.join(package_name, "utils.py"))
    zf.write(os.path.join(package_name, "record_datetimes.py"))
    zf.write(os.path.join(package_name, "df_transforms.py"))
    zf.write(os.path.join(package_name, "data/data_type_conversion.json"))
    zf.close()

    g = GlueJob("test/glue_test/",
                bucket="alpha-gluejobutils",
                job_role=job_role)
    g.job_name = "gluejobutils_unit_test"
    g.run_job()

    g.wait_for_completion(True)
    if g.job_run_state == "SUCCEEDED":
        print("cleaning up job...")
        g.cleanup()
Example #8
0
def main(job_role):
    package_name = 'gluejobutils'
    to_path = f'test/glue_test/glue_py_resources/{package_name}.zip'
    zf = zipfile.ZipFile(to_path, "w")
    zf.write(os.path.join(package_name, '__init__.py'))
    zf.write(os.path.join(package_name, 'datatypes.py'))
    zf.write(os.path.join(package_name, 's3.py'))
    zf.write(os.path.join(package_name, 'utils.py'))
    zf.write(os.path.join(package_name, 'record_datetimes.py'))
    zf.write(os.path.join(package_name, 'df_transforms.py'))
    zf.write(os.path.join(package_name, 'data/data_type_conversion.json'))
    zf.close()

    g = GlueJob('test/glue_test/',
                bucket='alpha-gluejobutils',
                job_role=job_role)
    g.job_name = 'gluejobutils_unit_test'
    g.run_job()

    g.wait_for_completion(True)
    if g.job_run_state == 'SUCCEEDED':
        print("cleaning up job...")
        g.cleanup()
Example #9
0
    def test_db_value_properties(self):
        g = GlueJob('example/glue_jobs/simple_etl_job/',
                    bucket='alpha-everyone',
                    job_role='alpha_user_isichei',
                    job_arguments={'--test_arg': 'this is a test'})

        g.job_name = 'changed_job_name'
        self.assertEqual(g.job_name, 'changed_job_name')

        g.bucket = "new-bucket"
        self.assertEqual(g.bucket, "new-bucket")
        with self.assertRaises(ValueError):
            g.bucket = "s3://new-bucket"

        g.job_role = 'alpha_new_user'
        self.assertEqual(g.job_role, 'alpha_new_user')

        g.job_arguments = {"--new_args": "something"}
        self.assertEqual(g.job_arguments["--new_args"], "something")
Example #10
0
    def test_db_value_properties(self):
        g = GlueJob(
            "example/glue_jobs/simple_etl_job/",
            bucket="alpha-everyone",
            job_role="alpha_user_isichei",
            job_arguments={"--test_arg": "this is a test"},
        )

        g.job_name = "changed_job_name"
        self.assertEqual(g.job_name, "changed_job_name")

        g.bucket = "new-bucket"
        self.assertEqual(g.bucket, "new-bucket")
        with self.assertRaises(ValueError):
            g.bucket = "s3://new-bucket"

        g.job_role = "alpha_new_user"
        self.assertEqual(g.job_role, "alpha_new_user")

        g.job_arguments = {"--new_args": "something"}
        self.assertEqual(g.job_arguments["--new_args"], "something")
Example #11
0
import os

try:
    ROLE = os.environ["GLUE_ROLE"]
except:
    raise Exception("You must provide a role name")

bucket = 'alpha-data-linking'

job = GlueJob(
    'match/',
    bucket=bucket,
    job_role=ROLE,
    job_arguments={
        "--test_arg": 'some_string',
        "--conf":
        'spark.jars.packages=graphframes:graphframes:0.6.0-spark2.3-s_2.11',
        '--enable-spark-ui': 'true',
        '--spark-event-logs-path':
        's3://alpha-data-linking/glue_test_delete/logsdelete',
        '--enable-continuous-cloudwatch-log': 'true'
    })

job.job_name = '1m_p_50_e_6'
print(job._job_definition())

job.allocated_capacity = 2

try:
    job.run_job()
    job.wait_for_completion()
from etl_manager.etl import GlueJob

job = GlueJob('gluejob/',
              bucket='alpha-mojap-curated-open-data',
              job_role='airflow_osrm_scraper',
              job_arguments={
                  '--test_arg': 'this is a test',
                  '--enable-metrics': ''
              })

job.allocated_capacity = 4

try:
    job.run_job()
    job.wait_for_completion()
finally:
    job.cleanup()
Example #13
0
from etl_manager.etl import GlueJob
import os

ROLE = 'name_of_iam_role_for_glue_job_to_assume'

bucket = 's3_bucket_where_job_python_file_and_jar_will_go'

job = GlueJob('glue_job/', bucket=bucket, job_role=ROLE,
              job_arguments={'--enable-spark-ui': 'true',
                             '--spark-event-logs-path': 's3://path_to_logs',
                             '--enable-continuous-cloudwatch-log': 'true',
                             '--enable-metrics': ''})

job.job_name = 'my_job_name'

job.allocated_capacity = 4

try:
    job.run_job()
    job.wait_for_completion()
finally:
    job.cleanup()
Example #14
0
    def test_init(self):
        g = GlueJob(
            "example/glue_jobs/simple_etl_job/",
            bucket="alpha-everyone",
            job_role="alpha_user_isichei",
            job_arguments={"--test_arg": "this is a test"},
        )

        self.assertEqual(
            g.resources,
            [
                "example/glue_jobs/simple_etl_job/glue_resources/employees.json",
                "example/glue_jobs/shared_job_resources/glue_resources/teams.json",
            ],
        )
        self.assertEqual(
            g.py_resources,
            [
                "example/glue_jobs/shared_job_resources/glue_py_resources/"
                "my_dummy_utils.zip"
            ],
        )

        self.assertEqual(
            set(g.jars),
            set([
                "example/glue_jobs/simple_etl_job/glue_jars/j1.jar",
                "example/glue_jobs/simple_etl_job/glue_jars/j2.jar",
            ]),
        )
        self.assertEqual(g.job_name, "simple_etl_job")
        self.assertEqual(g.bucket, "alpha-everyone")
        self.assertEqual(g.job_role, "alpha_user_isichei")
        self.assertEqual(
            g.github_zip_urls,
            [
                "https://github.com/moj-analytical-services/gluejobutils/archive/"
                "master.zip"
            ],
        )
        self.assertEqual(g.job_arguments["--test_arg"], "this is a test")
        self.assertEqual(g.github_py_resources, [])
        self.assertEqual(g.max_retries, 0)
        self.assertEqual(g.max_concurrent_runs, 1)
        self.assertEqual(g.allocated_capacity, 2)

        jobdef = g._job_definition()
        self.assertTrue("j2.jar" in jobdef["DefaultArguments"]["--extra-jars"])

        g2 = GlueJob(
            "example/glue_jobs/simple_etl_job/",
            bucket="alpha-everyone",
            job_role="alpha_user_isichei",
            include_shared_job_resources=False,
        )
        self.assertEqual(
            g2.resources,
            ["example/glue_jobs/simple_etl_job/glue_resources/employees.json"],
        )
        self.assertEqual(g2.py_resources, [])

        self.assertTrue(
            "_GlueJobs_" in g2.job_arguments["--metadata_base_path"])
Example #15
0
    def test_timeout(self):
        g = GlueJob(
            "example/glue_jobs/simple_etl_job/",
            bucket="alpha-everyone",
            job_role="alpha_user_isichei",
            job_arguments={"--test_arg": "this is a test"},
        )

        self.assertEqual(g._job_definition()["Timeout"], 1363)

        g.allocated_capacity = 10

        self.assertEqual(g._job_definition()["Timeout"], 272)

        g.allocated_capacity = 40

        self.assertEqual(g._job_definition()["Timeout"], 68)

        g = GlueJob(
            "example/glue_jobs/simple_etl_job/",
            bucket="alpha-everyone",
            job_role="alpha_user_isichei",
            job_arguments={"--test_arg": "this is a test"},
            timeout_override_minutes=2880,
        )

        g.allocated_capacity = 40

        self.assertEqual(g._job_definition()["Timeout"], 2880)