def create_spark_schema_from_metadata_file(filepath, drop_columns = [], non_nullable_cols = []): """ Creates a spark schema from a json file that is a meta data dictionary. If filepath starts with s3:// the function assumes it is an S3 file otherwise it tries to read the file from the local directory. """ if 's3://' in filepath : metadata = read_json_from_s3(filepath) else : metadata = read_json(filepath) return create_spark_schema_from_metadata(metadata, drop_columns=drop_columns, non_nullable_cols=non_nullable_cols)
from awsglue.job import Job from gluejobutils.s3 import read_json_from_s3 args = getResolvedOptions(sys.argv, ["JOB_NAME", "metadata_path", "test_arg"]) print("JOB SPECS...") print("JOB_NAME: ", args["JOB_NAME"]) print("test argument: ", args["test_arg"]) sc = SparkContext() glueContext = GlueContext(sc) spark = glueContext.spark_session job = Job(glueContext) job.init(args["JOB_NAME"], args) meta_employees = read_json_from_s3( os.path.join(args["metadata_path"], "employees.json")) meta_teams = read_json_from_s3( os.path.join(args["metadata_path"], "teams.json")) spark.read.csv("s3://data_bucket/employees/").createOrReplaceTempView("emp") spark.read.csv("s3://data_bucket/teams/").createOrReplaceTempView("team") df = spark.sql( "SELECT * FROM emp LEFT JOIN team ON emp.employee_id = team.employee_id") df.write("s3://data_bucket/join/") job.commit()
if b != 'alpha-gluejobutils' or o != 'testing/data/diamonds_csv/diamonds.csv': raise ValueError('s3_path_to_bucket_key FAILURE') b, o = s3.s3_path_to_bucket_key('s3://alpha-gluejobutils/testing/data') if b != 'alpha-gluejobutils' or o != 'testing/data': raise ValueError('s3_path_to_bucket_key FAILURE') b, o = s3.s3_path_to_bucket_key('s3://alpha-gluejobutils/testing/data/') if b != 'alpha-gluejobutils' or o != 'testing/data/': raise ValueError('s3_path_to_bucket_key FAILURE') print("===> s3_path_to_bucket_key ===> OK") ### ### ### ### ### ### ### ### read_json_from_s3 ### ### ### ### ### ### ### ### test_json = s3.read_json_from_s3( 's3://alpha-gluejobutils/testing/meta_data/diamonds.json') diff = len( set([ '$schema', 'name', 'description', 'data_format', 'columns', 'partitions', 'location' ]).difference(test_json.keys())) if diff != 0: raise ValueError('read_json_from_s3 FAILURE') print("===> read_json_from_s3 ===> OK") ### ### ### ### ### ### ### write_json_to_s3 ### ### ### ### ### ### ### json_data = {'a': 'dog', 'b': 14, 'c': [1, 2, 3], 'd': {'cat': 'alpha'}} s3.write_json_to_s3(json_data, 's3://alpha-gluejobutils/testing/data_dump/test1.json')
# Good practice to print out arguments for debugging print "JOB SPECS..." print "JOB_NAME: ", args["JOB_NAME"] print "metadata_base_path: ", args["metadata_base_path"] print "GITHUB_TAG: ", args["github_tag"] print "SNAPSHOT_DATE: ", args["snapshot_date"] # Init your spark script sc = SparkContext() glueContext = GlueContext(sc) spark = glueContext.spark_session job = Job(glueContext) job.init(args['JOB_NAME'], args) # Read in meta data database_meta = read_json_from_s3(os.path.join(args['metadata_base_path'], "curated/database.json")) random_postcodes_meta = read_json_from_s3(os.path.join(args['metadata_base_path'], "curated/random_postcodes.json")) calculated_meta = read_json_from_s3(os.path.join(args['metadata_base_path'], "curated/calculated.json")) # Read in the data spark.read.json('s3://mojap-raw-hist/open_data/postcodes_example/').createOrReplaceTempView('postcodes') # Do some spark transforms (not much to do here let's just add an extra fields) postcodes = spark.sql(""" SELECT *, '{}' AS dea_version FROM postcodes """.format(args['github_tag'])) postcodes.createOrReplaceTempView('postcodes') print postcodes.columns