def conversion_actions(self, dynamic_frame): from awsglue.transforms import Map def combine_datetime(record): """Combine two date and time fields into one time field""" record['time'] = "%s %s" % (record['date'], record['time']) del record['date'] return record mapped_dyf = Map.apply(frame=dynamic_frame, f=combine_datetime) return mapped_dyf
def conversion_actions(self, dynamic_frame): from awsglue.transforms import Map from dateutil import parser def combine_datetime(record): """Parse the funky timestamp because python doesn't support %z""" parsed_timestamp = parser.parse(record['time'].replace(':', ' ', 1)) record['time'] = parsed_timestamp.isoformat() return record mapped_dyf = Map.apply(frame=dynamic_frame, f=combine_datetime) return mapped_dyf
def _cast_timestamps(self, dynamic_frame): LOGGER.info("Performing vpc_flow custom conversion action: time conversions") from awsglue.transforms import Map from datetime import datetime # Note that this framework currently only supports string timestamps in the source def cast_timestamps(record): record['endtime'] = datetime.utcfromtimestamp(int(record['endtime'])).isoformat() record['starttime'] = datetime.utcfromtimestamp(int(record['starttime'])).isoformat() return record mapped_dyf = Map.apply(frame=dynamic_frame, f=cast_timestamps) return mapped_dyf
def _remove_dashes(self, dynamic_frame): LOGGER.info("Performing vpc_flow custom conversion action: removing dashes") from awsglue.transforms import Map def remove_dashes(record): for field in ['sourceaddress', 'destinationaddress', 'action']: if record[field] == '-': record[field] = None return record mapped_dyf = Map.apply(frame=dynamic_frame, f=remove_dashes) return mapped_dyf
def process_paintings(paintings: DynamicFrame) -> DynamicFrame: # Cast all "bit" fields (LongTypes) into booleans # It's easier to use a list of non-bit fields as the majority of fields imported are bit fields non_bit_fields = ["episode", "title"] bit_fields_specs = [ (field.name, "cast:boolean") for field in paintings.schema() if field.name not in non_bit_fields and field.dataType.typeName() == 'long' ] if not bit_fields_specs: return paintings paintings_with_bool_fields = ResolveChoice.apply(paintings, specs = bit_fields_specs, transformation_ctx = "paintings_with_bool_fields") paintings_with_parsed_episodes = Map.apply(frame = paintings_with_bool_fields, f = parse_episode, transformation_ctx = "paintings_with_parsed_episodes") return paintings_with_parsed_episodes
transformation_ctx='datasource0') ## @type: Map ## @args: [f = map_function, transformation_ctx = "mapped"] ## @return: mapped ## @inputs: [frame = datasource0] def map_function(dynamicRecord): if dynamicRecord['pk'].startswith('user-email|'): mock_user_email_key(dynamicRecord) find_and_mock_emails(dynamicRecord) return dynamicRecord mapped = Map.apply(frame=datasource0, f=map_function, transformation_ctx='mapped') ## @type: DataSink ## @args: [connection_type="dynamodb", connection_options={"table_name": "destination_table"} ] ## @return: datasink0 ## @inputs: [frame = mapped] datasink0 = glueContext.write_dynamic_frame.from_options( frame=mapped, connection_type="dynamodb", connection_options=OUTPUT_OPTIONS, transformation_ctx='datasink0') job.commit()
######################################### # Log starting time dt_start = datetime.now().strftime("%Y-%m-%d %H:%M:%S") print("Start time:", dt_start) # read data from s3 directly dynamic_frame_read = glue_context.create_dynamic_frame.from_options( connection_type="s3", connection_options=s3_options, format="csv", format_options={"withHeader": True}, ) # format_options go by default # process columns mapped_dynamic_frame_read = Map.apply(frame=dynamic_frame_read, f=_process_record) # #Convert dynamic frame to data frame to use standard pyspark functions data_frame = mapped_dynamic_frame_read.toDF() ######################################### ### TRANSFORM (MODIFY DATA) ######################################### ### first, check column existence, if not, add dummy identifier columns listColumns = data_frame.columns expected_column_list = [ # user_data fields "email", "phone", "device_id", "client_ip_address",