コード例 #1
0
    def conversion_actions(self, dynamic_frame):
        from awsglue.transforms import Map

        def combine_datetime(record):
            """Combine two date and time fields into one time field"""
            record['time'] = "%s %s" % (record['date'], record['time'])
            del record['date']
            return record

        mapped_dyf = Map.apply(frame=dynamic_frame, f=combine_datetime)
        return mapped_dyf
コード例 #2
0
    def conversion_actions(self, dynamic_frame):
        from awsglue.transforms import Map
        from dateutil import parser

        def combine_datetime(record):
            """Parse the funky timestamp because python doesn't support %z"""
            parsed_timestamp = parser.parse(record['time'].replace(':', ' ', 1))
            record['time'] = parsed_timestamp.isoformat()
            return record

        mapped_dyf = Map.apply(frame=dynamic_frame, f=combine_datetime)
        return mapped_dyf
コード例 #3
0
    def _cast_timestamps(self, dynamic_frame):
        LOGGER.info("Performing vpc_flow custom conversion action: time conversions")
        from awsglue.transforms import Map
        from datetime import datetime

        # Note that this framework currently only supports string timestamps in the source
        def cast_timestamps(record):
            record['endtime'] = datetime.utcfromtimestamp(int(record['endtime'])).isoformat()
            record['starttime'] = datetime.utcfromtimestamp(int(record['starttime'])).isoformat()
            return record

        mapped_dyf = Map.apply(frame=dynamic_frame, f=cast_timestamps)
        return mapped_dyf
コード例 #4
0
    def _remove_dashes(self, dynamic_frame):
        LOGGER.info("Performing vpc_flow custom conversion action: removing dashes")
        from awsglue.transforms import Map

        def remove_dashes(record):
            for field in ['sourceaddress', 'destinationaddress', 'action']:
                if record[field] == '-':
                    record[field] = None
            
            return record
        
        mapped_dyf = Map.apply(frame=dynamic_frame, f=remove_dashes)
        return mapped_dyf
コード例 #5
0
def process_paintings(paintings: DynamicFrame) -> DynamicFrame:
  # Cast all "bit" fields (LongTypes) into booleans
  # It's easier to use a list of non-bit fields as the majority of fields imported are bit fields
  non_bit_fields = ["episode", "title"]
  bit_fields_specs = [
      (field.name, "cast:boolean")
      for field in paintings.schema()
      if field.name not in non_bit_fields and field.dataType.typeName() == 'long'
  ]
  if not bit_fields_specs:
    return paintings
  paintings_with_bool_fields = ResolveChoice.apply(paintings,
                                                   specs = bit_fields_specs,
                                                   transformation_ctx = "paintings_with_bool_fields")
  paintings_with_parsed_episodes = Map.apply(frame = paintings_with_bool_fields,
                                             f = parse_episode,
                                             transformation_ctx = "paintings_with_parsed_episodes")
  return paintings_with_parsed_episodes
コード例 #6
0
ファイル: mock_emails.py プロジェクト: ustaxcourt/ef-cms
    transformation_ctx='datasource0')


## @type: Map
## @args: [f = map_function, transformation_ctx = "mapped"]
## @return: mapped
## @inputs: [frame = datasource0]
def map_function(dynamicRecord):
    if dynamicRecord['pk'].startswith('user-email|'):
        mock_user_email_key(dynamicRecord)

    find_and_mock_emails(dynamicRecord)
    return dynamicRecord


mapped = Map.apply(frame=datasource0,
                   f=map_function,
                   transformation_ctx='mapped')

## @type: DataSink
## @args: [connection_type="dynamodb", connection_options={"table_name": "destination_table"} ]
## @return: datasink0
## @inputs: [frame = mapped]
datasink0 = glueContext.write_dynamic_frame.from_options(
    frame=mapped,
    connection_type="dynamodb",
    connection_options=OUTPUT_OPTIONS,
    transformation_ctx='datasink0')

job.commit()
コード例 #7
0
ファイル: glue_ETL.py プロジェクト: facebookresearch/FBPCS
#########################################

# Log starting time
dt_start = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
print("Start time:", dt_start)

# read data from s3 directly
dynamic_frame_read = glue_context.create_dynamic_frame.from_options(
    connection_type="s3",
    connection_options=s3_options,
    format="csv",
    format_options={"withHeader": True},
)  # format_options go by default

# process columns
mapped_dynamic_frame_read = Map.apply(frame=dynamic_frame_read,
                                      f=_process_record)

# #Convert dynamic frame to data frame to use standard pyspark functions
data_frame = mapped_dynamic_frame_read.toDF()

#########################################
### TRANSFORM (MODIFY DATA)
#########################################
### first, check column existence, if not, add dummy identifier columns
listColumns = data_frame.columns
expected_column_list = [
    # user_data fields
    "email",
    "phone",
    "device_id",
    "client_ip_address",