def mappingForAll(dynamicFrame, mapping): applymapping2 = ApplyMapping.apply(frame=dynamicFrame, mappings=mapping) resolvechoice2 = ResolveChoice.apply(frame=applymapping2, choice="make_cols", transformation_ctx="resolvechoice2") dyf_communication_after_mapping = DropNullFields.apply(frame=resolvechoice2, transformation_ctx="dropnullfields2") return dyf_communication_after_mapping
def mappingForAll(dynamicFrame, mapping): # for k, v in add_collum.items(): # df_communication = df_communication.withColumn(k, v) applymapping2 = ApplyMapping.apply(frame=dynamicFrame, mappings=mapping) resolvechoice2 = ResolveChoice.apply(frame=applymapping2, choice="make_cols", transformation_ctx="resolvechoice2") dyf_communication_after_mapping = DropNullFields.apply(frame=resolvechoice2, transformation_ctx="dropnullfields2") return dyf_communication_after_mapping
def mappingForAll(dynamicFrame,mapping,add_collum): df_communication = dynamicFrame.toDF() df_communication = df_communication.dropDuplicates() for k, v in add_collum.items(): df_communication = df_communication.withColumn(k, v) dyf_communication = DynamicFrame.fromDF(df_communication, glueContext, 'dyf_communication') applymapping2 = ApplyMapping.apply(frame=dyf_communication, mappings=mapping) resolvechoice2 = ResolveChoice.apply(frame=applymapping2, choice="make_cols", transformation_ctx="resolvechoice2") dyf_communication_after_mapping = DropNullFields.apply(frame=resolvechoice2, transformation_ctx="dropnullfields2") return dyf_communication_after_mapping
def mappingForAll(dynamicFrame, mapping): df = dynamicFrame.toDF() df = df.dropDuplicates() dyf = DynamicFrame.fromDF(df, glueContext, "dyf") print("-------------------------------------------------") dyf.printSchema() print(mapping) applymapping2 = ApplyMapping.apply(frame=dyf, mappings=mapping) resolvechoice2 = ResolveChoice.apply(frame=applymapping2, choice="make_cols", transformation_ctx="resolvechoice2") dyf_mapping = DropNullFields.apply(frame=resolvechoice2, transformation_ctx="dropnullfields2") return dyf_mapping
def run(self): """Extract data from the data catalog and convert it to parquet, partitioning it along the way""" from awsglue.transforms import DropNullFields from awsglue.dynamicframe import DynamicFrame # Retrieve the source data from the Glue catalog source_data = self.glue_context.create_dynamic_frame.from_catalog( database=self.data_catalog.get_database_name(), table_name=self.data_catalog.get_table_name(), push_down_predicate=self.data_catalog.get_push_down_predicate(), transformation_ctx="source_data" ) # Perform any data-source-specific conversions optimized_transforms = self.optimized_catalog.conversion_actions(source_data) # Remove nulls and convert to dataframe - dataframe is only needed for replacing the date partitions. # It was previously used to repartition, but Glue supports that now. drop_nulls = DropNullFields.apply(frame=optimized_transforms, transformation_ctx="drop_nulls") data_frame = drop_nulls.toDF() # We might have no data - if that's the case, short-circuit if not data_frame.head(1): LOGGER.info("No data returned, skipping conversion.") return # Create Y-m-d partitions out of the optimized table's timestamp field df_partitions = self._replace_date_partitions(data_frame, self.data_catalog.timestamp_field()) # DataFrame runs out of memory for large datasets # Convert back to a DynamicFrame for further processing. partitioned_dynamicframe = DynamicFrame.fromDF(df_partitions, self.glue_context, "partitioned_dynamicframe") # Write out to partitioned parquet. We repartition to reduce the number of files to optimize Athena performance. # Athena queries will slow down even at 1,000 files, so we tradeoff having large files per partition rather # than many small files. self.glue_context.write_dynamic_frame.from_options( frame = partitioned_dynamicframe, connection_type = "s3", connection_options = {"path": self.optimized_catalog.get_s3_location(), "partitionKeys": ['region', 'year', 'month', 'day']}, format = "parquet")
def run(self): """Extract data from the data catalog and convert it to parquet, partitioning it along the way""" from awsglue.transforms import DropNullFields # Retrieve the source data from the Glue catalog source_data = self.glue_context.create_dynamic_frame.from_catalog( database=self.data_catalog.get_database_name(), table_name=self.data_catalog.get_table_name(), transformation_ctx="source_data" ) # Perform any data-source-specific conversions optimized_transforms = self.optimized_catalog.conversion_actions(source_data) # Remove nulls and convert to dataframe - dataframe is only needed for replacing the date partitions. # It was previously used to repartition, but Glue supports that now. drop_nulls = DropNullFields.apply(frame=optimized_transforms, transformation_ctx="drop_nulls") data_frame = drop_nulls.toDF() # We might have no data - if that's the case, short-circuit if not data_frame.head(1): LOGGER.info("No data returned, skipping conversion.") return # Create Y-m-d partitions out of the optimized table's timestamp field df_partitions = self._replace_date_partitions(data_frame, self.data_catalog.timestamp_field()) # Write out to partitioned parquet. We repartition to reduce the number of files to optimize Athena performance. # Athena queries will slow down even at 1,000 files, so we tradeoff having large files per partition rather # than many small files. ( df_partitions .repartition(*self._partition_columns()) .write .mode('append') .partitionBy(*self._partition_columns()) .parquet(self.optimized_catalog.get_s3_location()) )
("weight", "string", "weight", "string"), ], transformation_ctx="applymapping1", ) ## @type: ResolveChoice ## @args: [choice = "make_cols", transformation_ctx = "resolvechoice2"] ## @return: resolvechoice2 ## @inputs: [frame = applymapping1] resolvechoice2 = ResolveChoice.apply(frame=applymapping1, choice="make_cols", transformation_ctx="resolvechoice2") ## @type: DropNullFields ## @args: [transformation_ctx = "dropnullfields3"] ## @return: dropnullfields3 ## @inputs: [frame = resolvechoice2] dropnullfields3 = DropNullFields.apply(frame=resolvechoice2, transformation_ctx="dropnullfields3") ## @type: DataSink ## @args: [catalog_connection = "dotz_connection", connection_options = {"dbtable": "comp_boss_csv", "database": "dotz_challenge"}, transformation_ctx = "datasink4"] ## @return: datasink4 ## @inputs: [frame = dropnullfields3] datasink4 = glueContext.write_dynamic_frame.from_jdbc_conf( frame=dropnullfields3, catalog_connection="dotz_connection", connection_options={ "dbtable": "comp_boss", "database": "dotz_challenge" }, transformation_ctx="datasink4", ) job.commit()
spark = glue_context.spark_session job = Job(glue_context) job.init(args['JOB_NAME'], args) # Create DynamicFrame from Data Catalog dyf = glue_context.create_dynamic_frame.from_catalog(database=output_database, table_name=tmp_table, transformation_ctx='dyf') # Resolve choice type with make_struct dyf = ResolveChoice.apply(frame=dyf, choice='make_struct', transformation_ctx='resolvechoice') # Drop null fields dyf = DropNullFields.apply(frame=dyf, transformation_ctx='dropnullfields') # Apply mapping into Timestamp based on specified timestamp column if timestamp_column_name: # Add temp column from specified timestamp column tmp_timestamp_column_name = f'tmp_${timestamp_column_name}' df = dyf.toDF() df.withColumn(tmp_timestamp_column_name, col(timestamp_column_name)) dyf = DynamicFrame.fromDF(df, glue_context, "add_tmp_column") # Perform apply_mapping to convert the temp column to timestamp data type mapping = [] for field in dyf.schema(): if field.name == tmp_timestamp_column_name: mapping.append((field.name, field.dataType.typeName(), field.name, 'timestamp'))
col("unixtime")).repartition(1)) final_df = augmented_df ######################################### ### LOAD (WRITE DATA) ######################################### # Create just 1 partition, because there is so little data final_df = final_df.repartition(1) # Convert back to dynamic frame dynamic_frame_write = DynamicFrame.fromDF(final_df, glue_context, "dynamic_frame_write") # Drop columns with all NULL values dynamic_frame_write = DropNullFields.apply(frame=dynamic_frame_write) # Write data back to S3 glue_context.write_dynamic_frame.from_options( frame=dynamic_frame_write, connection_type="s3", connection_options={ "path": s3_write_path, # Here you could create S3 prefixes according to a values in specified columns "partitionKeys": ["year", "month", "day", "hour"], }, format="json", ) # Log end time dt_end = datetime.now().strftime("%Y-%m-%d %H:%M:%S") print("End time:", dt_end)