コード例 #1
0
def mappingForAll(dynamicFrame, mapping):
    applymapping2 = ApplyMapping.apply(frame=dynamicFrame,
                                       mappings=mapping)

    resolvechoice2 = ResolveChoice.apply(frame=applymapping2, choice="make_cols",
                                         transformation_ctx="resolvechoice2")

    dyf_communication_after_mapping = DropNullFields.apply(frame=resolvechoice2, transformation_ctx="dropnullfields2")

    return dyf_communication_after_mapping
def mappingForAll(dynamicFrame, mapping):

    # for k, v in add_collum.items():
    #     df_communication = df_communication.withColumn(k, v)

    applymapping2 = ApplyMapping.apply(frame=dynamicFrame,
                                       mappings=mapping)

    resolvechoice2 = ResolveChoice.apply(frame=applymapping2, choice="make_cols",
                                         transformation_ctx="resolvechoice2")

    dyf_communication_after_mapping = DropNullFields.apply(frame=resolvechoice2, transformation_ctx="dropnullfields2")
    return dyf_communication_after_mapping
コード例 #3
0
def mappingForAll(dynamicFrame,mapping,add_collum):
    df_communication = dynamicFrame.toDF()
    df_communication = df_communication.dropDuplicates()

    for k, v in add_collum.items():
        df_communication = df_communication.withColumn(k, v)

    dyf_communication = DynamicFrame.fromDF(df_communication, glueContext, 'dyf_communication')
    applymapping2 = ApplyMapping.apply(frame=dyf_communication,
                                       mappings=mapping)

    resolvechoice2 = ResolveChoice.apply(frame=applymapping2, choice="make_cols",
                                         transformation_ctx="resolvechoice2")

    dyf_communication_after_mapping = DropNullFields.apply(frame=resolvechoice2, transformation_ctx="dropnullfields2")
    return dyf_communication_after_mapping
コード例 #4
0
def mappingForAll(dynamicFrame, mapping):
    df = dynamicFrame.toDF()
    df = df.dropDuplicates()
    dyf = DynamicFrame.fromDF(df, glueContext, "dyf")
    print("-------------------------------------------------")
    dyf.printSchema()
    print(mapping)

    applymapping2 = ApplyMapping.apply(frame=dyf, mappings=mapping)

    resolvechoice2 = ResolveChoice.apply(frame=applymapping2,
                                         choice="make_cols",
                                         transformation_ctx="resolvechoice2")

    dyf_mapping = DropNullFields.apply(frame=resolvechoice2,
                                       transformation_ctx="dropnullfields2")
    return dyf_mapping
コード例 #5
0
    def run(self):
        """Extract data from the data catalog and convert it to parquet, partitioning it along the way"""
        from awsglue.transforms import DropNullFields
        from awsglue.dynamicframe import DynamicFrame

        # Retrieve the source data from the Glue catalog
        source_data = self.glue_context.create_dynamic_frame.from_catalog(
            database=self.data_catalog.get_database_name(),
            table_name=self.data_catalog.get_table_name(),
            push_down_predicate=self.data_catalog.get_push_down_predicate(),
            transformation_ctx="source_data"
        )

        # Perform any data-source-specific conversions
        optimized_transforms = self.optimized_catalog.conversion_actions(source_data)

        # Remove nulls and convert to dataframe - dataframe is only needed for replacing the date partitions.
        # It was previously used to repartition, but Glue supports that now.
        drop_nulls = DropNullFields.apply(frame=optimized_transforms, transformation_ctx="drop_nulls")
        data_frame = drop_nulls.toDF()

        # We might have no data - if that's the case, short-circuit
        if not data_frame.head(1):
            LOGGER.info("No data returned, skipping conversion.")
            return

        # Create Y-m-d partitions out of the optimized table's timestamp field
        df_partitions = self._replace_date_partitions(data_frame, self.data_catalog.timestamp_field())

        # DataFrame runs out of memory for large datasets
        # Convert back to a DynamicFrame for further processing.
        partitioned_dynamicframe = DynamicFrame.fromDF(df_partitions, self.glue_context, "partitioned_dynamicframe")

        # Write out to partitioned parquet. We repartition to reduce the number of files to optimize Athena performance.
        # Athena queries will slow down even at 1,000 files, so we tradeoff having large files per partition rather
        # than many small files.
        self.glue_context.write_dynamic_frame.from_options(
            frame = partitioned_dynamicframe,
            connection_type = "s3",    
            connection_options = {"path": self.optimized_catalog.get_s3_location(), "partitionKeys": ['region', 'year', 'month', 'day']},
            format = "parquet")
コード例 #6
0
    def run(self):
        """Extract data from the data catalog and convert it to parquet, partitioning it along the way"""
        from awsglue.transforms import DropNullFields

        # Retrieve the source data from the Glue catalog
        source_data = self.glue_context.create_dynamic_frame.from_catalog(
            database=self.data_catalog.get_database_name(),
            table_name=self.data_catalog.get_table_name(),
            transformation_ctx="source_data"
        )

        # Perform any data-source-specific conversions
        optimized_transforms = self.optimized_catalog.conversion_actions(source_data)

        # Remove nulls and convert to dataframe - dataframe is only needed for replacing the date partitions.
        # It was previously used to repartition, but Glue supports that now.
        drop_nulls = DropNullFields.apply(frame=optimized_transforms, transformation_ctx="drop_nulls")
        data_frame = drop_nulls.toDF()

        # We might have no data - if that's the case, short-circuit
        if not data_frame.head(1):
            LOGGER.info("No data returned, skipping conversion.")
            return

        # Create Y-m-d partitions out of the optimized table's timestamp field
        df_partitions = self._replace_date_partitions(data_frame, self.data_catalog.timestamp_field())

        # Write out to partitioned parquet. We repartition to reduce the number of files to optimize Athena performance.
        # Athena queries will slow down even at 1,000 files, so we tradeoff having large files per partition rather
        # than many small files.
        (
            df_partitions
            .repartition(*self._partition_columns())
            .write
            .mode('append')
            .partitionBy(*self._partition_columns())
            .parquet(self.optimized_catalog.get_s3_location())
        )
コード例 #7
0
        ("weight", "string", "weight", "string"),
    ],
    transformation_ctx="applymapping1",
)
## @type: ResolveChoice
## @args: [choice = "make_cols", transformation_ctx = "resolvechoice2"]
## @return: resolvechoice2
## @inputs: [frame = applymapping1]
resolvechoice2 = ResolveChoice.apply(frame=applymapping1,
                                     choice="make_cols",
                                     transformation_ctx="resolvechoice2")
## @type: DropNullFields
## @args: [transformation_ctx = "dropnullfields3"]
## @return: dropnullfields3
## @inputs: [frame = resolvechoice2]
dropnullfields3 = DropNullFields.apply(frame=resolvechoice2,
                                       transformation_ctx="dropnullfields3")
## @type: DataSink
## @args: [catalog_connection = "dotz_connection", connection_options = {"dbtable": "comp_boss_csv", "database": "dotz_challenge"}, transformation_ctx = "datasink4"]
## @return: datasink4
## @inputs: [frame = dropnullfields3]
datasink4 = glueContext.write_dynamic_frame.from_jdbc_conf(
    frame=dropnullfields3,
    catalog_connection="dotz_connection",
    connection_options={
        "dbtable": "comp_boss",
        "database": "dotz_challenge"
    },
    transformation_ctx="datasink4",
)
job.commit()
コード例 #8
0
spark = glue_context.spark_session
job = Job(glue_context)
job.init(args['JOB_NAME'], args)

# Create DynamicFrame from Data Catalog
dyf = glue_context.create_dynamic_frame.from_catalog(database=output_database,
                                                     table_name=tmp_table,
                                                     transformation_ctx='dyf')

# Resolve choice type with make_struct
dyf = ResolveChoice.apply(frame=dyf,
                          choice='make_struct',
                          transformation_ctx='resolvechoice')

# Drop null fields
dyf = DropNullFields.apply(frame=dyf, transformation_ctx='dropnullfields')

# Apply mapping into Timestamp based on specified timestamp column
if timestamp_column_name:
    # Add temp column from specified timestamp column
    tmp_timestamp_column_name = f'tmp_${timestamp_column_name}'
    df = dyf.toDF()
    df.withColumn(tmp_timestamp_column_name, col(timestamp_column_name))
    dyf = DynamicFrame.fromDF(df, glue_context, "add_tmp_column")

    # Perform apply_mapping to convert the temp column to timestamp data type
    mapping = []
    for field in dyf.schema():
        if field.name == tmp_timestamp_column_name:
            mapping.append((field.name, field.dataType.typeName(), field.name,
                            'timestamp'))
コード例 #9
0
ファイル: glue_ETL.py プロジェクト: facebookresearch/FBPCS
                                col("unixtime")).repartition(1))

final_df = augmented_df

#########################################
### LOAD (WRITE DATA)
#########################################

# Create just 1 partition, because there is so little data
final_df = final_df.repartition(1)

# Convert back to dynamic frame
dynamic_frame_write = DynamicFrame.fromDF(final_df, glue_context,
                                          "dynamic_frame_write")
# Drop columns with all NULL values
dynamic_frame_write = DropNullFields.apply(frame=dynamic_frame_write)
# Write data back to S3
glue_context.write_dynamic_frame.from_options(
    frame=dynamic_frame_write,
    connection_type="s3",
    connection_options={
        "path": s3_write_path,
        # Here you could create S3 prefixes according to a values in specified columns
        "partitionKeys": ["year", "month", "day", "hour"],
    },
    format="json",
)

# Log end time
dt_end = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
print("End time:", dt_end)