Ejemplo n.º 1
0
def create_catagory_and_relation(name, dataframe, gremlin_client):
    # upsert category vertices
    cateDF = dataframe.select(name).distinct()
    dynamic_df = DynamicFrame.fromDF(cateDF, glueContext, f'{name}DF')
    category_df = GlueGremlinCsvTransforms.create_prefixed_columns(
        dynamic_df, [('~id', name, name)])
    logger.info(f'Upserting category \'{name}\' as vertices of graph...')
    category_df.toDF().foreachPartition(
        gremlin_client.upsert_vertices(name, batch_size=100))

    # upsert edge
    logger.info(
        f'Creating glue dynamic frame from spark dataframe for the relation between transaction and {name}...'
    )
    dynamic_df = DynamicFrame.fromDF(dataframe, glueContext, f'{name}EdgeDF')
    relation = GlueGremlinCsvTransforms.create_prefixed_columns(
        dynamic_df, [('~from', TRANSACTION_ID, 't'), ('~to', name, name)])
    relation = GlueGremlinCsvTransforms.create_edge_id_column(
        relation, '~from', '~to')
    relation = SelectFields.apply(frame=relation,
                                  paths=["~id", '~from', '~to'],
                                  transformation_ctx=f'selection_{name}')
    logger.info(f'Upserting edges between \'{name}\' and transaction...')
    relation.toDF().foreachPartition(
        gremlin_client.upsert_edges('CATEGORY', batch_size=100))
Ejemplo n.º 2
0
def dump_edge_as_graph(name, dataframe):
    # upsert edge
    logger.info(f'Creating glue dynamic frame from spark dataframe for the relation between transaction and {name}...')
    dynamic_df = DynamicFrame.fromDF(dataframe, glueContext, f'{name}EdgeDF')
    relation = GlueGremlinCsvTransforms.create_prefixed_columns(dynamic_df, [('~from', TRANSACTION_ID, 't'),('~to', name, name)])
    relation = GlueGremlinCsvTransforms.create_edge_id_column(relation, '~from', '~to')
    relation = SelectFields.apply(frame = relation, paths = ["~id", '~from', '~to'], transformation_ctx = f'selection_{name}')
    logger.info(f'Upserting edges between \'{name}\' and transaction...')
    dump_df_to_s3(relation.toDF(), f'relation_{name}_edgelist', graph = True)
Ejemplo n.º 3
0
job.init(args['JOB_NAME'], args)
## @type: DataSource
## @args: [database = "demo-mysql", table_name = "salesdb_product", transformation_ctx = "datasource0"]
## @return: datasource0
## @inputs: []
datasource0 = glueContext.create_dynamic_frame.from_catalog(database = "demo-mysql", table_name = "salesdb_product", transformation_ctx = "datasource0")
datasource1 = glueContext.create_dynamic_frame.from_catalog(database = "demo-mysql", table_name = "salesdb_product_category", transformation_ctx = "datasource1")

datasource3=datasource0.join( ["CATEGORY_ID"],["CATEGORY_ID"], datasource1, transformation_ctx = "join")
print "Joined Rows : "+str(datasource3.count())
datasource3.printSchema()
## @type: ApplyMapping
## @args: [mapping = [("name", "string", "name", "string"), ("unit_price", "decimal(10,2)", "unit_price", "decimal(10,2)"), ("product_id", "double", "product_id", "int"), ("quantity_per_unit", "double", "quantity_per_unit", "int"), ("category_id", "double", "category_id", "int"), ("supplier_id", "double", "supplier_id", "int")], transformation_ctx = "applymapping1"]
## @return: applymapping1
## @inputs: [frame = datasource3]
applymapping1 = ApplyMapping.apply(frame = datasource3, mappings = [("NAME", "string", "name", "string"), ("UNIT_PRICE", "decimal(10,2)", "unit_price", "decimal(10,2)"), ("PRODUCT_ID", "int", "product_id", "int"), ("QUANTITY_PER_UNIT", "int", "quantity_per_unit", "int"), ("CATEGORY_ID", "int", "category_id", "int"), ("SUPPLIER_ID", "int", "supplier_id", "int"), ("CATEGORY_NAME", "string", "category_name", "string"), ("DESCRIPTION", "string", "description", "string"), ("IMAGE_URL", "string", "image_url", "string")], transformation_ctx = "applymapping1")
#applymapping1.toDF().show()
## @type: SelectFields
## @args: [paths = ["category_name", "category_id", "image_url", "product_id", "name", "description", "quantity_per_unit", "unit_price", "supplier_id"], transformation_ctx = "selectfields2"]
## @return: selectfields2
## @inputs: [frame = applymapping1]
selectfields2 = SelectFields.apply(frame = applymapping1, paths = ["category_name", "category_id", "image_url", "product_id", "name", "description", "quantity_per_unit", "unit_price", "supplier_id"], transformation_ctx = "selectfields2")
print "selectfields2 Rows : "+str(selectfields2.count())
selectfields2.toDF().show()
## @type: DataSink
## @args: [database = "demo-redshift", table_name = "redshiftdb_public_product_dim", redshift_tmp_dir = TempDir, transformation_ctx = "datasink3"]
## @return: datasink3
## @inputs: [frame = selectfields2]
datasink3 = glueContext.write_dynamic_frame.from_catalog(frame = selectfields2, database = "demo-redshift", table_name = "redshiftdb_public_product_dim", redshift_tmp_dir = args["TempDir"], transformation_ctx = "datasink3")
job.commit()
Ejemplo n.º 4
0
              ("PROFIT", "decimal(10,2)", "profit", "decimal(10,2)"),
              ("EXTENDED_PRICE", "decimal(10,2)", "extended_price",
               "decimal(10,2)"), ("DATE_KEY", "string", "date_key", "string"),
              ("ORDER_ID", "int", "order_id", "int")],
    transformation_ctx="applymapping1")
print "After ApplyMapping :"
applymapping1.printSchema()
applymapping1.toDF().show(5, False)

#selectfields2 = SelectFields.apply(frame = applymapping1, paths = ["quantity", "discount", "tax", "unit_price", "line_id", "date_key", "order_date", "extended_price", "ship_mode", "line_number", "product_id", "site_id", "supply_cost", "order_id", "profit"], transformation_ctx = "selectfields2")

selectfields2 = SelectFields.apply(frame=applymapping1,
                                   paths=[
                                       "order_id", "site_id", "order_date",
                                       "date_key", "ship_mode", "line_id",
                                       "line_number", "quantity", "product_id",
                                       "unit_price", "discount", "supply_cost",
                                       "tax", "extended_price", "profit"
                                   ],
                                   transformation_ctx="selectfields2")

print "After SelectFields :"
selectfields2.toDF().select("date_key").show(5, False)
#selectfields2.toDF().show(5,False)

jdbc_url = jdbc_url + '?user='******'&password=' + password
postActionStmt = "vacuum SALES_ORDER_FACT"

selectfields2.toDF().write.format("com.databricks.spark.redshift") \
.option("url", jdbc_url) \
.option("dbtable", target_table_1) \
Ejemplo n.º 5
0
get_fraud_frac = lambda series: 100 * sum(series)/len(series)
isfraud_df: DynamicFrame = transactions.select_fields("isFraud")
logger.info("Percent fraud for train transactions: {}".format(sum_col(transactions.toDF(), "isFraud")))
dump_df_to_s3(test_ids, 'test', header=False)

id_cols = args['id_cols']
cat_cols = args['cat_cols']
features_df, labels_df = get_features_and_labels(transactions.toDF(), id_cols, cat_cols)

logger.info(f'Dumping features and labels for training...')
dump_df_to_s3(features_df, 'features')
dump_df_to_s3(labels_df, 'tags')

featurs_graph_df = features_df.withColumn('props_values:String', to_json(struct(list(filter(lambda x: (x != TRANSACTION_ID), features_df.schema.names)))))
featurs_graph_df = featurs_graph_df.select('TransactionID','props_values:String')

logger.info(f'Creating glue dynamic frame from spark dataframe...')
features_graph_dynamic_df = DynamicFrame.fromDF(featurs_graph_df, glueContext, 'FeaturesDF')
features_graph_dynamic_df = GlueGremlinCsvTransforms.create_prefixed_columns(features_graph_dynamic_df, [('~id', TRANSACTION_ID, 't')])
features_graph_dynamic_df = GlueGremlinCsvTransforms.addLabel(features_graph_dynamic_df,'Transaction')
features_graph_dynamic_df = SelectFields.apply(frame = features_graph_dynamic_df, paths = ["~id",'~label', 'props_values:String'])
logger.info(f'Dumping transaction data as graph data...')
dump_df_to_s3(features_graph_dynamic_df.toDF(), f'transaction', graph = True)

relational_edges = get_relations_and_edgelist(transactions.toDF(), identities.toDF(), id_cols)
for name, df in relational_edges.items():
    if name != TRANSACTION_ID:
        logger.info(f'Dumping edge {name} for training...')
        dump_df_to_s3(df, f'relation_{name}_edgelist')
        logger.info(f'Dumping edge {name} as graph data...')
        dump_edge_as_graph(name, df)
Ejemplo n.º 6
0
datasource1 = DynamicFrame.fromDF(df0, glueContext, 'datasource1')

print "Total orders         : " + str(datasource0.count())
print "Orders for checkpoint: " + str(datasource1.count())

applymapping1 = ApplyMapping.apply(
    frame=datasource1,
    mappings=[("ORDER_DATE", "timestamp", "orderDate", "string"),
              ("SHIP_MODE", "string", "shipMode", "string"),
              ("SITE_ID", "double", "siteId", "int"),
              ("ORDER_ID", "int", "orderId", "int")],
    transformation_ctx="applymapping1")
applymapping1 = GremlinCsvTransforms.create_prefixed_columns(
    applymapping1, [('~id', 'orderId', 'o')])
selectfields1 = SelectFields.apply(frame=applymapping1,
                                   paths=["~id", "orderDate", "shipMode"],
                                   transformation_ctx="selectfields1")

selectfields1.toDF().foreachPartition(neptune.add_vertices('Order'))

print "Creating OrderDetail vertices..."

datasource2 = glueContext.create_dynamic_frame.from_catalog(
    database=database,
    table_name=order_detail_table,
    transformation_ctx="datasource1")
datasource3 = datasource2.join(["ORDER_ID"], ["ORDER_ID"],
                               datasource1,
                               transformation_ctx="join")

print "Total order details         : " + str(datasource2.count())
Ejemplo n.º 7
0
              ("ADDRESS", "string", "address", "string"),
              ("NAME", "string", "name", "string"),
              ("STATE", "string", "state", "string"),
              ("SUPPLIER_ID", "int", "supplier_id", "int"),
              ("CITY", "string", "city", "string"),
              ("PHONE", "string", "phone", "string")],
    transformation_ctx="applymapping1")
print "Source count datasource0: " + str(applymapping1.count())
applymapping1.toDF().show()
## @type: SelectFields
## @args: [paths = ["country", "address", "city", "phone", "name", "state", "supplier_id"], transformation_ctx = "selectfields2"]
## @return: selectfields2
## @inputs: [frame = applymapping1]
selectfields2 = SelectFields.apply(frame=applymapping1,
                                   paths=[
                                       "country", "address", "city", "phone",
                                       "name", "state", "supplier_id"
                                   ],
                                   transformation_ctx="selectfields2")
print "Source count datasource0: " + str(selectfields2.count())
selectfields2.toDF().show()
## @type: DataSink
## @args: [database = "demo-redshift", table_name = "redshiftdb_public_supplier_dim", redshift_tmp_dir = TempDir, transformation_ctx = "datasink3"]
## @return: datasink3
## @inputs: [frame = selectfields2]
datasink3 = glueContext.write_dynamic_frame.from_catalog(
    frame=selectfields2,
    database="demo-redshift",
    table_name="redshiftdb_public_supplier_dim",
    redshift_tmp_dir=args["TempDir"],
    transformation_ctx="datasink3")
print "Redshift Rows inserted : " + str(datasink3.toDF().count())
supplier_table = '{}salesdb_supplier'.format(args['TABLE_PREFIX'])

gremlin_endpoints = GlueNeptuneConnectionInfo(args['AWS_REGION'], args['CONNECT_TO_NEPTUNE_ROLE_ARN']).neptune_endpoints(args['NEPTUNE_CONNECTION_NAME'])
gremlin_client = GlueGremlinClient(gremlin_endpoints)

# Product vertices

print("Creating Product vertices...")

datasource0 = glueContext.create_dynamic_frame.from_catalog(database = database, table_name = product_table, transformation_ctx = "datasource0")
datasource1 = glueContext.create_dynamic_frame.from_catalog(database = database, table_name = product_category_table, transformation_ctx = "datasource1")
datasource2 = datasource0.join( ["CATEGORY_ID"],["CATEGORY_ID"], datasource1, transformation_ctx = "join")

applymapping1 = ApplyMapping.apply(frame = datasource2, mappings = [("NAME", "string", "name:String", "string"), ("UNIT_PRICE", "decimal(10,2)", "unitPrice", "string"), ("PRODUCT_ID", "int", "productId", "int"), ("QUANTITY_PER_UNIT", "int", "quantityPerUnit:Int", "int"), ("CATEGORY_ID", "int", "category_id", "int"), ("SUPPLIER_ID", "int", "supplierId", "int"), ("CATEGORY_NAME", "string", "category:String", "string"), ("DESCRIPTION", "string", "description:String", "string"), ("IMAGE_URL", "string", "imageUrl:String", "string")], transformation_ctx = "applymapping1")
applymapping1 = GlueGremlinCsvTransforms.create_prefixed_columns(applymapping1, [('~id', 'productId', 'p'),('~to', 'supplierId', 's')])
selectfields1 = SelectFields.apply(frame = applymapping1, paths = ["~id", "name:String", "category:String", "description:String", "unitPrice", "quantityPerUnit:Int", "imageUrl:String"], transformation_ctx = "selectfields1")

selectfields1.toDF().foreachPartition(gremlin_client.upsert_vertices('Product', batch_size=100))

# Supplier vertices

print("Creating Supplier vertices...")

datasource3 = glueContext.create_dynamic_frame.from_catalog(database = database, table_name = supplier_table, transformation_ctx = "datasource3")

applymapping2 = ApplyMapping.apply(frame = datasource3, mappings = [("COUNTRY", "string", "country:String", "string"), ("ADDRESS", "string", "address:String", "string"), ("NAME", "string", "name:String", "string"), ("STATE", "string", "state:String", "string"), ("SUPPLIER_ID", "int", "supplierId", "int"), ("CITY", "string", "city:String", "string"), ("PHONE", "string", "phone:String", "string")], transformation_ctx = "applymapping1")
applymapping2 = GlueGremlinCsvTransforms.create_prefixed_columns(applymapping2, [('~id', 'supplierId', 's')])
selectfields3 = SelectFields.apply(frame = applymapping2, paths = ["~id", "country:String", "address:String", "city:String", "phone:String", "name:String", "state:String"], transformation_ctx = "selectfields3")

selectfields3.toDF().foreachPartition(gremlin_client.upsert_vertices('Supplier', batch_size=100))
Ejemplo n.º 9
0
rdf.show(1)

# 2. Map fields to bulk load CSV column headings format
applymapping1 = ApplyMapping.apply(
    frame=rdf,
    mappings=[("`Reservations.val.Instances.val.InstanceId`", "string",
               "instanceId:String", "string"),
              ("`Reservations.val.Instances.val.InstanceType`", "string",
               "instanceType:String", "string")],
    transformation_ctx="applymapping1")

applymapping1.show(1)

# 3. Append prefixes to values in ID columns (ensures vertices for different types have unique IDs across graph)
applymapping2 = GlueGremlinCsvTransforms.create_prefixed_columns(
    applymapping1, [('~id', 'instanceId:String', 'ec2')])

# 4. Select fields for upsert
selectfields1 = SelectFields.apply(
    frame=applymapping2,
    paths=["~id", 'instanceId:String', 'instanceType:String'],
    transformation_ctx="selectfields1")

# 5. Write to s3
writeCsvFile(GlueGremlinCsvTransforms.addLabel(selectfields1, 'EC2'),
             nodes_folder)

job.commit()

print("Done")
Ejemplo n.º 10
0
def main():
    """This script will load data from the supplied DynamoDB Table to S3 so it can be analyzed with Athena"""
    if (JOB_TYPE == "issues"):
        DDB_TABLE_NAME = DDB_ISSUES_TABLE_NAME
        GLUE_TABLE_NAME = GLUE_ISSUES_TABLE_NAME
        FIELD_PATHS = [
            "eventid", "acknowledged", "created", "sitename", "issuesource",
            "priority",
            "areaname#status#processname#eventdescription#stationname#devicename#created",
            "version", "devicename", "devicename#eventid", "createdat",
            "areaname", "processname", "createddateutc", "eventdescription",
            "areaname#status#processname#stationname#devicename#created",
            "stationname", "id", "acknowledgedtime", "status", "updatedat",
            "closed", "resolutiontime", "createdby", "acknowledgedby",
            "closedby", "rejectedby", "additionaldetails"
        ]
    elif (JOB_TYPE == "hierarchy"):
        DDB_TABLE_NAME = DDB_DATA_HIERARCHY_TABLE_NAME
        GLUE_TABLE_NAME = GLUE_DATA_HIERARCHY_TABLE_NAME
        FIELD_PATHS = [
            "createdat", "name", "description", "id", "devicestationid",
            "type", "version", "parentid", "updatedat", "areasiteid",
            "eventprocessid", "eventtype", "priority", "rootcauses", "sms",
            "eventimgkey", "email", "protocol", "endpoint", "filterpolicy",
            "subscriptionarn", "stationareaid", "processareaid", "alias"
        ]
    else:
        raise JobInputException(
            f"JOB_TYPE was invalid ({JOB_TYPE}). Expecting either \"issues\" or \"hierarchy\""
        )

    log_message([
        "Running with the following context:",
        f"DDB_TABLE_NAME: {DDB_TABLE_NAME}",
        f"GLUE_TABLE_NAME: {GLUE_TABLE_NAME}", f"GLUE_DB_NAME: {GLUE_DB_NAME}",
        f"GLUE_OUTPUT_BUCKET: {GLUE_OUTPUT_BUCKET}"
    ])

    DDB_TABLE_NAME_FORMATTED = DDB_TABLE_NAME.lower().replace('-', '_')

    log_message("Mapping columns")
    COLUMN_MAPPINGS = list(map(lambda x: get_column_mapping(x), FIELD_PATHS))

    log_message("Creating a Dynamic Frame from the DynamoDB table schema")
    datasource0 = glue_context.create_dynamic_frame.from_catalog(
        database=GLUE_DB_NAME,
        table_name=DDB_TABLE_NAME_FORMATTED,
        transformation_ctx="datasource0")

    log_message("Applying column mappings")
    applymapping1 = ApplyMapping.apply(frame=datasource0,
                                       mappings=COLUMN_MAPPINGS,
                                       transformation_ctx="applymapping1")

    log_message("Selecting fields")
    selectfields2 = SelectFields.apply(frame=applymapping1,
                                       paths=FIELD_PATHS,
                                       transformation_ctx="selectfields2")

    log_message("Resolving")
    resolvechoice3 = ResolveChoice.apply(frame=selectfields2,
                                         choice="MATCH_CATALOG",
                                         database=GLUE_DB_NAME,
                                         table_name=GLUE_TABLE_NAME,
                                         transformation_ctx="resolvechoice3")

    resolvechoice4 = ResolveChoice.apply(frame=resolvechoice3,
                                         choice="make_struct",
                                         transformation_ctx="resolvechoice4")

    log_message("Persisting data in S3")
    glue_context.write_dynamic_frame.from_catalog(
        frame=resolvechoice4,
        database=GLUE_DB_NAME,
        table_name=GLUE_TABLE_NAME,
        transformation_ctx="datasink5")

    job.commit()
    log_message("Done")