Esempio n. 1
0
def create_catagory_and_relation(name, dataframe, gremlin_client):
    # upsert category vertices
    cateDF = dataframe.select(name).distinct()
    dynamic_df = DynamicFrame.fromDF(cateDF, glueContext, f'{name}DF')
    category_df = GlueGremlinCsvTransforms.create_prefixed_columns(
        dynamic_df, [('~id', name, name)])
    logger.info(f'Upserting category \'{name}\' as vertices of graph...')
    category_df.toDF().foreachPartition(
        gremlin_client.upsert_vertices(name, batch_size=100))

    # upsert edge
    logger.info(
        f'Creating glue dynamic frame from spark dataframe for the relation between transaction and {name}...'
    )
    dynamic_df = DynamicFrame.fromDF(dataframe, glueContext, f'{name}EdgeDF')
    relation = GlueGremlinCsvTransforms.create_prefixed_columns(
        dynamic_df, [('~from', TRANSACTION_ID, 't'), ('~to', name, name)])
    relation = GlueGremlinCsvTransforms.create_edge_id_column(
        relation, '~from', '~to')
    relation = SelectFields.apply(frame=relation,
                                  paths=["~id", '~from', '~to'],
                                  transformation_ctx=f'selection_{name}')
    logger.info(f'Upserting edges between \'{name}\' and transaction...')
    relation.toDF().foreachPartition(
        gremlin_client.upsert_edges('CATEGORY', batch_size=100))
Esempio n. 2
0
def dump_edge_as_graph(name, dataframe):
    # upsert edge
    logger.info(f'Creating glue dynamic frame from spark dataframe for the relation between transaction and {name}...')
    dynamic_df = DynamicFrame.fromDF(dataframe, glueContext, f'{name}EdgeDF')
    relation = GlueGremlinCsvTransforms.create_prefixed_columns(dynamic_df, [('~from', TRANSACTION_ID, 't'),('~to', name, name)])
    relation = GlueGremlinCsvTransforms.create_edge_id_column(relation, '~from', '~to')
    relation = SelectFields.apply(frame = relation, paths = ["~id", '~from', '~to'], transformation_ctx = f'selection_{name}')
    logger.info(f'Upserting edges between \'{name}\' and transaction...')
    dump_df_to_s3(relation.toDF(), f'relation_{name}_edgelist', graph = True)
Esempio n. 3
0
test_ids = transactions.select_fields(TRANSACTION_ID)
get_fraud_frac = lambda series: 100 * sum(series) / len(series)
isfraud_df: DynamicFrame = transactions.select_fields("isFraud")
logger.info("Percent fraud for train transactions: {}".format(
    sum_col(transactions.toDF(), "isFraud")))
dump_df_to_s3(test_ids.toDF(), 'test', header=False)

id_cols = args['id_cols']
cat_cols = args['cat_cols']
features_df, labels_df = get_features_and_labels(transactions.toDF(), id_cols,
                                                 cat_cols)

# Creating glue dynamic frame from spark dataframe
features_dynamic_df = DynamicFrame.fromDF(features_df, glueContext,
                                          'FeaturesDF')
features_dynamic_df = GlueGremlinCsvTransforms.create_prefixed_columns(
    features_dynamic_df, [('~id', TRANSACTION_ID, 't')])
logger.info(f'Upserting transactions as vertices of graph...')
features_dynamic_df.toDF().foreachPartition(
    gremlin_client.upsert_vertices('Transaction', batch_size=50))
logger.info(f'Creating glue DF from labels dataframe')
labels_dynamic_df = DynamicFrame.fromDF(labels_df, glueContext, 'LabelsDF')
labels_dynamic_df = GlueGremlinCsvTransforms.create_prefixed_columns(
    labels_dynamic_df, [('~id', TRANSACTION_ID, 't')])
logger.info(f'Upserting transactions with isFraud property...')
labels_dynamic_df.toDF().foreachPartition(
    gremlin_client.upsert_vertices('Transaction', batch_size=100))

dump_df_to_s3(features_df, 'features')
dump_df_to_s3(labels_df, 'tags')
relational_edges = get_relations_and_edgelist(transactions.toDF(),
                                              identities.toDF(), id_cols)
Esempio n. 4
0
get_fraud_frac = lambda series: 100 * sum(series)/len(series)
isfraud_df: DynamicFrame = transactions.select_fields("isFraud")
logger.info("Percent fraud for train transactions: {}".format(sum_col(transactions.toDF(), "isFraud")))
dump_df_to_s3(test_ids, 'test', header=False)

id_cols = args['id_cols']
cat_cols = args['cat_cols']
features_df, labels_df = get_features_and_labels(transactions.toDF(), id_cols, cat_cols)

logger.info(f'Dumping features and labels for training...')
dump_df_to_s3(features_df, 'features')
dump_df_to_s3(labels_df, 'tags')

featurs_graph_df = features_df.withColumn('props_values:String', to_json(struct(list(filter(lambda x: (x != TRANSACTION_ID), features_df.schema.names)))))
featurs_graph_df = featurs_graph_df.select('TransactionID','props_values:String')

logger.info(f'Creating glue dynamic frame from spark dataframe...')
features_graph_dynamic_df = DynamicFrame.fromDF(featurs_graph_df, glueContext, 'FeaturesDF')
features_graph_dynamic_df = GlueGremlinCsvTransforms.create_prefixed_columns(features_graph_dynamic_df, [('~id', TRANSACTION_ID, 't')])
features_graph_dynamic_df = GlueGremlinCsvTransforms.addLabel(features_graph_dynamic_df,'Transaction')
features_graph_dynamic_df = SelectFields.apply(frame = features_graph_dynamic_df, paths = ["~id",'~label', 'props_values:String'])
logger.info(f'Dumping transaction data as graph data...')
dump_df_to_s3(features_graph_dynamic_df.toDF(), f'transaction', graph = True)

relational_edges = get_relations_and_edgelist(transactions.toDF(), identities.toDF(), id_cols)
for name, df in relational_edges.items():
    if name != TRANSACTION_ID:
        logger.info(f'Dumping edge {name} for training...')
        dump_df_to_s3(df, f'relation_{name}_edgelist')
        logger.info(f'Dumping edge {name} as graph data...')
        dump_edge_as_graph(name, df)
Esempio n. 5
0
    table_name=order_table,
    transformation_ctx="datasource0")
df0 = datasource0.toDF().filter(col("ORDER_DATE") == checkpoint)
datasource1 = DynamicFrame.fromDF(df0, glueContext, 'datasource1')

print("Total orders         : " + str(datasource0.count()))
print("Orders for checkpoint: " + str(datasource1.count()))

applymapping1 = ApplyMapping.apply(
    frame=datasource1,
    mappings=[("ORDER_DATE", "timestamp", "orderDate", "string"),
              ("SHIP_MODE", "string", "shipMode", "string"),
              ("SITE_ID", "double", "siteId", "int"),
              ("ORDER_ID", "int", "orderId", "int")],
    transformation_ctx="applymapping1")
applymapping1 = GlueGremlinCsvTransforms.create_prefixed_columns(
    applymapping1, [('~id', 'orderId', 'o')])
selectfields1 = SelectFields.apply(frame=applymapping1,
                                   paths=["~id", "orderDate", "shipMode"],
                                   transformation_ctx="selectfields1")

selectfields1.toDF().foreachPartition(gremlin_client.add_vertices('Order'))

print("Creating OrderDetail vertices...")

datasource2 = glueContext.create_dynamic_frame.from_catalog(
    database=database,
    table_name=order_detail_table,
    transformation_ctx="datasource1")
datasource3 = datasource2.join(["ORDER_ID"], ["ORDER_ID"],
                               datasource1,
                               transformation_ctx="join")
product_category_table = '{}salesdb_product_category'.format(args['TABLE_PREFIX'])
supplier_table = '{}salesdb_supplier'.format(args['TABLE_PREFIX'])

gremlin_endpoints = GlueNeptuneConnectionInfo(args['AWS_REGION'], args['CONNECT_TO_NEPTUNE_ROLE_ARN']).neptune_endpoints(args['NEPTUNE_CONNECTION_NAME'])
gremlin_client = GlueGremlinClient(gremlin_endpoints)

# Product vertices

print("Creating Product vertices...")

datasource0 = glueContext.create_dynamic_frame.from_catalog(database = database, table_name = product_table, transformation_ctx = "datasource0")
datasource1 = glueContext.create_dynamic_frame.from_catalog(database = database, table_name = product_category_table, transformation_ctx = "datasource1")
datasource2 = datasource0.join( ["CATEGORY_ID"],["CATEGORY_ID"], datasource1, transformation_ctx = "join")

applymapping1 = ApplyMapping.apply(frame = datasource2, mappings = [("NAME", "string", "name:String", "string"), ("UNIT_PRICE", "decimal(10,2)", "unitPrice", "string"), ("PRODUCT_ID", "int", "productId", "int"), ("QUANTITY_PER_UNIT", "int", "quantityPerUnit:Int", "int"), ("CATEGORY_ID", "int", "category_id", "int"), ("SUPPLIER_ID", "int", "supplierId", "int"), ("CATEGORY_NAME", "string", "category:String", "string"), ("DESCRIPTION", "string", "description:String", "string"), ("IMAGE_URL", "string", "imageUrl:String", "string")], transformation_ctx = "applymapping1")
applymapping1 = GlueGremlinCsvTransforms.create_prefixed_columns(applymapping1, [('~id', 'productId', 'p'),('~to', 'supplierId', 's')])
selectfields1 = SelectFields.apply(frame = applymapping1, paths = ["~id", "name:String", "category:String", "description:String", "unitPrice", "quantityPerUnit:Int", "imageUrl:String"], transformation_ctx = "selectfields1")

selectfields1.toDF().foreachPartition(gremlin_client.upsert_vertices('Product', batch_size=100))

# Supplier vertices

print("Creating Supplier vertices...")

datasource3 = glueContext.create_dynamic_frame.from_catalog(database = database, table_name = supplier_table, transformation_ctx = "datasource3")

applymapping2 = ApplyMapping.apply(frame = datasource3, mappings = [("COUNTRY", "string", "country:String", "string"), ("ADDRESS", "string", "address:String", "string"), ("NAME", "string", "name:String", "string"), ("STATE", "string", "state:String", "string"), ("SUPPLIER_ID", "int", "supplierId", "int"), ("CITY", "string", "city:String", "string"), ("PHONE", "string", "phone:String", "string")], transformation_ctx = "applymapping1")
applymapping2 = GlueGremlinCsvTransforms.create_prefixed_columns(applymapping2, [('~id', 'supplierId', 's')])
selectfields3 = SelectFields.apply(frame = applymapping2, paths = ["~id", "country:String", "address:String", "city:String", "phone:String", "name:String", "state:String"], transformation_ctx = "selectfields3")

selectfields3.toDF().foreachPartition(gremlin_client.upsert_vertices('Supplier', batch_size=100))
                               datasource1,
                               transformation_ctx="join")

applymapping1 = ApplyMapping.apply(
    frame=datasource2,
    mappings=[("NAME", "string", "name:String", "string"),
              ("UNIT_PRICE", "decimal(10,2)", "unitPrice", "decimal(10,2)"),
              ("PRODUCT_ID", "int", "productId", "int"),
              ("QUANTITY_PER_UNIT", "int", "quantityPerUnit:Int", "int"),
              ("CATEGORY_ID", "int", "category_id", "int"),
              ("SUPPLIER_ID", "int", "supplierId", "int"),
              ("CATEGORY_NAME", "string", "category:String", "string"),
              ("DESCRIPTION", "string", "description:String", "string"),
              ("IMAGE_URL", "string", "imageUrl:String", "string")],
    transformation_ctx="applymapping1")
applymapping1 = GlueGremlinCsvTransforms.create_prefixed_columns(
    applymapping1, [('~id', 'productId', 'p'), ('~to', 'supplierId', 's')])
selectfields1 = SelectFields.apply(frame=applymapping1,
                                   paths=[
                                       "~id", "name:String", "category:String",
                                       "description:String", "unitPrice",
                                       "quantityPerUnit:Int", "imageUrl:String"
                                   ],
                                   transformation_ctx="selectfields1")

writeCsvFile(GlueGremlinCsvTransforms.addLabel(selectfields1, 'Product'),
             nodes_path)

# SUPPLIER edges

print("Creating SUPPLIER edges...")
Esempio n. 8
0
rdf.show(1)

# 2. Map fields to bulk load CSV column headings format
applymapping1 = ApplyMapping.apply(
    frame=rdf,
    mappings=[("`Reservations.val.Instances.val.InstanceId`", "string",
               "instanceId:String", "string"),
              ("`Reservations.val.Instances.val.InstanceType`", "string",
               "instanceType:String", "string")],
    transformation_ctx="applymapping1")

applymapping1.show(1)

# 3. Append prefixes to values in ID columns (ensures vertices for different types have unique IDs across graph)
applymapping2 = GlueGremlinCsvTransforms.create_prefixed_columns(
    applymapping1, [('~id', 'instanceId:String', 'ec2')])

# 4. Select fields for upsert
selectfields1 = SelectFields.apply(
    frame=applymapping2,
    paths=["~id", 'instanceId:String', 'instanceType:String'],
    transformation_ctx="selectfields1")

# 5. Write to s3
writeCsvFile(GlueGremlinCsvTransforms.addLabel(selectfields1, 'EC2'),
             nodes_folder)

job.commit()

print("Done")