コード例 #1
0
 def __init__(self, neptune_endpoint, elasticache_endpoint):
     GremlinUtils.init_statics(globals())
     gremlin_utils = GremlinUtils(
         Endpoints(neptune_endpoint=neptune_endpoint))
     self.vertext_metrics = VertexMetrics(elasticache_endpoint)
     self.neptune_connection = gremlin_utils.remote_connection()
     self.g = gremlin_utils.traversal_source(
         connection=self.neptune_connection)
コード例 #2
0
    def handle_records(self, stream_log):

        params = json.loads(os.environ['AdditionalParams'])

        neptune_endpoint = params['neptune_cluster_endpoint']
        neptune_port = params['neptune_port']

        GremlinUtils.init_statics(globals())

        endpoints = Endpoints(neptune_endpoint=neptune_endpoint,
                              neptune_port=neptune_port)
        gremlin_utils = GremlinUtils(endpoints)

        conn = gremlin_utils.remote_connection()
        g = gremlin_utils.traversal_source(connection=conn)

        records = stream_log[RECORDS_STR]

        last_op_num = None
        last_commit_num = None
        count = 0

        try:
            for record in records:

                # Process record
                op = record[OPERATION_STR]
                data = record[DATA_STR]
                type = data['type']
                id = data['id']

                if op == ADD_OPERATION:
                    if type == 'vl':
                        logger.info(g.V(id).valueMap(True).toList())
                    if type == 'e':
                        logger.info(g.E(id).valueMap(True).toList())

                # Update local checkpoint info
                last_op_num = record[EVENT_ID_STR][OP_NUM_STR]
                last_commit_num = record[EVENT_ID_STR][COMMIT_NUM_STR]
                count += 1

        except Exception as e:
            logger.error('Error occurred - {}'.format(str(e)))
            raise e
        finally:
            try:
                conn.close()
                yield HandlerResponse(last_op_num, last_commit_num, count)
            except Exception as e:
                logger.error('Error occurred - {}'.format(str(e)))
                raise e
            finally:
                conn.close()
コード例 #3
0
class GlueGremlinClient:
    def __init__(self, endpoints):

        self.gremlin_utils = GremlinUtils(endpoints)

        GremlinUtils.init_statics(globals())

    def add_vertices(self, label):
        """Adds a vertex with the supplied label for each row in a DataFrame partition.
        If the DataFrame contains an '~id' column, the values in this column will be treated as user-supplied IDs for the new vertices.
        If the DataFrame does not have an '~id' column, Neptune will autogenerate a UUID for each vertex. 
        
        Example:
        >>> dynamicframe.toDF().foreachPartition(neptune.add_vertices('Product'))
        """
        def add_vertices_for_label(rows):
            try:
                conn = self.gremlin_utils.remote_connection()
                g = self.gremlin_utils.traversal_source(connection=conn)
                for row in rows:
                    entries = row.asDict()
                    traversal = g.addV(label)
                    for key, value in entries.items():
                        key = key.split(':')[0]
                        if key == '~id':
                            traversal.property(id, value)
                        elif key == '~label':
                            pass
                        else:
                            traversal.property(key, value)
                    traversal.next()
                conn.close()
            except GremlinServerError as err:
                print("Neptune error: {0}".format(err))
            except:
                print("Unexpected error:", sys.exc_info()[0])

        return add_vertices_for_label

    def upsert_vertices(self, label):
        """Conditionally adds vertices for the rows in a DataFrame partition using the Gremlin coalesce() idiom.
        The DataFrame must contain an '~id' column. 
        
        Example:
        >>> dynamicframe.toDF().foreachPartition(neptune.upsert_vertices('Product'))
        """
        def upsert_vertices_for_label(rows):
            try:
                conn = self.gremlin_utils.remote_connection()
                g = self.gremlin_utils.traversal_source(connection=conn)
                for row in rows:
                    entries = row.asDict()
                    create_traversal = __.addV(label)
                    for key, value in entries.items():
                        key = key.split(':')[0]
                        if key == '~id':
                            create_traversal.property(id, value)
                        elif key == '~label':
                            pass
                        else:
                            create_traversal.property(key, value)
                    g.V(entries['~id']).fold().coalesce(
                        __.unfold(), create_traversal).next()
                conn.close()
            except GremlinServerError as err:
                print("Neptune error: {0}".format(err))
            except:
                print("Unexpected error:", sys.exc_info()[0])

        return upsert_vertices_for_label

    def add_edges(self, label):
        """Adds an edge with the supplied label for each row in a DataFrame partition.
        If the DataFrame contains an '~id' column, the values in this column will be treated as user-supplied IDs for the new edges.
        If the DataFrame does not have an '~id' column, Neptune will autogenerate a UUID for each edge. 
        
        Example:
        >>> dynamicframe.toDF().foreachPartition(neptune.add_edges('ORDER_DETAIL'))
        """
        def add_edges_for_label(rows):
            try:
                conn = self.gremlin_utils.remote_connection()
                g = self.gremlin_utils.traversal_source(connection=conn)
                for row in rows:
                    entries = row.asDict()
                    traversal = g.V(row['~from']).addE(label).to(V(
                        row['~to'])).property(id, row['~id'])
                    for key, value in entries.items():
                        key = key.split(':')[0]
                        if key not in ['~id', '~from', '~to', '~label']:
                            traversal.property(key, value)
                    traversal.next()
                conn.close()
            except GremlinServerError as err:
                print("Neptune error: {0}".format(err))
            except:
                print("Unexpected error:", sys.exc_info()[0])

        return add_edges_for_label

    def upsert_edges(self, label):
        """Conditionally adds edges for the rows in a DataFrame partition using the Gremlin coalesce() idiom.
        The DataFrame must contain '~id', '~from', '~to' and '~label' columns. 
        
        Example:
        >>> dynamicframe.toDF().foreachPartition(neptune.upsert_edges('ORDER_DETAIL'))
        """
        def add_edges_for_label(rows):
            try:
                conn = self.gremlin_utils.remote_connection()
                g = self.gremlin_utils.traversal_source(connection=conn)
                for row in rows:
                    entries = row.asDict()
                    create_traversal = __.V(row['~from']).addE(label).to(
                        V(row['~to'])).property(id, row['~id'])
                    for key, value in entries.items():
                        key = key.split(':')[0]
                        if key not in ['~id', '~from', '~to', '~label']:
                            create_traversal.property(key, value)
                    g.E(entries['~id']).fold().coalesce(
                        __.unfold(), create_traversal).next()
                conn.close()
            except GremlinServerError as err:
                print("Neptune error: {0}".format(err))
            except:
                print("Unexpected error:", sys.exc_info()[0])

        return add_edges_for_label
コード例 #4
0
class BatchUtils:
    def __init__(self,
                 endpoints,
                 job_name=None,
                 to_dict=lambda x: x,
                 pool_size=1,
                 **kwargs):

        self.gremlin_utils = GremlinUtils(endpoints)
        self.conn = None
        self.g = None
        self.region = endpoints.region
        self.job_name = job_name
        self.to_dict = to_dict
        self.pool_size = pool_size
        self.kwargs = kwargs

    def close(self):
        try:
            self.gremlin_utils.close()
        except:
            pass

    def __execute_batch_internal(self, rows, operations, **kwargs):
        @backoff.on_exception(backoff.constant,
                              tuple(retriable_errors),
                              max_tries=5,
                              giveup=is_non_retriable_error,
                              on_backoff=reset_connection_if_connection_issue,
                              on_success=publish_metrics,
                              interval=2,
                              jitter=backoff.full_jitter)
        def execute(self, rows, operations, **kwargs):

            if not self.conn:
                self.conn = self.gremlin_utils.remote_connection(
                    pool_size=self.pool_size, **self.kwargs)
                self.g = self.gremlin_utils.traversal_source(
                    connection=self.conn)

            t = self.g
            for operation in operations:
                for row in rows:
                    t = operation(t, row, **kwargs)
            t.next()

        return execute(self, rows, operations, **kwargs)

    def execute_batch(self, rows, operations=[], batch_size=50, **kwargs):

        if 'mappings' not in kwargs:
            kwargs['mappings'] = Mappings()

        rows_list = []

        for row in rows:
            rows_list.append(self.to_dict(row))
            if len(rows_list) == batch_size:
                self.__execute_batch_internal(rows_list, operations, **kwargs)
                rows_list = []

        if rows_list:
            self.__execute_batch_internal(rows_list, operations, **kwargs)

    def add_vertices(self, batch_size=50, rows=None, **kwargs):
        def batch_op(rows):
            self.execute_batch(rows,
                               operations=[add_vertex],
                               batch_size=batch_size,
                               **kwargs)

        return batch_op(rows) if rows else batch_op

    def upsert_vertices(self, batch_size=50, rows=None, **kwargs):
        def batch_op(rows):
            operations = [upsert_vertex]
            on_upsert = kwargs.get('on_upsert', None)
            if on_upsert and on_upsert == 'replaceAllProperties':
                operations.append(replace_vertex_properties)
            self.execute_batch(rows,
                               operations=operations,
                               batch_size=batch_size,
                               **kwargs)

        return batch_op(rows) if rows else batch_op

    def add_edges(self, batch_size=50, rows=None, **kwargs):
        def batch_op(rows):
            self.execute_batch(rows,
                               operations=[add_edge],
                               batch_size=batch_size,
                               **kwargs)

        return batch_op(rows) if rows else batch_op

    def upsert_edges(self, batch_size=50, rows=None, **kwargs):
        def batch_op(rows):
            operations = [upsert_edge]
            on_upsert = kwargs.get('on_upsert', None)
            if on_upsert and on_upsert == 'replaceAllProperties':
                operations.append(replace_edge_properties)
            self.execute_batch(rows,
                               operations=operations,
                               batch_size=batch_size,
                               **kwargs)

        return batch_op(rows) if rows else batch_op

    def add_edge_properties(self, batch_size=50, rows=None, **kwargs):
        def batch_op(rows):
            self.execute_batch(rows,
                               operations=[add_properties_to_edge],
                               batch_size=batch_size,
                               **kwargs)

        return batch_op(rows) if rows else batch_op
コード例 #5
0
class GraphModelClient:
    def __init__(self, endpoint):
        self.gremlin_utils = GremlinUtils(endpoint)
        GremlinUtils.init_statics(globals())

    def insert_new_transaction_vertex_and_edge(self,
                                               tr_dict,
                                               connectted_node_dict,
                                               target_id,
                                               vertex_type='Transaction'):
        """Load transaction data, insert transaction object and related domain objects into GraphDB as vertex,
        with their properties as values, and insert their relation as edges.
            
        Example:
        >>> insert_new_transaction_vertex_and_edge(tr_dict, connectted_node_dict, target_id, vertex_type = 'Transaction')
        """
        def insert_attr(graph_conn, attr_val_dict, target_id, node_id,
                        vertex_type):

            if (not g.V().has(id, node_id).hasNext()):
                logger.info(f'Insert_Vertex: {node_id}.')
                g.inject(attr_val_dict).unfold().as_(vertex_type).\
                addV(vertex_type).as_('v').property(id,node_id).\
                sideEffect(__.select(vertex_type).unfold().as_('kv').select('v').\
                    property(__.select('kv').by(Column.keys),
                                __.select('kv').by(Column.values)
                                )
                    ).iterate()

            # Insert_edge

            to_node = g.V().has(id, node_id).next()
            if (not g.E().has(id, target_id + '-' + node_id).hasNext()):
                logger.info(f'Insert_Edge: {target_id} --> {node_id}.')
                g.V().has(id, target_id).addE('CATEGORY').to(to_node).property(
                    id, target_id + '-' + node_id).iterate()

        conn = self.gremlin_utils.remote_connection()
        g = self.gremlin_utils.traversal_source(connection=conn)

        if (not g.V().has(id, target_id).hasNext()):
            logger.info(f'Insert_Vertex: {target_id}.')
            g.inject(tr_dict).unfold().as_(vertex_type).\
            addV(vertex_type).as_('v').property(id,target_id).\
            sideEffect(__.select(vertex_type).unfold().as_('kv').select('v').\
                property(__.select('kv').by(Column.keys),
                            __.select('kv').by(Column.values)
                            )
                ).iterate()

        attr_cols = [f'val{x}' for x in range(1, 391)]
        empty_node_dict = {}
        for attr in attr_cols:
            empty_node_dict[attr] = 0.0

        for node_k, node_v in connectted_node_dict[0].items():
            node_id = node_k + '-' + str(node_v)
            insert_attr(g, [empty_node_dict],
                        target_id,
                        node_id,
                        vertex_type=node_k)

        conn.close()

    def query_target_subgraph(self, target_id, tr_dict, transaction_value_cols,
                              union_id_cols, dummied_col):
        """Extract 2nd degree subgraph of target transaction.Dump data into subgraph dict and n_feats dict.
        subgraph_dict:  related transactions' id list and values through edges
        n_feats dict: related 1 degree vertex and transactions' embeded elements vectors. 
        Usually after insert new test sample's vertex and edges into graphDB. 
        
        Example:
        >>> query_target_subgraph('3661635', load_data_from_event(), 'M2_T,M3_F,M3_T,...')
        """
        subgraph_dict = {}
        neighbor_list = []
        neighbor_dict = {}
        transaction_embed_value_dict = {}

        ii = 0
        s_t = dt.now()

        conn = self.gremlin_utils.remote_connection()
        g = self.gremlin_utils.traversal_source(connection=conn)
        t1 = self.gremlin_utils.traversal_source(connection=conn)

        target_name = target_id[(target_id.find('-') + 1):]
        feature_list = g.V().has(id, target_id).out().id().toList()
        for feat in feature_list:
            ii += 1
            feat_name = feat[:feat.find('-')]
            feat_value = feat[(feat.find('-') + 1):]
            node_list = g.V().has(
                id, feat).both().limit(MAX_FEATURE_NODE).id().toList()
            target_and_conn_node_list = [int(target_name)] + [
                int(target_conn_node[(target_conn_node.find('-') + 1):])
                for target_conn_node in node_list
            ]
            target_and_conn_node_list = list(set(target_and_conn_node_list))
            neighbor_list += target_and_conn_node_list
            nodes_and_feature_value_array = (target_and_conn_node_list,
                                             [feat_value] *
                                             len(target_and_conn_node_list))
            subgraph_dict['target<>' +
                          feat_name] = nodes_and_feature_value_array

        e_t = dt.now()
        logger.info(
            f'INSIDE query_target_subgraph: subgraph_dict used {(e_t - s_t).total_seconds()} seconds'
        )
        logger.info(
            f'subgraph_dict len: {len(subgraph_dict.keys())}  key: {subgraph_dict.keys()}'
        )
        logger.info(f'subgraph_dict: {subgraph_dict}')
        new_s_t = e_t

        union_li = [
            t1.V().has(id, target_id).both().hasLabel(label).both().limit(
                MAX_FEATURE_NODE) for label in union_id_cols
        ]
        logger.info(
            f'union_id_cols len: {len(union_id_cols)}  key: {union_id_cols}')
        logger.info(f'union_li len: {len(union_li)}  key: {union_li}')

        if len(union_id_cols) == 51:
            node_dict = g.V().has(id,target_id).union(__.both().hasLabel('card1').both().limit(MAX_FEATURE_NODE),\
                    union_li[1], union_li[2], union_li[3], union_li[4], union_li[5],\
                    union_li[6], union_li[7], union_li[8], union_li[9], union_li[10],\
                    union_li[11], union_li[12], union_li[13], union_li[14], union_li[15],\
                    union_li[16], union_li[17], union_li[18], union_li[19], union_li[20],\
                    union_li[21], union_li[22], union_li[23], union_li[24], union_li[25],\
                    union_li[26], union_li[27], union_li[28], union_li[29], union_li[30],\
                    union_li[31], union_li[32], union_li[33], union_li[34], union_li[35],\
                    union_li[36], union_li[37], union_li[38], union_li[39], union_li[40],\
                    union_li[41], union_li[42], union_li[43], union_li[44], union_li[45],\
                    union_li[46], union_li[47], union_li[48], union_li[49], union_li[50]).elementMap().toList()
        else:
            node_dict = g.V().has(id,target_id).union(__.both().hasLabel('card1').both().limit(MAX_FEATURE_NODE),\
                    union_li[1], union_li[2], union_li[3], union_li[4], union_li[5],\
                    union_li[6], union_li[7], union_li[8], union_li[9], union_li[10]).elementMap().toList()

        e_t = dt.now()
        logger.info(
            f'INSIDE query_target_subgraph: node_dict used {(e_t - new_s_t).total_seconds()} seconds.'
        )
        new_s_t = e_t

        logger.info(f'node_dict len: {len(node_dict)}  key: {node_dict}')

        for item in node_dict:
            node = item.get(list(item)[0])
            node_value = node[(node.find('-') + 1):]
            neighbor_dict[node_value] = [
                item.get(key) for key in transaction_value_cols
            ]

        target_value = target_id[(target_id.find('-') + 1):]
        neighbor_dict[target_value] = [
            tr_dict[0].get(key) for key in transaction_value_cols
        ]

        logger.info(
            f'INSIDE query_target_subgraph: node_dict used {(e_t - new_s_t).total_seconds()} seconds.'
        )
        logger.info(
            f'neighbor_dict len: {len(neighbor_dict.keys())}  key: {neighbor_dict.keys()}'
        )
        logger.info(f'neighbor_dict: {neighbor_dict}')

        attr_cols = ['val' + str(x) for x in range(1, 391)]
        for attr in feature_list:
            attr_name = attr[:attr.find('-')]
            attr_value = attr[(attr.find('-') + 1):]
            attr_dict = g.V().has(id, attr).valueMap().toList()[0]
            attr_dict = [attr_dict.get(key)[-1] for key in attr_cols]
            attr_input_dict = {}
            attr_input_dict[attr_value] = attr_dict
            transaction_embed_value_dict[attr_name] = attr_input_dict

        e_t = dt.now()
        logger.info(
            f'INSIDE query_target_subgraph: transaction_embed_value_dict used {(e_t - new_s_t).total_seconds()} seconds. Total test cost {(e_t - s_t).total_seconds()} seconds.'
        )
        new_s_t = e_t

        transaction_embed_value_dict['target'] = neighbor_dict

        conn.close()

        logger.info(
            f'transaction_embed_value_dict len: {len(transaction_embed_value_dict.keys())} key: {transaction_embed_value_dict.keys()}'
        )
        logger.info(
            f'transaction_embed_value_dict: {transaction_embed_value_dict}')

        return subgraph_dict, transaction_embed_value_dict
コード例 #6
0
 def __init__(self):
     GremlinUtils.init_statics(globals())
     gremlin_utils = GremlinUtils()
     self.neptune_connection = gremlin_utils.remote_connection()
     self.g = gremlin_utils.traversal_source(
         connection=self.neptune_connection)
コード例 #7
0
# Simple way of creating endpoints, requires creating a dummy connection in Glue
endpoints = GlueNeptuneConnectionInfo(
    'us-east-1', args['neptune_connection_role']).neptune_endpoints(
        args['neptune_connection_name'])

# Complex way of creating endpoints - no connection required, but needs the neptune url
# sts = boto3.client('sts', region_name='us-east-1')
# role = sts.assume_role(RoleArn=role_arn, RoleSessionName='bananananame', DurationSeconds=3600)
# credentials = Credentials(
#     access_key=role['Credentials']['AccessKeyId'],
#     secret_key=role['Credentials']['SecretAccessKey'],
#     token=role['Credentials']['SessionToken'])

gremlin_utils = GremlinUtils(endpoints)
conn = gremlin_utils.remote_connection(show_endpoint=True)
g = gremlin_utils.traversal_source(connection=conn)

print("Endpoints created")

print(g.V().limit(10).valueMap().toList())

print("Sanity checked")

bulkload = BulkLoad(source='s3://co-resource-ingestion-bucket-dev/output-dir/',
                    role=args['neptune_to_s3_role'],
                    region='us-east-1',
                    endpoints=endpoints)

bulkload.load()
コード例 #8
0
class GlueGremlinClient:
    def __init__(self, endpoints):

        self.gremlin_utils = GremlinUtils(endpoints)

        GremlinUtils.init_statics(globals())

    def not_cme(e):
        return '"code":"ConcurrentModificationException"' not in str(e)

    @backoff.on_exception(backoff.expo,
                          GremlinServerError,
                          max_tries=5,
                          giveup=not_cme)
    def retry_query(self, query):
        q = query
        q.next()

    def add_vertices(self, label, batch_size=1):
        """Adds a vertex with the supplied label for each row in a DataFrame partition.
        If the DataFrame contains an '~id' column, the values in this column will be treated as user-supplied IDs for the new vertices.
        If the DataFrame does not have an '~id' column, Neptune will autogenerate a UUID for each vertex. 
        
        Example:
        >>> dynamicframe.toDF().foreachPartition(neptune.add_vertices('Product'))
        """
        def add_vertices_for_label(rows):

            conn = self.gremlin_utils.remote_connection()
            g = self.gremlin_utils.traversal_source(connection=conn)

            t = g
            i = 0
            for row in rows:
                entries = row.asDict()
                t = t.addV(label)
                for key, value in entries.items():
                    key = key.split(':')[0]
                    if key == '~id':
                        t = t.property(id, value)
                    elif key == '~label':
                        pass
                    else:
                        t = t.property(key, value)
                i += 1
                if i == batch_size:
                    self.retry_query(t)
                    t = g
                    i = 0
            if i > 0:
                self.retry_query(t)

            conn.close()

        return add_vertices_for_label

    def upsert_vertices(self, label, batch_size=1):
        """Conditionally adds vertices for the rows in a DataFrame partition using the Gremlin coalesce() idiom.
        The DataFrame must contain an '~id' column. 
        
        Example:
        >>> dynamicframe.toDF().foreachPartition(neptune.upsert_vertices('Product'))
        """
        def upsert_vertices_for_label(rows):

            conn = self.gremlin_utils.remote_connection()
            g = self.gremlin_utils.traversal_source(connection=conn)

            t = g
            i = 0
            for row in rows:
                entries = row.asDict()
                create_traversal = __.addV(label)
                for key, value in entries.items():
                    key = key.split(':')[0]
                    if key == '~id':
                        create_traversal = create_traversal.property(id, value)
                    elif key == '~label':
                        pass
                    else:
                        create_traversal = create_traversal.property(
                            key, value)
                t = t.V(entries['~id']).fold().coalesce(
                    __.unfold(), create_traversal)
                i += 1
                if i == batch_size:
                    self.retry_query(t)
                    t = g
                    i = 0
            if i > 0:
                self.retry_query(t)

            conn.close()

        return upsert_vertices_for_label

    def add_edges(self, label, batch_size=1):
        """Adds an edge with the supplied label for each row in a DataFrame partition.
        If the DataFrame contains an '~id' column, the values in this column will be treated as user-supplied IDs for the new edges.
        If the DataFrame does not have an '~id' column, Neptune will autogenerate a UUID for each edge. 
        
        Example:
        >>> dynamicframe.toDF().foreachPartition(neptune.add_edges('ORDER_DETAIL'))
        """
        def add_edges_for_label(rows):

            conn = self.gremlin_utils.remote_connection()
            g = self.gremlin_utils.traversal_source(connection=conn)

            t = g
            i = 0
            for row in rows:
                entries = row.asDict()
                t = t.V(entries['~from']).addE(label).to(V(
                    entries['~to'])).property(id, entries['~id'])
                for key, value in entries.items():
                    key = key.split(':')[0]
                    if key not in ['~id', '~from', '~to', '~label']:
                        t = t.property(key, value)
                i += 1
                if i == batch_size:
                    self.retry_query(t)
                    t = g
                    i = 0
            if i > 0:
                self.retry_query(t)

            conn.close()

        return add_edges_for_label

    def upsert_edges(self, label, batch_size=1):
        """Conditionally adds edges for the rows in a DataFrame partition using the Gremlin coalesce() idiom.
        The DataFrame must contain '~id', '~from', '~to' and '~label' columns. 
        
        Example:
        >>> dynamicframe.toDF().foreachPartition(neptune.upsert_edges('ORDER_DETAIL'))
        """
        def add_edges_for_label(rows):

            conn = self.gremlin_utils.remote_connection()
            g = self.gremlin_utils.traversal_source(connection=conn)

            t = g
            i = 0
            for row in rows:
                entries = row.asDict()
                create_traversal = __.V(entries['~from']).addE(label).to(
                    V(entries['~to'])).property(id, entries['~id'])
                for key, value in entries.items():
                    key = key.split(':')[0]
                    if key not in ['~id', '~from', '~to', '~label']:
                        create_traversal.property(key, value)
                t = t.V(entries['~from']).outE(label).hasId(
                    entries['~id']).fold().coalesce(__.unfold(),
                                                    create_traversal)
                i += 1
                if i == batch_size:
                    self.retry_query(t)
                    t = g
                    i = 0
            if i > 0:
                self.retry_query(t)

            conn.close()

        return add_edges_for_label
コード例 #9
0
class GraphModelClient:
    def __init__(self, endpoint):
        self.gremlin_utils = GremlinUtils(endpoint)
        GremlinUtils.init_statics(globals())

    def insert_new_transaction_vertex_and_edge(self,
                                               tr_dict,
                                               connectted_node_dict,
                                               target_id,
                                               vertex_type='Transaction'):
        """Load transaction data, insert transaction object and related domain objects into GraphDB as vertex,
        with their properties as values, and insert their relation as edges.
            
        Example:
        >>> insert_new_transaction_vertex_and_edge(tr_dict, connectted_node_dict, target_id, vertex_type = 'Transaction')
        """
        def insert_attr(graph_conn, attr_val_dict, target_id, node_id,
                        vertex_type):

            if (not g.V().has(id, node_id).hasNext()):
                logger.info(f'Insert_Vertex: {node_id}.')
                g.inject(attr_val_dict).unfold().as_(vertex_type).\
                addV(vertex_type).as_('v').property(id,node_id).\
                sideEffect(__.select(vertex_type).unfold().as_('kv').select('v').\
                    property(Cardinality.single, __.select('kv').by(Column.keys),
                                __.select('kv').by(Column.values)
                                )
                    ).iterate()
            else:
                logger.debug(
                    f'Ignore inserting existing Vertex with id {node_id}')

            # Insert_edge

            to_node = g.V().has(id, node_id).next()
            edgeId = target_id + '-' + node_id
            if (not g.E().has(id, edgeId).hasNext()):
                logger.info(f'Insert_Edge: {target_id} --> {node_id}.')
                g.V().has(id, target_id).addE('CATEGORY').to(to_node).property(
                    id, edgeId).iterate()
            else:
                logger.debug(
                    f'Ignore inserting existing edge with id {edgeId}')

        conn = self.gremlin_utils.remote_connection()
        g = self.gremlin_utils.traversal_source(connection=conn)

        if (not g.V().has(id, target_id).hasNext()):
            logger.info(f'Insert_Vertex: {target_id}.')
            g.inject(tr_dict).unfold().as_(vertex_type).\
            addV(vertex_type).as_('v').property(id,target_id).\
            sideEffect(__.select(vertex_type).unfold().as_('kv').select('v').\
                property(Cardinality.single, __.select('kv').by(Column.keys),
                            __.select('kv').by(Column.values)
                            )
                ).iterate()

        cols = {'val' + str(i + 1): '0.0' for i in range(390)}
        for node_k, node_v in connectted_node_dict[0].items():
            node_id = node_k + '-' + str(node_v)
            empty_node_dict = {}
            empty_node_dict[attr_version_key] = json.dumps(cols)
            empty_node_dict = [empty_node_dict]
            insert_attr(g,
                        empty_node_dict,
                        target_id,
                        node_id,
                        vertex_type=node_k)

        conn.close()

    def query_target_subgraph(self, target_id, tr_dict, transaction_value_cols,
                              union_id_cols, dummied_col):
        """Extract 2nd degree subgraph of target transaction.Dump data into subgraph dict and n_feats dict.
        subgraph_dict:  related transactions' id list and values through edges
        n_feats dict: related 1 degree vertex and transactions' embeded elements vectors. 
        Usually after insert new test sample's vertex and edges into graphDB. 
        
        Example:
        >>> query_target_subgraph('3661635', load_data_from_event(), 'M2_T,M3_F,M3_T,...')
        """
        subgraph_dict = {}
        neighbor_list = []
        neighbor_dict = {}
        transaction_embed_value_dict = {}

        ii = 0
        s_t = dt.now()

        conn = self.gremlin_utils.remote_connection()
        g = self.gremlin_utils.traversal_source(connection=conn)

        target_name = target_id[(target_id.find('-') + 1):]
        feature_list = g.V().has(id, target_id).out().id().toList()
        for feat in feature_list:
            ii += 1
            feat_name = feat[:feat.find('-')]
            feat_value = feat[(feat.find('-') + 1):]
            node_list = g.V().has(
                id, feat).both().limit(MAX_FEATURE_NODE).id().toList()
            target_and_conn_node_list = [int(target_name)] + [
                int(target_conn_node[(target_conn_node.find('-') + 1):])
                for target_conn_node in node_list
            ]
            target_and_conn_node_list = list(set(target_and_conn_node_list))
            neighbor_list += target_and_conn_node_list
            nodes_and_feature_value_array = (target_and_conn_node_list,
                                             [feat_value] *
                                             len(target_and_conn_node_list))
            subgraph_dict['target<>' +
                          feat_name] = nodes_and_feature_value_array

        e_t = dt.now()
        logger.info(
            f'INSIDE query_target_subgraph: subgraph_dict used {(e_t - s_t).total_seconds()} seconds'
        )
        new_s_t = e_t

        union_li = [
            __.V().has(id, target_id).both().hasLabel(label).both().limit(
                MAX_FEATURE_NODE) for label in union_id_cols
        ]

        if len(union_id_cols) == 51:
            node_dict = g.V().has(id,target_id).union(__.both().hasLabel('card1').both().limit(MAX_FEATURE_NODE),\
                    union_li[1], union_li[2], union_li[3], union_li[4], union_li[5],\
                    union_li[6], union_li[7], union_li[8], union_li[9], union_li[10],\
                    union_li[11], union_li[12], union_li[13], union_li[14], union_li[15],\
                    union_li[16], union_li[17], union_li[18], union_li[19], union_li[20],\
                    union_li[21], union_li[22], union_li[23], union_li[24], union_li[25],\
                    union_li[26], union_li[27], union_li[28], union_li[29], union_li[30],\
                    union_li[31], union_li[32], union_li[33], union_li[34], union_li[35],\
                    union_li[36], union_li[37], union_li[38], union_li[39], union_li[40],\
                    union_li[41], union_li[42], union_li[43], union_li[44], union_li[45],\
                    union_li[46], union_li[47], union_li[48], union_li[49], union_li[50]).elementMap().toList()
        else:
            node_dict = g.V().has(id,target_id).union(__.both().hasLabel('card1').both().limit(MAX_FEATURE_NODE),\
                    union_li[1], union_li[2], union_li[3], union_li[4], union_li[5],\
                    union_li[6], union_li[7], union_li[8], union_li[9], union_li[10]).elementMap().toList()

        e_t = dt.now()
        logger.info(
            f'INSIDE query_target_subgraph: node_dict used {(e_t - new_s_t).total_seconds()} seconds.'
        )
        new_s_t = e_t

        logger.debug(f'Found {len(node_dict)} nodes from graph dbs...')

        class Item():
            def __init__(self, item):
                self.item = item

            def __hash__(self):
                return hash(self.item.get(list(self.item)[0]))

            def __eq__(self, other):
                if isinstance(other, self.__class__):
                    return self.__hash__() == other.__hash__()
                else:
                    return NotImplemented

            def __repr__(self):
                return "Item(%s)" % (self.item)

        node_dict = list(set([Item(node) for node in node_dict]))
        logger.debug(f'Found {len(node_dict)} nodes without duplication')
        for item in node_dict:
            item = item.item
            node = item.get(list(item)[0])
            node_value = node[(node.find('-') + 1):]
            try:
                logger.debug(
                    f'the props of node {node} is {item.get(attr_version_key)}'
                )
                jsonVal = json.loads(item.get(attr_version_key))
                neighbor_dict[node_value] = [
                    jsonVal[key] for key in transaction_value_cols
                ]
                logger.debug(
                    f'neighbor pair is {node_value}, {neighbor_dict[node_value]}'
                )
            except json.JSONDecodeError:
                logger.warn(
                    f'Malform node value {node} is {item.get(attr_version_key)}, run below cmd to remove it'
                )
                logger.info(f'g.V(\'{node}\').drop()')

        target_value = target_id[(target_id.find('-') + 1):]
        jsonVal = json.loads(tr_dict[0].get(attr_version_key))
        neighbor_dict[target_value] = [
            jsonVal[key] for key in transaction_value_cols
        ]

        logger.info(
            f'INSIDE query_target_subgraph: neighbor_dict used {(e_t - new_s_t).total_seconds()} seconds.'
        )

        attr_cols = ['val' + str(x) for x in range(1, 391)]
        for attr in feature_list:
            attr_name = attr[:attr.find('-')]
            attr_value = attr[(attr.find('-') + 1):]
            attr_dict = g.V().has(id, attr).valueMap().toList()[0]
            logger.debug(f'attr is {attr}, dict is {attr_dict}')
            jsonVal = json.loads(attr_dict.get(attr_version_key)[0])
            attr_dict = [float(jsonVal[key]) for key in attr_cols]
            attr_input_dict = {}
            attr_input_dict[attr_value] = attr_dict
            transaction_embed_value_dict[attr_name] = attr_input_dict

        e_t = dt.now()
        logger.info(
            f'INSIDE query_target_subgraph: transaction_embed_value_dict used {(e_t - new_s_t).total_seconds()} seconds. Total test cost {(e_t - s_t).total_seconds()} seconds.'
        )
        new_s_t = e_t

        transaction_embed_value_dict['target'] = neighbor_dict

        conn.close()

        return subgraph_dict, transaction_embed_value_dict