Example #1
0
    def get_neighbors(self, entity):
        """
        Returns all the neighbors of a node
            :param entity: the entity id of a vertex
            :returns: a list of neighbors attributes
        """

        # find the node identified by `entity`
        traversal = self.g.V().has('entity', entity)

        # find the neighbors of this node
        traversal = traversal.bothE().bothV().dedup()

        # calculates extra attributes
        traversal = traversal \
            .property('degree', __.both().dedup().count()) \
            .property('in_degree_weighted',
                      __.inE().values('valeur_euro').sum()) \
            .property('out_degree_weighted',
                      __.outE().values('valeur_euro').sum())

        # select only specific attributes
        traversal = traversal.project(
            'entity',
            'prenom',
            'nom',
            'prenom_nom',
            'date_naissance',
            'pays_code',
            'code_postal',
            'numero_piece_identite',
            'star',
            'degree',
            'in_degree_weighted',
            'out_degree_weighted') \
            .by('entity') \
            .by('prenom') \
            .by('nom') \
            .by('prenomnom') \
            .by('date_naissance') \
            .by('pays_code') \
            .by('code_postal') \
            .by('numero_piece_identite') \
            .by('star') \
            .by('degree') \
            .by('in_degree_weighted') \
            .by('out_degree_weighted')

        neighbors = traversal.toList()
        return neighbors
Example #2
0
 def search(self, prenom_nom):
     """
     Search all nodes which attribute `prenomnom` matches the query
         :param prenom_nom: the text query
     """
     tokens = prenom_nom.strip().split()
     str_query = es_fuzzy_string_query(tokens)
     # Use JanusGraph direct index query
     # https://docs.janusgraph.org/latest/direct-index-query.html
     query = "graph.indexQuery('vertexByPrenomNom', 'v.prenomnom:%s')\
         .limit(10).vertices()" % str_query
     client = self.connection._client
     vertices_and_score = client.submit(query).all().result()
     vertices = [v["element"] for v in vertices_and_score]
     search_results = []
     if vertices:
         # find all the matching vertices
         traversal = self.g.V(vertices)
         # add the attribute `degree` on each vertex
         traversal = traversal.property('degree', __.both().dedup().count())
         # select attributes
         traversal = traversal \
             .project(
                 'entity',
                 'prenom_nom',
                 'prenom',
                 'nom',
                 'code_postal',
                 'pays_code',
                 'numero_piece_identite',
                 'degree') \
             .by('entity') \
             .by('prenomnom') \
             .by('prenom') \
             .by('nom') \
             .by('code_postal') \
             .by('pays_code') \
             .by('numero_piece_identite') \
             .by('degree')
         search_results.extend(traversal.toList())
     return search_results
Example #3
0
    def test_traversals(self, remote_connection):
        statics.load_statics(globals())
        g = traversal().withRemote(remote_connection)

        assert long(6) == g.V().count().toList()[0]
        # #
        assert Vertex(1) == g.V(1).next()
        assert Vertex(1) == g.V(Vertex(1)).next()
        assert 1 == g.V(1).id_().next()
        assert Traverser(Vertex(1)) == g.V(1).nextTraverser()
        assert 1 == len(g.V(1).toList())
        assert isinstance(g.V(1).toList(), list)
        results = g.V().repeat(__.out()).times(2).name
        results = results.toList()
        assert 2 == len(results)
        assert "lop" in results
        assert "ripple" in results
        # #
        assert 10 == g.V().repeat(__.both()).times(5)[0:10].count().next()
        assert 1 == g.V().repeat(__.both()).times(5)[0:1].count().next()
        assert 0 == g.V().repeat(__.both()).times(5)[0:0].count().next()
        assert 4 == g.V()[2:].count().next()
        assert 2 == g.V()[:2].count().next()
        # #
        results = g.withSideEffect(
            'a', ['josh', 'peter'
                  ]).V(1).out('created').in_('created').values('name').where(
                      P.within('a')).toList()
        assert 2 == len(results)
        assert 'josh' in results
        assert 'peter' in results
        # #
        results = g.V().out().profile().toList()
        assert 1 == len(results)
        assert 'metrics' in results[0]
        assert 'dur' in results[0]
        # #
        results = g.V().has('name',
                            'peter').as_('a').out('created').as_('b').select(
                                'a', 'b').by(__.valueMap()).toList()
        assert 1 == len(results)
        assert 'peter' == results[0]['a']['name'][0]
        assert 35 == results[0]['a']['age'][0]
        assert 'lop' == results[0]['b']['name'][0]
        assert 'java' == results[0]['b']['lang'][0]
        assert 2 == len(results[0]['a'])
        assert 2 == len(results[0]['b'])
        # #
        results = g.V(1).inject(g.V(2).next()).values('name').toList()
        assert 2 == len(results)
        assert 'marko' in results
        assert 'vadas' in results
        # #
        results = g.V().has('person', 'name', 'marko').map(
            lambda: ("it.get().value('name')", "gremlin-groovy")).toList()
        assert 1 == len(results)
        assert 'marko' in results
        # #
        # this test just validates that the underscored versions of steps conflicting with Gremlin work
        # properly and can be removed when the old steps are removed - TINKERPOP-2272
        results = g.V().filter_(
            __.values('age').sum_().and_(__.max_().is_(P.gt(0)),
                                         __.min_().is_(P.gt(0)))).range_(
                                             0, 1).id_().next()
        assert 1 == results
        # #
        # test binding in P
        results = g.V().has('person', 'age',
                            Bindings.of('x', P.lt(30))).count().next()
        assert 2 == results
        # #
        # test dict keys which can only work on GraphBinary and GraphSON3 which include specific serialization
        # types for dict
        if not isinstance(remote_connection._client._message_serializer,
                          GraphSONSerializersV2d0):
            results = g.V().has(
                'person', 'name',
                'marko').elementMap("name").groupCount().next()
            assert {
                HashableDict.of({
                    T.id: 1,
                    T.label: 'person',
                    'name': 'marko'
                }): 1
            } == results
        if not isinstance(remote_connection._client._message_serializer,
                          GraphSONSerializersV2d0):
            results = g.V().has('person', 'name',
                                'marko').both('knows').groupCount().by(
                                    __.values('name').fold()).next()
            assert {tuple(['vadas']): 1, tuple(['josh']): 1} == results
Example #4
0
    def query_target_subgraph(self, target_id, tr_dict, transaction_value_cols,
                              union_id_cols, dummied_col):
        """Extract 2nd degree subgraph of target transaction.Dump data into subgraph dict and n_feats dict.
        subgraph_dict:  related transactions' id list and values through edges
        n_feats dict: related 1 degree vertex and transactions' embeded elements vectors. 
        Usually after insert new test sample's vertex and edges into graphDB. 
        
        Example:
        >>> query_target_subgraph('3661635', load_data_from_event(), 'M2_T,M3_F,M3_T,...')
        """
        subgraph_dict = {}
        neighbor_list = []
        neighbor_dict = {}
        transaction_embed_value_dict = {}

        ii = 0
        s_t = dt.now()

        conn = self.gremlin_utils.remote_connection()
        g = self.gremlin_utils.traversal_source(connection=conn)
        t1 = self.gremlin_utils.traversal_source(connection=conn)

        target_name = target_id[(target_id.find('-') + 1):]
        feature_list = g.V().has(id, target_id).out().id().toList()
        for feat in feature_list:
            ii += 1
            feat_name = feat[:feat.find('-')]
            feat_value = feat[(feat.find('-') + 1):]
            node_list = g.V().has(
                id, feat).both().limit(MAX_FEATURE_NODE).id().toList()
            target_and_conn_node_list = [int(target_name)] + [
                int(target_conn_node[(target_conn_node.find('-') + 1):])
                for target_conn_node in node_list
            ]
            target_and_conn_node_list = list(set(target_and_conn_node_list))
            neighbor_list += target_and_conn_node_list
            nodes_and_feature_value_array = (target_and_conn_node_list,
                                             [feat_value] *
                                             len(target_and_conn_node_list))
            subgraph_dict['target<>' +
                          feat_name] = nodes_and_feature_value_array

        e_t = dt.now()
        logger.info(
            f'INSIDE query_target_subgraph: subgraph_dict used {(e_t - s_t).total_seconds()} seconds'
        )
        logger.info(
            f'subgraph_dict len: {len(subgraph_dict.keys())}  key: {subgraph_dict.keys()}'
        )
        logger.info(f'subgraph_dict: {subgraph_dict}')
        new_s_t = e_t

        union_li = [
            t1.V().has(id, target_id).both().hasLabel(label).both().limit(
                MAX_FEATURE_NODE) for label in union_id_cols
        ]
        logger.info(
            f'union_id_cols len: {len(union_id_cols)}  key: {union_id_cols}')
        logger.info(f'union_li len: {len(union_li)}  key: {union_li}')

        if len(union_id_cols) == 51:
            node_dict = g.V().has(id,target_id).union(__.both().hasLabel('card1').both().limit(MAX_FEATURE_NODE),\
                    union_li[1], union_li[2], union_li[3], union_li[4], union_li[5],\
                    union_li[6], union_li[7], union_li[8], union_li[9], union_li[10],\
                    union_li[11], union_li[12], union_li[13], union_li[14], union_li[15],\
                    union_li[16], union_li[17], union_li[18], union_li[19], union_li[20],\
                    union_li[21], union_li[22], union_li[23], union_li[24], union_li[25],\
                    union_li[26], union_li[27], union_li[28], union_li[29], union_li[30],\
                    union_li[31], union_li[32], union_li[33], union_li[34], union_li[35],\
                    union_li[36], union_li[37], union_li[38], union_li[39], union_li[40],\
                    union_li[41], union_li[42], union_li[43], union_li[44], union_li[45],\
                    union_li[46], union_li[47], union_li[48], union_li[49], union_li[50]).elementMap().toList()
        else:
            node_dict = g.V().has(id,target_id).union(__.both().hasLabel('card1').both().limit(MAX_FEATURE_NODE),\
                    union_li[1], union_li[2], union_li[3], union_li[4], union_li[5],\
                    union_li[6], union_li[7], union_li[8], union_li[9], union_li[10]).elementMap().toList()

        e_t = dt.now()
        logger.info(
            f'INSIDE query_target_subgraph: node_dict used {(e_t - new_s_t).total_seconds()} seconds.'
        )
        new_s_t = e_t

        logger.info(f'node_dict len: {len(node_dict)}  key: {node_dict}')

        for item in node_dict:
            node = item.get(list(item)[0])
            node_value = node[(node.find('-') + 1):]
            neighbor_dict[node_value] = [
                item.get(key) for key in transaction_value_cols
            ]

        target_value = target_id[(target_id.find('-') + 1):]
        neighbor_dict[target_value] = [
            tr_dict[0].get(key) for key in transaction_value_cols
        ]

        logger.info(
            f'INSIDE query_target_subgraph: node_dict used {(e_t - new_s_t).total_seconds()} seconds.'
        )
        logger.info(
            f'neighbor_dict len: {len(neighbor_dict.keys())}  key: {neighbor_dict.keys()}'
        )
        logger.info(f'neighbor_dict: {neighbor_dict}')

        attr_cols = ['val' + str(x) for x in range(1, 391)]
        for attr in feature_list:
            attr_name = attr[:attr.find('-')]
            attr_value = attr[(attr.find('-') + 1):]
            attr_dict = g.V().has(id, attr).valueMap().toList()[0]
            attr_dict = [attr_dict.get(key)[-1] for key in attr_cols]
            attr_input_dict = {}
            attr_input_dict[attr_value] = attr_dict
            transaction_embed_value_dict[attr_name] = attr_input_dict

        e_t = dt.now()
        logger.info(
            f'INSIDE query_target_subgraph: transaction_embed_value_dict used {(e_t - new_s_t).total_seconds()} seconds. Total test cost {(e_t - s_t).total_seconds()} seconds.'
        )
        new_s_t = e_t

        transaction_embed_value_dict['target'] = neighbor_dict

        conn.close()

        logger.info(
            f'transaction_embed_value_dict len: {len(transaction_embed_value_dict.keys())} key: {transaction_embed_value_dict.keys()}'
        )
        logger.info(
            f'transaction_embed_value_dict: {transaction_embed_value_dict}')

        return subgraph_dict, transaction_embed_value_dict
    def query_target_subgraph(self, target_id, tr_dict, transaction_value_cols,
                              union_id_cols, dummied_col):
        """Extract 2nd degree subgraph of target transaction.Dump data into subgraph dict and n_feats dict.
        subgraph_dict:  related transactions' id list and values through edges
        n_feats dict: related 1 degree vertex and transactions' embeded elements vectors. 
        Usually after insert new test sample's vertex and edges into graphDB. 
        
        Example:
        >>> query_target_subgraph('3661635', load_data_from_event(), 'M2_T,M3_F,M3_T,...')
        """
        subgraph_dict = {}
        neighbor_list = []
        neighbor_dict = {}
        transaction_embed_value_dict = {}

        ii = 0
        s_t = dt.now()

        conn = self.gremlin_utils.remote_connection()
        g = self.gremlin_utils.traversal_source(connection=conn)

        target_name = target_id[(target_id.find('-') + 1):]
        feature_list = g.V().has(id, target_id).out().id().toList()
        for feat in feature_list:
            ii += 1
            feat_name = feat[:feat.find('-')]
            feat_value = feat[(feat.find('-') + 1):]
            node_list = g.V().has(
                id, feat).both().limit(MAX_FEATURE_NODE).id().toList()
            target_and_conn_node_list = [int(target_name)] + [
                int(target_conn_node[(target_conn_node.find('-') + 1):])
                for target_conn_node in node_list
            ]
            target_and_conn_node_list = list(set(target_and_conn_node_list))
            neighbor_list += target_and_conn_node_list
            nodes_and_feature_value_array = (target_and_conn_node_list,
                                             [feat_value] *
                                             len(target_and_conn_node_list))
            subgraph_dict['target<>' +
                          feat_name] = nodes_and_feature_value_array

        e_t = dt.now()
        logger.info(
            f'INSIDE query_target_subgraph: subgraph_dict used {(e_t - s_t).total_seconds()} seconds'
        )
        new_s_t = e_t

        union_li = [
            __.V().has(id, target_id).both().hasLabel(label).both().limit(
                MAX_FEATURE_NODE) for label in union_id_cols
        ]

        if len(union_id_cols) == 51:
            node_dict = g.V().has(id,target_id).union(__.both().hasLabel('card1').both().limit(MAX_FEATURE_NODE),\
                    union_li[1], union_li[2], union_li[3], union_li[4], union_li[5],\
                    union_li[6], union_li[7], union_li[8], union_li[9], union_li[10],\
                    union_li[11], union_li[12], union_li[13], union_li[14], union_li[15],\
                    union_li[16], union_li[17], union_li[18], union_li[19], union_li[20],\
                    union_li[21], union_li[22], union_li[23], union_li[24], union_li[25],\
                    union_li[26], union_li[27], union_li[28], union_li[29], union_li[30],\
                    union_li[31], union_li[32], union_li[33], union_li[34], union_li[35],\
                    union_li[36], union_li[37], union_li[38], union_li[39], union_li[40],\
                    union_li[41], union_li[42], union_li[43], union_li[44], union_li[45],\
                    union_li[46], union_li[47], union_li[48], union_li[49], union_li[50]).elementMap().toList()
        else:
            node_dict = g.V().has(id,target_id).union(__.both().hasLabel('card1').both().limit(MAX_FEATURE_NODE),\
                    union_li[1], union_li[2], union_li[3], union_li[4], union_li[5],\
                    union_li[6], union_li[7], union_li[8], union_li[9], union_li[10]).elementMap().toList()

        e_t = dt.now()
        logger.info(
            f'INSIDE query_target_subgraph: node_dict used {(e_t - new_s_t).total_seconds()} seconds.'
        )
        new_s_t = e_t

        logger.debug(f'Found {len(node_dict)} nodes from graph dbs...')

        class Item():
            def __init__(self, item):
                self.item = item

            def __hash__(self):
                return hash(self.item.get(list(self.item)[0]))

            def __eq__(self, other):
                if isinstance(other, self.__class__):
                    return self.__hash__() == other.__hash__()
                else:
                    return NotImplemented

            def __repr__(self):
                return "Item(%s)" % (self.item)

        node_dict = list(set([Item(node) for node in node_dict]))
        logger.debug(f'Found {len(node_dict)} nodes without duplication')
        for item in node_dict:
            item = item.item
            node = item.get(list(item)[0])
            node_value = node[(node.find('-') + 1):]
            try:
                logger.debug(
                    f'the props of node {node} is {item.get(attr_version_key)}'
                )
                jsonVal = json.loads(item.get(attr_version_key))
                neighbor_dict[node_value] = [
                    jsonVal[key] for key in transaction_value_cols
                ]
                logger.debug(
                    f'neighbor pair is {node_value}, {neighbor_dict[node_value]}'
                )
            except json.JSONDecodeError:
                logger.warn(
                    f'Malform node value {node} is {item.get(attr_version_key)}, run below cmd to remove it'
                )
                logger.info(f'g.V(\'{node}\').drop()')

        target_value = target_id[(target_id.find('-') + 1):]
        jsonVal = json.loads(tr_dict[0].get(attr_version_key))
        neighbor_dict[target_value] = [
            jsonVal[key] for key in transaction_value_cols
        ]

        logger.info(
            f'INSIDE query_target_subgraph: neighbor_dict used {(e_t - new_s_t).total_seconds()} seconds.'
        )

        attr_cols = ['val' + str(x) for x in range(1, 391)]
        for attr in feature_list:
            attr_name = attr[:attr.find('-')]
            attr_value = attr[(attr.find('-') + 1):]
            attr_dict = g.V().has(id, attr).valueMap().toList()[0]
            logger.debug(f'attr is {attr}, dict is {attr_dict}')
            jsonVal = json.loads(attr_dict.get(attr_version_key)[0])
            attr_dict = [float(jsonVal[key]) for key in attr_cols]
            attr_input_dict = {}
            attr_input_dict[attr_value] = attr_dict
            transaction_embed_value_dict[attr_name] = attr_input_dict

        e_t = dt.now()
        logger.info(
            f'INSIDE query_target_subgraph: transaction_embed_value_dict used {(e_t - new_s_t).total_seconds()} seconds. Total test cost {(e_t - s_t).total_seconds()} seconds.'
        )
        new_s_t = e_t

        transaction_embed_value_dict['target'] = neighbor_dict

        conn.close()

        return subgraph_dict, transaction_embed_value_dict