def get_neighbors(self, entity): """ Returns all the neighbors of a node :param entity: the entity id of a vertex :returns: a list of neighbors attributes """ # find the node identified by `entity` traversal = self.g.V().has('entity', entity) # find the neighbors of this node traversal = traversal.bothE().bothV().dedup() # calculates extra attributes traversal = traversal \ .property('degree', __.both().dedup().count()) \ .property('in_degree_weighted', __.inE().values('valeur_euro').sum()) \ .property('out_degree_weighted', __.outE().values('valeur_euro').sum()) # select only specific attributes traversal = traversal.project( 'entity', 'prenom', 'nom', 'prenom_nom', 'date_naissance', 'pays_code', 'code_postal', 'numero_piece_identite', 'star', 'degree', 'in_degree_weighted', 'out_degree_weighted') \ .by('entity') \ .by('prenom') \ .by('nom') \ .by('prenomnom') \ .by('date_naissance') \ .by('pays_code') \ .by('code_postal') \ .by('numero_piece_identite') \ .by('star') \ .by('degree') \ .by('in_degree_weighted') \ .by('out_degree_weighted') neighbors = traversal.toList() return neighbors
def search(self, prenom_nom): """ Search all nodes which attribute `prenomnom` matches the query :param prenom_nom: the text query """ tokens = prenom_nom.strip().split() str_query = es_fuzzy_string_query(tokens) # Use JanusGraph direct index query # https://docs.janusgraph.org/latest/direct-index-query.html query = "graph.indexQuery('vertexByPrenomNom', 'v.prenomnom:%s')\ .limit(10).vertices()" % str_query client = self.connection._client vertices_and_score = client.submit(query).all().result() vertices = [v["element"] for v in vertices_and_score] search_results = [] if vertices: # find all the matching vertices traversal = self.g.V(vertices) # add the attribute `degree` on each vertex traversal = traversal.property('degree', __.both().dedup().count()) # select attributes traversal = traversal \ .project( 'entity', 'prenom_nom', 'prenom', 'nom', 'code_postal', 'pays_code', 'numero_piece_identite', 'degree') \ .by('entity') \ .by('prenomnom') \ .by('prenom') \ .by('nom') \ .by('code_postal') \ .by('pays_code') \ .by('numero_piece_identite') \ .by('degree') search_results.extend(traversal.toList()) return search_results
def test_traversals(self, remote_connection): statics.load_statics(globals()) g = traversal().withRemote(remote_connection) assert long(6) == g.V().count().toList()[0] # # assert Vertex(1) == g.V(1).next() assert Vertex(1) == g.V(Vertex(1)).next() assert 1 == g.V(1).id_().next() assert Traverser(Vertex(1)) == g.V(1).nextTraverser() assert 1 == len(g.V(1).toList()) assert isinstance(g.V(1).toList(), list) results = g.V().repeat(__.out()).times(2).name results = results.toList() assert 2 == len(results) assert "lop" in results assert "ripple" in results # # assert 10 == g.V().repeat(__.both()).times(5)[0:10].count().next() assert 1 == g.V().repeat(__.both()).times(5)[0:1].count().next() assert 0 == g.V().repeat(__.both()).times(5)[0:0].count().next() assert 4 == g.V()[2:].count().next() assert 2 == g.V()[:2].count().next() # # results = g.withSideEffect( 'a', ['josh', 'peter' ]).V(1).out('created').in_('created').values('name').where( P.within('a')).toList() assert 2 == len(results) assert 'josh' in results assert 'peter' in results # # results = g.V().out().profile().toList() assert 1 == len(results) assert 'metrics' in results[0] assert 'dur' in results[0] # # results = g.V().has('name', 'peter').as_('a').out('created').as_('b').select( 'a', 'b').by(__.valueMap()).toList() assert 1 == len(results) assert 'peter' == results[0]['a']['name'][0] assert 35 == results[0]['a']['age'][0] assert 'lop' == results[0]['b']['name'][0] assert 'java' == results[0]['b']['lang'][0] assert 2 == len(results[0]['a']) assert 2 == len(results[0]['b']) # # results = g.V(1).inject(g.V(2).next()).values('name').toList() assert 2 == len(results) assert 'marko' in results assert 'vadas' in results # # results = g.V().has('person', 'name', 'marko').map( lambda: ("it.get().value('name')", "gremlin-groovy")).toList() assert 1 == len(results) assert 'marko' in results # # # this test just validates that the underscored versions of steps conflicting with Gremlin work # properly and can be removed when the old steps are removed - TINKERPOP-2272 results = g.V().filter_( __.values('age').sum_().and_(__.max_().is_(P.gt(0)), __.min_().is_(P.gt(0)))).range_( 0, 1).id_().next() assert 1 == results # # # test binding in P results = g.V().has('person', 'age', Bindings.of('x', P.lt(30))).count().next() assert 2 == results # # # test dict keys which can only work on GraphBinary and GraphSON3 which include specific serialization # types for dict if not isinstance(remote_connection._client._message_serializer, GraphSONSerializersV2d0): results = g.V().has( 'person', 'name', 'marko').elementMap("name").groupCount().next() assert { HashableDict.of({ T.id: 1, T.label: 'person', 'name': 'marko' }): 1 } == results if not isinstance(remote_connection._client._message_serializer, GraphSONSerializersV2d0): results = g.V().has('person', 'name', 'marko').both('knows').groupCount().by( __.values('name').fold()).next() assert {tuple(['vadas']): 1, tuple(['josh']): 1} == results
def query_target_subgraph(self, target_id, tr_dict, transaction_value_cols, union_id_cols, dummied_col): """Extract 2nd degree subgraph of target transaction.Dump data into subgraph dict and n_feats dict. subgraph_dict: related transactions' id list and values through edges n_feats dict: related 1 degree vertex and transactions' embeded elements vectors. Usually after insert new test sample's vertex and edges into graphDB. Example: >>> query_target_subgraph('3661635', load_data_from_event(), 'M2_T,M3_F,M3_T,...') """ subgraph_dict = {} neighbor_list = [] neighbor_dict = {} transaction_embed_value_dict = {} ii = 0 s_t = dt.now() conn = self.gremlin_utils.remote_connection() g = self.gremlin_utils.traversal_source(connection=conn) t1 = self.gremlin_utils.traversal_source(connection=conn) target_name = target_id[(target_id.find('-') + 1):] feature_list = g.V().has(id, target_id).out().id().toList() for feat in feature_list: ii += 1 feat_name = feat[:feat.find('-')] feat_value = feat[(feat.find('-') + 1):] node_list = g.V().has( id, feat).both().limit(MAX_FEATURE_NODE).id().toList() target_and_conn_node_list = [int(target_name)] + [ int(target_conn_node[(target_conn_node.find('-') + 1):]) for target_conn_node in node_list ] target_and_conn_node_list = list(set(target_and_conn_node_list)) neighbor_list += target_and_conn_node_list nodes_and_feature_value_array = (target_and_conn_node_list, [feat_value] * len(target_and_conn_node_list)) subgraph_dict['target<>' + feat_name] = nodes_and_feature_value_array e_t = dt.now() logger.info( f'INSIDE query_target_subgraph: subgraph_dict used {(e_t - s_t).total_seconds()} seconds' ) logger.info( f'subgraph_dict len: {len(subgraph_dict.keys())} key: {subgraph_dict.keys()}' ) logger.info(f'subgraph_dict: {subgraph_dict}') new_s_t = e_t union_li = [ t1.V().has(id, target_id).both().hasLabel(label).both().limit( MAX_FEATURE_NODE) for label in union_id_cols ] logger.info( f'union_id_cols len: {len(union_id_cols)} key: {union_id_cols}') logger.info(f'union_li len: {len(union_li)} key: {union_li}') if len(union_id_cols) == 51: node_dict = g.V().has(id,target_id).union(__.both().hasLabel('card1').both().limit(MAX_FEATURE_NODE),\ union_li[1], union_li[2], union_li[3], union_li[4], union_li[5],\ union_li[6], union_li[7], union_li[8], union_li[9], union_li[10],\ union_li[11], union_li[12], union_li[13], union_li[14], union_li[15],\ union_li[16], union_li[17], union_li[18], union_li[19], union_li[20],\ union_li[21], union_li[22], union_li[23], union_li[24], union_li[25],\ union_li[26], union_li[27], union_li[28], union_li[29], union_li[30],\ union_li[31], union_li[32], union_li[33], union_li[34], union_li[35],\ union_li[36], union_li[37], union_li[38], union_li[39], union_li[40],\ union_li[41], union_li[42], union_li[43], union_li[44], union_li[45],\ union_li[46], union_li[47], union_li[48], union_li[49], union_li[50]).elementMap().toList() else: node_dict = g.V().has(id,target_id).union(__.both().hasLabel('card1').both().limit(MAX_FEATURE_NODE),\ union_li[1], union_li[2], union_li[3], union_li[4], union_li[5],\ union_li[6], union_li[7], union_li[8], union_li[9], union_li[10]).elementMap().toList() e_t = dt.now() logger.info( f'INSIDE query_target_subgraph: node_dict used {(e_t - new_s_t).total_seconds()} seconds.' ) new_s_t = e_t logger.info(f'node_dict len: {len(node_dict)} key: {node_dict}') for item in node_dict: node = item.get(list(item)[0]) node_value = node[(node.find('-') + 1):] neighbor_dict[node_value] = [ item.get(key) for key in transaction_value_cols ] target_value = target_id[(target_id.find('-') + 1):] neighbor_dict[target_value] = [ tr_dict[0].get(key) for key in transaction_value_cols ] logger.info( f'INSIDE query_target_subgraph: node_dict used {(e_t - new_s_t).total_seconds()} seconds.' ) logger.info( f'neighbor_dict len: {len(neighbor_dict.keys())} key: {neighbor_dict.keys()}' ) logger.info(f'neighbor_dict: {neighbor_dict}') attr_cols = ['val' + str(x) for x in range(1, 391)] for attr in feature_list: attr_name = attr[:attr.find('-')] attr_value = attr[(attr.find('-') + 1):] attr_dict = g.V().has(id, attr).valueMap().toList()[0] attr_dict = [attr_dict.get(key)[-1] for key in attr_cols] attr_input_dict = {} attr_input_dict[attr_value] = attr_dict transaction_embed_value_dict[attr_name] = attr_input_dict e_t = dt.now() logger.info( f'INSIDE query_target_subgraph: transaction_embed_value_dict used {(e_t - new_s_t).total_seconds()} seconds. Total test cost {(e_t - s_t).total_seconds()} seconds.' ) new_s_t = e_t transaction_embed_value_dict['target'] = neighbor_dict conn.close() logger.info( f'transaction_embed_value_dict len: {len(transaction_embed_value_dict.keys())} key: {transaction_embed_value_dict.keys()}' ) logger.info( f'transaction_embed_value_dict: {transaction_embed_value_dict}') return subgraph_dict, transaction_embed_value_dict
def query_target_subgraph(self, target_id, tr_dict, transaction_value_cols, union_id_cols, dummied_col): """Extract 2nd degree subgraph of target transaction.Dump data into subgraph dict and n_feats dict. subgraph_dict: related transactions' id list and values through edges n_feats dict: related 1 degree vertex and transactions' embeded elements vectors. Usually after insert new test sample's vertex and edges into graphDB. Example: >>> query_target_subgraph('3661635', load_data_from_event(), 'M2_T,M3_F,M3_T,...') """ subgraph_dict = {} neighbor_list = [] neighbor_dict = {} transaction_embed_value_dict = {} ii = 0 s_t = dt.now() conn = self.gremlin_utils.remote_connection() g = self.gremlin_utils.traversal_source(connection=conn) target_name = target_id[(target_id.find('-') + 1):] feature_list = g.V().has(id, target_id).out().id().toList() for feat in feature_list: ii += 1 feat_name = feat[:feat.find('-')] feat_value = feat[(feat.find('-') + 1):] node_list = g.V().has( id, feat).both().limit(MAX_FEATURE_NODE).id().toList() target_and_conn_node_list = [int(target_name)] + [ int(target_conn_node[(target_conn_node.find('-') + 1):]) for target_conn_node in node_list ] target_and_conn_node_list = list(set(target_and_conn_node_list)) neighbor_list += target_and_conn_node_list nodes_and_feature_value_array = (target_and_conn_node_list, [feat_value] * len(target_and_conn_node_list)) subgraph_dict['target<>' + feat_name] = nodes_and_feature_value_array e_t = dt.now() logger.info( f'INSIDE query_target_subgraph: subgraph_dict used {(e_t - s_t).total_seconds()} seconds' ) new_s_t = e_t union_li = [ __.V().has(id, target_id).both().hasLabel(label).both().limit( MAX_FEATURE_NODE) for label in union_id_cols ] if len(union_id_cols) == 51: node_dict = g.V().has(id,target_id).union(__.both().hasLabel('card1').both().limit(MAX_FEATURE_NODE),\ union_li[1], union_li[2], union_li[3], union_li[4], union_li[5],\ union_li[6], union_li[7], union_li[8], union_li[9], union_li[10],\ union_li[11], union_li[12], union_li[13], union_li[14], union_li[15],\ union_li[16], union_li[17], union_li[18], union_li[19], union_li[20],\ union_li[21], union_li[22], union_li[23], union_li[24], union_li[25],\ union_li[26], union_li[27], union_li[28], union_li[29], union_li[30],\ union_li[31], union_li[32], union_li[33], union_li[34], union_li[35],\ union_li[36], union_li[37], union_li[38], union_li[39], union_li[40],\ union_li[41], union_li[42], union_li[43], union_li[44], union_li[45],\ union_li[46], union_li[47], union_li[48], union_li[49], union_li[50]).elementMap().toList() else: node_dict = g.V().has(id,target_id).union(__.both().hasLabel('card1').both().limit(MAX_FEATURE_NODE),\ union_li[1], union_li[2], union_li[3], union_li[4], union_li[5],\ union_li[6], union_li[7], union_li[8], union_li[9], union_li[10]).elementMap().toList() e_t = dt.now() logger.info( f'INSIDE query_target_subgraph: node_dict used {(e_t - new_s_t).total_seconds()} seconds.' ) new_s_t = e_t logger.debug(f'Found {len(node_dict)} nodes from graph dbs...') class Item(): def __init__(self, item): self.item = item def __hash__(self): return hash(self.item.get(list(self.item)[0])) def __eq__(self, other): if isinstance(other, self.__class__): return self.__hash__() == other.__hash__() else: return NotImplemented def __repr__(self): return "Item(%s)" % (self.item) node_dict = list(set([Item(node) for node in node_dict])) logger.debug(f'Found {len(node_dict)} nodes without duplication') for item in node_dict: item = item.item node = item.get(list(item)[0]) node_value = node[(node.find('-') + 1):] try: logger.debug( f'the props of node {node} is {item.get(attr_version_key)}' ) jsonVal = json.loads(item.get(attr_version_key)) neighbor_dict[node_value] = [ jsonVal[key] for key in transaction_value_cols ] logger.debug( f'neighbor pair is {node_value}, {neighbor_dict[node_value]}' ) except json.JSONDecodeError: logger.warn( f'Malform node value {node} is {item.get(attr_version_key)}, run below cmd to remove it' ) logger.info(f'g.V(\'{node}\').drop()') target_value = target_id[(target_id.find('-') + 1):] jsonVal = json.loads(tr_dict[0].get(attr_version_key)) neighbor_dict[target_value] = [ jsonVal[key] for key in transaction_value_cols ] logger.info( f'INSIDE query_target_subgraph: neighbor_dict used {(e_t - new_s_t).total_seconds()} seconds.' ) attr_cols = ['val' + str(x) for x in range(1, 391)] for attr in feature_list: attr_name = attr[:attr.find('-')] attr_value = attr[(attr.find('-') + 1):] attr_dict = g.V().has(id, attr).valueMap().toList()[0] logger.debug(f'attr is {attr}, dict is {attr_dict}') jsonVal = json.loads(attr_dict.get(attr_version_key)[0]) attr_dict = [float(jsonVal[key]) for key in attr_cols] attr_input_dict = {} attr_input_dict[attr_value] = attr_dict transaction_embed_value_dict[attr_name] = attr_input_dict e_t = dt.now() logger.info( f'INSIDE query_target_subgraph: transaction_embed_value_dict used {(e_t - new_s_t).total_seconds()} seconds. Total test cost {(e_t - s_t).total_seconds()} seconds.' ) new_s_t = e_t transaction_embed_value_dict['target'] = neighbor_dict conn.close() return subgraph_dict, transaction_embed_value_dict