def test_append_traversal(self) -> None: g = __.V().hasLabel('Foo') w = __.where(__.inE().outV().hasLabel('Bar')) actual = append_traversal(g, w) expected = __.V().hasLabel('Foo').where( __.inE().outV().hasLabel('Bar')) self.assertEqual(actual, expected)
def _table_search_query(graph: GraphTraversalSource, tag_filter: str) -> List[Dict]: traversal = graph.V().hasLabel(TableMetadata.TABLE_NODE_LABEL) if tag_filter: traversal = traversal.has('published_tag', tag_filter) traversal = traversal.project('database', 'cluster', 'schema', 'schema_description', 'name', 'key', 'description', 'last_updated_timestamp', 'column_names', 'column_descriptions', 'total_usage', 'unique_usage', 'tags', 'badges', 'programmatic_descriptions') traversal = traversal.by( __.out(TableMetadata.TABLE_SCHEMA_RELATION_TYPE).out( SCHEMA_REVERSE_RELATION_TYPE).out( CLUSTER_REVERSE_RELATION_TYPE).values('name')) # database traversal = traversal.by( __.out(TableMetadata.TABLE_SCHEMA_RELATION_TYPE).out( SCHEMA_REVERSE_RELATION_TYPE).values('name')) # cluster traversal = traversal.by( __.out( TableMetadata.TABLE_SCHEMA_RELATION_TYPE).values('name')) # schema traversal = traversal.by( __.coalesce( __.out(TableMetadata.TABLE_SCHEMA_RELATION_TYPE).out( DescriptionMetadata.DESCRIPTION_RELATION_TYPE).values( 'description'), __.constant(''))) # schema_description traversal = traversal.by('name') # name traversal = traversal.by(T.id) # key traversal = traversal.by( __.coalesce( __.out(DescriptionMetadata.DESCRIPTION_RELATION_TYPE).values( 'description'), __.constant(''))) # description traversal = traversal.by( __.coalesce( __.out(LASTUPDATED_RELATION_TYPE).values(TIMESTAMP_PROPERTY), __.constant(''))) # last_updated_timestamp traversal = traversal.by( __.out(TableMetadata.TABLE_COL_RELATION_TYPE).values( 'name').fold()) # column_names traversal = traversal.by( __.out(TableMetadata.TABLE_COL_RELATION_TYPE).out( DescriptionMetadata.DESCRIPTION_RELATION_TYPE).values( 'description').fold()) # column_descriptions traversal = traversal.by( __.coalesce( __.outE(READ_REVERSE_RELATION_TYPE).values('read_count'), __.constant(0)).sum()) # total_usage traversal = traversal.by( __.outE(READ_REVERSE_RELATION_TYPE).count()) # unique_usage traversal = traversal.by( __.inE(TableMetadata.TAG_TABLE_RELATION_TYPE).outV().values( METADATA_KEY_PROPERTY_NAME).fold()) # tags traversal = traversal.by( __.out('HAS_BADGE').values('keys').dedup().fold()) # badges traversal = traversal.by( __.out(DescriptionMetadata.PROGRAMMATIC_DESCRIPTION_NODE_LABEL).values( 'description').fold()) # programmatic_descriptions traversal = traversal.order().by(__.select('name'), Order.asc) return traversal.toList()
def get_inbound(self, id, search_depth=DEFAULT_SEARCH_DEPTH): this_object = self._validated_vertex_id(id) if this_object is None: raise ResourceNotFoundException(f"Unable to resolve Object {id}") else: return _format_graph_results( self._depth_search(this_object, __.inE(), search_depth))
def get_neighbors(self, entity): """ Returns all the neighbors of a node :param entity: the entity id of a vertex :returns: a list of neighbors attributes """ # find the node identified by `entity` traversal = self.g.V().has('entity', entity) # find the neighbors of this node traversal = traversal.bothE().bothV().dedup() # calculates extra attributes traversal = traversal \ .property('degree', __.both().dedup().count()) \ .property('in_degree_weighted', __.inE().values('valeur_euro').sum()) \ .property('out_degree_weighted', __.outE().values('valeur_euro').sum()) # select only specific attributes traversal = traversal.project( 'entity', 'prenom', 'nom', 'prenom_nom', 'date_naissance', 'pays_code', 'code_postal', 'numero_piece_identite', 'star', 'degree', 'in_degree_weighted', 'out_degree_weighted') \ .by('entity') \ .by('prenom') \ .by('nom') \ .by('prenomnom') \ .by('date_naissance') \ .by('pays_code') \ .by('code_postal') \ .by('numero_piece_identite') \ .by('star') \ .by('degree') \ .by('in_degree_weighted') \ .by('out_degree_weighted') neighbors = traversal.toList() return neighbors
def get_or_create_edge(self, v1, v2, label: str): return self.g.V(v1).as_('v1').V(v2).coalesce( __.inE(label).where(__.outV().as_('v1')), __.addE(label).from_('v1'))
def _get_or_created_edge_from(self, node: GraphTraversal, other: int, label: str): return node.coalesce( __.inE(label).filter(__.outV().hasId(other)), __.addE(label).from_(self.g.V(other)))
def convert_record(biorecord, taxrecord, g): """Convert Biosample record and Taxonomy record into a series of associated records that capture taxonomy, sample collection, and a lot of names""" primary_id = biorecord.find('./BioSample').attrib['accession'] #create the sample s = ( g.V().has('sample', 'sample_id', primary_id).fold() .coalesce( unfold(), addV('metadata::sample').property('sample_id', primary_id)) ) s = s.property('package', biorecord.find('.//Package').text) for prop in biorecord.findall('.//Attribute'): s = s.property(prop.attrib['attribute_name'], prop.text) s = s.next() #commit #return s #print(s) #traverse the taxonomic tree and create nodes if they don't exist #most everything points outward from the sample, to put the #sample at the top of a DAG. r = None try: taxid, sci_name, _, _, rank, *_ = taxrecord.find('./Taxon') except TypeError: #root taxon not found pass else: sp = ( g.V().has('taxon', 'taxid', taxid.text).fold() .coalesce(unfold(), addV('taxonomy::taxon') .property('name', sci_name.text.split()[-1]) .property('rank', rank.text) .property('taxid', taxid.text)) ).next() #commit #traverse the LineageEx and update our taxonomy tree for taxid, name, rank in taxrecord.findall(".//LineageEx/Taxon"): taxid = taxid.text name = name.text.split(' ')[-1] #tokenize and get the last token rank = rank.text t = ( g.V().has('taxon', 'taxid', taxid).fold() .coalesce(unfold(), addV('taxonomy::taxon') .property('name', name) .property('rank', rank) .property('taxid', taxid)) ).next() #commit #print(t) if r: #t -[is_a]-> r ( g.V(t).as_('t') .V(r).coalesce( __.inE('IS_A').where(outV().as_('t')), addE('IS_A').from_('t') ) ).next() #commit r = t #print(r) #some tax records don't extend the lineage all the way to the parent taxon #this captures any enclosing terminal taxon from the other part of the #xml record. ( g.V(sp).as_('sp') .V(t).coalesce( __.inE('IS_A').where(outV().as_('sp')), addE('IS_A').from_('sp') ) ).next() #commit #lastly, connect the sample itself to its taxonomy. ( g.V(s).as_('s') .V(sp).coalesce( __.inE('IS_A').where(outV().as_('s')), addE('IS_A').from_('s') ) ).next() #load sample names for name_element in biorecord.find('.//Ids'): name = name_element.text namespace = name_element.attrib['db'] ns = ( g.V().has('namespace', 'name', namespace).fold() .coalesce( unfold(), addV('metadata::namespace').property('name', namespace) ).V().has('namespace', 'name', namespace).as_('ns') .V(s) .coalesce( __.inE('NAMED_IN').where(inV().as_('ns') .and_() .values('name').is_(name)), addE('NAMED_IN') .to('ns') .property('name', name) ) ).next() # ( # g.V(s).outE('NAMED_IN').where('name', name).to( # coalesce( # g.V().has('namespace', 'namespace', namespace), # g.addV('metadata::namespace', 'namespace', namespace) # ).as_('ns') # ).fold() # .coalesce( # unfold(), # addE('NAMED_IN').to('ns').property('name', name) # ) # ).next() #commit return biorecord.find('./BioSample').attrib['id']
(clargs.input_server, clargs.input_source)) graph = anonymous_traversal.traversal().withRemote( DriverRemoteConnection(clargs.input_server, clargs.input_source)) def percent_of(value, total): return (value / total) * 100 def nice_percent(value, total): return '{} ({:.3f}%)'.format(value, percent_of(value, total)) event_count = graph.V().hasLabel('event').count().next() non_root_logger().info( "checking broken event chain: event with either incoming or outgoing follow edge" ) no_prev = graph.V().hasLabel('event').where(__.outE('follow')).not_( __.inE('follow')).count().next() no_next = graph.V().hasLabel('event').where(__.inE('follow')).not_( __.outE('follow')).count().next() non_root_logger().info(" (event)-[follow]-> {}".format( nice_percent(no_prev, event_count), nice_percent(no_next, event_count))) non_root_logger().info(" -[follow]->(event) {}".format( nice_percent(no_prev, event_count), nice_percent(no_next, event_count))) non_root_logger().info( "checking unconnected events: event without follow edge") unconnected_event = graph.V().hasLabel('event').not_( __.inE('follow')).not_(__.outE('follow')).count().next() non_root_logger().info(" X-[follow]->(event)-[follow]-X {}".format( nice_percent(unconnected_event, event_count)))
def table_entities(cls, *, _g: GraphTraversalSource, table_data: List[Table], existing: EXISTING) -> None: all_tables_ids = list( set([ VertexTypes.Table.value.id( key=TableUris.get(database=t.database, cluster=t.cluster, schema=t.schema, table=t.name).table) for t in table_data ])) all_owner_ids = list( set([ VertexTypes.User.value.id(key=key) for key in [ t.table_writer.id for t in table_data if t.table_writer is not None ] ])) all_application_ids = list( set( list( possible_vertex_ids_for_application_key(*[ t.table_writer.id for t in table_data if t.table_writer is not None ])))) # chunk these since 100,000s seems to choke for tables_ids in chunk(all_tables_ids, 1000): LOGGER.info(f'fetching for tables: {tables_ids}') # fetch database -> cluster -> schema -> table links g = _g.V(tuple(tables_ids)).as_('tables') g = g.coalesce(__.inE( EdgeTypes.Table.value.label).dedup().fold()).as_( EdgeTypes.Table.name) g = g.coalesce(__.unfold().outV().hasLabel( VertexTypes.Schema.value.label).inE( EdgeTypes.Schema.value.label).dedup().fold()).as_( EdgeTypes.Schema.name) g = g.coalesce(__.unfold().outV().hasLabel( VertexTypes.Cluster.value.label).inE( EdgeTypes.Cluster.value.label).dedup().fold()).as_( EdgeTypes.Cluster.name) # fetch table <- links for t in (EdgeTypes.BelongToTable, EdgeTypes.Generates, EdgeTypes.Tag): g = g.coalesce(__.select('tables').inE( t.value.label).fold()).as_(t.name) # fetch table -> column et al links for t in (EdgeTypes.Column, EdgeTypes.Description, EdgeTypes.LastUpdatedAt, EdgeTypes.Source, EdgeTypes.Stat): g = g.coalesce(__.select('tables').outE( t.value.label).fold()).as_(t.name) # TODO: add owners, watermarks, last timestamp existing, source aliases = set([ t.name for t in (EdgeTypes.Table, EdgeTypes.Schema, EdgeTypes.Cluster, EdgeTypes.BelongToTable, EdgeTypes.Generates, EdgeTypes.Tag, EdgeTypes.Column, EdgeTypes.Description, EdgeTypes.LastUpdatedAt, EdgeTypes.Source, EdgeTypes.Stat) ]) g = g.select(*aliases).unfold().select(MapColumn.values).unfold() g = g.local( __.union(__.outV().id(), __.valueMap(True), __.inV().id()).fold()) cls._into_existing(g.toList(), existing) cls._column_entities(_g=_g, tables_ids=tables_ids, existing=existing) # fetch Application, User for ids in chunk(list(set(all_application_ids + all_owner_ids)), 5000): LOGGER.info(f'fetching for application/owners: {ids}') g = _g.V(ids).valueMap(True) cls._into_existing(g.toList(), existing)