Esempio n. 1
0
 def test_append_traversal(self) -> None:
     g = __.V().hasLabel('Foo')
     w = __.where(__.inE().outV().hasLabel('Bar'))
     actual = append_traversal(g, w)
     expected = __.V().hasLabel('Foo').where(
         __.inE().outV().hasLabel('Bar'))
     self.assertEqual(actual, expected)
Esempio n. 2
0
def _table_search_query(graph: GraphTraversalSource,
                        tag_filter: str) -> List[Dict]:
    traversal = graph.V().hasLabel(TableMetadata.TABLE_NODE_LABEL)
    if tag_filter:
        traversal = traversal.has('published_tag', tag_filter)
    traversal = traversal.project('database', 'cluster', 'schema',
                                  'schema_description', 'name', 'key',
                                  'description', 'last_updated_timestamp',
                                  'column_names', 'column_descriptions',
                                  'total_usage', 'unique_usage', 'tags',
                                  'badges', 'programmatic_descriptions')
    traversal = traversal.by(
        __.out(TableMetadata.TABLE_SCHEMA_RELATION_TYPE).out(
            SCHEMA_REVERSE_RELATION_TYPE).out(
                CLUSTER_REVERSE_RELATION_TYPE).values('name'))  # database
    traversal = traversal.by(
        __.out(TableMetadata.TABLE_SCHEMA_RELATION_TYPE).out(
            SCHEMA_REVERSE_RELATION_TYPE).values('name'))  # cluster
    traversal = traversal.by(
        __.out(
            TableMetadata.TABLE_SCHEMA_RELATION_TYPE).values('name'))  # schema
    traversal = traversal.by(
        __.coalesce(
            __.out(TableMetadata.TABLE_SCHEMA_RELATION_TYPE).out(
                DescriptionMetadata.DESCRIPTION_RELATION_TYPE).values(
                    'description'), __.constant('')))  # schema_description
    traversal = traversal.by('name')  # name
    traversal = traversal.by(T.id)  # key
    traversal = traversal.by(
        __.coalesce(
            __.out(DescriptionMetadata.DESCRIPTION_RELATION_TYPE).values(
                'description'), __.constant('')))  # description
    traversal = traversal.by(
        __.coalesce(
            __.out(LASTUPDATED_RELATION_TYPE).values(TIMESTAMP_PROPERTY),
            __.constant('')))  # last_updated_timestamp
    traversal = traversal.by(
        __.out(TableMetadata.TABLE_COL_RELATION_TYPE).values(
            'name').fold())  # column_names
    traversal = traversal.by(
        __.out(TableMetadata.TABLE_COL_RELATION_TYPE).out(
            DescriptionMetadata.DESCRIPTION_RELATION_TYPE).values(
                'description').fold())  # column_descriptions
    traversal = traversal.by(
        __.coalesce(
            __.outE(READ_REVERSE_RELATION_TYPE).values('read_count'),
            __.constant(0)).sum())  # total_usage
    traversal = traversal.by(
        __.outE(READ_REVERSE_RELATION_TYPE).count())  # unique_usage
    traversal = traversal.by(
        __.inE(TableMetadata.TAG_TABLE_RELATION_TYPE).outV().values(
            METADATA_KEY_PROPERTY_NAME).fold())  # tags
    traversal = traversal.by(
        __.out('HAS_BADGE').values('keys').dedup().fold())  # badges
    traversal = traversal.by(
        __.out(DescriptionMetadata.PROGRAMMATIC_DESCRIPTION_NODE_LABEL).values(
            'description').fold())  # programmatic_descriptions
    traversal = traversal.order().by(__.select('name'), Order.asc)
    return traversal.toList()
Esempio n. 3
0
    def get_inbound(self, id, search_depth=DEFAULT_SEARCH_DEPTH):
        this_object = self._validated_vertex_id(id)

        if this_object is None:
            raise ResourceNotFoundException(f"Unable to resolve Object {id}")
        else:
            return _format_graph_results(
                self._depth_search(this_object, __.inE(), search_depth))
Esempio n. 4
0
    def get_neighbors(self, entity):
        """
        Returns all the neighbors of a node
            :param entity: the entity id of a vertex
            :returns: a list of neighbors attributes
        """

        # find the node identified by `entity`
        traversal = self.g.V().has('entity', entity)

        # find the neighbors of this node
        traversal = traversal.bothE().bothV().dedup()

        # calculates extra attributes
        traversal = traversal \
            .property('degree', __.both().dedup().count()) \
            .property('in_degree_weighted',
                      __.inE().values('valeur_euro').sum()) \
            .property('out_degree_weighted',
                      __.outE().values('valeur_euro').sum())

        # select only specific attributes
        traversal = traversal.project(
            'entity',
            'prenom',
            'nom',
            'prenom_nom',
            'date_naissance',
            'pays_code',
            'code_postal',
            'numero_piece_identite',
            'star',
            'degree',
            'in_degree_weighted',
            'out_degree_weighted') \
            .by('entity') \
            .by('prenom') \
            .by('nom') \
            .by('prenomnom') \
            .by('date_naissance') \
            .by('pays_code') \
            .by('code_postal') \
            .by('numero_piece_identite') \
            .by('star') \
            .by('degree') \
            .by('in_degree_weighted') \
            .by('out_degree_weighted')

        neighbors = traversal.toList()
        return neighbors
Esempio n. 5
0
 def get_or_create_edge(self, v1, v2, label: str):
     return self.g.V(v1).as_('v1').V(v2).coalesce(
         __.inE(label).where(__.outV().as_('v1')),
         __.addE(label).from_('v1'))
Esempio n. 6
0
 def _get_or_created_edge_from(self, node: GraphTraversal, other: int,
                               label: str):
     return node.coalesce(
         __.inE(label).filter(__.outV().hasId(other)),
         __.addE(label).from_(self.g.V(other)))
Esempio n. 7
0
def convert_record(biorecord, taxrecord, g):
    """Convert Biosample record and Taxonomy record into a series of associated
       records that capture taxonomy, sample collection, and a lot of names"""
    primary_id = biorecord.find('./BioSample').attrib['accession']
    #create the sample
    s = ( g.V().has('sample', 'sample_id', primary_id).fold()
          .coalesce(
            unfold(),
            addV('metadata::sample').property('sample_id', primary_id))
        )
    s = s.property('package', biorecord.find('.//Package').text)
    for prop in biorecord.findall('.//Attribute'):
        s = s.property(prop.attrib['attribute_name'], prop.text)
    s = s.next() #commit
    #return s
    #print(s)
    
    #traverse the taxonomic tree and create nodes if they don't exist
    #most everything points outward from the sample, to put the
    #sample at the top of a DAG.
    r = None
    try:
        taxid, sci_name, _, _, rank, *_ = taxrecord.find('./Taxon')
    except TypeError: #root taxon not found
        pass
    else:
        sp = (
                g.V().has('taxon', 'taxid', taxid.text).fold()
                 .coalesce(unfold(),
                           addV('taxonomy::taxon')
                           .property('name', sci_name.text.split()[-1])
                           .property('rank', rank.text)
                           .property('taxid', taxid.text))
             ).next() #commit

    #traverse the LineageEx and update our taxonomy tree
    
    for taxid, name, rank in taxrecord.findall(".//LineageEx/Taxon"):
        taxid = taxid.text
        name = name.text.split(' ')[-1] #tokenize and get the last token
        rank = rank.text
        t = (
            g.V().has('taxon', 'taxid', taxid).fold() 
            .coalesce(unfold(),                       
                      addV('taxonomy::taxon')                   
                      .property('name', name)         
                      .property('rank', rank)         
                      .property('taxid', taxid))
            ).next() #commit
        #print(t)
        if r:
            #t -[is_a]-> r
            (
             g.V(t).as_('t')
              .V(r).coalesce(
                    __.inE('IS_A').where(outV().as_('t')),
                    addE('IS_A').from_('t')
                 )
            ).next() #commit
        r = t
    #print(r)
    #some tax records don't extend the lineage all the way to the parent taxon
    #this captures any enclosing terminal taxon from the other part of the
    #xml record.
    (
     g.V(sp).as_('sp')
      .V(t).coalesce(
            __.inE('IS_A').where(outV().as_('sp')),
               addE('IS_A').from_('sp')
            )
    ).next() #commit
    
    #lastly, connect the sample itself to its taxonomy.
    (
     g.V(s).as_('s')
      .V(sp).coalesce(
           __.inE('IS_A').where(outV().as_('s')),
              addE('IS_A').from_('s')
          )
    ).next()
     
    #load sample names
    for name_element in biorecord.find('.//Ids'):
        name = name_element.text
        namespace = name_element.attrib['db']
        ns = (
          g.V().has('namespace', 'name', namespace).fold()
            .coalesce(
               unfold(),
               addV('metadata::namespace').property('name', namespace)
            ).V().has('namespace', 'name', namespace).as_('ns')
             .V(s)
            .coalesce(
                __.inE('NAMED_IN').where(inV().as_('ns')
                                  .and_()
                                  .values('name').is_(name)),
                  addE('NAMED_IN')
                    .to('ns')
                    .property('name', name)
            )
        ).next()
        # (
        #  g.V(s).outE('NAMED_IN').where('name', name).to(
        #      coalesce(
        #          g.V().has('namespace', 'namespace', namespace),
        #          g.addV('metadata::namespace', 'namespace', namespace)
        #          ).as_('ns')
        #     ).fold()
        #   .coalesce(
        #         unfold(),
        #         addE('NAMED_IN').to('ns').property('name', name)
        #     )
        # ).next()  #commit
    return biorecord.find('./BioSample').attrib['id']
Esempio n. 8
0
                           (clargs.input_server, clargs.input_source))
    graph = anonymous_traversal.traversal().withRemote(
        DriverRemoteConnection(clargs.input_server, clargs.input_source))

    def percent_of(value, total):
        return (value / total) * 100

    def nice_percent(value, total):
        return '{} ({:.3f}%)'.format(value, percent_of(value, total))

    event_count = graph.V().hasLabel('event').count().next()
    non_root_logger().info(
        "checking broken event chain: event with either incoming or outgoing follow edge"
    )
    no_prev = graph.V().hasLabel('event').where(__.outE('follow')).not_(
        __.inE('follow')).count().next()
    no_next = graph.V().hasLabel('event').where(__.inE('follow')).not_(
        __.outE('follow')).count().next()
    non_root_logger().info("  (event)-[follow]-> {}".format(
        nice_percent(no_prev, event_count), nice_percent(no_next,
                                                         event_count)))
    non_root_logger().info("  -[follow]->(event) {}".format(
        nice_percent(no_prev, event_count), nice_percent(no_next,
                                                         event_count)))

    non_root_logger().info(
        "checking unconnected events: event without follow edge")
    unconnected_event = graph.V().hasLabel('event').not_(
        __.inE('follow')).not_(__.outE('follow')).count().next()
    non_root_logger().info("  X-[follow]->(event)-[follow]-X {}".format(
        nice_percent(unconnected_event, event_count)))
    def table_entities(cls, *, _g: GraphTraversalSource,
                       table_data: List[Table], existing: EXISTING) -> None:

        all_tables_ids = list(
            set([
                VertexTypes.Table.value.id(
                    key=TableUris.get(database=t.database,
                                      cluster=t.cluster,
                                      schema=t.schema,
                                      table=t.name).table) for t in table_data
            ]))

        all_owner_ids = list(
            set([
                VertexTypes.User.value.id(key=key) for key in [
                    t.table_writer.id for t in table_data
                    if t.table_writer is not None
                ]
            ]))
        all_application_ids = list(
            set(
                list(
                    possible_vertex_ids_for_application_key(*[
                        t.table_writer.id for t in table_data
                        if t.table_writer is not None
                    ]))))

        # chunk these since 100,000s seems to choke
        for tables_ids in chunk(all_tables_ids, 1000):
            LOGGER.info(f'fetching for tables: {tables_ids}')
            # fetch database -> cluster -> schema -> table links
            g = _g.V(tuple(tables_ids)).as_('tables')
            g = g.coalesce(__.inE(
                EdgeTypes.Table.value.label).dedup().fold()).as_(
                    EdgeTypes.Table.name)
            g = g.coalesce(__.unfold().outV().hasLabel(
                VertexTypes.Schema.value.label).inE(
                    EdgeTypes.Schema.value.label).dedup().fold()).as_(
                        EdgeTypes.Schema.name)
            g = g.coalesce(__.unfold().outV().hasLabel(
                VertexTypes.Cluster.value.label).inE(
                    EdgeTypes.Cluster.value.label).dedup().fold()).as_(
                        EdgeTypes.Cluster.name)

            # fetch table <- links
            for t in (EdgeTypes.BelongToTable, EdgeTypes.Generates,
                      EdgeTypes.Tag):
                g = g.coalesce(__.select('tables').inE(
                    t.value.label).fold()).as_(t.name)

            # fetch table -> column et al links
            for t in (EdgeTypes.Column, EdgeTypes.Description,
                      EdgeTypes.LastUpdatedAt, EdgeTypes.Source,
                      EdgeTypes.Stat):
                g = g.coalesce(__.select('tables').outE(
                    t.value.label).fold()).as_(t.name)

            # TODO: add owners, watermarks, last timestamp existing, source
            aliases = set([
                t.name
                for t in (EdgeTypes.Table, EdgeTypes.Schema, EdgeTypes.Cluster,
                          EdgeTypes.BelongToTable, EdgeTypes.Generates,
                          EdgeTypes.Tag, EdgeTypes.Column,
                          EdgeTypes.Description, EdgeTypes.LastUpdatedAt,
                          EdgeTypes.Source, EdgeTypes.Stat)
            ])
            g = g.select(*aliases).unfold().select(MapColumn.values).unfold()
            g = g.local(
                __.union(__.outV().id(), __.valueMap(True),
                         __.inV().id()).fold())
            cls._into_existing(g.toList(), existing)

            cls._column_entities(_g=_g,
                                 tables_ids=tables_ids,
                                 existing=existing)

        # fetch Application, User
        for ids in chunk(list(set(all_application_ids + all_owner_ids)), 5000):
            LOGGER.info(f'fetching for application/owners: {ids}')
            g = _g.V(ids).valueMap(True)
            cls._into_existing(g.toList(), existing)