Example #1
0
def _calculate_outputs(
    topo_traversal: GraphTraversalSource,
    source_vertex: Vertex,
    arrival_rates: ARRIVAL_RATES,
    output_rates: DefaultDict[int, Dict[str, float]],
    coefficients: pd.Series,
) -> DefaultDict[int, Dict[str, float]]:

    source_task: int = (
        topo_traversal.V(source_vertex).properties("task_id").value().next())

    in_streams: List[Dict[str, str]] = \
        (topo_traversal.V(source_vertex).inE("logically_connected")
         .project("stream_name", "source_component")
         .by(properties("stream").value())
         .by(outV().properties("component").value())
         .dedup()
         .toList())

    out_streams: List[str] = \
        (topo_traversal.V(source_vertex)
         .outE("logically_connected").values("stream")
         .dedup().toList())

    for out_stream in out_streams:
        output_rate: float = 0.0
        for in_stream in in_streams:
            in_stream_name: str = in_stream["stream_name"]
            source_component: str = in_stream["source_component"]

            stream_arrivals: float = \
                arrival_rates[source_task][(in_stream_name,
                                            source_component)]

            try:
                coefficent: float = float(coefficients.loc[source_task,
                                                           out_stream,
                                                           in_stream_name,
                                                           source_component])
            except KeyError:
                LOG.debug(
                    "No coefficient available for source task %d, "
                    "out stream %s, in stream %s from component %s",
                    source_task, out_stream, in_stream_name, source_component)
            else:
                output_rate += (stream_arrivals * coefficent)

        # It is possible that some of the IO coefficients may be negative,
        # implying that the more you receive on an input stream the less you
        # output to a given output stream. If we anticipate a large arrival on
        # this negative input stream and low on other positive streams then it
        # is possible that the predicted output rate could be negative (which
        # is obviously meaningless).
        if output_rate < 0.0:
            output_rate = 0.0

        output_rates[source_task][out_stream] = output_rate

    return output_rates
Example #2
0
def _user_search_query(graph: GraphTraversalSource,
                       tag_filter: str) -> List[Dict]:
    traversal = graph.V().hasLabel(User.USER_NODE_LABEL)
    traversal = traversal.has(User.USER_NODE_FULL_NAME)
    if tag_filter:
        traversal = traversal.where('published_tag', tag_filter)
    traversal = traversal.project('email', 'first_name', 'last_name',
                                  'full_name', 'github_username', 'team_name',
                                  'employee_type', 'manager_email', 'slack_id',
                                  'is_active', 'role_name', 'total_read',
                                  'total_own', 'total_follow')
    traversal = traversal.by('email')  # email
    traversal = traversal.by('first_name')  # first_name
    traversal = traversal.by('last_name')  # last_name
    traversal = traversal.by('full_name')  # full_name
    traversal = traversal.by('github_username')  # github_username
    traversal = traversal.by('team_name')  # team_name
    traversal = traversal.by('employee_type')  # employee_type
    traversal = traversal.by(
        __.coalesce(
            __.out(User.USER_MANAGER_RELATION_TYPE).values('email'),
            __.constant('')))  # manager_email
    traversal = traversal.by('slack_id')  # slack_id
    traversal = traversal.by('is_active')  # is_active
    traversal = traversal.by('role_name')  # role_name
    traversal = traversal.by(
        __.coalesce(
            __.outE(READ_RELATION_TYPE).values('read_count'),
            __.constant(0)).sum())  # total_read
    traversal = traversal.by(
        __.outE(OWNER_OF_OBJECT_RELATION_TYPE).fold().count())  # total_own
    traversal = traversal.by(
        __.outE('FOLLOWED_BY').fold().count())  # total_follow
    traversal = traversal.order().by(__.select('email'), Order.asc)
    return traversal.toList()
Example #3
0
def get_comp_links_by_grouping(graph_traversal: GraphTraversalSource,
                               grouping: str) -> List[Dict[str, str]]:
    """ Gets a list of component connection dictionaries. These describe all
    source->stream->destination connections with the specified grouping value
    in the topology available via the supplied graph traversal source.

    Arguments:
        graph_traversal (GraphTraversalSource): A GraphTraversalSource instance
                                                linked to the topology subgraph
                                                whose connections are to be
                                                queried.
        grouping (str): The stream grouping of the connections to be returned.

    Returns:
        A list of dictionaries each containing "source", "stream" and
        "destination" keys of the component and stream name respectively.
    """

    component_connections: List[Dict[str, str]] = \
        (graph_traversal.V().hasLabel(P.within("bolt", "spout")).as_("source")
         .outE("logically_connected").has("grouping", grouping).as_("stream")
         .inV().as_("destination").select("source", "stream", "destination")
         .by("component").by("stream").by("component").dedup().toList())

    return component_connections
Example #4
0
def _table_search_query(graph: GraphTraversalSource,
                        tag_filter: str) -> List[Dict]:
    traversal = graph.V().hasLabel(TableMetadata.TABLE_NODE_LABEL)
    if tag_filter:
        traversal = traversal.has('published_tag', tag_filter)
    traversal = traversal.project('database', 'cluster', 'schema',
                                  'schema_description', 'name', 'key',
                                  'description', 'last_updated_timestamp',
                                  'column_names', 'column_descriptions',
                                  'total_usage', 'unique_usage', 'tags',
                                  'badges', 'programmatic_descriptions')
    traversal = traversal.by(
        __.out(TableMetadata.TABLE_SCHEMA_RELATION_TYPE).out(
            SCHEMA_REVERSE_RELATION_TYPE).out(
                CLUSTER_REVERSE_RELATION_TYPE).values('name'))  # database
    traversal = traversal.by(
        __.out(TableMetadata.TABLE_SCHEMA_RELATION_TYPE).out(
            SCHEMA_REVERSE_RELATION_TYPE).values('name'))  # cluster
    traversal = traversal.by(
        __.out(
            TableMetadata.TABLE_SCHEMA_RELATION_TYPE).values('name'))  # schema
    traversal = traversal.by(
        __.coalesce(
            __.out(TableMetadata.TABLE_SCHEMA_RELATION_TYPE).out(
                DescriptionMetadata.DESCRIPTION_RELATION_TYPE).values(
                    'description'), __.constant('')))  # schema_description
    traversal = traversal.by('name')  # name
    traversal = traversal.by(T.id)  # key
    traversal = traversal.by(
        __.coalesce(
            __.out(DescriptionMetadata.DESCRIPTION_RELATION_TYPE).values(
                'description'), __.constant('')))  # description
    traversal = traversal.by(
        __.coalesce(
            __.out(LASTUPDATED_RELATION_TYPE).values(TIMESTAMP_PROPERTY),
            __.constant('')))  # last_updated_timestamp
    traversal = traversal.by(
        __.out(TableMetadata.TABLE_COL_RELATION_TYPE).values(
            'name').fold())  # column_names
    traversal = traversal.by(
        __.out(TableMetadata.TABLE_COL_RELATION_TYPE).out(
            DescriptionMetadata.DESCRIPTION_RELATION_TYPE).values(
                'description').fold())  # column_descriptions
    traversal = traversal.by(
        __.coalesce(
            __.outE(READ_REVERSE_RELATION_TYPE).values('read_count'),
            __.constant(0)).sum())  # total_usage
    traversal = traversal.by(
        __.outE(READ_REVERSE_RELATION_TYPE).count())  # unique_usage
    traversal = traversal.by(
        __.inE(TableMetadata.TAG_TABLE_RELATION_TYPE).outV().values(
            METADATA_KEY_PROPERTY_NAME).fold())  # tags
    traversal = traversal.by(
        __.out('HAS_BADGE').values('keys').dedup().fold())  # badges
    traversal = traversal.by(
        __.out(DescriptionMetadata.PROGRAMMATIC_DESCRIPTION_NODE_LABEL).values(
            'description').fold())  # programmatic_descriptions
    traversal = traversal.order().by(__.select('name'), Order.asc)
    return traversal.toList()
Example #5
0
def _build_gremlin_vertices(g: GraphTraversalSource,
                            row: Any) -> GraphTraversalSource:
    g = g.V(str(row["~id"])).fold().coalesce(
        __.unfold(),
        __.addV(row["~label"]).property(T.id, str(row["~id"])))
    g = _build_gremlin_properties(g, row)

    return g
Example #6
0
def _build_gremlin_insert_vertices(
        g: GraphTraversalSource,
        row: Any,
        use_header_cardinality: bool = False) -> GraphTraversalSource:
    g = g.V(str(row["~id"])).fold().coalesce(
        __.unfold(),
        __.addV(row["~label"]).property(T.id, str(row["~id"])))
    g = _set_properties(g, use_header_cardinality, row)
    return g
    def expire_connections_for_other(cls, *, _g: GraphTraversalSource,
                                     vertex_type: VertexType,
                                     keys: FrozenSet[str],
                                     existing: EXISTING) -> None:
        # V().has(label, 'key', P.without(keys)) is more intuitive but doesn't scale, so instead just find all those
        g = _g.V().hasLabel(vertex_type.label).where(__.bothE())
        g = g.values(WellKnownProperties.Key.value.name)
        all_to_expire_keys = set(g.toList()).difference(keys)

        # TODO: when any vertex ids that need something besides key
        all_to_expire = set(
            vertex_type.id(key=key) for key in all_to_expire_keys)

        for to_expire in chunk(all_to_expire, 1000):
            g = _g.V(tuple(to_expire)).bothE()
            g = g.local(
                __.union(__.outV().id(), __.valueMap(True),
                         __.inV().id()).fold())
            cls._into_existing(g.toList(), existing)
Example #8
0
def _build_gremlin_edges(g: GraphTraversalSource,
                         row: pd.Series) -> GraphTraversalSource:
    g = (g.V(str(row["~from"])).fold().coalesce(
        __.unfold(),
        _build_gremlin_vertices(__, {
            "~id": row["~from"],
            "~label": "Vertex"
        })).addE(row["~label"]).to(
            __.V(str(row["~to"])).fold().coalesce(
                __.unfold(),
                _build_gremlin_vertices(__, {
                    "~id": row["~to"],
                    "~label": "Vertex"
                }))))
    g = _build_gremlin_properties(g, row)

    return g
    def _column_entities(cls, *, _g: GraphTraversalSource,
                         tables_ids: Iterable[str],
                         existing: EXISTING) -> None:
        # fetch database -> cluster -> schema -> table links
        g = _g.V(tuple(tables_ids))
        g = g.outE(EdgeTypes.Column.value.label)
        g = g.inV().hasLabel(VertexTypes.Column.value.label).as_('columns')

        # fetch column -> links (no Stat)
        for t in [EdgeTypes.Description]:
            g = g.coalesce(__.select('columns').outE(
                t.value.label).fold()).as_(t.name)

        g = g.select(EdgeTypes.Description.name).unfold()
        g = g.local(
            __.union(__.outV().id(), __.valueMap(True),
                     __.inV().id()).fold())
        cls._into_existing(g.toList(), existing)
Example #10
0
def _build_gremlin_insert_edges(
        g: GraphTraversalSource, row: pd.Series,
        use_header_cardinality: bool) -> GraphTraversalSource:
    g = (g.V(str(row["~from"])).fold().coalesce(
        __.unfold(),
        _build_gremlin_insert_vertices(__, {
            "~id": row["~from"],
            "~label": "Vertex"
        })).addE(row["~label"]).property(T.id, str(row["~id"])).to(
            __.V(str(row["~to"])).fold().coalesce(
                __.unfold(),
                _build_gremlin_insert_vertices(__, {
                    "~id": row["~to"],
                    "~label": "Vertex"
                }))))
    g = _set_properties(g, use_header_cardinality, row)

    return g
Example #11
0
def get_levels(topo_traversal: GraphTraversalSource) -> List[List[Vertex]]:
    """ Gets the levels of the logical graph. The traversal starts with the
    source spouts and performs a breadth first search through the logically
    connected vertices.

    Arguments:
        topo_traversal (GraphTraversalSource):  A traversal source instance
                                                mapped to the topology subgraph
                                                whose levels are to be
                                                calculated.

    Returns:
        A list where each entry is a list of Vertex instances representing a
        level within the logical graph. The first level will be the spout
        instances.
    """

    # Only load the static enums we need so we don't pollute the globals dict
    keys = statics.staticEnums["keys"]
    values = statics.staticEnums["values"]
    local_scope = statics.staticEnums["local"]

    # Repeatedly traverse the tree defined by the logical connections, grouping
    # each group (or set because we us de-duplicate) of vertices by their depth
    # in the tree. This depth is based the current number of times the repeat
    # step has run (loops). So you end up with a map of integer depth to list
    # of vertices which is emitted by the cap step. After this we just put the
    # Hash Map in key order (ascending) and then take only the values (the
    # lists of vertices) and unfold them into a list.
    # The first group by(-1) statement is so that the spout vertices are
    # included at the top of the list
    levels: List[List[Vertex]] = (
        topo_traversal.V().hasLabel("spout").group("m").by(
            constant(-1)).repeat(
                out("logically_connected").dedup().group("m").by(
                    loops())).until(not_(outE("logically_connected"))).
        cap("m").order(local_scope).by(keys).select(values).unfold().toList())

    return levels
Example #12
0
def _calculate_arrivals(topo_traversal: GraphTraversalSource,
                        source_vertex: Vertex, arrival_rates: ARRIVAL_RATES,
                        output_rates: DefaultDict[int, Dict[str, float]],
                        i2i_rps: pd.DataFrame) -> ARRIVAL_RATES:

    # Get all downstream edges and vertices for this source vertex
    out_edges: List[Dict[str, Union[str, int, float]]] = \
        (topo_traversal.V(source_vertex).outE("logically_connected")
         .project("source_task", "source_component", "stream_name",
                  "destination_task", "destination_component")
         .by(outV().properties("task_id").value())
         .by(outV().properties("component").value())
         .by(properties("stream").value())
         .by(inV().properties("task_id").value())
         .by(inV().properties("component").value())
         .toList())

    if not out_edges:
        return arrival_rates

    source_task: int = cast(int, out_edges[0]["source_task"])
    source_component: str = cast(str, out_edges[0]["source_component"])

    LOG.debug("Processing output from source instance %s_%d", source_component,
              source_task)

    for out_edge in out_edges:
        stream: str = cast(str, out_edge["stream_name"])
        try:
            stream_output: float = cast(float,
                                        output_rates[source_task][stream])
        except KeyError:
            LOG.debug(
                "No output rate information for source task %d on "
                "stream %s. Skipping the outgoing edge", source_task, stream)
            continue

        destination_task: int = cast(int, out_edge["destination_task"])

        try:
            r_prob: float = float(
                i2i_rps.loc(axis=0)[source_task, destination_task, stream])
        except KeyError:
            LOG.debug(
                "Unable to find routing probability for connection from "
                "task %d to %d on stream %s", source_task, destination_task,
                stream)

            edge_output: float = 0.0
        else:

            edge_output = (stream_output * r_prob)

            LOG.debug(
                "Output from %s-%d to %s-%d on stream %s is "
                "calculated as %f * %f = %f", source_component, source_task,
                out_edge["destination_component"], destination_task, stream,
                stream_output, r_prob, edge_output)

        arrival_rates[destination_task][(stream,
                                         source_component)] += edge_output

    return arrival_rates
Example #13
0
def _dashboard_search_query(graph: GraphTraversalSource,
                            tag_filter: str) -> List[Dict]:
    traversal = graph.V().hasLabel(DashboardMetadata.DASHBOARD_NODE_LABEL)
    traversal = traversal.has('name')
    if tag_filter:
        traversal = traversal.where('published_tag', tag_filter)

    traversal = traversal.project('group_name', 'name', 'cluster',
                                  'description', 'group_description',
                                  'group_url', 'url', 'uri',
                                  'last_successful_run_timestamp',
                                  'query_names', 'chart_names', 'total_usage',
                                  'tags', 'badges')
    traversal = traversal.by(
        __.out(
            DashboardMetadata.DASHBOARD_DASHBOARD_GROUP_RELATION_TYPE).values(
                'name'))  # group_name
    traversal = traversal.by('name')  # name
    traversal = traversal.by(
        __.out(DashboardMetadata.DASHBOARD_DASHBOARD_GROUP_RELATION_TYPE).out(
            DashboardMetadata.DASHBOARD_GROUP_CLUSTER_RELATION_TYPE).values(
                'name'))  # cluster
    traversal = traversal.by(
        __.coalesce(
            __.out(
                DashboardMetadata.DASHBOARD_DESCRIPTION_RELATION_TYPE).values(
                    'description'), __.constant('')))  # description
    traversal = traversal.by(
        __.coalesce(
            __.out(
                DashboardMetadata.DASHBOARD_DASHBOARD_GROUP_RELATION_TYPE).out(
                    DashboardMetadata.DASHBOARD_DESCRIPTION_RELATION_TYPE).
            values('description'), __.constant('')))  # group_description
    traversal = traversal.by(
        __.out(
            DashboardMetadata.DASHBOARD_DASHBOARD_GROUP_RELATION_TYPE).values(
                'dashboard_group_url'))  # group_url
    traversal = traversal.by('dashboard_url')  # dashboard_url
    traversal = traversal.by('key')  # uri

    traversal = traversal.by(
        __.coalesce(
            __.out('EXECUTED').has(
                'key', TextP.endingWith(
                    '_last_successful_execution')).values('timestamp'),
            __.constant('')))  # last_successful_run_timestamp
    traversal = traversal.by(
        __.out(DashboardQuery.DASHBOARD_QUERY_RELATION_TYPE).values(
            'name').dedup().fold())  # query_names
    traversal = traversal.by(
        __.out(DashboardQuery.DASHBOARD_QUERY_RELATION_TYPE).out(
            DashboardChart.CHART_RELATION_TYPE).values(
                'name').dedup().fold())  # chart_names
    traversal = traversal.by(
        __.coalesce(
            __.outE(READ_REVERSE_RELATION_TYPE).values(
                READ_RELATION_COUNT_PROPERTY),
            __.constant(0)).sum())  # total_usage
    traversal = traversal.by(
        __.out('TAGGED_BY').has(
            'tag_type', 'default').values('keys').dedup().fold())  # tags
    traversal = traversal.by(
        __.out('HAS_BADGE').values('keys').dedup().fold())  # badges

    traversal = traversal.order().by(__.select('name'), Order.asc)

    dashboards = traversal.toList()
    for dashboard in dashboards:
        dashboard['product'] = dashboard['uri'].split('_')[0]

    return dashboards
Example #14
0
def _build_gremlin_update(g: GraphTraversalSource,
                          row: Any) -> GraphTraversalSource:
    g = g.V(str(row["~id"]))
    g = _build_gremlin_properties(g, row)

    return g
Example #15
0
def delete_graph_for_shard_only(g: GraphTraversalSource) -> None:
    shard = get_shard()
    assert shard, f'expected shard to exist! Surely you are only using this in development or test?'
    # TODO: do something better than not using WellKnownProperties.TestShard here (since that makes a circular
    # dependency)
    g.V().has('shard', shard).drop().iterate()
Example #16
0
def _build_gremlin_update(
        g: GraphTraversalSource, row: Any,
        use_header_cardinality: bool) -> GraphTraversalSource:
    g = g.V(str(row["~id"]))
    g = _set_properties(g, use_header_cardinality, row)
    return g
def delete_everything(traversal: GraphTraversalSource):
    return traversal.V().drop().toList()
    def table_entities(cls, *, _g: GraphTraversalSource,
                       table_data: List[Table], existing: EXISTING) -> None:

        all_tables_ids = list(
            set([
                VertexTypes.Table.value.id(
                    key=TableUris.get(database=t.database,
                                      cluster=t.cluster,
                                      schema=t.schema,
                                      table=t.name).table) for t in table_data
            ]))

        all_owner_ids = list(
            set([
                VertexTypes.User.value.id(key=key) for key in [
                    t.table_writer.id for t in table_data
                    if t.table_writer is not None
                ]
            ]))
        all_application_ids = list(
            set(
                list(
                    possible_vertex_ids_for_application_key(*[
                        t.table_writer.id for t in table_data
                        if t.table_writer is not None
                    ]))))

        # chunk these since 100,000s seems to choke
        for tables_ids in chunk(all_tables_ids, 1000):
            LOGGER.info(f'fetching for tables: {tables_ids}')
            # fetch database -> cluster -> schema -> table links
            g = _g.V(tuple(tables_ids)).as_('tables')
            g = g.coalesce(__.inE(
                EdgeTypes.Table.value.label).dedup().fold()).as_(
                    EdgeTypes.Table.name)
            g = g.coalesce(__.unfold().outV().hasLabel(
                VertexTypes.Schema.value.label).inE(
                    EdgeTypes.Schema.value.label).dedup().fold()).as_(
                        EdgeTypes.Schema.name)
            g = g.coalesce(__.unfold().outV().hasLabel(
                VertexTypes.Cluster.value.label).inE(
                    EdgeTypes.Cluster.value.label).dedup().fold()).as_(
                        EdgeTypes.Cluster.name)

            # fetch table <- links
            for t in (EdgeTypes.BelongToTable, EdgeTypes.Generates,
                      EdgeTypes.Tag):
                g = g.coalesce(__.select('tables').inE(
                    t.value.label).fold()).as_(t.name)

            # fetch table -> column et al links
            for t in (EdgeTypes.Column, EdgeTypes.Description,
                      EdgeTypes.LastUpdatedAt, EdgeTypes.Source,
                      EdgeTypes.Stat):
                g = g.coalesce(__.select('tables').outE(
                    t.value.label).fold()).as_(t.name)

            # TODO: add owners, watermarks, last timestamp existing, source
            aliases = set([
                t.name
                for t in (EdgeTypes.Table, EdgeTypes.Schema, EdgeTypes.Cluster,
                          EdgeTypes.BelongToTable, EdgeTypes.Generates,
                          EdgeTypes.Tag, EdgeTypes.Column,
                          EdgeTypes.Description, EdgeTypes.LastUpdatedAt,
                          EdgeTypes.Source, EdgeTypes.Stat)
            ])
            g = g.select(*aliases).unfold().select(MapColumn.values).unfold()
            g = g.local(
                __.union(__.outV().id(), __.valueMap(True),
                         __.inV().id()).fold())
            cls._into_existing(g.toList(), existing)

            cls._column_entities(_g=_g,
                                 tables_ids=tables_ids,
                                 existing=existing)

        # fetch Application, User
        for ids in chunk(list(set(all_application_ids + all_owner_ids)), 5000):
            LOGGER.info(f'fetching for application/owners: {ids}')
            g = _g.V(ids).valueMap(True)
            cls._into_existing(g.toList(), existing)