Beispiel #1
0
def plot_emit_complete_latency(metrics_client: HeronMetricsClient,
                               topology_id: str, cluster: str, environ: str,
                               start: dt.datetime, end: dt.datetime,
                               **kwargs: Union[str, int, float]):

    emit_counts: pd.DataFrame = metrics_client.get_emit_counts(
        topology_id, cluster, environ, start, end, **kwargs)

    complete_latencies: pd.DataFrame = metrics_client.get_complete_latencies(
        topology_id, cluster, environ, start, end, **kwargs)

    spouts: np.ndarray = complete_latencies.component.unique()

    emit_counts = emit_counts[emit_counts.component.isin(spouts)]

    combined: pd.DataFrame = emit_counts.merge(
        complete_latencies, on=["task", "timestamp"])[["task", "timestamp",
                                                       "emit_count",
                                                       "latency_ms"]]

    for (task, stream), data in combined.groupby(["task", "stream"]):
        fig, ax1 = plt.subplots()

        color = 'tab:red'
        ax1.set_xlabel('timestamp')
        ax1.set_ylabel('latency (ms)', color=color)
        ax1.plot(data.timestamp, data.latency_ms, color=color)
        ax1.tick_params(axis='y', labelcolor=color)

        # instantiate a second axes that shares the same x-axis
        ax2 = ax1.twinx()

        color = 'tab:blue'
        # we already handled the x-label with ax1
        ax2.set_ylabel('count', color=color)
        ax2.plot(data.timestamp, data.emit_count, color=color)
        ax2.tick_params(axis='y', labelcolor=color)

        fig.tight_layout()  # otherwise the right y-label is slightly clipped
        plt.show()
Beispiel #2
0
    def __init__(self, metrics_client: HeronMetricsClient, graph_client: GremlinClient, topology_id: str,
                 cluster: str, environ: str, start: [dt.datetime], end: [dt.datetime],
                 traffic_config: Dict[str, Any], **other_kwargs) -> None:
        self.graph_client = graph_client
        self.metrics_client: HeronMetricsClient = metrics_client
        self.topology = topology_id
        self.cluster = cluster
        self.environ = environ
        self.start = start
        self.end = end
        self.kwargs = other_kwargs
        self.tuples = self.metrics_client.get_tuple_arrivals_at_stmgr\
            (self.topology, cluster, environ, start, end, **other_kwargs)

        spouts = graph_client.graph_traversal.V().has("topology_id", self.topology). \
            hasLabel("spout").where(outE("logically_connected")).properties('component').value().dedup().toList()

        spout_queue_processing_rate = metrics_client.get_outgoing_queue_processing_rate(
            topology_id, cluster, environ, start, end)
        self.spout_queue_processing_rate = \
            spout_queue_processing_rate.loc[spout_queue_processing_rate['component'].isin(spouts)]

        num_tuples_added_to_spout_gateway_queue = metrics_client.get_out_going_queue_arrival_rate(
            self.topology, cluster, environ, start, end)
        self.num_tuples_added_to_spout_gateway_queue = \
            num_tuples_added_to_spout_gateway_queue.loc[
                num_tuples_added_to_spout_gateway_queue['component'].isin(spouts)]

        spout_tuple_set_size = metrics_client.get_average_tuple_set_size_added_to_outgoing_queue(
            self.topology, cluster, environ, start, end)
        self.spout_tuple_set_size = spout_tuple_set_size.loc[spout_tuple_set_size['component'].isin(spouts)]

        spout_arrival_rates = self.num_tuples_added_to_spout_gateway_queue.\
            rename(index=str, columns={"tuples-added-to-queue": "num-tuples"})
        self.spout_arrival_rates = spout_arrival_rates.\
            merge(self.spout_tuple_set_size, on=["task", "component", "container", "timestamp"])
        self.spout_arrival_rates["num-tuples"] = self.spout_arrival_rates["num-tuples"] *\
                                                 self.spout_arrival_rates["tuple-set-size"]
Beispiel #3
0
def get_spout_emissions(metric_client: HeronMetricsClient, tracker_url: str,
                        topology_id: str, cluster: str, environ: str,
                        start: dt.datetime, end: dt.datetime) -> pd.DataFrame:

    emit_counts: pd.DataFrame = metric_client.get_emit_counts(
        topology_id, cluster, environ, start, end)

    lplan: Dict[str, Any] = tracker.get_logical_plan(tracker_url, cluster,
                                                     environ, topology_id)

    spout_emits: pd.DataFrame = \
        emit_counts[emit_counts.component.isin(lplan["spouts"].keys())]

    return spout_emits
Beispiel #4
0
def run(config: Dict[str, Any], metrics_client: HeronMetricsClient,
        total_hours: int, period_length_secs: int,
        topology_model: QTTopologyModel, topology_id: str, cluster: str,
        environ: str, metric_bucket_length: int, **kwargs: Any):

    start: dt.datetime = (dt.datetime.utcnow().replace(tzinfo=dt.timezone.utc)
                          - dt.timedelta(hours=total_hours))

    zk_config = config["heron.topology.models.config"]
    last_updated: dt.datetime = zookeeper.last_topo_update_ts_html(
        zk_config["heron.statemgr.connection.string"],
        zk_config["heron.statemgr.root.path"], topology_id,
        zk_config["zk.time.offset"]).astimezone(dt.timezone.utc)

    if start < last_updated:
        update_err: str = (f"The provided total hours ({total_hours}) will "
                           f"result in a start time ({start.isoformat()}) "
                           f"which is before the last update to "
                           f"{topology_id}'s physical plan "
                           f"({last_updated.isoformat()})")
        LOG.error(update_err)
        raise RuntimeError(update_err)

    periods: List[Tuple[dt.datetime, dt.datetime]] = \
        validation_helper.create_start_end_list(total_hours,
                                                period_length_secs)

    output: pd.DataFrame = None

    for j, (traffic_start, traffic_end) in enumerate(periods):

        LOG.info("Using metrics sourced from %s to %s",
                 traffic_start.isoformat(), traffic_end.isoformat())

        LOG.info("\n\nComparing prediction, using metrics from period "
                 "%d and traffic from period %d, to actual performance"
                 " during period %d\n", j, j, j)
        try:

            spout_state = heron_helper.get_spout_state(
                metrics_client, topology_id, cluster, environ,
                config["heron.tracker.url"], traffic_start, traffic_end,
                60, "mean")

            # Get the actual arrival rates at all instances
            actual_arrs: pd.DataFrame = \
                metrics_client.get_tuple_arrivals_at_stmgr(topology_id, cluster,
                                                            environ, traffic_start,
                                                            traffic_end, **kwargs)

            actual_arrs: pd.DataFrame = \
                (actual_arrs.groupby(["task", "component", "timestamp"]).sum().reset_index())

            actual_arrs["arrival_rate_tps"] = (actual_arrs["num-tuples"] / 60)

            actual_instanceå_arrs: pd.DataFrame = \
                (actual_arrs.groupby(["component", "task"])
                 ["arrival_rate_tps"].mean().reset_index()
                 .rename(index=str, columns={"arrival_rate_tps":
                                             "actual_arrival_rates_tps"}))

            results: pd.DataFrame = compare(
                metrics_client, spout_state, actual_instance_arrs,
                topology_model, topology_id,
                cluster, environ, traffic_start, traffic_end,
                metric_bucket_length, **kwargs)

        except ConnectionRefusedError as cr_err:
            LOG.error("Connection was refused with message: %s",
                      str(cr_err))
        except ConnectionResetError as cre_err:
            LOG.error("Connection was reset with message: %s",
                      str(cre_err))
        except requests.exceptions.ConnectionError as req_err:
            LOG.error("Connection error with message: %s", str(req_err))
        except Exception as err:
            LOG.error("Error (%s) with message: %s", str(type(err)),
                      str(err))
            raise err
        else:
            results["traffic_start"] = traffic_start
            results["traffic_end"] = traffic_end

            if output is not None:
                output = output.append(results, ignore_index=True)
            else:
                output = results

    return output
Beispiel #5
0
def lstsq_io_ratios(metrics_client: HeronMetricsClient,
                    graph_client: GremlinClient, topology_id: str,
                    cluster: str, environ: str,
                    start: dt.datetime, end: dt.datetime, bucket_length: int,
                    **kwargs: Union[str, int, float]) -> pd.DataFrame:
    """ This method will calculate the input/output ratio for each instance in
    the supplied topology using data aggregated from the defined period. The
    method uses least squares regression to calculate a coefficient for each
    input stream into a instance such that the total output amount for a given
    output stream is sum of all input stream arrival amounts times their
    coefficient.

    *NOTE*: This method assumes that there is an (approximately) linear
    relationship between the inputs and outputs of a given component.

    Arguments:
        metrics_client (HeronMetricsClient):    The client instance for the
                                                metrics database.
        graph_client (GremlinClient):   The client instance for the graph
                                        database.
        topology_id (str):  The topology identification string.
        start (dt.datetime):    The UTC datetime object for the start of the
                                metric gathering period.
        end (dt.datetime):  The UTC datetime object for the end of the metric
                            gathering period.
        bucket_length (int):    The length in seconds that the metrics should
                                be aggregated into. *NOTE*: For the least
                                squares regression to work the number of
                                buckets must exceed the highest number of input
                                streams into the component of the topology.
        **kwargs:   Additional keyword arguments that will be passed to the
                    metrics client object. Consult the documentation for the
                    specific metrics client beings used.
    Returns:
        pandas.DataFrame:   A DataFrame with the following columns:

        * task: Task ID integer.
        * output_stream: The output stream name.
        * input_stream: The input stream name.
        * source_component: The name of the source component for the input
          stream.
        * coefficient: The value of the input amount coefficient for this
          output stream, inputs stream source component combination.
    """

    LOG.info("Calculating instance input/output ratios using least squares "
             "regression for topology %s over a %d second window between %s "
             "and %s", topology_id, (end-start).total_seconds(),
             start.isoformat(), end.isoformat())

    emit_counts: pd.DataFrame = metrics_client.get_emit_counts(
        topology_id, cluster, environ, start, end, **kwargs)

    arrived_tuples: pd.DataFrame = metrics_client.get_tuple_arrivals_at_stmgr(
        topology_id, cluster, environ, start, end, **kwargs)

    execute_counts: pd.DataFrame = metrics_client.get_execute_counts(
        topology_id, cluster, environ, start, end, **kwargs)

    arrived_tuples = arrived_tuples.merge(execute_counts, on=["task", "component", "container", "timestamp"])

    arrived_tuples.drop("execute_count", axis=1, inplace=True)
    # Limit the count DataFrames to only those component with both incoming and
    # outgoing streams
    in_out_comps: List[str] = get_in_out_components(graph_client, topology_id)

    emit_counts = emit_counts[emit_counts["component"].isin(in_out_comps)]
    emit_counts.rename(index=str, columns={"stream": "outgoing_stream"},
                       inplace=True)

    arrived_tuples = arrived_tuples[arrived_tuples["component"]
                                    .isin(in_out_comps)]
    arrived_tuples.rename(index=str, columns={"stream": "incoming_stream"},
                          inplace=True)
    # Re-sample the counts into equal length time buckets and group by task id,
    # time bucket and stream. This aligns the two DataFrames with timestamps of
    # equal length and start point so they can be merged later
    emit_counts_ts: pd.DataFrame = \
        (emit_counts.set_index(["task", "timestamp"])
         .groupby([pd.Grouper(level="task"),
                   pd.Grouper(freq=f"{bucket_length}S", level='timestamp'),
                   "component", "outgoing_stream"])
         ["emit_count"]
         .sum().reset_index())

    arrived_tuples_ts: pd.DataFrame = \
        (arrived_tuples.set_index(["task", "timestamp"])
         .groupby([pd.Grouper(level="task"),
                   pd.Grouper(freq=f"{bucket_length}S", level='timestamp'),
                   "component", "incoming_stream", "source_component"])
         ["num-tuples"]
         .sum().reset_index())

    rows: List[Dict[str, Union[str, float]]] = []

    # Now we loop through each component and munge the data until we have an
    # output total for each output stream for each task on the same row (one
    # row per time bucket) as the input total for each input stream
    component: str
    in_data: pd.DataFrame
    for component, in_data in arrived_tuples_ts.groupby(["component"]):
        in_stream_counts: pd.DataFrame = \
            (in_data.set_index(["task", "timestamp", "incoming_stream",
                                "source_component"])
             ["num-tuples"].unstack(level=["incoming_stream",
                                           "source_component"])
             .reset_index())

        out_stream_counts: pd.DataFrame = \
            emit_counts_ts[emit_counts_ts.component == component]

        merged: pd.DataFrame = out_stream_counts.merge(in_stream_counts,
                                                       on=["task",
                                                           "timestamp"])
        task: int
        out_stream: str
        data: pd.DataFrame
        for (task, out_stream), data in merged.groupby(["task",
                                                        "outgoing_stream"]):

            LOG.debug("Processing instance %d output stream %s", task,
                      out_stream)

            # Get a series of the output counts for this output stream, these
            # are the dependent variables (b) of the least squares regression
            # a x = b
            output_counts: pd.DataFrame = data.emit_count

            # If this instance's component has output stream registered that
            # nothing else subscribes too then the emit count will be zero and
            # we can skip this output stream
            if output_counts.sum() <= 0.0:
                LOG.debug("No emissions from instance %d on stream %s, "
                          "skipping this stream...", task, out_stream)
                continue

            # Get just the input stream counts for each time bucket. This is
            # the coefficients matrix (a) of the least squares regression
            # a x = b
            cols: List[Tuple[str, str]] = data.columns[5:]
            input_counts: pd.DataFrame = data[cols]

            coeffs: List[float]
            coeffs, _, _, _ = np.linalg.lstsq(input_counts, output_counts,
                                              rcond=None)
            i: int
            in_stream: str
            source: str
            for i, (in_stream, source) in enumerate(cols):
                row: Dict[str, Union[str, float]] = {
                    "task": task,
                    "output_stream": out_stream,
                    "input_stream": in_stream,
                    "source_component": source,
                    "coefficient": coeffs[i]}
                rows.append(row)
    result = pd.DataFrame(rows)

    if result.empty:
        raise Exception("lstsq_io_ratios returns an empty dataframe")

    return result
def calculate_inter_instance_rps(metrics_client: HeronMetricsClient,
                                 topology_id: str, cluster: str, environ: str,
                                 start: dt.datetime,
                                 end: dt.datetime) -> pd.DataFrame:
    """ Get a DataFrame with the instance to instance routing probabilities for
    each source instance's output streams.

    Arguments:
        metrics_client (HeronMetricsClient):    The metrics client from which
                                                to extract transfer count data
                                                from.
        topology_id (str):  The topology identification string.
        cluster (str): The cluster the topology is running on.
        environ (str): The environment the topology is running in.
        start (dt.datetime):    The UTC datetime object for the start of the
                                metrics gathering widow.
        end (dt.datetime):  The UTC datetime object for the end of the metrics
                            gathering widow.

    Returns:
        pandas.DataFrame: A DataFrame with the following columns:

        * source_component: The source instance's component name.
        * source_task: The source instances task ID.
        * stream: The stream ID string for the outgoing stream from the source.
        * destination_component: The destination instance's component name.
        * destination_task: The destination instance's task ID.
        * routing_probability: The probability (between 0 and 1) that a tuple
          leaving the source instance on the specified stream will be routed to
          the destination instance.
    """

    LOG.info(
        "Calculating instance to instance routing probabilities for "
        "topology %s for period from %s to %s", topology_id, start.isoformat(),
        end.isoformat())

    # Get the receive counts for the topology
    rec_counts: pd.DataFrame = metrics_client.get_receive_counts(
        topology_id, cluster, environ, start, end)

    # Get the instance to instance transfers
    transfer_counts: pd.DataFrame = rec_counts.groupby(
        ["source_component", "source_task", "stream", "component",
         "task"])["receive_count"].sum().reset_index()
    transfer_counts.rename(index=str,
                           columns={"receive_count": "transfer_count"},
                           inplace=True)

    # Get the total emitted by each instance onto each stream
    total_emissions: pd.DataFrame = rec_counts.groupby(
        ["source_component", "source_task", "stream",
         "component"])["receive_count"].sum().reset_index()
    total_emissions.rename(index=str,
                           columns={"receive_count": "total_emitted"},
                           inplace=True)

    # Merge the total emissions from each instance and the total transferred
    # between instances into a single DataFrame
    merged_counts: pd.DataFrame = total_emissions.merge(
        transfer_counts,
        on=["source_component", "source_task", "stream", "component"])

    # Calculate the routing probability
    merged_counts["routing_probability"] = (merged_counts["transfer_count"] /
                                            merged_counts["total_emitted"])

    merged_counts["routing_probability"].fillna(0, inplace=True)

    merged_counts.rename(index=str,
                         columns={
                             "component": "destination_component",
                             "task": "destination_task"
                         },
                         inplace=True)

    return merged_counts[[
        "source_component", "source_task", "stream", "destination_component",
        "destination_task", "routing_probability"
    ]]
def calculate_ISAP(metrics_client: HeronMetricsClient, topology_id: str,
                   cluster: str, environ: str, start: dt.datetime,
                   end: dt.datetime, **kwargs: Union[str, int,
                                                     float]) -> pd.DataFrame:
    """ Calculates the Instance Stream Activation Proportion (ISAP) for each
    instance in the specified topology. This is the proportion, relative to the
    total activation of all instances of the same component, that each instance
    is active for each (stream, source component) combination.

    Under certain situation the ISAP can be used as a proxy for the routing
    probability of incoming connection to the instance.

    Arguments:
        metrics_client (HeronMetricsClient):    The metrics client from which
                                                to extract transfer count data
                                                from.
        topology_id (str):  The topology identification string.
        cluster (str): The cluster the topology is running in.
        environ (str): The environment the topology is running in.
        start (dt.datetime):    The UTC datetime object for the start of the
                                metrics gathering widow.
        end (dt.datetime):  The UTC datetime object for the end of the metrics
                            gathering widow.
        **kwargs:   Additional keyword arguments required by the methods of
                    the supplied metrics client instance.

    Returns:
        pandas.DataFrame:   A DataFrame with the following columns:

        * timestamp: The UTC timestamp for the metric.
        * component: The component this metric comes from.
        * task: The instance ID number for the instance that the metric
          comes from.
        * container:  The ID for the container this metric comes from.
        * stream: The name of the incoming stream from which the tuples
          that lead to this metric came from.
        * source_component: The name of the component the stream's source
          instance belongs to.
        * execute_count: The execute count during the metric time period.
        * component_total: The total number of tuples executed by all
          instances of the component in that metric time period.
        * ISAP: The instance stream activation proportion for the given
          instance in that metric time period.
    """

    LOG.info(
        "Calculating ISAP for topology %s over a %d second period from "
        "%s to %s", topology_id, (end - start).total_seconds(),
        start.isoformat(), end.isoformat())

    execute_counts: pd.DataFrame = metrics_client.get_execute_counts(
        topology_id, cluster, environ, start, end, **kwargs)

    ex_counts_totals: pd.DataFrame = execute_counts.merge(
        execute_counts.groupby(
            ["component", "stream", "source_component",
             "timestamp"]).execute_count.sum().reset_index().rename(
                 index=str, columns={"execute_count": "component_total"}),
        on=["component", "stream", "source_component", "timestamp"])

    ex_counts_totals["ISAP"] = (ex_counts_totals["execute_count"] /
                                ex_counts_totals["component_total"])

    ex_counts_totals["ISAP"].fillna(0, inplace=True)

    return ex_counts_totals
Beispiel #8
0
def get_spout_state(
        metrics_client: HeronMetricsClient,
        topology_id: str,
        cluster: str,
        environ: str,
        tracker_url: str,
        start: dt.datetime,
        end: dt.datetime,
        metrics_sample_period: float,
        summary_method: str = "median",
        **kwargs: Union[str, int, float]) -> Dict[int, Dict[str, float]]:
    """ Helper script that will fetch the median or mean spout emission rates
    and format them into the dictionary structure expected by the topology
    performance prediction methods.

    Arguments:
        metrics_client (HeronMetricsClient):    The client for the metrics
                                                database.
        topology_id (str):  The topology identification string.
        cluster (str):  The cluster that that the topology is running on.
        environ (str): The environment that the topology is running in.
        tracker_url (str):  The URL for the Heron Tracker API>
        start (datetime):   The UTC datetime for the start of the metrics
                            gathering period.
        end (datetime): The UTC datetime for the start of the metrics
                        gathering period.
        metrics_sample_period (float):  The period that metrics are sampled
                                        into. eg 60 secs (1 min), 300 secs
                                        (5 mins).
        summary_method (str):   The method to use to summerise the emit counts.
                                Either "mean" to "median". Defaults to median.
        **kwargs:   Any additional keyword arguments required by the metrics
                    client.

    Returns:
        Dict[int, Dict[str, float]]:    A dictionary mapping from task ID to a
        dict that maps from output stream name to an emission rate in tuples
        per second.
    """

    LOG.info(
        "Getting spout emission state dictionary for topology %s over a"
        "period of %d seconds from %s to %s", topology_id,
        (end - start).total_seconds(), start.isoformat(), end.isoformat())

    lplan: Dict[str, Any] = tracker.get_logical_plan(tracker_url, cluster,
                                                     environ, topology_id)

    emit_counts: pd.DataFrame = metrics_client.get_emit_counts(
        topology_id, cluster, environ, start, end, **kwargs)

    spout_groups: pd.core.groupby.DataFrameGroupBy = \
        (emit_counts[emit_counts["component"].isin(lplan["spouts"])]
         .groupby(["task", "stream"]))

    if summary_method == "median":

        spout_emits: pd.Series = spout_groups.emit_count.median()

    elif summary_method == "mean":

        spout_emits = spout_groups.emit_count.mean()

    else:
        msg: str = f"Unknown summary method: {summary_method}"
        LOG.error(msg)
        raise RuntimeError(msg)

    output: DefaultDict[int, Dict[str, float]] = defaultdict(dict)

    for (task_id, stream), emit_count in spout_emits.iteritems():

        output[task_id][stream] = emit_count / metrics_sample_period

    return dict(output)