Esempio n. 1
0
def _build_graph(
    graph_client: GremlinClient,
    tracker_url: str,
    cluster: str,
    environ: str,
    topology_id: str,
    ref_prefix: str = "heron",
) -> str:

    topology_ref: str = create_graph_ref(cluster, environ, topology_id, ref_prefix)

    logical_plan: Dict[str, Any] = tracker.get_logical_plan(
        tracker_url, cluster, environ, topology_id
    )

    physical_plan: Dict[str, Any] = tracker.get_physical_plan(
        tracker_url, cluster, environ, topology_id
    )

    builder.create_physical_graph(
        graph_client, topology_id, topology_ref, logical_plan, physical_plan
    )

    return topology_ref
Esempio n. 2
0
def add_pplan_info(tracker_url: str,
                   topologies: pd.DataFrame = None) -> pd.DataFrame:
    """ Combines information from the topology summary DataFrame with
    information from the physical plan of each topology.

    Arguments:
        tracker_url (str):  The URL for the Heron Tracker API
        topologies (pd.DataFrame):  The topologies summary from the heron
                                    tracker can be supplied, if not it will
                                    fetched fresh from the Tracker API.

    Returns:
        pandas.DataFrame:   The topologies summary DataFrame with physical plan
        information added. This will return a new DataFrame and will not modify
        the supplied DataFrame
    """
    if topologies is None:
        topologies = tracker.get_topologies(tracker_url)

    output: List[Dict[str, Union[str, float, List[int]]]] = []

    for (cluster, environ,
         user), data in topologies.groupby(["cluster", "environ", "user"]):
        for topology_id in data.topology:

            try:
                pplan: Dict[str, Any] = tracker.get_physical_plan(
                    tracker_url, cluster, environ, topology_id)
            except requests.HTTPError:
                # If we cannot fetch the plan, skip this topology
                continue

            # Add information from the configuration dictionary
            config: Dict[str, str] = pplan["config"]
            row: Dict[str, Union[str, float, List[int]]] = {}
            row["topology"] = topology_id
            row["cluster"] = cluster
            row["environ"] = environ
            row["user"] = user
            for key, value in config.items():

                # Some of the custom config values are large dictionaries or
                # lists so we will skip them
                if isinstance(value, (dict, list)):
                    continue

                # Replace "." with "_" in the key name so we can use namespace
                # calls on the DataFrame
                new_key: str = "_".join(key.split(".")[1:])

                # Try to convert any values that numeric so we can do summary
                # stats
                try:
                    new_value: Union[str, float] = float(value)
                except ValueError:
                    new_value = value
                except TypeError:
                    LOG.error(
                        "Value of key: %s was not a string or number it"
                        " was a %s",
                        key,
                        str(type(value)),
                    )

                row[new_key] = new_value

            # Add instances stats for this topology
            row["total_instances"] = len(pplan["instances"])
            row["instances_per_container_dist"] = [
                len(pplan["stmgrs"][stmgr]["instance_ids"])
                for stmgr in pplan["stmgrs"]
            ]

            output.append(row)

    return pd.DataFrame(output)
Esempio n. 3
0
    def get_complete_latencies(
        self,
        topology_id: str,
        cluster: str,
        environ: str,
        start: dt.datetime,
        end: dt.datetime,
        **kwargs: Union[str, int, float],
    ) -> pd.DataFrame:
        """ Gets the complete latencies, as a timeseries, for every instance of
        the of all the spout components of the specified topology. The start
        and end times define the window over which to gather the metrics. The
        window duration should be less than 3 hours as this is the limit of
        what the Topology master stores.

        Arguments:
            topology_id (str):    The topology identification string.
            cluster (str):  The cluster the topology is running in.
            environ (str):  The environment the topology is running in (eg.
                            prod, devel, test, etc).
            start (datetime):    utc datetime instance for the start of the
                                    metrics gathering period.
            end (datetime):  utc datetime instance for the end of the
                                metrics gathering period.

        Returns:
            pandas.DataFrame: A DataFrame containing the service time
            measurements as a timeseries. Each row represents a measurement
            (aggregated over one minute) with the following columns:

            * timestamp:  The UTC timestamp for the metric,
            * component: The component this metric comes from,
            * task: The instance ID number for the instance that the metric
              comes from,
            * container:  The ID for the container this metric comes from,
              stream: The name of the incoming stream from which the tuples
              that lead to this metric came from,
            * latency_ms: The average execute latency measurement in
              milliseconds for that metric time period.

        Raises:
            RuntimeWarning: If the specified topology has a reliability mode
                            that does not enable complete latency.
        """
        LOG.info(
            "Getting complete latencies for topology %s over a %d second "
            "period from %s to %s",
            topology_id,
            (end - start).total_seconds(),
            start.isoformat(),
            end.isoformat(),
        )

        logical_plan, start_time, end_time = self._query_setup(
            topology_id, cluster, environ, start, end
        )

        # First we need to check that the supplied topology will actually have
        # complete latencies. Only ATLEAST_ONCE and EXACTLY_ONCE will have
        # complete latency values as acking is disabled for ATMOST_ONCE.
        physical_plan: Dict[str, Any] = tracker.get_physical_plan(
            self.tracker_url, cluster, environ, topology_id
        )
        if physical_plan["config"]["topology.reliability.mode"] == "ATMOST_ONCE":
            rm_msg: str = (
                f"Topology {topology_id} reliability mode is set "
                f"to ATMOST_ONCE. Complete latency is not "
                f"available for these types of topologies"
            )
            LOG.warning(rm_msg)
            warnings.warn(rm_msg, RuntimeWarning)
            return pd.DataFrame()

        output: pd.DataFrame = None

        spouts: Dict[str, Any] = logical_plan["spouts"]
        for spout_component in spouts:

            try:
                spout_complete_latencies: pd.DataFrame = self.get_spout_complete_latencies(
                    topology_id,
                    cluster,
                    environ,
                    spout_component,
                    start_time,
                    end_time,
                    logical_plan,
                )
            except HTTPError as http_error:
                LOG.warning(
                    "Fetching execute latencies  for component %s "
                    "failed with status code %s",
                    spout_component,
                    str(http_error.response.status_code),
                )

            if output is None:
                output = spout_complete_latencies
            else:
                output = output.append(spout_complete_latencies, ignore_index=True)

        return output
Esempio n. 4
0
            print(MSG2)
        else:
            LOG.error(MSG2)

        sys.exit(1)

    TIMER_START = dt.datetime.now()

    TRACKER_URL: str = cast(str, CONFIG[ConfKeys.HERON_TRACKER_URL.value])

    LPLAN: Dict[str, Any] = tracker.get_logical_plan(
        TRACKER_URL, ARGS.zone, ARGS.environment, ARGS.topology
    )

    PPLAN: Dict[str, Any] = tracker.get_physical_plan(
        TRACKER_URL, ARGS.zone, ARGS.environment, ARGS.topology
    )

    GRAPH_CLIENT: GremlinClient = GremlinClient(CONFIG["graph.client.config"])

    builder.create_physical_graph(
        GRAPH_CLIENT, ARGS.topology, ARGS.reference, LPLAN, PPLAN
    )

    if ARGS.populate and ARGS.duration:

        METRIC_CLIENT_CLASS: Type = loader.get_class(CONFIG["heron.metrics.client"])

        METRICS_CLIENT: HeronMetricsClient = METRIC_CLIENT_CLASS(
            CONFIG["heron.metrics.client.config"]
        )
Esempio n. 5
0
    def get_complete_latencies(
        self,
        topology_id: str,
        cluster: str,
        environ: str,
        start: dt.datetime,
        end: dt.datetime,
        **kwargs: Union[str, int, float],
    ) -> pd.DataFrame:
        """ Gets the complete latencies, as a timeseries, for every instance of
        the of all the spout components of the specified topology. The start
        and end times define the window over which to gather the metrics.

        Arguments:
            topology_id (str):    The topology identification string.
            cluster (str):  The cluster the topology is running in.
            environ (str):  The environment the topology is running in (eg.
                              prod, devel, test, etc).
            start (datetime):    utc datetime instance for the start of the
                                    metrics gathering period.
            end (datetime):  utc datetime instance for the end of the
                                metrics gathering period.

        Returns:
            pandas.DataFrame: A DataFrame containing the service time
            measurements as a timeseries. Each row represents a measurement
            with the following columns:

            * timestamp:  The UTC timestamp for the metric,
            * component: The component this metric comes from,
            * task: The instance ID number for the instance that the metric
              comes from,
            * container:  The ID for the container this metric comes from,
              stream: The name of the incoming stream from which the tuples
              that lead to this metric came from,
            * latency_ms: The average execute latency measurement in
              milliseconds for that metric time period.

        Raises:
            RuntimeWarning: If the specified topology has a reliability mode
                            that does not enable complete latency.
            ConnectionError: If the physical plan cannot be extracted from the
                             Heron Tracker API.
        """

        # First we need to check that the supplied topology will actually have
        # complete latencies. Only ATLEAST_ONCE and EXACTLY_ONCE will have
        # complete latency values as acking is disabled for ATMOST_ONCE.
        try:
            physical_plan: Dict[str, Any] = tracker.get_physical_plan(
                self.tracker_url, cluster, environ, topology_id)
        except ConnectionError as conn_err:
            conn_msg: str = (f"Unable to connect to Heron Tracker API at: "
                             f"{self.tracker_url}. Cannot retrieve physical "
                             f"plan for topology: {topology_id}")
            LOG.error(conn_msg)
            raise ConnectionError(conn_msg)

        if physical_plan["config"][
                "topology.reliability.mode"] == "ATMOST_ONCE":
            rm_msg: str = (f"Topology {topology_id} reliability mode is set "
                           f"to ATMOST_ONCE. Complete latency is not "
                           f"available for these types of topologies")
            LOG.warning(rm_msg)
            warnings.warn(rm_msg, RuntimeWarning)
            return pd.DataFrame()

        start_time: str = convert_datetime_to_rfc3339(start)
        end_time: str = convert_datetime_to_rfc3339(end)

        database: str = create_db_name(self.database_prefix, topology_id,
                                       cluster, environ)

        LOG.info(
            "Fetching complete latencies for topology: %s on cluster: %s "
            "in environment: %s for a %s second time period between %s "
            "and %s",
            topology_id,
            cluster,
            environ,
            (end - start).total_seconds(),
            start_time,
            end_time,
        )

        self.client.switch_database(database)

        metric_name: str = "complete-latency"
        metric_regex: str = "/complete\-latency\/+.*/"

        measurement_names: List[str] = self.get_metric_measurement_names(
            database, metric_name, metric_regex)

        output: List[Dict[str, Union[str, int, float, dt.datetime]]] = []

        for measurement_name in measurement_names:

            _, stream = measurement_name.split("/")

            query_str: str = (f"SELECT Component, Instance, value "
                              f'FROM "{measurement_name}" '
                              f"WHERE time >= '{start_time}' "
                              f"AND time <= '{end_time}'")

            LOG.debug(
                "Querying %s measurements with influx QL statement: %s",
                metric_name,
                query_str,
            )

            results: ResultSet = self.client.query(query_str)

            for point in results.get_points():

                instance: Optional[re.Match] = re.search(
                    INSTANCE_NAME_RE, point["Instance"])

                if instance:
                    instance_dict: Dict[str, str] = instance.groupdict()
                else:
                    LOG.warning("Could not parse instance name: %s",
                                point["Instance"])
                    continue

                row: Dict[str, Union[str, int, float, dt.datetime]] = {
                    "timestamp": convert_rfc339_to_datetime(point["time"]),
                    "component": point["Component"],
                    "task": int(instance_dict["task"]),
                    "container": int(instance_dict["container"]),
                    "stream": stream,
                    "latency_ms": float(point["value"]),
                }

                output.append(row)

        return pd.DataFrame(output)