def register_watch(self, callback): """ Returns the UUID with which the watch is registered. This UUID can be used to unregister the watch. Returns None if watch could not be registered. The argument 'callback' must be a function that takes exactly one argument, the topology on which the watch was triggered. Note that the watch will be unregistered in case it raises any Exception the first time. This callback is also called at the time of registration. """ RETRY_COUNT = 5 # Retry in case UID is previously # generated, just in case... for _ in range(RETRY_COUNT): # Generate a random UUID. uid = uuid.uuid4() if uid not in self.watches: Log.info("Registering a watch with uid: " + str(uid)) try: callback(self) except Exception as e: Log.error("Caught exception while triggering callback: " + str(e)) Log.debug(traceback.format_exc()) return None self.watches[uid] = callback return uid return None
def get(self): """ get method """ try: cluster = self.get_argument_cluster() role = self.get_argument_role() environ = self.get_argument_environ() topology_name = self.get_argument_topology() container = self.get_argument(constants.PARAM_CONTAINER) path = self.get_argument(constants.PARAM_PATH) offset = self.get_argument_offset() length = self.get_argument_length() topology_info = self.tracker.getTopologyInfo(topology_name, cluster, role, environ) stmgr_id = "stmgr-" + container stmgr = topology_info["physical_plan"]["stmgrs"][stmgr_id] host = stmgr["host"] shell_port = stmgr["shell_port"] file_data_url = "http://%s:%d/filedata/%s?offset=%s&length=%s" % \ (host, shell_port, path, offset, length) http_client = tornado.httpclient.AsyncHTTPClient() response = yield http_client.fetch(file_data_url) self.write_success_response(json.loads(response.body)) self.finish() except Exception as e: Log.debug(traceback.format_exc()) self.write_error_response(e)
def get_logical_plan(cluster, env, topology, role): """Synced API call to get logical plans""" instance = tornado.ioloop.IOLoop.instance() try: return instance.run_sync(lambda: API.get_logical_plan(cluster, env, topology, role)) except Exception: Log.debug(traceback.format_exc()) raise
def get_topology_metrics(*args): """Synced API call to get topology metrics""" instance = tornado.ioloop.IOLoop.instance() try: return instance.run_sync(lambda: API.get_comp_metrics(*args)) except Exception: Log.debug(traceback.format_exc()) raise
def get_cluster_topologies(cluster): """Synced API call to get topologies under a cluster""" instance = tornado.ioloop.IOLoop.instance() try: return instance.run_sync(lambda: API.get_cluster_topologies(cluster)) except Exception: Log.debug(traceback.format_exc()) raise
def get_cluster_role_env_topologies(cluster, role, env): """Synced API call to get topologies under a cluster submitted by a role under env""" instance = tornado.ioloop.IOLoop.instance() try: return instance.run_sync(lambda: API.get_cluster_role_env_topologies(cluster, role, env)) except Exception: Log.debug(traceback.format_exc()) raise
def get_clusters(): """Synced API call to get all cluster names""" instance = tornado.ioloop.IOLoop.instance() # pylint: disable=unnecessary-lambda try: return instance.run_sync(lambda: API.get_clusters()) except Exception: Log.debug(traceback.format_exc()) raise
def get_component_metrics(component, cluster, env, topology, role): """Synced API call to get component metrics""" all_queries = metric_queries() try: result = get_topology_metrics( cluster, env, topology, component, [], all_queries, [0, -1], role) return result["metrics"] except Exception: Log.debug(traceback.format_exc()) raise
def get(self): """ get method """ try: cluster = self.get_argument_cluster() role = self.get_argument_role() environ = self.get_argument_environ() topology_name = self.get_argument_topology() topology_info = self.tracker.getTopologyInfo(topology_name, cluster, role, environ) self.write_success_response(topology_info) except Exception as e: Log.debug(traceback.format_exc()) self.write_error_response(e)
def get(self): """ get method """ try: cluster = self.get_argument_cluster() role = self.get_argument_role() environ = self.get_argument_environ() topology_name = self.get_argument_topology() instance = self.get_argument_instance() topology_info = self.tracker.getTopologyInfo(topology_name, cluster, role, environ) ret = yield self.getInstanceMemoryHistogram(topology_info, instance) self.write_success_response(ret) except Exception as e: Log.debug(traceback.format_exc()) self.write_error_response(e)
def getInstancePid(topology_info, instance_id): """ This method is used by other modules, and so it is not a part of the class. Fetches Instance pid from heron-shell. """ try: http_client = tornado.httpclient.AsyncHTTPClient() endpoint = utils.make_shell_endpoint(topology_info, instance_id) url = "%s/pid/%s" % (endpoint, instance_id) Log.debug("HTTP call for url: %s", url) response = yield http_client.fetch(url) raise tornado.gen.Return(response.body) except tornado.httpclient.HTTPError as e: raise Exception(str(e))
def on_topologies_watch(state_manager, topologies): """watch topologies""" Log.info("State watch triggered for topologies.") Log.debug("Topologies: " + str(topologies)) existingTopologies = self.getTopologiesForStateLocation(state_manager.name) existingTopNames = map(lambda t: t.name, existingTopologies) Log.debug("Existing topologies: " + str(existingTopNames)) for name in existingTopNames: if name not in topologies: Log.info("Removing topology: %s in rootpath: %s", name, state_manager.rootpath) self.removeTopology(name, state_manager.name) for name in topologies: if name not in existingTopNames: self.addNewTopology(state_manager, name)
def trigger_watches(self): """ Call all the callbacks. If any callback raises an Exception, unregister the corresponding watch. """ to_remove = [] for uid, callback in self.watches.iteritems(): try: callback(self) except Exception as e: Log.error("Caught exception while triggering callback: " + str(e)) Log.debug(traceback.format_exc()) to_remove.append(uid) for uid in to_remove: self.unregister_watch(uid)
def getInstanceMemoryHistogram(self, topology_info, instance_id): """ Fetches Instance top memory item as histogram. """ pid_response = yield getInstancePid(topology_info, instance_id) try: http_client = tornado.httpclient.AsyncHTTPClient() pid_json = json.loads(pid_response) pid = pid_json['stdout'].strip() if pid == '': raise Exception('Failed to get pid') endpoint = utils.make_shell_endpoint(topology_info, instance_id) url = "%s/histo/%s" % (endpoint, pid) response = yield http_client.fetch(url) Log.debug("HTTP call for url: %s", url) raise tornado.gen.Return(response.body) except tornado.httpclient.HTTPError as e: raise Exception(str(e))
def get(self): """ get method """ try: cluster = self.get_argument_cluster() role = self.get_argument_role() environ = self.get_argument_environ() topology_name = self.get_argument_topology() topology = self.tracker.getTopologyByClusterRoleEnvironAndName( cluster, role, environ, topology_name) start_time = self.get_argument_starttime() end_time = self.get_argument_endtime() self.validateInterval(start_time, end_time) query = self.get_argument_query() metrics = yield tornado.gen.Task(self.executeMetricsQuery, topology.tmaster, query, int(start_time), int(end_time)) self.write_success_response(metrics) except Exception as e: Log.debug(traceback.format_exc()) self.write_error_response(e)
def get(self): """ get method """ try: cluster = self.get_argument_cluster() role = self.get_argument_role() environ = self.get_argument_environ() topology_name = self.get_argument_topology() container = self.get_argument(constants.PARAM_CONTAINER) path = self.get_argument(constants.PARAM_PATH, default=".") topology_info = self.tracker.getTopologyInfo(topology_name, cluster, role, environ) stmgr_id = "stmgr-" + str(container) stmgr = topology_info["physical_plan"]["stmgrs"][stmgr_id] host = stmgr["host"] shell_port = stmgr["shell_port"] filestats_url = utils.make_shell_filestats_url(host, shell_port, path) http_client = tornado.httpclient.AsyncHTTPClient() response = yield http_client.fetch(filestats_url) self.write_success_response(json.loads(response.body)) self.finish() except Exception as e: Log.debug(traceback.format_exc()) self.write_error_response(e)
def getMetricsTimeline(tmaster, component_name, metric_names, instances, start_time, end_time, callback=None): """ Get the specified metrics for the given component name of this topology. Returns the following dict on success: { "timeline": { <metricname>: { <instance>: { <start_time> : <numeric value>, <start_time> : <numeric value>, ... } ... }, ... }, "starttime": <numeric value>, "endtime": <numeric value>, "component": "..." } Returns the following dict on failure: { "message": "..." } """ # Tmaster is the proto object and must have host and port for stats. if not tmaster or not tmaster.host or not tmaster.stats_port: raise Exception("No Tmaster found") host = tmaster.host port = tmaster.stats_port # Create the proto request object to get metrics. metricRequest = tmaster_pb2.MetricRequest() metricRequest.component_name = component_name # If no instances are give, metrics for all instances # are fetched by default. if len(instances) > 0: for instance in instances: metricRequest.instance_id.append(instance) for metricName in metric_names: metricRequest.metric.append(metricName) metricRequest.explicit_interval.start = start_time metricRequest.explicit_interval.end = end_time metricRequest.minutely = True # Serialize the metricRequest to send as a payload # with the HTTP request. metricRequestString = metricRequest.SerializeToString() # Form and send the http request. url = "http://{0}:{1}/stats".format(host, port) request = tornado.httpclient.HTTPRequest(url, body=metricRequestString, method='POST', request_timeout=5) Log.debug("Making HTTP call to fetch metrics") Log.debug("url: " + url) try: client = tornado.httpclient.AsyncHTTPClient() result = yield client.fetch(request) Log.debug("HTTP call complete.") except tornado.httpclient.HTTPError as e: raise Exception(str(e)) # Check the response code - error if it is in 400s or 500s responseCode = result.code if responseCode >= 400: message = "Error in getting metrics from Tmaster, code: " + responseCode Log.error(message) raise Exception(message) # Parse the response from tmaster. metricResponse = tmaster_pb2.MetricResponse() metricResponse.ParseFromString(result.body) if metricResponse.status.status == common_pb2.NOTOK: if metricResponse.status.HasField("message"): Log.error(metricResponse.status.message) # Form the response. ret = {} ret["starttime"] = start_time ret["endtime"] = end_time ret["component"] = component_name ret["timeline"] = {} # Loop through all the metrics # One instance corresponds to one metric, which can have # multiple IndividualMetrics for each metricname requested. for metric in metricResponse.metric: instance = metric.instance_id # Loop through all individual metrics. for im in metric.metric: metricname = im.name if metricname not in ret["timeline"]: ret["timeline"][metricname] = {} if instance not in ret["timeline"][metricname]: ret["timeline"][metricname][instance] = {} # We get minutely metrics. # Interval-values correspond to the minutely mark for which # this metric value corresponds to. for interval_value in im.interval_values: ret["timeline"][metricname][instance][interval_value.interval.start] = interval_value.value raise tornado.gen.Return(ret)
def on_topology_tmaster(data): """set tmaster""" Log.info("Watch triggered for topology tmaster: " + topologyName) topology.set_tmaster(data) if not data: Log.debug("No data to be set")
def on_topology_scheduler_location(data): """set scheduler location""" Log.info("Watch triggered for topology scheduler location: " + topologyName) topology.set_scheduler_location(data) if not data: Log.debug("No data to be set")
def on_topology_execution_state(data): """watch execution state""" Log.info("Watch triggered for topology execution state: " + topologyName) topology.set_execution_state(data) if not data: Log.debug("No data to be set")
def log_debug(message, ident=0): """log debugging info""" Log.debug(" " * (ident * 2) + str(message))
def on_topology_pplan(data): """watch physical plan""" Log.info("Watch triggered for topology pplan: " + topologyName) topology.set_physical_plan(data) if not data: Log.debug("No data to be set")