Ejemplo n.º 1
0
  def register_watch(self, callback):
    """
    Returns the UUID with which the watch is
    registered. This UUID can be used to unregister
    the watch.
    Returns None if watch could not be registered.

    The argument 'callback' must be a function that takes
    exactly one argument, the topology on which
    the watch was triggered.
    Note that the watch will be unregistered in case
    it raises any Exception the first time.

    This callback is also called at the time
    of registration.
    """
    RETRY_COUNT = 5
    # Retry in case UID is previously
    # generated, just in case...
    for _ in range(RETRY_COUNT):
      # Generate a random UUID.
      uid = uuid.uuid4()
      if uid not in self.watches:
        Log.info("Registering a watch with uid: " + str(uid))
        try:
          callback(self)
        except Exception as e:
          Log.error("Caught exception while triggering callback: " + str(e))
          Log.debug(traceback.format_exc())
          return None
        self.watches[uid] = callback
        return uid
    return None
Ejemplo n.º 2
0
  def get(self):
    """ get method """
    try:
      cluster = self.get_argument_cluster()
      role = self.get_argument_role()
      environ = self.get_argument_environ()
      topology_name = self.get_argument_topology()
      container = self.get_argument(constants.PARAM_CONTAINER)
      path = self.get_argument(constants.PARAM_PATH)
      offset = self.get_argument_offset()
      length = self.get_argument_length()
      topology_info = self.tracker.getTopologyInfo(topology_name, cluster, role, environ)

      stmgr_id = "stmgr-" + container
      stmgr = topology_info["physical_plan"]["stmgrs"][stmgr_id]
      host = stmgr["host"]
      shell_port = stmgr["shell_port"]
      file_data_url = "http://%s:%d/filedata/%s?offset=%s&length=%s" % \
        (host, shell_port, path, offset, length)

      http_client = tornado.httpclient.AsyncHTTPClient()
      response = yield http_client.fetch(file_data_url)
      self.write_success_response(json.loads(response.body))
      self.finish()
    except Exception as e:
      Log.debug(traceback.format_exc())
      self.write_error_response(e)
Ejemplo n.º 3
0
def get_logical_plan(cluster, env, topology, role):
  """Synced API call to get logical plans"""
  instance = tornado.ioloop.IOLoop.instance()
  try:
    return instance.run_sync(lambda: API.get_logical_plan(cluster, env, topology, role))
  except Exception:
    Log.debug(traceback.format_exc())
    raise
Ejemplo n.º 4
0
def get_topology_metrics(*args):
  """Synced API call to get topology metrics"""
  instance = tornado.ioloop.IOLoop.instance()
  try:
    return instance.run_sync(lambda: API.get_comp_metrics(*args))
  except Exception:
    Log.debug(traceback.format_exc())
    raise
Ejemplo n.º 5
0
def get_cluster_topologies(cluster):
  """Synced API call to get topologies under a cluster"""
  instance = tornado.ioloop.IOLoop.instance()
  try:
    return instance.run_sync(lambda: API.get_cluster_topologies(cluster))
  except Exception:
    Log.debug(traceback.format_exc())
    raise
Ejemplo n.º 6
0
def get_cluster_role_env_topologies(cluster, role, env):
  """Synced API call to get topologies under a cluster submitted by a role under env"""
  instance = tornado.ioloop.IOLoop.instance()
  try:
    return instance.run_sync(lambda: API.get_cluster_role_env_topologies(cluster, role, env))
  except Exception:
    Log.debug(traceback.format_exc())
    raise
Ejemplo n.º 7
0
def get_clusters():
  """Synced API call to get all cluster names"""
  instance = tornado.ioloop.IOLoop.instance()
  # pylint: disable=unnecessary-lambda
  try:
    return instance.run_sync(lambda: API.get_clusters())
  except Exception:
    Log.debug(traceback.format_exc())
    raise
Ejemplo n.º 8
0
def get_component_metrics(component, cluster, env, topology, role):
  """Synced API call to get component metrics"""
  all_queries = metric_queries()
  try:
    result = get_topology_metrics(
        cluster, env, topology, component, [], all_queries, [0, -1], role)
    return result["metrics"]
  except Exception:
    Log.debug(traceback.format_exc())
    raise
Ejemplo n.º 9
0
 def get(self):
   """ get method """
   try:
     cluster = self.get_argument_cluster()
     role = self.get_argument_role()
     environ = self.get_argument_environ()
     topology_name = self.get_argument_topology()
     topology_info = self.tracker.getTopologyInfo(topology_name, cluster, role, environ)
     self.write_success_response(topology_info)
   except Exception as e:
     Log.debug(traceback.format_exc())
     self.write_error_response(e)
Ejemplo n.º 10
0
 def get(self):
   """ get method """
   try:
     cluster = self.get_argument_cluster()
     role = self.get_argument_role()
     environ = self.get_argument_environ()
     topology_name = self.get_argument_topology()
     instance = self.get_argument_instance()
     topology_info = self.tracker.getTopologyInfo(topology_name, cluster, role, environ)
     ret = yield self.getInstanceMemoryHistogram(topology_info, instance)
     self.write_success_response(ret)
   except Exception as e:
     Log.debug(traceback.format_exc())
     self.write_error_response(e)
Ejemplo n.º 11
0
def getInstancePid(topology_info, instance_id):
  """
  This method is used by other modules, and so it
  is not a part of the class.
  Fetches Instance pid from heron-shell.
  """
  try:
    http_client = tornado.httpclient.AsyncHTTPClient()
    endpoint = utils.make_shell_endpoint(topology_info, instance_id)
    url = "%s/pid/%s" % (endpoint, instance_id)
    Log.debug("HTTP call for url: %s", url)
    response = yield http_client.fetch(url)
    raise tornado.gen.Return(response.body)
  except tornado.httpclient.HTTPError as e:
    raise Exception(str(e))
Ejemplo n.º 12
0
    def on_topologies_watch(state_manager, topologies):
      """watch topologies"""
      Log.info("State watch triggered for topologies.")
      Log.debug("Topologies: " + str(topologies))
      existingTopologies = self.getTopologiesForStateLocation(state_manager.name)
      existingTopNames = map(lambda t: t.name, existingTopologies)
      Log.debug("Existing topologies: " + str(existingTopNames))
      for name in existingTopNames:
        if name not in topologies:
          Log.info("Removing topology: %s in rootpath: %s",
                   name, state_manager.rootpath)
          self.removeTopology(name, state_manager.name)

      for name in topologies:
        if name not in existingTopNames:
          self.addNewTopology(state_manager, name)
Ejemplo n.º 13
0
  def trigger_watches(self):
    """
    Call all the callbacks.
    If any callback raises an Exception,
    unregister the corresponding watch.
    """
    to_remove = []
    for uid, callback in self.watches.iteritems():
      try:
        callback(self)
      except Exception as e:
        Log.error("Caught exception while triggering callback: " + str(e))
        Log.debug(traceback.format_exc())
        to_remove.append(uid)

    for uid in to_remove:
      self.unregister_watch(uid)
Ejemplo n.º 14
0
 def getInstanceMemoryHistogram(self, topology_info, instance_id):
   """
   Fetches Instance top memory item as histogram.
   """
   pid_response = yield getInstancePid(topology_info, instance_id)
   try:
     http_client = tornado.httpclient.AsyncHTTPClient()
     pid_json = json.loads(pid_response)
     pid = pid_json['stdout'].strip()
     if pid == '':
       raise Exception('Failed to get pid')
     endpoint = utils.make_shell_endpoint(topology_info, instance_id)
     url = "%s/histo/%s" % (endpoint, pid)
     response = yield http_client.fetch(url)
     Log.debug("HTTP call for url: %s", url)
     raise tornado.gen.Return(response.body)
   except tornado.httpclient.HTTPError as e:
     raise Exception(str(e))
Ejemplo n.º 15
0
  def get(self):
    """ get method """
    try:
      cluster = self.get_argument_cluster()

      role = self.get_argument_role()
      environ = self.get_argument_environ()
      topology_name = self.get_argument_topology()
      topology = self.tracker.getTopologyByClusterRoleEnvironAndName(
          cluster, role, environ, topology_name)

      start_time = self.get_argument_starttime()
      end_time = self.get_argument_endtime()
      self.validateInterval(start_time, end_time)

      query = self.get_argument_query()
      metrics = yield tornado.gen.Task(self.executeMetricsQuery,
                                       topology.tmaster, query, int(start_time), int(end_time))
      self.write_success_response(metrics)
    except Exception as e:
      Log.debug(traceback.format_exc())
      self.write_error_response(e)
Ejemplo n.º 16
0
  def get(self):
    """ get method """
    try:
      cluster = self.get_argument_cluster()
      role = self.get_argument_role()
      environ = self.get_argument_environ()
      topology_name = self.get_argument_topology()
      container = self.get_argument(constants.PARAM_CONTAINER)
      path = self.get_argument(constants.PARAM_PATH, default=".")
      topology_info = self.tracker.getTopologyInfo(topology_name, cluster, role, environ)

      stmgr_id = "stmgr-" + str(container)
      stmgr = topology_info["physical_plan"]["stmgrs"][stmgr_id]
      host = stmgr["host"]
      shell_port = stmgr["shell_port"]
      filestats_url = utils.make_shell_filestats_url(host, shell_port, path)

      http_client = tornado.httpclient.AsyncHTTPClient()
      response = yield http_client.fetch(filestats_url)
      self.write_success_response(json.loads(response.body))
      self.finish()
    except Exception as e:
      Log.debug(traceback.format_exc())
      self.write_error_response(e)
Ejemplo n.º 17
0
def getMetricsTimeline(tmaster,
                       component_name,
                       metric_names,
                       instances,
                       start_time,
                       end_time,
                       callback=None):
  """
  Get the specified metrics for the given component name of this topology.
  Returns the following dict on success:
  {
    "timeline": {
      <metricname>: {
        <instance>: {
          <start_time> : <numeric value>,
          <start_time> : <numeric value>,
          ...
        }
        ...
      }, ...
    },
    "starttime": <numeric value>,
    "endtime": <numeric value>,
    "component": "..."
  }

  Returns the following dict on failure:
  {
    "message": "..."
  }
  """
  # Tmaster is the proto object and must have host and port for stats.
  if not tmaster or not tmaster.host or not tmaster.stats_port:
    raise Exception("No Tmaster found")

  host = tmaster.host
  port = tmaster.stats_port

  # Create the proto request object to get metrics.

  metricRequest = tmaster_pb2.MetricRequest()
  metricRequest.component_name = component_name

  # If no instances are give, metrics for all instances
  # are fetched by default.
  if len(instances) > 0:
    for instance in instances:
      metricRequest.instance_id.append(instance)

  for metricName in metric_names:
    metricRequest.metric.append(metricName)

  metricRequest.explicit_interval.start = start_time
  metricRequest.explicit_interval.end = end_time
  metricRequest.minutely = True

  # Serialize the metricRequest to send as a payload
  # with the HTTP request.
  metricRequestString = metricRequest.SerializeToString()

  # Form and send the http request.
  url = "http://{0}:{1}/stats".format(host, port)
  request = tornado.httpclient.HTTPRequest(url,
                                           body=metricRequestString,
                                           method='POST',
                                           request_timeout=5)

  Log.debug("Making HTTP call to fetch metrics")
  Log.debug("url: " + url)
  try:
    client = tornado.httpclient.AsyncHTTPClient()
    result = yield client.fetch(request)
    Log.debug("HTTP call complete.")
  except tornado.httpclient.HTTPError as e:
    raise Exception(str(e))


  # Check the response code - error if it is in 400s or 500s
  responseCode = result.code
  if responseCode >= 400:
    message = "Error in getting metrics from Tmaster, code: " + responseCode
    Log.error(message)
    raise Exception(message)

  # Parse the response from tmaster.
  metricResponse = tmaster_pb2.MetricResponse()
  metricResponse.ParseFromString(result.body)

  if metricResponse.status.status == common_pb2.NOTOK:
    if metricResponse.status.HasField("message"):
      Log.error(metricResponse.status.message)

  # Form the response.
  ret = {}
  ret["starttime"] = start_time
  ret["endtime"] = end_time
  ret["component"] = component_name
  ret["timeline"] = {}

  # Loop through all the metrics
  # One instance corresponds to one metric, which can have
  # multiple IndividualMetrics for each metricname requested.
  for metric in metricResponse.metric:
    instance = metric.instance_id

    # Loop through all individual metrics.
    for im in metric.metric:
      metricname = im.name
      if metricname not in ret["timeline"]:
        ret["timeline"][metricname] = {}
      if instance not in ret["timeline"][metricname]:
        ret["timeline"][metricname][instance] = {}

      # We get minutely metrics.
      # Interval-values correspond to the minutely mark for which
      # this metric value corresponds to.
      for interval_value in im.interval_values:
        ret["timeline"][metricname][instance][interval_value.interval.start] = interval_value.value

  raise tornado.gen.Return(ret)
Ejemplo n.º 18
0
 def on_topology_tmaster(data):
   """set tmaster"""
   Log.info("Watch triggered for topology tmaster: " + topologyName)
   topology.set_tmaster(data)
   if not data:
     Log.debug("No data to be set")
Ejemplo n.º 19
0
 def on_topology_scheduler_location(data):
   """set scheduler location"""
   Log.info("Watch triggered for topology scheduler location: " + topologyName)
   topology.set_scheduler_location(data)
   if not data:
     Log.debug("No data to be set")
Ejemplo n.º 20
0
 def on_topology_execution_state(data):
   """watch execution state"""
   Log.info("Watch triggered for topology execution state: " + topologyName)
   topology.set_execution_state(data)
   if not data:
     Log.debug("No data to be set")
Ejemplo n.º 21
0
def log_debug(message, ident=0):
  """log debugging info"""
  Log.debug(" " * (ident * 2) + str(message))
Ejemplo n.º 22
0
 def on_topology_pplan(data):
   """watch physical plan"""
   Log.info("Watch triggered for topology pplan: " + topologyName)
   topology.set_physical_plan(data)
   if not data:
     Log.debug("No data to be set")