Example #1
0
def start_api_server(masters, cl_args):
  '''
  Start the Heron API server
  '''
  # make sure nomad cluster is up
  single_master = list(masters)[0]
  wait_for_master_to_start(single_master)

  cmd = "%s run %s >> /tmp/apiserver_start.log 2>&1 &" \
        % (get_nomad_path(cl_args), get_apiserver_job_file(cl_args))
  Log.info("Starting Heron API Server on %s" % single_master)

  if not is_self(single_master):
    cmd = ssh_remote_execute(cmd, single_master, cl_args)
  Log.debug(cmd)
  pid = subprocess.Popen(cmd,
                         shell=True,
                         stdout=subprocess.PIPE,
                         stderr=subprocess.PIPE)

  return_code = pid.wait()
  output = pid.communicate()
  Log.debug("return code: %s output: %s" % (return_code, output))
  if return_code != 0:
    Log.error("Failed to start apiserver on %s with error:\n%s" % (single_master, output[1]))
    sys.exit(-1)

  wait_for_job_to_start(single_master, "apiserver")
  Log.info("Done starting Heron API Server")
Example #2
0
def start_heron_tools(masters, cl_args):
  '''
  Start Heron tracker and UI
  '''
  single_master = list(masters)[0]
  wait_for_master_to_start(single_master)

  cmd = "%s run %s >> /tmp/heron_tools_start.log 2>&1 &" \
        % (get_nomad_path(cl_args), get_heron_tools_job_file(cl_args))
  Log.info("Starting Heron Tools on %s" % single_master)

  if not is_self(single_master):
    cmd = ssh_remote_execute(cmd, single_master, cl_args)
  Log.debug(cmd)
  pid = subprocess.Popen(cmd,
                         shell=True,
                         stdout=subprocess.PIPE,
                         stderr=subprocess.PIPE)

  return_code = pid.wait()
  output = pid.communicate()
  Log.debug("return code: %s output: %s" % (return_code, output))
  if return_code != 0:
    Log.error("Failed to start Heron Tools on %s with error:\n%s" % (single_master, output[1]))
    sys.exit(-1)

  wait_for_job_to_start(single_master, "heron-tools")
  Log.info("Done starting Heron Tools")
Example #3
0
def get_jobs(cl_args, nomad_addr):
  r = requests.get("http://%s:4646/v1/jobs" % nomad_addr)
  if r.status_code != 200:
    Log.error("Failed to get list of jobs")
    Log.debug("Response: %s" % r)
    sys.exit(-1)
  return r.json()
Example #4
0
def start_slave_nodes(slaves, cl_args):
  '''
  Star slave nodes
  '''
  pids = []
  for slave in slaves:
    Log.info("Starting slave on %s" % slave)
    cmd = "%s agent -config %s >> /tmp/nomad_client.log 2>&1 &" \
          % (get_nomad_path(cl_args), get_nomad_slave_config_file(cl_args))
    if not is_self(slave):
      cmd = ssh_remote_execute(cmd, slave, cl_args)
    Log.debug(cmd)
    pid = subprocess.Popen(cmd,
                           shell=True,
                           stdout=subprocess.PIPE,
                           stderr=subprocess.PIPE)
    pids.append({"pid": pid, "dest": slave})

  errors = []
  for entry in pids:
    pid = entry["pid"]
    return_code = pid.wait()
    output = pid.communicate()
    Log.debug("return code: %s output: %s" % (return_code, output))
    if return_code != 0:
      errors.append("Failed to start slave on %s with error:\n%s" % (entry["dest"], output[1]))

  if errors:
    for error in errors:
      Log.error(error)
    sys.exit(-1)

  Log.info("Done starting slaves")
Example #5
0
  def _read_tuples_and_execute(self):
    start_cycle_time = time.time()
    total_data_emitted_bytes_before = self.get_total_data_emitted_in_bytes()
    exec_batch_time = \
      self.sys_config[system_constants.INSTANCE_EXECUTE_BATCH_TIME_MS] * system_constants.MS_TO_SEC
    exec_batch_size = self.sys_config[system_constants.INSTANCE_EXECUTE_BATCH_SIZE_BYTES]
    while not self.in_stream.is_empty():
      try:
        tuples = self.in_stream.poll()
      except Queue.Empty:
        break

      if isinstance(tuples, tuple_pb2.HeronTupleSet):
        if tuples.HasField("control"):
          raise RuntimeError("Bolt cannot get acks/fails from other components")
        elif tuples.HasField("data"):
          stream = tuples.data.stream

          for data_tuple in tuples.data.tuples:
            self._handle_data_tuple(data_tuple, stream)
        else:
          Log.error("Received tuple neither data nor control")
      else:
        Log.error("Received tuple not instance of HeronTupleSet")

      if (time.time() - start_cycle_time - exec_batch_time > 0) or \
          (self.get_total_data_emitted_in_bytes() - total_data_emitted_bytes_before
           > exec_batch_size):
        # batch reached
        break
Example #6
0
  def _get_dict_from_config(topology_config):
    """Converts Config protobuf message to python dictionary

    Values are converted according to the rules below:

    - Number string (e.g. "12" or "1.2") is appropriately converted to ``int`` or ``float``
    - Boolean string ("true", "True", "false" or "False") is converted to built-in boolean type
      (i.e. ``True`` or ``False``)
    - Normal string is inserted to dict as is
    - Serialized value is deserialized and inserted as a corresponding Python object
    """
    config = {}
    for kv in topology_config.kvs:
      if kv.HasField("value"):
        assert kv.type == topology_pb2.ConfigValueType.Value("STRING_VALUE")
        # value is string
        if PhysicalPlanHelper._is_number(kv.value):
          config[kv.key] = PhysicalPlanHelper._get_number(kv.value)
        elif kv.value.lower() in ("true", "false"):
          config[kv.key] = True if kv.value.lower() == "true" else False
        else:
          config[kv.key] = kv.value
      elif kv.HasField("serialized_value") and \
        kv.type == topology_pb2.ConfigValueType.Value("PYTHON_SERIALIZED_VALUE"):
        # deserialize that
        config[kv.key] = default_serializer.deserialize(kv.serialized_value)
      else:
        assert kv.HasField("type")
        Log.error("Unsupported config <key:value> found: %s, with type: %s"
                  % (str(kv), str(kv.type)))
        continue

    return config
Example #7
0
 def update_cpu_time(self):
   try:
     t = self.process.cpu_times()
     self.sys_cpu_time.update(t.system)
     self.user_cpu_time.update(t.user)
   except Exception as e:
     Log.error(traceback.format_exc(e))
Example #8
0
 def update_memory_usage(self):
   try:
     m = self.process.memory_info()
     self.physical_memory.update(m.rss)
     self.virtual_memory.update(m.vms)
   except Exception as e:
     Log.error(traceback.format_exc(e))
Example #9
0
 def on_connect(self, status):
   Log.debug("In on_connect of MetricsManagerClient")
   if status != StatusCode.OK:
     Log.error("Error connecting to Metrics Manager with status: %s" % str(status))
     retry_interval = float(self.sys_config[constants.INSTANCE_RECONNECT_METRICSMGR_INTERVAL_SEC])
     self.looper.register_timer_task_in_sec(self.start_connect, retry_interval)
   self._send_register_req()
Example #10
0
  def _read_tuples_and_execute(self):
    start_cycle_time = time.time()
    ack_batch_time = self.sys_config[system_constants.INSTANCE_ACK_BATCH_TIME_MS] * \
                     system_constants.MS_TO_SEC
    while not self.in_stream.is_empty():
      try:
        tuples = self.in_stream.poll()
      except Queue.Empty:
        break

      if isinstance(tuples, tuple_pb2.HeronTupleSet):
        if tuples.HasField("data"):
          raise RuntimeError("Spout cannot get incoming data tuples from other components")
        elif tuples.HasField("control"):
          for ack_tuple in tuples.control.acks:
            self._handle_ack_tuple(ack_tuple, True)
          for fail_tuple in tuples.control.fails:
            self._handle_ack_tuple(fail_tuple, False)
        else:
          Log.error("Received tuple neither data nor control")
      else:
        Log.error("Received tuple not instance of HeronTupleSet")

      # avoid spending too much time here
      if time.time() - start_cycle_time - ack_batch_time > 0:
        break
Example #11
0
def import_and_get_class(path_to_pex, python_class_name):
  """Imports and load a class from a given pex file path and python class name

  For example, if you want to get a class called `Sample` in
  /some-path/sample.pex/heron/examples/src/python/sample.py,
  ``path_to_pex`` needs to be ``/some-path/sample.pex``, and
  ``python_class_name`` needs to be ``heron.examples.src.python.sample.Sample``
  """
  abs_path_to_pex = os.path.abspath(path_to_pex)

  Log.debug("Add a pex to the path: %s" % abs_path_to_pex)
  Log.debug("In import_and_get_class with cls_name: %s" % python_class_name)
  split = python_class_name.split('.')
  from_path = '.'.join(split[:-1])
  import_name = python_class_name.split('.')[-1]

  Log.debug("From path: %s, import name: %s" % (from_path, import_name))

  # Resolve duplicate package suffix problem (heron.), if the top level package name is heron
  if python_class_name.startswith("heron."):
    try:
      mod = resolve_heron_suffix_issue(abs_path_to_pex, python_class_name)
      return getattr(mod, import_name)
    except:
      Log.error("Could not resolve class %s with special handling" % python_class_name)

  mod = __import__(from_path, fromlist=[import_name], level=-1)
  Log.debug("Imported module: %s" % str(mod))
  return getattr(mod, import_name)
Example #12
0
  def register_watch(self, callback):
    """
    Returns the UUID with which the watch is
    registered. This UUID can be used to unregister
    the watch.
    Returns None if watch could not be registered.

    The argument 'callback' must be a function that takes
    exactly one argument, the topology on which
    the watch was triggered.
    Note that the watch will be unregistered in case
    it raises any Exception the first time.

    This callback is also called at the time
    of registration.
    """
    RETRY_COUNT = 5
    # Retry in case UID is previously
    # generated, just in case...
    for _ in range(RETRY_COUNT):
      # Generate a random UUID.
      uid = uuid.uuid4()
      if uid not in self.watches:
        Log.info("Registering a watch with uid: " + str(uid))
        try:
          callback(self)
        except Exception as e:
          Log.error("Caught exception while triggering callback: " + str(e))
          Log.debug(traceback.format_exc())
          return None
        self.watches[uid] = callback
        return uid
    return None
Example #13
0
  def getComponentException(self, tmaster, component_name, instances=[], callback=None):
    """
    Get all (last 1000) exceptions for 'component_name' of the topology.
    Returns an Array of exception logs on success.
    Returns json with message on failure.
    """
    if not tmaster or not tmaster.host or not tmaster.stats_port:
      return

    exception_request = tmaster_pb2.ExceptionLogRequest()
    exception_request.component_name = component_name
    if len(instances) > 0:
      exception_request.instances.extend(instances)
    request_str = exception_request.SerializeToString()
    port = str(tmaster.stats_port)
    host = tmaster.host
    url = "http://{0}:{1}/exceptions".format(host, port)
    request = tornado.httpclient.HTTPRequest(url,
                                             body=request_str,
                                             method='POST',
                                             request_timeout=5)
    Log.debug('Making HTTP call to fetch exceptions url: %s', url)
    try:
      client = tornado.httpclient.AsyncHTTPClient()
      result = yield client.fetch(request)
      Log.debug("HTTP call complete.")
    except tornado.httpclient.HTTPError as e:
      raise Exception(str(e))

    # Check the response code - error if it is in 400s or 500s
    responseCode = result.code
    if responseCode >= 400:
      message = "Error in getting exceptions from Tmaster, code: " + responseCode
      Log.error(message)
      raise tornado.gen.Return({
          "message": message
      })

    # Parse the response from tmaster.
    exception_response = tmaster_pb2.ExceptionLogResponse()
    exception_response.ParseFromString(result.body)

    if exception_response.status.status == common_pb2.NOTOK:
      if exception_response.status.HasField("message"):
        raise tornado.gen.Return({
            "message": exception_response.status.message
        })

    # Send response
    ret = []
    for exception_log in exception_response.exceptions:
      ret.append({'hostname': exception_log.hostname,
                  'instance_id': exception_log.instance_id,
                  'stack_trace': exception_log.stacktrace,
                  'lasttime': exception_log.lasttime,
                  'firsttime': exception_log.firsttime,
                  'count': str(exception_log.count),
                  'logging': exception_log.logging})
    raise tornado.gen.Return(ret)
Example #14
0
 def on_response(self, status, context, response):
   Log.debug("In on_response with status: %s, with context: %s" % (str(status), str(context)))
   if status != StatusCode.OK:
     raise RuntimeError("Response from Metrics Manager not OK")
   if isinstance(response, metrics_pb2.MetricPublisherRegisterResponse):
     self._handle_register_response(response)
   else:
     Log.error("Unknown kind of response received: %s" % response.DESCRIPTOR.full_name)
     raise RuntimeError("Unknown kind of response received from Metrics Manager")
Example #15
0
 def update_cpu_and_memory_metrics(self):
   try:
     r = resource.getrusage(resource.RUSAGE_SELF)
     self.sys_cpu_time.update(r.ru_stime)
     self.user_cpu_time.update(r.ru_utime)
     self.physical_memory.update(r.ru_maxrss)
     # self.virtual_memory.update(m.vms)
   except Exception as e:
     Log.error(traceback.format_exc(e))
Example #16
0
 def on_connect(self, status):
   Log.debug("In on_connect of STStmgrClient")
   if status != StatusCode.OK:
     Log.error("Error connecting to Stream Manager with status: %s", str(status))
     retry_interval = float(self.sys_config[constants.INSTANCE_RECONNECT_STREAMMGR_INTERVAL_SEC])
     self.looper.register_timer_task_in_sec(self.start_connect, retry_interval)
     return
   self._register_msg_to_handle()
   self._send_register_req()
Example #17
0
 def _run_once(self):
   """Run once, should be called only from loop()"""
   try:
     self.do_wait()
     self._execute_wakeup_tasks()
     self._trigger_timers()
   except Exception as e:
     Log.error("Error occured during _run_once(): " + str(e))
     Log.error(traceback.format_exc())
     self.should_exit = True
Example #18
0
def check_release_file_exists():
  """Check if the release.yaml file exists"""
  release_file = get_heron_release_file()

  # if the file does not exist and is not a file
  if not os.path.isfile(release_file):
    Log.error("Required file not found: %s" % release_file)
    return False

  return True
  def getComponentExceptionSummary(self, tmaster, component_name, instances=[], callback=None):
    """
    Get the summary of exceptions for component_name and list of instances.
    Empty instance list will fetch all exceptions.
    """
    if not tmaster or not tmaster.host or not tmaster.stats_port:
      return
    exception_request = tmaster_pb2.ExceptionLogRequest()
    exception_request.component_name = component_name
    if len(instances) > 0:
      exception_request.instances.extend(instances)
    request_str = exception_request.SerializeToString()
    port = str(tmaster.stats_port)
    host = tmaster.host
    url = "http://{0}:{1}/exceptionsummary".format(host, port)
    Log.debug("Creating request object.")
    request = tornado.httpclient.HTTPRequest(url,
                                             body=request_str,
                                             method='POST',
                                             request_timeout=5)
    Log.debug('Making HTTP call to fetch exceptionsummary url: %s', url)
    try:
      client = tornado.httpclient.AsyncHTTPClient()
      result = yield client.fetch(request)
      Log.debug("HTTP call complete.")
    except tornado.httpclient.HTTPError as e:
      raise Exception(str(e))

    # Check the response code - error if it is in 400s or 500s
    responseCode = result.code
    if responseCode >= 400:
      message = "Error in getting exceptions from Tmaster, code: " + responseCode
      Log.error(message)
      raise tornado.gen.Return({
          "message": message
      })

    # Parse the response from tmaster.
    exception_response = tmaster_pb2.ExceptionLogResponse()
    exception_response.ParseFromString(result.body)

    if exception_response.status.status == common_pb2.NOTOK:
      if exception_response.status.HasField("message"):
        raise tornado.gen.Return({
            "message": exception_response.status.message
        })

    # Send response
    ret = []
    for exception_log in exception_response.exceptions:
      ret.append({'class_name': exception_log.stacktrace,
                  'lasttime': exception_log.lasttime,
                  'firsttime': exception_log.firsttime,
                  'count': str(exception_log.count)})
    raise tornado.gen.Return(ret)
Example #20
0
def run(command, parser, cl_args, unknown_args):
  '''
  Submits the topology to the scheduler
    * Depending on the topology file name extension, we treat the file as a
      fatjar (if the ext is .jar) or a tar file (if the ext is .tar/.tar.gz).
    * We upload the topology file to the packer, update zookeeper and launch
      scheduler jobs representing that topology
    * You can see your topology in Heron UI
  :param command:
  :param parser:
  :param cl_args:
  :param unknown_args:
  :return:
  '''
  # get the topology file name
  topology_file = cl_args['topology-file-name']

  # check to see if the topology file exists
  if not os.path.isfile(topology_file):
    Log.error("Topology jar|tar|pex file %s does not exist" % topology_file)
    return False

  # check if it is a valid file type
  jar_type = topology_file.endswith(".jar")
  tar_type = topology_file.endswith(".tar") or topology_file.endswith(".tar.gz")
  pex_type = topology_file.endswith(".pex")
  if not jar_type and not tar_type and not pex_type:
    Log.error("Unknown file type. Please use .tar or .tar.gz or .jar or .pex file")
    return False

  # create a temporary directory for topology definition file
  tmp_dir = tempfile.mkdtemp()

  # if topology needs to be launched in deactivated state, do it so
  if cl_args['deploy_deactivated']:
    initial_state = topology_pb2.TopologyState.Name(topology_pb2.PAUSED)
  else:
    initial_state = topology_pb2.TopologyState.Name(topology_pb2.RUNNING)

  # set the tmp dir and deactivated state in global options
  opts.set_config('cmdline.topologydefn.tmpdirectory', tmp_dir)
  opts.set_config('cmdline.topology.initial.state', initial_state)

  # check the extension of the file name to see if it is tar/jar file.
  if jar_type:
    return submit_fatjar(cl_args, unknown_args, tmp_dir)

  elif tar_type:
    return submit_tar(cl_args, unknown_args, tmp_dir)

  elif pex_type:
    return submit_pex(cl_args, unknown_args, tmp_dir)

  return False
Example #21
0
def parse_topo_loc(cl_args):
  """ parse topology location """
  try:
    topo_loc = cl_args['cluster/[role]/[env]'].split('/')
    topo_loc.append(cl_args['topology-name'])
    if len(topo_loc) != 4:
      raise
    return topo_loc
  except Exception:
    Log.error('Error: invalid topology location')
    raise
Example #22
0
def run(command, parser, cl_args, unknown_args):
  """ run command """
  try:
    clusters = tracker_access.get_clusters()
  except:
    Log.error("Fail to connect to tracker: \'%s\'", cl_args["tracker_url"])
    return False
  print 'Available clusters:'
  for cluster in clusters:
    print '  %s' % cluster
  return True
Example #23
0
def parse_topo_loc(cl_args):
    """ parse topology location """
    try:
        topo_loc = cl_args["cluster/[role]/[env]"].split("/")
        topo_name = cl_args["topology-name"]
        topo_loc.append(topo_name)
        if len(topo_loc) != 4:
            raise
        return topo_loc
    except Exception:
        Log.error("Invalid topology location")
        raise
Example #24
0
def run(command, parser, cl_args, unknown_args):
  """ run command """
  location = cl_args['cluster/[role]/[env]'].split('/')
  if len(location) == 1:
    return show_cluster(cl_args, *location)
  elif len(location) == 2:
    return show_cluster_role(cl_args, *location)
  elif len(location) == 3:
    return show_cluster_role_env(cl_args, *location)
  else:
    Log.error('Invalid topologies selection')
    return False
Example #25
0
 def update_gc_stat(self):
   try:
     c1, c2, c3 = gc.get_count()
     t1, t2, t3 = gc.get_threshold()
     self.g1_count.update(c1)
     self.g2_count.update(c2)
     self.g3_count.update(c3)
     self.g1_threshold.update(t1)
     self.g2_threshold.update(t2)
     self.g3_threshold.update(t3)
   except Exception as e:
     Log.error(traceback.format_exc(e))
Example #26
0
def launch_topology_server(cl_args, topology_file, topology_defn_file, topology_name):
  '''
  Launch a topology given topology jar, its definition file and configurations
  :param cl_args:
  :param topology_file:
  :param topology_defn_file:
  :param topology_name:
  :return:
  '''
  service_apiurl = cl_args['service_url'] + rest.ROUTE_SIGNATURES['submit'][1]
  service_method = rest.ROUTE_SIGNATURES['submit'][0]
  data = dict(
      name=topology_name,
      cluster=cl_args['cluster'],
      role=cl_args['role'],
      environment=cl_args['environ'],
      user=cl_args['submit_user'],
  )

  Log.info("" + str(cl_args))
  overrides = dict()
  if 'config_property' in cl_args:
    overrides = config.parse_override_config(cl_args['config_property'])

  if overrides:
    data.update(overrides)

  if cl_args['dry_run']:
    data["dry_run"] = True

  files = dict(
      definition=open(topology_defn_file, 'rb'),
      topology=open(topology_file, 'rb'),
  )

  err_ctxt = "Failed to launch topology '%s' %s" % (topology_name, launch_mode_msg(cl_args))
  succ_ctxt = "Successfully launched topology '%s' %s" % (topology_name, launch_mode_msg(cl_args))

  try:
    r = service_method(service_apiurl, data=data, files=files)
    ok = r.status_code is requests.codes.ok
    created = r.status_code is requests.codes.created
    s = Status.Ok if created or ok else Status.HeronError
    if s is Status.HeronError:
      Log.error(r.json().get('message', "Unknown error from API server %d" % r.status_code))
    elif ok:
      # this case happens when we request a dry_run
      print(r.json().get("response"))
  except (requests.exceptions.ConnectionError, requests.exceptions.HTTPError) as err:
    Log.error(err)
    return SimpleResult(Status.HeronError, err_ctxt, succ_ctxt)
  return SimpleResult(s, err_ctxt, succ_ctxt)
Example #27
0
def parse_cluster_role_env(cluster_role_env, config_path):
  """Parse cluster/[role]/[environ], supply default, if not provided, not required"""
  parts = cluster_role_env.split('/')[:3]
  Log.info("Using config file under %s" % config_path)
  if not os.path.isdir(config_path):
    Log.error("Config path cluster directory does not exist: %s" % config_path)
    raise Exception("Invalid config path")

  # if cluster/role/env is not completely provided, check further
  if len(parts) < 3:

    cli_conf_file = os.path.join(config_path, CLIENT_YAML)

    # if client conf doesn't exist, use default value
    if not os.path.isfile(cli_conf_file):
      if len(parts) == 1:
        parts.append(getpass.getuser())
      if len(parts) == 2:
        parts.append(ENVIRON)
    else:
      cli_confs = {}
      with open(cli_conf_file, 'r') as conf_file:
        tmp_confs = yaml.load(conf_file)
        # the return value of yaml.load can be None if conf_file is an empty file
        if tmp_confs is not None:
          cli_confs = tmp_confs
        else:
          print "Failed to read: %s due to it is empty" % (CLIENT_YAML)

      # if role is required but not provided, raise exception
      if len(parts) == 1:
        if (IS_ROLE_REQUIRED in cli_confs) and (cli_confs[IS_ROLE_REQUIRED] is True):
          raise Exception("role required but not provided (cluster/role/env = %s). See %s in %s"
                          % (cluster_role_env, IS_ROLE_REQUIRED, CLIENT_YAML))
        else:
          parts.append(getpass.getuser())

      # if environ is required but not provided, raise exception
      if len(parts) == 2:
        if (IS_ENV_REQUIRED in cli_confs) and (cli_confs[IS_ENV_REQUIRED] is True):
          raise Exception("environ required but not provided (cluster/role/env = %s). See %s in %s"
                          % (cluster_role_env, IS_ENV_REQUIRED, CLIENT_YAML))
        else:
          parts.append(ENVIRON)

  # if cluster or role or environ is empty, print
  if len(parts[0]) == 0 or len(parts[1]) == 0 or len(parts[2]) == 0:
    print "Failed to parse"
    sys.exit(1)

  return (parts[0], parts[1], parts[2])
Example #28
0
def check_java_home_set():
  """Check if the java home set"""
  # check if environ variable is set
  if "JAVA_HOME" not in os.environ:
    Log.error("JAVA_HOME not set")
    return False

  # check if the value set is correct
  java_path = get_java_path()
  if os.path.isfile(java_path) and os.access(java_path, os.X_OK):
    return True

  Log.error("JAVA_HOME/bin/java either does not exist or not an executable")
  return False
Example #29
0
 def get(self):
   """ get method """
   try:
     cluster = self.get_argument_cluster()
     role = self.get_argument_role()
     environ = self.get_argument_environ()
     topology_name = self.get_argument_topology()
     topology_info = self.tracker.getTopologyInfo(topology_name, cluster, role, environ)
     metadata = topology_info["metadata"]
     self.write_success_response(metadata)
   except Exception as e:
     Log.error("Exception when handling GET request '/topologies/metadata'")
     Log.debug(traceback.format_exc())
     self.write_error_response(e)
Example #30
0
def template_file(src, dest, replacements_dict):
  Log.debug("Templating %s - > %s with %s" % (src, dest, replacements_dict))

  file_contents = ""
  with open(src, 'r') as tf:
    file_contents = tf.read()
    for key, value in replacements_dict.iteritems():
      file_contents = file_contents.replace(key, value)

  if not file_contents:
    Log.error("File contents after templating is empty")
    sys.exit(-1)

  with open(dest, 'w') as tf:
    tf.write(file_contents)
    tf.truncate()
Example #31
0
def run_server(command, cl_args, action, extra_args=dict()):
  '''
  helper function to take action on topologies using REST API
  :param command:
  :param cl_args:
  :param action:        description of action taken
  :return:
  '''
  topology_name = cl_args['topology-name']

  service_endpoint = cl_args['service_url']
  apiroute = rest.ROUTE_SIGNATURES[command][1] % (
      cl_args['cluster'],
      cl_args['role'],
      cl_args['environ'],
      topology_name
  )
  service_apiurl = service_endpoint + apiroute
  service_method = rest.ROUTE_SIGNATURES[command][0]

  # convert the dictionary to a list of tuples
  data = flatten_args(extra_args)

  err_msg = "Failed to %s: %s" % (action, topology_name)
  succ_msg = "Successfully %s: %s" % (action, topology_name)

  try:
    r = service_method(service_apiurl, data=data)
    s = Status.Ok if r.status_code == requests.codes.ok else Status.HeronError
    if r.status_code != requests.codes.ok:
      Log.error(r.json().get('message', "Unknown error from api server %d" % r.status_code))
  except (requests.exceptions.ConnectionError, requests.exceptions.HTTPError) as err:
    Log.error(err)
    return SimpleResult(Status.HeronError, err_msg, succ_msg)

  return SimpleResult(s, err_msg, succ_msg)
Example #32
0
    def _read_tuples_and_execute(self):
        start_cycle_time = time.time()
        total_data_emitted_bytes_before = self.get_total_data_emitted_in_bytes(
        )
        exec_batch_time = \
          self.sys_config[system_constants.INSTANCE_EXECUTE_BATCH_TIME_MS] * system_constants.MS_TO_SEC
        exec_batch_size = self.sys_config[
            system_constants.INSTANCE_EXECUTE_BATCH_SIZE_BYTES]
        while not self.in_stream.is_empty():
            try:
                tuples = self.in_stream.poll()
            except Queue.Empty:
                break

            if isinstance(tuples, tuple_pb2.HeronTupleSet):
                if tuples.HasField("control"):
                    raise RuntimeError(
                        "Bolt cannot get acks/fails from other components")
                elif tuples.HasField("data"):
                    stream = tuples.data.stream

                    for data_tuple in tuples.data.tuples:
                        self._handle_data_tuple(data_tuple, stream)
                else:
                    Log.error("Received tuple neither data nor control")
            elif isinstance(tuples, ckptmgr_pb2.InitiateStatefulCheckpoint):
                self.handle_initiate_stateful_checkpoint(
                    tuples, self.bolt_impl)
            else:
                Log.error("Received tuple not instance of HeronTupleSet")

            if (time.time() - start_cycle_time - exec_batch_time > 0) or \
                (self.get_total_data_emitted_in_bytes() - total_data_emitted_bytes_before
                 > exec_batch_size):
                # batch reached
                break
Example #33
0
  def synch_topologies(self):
    """
    Sync the topologies with the statemgrs.
    """
    self.state_managers = statemanagerfactory.get_all_state_managers(self.config.statemgr_config)
    try:
      for state_manager in self.state_managers:
        state_manager.start()
    except Exception as ex:
      Log.error("Found exception while initializing state managers: %s. Bailing out..." % ex)
      traceback.print_exc()
      sys.exit(1)

    # pylint: disable=deprecated-lambda
    def on_topologies_watch(state_manager, topologies):
      """watch topologies"""
      Log.info("State watch triggered for topologies.")
      Log.debug("Topologies: " + str(topologies))
      existingTopologies = self.getTopologiesForStateLocation(state_manager.name)
      existingTopNames = map(lambda t: t.name, existingTopologies)
      Log.debug("Existing topologies: " + str(existingTopNames))
      for name in existingTopNames:
        if name not in topologies:
          Log.info("Removing topology: %s in rootpath: %s",
                   name, state_manager.rootpath)
          self.removeTopology(name, state_manager.name)

      for name in topologies:
        if name not in existingTopNames:
          self.addNewTopology(state_manager, name)

    for state_manager in self.state_managers:
      # The callback function with the bound
      # state_manager as first variable.
      onTopologiesWatch = partial(on_topologies_watch, state_manager)
      state_manager.get_topologies(onTopologiesWatch)
Example #34
0
def run(command, parser, cl_args, unknown_args):
  """ run the update command """
  topology_name = cl_args['topology-name']
  try:
    new_args = [
        "--cluster", cl_args['cluster'],
        "--role", cl_args['role'],
        "--environment", cl_args['environ'],
        "--heron_home", config.get_heron_dir(),
        "--config_path", cl_args['config_path'],
        "--override_config_file", cl_args['override_config_file'],
        "--release_file", config.get_heron_release_file(),
        "--topology_name", topology_name,
        "--command", command,
        "--component_parallelism", ','.join(cl_args['component_parallelism']),
    ]

    if Log.getEffectiveLevel() == logging.DEBUG:
      new_args.append("--verbose")

    lib_jars = config.get_heron_libs(jars.scheduler_jars() + jars.statemgr_jars())

    # invoke the runtime manager to kill the topology
    execute.heron_class(
        'com.twitter.heron.scheduler.RuntimeManagerMain',
        lib_jars,
        extra_jars=[],
        args=new_args
    )

  except Exception as ex:
    Log.error('Failed to update topology \'%s\': %s', topology_name, traceback.format_exc(ex))
    return False

  Log.info('Successfully updated topology \'%s\'' % topology_name)
  return True
Example #35
0
def run(command, parser, args, unknown_args):
    '''
  :param command:
  :param parser:
  :param args:
  :param unknown_args:
  :return:
  '''
    # get the command for detailed help
    command_help = args['help-command']

    # if no command is provided, just print main help
    if command_help == 'help':
        parser.print_help()
        return SimpleResult(Status.Ok)

    # get the subparser for the specific command
    subparser = config.get_subparser(parser, command_help)
    if subparser:
        print subparser.format_help()
        return SimpleResult(Status.Ok)
    else:
        Log.error("Unknown subcommand \'%s\'", command_help)
        return SimpleResult(Status.InvocationError)
Example #36
0
def render(resp):
    if isinstance(resp, list):
        for r in resp:
            render(r)
    elif isinstance(resp, Response):
        if resp.status == Status.Ok:
            if resp.msg:
                Log.info(resp.msg)
            if resp.detailed_msg:
                Log.debug(resp.detailed_msg)
        elif resp.status == Status.HeronError:
            if resp.msg:
                Log.error(resp.msg)
            if resp.detailed_msg:
                Log.debug(resp.detailed_msg)
        # If status code is InvocationError, invocation of shelled-out program fails. The error
        # message will be in stderr, so we log.error detailed message(stderr) only
        elif resp.status == Status.InvocationError:
            Log.error(resp.detailed_msg)
        else:
            raise RuntimeError("Unknown status type of value %d", resp.status)
    else:
        raise RuntimeError("Unknown response instance: %s",
                           str(resp.__class__))
Example #37
0
def run(cl_args, compo_type):
    """ run command """
    cluster, role, env = cl_args['cluster'], cl_args['role'], cl_args[
        'environ']
    topology = cl_args['topology-name']
    spouts_only, bolts_only = cl_args['spout'], cl_args['bolt']
    try:
        components = tracker_access.get_logical_plan(cluster, env, topology,
                                                     role)
        topo_info = tracker_access.get_topology_info(cluster, env, topology,
                                                     role)
        table, header = to_table(components, topo_info)
        if spouts_only == bolts_only:
            print(tabulate(table, headers=header))
        elif spouts_only:
            table, header = filter_spouts(table, header)
            print(tabulate(table, headers=header))
        else:
            table, header = filter_bolts(table, header)
            print(tabulate(table, headers=header))
        return True
    except:
        Log.error("Fail to connect to tracker: \'%s\'", cl_args["tracker_url"])
        return False
Example #38
0
    def update_reduced_metric(self, name, value, key=None):
        """Update the value of ReducedMetric or MultiReducedMetric

    :type name: str
    :param name: name of the registered metric to be updated.
    :param value: specifies a value to be reduced.
    :type key: str or None
    :param key: specifies a key for MultiReducedMetric. Needs to be `None` for updating
                ReducedMetric.
    """
        if name not in self.metrics:
            Log.error(
                "In update_reduced_metric(): %s is not registered in the metric",
                name)

        if key is None and isinstance(self.metrics[name], ReducedMetric):
            self.metrics[name].update(value)
        elif key is not None and isinstance(self.metrics[name],
                                            MultiReducedMetric):
            self.metrics[name].update(key, value)
        else:
            Log.error(
                "In update_count(): %s is registered but not supported with this method",
                name)
Example #39
0
def run(command, parser, cl_args, unknown_args):
  '''
  :param command:
  :param parser:
  :param args:
  :param unknown_args:
  :return:
  '''
  cluster = cl_args['cluster']

  # server mode
  if cluster:
    config_file = config.heron_rc_file()
    client_confs = dict()

    # Read the cluster definition, if not found
    client_confs = cdefs.read_server_mode_cluster_definition(cluster, cl_args, config_file)

    if not client_confs[cluster]:
      Log.error('Neither service url nor %s cluster definition in %s file', cluster, config_file)
      return SimpleResult(Status.HeronError)

    # if cluster definition exists, but service_url is not set, it is an error
    if not 'service_url' in client_confs[cluster]:
      Log.error('No service url for %s cluster in %s', cluster, config_file)
      sys.exit(1)

    service_endpoint = cl_args['service_url']
    service_apiurl = service_endpoint + rest.ROUTE_SIGNATURES[command][1]
    service_method = rest.ROUTE_SIGNATURES[command][0]

    try:
      r = service_method(service_apiurl)
      if r.status_code != requests.codes.ok:
        Log.error(r.json().get('message', "Unknown error from API server %d" % r.status_code))
      sorted_items = sorted(list(r.json().items()), key=lambda tup: tup[0])
      for key, value in sorted_items:
        print("%s : %s" % (key, value))
    except (requests.exceptions.ConnectionError, requests.exceptions.HTTPError) as err:
      Log.error(err)
      return SimpleResult(Status.HeronError)
  else:
    config.print_build_info()

  return SimpleResult(Status.Ok)
Example #40
0
    def _handle_packet(self, packet):
        # only called when packet.is_complete is True
        # otherwise, it's just an message -- call on_incoming_message()
        typename, reqid, serialized_msg = HeronProtocol.decode_packet(packet)
        if self.context_map.has_key(reqid):
            # this incoming packet has the response of a request
            context = self.context_map.pop(reqid)
            response_msg = self.response_message_map.pop(reqid)

            try:
                response_msg.ParseFromString(serialized_msg)
            except Exception as e:
                Log.error("Invalid Packet Error: %s" % e.message)
                self._handle_close()
                self.on_error()
                return

            if response_msg.IsInitialized():
                self.on_response(StatusCode.OK, context, response_msg)
            else:
                Log.error("Response not initialized")
                self._handle_close()
                self.on_error()
        elif reqid.is_zero():
            # this is a Message -- no need to send back response
            try:
                if typename not in self.registered_message_map:
                    raise ValueError("%s is not registered in message map" %
                                     typename)
                msg_builder = self.registered_message_map[typename]
                message = msg_builder()
                message.ParseFromString(serialized_msg)
                if message.IsInitialized():
                    self.on_incoming_message(message)
                else:
                    raise RuntimeError("Message not initialized")
            except Exception as e:
                Log.error("Error when handling message packet: %s" % e.message)
                Log.error(traceback.format_exc())
                raise RuntimeError("Problem reading message")
        else:
            # might be a timeout response
            Log.info(
                "In handle_packet(): Received message whose REQID is not registered: %s"
                % str(reqid))
Example #41
0
def run_metrics(
    cluster: str,
    role: str,
    environment: str,
    topology: str,
    component: Optional[str],
) -> None:
    """Render a table of metrics."""
    try:
        result = tracker.get_topology_info(cluster, environment, topology,
                                           role)
    except requests.ConnectionError as e:
        Log.error(f"Fail to connect to tracker: {e}")
        sys.exit(1)

    all_components = sorted(result['physical_plan']['components'].keys())
    if component:
        if component not in all_components:
            Log.error(f"Unknown component: {component!r}")
            sys.exit(1)
        components = [component]
    else:
        components = all_components
    all_queries = tracker.metric_queries()

    for i, comp in enumerate(components):
        try:
            result = tracker.get_comp_metrics(
                cluster,
                environment,
                topology,
                comp,
                [],
                all_queries,
                [0, -1],
                role,
            )
        except requests.ConnectionError as e:
            Log.error(f"Fail to connect to tracker: {e}")
            sys.exit(1)
        stat, header = to_table(result["metrics"])
        if i != 0:
            print('')
        print(f"{comp!r} metrics:")
        print(tabulate(stat, headers=header))
Example #42
0
def run_metrics(
    cluster: str,
    role: str,
    environment: str,
    topology: str,
    component: str,
) -> None:
    """Render a table of metrics."""
    try:
        result = tracker_access.get_topology_info(cluster, environment,
                                                  topology, role)
    except Exception:
        Log.error("Fail to connect to tracker")
        sys.exit(1)
    spouts = list(result['physical_plan']['spouts'].keys())
    bolts = list(result['physical_plan']['bolts'].keys())
    components = spouts + bolts
    if component:
        if component in components:
            components = [component]
        else:
            Log.error(f"Unknown component: {component!r}")
            sys.exit(1)
    cresult = []
    for comp in components:
        try:
            metrics = tracker_access.get_component_metrics(
                comp, cluster, environment, topology, role)
        except:
            Log.error("Fail to connect to tracker")
            sys.exit(1)
        stat, header = to_table(metrics)
        cresult.append((comp, stat, header))
    for i, (c, stat, header) in enumerate(cresult):
        if i != 0:
            print('')
        print(f"{c!r} metrics:")
        print(tabulate(stat, headers=header))
Example #43
0
def run_metrics(command, parser, cl_args, unknown_args):
    """ run metrics subcommand """
    cluster, role, env = cl_args['cluster'], cl_args['role'], cl_args[
        'environ']
    topology = cl_args['topology-name']
    try:
        result = tracker_access.get_topology_info(cluster, env, topology, role)
        spouts = result['physical_plan']['spouts'].keys()
        bolts = result['physical_plan']['bolts'].keys()
        components = spouts + bolts
        cname = cl_args['component']
        if cname:
            if cname in components:
                components = [cname]
            else:
                Log.error('Unknown component: \'%s\'' % cname)
                raise
    except Exception:
        Log.error("Fail to connect to tracker: \'%s\'", cl_args["tracker_url"])
        return False
    cresult = []
    for comp in components:
        try:
            metrics = tracker_access.get_component_metrics(
                comp, cluster, env, topology, role)
        except:
            Log.error("Fail to connect to tracker: \'%s\'",
                      cl_args["tracker_url"])
            return False
        stat, header = to_table(metrics)
        cresult.append((comp, stat, header))
    for i, (comp, stat, header) in enumerate(cresult):
        if i != 0:
            print('')
        print('\'%s\' metrics:' % comp)
        print(tabulate(stat, headers=header))
    return True
Example #44
0
 def on_error(self):
     Log.error("Disconnected from Stream Manager")
     # cleaning up
     self._pplan_helper = None
     # retry again
     self.on_connect(StatusCode.CONNECT_ERROR)
Example #45
0
 def update_threads_time(self):
     try:
         for t in self.process.threads():
             self.threads.update(t.id, (t.user_time, t.system_time))
     except Exception as e:
         Log.error(traceback.format_exc(e))
Example #46
0
 def on_error(self):
     Log.error("Disconnected from Metrics Manager")
     self.on_connect(StatusCode.CONNECT_ERROR)
Example #47
0
def log_error(message, ident=0):
    """log error info"""
    Log.error(" " * (ident * 2) + str(message))
Example #48
0
def getMetricsTimeline(tmaster,
                       component_name,
                       metric_names,
                       instances,
                       start_time,
                       end_time,
                       callback=None):
    """
  Get the specified metrics for the given component name of this topology.
  Returns the following dict on success:
  {
    "timeline": {
      <metricname>: {
        <instance>: {
          <start_time> : <numeric value>,
          <start_time> : <numeric value>,
          ...
        }
        ...
      }, ...
    },
    "starttime": <numeric value>,
    "endtime": <numeric value>,
    "component": "..."
  }

  Returns the following dict on failure:
  {
    "message": "..."
  }
  """
    # Tmaster is the proto object and must have host and port for stats.
    if not tmaster or not tmaster.host or not tmaster.stats_port:
        raise Exception("No Tmaster found")

    host = tmaster.host
    port = tmaster.stats_port

    # Create the proto request object to get metrics.

    metricRequest = tmaster_pb2.MetricRequest()
    metricRequest.component_name = component_name

    # If no instances are give, metrics for all instances
    # are fetched by default.
    if len(instances) > 0:
        for instance in instances:
            metricRequest.instance_id.append(instance)

    for metricName in metric_names:
        metricRequest.metric.append(metricName)

    metricRequest.explicit_interval.start = start_time
    metricRequest.explicit_interval.end = end_time
    metricRequest.minutely = True

    # Serialize the metricRequest to send as a payload
    # with the HTTP request.
    metricRequestString = metricRequest.SerializeToString()

    # Form and send the http request.
    url = "http://{0}:{1}/stats".format(host, port)
    request = tornado.httpclient.HTTPRequest(url,
                                             body=metricRequestString,
                                             method='POST',
                                             request_timeout=5)

    Log.debug("Making HTTP call to fetch metrics")
    Log.debug("url: " + url)
    try:
        client = tornado.httpclient.AsyncHTTPClient()
        result = yield client.fetch(request)
        Log.debug("HTTP call complete.")
    except tornado.httpclient.HTTPError as e:
        raise Exception(str(e))

    # Check the response code - error if it is in 400s or 500s
    responseCode = result.code
    if responseCode >= 400:
        message = "Error in getting metrics from Tmaster, code: " + responseCode
        Log.error(message)
        raise Exception(message)

    # Parse the response from tmaster.
    metricResponse = tmaster_pb2.MetricResponse()
    metricResponse.ParseFromString(result.body)

    if metricResponse.status.status == common_pb2.NOTOK:
        if metricResponse.status.HasField("message"):
            Log.error(metricResponse.status.message)

    # Form the response.
    ret = {}
    ret["starttime"] = start_time
    ret["endtime"] = end_time
    ret["component"] = component_name
    ret["timeline"] = {}

    # Loop through all the metrics
    # One instance corresponds to one metric, which can have
    # multiple IndividualMetrics for each metricname requested.
    for metric in metricResponse.metric:
        instance = metric.instance_id

        # Loop through all individual metrics.
        for im in metric.metric:
            metricname = im.name
            if metricname not in ret["timeline"]:
                ret["timeline"][metricname] = {}
            if instance not in ret["timeline"][metricname]:
                ret["timeline"][metricname][instance] = {}

            # We get minutely metrics.
            # Interval-values correspond to the minutely mark for which
            # this metric value corresponds to.
            for interval_value in im.interval_values:
                ret["timeline"][metricname][instance][
                    interval_value.interval.start] = interval_value.value

    raise tornado.gen.Return(ret)
Example #49
0
def run(command, parser, cl_args, unknown_args):
    '''
  Submits the topology to the scheduler
    * Depending on the topology file name extension, we treat the file as a
      fatjar (if the ext is .jar) or a tar file (if the ext is .tar/.tar.gz).
    * We upload the topology file to the packer, update zookeeper and launch
      scheduler jobs representing that topology
    * You can see your topology in Heron UI
  :param command:
  :param parser:
  :param cl_args:
  :param unknown_args:
  :return:
  '''
    # get the topology file name
    topology_file = cl_args['topology-file-name']

    # check to see if the topology file exists
    if not os.path.isfile(topology_file):
        Log.error("Topology jar|tar|pex file %s does not exist" %
                  topology_file)
        return False

    # check if it is a valid file type
    jar_type = topology_file.endswith(".jar")
    tar_type = topology_file.endswith(".tar") or topology_file.endswith(
        ".tar.gz")
    pex_type = topology_file.endswith(".pex")
    if not jar_type and not tar_type and not pex_type:
        Log.error(
            "Unknown file type. Please use .tar or .tar.gz or .jar or .pex file"
        )
        return False

    # check if extra launch classpath is provided and if it is validate
    if cl_args['extra_launch_classpath']:
        valid_classpath = classpath.valid_java_classpath(
            cl_args['extra_launch_classpath'])
        if not valid_classpath:
            Log.error(
                "One of jar or directory in extra launch classpath does not exist"
            )
            return False

    # create a temporary directory for topology definition file
    tmp_dir = tempfile.mkdtemp()

    # if topology needs to be launched in deactivated state, do it so
    if cl_args['deploy_deactivated']:
        initial_state = topology_pb2.TopologyState.Name(topology_pb2.PAUSED)
    else:
        initial_state = topology_pb2.TopologyState.Name(topology_pb2.RUNNING)

    # set the tmp dir and deactivated state in global options
    opts.set_config('cmdline.topologydefn.tmpdirectory', tmp_dir)
    opts.set_config('cmdline.topology.initial.state', initial_state)

    # check the extension of the file name to see if it is tar/jar file.
    if jar_type:
        return submit_fatjar(cl_args, unknown_args, tmp_dir)

    elif tar_type:
        return submit_tar(cl_args, unknown_args, tmp_dir)

    elif pex_type:
        return submit_pex(cl_args, unknown_args, tmp_dir)

    return False
Example #50
0
def delete_job(cl_args, job_id, nomad_addr):
  r = requests.delete("http://%s:4646/v1/job/%s" % (nomad_addr, job_id), data={'purge':'true'})
  if r.status_code != 200:
    Log.error("Failed to delete job %s" % job_id)
    Log.debug("Response: %s" % r)
    sys.exit(-1)
Example #51
0
    def getComponentException(self,
                              tmaster,
                              component_name,
                              instances=[],
                              callback=None):
        """
    Get all (last 1000) exceptions for 'component_name' of the topology.
    Returns an Array of exception logs on success.
    Returns json with message on failure.
    """
        if not tmaster or not tmaster.host or not tmaster.stats_port:
            return

        exception_request = tmaster_pb2.ExceptionLogRequest()
        exception_request.component_name = component_name
        if len(instances) > 0:
            exception_request.instances.extend(instances)
        request_str = exception_request.SerializeToString()
        port = str(tmaster.stats_port)
        host = tmaster.host
        url = "http://{0}:{1}/exceptions".format(host, port)
        request = tornado.httpclient.HTTPRequest(url,
                                                 body=request_str,
                                                 method='POST',
                                                 request_timeout=5)
        Log.debug('Making HTTP call to fetch exceptions url: %s', url)
        try:
            client = tornado.httpclient.AsyncHTTPClient()
            result = yield client.fetch(request)
            Log.debug("HTTP call complete.")
        except tornado.httpclient.HTTPError as e:
            raise Exception(str(e))

        # Check the response code - error if it is in 400s or 500s
        responseCode = result.code
        if responseCode >= 400:
            message = "Error in getting exceptions from Tmaster, code: " + responseCode
            Log.error(message)
            raise tornado.gen.Return({"message": message})

        # Parse the response from tmaster.
        exception_response = tmaster_pb2.ExceptionLogResponse()
        exception_response.ParseFromString(result.body)

        if exception_response.status.status == common_pb2.NOTOK:
            if exception_response.status.HasField("message"):
                raise tornado.gen.Return(
                    {"message": exception_response.status.message})

        # Send response
        ret = []
        for exception_log in exception_response.exceptions:
            ret.append({
                'hostname': exception_log.hostname,
                'instance_id': exception_log.instance_id,
                'stack_trace': exception_log.stacktrace,
                'lasttime': exception_log.lasttime,
                'firsttime': exception_log.firsttime,
                'count': str(exception_log.count),
                'logging': exception_log.logging
            })
        raise tornado.gen.Return(ret)
Example #52
0
    def getComponentMetrics(self,
                            tmaster,
                            componentName,
                            metricNames,
                            instances,
                            interval,
                            callback=None):
        """
    Get the specified metrics for the given component name of this topology.
    Returns the following dict on success:
    {
      "metrics": {
        <metricname>: {
          <instance>: <numeric value>,
          <instance>: <numeric value>,
          ...
        }, ...
      },
      "interval": <numeric value>,
      "component": "..."
    }

    Raises exception on failure.
    """
        if not tmaster or not tmaster.host or not tmaster.stats_port:
            raise Exception("No Tmaster found")

        host = tmaster.host
        port = tmaster.stats_port

        metricRequest = tmaster_pb2.MetricRequest()
        metricRequest.component_name = componentName
        if len(instances) > 0:
            for instance in instances:
                metricRequest.instance_id.append(instance)
        for metricName in metricNames:
            metricRequest.metric.append(metricName)
        metricRequest.interval = interval

        # Serialize the metricRequest to send as a payload
        # with the HTTP request.
        metricRequestString = metricRequest.SerializeToString()

        url = "http://{0}:{1}/stats".format(host, port)
        request = tornado.httpclient.HTTPRequest(url,
                                                 body=metricRequestString,
                                                 method='POST',
                                                 request_timeout=5)

        Log.debug("Making HTTP call to fetch metrics")
        Log.debug("url: " + url)
        try:
            client = tornado.httpclient.AsyncHTTPClient()
            result = yield client.fetch(request)
            Log.debug("HTTP call complete.")
        except tornado.httpclient.HTTPError as e:
            raise Exception(str(e))

        # Check the response code - error if it is in 400s or 500s
        responseCode = result.code
        if responseCode >= 400:
            message = "Error in getting metrics from Tmaster, code: " + responseCode
            Log.error(message)
            raise Exception(message)

        # Parse the response from tmaster.
        metricResponse = tmaster_pb2.MetricResponse()
        metricResponse.ParseFromString(result.body)

        if metricResponse.status.status == common_pb2.NOTOK:
            if metricResponse.status.HasField("message"):
                Log.warn("Received response from Tmaster: %s",
                         metricResponse.status.message)

        # Form the response.
        ret = {}
        ret["interval"] = metricResponse.interval
        ret["component"] = componentName
        ret["metrics"] = {}
        for metric in metricResponse.metric:
            instance = metric.instance_id
            for im in metric.metric:
                metricname = im.name
                value = im.value
                if metricname not in ret["metrics"]:
                    ret["metrics"][metricname] = {}
                ret["metrics"][metricname][instance] = value

        raise tornado.gen.Return(ret)
Example #53
0
 def update_fds(self):
     try:
         self.fd_nums.update(self.process.num_fds())
     except Exception as e:
         Log.error(traceback.format_exc(e))
Example #54
0
 def on_error(self):
     Log.error("Disconnected from Stream Manager")
     # retry again
     self.on_connect(StatusCode.CONNECT_ERROR)