Example #1
0
def _convert_java_value(kv, include_non_primitives=True):
  try:
    pobj = javaobj.loads(kv.serialized_value)
    if pyutils.is_str_instance(pobj):
      return pobj

    if pobj.is_primitive():
      return pobj.value

    if include_non_primitives:
      # java objects that are not strings return value and encoded value
      # Hexadecimal byte array for Serialized objects that
      return {
          'value' : json.dumps(pobj,
                               default=lambda custom_field: custom_field.__dict__,
                               sort_keys=True,
                               indent=2),
          'raw' : utils.hex_escape(kv.serialized_value)}

    return None
  except Exception:
    Log.exception("Failed to parse data as java object")
    if include_non_primitives:
      return _raw_value(kv)
    else:
      return None
 def prepare(self, context, component, stream, target_tasks):
   Log.info("In prepare of SampleCustomGrouping, "
            "with src component: %s, "
            "with stream id: %s, "
            "with target tasks: %s"
            % (component, stream, str(target_tasks)))
   self.target_tasks = target_tasks
Example #3
0
  def get(self):
    try:
      cluster = self.get_argument_cluster()
      role = self.get_argument_role()
      environ = self.get_argument_environ()
      topology_name = self.get_argument_topology()
      container = self.get_argument(constants.PARAM_CONTAINER)
      path = self.get_argument(constants.PARAM_PATH)
      topology_info = self.tracker.getTopologyInfo(topology_name, cluster, role, environ)

      stmgr_id = "stmgr-" + container
      stmgr = topology_info["physical_plan"]["stmgrs"][stmgr_id]
      host = stmgr["host"]
      shell_port = stmgr["shell_port"]
      file_download_url = "http://%s:%d/download/%s" % (host, shell_port, path)
      Log.debug("download file url: %s", file_download_url)

      path = self.get_argument("path")
      filename = path.split("/")[-1]
      self.set_header("Content-Disposition", "attachment; filename=%s" % filename)

      def streaming_callback(chunk):
        self.write(chunk)
        self.flush()

      http_client = tornado.httpclient.AsyncHTTPClient()
      yield http_client.fetch(file_download_url, streaming_callback=streaming_callback)
      self.finish()
    except Exception as e:
      Log.debug(traceback.format_exc())
      self.write_error_response(e)
Example #4
0
def launch_topologies(cl_args, topology_file, tmp_dir):
  '''
  Launch topologies
  :param cl_args:
  :param topology_file:
  :param tmp_dir:
  :return: list(Responses)
  '''
  # the submitter would have written the .defn file to the tmp_dir
  defn_files = glob.glob(tmp_dir + '/*.defn')

  if len(defn_files) == 0:
    return SimpleResult(Status.HeronError, "No topologies found under %s" % tmp_dir)

  results = []
  for defn_file in defn_files:
    # load the topology definition from the file
    topology_defn = topology_pb2.Topology()
    try:
      handle = open(defn_file, "rb")
      topology_defn.ParseFromString(handle.read())
      handle.close()
    except Exception as e:
      err_context = "Cannot load topology definition '%s': %s" % (defn_file, e)
      return SimpleResult(Status.HeronError, err_context)
    # launch the topology
    Log.info("Launching topology: \'%s\'", topology_defn.name)
    res = launch_a_topology(
        cl_args, tmp_dir, topology_file, defn_file, topology_defn.name)
    results.append(res)
  return results
Example #5
0
 def fail(self, tup):
     Log.info(
         "Trying to do a fail. tuples processed: %d, received: %d" % (self.tuples_processed, self.tuple_received)
     )
     if self.tuples_processed < self.tuple_received:
         super(IntegrationTestBolt, self).fail(tup)
         self.tuples_processed += 1
Example #6
0
  def get(self):
    """ get method """
    try:
      cluster = self.get_argument_cluster()
      role = self.get_argument_role()
      environ = self.get_argument_environ()
      topology_name = self.get_argument_topology()
      container = self.get_argument(constants.PARAM_CONTAINER)
      path = self.get_argument(constants.PARAM_PATH)
      offset = self.get_argument_offset()
      length = self.get_argument_length()
      topology_info = self.tracker.getTopologyInfo(topology_name, cluster, role, environ)

      stmgr_id = "stmgr-" + container
      stmgr = topology_info["physical_plan"]["stmgrs"][stmgr_id]
      host = stmgr["host"]
      shell_port = stmgr["shell_port"]
      file_data_url = "http://%s:%d/filedata/%s?offset=%s&length=%s" % \
        (host, shell_port, path, offset, length)

      http_client = tornado.httpclient.AsyncHTTPClient()
      response = yield http_client.fetch(file_data_url)
      self.write_success_response(json.loads(response.body))
      self.finish()
    except Exception as e:
      Log.debug(traceback.format_exc())
      self.write_error_response(e)
Example #7
0
 def update_cpu_time(self):
   try:
     t = self.process.cpu_times()
     self.sys_cpu_time.update(t.system)
     self.user_cpu_time.update(t.user)
   except Exception as e:
     Log.error(traceback.format_exc(e))
Example #8
0
def get_jobs(cl_args, nomad_addr):
  r = requests.get("http://%s:4646/v1/jobs" % nomad_addr)
  if r.status_code != 200:
    Log.error("Failed to get list of jobs")
    Log.debug("Response: %s" % r)
    sys.exit(-1)
  return r.json()
Example #9
0
  def _add_spout_task(self):
    Log.info("Adding spout task...")
    def spout_task():
      # don't do anything when topology is paused
      if not self._is_topology_running():
        return

      if self._should_produce_tuple():
        self._produce_tuple()
        self.output_helper.send_out_tuples()
        self.looper.wake_up() # so emitted tuples would be added to buffer now
      else:
        self.spout_metrics.update_out_queue_full_count()

      if self.acking_enabled:
        self._read_tuples_and_execute()
        self.spout_metrics.update_pending_tuples_count(len(self.in_flight_tuples))
      else:
        self._do_immediate_acks()

      if self._is_continue_to_work():
        self.looper.wake_up()

    self.looper.add_wakeup_task(spout_task)

    # look for the timeout's tuples
    if self.enable_message_timeouts:
      self._look_for_timeouts()
Example #10
0
  def _read_tuples_and_execute(self):
    start_cycle_time = time.time()
    total_data_emitted_bytes_before = self.get_total_data_emitted_in_bytes()
    exec_batch_time = \
      self.sys_config[system_constants.INSTANCE_EXECUTE_BATCH_TIME_MS] * system_constants.MS_TO_SEC
    exec_batch_size = self.sys_config[system_constants.INSTANCE_EXECUTE_BATCH_SIZE_BYTES]
    while not self.in_stream.is_empty():
      try:
        tuples = self.in_stream.poll()
      except Queue.Empty:
        break

      if isinstance(tuples, tuple_pb2.HeronTupleSet):
        if tuples.HasField("control"):
          raise RuntimeError("Bolt cannot get acks/fails from other components")
        elif tuples.HasField("data"):
          stream = tuples.data.stream

          for data_tuple in tuples.data.tuples:
            self._handle_data_tuple(data_tuple, stream)
        else:
          Log.error("Received tuple neither data nor control")
      else:
        Log.error("Received tuple not instance of HeronTupleSet")

      if (time.time() - start_cycle_time - exec_batch_time > 0) or \
          (self.get_total_data_emitted_in_bytes() - total_data_emitted_bytes_before
           > exec_batch_size):
        # batch reached
        break
Example #11
0
 def on_connect(self, status):
   Log.debug("In on_connect of MetricsManagerClient")
   if status != StatusCode.OK:
     Log.error("Error connecting to Metrics Manager with status: %s" % str(status))
     retry_interval = float(self.sys_config[constants.INSTANCE_RECONNECT_METRICSMGR_INTERVAL_SEC])
     self.looper.register_timer_task_in_sec(self.start_connect, retry_interval)
   self._send_register_req()
Example #12
0
  def __init__(self, pplan_helper, in_stream, out_stream, looper):
    super(SpoutInstance, self).__init__(pplan_helper, in_stream, out_stream, looper)
    self.topology_state = topology_pb2.TopologyState.Value("PAUSED")

    if not self.pplan_helper.is_spout:
      raise RuntimeError("No spout in physicial plan")

    context = self.pplan_helper.context
    self.spout_metrics = SpoutMetrics(self.pplan_helper)
    self.serializer = SerializerHelper.get_serializer(context)

    # acking related
    self.acking_enabled = context.get_cluster_config().get(api_constants.TOPOLOGY_ENABLE_ACKING,
                                                           False)
    self.enable_message_timeouts = \
      context.get_cluster_config().get(api_constants.TOPOLOGY_ENABLE_MESSAGE_TIMEOUTS)
    Log.info("Enable ACK: %s" % str(self.acking_enabled))
    Log.info("Enable Message Timeouts: %s" % str(self.enable_message_timeouts))

    # map <tuple_info.key -> tuple_info>, ordered by insertion time
    self.in_flight_tuples = collections.OrderedDict()
    self.immediate_acks = collections.deque()
    self.total_tuples_emitted = 0

    # load user's spout class
    spout_impl_class = super(SpoutInstance, self).load_py_instance(is_spout=True)
    self.spout_impl = spout_impl_class(delegate=self)
Example #13
0
def start_api_server(masters, cl_args):
  '''
  Start the Heron API server
  '''
  # make sure nomad cluster is up
  single_master = list(masters)[0]
  wait_for_master_to_start(single_master)

  cmd = "%s run %s >> /tmp/apiserver_start.log 2>&1 &" \
        % (get_nomad_path(cl_args), get_apiserver_job_file(cl_args))
  Log.info("Starting Heron API Server on %s" % single_master)

  if not is_self(single_master):
    cmd = ssh_remote_execute(cmd, single_master, cl_args)
  Log.debug(cmd)
  pid = subprocess.Popen(cmd,
                         shell=True,
                         stdout=subprocess.PIPE,
                         stderr=subprocess.PIPE)

  return_code = pid.wait()
  output = pid.communicate()
  Log.debug("return code: %s output: %s" % (return_code, output))
  if return_code != 0:
    Log.error("Failed to start apiserver on %s with error:\n%s" % (single_master, output[1]))
    sys.exit(-1)

  wait_for_job_to_start(single_master, "apiserver")
  Log.info("Done starting Heron API Server")
Example #14
0
def start_heron_tools(masters, cl_args):
  '''
  Start Heron tracker and UI
  '''
  single_master = list(masters)[0]
  wait_for_master_to_start(single_master)

  cmd = "%s run %s >> /tmp/heron_tools_start.log 2>&1 &" \
        % (get_nomad_path(cl_args), get_heron_tools_job_file(cl_args))
  Log.info("Starting Heron Tools on %s" % single_master)

  if not is_self(single_master):
    cmd = ssh_remote_execute(cmd, single_master, cl_args)
  Log.debug(cmd)
  pid = subprocess.Popen(cmd,
                         shell=True,
                         stdout=subprocess.PIPE,
                         stderr=subprocess.PIPE)

  return_code = pid.wait()
  output = pid.communicate()
  Log.debug("return code: %s output: %s" % (return_code, output))
  if return_code != 0:
    Log.error("Failed to start Heron Tools on %s with error:\n%s" % (single_master, output[1]))
    sys.exit(-1)

  wait_for_job_to_start(single_master, "heron-tools")
  Log.info("Done starting Heron Tools")
Example #15
0
def start_slave_nodes(slaves, cl_args):
  '''
  Star slave nodes
  '''
  pids = []
  for slave in slaves:
    Log.info("Starting slave on %s" % slave)
    cmd = "%s agent -config %s >> /tmp/nomad_client.log 2>&1 &" \
          % (get_nomad_path(cl_args), get_nomad_slave_config_file(cl_args))
    if not is_self(slave):
      cmd = ssh_remote_execute(cmd, slave, cl_args)
    Log.debug(cmd)
    pid = subprocess.Popen(cmd,
                           shell=True,
                           stdout=subprocess.PIPE,
                           stderr=subprocess.PIPE)
    pids.append({"pid": pid, "dest": slave})

  errors = []
  for entry in pids:
    pid = entry["pid"]
    return_code = pid.wait()
    output = pid.communicate()
    Log.debug("return code: %s output: %s" % (return_code, output))
    if return_code != 0:
      errors.append("Failed to start slave on %s with error:\n%s" % (entry["dest"], output[1]))

  if errors:
    for error in errors:
      Log.error(error)
    sys.exit(-1)

  Log.info("Done starting slaves")
Example #16
0
 def _emit_terminal_if_needed(self):
     Log.info("is_done: %s, tuples_to_complete: %s" % (self.is_done, self.tuples_to_complete))
     if self.is_done and self.tuples_to_complete == 0:
         Log.info("Emitting terminals to downstream")
         super(IntegrationTestSpout, self).emit(
             [integ_const.INTEGRATION_TEST_TERMINAL], stream=integ_const.INTEGRATION_TEST_CONTROL_STREAM_ID
         )
Example #17
0
  def _read_tuples_and_execute(self):
    start_cycle_time = time.time()
    ack_batch_time = self.sys_config[system_constants.INSTANCE_ACK_BATCH_TIME_MS] * \
                     system_constants.MS_TO_SEC
    while not self.in_stream.is_empty():
      try:
        tuples = self.in_stream.poll()
      except Queue.Empty:
        break

      if isinstance(tuples, tuple_pb2.HeronTupleSet):
        if tuples.HasField("data"):
          raise RuntimeError("Spout cannot get incoming data tuples from other components")
        elif tuples.HasField("control"):
          for ack_tuple in tuples.control.acks:
            self._handle_ack_tuple(ack_tuple, True)
          for fail_tuple in tuples.control.fails:
            self._handle_ack_tuple(fail_tuple, False)
        else:
          Log.error("Received tuple neither data nor control")
      else:
        Log.error("Received tuple not instance of HeronTupleSet")

      # avoid spending too much time here
      if time.time() - start_cycle_time - ack_batch_time > 0:
        break
Example #18
0
  def _get_dict_from_config(topology_config):
    """Converts Config protobuf message to python dictionary

    Values are converted according to the rules below:

    - Number string (e.g. "12" or "1.2") is appropriately converted to ``int`` or ``float``
    - Boolean string ("true", "True", "false" or "False") is converted to built-in boolean type
      (i.e. ``True`` or ``False``)
    - Normal string is inserted to dict as is
    - Serialized value is deserialized and inserted as a corresponding Python object
    """
    config = {}
    for kv in topology_config.kvs:
      if kv.HasField("value"):
        assert kv.type == topology_pb2.ConfigValueType.Value("STRING_VALUE")
        # value is string
        if PhysicalPlanHelper._is_number(kv.value):
          config[kv.key] = PhysicalPlanHelper._get_number(kv.value)
        elif kv.value.lower() in ("true", "false"):
          config[kv.key] = True if kv.value.lower() == "true" else False
        else:
          config[kv.key] = kv.value
      elif kv.HasField("serialized_value") and \
        kv.type == topology_pb2.ConfigValueType.Value("PYTHON_SERIALIZED_VALUE"):
        # deserialize that
        config[kv.key] = default_serializer.deserialize(kv.serialized_value)
      else:
        assert kv.HasField("type")
        Log.error("Unsupported config <key:value> found: %s, with type: %s"
                  % (str(kv), str(kv.type)))
        continue

    return config
Example #19
0
def import_and_get_class(path_to_pex, python_class_name):
  """Imports and load a class from a given pex file path and python class name

  For example, if you want to get a class called `Sample` in
  /some-path/sample.pex/heron/examples/src/python/sample.py,
  ``path_to_pex`` needs to be ``/some-path/sample.pex``, and
  ``python_class_name`` needs to be ``heron.examples.src.python.sample.Sample``
  """
  abs_path_to_pex = os.path.abspath(path_to_pex)

  Log.debug("Add a pex to the path: %s" % abs_path_to_pex)
  Log.debug("In import_and_get_class with cls_name: %s" % python_class_name)
  split = python_class_name.split('.')
  from_path = '.'.join(split[:-1])
  import_name = python_class_name.split('.')[-1]

  Log.debug("From path: %s, import name: %s" % (from_path, import_name))

  # Resolve duplicate package suffix problem (heron.), if the top level package name is heron
  if python_class_name.startswith("heron."):
    try:
      mod = resolve_heron_suffix_issue(abs_path_to_pex, python_class_name)
      return getattr(mod, import_name)
    except:
      Log.error("Could not resolve class %s with special handling" % python_class_name)

  mod = __import__(from_path, fromlist=[import_name], level=-1)
  Log.debug("Imported module: %s" % str(mod))
  return getattr(mod, import_name)
Example #20
0
 def update_memory_usage(self):
   try:
     m = self.process.memory_info()
     self.physical_memory.update(m.rss)
     self.virtual_memory.update(m.vms)
   except Exception as e:
     Log.error(traceback.format_exc(e))
Example #21
0
    def _handle_assignment_message(self, pplan):
        """Called when new NewInstanceAssignmentMessage arrives"""
        Log.debug("In handle_assignment_message() of STStmgrClient, Physical Plan: \n%s", str(pplan))
        new_helper = PhysicalPlanHelper(
            pplan, self.instance.instance_id, self.heron_instance_cls.topo_pex_file_abs_path
        )

        if self._pplan_helper is not None and (
            self._pplan_helper.my_component_name != new_helper.my_component_name
            or self._pplan_helper.my_task_id != new_helper.my_task_id
        ):
            raise RuntimeError("Our Assignment has changed. We will die to pick it.")

        if self._pplan_helper is None:
            Log.info("Received a new Physical Plan")
            Log.info("Push the new pplan_helper to Heron Instance")
            self.heron_instance_cls.handle_assignment_msg(new_helper)
        else:
            Log.info("Received a new Physical Plan with the same assignment -- State Change")
            Log.info(
                "Old state: %s, new state: %s.",
                self._pplan_helper.get_topology_state(),
                new_helper.get_topology_state(),
            )
            self.heron_instance_cls.handle_state_change_msg(new_helper)

        self._pplan_helper = new_helper
Example #22
0
 def _send_metrics_messages(self):
   if self.connected:
     while not self.out_queue.is_empty():
       message = self.out_queue.poll()
       assert isinstance(message, metrics_pb2.MetricPublisherPublishMessage)
       Log.debug("Sending metric message: %s" % str(message))
       self.send_message(message)
Example #23
0
  def poll(self, timeout=0.0):
    """Modified version of poll() from asyncore module"""
    if self.sock_map is None:
      Log.warning("Socket map is not registered to Gateway Looper")
    readable_lst = []
    writable_lst = []
    error_lst = []

    if self.sock_map is not None:
      for fd, obj in self.sock_map.items():
        is_r = obj.readable()
        is_w = obj.writable()
        if is_r:
          readable_lst.append(fd)
        if is_w and not obj.accepting:
          writable_lst.append(fd)
        if is_r or is_w:
          error_lst.append(fd)

    # Add wakeup fd
    readable_lst.append(self.pipe_r)

    Log.debug("Will select() with timeout: " + str(timeout) + ", with map: " + str(self.sock_map))
    try:
      readable_lst, writable_lst, error_lst = \
        select.select(readable_lst, writable_lst, error_lst, timeout)
    except select.error as err:
      Log.debug("Trivial error: " + str(err))
      if err.args[0] != errno.EINTR:
        raise
      else:
        return
    Log.debug("Selected [r]: " + str(readable_lst) +
              " [w]: " + str(writable_lst) + " [e]: " + str(error_lst))

    if self.pipe_r in readable_lst:
      Log.debug("Read from pipe")
      os.read(self.pipe_r, 1024)
      readable_lst.remove(self.pipe_r)

    if self.sock_map is not None:
      for fd in readable_lst:
        obj = self.sock_map.get(fd)
        if obj is None:
          continue
        asyncore.read(obj)

      for fd in writable_lst:
        obj = self.sock_map.get(fd)
        if obj is None:
          continue
        asyncore.write(obj)

      for fd in error_lst:
        obj = self.sock_map.get(fd)
        if obj is None:
          continue
        # pylint: disable=W0212
        asyncore._exception(obj)
Example #24
0
def get_topology_metrics(*args):
  """Synced API call to get topology metrics"""
  instance = tornado.ioloop.IOLoop.instance()
  try:
    return instance.run_sync(lambda: API.get_comp_metrics(*args))
  except Exception:
    Log.debug(traceback.format_exc())
    raise
Example #25
0
 def unregister_watch(self, uid):
   """
   Unregister the watch with the given UUID.
   """
   # Do not raise an error if UUID is
   # not present in the watches.
   Log.info("Unregister a watch with uid: " + str(uid))
   self.watches.pop(uid, None)
Example #26
0
def get_cluster_role_env_topologies(cluster, role, env):
  """Synced API call to get topologies under a cluster submitted by a role under env"""
  instance = tornado.ioloop.IOLoop.instance()
  try:
    return instance.run_sync(lambda: API.get_cluster_role_env_topologies(cluster, role, env))
  except Exception:
    Log.debug(traceback.format_exc())
    raise
Example #27
0
def get_logical_plan(cluster, env, topology, role):
  """Synced API call to get logical plans"""
  instance = tornado.ioloop.IOLoop.instance()
  try:
    return instance.run_sync(lambda: API.get_logical_plan(cluster, env, topology, role))
  except Exception:
    Log.debug(traceback.format_exc())
    raise
Example #28
0
 def set_topology_context(self, metrics_collector):
   """Sets a new topology context"""
   Log.debug("Setting topology context")
   cluster_config = self.get_topology_config()
   cluster_config.update(self._get_dict_from_config(self.my_component.config))
   task_to_component_map = self._get_task_to_comp_map()
   self.context = TopologyContext(cluster_config, self.pplan.topology, task_to_component_map,
                                  self.my_task_id, metrics_collector)
Example #29
0
def get_cluster_topologies(cluster):
  """Synced API call to get topologies under a cluster"""
  instance = tornado.ioloop.IOLoop.instance()
  try:
    return instance.run_sync(lambda: API.get_cluster_topologies(cluster))
  except Exception:
    Log.debug(traceback.format_exc())
    raise
Example #30
0
  def register_on_message(self, msg_builder):
    """Registers protobuf message builders that this client wants to receive

    :param msg_builder: callable to create a protobuf message that this client wants to receive
    """
    message = msg_builder()
    Log.debug("In register_on_message(): %s" % message.DESCRIPTOR.full_name)
    self.registered_message_map[message.DESCRIPTOR.full_name] = msg_builder
Example #31
0
    def setTopologyInfo(self, topology):
        """
    Extracts info from the stored proto states and
    convert it into representation that is exposed using
    the API.
    This method is called on any change for the topology.
    For example, when a container moves and its host or some
    port changes. All the information is parsed all over
    again and cache is updated.
    """
        # Execution state is the most basic info.
        # If there is no execution state, just return
        # as the rest of the things don't matter.
        if not topology.execution_state:
            Log.info("No execution state found for: " + topology.name)
            return

        Log.info("Setting topology info for topology: " + topology.name)
        has_physical_plan = True
        if not topology.physical_plan:
            has_physical_plan = False

        Log.info("Setting topology info for topology: " + topology.name)
        has_packing_plan = True
        if not topology.packing_plan:
            has_packing_plan = False

        has_tmaster_location = True
        if not topology.tmaster:
            has_tmaster_location = False

        has_scheduler_location = True
        if not topology.scheduler_location:
            has_scheduler_location = False

        topologyInfo = {
            "name": topology.name,
            "id": topology.id,
            "logical_plan": None,
            "physical_plan": None,
            "packing_plan": None,
            "execution_state": None,
            "tmaster_location": None,
            "scheduler_location": None,
        }

        executionState = self.extract_execution_state(topology)
        executionState["has_physical_plan"] = has_physical_plan
        executionState["has_packing_plan"] = has_packing_plan
        executionState["has_tmaster_location"] = has_tmaster_location
        executionState["has_scheduler_location"] = has_scheduler_location
        executionState["status"] = topology.get_status()

        topologyInfo["metadata"] = self.extract_metadata(topology)
        topologyInfo["runtime_state"] = self.extract_runtime_state(topology)

        topologyInfo["execution_state"] = executionState
        topologyInfo["logical_plan"] = self.extract_logical_plan(topology)
        topologyInfo["physical_plan"] = self.extract_physical_plan(topology)
        topologyInfo["packing_plan"] = self.extract_packing_plan(topology)
        topologyInfo["tmaster_location"] = self.extract_tmaster(topology)
        topologyInfo["scheduler_location"] = self.extract_scheduler_location(
            topology)

        self.topologyInfos[(topology.name,
                            topology.state_manager_name)] = topologyInfo
Example #32
0
def log_error(message, ident=0):
    """log error info"""
    Log.error(" " * (ident * 2) + str(message))
Example #33
0
 def invoke_deactivate(self):
     Log.info("Deactivating Bolt")
     self.topology_state = topology_pb2.TopologyState.Value("PAUSED")
Example #34
0
 def handle_close(self):
     Log.info(f"{self._get_classname()}: handle_close() called")
     self._handle_close()
     self.on_error()
Example #35
0
def run(command, parser, cl_args, unknown_args):
    '''
  Submits the topology to the scheduler
    * Depending on the topology file name extension, we treat the file as a
      fatjar (if the ext is .jar) or a tar file (if the ext is .tar/.tar.gz).
    * We upload the topology file to the packer, update zookeeper and launch
      scheduler jobs representing that topology
    * You can see your topology in Heron UI
  :param command:
  :param parser:
  :param cl_args:
  :param unknown_args:
  :return:
  '''
    Log.debug("Submit Args %s", cl_args)

    # get the topology file name
    topology_file = cl_args['topology-file-name']

    if urlparse(topology_file).scheme:
        cl_args['topology-file-name'] = download(topology_file,
                                                 cl_args['cluster'])
        topology_file = cl_args['topology-file-name']
        Log.debug("download uri to local file: %s", topology_file)

    # check to see if the topology file exists
    if not os.path.isfile(topology_file):
        err_context = "Topology file '%s' does not exist" % topology_file
        return SimpleResult(Status.InvocationError, err_context)

    # check if it is a valid file type
    jar_type = topology_file.endswith(".jar")
    tar_type = topology_file.endswith(".tar") or topology_file.endswith(
        ".tar.gz")
    pex_type = topology_file.endswith(".pex")
    cpp_type = topology_file.endswith(".dylib") or topology_file.endswith(
        ".so")
    if not (jar_type or tar_type or pex_type or cpp_type):
        _, ext_name = os.path.splitext(topology_file)
        err_context = "Unknown file type '%s'. Please use .tar "\
                      "or .tar.gz or .jar or .pex or .dylib or .so file"\
                      % ext_name
        return SimpleResult(Status.InvocationError, err_context)

    # check if extra launch classpath is provided and if it is validate
    if cl_args['extra_launch_classpath']:
        valid_classpath = classpath.valid_java_classpath(
            cl_args['extra_launch_classpath'])
        if not valid_classpath:
            err_context = "One of jar or directory in extra launch classpath does not exist: %s" % \
              cl_args['extra_launch_classpath']
            return SimpleResult(Status.InvocationError, err_context)

    # create a temporary directory for topology definition file
    tmp_dir = tempfile.mkdtemp()
    opts.cleaned_up_files.append(tmp_dir)

    # if topology needs to be launched in deactivated state, do it so
    if cl_args['deploy_deactivated']:
        initial_state = topology_pb2.TopologyState.Name(topology_pb2.PAUSED)
    else:
        initial_state = topology_pb2.TopologyState.Name(topology_pb2.RUNNING)

    # set the tmp dir and deactivated state in global options
    opts.set_config('cmdline.topologydefn.tmpdirectory', tmp_dir)
    opts.set_config('cmdline.topology.initial.state', initial_state)
    opts.set_config('cmdline.topology.role', cl_args['role'])
    opts.set_config('cmdline.topology.environment', cl_args['environ'])
    opts.set_config('cmdline.topology.cluster', cl_args['cluster'])
    opts.set_config('cmdline.topology.file_name',
                    cl_args['topology-file-name'])
    opts.set_config('cmdline.topology.class_name',
                    cl_args['topology-class-name'])
    opts.set_config('cmdline.topology.submit_user', cl_args['submit_user'])

    # Use CLI release yaml file if the release_yaml_file config is empty
    if not cl_args['release_yaml_file']:
        cl_args['release_yaml_file'] = config.get_heron_release_file()

    # check the extension of the file name to see if it is tar/jar file.
    if jar_type:
        return submit_fatjar(cl_args, unknown_args, tmp_dir)
    if tar_type:
        return submit_tar(cl_args, unknown_args, tmp_dir)
    if cpp_type:
        return submit_cpp(cl_args, unknown_args, tmp_dir)
    return submit_pex(cl_args, unknown_args, tmp_dir)
Example #36
0
 def fail(self, tup):
     Log.info("Trying to do a fail. tuples processed: %d, received: %d" %
              (self.tuples_processed, self.tuple_received))
     if self.tuples_processed < self.tuple_received:
         super(IntegrationTestBolt, self).fail(tup)
         self.tuples_processed += 1
Example #37
0
def launch_a_topology(cl_args, tmp_dir, topology_file, topology_defn_file,
                      topology_name):
    '''
  Launch a topology given topology jar, its definition file and configurations
  :param cl_args:
  :param tmp_dir:
  :param topology_file:
  :param topology_defn_file:
  :return:
  '''
    # get the normalized path for topology.tar.gz
    topology_pkg_path = config.normalized_class_path(
        os.path.join(tmp_dir, 'topology.tar.gz'))

    # get the release yaml file
    release_yaml_file = config.get_heron_release_file()

    # create a tar package with the cluster configuration and generated config files
    config_path = cl_args['config_path']
    tar_pkg_files = [topology_file, topology_defn_file]
    generated_config_files = [
        release_yaml_file, cl_args['override_config_file']
    ]

    config.create_tar(topology_pkg_path, tar_pkg_files, config_path,
                      generated_config_files)

    # pass the args to submitter main
    args = [
        "--cluster",
        cl_args['cluster'],
        "--role",
        cl_args['role'],
        "--environment",
        cl_args['environ'],
        "--heron_home",
        config.get_heron_dir(),
        "--config_path",
        config_path,
        "--override_config_file",
        cl_args['override_config_file'],
        "--release_file",
        release_yaml_file,
        "--topology_package",
        topology_pkg_path,
        "--topology_defn",
        topology_defn_file,
        "--topology_bin",
        topology_file  # pex file if pex specified
    ]

    if Log.getEffectiveLevel() == logging.DEBUG:
        args.append("--verbose")

    if cl_args["dry_run"]:
        args.append("--dry_run")
        if "dry_run_format" in cl_args:
            args += ["--dry_run_format", cl_args["dry_run_format"]]

    lib_jars = config.get_heron_libs(jars.scheduler_jars() +
                                     jars.uploader_jars() +
                                     jars.statemgr_jars() +
                                     jars.packing_jars())
    extra_jars = cl_args['extra_launch_classpath'].split(':')

    # invoke the submitter to submit and launch the topology
    main_class = 'com.twitter.heron.scheduler.SubmitterMain'
    res = execute.heron_class(class_name=main_class,
                              lib_jars=lib_jars,
                              extra_jars=extra_jars,
                              args=args,
                              java_defines=[])
    err_context = "Failed to launch topology '%s'" % topology_name
    if cl_args["dry_run"]:
        err_context += " in dry-run mode"
    succ_context = "Successfully launched topology '%s'" % topology_name
    if cl_args["dry_run"]:
        succ_context += " in dry-run mode"
    res.add_context(err_context, succ_context)
    return res
Example #38
0
 def _handle_register_response(self, response):
   if response.status.status != common_pb2.StatusCode.Value("OK"):
     raise RuntimeError("Metrics Manager returned a not OK response for register")
   Log.info("We registered ourselves to the Metrics Manager")
Example #39
0
def start_cluster(cl_args):
    '''
  Start a Heron standalone cluster
  '''
    roles = read_and_parse_roles(cl_args)
    masters = roles[SET.MASTERS]
    slaves = roles[SET.SLAVES]
    zookeepers = roles[SET.ZOOKEEPERS]
    Log.info("Roles:")
    Log.info(" - Master Servers: %s" % list(masters))
    Log.info(" - Slave Servers: %s" % list(slaves))
    Log.info(" - Zookeeper Servers: %s" % list(zookeepers))
    if not masters:
        Log.error("No master servers specified!")
        sys.exit(-1)
    if not slaves:
        Log.error("No slave servers specified!")
        sys.exit(-1)
    if not zookeepers:
        Log.error("No zookeeper servers specified!")
        sys.exit(-1)
    # make sure configs are templated
    update_zookeeper_config_files(cl_args)
    update_master_config_files(cl_args)
    update_slave_config_files(cl_args)

    dist_nodes = list(masters.union(slaves))
    # if just local deployment
    if not (len(dist_nodes) == 1 and is_self(dist_nodes[0])):
        distribute_package(roles, cl_args)
    start_master_nodes(masters, cl_args)
    start_slave_nodes(slaves, cl_args)
    start_api_server(masters, cl_args)
    Log.info("Heron standalone cluster complete!")
Example #40
0
class GatewayLooper(EventLooper):
    """A GatewayLooper, inheriting EventLooper

  It is a class wrapping Python's asyncore module (and selector) to dispatch events.
  This class can be used as a looper for an ``asyncore.dispatcher`` class, instead of calling
  ``asyncore.loop()``.

  As it is a subclass of EventLooper, it will execute in a while loop until
  the ``exit_loop()`` is called.

  In order to use this class, users first need to specify a socket map that maps from
  a file descriptor to ``asyncore.dispatcher`` class, using ``prepare_map()`` method.
  The GatewayLooper will dispatch ready events that are in the specified map.
  """
    def __init__(self, socket_map):
        """Initializes a GatewayLooper instance

    :param socket_map: socket map used for asyncore.dispatcher
    """
        super(GatewayLooper, self).__init__()
        self.sock_map = socket_map

        # Pipe used for wake up select
        self.pipe_r, self.pipe_w = os.pipe()

        self.started = time.time()
        Log.debug("Gateway Looper started time: " + str(time.asctime()))

    def do_wait(self):
        next_timeout = self._get_next_timeout_interval()
        if next_timeout > 0:
            self.poll(timeout=next_timeout)
        else:
            self.poll(timeout=0.0)

    def wake_up(self):
        os.write(self.pipe_w, "\n")
        Log.debug("Wake up called")

    def on_exit(self):
        super(GatewayLooper, self).on_exit()
        os.close(self.pipe_r)
        os.close(self.pipe_w)

    # pylint: disable=too-many-branches
    def poll(self, timeout=0.0):
        """Modified version of poll() from asyncore module"""
        if self.sock_map is None:
            Log.warning("Socket map is not registered to Gateway Looper")
        readable_lst = []
        writable_lst = []
        error_lst = []

        if self.sock_map is not None:
            for fd, obj in self.sock_map.items():
                is_r = obj.readable()
                is_w = obj.writable()
                if is_r:
                    readable_lst.append(fd)
                if is_w and not obj.accepting:
                    writable_lst.append(fd)
                if is_r or is_w:
                    error_lst.append(fd)

        # Add wakeup fd
        readable_lst.append(self.pipe_r)

        Log.debug("Will select() with timeout: " + str(timeout) +
                  ", with map: " + str(self.sock_map))
        try:
            readable_lst, writable_lst, error_lst = \
              select.select(readable_lst, writable_lst, error_lst, timeout)
        except select.error, err:
            Log.debug("Trivial error: " + err.message)
            if err.args[0] != errno.EINTR:
                raise
            else:
                return
        Log.debug("Selected [r]: " + str(readable_lst) + " [w]: " +
                  str(writable_lst) + " [e]: " + str(error_lst))

        if self.pipe_r in readable_lst:
            Log.debug("Read from pipe")
            os.read(self.pipe_r, 1024)
            readable_lst.remove(self.pipe_r)

        if self.sock_map is not None:
            for fd in readable_lst:
                obj = self.sock_map.get(fd)
                if obj is None:
                    continue
                asyncore.read(obj)

            for fd in writable_lst:
                obj = self.sock_map.get(fd)
                if obj is None:
                    continue
                asyncore.write(obj)

            for fd in error_lst:
                obj = self.sock_map.get(fd)
                if obj is None:
                    continue
                # pylint: disable=W0212
                asyncore._exception(obj)
Example #41
0
 def on_exit(self):
     """Called when exiting"""
     Log.info("In on_exit() of event_looper")
     for task in self.exit_tasks:
         task()
Example #42
0
 def send_message(self, message):
     """Sends a message (REQID is zero)"""
     Log.debug(f"In send_message() of {self._get_classname()}")
     outgoing_pkt = OutgoingPacket.create_packet(REQID.generate_zero(),
                                                 message)
     self._send_packet(outgoing_pkt)
Example #43
0
 def on_topology_pplan(data):
   """watch physical plan"""
   Log.info("Watch triggered for topology pplan: " + topologyName)
   topology.set_physical_plan(data)
   if not data:
     Log.debug("No data to be set")
Example #44
0
 def handle_close(self):
   Log.info("%s: handle_close() called" % self._get_classname())
   self._handle_close()
   self.on_error()
Example #45
0
 def invoke_deactivate(self):
   Log.info("Spout is deactivated")
   self.spout_impl.deactivate()
   self.topology_state = topology_pb2.TopologyState.Value("PAUSED")
Example #46
0
 def handle_connect(self):
     Log.info(f"Connected to {self.hostname}:{self.port}")
     self._connecting = False
     self.on_connect(StatusCode.OK)
Example #47
0
    def getComponentException(self,
                              tmaster,
                              component_name,
                              instances=[],
                              callback=None):
        """
    Get all (last 1000) exceptions for 'component_name' of the topology.
    Returns an Array of exception logs on success.
    Returns json with message on failure.
    """
        if not tmaster or not tmaster.host or not tmaster.stats_port:
            return

        exception_request = tmaster_pb2.ExceptionLogRequest()
        exception_request.component_name = component_name
        if len(instances) > 0:
            exception_request.instances.extend(instances)
        request_str = exception_request.SerializeToString()
        port = str(tmaster.stats_port)
        host = tmaster.host
        url = "http://{0}:{1}/exceptions".format(host, port)
        request = tornado.httpclient.HTTPRequest(url,
                                                 body=request_str,
                                                 method='POST',
                                                 request_timeout=5)
        Log.debug('Making HTTP call to fetch exceptions url: %s', url)
        try:
            client = tornado.httpclient.AsyncHTTPClient()
            result = yield client.fetch(request)
            Log.debug("HTTP call complete.")
        except tornado.httpclient.HTTPError as e:
            raise Exception(str(e))

        # Check the response code - error if it is in 400s or 500s
        responseCode = result.code
        if responseCode >= 400:
            message = "Error in getting exceptions from Tmaster, code: " + responseCode
            Log.error(message)
            raise tornado.gen.Return({"message": message})

        # Parse the response from tmaster.
        exception_response = tmaster_pb2.ExceptionLogResponse()
        exception_response.ParseFromString(result.body)

        if exception_response.status.status == common_pb2.NOTOK:
            if exception_response.status.HasField("message"):
                raise tornado.gen.Return(
                    {"message": exception_response.status.message})

        # Send response
        ret = []
        for exception_log in exception_response.exceptions:
            ret.append({
                'hostname': exception_log.hostname,
                'instance_id': exception_log.instance_id,
                'stack_trace': exception_log.stacktrace,
                'lasttime': exception_log.lasttime,
                'firsttime': exception_log.firsttime,
                'count': str(exception_log.count),
                'logging': exception_log.logging
            })
        raise tornado.gen.Return(ret)
Example #48
0
 def invoke_activate(self):
   Log.info("Spout is activated")
   self.spout_impl.activate()
   self.topology_state = topology_pb2.TopologyState.Value("RUNNING")
Example #49
0
 def handle_connect(self):
   Log.info("Connected to %s:%d" % (self.hostname, self.port))
   self._connecting = False
   self.on_connect(StatusCode.OK)
Example #50
0
 def _invoke_fail(self, tuple_id, stream_id, fail_latency_ns):
   Log.debug("In invoke_fail(): Failing %s from stream: %s" % (str(tuple_id), stream_id))
   self.spout_impl.fail(tuple_id)
   self.pplan_helper.context.invoke_hook_spout_fail(tuple_id, fail_latency_ns)
   self.spout_metrics.failed_tuple(stream_id, fail_latency_ns)
Example #51
0
 def invoke_activate(self):
     Log.info("Activating Bolt")
     self.topology_state = topology_pb2.TopologyState.Value("RUNNING")
Example #52
0
 def _invoke_ack(self, tuple_id, stream_id, complete_latency_ns):
   Log.debug("In invoke_ack(): Acking %s from stream: %s" % (str(tuple_id), stream_id))
   self.spout_impl.ack(tuple_id)
   self.pplan_helper.context.invoke_hook_spout_ack(tuple_id, complete_latency_ns)
   self.spout_metrics.acked_tuple(stream_id, complete_latency_ns)
Example #53
0
def log_debug(message, ident=0):
    """log debugging info"""
    Log.debug(" " * (ident * 2) + str(message))
Example #54
0
def get_metrics_timeline(
    tmanager: tmanager_pb2.TManagerLocation,
    component_name: str,
    metric_names: List[str],
    instances: List[str],
    start_time: int,
    end_time: int,
    callback=None,
) -> dict:
    """
  Get the specified metrics for the given component name of this topology.
  Returns the following dict on success:
  {
    "timeline": {
      <metricname>: {
        <instance>: {
          <start_time> : <numeric value>,
          <start_time> : <numeric value>,
          ...
        }
        ...
      }, ...
    },
    "starttime": <numeric value>,
    "endtime": <numeric value>,
    "component": "..."
  }

  Returns the following dict on failure:
  {
    "message": "..."
  }
  """
    # Tmanager is the proto object and must have host and port for stats.
    if not tmanager or not tmanager.host or not tmanager.stats_port:
        raise Exception("No Tmanager found")

    host = tmanager.host
    port = tmanager.stats_port

    # Create the proto request object to get metrics.

    request_parameters = tmanager_pb2.MetricRequest()
    request_parameters.component_name = component_name

    # If no instances are given, metrics for all instances
    # are fetched by default.
    request_parameters.instance_id.extend(instances)
    request_parameters.metric.extend(metric_names)

    request_parameters.explicit_interval.start = start_time
    request_parameters.explicit_interval.end = end_time
    request_parameters.minutely = True

    # Form and send the http request.
    url = f"http://{host}:{port}/stats"
    request = tornado.httpclient.HTTPRequest(
        url,
        body=request_parameters.SerializeToString(),
        method='POST',
        request_timeout=5)

    Log.debug("Making HTTP call to fetch metrics")
    Log.debug("url: " + url)
    try:
        client = tornado.httpclient.AsyncHTTPClient()
        result = yield client.fetch(request)
        Log.debug("HTTP call complete.")
    except tornado.httpclient.HTTPError as e:
        raise Exception(str(e))

    # Check the response code - error if it is in 400s or 500s
    if result.code >= 400:
        message = f"Error in getting metrics from Tmanager, code: {result.code}"
        raise Exception(message)

    # Parse the response from tmanager.
    response_data = tmanager_pb2.MetricResponse()
    response_data.ParseFromString(result.body)

    if response_data.status.status == common_pb2.NOTOK:
        if response_data.status.HasField("message"):
            Log.warn("Received response from Tmanager: %s",
                     response_data.status.message)

    # Form the response.
    ret = {}
    ret["starttime"] = start_time
    ret["endtime"] = end_time
    ret["component"] = component_name
    ret["timeline"] = {}

    # Loop through all the metrics
    # One instance corresponds to one metric, which can have
    # multiple IndividualMetrics for each metricname requested.
    for metric in response_data.metric:
        instance = metric.instance_id

        # Loop through all individual metrics.
        for im in metric.metric:
            metricname = im.name
            if metricname not in ret["timeline"]:
                ret["timeline"][metricname] = {}
            if instance not in ret["timeline"][metricname]:
                ret["timeline"][metricname][instance] = {}

            # We get minutely metrics.
            # Interval-values correspond to the minutely mark for which
            # this metric value corresponds to.
            for interval_value in im.interval_values:
                ret["timeline"][metricname][instance][
                    interval_value.interval.start] = interval_value.value

    raise tornado.gen.Return(ret)
Example #55
0
def start_api_server(masters, cl_args):
    '''
  Start the Heron API server
  '''
    # make sure nomad cluster is up
    single_master = list(masters)[0]

    for i in range(10):
        try:
            r = requests.get("http://%s:4646/v1/status/leader" % single_master)
            if r.status_code == 200:
                break
        except:
            Log.debug(sys.exc_info()[0])
            Log.info("Waiting for cluster to come up... %s" % i)
            time.sleep(1)


    cmd = "%s run %s >> /tmp/apiserver_start.log 2>&1 &" \
          % (get_nomad_path(cl_args), get_apiserver_job_file(cl_args))
    Log.info("Starting Heron API Server on %s" % single_master)

    if not is_self(single_master):
        cmd = ssh_remote_execute(cmd, single_master, cl_args)
    Log.debug(cmd)
    pid = subprocess.Popen(cmd,
                           shell=True,
                           stdout=subprocess.PIPE,
                           stderr=subprocess.PIPE)

    return_code = pid.wait()
    output = pid.communicate()
    Log.debug("return code: %s output: %s" % (return_code, output))
    if return_code != 0:
        Log.error("Failed to start apiserver on %s with error:\n%s" %
                  (single_master, output[1]))
        sys.exit(-1)

    Log.info("Done starting Heron API Server")
Example #56
0
 def on_topology_scheduler_location(data):
   """set scheduler location"""
   Log.info("Watch triggered for topology scheduler location: " + topologyName)
   topology.set_scheduler_location(data)
   if not data:
     Log.debug("No data to be set")
Example #57
0
 def on_topology_tmaster(data):
   """set tmaster"""
   Log.info("Watch triggered for topology tmaster: " + topologyName)
   topology.set_tmaster(data)
   if not data:
     Log.debug("No data to be set")
Example #58
0
 def on_topology_execution_state(data):
   """watch execution state"""
   Log.info("Watch triggered for topology execution state: " + topologyName)
   topology.set_execution_state(data)
   if not data:
     Log.debug("No data to be set")
Example #59
0
 def wake_up(self):
     os.write(self.pipe_w, "\n")
     Log.debug("Wake up called")
Example #60
0
 def on_error(self):
     Log.error("Disconnected from Metrics Manager")
     self.on_connect(StatusCode.CONNECT_ERROR)