Example #1
0
    def _handle_assignment_message(self, pplan):
        """Called when new NewInstanceAssignmentMessage arrives"""
        Log.debug("In handle_assignment_message() of STStmgrClient, Physical Plan: \n%s", str(pplan))
        new_helper = PhysicalPlanHelper(
            pplan, self.instance.instance_id, self.heron_instance_cls.topo_pex_file_abs_path
        )

        if self._pplan_helper is not None and (
            self._pplan_helper.my_component_name != new_helper.my_component_name
            or self._pplan_helper.my_task_id != new_helper.my_task_id
        ):
            raise RuntimeError("Our Assignment has changed. We will die to pick it.")

        if self._pplan_helper is None:
            Log.info("Received a new Physical Plan")
            Log.info("Push the new pplan_helper to Heron Instance")
            self.heron_instance_cls.handle_assignment_msg(new_helper)
        else:
            Log.info("Received a new Physical Plan with the same assignment -- State Change")
            Log.info(
                "Old state: %s, new state: %s.",
                self._pplan_helper.get_topology_state(),
                new_helper.get_topology_state(),
            )
            self.heron_instance_cls.handle_state_change_msg(new_helper)

        self._pplan_helper = new_helper
Example #2
0
  def poll(self, timeout=0.0):
    """Modified version of poll() from asyncore module"""
    if self.sock_map is None:
      Log.warning("Socket map is not registered to Gateway Looper")
    readable_lst = []
    writable_lst = []
    error_lst = []

    if self.sock_map is not None:
      for fd, obj in self.sock_map.items():
        is_r = obj.readable()
        is_w = obj.writable()
        if is_r:
          readable_lst.append(fd)
        if is_w and not obj.accepting:
          writable_lst.append(fd)
        if is_r or is_w:
          error_lst.append(fd)

    # Add wakeup fd
    readable_lst.append(self.pipe_r)

    Log.debug("Will select() with timeout: " + str(timeout) + ", with map: " + str(self.sock_map))
    try:
      readable_lst, writable_lst, error_lst = \
        select.select(readable_lst, writable_lst, error_lst, timeout)
    except select.error, err:
      Log.debug("Trivial error: " + err.message)
      if err.args[0] != errno.EINTR:
        raise
      else:
        return
Example #3
0
  def get(self):
    """ get method """
    try:
      cluster = self.get_argument_cluster()
      role = self.get_argument_role()
      environ = self.get_argument_environ()
      topology_name = self.get_argument_topology()
      container = self.get_argument(constants.PARAM_CONTAINER)
      path = self.get_argument(constants.PARAM_PATH)
      offset = self.get_argument_offset()
      length = self.get_argument_length()
      topology_info = self.tracker.getTopologyInfo(topology_name, cluster, role, environ)

      stmgr_id = "stmgr-" + container
      stmgr = topology_info["physical_plan"]["stmgrs"][stmgr_id]
      host = stmgr["host"]
      shell_port = stmgr["shell_port"]
      file_data_url = "http://%s:%d/filedata/%s?offset=%s&length=%s" % \
        (host, shell_port, path, offset, length)

      http_client = tornado.httpclient.AsyncHTTPClient()
      response = yield http_client.fetch(file_data_url)
      self.write_success_response(json.loads(response.body))
      self.finish()
    except Exception as e:
      Log.debug(traceback.format_exc())
      self.write_error_response(e)
Example #4
0
def start_heron_tools(masters, cl_args):
  '''
  Start Heron tracker and UI
  '''
  single_master = list(masters)[0]
  wait_for_master_to_start(single_master)

  cmd = "%s run %s >> /tmp/heron_tools_start.log 2>&1 &" \
        % (get_nomad_path(cl_args), get_heron_tools_job_file(cl_args))
  Log.info("Starting Heron Tools on %s" % single_master)

  if not is_self(single_master):
    cmd = ssh_remote_execute(cmd, single_master, cl_args)
  Log.debug(cmd)
  pid = subprocess.Popen(cmd,
                         shell=True,
                         stdout=subprocess.PIPE,
                         stderr=subprocess.PIPE)

  return_code = pid.wait()
  output = pid.communicate()
  Log.debug("return code: %s output: %s" % (return_code, output))
  if return_code != 0:
    Log.error("Failed to start Heron Tools on %s with error:\n%s" % (single_master, output[1]))
    sys.exit(-1)

  wait_for_job_to_start(single_master, "heron-tools")
  Log.info("Done starting Heron Tools")
Example #5
0
 def _send_metrics_messages(self):
   if self.connected:
     while not self.out_queue.is_empty():
       message = self.out_queue.poll()
       assert isinstance(message, metrics_pb2.MetricPublisherPublishMessage)
       Log.debug("Sending metric message: %s" % str(message))
       self.send_message(message)
Example #6
0
  def register_watch(self, callback):
    """
    Returns the UUID with which the watch is
    registered. This UUID can be used to unregister
    the watch.
    Returns None if watch could not be registered.

    The argument 'callback' must be a function that takes
    exactly one argument, the topology on which
    the watch was triggered.
    Note that the watch will be unregistered in case
    it raises any Exception the first time.

    This callback is also called at the time
    of registration.
    """
    RETRY_COUNT = 5
    # Retry in case UID is previously
    # generated, just in case...
    for _ in range(RETRY_COUNT):
      # Generate a random UUID.
      uid = uuid.uuid4()
      if uid not in self.watches:
        Log.info("Registering a watch with uid: " + str(uid))
        try:
          callback(self)
        except Exception as e:
          Log.error("Caught exception while triggering callback: " + str(e))
          Log.debug(traceback.format_exc())
          return None
        self.watches[uid] = callback
        return uid
    return None
Example #7
0
def get_jobs(cl_args, nomad_addr):
  r = requests.get("http://%s:4646/v1/jobs" % nomad_addr)
  if r.status_code != 200:
    Log.error("Failed to get list of jobs")
    Log.debug("Response: %s" % r)
    sys.exit(-1)
  return r.json()
Example #8
0
def start_slave_nodes(slaves, cl_args):
  '''
  Star slave nodes
  '''
  pids = []
  for slave in slaves:
    Log.info("Starting slave on %s" % slave)
    cmd = "%s agent -config %s >> /tmp/nomad_client.log 2>&1 &" \
          % (get_nomad_path(cl_args), get_nomad_slave_config_file(cl_args))
    if not is_self(slave):
      cmd = ssh_remote_execute(cmd, slave, cl_args)
    Log.debug(cmd)
    pid = subprocess.Popen(cmd,
                           shell=True,
                           stdout=subprocess.PIPE,
                           stderr=subprocess.PIPE)
    pids.append({"pid": pid, "dest": slave})

  errors = []
  for entry in pids:
    pid = entry["pid"]
    return_code = pid.wait()
    output = pid.communicate()
    Log.debug("return code: %s output: %s" % (return_code, output))
    if return_code != 0:
      errors.append("Failed to start slave on %s with error:\n%s" % (entry["dest"], output[1]))

  if errors:
    for error in errors:
      Log.error(error)
    sys.exit(-1)

  Log.info("Done starting slaves")
Example #9
0
  def get(self):
    try:
      cluster = self.get_argument_cluster()
      role = self.get_argument_role()
      environ = self.get_argument_environ()
      topology_name = self.get_argument_topology()
      container = self.get_argument(constants.PARAM_CONTAINER)
      path = self.get_argument(constants.PARAM_PATH)
      topology_info = self.tracker.getTopologyInfo(topology_name, cluster, role, environ)

      stmgr_id = "stmgr-" + container
      stmgr = topology_info["physical_plan"]["stmgrs"][stmgr_id]
      host = stmgr["host"]
      shell_port = stmgr["shell_port"]
      file_download_url = "http://%s:%d/download/%s" % (host, shell_port, path)
      Log.debug("download file url: %s", file_download_url)

      path = self.get_argument("path")
      filename = path.split("/")[-1]
      self.set_header("Content-Disposition", "attachment; filename=%s" % filename)

      def streaming_callback(chunk):
        self.write(chunk)
        self.flush()

      http_client = tornado.httpclient.AsyncHTTPClient()
      yield http_client.fetch(file_download_url, streaming_callback=streaming_callback)
      self.finish()
    except Exception as e:
      Log.debug(traceback.format_exc())
      self.write_error_response(e)
Example #10
0
 def on_connect(self, status):
   Log.debug("In on_connect of MetricsManagerClient")
   if status != StatusCode.OK:
     Log.error("Error connecting to Metrics Manager with status: %s" % str(status))
     retry_interval = float(self.sys_config[constants.INSTANCE_RECONNECT_METRICSMGR_INTERVAL_SEC])
     self.looper.register_timer_task_in_sec(self.start_connect, retry_interval)
   self._send_register_req()
Example #11
0
def start_api_server(masters, cl_args):
  '''
  Start the Heron API server
  '''
  # make sure nomad cluster is up
  single_master = list(masters)[0]
  wait_for_master_to_start(single_master)

  cmd = "%s run %s >> /tmp/apiserver_start.log 2>&1 &" \
        % (get_nomad_path(cl_args), get_apiserver_job_file(cl_args))
  Log.info("Starting Heron API Server on %s" % single_master)

  if not is_self(single_master):
    cmd = ssh_remote_execute(cmd, single_master, cl_args)
  Log.debug(cmd)
  pid = subprocess.Popen(cmd,
                         shell=True,
                         stdout=subprocess.PIPE,
                         stderr=subprocess.PIPE)

  return_code = pid.wait()
  output = pid.communicate()
  Log.debug("return code: %s output: %s" % (return_code, output))
  if return_code != 0:
    Log.error("Failed to start apiserver on %s with error:\n%s" % (single_master, output[1]))
    sys.exit(-1)

  wait_for_job_to_start(single_master, "apiserver")
  Log.info("Done starting Heron API Server")
Example #12
0
  def getComponentException(self, tmaster, component_name, instances=[], callback=None):
    """
    Get all (last 1000) exceptions for 'component_name' of the topology.
    Returns an Array of exception logs on success.
    Returns json with message on failure.
    """
    if not tmaster or not tmaster.host or not tmaster.stats_port:
      return

    exception_request = tmaster_pb2.ExceptionLogRequest()
    exception_request.component_name = component_name
    if len(instances) > 0:
      exception_request.instances.extend(instances)
    request_str = exception_request.SerializeToString()
    port = str(tmaster.stats_port)
    host = tmaster.host
    url = "http://{0}:{1}/exceptions".format(host, port)
    request = tornado.httpclient.HTTPRequest(url,
                                             body=request_str,
                                             method='POST',
                                             request_timeout=5)
    Log.debug('Making HTTP call to fetch exceptions url: %s', url)
    try:
      client = tornado.httpclient.AsyncHTTPClient()
      result = yield client.fetch(request)
      Log.debug("HTTP call complete.")
    except tornado.httpclient.HTTPError as e:
      raise Exception(str(e))

    # Check the response code - error if it is in 400s or 500s
    responseCode = result.code
    if responseCode >= 400:
      message = "Error in getting exceptions from Tmaster, code: " + responseCode
      Log.error(message)
      raise tornado.gen.Return({
          "message": message
      })

    # Parse the response from tmaster.
    exception_response = tmaster_pb2.ExceptionLogResponse()
    exception_response.ParseFromString(result.body)

    if exception_response.status.status == common_pb2.NOTOK:
      if exception_response.status.HasField("message"):
        raise tornado.gen.Return({
            "message": exception_response.status.message
        })

    # Send response
    ret = []
    for exception_log in exception_response.exceptions:
      ret.append({'hostname': exception_log.hostname,
                  'instance_id': exception_log.instance_id,
                  'stack_trace': exception_log.stacktrace,
                  'lasttime': exception_log.lasttime,
                  'firsttime': exception_log.firsttime,
                  'count': str(exception_log.count),
                  'logging': exception_log.logging})
    raise tornado.gen.Return(ret)
Example #13
0
def get_logical_plan(cluster, env, topology, role):
  """Synced API call to get logical plans"""
  instance = tornado.ioloop.IOLoop.instance()
  try:
    return instance.run_sync(lambda: API.get_logical_plan(cluster, env, topology, role))
  except Exception:
    Log.debug(traceback.format_exc())
    raise
Example #14
0
def get_cluster_topologies(cluster):
  """Synced API call to get topologies under a cluster"""
  instance = tornado.ioloop.IOLoop.instance()
  try:
    return instance.run_sync(lambda: API.get_cluster_topologies(cluster))
  except Exception:
    Log.debug(traceback.format_exc())
    raise
Example #15
0
def get_cluster_role_env_topologies(cluster, role, env):
  """Synced API call to get topologies under a cluster submitted by a role under env"""
  instance = tornado.ioloop.IOLoop.instance()
  try:
    return instance.run_sync(lambda: API.get_cluster_role_env_topologies(cluster, role, env))
  except Exception:
    Log.debug(traceback.format_exc())
    raise
Example #16
0
def get_topology_metrics(*args):
  """Synced API call to get topology metrics"""
  instance = tornado.ioloop.IOLoop.instance()
  try:
    return instance.run_sync(lambda: API.get_comp_metrics(*args))
  except Exception:
    Log.debug(traceback.format_exc())
    raise
Example #17
0
 def set_topology_context(self, metrics_collector):
   """Sets a new topology context"""
   Log.debug("Setting topology context")
   cluster_config = self.get_topology_config()
   cluster_config.update(self._get_dict_from_config(self.my_component.config))
   task_to_component_map = self._get_task_to_comp_map()
   self.context = TopologyContext(cluster_config, self.pplan.topology, task_to_component_map,
                                  self.my_task_id, metrics_collector)
Example #18
0
  def register_on_message(self, msg_builder):
    """Registers protobuf message builders that this client wants to receive

    :param msg_builder: callable to create a protobuf message that this client wants to receive
    """
    message = msg_builder()
    Log.debug("In register_on_message(): %s" % message.DESCRIPTOR.full_name)
    self.registered_message_map[message.DESCRIPTOR.full_name] = msg_builder
Example #19
0
def get_clusters():
  """Synced API call to get all cluster names"""
  instance = tornado.ioloop.IOLoop.instance()
  # pylint: disable=unnecessary-lambda
  try:
    return instance.run_sync(lambda: API.get_clusters())
  except Exception:
    Log.debug(traceback.format_exc())
    raise
Example #20
0
 def on_connect(self, status):
   Log.debug("In on_connect of STStmgrClient")
   if status != StatusCode.OK:
     Log.error("Error connecting to Stream Manager with status: %s", str(status))
     retry_interval = float(self.sys_config[constants.INSTANCE_RECONNECT_STREAMMGR_INTERVAL_SEC])
     self.looper.register_timer_task_in_sec(self.start_connect, retry_interval)
     return
   self._register_msg_to_handle()
   self._send_register_req()
Example #21
0
 def _send_metrics_messages(self):
   if self.connected:
     while not self.out_queue.is_empty():
       message = self.out_queue.poll()
       assert isinstance(message, metrics_pb2.MetricPublisherPublishMessage)
       Log.debug("Sending metric message: %s" % str(message))
       self.send_message(message)
       self.gateway_metrics.update_sent_metrics_size(message.ByteSize())
       self.gateway_metrics.update_sent_metrics(len(message.metrics), len(message.exceptions))
Example #22
0
 def on_response(self, status, context, response):
   Log.debug("In on_response with status: %s, with context: %s" % (str(status), str(context)))
   if status != StatusCode.OK:
     raise RuntimeError("Response from Metrics Manager not OK")
   if isinstance(response, metrics_pb2.MetricPublisherRegisterResponse):
     self._handle_register_response(response)
   else:
     Log.error("Unknown kind of response received: %s" % response.DESCRIPTOR.full_name)
     raise RuntimeError("Unknown kind of response received from Metrics Manager")
Example #23
0
def heron_pex(topology_pex, topology_class_name, tmp_dir):
  Log.debug("Importing %s from %s" % (topology_class_name, topology_pex))
  try:
    pex_loader.load_pex(topology_pex)
    topology_class = pex_loader.import_and_get_class(topology_pex, topology_class_name)
    topology_class.write(tmp_dir)
  except Exception:
    traceback.print_exc()
    err_str = "Topology pex failed to be loaded. Bailing out..."
    raise RuntimeError(err_str)
Example #24
0
def run(command, parser, cl_args, unknown_args):
  '''
  :param command:
  :param parser:
  :param cl_args:
  :param unknown_args:
  :return:
  '''
  Log.debug("Activate Args: %s", cl_args)
  return cli_helper.run(command, cl_args, "activate topology")
Example #25
0
 def get_positional_args(self):
   positional_args_map = collections.defaultdict(dict)
   for key in self._actions:
     # pylint: disable=protected-access
     if isinstance(key, argparse._StoreAction) and len(key.option_strings) == 0:
       if key.dest == 'cluster/[role]/[env]':
         continue
       positional_args_map['--'+key.dest] = key.dest
       Log.debug("get_positional_args : key: %s, dest : %s", key, key.dest)
   return positional_args_map
Example #26
0
def get_component_metrics(component, cluster, env, topology, role):
  """Synced API call to get component metrics"""
  all_queries = metric_queries()
  try:
    result = get_topology_metrics(cluster, env, topology, component, [],
                                  all_queries, [0, -1], role)
    return result["metrics"]
  except Exception:
    Log.debug(traceback.format_exc())
    raise
Example #27
0
  def _handle_register_response(self, response):
    """Called when a register response (RegisterInstanceResponse) arrives"""
    if response.status.status != common_pb2.StatusCode.Value("OK"):
      raise RuntimeError("Stream Manager returned a not OK response for register")
    Log.info("We registered ourselves to the Stream Manager")

    if response.HasField("pplan"):
      Log.info("Handling assignment message from response")
      self._handle_assignment_message(response.pplan)
    else:
      Log.debug("Received a register response with no pplan")
Example #28
0
  def start_connect(self):
    """Tries to connect to the Heron Server

    ``loop()`` method needs to be called after this.
    """
    Log.debug("In start_connect() of %s" % self._get_classname())
    # TODO: specify buffer size, exception handling
    self.create_socket(socket.AF_INET, socket.SOCK_STREAM)

    # when ready, handle_connect is called
    self.connect(self.endpoint)
Example #29
0
  def _gather_metrics(self, time_bucket_in_sec):
    if time_bucket_in_sec in self.time_bucket_in_sec_to_metrics_name:
      message = metrics_pb2.MetricPublisherPublishMessage()
      for name in self.time_bucket_in_sec_to_metrics_name[time_bucket_in_sec]:
        Log.debug("Will call gather_one_metric with %s", name)
        self._gather_one_metric(name, message)

      assert message.IsInitialized()
      self.out_metrics.offer(message)

      # schedule ourselves again
      self._register_timer_task(time_bucket_in_sec)
Example #30
0
 def get(self):
   """ get method """
   try:
     cluster = self.get_argument_cluster()
     role = self.get_argument_role()
     environ = self.get_argument_environ()
     topology_name = self.get_argument_topology()
     topology_info = self.tracker.getTopologyInfo(topology_name, cluster, role, environ)
     self.write_success_response(topology_info)
   except Exception as e:
     Log.debug(traceback.format_exc())
     self.write_error_response(e)
Example #31
0
 def on_connect(self, status):
     Log.debug("In on_connect of STStmgrClient")
     if status != StatusCode.OK:
         Log.error("Error connecting to Stream Manager with status: %s" %
                   str(status))
         retry_interval = float(self.sys_config[
             constants.INSTANCE_RECONNECT_STREAMMGR_INTERVAL_SEC])
         self.looper.register_timer_task_in_sec(self.start_connect,
                                                retry_interval)
         return
     self._register_msg_to_handle()
     self._send_register_req()
Example #32
0
    def _handle_register_response(self, response):
        """Called when a register response (RegisterInstanceResponse) arrives"""
        if response.status.status != common_pb2.StatusCode.Value("OK"):
            raise RuntimeError(
                "Stream Manager returned a not OK response for register")
        Log.info("We registered ourselves to the Stream Manager")

        if response.HasField("pplan"):
            Log.info("Handling assignment message from response")
            self._handle_assignment_message(response.pplan)
        else:
            Log.debug("Received a register response with no pplan")
Example #33
0
    def start_connect(self):
        """Tries to connect to the Heron Server

    ``loop()`` method needs to be called after this.
    """
        Log.debug("In start_connect() of %s" % self._get_classname())
        # TODO: specify buffer size, exception handling
        self.create_socket(socket.AF_INET, socket.SOCK_STREAM)

        # when ready, handle_connect is called
        self._connecting = True
        self.connect(self.endpoint)
  def _gather_metrics(self, time_bucket_in_sec):
    if time_bucket_in_sec in self.time_bucket_in_sec_to_metrics_name:
      message = metrics_pb2.MetricPublisherPublishMessage()
      for name in self.time_bucket_in_sec_to_metrics_name[time_bucket_in_sec]:
        Log.debug("Will call gather_one_metric with %s", name)
        self._gather_one_metric(name, message)

      assert message.IsInitialized()
      self.out_metrics.offer(message)

      # schedule ourselves again
      self._register_timer_task(time_bucket_in_sec)
Example #35
0
 def on_response(self, status, context, response):
     Log.debug("In on_response with status: %s, with context: %s" %
               (str(status), str(context)))
     if status != StatusCode.OK:
         raise RuntimeError("Response from Metrics Manager not OK")
     if isinstance(response, metrics_pb2.MetricPublisherRegisterResponse):
         self._handle_register_response(response)
     else:
         Log.error("Unknown kind of response received: %s" %
                   response.DESCRIPTOR.full_name)
         raise RuntimeError(
             "Unknown kind of response received from Metrics Manager")
Example #36
0
 def _send_metrics_messages(self):
     if self.connected:
         while not self.out_queue.is_empty():
             message = self.out_queue.poll()
             assert isinstance(message,
                               metrics_pb2.MetricPublisherPublishMessage)
             Log.debug("Sending metric message: %s" % str(message))
             self.send_message(message)
             self.gateway_metrics.update_sent_metrics_size(
                 message.ByteSize())
             self.gateway_metrics.update_sent_metrics(
                 len(message.metrics), len(message.exceptions))
Example #37
0
 def get(self):
   """ get method """
   try:
     cluster = self.get_argument_cluster()
     role = self.get_argument_role()
     environ = self.get_argument_environ()
     topology_name = self.get_argument_topology()
     topology_info = self.tracker.getTopologyInfo(topology_name, cluster, role, environ)
     self.write_success_response(topology_info)
   except Exception as e:
     Log.debug(traceback.format_exc())
     self.write_error_response(e)
Example #38
0
 def get_positional_args(self):
     positional_args_map = collections.defaultdict(dict)
     for key in self._actions:
         # pylint: disable=protected-access
         if isinstance(key, argparse._StoreAction) and len(
                 key.option_strings) == 0:
             if key.dest == 'cluster/[role]/[env]':
                 continue
             positional_args_map['--' + key.dest] = key.dest
             Log.debug("get_positional_args : key: %s, dest : %s", key,
                       key.dest)
     return positional_args_map
Example #39
0
 def on_connect(self, status):
     Log.debug("In on_connect of MetricsManagerClient")
     if status != StatusCode.OK:
         Log.error(
             f"Error connecting to Metrics Manager with status: {str(status)}"
         )
         retry_interval = float(self.sys_config[
             constants.INSTANCE_RECONNECT_METRICSMGR_INTERVAL_SEC])
         self.looper.register_timer_task_in_sec(self.start_connect,
                                                retry_interval)
         return
     self._send_register_req()
Example #40
0
def heron_class(class_name,
                lib_jars,
                extra_jars=None,
                args=None,
                java_defines=None):
    '''
  Execute a heron class given the args and the jars needed for class path
  :param class_name:
  :param lib_jars:
  :param extra_jars:
  :param args:
  :param java_defines:
  :return:
  '''
    # default optional params to empty list if not provided
    if extra_jars is None:
        extra_jars = []
    if args is None:
        args = []
    if java_defines is None:
        java_defines = []

    # Format all java -D options that need to be passed while running
    # the class locally.
    java_opts = ['-D' + opt for opt in java_defines]

    # Construct the command line for the sub process to run
    # Because of the way Python execute works,
    # the java opts must be passed as part of the list
    all_args = [config.get_java_path(), "-client", "-Xmx1g"] + \
               java_opts + \
               ["-cp", config.get_classpath(extra_jars + lib_jars)]

    all_args += [class_name] + list(args)

    # set heron_config environment variable
    heron_env = os.environ.copy()
    heron_env['HERON_OPTIONS'] = opts.get_heron_config()

    # print the verbose message
    Log.debug("Invoking class using command: ``%s''", ' '.join(all_args))
    Log.debug("Heron options: {%s}", str(heron_env["HERON_OPTIONS"]))

    # invoke the command with subprocess and print error message, if any
    proc = subprocess.Popen(all_args,
                            env=heron_env,
                            stdout=subprocess.PIPE,
                            stderr=subprocess.PIPE,
                            bufsize=1)
    # stdout message has the information Java program sends back
    # stderr message has extra information, such as debugging message
    return ProcessResult(proc)
Example #41
0
  def __init__(self, socket_map):
    """Initializes a GatewayLooper instance

    :param socket_map: socket map used for asyncore.dispatcher
    """
    super().__init__()
    self.sock_map = socket_map

    # Pipe used for wake up select
    self.pipe_r, self.pipe_w = os.pipe()

    self.started = time.time()
    Log.debug("Gateway Looper started time: " + str(time.asctime()))
Example #42
0
def update_config_files(cl_args):
    Log.info("Updating config files...")
    roles = read_and_parse_roles(cl_args)
    Log.debug("roles: %s" % roles)
    masters = list(roles[Role.MASTERS])
    zookeepers = list(roles[Role.ZOOKEEPERS])

    template_slave_hcl(cl_args, masters)
    template_scheduler_yaml(cl_args, masters)
    template_uploader_yaml(cl_args, masters)
    template_apiserver_hcl(cl_args, masters, zookeepers)
    template_statemgr_yaml(cl_args, zookeepers)
    template_heron_tools_hcl(cl_args, masters, zookeepers)
Example #43
0
def update_slave_config_files(cl_args):
    '''
  update/template config files related to slave servers
  '''

    roles = read_and_parse_roles(cl_args)
    slaves = list(roles[SET.SLAVES])
    if not slaves:
        return
    Log.debug("Templating files for slaves...")

    # update apiserver location

    single_slave = slaves[0]
    uploader_config_template = "%s/standalone/templates/uploader.template.yaml" \
                               % cl_args["config_path"]
    with open(uploader_config_template, 'r') as tf:
        file_contents = tf.read()
        new_file_contents = file_contents.replace(
            "<http_uploader_uri>",
            "http://%s:9000/api/v1/file/upload" % single_slave)

    uploader_config_actual = "%s/standalone/uploader.yaml" % cl_args[
        "config_path"]
    with open(uploader_config_actual, 'w') as tf:
        tf.write(new_file_contents)
        tf.truncate()

    # Api server nomad job def
    apiserver_config_template = "%s/standalone/templates/apiserver.template.hcl" \
                                % cl_args["config_path"]
    with open(apiserver_config_template, 'r') as tf:
        file_contents = tf.read()
        new_file_contents = file_contents.replace(
            "<heron_apiserver_hostname>",
            '"%s"' % get_hostname(single_slave, cl_args))
        if is_self(single_slave):
            new_file_contents = new_file_contents.replace(
                "<heron_apiserver_executable>",
                '"%s/heron-apiserver"' % config.get_heron_bin_dir())
        else:
            new_file_contents = new_file_contents.replace(
                "<heron_apiserver_executable>",
                '"%s/.heron/bin/heron-apiserver"' %
                get_remote_home(single_slave, cl_args))

    apiserver_config_actual = "%s/standalone/resources/apiserver.hcl" % cl_args[
        "config_path"]
    with open(apiserver_config_actual, 'w') as tf:
        tf.write(new_file_contents)
        tf.truncate()
Example #44
0
def import_and_get_class(path_to_pex, python_class_name):
    """Imports and load a class from a given pex file path and python class name

  For example, if you want to get a class called `Sample` in
  /some-path/sample.pex/heron/examples/src/python/sample.py,
  ``path_to_pex`` needs to be ``/some-path/sample.pex``, and
  ``python_class_name`` needs to be ``heron.examples.src.python.sample.Sample``
  """
    abs_path_to_pex = os.path.abspath(path_to_pex)

    Log.debug(f"Add a pex to the path: {abs_path_to_pex}")
    Log.debug(f"In import_and_get_class with cls_name: {python_class_name}")
    split = python_class_name.split('.')
    from_path = '.'.join(split[:-1])
    import_name = python_class_name.split('.')[-1]

    Log.debug(f"From path: {from_path}, import name: {import_name}")

    # Resolve duplicate package suffix problem (heron.), if the top level package name is heron
    if python_class_name.startswith("heron."):
        try:
            mod = resolve_heron_suffix_issue(abs_path_to_pex,
                                             python_class_name)
            return getattr(mod, import_name)
        except:
            Log.error(
                f"Could not resolve class {python_class_name} with special handling"
            )

    mod = __import__(from_path, fromlist=[import_name], level=0)
    Log.debug(f"Imported module: {str(mod)}")
    return getattr(mod, import_name)
Example #45
0
def distribute_package(roles, cl_args):
  '''
  distribute Heron packages to all nodes
  '''
  Log.info("Distributing heron package to nodes (this might take a while)...")
  masters = roles[Role.MASTERS]
  slaves = roles[Role.SLAVES]

  tar_file = tempfile.NamedTemporaryFile(suffix=".tmp").name
  Log.debug("TAR file %s to %s" % (cl_args["heron_dir"], tar_file))
  make_tarfile(tar_file, cl_args["heron_dir"])
  dist_nodes = masters.union(slaves)

  scp_package(tar_file, dist_nodes, cl_args)
Example #46
0
  def __init__(self, looper, metrics_host, port, instance,
               out_metrics, in_stream, out_stream, sock_map, socket_options,
               gateway_metrics, sys_config):
    HeronClient.__init__(self, looper, metrics_host, port, sock_map, socket_options)
    self.instance = instance
    self.out_queue = out_metrics
    self.in_stream = in_stream
    self.out_stream = out_stream
    self.gateway_metrics = gateway_metrics
    self.sys_config = sys_config

    self._add_metrics_client_tasks()
    Log.debug('start updating in and out stream metrics')
    self._update_in_out_stream_metrics_tasks()
Example #47
0
def import_and_get_class(path_to_pex, python_class_name):
    """Imports and load a class from a given pex file path and python class name

  For example, if you want to get a class called `Sample` in
  /some-path/sample.pex/heron/examples/src/python/sample.py,
  ``path_to_pex`` needs to be ``/some-path/sample.pex``, and
  ``python_class_name`` needs to be ``heron.examples.src.python.sample.Sample``
  """
    abs_path_to_pex = os.path.abspath(path_to_pex)

    Log.debug("Add a pex to the path: %s" % abs_path_to_pex)
    Log.debug("In import_and_get_class with cls_name: %s" % python_class_name)
    split = python_class_name.split('.')
    from_path = '.'.join(split[:-1])
    import_name = python_class_name.split('.')[-1]

    Log.debug("From path: %s, import name: %s" % (from_path, import_name))

    # Resolve duplicate package suffix problem (heron.), if the top level package name is heron
    if python_class_name.startswith("heron."):
        mod = resolve_heron_suffix_issue(abs_path_to_pex, python_class_name)
        return getattr(mod, import_name)

    mod = __import__(from_path, fromlist=[import_name], level=-1)
    Log.debug("Imported module: %s" % str(mod))
    return getattr(mod, import_name)
Example #48
0
    def on_topologies_watch(state_manager: StateManager, topologies: List[str]) -> None:
      """watch topologies"""
      topologies = set(topologies)
      Log.info("State watch triggered for topologies.")
      Log.debug("Topologies: %s", topologies)
      cached_names = {t.name for t in self.get_stmgr_topologies(state_manager.name)}
      Log.debug("Existing topologies: %s", cached_names)
      for name in cached_names - topologies:
        Log.info("Removing topology: %s in rootpath: %s",
                 name, state_manager.rootpath)
        self.remove_topology(name, state_manager.name)

      for name in topologies - cached_names:
        self.add_new_topology(state_manager, name)
Example #49
0
    def poll(self, timeout=0.0):
        """Modified version of poll() from asyncore module"""
        if self.sock_map is None:
            Log.warning("Socket map is not registered to Gateway Looper")
        readable_lst = []
        writable_lst = []
        error_lst = []

        if self.sock_map is not None:
            for fd, obj in self.sock_map.items():
                is_r = obj.readable()
                is_w = obj.writable()
                if is_r:
                    readable_lst.append(fd)
                if is_w and not obj.accepting:
                    writable_lst.append(fd)
                if is_r or is_w:
                    error_lst.append(fd)

        # Add wakeup fd
        readable_lst.append(self.pipe_r)

        Log.debug("Will select() with timeout: " + str(timeout) +
                  ", with map: " + str(self.sock_map))
        try:
            readable_lst, writable_lst, error_lst = \
              select.select(readable_lst, writable_lst, error_lst, timeout)
        except select.error as err:
            Log.debug("Trivial error: " + str(err))
            if err.args[0] != errno.EINTR:
                raise
            else:
                return
        Log.debug("Selected [r]: " + str(readable_lst) + " [w]: " +
                  str(writable_lst) + " [e]: " + str(error_lst))

        if self.pipe_r in readable_lst:
            Log.debug("Read from pipe")
            os.read(self.pipe_r, 1024)
            readable_lst.remove(self.pipe_r)

        if self.sock_map is not None:
            for fd in readable_lst:
                obj = self.sock_map.get(fd)
                if obj is None:
                    continue
                asyncore.read(obj)

            for fd in writable_lst:
                obj = self.sock_map.get(fd)
                if obj is None:
                    continue
                asyncore.write(obj)

            for fd in error_lst:
                obj = self.sock_map.get(fd)
                if obj is None:
                    continue
                # pylint: disable=W0212
                asyncore._exception(obj)
Example #50
0
 def get(self):
   """ get method """
   try:
     cluster = self.get_argument_cluster()
     role = self.get_argument_role()
     environ = self.get_argument_environ()
     topology_name = self.get_argument_topology()
     topology_info = self.tracker.getTopologyInfo(topology_name, cluster, role, environ)
     metadata = topology_info["metadata"]
     self.write_success_response(metadata)
   except Exception as e:
     Log.error("Exception when handling GET request '/topologies/metadata'")
     Log.debug(traceback.format_exc())
     self.write_error_response(e)
Example #51
0
    def offer(self, item):
        """Offer to the buffer

    It is a non-blocking operation, and when the buffer is full, it raises Queue.Full exception
    """
        try:
            # non-blocking
            self._buffer.put(item, block=False)
            if self._consumer_callback is not None:
                self._consumer_callback()
            return True
        except Queue.Full:
            Log.debug("%s: Full in offer()" % str(self))
            return False
Example #52
0
    def poll(self):
        """Poll from the buffer

    It is a non-blocking operation, and when the buffer is empty, it raises Queue.Empty exception
    """
        try:
            # non-blocking
            ret = self._buffer.get(block=False)
            if self._producer_callback is not None:
                self._producer_callback()
            return ret
        except Queue.Empty:
            Log.debug("%s: Empty in poll()" % str(self))
            raise Queue.Empty
Example #53
0
def getInstancePid(topology_info, instance_id):
    """
  This method is used by other modules, and so it
  is not a part of the class.
  Fetches Instance pid from heron-shell.
  """
    try:
        http_client = tornado.httpclient.AsyncHTTPClient()
        endpoint = utils.make_shell_endpoint(topology_info, instance_id)
        url = "%s/pid/%s" % (endpoint, instance_id)
        Log.debug("HTTP call for url: %s", url)
        response = yield http_client.fetch(url)
        raise tornado.gen.Return(response.body)
    except tornado.httpclient.HTTPError as e:
        raise Exception(str(e))
    def _flush_remaining(self):
        if self.current_data_tuple_set is not None:
            Log.debug("In flush_remaining() - flush data tuple set")
            tuple_set = self.make_tuple_set()
            tuple_set.data.CopyFrom(self.current_data_tuple_set)
            self._push_tuple_to_stream(tuple_set)
            self.current_data_tuple_set = None
            self.current_data_tuple_size_in_bytes = 0

        if self.current_control_tuple_set is not None:
            Log.debug("In flush_remaining() - flush control tuple set")
            tuple_set = self.make_tuple_set()
            tuple_set.control.CopyFrom(self.current_control_tuple_set)
            self._push_tuple_to_stream(tuple_set)
            self.current_control_tuple_set = None
Example #55
0
 def get(self):
     """ get method """
     try:
         cluster = self.get_argument_cluster()
         role = self.get_argument_role()
         environ = self.get_argument_environ()
         topology_name = self.get_argument_topology()
         instance = self.get_argument_instance()
         topology_info = self.tracker.get_topology_info(
             topology_name, cluster, role, environ)
         result = yield getInstancePid(topology_info, instance)
         self.write_success_response(result)
     except Exception as e:
         Log.debug(traceback.format_exc())
         self.write_error_response(e)
Example #56
0
def _save_or_remove(config, cluster):
    cluster_config_file = get_cluster_config_file(cluster)
    if config:
        Log.debug("saving config file: %s", cluster_config_file)
        config_directory = get_config_directory(cluster)
        if not os.path.isdir(config_directory):
            os.makedirs(config_directory)
        with open(cluster_config_file, 'wb') as cf:
            yaml.dump(config, cf, default_flow_style=False)
    else:
        if os.path.isfile(cluster_config_file):
            try:
                os.remove(cluster_config_file)
            except OSError:
                pass
Example #57
0
    def on_topologies_watch(state_manager, topologies) -> None:
      """watch topologies"""
      Log.info("State watch triggered for topologies.")
      Log.debug("Topologies: " + str(topologies))
      cached_names = [t.name for t in self.get_stmgr_topologies(state_manager.name)]
      Log.debug(f"Existing topologies: {cached_names}")
      for name in cached_names:
        if name not in topologies:
          Log.info("Removing topology: %s in rootpath: %s",
                   name, state_manager.rootpath)
          self.remove_topology(name, state_manager.name)

      for name in topologies:
        if name not in cached_names:
          self.add_new_topology(state_manager, name)
Example #58
0
def download(uri, cluster):
  tmp_dir = tempfile.mkdtemp()
  cmd_downloader = config.get_heron_bin_dir() + "/heron-downloader.sh"
  cmd_uri = "-u " + uri
  cmd_destination = "-f " + tmp_dir
  cmd_heron_root = "-d " + config.get_heron_dir()
  cmd_heron_config = "-p " + config.get_heron_cluster_conf_dir(cluster, config.get_heron_conf_dir())
  cmd_mode = "-m local"
  cmd = [cmd_downloader, cmd_uri, cmd_destination, cmd_heron_root, cmd_heron_config, cmd_mode]
  Log.debug("download uri command: %s", cmd)
  subprocess.call(cmd)
  suffix = (".jar", ".tar", ".tar.gz", ".pex", ".dylib", ".so")
  for f in os.listdir(tmp_dir):
    if f.endswith(suffix):
      return os.path.join(tmp_dir, f)
Example #59
0
def heron_class(class_name,
                lib_jars,
                extra_jars=None,
                args=None,
                java_defines=None):
    '''
  Execute a heron class given the args and the jars needed for class path
  :param class_name:
  :param lib_jars:
  :param extra_jars:
  :param args:
  :param java_defines:
  :return:
  '''
    # default optional params to empty list if not provided
    if extra_jars is None:
        extra_jars = []
    if args is None:
        args = []
    if java_defines is None:
        java_defines = []

    # Format all java -D options that need to be passed while running
    # the class locally.
    java_opts = ['-D' + opt for opt in java_defines]

    # Construct the command line for the sub process to run
    # Because of the way Python execute works,
    # the java opts must be passed as part of the list
    all_args = [config.get_java_path(), "-client", "-Xmx1g"] + \
               java_opts + \
               ["-cp", config.get_classpath(lib_jars + extra_jars)]

    all_args += [class_name] + list(args)

    # set heron_config environment variable
    heron_env = os.environ.copy()
    heron_env['HERON_OPTIONS'] = opts.get_heron_config()

    # print the verbose message
    Log.debug('$> %s' % ' '.join(all_args))
    Log.debug('Heron options: %s' % str(heron_env["HERON_OPTIONS"]))

    # invoke the command with subprocess and print error message, if any
    status = subprocess.call(all_args, env=heron_env)
    if status != 0:
        err_str = "User main failed with status %d. Bailing out..." % status
        raise RuntimeError(err_str)
Example #60
0
 def parse_known_args(self, args=None, namespace=None):
   namespace, args = super(HeronArgumentParser,
                           self).parse_known_args(args, namespace)
   positional_args_map = self.get_positional_args()
   if self.prog != 'heron':
     ## sub parser specific validation
     Log.debug('sub parser expansion  %s %s', self.prog, args)
     ## if the expanded args contains a optional equivalent of a positional argument
     ## i.e --topology-name xyz for positional argument topology-name
     ## need to prevent that for that sub parser. bail out
     for key in args:
       if key in positional_args_map:
         raise ValueError(
             'positional argument for  command {} : {} specified in heronrc'.format(
                 self.prog, positional_args_map[key]))
   return namespace, args