Example #1
0
class PingPongServer(Observable):
    PING_DELAY = Amount(1, Time.SECONDS)

    def __init__(self, target_host, target_port, clock=time):
        self._clock = clock
        self._target = (target_host, target_port)
        self._pings = AtomicGauge('pings')
        self.metrics.register(self._pings)

    def send_request(self, endpoint, message, ttl):
        url_base = 'http://%s:%d' % self._target
        try:
            urllib2.urlopen('%s/%s/%s/%d' %
                            (url_base, endpoint, message, ttl)).read()
        except Exception as e:
            log.error('Failed to query %s: %s' % (url_base, e))

    @HttpServer.route('/ping/:message')
    @HttpServer.route('/ping/:message/:ttl')
    def ping(self, message, ttl=60):
        self._pings.increment()
        log.info('Got ping (ttl=%s): %s' % (message, ttl))
        ttl = int(ttl) - 1
        if ttl > 0:
            defer(partial(self.send_request, 'ping', message, ttl),
                  delay=self.PING_DELAY,
                  clock=self._clock)
Example #2
0
    def from_target(cls, config, target, conn_timeout=None):
        from twitter.common.python.fetcher import PyPIFetcher, Fetcher
        from twitter.common.python.resolver import Resolver
        from twitter.common.python.http import Crawler
        from twitter.common.quantity import Amount, Time

        conn_timeout_amount = Amount(
            conn_timeout, Time.SECONDS) if conn_timeout is not None else None

        crawler = Crawler(cache=config.get('python-setup', 'download_cache'),
                          conn_timeout=conn_timeout_amount)

        fetchers = []
        fetchers.extend(
            Fetcher([url])
            for url in config.getlist('python-repos', 'repos', []))
        fetchers.extend(
            PyPIFetcher(url)
            for url in config.getlist('python-repos', 'indices', []))

        platforms = config.getlist('python-setup', 'platforms', ['current'])
        if isinstance(target, PythonBinary) and target.platforms:
            platforms = target.platforms

        return cls(platforms=get_platforms(platforms),
                   resolver=Resolver(cache=config.get('python-setup',
                                                      'install_cache'),
                                     crawler=crawler,
                                     fetchers=fetchers,
                                     install_cache=config.get(
                                         'python-setup', 'install_cache'),
                                     conn_timeout=conn_timeout_amount))
Example #3
0
  def test_iter_content_error(self):
    self.requests.get('http://foo', stream=True, timeout=60).AndReturn(self.response)
    self.response.status_code = 200
    self.response.headers = {}
    self.listener.status(200, content_length=None)

    self.response.iter_content(chunk_size=1024).AndRaise(requests.Timeout)
    self.response.close()

    self.mox.ReplayAll()

    with pytest.raises(self.fetcher.TransientError):
      self.fetcher.fetch('http://foo',
                         self.listener,
                         chunk_size=Amount(1, Data.KB),
                         timeout=Amount(1, Time.MINUTES))
Example #4
0
    def test_killTask(self):  # noqa
        proxy_driver = ProxyDriver()

        class ProvidedThermosRunnerMatcher(object):
            """Matcher that ensures a bound method 'stop' from 'ProvidedThermosTaskRunner' is called."""
            def __eq__(self, other):
                return (type(
                    other.im_self).__name__ == 'ProvidedThermosTaskRunner'
                        and other.__name__ == 'stop')

        with contextlib.nested(
                temporary_dir(),
                mock.patch(
                    'apache.aurora.executor.aurora_executor.propagate_deadline',
                    wraps=propagate_deadline)) as (checkpoint_root,
                                                   mock_propagate_deadline):

            _, executor = make_executor(proxy_driver,
                                        checkpoint_root,
                                        SLEEP60_MTI,
                                        stop_timeout_in_secs=123)
            # send two, expect at most one delivered
            executor.killTask(proxy_driver,
                              mesos_pb2.TaskID(value='sleep60-001'))
            executor.killTask(proxy_driver,
                              mesos_pb2.TaskID(value='sleep60-001'))
            executor.terminated.wait()

            updates = proxy_driver.method_calls['sendStatusUpdate']

            mock_propagate_deadline.assert_called_with(  # Ensure 'stop' is called with custom timeout.
                ProvidedThermosRunnerMatcher(),
                timeout=Amount(123, Time.SECONDS))
            assert len(updates) == 3
            assert updates[-1][0][0].state == mesos_pb2.TASK_KILLED
def test_sampler_base():
    class TestSampler(SamplerBase):
        def __init__(self, period, clock):
            self.count = 0
            SamplerBase.__init__(self, period, clock)

        def iterate(self):
            self.count += 1

    test_clock = ThreadedClock()
    sampler = TestSampler(Amount(1, Time.SECONDS), clock=test_clock)
    sampler.start()

    assert test_clock.converge(threads=[sampler])
    test_clock.assert_waiting(sampler, 1)

    test_clock.tick(0.5)
    assert test_clock.converge(threads=[sampler])
    assert sampler.count == 0

    test_clock.tick(0.5)
    assert test_clock.converge(threads=[sampler])
    assert sampler.count == 1

    test_clock.tick(5)
    assert test_clock.converge(threads=[sampler])
    assert sampler.count == 6

    assert not sampler.is_stopped()
    sampler.stop()

    # make sure that stopping the sampler short circuits any sampling
    test_clock.tick(5)
    assert test_clock.converge(threads=[sampler])
    assert sampler.count == 6
Example #6
0
  def test_drain_hosts_timed_out_wait(self, _, mock_drain_hosts, mock_maintenance_status, mock_log):
    fake_maintenance_status_response = Response(
        responseCode=ResponseCode.OK,
        result=Result(maintenanceStatusResult=MaintenanceStatusResult(set([
          HostStatus(host=TEST_HOSTNAMES[0], mode=MaintenanceMode.SCHEDULED),
          HostStatus(host=TEST_HOSTNAMES[1], mode=MaintenanceMode.SCHEDULED),
          HostStatus(host=TEST_HOSTNAMES[2], mode=MaintenanceMode.SCHEDULED)
        ]))))

    mock_drain_hosts.return_value = Response(responseCode=ResponseCode.OK)
    mock_maintenance_status.return_value = fake_maintenance_status_response
    test_hosts = Hosts(set(TEST_HOSTNAMES))
    maintenance = HostMaintenance(DEFAULT_CLUSTER, 'quiet')
    maintenance.MAX_STATUS_WAIT = Amount(1, Time.MILLISECONDS)

    not_drained_hostnames = maintenance._drain_hosts(test_hosts)
    assert TEST_HOSTNAMES == sorted(not_drained_hostnames)
    assert mock_maintenance_status.call_count == 1
    mock_drain_hosts.assert_called_once_with(test_hosts)
    mock_maintenance_status.assert_called_once_with((Hosts(set(TEST_HOSTNAMES))))
    assert mock_log.mock_calls == [mock.call(textwrap.dedent("""\
        Failed to move all hosts into DRAINED within 1 ms:
        \tHost:us-west-001.example.com\tStatus:SCHEDULED
        \tHost:us-west-002.example.com\tStatus:SCHEDULED
        \tHost:us-west-003.example.com\tStatus:SCHEDULED"""))]
    def test_drain_hosts_timed_out_wait(self, _, mock_drain_hosts,
                                        mock_maintenance_status):
        fake_maintenance_status_response = Response(
            responseCode=ResponseCode.OK,
            result=Result(maintenanceStatusResult=MaintenanceStatusResult(
                set([
                    HostStatus(host=TEST_HOSTNAMES[0],
                               mode=MaintenanceMode.SCHEDULED),
                    HostStatus(host=TEST_HOSTNAMES[1],
                               mode=MaintenanceMode.SCHEDULED),
                    HostStatus(host=TEST_HOSTNAMES[2],
                               mode=MaintenanceMode.SCHEDULED)
                ]))))

        mock_drain_hosts.return_value = Response(responseCode=ResponseCode.OK)
        mock_maintenance_status.return_value = fake_maintenance_status_response
        test_hosts = Hosts(set(TEST_HOSTNAMES))
        maintenance = HostMaintenance(DEFAULT_CLUSTER, 'quiet')
        maintenance.MAX_STATUS_WAIT = Amount(1, Time.MILLISECONDS)

        not_drained_hostnames = maintenance._drain_hosts(test_hosts)
        assert TEST_HOSTNAMES == sorted(not_drained_hostnames)
        assert mock_maintenance_status.call_count == 1
        mock_drain_hosts.assert_called_once_with(test_hosts)
        mock_maintenance_status.assert_called_once_with(
            (Hosts(set(TEST_HOSTNAMES))))
Example #8
0
  def from_task(self, task, sandbox):
    data = json.loads(task.data)
    task_mem = None
    for resource in task.resources:
      if resource.name == 'mem':
        task_mem = resource.scalar.value
        break

    assert task_mem, "Task resources should always include 'mem'"

    buffer_pool_size = int(
        Amount(int(task_mem), Data.MB).as_(Data.BYTES) * MEM_FRACTION_FOR_BUFFER_POOL)
    log.info("Allocating %s bytes of memory to MySQL buffer pool" % buffer_pool_size)

    # TODO(jyx): Use an ephemeral sandbox for now. Will change when Mesos adds persistent resources
    # support: MESOS-1554.
    return MySQLTaskControl(
        sandbox,
        data['framework_user'],
        data['host'],
        data['port'],
        data['cluster'],
        data['cluster_user'],
        data['cluster_password'],
        data['server_id'],
        data['admin_keypath'],
        buffer_pool_size)
Example #9
0
def test_gc_lifetime():
  with run_gc_with_timeout(maximum_executor_lifetime=Amount(500, Time.MILLISECONDS)) as (
      proxy_driver, executor):
    executor._clock.tick(1)
    proxy_driver.stopped.wait(timeout=EVENT_WAIT_TIMEOUT_SECS)
    assert proxy_driver.stopped.is_set()
    assert not executor._stop_event.is_set()
 def test_run(self):
   event = Event()
   mock_driver = mock.Mock(spec=ExecutorDriver)
   event.set()
   executor_timeout = ExecutorTimeout(event, mock_driver, timeout=Amount(0, Time.SECONDS))
   executor_timeout.run()
   assert mock_driver.stop.call_count == 0
Example #11
0
  def test_incompatible_resource_role(self):
    scheduler1 = MysosScheduler(
        self._state,
        self._state_provider,
        self._framework_user,
        "./executor.pex",
        "cmd.sh",
        self._zk_client,
        self._zk_url,
        Amount(5, Time.SECONDS),
        "/etc/mysos/admin_keyfile.yml",
        gen_encryption_key(),
        framework_role='mysos')  # Require 'mysos' but the resources are in '*'.

    RootMetrics().register_observable('scheduler', scheduler1)

    scheduler1.registered(self._driver, self._framework_id, object())
    scheduler1.create_cluster("cluster1", "mysql_user", 3)
    scheduler1.resourceOffers(self._driver, [self._offer])

    assert "declineOffer" in self._driver.method_calls
    assert len(self._driver.method_calls["declineOffer"]) == 1
    # [0][0][1]: [First declineOffer call][The positional args][The first positional arg], which is
    # a 'Filters' object.
    assert (self._driver.method_calls["declineOffer"][0][0][1].refuse_seconds ==
        INCOMPATIBLE_ROLE_OFFER_REFUSE_DURATION.as_(Time.SECONDS))

    sample = RootMetrics().sample()
    assert sample['scheduler.offers_incompatible_role'] == 1
Example #12
0
def initialize(options):
  path_detector = ChainedPathDetector(
      FixedPathDetector(options.root),
      MesosPathDetector(options.mesos_root),
  )
  polling_interval = Amount(options.polling_interval_secs, Time.SECONDS)
  return TaskObserver(path_detector, interval=polling_interval)
Example #13
0
 def __init__(self,
              checkpoint_root,
              disk_collector=DiskCollector,
              disk_collection_interval=Amount(1, Time.MINUTES)):
     self._checkpoint_root = checkpoint_root
     self._disk_collector = disk_collector
     self._disk_collection_interval = disk_collection_interval
Example #14
0
class StatusManager(ExceptionalThread):
  """
    An agent that periodically checks the health of a task via StatusCheckers that
    provide HTTP health checking, resource consumption, etc.

    If any of the status interfaces return a status, the Status Manager
    invokes the user-supplied callback with the status.
  """
  POLL_WAIT = Amount(500, Time.MILLISECONDS)

  def __init__(self, status_checker, callback, clock=time):
    if not isinstance(status_checker, StatusChecker):
      raise TypeError('status_checker must be a StatusChecker, got %s' % type(status_checker))
    if not callable(callback):
      raise TypeError('callback needs to be callable!')
    self._status_checker = status_checker
    self._callback = callback
    self._clock = clock
    super(StatusManager, self).__init__()
    self.daemon = True

  def run(self):
    while True:
      status_result = self._status_checker.status
      if status_result is not None:
        log.info('Status manager got %s' % status_result)
        self._callback(status_result)
        break
      else:
        self._clock.sleep(self.POLL_WAIT.as_(Time.SECONDS))
Example #15
0
class DefaultAnnouncerCheckerProvider(AnnouncerCheckerProvider):
    DEFAULT_RETRY_MAX_DELAY = Amount(5, Time.MINUTES)
    DEFAULT_RETRY_POLICY = KazooRetry(
        max_tries=None,
        ignore_expire=True,
        max_delay=DEFAULT_RETRY_MAX_DELAY.as_(Time.SECONDS),
    )

    def __init__(self,
                 ensemble,
                 root='/aurora',
                 allow_custom_serverset_path=False,
                 hostname=None):
        self.__ensemble = ensemble
        self.__root = root
        super(DefaultAnnouncerCheckerProvider,
              self).__init__(allow_custom_serverset_path, hostname)

    def make_zk_client(self):
        return KazooClient(self.__ensemble,
                           connection_retry=self.DEFAULT_RETRY_POLICY)

    def make_zk_path(self, assigned_task):
        config = assigned_task.task
        role, environment, name = (config.job.role, config.job.environment,
                                   config.job.name)
        return posixpath.join(self.__root, role, environment, name)
Example #16
0
  def _wait_for_control(self):
    """Wait for control of the checkpoint stream: must be run in the child."""
    total_wait_time = Amount(0, Time.SECONDS)

    with open(self.ckpt_file(), 'r') as fp:
      fp.seek(self._ckpt_head)
      rr = ThriftRecordReader(fp, RunnerCkpt)
      while total_wait_time < self.MAXIMUM_CONTROL_WAIT:
        ckpt_tail = os.path.getsize(self.ckpt_file())
        if ckpt_tail == self._ckpt_head:
          self._platform.clock().sleep(self.CONTROL_WAIT_CHECK_INTERVAL.as_(Time.SECONDS))
          total_wait_time += self.CONTROL_WAIT_CHECK_INTERVAL
          continue
        checkpoint = rr.try_read()
        if checkpoint:
          if not checkpoint.process_status:
            raise self.CheckpointError('No process status in checkpoint!')
          if (checkpoint.process_status.process != self.name() or
              checkpoint.process_status.state != ProcessState.FORKED or
              checkpoint.process_status.fork_time != self._fork_time or
              checkpoint.process_status.coordinator_pid != self._pid):
            self._log('Losing control of the checkpoint stream:')
            self._log('   fork_time [%s] vs self._fork_time [%s]' % (
                checkpoint.process_status.fork_time, self._fork_time))
            self._log('   coordinator_pid [%s] vs self._pid [%s]' % (
                checkpoint.process_status.coordinator_pid, self._pid))
            raise self.CheckpointError('Lost control of the checkpoint stream!')
          self._log('Taking control of the checkpoint stream at record: %s' %
            checkpoint.process_status)
          self._seq = checkpoint.process_status.seq + 1
          return True
    raise self.CheckpointError('Timed out waiting for checkpoint stream!')
Example #17
0
    def test_invalid_status_update(self):
        """Launcher raises an exception when an invalid status is received."""
        self._cluster.num_nodes = 1
        launcher = MySQLClusterLauncher(self._driver, self._cluster,
                                        self._state_provider, self._zk_url,
                                        self._zk_client, self._framework_user,
                                        "./executor.pex", "cmd.sh",
                                        Amount(5, Time.SECONDS),
                                        "/etc/mysos/admin_keyfile.yml")
        self._launchers.append(launcher)

        resources = create_resources(cpus=4, mem=512 * 3, ports=set([10000]))
        self._offer.resources.extend(resources)

        task_id, _ = launcher.launch(self._offer)
        assert task_id == "mysos-cluster0-0"

        tasks = self._driver.method_calls["launchTasks"]
        assert len(tasks) == self._cluster.num_nodes

        status = mesos_pb2.TaskStatus()
        status.task_id.value = task_id
        status.state = mesos_pb2.TASK_RUNNING  # Valid state.
        launcher.status_update(status)

        status.state = mesos_pb2.TASK_FINISHED  # An invalid state.

        with pytest.raises(MySQLClusterLauncher.Error):
            launcher.status_update(status)
  def test_initialize(self):
    expected_interval = Amount(15, Time.SECONDS)
    mock_options = Mock(spec_set=['root', 'mesos_root', 'polling_interval_secs'])
    mock_options.root = ''
    mock_options.mesos_root = os.path.abspath('.')
    mock_options.polling_interval_secs = int(expected_interval.as_(Time.SECONDS))
    mock_task_observer = create_autospec(spec=TaskObserver)
    with patch(
        'apache.aurora.tools.thermos_observer.TaskObserver',
        return_value=mock_task_observer) as mock_observer:

      initialize(mock_options)

      assert len(mock_observer.mock_calls) == 1
      args = mock_observer.mock_calls[0][2]
      assert expected_interval == args['interval']
Example #19
0
  def wait_start(self, timeout=MAX_WAIT):
    log.debug('Waiting for task to start.')

    def is_started():
      return self._monitor and (self._monitor.active or self._monitor.finished)

    waited = Amount(0, Time.SECONDS)

    while waited < timeout:
      if not is_started():
        log.debug('  - sleeping...')
        self._clock.sleep(self.POLL_INTERVAL.as_(Time.SECONDS))
        waited += self.POLL_INTERVAL
      else:
        break

      if not self.is_alive:
        if self._popen_rc != 0:
          raise TaskError('Task failed: %s' % self.compute_status().reason)
        else:
          # We can end up here if the process exited between the call to Popen and
          # waitpid (in is_alive), which is fine.
          log.info('Task runner exited: %s' % self.compute_status().reason)
          break

    if not is_started():
      log.error('Task did not start with in deadline, forcing loss.')
      self.lose()
      raise TaskError('Task did not start within deadline.')
Example #20
0
  def stop(self, timeout=MAX_WAIT):
    """Stop the runner.  If it's already completed, no-op.  If it's still running, issue a kill."""
    log.info('ThermosTaskRunner is shutting down.')

    if not self.forking.is_set():
      raise TaskError('Failed to call TaskRunner.start.')

    log.info('Invoking runner HTTP teardown.')
    self._terminate_http()

    log.info('Invoking runner.kill')
    self.kill()

    waited = Amount(0, Time.SECONDS)
    while self.is_alive and waited < timeout:
      self._clock.sleep(self.POLL_INTERVAL.as_(Time.SECONDS))
      waited += self.POLL_INTERVAL

    if not self.is_alive and self.task_state() != TaskState.ACTIVE:
      return

    log.info('Thermos task did not shut down cleanly, rebinding to kill.')
    self.quitquitquit()

    while not self._monitor.finished and waited < timeout:
      self._clock.sleep(self.POLL_INTERVAL.as_(Time.SECONDS))
      waited += self.POLL_INTERVAL

    if not self._monitor.finished:
      raise TaskError('Task did not stop within deadline.')
Example #21
0
 def lose(self, force=False):
     """
   Mark a task as LOST and kill any straggling processes.
 """
     self.kill(force,
               preemption_wait=Amount(0, Time.SECONDS),
               terminal_status=TaskState.LOST)
Example #22
0
class TaskRunnerStage(object):
    """
    A stage of the task runner pipeline.
  """
    MAX_ITERATION_WAIT = Amount(1, Time.SECONDS)

    def __init__(self, runner):
        self.runner = runner
        self.clock = runner._clock

    def run(self):
        """
      Perform any work necessary at this stage of the task.

      If there is no more work to be done, return None. [This will invoke a state transition.]

      If there is still work to be done, return the number of seconds from now in which you'd like
      to be called to re-run the plan.
    """
        return None

    def transition_to(self):
        """
      The stage to which we should transition.
    """
        raise NotImplementedError
Example #23
0
  def wait_for_accept(cls, port, tunnel_popen, timeout):
    total_time = Amount(0, Time.SECONDS)
    sleep = cls.MIN_RETRY
    warned = False  # Did we log a warning that shows we're waiting for the tunnel?

    while total_time < timeout and tunnel_popen.returncode is None:
      try:
        accepted_socket = socket.create_connection(('localhost', port), timeout=5.0)
        accepted_socket.close()
        return True
      except socket.error:
        total_time += sleep
        time.sleep(sleep.as_(Time.SECONDS))

        # Increase sleep exponentially until MAX_INTERVAL is reached
        sleep = min(sleep * 2, cls.MAX_INTERVAL)

        if total_time > cls.WARN_THRESHOLD and not warned:
          log.warn('Still waiting for tunnel to be established after %s (timeout is %s)' % (
              total_time, cls.DEFAULT_TIMEOUT))
          warned = True

        tunnel_popen.poll()  # needed to update tunnel_popen.returncode
    if tunnel_popen.returncode is not None:
      cls.log('SSH returned prematurely with code %s' % str(tunnel_popen.returncode))
    else:
      cls.log('timed out initializing tunnel')
    return False
Example #24
0
 def stats_uploader_daemon(self, stats):
     """
 This method calls uploader in sync
 """
     self._su = StatsUploader("locahost", "80", "buildtime.json",
                              Amount(6, Time.HOURS), self._file, "dummy")
     self._su.upload_sync(stats)
Example #25
0
  def test_launch_cluster_all_nodes_successful(self):
    for i in range(self._cluster.num_nodes):
      task_id, remaining = self._launcher.launch(self._offer)
      del self._offer.resources[:]
      self._offer.resources.extend(remaining)
      assert task_id == "mysos-cluster0-%s" % i

    tasks = self._driver.method_calls["launchTasks"]
    assert len(tasks) == self._cluster.num_nodes

    # No new tasks are launched.
    assert self._launcher.launch(self._offer)[0] is None
    assert len(self._driver.method_calls["launchTasks"]) == self._cluster.num_nodes

    # All 3 nodes have successfully started.
    status = mesos_pb2.TaskStatus()
    status.state = mesos_pb2.TASK_RUNNING  # Valid state.
    status.slave_id.value = self._offer.slave_id.value
    for i in range(self._cluster.num_nodes):
      status.task_id.value = "mysos-cluster0-%s" % i
      self._launcher.status_update(status)

    deadline(
        lambda: wait_for_master(
            get_cluster_path(self._zk_url, self._cluster.name),
            self._zk_client),
        Amount(5, Time.SECONDS))

    # The first slave is elected.
    assert "/mysos/test/cluster0/master/member_0000000000" in self._storage.paths
    # Two slaves.
    assert len([x for x in self._storage.paths.keys() if x.startswith(
        "/mysos/test/cluster0/slaves/member_")]) == 2
  def test_demote(self):
    task_control = FakeTaskControl()
    runner = MysosTaskRunner(
        self._self_instance,
        self._client,
        "/home/test/my_cluster",
        NoopPackageInstaller(),
        task_control,
        self._state_manager)

    manager = ClusterManager(self._client, "/home/test/my_cluster")
    runner.start()

    self_member = manager.add_member(self._self_instance)

    # 'self_instance' becomes the master.
    manager.promote_member(self_member)

    runner.promoted.wait(1)

    another_member = manager.add_member(ServiceInstance(Endpoint("another_host", 10000)))

    # This demotes 'self_instance', which should cause runner to stop.
    manager.promote_member(another_member)

    assert deadline(runner.join, Amount(1, Time.SECONDS))
Example #27
0
def test_announcer_under_abnormal_circumstances():
    mock_serverset = create_autospec(spec=ServerSet, instance=True)
    mock_serverset.join = MagicMock()
    mock_serverset.join.side_effect = [
        KazooException('Whoops the ensemble is down!'),
        'member0001',
    ]
    mock_serverset.cancel = MagicMock()

    endpoint = Endpoint('localhost', 12345)
    clock = ThreadedClock(31337.0)

    announcer = Announcer(mock_serverset,
                          endpoint,
                          clock=clock,
                          exception_wait=Amount(2, Time.SECONDS))
    announcer.start()

    try:
        clock.tick(1.0)
        assert announcer.disconnected_time() == 1.0
        clock.tick(2.0)
        assert announcer.disconnected_time() == 0.0, (
            'Announcer should recover after an exception thrown internally.')
        assert announcer._membership == 'member0001'
    finally:
        announcer.stop()
Example #28
0
class ServerSetJoinThread(ExceptionalThread):
    """Background thread to reconnect to Serverset on session expiration."""

    LOOP_WAIT = Amount(1, Time.SECONDS)

    def __init__(self, event, joiner, loop_wait=LOOP_WAIT):
        self._event = event
        self._joiner = joiner
        self._stopped = threading.Event()
        self._loop_wait = loop_wait
        super(ServerSetJoinThread, self).__init__()
        self.daemon = True

    def run(self):
        while True:
            if self._stopped.is_set():
                break
            self._event.wait(timeout=self._loop_wait.as_(Time.SECONDS))
            if not self._event.is_set():
                continue
            log.debug('Join event triggered, joining serverset.')
            self._event.clear()
            self._joiner()

    def stop(self):
        self._stopped.set()
Example #29
0
    def __init__(self,
                 driver,
                 cluster_name,
                 epoch,
                 master_callback,
                 election_timeout,
                 query_interval=Amount(1, Time.SECONDS)):
        """
      :param driver: The SchedulerDriver for querying the slaves.
      :param cluster_name: The name of the MySQL cluster.
      :param epoch: The master epoch that identifies this election.
      :param master_callback: The callback function with one argument: the 'task_id' of the elected
                              master which could be None if no one is electable.
      :param election_timeout: The amount of time the elector waits for all slaves to respond. If
                               not all slaves have responded within the timeout, then the master is
                               elected from the ones who have.
      :param query_interval: The timeout before the elector re-sends queries for positions.

      :type epoch: int
      :type query_interval: Amount
      :type election_timeout: Amount
      :type master_callback: function
    """
        super(MySQLMasterElector, self).__init__()

        if not isinstance(epoch, int):
            raise TypeError("'epoch' should be an int")
        if not isinstance(query_interval, Amount) or not isinstance(
                query_interval.unit(), Time):
            raise ValueError("'query_interval' must be an Amount of Time")
        if not isinstance(election_timeout, Amount) or not isinstance(
                election_timeout.unit(), Time):
            raise ValueError("'election_timeout' must be an Amount of Time")
        if not hasattr(master_callback, '__call__'):
            raise TypeError("master_callback must be a function")

        self._query_interval = query_interval.as_(Time.SECONDS)

        self._election_deadline = (
            datetime.utcnow() +
            timedelta(seconds=election_timeout.as_(Time.SECONDS)))

        self._driver = driver
        self._cluster_name = cluster_name  # For logging.
        self._epoch = epoch
        self._master_callback = master_callback

        self._positions = OrderedDict(
        )  # Slave {Task ID: Position} mappings. Use OrderedDict so we can
        # easily locate the first added slave.
        self._mesos_slaves = {}  # Slave {Task ID, Mesos slave ID)} mappings.
        self._master = None  # Elected master (its TaskID); initially None and can still be None after
        # the election has timed out and there are no slaves to elect from.

        self._lock = threading.Lock()
        self._aborted = threading.Event(
        )  # Elector thread aborted (don't invoke callback).
        self._completed = threading.Event(
        )  # Election process completed (invoke callback).
Example #30
0
 def __init__(self,
              pex_location,
              checkpoint_root=DEFAULT_CHECKPOINT_ROOT,
              artifact_dir=None,
              task_runner_class=ThermosTaskRunner,
              max_wait=Amount(1, Time.MINUTES),
              preemption_wait=Amount(1, Time.MINUTES),
              poll_interval=Amount(500, Time.MILLISECONDS),
              clock=time):
     self._artifact_dir = artifact_dir or safe_mkdtemp()
     self._checkpoint_root = checkpoint_root
     self._clock = clock
     self._max_wait = max_wait
     self._pex_location = pex_location
     self._poll_interval = poll_interval
     self._preemption_wait = preemption_wait
     self._task_runner_class = task_runner_class
Example #31
0
def test_waiting_executor():
  proxy_driver = ProxyDriver()
  with temporary_dir() as checkpoint_root:
    te = AuroraExecutor(
        runner_provider=make_provider(checkpoint_root),
        sandbox_provider=DefaultTestSandboxProvider())
    ExecutorTimeout(te.launched, proxy_driver, timeout=Amount(100, Time.MILLISECONDS)).start()
    proxy_driver.wait_stopped()
Example #32
0
  def run(self):
    tasks = []
    now = time.time()

    TaskTuple = namedtuple('TaskTuple', 'task_id age metadata_size log_size data_size')
    for task_id in self.collector.get_finished_tasks():
      age = Amount(int(now - self.collector.get_age(task_id)), Time.SECONDS)
      self.log('Analyzing task %s (age: %s)... ' % (task_id, age))
      metadata_size = Amount(sum(sz for _, sz in self.collector.get_metadata(task_id)), Data.BYTES)
      self.log('  metadata %.1fKB ' % metadata_size.as_(Data.KB))
      log_size = Amount(sum(sz for _, sz in self.collector.get_logs(task_id)), Data.BYTES)
      self.log('  logs %.1fKB ' % log_size.as_(Data.KB))
      data_size = Amount(sum(sz for _, sz in self.collector.get_data(task_id)), Data.BYTES)
      self.log('  data %.1fMB ' % data_size.as_(Data.MB))
      tasks.append(TaskTuple(task_id, age, metadata_size, log_size, data_size))

    gc_tasks = set()
    gc_tasks.update(task for task in tasks if task.age > self._max_age)
    self.log('After age filter: %s tasks' % len(gc_tasks))

    def total_gc_size(task):
      return sum([task.data_size,
                  task.metadata_size if self._include_metadata else Amount(0, Data.BYTES),
                  task.log_size if self._include_logs else Amount(0, Data.BYTES)],
                  Amount(0, Data.BYTES))

    total_used = Amount(0, Data.BYTES)
    for task in sorted(tasks, key=lambda tsk: tsk.age, reverse=True):
      if task not in gc_tasks:
        total_used += total_gc_size(task)
        if total_used > self._max_space:
          gc_tasks.add(task)
    self.log('After size filter: %s tasks' % len(gc_tasks))

    for task in sorted(tasks, key=lambda tsk: tsk.age, reverse=True):
      if task not in gc_tasks and len(tasks) - len(gc_tasks) > self._max_tasks:
        gc_tasks.add(task)
    self.log('After total task filter: %s tasks' % len(gc_tasks))

    self.log('Deciding to garbage collect the following tasks:')
    if gc_tasks:
      for task in gc_tasks:
        self.log('   %s' % repr(task))
    else:
      self.log('   None.')

    return gc_tasks
 def run_to_completion(self, runner, max_wait=Amount(10, Time.SECONDS)):
     poll_interval = Amount(100, Time.MILLISECONDS)
     total_time = Amount(0, Time.SECONDS)
     while runner.status is None and total_time < max_wait:
         total_time += poll_interval
         time.sleep(poll_interval.as_(Time.SECONDS))