Ejemplo n.º 1
0
    def test_no_hang_when_group_id_cycled(self):
        planner = WorkPlanner()

        def my_state():
            return State(next_group_id=99999,
                         active_commands=LinkedList(),
                         current_group_id=99999,
                         next_group_commands=set(),
                         is_shutdown=False)

        planner = WorkPlanner(init_state_factory=my_state)

        tracker = GroupTracker()
        thread_count = 1
        for i in range(10):
            planner.add_command(process_event())

        for j in range(thread_count):
            planner.add_command(Die())

        exc = None

        def fn(planner: WorkPlanner):
            nonlocal exc
            try:
                while True:
                    LOG.log(TRACE, "Requesting for a work")
                    # import pudb.remote
                    # pudb.remote.set_trace(term_size=(120, 40), port=9998)
                    cmd = planner.get_next_command()
                    LOG.log(TRACE, "The command is received %s [group=%s]",
                            type(cmd), cmd.group)

                    if isinstance(cmd, Die):
                        LOG.log(TRACE,
                                "Poison pill is received - exiting. Bye!")
                        planner.notify_finished(cmd)
                        break
                    tracker.log(cmd)
                    LOG.log(TRACE, "The job is done, notifying the planner")
                    planner.notify_finished(cmd)
                    LOG.log(TRACE, "Notified. ")

            except Exception as e:
                LOG.exception('*** ERROR ***')
                exc = e

        workers = [
            Thread(target=fn, args=(planner, )) for t in range(thread_count)
        ]
        for t in workers:
            t.start()

        for t in workers:
            t.join()
        if exc:
            raise exc
        groups_processed = tracker.get_tracks()
        self.assertEqual([99999, 10**5, 0, 1, 2, 3, 4,
                          5, 6, 7], groups_processed)
Ejemplo n.º 2
0
    def test_ha_nvec_get_shares_group_always(self):
        planner = WorkPlanner()

        assign = planner._assign_group

        msgs_after_bc = [
            assign(broadcast()),
            assign(nvec_get()),
            assign(broadcast()),
            assign(entrypoint())
        ]
        msgs_after_ep = [
            assign(entrypoint()),
            assign(nvec_get()),
            assign(broadcast()),
            assign(entrypoint())
        ]
        msgs_after_nvec = [
            assign(entrypoint()),
            assign(nvec_get()),
            assign(nvec_get()),
            assign(entrypoint())
        ]
        self.assertEqual([0, 0, 1, 2], [m.group for m in msgs_after_bc])
        self.assertEqual([2, 2, 3, 4], [m.group for m in msgs_after_ep])
        self.assertEqual([4, 4, 4, 4], [m.group for m in msgs_after_nvec])
Ejemplo n.º 3
0
def main():
    # Note: no logging must happen before this call.
    # Otherwise the log configuration will not apply.
    setup_logging()

    # [KN] The elements in the queue will appear if
    # 1. A callback is invoked from ha_link (this will happen in a motr
    #    thread which must be free ASAP)
    # 2. A new HA notification has come form Consul via HTTP
    # [KN] The messages are consumed by Python thread created by
    # _run_qconsumer_thread function.
    #
    # [KN] Note: The server is launched in the main thread.
    planner = WorkPlanner()

    util: ConsulUtil = ConsulUtil()
    _remove_stale_session(util)
    cfg: HL_Fids = _get_motr_fids(util)

    LOG.info('Welcome to HaX')
    LOG.info(f'Setting up ha_link interface with the options as follows: '
             f'hax fid = {cfg.hax_fid}, hax endpoint = {cfg.hax_ep}, '
             f'HA fid = {cfg.ha_fid}')

    ffi = HaxFFI()
    herald = DeliveryHerald()
    motr = Motr(planner=planner, ffi=ffi, herald=herald, consul_util=util)

    # Note that consumer thread must be started before we invoke motr.start(..)
    # Reason: hax process will send entrypoint request and somebody needs
    # to reply it.

    # TODO make the number of threads configurable
    consumer_threads = [
        _run_qconsumer_thread(planner, motr, herald, util, i) for i in range(4)
    ]

    try:
        # [KN] We use just the first profile for Spiel API for now.
        motr.start(cfg.hax_ep,
                   process=cfg.hax_fid,
                   ha_service=cfg.ha_fid,
                   profile=cfg.profiles[0])
        LOG.info('Motr API has been started')
        rconfc_starter = _run_rconfc_starter_thread(motr, consul_util=util)

        stats_updater = _run_stats_updater_thread(motr, consul_util=util)
        event_poller = _run_thread(create_ha_thread(planner, util))
        # [KN] This is a blocking call. It will work until the program is
        # terminated by signal

        server = ServerRunner(planner, herald, consul_util=util)
        server.run(threads_to_wait=[
            *consumer_threads, stats_updater, rconfc_starter, event_poller
        ])
    except Exception:
        LOG.exception('Exiting due to an exception')
    finally:
        motr.fini()
Ejemplo n.º 4
0
    def test_entrypoint_not_paralleled_with_broadcast(self):
        planner = WorkPlanner()
        bcast = broadcast()
        ep1 = entrypoint()

        bcast = planner._assign_group(bcast)
        ep1 = planner._assign_group(ep1)
        self.assertEqual([0, 1], [bcast.group, ep1.group])
Ejemplo n.º 5
0
    def test_entrypoint_requests_share_same_group(self):
        planner = WorkPlanner()
        ep1 = entrypoint()
        ep2 = entrypoint()

        ep1 = planner._assign_group(ep1)
        ep2 = planner._assign_group(ep2)
        self.assertEqual([0, 0], [ep1.group, ep2.group])
Ejemplo n.º 6
0
    def test_entrypoint_executed_asap(self):
        planner = WorkPlanner()

        a = planner._assign_group
        self.assertEqual(
            [0, 1, 0],
            [a(i)[0].group
             for i in [broadcast(), broadcast(),
                       entrypoint()]])
Ejemplo n.º 7
0
    def test_broadcast_starts_new_group(self):
        planner = WorkPlanner()

        assign = planner._assign_group

        msgs = [
            assign(broadcast()),
            assign(broadcast()),
            assign(broadcast()),
            assign(entrypoint())
        ]
        self.assertEqual([0, 1, 2, 3], [m.group for m in msgs])
Ejemplo n.º 8
0
    def test_broadcast_does_not_start_new_group(self):
        planner = WorkPlanner()

        assign = planner._assign_group

        msgs = [
            assign(broadcast()),
            assign(broadcast()),
            assign(broadcast()),
            assign(nvec_get())
        ]
        self.assertEqual([0, 1, 2, 0], [m.group for (m, _) in msgs])
Ejemplo n.º 9
0
    def test_parallelism_is_possible(self):
        planner = WorkPlanner()
        for i in range(40):
            planner.add_command(entrypoint())

        for j in range(4):
            planner.add_command(Die())

        exc = None

        def fn(planner: WorkPlanner):
            nonlocal exc
            try:
                while True:
                    LOG.log(TRACE, "Requesting for a work")
                    cmd = planner.get_next_command()
                    LOG.log(TRACE, "The command is received")
                    if isinstance(cmd, Die):
                        LOG.log(TRACE,
                                "Poison pill is received - exiting. Bye!")
                        break

                    sleep(0.5)
                    LOG.log(TRACE, "The job is done, notifying the planner")
                    planner.notify_finished(cmd)
                    LOG.log(TRACE, "Notified. ")

            except Exception as e:
                LOG.exception('*** ERROR ***')
                exc = e

        workers = [Thread(target=fn, args=(planner, )) for t in range(4)]
        time_1 = time.time()
        for t in workers:
            t.start()

        for t in workers:
            t.join()
        time_2 = time.time()
        logging.info('Processing time %s', time_2 - time_1)
        if exc:
            raise exc
        self.assertTrue(planner.is_empty(), 'Not all commands were read out')
        # Every thread sleeps for 500ms. 40 commands * 0.5 gives 20 seconds if
        # the commands executed sequentially
        self.assertLess(time_2 - time_1, 19, 'Suspiciously slow')
Ejemplo n.º 10
0
    def test_group_id_cycled(self):
        def my_state():
            return State(next_group_id=99999,
                         active_commands=LinkedList(),
                         current_group_id=99999,
                         next_group_commands=set(),
                         is_shutdown=False)

        planner = WorkPlanner(init_state_factory=my_state)
        assign = planner._assign_group

        msgs = [
            assign(broadcast()),
            assign(broadcast()),
            assign(broadcast()),
            assign(broadcast()),
        ]
        self.assertEqual([99999, 10**5, 0, 1], [m.group for (m, _) in msgs])
Ejemplo n.º 11
0
def main():
    # Note: no logging must happen before this call.
    # Otherwise the log configuration will not apply.
    setup_logging()
    set_locale()
    inject.configure(di_configuration)

    state = inject.instance(HaxGlobalState)

    # [KN] The elements in the work planner will appear if
    # 1. A callback is invoked from ha_link (this will happen in a motr
    #    thread which must be free ASAP)
    # 2. A new HA notification has come form Consul via HTTP
    # [KN] The messages are consumed by Python threads created by
    # _run_qconsumer_thread function.
    #
    # [KN] Note: The server is launched in the main thread.
    planner = WorkPlanner()

    def handle_signal(sig, frame):
        state.set_stopping()
        planner.shutdown()

    # This is necessary to allow hax to exit early if Consul is not available
    # (otherwise _get_motr_fids() may be retrying forever even if the hax
    # process needs to shutdown).
    signal.signal(signal.SIGINT, handle_signal)

    util: ConsulUtil = ConsulUtil()
    # Avoid removing session on hax start as this will happen
    # on every node, thus leader election will keep re-triggering
    # until the final hax node starts, this will delay further
    # bootstrapping operations.
    _remove_stale_session(util)
    cfg: HL_Fids = _get_motr_fids(util)
    hax_http_port = util.get_hax_http_port()
    util.init_motr_processes_status()

    LOG.info('Welcome to HaX')
    LOG.info(f'Setting up ha_link interface with the options as follows: '
             f'hax fid = {cfg.hax_fid}, hax endpoint = {cfg.hax_ep}, '
             f'HA fid = {cfg.ha_fid}')

    ffi = HaxFFI()
    herald = DeliveryHerald()
    motr = Motr(planner=planner, ffi=ffi, herald=herald, consul_util=util)

    # Note that consumer thread must be started before we invoke motr.start(..)
    # Reason: hax process will send entrypoint request and somebody needs
    # to reply it.

    # TODO make the number of threads configurable
    consumer_threads = [
        _run_qconsumer_thread(planner, motr, herald, util, i)
        for i in range(32)
    ]

    try:
        # [KN] We use just the first profile for Spiel API for now.
        motr.start(cfg.hax_ep,
                   process=cfg.hax_fid,
                   ha_service=cfg.ha_fid,
                   profile=cfg.profiles[0])
        LOG.info('Motr API has been started')
        rconfc_starter = _run_rconfc_starter_thread(motr, consul_util=util)

        stats_updater = _run_stats_updater_thread(motr, consul_util=util)
        bc_updater = _run_bc_updater_thread(motr, consul_util=util)
        event_poller = _run_thread(create_ha_thread(planner, util))
        # [KN] This is a blocking call. It will work until the program is
        # terminated by signal

        server = ServerRunner(planner,
                              herald,
                              consul_util=util,
                              hax_state=state)
        server.run(threads_to_wait=[
            *consumer_threads, stats_updater, bc_updater, rconfc_starter,
            event_poller
        ],
                   port=hax_http_port)
    except Exception:
        LOG.exception('Exiting due to an exception')
    finally:
        motr.fini()
Ejemplo n.º 12
0
    def test_groups_processed_sequentially_4_threads(self):
        planner = WorkPlanner()
        group_idx = 0

        def ret_values(cmd: BaseMessage) -> bool:
            nonlocal group_idx
            # We don't care about the group distribution logic
            # in this test. Instead, we concentrate how different group
            # numbers are processed by the workers and the order
            # in which they are allowed to process the messages.
            #
            # _assign_group is invoked under a lock acquired, so this
            # increment is thread-safe.
            values = [0, 1, 0, 1, 0, 1, 0, 1, 0, 1]
            ret = bool(values[group_idx])
            group_idx += 1
            return ret

        setattr(planner, '_should_increase_group',
                Mock(side_effect=ret_values))
        tracker = GroupTracker()
        thread_count = 4
        for i in range(10):
            planner.add_command(entrypoint())

        for j in range(thread_count):
            planner.add_command(Die())

        exc = None

        def fn(planner: WorkPlanner):
            nonlocal exc
            try:
                while True:
                    LOG.log(TRACE, "Requesting for a work")
                    cmd = planner.get_next_command()
                    LOG.log(TRACE, "The command is received %s [group=%s]",
                            type(cmd), cmd.group)

                    if isinstance(cmd, Die):
                        LOG.log(TRACE,
                                "Poison pill is received - exiting. Bye!")
                        planner.notify_finished(cmd)
                        break
                    tracker.log(cmd)
                    LOG.log(TRACE, "The job is done, notifying the planner")
                    planner.notify_finished(cmd)
                    LOG.log(TRACE, "Notified. ")

            except Exception as e:
                LOG.exception('*** ERROR ***')
                exc = e

        workers = [
            Thread(target=fn, args=(planner, )) for t in range(thread_count)
        ]
        for t in workers:
            t.start()

        for t in workers:
            t.join()
        if exc:
            raise exc
        groups_processed = tracker.get_tracks()
        self.assertEqual([0, 1, 1, 2, 2, 3, 3, 4, 4, 5], groups_processed)
Ejemplo n.º 13
0
    def test_workers_not_blocked_by_future_work(self):
        planner = WorkPlanner()
        group_idx = 0

        tracker = TimeTracker()
        thread_count = 2
        # We add way more commands than we have workers now
        for i in range(8):
            planner.add_command(broadcast())

        planner.add_command(entrypoint())

        for j in range(thread_count):
            planner.add_command(Die())

        exc = None

        def fn(planner: WorkPlanner):
            nonlocal exc
            try:
                while True:
                    LOG.log(TRACE, "Requesting for a work")
                    cmd = planner.get_next_command()
                    LOG.log(TRACE, "The command is received %s [group=%s]",
                            type(cmd), cmd.group)

                    if isinstance(cmd, BroadcastHAStates):
                        time.sleep(1)

                    if isinstance(cmd, EntrypointRequest):
                        planner.shutdown()

                    if isinstance(cmd, Die):
                        LOG.log(TRACE,
                                "Poison pill is received - exiting. Bye!")
                        planner.notify_finished(cmd)
                        break
                    tracker.log(cmd)
                    LOG.log(TRACE, "The job is done, notifying the planner")
                    planner.notify_finished(cmd)
                    LOG.log(TRACE, "Notified. ")

            except Exception as e:
                LOG.exception('*** ERROR ***')
                exc = e

        workers = [
            Thread(target=fn, args=(planner, )) for t in range(thread_count)
        ]

        t0 = time.time()
        for t in workers:
            t.start()

        for t in workers:
            t.join()
        if exc:
            raise exc
        tracks = tracker.get_tracks()

        idx, (cmd, ts) = self.find(tracks,
                             lambda a: isinstance(a[0], EntrypointRequest),
                             'EntrypointRequest not processed')
        self.assertTrue(ts - t0 < 3)
        self.assertTrue(len(tracks) < 4)
Ejemplo n.º 14
0
    def test_entrypoint_request_processed_asap(self):
        planner = WorkPlanner()
        group_idx = 0

        def ret_values(cmd: BaseMessage) -> bool:
            nonlocal group_idx
            # We don't care about the group distribution logic
            # in this test. Instead, we concentrate how different group
            # numbers are processed by the workers and the order
            # in which they are allowed to process the messages.
            #
            # _assign_group is invoked under a lock acquired, so this
            # increment is thread-safe.
            values = [0, 1, 0, 1, 0, 1, 0, 1, 0, 1]
            ret = bool(values[group_idx])
            group_idx += 1
            return ret

        tracker = TimeTracker()
        thread_count = 4
        planner.add_command(broadcast())
        planner.add_command(broadcast())
        planner.add_command(broadcast())
        planner.add_command(entrypoint())

        for j in range(thread_count):
            planner.add_command(Die())

        exc = None

        def fn(planner: WorkPlanner):
            nonlocal exc
            try:
                while True:
                    LOG.log(TRACE, "Requesting for a work")
                    cmd = planner.get_next_command()
                    LOG.log(TRACE, "The command is received %s [group=%s]",
                            type(cmd), cmd.group)
                    if isinstance(cmd, BroadcastHAStates):
                        time.sleep(1.5)

                    if isinstance(cmd, Die):
                        LOG.log(TRACE,
                                "Poison pill is received - exiting. Bye!")
                        planner.notify_finished(cmd)
                        break
                    tracker.log(cmd)
                    LOG.log(TRACE, "The job is done, notifying the planner")
                    planner.notify_finished(cmd)
                    LOG.log(TRACE, "Notified. ")

            except Exception as e:
                LOG.exception('*** ERROR ***')
                exc = e

        workers = [
            Thread(target=fn, args=(planner, )) for t in range(thread_count)
        ]
        for t in workers:
            t.start()

        for t in workers:
            t.join()
        if exc:
            raise exc
        tracks = tracker.get_tracks()
        (cmd, ts) = tracks[0]
        self.assertTrue(isinstance(cmd, EntrypointRequest))