Example #1
0
    def _create_fn_schedule(self,
                            dag,
                            arg,
                            target,
                            fnames,
                            consistency=NORMAL):
        schedule = DagSchedule()
        schedule.id = 'id'
        schedule.dag.CopyFrom(dag)
        schedule.target_function = target
        schedule.consistency = consistency

        # The BEGIN trigger is sent by the scheduler.
        schedule.triggers.append('BEGIN')

        # We set all locations as thread ID 0.
        for fname in fnames:
            schedule.locations[fname] = self.ip + ':0'
        val = schedule.arguments[target].values.add()

        # Set the square function's argument.
        serializer.dump(arg, val, False)
        schedule.start_time = time.time()

        # Create a trigger corresponding to this DAG.
        trigger = DagTrigger()
        trigger.id = schedule.id
        trigger.target_function = schedule.target_function
        trigger.source = 'BEGIN'

        return schedule, {'BEGIN': trigger}
Example #2
0
    def test_dag_call_no_refs(self):
        '''
        Tests a DAG call without any references. We do not currently have a
        test for selecting a DAG call with references because the reference
        logic in the default policy is the same for individual functions
        (tested above) and for DAGs.
        '''

        # Create a simple DAG.
        source = 'source'
        sink = 'sink'
        dag, source_address, sink_address = self._construct_dag_with_locations(
            source, sink)

        # Create a DAG call that corresponds to this new DAG.
        call = DagCall()
        call.name = dag.name
        call.consistency = NORMAL
        call.output_key = 'output_key'
        call.client_id = '0'

        # Execute the scheduling policy.
        call_dag(call, self.pusher_cache, {dag.name: (dag, {source})},
                 self.policy)

        # Check that the correct number of messages were sent.
        self.assertEqual(len(self.pusher_cache.socket.outbox), 3)

        # Extract each of the two schedules and ensure that they are correct.
        source_schedule = DagSchedule()
        source_schedule.ParseFromString(self.pusher_cache.socket.outbox[0])
        self._verify_dag_schedule(source, 'BEGIN', source_schedule, dag, call)

        sink_schedule = DagSchedule()
        sink_schedule.ParseFromString(self.pusher_cache.socket.outbox[1])
        self._verify_dag_schedule(sink, source, sink_schedule, dag, call)

        # Make sure that only trigger was sent, and it was for the DAG source.
        trigger = DagTrigger()
        trigger.ParseFromString(self.pusher_cache.socket.outbox[2])
        self.assertEqual(trigger.id, source_schedule.id)
        self.assertEqual(trigger.target_function, source)
        self.assertEqual(trigger.source, 'BEGIN')
        self.assertEqual(len(trigger.version_locations), 0)
        self.assertEqual(len(trigger.dependencies), 0)

        # Ensure that all the the destination addresses match the addresses we
        # expect.
        self.assertEqual(len(self.pusher_cache.addresses), 3)
        self.assertEqual(self.pusher_cache.addresses[0],
                         utils.get_queue_address(*source_address))
        self.assertEqual(self.pusher_cache.addresses[1],
                         utils.get_queue_address(*sink_address))
        self.assertEqual(
            self.pusher_cache.addresses[2],
            sutils.get_dag_trigger_address(':'.join(
                map(lambda s: str(s), source_address))))
Example #3
0
def _construct_trigger(sid, fname, result):
    trigger = DagTrigger()
    trigger.id = sid
    trigger.source = fname

    if type(result) != tuple:
        result = (result, )

    trigger.arguments.values.extend(
        list(map(lambda v: serializer.dump(v, None, False), result)))
    return trigger
Example #4
0
    def test_exec_causal_dag_non_sink(self):
        '''
        Creates and executes a non-sink function in a causal-mode DAG. This
        should be exactly the same as the non-causal version of the test,
        except we ensure that the causal metadata is empty, because we don't
        have any KVS accesses.
        '''

        # Create two functions intended to be used in sequence.
        def incr(_, x):
            x + 1

        iname = 'incr'

        def square(_, x):
            return x * x

        sname = 'square'
        arg = 1

        # Create a DAG and a trigger for the first function in the DAG.
        dag = create_linear_dag([incr, square], [iname, sname],
                                self.kvs_client, 'dag', MultiKeyCausalLattice)
        schedule, triggers = self._create_fn_schedule(dag, arg, iname,
                                                      [iname, sname], MULTI)

        exec_dag_function(self.pusher_cache, self.kvs_client, triggers, incr,
                          schedule, self.user_library, {}, {})

        # Assert that there has been a message sent.
        self.assertEqual(len(self.pusher_cache.socket.outbox), 1)

        # Extract that message and check its contents.
        trigger = DagTrigger()
        trigger.ParseFromString(self.pusher_cache.socket.outbox[0])
        self.assertEqual(trigger.id, schedule.id)
        self.assertEqual(trigger.target_function, sname)
        self.assertEqual(trigger.source, iname)
        self.assertEqual(len(trigger.arguments.values), 1)
        self.assertEqual(len(trigger.version_locations), 0)
        self.assertEqual(len(trigger.dependencies), 0)

        val = serializer.load(trigger.arguments.values[0])
        self.assertEqual(val, incr('', arg))
Example #5
0
    def test_exec_dag_non_sink(self):
        '''
        Executes a non-sink function in a DAG and ensures that the correct
        downstream trigger was sent with a correct execution of the function.
        '''

        # Create two functions intended to be used in sequence.
        def incr(_, x):
            x + 1

        iname = 'incr'

        def square(_, x):
            return x * x

        sname = 'square'
        arg = 1

        # Create a DAG and a trigger for the first function in the DAG.
        dag = create_linear_dag([incr, square], [iname, sname],
                                self.kvs_client, 'dag')
        schedule, triggers = self._create_fn_schedule(dag, arg, iname,
                                                      [iname, sname])

        exec_dag_function(self.pusher_cache, self.kvs_client, triggers, incr,
                          schedule, self.user_library, {}, {})

        # Assert that there has been a message sent.
        self.assertEqual(len(self.pusher_cache.socket.outbox), 1)

        # Extract that message and check its contents.
        trigger = DagTrigger()
        trigger.ParseFromString(self.pusher_cache.socket.outbox[0])
        self.assertEqual(trigger.id, schedule.id)
        self.assertEqual(trigger.target_function, sname)
        self.assertEqual(trigger.source, iname)
        self.assertEqual(len(trigger.arguments.values), 1)

        val = serializer.load(trigger.arguments.values[0])
        self.assertEqual(val, incr('', arg))
Example #6
0
def call_dag(call, pusher_cache, dags, policy):
    dag, sources = dags[call.name]

    schedule = DagSchedule()
    schedule.id = str(uuid.uuid4())
    schedule.dag.CopyFrom(dag)
    schedule.start_time = time.time()
    schedule.consistency = call.consistency

    if call.response_address:
        schedule.response_address = call.response_address

    if call.output_key:
        schedule.output_key = call.output_key

    if call.client_id:
        schedule.client_id = call.client_id

    for fref in dag.functions:
        args = call.function_args[fref.name].values

        refs = list(
            filter(lambda arg: type(arg) == CloudburstReference,
                   map(lambda arg: serializer.load(arg), args)))

        result = policy.pick_executor(refs, fref.name)
        if result is None:
            response = GenericResponse()
            response.success = False
            response.error = NO_RESOURCES
            return response

        ip, tid = result
        schedule.locations[fref.name] = ip + ':' + str(tid)

        # copy over arguments into the dag schedule
        arg_list = schedule.arguments[fref.name]
        arg_list.values.extend(args)

    for fref in dag.functions:
        loc = schedule.locations[fref.name].split(':')
        ip = utils.get_queue_address(loc[0], loc[1])
        schedule.target_function = fref.name

        triggers = sutils.get_dag_predecessors(dag, fref.name)
        if len(triggers) == 0:
            triggers.append('BEGIN')

        schedule.ClearField('triggers')
        schedule.triggers.extend(triggers)

        sckt = pusher_cache.get(ip)
        sckt.send(schedule.SerializeToString())

    for source in sources:
        trigger = DagTrigger()
        trigger.id = schedule.id
        trigger.source = 'BEGIN'
        trigger.target_function = source

        ip = sutils.get_dag_trigger_address(schedule.locations[source])
        sckt = pusher_cache.get(ip)
        sckt.send(trigger.SerializeToString())

    response = GenericResponse()
    response.success = True
    if schedule.output_key:
        response.response_id = schedule.output_key
    else:
        response.response_id = schedule.id

    return response
Example #7
0
def executor(ip, mgmt_ip, schedulers, thread_id):
    # logging.basicConfig(stream=sys.stdout, level=logging.INFO, format='%(asctime)s %(message)s')
    logging.basicConfig(filename='log_executor.txt',
                        level=logging.INFO,
                        filemode="w",
                        format='%(asctime)s %(message)s')

    # Check what resources we have access to, set as an environment variable.
    if os.getenv('EXECUTOR_TYPE', 'CPU') == 'GPU':
        exec_type = GPU
    else:
        exec_type = CPU

    context = zmq.Context(1)
    poller = zmq.Poller()

    pin_socket = context.socket(zmq.PULL)
    pin_socket.bind(sutils.BIND_ADDR_TEMPLATE % (sutils.PIN_PORT + thread_id))

    unpin_socket = context.socket(zmq.PULL)
    unpin_socket.bind(sutils.BIND_ADDR_TEMPLATE %
                      (sutils.UNPIN_PORT + thread_id))

    exec_socket = context.socket(zmq.PULL)
    exec_socket.bind(sutils.BIND_ADDR_TEMPLATE %
                     (sutils.FUNC_EXEC_PORT + thread_id))

    dag_queue_socket = context.socket(zmq.PULL)
    dag_queue_socket.bind(sutils.BIND_ADDR_TEMPLATE %
                          (sutils.DAG_QUEUE_PORT + thread_id))

    dag_exec_socket = context.socket(zmq.PULL)
    dag_exec_socket.bind(sutils.BIND_ADDR_TEMPLATE %
                         (sutils.DAG_EXEC_PORT + thread_id))

    self_depart_socket = context.socket(zmq.PULL)
    self_depart_socket.bind(sutils.BIND_ADDR_TEMPLATE %
                            (sutils.SELF_DEPART_PORT + thread_id))

    pusher_cache = SocketCache(context, zmq.PUSH)

    poller = zmq.Poller()
    poller.register(pin_socket, zmq.POLLIN)
    poller.register(unpin_socket, zmq.POLLIN)
    poller.register(exec_socket, zmq.POLLIN)
    poller.register(dag_queue_socket, zmq.POLLIN)
    poller.register(dag_exec_socket, zmq.POLLIN)
    poller.register(self_depart_socket, zmq.POLLIN)

    # If the management IP is set to None, that means that we are running in
    # local mode, so we use a regular AnnaTcpClient rather than an IPC client.
    has_ephe = False
    if mgmt_ip:
        if 'STORAGE_OR_DEFAULT' in os.environ and os.environ[
                'STORAGE_OR_DEFAULT'] == '0':
            client = AnnaTcpClient(os.environ['ROUTE_ADDR'],
                                   ip,
                                   local=False,
                                   offset=thread_id)
            has_ephe = True
        else:
            client = AnnaIpcClient(thread_id, context)
        # force_remote_anna = 1
        # if 'FORCE_REMOTE' in os.environ:
        #     force_remote_anna = int(os.environ['FORCE_REMOTE'])

        # if force_remote_anna == 0: # remote anna only
        #     client = AnnaTcpClient(os.environ['ROUTE_ADDR'], ip, local=False, offset=thread_id)
        # elif force_remote_anna == 1: # anna cache
        #     client = AnnaIpcClient(thread_id, context)
        # elif force_remote_anna == 2: # control both cache and remote anna
        #     remote_client = AnnaTcpClient(os.environ['ROUTE_ADDR'], ip, local=False, offset=thread_id)
        #     cache_client = AnnaIpcClient(thread_id, context)
        #     client = cache_client
        #     user_library = CloudburstUserLibrary(context, pusher_cache, ip, thread_id, (cache_client, remote_client))

        local = False
    else:
        client = AnnaTcpClient('127.0.0.1', '127.0.0.1', local=True, offset=1)
        local = True

    user_library = CloudburstUserLibrary(context,
                                         pusher_cache,
                                         ip,
                                         thread_id,
                                         client,
                                         has_ephe=has_ephe)

    status = ThreadStatus()
    status.ip = ip
    status.tid = thread_id
    status.running = True
    status.type = exec_type
    utils.push_status(schedulers, pusher_cache, status)

    departing = False

    # Maintains a request queue for each function pinned on this executor. Each
    # function will have a set of request IDs mapped to it, and this map stores
    # a schedule for each request ID.
    queue = {}

    # Tracks the actual function objects that are pinned to this executor.
    function_cache = {}

    # Tracks runtime cost of excuting a DAG function.
    runtimes = {}

    # If multiple triggers are necessary for a function, track the triggers as
    # we receive them. This is also used if a trigger arrives before its
    # corresponding schedule.
    received_triggers = {}

    # Tracks when we received a function request, so we can report end-to-end
    # latency for the whole executio.
    receive_times = {}

    # Tracks the number of requests we are finishing for each function pinned
    # here.
    exec_counts = {}

    # Tracks the end-to-end runtime of each DAG request for which we are the
    # sink function.
    dag_runtimes = {}

    # A map with KVS keys and their corresponding deserialized payloads.
    cache = {}

    # A map which tracks the most recent DAGs for which we have finished our
    # work.
    finished_executions = {}

    # The set of pinned functions and whether they support batching. NOTE: This
    # is only a set for local mode -- in cluster mode, there will only be one
    # pinned function per executor.
    batching = False

    # Internal metadata to track thread utilization.
    report_start = time.time()
    event_occupancy = {
        'pin': 0.0,
        'unpin': 0.0,
        'func_exec': 0.0,
        'dag_queue': 0.0,
        'dag_exec': 0.0
    }
    total_occupancy = 0.0

    while True:
        socks = dict(poller.poll(timeout=1000))

        if pin_socket in socks and socks[pin_socket] == zmq.POLLIN:
            work_start = time.time()
            batching = pin(pin_socket, pusher_cache, client, status,
                           function_cache, runtimes, exec_counts, user_library,
                           local, batching)
            utils.push_status(schedulers, pusher_cache, status)

            elapsed = time.time() - work_start
            event_occupancy['pin'] += elapsed
            total_occupancy += elapsed

        if unpin_socket in socks and socks[unpin_socket] == zmq.POLLIN:
            work_start = time.time()
            unpin(unpin_socket, status, function_cache, runtimes, exec_counts)
            utils.push_status(schedulers, pusher_cache, status)

            elapsed = time.time() - work_start
            event_occupancy['unpin'] += elapsed
            total_occupancy += elapsed

        if exec_socket in socks and socks[exec_socket] == zmq.POLLIN:
            work_start = time.time()
            # logging.info(f'Executor timer. exec_socket recv: {work_start}')
            exec_function(exec_socket,
                          client,
                          user_library,
                          cache,
                          function_cache,
                          has_ephe=has_ephe)
            user_library.close()

            utils.push_status(schedulers, pusher_cache, status)

            elapsed = time.time() - work_start
            event_occupancy['func_exec'] += elapsed
            total_occupancy += elapsed

        if dag_queue_socket in socks and socks[dag_queue_socket] == zmq.POLLIN:
            work_start = time.time()
            logging.info(
                f'Executor timer. dag_queue_socket recv: {work_start}')
            # In order to effectively support batching, we have to make sure we
            # dequeue lots of schedules in addition to lots of triggers. Right
            # now, we're not going to worry about supporting batching here,
            # just on the trigger dequeue side, but we still have to dequeue
            # all schedules we've received. We just process them one at a time.
            while True:
                schedule = DagSchedule()
                try:
                    msg = dag_queue_socket.recv(zmq.DONTWAIT)
                except zmq.ZMQError as e:
                    if e.errno == zmq.EAGAIN:
                        break  # There are no more messages.
                    else:
                        raise e  # Unexpected error.

                schedule.ParseFromString(msg)
                fname = schedule.target_function

                logging.info(
                    'Received a schedule for DAG %s (%s), function %s.' %
                    (schedule.dag.name, schedule.id, fname))

                if fname not in queue:
                    queue[fname] = {}

                queue[fname][schedule.id] = schedule

                if (schedule.id, fname) not in receive_times:
                    receive_times[(schedule.id, fname)] = time.time()

                # In case we receive the trigger before we receive the schedule, we
                # can trigger from this operation as well.
                trkey = (schedule.id, fname)
                fref = None

                # Check to see what type of execution this function is.
                for ref in schedule.dag.functions:
                    if ref.name == fname:
                        fref = ref

                if (trkey in received_triggers and
                    ((len(received_triggers[trkey]) == len(schedule.triggers))
                     or (fref.type == MULTIEXEC))):

                    triggers = list(received_triggers[trkey].values())

                    if fname not in function_cache:
                        logging.error('%s not in function cache', fname)
                        utils.generate_error_response(schedule, client, fname)
                        continue
                    exec_start = time.time()
                    # logging.info(f'Executor timer. dag_queue_socket exec_dag: {exec_start}')
                    # We don't support actual batching for when we receive a
                    # schedule before a trigger, so everything is just a batch of
                    # size 1 if anything.
                    success = exec_dag_function(pusher_cache, client,
                                                [triggers],
                                                function_cache[fname],
                                                [schedule], user_library,
                                                dag_runtimes, cache,
                                                schedulers, batching)[0]
                    user_library.close()

                    del received_triggers[trkey]
                    if success:
                        del queue[fname][schedule.id]

                        fend = time.time()
                        fstart = receive_times[(schedule.id, fname)]
                        runtimes[fname].append(fend - work_start)
                        exec_counts[fname] += 1

                        finished_executions[(schedule.id, fname)] = time.time()

            elapsed = time.time() - work_start
            event_occupancy['dag_queue'] += elapsed
            total_occupancy += elapsed

        if dag_exec_socket in socks and socks[dag_exec_socket] == zmq.POLLIN:
            work_start = time.time()
            # logging.info(f'Executor timer. dag_exec_socket recv: {work_start}')

            # How many messages to dequeue -- BATCH_SIZE_MAX or 1 depending on
            # the function configuration.
            if batching:
                count = BATCH_SIZE_MAX
            else:
                count = 1

            trigger_keys = set()

            for _ in range(count):  # Dequeue count number of messages.
                trigger = DagTrigger()

                try:
                    msg = dag_exec_socket.recv(zmq.DONTWAIT)
                except zmq.ZMQError as e:
                    if e.errno == zmq.EAGAIN:  # There are no more messages.
                        break
                    else:
                        raise e  # Unexpected error.

                trigger.ParseFromString(msg)

                # We have received a repeated trigger for a function that has
                # already finished executing.
                if trigger.id in finished_executions:
                    continue

                fname = trigger.target_function
                logging.info(
                    'Received a trigger for schedule %s, function %s.' %
                    (trigger.id, fname))

                key = (trigger.id, fname)
                trigger_keys.add(key)
                if key not in received_triggers:
                    received_triggers[key] = {}

                if (trigger.id, fname) not in receive_times:
                    receive_times[(trigger.id, fname)] = time.time()

                received_triggers[key][trigger.source] = trigger

            # Only execute the functions for which we have received a schedule.
            # Everything else will wait.
            for tid, fname in list(trigger_keys):
                if fname not in queue or tid not in queue[fname]:
                    trigger_keys.remove((tid, fname))

            if len(trigger_keys) == 0:
                continue

            fref = None
            schedule = queue[fname][list(trigger_keys)[0]
                                    [0]]  # Pick a random schedule to check.
            # Check to see what type of execution this function is.
            for ref in schedule.dag.functions:
                if ref.name == fname:
                    fref = ref
                    break

            # Compile a list of all the trigger sets for which we have
            # enough triggers.
            trigger_sets = []
            schedules = []
            for key in trigger_keys:
                if (len(received_triggers[key]) == len(schedule.triggers)) or \
                        fref.type == MULTIEXEC:

                    if fref.type == MULTIEXEC:
                        triggers = [trigger]
                    else:
                        triggers = list(received_triggers[key].values())

                    if fname not in function_cache:
                        logging.error('%s not in function cache', fname)
                        utils.generate_error_response(schedule, client, fname)
                        continue

                    trigger_sets.append(triggers)
                    schedule = queue[fname][key[0]]
                    schedules.append(schedule)

            exec_start = time.time()
            # logging.info(f'Executor timer. dag_exec_socket exec_dag: {exec_start}')
            # Pass all of the trigger_sets into exec_dag_function at once.
            # We also include the batching variaible to make sure we know
            # whether to pass lists into the fn or not.
            if len(trigger_sets) > 0:
                successes = exec_dag_function(pusher_cache, client,
                                              trigger_sets,
                                              function_cache[fname], schedules,
                                              user_library, dag_runtimes,
                                              cache, schedulers, batching)
                user_library.close()
                del received_triggers[key]

                for key, success in zip(trigger_keys, successes):
                    if success:
                        del queue[fname][key[0]]  # key[0] is trigger.id.

                        fend = time.time()
                        fstart = receive_times[key]

                        average_time = (fend - work_start) / len(trigger_keys)

                        runtimes[fname].append(average_time)
                        exec_counts[fname] += 1

                        finished_executions[(schedule.id, fname)] = time.time()

            elapsed = time.time() - work_start
            event_occupancy['dag_exec'] += elapsed
            total_occupancy += elapsed

        if self_depart_socket in socks and socks[self_depart_socket] == \
                zmq.POLLIN:
            # This message does not matter.
            self_depart_socket.recv()

            logging.info('Preparing to depart. No longer accepting requests ' +
                         'and clearing all queues.')

            status.ClearField('functions')
            status.running = False
            utils.push_status(schedulers, pusher_cache, status)

            departing = True

        # periodically report function occupancy
        report_end = time.time()
        if report_end - report_start > REPORT_THRESH:
            if len(cache) > 100:
                extra_keys = list(cache.keys())[:len(cache) - 100]
                for key in extra_keys:
                    del cache[key]

            utilization = total_occupancy / (report_end - report_start)
            status.utilization = utilization

            # Periodically report my status to schedulers with the utilization
            # set.
            utils.push_status(schedulers, pusher_cache, status)

            logging.debug('Total thread occupancy: %.6f' % (utilization))

            for event in event_occupancy:
                occ = event_occupancy[event] / (report_end - report_start)
                logging.debug('\tEvent %s occupancy: %.6f' % (event, occ))
                event_occupancy[event] = 0.0

            stats = ExecutorStatistics()
            for fname in runtimes:
                if exec_counts[fname] > 0:
                    fstats = stats.functions.add()
                    fstats.name = fname
                    fstats.call_count = exec_counts[fname]
                    fstats.runtime.extend(runtimes[fname])

                runtimes[fname].clear()
                exec_counts[fname] = 0

            for dname in dag_runtimes:
                dstats = stats.dags.add()
                dstats.name = dname

                dstats.runtimes.extend(dag_runtimes[dname])

                dag_runtimes[dname].clear()

            # If we are running in cluster mode, mgmt_ip will be set, and we
            # will report our status and statistics to it. Otherwise, we will
            # write to the local conf file
            if mgmt_ip:
                sckt = pusher_cache.get(
                    sutils.get_statistics_report_address(mgmt_ip))
                sckt.send(stats.SerializeToString())

                sckt = pusher_cache.get(utils.get_util_report_address(mgmt_ip))
                sckt.send(status.SerializeToString())
            else:
                logging.info(stats)

            status.ClearField('utilization')
            report_start = time.time()
            total_occupancy = 0.0

            # Periodically clear any old functions we have cached that we are
            # no longer accepting requests for.
            del_list = []
            for fname in queue:
                if len(queue[fname]) == 0 and fname not in status.functions:
                    del_list.append(fname)
                    del function_cache[fname]
                    del runtimes[fname]
                    del exec_counts[fname]

            for fname in del_list:
                del queue[fname]

            del_list = []
            for tid in finished_executions:
                if (time.time() - finished_executions[tid]) > 10:
                    del_list.append(tid)

            for tid in del_list:
                del finished_executions[tid]

            # If we are departing and have cleared our queues, let the
            # management server know, and exit the process.
            if departing and len(queue) == 0:
                sckt = pusher_cache.get(utils.get_depart_done_addr(mgmt_ip))
                sckt.send_string(ip)

                # We specifically pass 1 as the exit code when ending our
                # process so that the wrapper script does not restart us.
                sys.exit(1)
Example #8
0
    def test_exec_causal_dag_non_sink_with_ref(self):
        '''
        Creates and executes a non-sink function in a causal-mode DAG. This
        version accesses a KVS key, so we ensure that data is appropriately
        cached and the metadata is passed downstream.
        '''

        # Create two functions intended to be used in sequence.
        def incr(_, x):
            x + 1

        iname = 'incr'

        def square(_, x):
            return x * x

        sname = 'square'

        # Put tthe argument into the KVS.
        arg_name = 'arg'
        arg_value = 1
        arg = serializer.dump_lattice(arg_value, MultiKeyCausalLattice)
        self.kvs_client.put(arg_name, arg)

        # Create a DAG and a trigger for the first function in the DAG.
        dag = create_linear_dag([incr, square], [iname, sname],
                                self.kvs_client, 'dag', MultiKeyCausalLattice)
        schedule, triggers = self._create_fn_schedule(
            dag, CloudburstReference(arg_name, True), iname, [iname, sname],
            MULTI)

        exec_dag_function(self.pusher_cache, self.kvs_client, triggers, incr,
                          schedule, self.user_library, {}, {})

        # Assert that there has been a message sent.
        self.assertEqual(len(self.pusher_cache.socket.outbox), 1)

        # Extract that message and check its contents.
        trigger = DagTrigger()
        trigger.ParseFromString(self.pusher_cache.socket.outbox[0])
        self.assertEqual(trigger.id, schedule.id)
        self.assertEqual(trigger.target_function, sname)
        self.assertEqual(trigger.source, iname)
        self.assertEqual(len(trigger.arguments.values), 1)

        # Check the metadata of the key that is cached here after execution.
        locs = trigger.version_locations
        self.assertEqual(len(locs), 1)
        self.assertTrue(self.ip in locs.keys())
        self.assertEqual(len(locs[self.ip].keys), 1)
        kv = locs[self.ip].keys[0]
        self.assertEqual(kv.key, arg_name)
        self.assertEqual(VectorClock(dict(kv.vector_clock), True),
                         arg.vector_clock)

        # Check the metatada of the causal dependency passed downstream.
        self.assertEqual(len(trigger.dependencies), 1)
        kv = trigger.dependencies[0]
        self.assertEqual(kv.key, arg_name)
        self.assertEqual(VectorClock(dict(kv.vector_clock), True),
                         arg.vector_clock)

        val = serializer.load(trigger.arguments.values[0])
        self.assertEqual(val, incr('', arg_value))
Example #9
0
def executor(ip, mgmt_ip, schedulers, thread_id):
    logging.basicConfig(filename='log_executor.txt',
                        level=logging.INFO,
                        format='%(asctime)s %(message)s')

    context = zmq.Context(1)
    poller = zmq.Poller()

    pin_socket = context.socket(zmq.PULL)
    pin_socket.bind(sutils.BIND_ADDR_TEMPLATE % (sutils.PIN_PORT + thread_id))

    unpin_socket = context.socket(zmq.PULL)
    unpin_socket.bind(sutils.BIND_ADDR_TEMPLATE %
                      (sutils.UNPIN_PORT + thread_id))

    exec_socket = context.socket(zmq.PULL)
    exec_socket.bind(sutils.BIND_ADDR_TEMPLATE %
                     (sutils.FUNC_EXEC_PORT + thread_id))

    dag_queue_socket = context.socket(zmq.PULL)
    dag_queue_socket.bind(sutils.BIND_ADDR_TEMPLATE %
                          (sutils.DAG_QUEUE_PORT + thread_id))

    dag_exec_socket = context.socket(zmq.PULL)
    dag_exec_socket.bind(sutils.BIND_ADDR_TEMPLATE %
                         (sutils.DAG_EXEC_PORT + thread_id))

    self_depart_socket = context.socket(zmq.PULL)
    self_depart_socket.bind(sutils.BIND_ADDR_TEMPLATE %
                            (sutils.SELF_DEPART_PORT + thread_id))

    pusher_cache = SocketCache(context, zmq.PUSH)

    poller = zmq.Poller()
    poller.register(pin_socket, zmq.POLLIN)
    poller.register(unpin_socket, zmq.POLLIN)
    poller.register(exec_socket, zmq.POLLIN)
    poller.register(dag_queue_socket, zmq.POLLIN)
    poller.register(dag_exec_socket, zmq.POLLIN)
    poller.register(self_depart_socket, zmq.POLLIN)

    # If the management IP is set to None, that means that we are running in
    # local mode, so we use a regular AnnaTcpClient rather than an IPC client.
    if mgmt_ip:
        client = AnnaIpcClient(thread_id, context)
    else:
        client = AnnaTcpClient('127.0.0.1', '127.0.0.1', local=True, offset=1)

    user_library = CloudburstUserLibrary(context, pusher_cache, ip, thread_id,
                                         client)

    status = ThreadStatus()
    status.ip = ip
    status.tid = thread_id
    status.running = True
    utils.push_status(schedulers, pusher_cache, status)

    departing = False

    # Maintains a request queue for each function pinned on this executor. Each
    # function will have a set of request IDs mapped to it, and this map stores
    # a schedule for each request ID.
    queue = {}

    # Tracks the actual function objects that are pinned to this executor.
    function_cache = {}

    # Tracks runtime cost of excuting a DAG function.
    runtimes = {}

    # If multiple triggers are necessary for a function, track the triggers as
    # we receive them. This is also used if a trigger arrives before its
    # corresponding schedule.
    received_triggers = {}

    # Tracks when we received a function request, so we can report end-to-end
    # latency for the whole executio.
    receive_times = {}

    # Tracks the number of requests we are finishing for each function pinned
    # here.
    exec_counts = {}

    # Tracks the end-to-end runtime of each DAG request for which we are the
    # sink function.
    dag_runtimes = {}

    # A map with KVS keys and their corresponding deserialized payloads.
    cache = {}

    # Internal metadata to track thread utilization.
    report_start = time.time()
    event_occupancy = {
        'pin': 0.0,
        'unpin': 0.0,
        'func_exec': 0.0,
        'dag_queue': 0.0,
        'dag_exec': 0.0
    }
    total_occupancy = 0.0

    while True:
        socks = dict(poller.poll(timeout=1000))

        if pin_socket in socks and socks[pin_socket] == zmq.POLLIN:
            work_start = time.time()
            pin(pin_socket, pusher_cache, client, status, function_cache,
                runtimes, exec_counts, user_library)
            utils.push_status(schedulers, pusher_cache, status)

            elapsed = time.time() - work_start
            event_occupancy['pin'] += elapsed
            total_occupancy += elapsed

        if unpin_socket in socks and socks[unpin_socket] == zmq.POLLIN:
            work_start = time.time()
            unpin(unpin_socket, status, function_cache, runtimes, exec_counts)
            utils.push_status(schedulers, pusher_cache, status)

            elapsed = time.time() - work_start
            event_occupancy['unpin'] += elapsed
            total_occupancy += elapsed

        if exec_socket in socks and socks[exec_socket] == zmq.POLLIN:
            work_start = time.time()
            exec_function(exec_socket, client, user_library, cache,
                          function_cache)
            user_library.close()

            utils.push_status(schedulers, pusher_cache, status)

            elapsed = time.time() - work_start
            event_occupancy['func_exec'] += elapsed
            total_occupancy += elapsed

        if dag_queue_socket in socks and socks[dag_queue_socket] == zmq.POLLIN:
            work_start = time.time()

            schedule = DagSchedule()
            schedule.ParseFromString(dag_queue_socket.recv())
            fname = schedule.target_function

            logging.info('Received a schedule for DAG %s (%s), function %s.' %
                         (schedule.dag.name, schedule.id, fname))

            if fname not in queue:
                queue[fname] = {}

            queue[fname][schedule.id] = schedule

            if (schedule.id, fname) not in receive_times:
                receive_times[(schedule.id, fname)] = time.time()

            # In case we receive the trigger before we receive the schedule, we
            # can trigger from this operation as well.
            trkey = (schedule.id, fname)
            if (trkey in received_triggers and
                (len(received_triggers[trkey]) == len(schedule.triggers))):

                exec_dag_function(pusher_cache, client,
                                  received_triggers[trkey],
                                  function_cache[fname], schedule,
                                  user_library, dag_runtimes, cache)
                user_library.close()

                del received_triggers[trkey]
                del queue[fname][schedule.id]

                fend = time.time()
                fstart = receive_times[(schedule.id, fname)]
                runtimes[fname].append(fend - fstart)
                exec_counts[fname] += 1

            elapsed = time.time() - work_start
            event_occupancy['dag_queue'] += elapsed
            total_occupancy += elapsed

        if dag_exec_socket in socks and socks[dag_exec_socket] == zmq.POLLIN:
            work_start = time.time()
            trigger = DagTrigger()
            trigger.ParseFromString(dag_exec_socket.recv())

            fname = trigger.target_function
            logging.info('Received a trigger for schedule %s, function %s.' %
                         (trigger.id, fname))

            key = (trigger.id, fname)
            if key not in received_triggers:
                received_triggers[key] = {}

            if (trigger.id, fname) not in receive_times:
                receive_times[(trigger.id, fname)] = time.time()

            received_triggers[key][trigger.source] = trigger
            if fname in queue and trigger.id in queue[fname]:
                schedule = queue[fname][trigger.id]
                if len(received_triggers[key]) == len(schedule.triggers):
                    exec_dag_function(pusher_cache, client,
                                      received_triggers[key],
                                      function_cache[fname], schedule,
                                      user_library, dag_runtimes, cache)
                    user_library.close()

                    del received_triggers[key]
                    del queue[fname][trigger.id]

                    fend = time.time()
                    fstart = receive_times[(trigger.id, fname)]
                    runtimes[fname].append(fend - fstart)
                    exec_counts[fname] += 1

            elapsed = time.time() - work_start
            event_occupancy['dag_exec'] += elapsed
            total_occupancy += elapsed

        if self_depart_socket in socks and socks[self_depart_socket] == \
                zmq.POLLIN:
            # This message does not matter.
            self_depart_socket.recv()

            logging.info('Preparing to depart. No longer accepting requests ' +
                         'and clearing all queues.')

            status.ClearField('functions')
            status.running = False
            utils.push_status(schedulers, pusher_cache, status)

            departing = True

        # periodically report function occupancy
        report_end = time.time()
        if report_end - report_start > REPORT_THRESH:
            cache.clear()

            utilization = total_occupancy / (report_end - report_start)
            status.utilization = utilization

            # Periodically report my status to schedulers with the utilization
            # set.
            utils.push_status(schedulers, pusher_cache, status)

            logging.info('Total thread occupancy: %.6f' % (utilization))

            for event in event_occupancy:
                occ = event_occupancy[event] / (report_end - report_start)
                logging.info('\tEvent %s occupancy: %.6f' % (event, occ))
                event_occupancy[event] = 0.0

            stats = ExecutorStatistics()
            for fname in runtimes:
                if exec_counts[fname] > 0:
                    fstats = stats.functions.add()
                    fstats.name = fname
                    fstats.call_count = exec_counts[fname]
                    fstats.runtime.extend(runtimes[fname])

                runtimes[fname].clear()
                exec_counts[fname] = 0

            for dname in dag_runtimes:
                dstats = stats.dags.add()
                dstats.name = dname

                dstats.runtimes.extend(dag_runtimes[dname])

                dag_runtimes[dname].clear()

            # If we are running in cluster mode, mgmt_ip will be set, and we
            # will report our status and statistics to it. Otherwise, we will
            # write to the local conf file
            if mgmt_ip:
                sckt = pusher_cache.get(
                    sutils.get_statistics_report_address(mgmt_ip))
                sckt.send(stats.SerializeToString())

                sckt = pusher_cache.get(utils.get_util_report_address(mgmt_ip))
                sckt.send(status.SerializeToString())
            else:
                logging.info(stats)

            status.ClearField('utilization')
            report_start = time.time()
            total_occupancy = 0.0

            # Periodically clear any old functions we have cached that we are
            # no longer accepting requests for.
            del_list = []
            for fname in queue:
                if len(queue[fname]) == 0 and fname not in status.functions:
                    del_list.append(fname)
                    del function_cache[fname]
                    del runtimes[fname]
                    del exec_counts[fname]

            for fname in del_list:
                del queue[fname]

            # If we are departing and have cleared our queues, let the
            # management server know, and exit the process.
            if departing and len(queue) == 0:
                sckt = pusher_cache.get(utils.get_depart_done_addr(mgmt_ip))
                sckt.send_string(ip)

                # We specifically pass 1 as the exit code when ending our
                # process so that the wrapper script does not restart us.
                os._exit(1)