Exemple #1
0
    def __init__(self, func_addr, ip, tid=0, local=False):
        '''
        func_addr: The address of the Cloudburst interface, either localhost or
        the address of an AWS ELB in cluster mode.
        ip: The IP address of the client machine -- used to send and receive
        responses.
        tid: If multiple clients are running on the same machine, they will
        need to use unique IDs.
        local: A boolean representin whether the client is interacting with the
        cluster in local or cluster mode.
        '''

        self.service_addr = 'tcp://' + func_addr + ':%d'
        self.context = zmq.Context(1)

        kvs_addr = self._connect()
        while not kvs_addr:
            logging.info('Connection timed out, retrying')
            print('Connection timed out, retrying')
            kvs_addr = self._connect()

        # Picks a random offset of 10, mostly to alleviate port conflicts when
        # running in local mode.
        self.kvs_client = AnnaTcpClient(kvs_addr,
                                        ip,
                                        local=local,
                                        offset=tid + 10)

        self.func_create_sock = self.context.socket(zmq.REQ)
        self.func_create_sock.connect(self.service_addr % FUNC_CREATE_PORT)

        self.func_call_sock = self.context.socket(zmq.REQ)
        self.func_call_sock.connect(self.service_addr % FUNC_CALL_PORT)

        self.list_sock = self.context.socket(zmq.REQ)
        self.list_sock.connect(self.service_addr % LIST_PORT)

        self.dag_create_sock = self.context.socket(zmq.REQ)
        self.dag_create_sock.connect(self.service_addr % DAG_CREATE_PORT)

        self.dag_call_sock = self.context.socket(zmq.REQ)
        self.dag_call_sock.connect(self.service_addr % DAG_CALL_PORT)

        self.dag_delete_sock = self.context.socket(zmq.REQ)
        self.dag_delete_sock.connect(self.service_addr % DAG_DELETE_PORT)

        self.response_sock = self.context.socket(zmq.PULL)
        response_port = 9000 + tid
        self.response_sock.setsockopt(zmq.RCVTIMEO, 1000)
        self.response_sock.bind('tcp://*:' + str(response_port))

        self.response_address = 'tcp://' + ip + ':' + str(response_port)

        self.rid = 0
Exemple #2
0
class CloudburstConnection():
    def __init__(self, func_addr, ip, tid=0, local=False):
        '''
        func_addr: The address of the Cloudburst interface, either localhost or
        the address of an AWS ELB in cluster mode.
        ip: The IP address of the client machine -- used to send and receive
        responses.
        tid: If multiple clients are running on the same machine, they will
        need to use unique IDs.
        local: A boolean representin whether the client is interacting with the
        cluster in local or cluster mode.
        '''

        self.service_addr = 'tcp://' + func_addr + ':%d'
        self.context = zmq.Context(1)
        kvs_addr = self._connect()

        # Picks a random offset of 10, mostly to alleviate port conflicts when
        # running in local mode.
        self.kvs_client = AnnaTcpClient(kvs_addr,
                                        ip,
                                        local=local,
                                        offset=tid + 10)

        self.func_create_sock = self.context.socket(zmq.REQ)
        self.func_create_sock.connect(self.service_addr % FUNC_CREATE_PORT)

        self.func_call_sock = self.context.socket(zmq.REQ)
        self.func_call_sock.connect(self.service_addr % FUNC_CALL_PORT)

        self.list_sock = self.context.socket(zmq.REQ)
        self.list_sock.connect(self.service_addr % LIST_PORT)

        self.dag_create_sock = self.context.socket(zmq.REQ)
        self.dag_create_sock.connect(self.service_addr % DAG_CREATE_PORT)

        self.dag_call_sock = self.context.socket(zmq.REQ)
        self.dag_call_sock.connect(self.service_addr % DAG_CALL_PORT)

        self.dag_delete_sock = self.context.socket(zmq.REQ)
        self.dag_delete_sock.connect(self.service_addr % DAG_DELETE_PORT)

        self.response_sock = self.context.socket(zmq.PULL)
        response_port = 9000 + tid
        self.response_sock.setsockopt(zmq.RCVTIMEO, 1000)
        self.response_sock.bind('tcp://*:' + str(response_port))

        self.response_address = 'tcp://' + ip + ':' + str(response_port)

        self.rid = 0

    def list(self, prefix=None):
        '''
        Returns a list of all the functions registered in the system.

        prefix: An optional argument which, if specified, prunes the list of
        returned functions to match the provided prefix.
        '''

        for fname in self._get_func_list(prefix):
            print(fname)

    def get_function(self, name):
        '''
        Retrieves a handle for an individual function. Returns None if the
        function cannot be found in the system. The returned object can be
        called like a regular Python function, which returns a CloudburstFuture.

        name: The name of the function to retrieve.
        '''
        if name not in self._get_func_list():
            print(f'''No function found with name {name}. To view all
                  functions, use the `list` method.''')
            return None

        return CloudburstFunction(name, self, self.kvs_client)

    def register(self, function, name):
        '''
        Registers a new function or class with the system. The returned object
        can be called like a regular Python function, which returns a Cloudburst
        Future. If the input is a class, the class is expected to have a run
        method, which is what is invoked at runtime.

        function: The function object that we are registering.
        name: A unique name for the function to be stored with in the system.
        '''

        func = Function()
        func.name = name
        func.body = serializer.dump(function)

        self.func_create_sock.send(func.SerializeToString())

        resp = GenericResponse()
        resp.ParseFromString(self.func_create_sock.recv())

        if resp.success:
            registered_functon = CloudburstFunction(name, self,
                                                    self.kvs_client)
            # print("55", self.kvs_client, "in register 66")
            return registered_functon
        else:
            raise RuntimeError(
                f'Unexpected error while registering function: {resp}.')

    def register_dag(self, name, functions, connections):
        '''
        Registers a new DAG with the system. This operation will fail if any of
        the functions provided cannot be identified in the system.

        name: A unique name for this DAG.
        functions: A list of names of functions to be included in this DAG.
        connections: A list of ordered pairs of function names that represent
        the edges in this DAG.
        '''

        flist = self._get_func_list()
        for fname in functions:
            if isinstance(fname, tuple):
                fname = fname[0]

            if fname not in flist:
                raise RuntimeError(
                    f'Function {fname} not registered. Please register before '
                    + 'including it in a DAG.')

        dag = Dag()
        dag.name = name
        for function in functions:
            ref = dag.functions.add()

            if type(function) == tuple:
                fname = function[0]
                invalids = function[1]
                ref.type = MULTIEXEC
            else:
                fname = function
                invalids = []

            ref.name = fname
            for invalid in invalids:
                ref.invalid_results.append(serializer.dump(invalid))

        for pair in connections:
            conn = dag.connections.add()
            conn.source = pair[0]
            conn.sink = pair[1]

        self.dag_create_sock.send(dag.SerializeToString())

        r = GenericResponse()
        r.ParseFromString(self.dag_create_sock.recv())

        return r.success, r.error

    def call_dag(self,
                 dname,
                 arg_map,
                 direct_response=False,
                 consistency=NORMAL,
                 output_key=None,
                 client_id=None):
        '''
        Issues a new request to execute the DAG. Returns a CloudburstFuture that

        dname: The name of the DAG to cexecute.
        arg_map: A map from function names to lists of arguments for each of
        the functions in the DAG.
        direct_response: If True, the response will be synchronously received
        by the client; otherwise, the result will be stored in the KVS.
        consistency: The consistency mode to use with this function: either
        NORMAL or MULTI.
        output_key: The KVS key in which to store the result of thie DAG.
        client_id: An optional ID associated with an individual client across
        requests; this is used for causal metadata.
        '''
        dc = DagCall()
        dc.name = dname
        dc.consistency = consistency

        if output_key:
            dc.output_key = output_key

        if client_id:
            dc.client_id = client_id

        for fname in arg_map:
            fname_args = arg_map[fname]
            if type(fname_args) != list:
                fname_args = [fname_args]
            args = [
                serializer.dump(arg, serialize=False) for arg in fname_args
            ]
            al = dc.function_args[fname]
            al.values.extend(args)

        if direct_response:
            dc.response_address = self.response_address

        self.dag_call_sock.send(dc.SerializeToString())

        r = GenericResponse()
        r.ParseFromString(self.dag_call_sock.recv())

        if direct_response:
            try:
                result = self.response_sock.recv()
                return serializer.load(result)
            except zmq.ZMQError as e:
                if e.errno == zmq.EAGAIN:
                    return None
                else:
                    raise e
        else:
            if r.success:
                return CloudburstFuture(r.response_id, self.kvs_client,
                                        serializer)
            else:
                return None

    def delete_dag(self, dname):
        '''
        Removes the specified DAG from the system.

        dname: The name of the DAG to delete.
        '''
        self.dag_delete_sock.send_string(dname)

        r = GenericResponse()
        r.ParseFromString(self.dag_delete_sock.recv())

        return r.success, r.error

    def get_object(self, key):
        '''
        Retrieves an arbitrary key from the KVS, automatically deserializes it,
        and returns the value to the user.
        '''
        lattice = self.kvs_client.get(key)[key]
        return serializer.load_lattice(lattice)

    def put_object(self, key, value):
        '''
        Automatically wraps an object in a lattice and puts it into the
        key-value store at the desired key.
        '''
        lattice = serializer.dump_lattice(value)
        return self.kvs_client.put(key, lattice)

    def exec_func(self, name, args):
        call = FunctionCall()
        call.name = name
        call.request_id = self.rid

        for arg in args:
            argobj = call.arguments.values.add()
            serializer.dump(arg, argobj)

        self.func_call_sock.send(call.SerializeToString())

        r = GenericResponse()
        r.ParseFromString(self.func_call_sock.recv())

        self.rid += 1
        return r.response_id

    def _connect(self):
        sckt = self.context.socket(zmq.REQ)
        sckt.connect(self.service_addr % CONNECT_PORT)
        sckt.send_string('')

        return sckt.recv_string()

    def _get_func_list(self, prefix=None):
        msg = prefix if prefix else ''
        self.list_sock.send_string(msg)

        flist = StringSet()
        flist.ParseFromString(self.list_sock.recv())
        return flist.keys
Exemple #3
0
def executor(ip, mgmt_ip, schedulers, thread_id):
    # logging.basicConfig(stream=sys.stdout, level=logging.INFO, format='%(asctime)s %(message)s')
    logging.basicConfig(filename='log_executor.txt',
                        level=logging.INFO,
                        filemode="w",
                        format='%(asctime)s %(message)s')

    # Check what resources we have access to, set as an environment variable.
    if os.getenv('EXECUTOR_TYPE', 'CPU') == 'GPU':
        exec_type = GPU
    else:
        exec_type = CPU

    context = zmq.Context(1)
    poller = zmq.Poller()

    pin_socket = context.socket(zmq.PULL)
    pin_socket.bind(sutils.BIND_ADDR_TEMPLATE % (sutils.PIN_PORT + thread_id))

    unpin_socket = context.socket(zmq.PULL)
    unpin_socket.bind(sutils.BIND_ADDR_TEMPLATE %
                      (sutils.UNPIN_PORT + thread_id))

    exec_socket = context.socket(zmq.PULL)
    exec_socket.bind(sutils.BIND_ADDR_TEMPLATE %
                     (sutils.FUNC_EXEC_PORT + thread_id))

    dag_queue_socket = context.socket(zmq.PULL)
    dag_queue_socket.bind(sutils.BIND_ADDR_TEMPLATE %
                          (sutils.DAG_QUEUE_PORT + thread_id))

    dag_exec_socket = context.socket(zmq.PULL)
    dag_exec_socket.bind(sutils.BIND_ADDR_TEMPLATE %
                         (sutils.DAG_EXEC_PORT + thread_id))

    self_depart_socket = context.socket(zmq.PULL)
    self_depart_socket.bind(sutils.BIND_ADDR_TEMPLATE %
                            (sutils.SELF_DEPART_PORT + thread_id))

    pusher_cache = SocketCache(context, zmq.PUSH)

    poller = zmq.Poller()
    poller.register(pin_socket, zmq.POLLIN)
    poller.register(unpin_socket, zmq.POLLIN)
    poller.register(exec_socket, zmq.POLLIN)
    poller.register(dag_queue_socket, zmq.POLLIN)
    poller.register(dag_exec_socket, zmq.POLLIN)
    poller.register(self_depart_socket, zmq.POLLIN)

    # If the management IP is set to None, that means that we are running in
    # local mode, so we use a regular AnnaTcpClient rather than an IPC client.
    has_ephe = False
    if mgmt_ip:
        if 'STORAGE_OR_DEFAULT' in os.environ and os.environ[
                'STORAGE_OR_DEFAULT'] == '0':
            client = AnnaTcpClient(os.environ['ROUTE_ADDR'],
                                   ip,
                                   local=False,
                                   offset=thread_id)
            has_ephe = True
        else:
            client = AnnaIpcClient(thread_id, context)
        # force_remote_anna = 1
        # if 'FORCE_REMOTE' in os.environ:
        #     force_remote_anna = int(os.environ['FORCE_REMOTE'])

        # if force_remote_anna == 0: # remote anna only
        #     client = AnnaTcpClient(os.environ['ROUTE_ADDR'], ip, local=False, offset=thread_id)
        # elif force_remote_anna == 1: # anna cache
        #     client = AnnaIpcClient(thread_id, context)
        # elif force_remote_anna == 2: # control both cache and remote anna
        #     remote_client = AnnaTcpClient(os.environ['ROUTE_ADDR'], ip, local=False, offset=thread_id)
        #     cache_client = AnnaIpcClient(thread_id, context)
        #     client = cache_client
        #     user_library = CloudburstUserLibrary(context, pusher_cache, ip, thread_id, (cache_client, remote_client))

        local = False
    else:
        client = AnnaTcpClient('127.0.0.1', '127.0.0.1', local=True, offset=1)
        local = True

    user_library = CloudburstUserLibrary(context,
                                         pusher_cache,
                                         ip,
                                         thread_id,
                                         client,
                                         has_ephe=has_ephe)

    status = ThreadStatus()
    status.ip = ip
    status.tid = thread_id
    status.running = True
    status.type = exec_type
    utils.push_status(schedulers, pusher_cache, status)

    departing = False

    # Maintains a request queue for each function pinned on this executor. Each
    # function will have a set of request IDs mapped to it, and this map stores
    # a schedule for each request ID.
    queue = {}

    # Tracks the actual function objects that are pinned to this executor.
    function_cache = {}

    # Tracks runtime cost of excuting a DAG function.
    runtimes = {}

    # If multiple triggers are necessary for a function, track the triggers as
    # we receive them. This is also used if a trigger arrives before its
    # corresponding schedule.
    received_triggers = {}

    # Tracks when we received a function request, so we can report end-to-end
    # latency for the whole executio.
    receive_times = {}

    # Tracks the number of requests we are finishing for each function pinned
    # here.
    exec_counts = {}

    # Tracks the end-to-end runtime of each DAG request for which we are the
    # sink function.
    dag_runtimes = {}

    # A map with KVS keys and their corresponding deserialized payloads.
    cache = {}

    # A map which tracks the most recent DAGs for which we have finished our
    # work.
    finished_executions = {}

    # The set of pinned functions and whether they support batching. NOTE: This
    # is only a set for local mode -- in cluster mode, there will only be one
    # pinned function per executor.
    batching = False

    # Internal metadata to track thread utilization.
    report_start = time.time()
    event_occupancy = {
        'pin': 0.0,
        'unpin': 0.0,
        'func_exec': 0.0,
        'dag_queue': 0.0,
        'dag_exec': 0.0
    }
    total_occupancy = 0.0

    while True:
        socks = dict(poller.poll(timeout=1000))

        if pin_socket in socks and socks[pin_socket] == zmq.POLLIN:
            work_start = time.time()
            batching = pin(pin_socket, pusher_cache, client, status,
                           function_cache, runtimes, exec_counts, user_library,
                           local, batching)
            utils.push_status(schedulers, pusher_cache, status)

            elapsed = time.time() - work_start
            event_occupancy['pin'] += elapsed
            total_occupancy += elapsed

        if unpin_socket in socks and socks[unpin_socket] == zmq.POLLIN:
            work_start = time.time()
            unpin(unpin_socket, status, function_cache, runtimes, exec_counts)
            utils.push_status(schedulers, pusher_cache, status)

            elapsed = time.time() - work_start
            event_occupancy['unpin'] += elapsed
            total_occupancy += elapsed

        if exec_socket in socks and socks[exec_socket] == zmq.POLLIN:
            work_start = time.time()
            # logging.info(f'Executor timer. exec_socket recv: {work_start}')
            exec_function(exec_socket,
                          client,
                          user_library,
                          cache,
                          function_cache,
                          has_ephe=has_ephe)
            user_library.close()

            utils.push_status(schedulers, pusher_cache, status)

            elapsed = time.time() - work_start
            event_occupancy['func_exec'] += elapsed
            total_occupancy += elapsed

        if dag_queue_socket in socks and socks[dag_queue_socket] == zmq.POLLIN:
            work_start = time.time()
            logging.info(
                f'Executor timer. dag_queue_socket recv: {work_start}')
            # In order to effectively support batching, we have to make sure we
            # dequeue lots of schedules in addition to lots of triggers. Right
            # now, we're not going to worry about supporting batching here,
            # just on the trigger dequeue side, but we still have to dequeue
            # all schedules we've received. We just process them one at a time.
            while True:
                schedule = DagSchedule()
                try:
                    msg = dag_queue_socket.recv(zmq.DONTWAIT)
                except zmq.ZMQError as e:
                    if e.errno == zmq.EAGAIN:
                        break  # There are no more messages.
                    else:
                        raise e  # Unexpected error.

                schedule.ParseFromString(msg)
                fname = schedule.target_function

                logging.info(
                    'Received a schedule for DAG %s (%s), function %s.' %
                    (schedule.dag.name, schedule.id, fname))

                if fname not in queue:
                    queue[fname] = {}

                queue[fname][schedule.id] = schedule

                if (schedule.id, fname) not in receive_times:
                    receive_times[(schedule.id, fname)] = time.time()

                # In case we receive the trigger before we receive the schedule, we
                # can trigger from this operation as well.
                trkey = (schedule.id, fname)
                fref = None

                # Check to see what type of execution this function is.
                for ref in schedule.dag.functions:
                    if ref.name == fname:
                        fref = ref

                if (trkey in received_triggers and
                    ((len(received_triggers[trkey]) == len(schedule.triggers))
                     or (fref.type == MULTIEXEC))):

                    triggers = list(received_triggers[trkey].values())

                    if fname not in function_cache:
                        logging.error('%s not in function cache', fname)
                        utils.generate_error_response(schedule, client, fname)
                        continue
                    exec_start = time.time()
                    # logging.info(f'Executor timer. dag_queue_socket exec_dag: {exec_start}')
                    # We don't support actual batching for when we receive a
                    # schedule before a trigger, so everything is just a batch of
                    # size 1 if anything.
                    success = exec_dag_function(pusher_cache, client,
                                                [triggers],
                                                function_cache[fname],
                                                [schedule], user_library,
                                                dag_runtimes, cache,
                                                schedulers, batching)[0]
                    user_library.close()

                    del received_triggers[trkey]
                    if success:
                        del queue[fname][schedule.id]

                        fend = time.time()
                        fstart = receive_times[(schedule.id, fname)]
                        runtimes[fname].append(fend - work_start)
                        exec_counts[fname] += 1

                        finished_executions[(schedule.id, fname)] = time.time()

            elapsed = time.time() - work_start
            event_occupancy['dag_queue'] += elapsed
            total_occupancy += elapsed

        if dag_exec_socket in socks and socks[dag_exec_socket] == zmq.POLLIN:
            work_start = time.time()
            # logging.info(f'Executor timer. dag_exec_socket recv: {work_start}')

            # How many messages to dequeue -- BATCH_SIZE_MAX or 1 depending on
            # the function configuration.
            if batching:
                count = BATCH_SIZE_MAX
            else:
                count = 1

            trigger_keys = set()

            for _ in range(count):  # Dequeue count number of messages.
                trigger = DagTrigger()

                try:
                    msg = dag_exec_socket.recv(zmq.DONTWAIT)
                except zmq.ZMQError as e:
                    if e.errno == zmq.EAGAIN:  # There are no more messages.
                        break
                    else:
                        raise e  # Unexpected error.

                trigger.ParseFromString(msg)

                # We have received a repeated trigger for a function that has
                # already finished executing.
                if trigger.id in finished_executions:
                    continue

                fname = trigger.target_function
                logging.info(
                    'Received a trigger for schedule %s, function %s.' %
                    (trigger.id, fname))

                key = (trigger.id, fname)
                trigger_keys.add(key)
                if key not in received_triggers:
                    received_triggers[key] = {}

                if (trigger.id, fname) not in receive_times:
                    receive_times[(trigger.id, fname)] = time.time()

                received_triggers[key][trigger.source] = trigger

            # Only execute the functions for which we have received a schedule.
            # Everything else will wait.
            for tid, fname in list(trigger_keys):
                if fname not in queue or tid not in queue[fname]:
                    trigger_keys.remove((tid, fname))

            if len(trigger_keys) == 0:
                continue

            fref = None
            schedule = queue[fname][list(trigger_keys)[0]
                                    [0]]  # Pick a random schedule to check.
            # Check to see what type of execution this function is.
            for ref in schedule.dag.functions:
                if ref.name == fname:
                    fref = ref
                    break

            # Compile a list of all the trigger sets for which we have
            # enough triggers.
            trigger_sets = []
            schedules = []
            for key in trigger_keys:
                if (len(received_triggers[key]) == len(schedule.triggers)) or \
                        fref.type == MULTIEXEC:

                    if fref.type == MULTIEXEC:
                        triggers = [trigger]
                    else:
                        triggers = list(received_triggers[key].values())

                    if fname not in function_cache:
                        logging.error('%s not in function cache', fname)
                        utils.generate_error_response(schedule, client, fname)
                        continue

                    trigger_sets.append(triggers)
                    schedule = queue[fname][key[0]]
                    schedules.append(schedule)

            exec_start = time.time()
            # logging.info(f'Executor timer. dag_exec_socket exec_dag: {exec_start}')
            # Pass all of the trigger_sets into exec_dag_function at once.
            # We also include the batching variaible to make sure we know
            # whether to pass lists into the fn or not.
            if len(trigger_sets) > 0:
                successes = exec_dag_function(pusher_cache, client,
                                              trigger_sets,
                                              function_cache[fname], schedules,
                                              user_library, dag_runtimes,
                                              cache, schedulers, batching)
                user_library.close()
                del received_triggers[key]

                for key, success in zip(trigger_keys, successes):
                    if success:
                        del queue[fname][key[0]]  # key[0] is trigger.id.

                        fend = time.time()
                        fstart = receive_times[key]

                        average_time = (fend - work_start) / len(trigger_keys)

                        runtimes[fname].append(average_time)
                        exec_counts[fname] += 1

                        finished_executions[(schedule.id, fname)] = time.time()

            elapsed = time.time() - work_start
            event_occupancy['dag_exec'] += elapsed
            total_occupancy += elapsed

        if self_depart_socket in socks and socks[self_depart_socket] == \
                zmq.POLLIN:
            # This message does not matter.
            self_depart_socket.recv()

            logging.info('Preparing to depart. No longer accepting requests ' +
                         'and clearing all queues.')

            status.ClearField('functions')
            status.running = False
            utils.push_status(schedulers, pusher_cache, status)

            departing = True

        # periodically report function occupancy
        report_end = time.time()
        if report_end - report_start > REPORT_THRESH:
            if len(cache) > 100:
                extra_keys = list(cache.keys())[:len(cache) - 100]
                for key in extra_keys:
                    del cache[key]

            utilization = total_occupancy / (report_end - report_start)
            status.utilization = utilization

            # Periodically report my status to schedulers with the utilization
            # set.
            utils.push_status(schedulers, pusher_cache, status)

            logging.debug('Total thread occupancy: %.6f' % (utilization))

            for event in event_occupancy:
                occ = event_occupancy[event] / (report_end - report_start)
                logging.debug('\tEvent %s occupancy: %.6f' % (event, occ))
                event_occupancy[event] = 0.0

            stats = ExecutorStatistics()
            for fname in runtimes:
                if exec_counts[fname] > 0:
                    fstats = stats.functions.add()
                    fstats.name = fname
                    fstats.call_count = exec_counts[fname]
                    fstats.runtime.extend(runtimes[fname])

                runtimes[fname].clear()
                exec_counts[fname] = 0

            for dname in dag_runtimes:
                dstats = stats.dags.add()
                dstats.name = dname

                dstats.runtimes.extend(dag_runtimes[dname])

                dag_runtimes[dname].clear()

            # If we are running in cluster mode, mgmt_ip will be set, and we
            # will report our status and statistics to it. Otherwise, we will
            # write to the local conf file
            if mgmt_ip:
                sckt = pusher_cache.get(
                    sutils.get_statistics_report_address(mgmt_ip))
                sckt.send(stats.SerializeToString())

                sckt = pusher_cache.get(utils.get_util_report_address(mgmt_ip))
                sckt.send(status.SerializeToString())
            else:
                logging.info(stats)

            status.ClearField('utilization')
            report_start = time.time()
            total_occupancy = 0.0

            # Periodically clear any old functions we have cached that we are
            # no longer accepting requests for.
            del_list = []
            for fname in queue:
                if len(queue[fname]) == 0 and fname not in status.functions:
                    del_list.append(fname)
                    del function_cache[fname]
                    del runtimes[fname]
                    del exec_counts[fname]

            for fname in del_list:
                del queue[fname]

            del_list = []
            for tid in finished_executions:
                if (time.time() - finished_executions[tid]) > 10:
                    del_list.append(tid)

            for tid in del_list:
                del finished_executions[tid]

            # If we are departing and have cleared our queues, let the
            # management server know, and exit the process.
            if departing and len(queue) == 0:
                sckt = pusher_cache.get(utils.get_depart_done_addr(mgmt_ip))
                sckt.send_string(ip)

                # We specifically pass 1 as the exit code when ending our
                # process so that the wrapper script does not restart us.
                sys.exit(1)
Exemple #4
0
def scheduler(ip, mgmt_ip, route_addr):

    # If the management IP is not set, we are running in local mode.
    local = (mgmt_ip is None)
    kvs = AnnaTcpClient(route_addr, ip, local=local)

    scheduler_id = str(uuid.uuid4())

    context = zmq.Context(1)

    # A mapping from a DAG's name to its protobuf representation.
    dags = {}

    # Tracks how often a request for each function is received.
    call_frequency = {}

    # Tracks the time interval between successive requests for a particular
    # DAG.
    interarrivals = {}

    # Tracks the most recent arrival for each DAG -- used to calculate
    # interarrival times.
    last_arrivals = {}

    # Maintains a list of all other schedulers in the system, so we can
    # propagate metadata to them.
    schedulers = []

    connect_socket = context.socket(zmq.REP)
    connect_socket.bind(sutils.BIND_ADDR_TEMPLATE % (CONNECT_PORT))

    func_create_socket = context.socket(zmq.REP)
    func_create_socket.bind(sutils.BIND_ADDR_TEMPLATE % (FUNC_CREATE_PORT))

    func_call_socket = context.socket(zmq.REP)
    func_call_socket.bind(sutils.BIND_ADDR_TEMPLATE % (FUNC_CALL_PORT))

    dag_create_socket = context.socket(zmq.REP)
    dag_create_socket.bind(sutils.BIND_ADDR_TEMPLATE % (DAG_CREATE_PORT))

    dag_call_socket = context.socket(zmq.REP)
    dag_call_socket.bind(sutils.BIND_ADDR_TEMPLATE % (DAG_CALL_PORT))

    dag_delete_socket = context.socket(zmq.REP)
    dag_delete_socket.bind(sutils.BIND_ADDR_TEMPLATE % (DAG_DELETE_PORT))

    list_socket = context.socket(zmq.REP)
    list_socket.bind(sutils.BIND_ADDR_TEMPLATE % (LIST_PORT))

    exec_status_socket = context.socket(zmq.PULL)
    exec_status_socket.bind(sutils.BIND_ADDR_TEMPLATE % (sutils.STATUS_PORT))

    sched_update_socket = context.socket(zmq.PULL)
    sched_update_socket.bind(sutils.BIND_ADDR_TEMPLATE %
                             (sutils.SCHED_UPDATE_PORT))

    pin_accept_socket = context.socket(zmq.PULL)
    pin_accept_socket.setsockopt(zmq.RCVTIMEO, 500)
    pin_accept_socket.bind(sutils.BIND_ADDR_TEMPLATE %
                           (sutils.PIN_ACCEPT_PORT))

    requestor_cache = SocketCache(context, zmq.REQ)
    pusher_cache = SocketCache(context, zmq.PUSH)

    poller = zmq.Poller()
    poller.register(connect_socket, zmq.POLLIN)
    poller.register(func_create_socket, zmq.POLLIN)
    poller.register(func_call_socket, zmq.POLLIN)
    poller.register(dag_create_socket, zmq.POLLIN)
    poller.register(dag_call_socket, zmq.POLLIN)
    poller.register(dag_delete_socket, zmq.POLLIN)
    poller.register(list_socket, zmq.POLLIN)
    poller.register(exec_status_socket, zmq.POLLIN)
    poller.register(sched_update_socket, zmq.POLLIN)

    # Start the policy engine.
    policy = DefaultCloudburstSchedulerPolicy(pin_accept_socket,
                                              pusher_cache,
                                              kvs,
                                              ip,
                                              local=local)
    policy.update()

    start = time.time()

    while True:
        socks = dict(poller.poll(timeout=1000))

        if connect_socket in socks and socks[connect_socket] == zmq.POLLIN:
            msg = connect_socket.recv_string()
            connect_socket.send_string(route_addr)

        if (func_create_socket in socks
                and socks[func_create_socket] == zmq.POLLIN):
            create_function(func_create_socket, kvs)

        if func_call_socket in socks and socks[func_call_socket] == zmq.POLLIN:
            call_function(func_call_socket, pusher_cache, policy)

        if (dag_create_socket in socks
                and socks[dag_create_socket] == zmq.POLLIN):
            create_dag(dag_create_socket, pusher_cache, kvs, dags, policy,
                       call_frequency)

        if dag_call_socket in socks and socks[dag_call_socket] == zmq.POLLIN:
            call = DagCall()
            call.ParseFromString(dag_call_socket.recv())

            name = call.name

            t = time.time()
            if name in last_arrivals:
                if name not in interarrivals:
                    interarrivals[name] = []

                interarrivals[name].append(t - last_arrivals[name])

            last_arrivals[name] = t

            if name not in dags:
                resp = GenericResponse()
                resp.success = False
                resp.error = NO_SUCH_DAG

                dag_call_socket.send(resp.SerializeToString())
                continue

            dag = dags[name]
            for fname in dag[0].functions:
                call_frequency[fname.name] += 1

            response = call_dag(call, pusher_cache, dags, policy)
            dag_call_socket.send(response.SerializeToString())

        if (dag_delete_socket in socks
                and socks[dag_delete_socket] == zmq.POLLIN):
            delete_dag(dag_delete_socket, dags, policy, call_frequency)

        if list_socket in socks and socks[list_socket] == zmq.POLLIN:
            msg = list_socket.recv_string()
            prefix = msg if msg else ''

            resp = StringSet()
            resp.keys.extend(sched_utils.get_func_list(kvs, prefix))

            list_socket.send(resp.SerializeToString())

        if exec_status_socket in socks and socks[exec_status_socket] == \
                zmq.POLLIN:
            status = ThreadStatus()
            status.ParseFromString(exec_status_socket.recv())

            policy.process_status(status)

        if sched_update_socket in socks and socks[sched_update_socket] == \
                zmq.POLLIN:
            status = SchedulerStatus()
            status.ParseFromString(sched_update_socket.recv())

            # Retrieve any DAGs that some other scheduler knows about that we
            # do not yet know about.
            for dname in status.dags:
                if dname not in dags:
                    payload = kvs.get(dname)
                    while None in payload:
                        payload = kvs.get(dname)

                    dag = Dag()
                    dag.ParseFromString(payload[dname].reveal())
                    dags[dag.name] = (dag, sched_utils.find_dag_source(dag))

                    for fname in dag.functions:
                        if fname not in call_frequency:
                            call_frequency[fname] = 0

            policy.update_function_locations(status.function_locations)

        end = time.time()

        if end - start > METADATA_THRESHOLD:
            # Update the scheduler policy-related metadata.
            policy.update()

            # If the management IP is None, that means we arre running in
            # local mode, so there is no need to deal with caches and other
            # schedulers.
            if mgmt_ip:
                schedulers = sched_utils.get_ip_set(
                    sched_utils.get_scheduler_list_address(mgmt_ip),
                    requestor_cache, False)

        if end - start > REPORT_THRESHOLD:
            num_unique_executors = policy.get_unique_executors()
            key = scheduler_id + ':' + str(time.time())
            data = {'key': key, 'count': num_unique_executors}

            status = SchedulerStatus()
            for name in dags.keys():
                status.dags.append(name)

            for fname in policy.function_locations:
                for loc in policy.function_locations[fname]:
                    floc = status.function_locations.add()
                    floc.name = fname
                    floc.ip = loc[0]
                    floc.tid = loc[1]

            msg = status.SerializeToString()

            for sched_ip in schedulers:
                if sched_ip != ip:
                    sckt = pusher_cache.get(
                        sched_utils.get_scheduler_update_address(sched_ip))
                    sckt.send(msg)

            stats = ExecutorStatistics()
            for fname in call_frequency:
                fstats = stats.functions.add()
                fstats.name = fname
                fstats.call_count = call_frequency[fname]
                logging.info('Reporting %d calls for function %s.' %
                             (call_frequency[fname], fname))

                call_frequency[fname] = 0

            for dname in interarrivals:
                dstats = stats.dags.add()
                dstats.name = dname
                dstats.call_count = len(interarrivals[dname]) + 1
                dstats.interarrival.extend(interarrivals[dname])

                interarrivals[dname].clear()

            # We only attempt to send the statistics if we are running in
            # cluster mode. If we are running in local mode, we write them to
            # the local log file.
            if mgmt_ip:
                sckt = pusher_cache.get(
                    sutils.get_statistics_report_address(mgmt_ip))
                sckt.send(stats.SerializeToString())

            start = time.time()
Exemple #5
0
def executor(ip, mgmt_ip, schedulers, thread_id):
    logging.basicConfig(filename='log_executor.txt',
                        level=logging.INFO,
                        format='%(asctime)s %(message)s')

    context = zmq.Context(1)
    poller = zmq.Poller()

    pin_socket = context.socket(zmq.PULL)
    pin_socket.bind(sutils.BIND_ADDR_TEMPLATE % (sutils.PIN_PORT + thread_id))

    unpin_socket = context.socket(zmq.PULL)
    unpin_socket.bind(sutils.BIND_ADDR_TEMPLATE %
                      (sutils.UNPIN_PORT + thread_id))

    exec_socket = context.socket(zmq.PULL)
    exec_socket.bind(sutils.BIND_ADDR_TEMPLATE %
                     (sutils.FUNC_EXEC_PORT + thread_id))

    dag_queue_socket = context.socket(zmq.PULL)
    dag_queue_socket.bind(sutils.BIND_ADDR_TEMPLATE %
                          (sutils.DAG_QUEUE_PORT + thread_id))

    dag_exec_socket = context.socket(zmq.PULL)
    dag_exec_socket.bind(sutils.BIND_ADDR_TEMPLATE %
                         (sutils.DAG_EXEC_PORT + thread_id))

    self_depart_socket = context.socket(zmq.PULL)
    self_depart_socket.bind(sutils.BIND_ADDR_TEMPLATE %
                            (sutils.SELF_DEPART_PORT + thread_id))

    pusher_cache = SocketCache(context, zmq.PUSH)

    poller = zmq.Poller()
    poller.register(pin_socket, zmq.POLLIN)
    poller.register(unpin_socket, zmq.POLLIN)
    poller.register(exec_socket, zmq.POLLIN)
    poller.register(dag_queue_socket, zmq.POLLIN)
    poller.register(dag_exec_socket, zmq.POLLIN)
    poller.register(self_depart_socket, zmq.POLLIN)

    # If the management IP is set to None, that means that we are running in
    # local mode, so we use a regular AnnaTcpClient rather than an IPC client.
    if mgmt_ip:
        client = AnnaIpcClient(thread_id, context)
    else:
        client = AnnaTcpClient('127.0.0.1', '127.0.0.1', local=True, offset=1)

    user_library = CloudburstUserLibrary(context, pusher_cache, ip, thread_id,
                                         client)

    status = ThreadStatus()
    status.ip = ip
    status.tid = thread_id
    status.running = True
    utils.push_status(schedulers, pusher_cache, status)

    departing = False

    # Maintains a request queue for each function pinned on this executor. Each
    # function will have a set of request IDs mapped to it, and this map stores
    # a schedule for each request ID.
    queue = {}

    # Tracks the actual function objects that are pinned to this executor.
    function_cache = {}

    # Tracks runtime cost of excuting a DAG function.
    runtimes = {}

    # If multiple triggers are necessary for a function, track the triggers as
    # we receive them. This is also used if a trigger arrives before its
    # corresponding schedule.
    received_triggers = {}

    # Tracks when we received a function request, so we can report end-to-end
    # latency for the whole executio.
    receive_times = {}

    # Tracks the number of requests we are finishing for each function pinned
    # here.
    exec_counts = {}

    # Tracks the end-to-end runtime of each DAG request for which we are the
    # sink function.
    dag_runtimes = {}

    # A map with KVS keys and their corresponding deserialized payloads.
    cache = {}

    # Internal metadata to track thread utilization.
    report_start = time.time()
    event_occupancy = {
        'pin': 0.0,
        'unpin': 0.0,
        'func_exec': 0.0,
        'dag_queue': 0.0,
        'dag_exec': 0.0
    }
    total_occupancy = 0.0

    while True:
        socks = dict(poller.poll(timeout=1000))

        if pin_socket in socks and socks[pin_socket] == zmq.POLLIN:
            work_start = time.time()
            pin(pin_socket, pusher_cache, client, status, function_cache,
                runtimes, exec_counts, user_library)
            utils.push_status(schedulers, pusher_cache, status)

            elapsed = time.time() - work_start
            event_occupancy['pin'] += elapsed
            total_occupancy += elapsed

        if unpin_socket in socks and socks[unpin_socket] == zmq.POLLIN:
            work_start = time.time()
            unpin(unpin_socket, status, function_cache, runtimes, exec_counts)
            utils.push_status(schedulers, pusher_cache, status)

            elapsed = time.time() - work_start
            event_occupancy['unpin'] += elapsed
            total_occupancy += elapsed

        if exec_socket in socks and socks[exec_socket] == zmq.POLLIN:
            work_start = time.time()
            exec_function(exec_socket, client, user_library, cache,
                          function_cache)
            user_library.close()

            utils.push_status(schedulers, pusher_cache, status)

            elapsed = time.time() - work_start
            event_occupancy['func_exec'] += elapsed
            total_occupancy += elapsed

        if dag_queue_socket in socks and socks[dag_queue_socket] == zmq.POLLIN:
            work_start = time.time()

            schedule = DagSchedule()
            schedule.ParseFromString(dag_queue_socket.recv())
            fname = schedule.target_function

            logging.info('Received a schedule for DAG %s (%s), function %s.' %
                         (schedule.dag.name, schedule.id, fname))

            if fname not in queue:
                queue[fname] = {}

            queue[fname][schedule.id] = schedule

            if (schedule.id, fname) not in receive_times:
                receive_times[(schedule.id, fname)] = time.time()

            # In case we receive the trigger before we receive the schedule, we
            # can trigger from this operation as well.
            trkey = (schedule.id, fname)
            if (trkey in received_triggers and
                (len(received_triggers[trkey]) == len(schedule.triggers))):

                exec_dag_function(pusher_cache, client,
                                  received_triggers[trkey],
                                  function_cache[fname], schedule,
                                  user_library, dag_runtimes, cache)
                user_library.close()

                del received_triggers[trkey]
                del queue[fname][schedule.id]

                fend = time.time()
                fstart = receive_times[(schedule.id, fname)]
                runtimes[fname].append(fend - fstart)
                exec_counts[fname] += 1

            elapsed = time.time() - work_start
            event_occupancy['dag_queue'] += elapsed
            total_occupancy += elapsed

        if dag_exec_socket in socks and socks[dag_exec_socket] == zmq.POLLIN:
            work_start = time.time()
            trigger = DagTrigger()
            trigger.ParseFromString(dag_exec_socket.recv())

            fname = trigger.target_function
            logging.info('Received a trigger for schedule %s, function %s.' %
                         (trigger.id, fname))

            key = (trigger.id, fname)
            if key not in received_triggers:
                received_triggers[key] = {}

            if (trigger.id, fname) not in receive_times:
                receive_times[(trigger.id, fname)] = time.time()

            received_triggers[key][trigger.source] = trigger
            if fname in queue and trigger.id in queue[fname]:
                schedule = queue[fname][trigger.id]
                if len(received_triggers[key]) == len(schedule.triggers):
                    exec_dag_function(pusher_cache, client,
                                      received_triggers[key],
                                      function_cache[fname], schedule,
                                      user_library, dag_runtimes, cache)
                    user_library.close()

                    del received_triggers[key]
                    del queue[fname][trigger.id]

                    fend = time.time()
                    fstart = receive_times[(trigger.id, fname)]
                    runtimes[fname].append(fend - fstart)
                    exec_counts[fname] += 1

            elapsed = time.time() - work_start
            event_occupancy['dag_exec'] += elapsed
            total_occupancy += elapsed

        if self_depart_socket in socks and socks[self_depart_socket] == \
                zmq.POLLIN:
            # This message does not matter.
            self_depart_socket.recv()

            logging.info('Preparing to depart. No longer accepting requests ' +
                         'and clearing all queues.')

            status.ClearField('functions')
            status.running = False
            utils.push_status(schedulers, pusher_cache, status)

            departing = True

        # periodically report function occupancy
        report_end = time.time()
        if report_end - report_start > REPORT_THRESH:
            cache.clear()

            utilization = total_occupancy / (report_end - report_start)
            status.utilization = utilization

            # Periodically report my status to schedulers with the utilization
            # set.
            utils.push_status(schedulers, pusher_cache, status)

            logging.info('Total thread occupancy: %.6f' % (utilization))

            for event in event_occupancy:
                occ = event_occupancy[event] / (report_end - report_start)
                logging.info('\tEvent %s occupancy: %.6f' % (event, occ))
                event_occupancy[event] = 0.0

            stats = ExecutorStatistics()
            for fname in runtimes:
                if exec_counts[fname] > 0:
                    fstats = stats.functions.add()
                    fstats.name = fname
                    fstats.call_count = exec_counts[fname]
                    fstats.runtime.extend(runtimes[fname])

                runtimes[fname].clear()
                exec_counts[fname] = 0

            for dname in dag_runtimes:
                dstats = stats.dags.add()
                dstats.name = dname

                dstats.runtimes.extend(dag_runtimes[dname])

                dag_runtimes[dname].clear()

            # If we are running in cluster mode, mgmt_ip will be set, and we
            # will report our status and statistics to it. Otherwise, we will
            # write to the local conf file
            if mgmt_ip:
                sckt = pusher_cache.get(
                    sutils.get_statistics_report_address(mgmt_ip))
                sckt.send(stats.SerializeToString())

                sckt = pusher_cache.get(utils.get_util_report_address(mgmt_ip))
                sckt.send(status.SerializeToString())
            else:
                logging.info(stats)

            status.ClearField('utilization')
            report_start = time.time()
            total_occupancy = 0.0

            # Periodically clear any old functions we have cached that we are
            # no longer accepting requests for.
            del_list = []
            for fname in queue:
                if len(queue[fname]) == 0 and fname not in status.functions:
                    del_list.append(fname)
                    del function_cache[fname]
                    del runtimes[fname]
                    del exec_counts[fname]

            for fname in del_list:
                del queue[fname]

            # If we are departing and have cleared our queues, let the
            # management server know, and exit the process.
            if departing and len(queue) == 0:
                sckt = pusher_cache.get(utils.get_depart_done_addr(mgmt_ip))
                sckt.send_string(ip)

                # We specifically pass 1 as the exit code when ending our
                # process so that the wrapper script does not restart us.
                os._exit(1)
Exemple #6
0
def scheduler(ip, mgmt_ip, route_addr, policy_type):

    # If the management IP is not set, we are running in local mode.
    local = (mgmt_ip is None)
    kvs = AnnaTcpClient(route_addr, ip, local=local)

    scheduler_id = str(uuid.uuid4())

    context = zmq.Context(1)
    context.set(zmq.MAX_SOCKETS, 10000)

    # A mapping from a DAG's name to its protobuf representation.
    dags = {}

    # Tracks how often a request for each function is received.
    call_frequency = {}

    # Tracks the time interval between successive requests for a particular
    # DAG.
    interarrivals = {}

    # Tracks the most recent arrival for each DAG -- used to calculate
    # interarrival times.
    last_arrivals = {}

    # Maintains a list of all other schedulers in the system, so we can
    # propagate metadata to them.
    schedulers = set()

    connect_socket = context.socket(zmq.REP)
    connect_socket.bind(sutils.BIND_ADDR_TEMPLATE % (CONNECT_PORT))

    func_create_socket = context.socket(zmq.REP)
    func_create_socket.bind(sutils.BIND_ADDR_TEMPLATE % (FUNC_CREATE_PORT))

    func_call_socket = context.socket(zmq.REP)
    func_call_socket.bind(sutils.BIND_ADDR_TEMPLATE % (FUNC_CALL_PORT))

    # This is for handle the invocation from queue
    # Mainly for storage event
    func_call_queue_socket = context.socket(zmq.PULL)
    func_call_queue_socket.bind(sutils.BIND_ADDR_TEMPLATE %
                                (FUNC_CALL_QUEUE_PORT))

    dag_create_socket = context.socket(zmq.REP)
    dag_create_socket.bind(sutils.BIND_ADDR_TEMPLATE % (DAG_CREATE_PORT))

    dag_call_socket = context.socket(zmq.REP)
    dag_call_socket.bind(sutils.BIND_ADDR_TEMPLATE % (DAG_CALL_PORT))

    dag_delete_socket = context.socket(zmq.REP)
    dag_delete_socket.bind(sutils.BIND_ADDR_TEMPLATE % (DAG_DELETE_PORT))

    list_socket = context.socket(zmq.REP)
    list_socket.bind(sutils.BIND_ADDR_TEMPLATE % (LIST_PORT))

    exec_status_socket = context.socket(zmq.PULL)
    exec_status_socket.bind(sutils.BIND_ADDR_TEMPLATE % (sutils.STATUS_PORT))

    sched_update_socket = context.socket(zmq.PULL)
    sched_update_socket.bind(sutils.BIND_ADDR_TEMPLATE %
                             (sutils.SCHED_UPDATE_PORT))

    pin_accept_socket = context.socket(zmq.PULL)
    pin_accept_socket.setsockopt(zmq.RCVTIMEO, 10000)  # 10 seconds.
    pin_accept_socket.bind(sutils.BIND_ADDR_TEMPLATE %
                           (sutils.PIN_ACCEPT_PORT))

    continuation_socket = context.socket(zmq.PULL)
    continuation_socket.bind(sutils.BIND_ADDR_TEMPLATE %
                             (sutils.CONTINUATION_PORT))

    if not local:
        management_request_socket = context.socket(zmq.REQ)
        management_request_socket.setsockopt(zmq.RCVTIMEO, 500)
        # By setting this flag, zmq matches replies with requests.
        management_request_socket.setsockopt(zmq.REQ_CORRELATE, 1)
        # Relax strict alternation between request and reply.
        # For detailed explanation, see here: http://api.zeromq.org/4-1:zmq-setsockopt
        management_request_socket.setsockopt(zmq.REQ_RELAXED, 1)
        management_request_socket.connect(
            sched_utils.get_scheduler_list_address(mgmt_ip))

    pusher_cache = SocketCache(context, zmq.PUSH)

    poller = zmq.Poller()
    poller.register(connect_socket, zmq.POLLIN)
    poller.register(func_create_socket, zmq.POLLIN)
    poller.register(func_call_socket, zmq.POLLIN)
    poller.register(func_call_queue_socket, zmq.POLLIN)
    poller.register(dag_create_socket, zmq.POLLIN)
    poller.register(dag_call_socket, zmq.POLLIN)
    poller.register(dag_delete_socket, zmq.POLLIN)
    poller.register(list_socket, zmq.POLLIN)
    poller.register(exec_status_socket, zmq.POLLIN)
    poller.register(sched_update_socket, zmq.POLLIN)
    poller.register(continuation_socket, zmq.POLLIN)

    # Start the policy engine.
    policy = DefaultCloudburstSchedulerPolicy(pin_accept_socket,
                                              pusher_cache,
                                              kvs,
                                              ip,
                                              policy_type,
                                              local=local)
    policy.update()

    start = time.time()

    while True:
        socks = dict(poller.poll(timeout=1000))

        if connect_socket in socks and socks[connect_socket] == zmq.POLLIN:
            msg = connect_socket.recv_string()
            connect_socket.send_string(route_addr)

        if (func_create_socket in socks
                and socks[func_create_socket] == zmq.POLLIN):
            create_function(func_create_socket, kvs)

        if func_call_socket in socks and socks[func_call_socket] == zmq.POLLIN:
            call_function(func_call_socket, pusher_cache, policy)

        if func_call_queue_socket in socks and socks[
                func_call_queue_socket] == zmq.POLLIN:
            call_function_from_queue(func_call_queue_socket, pusher_cache,
                                     policy)

        if (dag_create_socket in socks
                and socks[dag_create_socket] == zmq.POLLIN):
            create_dag(dag_create_socket, pusher_cache, kvs, dags, policy,
                       call_frequency)

        if dag_call_socket in socks and socks[dag_call_socket] == zmq.POLLIN:
            start_t = int(time.time() * 1000000)
            call = DagCall()
            call.ParseFromString(dag_call_socket.recv())

            name = call.name

            t = time.time()
            if name in last_arrivals:
                if name not in interarrivals:
                    interarrivals[name] = []

                interarrivals[name].append(t - last_arrivals[name])

            last_arrivals[name] = t

            if name not in dags:
                resp = GenericResponse()
                resp.success = False
                resp.error = NO_SUCH_DAG

                dag_call_socket.send(resp.SerializeToString())
                continue

            dag = dags[name]
            for fname in dag[0].functions:
                call_frequency[fname.name] += 1

            response = call_dag(call, pusher_cache, dags, policy)
            sched_t = int(time.time() * 1000000)
            logging.info(
                f'App function {name} recv: {start_t}, scheduled: {sched_t}')
            dag_call_socket.send(response.SerializeToString())

        if (dag_delete_socket in socks
                and socks[dag_delete_socket] == zmq.POLLIN):
            delete_dag(dag_delete_socket, dags, policy, call_frequency)

        if list_socket in socks and socks[list_socket] == zmq.POLLIN:
            msg = list_socket.recv_string()
            prefix = msg if msg else ''

            resp = StringSet()
            resp.keys.extend(sched_utils.get_func_list(kvs, prefix))

            list_socket.send(resp.SerializeToString())

        if exec_status_socket in socks and socks[exec_status_socket] == \
                zmq.POLLIN:
            status = ThreadStatus()
            status.ParseFromString(exec_status_socket.recv())

            policy.process_status(status)

        if sched_update_socket in socks and socks[sched_update_socket] == \
                zmq.POLLIN:
            status = SchedulerStatus()
            status.ParseFromString(sched_update_socket.recv())

            # Retrieve any DAGs that some other scheduler knows about that we
            # do not yet know about.
            for dname in status.dags:
                if dname not in dags:
                    payload = kvs.get(dname)
                    while None in payload:
                        payload = kvs.get(dname)

                    dag = Dag()
                    dag.ParseFromString(payload[dname].reveal())
                    dags[dag.name] = (dag, sched_utils.find_dag_source(dag))

                    for fname in dag.functions:
                        if fname.name not in call_frequency:
                            call_frequency[fname.name] = 0

            policy.update_function_locations(status.function_locations)

        if continuation_socket in socks and socks[continuation_socket] == \
                zmq.POLLIN:
            start_t = int(time.time() * 1000000)

            continuation = Continuation()
            continuation.ParseFromString(continuation_socket.recv())

            call = continuation.call
            call.name = continuation.name

            result = Value()
            result.ParseFromString(continuation.result)

            dag, sources = dags[call.name]
            for source in sources:
                call.function_args[source].values.extend([result])

            call_dag(call, pusher_cache, dags, policy, continuation.id)
            sched_t = int(time.time() * 1000000)
            print(
                f'App function {call.name} recv: {start_t}, scheduled: {sched_t}'
            )
            for fname in dag.functions:
                call_frequency[fname.name] += 1

        end = time.time()

        if end - start > METADATA_THRESHOLD:
            # Update the scheduler policy-related metadata.
            policy.update()

            # If the management IP is None, that means we arre running in
            # local mode, so there is no need to deal with caches and other
            # schedulers.
            if not local:
                latest_schedulers = sched_utils.get_ip_set(
                    management_request_socket, False)
                if latest_schedulers:
                    schedulers = latest_schedulers

        if end - start > REPORT_THRESHOLD:
            status = SchedulerStatus()
            for name in dags.keys():
                status.dags.append(name)

            for fname in policy.function_locations:
                for loc in policy.function_locations[fname]:
                    floc = status.function_locations.add()
                    floc.name = fname
                    floc.ip = loc[0]
                    floc.tid = loc[1]

            msg = status.SerializeToString()

            for sched_ip in schedulers:
                if sched_ip != ip:
                    sckt = pusher_cache.get(
                        sched_utils.get_scheduler_update_address(sched_ip))
                    sckt.send(msg)

            stats = ExecutorStatistics()
            for fname in call_frequency:
                fstats = stats.functions.add()
                fstats.name = fname
                fstats.call_count = call_frequency[fname]
                logging.debug('Reporting %d calls for function %s.' %
                              (call_frequency[fname], fname))

                call_frequency[fname] = 0

            for dname in interarrivals:
                dstats = stats.dags.add()
                dstats.name = dname
                dstats.call_count = len(interarrivals[dname]) + 1
                dstats.interarrival.extend(interarrivals[dname])

                interarrivals[dname].clear()

            # We only attempt to send the statistics if we are running in
            # cluster mode. If we are running in local mode, we write them to
            # the local log file.
            if mgmt_ip:
                sckt = pusher_cache.get(
                    sutils.get_statistics_report_address(mgmt_ip))
                sckt.send(stats.SerializeToString())

            start = time.time()
Exemple #7
0
def lambda_handler(event, context):
    print('Lambda started')
    num_txns = int(event["num_txns"])
    num_reads = int(event["num_reads"])
    num_writes = int(event["num_writes"])
    num_lookups = int(event["num_lookups"])
    benchmark_server = event["benchmark_ip"]
    elb = event["elb"]
    zipf = float(event["zipf"])
    prefix = event["prefix"]
    N = int(event["N"])

    x = np.arange(1, N)

    weights = x**(-zipf)
    weights /= weights.sum()
    bounded_zipf = stats.rv_discrete(name='bounded_zipf', values=(x, weights))

    read_times = []
    write_times = []
    lookup_times = []
    throughput_time = 0

    ip = requests.get('http://checkip.amazonaws.com').text.rstrip()
    sip = socket.gethostname()

    print('AWS IP Got {}'.format(ip))
    print('Socket IP Got {}'.format(sip))

    dumb_client = AnnaTcpClient(elb, ip)

    for i in range(num_txns):
        print('*** Starting Transaction ' + str(i) + ' ! ***')

        # Perform routing lookups
        for _ in range(num_lookups):
            key = prefix + str(bounded_zipf.rvs(size=1)[0])
            port = 6450
            start = time.time()
            addresses = dumb_client._query_routing(key, port)
            end = time.time()

            lookup_times.append(end - start)
            throughput_time += (end - start)

        # Perform writes
        for _ in range(num_writes):
            key = prefix + str(bounded_zipf.rvs(size=1)[0])

            port = random.choice([6450, 6451, 6452, 6453])
            addresses = dumb_client._query_routing(key, port)
            send_sock = dumb_client.pusher_cache.get(addresses[0])

            data = os.urandom(4096)
            lww = LWW(time.time_ns(), data)
            req, tup = dumb_client._prepare_data_request([key])
            req.type = PUT

            rids = [req.request_id]
            tup = tup[0]
            tup.payload, tup.lattice_type = dumb_client._serialize(value)

            start = time.time()
            send_request(req, send_sock)
            responses = recv_response(rids, dumb_client.response_puller,
                                      KeyResponse)
            end = time.time()

            write_times.append(end - start)

            throughput_time += (end - start)

        # Perform reads
        for _ in range(num_reads):
            key = prefix + str(bounded_zipf.rvs(size=1)[0])

            port = random.choice([6450, 6451, 6452, 6453])
            addresses = dumb_client._query_routing(key, port)
            send_sock = dumb_client.pusher_cache.get(addresses[0])

            req, _ = dumb_client._prepare_data_request([key])
            req.type = GET
            rids = [req.request_id]

            start = time.time()

            send_request(req, send_sock)

            # Wait for all responses to return.

            responses = recv_response(rids, dumb_client.response_puller,
                                      KeyResponse)
            end = time.time()

            read_times.append(end - start)
            throughput_time += (end - start)

    throughput = (num_txns * (num_writes + num_reads)) / throughput_time
    convert = lambda x: x * 1000
    read_lat = list(map(convert, read_times))
    write_lat = list(map(convert, write_times))
    lookup_lat = list(map(convert, lookup_times))
    read_msg = ",".join(map(str, read_lat))
    write_msg = ",".join(map(str, write_lat))
    lookup_msg = ",".join(map(str, lookup_lat))

    ctx = zmq.Context(1)
    sckt = ctx.socket(zmq.PUSH)
    sckt.connect('tcp://%s:6600' % benchmark_server)

    message = str(throughput) + ";" + str(read_msg) + ";" + str(
        write_msg) + ";" + str(lookup_msg)

    sckt.send_string(message)
    return "Success"
Exemple #8
0
import json
import numpy as np
import torch
import pandas as pd
from PIL import Image
import torchvision
import pickle
from torch.nn import Parameter
import codecs
import anna
from anna.client import AnnaTcpClient
from anna.lattices import LWWPairLattice
# import droplet
# from anna.shared.serializer import Serializer

client = AnnaTcpClient('127.0.0.1', '127.0.0.1', local=True, offset=2)
model_options = ['ResNet50', 'ResNet18', 'ResNet152']

# value_string = "World"
# value = LWWPairLattice(1, value_string.encode())
# client.put("Hello", value)
# ret = (((client.get("Hello"))["Hello"]).reveal()).decode()
# print(ret)

model_str = str(sys.argv[1])
load_str = ''
if (model_str == model_options[0]):
    load_str = '/hydro_test/model_weights/resnet50-19c8e357.pth'
elif (model_str == model_options[1]):
    load_str = '/hydro_test/model_weights/resnet18-5c106cde.pth'
elif (model_str == model_options[2]):
Exemple #9
0
import json
import numpy as np
import torch
import pandas as pd
from PIL import Image
import torchvision
import pickle
from torch.nn import Parameter
import codecs
import anna
from anna.client import AnnaTcpClient
from anna.lattices import LWWPairLattice
# import droplet
# from anna.shared.serializer import Serializer

client = AnnaTcpClient('127.0.0.1', '127.0.0.1', local=True, offset=2)
model_options = ['ResNet50', 'ResNet18', 'ResNet152']

# value_string = "World"
# value = LWWPairLattice(1, value_string.encode())
# client.put("Hello", value)
# ret = (((client.get("Hello"))["Hello"]).reveal()).decode()
# print(ret)

model_str = str(sys.argv[1])
load_str = ''
if(model_str == model_options[0]):
	load_str = '/hydro_test/model_weights/resnet50-19c8e357.pth'
elif(model_str == model_options[1]):
	load_str = '/hydro_test/model_weights/resnet18-5c106cde.pth'
elif(model_str == model_options[2]):