Beispiel #1
0
    def pin_function(self, dag_name, function_ref):
        # If there are no functions left to choose from, then we return None,
        # indicating that we ran out of resources to use.
        if len(self.unpinned_executors) == 0:
            return False

        if dag_name not in self.pending_dags:
            self.pending_dags[dag_name] = []

        # Make a copy of the set of executors, so that we don't modify the
        # system's metadata.
        candidates = set(self.unpinned_executors)

        # Construct a PinFunction message to be sent to executors.
        pin_msg = PinFunction()
        pin_msg.name = function_ref.name
        pin_msg.response_address = self.ip

        serialized = pin_msg.SerializeToString()

        while True:
            # Pick a random executor from the set of candidates and attempt to
            # pin this function there.
            node, tid = sys_random.sample(candidates, 1)[0]

            sckt = self.pusher_cache.get(get_pin_address(node, tid))
            sckt.send(serialized)

            response = GenericResponse()
            try:
                response.ParseFromString(self.pin_accept_socket.recv())
            except zmq.ZMQError:
                logging.error('Pin operation to %s:%d timed out. Retrying.' %
                              (node, tid))
                continue

            # Do not use this executor either way: If it rejected, it has
            # something else pinned, and if it accepted, it has pinned what we
            # just asked it to pin.
            # In local model allow executors to have multiple functions pinned
            if not self.local:
                self.unpinned_executors.discard((node, tid))
                candidates.discard((node, tid))

            if response.success:
                # The pin operation succeeded, so we return the node and thread
                # ID to the caller.
                self.pending_dags[dag_name].append((function_ref.name, (node,
                                                                        tid)))
                return True
            else:
                # The pin operation was rejected, remove node and try again.
                logging.error('Node %s:%d rejected pin for %s. Retrying.'
                              % (node, tid, function_ref.name))

                continue
Beispiel #2
0
    def pin_function(self, dag_name, function_ref, colocated):
        # If there are no functions left to choose from, then we return None,
        # indicating that we ran out of resources to use.
        if function_ref.gpu and len(self.unpinned_gpu_executors) == 0:
            return False
        elif not function_ref.gpu and len(self.unpinned_cpu_executors) == 0:
            return False

        if dag_name not in self.pending_dags:
            self.pending_dags[dag_name] = []

        # Make a copy of the set of executors, so that we don't modify the
        # system's metadata.
        if function_ref.gpu:
            candidates = set(self.unpinned_gpu_executors)
        elif len(colocated) == 0:
            # If this is not a GPU function, just look at all of the unpinned
            # executors.
            candidates = set(self.unpinned_cpu_executors)
        else:
            candidates = set()

            already_pinned = set()
            for fn, thread in self.pending_dags[dag_name]:
                if fn in colocated:
                    already_pinned.add((fn, thread))
            candidate_nodes = set()

            if len(already_pinned) > 0:
                for fn, thread in already_pinned:
                    candidate_nodes.add(thread[0]) # The node's IP

                for node, tid in self.unpinned_cpu_executors:
                    if node in candidate_nodes:
                        candidates.add((node, tid))
            else:
                # If this is the first colocate to be pinned, try to assign to
                # an empty node.
                nodes = {}
                for node, tid in self.unpinned_cpu_executors:
                    if node not in nodes:
                        nodes[node] = 0
                    nodes[node] += 1

                for node in nodes:
                    if nodes[node] == NUM_EXECUTOR_THREADS:
                        for i in range(NUM_EXECUTOR_THREADS):
                            candidates.add((node, i))

        if len(candidates) == 0: # There no valid executors to colocate on.
            return self.pin_function(dag_name, function_ref, [])

        # Construct a PinFunction message to be sent to executors.
        pin_msg = PinFunction()
        pin_msg.name = function_ref.name
        pin_msg.batching = function_ref.batching
        pin_msg.response_address = self.ip

        serialized = pin_msg.SerializeToString()

        while True:
            # Pick a random executor from the set of candidates and attempt to
            # pin this function there.
            node, tid = sys_random.sample(candidates, 1)[0]

            sckt = self.pusher_cache.get(get_pin_address(node, tid))
            sckt.send(serialized)

            response = GenericResponse()
            try:
                response.ParseFromString(self.pin_accept_socket.recv())
            except zmq.ZMQError:
                logging.error('Pin operation to %s:%d timed out. Retrying.' %
                              (node, tid))
                continue

            # Do not use this executor either way: If it rejected, it has
            # something else pinned, and if it accepted, it has pinned what we
            # just asked it to pin. In local mode, however we allow executors
            # to have multiple functions pinned.
            if not self.local:
                if function_ref.gpu:
                    self.unpinned_gpu_executors.discard((node, tid))
                    candidates.discard((node, tid))
                else:
                    self.unpinned_cpu_executors.discard((node, tid))
                    candidates.discard((node, tid))

            if response.success:
                # The pin operation succeeded, so we return the node and thread
                # ID to the caller.
                self.pending_dags[dag_name].append((function_ref.name, (node,
                                                                        tid)))
                return True
            else:
                # The pin operation was rejected, remove node and try again.
                logging.error('Node %s:%d rejected pin for %s. Retrying.'
                              % (node, tid, function_ref.name))

                continue

            if len(candidates) == 0 and len(colocated) > 0:
                # Try again without colocation.
                return self.pin_function(self, dag_name, function_ref, [])