def pin_function(self, dag_name, function_ref): # If there are no functions left to choose from, then we return None, # indicating that we ran out of resources to use. if len(self.unpinned_executors) == 0: return False if dag_name not in self.pending_dags: self.pending_dags[dag_name] = [] # Make a copy of the set of executors, so that we don't modify the # system's metadata. candidates = set(self.unpinned_executors) # Construct a PinFunction message to be sent to executors. pin_msg = PinFunction() pin_msg.name = function_ref.name pin_msg.response_address = self.ip serialized = pin_msg.SerializeToString() while True: # Pick a random executor from the set of candidates and attempt to # pin this function there. node, tid = sys_random.sample(candidates, 1)[0] sckt = self.pusher_cache.get(get_pin_address(node, tid)) sckt.send(serialized) response = GenericResponse() try: response.ParseFromString(self.pin_accept_socket.recv()) except zmq.ZMQError: logging.error('Pin operation to %s:%d timed out. Retrying.' % (node, tid)) continue # Do not use this executor either way: If it rejected, it has # something else pinned, and if it accepted, it has pinned what we # just asked it to pin. # In local model allow executors to have multiple functions pinned if not self.local: self.unpinned_executors.discard((node, tid)) candidates.discard((node, tid)) if response.success: # The pin operation succeeded, so we return the node and thread # ID to the caller. self.pending_dags[dag_name].append((function_ref.name, (node, tid))) return True else: # The pin operation was rejected, remove node and try again. logging.error('Node %s:%d rejected pin for %s. Retrying.' % (node, tid, function_ref.name)) continue
def pin_function(self, dag_name, function_ref, colocated): # If there are no functions left to choose from, then we return None, # indicating that we ran out of resources to use. if function_ref.gpu and len(self.unpinned_gpu_executors) == 0: return False elif not function_ref.gpu and len(self.unpinned_cpu_executors) == 0: return False if dag_name not in self.pending_dags: self.pending_dags[dag_name] = [] # Make a copy of the set of executors, so that we don't modify the # system's metadata. if function_ref.gpu: candidates = set(self.unpinned_gpu_executors) elif len(colocated) == 0: # If this is not a GPU function, just look at all of the unpinned # executors. candidates = set(self.unpinned_cpu_executors) else: candidates = set() already_pinned = set() for fn, thread in self.pending_dags[dag_name]: if fn in colocated: already_pinned.add((fn, thread)) candidate_nodes = set() if len(already_pinned) > 0: for fn, thread in already_pinned: candidate_nodes.add(thread[0]) # The node's IP for node, tid in self.unpinned_cpu_executors: if node in candidate_nodes: candidates.add((node, tid)) else: # If this is the first colocate to be pinned, try to assign to # an empty node. nodes = {} for node, tid in self.unpinned_cpu_executors: if node not in nodes: nodes[node] = 0 nodes[node] += 1 for node in nodes: if nodes[node] == NUM_EXECUTOR_THREADS: for i in range(NUM_EXECUTOR_THREADS): candidates.add((node, i)) if len(candidates) == 0: # There no valid executors to colocate on. return self.pin_function(dag_name, function_ref, []) # Construct a PinFunction message to be sent to executors. pin_msg = PinFunction() pin_msg.name = function_ref.name pin_msg.batching = function_ref.batching pin_msg.response_address = self.ip serialized = pin_msg.SerializeToString() while True: # Pick a random executor from the set of candidates and attempt to # pin this function there. node, tid = sys_random.sample(candidates, 1)[0] sckt = self.pusher_cache.get(get_pin_address(node, tid)) sckt.send(serialized) response = GenericResponse() try: response.ParseFromString(self.pin_accept_socket.recv()) except zmq.ZMQError: logging.error('Pin operation to %s:%d timed out. Retrying.' % (node, tid)) continue # Do not use this executor either way: If it rejected, it has # something else pinned, and if it accepted, it has pinned what we # just asked it to pin. In local mode, however we allow executors # to have multiple functions pinned. if not self.local: if function_ref.gpu: self.unpinned_gpu_executors.discard((node, tid)) candidates.discard((node, tid)) else: self.unpinned_cpu_executors.discard((node, tid)) candidates.discard((node, tid)) if response.success: # The pin operation succeeded, so we return the node and thread # ID to the caller. self.pending_dags[dag_name].append((function_ref.name, (node, tid))) return True else: # The pin operation was rejected, remove node and try again. logging.error('Node %s:%d rejected pin for %s. Retrying.' % (node, tid, function_ref.name)) continue if len(candidates) == 0 and len(colocated) > 0: # Try again without colocation. return self.pin_function(self, dag_name, function_ref, [])