def run_on_box( self, options, first_worker ): """ :param cgcloud.core.box.Box first_worker: """ log.info( '=== Binding to leader ===' ) leader = self.cluster.leader_role( self.cluster.ctx ) leader.bind( cluster_name=options.cluster_name, ordinal=options.ordinal, wait_ready=False ) log.info( '=== Creating workers ===' ) workers = first_worker.list( leader_instance_id=leader.instance_id ) used_cluster_ordinals = set( w.cluster_ordinal for w in workers ) assert len( used_cluster_ordinals ) == len( workers ) # check for collisions assert 0 not in used_cluster_ordinals # master has 0 used_cluster_ordinals.add( 0 ) # to make the math easier cluster_ordinal = allocate_cluster_ordinals( num=options.num_workers, used=used_cluster_ordinals ) first_worker.unbind( ) # list() bound it spec = first_worker.prepare( leader_instance_id=leader.instance_id, cluster_name=leader.cluster_name, **self.preparation_kwargs( options, first_worker ) ) with thread_pool( min( options.num_threads, options.num_workers ) ) as pool: workers = first_worker.create( spec, cluster_ordinal=cluster_ordinal, executor=pool.apply_async, **self.creation_kwargs( options, first_worker ) ) if options.list: self.list( workers )
def clone( self, worker_role, num_workers, worker_instance_type, pool_size, wait_ready=True): """ Create a number of worker boxes that are connected to this leader. """ first_worker = worker_role( self.ctx ) args = self.preparation_args kwargs = dict( self.preparation_kwargs, instance_type=worker_instance_type, leader_instance_id=self.instance_id, num_instances=num_workers ) spec = first_worker.prepare( *args, **kwargs ) with thread_pool( pool_size ) as pool: first_worker.create( spec, wait_ready=wait_ready, cluster_ordinal=self.cluster_ordinal + 1, executor=pool.apply_async )
def clone( self, worker_role, num_workers, worker_instance_type, pool_size, wait_ready=True ): """ Create a number of worker boxes that are connected to this leader. """ first_worker = worker_role( self.ctx ) args = self.preparation_args kwargs = dict( self.preparation_kwargs, instance_type=worker_instance_type, leader_instance_id=self.instance_id, num_instances=num_workers ) spec = first_worker.prepare( *args, **kwargs ) with thread_pool( pool_size ) as pool: return first_worker.create( spec, wait_ready=wait_ready, cluster_ordinal=self.cluster_ordinal + 1, executor=pool.apply_async )
def run_on_box( self, options, leader ): """ :type leader: cgcloud.core.box.Box """ log.info( '=== Creating leader ===' ) preparation_kwargs = self.preparation_kwargs( options, leader ) if options.leader_on_demand: preparation_kwargs = { k: v for k, v in preparation_kwargs.iteritems( ) if not k.startswith( 'spot_' ) } spec = leader.prepare( **preparation_kwargs ) creation_kwargs = dict( self.creation_kwargs( options, leader ), num_instances=1, # We must always wait for the leader since workers depend on it. wait_ready=True ) leader.create( spec, **creation_kwargs ) try: self.run_on_creation( leader, options ) except: if options.terminate is not False: with panic( log ): leader.terminate( wait=False ) raise # Leader is fully setup, even if the code below fails to add workers, # the GrowClusterCommand can be used to recover from that failure. if options.num_workers: log.info( '=== Creating workers ===' ) first_worker = self.cluster.worker_role( leader.ctx ) preparation_kwargs = dict( self.preparation_kwargs( options, first_worker ), leader_instance_id=leader.instance_id, instance_type=options.worker_instance_type ) spec = first_worker.prepare( **preparation_kwargs ) with thread_pool( min( options.num_threads, options.num_workers ) ) as pool: workers = first_worker.create( spec, cluster_ordinal=leader.cluster_ordinal + 1, executor=pool.apply_async, **self.creation_kwargs( options, first_worker ) ) else: workers = [ ] if options.list: self.list( [ leader ] ) self.list( workers, print_headers=False ) self.log_ssh_hint( options )
def run_on_box( self, options, first_worker ): log.info( '=== Binding to leader ===' ) leader = self.cluster.leader_role( self.cluster.ctx ) leader.bind( cluster_name=options.cluster_name, ordinal=options.ordinal, wait_ready=False ) log.info( '=== Creating workers ===' ) workers = first_worker.list( leader_instance_id=leader.instance_id ) used_cluster_ordinals = set( w.cluster_ordinal for w in workers ) assert len( used_cluster_ordinals ) == len( workers ) # check for collisions assert 0 not in used_cluster_ordinals # master has 0 used_cluster_ordinals.add( 0 ) # to make the math easier cluster_ordinal = self.allocate_cluster_ordinals( num=options.num_workers, used=used_cluster_ordinals ) spec = first_worker.prepare( leader_instance_id=leader.instance_id, **self.instance_options( options, first_worker ) ) with thread_pool( min( options.num_threads, options.num_workers ) ) as pool: first_worker.create( spec, wait_ready=True, cluster_ordinal=cluster_ordinal, executor=pool.apply_async )
def run_on_box(self, options, first_worker): log.info('=== Binding to leader ===') leader = self.cluster.leader_role(self.cluster.ctx) leader.bind(cluster_name=options.cluster_name, ordinal=options.ordinal, wait_ready=False) log.info('=== Creating workers ===') workers = first_worker.list(leader_instance_id=leader.instance_id) used_cluster_ordinals = set(w.cluster_ordinal for w in workers) assert len(used_cluster_ordinals) == len( workers) # check for collisions assert 0 not in used_cluster_ordinals # master has 0 used_cluster_ordinals.add(0) # to make the math easier cluster_ordinal = self.allocate_cluster_ordinals( num=options.num_workers, used=used_cluster_ordinals) spec = first_worker.prepare(leader_instance_id=leader.instance_id, **self.instance_options( options, first_worker)) with thread_pool(min(options.num_threads, options.num_workers)) as pool: first_worker.create(spec, wait_ready=True, cluster_ordinal=cluster_ordinal, executor=pool.apply_async)
def _addNodes(self, instances, numNodes, preemptable=False): deadline = time.time() + provisioning_timeout spec = dict(key_name=self._keyName, user_data=self._userData(), instance_type=self.instanceType[preemptable].name, instance_profile_arn=self._instanceProfileArn, security_group_ids=self._securityGroupIds, ebs_optimized=self.ebsOptimized, dry_run=False) # Offset the ordinals of the preemptable nodes to be disjunct from the non-preemptable # ones. Without this, the two scaler threads would inevitably allocate colliding ordinals. offset = 1000 if preemptable else 0 used_ordinals = { int(i.tags['cluster_ordinal']) - offset for i in instances } # Since leader is absent from the instances iterable, we need to explicitly reserve its # ordinal unless we're allocating offset ordinals reserved for preemptable instances: assert len(used_ordinals) == len(instances) # check for collisions if not preemptable: used_ordinals.add(0) ordinals = (ordinal + offset for ordinal in allocate_cluster_ordinals( num=numNodes, used=used_ordinals)) def createInstances(): """ :rtype: Iterable[list[Instance]] """ if preemptable: for batch in create_spot_instances( self._ec2, self.spotBid, self.imageId, spec, # Don't insist on spot requests and don't raise # if no requests were fulfilled: tentative=True, num_instances=numNodes, timeout=deadline - time.time()): yield batch else: yield create_ondemand_instances(self._ec2, self.imageId, spec, num_instances=numNodes) instancesByAddress = {} def handleInstance(instance): log.debug('Tagging instance %s.', instance.id) leader_tags = self._instance.tags name = leader_tags['Name'].replace('toil-leader', 'toil-worker') tag_object_persistently( instance, dict(leader_tags, Name=name, cluster_ordinal=next(ordinals))) assert instance.private_ip_address instancesByAddress[instance.private_ip_address] = instance # Each instance gets a different ordinal so we can't tag an entire batch at once but have # to tag each instance individually. It needs to be done quickly because the tags are # crucial for the boot code running inside the instance to join the cluster. Hence we do # it in a thread pool. If the pool is too large, we'll hit the EC2 limit on the number of # of concurrent requests. If it is too small, we won't be able to tag all instances in # time. with thread_pool(min(numNodes, 32)) as pool: for batch in createInstances(): log.debug('Got a batch of %i instance(s).', len(batch)) for instance in batch: log.debug( 'Submitting instance %s to thread pool for tagging.', instance.id) pool.apply_async(handleInstance, (instance, )) numInstancesAdded = len(instancesByAddress) log.info('Created and tagged %i instance(s).', numInstancesAdded) if preemptable: # Reset deadline such that slow spot creation does not take away from instance boot-up deadline = time.time() + provisioning_timeout if isinstance(self.batchSystem, AbstractScalableBatchSystem): while instancesByAddress and time.time() < deadline: with throttle(10): log.debug( 'Waiting for batch system to report back %i node(s).', len(instancesByAddress)) # Get all nodes to be safe, not just the ones whose preemptability matches, # in case there's a problem with a node determining its own preemptability. nodes = self.batchSystem.getNodes() for nodeAddress in nodes.iterkeys(): instancesByAddress.pop(nodeAddress, None) if instancesByAddress: log.warn( '%i instance(s) out of %i did not join the cluster as worker nodes. They ' 'will be terminated.', len(instancesByAddress), numInstancesAdded) instanceIds = [i.id for i in instancesByAddress.itervalues()] self._logAndTerminate(instanceIds) numInstancesAdded -= len(instanceIds) else: log.info('All %i node(s) joined the cluster.', numInstancesAdded) else: log.warn( 'Batch system is not scalable. Assuming all instances joined the cluster.' ) return numInstancesAdded