コード例 #1
0
ファイル: cluster_commands.py プロジェクト: dleehr/cgcloud
 def run_on_box( self, options, first_worker ):
     """
     :param cgcloud.core.box.Box first_worker:
     """
     log.info( '=== Binding to leader ===' )
     leader = self.cluster.leader_role( self.cluster.ctx )
     leader.bind( cluster_name=options.cluster_name,
                  ordinal=options.ordinal,
                  wait_ready=False )
     log.info( '=== Creating workers  ===' )
     workers = first_worker.list( leader_instance_id=leader.instance_id )
     used_cluster_ordinals = set( w.cluster_ordinal for w in workers )
     assert len( used_cluster_ordinals ) == len( workers )  # check for collisions
     assert 0 not in used_cluster_ordinals  # master has 0
     used_cluster_ordinals.add( 0 )  # to make the math easier
     cluster_ordinal = allocate_cluster_ordinals( num=options.num_workers,
                                                  used=used_cluster_ordinals )
     first_worker.unbind( )  # list() bound it
     spec = first_worker.prepare( leader_instance_id=leader.instance_id,
                                  cluster_name=leader.cluster_name,
                                  **self.preparation_kwargs( options, first_worker ) )
     with thread_pool( min( options.num_threads, options.num_workers ) ) as pool:
         workers = first_worker.create( spec,
                                        cluster_ordinal=cluster_ordinal,
                                        executor=pool.apply_async,
                                        **self.creation_kwargs( options, first_worker ) )
     if options.list:
         self.list( workers )
コード例 #2
0
 def clone( self, worker_role, num_workers, worker_instance_type, pool_size, wait_ready=True):
     """
     Create a number of worker boxes that are connected to this leader.
     """
     first_worker = worker_role( self.ctx )
     args = self.preparation_args
     kwargs = dict( self.preparation_kwargs,
                    instance_type=worker_instance_type,
                    leader_instance_id=self.instance_id,
                    num_instances=num_workers )
     spec = first_worker.prepare( *args, **kwargs )
     with thread_pool( pool_size ) as pool:
         first_worker.create( spec,
                              wait_ready=wait_ready,
                              cluster_ordinal=self.cluster_ordinal + 1,
                              executor=pool.apply_async )
コード例 #3
0
ファイル: cluster.py プロジェクト: arkal/cgcloud
 def clone( self, worker_role, num_workers, worker_instance_type, pool_size, wait_ready=True ):
     """
     Create a number of worker boxes that are connected to this leader.
     """
     first_worker = worker_role( self.ctx )
     args = self.preparation_args
     kwargs = dict( self.preparation_kwargs,
                    instance_type=worker_instance_type,
                    leader_instance_id=self.instance_id,
                    num_instances=num_workers )
     spec = first_worker.prepare( *args, **kwargs )
     with thread_pool( pool_size ) as pool:
         return first_worker.create( spec,
                                     wait_ready=wait_ready,
                                     cluster_ordinal=self.cluster_ordinal + 1,
                                     executor=pool.apply_async )
コード例 #4
0
ファイル: cluster_commands.py プロジェクト: dleehr/cgcloud
 def run_on_box( self, options, leader ):
     """
     :type leader: cgcloud.core.box.Box
     """
     log.info( '=== Creating leader ===' )
     preparation_kwargs = self.preparation_kwargs( options, leader )
     if options.leader_on_demand:
         preparation_kwargs = { k: v for k, v in preparation_kwargs.iteritems( )
             if not k.startswith( 'spot_' ) }
     spec = leader.prepare( **preparation_kwargs )
     creation_kwargs = dict( self.creation_kwargs( options, leader ),
                             num_instances=1,
                             # We must always wait for the leader since workers depend on it.
                             wait_ready=True )
     leader.create( spec, **creation_kwargs )
     try:
         self.run_on_creation( leader, options )
     except:
         if options.terminate is not False:
             with panic( log ):
                 leader.terminate( wait=False )
         raise
     # Leader is fully setup, even if the code below fails to add workers,
     # the GrowClusterCommand can be used to recover from that failure.
     if options.num_workers:
         log.info( '=== Creating workers ===' )
         first_worker = self.cluster.worker_role( leader.ctx )
         preparation_kwargs = dict( self.preparation_kwargs( options, first_worker ),
                                    leader_instance_id=leader.instance_id,
                                    instance_type=options.worker_instance_type )
         spec = first_worker.prepare( **preparation_kwargs )
         with thread_pool( min( options.num_threads, options.num_workers ) ) as pool:
             workers = first_worker.create( spec,
                                            cluster_ordinal=leader.cluster_ordinal + 1,
                                            executor=pool.apply_async,
                                            **self.creation_kwargs( options, first_worker ) )
     else:
         workers = [ ]
     if options.list:
         self.list( [ leader ] )
         self.list( workers, print_headers=False )
     self.log_ssh_hint( options )
コード例 #5
0
ファイル: cluster_commands.py プロジェクト: kushaldas/cgcloud
 def run_on_box( self, options, first_worker ):
     log.info( '=== Binding to leader ===' )
     leader = self.cluster.leader_role( self.cluster.ctx )
     leader.bind( cluster_name=options.cluster_name,
                  ordinal=options.ordinal,
                  wait_ready=False )
     log.info( '=== Creating workers  ===' )
     workers = first_worker.list( leader_instance_id=leader.instance_id )
     used_cluster_ordinals = set( w.cluster_ordinal for w in workers )
     assert len( used_cluster_ordinals ) == len( workers )  # check for collisions
     assert 0 not in used_cluster_ordinals  # master has 0
     used_cluster_ordinals.add( 0 )  # to make the math easier
     cluster_ordinal = self.allocate_cluster_ordinals( num=options.num_workers,
                                                       used=used_cluster_ordinals )
     spec = first_worker.prepare( leader_instance_id=leader.instance_id,
                                  **self.instance_options( options, first_worker ) )
     with thread_pool( min( options.num_threads, options.num_workers ) ) as pool:
         first_worker.create( spec,
                              wait_ready=True,
                              cluster_ordinal=cluster_ordinal,
                              executor=pool.apply_async )
コード例 #6
0
ファイル: cluster_commands.py プロジェクト: fnothaft/cgcloud
 def run_on_box(self, options, first_worker):
     log.info('=== Binding to leader ===')
     leader = self.cluster.leader_role(self.cluster.ctx)
     leader.bind(cluster_name=options.cluster_name,
                 ordinal=options.ordinal,
                 wait_ready=False)
     log.info('=== Creating workers  ===')
     workers = first_worker.list(leader_instance_id=leader.instance_id)
     used_cluster_ordinals = set(w.cluster_ordinal for w in workers)
     assert len(used_cluster_ordinals) == len(
         workers)  # check for collisions
     assert 0 not in used_cluster_ordinals  # master has 0
     used_cluster_ordinals.add(0)  # to make the math easier
     cluster_ordinal = self.allocate_cluster_ordinals(
         num=options.num_workers, used=used_cluster_ordinals)
     spec = first_worker.prepare(leader_instance_id=leader.instance_id,
                                 **self.instance_options(
                                     options, first_worker))
     with thread_pool(min(options.num_threads,
                          options.num_workers)) as pool:
         first_worker.create(spec,
                             wait_ready=True,
                             cluster_ordinal=cluster_ordinal,
                             executor=pool.apply_async)
コード例 #7
0
    def _addNodes(self, instances, numNodes, preemptable=False):
        deadline = time.time() + provisioning_timeout
        spec = dict(key_name=self._keyName,
                    user_data=self._userData(),
                    instance_type=self.instanceType[preemptable].name,
                    instance_profile_arn=self._instanceProfileArn,
                    security_group_ids=self._securityGroupIds,
                    ebs_optimized=self.ebsOptimized,
                    dry_run=False)
        # Offset the ordinals of the preemptable nodes to be disjunct from the non-preemptable
        # ones. Without this, the two scaler threads would inevitably allocate colliding ordinals.
        offset = 1000 if preemptable else 0
        used_ordinals = {
            int(i.tags['cluster_ordinal']) - offset
            for i in instances
        }
        # Since leader is absent from the instances iterable, we need to explicitly reserve its
        # ordinal unless we're allocating offset ordinals reserved for preemptable instances:
        assert len(used_ordinals) == len(instances)  # check for collisions
        if not preemptable:
            used_ordinals.add(0)
        ordinals = (ordinal + offset for ordinal in allocate_cluster_ordinals(
            num=numNodes, used=used_ordinals))

        def createInstances():
            """
            :rtype: Iterable[list[Instance]]
            """
            if preemptable:
                for batch in create_spot_instances(
                        self._ec2,
                        self.spotBid,
                        self.imageId,
                        spec,
                        # Don't insist on spot requests and don't raise
                        # if no requests were fulfilled:
                        tentative=True,
                        num_instances=numNodes,
                        timeout=deadline - time.time()):
                    yield batch
            else:
                yield create_ondemand_instances(self._ec2,
                                                self.imageId,
                                                spec,
                                                num_instances=numNodes)

        instancesByAddress = {}

        def handleInstance(instance):
            log.debug('Tagging instance %s.', instance.id)
            leader_tags = self._instance.tags
            name = leader_tags['Name'].replace('toil-leader', 'toil-worker')
            tag_object_persistently(
                instance,
                dict(leader_tags, Name=name, cluster_ordinal=next(ordinals)))
            assert instance.private_ip_address
            instancesByAddress[instance.private_ip_address] = instance

        # Each instance gets a different ordinal so we can't tag an entire batch at once but have
        # to tag each instance individually. It needs to be done quickly because the tags are
        # crucial for the boot code running inside the instance to join the cluster. Hence we do
        # it in a thread pool. If the pool is too large, we'll hit the EC2 limit on the number of
        # of concurrent requests. If it is too small, we won't be able to tag all instances in
        # time.
        with thread_pool(min(numNodes, 32)) as pool:
            for batch in createInstances():
                log.debug('Got a batch of %i instance(s).', len(batch))
                for instance in batch:
                    log.debug(
                        'Submitting instance %s to thread pool for tagging.',
                        instance.id)
                    pool.apply_async(handleInstance, (instance, ))
        numInstancesAdded = len(instancesByAddress)
        log.info('Created and tagged %i instance(s).', numInstancesAdded)

        if preemptable:
            # Reset deadline such that slow spot creation does not take away from instance boot-up
            deadline = time.time() + provisioning_timeout
        if isinstance(self.batchSystem, AbstractScalableBatchSystem):
            while instancesByAddress and time.time() < deadline:
                with throttle(10):
                    log.debug(
                        'Waiting for batch system to report back %i node(s).',
                        len(instancesByAddress))
                    # Get all nodes to be safe, not just the ones whose preemptability matches,
                    # in case there's a problem with a node determining its own preemptability.
                    nodes = self.batchSystem.getNodes()
                    for nodeAddress in nodes.iterkeys():
                        instancesByAddress.pop(nodeAddress, None)
            if instancesByAddress:
                log.warn(
                    '%i instance(s) out of %i did not join the cluster as worker nodes. They '
                    'will be terminated.', len(instancesByAddress),
                    numInstancesAdded)
                instanceIds = [i.id for i in instancesByAddress.itervalues()]
                self._logAndTerminate(instanceIds)
                numInstancesAdded -= len(instanceIds)
            else:
                log.info('All %i node(s) joined the cluster.',
                         numInstancesAdded)
        else:
            log.warn(
                'Batch system is not scalable. Assuming all instances joined the cluster.'
            )
        return numInstancesAdded