def try_and_nested_panic_with_secondary( self ): try: self.line_of_primary_exc = inspect.currentframe( ).f_lineno + 1 raise ValueError( "primary" ) except: with panic( log ): with panic( log ): raise RuntimeError( "secondary" )
def multipartReader(): buf = readable.read(store.partSize) if allowInlining and len(buf) <= self._maxInlinedSize(): self.content = buf else: headers = self._s3EncryptionHeaders() for attempt in retry_s3(): with attempt: upload = store.filesBucket.initiate_multipart_upload( key_name=self.fileID, headers=headers) try: for part_num in itertools.count(): # There must be at least one part, even if the file is empty. if len(buf) == 0 and part_num > 0: break for attempt in retry_s3(): with attempt: upload.upload_part_from_file(fp=StringIO(buf), # part numbers are 1-based part_num=part_num + 1, headers=headers) if len(buf) == 0: break buf = readable.read(self.outer.partSize) except: with panic(log=log): for attempt in retry_s3(): with attempt: upload.cancel_upload() else: for attempt in retry_s3(): with attempt: self.version = upload.complete_upload().version_id
def chunkedFileUpload(readable, bucket, fileID, file_size, headers=None, partSize=50 << 20): for attempt in retry_s3(): with attempt: upload = bucket.initiate_multipart_upload( key_name=fileID, headers=headers) try: start = 0 part_num = itertools.count() while start < file_size: end = min(start + partSize, file_size) assert readable.tell() == start for attempt in retry_s3(): with attempt: upload.upload_part_from_file(fp=readable, part_num=next(part_num) + 1, size=end - start, headers=headers) start = end assert readable.tell() == file_size == start except: with panic(log=log): for attempt in retry_s3(): with attempt: upload.cancel_upload() else: for attempt in retry_s3(): with attempt: version = upload.complete_upload().version_id return version
def try_and_panic( self ): try: self.line_of_primary_exc = inspect.currentframe( ).f_lineno + 1 raise ValueError( "primary" ) except: with panic( log ): pass
def wait_for_spot_instances(ec2, requests): # We would use a set but get_all_spot_instance_requests wants a list so its either O(n) # for lookup (the rock) or O(n) for converting a set to a list (the hard place). request_ids = [request.id for request in requests] instance_ids = [] def spot_request_not_found(e): error_code = 'InvalidSpotInstanceRequestID.NotFound' return isinstance(e, EC2ResponseError) and e.error_code == error_code # noinspection PyBroadException try: try: while True: for attempt in retry_ec2(retry_while=spot_request_not_found): with attempt: requests = ec2.get_all_spot_instance_requests( request_ids) for request in requests: if request.status.code == 'fulfilled': log.info('Request %s was fulfilled.', request.id) request_ids.remove(request.id) instance_ids.append(request.instance_id) if request_ids: spot_sleep = 30 log.info( '%d spot market requests still pending. Waiting for %ds', len(request_ids), spot_sleep) time.sleep(spot_sleep) else: log.info('All spot market requests have been fulfilled.') return ec2.get_all_instances(instance_ids) except: with panic(log): if instance_ids: log.warn( 'Terminating instances for already fulfilled requests.' ) ec2.terminate_instances(instance_ids) except: with panic(log): if request_ids: log.warn('Cancelling remaining spot requests.') ec2.cancel_spot_instance_requests(request_ids)
def _createExternalStore(self): import boto.s3 s3 = boto.s3.connect_to_region(self.testRegion) try: return s3.create_bucket(bucket_name='import-export-test-%s' % uuid.uuid4(), location=region_to_bucket_location(self.testRegion)) except: with panic(log=logger): s3.close()
def _createExternalStore(self): import boto.s3 s3 = boto.s3.connect_to_region(self.awsRegion()) try: return s3.create_bucket(bucket_name='import-export-test-%s' % uuid.uuid4(), location=region_to_bucket_location(self.awsRegion())) except: with panic(log=logger): s3.close()
def wait_for_spot_instances( ec2, requests ): # We would use a set but get_all_spot_instance_requests wants a list so its either O(n) # for lookup (the rock) or O(n) for converting a set to a list (the hard place). request_ids = [ request.id for request in requests ] instance_ids = [ ] def spot_request_not_found( e ): error_code = 'InvalidSpotInstanceRequestID.NotFound' return isinstance( e, EC2ResponseError ) and e.error_code == error_code # noinspection PyBroadException try: try: while True: for attempt in retry_ec2( retry_while=spot_request_not_found ): with attempt: requests = ec2.get_all_spot_instance_requests( request_ids ) for request in requests: if request.status.code == 'fulfilled': log.info( 'Request %s was fulfilled.', request.id ) request_ids.remove( request.id ) instance_ids.append( request.instance_id ) if request_ids: spot_sleep = 30 log.info( '%d spot market requests still pending. Waiting for %ds', len( request_ids ), spot_sleep ) time.sleep( spot_sleep ) else: log.info( 'All spot market requests have been fulfilled.' ) return ec2.get_all_instances( instance_ids ) except: with panic( log ): if instance_ids: log.warn( 'Terminating instances for already fulfilled requests.' ) ec2.terminate_instances( instance_ids ) except: with panic( log ): if request_ids: log.warn( 'Cancelling remaining spot requests.' ) ec2.cancel_spot_instance_requests( request_ids )
def upload(self, localFilePath): file_size, file_time = self._fileSizeAndTime(localFilePath) if file_size <= self._maxInlinedSize(): with open(localFilePath) as f: self.content = f.read() else: headers = self._s3EncryptionHeaders() if file_size <= self.outer.partSize: key = self.outer.filesBucket.new_key(key_name=self.fileID) key.name = self.fileID for attempt in retry_s3(): with attempt: key.set_contents_from_filename(localFilePath, headers=headers) self.version = key.version_id else: with open(localFilePath, 'rb') as f: for attempt in retry_s3(): with attempt: upload = self.outer.filesBucket.initiate_multipart_upload( key_name=self.fileID, headers=headers) try: start = 0 part_num = itertools.count() while start < file_size: end = min(start + self.outer.partSize, file_size) assert f.tell() == start for attempt in retry_s3(): with attempt: upload.upload_part_from_file(fp=f, part_num=next(part_num) + 1, size=end - start, headers=headers) start = end assert f.tell() == file_size == start except: with panic(log=log): for attempt in retry_s3(): with attempt: upload.cancel_upload() else: for attempt in retry_s3(): with attempt: self.version = upload.complete_upload().version_id for attempt in retry_s3(): with attempt: key = self.outer.filesBucket.get_key(self.fileID, headers=headers, version_id=self.version) assert key.size == file_size # Make resonably sure that the file wasn't touched during the upload assert self._fileSizeAndTime(localFilePath) == (file_size, file_time)
def readFrom(self, readable): blockIDs = [] try: while True: buf = readable.read(maxBlockSize) if len(buf) == 0: # We're safe to break here even if we never read anything, since # putting an empty block list creates an empty blob. break if encrypted: buf = encryption.encrypt(buf, store.keyPath) blockID = store._newFileID() container.put_block(blob_name=jobStoreFileID, block=buf, blockid=blockID) blockIDs.append(blockID) except: with panic(log=logger): # This is guaranteed to delete any uncommitted blocks. container.delete_blob(blob_name=jobStoreFileID) if checkForModification and expectedVersion is not None: # Acquire a (60-second) write lock, leaseID = container.lease_blob( blob_name=jobStoreFileID, x_ms_lease_action='acquire')['x-ms-lease-id'] # check for modification, blobProperties = container.get_blob_properties( blob_name=jobStoreFileID) if blobProperties['etag'] != expectedVersion: container.lease_blob(blob_name=jobStoreFileID, x_ms_lease_action='release', x_ms_lease_id=leaseID) raise ConcurrentFileModificationException( jobStoreFileID) # commit the file, container.put_block_list( blob_name=jobStoreFileID, block_list=blockIDs, x_ms_lease_id=leaseID, x_ms_meta_name_values=dict(encrypted=str(encrypted))) # then release the lock. container.lease_blob(blob_name=jobStoreFileID, x_ms_lease_action='release', x_ms_lease_id=leaseID) else: # No need to check for modification, just blindly write over whatever # was there. container.put_block_list( blob_name=jobStoreFileID, block_list=blockIDs, x_ms_meta_name_values=dict(encrypted=str(encrypted)))
def readFrom(self, readable): blockIDs = [] try: while True: buf = readable.read(maxBlockSize) if len(buf) == 0: # We're safe to break here even if we never read anything, since # putting an empty block list creates an empty blob. break if encrypted: buf = encryption.encrypt(buf, store.keyPath) blockID = store._newFileID() container.put_block(blob_name=bytes(jobStoreFileID), block=buf, blockid=blockID) blockIDs.append(blockID) except: with panic(log=logger): # This is guaranteed to delete any uncommitted blocks. container.delete_blob(blob_name=bytes(jobStoreFileID)) if checkForModification and expectedVersion is not None: # Acquire a (60-second) write lock, leaseID = container.lease_blob(blob_name=bytes(jobStoreFileID), x_ms_lease_action='acquire')['x-ms-lease-id'] # check for modification, blobProperties = container.get_blob_properties(blob_name=bytes(jobStoreFileID)) if blobProperties['etag'] != expectedVersion: container.lease_blob(blob_name=bytes(jobStoreFileID), x_ms_lease_action='release', x_ms_lease_id=leaseID) raise ConcurrentFileModificationException(jobStoreFileID) # commit the file, container.put_block_list(blob_name=bytes(jobStoreFileID), block_list=blockIDs, x_ms_lease_id=leaseID, x_ms_meta_name_values=dict( encrypted=str(encrypted))) # then release the lock. container.lease_blob(blob_name=bytes(jobStoreFileID), x_ms_lease_action='release', x_ms_lease_id=leaseID) else: # No need to check for modification, just blindly write over whatever # was there. container.put_block_list(blob_name=bytes(jobStoreFileID), block_list=blockIDs, x_ms_meta_name_values=dict(encrypted=str(encrypted)))
def run_on_box(self, options, box): try: spec = box.prepare(**self.instance_options(options, box)) box.create(spec, wait_ready=True) self.run_on_creation(box, options) except: if options.terminate is not False: with panic(): box.terminate(wait=False) else: raise else: if options.terminate is True: box.terminate() else: self.log_ssh_hint(options)
def run_on_box( self, options, box ): try: spec = box.prepare( **self.instance_options( options, box ) ) box.create( spec, wait_ready=True ) self.run_on_creation( box, options ) except: if options.terminate is not False: with panic( ): box.terminate( wait=False ) else: raise else: if options.terminate is True: box.terminate( ) else: self.log_ssh_hint( options )
def copyKeyMultipart(srcKey, dstBucketName, dstKeyName, headers=None): """ Copies a key from a source key to a destination key in multiple parts. Note that if the destination key exists it will be overwritten implicitly, and if it does not exist a new key will be created. :param boto.s3.key.Key srcKey: The source key to be copied from. :param str dstBucketName: The name of the destination bucket for the copy. :param str dstKeyName: The name of the destination key that will be created or overwritten. :param dict headers: Any headers that should be passed. :rtype: boto.s3.multipart.CompletedMultiPartUpload :return: An object representing the completed upload. """ partSize = defaultPartSize # We need a location-agnostic connection to S3 so we can't use the one that we # normally use for interacting with the job store bucket. with closing(boto.connect_s3()) as s3: headers = headers or {} totalSize = srcKey.size for attempt in retry_s3(): with attempt: dstBucket = s3.get_bucket(dstBucketName) upload = dstBucket.initiate_multipart_upload(dstKeyName, headers=headers) try: start = 0 partIndex = itertools.count() while start < totalSize: end = min(start + partSize, totalSize) for attempt in retry_s3(): with attempt: upload.copy_part_from_key(src_bucket_name=srcKey.bucket.name, src_key_name=srcKey.name, src_version_id=srcKey.version_id, part_num=next(partIndex) + 1, start=start, end=end - 1, headers=headers) start += partSize except: with panic(log=log): upload.cancel_upload() else: for attempt in retry_s3(): with attempt: return upload.complete_upload()
def run_on_box( self, options, box ): try: resolve_me = functools.partial( box.ctx.resolve_me, drop_hostname=False ) box.prepare( ec2_keypair_globs=map( resolve_me, options.ec2_keypair_names ), instance_type=options.instance_type, virtualization_type=options.virtualization_type, **self.instance_options( options ) ) box.create( wait_ready=True ) self.run_on_creation( box, options ) except: if options.terminate is not False: with panic( ): box.terminate( wait=False ) else: raise else: if options.terminate is True: box.terminate( )
def run_on_box( self, options, leader ): """ :type leader: cgcloud.core.box.Box """ log.info( '=== Creating leader ===' ) preparation_kwargs = self.preparation_kwargs( options, leader ) if options.leader_on_demand: preparation_kwargs = { k: v for k, v in preparation_kwargs.iteritems( ) if not k.startswith( 'spot_' ) } spec = leader.prepare( **preparation_kwargs ) creation_kwargs = dict( self.creation_kwargs( options, leader ), num_instances=1, # We must always wait for the leader since workers depend on it. wait_ready=True ) leader.create( spec, **creation_kwargs ) try: self.run_on_creation( leader, options ) except: if options.terminate is not False: with panic( log ): leader.terminate( wait=False ) raise # Leader is fully setup, even if the code below fails to add workers, # the GrowClusterCommand can be used to recover from that failure. if options.num_workers: log.info( '=== Creating workers ===' ) first_worker = self.cluster.worker_role( leader.ctx ) preparation_kwargs = dict( self.preparation_kwargs( options, first_worker ), leader_instance_id=leader.instance_id, instance_type=options.worker_instance_type ) spec = first_worker.prepare( **preparation_kwargs ) with thread_pool( min( options.num_threads, options.num_workers ) ) as pool: workers = first_worker.create( spec, cluster_ordinal=leader.cluster_ordinal + 1, executor=pool.apply_async, **self.creation_kwargs( options, first_worker ) ) else: workers = [ ] if options.list: self.list( [ leader ] ) self.list( workers, print_headers=False ) self.log_ssh_hint( options )
def _hello_world(self): script = 'hello_world.py' def hello_world(): # noinspection PyUnresolvedReferences from toil.job import Job from subprocess import check_output def hello(name): return check_output([ 'docker', 'run', '-e', 'FOO=' + name, 'ubuntu', 'bash', '-c', 'echo -n Hello, $FOO!' ]) if __name__ == '__main__': options = Job.Runner.getDefaultArgumentParser().parse_args() job = Job.wrapFn(hello, "world", cores=1, memory=1e6, disk=1e6, cache=1e6) result = Job.Runner.startToil(job, options) assert result == 'Hello, world!' body = dedent('\n'.join(getsource(hello_world).split('\n')[1:])) self._send_file(leader, body, script) def hex64(x): return hex(int(x))[2:].zfill(8) # Could use UUID but prefer historical ordering. Time in s plus PID is sufficiently unique. job_store = 'test-%s%s-toil-job-store' % (hex64( time.time()), hex64(os.getpid())) job_store = ':'.join(('aws', self.ctx.region, job_store)) self._ssh(leader, 'toil', 'clean', job_store) try: self._ssh(leader, 'python2.7', script, '--batchSystem=mesos', '--mesosMaster=mesos-master:5050', job_store) except: with panic(): self._ssh(leader, 'toil', 'clean', job_store)
def _test(self, box_cls): role = box_cls.role() self._cgcloud("create", role) try: self._cgcloud("stop", role) self._cgcloud("image", role) try: self._cgcloud("terminate", role) self._cgcloud("recreate", role) file_name = "foo-" + role self._ssh(role, "touch", file_name) self._rsync(role, ":" + file_name, ".") self.assertTrue(os.path.exists(file_name)) os.unlink(file_name) self._cgcloud("terminate", role) finally: self._cgcloud("delete-image", role) except: with panic(log): self._cgcloud("terminate", "-q", role)
def _test( self, box_cls ): role = box_cls.role( ) self._cgcloud( 'create', role ) try: self._cgcloud( 'stop', role ) self._cgcloud( 'image', role ) try: self._cgcloud( 'terminate', role ) self._cgcloud( 'recreate', role ) file_name = 'foo-' + role self._ssh( role, 'touch', file_name ) self._rsync( role, ':' + file_name, '.' ) self.assertTrue( os.path.exists( file_name ) ) os.unlink( file_name ) self._cgcloud( 'terminate', role ) finally: self._cgcloud( 'delete-image', role ) except: with panic( log ): self._cgcloud( 'terminate', '-q', role )
def run_on_box( self, options, box ): """ :type box: Box """ spec = box.prepare( **self.preparation_kwargs( options, box ) ) box.create( spec, **self.creation_kwargs( options, box ) ) try: self.run_on_creation( box, options ) except: if options.terminate is not False: with panic( log ): box.terminate( wait=False ) raise else: if options.list: self.list( [ box ] ) if options.terminate is True: box.terminate( ) else: self.log_ssh_hint( options )
def _test(self, box_cls): role = box_cls.role() self._cgcloud('create', role) try: self._cgcloud('stop', role) self._cgcloud('image', role) try: self._cgcloud('terminate', role) self._cgcloud('recreate', role) file_name = 'foo-' + role self._ssh(role, 'touch', file_name) self._rsync(role, ':' + file_name, '.') self.assertTrue(os.path.exists(file_name)) os.unlink(file_name) self._cgcloud('terminate', role) finally: self._cgcloud('delete-image', role) except: with panic(log): self._cgcloud('terminate', '--quick', role)
def run_on_box(self, options, box): """ :type box: Box """ spec = box.prepare(**self.preparation_kwargs(options, box)) box.create(spec, **self.creation_kwargs(options, box)) try: self.run_on_creation(box, options) except: if options.terminate is not False: with panic(log): box.terminate(wait=False) raise else: if options.list: self.list([box]) if options.terminate is True: box.terminate() else: self.log_ssh_hint(options)
def _hello_world( self ): script = 'hello_world.py' def hello_world( ): # noinspection PyUnresolvedReferences from toil.job import Job from subprocess import check_output import os def hello( name ): assert os.environ[ 'TOIL_WORKDIR' ] == '/var/lib/toil' return check_output( [ 'docker', 'run', '-e', 'FOO=' + name, 'ubuntu', 'bash', '-c', 'echo -n Hello, $FOO!' ] ) if __name__ == '__main__': options = Job.Runner.getDefaultArgumentParser( ).parse_args( ) job = Job.wrapFn( hello, "world", cores=1, memory=1e6, disk=1e6, cache=1e6 ) result = Job.Runner.startToil( job, options ) assert result == 'Hello, world!' body = dedent( '\n'.join( getsource( hello_world ).split( '\n' )[ 1: ] ) ) self._send_file( leader, body, script ) def hex64( x ): return hex( int( x ) )[ 2: ].zfill( 8 ) # Could use UUID but prefer historical ordering. Time in s plus PID is sufficiently unique. job_store = 'test-%s%s-toil-job-store' % (hex64( time.time( ) ), hex64( os.getpid( ) )) job_store = ':'.join( ('aws', self.ctx.region, job_store) ) self._ssh( leader, 'toil', 'clean', job_store ) try: self._ssh( leader, 'python2.7', script, '--batchSystem=mesos', '--mesosMaster=mesos-master:5050', job_store ) except: with panic( log ): self._ssh( leader, 'toil', 'clean', job_store )
def _getKeyForUrl(url, existing=None): """ Extracts a key from a given s3:// URL. On return, but not on exceptions, this method leaks an S3Connection object. The caller is responsible to close that by calling key.bucket.connection.close(). :param bool existing: If True, key is expected to exist. If False, key is expected not to exists and it will be created. If None, the key will be created if it doesn't exist. :rtype: Key """ # Get the bucket's region to avoid a redirect per request with closing(boto.connect_s3()) as s3: region = bucket_location_to_region(s3.get_bucket(url.netloc).get_location()) # Note that caller is responsible for closing the connection s3 = boto.s3.connect_to_region(region) try: bucket = s3.get_bucket(url.netloc) key = bucket.get_key(url.path[1:]) if existing is True: if key is None: raise RuntimeError('Key does not exist.') elif existing is False: if key is not None: raise RuntimeError('Key exists.') elif existing is None: pass else: assert False if key is None: key = bucket.new_key(url.path[1:]) except: with panic(): s3.close() else: return key
def docker_call(tool, parameters=None, work_dir='.', rm=True, env=None, outfile=None, inputs=None, outputs=None, docker_parameters=None, check_output=False, mock=None): """ Calls Docker, passing along parameters and tool. :param str tool: Name of the Docker image to be used (e.g. quay.io/ucsc_cgl/samtools) :param list[str] parameters: Command line arguments to be passed to the tool :param str work_dir: Directory to mount into the container via `-v`. Destination convention is /data :param bool rm: Set to True to pass `--rm` flag. :param dict[str,str] env: Environment variables to be added (e.g. dict(JAVA_OPTS='-Xmx15G')) :param bool sudo: If True, prepends `sudo` to the docker call :param file outfile: Pipe output of Docker call to file handle :param list[str] inputs: A list of the input files. :param dict[str,str] outputs: A dictionary containing the outputs files as keys with either None or a url. The value is only used if mock=True :param dict[str,str] docker_parameters: Parameters to pass to docker :param bool check_output: When True, this function returns docker's output :param bool mock: Whether to run in mock mode. If this variable is unset, its value will be determined by the environment variable. """ from toil_lib.urls import download_url if mock is None: mock = mock_mode() if parameters is None: parameters = [] if inputs is None: inputs = [] if outputs is None: outputs = {} for filename in inputs: assert (os.path.isfile(os.path.join(work_dir, filename))) if mock: for filename, url in outputs.items(): file_path = os.path.join(work_dir, filename) if url is None: # create mock file if not os.path.exists(file_path): f = open(file_path, 'w') f.write("contents") # FIXME f.close() else: file_path = os.path.join(work_dir, filename) if not os.path.exists(file_path): outfile = download_url(url, work_dir=work_dir, name=filename) assert os.path.exists(file_path) return base_docker_call = [ 'docker', 'run', '--log-driver=none', '-v', '{}:/data'.format(os.path.abspath(work_dir)) ] if rm: base_docker_call.append('--rm') if env: for e, v in env.iteritems(): base_docker_call.extend(['-e', '{}={}'.format(e, v)]) if docker_parameters: base_docker_call += docker_parameters _log.debug("Calling docker with %s." % " ".join(base_docker_call + [tool] + parameters)) docker_call = base_docker_call + [tool] + parameters try: if outfile: subprocess.check_call(docker_call, stdout=outfile) else: if check_output: return subprocess.check_output(docker_call) else: subprocess.check_call(docker_call) # Fix root ownership of output files except: # Panic avoids hiding the exception raised in the try block with panic(): _fix_permissions(base_docker_call, tool, work_dir) else: _fix_permissions(base_docker_call, tool, work_dir) for filename in outputs.keys(): if not os.path.isabs(filename): filename = os.path.join(work_dir, filename) assert (os.path.isfile(filename))
def wait_spot_requests_active(ec2, requests, timeout=None, tentative=False): """ Wait until no spot request in the given iterator is in the 'open' state or, optionally, a timeout occurs. Yield spot requests as soon as they leave the 'open' state. :param Iterator[SpotInstanceRequest] requests: :param float timeout: Maximum time in seconds to spend waiting or None to wait forever. If a timeout occurs, the remaining open requests will be cancelled. :param bool tentative: if True, give up on a spot request at the earliest indication of it not being fulfilled immediately :rtype: Iterator[list[SpotInstanceRequest]] """ if timeout is not None: timeout = time.time() + timeout active_ids = set() other_ids = set() open_ids = None def cancel(): log.warn('Cancelling remaining %i spot requests.', len(open_ids)) ec2.cancel_spot_instance_requests(list(open_ids)) def spot_request_not_found(e): error_code = 'InvalidSpotInstanceRequestID.NotFound' return isinstance(e, EC2ResponseError) and e.error_code == error_code try: while True: open_ids, eval_ids, fulfill_ids = set(), set(), set() batch = [] for r in requests: if r.state == 'open': open_ids.add(r.id) if r.status.code == 'pending-evaluation': eval_ids.add(r.id) elif r.status.code == 'pending-fulfillment': fulfill_ids.add(r.id) else: log.info( 'Request %s entered status %s indicating that it will not be ' 'fulfilled anytime soon.', r.id, r.status.code) elif r.state == 'active': assert r.id not in active_ids active_ids.add(r.id) batch.append(r) else: assert r.id not in other_ids other_ids.add(r.id) batch.append(r) if batch: yield batch log.info( '%i spot requests(s) are open (%i of which are pending evaluation and %i ' 'are pending fulfillment), %i are active and %i are in another state.', *map(len, (open_ids, eval_ids, fulfill_ids, active_ids, other_ids))) if not open_ids or tentative and not eval_ids and not fulfill_ids: break sleep_time = 2 * a_short_time if timeout is not None and time.time() + sleep_time >= timeout: log.warn('Timed out waiting for spot requests.') break log.info('Sleeping for %is', sleep_time) time.sleep(sleep_time) for attempt in retry_ec2(retry_while=spot_request_not_found): with attempt: requests = ec2.get_all_spot_instance_requests( list(open_ids)) except: if open_ids: with panic(log): cancel() raise else: if open_ids: cancel()
def docker_call(tool, parameters=None, work_dir='.', rm=True, env=None, outfile=None, inputs=None, outputs=None, docker_parameters=None, check_output=False, mock=None): """ Calls Docker, passing along parameters and tool. :param str tool: Name of the Docker image to be used (e.g. quay.io/ucsc_cgl/samtools) :param list[str] parameters: Command line arguments to be passed to the tool :param str work_dir: Directory to mount into the container via `-v`. Destination convention is /data :param bool rm: Set to True to pass `--rm` flag. :param dict[str,str] env: Environment variables to be added (e.g. dict(JAVA_OPTS='-Xmx15G')) :param bool sudo: If True, prepends `sudo` to the docker call :param file outfile: Pipe output of Docker call to file handle :param list[str] inputs: A list of the input files. :param dict[str,str] outputs: A dictionary containing the outputs files as keys with either None or a url. The value is only used if mock=True :param dict[str,str] docker_parameters: Parameters to pass to docker :param bool check_output: When True, this function returns docker's output :param bool mock: Whether to run in mock mode. If this variable is unset, its value will be determined by the environment variable. """ from toil_scripts.lib.urls import download_url if mock is None: mock = mock_mode() if parameters is None: parameters = [] if inputs is None: inputs = [] if outputs is None: outputs = {} for filename in inputs: assert(os.path.isfile(os.path.join(work_dir, filename))) if mock: for filename, url in outputs.items(): file_path = os.path.join(work_dir, filename) if url is None: # create mock file if not os.path.exists(file_path): f = open(file_path, 'w') f.write("contents") # FIXME f.close() else: file_path = os.path.join(work_dir, filename) if not os.path.exists(file_path): outfile = download_url(url, work_dir=work_dir, name=filename) assert os.path.exists(file_path) return base_docker_call = ['docker', 'run', '--log-driver=none', '-v', '{}:/data'.format(os.path.abspath(work_dir))] if rm: base_docker_call.append('--rm') if env: for e, v in env.iteritems(): base_docker_call.extend(['-e', '{}={}'.format(e, v)]) if docker_parameters: base_docker_call += docker_parameters _log.debug("Calling docker with %s." % " ".join(base_docker_call + [tool] + parameters)) docker_call = base_docker_call + [tool] + parameters try: if outfile: subprocess.check_call(docker_call, stdout=outfile) else: if check_output: return subprocess.check_output(docker_call) else: subprocess.check_call(docker_call) # Fix root ownership of output files except: # Panic avoids hiding the exception raised in the try block with panic(): _fix_permissions(base_docker_call, tool, work_dir) else: _fix_permissions(base_docker_call, tool, work_dir) for filename in outputs.keys(): if not os.path.isabs(filename): filename = os.path.join(work_dir, filename) assert(os.path.isfile(filename))
def wait_spot_requests_active( ec2, requests, timeout=None, tentative=False ): """ Wait until no spot request in the given iterator is in the 'open' state or, optionally, a timeout occurs. Yield spot requests as soon as they leave the 'open' state. :param Iterator[SpotInstanceRequest] requests: :param float timeout: Maximum time in seconds to spend waiting or None to wait forever. If a timeout occurs, the remaining open requests will be cancelled. :param bool tentative: if True, give up on a spot request at the earliest indication of it not being fulfilled immediately :rtype: Iterator[list[SpotInstanceRequest]] """ if timeout is not None: timeout = time.time( ) + timeout active_ids = set( ) other_ids = set( ) open_ids = None def cancel( ): log.warn( 'Cancelling remaining %i spot requests.', len( open_ids ) ) ec2.cancel_spot_instance_requests( list( open_ids ) ) def spot_request_not_found( e ): error_code = 'InvalidSpotInstanceRequestID.NotFound' return isinstance( e, EC2ResponseError ) and e.error_code == error_code try: while True: open_ids, eval_ids, fulfill_ids = set( ), set( ), set( ) batch = [ ] for r in requests: if r.state == 'open': open_ids.add( r.id ) if r.status.code == 'pending-evaluation': eval_ids.add( r.id ) elif r.status.code == 'pending-fulfillment': fulfill_ids.add( r.id ) else: log.info( 'Request %s entered status %s indicating that it will not be ' 'fulfilled anytime soon.', r.id, r.status.code ) elif r.state == 'active': assert r.id not in active_ids active_ids.add( r.id ) batch.append( r ) else: assert r.id not in other_ids other_ids.add( r.id ) batch.append( r ) if batch: yield batch log.info( '%i spot requests(s) are open (%i of which are pending evaluation and %i ' 'are pending fulfillment), %i are active and %i are in another state.', *map( len, (open_ids, eval_ids, fulfill_ids, active_ids, other_ids) ) ) if not open_ids or tentative and not eval_ids and not fulfill_ids: break sleep_time = 2 * a_short_time if timeout is not None and time.time( ) + sleep_time >= timeout: log.warn( 'Timed out waiting for spot requests.' ) break log.info( 'Sleeping for %is', sleep_time ) time.sleep( sleep_time ) for attempt in retry_ec2( retry_while=spot_request_not_found ): with attempt: requests = ec2.get_all_spot_instance_requests( list( open_ids ) ) except: if open_ids: with panic( log ): cancel( ) raise else: if open_ids: cancel( )
def docker_call(tool=None, tools=None, parameters=None, work_dir='.', rm=True, env=None, outfile=None, errfile=None, inputs=None, outputs=None, docker_parameters=None, check_output=False, return_stderr=False, mock=None): """ Calls Docker, passing along parameters and tool. :param (str tool | str tools): str tool name of the Docker image to be used (e.g. tool='quay.io/ucsc_cgl/samtools') OR str tools of the Docker images and order to be used when piping commands to Docker. (e.g. 'quay.io/ucsc_cgl/samtools'). Both tool and tools are mutually exclusive parameters to docker_call. :param list[str] parameters: Command line arguments to be passed to the tool :param str work_dir: Directory to mount into the container via `-v`. Destination convention is /data :param bool rm: Set to True to pass `--rm` flag. :param dict[str,str] env: Environment variables to be added (e.g. dict(JAVA_OPTS='-Xmx15G')) :param bool sudo: If True, prepends `sudo` to the docker call :param file outfile: Pipe stdout of Docker call to file handle :param file errfile: Pipe stderr of Docker call to file handle :param list[str] inputs: A list of the input files. :param dict[str,str] outputs: A dictionary containing the outputs files as keys with either None or a url. The value is only used if mock=True :param dict[str,str] docker_parameters: Parameters to pass to docker :param bool check_output: When True, this function returns docker's output :param bool return_stderr: When True, this function includes stderr in docker's output :param bool mock: Whether to run in mock mode. If this variable is unset, its value will be determined by the environment variable. Pipes in docker commands: Running a pipe in docker in 'pipe-in-single-container' mode produces command structure docker '... | ... | ...' where each '...' command corresponds to each element in the 'parameters' argument that uses a docker container. This is the most efficient method if you want to run a pipe of commands where each command uses the same docker container. Example for running command 'head -c 1M /dev/urandom | gzip | gunzip | md5sum 1>&2': Running 'pipe-in-single-container' mode: command= ['head -c 1M /dev/urandom', 'gzip', 'gunzip', 'md5sum 1>&2'] docker_work_dir=curr_work_dir docker_tools='ubuntu' stdout = docker_call(work_dir=docker_work_dir, parameters=command, tools=docker_tools, check_output=True) """ from toil_lib.urls import download_url if mock is None: mock = mock_mode() if parameters is None: parameters = [] if inputs is None: inputs = [] if outputs is None: outputs = {} for filename in inputs: assert(os.path.isfile(os.path.join(work_dir, filename))) if mock: for filename, url in outputs.items(): file_path = os.path.join(work_dir, filename) if url is None: # create mock file if not os.path.exists(file_path): f = open(file_path, 'w') f.write("contents") # FIXME f.close() else: file_path = os.path.join(work_dir, filename) if not os.path.exists(file_path): outfile = download_url(url, work_dir=work_dir, name=filename) assert os.path.exists(file_path) return base_docker_call = ['docker', 'run', '--log-driver=none', '-v', '{}:/data'.format(os.path.abspath(work_dir))] if rm: base_docker_call.append('--rm') if env: for e, v in env.iteritems(): base_docker_call.extend(['-e', '{}={}'.format(e, v)]) if docker_parameters: base_docker_call += docker_parameters docker_call = [] require(bool(tools) != bool(tool), 'Either "tool" or "tools" must contain a value, but not both') # Pipe functionality # each element in the parameters list must represent a sub-pipe command if bool(tools): # If tools is set then format the docker call in the 'pipe-in-single-container' mode docker_call = " ".join(base_docker_call + ['--entrypoint /bin/bash', tools, '-c \'{}\''.format(" | ".join(parameters))]) _log.debug("Calling docker with %s." % docker_call) else: docker_call = " ".join(base_docker_call + [tool] + parameters) _log.debug("Calling docker with %s." % docker_call) try: if outfile: if errfile: subprocess.check_call(docker_call, stdout=outfile, stderr=errfile, shell=True) else: subprocess.check_call(docker_call, stdout=outfile, shell=True) else: if check_output: if return_stderr: return subprocess.check_output(docker_call, shell=True, stderr=subprocess.STDOUT) else: return subprocess.check_output(docker_call, shell=True) else: subprocess.check_call(docker_call, shell=True) # Fix root ownership of output files except: # Panic avoids hiding the exception raised in the try block with panic(): _fix_permissions(base_docker_call, tool, tools, work_dir) else: _fix_permissions(base_docker_call, tool, tools, work_dir) for filename in outputs.keys(): if not os.path.isabs(filename): filename = os.path.join(work_dir, filename) assert(os.path.isfile(filename))
def copyKeyMultipart(srcKey, dstBucketName, dstKeyName, partSize, headers=None): """ Copies a key from a source key to a destination key in multiple parts. Note that if the destination key exists it will be overwritten implicitly, and if it does not exist a new key will be created. If the destination bucket does not exist an error will be raised. :param boto.s3.key.Key srcKey: The source key to be copied from. :param str dstBucketName: The name of the destination bucket for the copy. :param str dstKeyName: The name of the destination key that will be created or overwritten. :param int partSize: The size of each individual part, must be >= 5 MiB but large enough to not exceed 10k parts for the whole file :param dict headers: Any headers that should be passed. :rtype: boto.s3.multipart.CompletedMultiPartUpload :return: An object representing the completed upload. """ def copyPart(partIndex): if exceptions: return None try: for attempt in retry_s3(): with attempt: start = partIndex * partSize end = min(start + partSize, totalSize) part = upload.copy_part_from_key(src_bucket_name=srcKey.bucket.name, src_key_name=srcKey.name, src_version_id=srcKey.version_id, # S3 part numbers are 1-based part_num=partIndex + 1, # S3 range intervals are closed at the end start=start, end=end - 1, headers=headers) except Exception as e: if len(exceptions) < 5: exceptions.append(e) log.error('Failed to copy part number %d:', partIndex, exc_info=True) else: log.warn('Also failed to copy part number %d due to %s.', partIndex, e) return None else: log.debug('Successfully copied part %d of %d.', partIndex, totalParts) # noinspection PyUnboundLocalVariable return part totalSize = srcKey.size totalParts = (totalSize + partSize - 1) / partSize exceptions = [] # We need a location-agnostic connection to S3 so we can't use the one that we # normally use for interacting with the job store bucket. with closing(boto.connect_s3()) as s3: for attempt in retry_s3(): with attempt: dstBucket = s3.get_bucket(dstBucketName) upload = dstBucket.initiate_multipart_upload(dstKeyName, headers=headers) log.info("Initiated multipart copy from 's3://%s/%s' to 's3://%s/%s'.", srcKey.bucket.name, srcKey.name, dstBucketName, dstKeyName) try: # We can oversubscribe cores by at least a factor of 16 since each copy task just # blocks, waiting on the server. Limit # of threads to 128, since threads aren't # exactly free either. Lastly, we don't need more threads than we have parts. with ThreadPoolExecutor(max_workers=min(cpu_count() * 16, totalParts, 128)) as executor: parts = list(executor.map(copyPart, xrange(0, totalParts))) if exceptions: raise RuntimeError('Failed to copy at least %d part(s)' % len(exceptions)) assert len(filter(None, parts)) == totalParts except: with panic(log=log): upload.cancel_upload() else: for attempt in retry_s3(): with attempt: completed = upload.complete_upload() log.info("Completed copy from 's3://%s/%s' to 's3://%s/%s'.", srcKey.bucket.name, srcKey.name, dstBucketName, dstKeyName) return completed
def wait_spot_requests_active( ec2, requests, timeout=None ): """ Wait until no spot request in the given iterator is in the 'open' state or, optionally, a timeout occurs. Yield spot requests as soon as they leave the 'open' state. :param Iterator[SpotInstanceRequest] requests: :param float timeout: Maximum time in seconds to spend waiting or None to wait forever. If a timeout occurs, all remaining open requests will be terminated. :rtype: Iterator[list[SpotInstanceRequest]] """ if timeout is not None: timeout = time.time( ) + timeout active_ids = set( ) other_ids = set( ) open_ids = None def cancel( ): log.warn( 'Cancelling remaining %i spot requests.', len( open_ids ) ) ec2.cancel_spot_instance_requests( list( open_ids ) ) def spot_request_not_found( e ): error_code = 'InvalidSpotInstanceRequestID.NotFound' return isinstance( e, EC2ResponseError ) and e.error_code == error_code try: while timeout is None or time.time( ) < timeout: open_ids = set( ) batch = [ ] for r in requests: if r.state == 'open': open_ids.add( r.id ) elif r.state == 'active': assert r.id not in active_ids active_ids.add( r.id ) batch.append( r ) else: assert r.id not in other_ids other_ids.add( r.id ) batch.append( r ) if batch: yield batch log.info( '%i spot requests(s) open, %i active, %i other.', *map( len, (open_ids, active_ids, other_ids) ) ) if not open_ids: return sleep_time = 2 * a_short_time log.info( 'Sleeping for %is', sleep_time ) time.sleep( sleep_time ) for attempt in retry_ec2( retry_while=spot_request_not_found ): with attempt: requests = ec2.get_all_spot_instance_requests( list( open_ids ) ) log.warn( 'Timed out waiting for spot requests.' ) if open_ids: cancel( ) except: if open_ids: with panic( log ): cancel( ) raise