def write(self, digest, content): assert content is not None with self._lock: self._protected = self._protected or digest path = self._path(digest) # A stale broken file may remain. It is possible for the file to have write # access bit removed which would cause the file_write() call to fail to open # in write mode. Take no chance here. file_path.try_remove(path) try: size = file_write(path, content) except: # There are two possible places were an exception can occur: # 1) Inside |content| generator in case of network or unzipping errors. # 2) Inside file_write itself in case of disk IO errors. # In any case delete an incomplete file and propagate the exception to # caller, it will be logged there. file_path.try_remove(path) raise # Make the file read-only in the cache. This has a few side-effects since # the file node is modified, so every directory entries to this file becomes # read-only. It's fine here because it is a new file. file_path.set_read_only(path, True) with self._lock: self._add(digest, size) return digest
def _load(self, trim, time_fn): """Loads state of the cache from json file. If cache_dir does not exist on disk, it is created. """ self._lock.assert_locked() if not fs.isfile(self.state_file): if not fs.isdir(self.cache_dir): fs.makedirs(self.cache_dir) else: # Load state of the cache. try: self._lru = lru.LRUDict.load(self.state_file) except ValueError as err: logging.error('Failed to load cache state: %s' % (err, )) # Don't want to keep broken state file. file_path.try_remove(self.state_file) if time_fn: self._lru.time_fn = time_fn if trim: self._trim() # We want the initial cache size after trimming, i.e. what is readily # avaiable. self._initial_number_items = len(self._lru) self._initial_size = sum(self._lru.itervalues()) if self._evicted: logging.info('Trimming evicted items with the following sizes: %s', sorted(self._evicted))
def _delete_file(self, digest, size=UNKNOWN_FILE_SIZE): """Deletes cache file from the file system.""" self._lock.assert_locked() try: if size == UNKNOWN_FILE_SIZE: try: size = fs.stat(self._path(digest)).st_size except OSError: size = 0 file_path.try_remove(self._path(digest)) self._evicted.append(size) self._free_disk += size except OSError as e: if e.errno != errno.ENOENT: logging.error('Error attempting to delete a file %s:\n%s' % (digest, e))
def cleanup(self): """Cleans up the cache directory. Ensures there is no unknown files in cache_dir. Ensures the read-only bits are set correctly. At that point, the cache was already loaded, trimmed to respect cache policies. """ with self._lock: fs.chmod(self.cache_dir, 0700) # Ensure that all files listed in the state still exist and add new ones. previous = set(self._lru) # It'd be faster if there were a readdir() function. for filename in fs.listdir(self.cache_dir): if filename == self.STATE_FILE: fs.chmod(os.path.join(self.cache_dir, filename), 0600) continue if filename in previous: fs.chmod(os.path.join(self.cache_dir, filename), 0400) previous.remove(filename) continue # An untracked file. Delete it. logging.warning('Removing unknown file %s from cache', filename) p = self._path(filename) if fs.isdir(p): try: file_path.rmtree(p) except OSError: pass else: file_path.try_remove(p) continue if previous: # Filter out entries that were not found. logging.warning('Removed %d lost files', len(previous)) for filename in previous: self._lru.pop(filename) self._save()
def _load(self, trim, time_fn): """Loads state of the cache from json file. If cache_dir does not exist on disk, it is created. """ self._lock.assert_locked() if not fs.isfile(self.state_file): if not fs.isdir(self.cache_dir): fs.makedirs(self.cache_dir) else: # Load state of the cache. try: self._lru = lru.LRUDict.load(self.state_file) except ValueError as err: logging.error('Failed to load cache state: %s' % (err, )) # Don't want to keep broken state file. file_path.try_remove(self.state_file) if time_fn: self._lru.time_fn = time_fn if trim: self._trim()
def run_command(remote, task_details, work_dir, cost_usd_hour, task_start, run_isolated_flags, bot_file, ctx_file): """Runs a command and sends packets to the server to stream results back. Implements both I/O and hard timeouts. Sends the packets numbered, so the server can ensure they are processed in order. Returns: Metadata dict with the execution result. Raises: ExitSignal if caught some signal when starting or stopping. InternalError on unexpected internal errors. """ # Signal the command is about to be started. It is important to post a task # update *BEFORE* starting any user code to signify the server that the bot # correctly started processing the task. In the case of non-idempotent task, # this signal is used to know if it is safe to retry the task or not. See # _reap_task() in task_scheduler.py for more information. start = monotonic_time() params = { 'cost_usd': cost_usd_hour * (start - task_start) / 60. / 60., } if not remote.post_task_update( task_details.task_id, task_details.bot_id, params): # Don't even bother, the task was already canceled. return { u'exit_code': -1, u'hard_timeout': False, u'io_timeout': False, u'must_signal_internal_failure': None, u'version': OUT_VERSION, } isolated_result = os.path.join(work_dir, 'isolated_result.json') args = get_isolated_args(work_dir, task_details, isolated_result, bot_file, run_isolated_flags) # Hard timeout enforcement is deferred to run_isolated. Grace is doubled to # give one 'grace_period' slot to the child process and one slot to upload # the results back. task_details.hard_timeout = 0 if task_details.grace_period: task_details.grace_period *= 2 try: proc = _start_task_runner(args, work_dir, ctx_file) except _FailureOnStart as e: return fail_without_command( remote, task_details.bot_id, task_details.task_id, params, cost_usd_hour, task_start, e.exit_code, e.stdout) buf = _OutputBuffer(task_details, start) try: # Monitor the task exit_code = None had_io_timeout = False must_signal_internal_failure = None kill_sent = False timed_out = None try: for channel, new_data in proc.yield_any( maxsize=buf.maxsize, timeout=lambda: buf.calc_yield_wait(timed_out)): buf.add(channel, new_data) # Post update if necessary. if buf.should_post_update(): params['cost_usd'] = ( cost_usd_hour * (monotonic_time() - task_start) / 60. / 60.) if not remote.post_task_update( task_details.task_id, task_details.bot_id, params, buf.pop()): # Server is telling us to stop. Normally task cancellation. if not kill_sent: logging.warning('Server induced stop; sending SIGTERM') proc.terminate() timed_out = monotonic_time() # Send signal on timeout if necessary. Both are failures, not # internal_failures. # Eventually kill but return 0 so bot_main.py doesn't cancel the task. if not timed_out: if (task_details.io_timeout and buf.since_last_io > task_details.io_timeout): had_io_timeout = True logging.warning( 'I/O timeout is %.3fs; no update for %.3fs sending SIGTERM', task_details.io_timeout, buf.since_last_io) proc.terminate() timed_out = monotonic_time() else: # During grace period. if (not kill_sent and buf.last_loop - timed_out >= task_details.grace_period): # Now kill for real. The user can distinguish between the following # states: # - signal but process exited within grace period, # (hard_|io_)_timed_out will be set but the process exit code will # be script provided. # - processed exited late, exit code will be -9 on posix. logging.warning( 'Grace of %.3fs exhausted at %.3fs; sending SIGKILL', task_details.grace_period, buf.last_loop - timed_out) proc.kill() kill_sent = True logging.info('Waiting for process exit') exit_code = proc.wait() except ( ExitSignal, InternalError, IOError, OSError, remote_client.InternalError) as e: # Something wrong happened, try to kill the child process. must_signal_internal_failure = str(e.message or 'unknown error') exit_code = kill_and_wait(proc, task_details.grace_period, e.message) # This is the very last packet for this command. It if was an isolated task, # include the output reference to the archived .isolated file. now = monotonic_time() params['cost_usd'] = cost_usd_hour * (now - task_start) / 60. / 60. params['duration'] = now - start params['io_timeout'] = had_io_timeout had_hard_timeout = False try: if not os.path.isfile(isolated_result): # It's possible if # - run_isolated.py did not start # - run_isolated.py started, but arguments were invalid # - host in a situation unable to fork # - grand child process outliving the child process deleting everything # it can # Do not create an internal error, just send back the (partial) # view as task_runner saw it, for example the real exit_code is # unknown. logging.warning('there\'s no result file') if exit_code is None: exit_code = -1 else: # See run_isolated.py for the format. with open(isolated_result, 'rb') as f: run_isolated_result = json.load(f) logging.debug('run_isolated:\n%s', run_isolated_result) # TODO(maruel): Grab statistics (cache hit rate, data downloaded, # mapping time, etc) from run_isolated and push them to the server. if run_isolated_result['outputs_ref']: params['outputs_ref'] = run_isolated_result['outputs_ref'] had_hard_timeout = run_isolated_result['had_hard_timeout'] if not had_io_timeout and not had_hard_timeout: if run_isolated_result['internal_failure']: must_signal_internal_failure = ( run_isolated_result['internal_failure']) logging.error('%s', must_signal_internal_failure) elif exit_code: # TODO(maruel): Grab stdout from run_isolated. must_signal_internal_failure = ( 'run_isolated internal failure %d' % exit_code) logging.error('%s', must_signal_internal_failure) exit_code = run_isolated_result['exit_code'] params['bot_overhead'] = 0. if run_isolated_result.get('duration') is not None: # Calculate the real task duration as measured by run_isolated and # calculate the remaining overhead. params['bot_overhead'] = params['duration'] params['duration'] = run_isolated_result['duration'] params['bot_overhead'] -= params['duration'] params['bot_overhead'] -= run_isolated_result.get( 'download', {}).get('duration', 0) params['bot_overhead'] -= run_isolated_result.get( 'upload', {}).get('duration', 0) params['bot_overhead'] -= run_isolated_result.get( 'cipd', {}).get('duration', 0) if params['bot_overhead'] < 0: params['bot_overhead'] = 0 isolated_stats = run_isolated_result.get('stats', {}).get('isolated') if isolated_stats: params['isolated_stats'] = isolated_stats cipd_stats = run_isolated_result.get('stats', {}).get('cipd') if cipd_stats: params['cipd_stats'] = cipd_stats cipd_pins = run_isolated_result.get('cipd_pins') if cipd_pins: params['cipd_pins'] = cipd_pins except (IOError, OSError, ValueError) as e: logging.error('Swallowing error: %s', e) if not must_signal_internal_failure: must_signal_internal_failure = '%s\n%s' % ( e, traceback.format_exc()[-2048:]) # If no exit code has been set, something went wrong with run_isolated.py. # Set exit code to -1 to indicate a generic error occurred. if exit_code is None: exit_code = -1 params['hard_timeout'] = had_hard_timeout # Ignore server reply to stop. Also ignore internal errors here if we are # already handling some. try: if must_signal_internal_failure: # We need to update the task and then send task error. However, we # should *not* send the exit_code since doing so would cause the task # to be marked as COMPLETED until the subsequent post_task_error call # finished, which would cause any query made between these two calls to # get the wrong task status. We also clear out the duration and various # stats as the server prints errors if either are set in this case. # TODO(sethkoehler): Come up with some way to still send the exit_code # (and thus also duration/stats) without marking the task COMPLETED. exit_code = None params.pop('duration', None) params.pop('bot_overhead', None) params.pop('isolated_stats', None) params.pop('cipd_stats', None) params.pop('cipd_pins', None) remote.post_task_update( task_details.task_id, task_details.bot_id, params, buf.pop(), exit_code) if must_signal_internal_failure: remote.post_task_error(task_details.task_id, task_details.bot_id, must_signal_internal_failure) # Clear out this error as we've posted it now (we already cleared out # exit_code above). Note: another error could arise after this point, # which is fine, since bot_main.py will post it). must_signal_internal_failure = '' except remote_client.InternalError as e: logging.error('Internal error while finishing the task: %s', e) if not must_signal_internal_failure: must_signal_internal_failure = str(e.message or 'unknown error') return { u'exit_code': exit_code, u'hard_timeout': had_hard_timeout, u'io_timeout': had_io_timeout, u'must_signal_internal_failure': must_signal_internal_failure, u'version': OUT_VERSION, } finally: file_path.try_remove(unicode(isolated_result))
def run_command(remote, task_details, work_dir, cost_usd_hour, task_start, min_free_space, bot_file): """Runs a command and sends packets to the server to stream results back. Implements both I/O and hard timeouts. Sends the packets numbered, so the server can ensure they are processed in order. Returns: Metadata dict with the execution result. Raises: ExitSignal if caught some signal when starting or stopping. InternalError on unexpected internal errors. """ # TODO(maruel): This function is incomprehensible, split and refactor. # Signal the command is about to be started. last_packet = start = now = monotonic_time() task_id = task_details.task_id bot_id = task_details.bot_id params = { 'cost_usd': cost_usd_hour * (now - task_start) / 60. / 60., } if not remote.post_task_update(task_id, bot_id, params): # Don't even bother, the task was already canceled. return { u'exit_code': -1, u'hard_timeout': False, u'io_timeout': False, u'must_signal_internal_failure': None, u'version': OUT_VERSION, } isolated_result = os.path.join(work_dir, 'isolated_result.json') args_path = os.path.join(work_dir, 'run_isolated_args.json') cmd = get_run_isolated() cmd.extend(['-a', args_path]) args = get_isolated_args(work_dir, task_details, isolated_result, bot_file, min_free_space) # Hard timeout enforcement is deferred to run_isolated. Grace is doubled to # give one 'grace_period' slot to the child process and one slot to upload # the results back. task_details.hard_timeout = 0 if task_details.grace_period: task_details.grace_period *= 2 try: # TODO(maruel): Support both channels independently and display stderr in # red. env = os.environ.copy() for key, value in (task_details.env or {}).iteritems(): if not value: env.pop(key, None) else: env[key] = value logging.info('cmd=%s', cmd) logging.info('cwd=%s', work_dir) logging.info('env=%s', env) fail_on_start = lambda exit_code, stdout: fail_without_command( remote, bot_id, task_id, params, cost_usd_hour, task_start, exit_code, stdout) # We write args to a file since there may be more of them than the OS # can handle. try: with open(args_path, 'w') as f: json.dump(args, f) except (IOError, OSError) as e: return fail_on_start( -1, 'Could not write args to %s: %s' % (args_path, e)) # Start the command try: assert cmd and all(isinstance(a, basestring) for a in cmd) proc = subprocess42.Popen(cmd, env=env, cwd=work_dir, detached=True, stdout=subprocess42.PIPE, stderr=subprocess42.STDOUT, stdin=subprocess42.PIPE) except OSError as e: return fail_on_start( 1, 'Command "%s" failed to start.\nError: %s' % (' '.join(cmd), e)) # Monitor the task output_chunk_start = 0 stdout = '' exit_code = None had_io_timeout = False must_signal_internal_failure = None kill_sent = False timed_out = None try: calc = lambda: calc_yield_wait(task_details, start, last_io, timed_out, stdout) maxsize = lambda: MAX_CHUNK_SIZE - len(stdout) last_io = monotonic_time() for _, new_data in proc.yield_any(maxsize=maxsize, timeout=calc): now = monotonic_time() if new_data: stdout += new_data last_io = now # Post update if necessary. if should_post_update(stdout, now, last_packet): last_packet = monotonic_time() params['cost_usd'] = (cost_usd_hour * (last_packet - task_start) / 60. / 60.) if not remote.post_task_update( task_id, bot_id, params, (stdout, output_chunk_start)): # Server is telling us to stop. Normally task cancellation. if not kill_sent: logging.warning( 'Server induced stop; sending SIGKILL') proc.kill() kill_sent = True output_chunk_start += len(stdout) stdout = '' # Send signal on timeout if necessary. Both are failures, not # internal_failures. # Eventually kill but return 0 so bot_main.py doesn't cancel the task. if not timed_out: if (task_details.io_timeout and now - last_io > task_details.io_timeout): had_io_timeout = True logging.warning( 'I/O timeout is %.3fs; no update for %.3fs sending SIGTERM', task_details.io_timeout, now - last_io) proc.terminate() timed_out = monotonic_time() else: # During grace period. if not kill_sent and now - timed_out >= task_details.grace_period: # Now kill for real. The user can distinguish between the following # states: # - signal but process exited within grace period, # (hard_|io_)_timed_out will be set but the process exit code will # be script provided. # - processed exited late, exit code will be -9 on posix. logging.warning( 'Grace of %.3fs exhausted at %.3fs; sending SIGKILL', task_details.grace_period, now - timed_out) proc.kill() kill_sent = True logging.info('Waiting for process exit') exit_code = proc.wait() except (ExitSignal, InternalError, IOError, OSError) as e: # Something wrong happened, try to kill the child process. must_signal_internal_failure = str(e.message or 'unknown error') exit_code = kill_and_wait(proc, task_details.grace_period, e.message) # This is the very last packet for this command. It if was an isolated task, # include the output reference to the archived .isolated file. now = monotonic_time() params['cost_usd'] = cost_usd_hour * (now - task_start) / 60. / 60. params['duration'] = now - start params['io_timeout'] = had_io_timeout had_hard_timeout = False try: if not os.path.isfile(isolated_result): # It's possible if # - run_isolated.py did not start # - run_isolated.py started, but arguments were invalid # - host in a situation unable to fork # - grand child process outliving the child process deleting everything # it can # Do not create an internal error, just send back the (partial) # view as task_runner saw it, for example the real exit_code is # unknown. logging.warning('there\'s no result file') if exit_code is None: exit_code = -1 else: # See run_isolated.py for the format. with open(isolated_result, 'rb') as f: run_isolated_result = json.load(f) logging.debug('run_isolated:\n%s', run_isolated_result) # TODO(maruel): Grab statistics (cache hit rate, data downloaded, # mapping time, etc) from run_isolated and push them to the server. if run_isolated_result['outputs_ref']: params['outputs_ref'] = run_isolated_result['outputs_ref'] had_hard_timeout = run_isolated_result['had_hard_timeout'] if not had_io_timeout and not had_hard_timeout: if run_isolated_result['internal_failure']: must_signal_internal_failure = ( run_isolated_result['internal_failure']) logging.error('%s', must_signal_internal_failure) elif exit_code: # TODO(maruel): Grab stdout from run_isolated. must_signal_internal_failure = ( 'run_isolated internal failure %d' % exit_code) logging.error('%s', must_signal_internal_failure) exit_code = run_isolated_result['exit_code'] params['bot_overhead'] = 0. if run_isolated_result.get('duration') is not None: # Calculate the real task duration as measured by run_isolated and # calculate the remaining overhead. params['bot_overhead'] = params['duration'] params['duration'] = run_isolated_result['duration'] params['bot_overhead'] -= params['duration'] params['bot_overhead'] -= run_isolated_result.get( 'download', {}).get('duration', 0) params['bot_overhead'] -= run_isolated_result.get( 'upload', {}).get('duration', 0) params['bot_overhead'] -= run_isolated_result.get( 'cipd', {}).get('duration', 0) if params['bot_overhead'] < 0: params['bot_overhead'] = 0 isolated_stats = run_isolated_result.get('stats', {}).get('isolated') if isolated_stats: params['isolated_stats'] = isolated_stats cipd_stats = run_isolated_result.get('stats', {}).get('cipd') if cipd_stats: params['cipd_stats'] = cipd_stats cipd_pins = run_isolated_result.get('cipd_pins') if cipd_pins: params['cipd_pins'] = cipd_pins except (IOError, OSError, ValueError) as e: logging.error('Swallowing error: %s', e) if not must_signal_internal_failure: must_signal_internal_failure = '%s\n%s' % ( e, traceback.format_exc()[-2048:]) # TODO(maruel): Send the internal failure here instead of sending it through # bot_main, this causes a race condition. if exit_code is None: exit_code = -1 params['hard_timeout'] = had_hard_timeout # Ignore server reply to stop. Also ignore internal errors here if we are # already handling some. try: remote.post_task_update(task_id, bot_id, params, (stdout, output_chunk_start), exit_code) except InternalError as e: logging.error('Internal error while finishing the task: %s', e) if not must_signal_internal_failure: must_signal_internal_failure = str(e.message or 'unknown error') return { u'exit_code': exit_code, u'hard_timeout': had_hard_timeout, u'io_timeout': had_io_timeout, u'must_signal_internal_failure': must_signal_internal_failure, u'version': OUT_VERSION, } finally: file_path.try_remove(unicode(isolated_result))
def cleanup(self): """Cleans up the cache directory. Ensures there is no unknown files in cache_dir. Ensures the read-only bits are set correctly. At that point, the cache was already loaded, trimmed to respect cache policies. """ with self._lock: fs.chmod(self.cache_dir, 0o700) # Ensure that all files listed in the state still exist and add new ones. previous = set(self._lru) # It'd be faster if there were a readdir() function. for filename in fs.listdir(self.cache_dir): if filename == self.STATE_FILE: fs.chmod(os.path.join(self.cache_dir, filename), 0o600) continue if filename in previous: fs.chmod(os.path.join(self.cache_dir, filename), 0o400) previous.remove(filename) continue # An untracked file. Delete it. logging.warning('Removing unknown file %s from cache', filename) p = self._path(filename) if fs.isdir(p): try: file_path.rmtree(p) except OSError: pass else: file_path.try_remove(p) continue if previous: # Filter out entries that were not found. logging.warning('Removed %d lost files', len(previous)) for filename in previous: self._lru.pop(filename) self._save() # Verify hash of every single item to detect corruption. the corrupted # files will be evicted. with self._lock: for digest, (_, timestamp) in list(self._lru._items.items()): # verify only if the mtime is grather than the timestamp in state.json # to avoid take too long time. if self._get_mtime(digest) <= timestamp: continue logging.warning('Item has been modified. item: %s', digest) if self._is_valid_hash(digest): # Update timestamp in state.json self._lru.touch(digest) continue # remove corrupted file from LRU and file system self._lru.pop(digest) self._delete_file(digest, UNKNOWN_FILE_SIZE) logging.error('Deleted corrupted item: %s', digest) self._save()