def _run(self): while not self.IsKilled(): try: self.do() except Exception, e: logging.exception("worker\tjob execution error %s", e) time.sleep(self.tickTime)
def _the_loop(self): while not self._should_stop: with self._lock: while not(self._should_stop or self._queue): self._modified.wait() if self._should_stop: return id, (deadline, callback) = self._queue.front() now = time.time() if now < deadline: self._modified.wait(deadline - now) if self._should_stop: return if not self._queue or self._queue.front()[0] != id: continue self._queue.pop_front() try: if callback.__code__.co_argcount: callback(id) else: callback() except: logging.exception("Failed to execute %s" % callback) del callback
def inner(*args, **kwargs): if log_args: args_ = args[skip_arg_count:] \ if skip_arg_count else args if args_ and kwargs: args_to_log = (args_, kwargs) elif kwargs: args_to_log = kwargs else: args_to_log = args_ prefix = 'function "%s(%s)" ' % (func.__name__, str(args_to_log)[:4096]) else: prefix = 'function "%s" ' % func.__name__ def log(str): log_func(prefix + str) log('started') try: ret = func(*args, **kwargs) log('finished') return ret except: logging.exception('%s call failed', prefix) raise
def Restore(self, timestamp, tagRef, cloud_requester_state): logging.debug("TagLogger.Restore(%d)", timestamp) dirname, db_filename = os.path.split(self.db_filename) def get_filenames(): result = [] for filename in os.listdir(dirname): if filename.startswith(db_filename) and filename != db_filename: file_time = int(filename.split("-")[-1]) if file_time > timestamp: result.append(filename) result = sorted(result) if os.path.isfile(self.db_filename): result += [db_filename] return result with self._db_lock: self._restoring_mode = True for filename in get_filenames(): f = bsddb3.rnopen(os.path.join(dirname, filename), "r") for k, v in f.items(): try: obj = cPickle.loads(v) if isinstance(obj, CloudRequestStart): cloud_requester_state.start_request(obj.id, obj.update) elif isinstance(obj, CloudRequestFinish): cloud_requester_state.finish_request(obj.id) else: obj.Redo(tagRef) except Exception, e: logging.exception("occurred in TagLogger while restoring from a journal : %s", e) f.close() self._restoring_mode = False
def _do_run(self): try: f = getattr(self, '_run', threading.Thread.run.__get__(self)) f() except: logging.exception('NamedThread %s [%d] failed' % (self.name, gettid())) raise
def try_recover_after_backup_loading(self, ctx): descr = '[%s, directory = %s]' % (self, self.directory) try: self._try_recover_after_backup_loading(ctx) except Exception: logging.exception("Failed to recover packet %s" % descr) self._mark_as_failed_on_recovery()
def _do_resolve(self, rel): try: self._do_do_resolve(rel) # TODO Fail on permanent errors except Exception: #logging.warning('Failed to list_latest_releases for %s: %s' % (rel.request, e)) logging.exception('Failed to list_latest_releases for %s' % (rel.request,)) delayed_executor.schedule(lambda : self._resolve(rel), timeout=self._RETRY_INTERVAL)
def rpc_list_files(self): files = [] with self.lock: if self.directory: try: files = os.listdir(self.directory) except Exception: logging.exception("directory %s listing error", self.directory) return files
def _try_create_place_if_need(self): try: self._create_place_if_need() return True except Exception: logging.exception("Failed to create place") self.is_broken = True self._graph_executor.cancel() self._update_state() return False
def f(*args): try: return func(*args) except RpcUserError: _, e, tb = sys.exc_info() e = e.exc raise type(e), e, tb except: logging.exception("RPC method %s failed" % func.__name__) raise
def _process_server_message(self, msg): # WhichOneof doesn't work # https://github.com/google/protobuf/commit/0971bb0d57aa6f2db1abee4008b365d52b402891 # type = msg.WhichOneof('Data') def first(pred, iterable): for item in iterable: if pred(item): return item type = first(msg.HasField, ['Event', 'Bye', 'Ack', 'Lookup', 'Subscriptions', 'Match']) if type is None: # XXX Client will looped in this error raise NotImplementedError("Unknown server message type for [%s]" % msg) if type == 'Event': for ev in self._ServerMessage.Event(msg): try: self._on_event(ev) except: logging.exception("Failed to process journal event") return elif type == 'Bye': #logging.debug("...") with self._lock: self._io._bye_received = True self._outgoing_not_empty.notify() return data = getattr(msg, type) if not data.HasField('RequestId'): raise RuntimeError('No .RequestId') request_id = data.RequestId with self._lock: promise = self._running.pop(request_id).promise # FIXME , None and raise #if not self._running: #self._running_empty.notify_all() if self._should_and_can_stop(): self._should_and_can_stop_cond.notify_all() if type == 'Ack': promise.set(None) elif type == 'Lookup': promise.set(self._ServerMessage.Lookup(msg.Lookup)) elif type == 'Match': promise.set(*self._ServerMessage.Match(msg.Match)) elif type == 'Subscriptions': promise.set(self._ServerMessage.Subscriptions(msg.Subscriptions)) else: assert False
def _release_place(self): with self.lock: self._release_links() if self.directory and os.path.isdir(self.directory): try: shutil.rmtree(self.directory, onerror=None) except Exception: logging.exception("Packet %s release place error", self.id) self.directory = None
def __worker(self): while True: task = self._queue.get() if task is self.__STOP_INDICATOR: break try: task() except Exception: logging.exception("")
def __setstate__(self, sdict): for attr, builder in scheme.iteritems(): try: if attr in sdict: sdict[attr] = builder(sdict[attr]) else: sdict[attr] = builder() except: logging.exception("unpickable\tcan't deserialize attribute %s with builder %r", attr, builder) raise setter = getattr(super(ObjUnpickler, self), "__setstate__", self.__dict__.update) setter(sdict) ObjectRegistrator_.register(self, sdict)
def _send_email(self, make): if not self.notify_emails: return ctx = self._get_scheduler_ctx() try: msg = make(ctx) except: logging.exception("Failed to format email for %s" % self.id) else: if msg: ctx.send_email_async(self.notify_emails, msg)
def run(self): job = self._job try: try: self._run() except Exception: # FIXME Do something more than logging? logging.exception("Run job %s exception", job) if not(self._environment_error or self._cancelled): job.tries += 1 self._append_results() finally: self._ops.on_job_done(self)
def FireEvent(self, event, reference=None, safe=False): bad_listeners = set() for obj in self.__get_listeners(): if isinstance(obj, ICallbackAcceptor): try: obj.AcceptCallback(reference or self, event) except: if not safe: raise logging.exception("Failed to accept '%s' event for %s" % (event, obj)) else: logging.warning("callback %r\tincorrect acceptor for %s found: %s", self, event, obj) bad_listeners.add(obj) for obj in bad_listeners: self.DropCallbackListener(obj)
def _fetch_resource_list(self, pck): try: ans = pck._sandbox.list_task_resources(pck._sandbox_task_id) except: logging.exception('Failed to fetch task resources for %s' % pck) with pck._lock: if pck._target_stop_mode != StopMode.CANCEL: self._schedule(pck, self._start_fetch_resource_list, self._SANDBOX_TASK_CREATION_RETRY_INTERVAL) return # TODO We don't have to _fetch_resource_list() in any TERMINATED task # state (e.g. ERROR, EXCEPTION) #import json #logging.debug('task #%s resources list answer: %s' % (pck._sandbox_task_id, json.dumps(ans, indent=3))) res_by_type = { resource['type']: resource for resource in ans['items'] } #logging.debug('task #%s res_by_type: %s' % (pck._sandbox_task_id, json.dumps(res_by_type, indent=3))) with pck._lock: resource = res_by_type.get('REM_JOBPACKET_EXECUTION_SNAPSHOT') if not resource: logging.error("No REM_JOBPACKET_EXECUTION_SNAPSHOT resource in %s" % pck) err = "No REM_JOBPACKET_EXECUTION_SNAPSHOT resource" pck.set_error(err, False) self._mark_as_finished(pck, err) return pck._result_snapshot_resource_id = resource['id'] if pck._final_state is None: pck._final_update_url = res_by_type['REM_JOBPACKET_GRAPH_UPDATE']['http']['proxy'] pck._set_state(RemotePacketState.FETCHING_FINAL_UPDATE) else: self._mark_as_finished(pck, '_fetch_resource_list') return self._fetch_final_update(pck) # not under lock
def _run_process_inner(self, argv, stdin, stdout, stderr): job = self._job ops = self._ops self._start_time = time.time() try: process = ops.start_process( argv, stdout=stdout, stdin=stdin, stderr=stderr, cwd=ops.get_working_directory(), env_update=self._env_update.items(), ) except pgrpguard.Error as e: t, e, tb = sys.exc_info() raise t, e, tb except Exception as e: self._process_start_error = e logging.exception("Failed to start %s: %s" % (job, e)) return False logging.debug("%s started with pid=%s" % (job, process.pid)) self._process = process if self._cancelled: process.terminate() try: self._wait_process(process) except Exception as e: t, e, tb = sys.exc_info() try: process.terminate() except: pass raise t, e, tb self._finish_time = time.time() return True
def _process_update(self, update): tag, event = update[0:2] msg = None version = None if len(update) >= 3: msg = update[2] if len(update) >= 4: version = update[3] try: tag._ModifyLocalState(event, msg, version) except Exception: logging.exception("Failed to process tag update %s" % (update,)) if self._connection_manager: try: self._connection_manager.RegisterTagEvent(tag, event, msg) except Exception: logging.exception("Failed to pass update to ConnectionManager %s" % (update,))
def _release_links(self): tmpLinks, self.bin_links = self.bin_links, {} while tmpLinks: binname, file = tmpLinks.popitem() if isinstance(file, BinaryFile): # 'try' was a workaround for race-conditions in self. Now j.i.c try: file.Unlink(self, binname) except OSError as e: if e.errno == errno.ENOENT: logging.exception("Packet %s release place error", self.id) else: raise filehash = file.checksum elif isinstance(file, str): filehash = file else: filehash = None if filehash is not None: self.bin_links[binname] = filehash
def _update(self): running = self._running running_task_ids = running.keys() # FIXME Determine max &limit for /task in list_task_statuses and split request try: statuses = self._sandbox.list_task_statuses(running_task_ids) except (sandbox.NetworkError, sandbox.ServerInternalError): pass except Exception: logging.exception("Can't fetch task statuses from Sandbox") else: # FIXME Move to rem.sandbox deleted_task_ids = set(running_task_ids) - set(statuses.keys()) for task_id in deleted_task_ids: statuses[task_id] = TaskStatus.DELETED for task_id, status in statuses.iteritems(): task = running[task_id] prev_status_group, prev_status = task if status != prev_status: logging.debug("task #%d change status %s -> %s" % (task_id, prev_status, status)) status_group, can_has_res = self._interpret_status(status) if status_group == TaskStateGroups.TERMINATED: running.pop(task_id) else: task[0] = status_group task[1] = status if prev_status_group != status_group: try: self._notify(task_id, status, status_group, can_has_res) except: logging.exception("Sandbox task on_state_change handler failed for %s -> %s" \ % (task_id, status))
def convert_in_memory_tags_to_cloud_if_need(self): if not self._has_cloud_setup(): return False updates = [] for tag_name, tag in self.inmem_items.iteritems(): must_be_cloud = self._is_cloud_tag_name(tag_name) \ and not tag.IsRemote() # Hack for disable_remote_tags if must_be_cloud == tag.IsCloud(): continue elif must_be_cloud: if tag.IsLocallySet(): updates.append((tag_name, ETagEvent.Set)) self._make_tag_cloud(tag) else: logging.error("Tag %s is cloud, but must not be" % tag_name) if not updates: return False logging.info("before conversion %d tags to CloudTag's" % len(updates)) cloud = self._create_cloud_client(lambda ev: None) try: for bucket in split_in_groups(updates, 100000): # TODO Fix cloud_client.update cloud.update(bucket).get() finally: try: cloud.stop() except: logging.exception("Failed to stop temporary cloud client") logging.info("after conversion %d tags to CloudTag's" % len(updates)) return True
def _do_start_sandbox_task(self, pck): try: pck._sandbox.start_task(pck._sandbox_task_id) # Possible events before lock will be acquired in this func: # - not final GRAPH_UPDATE # - final GRAPH_UPDATE # - STOP_GRACEFULLY/STOP/CANCEL # XXX Уже неверно. # No task status changes may appear here, because awaiting is not racy # - final GRAPH_UPDATE + task terminated # - task terminated wo GRAPH_UPDATE's except Exception as e: logging.exception('Failed to start task %s' % pck) with pck._lock: if pck._state != RemotePacketState.STARTING: return # TODO Don't forget to take into account ._target_stop_mode in _on_task_status_change # Here we don't know if task is really started is_error_permanent = \ not isinstance(e, (sandbox.NetworkError, sandbox.ServerInternalError)) pck.set_error(str(e), is_error_permanent) pck._set_state(RemotePacketState.CHECKING_START_ERROR) return with pck._lock: if pck._state != RemotePacketState.STARTING: return pck._set_state(RemotePacketState.STARTED, '._sandbox.start() -> ok') assert not pck._peer_addr
def _io_loop(self, type_str, loop, how, after_shutdown=None): failed = True try: loop() failed = False except Exception as e: logging.exception("%s error" % type_str) #logging.debug("%s io thread stopped" % type_str) try: self._io._connection.shutdown(how) except socket.error as e: if e.errno != errno.ENOTCONN: logging.warning("Error on socket shutdown(%d): %s" % (how, e)) if after_shutdown: after_shutdown() with self._lock: self._io._thread_count -= 1 if not self._io._thread_count: self._io._connected = False
def _do_create_sandbox_task(self, pck): def reschedule_if_need(): with pck._lock: if pck._target_stop_mode: return self._schedule( pck, self._start_create_sandbox_task, timeout=self._SANDBOX_TASK_CREATION_RETRY_INTERVAL) with pck._lock: if pck._target_stop_mode: return def handle_unknown_error(e): with pck._lock: if not pck._target_stop_mode: pck.set_error(str(e), False) self._mark_as_finished(pck, 'Unknown error while creating: %s' % e) # FIXME Invert logic: retry everything except permanent try: task = self._sbx_create_task(pck) except (sandbox.NetworkError, sandbox.ServerInternalError) as e: reschedule_if_need() return except Exception as e: logging.exception('Failed to create sandbox task for %s' % pck.id) handle_unknown_error(e) return logging.debug('sbx:%d for %s created' % (task.id, pck.id)) with pck._lock: if pck._target_stop_mode: return try: self._sbx_update_task(pck, task) except (sandbox.NetworkError, sandbox.ServerInternalError) as e: reschedule_if_need() #self._start_delete_task(task.id) return except Exception as e: logging.exception('Failed to update (after start) task %s for %s' % (task.id, pck.id)) handle_unknown_error(e) #self._start_delete_task(task.id) return with pck._lock: if pck._target_stop_mode: return # FIXME fork locking (mallformed pck state) pck._set_task_id(task.id) pck._set_state(RemotePacketState.STARTING) self._by_task_id[task.id] = pck self._await_task_status(pck) self._do_start_sandbox_task(pck)