def RegisterTagEventForClient(self, clientname, tagname, event, message=None): logging.debug("%s remote tag %s on host %s", TagEventName[event], tagname, clientname) client = self.topologyInfo.GetClient(clientname, checkname=False) if client is None: logging.error("unknown client %s appeared", clientname) return False client.RegisterTagEvent("%s:%s" % (self.network_name, tagname), event, message)
def _DoSendEventsIfNeed(self): tosend = self.events[:self.MAX_TAGS_BULK] if not tosend: return logging.debug("SendData to %s: %d events", self.name, len(tosend)) def send_as_events(): try: self.connection.register_tags_events(tosend) return True except XMLRPCMethodNotSupported: self.SetVersion(1) return False def send_as_set_tags(): tags = _get_tags_to_set(tosend) if not tags: return self.connection.set_tags(tags) if self.version < 2 or not send_as_events(): send_as_set_tags() self.events.pop(len(tosend))
def _on_cloud_journal_event(self, ev): #logging.debug('before journal event %s' % ev) with self.lock: tag = self.inmem_items.get(ev.tag_name) if not tag: logging.warning('no object in inmem_items for cloud tag %s' % ev.tag_name) return if not tag.IsCloud(): # it's like assert logging.error('tag %s is not cloud tag in inmem_items but receives event from cloud' % ev.tag_name) return if tag.version >= ev.version: # TODO warn even on equal versions, but not for initial _subscribe_all if tag.version > ev.version: logging.warning('local version (%d) > journal version (%d) for tag %s' \ % (tag.version, ev.version, ev.tag_name)) return def add_event(event, version, msg=None): self._repr_modifier.add((tag, event, msg, version)) # FIXME here with warning, on state sync without it if ev.version > ev.last_reset_version and tag.version < ev.last_reset_version: logging.debug('overtaking reset %s.%d.%d for %d' % (ev.tag_name, ev.version, ev.last_reset_version, tag.version)) add_event(ETagEvent.Reset, ev.last_reset_version, ev.last_reset_comment) # TODO last_reset_comment is wrong add_event(ev.event, ev.version, ev.last_reset_comment if ev.event == ETagEvent.Reset else None) logging.debug('after journal event for %s' % ev.tag_name)
def Start(self): self._journal.Start() logging.debug("after_journal_start") self._repr_modifier.Start() logging.debug("after_repr_modifier_start") if self._cloud_tags_server: if self._cloud_tags_masks: self._masks_reload_thread = ProfiledThread( target=self._masks_reload_loop, name_prefix='TagsMasksReload') self._masks_reload_thread.start() logging.debug("after_masks_reload_thread_start") self._cloud = self._create_cloud_client(self._on_cloud_journal_event) else: # For CloudTag's in tags.db self._cloud = SerialUpdateOnlyDummyCloudClient( RuntimeError("Wrong setup for cloud tags in rem-server")) self._create_safe_cloud() logging.debug("after_safe_cloud_start") self._subscribe_all() logging.debug("after_subscribe_all")
def _on_rpc_update_graph(self, task_id, peer_addr, state, is_final): pck = self._by_task_id.get(task_id) logging.debug('_on_rpc_update_graph: task_id=%s, pck_id=%s; status=%s; is_final=%s' % ( task_id, pck and pck.id, GraphState.str(state['state']), is_final )) if not pck: raise WrongTaskIdError('No packet for sbx:%s' % task_id) with pck._lock: # FIXME # pck: connect # pck: write # rem: enter _on_rpc_update_graph # pck: CRASHED # pck: Sandbox' task FAILURE # rem: _on_task_status_change (enter + exit) # rem: _on_rpc_update_graph with self.lock <-- OOPS assert pck._state not in [ RemotePacketState.CREATING, RemotePacketState.TASK_FIN_WAIT, RemotePacketState.FINISHED, RemotePacketState.FETCHING_RESOURCE_LIST, RemotePacketState.FETCHING_FINAL_UPDATE ], "_on_rpc_update_graph in %s state" % pck._state if pck._state in [RemotePacketState.STARTING, RemotePacketState.CHECKING_START_ERROR, RemotePacketState.STARTED]: if pck._state != RemotePacketState.STARTED: pck._set_state(RemotePacketState.STARTED, '_on_rpc_update_graph') pck._drop_sched_if_need() assert not pck._peer_addr #else: #assert not pck._sched # stop may be scheduled for example if not pck._peer_addr: pck._peer_addr = peer_addr logging.debug('SET pck._peer_addr = %s for %s' % (peer_addr, pck)) if pck._target_stop_mode: if is_final: pck._sent_stop_mode = pck._target_stop_mode # FIXME else: self._start_packet_stop(pck) if pck._target_stop_mode != StopMode.CANCEL: pck._update_graph(state, is_final) if is_final: if pck._target_stop_mode == StopMode.CANCEL \ or state['state'] == GraphState.SUCCESSFULL: self._mark_task_fin_wait(pck, '_on_rpc_update_graph(SUCCESSFULL)') else: pass # XXX WAITING for TaskStateGroups.TERMINATED
def _set_state(self, state, reason=None): if self._state == RemotePacketState.CHECKING_START_ERROR \ and state != RemotePacketState.FINISHED: self.reset_error() self._state = state logging.debug('%s new state %s, reason: %s' % (self.id, RemotePacketState._NAMES[state], reason))
def _try_recover_directory(self, ctx): prev_packets_directory, _ = os.path.split(self.directory) actual_directory = os.path.join(ctx.packets_directory, self.id) if os.path.isdir(self.directory): if prev_packets_directory != ctx.packets_directory: try: logging.debug("relocates directory %s to %s", self.directory, actual_directory) shutil.copytree(self.directory, actual_directory) self.directory = actual_directory except: reraise("Failed to relocate directory") else: if os.path.isdir(actual_directory): self.directory = actual_directory return # FIXME Recreate? if not self._are_links_alive(ctx): raise NotAllFileLinksAlive("Not all links alive") try: self.directory = None self._create_place(ctx) except: reraise("Failed to resurrect directory")
def Restore(self, timestamp, tagRef, cloud_requester_state): logging.debug("TagLogger.Restore(%d)", timestamp) dirname, db_filename = os.path.split(self.db_filename) def get_filenames(): result = [] for filename in os.listdir(dirname): if filename.startswith(db_filename) and filename != db_filename: file_time = int(filename.split("-")[-1]) if file_time > timestamp: result.append(filename) result = sorted(result) if os.path.isfile(self.db_filename): result += [db_filename] return result with self._db_lock: self._restoring_mode = True for filename in get_filenames(): f = bsddb3.rnopen(os.path.join(dirname, filename), "r") for k, v in f.items(): try: obj = cPickle.loads(v) if isinstance(obj, CloudRequestStart): cloud_requester_state.start_request(obj.id, obj.update) elif isinstance(obj, CloudRequestFinish): cloud_requester_state.finish_request(obj.id) else: obj.Redo(tagRef) except Exception, e: logging.exception("occurred in TagLogger while restoring from a journal : %s", e) f.close() self._restoring_mode = False
def _update_state(self): new = self._calc_state() if new == self.state: return self.state = new logging.debug('SandboxJobGraphExecutorProxy.state => %s' % GraphState.str(self.state)) self._ops.on_state_change()
def _do_on_packet_terminated(self): def on_stop(): self._prev_task_id = self._remote_packet._sandbox_task_id self._remote_packet = None self._remote_state = None self._remote_time_wait_deadline = None assert not self.time_wait_deadline and not self.time_wait_sched r = self._remote_packet self.stopping = False if self.cancelled: self.cancelled = False on_stop() return # TODO Handle Exception packet state # FIXME Rollback history/Status to prev state logging.debug('state for SandboxJobGraphExecutorProxy == %s' \ % None if r._final_state is None else GraphState.str(r._final_state)) if r._error: self._error = r._error if r._is_error_permanent: self._tries = 0 elif self._tries: retry_idx = self.MAX_TRY_COUNT - self._tries - 1 delay = self.RETRY_INTERVALS[retry_idx] \ if retry_idx < len(self.RETRY_INTERVALS) \ else self.RETRY_INTERVALS[-1] logging.debug("%s failed. Will retry after %s seconds" % (r, delay)) self.time_wait_deadline = time.time() + delay self._schedule_time_wait_stop() elif r._final_state == GraphState.TIME_WAIT: self.time_wait_deadline = r._last_update['nearest_retry_deadline'] self._schedule_time_wait_stop() elif r._final_state == GraphState.SUCCESSFULL: self.result = True elif r._final_state == GraphState.ERROR: self.result = False self._prev_snapshot_resource_id = r._result_snapshot_resource_id \ if r._final_state != GraphState.SUCCESSFULL else None on_stop()
def _SendSubscriptionsIfNeed(self, local_server_network_name): tosend = list(self.subscriptions)[:self.MAX_TAGS_BULK] if not tosend: return logging.debug("SendData to %s: %d subscriptions", self.name, len(tosend)) self.connection.register_share(tosend, local_server_network_name) self.subscriptions.difference_update(tosend)
def _vivify_packets(self, alloc_guard): logging.debug('RemotePacketsDispatcher packet count: %d' % len(self._by_task_id)) for pck in self._by_task_id.itervalues(): logging.debug('VIVIFY %s' % pck.id) if pck._sandbox is None: pck._sandbox = self._default_sandbox_client pck._run_guard = alloc_guard() self._await_task_status(pck) self._reschedule_packet(pck)
def _get_vaults_for(self, job_id): vaults = self._vaults_setup if not vaults: return None env = copy.copy(vaults['global']) or {} env.update(vaults['jobs'].get(job_id, {})) logging.debug('Vaults for %d: %s' % (job_id, env)) return env
def on_state_change(self): pck = self.pck pck._update_state() logging.debug('pck._has_updates = True') pck._has_updates = True pck._mark_as_finished_if_need() logging.debug('pck._something_changed.notify()') pck._something_changed.notify()
def _do_stop_packet(self, pck): task_id = pck._sandbox_task_id stop_mode = pck._target_stop_mode def reschedule_if_need(): if pck._state != RemotePacketState.STARTED: return if pck._target_stop_mode > stop_mode: return if pck._sched: # TODO Assert that we have another _do_stop_packet in pck._sched return self._schedule( pck, self._start_packet_stop, timeout=self._RPC_RESEND_INTERVAL) with pck._lock: if not pck._peer_addr: # FIXME _do_stop_packet must not be called on this condition reschedule_if_need() return proxy = self._create_packet_rpc_proxy(pck) logging.debug('_do_stop_packet(%s, stop_mode=%s' % (pck, stop_mode)) try: if stop_mode == StopMode.CANCEL: proxy.cancel(task_id) else: kill_jobs = stop_mode == StopMode.STOP proxy.stop(task_id, kill_jobs) except Exception as e: logging.warning("Failed to send stop to packet %s: %s" % (pck, e)) if is_xmlrpc_exception(e, WrongTaskIdError) \ or isinstance(e, socket.error) and e.errno == errno.ECONNREFUSED: return # FIXME Is enough? # STOP/SUSPEND->EXECUTING with pck._lock: reschedule_if_need() else: with pck._lock: #assert pck._state == RemotePacketState.STARTED # FIXME XXX if pck._state != RemotePacketState.STARTED: return if pck._sent_stop_mode < stop_mode: pck._sent_stop_mode = stop_mode
def RegisterTagEvent(self, tag, event, message=None): if not isinstance(tag, TagBase): raise RuntimeError("%s is not Tag class instance", tag.GetName()) if tag.IsRemote(): return tagname = tag.GetName() with self.lock: # see register_share acceptors = self.acceptors.get(tagname) if acceptors: logging.debug("on %s connmanager %s with acceptors list %s", TagEventName[event], tagname, acceptors) for clientname in acceptors: self.RegisterTagEventForClient(clientname, tagname, event, message)
def _complete_waitJobs(pck_id, g): child_to_parents = g.child_to_parents fixed = False for job in g.jobs.values(): if job.parents: if job.id not in child_to_parents: fixed = True child_to_parents[job.id] = set([]) if fixed: logging.debug('!!!Incomplete .waitJobs in %s!!!' % pck_id)
def register_share(self, tags, clientname): tagRef = self.scheduler.tagRef logging.debug("register_share %d tags for %s: %s", len(tags), clientname, tags) for tagname in tags: # XXX # 1. this lock only guarantee eventual-consistency of tag's history # 2. clients of self may see duplicates of events (even Reset) # 3. also guard self.acceptors with self.lock: self.acceptors.add(tagname, clientname) if tagRef._RawTag(tagname).IsLocallySet(): self.RegisterTagEventForClient(clientname, tagname, ETagEvent.Set) logging.debug("register_share %d tags for %s: done", len(tags), clientname)
def _Communicate(self, f): self.Connect() try: f() self.errorsCnt = 0 logging.debug("SendData to %s: ok", self.name) except (IOError, socket.timeout) as e: logging.warning("SendData to %s: failed: %s", self.name, e) self.lastError = e self.errorsCnt += 1 except Exception as e: logging.error("SendData to %s: failed: %s", self.name, e)
def _update_repr_state(self): new = self._calc_repr_state() if new == self._repr_state: return self._repr_state = new self.history.append((new, time.time())) logging.debug("packet %s\tnew state %r", self.name, new) if new == ReprState.WAITING: deadline = self._graph_executor.get_nearest_retry_deadline() delay = max(deadline - time.time(), 0) if deadline else None logging.debug("packet %s\twaiting for %s sec", self.name, delay)
def _update_state(self): new = self._calc_state() if self.state != new: self.state = new logging.debug("packet %s\tnew impl state %r", self.name, new) self._update_repr_state() self._on_state_change(new) #else: #self._update_repr_state() # ImplState.RUNNING: 1. PENDING, 2. WORKABLE if self.queue: self.queue.update_pending_jobs_state(self) # TODO only for LocalPacket actually
def _create_job_graph_executor(self): assert not self.files_modified resources = self._produce_job_graph_executor_custom_resources() logging.debug('_create_job_graph_executor(%s): %s' % (self.id, resources)) return sandbox_remote_packet.SandboxJobGraphExecutorProxy( sandbox_remote_packet.SandboxPacketOpsForJobGraphExecutorProxy(self), self.id, self.make_job_graph(), resources, host=self.req_sandbox_host, oauth_token=self.oauth_token, vaults_setup=self._make_vaults_setup(), )
def UpdateContext(self, context): self.db_file = context.tags_db_file self.remote_tags_enabled = not context.disable_remote_tags self.DBConnect() self._journal.UpdateContext(context) self._repr_modifier.UpdateContext(context) self._cloud_tags_server = context.cloud_tags_server self._cloud_tags_masks = context.cloud_tags_masks self._cloud_tags_masks_reload_interval = context.cloud_tags_masks_reload_interval self._cloud_tags_release_delay = context.cloud_tags_release_delay self._tags_random_cloudiness = self._has_cloud_setup() and context.tags_random_cloudiness self._all_tags_in_cloud = self._has_cloud_setup() and context.all_tags_in_cloud logging.debug("TagStorage.UpdateContext, masks = %s, share = %s, server = %s" % ( self._cloud_tags_masks, self._all_tags_in_cloud, self._cloud_tags_server))
def get_job_to_run(self): while True: with self.lock: if not self.has_startable_jobs(): return None pck = self.local_ops.get_packet_to_run() # .get_job_to_run not under lock to prevent deadlock try: job = pck.get_job_to_run() except NotWorkingStateError: logging.debug('NotWorkingStateError idling: %s' % pck) continue return job
def _update_graph(self, update, is_final): assert self._target_stop_mode != StopMode.CANCEL self._last_update = update # TODO succeed_jobs = set(map(int, update['succeed_jobs'])) new_succeed_jobs = succeed_jobs - self._succeeded_jobs self._succeeded_jobs = succeed_jobs if is_final: self._final_state = update['state'] logging.debug('%s._final_state = %s' % (self, GraphState.str(self._final_state))) self._ops._on_sandbox_packet_update(update, new_succeed_jobs, is_final)
def _masks_reload_loop(self): while True: if self._masks_should_stop.wait(self._cloud_tags_masks_reload_interval): return try: match = self._load_masks() except Exception as e: logging.error("Failed to reload tags' masks from: %s" % e) continue if self._match_cloud_tag.count and not match.count: logging.warning("New cloud tags masks discarded: old count %d, new count %d" % ( self._match_cloud_tag.count, match.count)) continue logging.debug("Cloud tag's masks reloaded. Regexp count: %d" % match.count) self._match_cloud_tag = match
def _do_do_resolve(self, rel): request = rel.request resource = self._client.get_latest_resource( type=request.type, owner=request.owner, released=request.released ) rel.resolve_time = time.time() if not resource: logging.warning("Failed to resolve %s" % (request,)) rel.promise.set(None, RuntimeError("Can't find resource %s" % (request,))) return logging.debug("%s resolved to %s" % (request, resource)) rel.promise.set(resource['id'])
def _run_process_inner(self, argv, stdin, stdout, stderr): job = self._job ops = self._ops self._start_time = time.time() try: process = ops.start_process( argv, stdout=stdout, stdin=stdin, stderr=stderr, cwd=ops.get_working_directory(), env_update=self._env_update.items(), ) except pgrpguard.Error as e: t, e, tb = sys.exc_info() raise t, e, tb except Exception as e: self._process_start_error = e logging.exception("Failed to start %s: %s" % (job, e)) return False logging.debug("%s started with pid=%s" % (job, process.pid)) self._process = process if self._cancelled: process.terminate() try: self._wait_process(process) except Exception as e: t, e, tb = sys.exc_info() try: process.terminate() except: pass raise t, e, tb self._finish_time = time.time() return True
def _reschedule_packet(self, pck): # TODO Check by_state = { RemotePacketState.CREATING: self._start_create_sandbox_task, RemotePacketState.STARTING: self._start_start_sandbox_task, RemotePacketState.FETCHING_RESOURCE_LIST: self._start_fetch_resource_list, RemotePacketState.FETCHING_FINAL_UPDATE: self._start_fetch_final_update, } action = by_state.get(pck._state) if not action \ and pck._target_stop_mode != pck._sent_stop_mode \ and pck._state == RemotePacketState.STARTED: action = self._start_packet_stop if action: logging.debug('action %s' % action) action(pck)
def _update(self, update, id=None): with self._lock: logging.debug("SafeCloud.update(%s)" % str(update)) # until i will trust the code idx = self._next_idx self._next_idx += 1 id = id or self._alloc_id() self._journal.log_cloud_request_start(id, update) done = self._cloud.serial_update(update) # FIXME assert id not in self._running self._running[id] = (idx, update) # This call may lead to immediate _on_done in current thread done.subscribe(lambda f: self._on_done(id, f))