def AcceptCallback(self, reference, event): methName = "On" + event.title().replace("_", "") fn = getattr(self, methName, None) if callable(fn): fn(reference) else: logging.warning("can't invoke %s method for object %s", methName, self)
def _on_cloud_journal_event(self, ev): #logging.debug('before journal event %s' % ev) with self.lock: tag = self.inmem_items.get(ev.tag_name) if not tag: logging.warning('no object in inmem_items for cloud tag %s' % ev.tag_name) return if not tag.IsCloud(): # it's like assert logging.error('tag %s is not cloud tag in inmem_items but receives event from cloud' % ev.tag_name) return if tag.version >= ev.version: # TODO warn even on equal versions, but not for initial _subscribe_all if tag.version > ev.version: logging.warning('local version (%d) > journal version (%d) for tag %s' \ % (tag.version, ev.version, ev.tag_name)) return def add_event(event, version, msg=None): self._repr_modifier.add((tag, event, msg, version)) # FIXME here with warning, on state sync without it if ev.version > ev.last_reset_version and tag.version < ev.last_reset_version: logging.debug('overtaking reset %s.%d.%d for %d' % (ev.tag_name, ev.version, ev.last_reset_version, tag.version)) add_event(ETagEvent.Reset, ev.last_reset_version, ev.last_reset_comment) # TODO last_reset_comment is wrong add_event(ev.event, ev.version, ev.last_reset_comment if ev.event == ETagEvent.Reset else None) logging.debug('after journal event for %s' % ev.tag_name)
def FixLinks(self): bad_links = set() for (pck_id, name), dest in self.links.iteritems(): if not os.path.islink(dest): logging.warning("%s link item not found for packet %s", dest, pck_id) bad_links.add((pck_id, name)) for i in bad_links: self.links.pop(i)
def _find_packet_user_state_queue(self, pck): ret = None for qname, queue in self.by_user_state.items(): if pck in queue: if ret is not None: logging.warning("packet %r is in several queues", pck) ret = queue return ret
def Relink(self, estimated_path): with self.lock: self.path = estimated_path for dstname in self.links.itervalues(): dstdir = os.path.split(dstname)[0] if not os.path.isdir(dstdir): logging.warning("binfile\tcan't relink nonexisted packet data %s", dstdir) elif os.path._resolve_link(dstname) != self.path: osspec.create_symlink(self.path, dstname, reallocate=True)
def add(self, key, value): if self.revIndex.has_key(key): logging.warning("%r already is in heap", key) self._change_value(key, value) return self.objects.append(key) self.values.append(value) pos = len(self.objects) - 1 self.revIndex[key] = pos self._rollup(pos)
def _do_stop_packet(self, pck): task_id = pck._sandbox_task_id stop_mode = pck._target_stop_mode def reschedule_if_need(): if pck._state != RemotePacketState.STARTED: return if pck._target_stop_mode > stop_mode: return if pck._sched: # TODO Assert that we have another _do_stop_packet in pck._sched return self._schedule( pck, self._start_packet_stop, timeout=self._RPC_RESEND_INTERVAL) with pck._lock: if not pck._peer_addr: # FIXME _do_stop_packet must not be called on this condition reschedule_if_need() return proxy = self._create_packet_rpc_proxy(pck) logging.debug('_do_stop_packet(%s, stop_mode=%s' % (pck, stop_mode)) try: if stop_mode == StopMode.CANCEL: proxy.cancel(task_id) else: kill_jobs = stop_mode == StopMode.STOP proxy.stop(task_id, kill_jobs) except Exception as e: logging.warning("Failed to send stop to packet %s: %s" % (pck, e)) if is_xmlrpc_exception(e, WrongTaskIdError) \ or isinstance(e, socket.error) and e.errno == errno.ECONNREFUSED: return # FIXME Is enough? # STOP/SUSPEND->EXECUTING with pck._lock: reschedule_if_need() else: with pck._lock: #assert pck._state == RemotePacketState.STARTED # FIXME XXX if pck._state != RemotePacketState.STARTED: return if pck._sent_stop_mode < stop_mode: pck._sent_stop_mode = stop_mode
def _Communicate(self, f): self.Connect() try: f() self.errorsCnt = 0 logging.debug("SendData to %s: ok", self.name) except (IOError, socket.timeout) as e: logging.warning("SendData to %s: failed: %s", self.name, e) self.lastError = e self.errorsCnt += 1 except Exception as e: logging.error("SendData to %s: failed: %s", self.name, e)
def _stop(self, wait=True, timeout=None): if self._stopped: return new_value = self._ST_WAIT if wait else self._ST_NOWAIT with self._lock: if self._should_stop < new_value: logging.info("Stopping YtTags.Client (%s)" % '_ST_WAIT' if wait == self._ST_WAIT else '_ST_NOWAIT') self._do_stop(new_value) elif self._should_stop > new_value: logging.warning("stop() called with lower stop-level") self._connect_thread.join(timeout) # TODO sleeps
def _do_initial_cloud_tags_masks_load(self): try_count = 3 for idx in range(try_count): try: self._match_cloud_tag = self._load_masks() except Exception as e: logging.warning("Iter #%d of cloud tags masks (%s) loading failed: %s" % ( idx + 1, self._cloud_tags_masks, e)) else: break if idx != try_count - 1: time.sleep(5) else: raise RuntimeError("Failed to load cloud tags masks for %d attempts" % try_count)
def FireEvent(self, event, reference=None, safe=False): bad_listeners = set() for obj in self.__get_listeners(): if isinstance(obj, ICallbackAcceptor): try: obj.AcceptCallback(reference or self, event) except: if not safe: raise logging.exception("Failed to accept '%s' event for %s" % (event, obj)) else: logging.warning("callback %r\tincorrect acceptor for %s found: %s", self, event, obj) bad_listeners.add(obj) for obj in bad_listeners: self.DropCallbackListener(obj)
def _do_do_resolve(self, rel): request = rel.request resource = self._client.get_latest_resource( type=request.type, owner=request.owner, released=request.released ) rel.resolve_time = time.time() if not resource: logging.warning("Failed to resolve %s" % (request,)) rel.promise.set(None, RuntimeError("Can't find resource %s" % (request,))) return logging.debug("%s resolved to %s" % (request, resource)) rel.promise.set(resource['id'])
def _masks_reload_loop(self): while True: if self._masks_should_stop.wait(self._cloud_tags_masks_reload_interval): return try: match = self._load_masks() except Exception as e: logging.error("Failed to reload tags' masks from: %s" % e) continue if self._match_cloud_tag.count and not match.count: logging.warning("New cloud tags masks discarded: old count %d, new count %d" % ( self._match_cloud_tag.count, match.count)) continue logging.debug("Cloud tag's masks reloaded. Regexp count: %d" % match.count) self._match_cloud_tag = match
def _move_files(self, new_binary_directory): assert self.binDirectory != new_binary_directory if True: badFiles = set() for checksum, file in self.files.iteritems(): estimated_path = os.path.join(new_binary_directory, checksum) if not os.path.isfile(estimated_path): #if os.path.isfile(file.path): if False: shutil.move(file.path, estimated_path) elif file.LinksCount() > 0: print file.links raise RuntimeError("binstorage\tcan't recover file %r" % file.path) else: badFiles.add(checksum) file.Relink(estimated_path) if badFiles: for checksum in badFiles: del self.files[checksum] logging.warning("binstorage\tnonexisted file %s cleaning attempt", checksum) logging.warning("can't recover %d files; %d files left in storage", len(badFiles), len(self.files))
def Start(self): if not self.network_name or not self.tags_file or not self.port: logging.warning("ConnectionManager could'n start: wrong configuration. " + "network_name: %s, remote_tags_db_file: %s, system_port: %r", self.network_name, self.tags_file, self.port) return self.ReloadConfig() logging.debug("after_reload_config") for client in self.topologyInfo.servers.values(): if client.active and client.name != self.network_name: client.TryInitializePeersVersions(self.network_name) logging.debug("after_clients_versions_init") self.alive = True self.InitXMLRPCServer() self._accept_loop_thread = ProfiledThread(target=self.ServerLoop, name_prefix='ConnManager') self._accept_loop_thread.start() logging.debug("after_connection_manager_loop_start") for client in self.topologyInfo.servers.values(): self.scheduler.ScheduleTaskT(0, self.SendData, client, skip_logging=True)
def _create_connection(self): now = time.time() timeout = 0.0 while True: with self._lock: if self._should_and_can_stop(): return self._should_and_can_stop_cond.wait(timeout) if self._should_and_can_stop(): return try: conn = self._connection_constructor() except Exception as e: logging.warning("Failed to connect: %s" % e) if conn: return conn timeout = 1.0
def _io_loop(self, type_str, loop, how, after_shutdown=None): failed = True try: loop() failed = False except Exception as e: logging.exception("%s error" % type_str) #logging.debug("%s io thread stopped" % type_str) try: self._io._connection.shutdown(how) except socket.error as e: if e.errno != errno.ENOTCONN: logging.warning("Error on socket shutdown(%d): %s" % (how, e)) if after_shutdown: after_shutdown() with self._lock: self._io._thread_count -= 1 if not self._io._thread_count: self._io._connected = False
def convert_to_v2(self): for job in self.jobs.values(): d = job.__dict__ d.pop('packetRef', None) d.pop('callbacks', None) d.pop('nonpersistent_callbacks', None) job.max_try_count = d.pop('maxTryCount') job.pck_id = self.id pckd = self.__dict__ state = pckd.pop('state') if state == ReprState.NONINITIALIZED: #self._recover_noninitialized(ctx) logging.error("Packet %s in NONINITIALIZED state" % self) self.do_not_run = bool(self.flags & self.PacketFlag.USER_SUSPEND) self.is_broken = bool(self.flags & self.PacketFlag.RCVR_ERROR) pckd.pop('flags') if state == ReprState.SUCCESSFULL and self.do_not_run: #logging.warning("SUCCESSFULL and USER_SUSPEND in %s" % self.id) self.do_not_run = False pckd.pop('streams') # FIXME Overhead: will re-concat multi-deps pckd.pop('_active_jobs', None) pckd.pop('edges') # constant graph succeed_jobs = pckd.pop('done') jobs_to_run = pckd.pop('leafs') #active_jobs_cache = set() pckd.pop('as_in_queue_working') child_to_parents = pckd.pop('waitJobs') def pop_failed_job(): if not jobs_to_run: raise ValueError("jobs_to_run is empty to pop") for job_id in jobs_to_run: result = self.jobs[job_id].last_result() if not result: continue if not result.IsSuccessfull(): jobs_to_run.remove(job_id) return job_id jobs_to_retry = {} if state == ReprState.WAITING: if jobs_to_run: if self.waitingDeadline: job_id = pop_failed_job() or jobs_to_run.pop() jobs_to_retry[1] = (job_id, None, self.waitingDeadline) else: logging.error("No waitingDeadline: %s" % self) else: logging.error("WAITING && !jobs_to_run: %s" % self) pckd.pop('waitingDeadline', None) failed_jobs = set() if state == ReprState.ERROR: job_id = pop_failed_job() if jobs_to_run else None if job_id: failed_jobs.add(job_id) elif not self.is_broken: logging.error("ERROR && !broken && !failed_jobs: %s" % self) working_jobs = {jid for jid, deps in child_to_parents.items() if not deps} \ - (succeed_jobs | jobs_to_run \ | set(descr[0] for descr in jobs_to_retry.values()) \ | failed_jobs) jobs_to_run |= working_jobs if working_jobs: logging.debug('working_jobs for %s in %s: %s' % (self.id, state, working_jobs)) self.done_tag = pckd.pop('done_indicator') self.job_done_tag = pckd.pop('job_done_indicator') self.all_dep_tags = pckd.pop('allTags') self.bin_links = pckd.pop('binLinks') self.is_resetable = pckd.pop('isResetable') self.wait_dep_tags = pckd.pop('waitTags') # if we are in SUSPENDED (RCVR_ERROR or not) and len(self.wait_dep_tags) # -- we will wait tags (in previous packet.py impl) self.tags_awaited = not self.wait_dep_tags or state in _TAGS_AWAITED_STATES clean_state = pckd.pop('_clean_state') # TODO apply to _graph_executor queues = self._get_listeners_by_type((LocalQueue, LegacyQueue)) # FIXME Select one type queue = queues[0] if queues else None self.queue = queue if queue: self.DropCallbackListener(queue) self.__class__ = LocalPacket self.files_modified = False self.resources_modified = False self.files_sharing = None self.shared_files_resource_id = None self.resolved_releases = {} self.unresolved_release_count = 0 self.destroying = state == ReprState.HISTORIED self.sbx_files = {} self._repr_state = None self.state = None self.finish_status = True if state == ReprState.SUCCESSFULL else \ (False if state == ReprState.ERROR and not self.is_broken else None) self._saved_jobs_status = None self.last_sandbox_task_id = None self._graph_executor = DUMMY_GRAPH_EXECUTOR self._repr_state = state # to avoid duplicates in pck.history self.req_sandbox_host = None if state == ReprState.SUCCESSFULL: #pass g = self._create_job_graph_executor() self._saved_jobs_status = g.produce_detailed_status() #self._saved_jobs_status = self._produce_compressed_job_status(g) del g elif state == ReprState.HISTORIED: pass elif self.queue and (failed_jobs or succeed_jobs or jobs_to_retry): g = self._graph_executor = self._create_job_graph_executor() g.failed_jobs = failed_jobs g.succeed_jobs = succeed_jobs g.jobs_to_run = jobs_to_run g.jobs_to_retry = jobs_to_retry g.child_to_parents = child_to_parents g._clean_state = clean_state g.state = g._calc_state() # FIXME bug? waitJobs may not contain all jobs-with-parents _complete_waitJobs(self.id, g) try: _check_graph_consistence(g) except Exception as e: raise AssertionError("Inconsistent job graph in %s: %s" % (self.id, e)) self.state = self._calc_state() self._update_repr_state() if self.queue: if self.has_pending_jobs(): self.queue.packets_with_pending_jobs.add(self) self.queue.relocate_packet(self) if self._repr_state != state and not(state == ReprState.WORKABLE and self._repr_state == ReprState.PENDING): logging.warning("ReprState mismatch for %s: %s -> %s" % (self, state, self._repr_state))
def _send_signal_safe_inspect_result(f): if not f.is_success(): logging.warning("send_signal_safe failed: %s" % f.get_exception())
def _on_task_status_change(self, task_id, _, status_group, can_has_res): #with self.lock: if True: pck = self._by_task_id.get(task_id) if not pck: return with pck._lock: #if pck._status_await_job_id != job_id: #return #pck._status_await_job_id = None state = pck._state assert state not in [ RemotePacketState.FINISHED, RemotePacketState.FETCHING_RESOURCE_LIST, RemotePacketState.FETCHING_FINAL_UPDATE ] if state in [ RemotePacketState.CREATING, # we subscribe just after creation and before start ]: return # TODO Check the code if status_group == TaskStateGroups.DRAFT: if state == RemotePacketState.STARTING: pass elif state == RemotePacketState.CHECKING_START_ERROR: if pck._is_error_permanent: self._mark_as_finished(pck, 'is_error_permanent=True, DRAFT') else: pck._set_state(RemotePacketState.STARTING) self._start_start_sandbox_task(pck) elif state in [RemotePacketState.STARTED, RemotePacketState.TASK_FIN_WAIT]: # FIXME Race here between graph updates and _on_task_status_change logging.warning('%s._on_task_status_change(%s, %s)' % (pck, status_group, state)) #raise AssertionError() elif status_group == TaskStateGroups.ACTIVE: if state in [RemotePacketState.STARTING, RemotePacketState.CHECKING_START_ERROR]: pck._set_state(RemotePacketState.STARTED, 'TaskStateGroups.ACTIVE') elif status_group == TaskStateGroups.TERMINATED: if state in [RemotePacketState.STARTING, RemotePacketState.CHECKING_START_ERROR, RemotePacketState.STARTED]: if can_has_res: pck._set_state(RemotePacketState.FETCHING_RESOURCE_LIST) self._start_fetch_resource_list(pck) else: pck.set_error("Unknown task error", False) # TODO self._mark_as_finished(pck, 'Task TERMINATED and EXCEPTION/FAILURE') # FIXME Does Sandbox delete task's meta info or it's always DELETED # TODO Fetch and interpret ctx['__last_rem_error'] #TaskStatus.DELETING: #TaskStatus.DELETED: # has context #TaskStatus.FAILURE: #TaskStatus.EXCEPTION: #TaskStatus.NO_RES: #TaskStatus.TIMEOUT: elif state == RemotePacketState.TASK_FIN_WAIT: self._mark_as_finished(pck, 'Task TERMINATED on TASK_FIN_WAIT')
def cancel(self): logging.warning("Can't Terminate FuncJob %s" % self.runner)
def notify_long_execution(self, job): logging.warning("%s execution takes too long time", job) self.pck.send_job_long_execution_notification(job)