Example #1
0
 def AcceptCallback(self, reference, event):
     methName = "On" + event.title().replace("_", "")
     fn = getattr(self, methName, None)
     if callable(fn):
         fn(reference)
     else:
         logging.warning("can't invoke %s method for object %s", methName, self)
Example #2
0
    def _on_cloud_journal_event(self, ev):
        #logging.debug('before journal event %s' % ev)

        with self.lock:
            tag = self.inmem_items.get(ev.tag_name)

        if not tag:
            logging.warning('no object in inmem_items for cloud tag %s' % ev.tag_name)
            return

        if not tag.IsCloud(): # it's like assert
            logging.error('tag %s is not cloud tag in inmem_items but receives event from cloud' % ev.tag_name)
            return

        if tag.version >= ev.version:
            # TODO warn even on equal versions, but not for initial _subscribe_all
            if tag.version > ev.version:
                logging.warning('local version (%d) > journal version (%d) for tag %s' \
                    % (tag.version, ev.version, ev.tag_name))
            return

        def add_event(event, version, msg=None):
            self._repr_modifier.add((tag, event, msg, version))

        # FIXME here with warning, on state sync without it
        if ev.version > ev.last_reset_version and tag.version < ev.last_reset_version:
            logging.debug('overtaking reset %s.%d.%d for %d' % (ev.tag_name, ev.version, ev.last_reset_version, tag.version))
            add_event(ETagEvent.Reset, ev.last_reset_version, ev.last_reset_comment) # TODO last_reset_comment is wrong

        add_event(ev.event, ev.version, ev.last_reset_comment if ev.event == ETagEvent.Reset else None)

        logging.debug('after journal event for %s' % ev.tag_name)
Example #3
0
 def FixLinks(self):
     bad_links = set()
     for (pck_id, name), dest in self.links.iteritems():
         if not os.path.islink(dest):
             logging.warning("%s link item not found for packet %s", dest, pck_id)
             bad_links.add((pck_id, name))
     for i in bad_links:
         self.links.pop(i)
Example #4
0
File: queue.py Project: cgorbit/rem
 def _find_packet_user_state_queue(self, pck):
     ret = None
     for qname, queue in self.by_user_state.items():
         if pck in queue:
             if ret is not None:
                 logging.warning("packet %r is in several queues", pck)
             ret = queue
     return ret
Example #5
0
 def Relink(self, estimated_path):
     with self.lock:
         self.path = estimated_path
         for dstname in self.links.itervalues():
             dstdir = os.path.split(dstname)[0]
             if not os.path.isdir(dstdir):
                 logging.warning("binfile\tcan't relink nonexisted packet data %s", dstdir)
             elif os.path._resolve_link(dstname) != self.path:
                 osspec.create_symlink(self.path, dstname, reallocate=True)
Example #6
0
File: heap.py Project: cgorbit/rem
 def add(self, key, value):
     if self.revIndex.has_key(key):
         logging.warning("%r already is in heap", key)
         self._change_value(key, value)
         return
     self.objects.append(key)
     self.values.append(value)
     pos = len(self.objects) - 1
     self.revIndex[key] = pos
     self._rollup(pos)
Example #7
0
    def _do_stop_packet(self, pck):
        task_id = pck._sandbox_task_id
        stop_mode = pck._target_stop_mode

        def reschedule_if_need():
            if pck._state != RemotePacketState.STARTED:
                return
            if pck._target_stop_mode > stop_mode:
                return

            if pck._sched:
                # TODO Assert that we have another _do_stop_packet in pck._sched
                return

            self._schedule(
                pck,
                self._start_packet_stop,
                timeout=self._RPC_RESEND_INTERVAL)

        with pck._lock:
            if not pck._peer_addr: # FIXME _do_stop_packet must not be called on this condition
                reschedule_if_need()
                return

        proxy = self._create_packet_rpc_proxy(pck)

        logging.debug('_do_stop_packet(%s, stop_mode=%s' % (pck, stop_mode))

        try:
            if stop_mode == StopMode.CANCEL:
                proxy.cancel(task_id)
            else:
                kill_jobs = stop_mode == StopMode.STOP
                proxy.stop(task_id, kill_jobs)

        except Exception as e:
            logging.warning("Failed to send stop to packet %s: %s" % (pck, e))

            if is_xmlrpc_exception(e, WrongTaskIdError) \
                    or isinstance(e, socket.error) and e.errno == errno.ECONNREFUSED:
                return # FIXME Is enough? # STOP/SUSPEND->EXECUTING

            with pck._lock:
                reschedule_if_need()

        else:
            with pck._lock:
                #assert pck._state == RemotePacketState.STARTED # FIXME XXX
                if pck._state != RemotePacketState.STARTED:
                    return

                if pck._sent_stop_mode < stop_mode:
                    pck._sent_stop_mode = stop_mode
Example #8
0
    def _Communicate(self, f):
        self.Connect()

        try:
            f()

            self.errorsCnt = 0
            logging.debug("SendData to %s: ok", self.name)
        except (IOError, socket.timeout) as e:
            logging.warning("SendData to %s: failed: %s", self.name, e)
            self.lastError = e
            self.errorsCnt += 1
        except Exception as e:
            logging.error("SendData to %s: failed: %s", self.name, e)
Example #9
0
    def _stop(self, wait=True, timeout=None):
        if self._stopped:
            return

        new_value = self._ST_WAIT if wait else self._ST_NOWAIT

        with self._lock:
            if self._should_stop < new_value:
                logging.info("Stopping YtTags.Client (%s)" % '_ST_WAIT' if wait == self._ST_WAIT else '_ST_NOWAIT')
                self._do_stop(new_value)

            elif self._should_stop > new_value:
                logging.warning("stop() called with lower stop-level")

        self._connect_thread.join(timeout) # TODO sleeps
Example #10
0
    def _do_initial_cloud_tags_masks_load(self):
        try_count = 3
        for idx in range(try_count):
            try:
                self._match_cloud_tag = self._load_masks()
            except Exception as e:
                logging.warning("Iter #%d of cloud tags masks (%s) loading failed: %s" % (
                    idx + 1, self._cloud_tags_masks, e))
            else:
                break

            if idx != try_count - 1:
                time.sleep(5)
        else:
            raise RuntimeError("Failed to load cloud tags masks for %d attempts" % try_count)
Example #11
0
    def FireEvent(self, event, reference=None, safe=False):
        bad_listeners = set()

        for obj in self.__get_listeners():
            if isinstance(obj, ICallbackAcceptor):
                try:
                    obj.AcceptCallback(reference or self, event)
                except:
                    if not safe:
                        raise
                    logging.exception("Failed to accept '%s' event for %s" % (event, obj))
            else:
                logging.warning("callback %r\tincorrect acceptor for %s found: %s", self, event, obj)
                bad_listeners.add(obj)

        for obj in bad_listeners:
            self.DropCallbackListener(obj)
Example #12
0
    def _do_do_resolve(self, rel):
        request = rel.request

        resource = self._client.get_latest_resource(
            type=request.type,
            owner=request.owner,
            released=request.released
        )

        rel.resolve_time = time.time()

        if not resource:
            logging.warning("Failed to resolve %s" % (request,))
            rel.promise.set(None, RuntimeError("Can't find resource %s" % (request,)))
            return

        logging.debug("%s resolved to %s" % (request, resource))
        rel.promise.set(resource['id'])
Example #13
0
    def _masks_reload_loop(self):
        while True:
            if self._masks_should_stop.wait(self._cloud_tags_masks_reload_interval):
                return

            try:
                match = self._load_masks()
            except Exception as e:
                logging.error("Failed to reload tags' masks from: %s" % e)
                continue

            if self._match_cloud_tag.count and not match.count:
                logging.warning("New cloud tags masks discarded: old count %d, new count %d" % (
                    self._match_cloud_tag.count, match.count))
                continue

            logging.debug("Cloud tag's masks reloaded. Regexp count: %d" % match.count)
            self._match_cloud_tag = match
Example #14
0
 def _move_files(self, new_binary_directory):
     assert self.binDirectory != new_binary_directory
     if True:
         badFiles = set()
         for checksum, file in self.files.iteritems():
             estimated_path = os.path.join(new_binary_directory, checksum)
             if not os.path.isfile(estimated_path):
                 #if os.path.isfile(file.path):
                 if False:
                     shutil.move(file.path, estimated_path)
                 elif file.LinksCount() > 0:
                     print file.links
                     raise RuntimeError("binstorage\tcan't recover file %r" % file.path)
                 else:
                     badFiles.add(checksum)
             file.Relink(estimated_path)
         if badFiles:
             for checksum in badFiles:
                 del self.files[checksum]
                 logging.warning("binstorage\tnonexisted file %s cleaning attempt", checksum)
             logging.warning("can't recover %d files; %d files left in storage", len(badFiles), len(self.files))
Example #15
0
    def Start(self):
        if not self.network_name or not self.tags_file or not self.port:
            logging.warning("ConnectionManager could'n start: wrong configuration. " +
                            "network_name: %s, remote_tags_db_file: %s, system_port: %r",
                            self.network_name, self.tags_file, self.port)
            return

        self.ReloadConfig()
        logging.debug("after_reload_config")

        for client in self.topologyInfo.servers.values():
            if client.active and client.name != self.network_name:
                client.TryInitializePeersVersions(self.network_name)
        logging.debug("after_clients_versions_init")

        self.alive = True
        self.InitXMLRPCServer()
        self._accept_loop_thread = ProfiledThread(target=self.ServerLoop, name_prefix='ConnManager')
        self._accept_loop_thread.start()
        logging.debug("after_connection_manager_loop_start")

        for client in self.topologyInfo.servers.values():
            self.scheduler.ScheduleTaskT(0, self.SendData, client, skip_logging=True)
Example #16
0
    def _create_connection(self):
        now = time.time()
        timeout = 0.0

        while True:
            with self._lock:
                if self._should_and_can_stop():
                    return

                self._should_and_can_stop_cond.wait(timeout)

                if self._should_and_can_stop():
                    return

            try:
                conn = self._connection_constructor()
            except Exception as e:
                logging.warning("Failed to connect: %s" % e)

            if conn:
                return conn

            timeout = 1.0
Example #17
0
    def _io_loop(self, type_str, loop, how, after_shutdown=None):
        failed = True
        try:
            loop()
            failed = False
        except Exception as e:
            logging.exception("%s error" % type_str)

        #logging.debug("%s io thread stopped" % type_str)

        try:
            self._io._connection.shutdown(how)
        except socket.error as e:
            if e.errno != errno.ENOTCONN:
                logging.warning("Error on socket shutdown(%d): %s" % (how, e))

        if after_shutdown:
            after_shutdown()

        with self._lock:
            self._io._thread_count -= 1

            if not self._io._thread_count:
                self._io._connected = False
Example #18
0
    def convert_to_v2(self):
        for job in self.jobs.values():
            d = job.__dict__
            d.pop('packetRef', None)
            d.pop('callbacks', None)
            d.pop('nonpersistent_callbacks', None)
            job.max_try_count = d.pop('maxTryCount')
            job.pck_id = self.id

        pckd = self.__dict__

        state = pckd.pop('state')

        if state == ReprState.NONINITIALIZED:
            #self._recover_noninitialized(ctx)
            logging.error("Packet %s in NONINITIALIZED state" % self)

        self.do_not_run = bool(self.flags & self.PacketFlag.USER_SUSPEND)
        self.is_broken = bool(self.flags & self.PacketFlag.RCVR_ERROR)
        pckd.pop('flags')

        if state == ReprState.SUCCESSFULL and self.do_not_run:
            #logging.warning("SUCCESSFULL and USER_SUSPEND in %s" % self.id)
            self.do_not_run = False

        pckd.pop('streams') # FIXME Overhead: will re-concat multi-deps
        pckd.pop('_active_jobs', None)

        pckd.pop('edges') # constant graph
        succeed_jobs = pckd.pop('done')
        jobs_to_run = pckd.pop('leafs')

        #active_jobs_cache = set()
        pckd.pop('as_in_queue_working')

        child_to_parents = pckd.pop('waitJobs')

        def pop_failed_job():
            if not jobs_to_run:
                raise ValueError("jobs_to_run is empty to pop")

            for job_id in jobs_to_run:
                result = self.jobs[job_id].last_result()
                if not result:
                    continue
                if not result.IsSuccessfull():
                    jobs_to_run.remove(job_id)
                    return job_id

        jobs_to_retry = {}
        if state == ReprState.WAITING:
            if jobs_to_run:
                if self.waitingDeadline:
                    job_id = pop_failed_job() or jobs_to_run.pop()
                    jobs_to_retry[1] = (job_id, None, self.waitingDeadline)
                else:
                    logging.error("No waitingDeadline: %s" % self)
            else:
                logging.error("WAITING && !jobs_to_run: %s" % self)
        pckd.pop('waitingDeadline', None)

        failed_jobs = set()
        if state == ReprState.ERROR:
            job_id = pop_failed_job() if jobs_to_run else None
            if job_id:
                failed_jobs.add(job_id)
            elif not self.is_broken:
                logging.error("ERROR && !broken && !failed_jobs: %s" % self)

        working_jobs = {jid for jid, deps in child_to_parents.items() if not deps} \
            - (succeed_jobs | jobs_to_run \
                | set(descr[0] for descr in jobs_to_retry.values()) \
                | failed_jobs)

        jobs_to_run |= working_jobs

        if working_jobs:
            logging.debug('working_jobs for %s in %s: %s' % (self.id, state, working_jobs))

        self.done_tag = pckd.pop('done_indicator')
        self.job_done_tag = pckd.pop('job_done_indicator')
        self.all_dep_tags = pckd.pop('allTags')
        self.bin_links = pckd.pop('binLinks')
        self.is_resetable = pckd.pop('isResetable')

        self.wait_dep_tags = pckd.pop('waitTags')
        # if we are in SUSPENDED (RCVR_ERROR or not) and len(self.wait_dep_tags)
        #   -- we will wait tags (in previous packet.py impl)
        self.tags_awaited = not self.wait_dep_tags or state in _TAGS_AWAITED_STATES

        clean_state = pckd.pop('_clean_state') # TODO apply to _graph_executor

        queues = self._get_listeners_by_type((LocalQueue, LegacyQueue)) # FIXME Select one type
        queue = queues[0] if queues else None
        self.queue = queue
        if queue:
            self.DropCallbackListener(queue)

        self.__class__ = LocalPacket

        self.files_modified = False
        self.resources_modified = False
        self.files_sharing = None
        self.shared_files_resource_id = None
        self.resolved_releases = {}
        self.unresolved_release_count = 0

        self.destroying = state == ReprState.HISTORIED
        self.sbx_files = {}

        self._repr_state = None
        self.state = None

        self.finish_status = True if state == ReprState.SUCCESSFULL else \
            (False if state == ReprState.ERROR and not self.is_broken else None)

        self._saved_jobs_status = None
        self.last_sandbox_task_id = None
        self._graph_executor = DUMMY_GRAPH_EXECUTOR

        self._repr_state = state # to avoid duplicates in pck.history

        self.req_sandbox_host = None

        if state == ReprState.SUCCESSFULL:
            #pass
            g = self._create_job_graph_executor()
            self._saved_jobs_status = g.produce_detailed_status()
            #self._saved_jobs_status = self._produce_compressed_job_status(g)
            del g

        elif state == ReprState.HISTORIED:
            pass

        elif self.queue and (failed_jobs or succeed_jobs or jobs_to_retry):
            g = self._graph_executor = self._create_job_graph_executor()

            g.failed_jobs = failed_jobs
            g.succeed_jobs = succeed_jobs
            g.jobs_to_run = jobs_to_run
            g.jobs_to_retry = jobs_to_retry
            g.child_to_parents = child_to_parents

            g._clean_state = clean_state

            g.state = g._calc_state()

            # FIXME bug? waitJobs may not contain all jobs-with-parents
            _complete_waitJobs(self.id, g)

            try:
                _check_graph_consistence(g)
            except Exception as e:
                raise AssertionError("Inconsistent job graph in %s: %s" % (self.id, e))

        self.state = self._calc_state()
        self._update_repr_state()

        if self.queue:
            if self.has_pending_jobs():
                self.queue.packets_with_pending_jobs.add(self)
            self.queue.relocate_packet(self)

        if self._repr_state != state and not(state == ReprState.WORKABLE and self._repr_state == ReprState.PENDING):
            logging.warning("ReprState mismatch for %s: %s -> %s" % (self, state, self._repr_state))
Example #19
0
 def _send_signal_safe_inspect_result(f):
     if not f.is_success():
         logging.warning("send_signal_safe failed: %s" % f.get_exception())
Example #20
0
    def _on_task_status_change(self, task_id, _, status_group, can_has_res):
        #with self.lock:
        if True:
            pck = self._by_task_id.get(task_id)

        if not pck:
            return

        with pck._lock:
            #if pck._status_await_job_id != job_id:
                #return
            #pck._status_await_job_id = None

            state = pck._state

            assert state not in [
                RemotePacketState.FINISHED,
                RemotePacketState.FETCHING_RESOURCE_LIST,
                RemotePacketState.FETCHING_FINAL_UPDATE
            ]

            if state in [
                RemotePacketState.CREATING, # we subscribe just after creation and before start
            ]:
                return

            # TODO Check the code

            if status_group == TaskStateGroups.DRAFT:

                if state == RemotePacketState.STARTING:
                    pass

                elif state == RemotePacketState.CHECKING_START_ERROR:
                    if pck._is_error_permanent:
                        self._mark_as_finished(pck, 'is_error_permanent=True, DRAFT')
                    else:
                        pck._set_state(RemotePacketState.STARTING)
                        self._start_start_sandbox_task(pck)

                elif state in [RemotePacketState.STARTED, RemotePacketState.TASK_FIN_WAIT]:
                    # FIXME Race here between graph updates and _on_task_status_change
                    logging.warning('%s._on_task_status_change(%s, %s)' % (pck, status_group, state))
                    #raise AssertionError()

            elif status_group == TaskStateGroups.ACTIVE:

                if state in [RemotePacketState.STARTING, RemotePacketState.CHECKING_START_ERROR]:
                    pck._set_state(RemotePacketState.STARTED, 'TaskStateGroups.ACTIVE')

            elif status_group == TaskStateGroups.TERMINATED:

                if state in [RemotePacketState.STARTING,
                             RemotePacketState.CHECKING_START_ERROR,
                             RemotePacketState.STARTED]:

                    if can_has_res:
                        pck._set_state(RemotePacketState.FETCHING_RESOURCE_LIST)
                        self._start_fetch_resource_list(pck)
                    else:
                        pck.set_error("Unknown task error", False) # TODO
                        self._mark_as_finished(pck, 'Task TERMINATED and EXCEPTION/FAILURE')

                    # FIXME Does Sandbox delete task's meta info or it's always DELETED

                    # TODO Fetch and interpret ctx['__last_rem_error']

                    #TaskStatus.DELETING:
                    #TaskStatus.DELETED: # has context
                    #TaskStatus.FAILURE:
                    #TaskStatus.EXCEPTION:
                    #TaskStatus.NO_RES:
                    #TaskStatus.TIMEOUT:

                elif state == RemotePacketState.TASK_FIN_WAIT:
                    self._mark_as_finished(pck, 'Task TERMINATED on TASK_FIN_WAIT')
Example #21
0
File: job.py Project: cgorbit/rem
 def cancel(self):
     logging.warning("Can't Terminate FuncJob %s" % self.runner)
Example #22
0
 def notify_long_execution(self, job):
     logging.warning("%s execution takes too long time", job)
     self.pck.send_job_long_execution_notification(job)