Example #1
0
 def RegisterTagEventForClient(self, clientname, tagname, event, message=None):
     logging.debug("%s remote tag %s on host %s", TagEventName[event], tagname, clientname)
     client = self.topologyInfo.GetClient(clientname, checkname=False)
     if client is None:
         logging.error("unknown client %s appeared", clientname)
         return False
     client.RegisterTagEvent("%s:%s" % (self.network_name, tagname), event, message)
Example #2
0
    def _DoSendEventsIfNeed(self):
        tosend = self.events[:self.MAX_TAGS_BULK]
        if not tosend:
            return

        logging.debug("SendData to %s: %d events", self.name, len(tosend))

        def send_as_events():
            try:
                self.connection.register_tags_events(tosend)
                return True
            except XMLRPCMethodNotSupported:
                self.SetVersion(1)
                return False

        def send_as_set_tags():
            tags = _get_tags_to_set(tosend)
            if not tags:
                return
            self.connection.set_tags(tags)

        if self.version < 2 or not send_as_events():
            send_as_set_tags()

        self.events.pop(len(tosend))
Example #3
0
    def _on_cloud_journal_event(self, ev):
        #logging.debug('before journal event %s' % ev)

        with self.lock:
            tag = self.inmem_items.get(ev.tag_name)

        if not tag:
            logging.warning('no object in inmem_items for cloud tag %s' % ev.tag_name)
            return

        if not tag.IsCloud(): # it's like assert
            logging.error('tag %s is not cloud tag in inmem_items but receives event from cloud' % ev.tag_name)
            return

        if tag.version >= ev.version:
            # TODO warn even on equal versions, but not for initial _subscribe_all
            if tag.version > ev.version:
                logging.warning('local version (%d) > journal version (%d) for tag %s' \
                    % (tag.version, ev.version, ev.tag_name))
            return

        def add_event(event, version, msg=None):
            self._repr_modifier.add((tag, event, msg, version))

        # FIXME here with warning, on state sync without it
        if ev.version > ev.last_reset_version and tag.version < ev.last_reset_version:
            logging.debug('overtaking reset %s.%d.%d for %d' % (ev.tag_name, ev.version, ev.last_reset_version, tag.version))
            add_event(ETagEvent.Reset, ev.last_reset_version, ev.last_reset_comment) # TODO last_reset_comment is wrong

        add_event(ev.event, ev.version, ev.last_reset_comment if ev.event == ETagEvent.Reset else None)

        logging.debug('after journal event for %s' % ev.tag_name)
Example #4
0
    def Start(self):
        self._journal.Start()
        logging.debug("after_journal_start")

        self._repr_modifier.Start()
        logging.debug("after_repr_modifier_start")

        if self._cloud_tags_server:
            if self._cloud_tags_masks:
                self._masks_reload_thread = ProfiledThread(
                    target=self._masks_reload_loop, name_prefix='TagsMasksReload')
                self._masks_reload_thread.start()
                logging.debug("after_masks_reload_thread_start")

            self._cloud = self._create_cloud_client(self._on_cloud_journal_event)
        else:
            # For CloudTag's in tags.db
            self._cloud = SerialUpdateOnlyDummyCloudClient(
                RuntimeError("Wrong setup for cloud tags in rem-server"))

        self._create_safe_cloud()
        logging.debug("after_safe_cloud_start")

        self._subscribe_all()
        logging.debug("after_subscribe_all")
Example #5
0
    def _on_rpc_update_graph(self, task_id, peer_addr, state, is_final):
        pck = self._by_task_id.get(task_id)

        logging.debug('_on_rpc_update_graph: task_id=%s, pck_id=%s; status=%s; is_final=%s' % (
            task_id,
            pck and pck.id,
            GraphState.str(state['state']),
            is_final
        ))

        if not pck:
            raise WrongTaskIdError('No packet for sbx:%s' % task_id)

        with pck._lock:
            # FIXME
            # pck: connect
            # pck: write
            # rem: enter _on_rpc_update_graph
            # pck: CRASHED
            # pck: Sandbox' task FAILURE
            # rem: _on_task_status_change (enter + exit)
            # rem: _on_rpc_update_graph with self.lock <-- OOPS

            assert pck._state not in [
                RemotePacketState.CREATING,
                RemotePacketState.TASK_FIN_WAIT,
                RemotePacketState.FINISHED,
                RemotePacketState.FETCHING_RESOURCE_LIST,
                RemotePacketState.FETCHING_FINAL_UPDATE
            ], "_on_rpc_update_graph in %s state" % pck._state

            if pck._state in [RemotePacketState.STARTING,
                              RemotePacketState.CHECKING_START_ERROR,
                              RemotePacketState.STARTED]:
                if pck._state != RemotePacketState.STARTED:
                    pck._set_state(RemotePacketState.STARTED, '_on_rpc_update_graph')
                    pck._drop_sched_if_need()
                    assert not pck._peer_addr
                #else:
                    #assert not pck._sched # stop may be scheduled for example

                if not pck._peer_addr:
                    pck._peer_addr = peer_addr
                    logging.debug('SET pck._peer_addr = %s for %s' % (peer_addr, pck))

                    if pck._target_stop_mode:
                        if is_final:
                            pck._sent_stop_mode = pck._target_stop_mode # FIXME
                        else:
                            self._start_packet_stop(pck)

            if pck._target_stop_mode != StopMode.CANCEL:
                pck._update_graph(state, is_final)

            if is_final:
                if pck._target_stop_mode == StopMode.CANCEL \
                    or state['state'] == GraphState.SUCCESSFULL:
                    self._mark_task_fin_wait(pck, '_on_rpc_update_graph(SUCCESSFULL)')
                else:
                    pass # XXX WAITING for TaskStateGroups.TERMINATED
Example #6
0
    def _set_state(self, state, reason=None):
        if self._state == RemotePacketState.CHECKING_START_ERROR \
                and state != RemotePacketState.FINISHED:
            self.reset_error()

        self._state = state
        logging.debug('%s new state %s, reason: %s' % (self.id, RemotePacketState._NAMES[state], reason))
Example #7
0
    def _try_recover_directory(self, ctx):
        prev_packets_directory, _ = os.path.split(self.directory)
        actual_directory = os.path.join(ctx.packets_directory, self.id)

        if os.path.isdir(self.directory):
            if prev_packets_directory != ctx.packets_directory:
                try:
                    logging.debug("relocates directory %s to %s", self.directory, actual_directory)
                    shutil.copytree(self.directory, actual_directory)
                    self.directory = actual_directory
                except:
                    reraise("Failed to relocate directory")

        else:
            if os.path.isdir(actual_directory):
                self.directory = actual_directory
                return # FIXME Recreate?

            if not self._are_links_alive(ctx):
                raise NotAllFileLinksAlive("Not all links alive")

            try:
                self.directory = None
                self._create_place(ctx)
            except:
                reraise("Failed to resurrect directory")
Example #8
0
    def Restore(self, timestamp, tagRef, cloud_requester_state):
        logging.debug("TagLogger.Restore(%d)", timestamp)
        dirname, db_filename = os.path.split(self.db_filename)

        def get_filenames():
            result = []
            for filename in os.listdir(dirname):
                if filename.startswith(db_filename) and filename != db_filename:
                    file_time = int(filename.split("-")[-1])
                    if file_time > timestamp:
                        result.append(filename)
            result = sorted(result)
            if os.path.isfile(self.db_filename):
                result += [db_filename]
            return result

        with self._db_lock:
            self._restoring_mode = True
            for filename in get_filenames():
                f = bsddb3.rnopen(os.path.join(dirname, filename), "r")
                for k, v in f.items():
                    try:
                        obj = cPickle.loads(v)

                        if isinstance(obj, CloudRequestStart):
                            cloud_requester_state.start_request(obj.id, obj.update)
                        elif isinstance(obj, CloudRequestFinish):
                            cloud_requester_state.finish_request(obj.id)
                        else:
                            obj.Redo(tagRef)

                    except Exception, e:
                        logging.exception("occurred in TagLogger while restoring from a journal : %s", e)
                f.close()
            self._restoring_mode = False
Example #9
0
    def _update_state(self):
        new = self._calc_state()
        if new == self.state:
            return

        self.state = new
        logging.debug('SandboxJobGraphExecutorProxy.state => %s' % GraphState.str(self.state))
        self._ops.on_state_change()
Example #10
0
    def _do_on_packet_terminated(self):
        def on_stop():
            self._prev_task_id = self._remote_packet._sandbox_task_id
            self._remote_packet = None
            self._remote_state = None
            self._remote_time_wait_deadline = None

        assert not self.time_wait_deadline and not self.time_wait_sched

        r = self._remote_packet

        self.stopping = False

        if self.cancelled:
            self.cancelled = False
            on_stop()
            return

        # TODO Handle Exception packet state
        # FIXME Rollback history/Status to prev state

        logging.debug('state for SandboxJobGraphExecutorProxy == %s' \
            % None if r._final_state is None else GraphState.str(r._final_state))

        if r._error:
            self._error = r._error

            if r._is_error_permanent:
                self._tries = 0

            elif self._tries:
                retry_idx = self.MAX_TRY_COUNT - self._tries - 1

                delay = self.RETRY_INTERVALS[retry_idx] \
                    if retry_idx < len(self.RETRY_INTERVALS) \
                    else self.RETRY_INTERVALS[-1]

                logging.debug("%s failed. Will retry after %s seconds" % (r, delay))

                self.time_wait_deadline = time.time() + delay

                self._schedule_time_wait_stop()

        elif r._final_state == GraphState.TIME_WAIT:
            self.time_wait_deadline = r._last_update['nearest_retry_deadline']
            self._schedule_time_wait_stop()

        elif r._final_state == GraphState.SUCCESSFULL:
            self.result = True

        elif r._final_state == GraphState.ERROR:
            self.result = False

        self._prev_snapshot_resource_id = r._result_snapshot_resource_id \
            if r._final_state != GraphState.SUCCESSFULL else None

        on_stop()
Example #11
0
    def _SendSubscriptionsIfNeed(self, local_server_network_name):
        tosend = list(self.subscriptions)[:self.MAX_TAGS_BULK]
        if not tosend:
            return

        logging.debug("SendData to %s: %d subscriptions", self.name, len(tosend))

        self.connection.register_share(tosend, local_server_network_name)
        self.subscriptions.difference_update(tosend)
Example #12
0
    def _vivify_packets(self, alloc_guard):
        logging.debug('RemotePacketsDispatcher packet count: %d' % len(self._by_task_id))

        for pck in self._by_task_id.itervalues():
            logging.debug('VIVIFY %s' % pck.id)
            if pck._sandbox is None:
                pck._sandbox = self._default_sandbox_client
            pck._run_guard = alloc_guard()
            self._await_task_status(pck)
            self._reschedule_packet(pck)
Example #13
0
    def _get_vaults_for(self, job_id):
        vaults = self._vaults_setup
        if not vaults:
            return None

        env = copy.copy(vaults['global']) or {}
        env.update(vaults['jobs'].get(job_id, {}))

        logging.debug('Vaults for %d: %s' % (job_id, env))

        return env
Example #14
0
    def on_state_change(self):
        pck = self.pck

        pck._update_state()
        logging.debug('pck._has_updates = True')
        pck._has_updates = True

        pck._mark_as_finished_if_need()

        logging.debug('pck._something_changed.notify()')
        pck._something_changed.notify()
Example #15
0
    def _do_stop_packet(self, pck):
        task_id = pck._sandbox_task_id
        stop_mode = pck._target_stop_mode

        def reschedule_if_need():
            if pck._state != RemotePacketState.STARTED:
                return
            if pck._target_stop_mode > stop_mode:
                return

            if pck._sched:
                # TODO Assert that we have another _do_stop_packet in pck._sched
                return

            self._schedule(
                pck,
                self._start_packet_stop,
                timeout=self._RPC_RESEND_INTERVAL)

        with pck._lock:
            if not pck._peer_addr: # FIXME _do_stop_packet must not be called on this condition
                reschedule_if_need()
                return

        proxy = self._create_packet_rpc_proxy(pck)

        logging.debug('_do_stop_packet(%s, stop_mode=%s' % (pck, stop_mode))

        try:
            if stop_mode == StopMode.CANCEL:
                proxy.cancel(task_id)
            else:
                kill_jobs = stop_mode == StopMode.STOP
                proxy.stop(task_id, kill_jobs)

        except Exception as e:
            logging.warning("Failed to send stop to packet %s: %s" % (pck, e))

            if is_xmlrpc_exception(e, WrongTaskIdError) \
                    or isinstance(e, socket.error) and e.errno == errno.ECONNREFUSED:
                return # FIXME Is enough? # STOP/SUSPEND->EXECUTING

            with pck._lock:
                reschedule_if_need()

        else:
            with pck._lock:
                #assert pck._state == RemotePacketState.STARTED # FIXME XXX
                if pck._state != RemotePacketState.STARTED:
                    return

                if pck._sent_stop_mode < stop_mode:
                    pck._sent_stop_mode = stop_mode
Example #16
0
    def RegisterTagEvent(self, tag, event, message=None):
        if not isinstance(tag, TagBase):
            raise RuntimeError("%s is not Tag class instance", tag.GetName())
        if tag.IsRemote():
            return

        tagname = tag.GetName()
        with self.lock: # see register_share
            acceptors = self.acceptors.get(tagname)
            if acceptors:
                logging.debug("on %s connmanager %s with acceptors list %s", TagEventName[event], tagname, acceptors)
                for clientname in acceptors:
                    self.RegisterTagEventForClient(clientname, tagname, event, message)
Example #17
0
def _complete_waitJobs(pck_id, g):
    child_to_parents = g.child_to_parents

    fixed = False

    for job in g.jobs.values():
        if job.parents:
            if job.id not in child_to_parents:
                fixed = True
                child_to_parents[job.id] = set([])

    if fixed:
        logging.debug('!!!Incomplete .waitJobs in %s!!!' % pck_id)
Example #18
0
 def register_share(self, tags, clientname):
     tagRef = self.scheduler.tagRef
     logging.debug("register_share %d tags for %s: %s", len(tags), clientname, tags)
     for tagname in tags:
         # XXX
         # 1. this lock only guarantee eventual-consistency of tag's history
         # 2. clients of self may see duplicates of events (even Reset)
         # 3. also guard self.acceptors
         with self.lock:
             self.acceptors.add(tagname, clientname)
             if tagRef._RawTag(tagname).IsLocallySet():
                 self.RegisterTagEventForClient(clientname, tagname, ETagEvent.Set)
     logging.debug("register_share %d tags for %s: done", len(tags), clientname)
Example #19
0
    def _Communicate(self, f):
        self.Connect()

        try:
            f()

            self.errorsCnt = 0
            logging.debug("SendData to %s: ok", self.name)
        except (IOError, socket.timeout) as e:
            logging.warning("SendData to %s: failed: %s", self.name, e)
            self.lastError = e
            self.errorsCnt += 1
        except Exception as e:
            logging.error("SendData to %s: failed: %s", self.name, e)
Example #20
0
    def _update_repr_state(self):
        new = self._calc_repr_state()

        if new == self._repr_state:
            return

        self._repr_state = new
        self.history.append((new, time.time()))
        logging.debug("packet %s\tnew state %r", self.name, new)

        if new == ReprState.WAITING:
            deadline = self._graph_executor.get_nearest_retry_deadline()
            delay = max(deadline - time.time(), 0) if deadline else None
            logging.debug("packet %s\twaiting for %s sec", self.name, delay)
Example #21
0
    def _update_state(self):
        new = self._calc_state()

        if self.state != new:
            self.state = new
            logging.debug("packet %s\tnew impl state %r", self.name, new)

            self._update_repr_state()

            self._on_state_change(new)
        #else:
            #self._update_repr_state() # ImplState.RUNNING: 1. PENDING, 2. WORKABLE

        if self.queue:
            self.queue.update_pending_jobs_state(self) # TODO only for LocalPacket actually
Example #22
0
    def _create_job_graph_executor(self):
        assert not self.files_modified
        resources = self._produce_job_graph_executor_custom_resources()

        logging.debug('_create_job_graph_executor(%s): %s' % (self.id, resources))

        return sandbox_remote_packet.SandboxJobGraphExecutorProxy(
            sandbox_remote_packet.SandboxPacketOpsForJobGraphExecutorProxy(self),
            self.id,
            self.make_job_graph(),
            resources,
            host=self.req_sandbox_host,
            oauth_token=self.oauth_token,
            vaults_setup=self._make_vaults_setup(),
        )
Example #23
0
    def UpdateContext(self, context):
        self.db_file = context.tags_db_file
        self.remote_tags_enabled = not context.disable_remote_tags
        self.DBConnect()
        self._journal.UpdateContext(context)
        self._repr_modifier.UpdateContext(context)
        self._cloud_tags_server = context.cloud_tags_server
        self._cloud_tags_masks = context.cloud_tags_masks
        self._cloud_tags_masks_reload_interval = context.cloud_tags_masks_reload_interval
        self._cloud_tags_release_delay = context.cloud_tags_release_delay
        self._tags_random_cloudiness = self._has_cloud_setup() and context.tags_random_cloudiness
        self._all_tags_in_cloud = self._has_cloud_setup() and context.all_tags_in_cloud

        logging.debug("TagStorage.UpdateContext, masks = %s, share = %s, server = %s" % (
            self._cloud_tags_masks, self._all_tags_in_cloud, self._cloud_tags_server))
Example #24
0
File: queue.py Project: cgorbit/rem
    def get_job_to_run(self):
        while True:
            with self.lock:
                if not self.has_startable_jobs():
                    return None

                pck = self.local_ops.get_packet_to_run()

            # .get_job_to_run not under lock to prevent deadlock
            try:
                job = pck.get_job_to_run()
            except NotWorkingStateError:
                logging.debug('NotWorkingStateError idling: %s' % pck)
                continue

            return job
Example #25
0
    def _update_graph(self, update, is_final):
        assert self._target_stop_mode != StopMode.CANCEL

        self._last_update = update # TODO

        succeed_jobs = set(map(int, update['succeed_jobs']))

        new_succeed_jobs = succeed_jobs - self._succeeded_jobs
        self._succeeded_jobs = succeed_jobs

        if is_final:
            self._final_state = update['state']

            logging.debug('%s._final_state = %s' % (self, GraphState.str(self._final_state)))

        self._ops._on_sandbox_packet_update(update, new_succeed_jobs, is_final)
Example #26
0
    def _masks_reload_loop(self):
        while True:
            if self._masks_should_stop.wait(self._cloud_tags_masks_reload_interval):
                return

            try:
                match = self._load_masks()
            except Exception as e:
                logging.error("Failed to reload tags' masks from: %s" % e)
                continue

            if self._match_cloud_tag.count and not match.count:
                logging.warning("New cloud tags masks discarded: old count %d, new count %d" % (
                    self._match_cloud_tag.count, match.count))
                continue

            logging.debug("Cloud tag's masks reloaded. Regexp count: %d" % match.count)
            self._match_cloud_tag = match
Example #27
0
    def _do_do_resolve(self, rel):
        request = rel.request

        resource = self._client.get_latest_resource(
            type=request.type,
            owner=request.owner,
            released=request.released
        )

        rel.resolve_time = time.time()

        if not resource:
            logging.warning("Failed to resolve %s" % (request,))
            rel.promise.set(None, RuntimeError("Can't find resource %s" % (request,)))
            return

        logging.debug("%s resolved to %s" % (request, resource))
        rel.promise.set(resource['id'])
Example #28
0
File: job.py Project: cgorbit/rem
    def _run_process_inner(self, argv, stdin, stdout, stderr):
        job = self._job
        ops = self._ops
        self._start_time = time.time()

        try:
            process = ops.start_process(
                argv,
                stdout=stdout,
                stdin=stdin,
                stderr=stderr,
                cwd=ops.get_working_directory(),
                env_update=self._env_update.items(),
            )

        except pgrpguard.Error as e:
            t, e, tb = sys.exc_info()
            raise t, e, tb

        except Exception as e:
            self._process_start_error = e
            logging.exception("Failed to start %s: %s" % (job, e))
            return False

        logging.debug("%s started with pid=%s" % (job, process.pid))
        self._process = process

        if self._cancelled:
            process.terminate()

        try:
            self._wait_process(process)
        except Exception as e:
            t, e, tb = sys.exc_info()
            try:
                process.terminate()
            except:
                pass
            raise t, e, tb

        self._finish_time = time.time()

        return True
Example #29
0
    def _reschedule_packet(self, pck):
        # TODO Check
        by_state = {
            RemotePacketState.CREATING: self._start_create_sandbox_task,
            RemotePacketState.STARTING: self._start_start_sandbox_task,
            RemotePacketState.FETCHING_RESOURCE_LIST: self._start_fetch_resource_list,
            RemotePacketState.FETCHING_FINAL_UPDATE: self._start_fetch_final_update,
        }

        action = by_state.get(pck._state)

        if not action \
                and pck._target_stop_mode != pck._sent_stop_mode \
                and pck._state == RemotePacketState.STARTED:
            action = self._start_packet_stop

        if action:
            logging.debug('action %s' % action)
            action(pck)
Example #30
0
    def _update(self, update, id=None):
        with self._lock:
            logging.debug("SafeCloud.update(%s)" % str(update)) # until i will trust the code

            idx = self._next_idx
            self._next_idx += 1

            id = id or self._alloc_id()

            self._journal.log_cloud_request_start(id, update)

            done = self._cloud.serial_update(update)

            # FIXME
            assert id not in self._running

            self._running[id] = (idx, update)

        # This call may lead to immediate _on_done in current thread
        done.subscribe(lambda f: self._on_done(id, f))