Beispiel #1
0
 def _run(self):
     while not self.IsKilled():
         try:
             self.do()
         except Exception, e:
             logging.exception("worker\tjob execution error %s", e)
         time.sleep(self.tickTime)
Beispiel #2
0
    def _the_loop(self):
        while not self._should_stop:
            with self._lock:
                while not(self._should_stop or self._queue):
                    self._modified.wait()

                if self._should_stop:
                    return

                id, (deadline, callback) = self._queue.front()

                now = time.time()

                if now < deadline:
                    self._modified.wait(deadline - now)

                    if self._should_stop:
                        return

                    if not self._queue or self._queue.front()[0] != id:
                        continue

                self._queue.pop_front()

            try:
                if callback.__code__.co_argcount:
                    callback(id)
                else:
                    callback()
            except:
                logging.exception("Failed to execute %s" % callback)

            del callback
Beispiel #3
0
        def inner(*args, **kwargs):
            if log_args:
                args_ = args[skip_arg_count:] \
                    if skip_arg_count else args

                if args_ and kwargs:
                    args_to_log = (args_, kwargs)
                elif kwargs:
                    args_to_log = kwargs
                else:
                    args_to_log = args_

                prefix = 'function "%s(%s)" ' % (func.__name__, str(args_to_log)[:4096])
            else:
                prefix = 'function "%s" ' % func.__name__

            def log(str):
                log_func(prefix + str)

            log('started')
            try:
                ret = func(*args, **kwargs)
                log('finished')
                return ret
            except:
                logging.exception('%s call failed', prefix)
                raise
Beispiel #4
0
    def Restore(self, timestamp, tagRef, cloud_requester_state):
        logging.debug("TagLogger.Restore(%d)", timestamp)
        dirname, db_filename = os.path.split(self.db_filename)

        def get_filenames():
            result = []
            for filename in os.listdir(dirname):
                if filename.startswith(db_filename) and filename != db_filename:
                    file_time = int(filename.split("-")[-1])
                    if file_time > timestamp:
                        result.append(filename)
            result = sorted(result)
            if os.path.isfile(self.db_filename):
                result += [db_filename]
            return result

        with self._db_lock:
            self._restoring_mode = True
            for filename in get_filenames():
                f = bsddb3.rnopen(os.path.join(dirname, filename), "r")
                for k, v in f.items():
                    try:
                        obj = cPickle.loads(v)

                        if isinstance(obj, CloudRequestStart):
                            cloud_requester_state.start_request(obj.id, obj.update)
                        elif isinstance(obj, CloudRequestFinish):
                            cloud_requester_state.finish_request(obj.id)
                        else:
                            obj.Redo(tagRef)

                    except Exception, e:
                        logging.exception("occurred in TagLogger while restoring from a journal : %s", e)
                f.close()
            self._restoring_mode = False
Beispiel #5
0
 def _do_run(self):
     try:
         f = getattr(self, '_run', threading.Thread.run.__get__(self))
         f()
     except:
         logging.exception('NamedThread %s [%d] failed' % (self.name, gettid()))
         raise
Beispiel #6
0
    def try_recover_after_backup_loading(self, ctx):
        descr = '[%s, directory = %s]' % (self, self.directory)

        try:
            self._try_recover_after_backup_loading(ctx)
        except Exception:
            logging.exception("Failed to recover packet %s" % descr)
            self._mark_as_failed_on_recovery()
Beispiel #7
0
 def _do_resolve(self, rel):
     try:
         self._do_do_resolve(rel)
     # TODO Fail on permanent errors
     except Exception:
         #logging.warning('Failed to list_latest_releases for %s: %s' % (rel.request, e))
         logging.exception('Failed to list_latest_releases for %s' % (rel.request,))
         delayed_executor.schedule(lambda : self._resolve(rel), timeout=self._RETRY_INTERVAL)
Beispiel #8
0
 def rpc_list_files(self):
     files = []
     with self.lock:
         if self.directory:
             try:
                 files = os.listdir(self.directory)
             except Exception:
                 logging.exception("directory %s listing error", self.directory)
     return files
Beispiel #9
0
 def _try_create_place_if_need(self):
     try:
         self._create_place_if_need()
         return True
     except Exception:
         logging.exception("Failed to create place")
         self.is_broken = True
         self._graph_executor.cancel()
         self._update_state()
         return False
Beispiel #10
0
 def f(*args):
     try:
         return func(*args)
     except RpcUserError:
         _, e, tb = sys.exc_info()
         e = e.exc
         raise type(e), e, tb
     except:
         logging.exception("RPC method %s failed" % func.__name__)
         raise
Beispiel #11
0
    def _process_server_message(self, msg):
        # WhichOneof doesn't work
        # https://github.com/google/protobuf/commit/0971bb0d57aa6f2db1abee4008b365d52b402891
        # type = msg.WhichOneof('Data')

        def first(pred, iterable):
            for item in iterable:
                if pred(item):
                    return item

        type = first(msg.HasField, ['Event', 'Bye', 'Ack', 'Lookup', 'Subscriptions', 'Match'])

        if type is None:
    # XXX Client will looped in this error
            raise NotImplementedError("Unknown server message type for [%s]" % msg)

        if type == 'Event':
            for ev in self._ServerMessage.Event(msg):
                try:
                    self._on_event(ev)
                except:
                    logging.exception("Failed to process journal event")
            return

        elif type == 'Bye':
            #logging.debug("...")
            with self._lock:
                self._io._bye_received = True
                self._outgoing_not_empty.notify()
            return

        data = getattr(msg, type)

        if not data.HasField('RequestId'):
            raise RuntimeError('No .RequestId')
        request_id = data.RequestId

        with self._lock:
            promise = self._running.pop(request_id).promise # FIXME , None and raise
            #if not self._running:
                #self._running_empty.notify_all()
            if self._should_and_can_stop():
                self._should_and_can_stop_cond.notify_all()

        if type == 'Ack':
            promise.set(None)
        elif type == 'Lookup':
            promise.set(self._ServerMessage.Lookup(msg.Lookup))
        elif type == 'Match':
            promise.set(*self._ServerMessage.Match(msg.Match))
        elif type == 'Subscriptions':
            promise.set(self._ServerMessage.Subscriptions(msg.Subscriptions))
        else:
            assert False
Beispiel #12
0
    def _release_place(self):
        with self.lock:
            self._release_links()

            if self.directory and os.path.isdir(self.directory):
                try:
                    shutil.rmtree(self.directory, onerror=None)
                except Exception:
                    logging.exception("Packet %s release place error", self.id)

            self.directory = None
Beispiel #13
0
    def __worker(self):
        while True:
            task = self._queue.get()

            if task is self.__STOP_INDICATOR:
                break

            try:
                task()
            except Exception:
                logging.exception("")
Beispiel #14
0
 def __setstate__(self, sdict):
     for attr, builder in scheme.iteritems():
         try:
             if attr in sdict:
                 sdict[attr] = builder(sdict[attr])
             else:
                 sdict[attr] = builder()
         except:
             logging.exception("unpickable\tcan't deserialize attribute %s with builder %r", attr, builder)
             raise
     setter = getattr(super(ObjUnpickler, self), "__setstate__", self.__dict__.update)
     setter(sdict)
     ObjectRegistrator_.register(self, sdict)
Beispiel #15
0
    def _send_email(self, make):
        if not self.notify_emails:
            return

        ctx = self._get_scheduler_ctx()

        try:
            msg = make(ctx)
        except:
            logging.exception("Failed to format email for %s" % self.id)
        else:
            if msg:
                ctx.send_email_async(self.notify_emails, msg)
Beispiel #16
0
    def run(self):
        job = self._job

        try:
            try:
                self._run()
            except Exception:
                # FIXME Do something more than logging?
                logging.exception("Run job %s exception", job)

            if not(self._environment_error or self._cancelled):
                job.tries += 1

            self._append_results()

        finally:
            self._ops.on_job_done(self)
Beispiel #17
0
    def FireEvent(self, event, reference=None, safe=False):
        bad_listeners = set()

        for obj in self.__get_listeners():
            if isinstance(obj, ICallbackAcceptor):
                try:
                    obj.AcceptCallback(reference or self, event)
                except:
                    if not safe:
                        raise
                    logging.exception("Failed to accept '%s' event for %s" % (event, obj))
            else:
                logging.warning("callback %r\tincorrect acceptor for %s found: %s", self, event, obj)
                bad_listeners.add(obj)

        for obj in bad_listeners:
            self.DropCallbackListener(obj)
Beispiel #18
0
    def _fetch_resource_list(self, pck):
        try:
            ans = pck._sandbox.list_task_resources(pck._sandbox_task_id)
        except:
            logging.exception('Failed to fetch task resources for %s' % pck)

            with pck._lock:
                if pck._target_stop_mode != StopMode.CANCEL:
                    self._schedule(pck, self._start_fetch_resource_list, self._SANDBOX_TASK_CREATION_RETRY_INTERVAL)

            return

        # TODO We don't have to _fetch_resource_list() in any TERMINATED task
        # state (e.g. ERROR, EXCEPTION)

        #import json
        #logging.debug('task #%s resources list answer: %s' % (pck._sandbox_task_id, json.dumps(ans, indent=3)))

        res_by_type = {
            resource['type']: resource
                for resource in ans['items']
        }

        #logging.debug('task #%s res_by_type: %s' % (pck._sandbox_task_id, json.dumps(res_by_type, indent=3)))

        with pck._lock:
            resource = res_by_type.get('REM_JOBPACKET_EXECUTION_SNAPSHOT')
            if not resource:
                logging.error("No REM_JOBPACKET_EXECUTION_SNAPSHOT resource in %s" % pck)
                err = "No REM_JOBPACKET_EXECUTION_SNAPSHOT resource"
                pck.set_error(err, False)
                self._mark_as_finished(pck, err)
                return

            pck._result_snapshot_resource_id = resource['id']

            if pck._final_state is None:
                pck._final_update_url = res_by_type['REM_JOBPACKET_GRAPH_UPDATE']['http']['proxy']
                pck._set_state(RemotePacketState.FETCHING_FINAL_UPDATE)
            else:
                self._mark_as_finished(pck, '_fetch_resource_list')
                return

        self._fetch_final_update(pck) # not under lock
Beispiel #19
0
    def _run_process_inner(self, argv, stdin, stdout, stderr):
        job = self._job
        ops = self._ops
        self._start_time = time.time()

        try:
            process = ops.start_process(
                argv,
                stdout=stdout,
                stdin=stdin,
                stderr=stderr,
                cwd=ops.get_working_directory(),
                env_update=self._env_update.items(),
            )

        except pgrpguard.Error as e:
            t, e, tb = sys.exc_info()
            raise t, e, tb

        except Exception as e:
            self._process_start_error = e
            logging.exception("Failed to start %s: %s" % (job, e))
            return False

        logging.debug("%s started with pid=%s" % (job, process.pid))
        self._process = process

        if self._cancelled:
            process.terminate()

        try:
            self._wait_process(process)
        except Exception as e:
            t, e, tb = sys.exc_info()
            try:
                process.terminate()
            except:
                pass
            raise t, e, tb

        self._finish_time = time.time()

        return True
Beispiel #20
0
    def _process_update(self, update):
        tag, event = update[0:2]
        msg = None
        version = None
        if len(update) >= 3:
            msg = update[2]
        if len(update) >= 4:
            version = update[3]

        try:
            tag._ModifyLocalState(event, msg, version)
        except Exception:
            logging.exception("Failed to process tag update %s" % (update,))

        if self._connection_manager:
            try:
                self._connection_manager.RegisterTagEvent(tag, event, msg)
            except Exception:
                logging.exception("Failed to pass update to ConnectionManager %s" % (update,))
Beispiel #21
0
 def _release_links(self):
     tmpLinks, self.bin_links = self.bin_links, {}
     while tmpLinks:
         binname, file = tmpLinks.popitem()
         if isinstance(file, BinaryFile):
             # 'try' was a workaround for race-conditions in self. Now j.i.c
             try:
                 file.Unlink(self, binname)
             except OSError as e:
                 if e.errno == errno.ENOENT:
                     logging.exception("Packet %s release place error", self.id)
                 else:
                     raise
             filehash = file.checksum
         elif isinstance(file, str):
             filehash = file
         else:
             filehash = None
         if filehash is not None:
             self.bin_links[binname] = filehash
Beispiel #22
0
    def _update(self):
        running = self._running
        running_task_ids = running.keys()

        # FIXME Determine max &limit for /task in list_task_statuses and split request
        try:
            statuses = self._sandbox.list_task_statuses(running_task_ids)

        except (sandbox.NetworkError, sandbox.ServerInternalError):
            pass

        except Exception:
            logging.exception("Can't fetch task statuses from Sandbox")

        else:
            # FIXME Move to rem.sandbox
            deleted_task_ids = set(running_task_ids) - set(statuses.keys())
            for task_id in deleted_task_ids:
                statuses[task_id] = TaskStatus.DELETED

            for task_id, status in statuses.iteritems():
                task = running[task_id]
                prev_status_group, prev_status = task

                if status != prev_status:
                    logging.debug("task #%d change status %s -> %s" % (task_id, prev_status, status))

                status_group, can_has_res = self._interpret_status(status)

                if status_group == TaskStateGroups.TERMINATED:
                    running.pop(task_id)
                else:
                    task[0] = status_group
                    task[1] = status

                if prev_status_group != status_group:
                    try:
                        self._notify(task_id, status, status_group, can_has_res)
                    except:
                        logging.exception("Sandbox task on_state_change handler failed for %s -> %s" \
                            % (task_id, status))
Beispiel #23
0
    def convert_in_memory_tags_to_cloud_if_need(self):
        if not self._has_cloud_setup():
            return False

        updates = []

        for tag_name, tag in self.inmem_items.iteritems():
            must_be_cloud = self._is_cloud_tag_name(tag_name) \
                and not tag.IsRemote() # Hack for disable_remote_tags

            if must_be_cloud == tag.IsCloud():
                continue

            elif must_be_cloud:
                if tag.IsLocallySet():
                    updates.append((tag_name, ETagEvent.Set))

                self._make_tag_cloud(tag)
            else:
                logging.error("Tag %s is cloud, but must not be" % tag_name)

        if not updates:
            return False

        logging.info("before conversion %d tags to CloudTag's" % len(updates))

        cloud = self._create_cloud_client(lambda ev: None)

        try:
            for bucket in split_in_groups(updates, 100000): # TODO Fix cloud_client.update
                cloud.update(bucket).get()
        finally:
            try:
                cloud.stop()
            except:
                logging.exception("Failed to stop temporary cloud client")

        logging.info("after conversion %d tags to CloudTag's" % len(updates))

        return True
Beispiel #24
0
    def _do_start_sandbox_task(self, pck):
        try:
           pck._sandbox.start_task(pck._sandbox_task_id)

        # Possible events before lock will be acquired in this func:
        # - not final GRAPH_UPDATE
        # - final GRAPH_UPDATE
        # - STOP_GRACEFULLY/STOP/CANCEL

        # XXX Уже неверно.
        # No task status changes may appear here, because awaiting is not racy
        # - final GRAPH_UPDATE + task terminated
        # - task terminated wo GRAPH_UPDATE's

        except Exception as e:
            logging.exception('Failed to start task %s' % pck)

            with pck._lock:
                if pck._state != RemotePacketState.STARTING:
                    return

                # TODO Don't forget to take into account ._target_stop_mode in _on_task_status_change

                # Here we don't know if task is really started
                is_error_permanent = \
                    not isinstance(e, (sandbox.NetworkError, sandbox.ServerInternalError))

                pck.set_error(str(e), is_error_permanent)
                pck._set_state(RemotePacketState.CHECKING_START_ERROR)

            return

        with pck._lock:
            if pck._state != RemotePacketState.STARTING:
                return

            pck._set_state(RemotePacketState.STARTED, '._sandbox.start() -> ok')

            assert not pck._peer_addr
Beispiel #25
0
    def _io_loop(self, type_str, loop, how, after_shutdown=None):
        failed = True
        try:
            loop()
            failed = False
        except Exception as e:
            logging.exception("%s error" % type_str)

        #logging.debug("%s io thread stopped" % type_str)

        try:
            self._io._connection.shutdown(how)
        except socket.error as e:
            if e.errno != errno.ENOTCONN:
                logging.warning("Error on socket shutdown(%d): %s" % (how, e))

        if after_shutdown:
            after_shutdown()

        with self._lock:
            self._io._thread_count -= 1

            if not self._io._thread_count:
                self._io._connected = False
Beispiel #26
0
    def _do_create_sandbox_task(self, pck):
        def reschedule_if_need():
            with pck._lock:
                if pck._target_stop_mode:
                    return

                self._schedule(
                    pck,
                    self._start_create_sandbox_task,
                    timeout=self._SANDBOX_TASK_CREATION_RETRY_INTERVAL)

        with pck._lock:
            if pck._target_stop_mode:
                return

        def handle_unknown_error(e):
            with pck._lock:
                if not pck._target_stop_mode:
                    pck.set_error(str(e), False)
                self._mark_as_finished(pck, 'Unknown error while creating: %s' % e)

        # FIXME Invert logic: retry everything except permanent

        try:
            task = self._sbx_create_task(pck)
        except (sandbox.NetworkError, sandbox.ServerInternalError) as e:
            reschedule_if_need()
            return
        except Exception as e:
            logging.exception('Failed to create sandbox task for %s' % pck.id)
            handle_unknown_error(e)
            return

        logging.debug('sbx:%d for %s created' % (task.id, pck.id))

        with pck._lock:
            if pck._target_stop_mode:
                return

        try:
            self._sbx_update_task(pck, task)

        except (sandbox.NetworkError, sandbox.ServerInternalError) as e:
            reschedule_if_need()
            #self._start_delete_task(task.id)
            return
        except Exception as e:
            logging.exception('Failed to update (after start) task %s for %s' % (task.id, pck.id))
            handle_unknown_error(e)
            #self._start_delete_task(task.id)
            return

        with pck._lock:
            if pck._target_stop_mode:
                return

            # FIXME fork locking (mallformed pck state)
            pck._set_task_id(task.id)
            pck._set_state(RemotePacketState.STARTING)

            self._by_task_id[task.id] = pck

        self._await_task_status(pck)
        self._do_start_sandbox_task(pck)