Exemple #1
0
    def _on_cloud_journal_event(self, ev):
        #logging.debug('before journal event %s' % ev)

        with self.lock:
            tag = self.inmem_items.get(ev.tag_name)

        if not tag:
            logging.warning('no object in inmem_items for cloud tag %s' % ev.tag_name)
            return

        if not tag.IsCloud(): # it's like assert
            logging.error('tag %s is not cloud tag in inmem_items but receives event from cloud' % ev.tag_name)
            return

        if tag.version >= ev.version:
            # TODO warn even on equal versions, but not for initial _subscribe_all
            if tag.version > ev.version:
                logging.warning('local version (%d) > journal version (%d) for tag %s' \
                    % (tag.version, ev.version, ev.tag_name))
            return

        def add_event(event, version, msg=None):
            self._repr_modifier.add((tag, event, msg, version))

        # FIXME here with warning, on state sync without it
        if ev.version > ev.last_reset_version and tag.version < ev.last_reset_version:
            logging.debug('overtaking reset %s.%d.%d for %d' % (ev.tag_name, ev.version, ev.last_reset_version, tag.version))
            add_event(ETagEvent.Reset, ev.last_reset_version, ev.last_reset_comment) # TODO last_reset_comment is wrong

        add_event(ev.event, ev.version, ev.last_reset_comment if ev.event == ETagEvent.Reset else None)

        logging.debug('after journal event for %s' % ev.tag_name)
Exemple #2
0
    def _RawTag(self, tagname, dont_create=False):
        if not tagname:
            raise ValueError("Empty tag name")

        tag = self.inmem_items.get(tagname, None)
        if tag:
            return tag

        if not self.db_file_opened:
            self.DBConnect()

        tagDescr = self.infile_items.get(tagname, None)

        if tagDescr:
            tag = cPickle.loads(tagDescr)

            if tag.IsCloud():
                if not self._has_cloud_setup():
                    logging.error("Tag %s is cloud on disk storage, but no setup for" \
                        " cloud in config. Restart server with proper setup!" % tagname)
            elif not tag.IsRemote(): # Hack for disable_remote_tags
                if self._is_cloud_tag_name(tag.GetFullname()):
                    logging.error("Tag %s is not cloud on disk storage, but must be." \
                        " Convert tags in disk storage!" % tagname)

            self._set_modify_func(tag)

        elif dont_create:
            return None

        else:
            tag = self._create_tag(tagname)

        return tag
Exemple #3
0
def cleanup_directory(directory, to_keep, max_removed_items_to_output=100):
    removed = []

    files = os.listdir(directory)

    for basename in files:
        if basename in to_keep:
            continue

        filename = directory + '/' + basename

        remove = shutil.rmtree \
            if os.path.isdir(filename) and not os.path.islink(filename) \
            else os.unlink

        try:
            remove(filename)
        except Exception as e:
            logging.error("Can't remove %s: %s" % (filename, e))
        else:
            removed.append(basename)

    if removed:
        logging.info('%d files removed from %s: %s' \
            % (len(removed), directory, ', '.join(removed[:max_removed_items_to_output])))
Exemple #4
0
 def RegisterTagEventForClient(self, clientname, tagname, event, message=None):
     logging.debug("%s remote tag %s on host %s", TagEventName[event], tagname, clientname)
     client = self.topologyInfo.GetClient(clientname, checkname=False)
     if client is None:
         logging.error("unknown client %s appeared", clientname)
         return False
     client.RegisterTagEvent("%s:%s" % (self.network_name, tagname), event, message)
Exemple #5
0
    def __reset_requests(self):
        rest, self._incoming.queue = self._incoming.queue, deque()

        for sock, _ in rest:
            try:
                _socket_send_reset(sock)
            except Exception as e:
                logging.error("Failed to send RST to RPC client: %s" % e)
Exemple #6
0
    def __put_request(self):
        try:
            request = self.get_request()
        except socket.error:
            logging.error("XMLRPCServer: socket error")
            return

        self._incoming.put(request)
Exemple #7
0
    def handle_request(self, timings):
        try:
            request = self.get_request()
        except socket.error:
            logging.error("XMLRPCServer: socket error")
            return

        timings.append(time.time())
        self._timings[id(request[0])] = timings

        self.requests.put(request)
Exemple #8
0
 def func(*args, **kwargs):
     penalty = 0.01
     _tries = tries
     while _tries:
         try:
             return f(*args, **kwargs)
             break
         except tuple(exception_list), e:
             time.sleep(penalty)
             penalty = min(penalty * penalty_factor, 5)
             _tries -= 1
             logging.error('Exception in %s, exception message: %s, attempts left:  %s', f.func_name, e.message, _tries)
Exemple #9
0
    def _Communicate(self, f):
        self.Connect()

        try:
            f()

            self.errorsCnt = 0
            logging.debug("SendData to %s: ok", self.name)
        except (IOError, socket.timeout) as e:
            logging.warning("SendData to %s: failed: %s", self.name, e)
            self.lastError = e
            self.errorsCnt += 1
        except Exception as e:
            logging.error("SendData to %s: failed: %s", self.name, e)
Exemple #10
0
    def mark_as_too_old(self):
        with self.lock:
            if self.state not in ImplState.LimitedLifetimeSuspendedStates:
                raise NonTooOldMarkableStateError("mark_as_too_old called for %s" % self)

            if self.is_too_old:
                logging.error("mark_as_too_old called for already old %s" % self)
                return

            self._send_becomes_too_old_notification() # before _update_state

            self.is_too_old = True
            self.do_not_run = False # FIXME Clear or not? (change _check_add_files otherwise)

            self._update_state()
Exemple #11
0
    def _masks_reload_loop(self):
        while True:
            if self._masks_should_stop.wait(self._cloud_tags_masks_reload_interval):
                return

            try:
                match = self._load_masks()
            except Exception as e:
                logging.error("Failed to reload tags' masks from: %s" % e)
                continue

            if self._match_cloud_tag.count and not match.count:
                logging.warning("New cloud tags masks discarded: old count %d, new count %d" % (
                    self._match_cloud_tag.count, match.count))
                continue

            logging.debug("Cloud tag's masks reloaded. Regexp count: %d" % match.count)
            self._match_cloud_tag = match
Exemple #12
0
    def _fetch_resource_list(self, pck):
        try:
            ans = pck._sandbox.list_task_resources(pck._sandbox_task_id)
        except:
            logging.exception('Failed to fetch task resources for %s' % pck)

            with pck._lock:
                if pck._target_stop_mode != StopMode.CANCEL:
                    self._schedule(pck, self._start_fetch_resource_list, self._SANDBOX_TASK_CREATION_RETRY_INTERVAL)

            return

        # TODO We don't have to _fetch_resource_list() in any TERMINATED task
        # state (e.g. ERROR, EXCEPTION)

        #import json
        #logging.debug('task #%s resources list answer: %s' % (pck._sandbox_task_id, json.dumps(ans, indent=3)))

        res_by_type = {
            resource['type']: resource
                for resource in ans['items']
        }

        #logging.debug('task #%s res_by_type: %s' % (pck._sandbox_task_id, json.dumps(res_by_type, indent=3)))

        with pck._lock:
            resource = res_by_type.get('REM_JOBPACKET_EXECUTION_SNAPSHOT')
            if not resource:
                logging.error("No REM_JOBPACKET_EXECUTION_SNAPSHOT resource in %s" % pck)
                err = "No REM_JOBPACKET_EXECUTION_SNAPSHOT resource"
                pck.set_error(err, False)
                self._mark_as_finished(pck, err)
                return

            pck._result_snapshot_resource_id = resource['id']

            if pck._final_state is None:
                pck._final_update_url = res_by_type['REM_JOBPACKET_GRAPH_UPDATE']['http']['proxy']
                pck._set_state(RemotePacketState.FETCHING_FINAL_UPDATE)
            else:
                self._mark_as_finished(pck, '_fetch_resource_list')
                return

        self._fetch_final_update(pck) # not under lock
Exemple #13
0
    def _is_cloud_tag_name(self, name):
        if self.IsRemoteTagName(name):
            return False

        try:
            if self._tags_random_cloudiness:
                return hash(name) % 3 == 0

            if self._all_tags_in_cloud:
                return True

            return self._match_cloud_tag(name)

        except Exception as e:
            now = time.time()
            if now - self._last_tag_mask_error_report_time > 5:
                logging.error("Failed to match tag masks: %s" % e)
                self._last_tag_mask_error_report_time = now
            return False
Exemple #14
0
    def _write(self, data):
        timeout = 1.0
        max_timeout = 15.0

        while True:
            with self._db_lock:
                try:
                    if not self._db:
                        self._reopen()
                    self._db.write(data)
                    self._db.sync()

                except Exception as err:
                    self._db = None
                    logging.error("Can't write to journal (%d items left): %s" \
                        % (len(self._queue), err))
                else:
                    break

            timeout = min(max_timeout, timeout * 2)
            time.sleep(timeout)
Exemple #15
0
    def convert_in_memory_tags_to_cloud_if_need(self):
        if not self._has_cloud_setup():
            return False

        updates = []

        for tag_name, tag in self.inmem_items.iteritems():
            must_be_cloud = self._is_cloud_tag_name(tag_name) \
                and not tag.IsRemote() # Hack for disable_remote_tags

            if must_be_cloud == tag.IsCloud():
                continue

            elif must_be_cloud:
                if tag.IsLocallySet():
                    updates.append((tag_name, ETagEvent.Set))

                self._make_tag_cloud(tag)
            else:
                logging.error("Tag %s is cloud, but must not be" % tag_name)

        if not updates:
            return False

        logging.info("before conversion %d tags to CloudTag's" % len(updates))

        cloud = self._create_cloud_client(lambda ev: None)

        try:
            for bucket in split_in_groups(updates, 100000): # TODO Fix cloud_client.update
                cloud.update(bucket).get()
        finally:
            try:
                cloud.stop()
            except:
                logging.exception("Failed to stop temporary cloud client")

        logging.info("after conversion %d tags to CloudTag's" % len(updates))

        return True
Exemple #16
0
    def convert_to_v2(self):
        for job in self.jobs.values():
            d = job.__dict__
            d.pop('packetRef', None)
            d.pop('callbacks', None)
            d.pop('nonpersistent_callbacks', None)
            job.max_try_count = d.pop('maxTryCount')
            job.pck_id = self.id

        pckd = self.__dict__

        state = pckd.pop('state')

        if state == ReprState.NONINITIALIZED:
            #self._recover_noninitialized(ctx)
            logging.error("Packet %s in NONINITIALIZED state" % self)

        self.do_not_run = bool(self.flags & self.PacketFlag.USER_SUSPEND)
        self.is_broken = bool(self.flags & self.PacketFlag.RCVR_ERROR)
        pckd.pop('flags')

        if state == ReprState.SUCCESSFULL and self.do_not_run:
            #logging.warning("SUCCESSFULL and USER_SUSPEND in %s" % self.id)
            self.do_not_run = False

        pckd.pop('streams') # FIXME Overhead: will re-concat multi-deps
        pckd.pop('_active_jobs', None)

        pckd.pop('edges') # constant graph
        succeed_jobs = pckd.pop('done')
        jobs_to_run = pckd.pop('leafs')

        #active_jobs_cache = set()
        pckd.pop('as_in_queue_working')

        child_to_parents = pckd.pop('waitJobs')

        def pop_failed_job():
            if not jobs_to_run:
                raise ValueError("jobs_to_run is empty to pop")

            for job_id in jobs_to_run:
                result = self.jobs[job_id].last_result()
                if not result:
                    continue
                if not result.IsSuccessfull():
                    jobs_to_run.remove(job_id)
                    return job_id

        jobs_to_retry = {}
        if state == ReprState.WAITING:
            if jobs_to_run:
                if self.waitingDeadline:
                    job_id = pop_failed_job() or jobs_to_run.pop()
                    jobs_to_retry[1] = (job_id, None, self.waitingDeadline)
                else:
                    logging.error("No waitingDeadline: %s" % self)
            else:
                logging.error("WAITING && !jobs_to_run: %s" % self)
        pckd.pop('waitingDeadline', None)

        failed_jobs = set()
        if state == ReprState.ERROR:
            job_id = pop_failed_job() if jobs_to_run else None
            if job_id:
                failed_jobs.add(job_id)
            elif not self.is_broken:
                logging.error("ERROR && !broken && !failed_jobs: %s" % self)

        working_jobs = {jid for jid, deps in child_to_parents.items() if not deps} \
            - (succeed_jobs | jobs_to_run \
                | set(descr[0] for descr in jobs_to_retry.values()) \
                | failed_jobs)

        jobs_to_run |= working_jobs

        if working_jobs:
            logging.debug('working_jobs for %s in %s: %s' % (self.id, state, working_jobs))

        self.done_tag = pckd.pop('done_indicator')
        self.job_done_tag = pckd.pop('job_done_indicator')
        self.all_dep_tags = pckd.pop('allTags')
        self.bin_links = pckd.pop('binLinks')
        self.is_resetable = pckd.pop('isResetable')

        self.wait_dep_tags = pckd.pop('waitTags')
        # if we are in SUSPENDED (RCVR_ERROR or not) and len(self.wait_dep_tags)
        #   -- we will wait tags (in previous packet.py impl)
        self.tags_awaited = not self.wait_dep_tags or state in _TAGS_AWAITED_STATES

        clean_state = pckd.pop('_clean_state') # TODO apply to _graph_executor

        queues = self._get_listeners_by_type((LocalQueue, LegacyQueue)) # FIXME Select one type
        queue = queues[0] if queues else None
        self.queue = queue
        if queue:
            self.DropCallbackListener(queue)

        self.__class__ = LocalPacket

        self.files_modified = False
        self.resources_modified = False
        self.files_sharing = None
        self.shared_files_resource_id = None
        self.resolved_releases = {}
        self.unresolved_release_count = 0

        self.destroying = state == ReprState.HISTORIED
        self.sbx_files = {}

        self._repr_state = None
        self.state = None

        self.finish_status = True if state == ReprState.SUCCESSFULL else \
            (False if state == ReprState.ERROR and not self.is_broken else None)

        self._saved_jobs_status = None
        self.last_sandbox_task_id = None
        self._graph_executor = DUMMY_GRAPH_EXECUTOR

        self._repr_state = state # to avoid duplicates in pck.history

        self.req_sandbox_host = None

        if state == ReprState.SUCCESSFULL:
            #pass
            g = self._create_job_graph_executor()
            self._saved_jobs_status = g.produce_detailed_status()
            #self._saved_jobs_status = self._produce_compressed_job_status(g)
            del g

        elif state == ReprState.HISTORIED:
            pass

        elif self.queue and (failed_jobs or succeed_jobs or jobs_to_retry):
            g = self._graph_executor = self._create_job_graph_executor()

            g.failed_jobs = failed_jobs
            g.succeed_jobs = succeed_jobs
            g.jobs_to_run = jobs_to_run
            g.jobs_to_retry = jobs_to_retry
            g.child_to_parents = child_to_parents

            g._clean_state = clean_state

            g.state = g._calc_state()

            # FIXME bug? waitJobs may not contain all jobs-with-parents
            _complete_waitJobs(self.id, g)

            try:
                _check_graph_consistence(g)
            except Exception as e:
                raise AssertionError("Inconsistent job graph in %s: %s" % (self.id, e))

        self.state = self._calc_state()
        self._update_repr_state()

        if self.queue:
            if self.has_pending_jobs():
                self.queue.packets_with_pending_jobs.add(self)
            self.queue.relocate_packet(self)

        if self._repr_state != state and not(state == ReprState.WORKABLE and self._repr_state == ReprState.PENDING):
            logging.warning("ReprState mismatch for %s: %s -> %s" % (self, state, self._repr_state))
Exemple #17
0
 def log_fail(error):
     logging.error('Failed to fetch %s for %s: %s' % (pck._final_update_url, pck, error))
Exemple #18
0
    def _status(self):
        history = self.History()
        total_time = history[-1][1] - history[0][1]
        wait_time = 0

        for ((state, start_time), (_, end_time)) in zip(history, history[1:] + [("", time.time())]):
            if state in (ReprState.SUSPENDED, ReprState.WAITING):
                wait_time += end_time - start_time

        result_tag = self.done_tag.name if self.done_tag else None

        waiting_time = None
        if self.state == ImplState.TIME_WAIT:
            deadline = self._graph_executor.get_nearest_retry_deadline()
            if deadline:
                waiting_time = max(int(deadline - time.time()), 0)
            else:
                logging.error("Packet %s in WAITING but has no get_nearest_retry_deadline" % self.id)

        all_tags = list(self.all_dep_tags)

        status = dict(name=self.name,
                      is_sandbox=isinstance(self, SandboxPacket),
                      last_sandbox_task_id=self.last_sandbox_task_id, # TODO History of tasks
                      last_global_error=self._graph_executor.get_global_error(),
                      resolved_releases=self.resolved_releases,
                      state=self._repr_state,
                      extended_state=self._get_extended_state(),
                      wait=list(self.wait_dep_tags),
                      all_tags=all_tags,
                      result_tag=result_tag,
                      priority=self.priority,
                      notify_emails=self.notify_emails,
                      history=history,
                      total_time=total_time,
                      wait_time=wait_time,
                      last_modified=history[-1][1],
                      waiting_time=waiting_time,
                      queue=self.queue.name if self.queue else None,
                      labels=self.user_labels,
                      oauth_login=self.oauth_login,
                     )

        extra_flags = set()

        if self.is_broken:
            extra_flags.add("can't-be-recovered")

        if self.do_not_run:
            extra_flags.add("manually-suspended")

        if self.is_too_old:
            extra_flags.add("too-old")

        if extra_flags:
            status["extra_flags"] = ";".join(extra_flags)

        if not self._is_dummy_graph_executor():
            jobs = self._graph_executor.produce_detailed_status() \
                or self._produce_clean_jobs_status()
        elif self._saved_jobs_status:
            jobs = self._saved_jobs_status
        else:
            jobs = self._produce_clean_jobs_status()

        for job in jobs:
            tag = self.job_done_tag.get(int(job['id'])) # id is str already
            if tag:
                job['result_tag'] = tag.name

        status["jobs"] = jobs

        return status