def load_dir(self, directory, ext=".xml", url=None): """ :param directory: A directory to walk. :param ext: Include files with this extension (default .xml) Traverse a directory tree looking for metadata. Files ending in the specified extension are included. Directories starting with '.' are excluded. """ if url is None: url = directory log.debug("walking %s" % directory) if not directory in self.md: entities = [] for top, dirs, files in os.walk(directory): for dn in dirs: if dn.startswith("."): dirs.remove(dn) for nm in files: log.debug("found file %s" % nm) if nm.endswith(ext): fn = os.path.join(top, nm) try: t = self.parse_metadata(fn, fail_on_error=True) # local metadata is assumed to be ok entities.extend(self.entities(t)) except Exception, ex: log.error(ex) self.import_metadata(self.entity_set(entities, url))
def run(self, server): locked = False try: if self.lock.acquire(blocking=0): locked = True md = self.server.new_repository() for o in self.server.observers: md.subscribe(o) for p in server.plumbings: state = {'update': True, 'stats': {}} p.process(md, state) stats.update(state.get('stats', {})) if not md.sane(): log.error("update produced insane active repository - will try again later...") with server.lock.writelock: log.debug("update produced new repository with %d entities" % md.index.size()) server.md = md md.fire(type=EVENT_REPOSITORY_LIVE, size=md.index.size()) stats['Repository Update Time'] = datetime.now() stats['Repository Size'] = md.index.size() else: log.error("another instance is running - will try again later...") except Exception, ex: traceback.print_exc(ex)
def lookup(self, key): log.debug("redis store lookup: %s" % key) if '+' in key: hk = hex_digest(key) if not self.rc.exists("%s#members" % hk): self.rc.zinterstore("%s#members" % hk, ["%s#members" % k for k in key.split('+')], 'min') self.rc.expire("%s#members" % hk, 30) # XXX bad juju - only to keep clients from hammering return self.lookup(hk) m = re.match("^(.+)=(.+)$", key) if m: return self.lookup("{%s}%s" % (m.group(1), m.group(2))) m = re.match("^{(.+)}(.+)$", key) if m and ';' in m.group(2): hk = hex_digest(key) if not self.rc.exists("%s#members" % hk): self.rc.zunionstore("%s#members" % hk, ["{%s}%s#members" % (m.group(1), v) for v in m.group(2).split(';')], 'min') self.rc.expire("%s#members" % hk, 30) # XXX bad juju - only to keep clients from hammering return self.lookup(hk) elif self.rc.exists("%s#alias" % key): return self.lookup(self.rc.get("%s#alias" % key)) elif self.rc.exists("%s#metadata" % key): return [self._get_metadata(key)] else: return self._members(key)
def periodic(self, stats): now = _now() stats['Last Periodic Maintenance'] = now log.debug("periodic maintentance...") self.rc.zremrangebyscore("members", "-inf", now) self._drop_empty_av("collections", "members", now) self._drop_empty_av("attributes", "values", now)
def parse_metadata(self, fn, key=None, base_url=None, fail_on_error=False, filter_invalid=True): """Parse a piece of XML and split it up into EntityDescriptor elements. Each such element is stored in the MDRepository instance. :param fn: a file-like object containing SAML metadata :param key: a certificate (file) or a SHA1 fingerprint to use for signature verification :param base_url: use this base url to resolve relative URLs for XInclude processing """ try: t = etree.parse(fn, base_url=base_url, parser=etree.XMLParser(resolve_entities=False)) t.xinclude() if filter_invalid: for e in t.findall('{%s}EntityDescriptor' % NS['md']): if not schema().validate(e): error = _e(schema().error_log, m=base_url) log.debug("removing '%s': schema validation failed (%s)" % ( e.get('entityID'), error)) e.getparent().remove(e) self.fire(type=EVENT_DROP_ENTITY, url=base_url, entityID=e.get('entityID'), error=error) else: # Having removed the invalid entities this should now never # happen... schema().assertValid(t) except DocumentInvalid, ex: traceback.print_exc() log.debug("schema validation failed on '%s': %s" % ( base_url, _e(ex.error_log, m=base_url))) raise MetadataException("schema validation failed")
def test_log_plain(self): try: logfile = StringIO() logger = logging.getLogger() old_handlers = [] for hdl in logger.handlers: logger.removeHandler(hdl) old_handlers.append(hdl) test_handler = logging.StreamHandler(logfile) logger.addHandler(test_handler) logger.setLevel(logging.WARNING) log.info("info") log.warn("warn") log.warning("warning") log.error("error") log.critical("critical") log.debug("debug") lines = logfile.getvalue().split("\n") assert ("info" not in lines) assert ("warn" in lines) assert ("warning" in lines) assert ("critical" in lines) assert ("error" in lines) assert ("debug" not in lines) finally: logger.removeHandler(test_handler) for hdl in old_handlers: logger.addHandler(hdl)
def run(self, server): locked = False try: self.lock.acquire() locked = True md = self.server.md.clone() for p in server.plumbings: state = {'update': True, 'stats': {}} p.process(md, state) stats.update(state.get('stats', {})) with server.lock.writelock: log.debug("update produced new repository with %d entities" % server.md.store.size()) server.md = md server.md.fire(type=EVENT_REPOSITORY_LIVE, size=server.md.store.size()) stats['Repository Update Time'] = datetime.now() stats['Repository Size'] = server.md.store.size() self.nruns += 1 stats['Updates Since Server Start'] = self.nruns if hasattr(self.server.md.store, 'periodic'): self.server.md.store.periodic(stats) except Exception, ex: log.error(ex.message)
def when(req, condition, *values): """ Conditionally execute part of the pipeline. :param req: The request :param condition: The condition key :param values: The condition values :param opts: More Options (unused) :return: None The inner pipeline is executed if the at least one of the condition values is present for the specified key in the request state. **Examples** .. code-block:: yaml - when foo - something - when bar bill - other The condition operates on the state: if 'foo' is present in the state (with any value), then the something branch is followed. If 'bar' is present in the state with the value 'bill' then the other branch is followed. """ log.debug("condition key: %s" % repr(condition)) c = req.state.get(condition, None) log.debug("condition %s" % repr(c)) if c is not None: if not values or _any(values, c): return Plumbing(pipeline=req.args, id="%s.when" % req.plumbing.id)._process(req) return req.t
def test_log_plain(self): try: logfile = StringIO() logger = logging.getLogger() old_handlers = [] for hdl in logger.handlers: logger.removeHandler(hdl) old_handlers.append(hdl) test_handler = logging.StreamHandler(logfile) logger.addHandler(test_handler) logger.setLevel(logging.WARNING) log.info("info") log.warn("warn") log.warning("warning") log.error("error") log.critical("critical") log.debug("debug") lines = logfile.getvalue().split("\n") assert("info" not in lines) assert("warn" in lines) assert("warning" in lines) assert("critical" in lines) assert("error" in lines) assert("debug" not in lines) finally: logger.removeHandler(test_handler) for hdl in old_handlers: logger.addHandler(hdl)
def _process(self, req): """The inner request pipeline processor. :param req: The request to run through the pipeline """ log.debug('Processing \n%s' % self) for p in self.pipeline: try: pipe, opts, name, args = loader.load_pipe(p) #log.debug("traversing pipe %s,%s,%s using %s" % (pipe,name,args,opts)) if type(args) is str or type(args) is unicode: args = [args] if args is not None and type(args) is not dict and type(args) is not list and type(args) is not tuple: raise PipeException("Unknown argument type %s" % repr(args)) req.args = args req.name = name ot = pipe(req, *opts) if ot is not None: req.t = ot #log.debug("new state after %s: %s (done=%s)" % (pipe,req.state,req.done)) if req.done: break except PipeException, ex: log.error(ex) break
def test_log_syslog(self): with patch('syslog.syslog', new=self.dummy_syslog): try: logger = logging.getLogger() old_handlers = [] for hdl in logger.handlers: logger.removeHandler(hdl) old_handlers.append(hdl) test_handler = SysLogLibHandler("USER") logger.addHandler(test_handler) logger.setLevel(logging.WARNING) log.info("info") log.warn("warn") log.warning("warning") log.error("error") log.critical("critical") log.debug("debug") lines = self._syslog.getvalue().split("\n") assert("info" not in lines) assert("12:warn" in lines) assert("12:warning" in lines) assert("10:critical" in lines) assert("11:error" in lines) assert("debug" not in lines) finally: logger.removeHandler(test_handler) for hdl in old_handlers: logger.addHandler(hdl)
def producer(q, resources, cache=self.metadata_cache_enabled): print resources for url, verify, id, tries in resources: log.debug("starting fetcher for '%s'" % url) thread = URLFetch(url, verify, id, enable_cache=cache, tries=tries) thread.start() q.put(thread, True)
def test_log_syslog(self): with patch('syslog.syslog', new=self.dummy_syslog): try: logger = logging.getLogger() old_handlers = [] for hdl in logger.handlers: logger.removeHandler(hdl) old_handlers.append(hdl) test_handler = SysLogLibHandler("USER") logger.addHandler(test_handler) logger.setLevel(logging.WARNING) log.info("info") log.warn("warn") log.warning("warning") log.error("error") log.critical("critical") log.debug("debug") lines = self._syslog.getvalue().split("\n") assert ("info" not in lines) assert ("12:warn" in lines) assert ("12:warning" in lines) assert ("10:critical" in lines) assert ("11:error" in lines) assert ("debug" not in lines) finally: logger.removeHandler(test_handler) for hdl in old_handlers: logger.addHandler(hdl)
def producer(q, resources, cache=self.metadata_cache_enabled): print resources for url, verify, id, tries in resources: log.debug("Starting fetcher for %s" % url) thread = URLFetch(url, verify, id, enable_cache=cache, tries=tries) thread.start() q.put(thread, True)
def parse_metadata(self, fn, key=None, base_url=None, fail_on_error=False, filter_invalid=True): """Parse a piece of XML and split it up into EntityDescriptor elements. Each such element is stored in the MDRepository instance. :param fn: a file-like object containing SAML metadata :param key: a certificate (file) or a SHA1 fingerprint to use for signature verification :param base_url: use this base url to resolve relative URLs for XInclude processing """ try: t = etree.parse(fn, base_url=base_url, parser=etree.XMLParser(resolve_entities=False)) t.xinclude() if filter_invalid: for e in t.findall('{%s}EntityDescriptor' % NS['md']): if not schema().validate(e): error = _e(schema().error_log, m=base_url) log.debug("removing '%s': schema validation failed (%s)" % (e.get('entityID'), error)) e.getparent().remove(e) self.fire(type=EVENT_DROP_ENTITY, url=base_url, entityID=e.get('entityID'), error=error) else: # Having removed the invalid entities this should now never happen... schema().assertValid(t) except DocumentInvalid, ex: traceback.print_exc() log.debug("schema validation failed on '%s': %s" % (base_url, _e(ex.error_log, m=base_url))) raise MetadataException("schema validation failed")
def load_dir(self, directory, ext=".xml", url=None): """ :param directory: A directory to walk. :param ext: Include files with this extension (default .xml) Traverse a directory tree looking for metadata. Files ending in the specified extension are included. Directories starting with '.' are excluded. """ if url is None: url = directory log.debug("walking %s" % directory) if not directory in self.md: entities = [] for top, dirs, files in os.walk(directory): for dn in dirs: if dn.startswith("."): dirs.remove(dn) for nm in files: log.debug("found file %s" % nm) if nm.endswith(ext): fn = os.path.join(top, nm) try: t = self.parse_metadata(fn, fail_on_error=True) entities.extend(self.entities(t)) # local metadata is assumed to be ok except Exception, ex: log.error(ex) self.import_metadata(self.entity_set(entities, url))
def run(self, server): locked = False try: self.lock.acquire() locked = True md = self.server.md.clone() for p in server.plumbings: state = {'update': True, 'stats': {}} p.process(md, state) stats.update(state.get('stats', {})) with server.lock.writelock: log.debug("update produced new repository with %d entities" % server.md.store.size()) server.md = md server.md.fire(type=EVENT_REPOSITORY_LIVE, size=server.md.store.size()) stats['Repository Update Time'] = datetime.now() stats['Repository Size'] = server.md.store.size() self.nruns += 1 stats['Updates Since Server Start'] = self.nruns if hasattr(self.server.md.store, 'periodic'): self.server.md.store.periodic(stats) except Exception as ex: log.error(ex.message) finally: if locked: self.lock.release()
def _drop_empty_av(self, attr, tag, ts): an = "#%s" % attr for c in self.rc.smembers(an): tn = "%s#members" % c self.rc.zremrangebyscore(tn, "-inf", ts) if not self.rc.zcard(tn) > 0: log.debug("dropping empty %s %s" % (attr, c)) self.rc.srem(an, c)
def default(self, *args, **kwargs): log.debug("default args: %s, kwargs: %s" % (repr(args), repr(kwargs))) if len(args) > 0 and args[0] in self.server.aliases: kwargs['pfx'] = args[0] if len(args) > 1: kwargs['path'] = args[1] return self.server.request(**kwargs) else: kwargs['pfx'] = None kwargs['path'] = "/" + "/".join(args) return self.server.request(**kwargs)
def default(self, *args, **kwargs): log.debug("request default: %s" % ",".join(args)) if len(args) > 0 and args[0] in self.server.aliases: kwargs['pfx'] = args[0] if len(args) > 1: kwargs['path'] = args[1] return self.server.request(**kwargs) else: log.debug("not an alias: %s" % "/".join(args)) kwargs['pfx'] = None kwargs['path'] = "/" + "/".join(args) return self.server.request(**kwargs)
def publish(req, *opts): """ Publish the working document in XML form. :param req: The request :param opts: Options (unused) :return: None Publish takes one argument: path to a file where the document tree will be written. **Examples** .. code-block:: yaml - publish: /tmp/idp.xml """ if req.t is None: raise PipeException("Empty document submitted for publication") if req.args is None: raise PipeException("publish must specify output") try: validate_document(req.t) except DocumentInvalid as ex: log.error(ex.error_log) raise PipeException("XML schema validation failed") output_file = None if type(req.args) is dict: output_file = req.args.get("output", None) else: output_file = req.args[0] if output_file is not None: output_file = output_file.strip() log.debug("publish {}".format(output_file)) resource_name = output_file m = re.match(FILESPEC_REGEX, output_file) if m: output_file = m.group(1) resource_name = m.group(2) log.debug("output_file={}, resource_name={}".format( output_file, resource_name)) out = output_file if os.path.isdir(output_file): out = "{}.xml".format(os.path.join(output_file, req.id)) safe_write(out, dumptree(req.t)) req.md.store.update( req.t, tid=resource_name ) # TODO maybe this is not the right thing to do anymore return req.t
def update(self, t, tid=None, ts=None, merge_strategy=None): # TODO: merge ? log.debug("redis store update: %s: %s" % (t, tid)) relt = root(t) ne = 0 if ts is None: ts = int( _now() + 3600 * 24 * 4) # 4 days is the arbitrary default expiration if relt.tag == "{%s}EntityDescriptor" % NS['md']: if tid is None: tid = relt.get('entityID') with self.rc.pipeline() as p: self.update_entity(relt, t, tid, ts, p) entity_id = relt.get("entityID") if entity_id is not None: self.membership("entities", entity_id, ts, p) for ea, eav in entity_attribute_dict(relt).iteritems(): for v in eav: # log.debug("%s=%s" % (ea, v)) self.membership("{%s}%s" % (ea, v), tid, ts, p) p.zadd("%s#values" % ea, v, ts) p.sadd("#attributes", ea) for hn in ('sha1', 'sha256', 'md5'): tid_hash = hex_digest(tid, hn) p.set("{%s}%s#alias" % (hn, tid_hash), tid) if ts is not None: p.expireat(tid_hash, ts) p.execute() ne += 1 elif relt.tag == "{%s}EntitiesDescriptor" % NS['md']: if tid is None: tid = relt.get('Name') ts = self._expiration(relt) with self.rc.pipeline() as p: self.update_entity(relt, t, tid, ts, p) for e in iter_entities(t): ne += self.update(e, ts=ts) entity_id = e.get("entityID") if entity_id is not None: self.membership(tid, entity_id, ts, p) self.membership("entities", entity_id, ts, p) p.execute() else: raise ValueError("Bad metadata top-level element: '%s'" % root(t).tag) return ne
def resolve(self, system_url, public_id, context): """ Resolves URIs using the resource API """ log.debug("resolve SYSTEM URL' %s' for '%s'" % (system_url, public_id)) path = system_url.split("/") fn = path[len(path) - 1] if pkg_resources.resource_exists(__name__, fn): return self.resolve_file(pkg_resources.resource_stream(__name__, fn), context) elif pkg_resources.resource_exists(__name__, "schema/%s" % fn): return self.resolve_file(pkg_resources.resource_stream(__name__, "schema/%s" % fn), context) else: raise ValueError("Unable to locate %s" % fn)
def _select_args(req): args = req.args log.debug("selecting using args: %s" % args) if args is None and 'select' in req.state: args = [req.state.get('select')] if args is None: args = req.md.store.collections() if args is None or not args: args = req.md.store.lookup('entities') if args is None or not args: args = [] return args
def parse_metadata(self, fn, key=None, base_url=None, fail_on_error=False, filter_invalid=True, validate=True, post=None): """Parse a piece of XML and split it up into EntityDescriptor elements. Each such element is stored in the MDRepository instance. :param fn: a file-like object containing SAML metadata :param key: a certificate (file) or a SHA1 fingerprint to use for signature verification :param base_url: use this base url to resolve relative URLs for XInclude processing :param fail_on_error: (default: False) :param filter_invalid: (default True) remove invalid EntityDescriptor elements rather than raise an errror :param validate: (default: True) set to False to turn off all XML schema validation :param post: A callable that will be called to modify the parse-tree before any validation (but after xinclude processing) """ try: t = etree.parse(fn, base_url=base_url, parser=etree.XMLParser(resolve_entities=False)) t.xinclude() if key is not None: try: log.debug("verifying signature using %s" % key) refs = xmlsec.verified(t, key) if len(refs) != 1: raise MetadataException("XML metadata contains %d signatures - exactly 1 is required" % len(refs)) t = refs[0] # prevent wrapping attacks except Exception, ex: tb = traceback.format_exc() print tb log.error(ex) return None if post is not None: t = post(t) if validate: if filter_invalid: for e in t.findall('{%s}EntityDescriptor' % NS['md']): if not schema().validate(e): error = _e(schema().error_log, m=base_url) log.debug("removing '%s': schema validation failed (%s)" % (e.get('entityID'), error)) e.getparent().remove(e) self.fire(type=EVENT_DROP_ENTITY, url=base_url, entityID=e.get('entityID'), error=error) else: # Having removed the invalid entities this should now never happen... schema().assertValid(t)
def _lookup(self, key): if key == 'entities' or key is None: return self.entities.values() if '+' in key: key = key.strip('+') # log.debug("lookup intersection of '%s'" % ' and '.join(key.split('+'))) hits = None for f in key.split("+"): f = f.strip() if hits is None: hits = set(self._lookup(f)) else: other = self._lookup(f) hits.intersection_update(other) if not hits: log.debug("empty intersection") return [] if hits is not None and hits: return list(hits) else: return [] m = re.match("^(.+)=(.+)$", key) if m: return self._lookup("{%s}%s" % (m.group(1), m.group(2).rstrip("/"))) m = re.match("^{(.+)}(.+)$", key) if m: res = set() for v in m.group(2).rstrip("/").split(';'): # log.debug("... adding %s=%s" % (m.group(1),v)) res.update(self._get_index(m.group(1), v)) return list(res) # log.debug("trying null index lookup %s" % key) l = self._get_index("null", key) if l: return list(l) # log.debug("trying main index lookup %s: " % key) if key in self.md: # log.debug("entities list %s: %s" % (key, self.md[key])) lst = [] for entityID in self.md[key]: lst.extend(self.lookup(entityID)) return lst return []
def default(self, *args, **kwargs): """The default request processor unpacks base64-encoded reuqests and passes them onto the MDServer.request handler. """ log.debug("ROOT default args: %s, kwargs: %s" % (repr(args), repr(kwargs))) if len(args) > 0 and args[0] in self.server.aliases: kwargs['pfx'] = args[0] if len(args) > 1: kwargs['path'] = args[1] return self.server.request(**kwargs) else: kwargs['pfx'] = None kwargs['path'] = "/" + "/".join(args) return self.server.request(**kwargs)
def dispatch(self, path_info): # log.debug("EncodingDispatcher (%s) called with %s" % (",".join(self.prefixes), path_info)) # vpath = path_info.replace("%2F", "/") vpath = path_info for prefix in self.prefixes: if vpath.startswith(prefix): log.debug("EncodingDispatcher (%s) called with %s" % (",".join(self.prefixes), path_info)) vpath = path_info.replace("%2F", "/") plen = len(prefix) vpath = vpath[plen + 1:] npath = "%s/%s" % (prefix, self.enc(vpath)) log.debug("EncodingDispatcher %s" % npath) return self.next_dispatcher(npath) return self.next_dispatcher(vpath)
def _lookup(self, key): if key == 'entities' or key is None: return self.entities.values() if '+' in key: key = key.strip('+') #log.debug("lookup intersection of '%s'" % ' and '.join(key.split('+'))) hits = None for f in key.split("+"): f = f.strip() if hits is None: hits = set(self._lookup(f)) else: other = self._lookup(f) hits.intersection_update(other) if not hits: log.debug("empty intersection") return [] if hits is not None and hits: return list(hits) else: return [] m = re.match("^(.+)=(.+)$", key) if m: return self._lookup("{%s}%s" % (m.group(1), m.group(2).rstrip("/"))) m = re.match("^{(.+)}(.+)$", key) if m: res = set() for v in m.group(2).rstrip("/").split(';'): # log.debug("... adding %s=%s" % (m.group(1),v)) res.update(self._get_index(m.group(1), v)) return list(res) # log.debug("trying null index lookup %s" % key) l = self._get_index("null", key) if l: return list(l) # log.debug("trying main index lookup %s: " % key) if key in self.md: # log.debug("entities list %s: %s" % (key, self.md[key])) lst = [] for entityID in self.md[key]: lst.extend(self.lookup(entityID)) return lst return []
def parse_metadata(self, fn, key=None, base_url=None, fail_on_error=False): """Parse a piece of XML and split it up into EntityDescriptor elements. Each such element is stored in the MDRepository instance. :param fn: a file-like object containing SAML metadata :param key: a certificate (file) or a SHA1 fingerprint to use for signature verification :param base_url: use this base url to resolve relative URLs for XInclude processing """ try: t = etree.parse(fn, base_url=base_url, parser=etree.XMLParser(resolve_entities=False)) t.xinclude() schema().assertValid(t) except DocumentInvalid, ex: log.debug(_e(ex.error_log)) raise ValueError("XML schema validation failed")
def _d(x, do_split=True): if x is not None: x = x.strip() log.debug("_d(%s,%s)" % (x, do_split)) if x is None or len(x) == 0: return None, None if x.startswith("{base64}"): x = x[8:].decode('base64') if do_split and '.' in x: (pth, dot, extn) = x.rpartition('.') assert (dot == '.') if extn in _ctypes: return pth, extn return x, None
def load_pipe(self, d): """Return a triple callable,name,args of the pipe specified by the object d. :param d: The following alternatives for d are allowed: - d is a string (or unicode) in which case the pipe is named d called with None as args. - d is a dict of the form {name: args} (i.e one key) in which case the pipe named *name* is called with args - d is an iterable (eg tuple or list) in which case d[0] is treated as the pipe name and d[1:] becomes the args """ name = None args = None opts = [] if type(d) is str or type(d) is unicode: name, opts = self._n(d) elif hasattr(d, '__iter__') and not type(d) is dict: if not len(d): raise PipeException("This does not look like a length of pipe... \n%s" % repr(d)) name, opts = self._n(d[0]) elif type(d) is dict: k = d.keys()[0] name, opts = self._n(k) args = d[k] else: raise PipeException("This does not look like a length of pipe... \n%s" % repr(d)) if name is None: raise PipeException("Anonymous length of pipe... \n%s" % repr(d)) mname = "pyff.pipes.builtins" fn = name if ':' in name: (mname, sep, fn) = name.rpartition(":") pm = mname if '.' in mname: (pm, sep, mn) = mname.rpartition('.') log.debug("importing %s from %s to find %s" % (mn, pm, fn)) else: log.debug("importing %s from %s to find %s" % (mname, pm, fn)) module = __import__(mname, fromlist=[pm]) if hasattr(module, fn) and hasattr(getattr(module, fn), '__call__'): return getattr(module, fn), opts, fn, args elif hasattr(module, "_%s" % fn) and hasattr(getattr(module, "_%s" % fn), '__call__'): return getattr(module, "_%s" % fn), opts, fn, args else: raise PipeException("No such method %s in %s" % (fn, mname))
def update(self, t, tid=None, ts=None, merge_strategy=None): # TODO: merge ? log.debug("redis store update: %s: %s" % (t, tid)) relt = root(t) ne = 0 if ts is None: ts = int(_now() + 3600 * 24 * 4) # 4 days is the arbitrary default expiration if relt.tag == "{%s}EntityDescriptor" % NS['md']: if tid is None: tid = relt.get('entityID') with self.rc.pipeline() as p: self.update_entity(relt, t, tid, ts, p) entity_id = relt.get("entityID") if entity_id is not None: self.membership("entities", entity_id, ts, p) for ea, eav in entity_attribute_dict(relt).iteritems(): for v in eav: # log.debug("%s=%s" % (ea, v)) self.membership("{%s}%s" % (ea, v), tid, ts, p) p.zadd("%s#values" % ea, v, ts) p.sadd("#attributes", ea) for hn in ('sha1', 'sha256', 'md5'): tid_hash = hex_digest(tid, hn) p.set("{%s}%s#alias" % (hn, tid_hash), tid) if ts is not None: p.expireat(tid_hash, ts) p.execute() ne += 1 elif relt.tag == "{%s}EntitiesDescriptor" % NS['md']: if tid is None: tid = relt.get('Name') ts = self._expiration(relt) with self.rc.pipeline() as p: self.update_entity(relt, t, tid, ts, p) for e in iter_entities(t): ne += self.update(e, ts=ts) entity_id = e.get("entityID") if entity_id is not None: self.membership(tid, entity_id, ts, p) self.membership("entities", entity_id, ts, p) p.execute() else: raise ValueError("Bad metadata top-level element: '%s'" % root(t).tag) return ne
def run(self): def _parse_date(str): if str is None: return datetime.new() return datetime(*parsedate(str)[:6]) self.start_time = clock() try: cache = httplib2.FileCache(".cache") headers = dict() if not self.enable_cache: headers['cache-control'] = 'no-cache' log.debug("fetching '%s'" % self.url) if self.url.startswith('file://'): path = self.url[7:] if not os.path.exists(path): raise IOError("file not found: %s" % path) with open(path, 'r') as fd: self.result = fd.read() self.cached = False self.date = datetime.now() self.last_modified = datetime.fromtimestamp(os.stat(path).st_mtime) else: h = httplib2.Http(cache=cache, timeout=60, disable_ssl_certificate_validation=True) # trust is done using signatures over here resp, content = h.request(self.url, headers=headers) self.resp = resp self.last_modified = _parse_date(resp.get('last-modified', resp.get('date', None))) self.date = _parse_date(resp['date']) if resp.status != 200: raise IOError(resp.reason) self.result = content self.cached = resp.fromcache log.debug("got %d bytes from '%s'" % (len(self.result), self.url)) except Exception, ex: #traceback.print_exc() #log.warn("unable to fetch '%s': %s" % (self.url, ex)) self.ex = ex self.result = None
def entity_set(self, entities, name, cacheDuration=None, validUntil=None, validate=True): """ :param entities: a set of entities specifiers (lookup is used to find entities from this set) :param name: the @Name attribute :param cacheDuration: an XML timedelta expression, eg PT1H for 1hr :param validUntil: a relative time eg 2w 4d 1h for 2 weeks, 4 days and 1hour from now. Produce an EntityDescriptors set from a list of entities. Optional Name, cacheDuration and validUntil are affixed. """ attrs = dict(Name=name, nsmap=NS) if cacheDuration is not None: attrs['cacheDuration'] = cacheDuration if validUntil is not None: attrs['validUntil'] = validUntil t = etree.Element("{%s}EntitiesDescriptor" % NS['md'], **attrs) nent = 0 seen = {} # TODO make better de-duplication for member in entities: for ent in self.lookup(member): entityID = ent.get('entityID', None) if (ent is not None) and (entityID is not None) and ( not seen.get(entityID, False)): t.append(deepcopy(ent)) seen[entityID] = True nent += 1 log.debug( "selecting %d entities from %d entity set(s) before validation" % (nent, len(entities))) if not nent: return None if validate: try: schema().assertValid(t) except DocumentInvalid, ex: log.debug(_e(ex.error_log))
def load(req, *opts): """ General-purpose resource fetcher. :param req: The request :param opts: Options: [qsize <5>] [timeout <30>] [xrd <output xrd file>] :return: None Supports both remote and local resources. Fetching remote resources is done in parallell using threads. """ remote = [] for x in req.args: x = x.strip() log.debug("load %s" % x) m = re.match(FILESPEC_REGEX, x) rid = None if m: x = m.group(1) rid = m.group(2) r = x.split() assert len(r) in [1, 2], PipeException("Usage: load: resource [as url] [verification]") verify = None url = r[0] if len(r) == 2: verify = r[1] if "://" in url: log.debug("remote %s %s %s" % (url, verify, rid)) remote.append((url, verify, rid)) elif os.path.exists(url): if os.path.isdir(url): log.debug("local directory %s %s %s" % (url, verify, rid)) req.md.load_dir(url, url=rid) elif os.path.isfile(url): log.debug("local file %s %s %s" % (url, verify, rid)) remote.append(("file://%s" % url, verify, rid)) else: log.error("Unknown file type for load: %s" % r[0]) else: log.error("Don't know how to load '%s' as %s verified by %s" % (url, rid, verify)) opts = dict(zip(opts[::2], opts[1::2])) opts.setdefault('timeout', 30) opts.setdefault('qsize', 5) opts.setdefault('xrd', None) stats = dict() opts.setdefault('stats', stats) req.md.fetch_metadata(remote, **opts) req.state['stats']['Metadata URLs'] = stats
def entity_set(self, entities, name, cacheDuration=None, validUntil=None, validate=True): """ :param entities: a set of entities specifiers (lookup is used to find entities from this set) :param name: the @Name attribute :param cacheDuration: an XML timedelta expression, eg PT1H for 1hr :param validUntil: a relative time eg 2w 4d 1h for 2 weeks, 4 days and 1hour from now. Produce an EntityDescriptors set from a list of entities. Optional Name, cacheDuration and validUntil are affixed. """ attrs = dict(Name=name, nsmap=NS) if cacheDuration is not None: attrs['cacheDuration'] = cacheDuration if validUntil is not None: attrs['validUntil'] = validUntil t = etree.Element("{%s}EntitiesDescriptor" % NS['md'], **attrs) nent = 0 seen = {} # TODO make better de-duplication for member in entities: for ent in self.lookup(member): entityID = ent.get('entityID', None) if (ent is not None) and (entityID is not None) and (not seen.get(entityID, False)): t.append(deepcopy(ent)) seen[entityID] = True nent += 1 log.debug("selecting %d entities from %d entity set(s) before validation" % ( nent, len(entities))) if not nent: return None if validate: try: schema().assertValid(t) except DocumentInvalid, ex: log.debug(_e(ex.error_log))
def emit(req, ctype="application/xml", *opts): """ Returns a UTF-8 encoded representation of the working tree. :param req: The request :param ctype: The mimetype of the response. :param opts: Options (not used) :return: unicode data Renders the working tree as text and sets the digest of the tree as the ETag. If the tree has already been rendered as text by an earlier step the text is returned as utf-8 encoded unicode. The mimetype (ctype) will be set in the Content-Type HTTP response header. **Examples** .. code-block:: yaml - emit application/xml: - break """ d = req.t log.debug("before getroot (%s) %s" % (type(d), repr(d))) if hasattr(d, 'getroot') and hasattr(d.getroot, '__call__'): nd = d.getroot() if nd is None: d = str(d) else: d = nd log.debug("after getroot (%s) %s" % (type(d), repr(d))) if hasattr(d, 'tag'): log.debug("has tag") d = dumptree(d) log.debug("after dumptree (%s) %s" % (type(d), repr(d))) if d is not None: m = hashlib.sha1() m.update(d) req.state['headers']['ETag'] = m.hexdigest() else: raise PipeException("Empty") req.state['headers']['Content-Type'] = ctype return unicode(d.decode('utf-8')).encode("utf-8")
def periodic(self, stats): now = _now() stats['Last Periodic Maintenance'] = now log.debug("periodic maintentance...") self.rc.zremrangebyscore("members", "-inf", now) for c in self.rc.smembers("#collections"): self.rc.zremrangebyscore("%s#members", "-inf", now) if not self.rc.zcard("%s#members" % c) > 0: log.debug("dropping empty collection %s" % c) self.rc.srem("#collections", c) for an in self.rc.smembers("#attributes"): self.rc.zremrangebyscore("%s#values", "-inf", now) if not self.rc.zcard("%s#members" % an) > 0: log.debug("dropping empty attribute %s" % an) self.rc.srem("#attributes", an)
def run(self): def _parse_date(str): if str is None: return datetime.new() return datetime(*parsedate(str)[:6]) self.start_time = clock() try: cache = httplib2.FileCache(".cache") if not self.enable_cache: log.debug("removing '%s' from cache" % self.url) cache.delete(self.url) log.debug("fetching '%s'" % self.url) if self.url.startswith('file://'): path = self.url[7:] if not os.path.exists(path): raise IOError("file not found: %s" % path) with open(path, 'r') as fd: self.result = fd.read() self.cached = False self.date = datetime.now() self.last_modified = datetime.fromtimestamp(os.stat(path).st_mtime) else: try: h = httplib2.Http(cache=cache, timeout=60, disable_ssl_certificate_validation=True) # trust is done using signatures over here resp, content = h.request(self.url) self.status = resp.status self.last_modified = _parse_date(resp.get('last-modified', resp.get('date', None))) if resp.status != 200: raise IOError(resp.reason) self.result = content self.cached = resp.fromcache except Exception, ex: resp = requests.get(self.url) self.status = resp.status_code self.last_modified = _parse_date(resp.headers['last-modified'] or resp.headers['date']) if resp.status_code != 200: raise IOError(httplib.responses[resp.status_code]) self.result = resp.content self.cached = False log.debug("got %d bytes from '%s'" % (len(self.result), self.url))
def run(self): def _parse_date(str): if str is None: return datetime.new() return datetime(*parsedate(str)[:6]) self.start_time = clock() try: requests_cache.install_cache('.cache') if not self.enable_cache: log.debug("removing '%s' from cache" % self.url) requests_cache.get_cache().delete_url(self.url) log.debug("fetching '%s'" % self.url) if self.url.startswith('file://'): path = self.url[7:] if not os.path.exists(path): raise IOError("file not found: %s" % path) with open(path, 'r') as fd: self.result = fd.read() self.cached = False self.date = datetime.now() self.last_modified = datetime.fromtimestamp( os.stat(path).st_mtime) else: self.resp = requests.get(self.url, timeout=60, verify=False) self.last_modified = _parse_date( self.resp.headers.get('last-modified', self.resp.headers.get('date', None))) self.date = _parse_date(self.resp.headers['date']) self.cached = getattr(self.resp, 'from_cache', False) self.status = self.resp.status_code if self.resp.status_code != 200: raise IOError(self.resp.reason) self.result = self.resp.content log.debug("got %d bytes from '%s'" % (len(self.result), self.url)) except Exception, ex: traceback.print_exc() log.warn("unable to fetch '%s': %s" % (self.url, ex)) self.ex = ex self.result = None
def search(self, query, path=None, page=None, page_limit=10, entity_filter=None): """ :param query: A string to search for. :param path: The repository collection (@Name) to search in - None for search in all collections :param page: When using paged search, the page index :param page_limit: When using paged search, the maximum entry per page :param entity_filter: A lookup expression used to filter the entries before search is done. Returns a list of dict's for each EntityDescriptor present in the metadata store such that any of the DisplayName, ServiceName, OrganizationName or OrganizationDisplayName elements match the query (as in contains the query as a substring). The dict in the list contains three items: :param label: A displayable string, useful as a UI label :param value: The entityID of the EntityDescriptor :param id: A sha1-ID of the entityID - on the form {sha1}<sha1-hash-of-entityID> """ def _strings(e): lst = [e.get('entityID')] for attr in ['.//{%s}DisplayName' % NS['mdui'], './/{%s}ServiceName' % NS['md'], './/{%s}OrganizationDisplayName' % NS['md'], './/{%s}OrganizationName' % NS['md']]: lst.extend([x.text.lower() for x in e.findall(attr)]) return filter(lambda s: s is not None, lst) def _match(query, e): #log.debug("looking for %s in %s" % (query,",".join(_strings(e)))) for qstr in _strings(e): if query in qstr: return True return False f = [] if path is not None: f.append(path) if entity_filter is not None: f.append(entity_filter) mexpr = None if f: mexpr = "+".join(f) log.debug("mexpr: %s" % mexpr) res = [{'label': self.display(e), 'value': e.get('entityID'), 'id': pyff.index.hash_id(e, 'sha1')} for e in pyff.index.EntitySet(filter(lambda ent: _match(query, ent), self.lookup(mexpr)))] res.sort(key=lambda i: i['label']) log.debug(res) if page is not None: total = len(res) begin = (page - 1) * page_limit end = begin + page_limit more = (end < total) return res[begin:end], more, total else: return res
def consumer(q, njobs, stats, next_jobs=None, resolved=None): if next_jobs is None: next_jobs = [] if resolved is None: resolved = set() nfinished = 0 while nfinished < njobs: info = None try: log.debug("waiting for next thread to finish...") thread = q.get(True) thread.join(timeout) if thread.isAlive(): raise MetadataException("thread timeout fetching '%s'" % thread.url) info = { 'Time Spent': thread.time() } if thread.ex is not None: raise thread.ex else: if thread.result is not None: info['Bytes'] = len(thread.result) else: raise MetadataException("empty response fetching '%s'" % thread.url) info['Cached'] = thread.cached info['Date'] = str(thread.date) info['Last-Modified'] = str(thread.last_modified) info['Tries'] = thread.tries xml = thread.result.strip() if thread.status is not None: info['Status'] = thread.status t = self.parse_metadata(StringIO(xml), key=thread.verify, base_url=thread.url) if t is None: self.fire(type=EVENT_IMPORT_FAIL, url=thread.url) raise MetadataException("no valid metadata found at '%s'" % thread.url) relt = root(t) if relt.tag in ('{%s}XRD' % NS['xrd'], '{%s}XRDS' % NS['xrd']): log.debug("%s looks like an xrd document" % thread.url) for xrd in t.xpath("//xrd:XRD", namespaces=NS): log.debug("xrd: %s" % xrd) for link in xrd.findall(".//{%s}Link[@rel='%s']" % (NS['xrd'], NS['md'])): url = link.get("href") certs = xmlsec.CertDict(link) fingerprints = certs.keys() fp = None if len(fingerprints) > 0: fp = fingerprints[0] log.debug("fingerprint: %s" % fp) next_jobs.append((url, fp, url, 0)) elif relt.tag in ('{%s}EntityDescriptor' % NS['md'], '{%s}EntitiesDescriptor' % NS['md']): cacheDuration = self.default_cache_duration if self.respect_cache_duration: cacheDuration = root(t).get('cacheDuration', self.default_cache_duration) offset = duration2timedelta(cacheDuration) if thread.cached: if thread.last_modified + offset < datetime.now() - duration2timedelta(self.min_cache_ttl): raise MetadataException("cached metadata expired") else: log.debug("found cached metadata for '%s' (last-modified: %s)" % (thread.url, thread.last_modified)) ne = self.import_metadata(t, url=thread.id) info['Number of Entities'] = ne else: log.debug("got fresh metadata for '%s' (date: %s)" % (thread.url, thread.date)) ne = self.import_metadata(t, url=thread.id) info['Number of Entities'] = ne info['Cache Expiration Time'] = str(thread.last_modified + offset) certs = xmlsec.CertDict(relt) cert = None if certs.values(): cert = certs.values()[0].strip() resolved.add((thread.url, cert)) else: raise MetadataException("unknown metadata type for '%s' (%s)" % (thread.url, relt.tag)) except Exception, ex: #traceback.print_exc(ex) log.warn("problem fetching '%s' (will retry): %s" % (thread.url, ex)) if info is not None: info['Exception'] = ex if thread.tries < self.retry_limit: next_jobs.append((thread.url, thread.verify, thread.id, thread.tries + 1)) else: #traceback.print_exc(ex) log.error("retry limit exceeded for %s (last error was: %s)" % (thread.url, ex)) finally:
def _lookup(self, member, xp=None): """ :param member: Either an entity, URL or a filter expression. Find a (set of) EntityDescriptor element(s) based on the specified 'member' expression. """ def _hash(hn, strv): if hn == 'null': return strv if not hasattr(hashlib, hn): raise MetadataException("Unknown digest mechanism: '%s'" % hn) hash_m = getattr(hashlib, hn) h = hash_m() h.update(strv) return h.hexdigest() if xp is None: xp = "//md:EntityDescriptor" if member is None: lst = [] for m in self.keys(): log.debug("resolving %s filtered by %s" % (m, xp)) lst.extend(self._lookup(m, xp)) return lst elif hasattr(member, 'xpath'): log.debug("xpath filter %s <- %s" % (xp, member)) return member.xpath(xp, namespaces=NS) elif type(member) is str or type(member) is unicode: log.debug("string lookup %s" % member) if '+' in member: member = member.strip('+') log.debug("lookup intersection of '%s'" % ' and '.join(member.split('+'))) hits = None for f in member.split("+"): f = f.strip() if hits is None: hits = set(self._lookup(f, xp)) else: other = self._lookup(f, xp) hits.intersection_update(other) if not hits: log.debug("empty intersection") return [] if hits is not None and hits: return list(hits) else: return [] if "!" in member: (src, xp) = member.split("!") if len(src) == 0: src = None log.debug("filtering using %s" % xp) else: log.debug("selecting %s filtered by %s" % (src, xp)) return self._lookup(src, xp) m = re.match("^\{(.+)\}(.+)$", member) if m is not None: log.debug("attribute-value match: %s='%s'" % (m.group(1), m.group(2))) return self.index.get(m.group(1), m.group(2).rstrip("/")) m = re.match("^(.+)=(.+)$", member) if m is not None: log.debug("attribute-value match: %s='%s'" % (m.group(1), m.group(2))) return self.index.get(m.group(1), m.group(2).rstrip("/")) log.debug("basic lookup %s" % member) for idx in DIGESTS: e = self.index.get(idx, member) if e: log.debug("found %s in %s index" % (e, idx)) return e e = self.get(member, None) if e is not None: return self._lookup(e, xp) e = self.get("%s.xml" % member, None) # hackish but helps save people from their misstakes if e is not None: if not "://" in member: # not an absolute URL log.warn("Found %s.xml as an alias - AVOID extensions in 'select as' statements" % member) return self._lookup(e, xp) if "://" in member: # looks like a URL and wasn't an entity or collection - recurse away! log.debug("recursively fetching members from '%s'" % member) # note that this supports remote lists which may be more rope than is healthy return [self._lookup(line, xp) for line in urllib.urlopen(member).iterlines()] return [] elif hasattr(member, '__iter__') and type(member) is not dict: if not len(member): member = self.keys() return [self._lookup(m, xp) for m in member] else: raise MetadataException("What about %s ??" % member)
def request(self, **kwargs): """The main request processor. This code implements all rendering of metadata. """ stats['MD Requests'] += 1 if not self.ready: raise HTTPError(503, _("Service Unavailable (repository loading)")) pfx = kwargs.get('pfx', None) path = kwargs.get('path', None) content_type = kwargs.get('content_type', None) log.debug("MDServer pfx=%s, path=%s, content_type=%s" % (pfx, path, content_type)) def _d(x, do_split=True): if x is not None: x = x.strip() log.debug("_d(%s,%s)" % (x, do_split)) if x is None or len(x) == 0: return None, None if x.startswith("{base64}"): x = x[8:].decode('base64') if do_split and '.' in x: (pth, dot, extn) = x.rpartition('.') assert (dot == '.') if extn in _ctypes: return pth, extn return x, None _ctypes = {'xml': 'application/xml', 'json': 'application/json', 'htm': 'text/html', 'html': 'text/html', 'ds': 'text/html', 's': 'application/json'} alias = None if pfx: alias = pfx pfx = self.aliases.get(alias, None) if pfx is None: raise NotFound() path, ext = _d(path, content_type is None) if pfx and path: q = "{%s}%s" % (pfx, path) path = "/%s/%s" % (alias, path) else: q = path if ext is not None: log.debug("request path: %s.%s, headers: %s" % (path, ext, cherrypy.request.headers)) else: log.debug("request path: %s, headers: %s" % (path, cherrypy.request.headers)) accept = {} if content_type is None: if ext is not None and ext in _ctypes: accept = {_ctypes[ext]: True} else: accept = MDServer.MediaAccept() if ext is not None: path = "%s.%s" % (path, ext) else: accept = {content_type: True} with self.lock.readlock: if ext == 'ds': pdict = dict() entity_id = kwargs.get('entityID', None) if entity_id is None: raise HTTPError(400, _("400 Bad Request - missing entityID")) pdict['sp'] = self.md.sha1_id(entity_id) e = self.md.store.lookup(entity_id) if e is None or len(e) == 0: raise HTTPError(404) if len(e) > 1: raise HTTPError(400, _("400 Bad Request - multiple matches for") + " %s" % entity_id) pdict['entity'] = self.md.simple_summary(e[0]) if not path: pdict['search'] = "/search/" pdict['list'] = "/role/idp.json" else: pdict['search'] = "%s.s" % path pdict['list'] = "%s.json" % path cherrypy.response.headers['Content-Type'] = 'text/html' return render_template("ds.html", **pdict) elif ext == 's': paged = bool(kwargs.get('paged', False)) query = kwargs.get('query', None) page = kwargs.get('page', 0) page_limit = kwargs.get('page_limit', 10) entity_filter = kwargs.get('entity_filter', None) related = kwargs.get('related', None) cherrypy.response.headers['Content-Type'] = 'application/json' if query is None: log.debug("empty query - creating one") query = [cherrypy.request.remote.ip] referrer = cherrypy.request.headers.get('referrer', None) if referrer is not None: log.debug("including referrer: %s" % referrer) url = urlparse.urlparse(referrer) host = url.netloc if ':' in url.netloc: (host, port) = url.netloc.split(':') for host_part in host.rstrip(self.psl.get_public_suffix(host)).split('.'): if host_part is not None and len(host_part) > 0: query.append(host_part) log.debug("created query: %s" % ",".join(query)) if paged: res, more, total = self.md.search(query, path=q, page=int(page), page_limit=int(page_limit), entity_filter=entity_filter, related=related) # log.debug(dumps({'entities': res, 'more': more, 'total': total})) return dumps({'entities': res, 'more': more, 'total': total}) else: return dumps(self.md.search(query, path=q, entity_filter=entity_filter, related=related)) elif accept.get('text/html'): if not q: if pfx: title = pfx else: title = _("Metadata By Attributes") return render_template("index.html", md=self.md, alias=alias, aliases=self.aliases, title=title) else: entities = self.md.lookup(q) if not entities: raise NotFound() if len(entities) > 1: return render_template("metadata.html", md=self.md, subheading=q, entities=entities) else: entity = entities[0] t = html.fragment_fromstring(unicode(xslt_transform(entity, "entity2html.xsl"))) for c_elt in t.findall(".//code[@role='entity']"): c_txt = dumptree(entity) parser = etree.XMLParser(remove_blank_text=True) src = StringIO(c_txt) tree = etree.parse(src, parser) c_txt = dumptree(tree, pretty_print=True, xml_declaration=False).decode("utf-8") p = c_elt.getparent() p.remove(c_elt) if p.text is not None: p.text += c_txt else: p.text = c_txt xml = dumptree(t, xml_declaration=False).decode('utf-8') return render_template("entity.html", headline=self.md.display(entity).strip(), subheading=entity.get('entityID'), entity_id=entity.get('entityID'), content=xml) else: for p in self.plumbings: state = {'request': True, 'headers': {'Content-Type': 'text/xml'}, 'accept': accept, 'url': cherrypy.url(relative=False), 'select': q, 'path': path, 'stats': {}} r = p.process(self.md, state=state) if r is not None: cache_ttl = state.get('cache', 0) log.debug("caching for %d seconds" % cache_ttl) for k, v in state.get('headers', {}).iteritems(): cherrypy.response.headers[k] = v caching.expires(secs=cache_ttl) return r raise NotFound()
def load(req, *opts): """ General-purpose resource fetcher. :param req: The request :param opts: Options: See "Options" below :return: None Supports both remote and local resources. Fetching remote resources is done in parallel using threads. Note: When downloading remote files over HTTPS the TLS server certificate is not validated. Note: Default behaviour is to ignore metadata files or entities in MD files that cannot be loaded Options are put directly after "load". E.g: .. code-block:: yaml - load fail_on_error True filter_invalid False: - http://example.com/some_remote_metadata.xml - local_file.xml - /opt/directory_containing_md_files/ **Options** Defaults are marked with (*) - max_workers <5> : Number of parallel threads to use for loading MD files - timeout <120> : Socket timeout when downloading files - validate <True*|False> : When true downloaded metadata files are validated (schema validation) - fail_on_error <True|False*> : Control whether an error during download, parsing or (optional)validatation of a MD file does not abort processing of the pipeline. When true a failure aborts and causes pyff to exit with a non zero exit code. Otherwise errors are logged but ignored. - filter_invalid <True*|False> : Controls validation behaviour. When true Entities that fail validation are filtered I.e. are not loaded. When false the entire metadata file is either loaded, or not. fail_on_error controls whether failure to validating the entire MD file will abort processing of the pipeline. """ opts = dict(zip(opts[::2], opts[1::2])) opts.setdefault('timeout', 120) opts.setdefault('max_workers', 5) opts.setdefault('validate', "True") opts.setdefault('fail_on_error', "False") opts.setdefault('filter_invalid', "True") opts['validate'] = bool(strtobool(opts['validate'])) opts['fail_on_error'] = bool(strtobool(opts['fail_on_error'])) opts['filter_invalid'] = bool(strtobool(opts['filter_invalid'])) remotes = [] for x in req.args: x = x.strip() log.debug("load parsing '%s'" % x) r = x.split() assert len(r) in range(1, 7), PipeException( "Usage: load resource [as url] [[verify] verification] [via pipeline]" ) url = r.pop(0) params = dict() while len(r) > 0: elt = r.pop(0) if elt in ("as", "verify", "via"): if len(r) > 0: params[elt] = r.pop(0) else: raise PipeException( "Usage: load resource [as url] [[verify] verification] [via pipeline]" ) else: params['verify'] = elt for elt in ("verify", "via"): params.setdefault(elt, None) params.setdefault('as', url) post = None if params['via'] is not None: post = PipelineCallback(params['via'], req) if "://" in url: log.debug("load {} verify {} as {} via {}".format( url, params['verify'], params['as'], params['via'])) remotes.append((url, params['verify'], params['as'], post)) elif os.path.exists(url): if os.path.isdir(url): log.debug("directory {} verify {} as {} via {}".format( url, params['verify'], params['as'], params['via'])) req.md.load_dir(url, url=params['as'], validate=opts['validate'], post=post, fail_on_error=opts['fail_on_error'], filter_invalid=opts['filter_invalid']) elif os.path.isfile(url): log.debug("file {} verify {} as {} via {}".format( url, params['verify'], params['as'], params['via'])) remotes.append( ("file://%s" % url, params['verify'], params['as'], post)) else: error = "Unknown file type for load: '{}'".format(url) if opts['fail_on_error']: raise PipeException(error) log.error(error) else: error = "Don't know how to load '{}' as {} verify {} via {} (file does not exist?)".format( url, params['as'], params['verify'], params['via']) if opts['fail_on_error']: raise PipeException(error) log.error(error) req.md.fetch_metadata(remotes, **opts)
def __iter__(self): for e in self.lookup("entities"): log.debug("**** yield entityID=%s" % e.get('entityID')) yield e
info['Exception'] = ex if thread.tries < self.retry_limit: next_jobs.append((thread.url, thread.verify, thread.id, thread.tries + 1)) else: #traceback.print_exc(ex) log.error("retry limit exceeded for %s (last error was: %s)" % (thread.url, ex)) finally: nfinished += 1 if info is not None: stats[thread.url] = info resources = [(url, verify, rid, 0) for url, verify, rid in resources] resolved = set() cache = True while len(resources) > 0: log.debug("fetching %d resources (%s)" % (len(resources), repr(resources))) next_jobs = [] q = Queue(qsize) prod_thread = threading.Thread(target=producer, args=(q, resources, cache)) cons_thread = threading.Thread(target=consumer, args=(q, len(resources), stats, next_jobs, resolved)) prod_thread.start() cons_thread.start() prod_thread.join() cons_thread.join() log.debug("after fetch: %d jobs to retry" % len(next_jobs)) if len(next_jobs) > 0: resources = next_jobs cache = False else: resources = []