def validate_norm(fn, nn, version, it): global norm global result # open XML parser n = parse(open(os.path.join(path, fn))) # validate DTD dtd = DTD(open(os.path.join(path, 'normalizer.dtd'))) assert dtd.validate(n) == True # Create normalizer from xml definition norm = Normalizer(n, os.path.join(path, 'common_tagTypes.xml'), os.path.join(path, 'common_callBacks.xml')) # Time normalizer validation try: assert norm.name.lower() == nn.lower() if norm.name != nn: print "Warning, %s has name attribute set to %s" % (fn, norm.name) except AssertionError: print "\n[%s]" % norm.name, "and [%s]" % nn, "don't match" return try: assert norm.version == version except AssertionError: print "\n[%s]" % norm.version, "and [%s]" % version, "don't match" return samples_amount = len([u for u in [v.examples for v in norm.patterns.values()]]) if samples_amount <= 0: print "No samples to validate in %s" % fn return t = timeit.Timer("assert norm.validate() == True", "from __main__ import norm") s = t.timeit(it) # Normalize result against number of validated samples s = s / float(samples_amount) # Add result result.add_res(norm.name, norm.version, norm.authors, s)
def normalize_samples(self, norm, name, version): """Test logparser.normalize validate for syslog normalizer.""" # open parser n = parse(open(os.path.join(self.normalizer_path, norm))) # validate DTD dtd = DTD(open(os.path.join(self.normalizer_path, 'normalizer.dtd'))) dtd.assertValid(n) # Create normalizer from xml definition normalizer = Normalizer(n, os.path.join(self.normalizer_path, 'common_tagTypes.xml'), os.path.join(self.normalizer_path, 'common_callBacks.xml')) self.assertEquals(normalizer.name, name) self.assertEquals(normalizer.version, version) self.assertTrue(normalizer.validate())
def normalize_samples(self, norm, name, version): """Test logparser.normalize validate for syslog normalizer.""" # open parser n = parse(open(os.path.join(self.normalizer_path, norm))) # validate DTD dtd = DTD(open(os.path.join(self.normalizer_path, 'normalizer.dtd'))) self.assertTrue(dtd.validate(n)) # Create normalizer from xml definition normalizer = Normalizer( n, os.path.join(self.normalizer_path, 'common_tagTypes.xml')) self.assertEquals(normalizer.name, name) self.assertEquals(normalizer.version, version) self.assertTrue(normalizer.validate())
def xhtmlValidate(modelXbrl, elt): from lxml.etree import DTD, XMLSyntaxError # copy xhtml elements to fresh tree with open(os.path.join(modelXbrl.modelManager.cntlr.configDir, "xhtml1-strict-ix.dtd")) as fh: dtd = DTD(fh) try: if not dtd.validate( XmlUtil.ixToXhtml(elt) ): modelXbrl.error("xmlDTD:error", _("%(element)s error %(error)s"), modelObject=elt, element=elt.localName.title(), error=', '.join(e.message for e in dtd.error_log.filter_from_errors())) except XMLSyntaxError as err: modelXbrl.error("xmlDTD:error", _("%(element)s error %(error)s"), modelObject=elt, element=elt.localName.title(), error=dtd.error_log.filter_from_errors())
def _validate_xml_tree( self, tree: etree._ElementTree, dtd: etree.DTD ) -> None: """Verifies that element is valid. Arguments: tree {etree._ElementTree} -- tree to verify dtd {etree.DTD} -- DTD used for validation Raises: ValidationError: If element is invalid """ if dtd and dtd.validate(tree) is False: #TODO convert list to text and rais for each entry raise ValidationError(str(dtd.error_log.filter_from_errors())) #TODO: work out simple metrics that can be counted during parsing #TODO define metric modules #TODO toplevel class provides add_stats -> sub module references stats #TODO open/close functions for reset/finish checking #TODO raw metrics: nummber of fbs, fbtypes, ecc states, events, inputs, outputs, ecc vertrices #TODO further metrics: undefined datatypes, cycles in ecc, not reacheable ecc states, locks in ecc,... #TODO each checker has a check function -> sorted list by prio -> do_check iterates over all checkers #TODO define output format of msg, write list to file , maybe GUI output?? (super extra) would be useful for graphs #TODO output would be filename, elment name, type (err/warn), message #TODO how to handle incomplete types
def test(xhtml_file: Path, dtd: DTD, schematron: Schematron) -> bool: """ Test that an XHTML file matches a DTD and passes Schematron tests. Error messages are printed to stderr if the file doesn't pass. :param xhtml_file: the XHTML file to test :param dtd: the DTD :param schematron: the Schematron :return: True if the file passes """ if settings.verbose: print(xhtml_file) clear_error_log() parser = XHTMLParser(dtd_validation=True, ns_clean=True) try: tree = parse(source=str(xhtml_file), parser=parser) html = tree.getroot() except IOError as e: print(f"{xhtml_file}: {e.strerror}", file=stderr) return False except XMLSyntaxError: print_error_log(parser.error_log) return False if not dtd.validate(html): print_error_log(dtd.error_log) return False if not schematron.validate(html): print_schematron_error_log(html, schematron) return False return test_links(xhtml_file, html) and test_images(xhtml_file, html)
def loadDTD(modelXbrl): global edbodyDTD if edbodyDTD is None: with open( os.path.join(modelXbrl.modelManager.cntlr.configDir, "edbody.dtd")) as fh: edbodyDTD = DTD(fh)
def loadDTD(modelXbrl): global edbodyDTD, isInlineDTD initModelDocumentTypeReferences() _isInline = modelXbrl.modelDocument.type == ModelDocumentTypeINLINEXBRL if isInlineDTD is None or isInlineDTD != _isInline: isInlineDTD = _isInline with open(os.path.join(modelXbrl.modelManager.cntlr.configDir, "xhtml1-strict-ix.dtd" if _isInline else "edbody.dtd")) as fh: edbodyDTD = DTD(fh)
def run(xhtml_files: List[Path], dtd_file: Path, images: bool, links: bool) -> bool: try: dtd = DTD(str(dtd_file)) except DTDParseError as e: print(e.error_log, file=stderr) clear_error_log() return False else: success = True for file in xhtml_files: # if you reuse the parser on too many documents it gets confused parser = XHTMLParser(dtd_validation=True, ns_clean=True) dtd = DTD(str(dtd_file)) if settings.verbose: print(xhtml_file) if not test(file, parser, dtd, images, links): success = False return success
def loadDTD(modelXbrl): global edbodyDTD, isInlineDTD, ModelDocumentTypeINLINEXBRL if ModelDocumentTypeINLINEXBRL is None: from arelle.ModelDocument import Type ModelDocumentTypeINLINEXBRL = Type.INLINEXBRL _isInline = modelXbrl.modelDocument.type == ModelDocumentTypeINLINEXBRL if isInlineDTD is None or isInlineDTD != _isInline: isInlineDTD = _isInline with open(os.path.join(modelXbrl.modelManager.cntlr.configDir, "xhtml1-strict-ix.dtd" if _isInline else "edbody.dtd")) as fh: edbodyDTD = DTD(fh)
def open_dtd(dtd_file: Path) -> DTD: """ Open a validate an XML DTD. Exit program on failure. :param dtd_file: path to a DTD file :return: A DTD object """ try: return DTD(str(dtd_file)) except DTDParseError as e: print(f"{dtd_file}:1: {e}", file=stderr) exit(1)
def xhtmlValidate(modelXbrl, elt): from lxml.etree import DTD, XMLSyntaxError # copy xhtml elements to fresh tree with open( os.path.join(modelXbrl.modelManager.cntlr.configDir, "xhtml1-strict-ix.dtd")) as fh: dtd = DTD(fh) try: if not dtd.validate(XmlUtil.ixToXhtml(elt)): modelXbrl.error("xmlDTD:error", _("%(element)s error %(error)s"), modelObject=elt, element=elt.localName.title(), error=', '.join( e.message for e in dtd.error_log.filter_from_errors())) except XMLSyntaxError as err: modelXbrl.error("xmlDTD:error", _("%(element)s error %(error)s"), modelObject=elt, element=elt.localName.title(), error=dtd.error_log.filter_from_errors())
def __init__(self, normalizers_paths, active_normalizers={}): """ Instantiates a flow manager. The default behavior is to activate every available normalizer. @param normalizers_paths: a list of absolute paths to the normalizer XML definitions to use or a just a single path as str. @param active_normalizers: a dictionary of active normalizers in the form {name-version : [True|False]}. """ if not isinstance(normalizers_paths, list or tuple): normalizers_paths = [ normalizers_paths, ] self.normalizers_paths = normalizers_paths self.active_normalizers = active_normalizers self.dtd, self.ctt, self.ccb = None, None, None # Walk through paths for normalizer.dtd and common_tagTypes.xml # /!\ dtd file and common elements will be overrriden if present in # many directories. for norm_path in self.normalizers_paths: if not os.path.isdir(norm_path): raise ValueError, "Invalid normalizer directory : %s" % norm_path dtd = os.path.join(norm_path, 'normalizer.dtd') ctt = os.path.join(norm_path, 'common_tagTypes.xml') ccb = os.path.join(norm_path, 'common_callBacks.xml') if os.path.isfile(dtd): self.dtd = DTD(open(dtd)) if os.path.isfile(ctt): self.ctt = ctt if os.path.isfile(ccb): self.ccb = ccb # Technically the common elements files should NOT be mandatory. # But many normalizers use them, so better safe than sorry. if not self.dtd or not self.ctt or not self.ccb: raise StandardError, "Missing DTD or common library files" self._cache = [] self.reload()
def test(xhtml_file: Path, parser: XHTMLParser, dtd: DTD, images: bool, links: bool) -> bool: success = False try: try: document = parse(source=str(xhtml_file), parser=parser).getroot() dtd.assertValid(document) except IOError as e: print(f"{xhtml_file}: {e.strerror}", file=stderr) except XMLSyntaxError as e: print(str(e.error_log), file=stderr) except DocumentInvalid as e: print(str(e.error_log), file=stderr) else: success = True if images: success = success and test_images(xhtml_file, document) if links: success = success and test_links(xhtml_file, document) finally: clear_error_log() return success
def validate_norm(fn, nn, version, it): global norm global result # open XML parser n = parse(open(os.path.join(path, fn))) # validate DTD dtd = DTD(open(os.path.join(path, 'normalizer.dtd'))) assert dtd.validate(n) == True # Create normalizer from xml definition norm = Normalizer(n, os.path.join(path, 'common_tagTypes.xml'), os.path.join(path, 'common_callBacks.xml')) # Time normalizer validation try: assert norm.name.lower() == nn.lower() if norm.name != nn: print "Warning, %s has name attribute set to %s" % (fn, norm.name) except AssertionError: print "\n[%s]" % norm.name, "and [%s]" % nn, "don't match" return try: assert norm.version == version except AssertionError: print "\n[%s]" % norm.version, "and [%s]" % version, "don't match" return samples_amount = len( [u for u in [v.examples for v in norm.patterns.values()]]) if samples_amount <= 0: print "No samples to validate in %s" % fn return t = timeit.Timer("assert norm.validate() == True", "from __main__ import norm") s = t.timeit(it) # Normalize result against number of validated samples s = s / float(samples_amount) # Add result result.add_res(norm.name, norm.version, norm.authors, s)
def __init__(self, normalizers_path, active_normalizers={}): """ Instantiates a flow manager. The default behavior is to activate every available normalizer. @param normalizer_path: absolute path to the normalizer XML definitions to use. @param active_normalizers: a dictionary of active normalizers in the form {name: [True|False]}. """ self.normalizers_path = normalizers_path self.active_normalizers = active_normalizers self.dtd = DTD(open(os.path.join(self.normalizers_path, "normalizer.dtd"))) self._cache = [] self.reload()
def __init__(self, normalizers_paths, active_normalizers = {}): """ Instantiates a flow manager. The default behavior is to activate every available normalizer. @param normalizers_paths: a list of absolute paths to the normalizer XML definitions to use or a just a single path as str. @param active_normalizers: a dictionary of active normalizers in the form {name-version : [True|False]}. """ if not isinstance(normalizers_paths, list or tuple): normalizers_paths = [normalizers_paths,] self.normalizers_paths = normalizers_paths self.active_normalizers = active_normalizers self.dtd, self.ctt, self.ccb = None, None, None # Walk through paths for normalizer.dtd and common_tagTypes.xml # /!\ dtd file and common elements will be overrriden if present in # many directories. for norm_path in self.normalizers_paths: if not os.path.isdir(norm_path): raise ValueError, "Invalid normalizer directory : %s" % norm_path dtd = os.path.join(norm_path, 'normalizer.dtd') ctt = os.path.join(norm_path, 'common_tagTypes.xml') ccb = os.path.join(norm_path, 'common_callBacks.xml') if os.path.isfile(dtd): self.dtd = DTD(open(dtd)) if os.path.isfile(ctt): self.ctt = ctt if os.path.isfile(ccb): self.ccb = ccb # Technically the common elements files should NOT be mandatory. # But many normalizers use them, so better safe than sorry. if not self.dtd or not self.ctt or not self.ccb: raise StandardError, "Missing DTD or common library files" self._cache = [] self.reload()
def __init__(self, normalizers_paths, active_normalizers = {}): """ Instantiates a flow manager. The default behavior is to activate every available normalizer. @param normalizers_paths: a list of absolute paths to the normalizer XML definitions to use or a just a single path as str. @param active_normalizers: a dictionary of active normalizers in the form {name: [True|False]}. """ if not isinstance(normalizers_paths, list or tuple): normalizers_paths = [normalizers_paths,] self.normalizers_paths = normalizers_paths self.active_normalizers = active_normalizers # Walk through paths for normalizer.dtd and common_tagTypes.xml for norm_path in self.normalizers_paths: dtd = os.path.join(norm_path, 'normalizer.dtd') ctt = os.path.join(norm_path, 'common_tagTypes.xml') if os.path.isfile(dtd): self.dtd = DTD(open(dtd)) if os.path.isfile(ctt): self.ctt = ctt self._cache = [] self.reload()
def xhtmlValidate(modelXbrl, elt): from lxml.etree import DTD, XMLSyntaxError from arelle import FunctionIxt ixNsStartTags = ["{" + ns + "}" for ns in XbrlConst.ixbrlAll] isEFM = modelXbrl.modelManager.disclosureSystem.validationType == "EFM" # find ix version for messages _ixNS = elt.modelDocument.ixNS _xhtmlDTD = XHTML_DTD[_ixNS] _customTransforms = modelXbrl.modelManager.customTransforms or {} def checkAttribute(elt, isIxElt, attrTag, attrValue): ixEltAttrDefs = ixAttrDefined.get(elt.namespaceURI, EMPTYDICT).get(elt.localName, ()) if attrTag.startswith("{"): ns, sep, localName = attrTag[1:].partition("}") else: ns = None localName = attrTag if ns is not None and ns not in XbrlConst.ixbrlAll and attrTag not in ixEltAttrDefs: if ns == XbrlConst.xsi: pass # xsi attributes are always allowed elif isIxElt: allowedNs = allowedNonIxAttrNS.get(elt.localName, None) if allowedNs != "##other" and ns != allowedNs: modelXbrl.error(ixMsgCode("qualifiedAttributeNotExpected", elt), _("Inline XBRL element %(element)s has qualified attribute %(name)s"), modelObject=elt, element=str(elt.elementQname), name=attrTag) if ns == XbrlConst.xbrli and elt.localName in { "fraction", "nonFraction", "nonNumeric", "references", "relationship", "tuple"}: modelXbrl.error(ixMsgCode("qualifiedAttributeDisallowed", elt), _("Inline XBRL element %(element)s has disallowed attribute %(name)s"), modelObject=elt, element=str(elt.elementQname), name=attrTag) else: if ns in XbrlConst.ixbrlAll: modelXbrl.error(ixMsgCode("inlineAttributeMisplaced", elt, name="other"), _("Inline XBRL attributes are not allowed on html elements: ix:%(name)s"), modelObject=elt, name=localName) elif ns not in {XbrlConst.xml, XbrlConst.xsi, XbrlConst.xhtml}: modelXbrl.error(ixMsgCode("extensionAttributeMisplaced", ns=_ixNS), _("Extension attributes are not allowed on html elements: %(tag)s"), modelObject=elt, tag=attrTag) elif isIxElt: try: _xsdType = ixAttrType[elt.namespaceURI][localName] if isinstance(_xsdType, dict): baseXsdType = _xsdType["type"] facets = _xsdType else: baseXsdType = _xsdType facets = None XmlValidate.validateValue(modelXbrl, elt, attrTag, baseXsdType, attrValue, facets=facets) if not (attrTag in ixEltAttrDefs or (localName in ixEltAttrDefs and (not ns or ns in XbrlConst.ixbrlAll))): raise KeyError disallowedXbrliAttrs = ({"scheme", "periodType", "balance", "contextRef", "unitRef", "precision", "decimals"} - {"fraction": {"contextRef", "unitRef"}, "nonFraction": {"contextRef", "unitRef", "decimals", "precision"}, "nonNumeric": {"contextRef"}}.get(elt.localName, set())) disallowedAttrs = set(a for a in disallowedXbrliAttrs if elt.get(a) is not None) if disallowedAttrs: modelXbrl.error(ixMsgCode("inlineElementAttributes",elt), _("Inline XBRL element %(element)s has disallowed attributes %(attributes)s"), modelObject=elt, element=elt.elementQname, attributes=", ".join(disallowedAttrs)) except KeyError: modelXbrl.error(ixMsgCode("attributeNotExpected",elt), _("Attribute %(attribute)s is not expected on element ix:%(element)s"), modelObject=elt, attribute=attrTag, element=elt.localName) elif ns is None: _xsdType = htmlAttrType.get(localName) if _xsdType is not None: if isinstance(_xsdType, dict): baseXsdType = _xsdType["type"] facets = _xsdType else: baseXsdType = _xsdType facets = None XmlValidate.validateValue(modelXbrl, elt, attrTag, baseXsdType, attrValue, facets=facets) def checkHierarchyConstraints(elt): constraints = ixHierarchyConstraints.get(elt.localName) if constraints: for _rel, names in constraints: reqt = _rel[0] rel = _rel[1:] if reqt in ('&', '^', '1'): nameFilter = ('*',) else: nameFilter = names if nameFilter == ('*',): namespaceFilter = namespacePrefix = '*' elif len(nameFilter) == 1 and "}" in nameFilter[0] and nameFilter[0][0] == "{": namespaceFilter, _sep, nameFilter = nameFilter[0][1:].partition("}") namespacePrefix = XmlUtil.xmlnsprefix(elt,namespaceFilter) else: namespaceFilter = elt.namespaceURI namespacePrefix = elt.prefix relations = {"ancestor": XmlUtil.ancestor, "parent": XmlUtil.parent, "child-choice": XmlUtil.children, "child-sequence": XmlUtil.children, "child-or-text": XmlUtil.children, "descendant": XmlUtil.descendants}[rel]( elt, namespaceFilter, nameFilter) if rel in ("ancestor", "parent"): if relations is None: relations = [] else: relations = [relations] if rel == "child-or-text": relations += XmlUtil.innerTextNodes(elt, ixExclude=True, ixEscape=False, ixContinuation=False, ixResolveUris=False) issue = '' if reqt in ('^',): if not any(r.localName in names and r.namespaceURI == elt.namespaceURI for r in relations): issue = " and is missing one of " + ', '.join(names) if reqt in ('1',) and not elt.isNil: if sum(r.localName in names and r.namespaceURI == elt.namespaceURI for r in relations) != 1: issue = " and must have exactly one of " + ', '.join(names) if reqt in ('&', '^'): disallowed = [str(r.elementQname) for r in relations if not (r.tag in names or (r.localName in names and r.namespaceURI == elt.namespaceURI))] if disallowed: issue += " and may not have " + ", ".join(disallowed) elif rel == "child-sequence": sequencePosition = 0 for i, r in enumerate(relations): rPos = names.index(str(r.localName)) if rPos < sequencePosition: issue += " and is out of sequence: " + str(r.elementQname) else: sequencePosition = rPos if reqt == '?' and len(relations) > 1: issue = " may only have 0 or 1 but {0} present ".format(len(relations)) if reqt == '+' and len(relations) == 0: issue = " must have at least 1 but none present " disallowedChildText = bool(reqt == '&' and rel in ("child-sequence", "child-choice") and elt.textValue.strip()) if ((reqt == '+' and not relations) or (reqt == '-' and relations) or (issue) or disallowedChildText): code = "{}:{}".format(ixSect[elt.namespaceURI].get(elt.localName,"other")["constraint"], { 'ancestor': "ancestorNode", 'parent': "parentNode", 'child-choice': "childNodes", 'child-sequence': "childNodes", 'child-or-text': "childNodesOrText", 'descendant': "descendantNodes"}[rel] + { '+': "Required", '-': "Disallowed", '&': "Allowed", '^': "Specified", '1': "Specified"}.get(reqt, "Specified")) msg = _("Inline XBRL ix:{0} {1} {2} {3} {4} element{5}").format( elt.localName, {'+': "must", '-': "may not", '&': "may only", '?': "may", '+': "must", '^': "must", '1': "must"}[reqt], {'ancestor': "be nested in", 'parent': "have parent", 'child-choice': "have child", 'child-sequence': "have child", 'child-or-text': "have child or text,", 'descendant': "have as descendant"}[rel], '' if rel == 'child-or-text' else ', '.join(str(r.elementQname) for r in relations) if names == ('*',) and relations else ", ".join("{}:{}".format(namespacePrefix, n) for n in names), issue, " and no child text (\"{}\")".format(elt.textValue.strip()[:32]) if disallowedChildText else "") modelXbrl.error(code, msg, modelObject=[elt] + relations, requirement=reqt, messageCodes=("ix{ver.sect}:ancestorNode{Required|Disallowed}", "ix{ver.sect}:childNodesOrTextRequired", "ix{ver.sect}:childNodes{Required|Disallowed|Allowed}", "ix{ver.sect}:descendantNodesDisallowed", "ix{ver.sect}:parentNodeRequired")) # other static element checks (that don't require a complete object model, context, units, etc if elt.localName == "nonFraction": childElts = XmlUtil.children(elt, '*', '*') hasText = (elt.text or "") or any((childElt.tail or "") for childElt in childElts) if elt.isNil: ancestorNonFractions = XmlUtil.ancestors(elt, _ixNS, elt.localName) if ancestorNonFractions: modelXbrl.error(ixMsgCode("nonFractionAncestors", elt), _("Fact %(fact)s is a nil nonFraction and MUST not have an ancestor ix:nonFraction"), modelObject=[elt] + ancestorNonFractions, fact=elt.qname) if childElts or hasText: modelXbrl.error(ixMsgCode("nonFractionTextAndElementChildren", elt), _("Fact %(fact)s is a nil nonFraction and MUST not have an child elements or text"), modelObject=[elt] + childElts, fact=elt.qname) elt.setInvalid() # prevent further validation or cascading errors else: if ((childElts and (len(childElts) != 1 or childElts[0].namespaceURI != _ixNS or childElts[0].localName != "nonFraction")) or (childElts and hasText)): modelXbrl.error(ixMsgCode("nonFractionTextAndElementChildren", elt), _("Fact %(fact)s is a non-nil nonFraction and MUST have exactly one ix:nonFraction child element or text."), modelObject=[elt] + childElts, fact=elt.qname) elt.setInvalid() if elt.localName == "fraction": if elt.isNil: ancestorFractions = XmlUtil.ancestors(elt, _ixNS, elt.localName) if ancestorFractions: modelXbrl.error(ixMsgCode("fractionAncestors", elt), _("Fact %(fact)s is a nil fraction and MUST not have an ancestor ix:fraction"), modelObject=[elt] + ancestorFractions, fact=elt.qname) else: nonFrChildren = [e for e in XmlUtil.children(elt, _ixNS, '*') if e.localName not in ("fraction", "numerator", "denominator")] if nonFrChildren: modelXbrl.error(ixMsgCode("fractionElementChildren", elt), _("Fact %(fact)s is a non-nil fraction and not have any child elements except ix:fraction, ix:numerator and ix:denominator: %(children)s"), modelObject=[elt] + nonFrChildren, fact=elt.qname, children=", ".join(e.localName for e in nonFrChildren)) for ancestorFraction in XmlUtil.ancestors(elt, XbrlConst.ixbrl11, "fraction"): # only ix 1.1 if normalizeSpace(elt.get("unitRef")) != normalizeSpace(ancestorFraction.get("unitRef")): modelXbrl.error(ixMsgCode("fractionNestedUnitRef", elt), _("Fact %(fact)s fraction and ancestor fractions must have matching unitRefs: %(unitRef)s, %(unitRef2)s"), modelObject=[elt] + nonFrChildren, fact=elt.qname, unitRef=elt.get("unitRef"), unitRef2=ancestorFraction.get("unitRef")) if elt.localName in ("nonFraction", "numerator", "denominator", "nonNumeric"): fmt = elt.format if fmt: if fmt in _customTransforms: pass elif fmt.namespaceURI not in FunctionIxt.ixtNamespaceFunctions: modelXbrl.error(ixMsgCode("invalidTransformation", elt, sect="validation"), _("Fact %(fact)s has unrecognized transformation namespace %(namespace)s"), modelObject=elt, fact=elt.qname, transform=fmt, namespace=fmt.namespaceURI) elt.setInvalid() elif fmt.localName not in FunctionIxt.ixtNamespaceFunctions[fmt.namespaceURI]: modelXbrl.error(ixMsgCode("invalidTransformation", elt, sect="validation"), _("Fact %(fact)s has unrecognized transformation name %(name)s"), modelObject=elt, fact=elt.qname, transform=fmt, name=fmt.localName) elt.setInvalid() def ixToXhtml(fromRoot): toRoot = etree.Element(fromRoot.localName) copyNonIxChildren(fromRoot, toRoot) for attrTag, attrValue in fromRoot.items(): checkAttribute(fromRoot, False, attrTag, attrValue) if attrTag not in ('version', # used in inline test cases but not valid xhtml '{http://www.w3.org/2001/XMLSchema-instance}schemaLocation'): toRoot.set(attrTag, attrValue) return toRoot def copyNonIxChildren(fromElt, toElt, excludeSubtree=False): for fromChild in fromElt.iterchildren(): if isinstance(fromChild, ModelObject): isIxNs = fromChild.namespaceURI in XbrlConst.ixbrlAll if isIxNs: if fromChild.localName not in ixElements[fromChild.namespaceURI]: modelXbrl.error(ixMsgCode("elementNameInvalid",ns=_ixNS), _("Inline XBRL element name %(element)s is not valid"), modelObject=fromChild, element=str(fromChild.elementQname)) else: checkHierarchyConstraints(fromChild) for attrTag, attrValue in fromChild.items(): checkAttribute(fromChild, True, attrTag, attrValue) for attrTag in ixAttrRequired[fromChild.namespaceURI].get(fromChild.localName,[]): if fromChild.get(attrTag) is None: modelXbrl.error(ixMsgCode("attributeRequired", fromChild), _("Attribute %(attribute)s required on element ix:%(element)s"), modelObject=fromChild, attribute=attrTag, element=fromChild.localName) if excludeSubtree or (fromChild.localName in {"references", "resources"} and isIxNs): copyNonIxChildren(fromChild, toElt, excludeSubtree=True) else: if fromChild.localName in {"footnote", "nonNumeric", "continuation"} and isIxNs: toChild = etree.Element("ixNestedContent") toElt.append(toChild) copyNonIxChildren(fromChild, toChild) if fromChild.text is not None: toChild.text = fromChild.text if fromChild.tail is not None: toChild.tail = fromChild.tail elif isIxNs: copyNonIxChildren(fromChild, toElt) else: toChild = etree.Element(fromChild.localName) toElt.append(toChild) copyNonIxChildren(fromChild, toChild) for attrTag, attrValue in fromChild.items(): checkAttribute(fromChild, False, attrTag, attrValue) toChild.set(attrTag, attrValue) if fromChild.text is not None: toChild.text = fromChild.text if fromChild.tail is not None: toChild.tail = fromChild.tail # copy xhtml elements to fresh tree with open(os.path.join(modelXbrl.modelManager.cntlr.configDir, _xhtmlDTD)) as fh: dtd = DTD(fh) try: #with open("/users/hermf/temp/testDtd.htm", "w") as fh: # fh.write(etree.tostring(ixToXhtml(elt), encoding=_STR_UNICODE, pretty_print=True)) if not dtd.validate( ixToXhtml(elt) ): modelXbrl.error("html:syntaxError", _("%(element)s error %(error)s"), modelObject=elt, element=elt.localName.title(), error=', '.join(e.message for e in dtd.error_log.filter_from_errors())) if isEFM: ValidateFilingText.validateHtmlContent(modelXbrl, elt, elt, "InlineXBRL", "EFM.5.02.05.", isInline=True) except XMLSyntaxError as err: modelXbrl.error("html:syntaxError", _("%(element)s error %(error)s"), modelObject=elt, element=elt.localName.title(), error=dtd.error_log.filter_from_errors())
def test_00_validate_fake_syslog(self): """Validate the fake normalizer""" dtd = DTD(open(os.path.join(self.normalizer_path, 'normalizer.dtd'))) self.assertTrue(dtd.validate(self.n))
def xhtmlValidate(modelXbrl, elt): from lxml.etree import DTD, XMLSyntaxError ixNsStartTags = ["{" + ns + "}" for ns in XbrlConst.ixbrlAll] def checkAttribute(elt, isIxElt, attrTag, attrValue): if attrTag.startswith("{"): ns, sep, localName = attrTag[1:].partition("}") if isIxElt: if ns not in (XbrlConst.xml, XbrlConst.xsi): modelXbrl.error("ix:qualifiedAttributeNotExpected", _("Inline XBRL element %(element)s: has qualified attribute %(name)s"), modelObject=elt, element=str(elt.elementQname), name=attrTag) else: if ns in XbrlConst.ixbrlAll: modelXbrl.error("ix:inlineAttributeMisplaced", _("Inline XBRL attributes are not allowed on html elements: ix:%(name)s"), modelObject=elt, name=localName) elif ns not in {XbrlConst.xml, XbrlConst.xsi, XbrlConst.xhtml}: modelXbrl.error("ix:extensionAttributeMisplaced", _("Extension attributes are not allowed on html elements: %(tag)s"), modelObject=elt, tag=attrTag) elif isIxElt: try: _xsdType = ixAttrType[elt.namespaceURI][attrTag] if isinstance(_xsdType, dict): baseXsdType = _xsdType["type"] facets = _xsdType else: baseXsdType = _xsdType facets = None XmlValidate.validateValue(modelXbrl, elt, attrTag, baseXsdType, attrValue, facets=facets) disallowedXbrliAttrs = ({"scheme", "periodType", "balance", "contextRef", "unitRef", "precision", "decimals"} - {"fraction": {"contextRef", "unitRef"}, "nonFraction": {"contextRef", "unitRef", "decimals", "precision"}, "nonNumeric": {"contextRef"}}.get(elt.localName, set())) disallowedAttrs = [a for a in disallowedXbrliAttrs if elt.get(a) is not None] if disallowedAttrs: modelXbrl.error("ix:inlineElementAttributes", _("Inline XBRL element %(element)s has disallowed attributes %(attributes)s"), modelObject=elt, element=elt.elementQname, attributes=", ".join(disallowedAttrs)) except KeyError: modelXbrl.error("ix:attributeNotExpected", _("Attribute %(attribute)s is not expected on element element ix:%(element)s"), modelObject=elt, attribute=attrTag, element=elt.localName) def checkHierarchyConstraints(elt): constraints = ixHierarchyConstraints.get(elt.localName) if constraints: for _rel, names in constraints: reqt = _rel[0] rel = _rel[1:] if reqt in ('&', '^'): nameFilter = ('*',) else: nameFilter = names relations = {"ancestor": XmlUtil.ancestor, "parent": XmlUtil.parent, "child": XmlUtil.children, "descendant": XmlUtil.descendants}[rel]( elt, '*' if nameFilter == ('*',) else elt.namespaceURI, nameFilter) if rel in ("ancestor", "parent"): if relations is None: relations = [] else: relations = [relations] issue = '' if reqt == '^': if not any(r.localName in names and r.namespaceURI == elt.namespaceURI for r in relations): issue = " and is missing one of " + ', '.join(names) if reqt in ('&', '^'): disallowed = [str(r.elementQname) for r in relations if r.localName not in names or r.namespaceURI != elt.namespaceURI] if disallowed: issue += " and may not have " + ", ".join(disallowed) if reqt == '?' and len(relations) > 1: issue = " may only have 0 or 1 but {0} present ".format(len(relations)) if reqt == '+' and len(relations) == 0: issue = " must have more than 1 but none present " if ((reqt == '+' and not relations) or (reqt == '-' and relations) or (issue)): code = "ix:" + { 'ancestor': "ancestorNode", 'parent': "parentNode", 'child': "childNodes", 'descendant': "descendantNodes"}[rel] + { '+': "Required", '-': "Disallowed", '&': "Allowed", '^': "Specified"}.get(reqt, "Specified") msg = _("Inline XBRL 1.0 ix:{0} {1} {2} {3} {4} element").format( elt.localName, {'+': "must", '-': "may not", '&': "may only", '?': "may", '+': "must"}[reqt], {'ancestor': "be nested in", 'parent': "have parent", 'child': "have child", 'descendant': "have as descendant"}[rel], ', '.join(str(r.elementQname) for r in relations) if names == ('*',) and relations else ", ".join("ix:" + n for n in names), issue) modelXbrl.error(code, msg, modelObject=[elt] + relations, requirement=reqt) def ixToXhtml(fromRoot): toRoot = etree.Element(fromRoot.localName) copyNonIxChildren(fromRoot, toRoot) for attrTag, attrValue in fromRoot.items(): checkAttribute(fromRoot, False, attrTag, attrValue) if attrTag not in ('version', # used in inline test cases but not valid xhtml '{http://www.w3.org/2001/XMLSchema-instance}schemaLocation'): toRoot.set(attrTag, attrValue) return toRoot def copyNonIxChildren(fromElt, toElt): for fromChild in fromElt.iterchildren(): if isinstance(fromChild, ModelObject): isIxNs = fromChild.namespaceURI in XbrlConst.ixbrlAll if isIxNs: checkHierarchyConstraints(fromChild) for attrTag, attrValue in fromChild.items(): checkAttribute(fromChild, True, attrTag, attrValue) for attrTag in ixAttrRequired[fromChild.namespaceURI].get(fromChild.localName,[]): if fromChild.get(attrTag) is None: modelXbrl.error("ix:attributeRequired", _("Attribute %(attribute)s required on element ix:%(element)s"), modelObject=elt, attribute=attrTag, element=fromChild.localName) if not (fromChild.localName in {"references", "resources"} and isIxNs): if fromChild.localName in {"footnote", "nonNumeric", "continuation"} and isIxNs: toChild = etree.Element("ixNestedContent") toElt.append(toChild) copyNonIxChildren(fromChild, toChild) if fromChild.text is not None: toChild.text = fromChild.text if fromChild.tail is not None: toChild.tail = fromChild.tail elif isIxNs: copyNonIxChildren(fromChild, toElt) else: toChild = etree.Element(fromChild.localName) toElt.append(toChild) copyNonIxChildren(fromChild, toChild) for attrTag, attrValue in fromChild.items(): checkAttribute(fromChild, False, attrTag, attrValue) toChild.set(attrTag, attrValue) if fromChild.text is not None: toChild.text = fromChild.text if fromChild.tail is not None: toChild.tail = fromChild.tail # copy xhtml elements to fresh tree with open(os.path.join(modelXbrl.modelManager.cntlr.configDir, "xhtml1-strict-ix.dtd")) as fh: dtd = DTD(fh) try: if not dtd.validate( ixToXhtml(elt) ): modelXbrl.error("ix:DTDelementUnexpected", _("%(element)s error %(error)s"), modelObject=elt, element=elt.localName.title(), error=', '.join(e.message for e in dtd.error_log.filter_from_errors())) except XMLSyntaxError as err: modelXbrl.error("ix:DTDerror", _("%(element)s error %(error)s"), modelObject=elt, element=elt.localName.title(), error=dtd.error_log.filter_from_errors())
methods[element] = { 'declaration': template('PCDATA_OPERATOR_DECLARATION').render( {'class': element, 'type': 'int'}), 'definition': template('PCDATA_OPERATOR_DEFINITION').render( {'class': element, 'type': 'int'}) } if __name__ == '__main__': import argparse cmdline = argparse.ArgumentParser() cmdline.add_argument("dtd") cmdline.add_argument("hxx") cmdline.add_argument("cxx") args = cmdline.parse_args() dtd = DTD(args.dtd) metadata = { 'dtd': dtd, 'enumerations': enumerations, 'extra_methods': methods, 'enum_classes': sorted([(v['name'], k) for k, v in enumerations.items() if not v in [e.name for e in dtd.iterelements()]]), 'forwards_for': {'ornament': ['ornament_type'], 'score': ['score_data', 'score_header']} } with open(args.hxx, 'w') as hxx: print(template('LIBRARY_HEADER').render(metadata), file=hxx) with open(args.cxx, 'w') as cxx: print(template('LIBRARY_IMPLEMENTATION').render(metadata), file=cxx)
class LogNormalizer(): """Basic normalization flow manager. Normalizers definitions are loaded from a path and checked against the DTD. If the definitions are syntactically correct, the normalizers are instantiated and populate the manager's cache. Normalization priormority is established as follows: * Maximum priority assigned to normalizers where the "appliedTo" tag is set to "raw". They MUST be mutually exclusive. * Medium priority assigned to normalizers where the "appliedTo" tag is set to "body". * Lowest priority assigned to any remaining normalizers. Some extra treatment is also done prior and after the log normalization: * Assignment of a unique ID, under the tag "uuid" * Conversion of date tags to UTC, if the "_timezone" was set prior to the normalization process.""" def __init__(self, normalizers_paths, active_normalizers={}): """ Instantiates a flow manager. The default behavior is to activate every available normalizer. @param normalizers_paths: a list of absolute paths to the normalizer XML definitions to use or a just a single path as str. @param active_normalizers: a dictionary of active normalizers in the form {name-version : [True|False]}. """ if not isinstance(normalizers_paths, list or tuple): normalizers_paths = [ normalizers_paths, ] self.normalizers_paths = normalizers_paths self.active_normalizers = active_normalizers self.dtd, self.ctt, self.ccb = None, None, None # Walk through paths for normalizer.dtd and common_tagTypes.xml # /!\ dtd file and common elements will be overrriden if present in # many directories. for norm_path in self.normalizers_paths: if not os.path.isdir(norm_path): raise ValueError, "Invalid normalizer directory : %s" % norm_path dtd = os.path.join(norm_path, 'normalizer.dtd') ctt = os.path.join(norm_path, 'common_tagTypes.xml') ccb = os.path.join(norm_path, 'common_callBacks.xml') if os.path.isfile(dtd): self.dtd = DTD(open(dtd)) if os.path.isfile(ctt): self.ctt = ctt if os.path.isfile(ccb): self.ccb = ccb # Technically the common elements files should NOT be mandatory. # But many normalizers use them, so better safe than sorry. if not self.dtd or not self.ctt or not self.ccb: raise StandardError, "Missing DTD or common library files" self._cache = [] self.reload() def reload(self): """Refreshes this instance's normalizers pool.""" self.normalizers = {'raw': [], 'body': []} for path in self.iter_normalizer(): norm = parse(open(path)) if not self.dtd.validate(norm): warnings.warn('Skipping %s : invalid DTD' % path) print 'invalid normalizer ', path else: normalizer = Normalizer(norm, self.ctt, self.ccb) normalizer.uuid = self._compute_norm_uuid(normalizer) self.normalizers.setdefault(normalizer.appliedTo, []) self.normalizers[normalizer.appliedTo].append(normalizer) self.activate_normalizers() def _compute_norm_uuid(self, normalizer): return "%s-%s" % (normalizer.name, normalizer.version) def iter_normalizer(self): """ Iterates through normalizers and returns the normalizers' paths. @return: a generator of absolute paths. """ for path in self.normalizers_paths: for root, dirs, files in os.walk(path): for name in files: if not name.startswith('common_tagTypes') and \ not name.startswith('common_callBacks') and \ name.endswith('.xml'): yield os.path.join(root, name) def __len__(self): """ Returns the amount of available normalizers. """ return len([n for n in self.iter_normalizer()]) def update_normalizer(self, raw_xml_contents, name=None, dir_path=None): """used to add or update a normalizer. @param raw_xml_contents: XML description of normalizer as flat XML. It must comply to the DTD. @param name: if set, the XML description will be saved as name.xml. If left blank, name will be fetched from the XML description. @param dir_path: the path to the directory where to copy the given normalizer. """ path = self.normalizers_paths[0] if dir_path: if dir_path in self.normalizers_paths: path = dir_path xmlconf = XMLfromstring(raw_xml_contents).getroottree() if not self.dtd.validate(xmlconf): raise ValueError, "This definition file does not follow the normalizers DTD :\n\n%s" % \ self.dtd.error_log.filter_from_errors() if not name: name = xmlconf.getroot().get('name') if not name.endswith('.xml'): name += '.xml' xmlconf.write(open(os.path.join(path, name), 'w'), encoding='utf8', method='xml', pretty_print=True) self.reload() def get_normalizer_by_uuid(self, uuid): """Returns normalizer by uuid.""" try: norm = [ u for u in sum(self.normalizers.values(), []) if u.uuid == uuid ][0] return norm except: raise ValueError, "Normalizer uuid : %s not found" % uuid def get_normalizer_source(self, uuid): """Returns the raw XML source of normalizer uuid.""" return self.get_normalizer_by_uuid(uuid).get_source() def get_normalizer_path(self, uuid): """Returns the filesystem path of a normalizer.""" return self.get_normalizer_by_uuid(uuid).sys_path def activate_normalizers(self): """Activates normalizers according to what was set by calling set_active_normalizers. If no call to the latter function has been made so far, this method activates every normalizer.""" if not self.active_normalizers: self.active_normalizers = dict([ (n.uuid, True) for n in \ sum([ v for v in self.normalizers.values()], []) ]) # fool-proof the list self.set_active_normalizers(self.active_normalizers) # build an ordered cache to speed things up self._cache = [] # First normalizers to apply are the "raw" ones. for norm in self.normalizers['raw']: # consider the normalizer to be inactive if not # explicitly in our list if self.active_normalizers.get(norm.uuid, False): self._cache.append(norm) # Then, apply the applicative normalization on "body": for norm in self.normalizers['body']: if self.active_normalizers.get(norm.uuid, False): self._cache.append(norm) # Then, apply everything else for norm in sum([ self.normalizers[u] for u in self.normalizers if u not in ['raw', 'body'] ], []): if self.active_normalizers.get(norm.uuid, False): self._cache.append(norm) def get_active_normalizers(self): """Returns a dictionary of normalizers; keys are normalizers' uuid and values are True|False according to the normalizer's activation state.""" return self.active_normalizers def set_active_normalizers(self, norms={}): """Sets the active/inactive normalizers. Default behavior is to deactivate every normalizer. @param norms: a dictionary, similar to the one returned by get_active_normalizers.""" default = dict([ (n.uuid, False) for n in \ sum([ v for v in self.normalizers.values()], []) ]) default.update(norms) self.active_normalizers = default def lognormalize(self, data): """ This method is the entry point to normalize data (a log). data is passed through every activated normalizer and extra tagging occurs accordingly. data receives also an extra uuid tag. @param data: must be a dictionary with at least a key 'raw' or 'body' with BaseString values (preferably Unicode). Here an example : >>> from logsparser import lognormalizer >>> from pprint import pprint >>> ln = lognormalizer.LogNormalizer('/usr/local/share/normalizers/') >>> mylog = {'raw' : 'Jul 18 15:35:01 zoo /USR/SBIN/CRON[14338]: (root) CMD (/srv/git/redmine-changesets.sh)'} >>> ln.lognormalize(mylog) >>> pprint mylog {'body': '(root) CMD (/srv/git/redmine-changesets.sh)', 'date': datetime.datetime(2011, 7, 18, 15, 35, 1), 'pid': '14338', 'program': '/USR/SBIN/CRON', 'raw': 'Jul 18 15:35:01 zoo /USR/SBIN/CRON[14338]: (root) CMD (/srv/git/redmine-changesets.sh)', 'source': 'zoo', 'uuid': 70851882840934161193887647073096992594L} """ data = self.uuidify(data) data = self.normalize(data) # some more functions for clarity def uuidify(self, log): """Adds a unique UID to the normalized log.""" log["uuid"] = _UUID_.uuid4().int return log def normalize(self, log): """plain normalization.""" for norm in self._cache: log = norm.normalize(log) return log def _normalize(self, log): """Used for testing only, the normalizers' tags prerequisite are deactivated.""" for norm in self._cache: log = norm.normalize(log, do_not_check_prereq=True) return log
def run(ebook: Path, bigbook: Path, ubercoordinator: Path, files: List[Path]) -> None: """ :param ebook: the ebook source directory :param bigbook: the Big Book of Key :param ubercoordinator: the ubercoordinator source directory, for the DTD :param files: the XHTML file from the Big Book of Key that need adding :return: """ index = Index(bigbook) book_dtd = DTD((ubercoordinator / 'src' / 'book.dtd').open()) book = xml.read(ebook / 'book.xml', dtd=book_dtd) illustrations = xml.get_one(book, 'illustrations') contents = xml.get_one(book, 'contents') sections = set( xml.get_all_str(contents, '//section[not(@template="yes")]/@file')) images = set(xml.get_all_str(illustrations, '//image/@file')) initial_sections = sections.copy() initial_images = images.copy() for filename in sections: ebook_file = ebook / 'Text' / filename bigbook_file = bigbook / 'Text' / filename if not ebook_file.exists() and bigbook_file.exists(): copyfile(bigbook_file, ebook_file) if ebook_file.exists(): for img_filename in find_images(ebook_file): if img_filename not in images: illustrations.append(file_element('image', img_filename)) images.add(img_filename) else: print( f"{ebook / 'book.xml'}:0:0:WARNING: is this missing?: {filename}" ) for file in files: article_id = file.stem article = index.articles_by_id[article_id] if article.file.name not in sections: copyfile(article.file, ebook / 'Text' / article.file.name) title = xml.rewrap('title', XML(article.link)) section = file_element('section', article.file.name) section.append(title) contents.append(section) sections.add(file.name) for img_filename in find_images(article.file): if img_filename not in images: illustrations.append(file_element('image', img_filename)) images.add(img_filename) for img_filename in images: file = ebook / 'Images' / img_filename if not file.exists(): copyfile(bigbook / 'Images' / img_filename, file) book.attrib['date'] = strftime("%Y-%m-%d") if sections != initial_sections or images != initial_images: copyfile(ebook / 'book.xml', ebook / 'book.xml.bak') xml.save(ebook / 'book.xml', book, doctype='book')
def read(filepath: Path, dtd: DTD = None) -> Element: with filepath.open(encoding='utf-8') as f: root = parse(f, XMLParser()).getroot() if dtd: dtd.validate(root) return root
def xhtmlValidate(modelXbrl, elt): from lxml.etree import DTD, XMLSyntaxError ixNsStartTags = ["{" + ns + "}" for ns in XbrlConst.ixbrlAll] def checkAttribute(elt, isIxElt, attrTag, attrValue): if attrTag.startswith("{"): ns, sep, localName = attrTag[1:].partition("}") if isIxElt: allowedNs = nonIxAttrNS.get(elt.localName, None) if allowedNs != "##other" and ns != allowedNs: modelXbrl.error( "ix:qualifiedAttributeNotExpected", _("Inline XBRL element %(element)s: has qualified attribute %(name)s" ), modelObject=elt, element=str(elt.elementQname), name=attrTag) else: if ns in XbrlConst.ixbrlAll: modelXbrl.error( "ix:inlineAttributeMisplaced", _("Inline XBRL attributes are not allowed on html elements: ix:%(name)s" ), modelObject=elt, name=localName) elif ns not in {XbrlConst.xml, XbrlConst.xsi, XbrlConst.xhtml}: modelXbrl.error( "ix:extensionAttributeMisplaced", _("Extension attributes are not allowed on html elements: %(tag)s" ), modelObject=elt, tag=attrTag) elif isIxElt: try: _xsdType = ixAttrType[elt.namespaceURI][attrTag] if isinstance(_xsdType, dict): baseXsdType = _xsdType["type"] facets = _xsdType else: baseXsdType = _xsdType facets = None XmlValidate.validateValue(modelXbrl, elt, attrTag, baseXsdType, attrValue, facets=facets) disallowedXbrliAttrs = ({ "scheme", "periodType", "balance", "contextRef", "unitRef", "precision", "decimals" } - { "fraction": {"contextRef", "unitRef"}, "nonFraction": {"contextRef", "unitRef", "decimals", "precision"}, "nonNumeric": {"contextRef"} }.get(elt.localName, set())) disallowedAttrs = [ a for a in disallowedXbrliAttrs if elt.get(a) is not None ] if disallowedAttrs: modelXbrl.error( "ix:inlineElementAttributes", _("Inline XBRL element %(element)s has disallowed attributes %(attributes)s" ), modelObject=elt, element=elt.elementQname, attributes=", ".join(disallowedAttrs)) except KeyError: modelXbrl.error( "ix:attributeNotExpected", _("Attribute %(attribute)s is not expected on element element ix:%(element)s" ), modelObject=elt, attribute=attrTag, element=elt.localName) def checkHierarchyConstraints(elt): constraints = ixHierarchyConstraints.get(elt.localName) if constraints: for _rel, names in constraints: reqt = _rel[0] rel = _rel[1:] if reqt in ('&', '^'): nameFilter = ('*', ) else: nameFilter = names relations = { "ancestor": XmlUtil.ancestor, "parent": XmlUtil.parent, "child-choice": XmlUtil.children, "child-sequence": XmlUtil.children, "descendant": XmlUtil.descendants }[rel](elt, '*' if nameFilter == ('*', ) else elt.namespaceURI, nameFilter) if rel in ("ancestor", "parent"): if relations is None: relations = [] else: relations = [relations] issue = '' if reqt == '^': if not any(r.localName in names and r.namespaceURI == elt.namespaceURI for r in relations): issue = " and is missing one of " + ', '.join(names) if reqt in ('&', '^'): disallowed = [ str(r.elementQname) for r in relations if not (r.tag in names or (r.localName in names and r.namespaceURI == elt.namespaceURI)) ] if disallowed: issue += " and may not have " + ", ".join(disallowed) elif rel == "child-sequence": sequencePosition = 0 for i, r in enumerate(relations): rPos = names.index(str(r.localName)) if rPos < sequencePosition: issue += " and is out of sequence: " + str( r.elementQname) else: sequencePosition = rPos if reqt == '?' and len(relations) > 1: issue = " may only have 0 or 1 but {0} present ".format( len(relations)) if reqt == '+' and len(relations) == 0: issue = " must have more than 1 but none present " if ((reqt == '+' and not relations) or (reqt == '-' and relations) or (issue)): code = "ix:" + { 'ancestor': "ancestorNode", 'parent': "parentNode", 'child-choice': "childNodes", 'child-sequence': "childNodes", 'descendant': "descendantNodes" }[rel] + { '+': "Required", '-': "Disallowed", '&': "Allowed", '^': "Specified" }.get(reqt, "Specified") msg = _("Inline XBRL 1.0 ix:{0} {1} {2} {3} {4} element" ).format( elt.localName, { '+': "must", '-': "may not", '&': "may only", '?': "may", '+': "must" }[reqt], { 'ancestor': "be nested in", 'parent': "have parent", 'child-choice': "have child", 'child-sequence': "have child", 'descendant': "have as descendant" }[rel], ', '.join( str(r.elementQname) for r in relations) if names == ('*', ) and relations else ", ".join("ix:" + n for n in names), issue) modelXbrl.error(code, msg, modelObject=[elt] + relations, requirement=reqt) def ixToXhtml(fromRoot): toRoot = etree.Element(fromRoot.localName) copyNonIxChildren(fromRoot, toRoot) for attrTag, attrValue in fromRoot.items(): checkAttribute(fromRoot, False, attrTag, attrValue) if attrTag not in ( 'version', # used in inline test cases but not valid xhtml '{http://www.w3.org/2001/XMLSchema-instance}schemaLocation' ): toRoot.set(attrTag, attrValue) return toRoot def copyNonIxChildren(fromElt, toElt): for fromChild in fromElt.iterchildren(): if isinstance(fromChild, ModelObject): isIxNs = fromChild.namespaceURI in XbrlConst.ixbrlAll if isIxNs: if fromChild.localName not in ixElements[ fromChild.namespaceURI]: modelXbrl.error( "ix:elementNameInvalid", _("Inline XBRL element name %(element)s is not valid" ), modelObject=fromChild, element=str(fromChild.elementQname)) else: checkHierarchyConstraints(fromChild) for attrTag, attrValue in fromChild.items(): checkAttribute(fromChild, True, attrTag, attrValue) for attrTag in ixAttrRequired[ fromChild.namespaceURI].get( fromChild.localName, []): if fromChild.get(attrTag) is None: modelXbrl.error( "ix:attributeRequired", _("Attribute %(attribute)s required on element ix:%(element)s" ), modelObject=elt, attribute=attrTag, element=fromChild.localName) if not (fromChild.localName in {"references", "resources"} and isIxNs): if fromChild.localName in { "footnote", "nonNumeric", "continuation" } and isIxNs: toChild = etree.Element("ixNestedContent") toElt.append(toChild) copyNonIxChildren(fromChild, toChild) if fromChild.text is not None: toChild.text = fromChild.text if fromChild.tail is not None: toChild.tail = fromChild.tail elif isIxNs: copyNonIxChildren(fromChild, toElt) else: toChild = etree.Element(fromChild.localName) toElt.append(toChild) copyNonIxChildren(fromChild, toChild) for attrTag, attrValue in fromChild.items(): checkAttribute(fromChild, False, attrTag, attrValue) toChild.set(attrTag, attrValue) if fromChild.text is not None: toChild.text = fromChild.text if fromChild.tail is not None: toChild.tail = fromChild.tail # copy xhtml elements to fresh tree with open( os.path.join(modelXbrl.modelManager.cntlr.configDir, "xhtml1-strict-ix.dtd")) as fh: dtd = DTD(fh) try: #with open("/users/hermf/temp/testDtd.htm", "w") as fh: # fh.write(etree.tostring(ixToXhtml(elt), encoding=_STR_UNICODE, pretty_print=True)) if not dtd.validate(ixToXhtml(elt)): modelXbrl.error("ix:DTDelementUnexpected", _("%(element)s error %(error)s"), modelObject=elt, element=elt.localName.title(), error=', '.join( e.message for e in dtd.error_log.filter_from_errors())) except XMLSyntaxError as err: modelXbrl.error("ix:DTDerror", _("%(element)s error %(error)s"), modelObject=elt, element=elt.localName.title(), error=dtd.error_log.filter_from_errors())
'definition': template('PCDATA_OPERATOR_DEFINITION').render({ 'class': element, 'type': 'int' }) } if __name__ == '__main__': import argparse cmdline = argparse.ArgumentParser() cmdline.add_argument("dtd") cmdline.add_argument("hxx") cmdline.add_argument("cxx") args = cmdline.parse_args() dtd = DTD(args.dtd) metadata = { 'dtd': dtd, 'enumerations': enumerations, 'extra_methods': methods, 'enum_classes': sorted([(v['name'], k) for k, v in enumerations.items() if not v in [e.name for e in dtd.iterelements()]]), 'forwards_for': { 'ornament': ['ornament_type'], 'score': ['score_data', 'score_header'] } }
def read(self, file): dtd = DTD(file) for entity in dtd.entities(): unit = Unit(entity.name, entity.content) self.units.append(unit)
class LogNormalizer(): """Basic normalization flow manager. Normalizers definitions are loaded from a path and checked against the DTD. If the definitions are syntactically correct, the normalizers are instantiated and populate the manager's cache. Normalization priormority is established as follows: * Maximum priority assigned to normalizers where the "appliedTo" tag is set to "raw". They MUST be mutually exclusive. * Medium priority assigned to normalizers where the "appliedTo" tag is set to "body". * Lowest priority assigned to any remaining normalizers. Some extra treatment is also done prior and after the log normalization: * Assignment of a unique ID, under the tag "uuid" * Conversion of date tags to UTC, if the "_timezone" was set prior to the normalization process.""" def __init__(self, normalizers_paths, active_normalizers = {}): """ Instantiates a flow manager. The default behavior is to activate every available normalizer. @param normalizers_paths: a list of absolute paths to the normalizer XML definitions to use or a just a single path as str. @param active_normalizers: a dictionary of active normalizers in the form {name: [True|False]}. """ if not isinstance(normalizers_paths, list or tuple): normalizers_paths = [normalizers_paths,] self.normalizers_paths = normalizers_paths self.active_normalizers = active_normalizers # Walk through paths for normalizer.dtd and common_tagTypes.xml for norm_path in self.normalizers_paths: dtd = os.path.join(norm_path, 'normalizer.dtd') ctt = os.path.join(norm_path, 'common_tagTypes.xml') if os.path.isfile(dtd): self.dtd = DTD(open(dtd)) if os.path.isfile(ctt): self.ctt = ctt self._cache = [] self.reload() def reload(self): """Refreshes this instance's normalizers pool.""" self.normalizers = { 'raw' : [], 'body' : [] } for path in self.iter_normalizer(): norm = parse(open(path)) if not self.dtd.validate(norm): warnings.warn('Skipping %s : invalid DTD' % path) print 'invalid normalizer ', path else: normalizer = Normalizer(norm, self.ctt) normalizer.uuid = self._compute_norm_uuid(normalizer) self.normalizers.setdefault(normalizer.appliedTo, []) self.normalizers[normalizer.appliedTo].append(normalizer) self.activate_normalizers() def _compute_norm_uuid(self, normalizer): return "%s-%s" % (normalizer.name, normalizer.version) def iter_normalizer(self): """ Iterates through normalizers and returns the normalizers' paths. @return: a generator of absolute paths. """ for path in self.normalizers_paths: for root, dirs, files in os.walk(path): for name in files: if not name.startswith('common_tagTypes') and \ name.endswith('.xml'): yield os.path.join(root, name) def __len__(self): """ Returns the amount of available normalizers. """ return len([n for n in self.iter_normalizer()]) def update_normalizer(self, raw_xml_contents, name = None, dir_path = None ): """used to add or update a normalizer. @param raw_xml_contents: XML description of normalizer as flat XML. It must comply to the DTD. @param name: if set, the XML description will be saved as name.xml. If left blank, name will be fetched from the XML description. @param dir_path: the path to the directory where to copy the given normalizer. """ path = self.normalizers_paths[0] if dir_path: if dir_path in self.normalizers_paths: path = dir_path xmlconf = XMLfromstring(raw_xml_contents).getroottree() if not self.dtd.validate(xmlconf): raise ValueError, "This definition file does not follow the normalizers DTD :\n\n%s" % \ self.dtd.error_log.filter_from_errors() if not name: name = xmlconf.getroot().get('name') if not name.endswith('.xml'): name += '.xml' xmlconf.write(open(os.path.join(path, name), 'w'), encoding = 'utf8', method = 'xml', pretty_print = True) self.reload() def get_normalizer_by_uuid(self, uuid): """Returns normalizer by uuid.""" try: norm = [ u for u in sum(self.normalizers.values(), []) if u.uuid == uuid][0] return norm except: raise ValueError, "Normalizer uuid : %s not found" % uuid def get_normalizer_source(self, uuid): """Returns the raw XML source of normalizer uuid.""" return self.get_normalizer_by_uuid(uuid).get_source() def get_normalizer_path(self, uuid): """Returns the filesystem path of a normalizer.""" return self.get_normalizer_by_uuid(uuid).sys_path def activate_normalizers(self): """Activates normalizers according to what was set by calling set_active_normalizers. If no call to the latter function has been made so far, this method activates every normalizer.""" if not self.active_normalizers: self.active_normalizers = dict([ (n.uuid, True) for n in \ sum([ v for v in self.normalizers.values()], []) ]) # fool-proof the list self.set_active_normalizers(self.active_normalizers) # build an ordered cache to speed things up self._cache = [] # First normalizers to apply are the "raw" ones. for norm in self.normalizers['raw']: # consider the normalizer to be inactive if not # explicitly in our list if self.active_normalizers.get(norm.uuid, False): self._cache.append(norm) # Then, apply the applicative normalization on "body": for norm in self.normalizers['body']: if self.active_normalizers.get(norm.uuid, False): self._cache.append(norm) # Then, apply everything else for norm in sum([ self.normalizers[u] for u in self.normalizers if u not in ['raw', 'body']], []): self._cache.append(norm) def get_active_normalizers(self): """Returns a dictionary of normalizers; keys are normalizers' uuid and values are True|False according to the normalizer's activation state.""" return self.active_normalizers def set_active_normalizers(self, norms = {}): """Sets the active/inactive normalizers. Default behavior is to deactivate every normalizer. @param norms: a dictionary, similar to the one returned by get_active_normalizers.""" default = dict([ (n.uuid, False) for n in \ sum([ v for v in self.normalizers.values()], []) ]) default.update(norms) self.active_normalizers = default def lognormalize(self, data): """ This method is the entry point to normalize data (a log). data is passed through every activated normalizer and extra tagging occurs accordingly. data receives also an extra uuid tag. @param data: must be a dictionary with at least a key 'raw' or 'body' with BaseString values (preferably Unicode). Here an example : >>> from logsparser import lognormalizer >>> from pprint import pprint >>> ln = lognormalizer.LogNormalizer('/usr/local/share/normalizers/') >>> mylog = {'raw' : 'Jul 18 15:35:01 zoo /USR/SBIN/CRON[14338]: (root) CMD (/srv/git/redmine-changesets.sh)'} >>> ln.lognormalize(mylog) >>> pprint mylog {'body': '(root) CMD (/srv/git/redmine-changesets.sh)', 'date': datetime.datetime(2011, 7, 18, 15, 35, 1), 'pid': '14338', 'program': '/USR/SBIN/CRON', 'raw': 'Jul 18 15:35:01 zoo /USR/SBIN/CRON[14338]: (root) CMD (/srv/git/redmine-changesets.sh)', 'source': 'zoo', 'uuid': 70851882840934161193887647073096992594L} """ data = self.uuidify(data) data = self.normalize(data) # some more functions for clarity def uuidify(self, log): """Adds a unique UID to the normalized log.""" log["uuid"] = _UUID_.uuid4().int return log def normalize(self, log): """plain normalization.""" for norm in self._cache: log = norm.normalize(log) return log def _normalize(self, log): """Used for testing only, the normalizers' tags prerequisite are deactivated.""" for norm in self._cache: log = norm.normalize(log, do_not_check_prereq = True) return log
def validateXbrlFinally(val, *args, **kwargs): if not (val.validateEFMHTMplugin): return modelXbrl = val.modelXbrl allowedExternalHrefPattern = modelXbrl.modelManager.disclosureSystem.allowedExternalHrefPattern efmHtmDTD = None with open( os.path.join(os.path.dirname(__file__), "resources", "efm-htm.dtd")) as fh: efmHtmDTD = DTD(fh) if efmHtmDTD and not efmHtmDTD.validate( modelXbrl.modelDocument.xmlRootElement.getroottree()): for e in efmHtmDTD.error_log.filter_from_errors(): if "declared in the external subset contains white spaces nodes" not in e.message: modelXbrl.error("html.syntax", _("HTML error %(error)s"), error=e.message) for elt in modelXbrl.modelDocument.xmlRootElement.iter(): eltTag = elt.tag if isinstance(elt, (_ElementTree, _Comment, _ProcessingInstruction)): continue # comment or other non-parsed element for attrTag, attrValue in elt.items(): if ((attrTag == "href" and eltTag == "a") or (attrTag == "src" and eltTag == "img")): if "javascript:" in attrValue: modelXbrl.error( "EFM.5.02.02.10.activeContent", _("Element has javascript in '%(attribute)s' for <%(element)s>" ), modelObject=elt, attribute=attrTag, element=eltTag) elif eltTag == "a" and ( not allowedExternalHrefPattern or allowedExternalHrefPattern.match(attrValue)): pass elif scheme(attrValue) in ("http", "https", "ftp"): modelXbrl.error( "EFM.6.05.16.externalReference", _("Element has an invalid external reference in '%(attribute)s' for <%(element)s>" ), modelObject=elt, attribute=attrTag, element=eltTag) if attrTag == "src" and attrValue not in checkedGraphicsFiles: if scheme(attrValue) == "data": modelXbrl.error( "EFM.5.02.02.10.graphicDataUrl", _("Element references a graphics data URL which isn't accepted '%(attribute)s' for <%(element)s>" ), modelObject=elt, attribute=attrValue[:32], element=eltTag) elif attrValue.lower()[-4:] not in ('.jpg', '.gif'): modelXbrl.error( "EFM.5.02.02.10.graphicFileType", _("Element references a graphics file which isn't .gif or .jpg '%(attribute)s' for <%(element)s>" ), modelObject=elt, attribute=attrValue, element=eltTag) if eltTag == "table" and any(a is not None for a in elt.iterancestors("table")): modelXbrl.error("EFM.5.02.02.10.nestedTable", _("Element is a disallowed nested <table>."), modelObject=elt)
def xhtmlValidate(modelXbrl, elt): from lxml.etree import DTD, XMLSyntaxError ixNsStartTags = ["{" + ns + "}" for ns in XbrlConst.ixbrlAll] isEFM = modelXbrl.modelManager.disclosureSystem.validationType == "EFM" # find ix version for messages _ixNS = elt.modelDocument.ixNS def checkAttribute(elt, isIxElt, attrTag, attrValue): ixEltAttrDefs = ixAttrDefined.get(elt.namespaceURI, EMPTYDICT).get(elt.localName, ()) if attrTag.startswith("{"): ns, sep, localName = attrTag[1:].partition("}") else: ns = None localName = attrTag if ns is not None and ns not in XbrlConst.ixbrlAll and attrTag not in ixEltAttrDefs: if isIxElt: allowedNs = allowedNonIxAttrNS.get(elt.localName, None) if allowedNs != "##other" and ns != allowedNs: modelXbrl.error(ixMsgCode("qualifiedAttributeNotExpected", elt), _("Inline XBRL element %(element)s has qualified attribute %(name)s"), modelObject=elt, element=str(elt.elementQname), name=attrTag) if ns == XbrlConst.xbrli and elt.localName in { "fraction", "nonFraction", "nonNumeric", "references", "relationship", "tuple"}: modelXbrl.error(ixMsgCode("qualifiedAttributeDisallowed", elt), _("Inline XBRL element %(element)s has disallowed attribute %(name)s"), modelObject=elt, element=str(elt.elementQname), name=attrTag) else: if ns in XbrlConst.ixbrlAll: modelXbrl.error(ixMsgCode("inlineAttributeMisplaced", elt, name="other"), _("Inline XBRL attributes are not allowed on html elements: ix:%(name)s"), modelObject=elt, name=localName) elif ns not in {XbrlConst.xml, XbrlConst.xsi, XbrlConst.xhtml}: modelXbrl.error(ixMsgCode("extensionAttributeMisplaced", ns=_ixNS), _("Extension attributes are not allowed on html elements: %(tag)s"), modelObject=elt, tag=attrTag) elif isIxElt: try: _xsdType = ixAttrType[elt.namespaceURI][localName] if isinstance(_xsdType, dict): baseXsdType = _xsdType["type"] facets = _xsdType else: baseXsdType = _xsdType facets = None XmlValidate.validateValue(modelXbrl, elt, attrTag, baseXsdType, attrValue, facets=facets) if not (attrTag in ixEltAttrDefs or (localName in ixEltAttrDefs and (not ns or ns in XbrlConst.ixbrlAll))): raise KeyError disallowedXbrliAttrs = ({"scheme", "periodType", "balance", "contextRef", "unitRef", "precision", "decimals"} - {"fraction": {"contextRef", "unitRef"}, "nonFraction": {"contextRef", "unitRef", "decimals", "precision"}, "nonNumeric": {"contextRef"}}.get(elt.localName, set())) disallowedAttrs = set(a for a in disallowedXbrliAttrs if elt.get(a) is not None) if disallowedAttrs: modelXbrl.error(ixMsgCode("inlineElementAttributes",elt), _("Inline XBRL element %(element)s has disallowed attributes %(attributes)s"), modelObject=elt, element=elt.elementQname, attributes=", ".join(disallowedAttrs)) except KeyError: modelXbrl.error(ixMsgCode("attributeNotExpected",elt), _("Attribute %(attribute)s is not expected on element ix:%(element)s"), modelObject=elt, attribute=attrTag, element=elt.localName) def checkHierarchyConstraints(elt): constraints = ixHierarchyConstraints.get(elt.localName) if constraints: for _rel, names in constraints: reqt = _rel[0] rel = _rel[1:] if reqt in ('&', '^'): nameFilter = ('*',) else: nameFilter = names if nameFilter == ('*',): namespaceFilter = namespacePrefix = '*' else: namespaceFilter = elt.namespaceURI namespacePrefix = elt.prefix relations = {"ancestor": XmlUtil.ancestor, "parent": XmlUtil.parent, "child-choice": XmlUtil.children, "child-sequence": XmlUtil.children, "child-or-text": XmlUtil.children, "descendant": XmlUtil.descendants}[rel]( elt, namespaceFilter, nameFilter) if rel in ("ancestor", "parent"): if relations is None: relations = [] else: relations = [relations] if rel == "child-or-text": relations += XmlUtil.innerTextNodes(elt, ixExclude=True, ixEscape=False, ixContinuation=False) issue = '' if reqt == '^': if not any(r.localName in names and r.namespaceURI == elt.namespaceURI for r in relations): issue = " and is missing one of " + ', '.join(names) if reqt in ('&', '^'): disallowed = [str(r.elementQname) for r in relations if not (r.tag in names or (r.localName in names and r.namespaceURI == elt.namespaceURI))] if disallowed: issue += " and may not have " + ", ".join(disallowed) elif rel == "child-sequence": sequencePosition = 0 for i, r in enumerate(relations): rPos = names.index(str(r.localName)) if rPos < sequencePosition: issue += " and is out of sequence: " + str(r.elementQname) else: sequencePosition = rPos if reqt == '?' and len(relations) > 1: issue = " may only have 0 or 1 but {0} present ".format(len(relations)) if reqt == '+' and len(relations) == 0: issue = " must have at least 1 but none present " if ((reqt == '+' and not relations) or (reqt == '-' and relations) or (issue)): code = "{}:{}".format(ixSect[elt.namespaceURI].get(elt.localName,"other")["constraint"], { 'ancestor': "ancestorNode", 'parent': "parentNode", 'child-choice': "childNodes", 'child-sequence': "childNodes", 'child-or-text': "childNodesOrText", 'descendant': "descendantNodes"}[rel] + { '+': "Required", '-': "Disallowed", '&': "Allowed", '^': "Specified"}.get(reqt, "Specified")) msg = _("Inline XBRL ix:{0} {1} {2} {3} {4} element").format( elt.localName, {'+': "must", '-': "may not", '&': "may only", '?': "may", '+': "must"}[reqt], {'ancestor': "be nested in", 'parent': "have parent", 'child-choice': "have child", 'child-sequence': "have child", 'child-or-text': "have child or text,", 'descendant': "have as descendant"}[rel], '' if rel == 'child-or-text' else ', '.join(str(r.elementQname) for r in relations) if names == ('*',) and relations else ", ".join("{}:{}".format(namespacePrefix, n) for n in names), issue) modelXbrl.error(code, msg, modelObject=[elt] + relations, requirement=reqt, messageCodes=("ix{ver.sect}:ancestorNode{Required|Disallowed}", "ix{ver.sect}:childNodesOrTextRequired", "ix{ver.sect}:childNodes{Required|Disallowed|Allowed}", "ix{ver.sect}:descendantNodesDisallowed", "ix{ver.sect}:parentNodeRequired")) def ixToXhtml(fromRoot): toRoot = etree.Element(fromRoot.localName) copyNonIxChildren(fromRoot, toRoot) for attrTag, attrValue in fromRoot.items(): checkAttribute(fromRoot, False, attrTag, attrValue) if attrTag not in ('version', # used in inline test cases but not valid xhtml '{http://www.w3.org/2001/XMLSchema-instance}schemaLocation'): toRoot.set(attrTag, attrValue) return toRoot def copyNonIxChildren(fromElt, toElt, excludeSubtree=False): for fromChild in fromElt.iterchildren(): if isinstance(fromChild, ModelObject): isIxNs = fromChild.namespaceURI in XbrlConst.ixbrlAll if isIxNs: if fromChild.localName not in ixElements[fromChild.namespaceURI]: modelXbrl.error(ixMsgCode("elementNameInvalid",ns=_ixNS), _("Inline XBRL element name %(element)s is not valid"), modelObject=fromChild, element=str(fromChild.elementQname)) else: checkHierarchyConstraints(fromChild) for attrTag, attrValue in fromChild.items(): checkAttribute(fromChild, True, attrTag, attrValue) for attrTag in ixAttrRequired[fromChild.namespaceURI].get(fromChild.localName,[]): if fromChild.get(attrTag) is None: modelXbrl.error(ixMsgCode("attributeRequired", fromChild), _("Attribute %(attribute)s required on element ix:%(element)s"), modelObject=elt, attribute=attrTag, element=fromChild.localName) if excludeSubtree or (fromChild.localName in {"references", "resources"} and isIxNs): copyNonIxChildren(fromChild, toElt, excludeSubtree=True) else: if fromChild.localName in {"footnote", "nonNumeric", "continuation"} and isIxNs: toChild = etree.Element("ixNestedContent") toElt.append(toChild) copyNonIxChildren(fromChild, toChild) if fromChild.text is not None: toChild.text = fromChild.text if fromChild.tail is not None: toChild.tail = fromChild.tail elif isIxNs: copyNonIxChildren(fromChild, toElt) else: toChild = etree.Element(fromChild.localName) toElt.append(toChild) copyNonIxChildren(fromChild, toChild) for attrTag, attrValue in fromChild.items(): checkAttribute(fromChild, False, attrTag, attrValue) toChild.set(attrTag, attrValue) if fromChild.text is not None: toChild.text = fromChild.text if fromChild.tail is not None: toChild.tail = fromChild.tail # copy xhtml elements to fresh tree with open(os.path.join(modelXbrl.modelManager.cntlr.configDir, "xhtml1-strict-ix.dtd")) as fh: dtd = DTD(fh) try: #with open("/users/hermf/temp/testDtd.htm", "w") as fh: # fh.write(etree.tostring(ixToXhtml(elt), encoding=_STR_UNICODE, pretty_print=True)) if not dtd.validate( ixToXhtml(elt) ): modelXbrl.error("html:syntaxError", _("%(element)s error %(error)s"), modelObject=elt, element=elt.localName.title(), error=', '.join(e.message for e in dtd.error_log.filter_from_errors())) if isEFM: ValidateFilingText.validateHtmlContent(modelXbrl, elt, elt, "InlineXBRL", "EFM.5.02.05.", isInline=True) except XMLSyntaxError as err: modelXbrl.error("html:syntaxError", _("%(element)s error %(error)s"), modelObject=elt, element=elt.localName.title(), error=dtd.error_log.filter_from_errors())
def _validate_dtd_name(self, identifier:str): dtd = "<!ELEMENT S EMPTY><!ATTLIST S id ID #REQUIRED>" dtd_file = StringIO(dtd) dtd_validator = DTD(dtd_file) sample_xml_element = Element("S", id = identifier) return dtd_validator.validate(sample_xml_element)
def xhtmlValidate(modelXbrl, elt): from lxml.etree import DTD, XMLSyntaxError from arelle import FunctionIxt ixNsStartTags = ["{" + ns + "}" for ns in XbrlConst.ixbrlAll] isEFM = modelXbrl.modelManager.disclosureSystem.validationType == "EFM" # find ix version for messages _ixNS = elt.modelDocument.ixNS _xhtmlDTD = XHTML_DTD[_ixNS] _customTransforms = modelXbrl.modelManager.customTransforms or {} def checkAttribute(elt, isIxElt, attrTag, attrValue): ixEltAttrDefs = ixAttrDefined.get(elt.namespaceURI, EMPTYDICT).get(elt.localName, ()) if attrTag.startswith("{"): ns, sep, localName = attrTag[1:].partition("}") else: ns = None localName = attrTag if ns is not None and ns not in XbrlConst.ixbrlAll and attrTag not in ixEltAttrDefs: if ns == XbrlConst.xsi: pass # xsi attributes are always allowed elif isIxElt: allowedNs = allowedNonIxAttrNS.get(elt.localName, None) if allowedNs != "##other" and ns != allowedNs: modelXbrl.error( ixMsgCode("qualifiedAttributeNotExpected", elt), _("Inline XBRL element %(element)s has qualified attribute %(name)s" ), modelObject=elt, element=str(elt.elementQname), name=attrTag) if ns == XbrlConst.xbrli and elt.localName in { "fraction", "nonFraction", "nonNumeric", "references", "relationship", "tuple" }: modelXbrl.error( ixMsgCode("qualifiedAttributeDisallowed", elt), _("Inline XBRL element %(element)s has disallowed attribute %(name)s" ), modelObject=elt, element=str(elt.elementQname), name=attrTag) else: if ns in XbrlConst.ixbrlAll: modelXbrl.error( ixMsgCode("inlineAttributeMisplaced", elt, name="other"), _("Inline XBRL attributes are not allowed on html elements: ix:%(name)s" ), modelObject=elt, name=localName) elif ns not in {XbrlConst.xml, XbrlConst.xsi, XbrlConst.xhtml}: modelXbrl.error( ixMsgCode("extensionAttributeMisplaced", ns=_ixNS), _("Extension attributes are not allowed on html elements: %(tag)s" ), modelObject=elt, tag=attrTag) elif isIxElt: try: _xsdType = ixAttrType[elt.namespaceURI][localName] if isinstance(_xsdType, dict): baseXsdType = _xsdType["type"] facets = _xsdType else: baseXsdType = _xsdType facets = None XmlValidate.validateValue(modelXbrl, elt, attrTag, baseXsdType, attrValue, facets=facets) if not (attrTag in ixEltAttrDefs or (localName in ixEltAttrDefs and (not ns or ns in XbrlConst.ixbrlAll))): raise KeyError disallowedXbrliAttrs = ({ "scheme", "periodType", "balance", "contextRef", "unitRef", "precision", "decimals" } - { "fraction": {"contextRef", "unitRef"}, "nonFraction": {"contextRef", "unitRef", "decimals", "precision"}, "nonNumeric": {"contextRef"} }.get(elt.localName, set())) disallowedAttrs = set(a for a in disallowedXbrliAttrs if elt.get(a) is not None) if disallowedAttrs: modelXbrl.error( ixMsgCode("inlineElementAttributes", elt), _("Inline XBRL element %(element)s has disallowed attributes %(attributes)s" ), modelObject=elt, element=elt.elementQname, attributes=", ".join(disallowedAttrs)) except KeyError: modelXbrl.error( ixMsgCode("attributeNotExpected", elt), _("Attribute %(attribute)s is not expected on element ix:%(element)s" ), modelObject=elt, attribute=attrTag, element=elt.localName) elif ns is None: _xsdType = htmlAttrType.get(localName) if _xsdType is not None: if isinstance(_xsdType, dict): baseXsdType = _xsdType["type"] facets = _xsdType else: baseXsdType = _xsdType facets = None XmlValidate.validateValue(modelXbrl, elt, attrTag, baseXsdType, attrValue, facets=facets) def checkHierarchyConstraints(elt): constraints = ixHierarchyConstraints.get(elt.localName) if constraints: for _rel, names in constraints: reqt = _rel[0] rel = _rel[1:] if reqt in ('&', '^', '1'): nameFilter = ('*', ) else: nameFilter = names if nameFilter == ('*', ): namespaceFilter = namespacePrefix = '*' elif len(nameFilter) == 1 and "}" in nameFilter[ 0] and nameFilter[0][0] == "{": namespaceFilter, _sep, nameFilter = nameFilter[0][ 1:].partition("}") namespacePrefix = XmlUtil.xmlnsprefix(elt, namespaceFilter) else: namespaceFilter = elt.namespaceURI namespacePrefix = elt.prefix relations = { "ancestor": XmlUtil.ancestor, "parent": XmlUtil.parent, "child-choice": XmlUtil.children, "child-sequence": XmlUtil.children, "child-or-text": XmlUtil.children, "descendant": XmlUtil.descendants }[rel](elt, namespaceFilter, nameFilter) if rel in ("ancestor", "parent"): if relations is None: relations = [] else: relations = [relations] if rel == "child-or-text": relations += XmlUtil.innerTextNodes(elt, ixExclude=True, ixEscape=False, ixContinuation=False) issue = '' if reqt in ('^', ): if not any(r.localName in names and r.namespaceURI == elt.namespaceURI for r in relations): issue = " and is missing one of " + ', '.join(names) if reqt in ('1', ) and not elt.isNil: if sum(r.localName in names and r.namespaceURI == elt.namespaceURI for r in relations) != 1: issue = " and must have exactly one of " + ', '.join( names) if reqt in ('&', '^'): disallowed = [ str(r.elementQname) for r in relations if not (r.tag in names or (r.localName in names and r.namespaceURI == elt.namespaceURI)) ] if disallowed: issue += " and may not have " + ", ".join(disallowed) elif rel == "child-sequence": sequencePosition = 0 for i, r in enumerate(relations): rPos = names.index(str(r.localName)) if rPos < sequencePosition: issue += " and is out of sequence: " + str( r.elementQname) else: sequencePosition = rPos if reqt == '?' and len(relations) > 1: issue = " may only have 0 or 1 but {0} present ".format( len(relations)) if reqt == '+' and len(relations) == 0: issue = " must have at least 1 but none present " disallowedChildText = bool( reqt == '&' and rel in ("child-sequence", "child-choice") and elt.textValue.strip()) if ((reqt == '+' and not relations) or (reqt == '-' and relations) or (issue) or disallowedChildText): code = "{}:{}".format( ixSect[elt.namespaceURI].get(elt.localName, "other")["constraint"], { 'ancestor': "ancestorNode", 'parent': "parentNode", 'child-choice': "childNodes", 'child-sequence': "childNodes", 'child-or-text': "childNodesOrText", 'descendant': "descendantNodes" }[rel] + { '+': "Required", '-': "Disallowed", '&': "Allowed", '^': "Specified", '1': "Specified" }.get(reqt, "Specified")) msg = _("Inline XBRL ix:{0} {1} {2} {3} {4} element{5}" ).format( elt.localName, { '+': "must", '-': "may not", '&': "may only", '?': "may", '+': "must", '^': "must", '1': "must" }[reqt], { 'ancestor': "be nested in", 'parent': "have parent", 'child-choice': "have child", 'child-sequence': "have child", 'child-or-text': "have child or text,", 'descendant': "have as descendant" }[rel], '' if rel == 'child-or-text' else ', '.join( str(r.elementQname) for r in relations) if names == ('*', ) and relations else ", ".join( "{}:{}".format(namespacePrefix, n) for n in names), issue, " and no child text (\"{}\")".format( elt.textValue.strip()[:32]) if disallowedChildText else "") modelXbrl.error( code, msg, modelObject=[elt] + relations, requirement=reqt, messageCodes= ("ix{ver.sect}:ancestorNode{Required|Disallowed}", "ix{ver.sect}:childNodesOrTextRequired", "ix{ver.sect}:childNodes{Required|Disallowed|Allowed}", "ix{ver.sect}:descendantNodesDisallowed", "ix{ver.sect}:parentNodeRequired")) # other static element checks (that don't require a complete object model, context, units, etc if elt.localName == "nonFraction": childElts = XmlUtil.children(elt, '*', '*') hasText = (elt.text or "") or any( (childElt.tail or "") for childElt in childElts) if elt.isNil: ancestorNonFractions = XmlUtil.ancestors( elt, _ixNS, elt.localName) if ancestorNonFractions: modelXbrl.error( ixMsgCode("nonFractionAncestors", elt), _("Fact %(fact)s is a nil nonFraction and MUST not have an ancestor ix:nonFraction" ), modelObject=[elt] + ancestorNonFractions, fact=elt.qname) if childElts or hasText: modelXbrl.error( ixMsgCode("nonFractionTextAndElementChildren", elt), _("Fact %(fact)s is a nil nonFraction and MUST not have an child elements or text" ), modelObject=[elt] + childElts, fact=elt.qname) elt.setInvalid( ) # prevent further validation or cascading errors else: if ((childElts and (len(childElts) != 1 or childElts[0].namespaceURI != _ixNS or childElts[0].localName != "nonFraction")) or (childElts and hasText)): modelXbrl.error( ixMsgCode("nonFractionTextAndElementChildren", elt), _("Fact %(fact)s is a non-nil nonFraction and MUST have exactly one ix:nonFraction child element or text." ), modelObject=[elt] + childElts, fact=elt.qname) elt.setInvalid() if elt.localName == "fraction": if elt.isNil: ancestorFractions = XmlUtil.ancestors(elt, _ixNS, elt.localName) if ancestorFractions: modelXbrl.error( ixMsgCode("fractionAncestors", elt), _("Fact %(fact)s is a nil fraction and MUST not have an ancestor ix:fraction" ), modelObject=[elt] + ancestorFractions, fact=elt.qname) else: nonFrChildren = [ e for e in XmlUtil.children(elt, _ixNS, '*') if e.localName not in ("fraction", "numerator", "denominator") ] if nonFrChildren: modelXbrl.error( ixMsgCode("fractionElementChildren", elt), _("Fact %(fact)s is a non-nil fraction and not have any child elements except ix:fraction, ix:numerator and ix:denominator: %(children)s" ), modelObject=[elt] + nonFrChildren, fact=elt.qname, children=", ".join(e.localName for e in nonFrChildren)) for ancestorFraction in XmlUtil.ancestors( elt, XbrlConst.ixbrl11, "fraction"): # only ix 1.1 if normalizeSpace(elt.get("unitRef")) != normalizeSpace( ancestorFraction.get("unitRef")): modelXbrl.error( ixMsgCode("fractionNestedUnitRef", elt), _("Fact %(fact)s fraction and ancestor fractions must have matching unitRefs: %(unitRef)s, %(unitRef2)s" ), modelObject=[elt] + nonFrChildren, fact=elt.qname, unitRef=elt.get("unitRef"), unitRef2=ancestorFraction.get("unitRef")) if elt.localName in ("nonFraction", "numerator", "denominator", "nonNumeric"): fmt = elt.format if fmt: if fmt in _customTransforms: pass elif fmt.namespaceURI not in FunctionIxt.ixtNamespaceFunctions: modelXbrl.error( ixMsgCode("invalidTransformation", elt, sect="validation"), _("Fact %(fact)s has unrecognized transformation namespace %(namespace)s" ), modelObject=elt, fact=elt.qname, transform=fmt, namespace=fmt.namespaceURI) elt.setInvalid() elif fmt.localName not in FunctionIxt.ixtNamespaceFunctions[ fmt.namespaceURI]: modelXbrl.error( ixMsgCode("invalidTransformation", elt, sect="validation"), _("Fact %(fact)s has unrecognized transformation name %(name)s" ), modelObject=elt, fact=elt.qname, transform=fmt, name=fmt.localName) elt.setInvalid() def ixToXhtml(fromRoot): toRoot = etree.Element(fromRoot.localName) copyNonIxChildren(fromRoot, toRoot) for attrTag, attrValue in fromRoot.items(): checkAttribute(fromRoot, False, attrTag, attrValue) if attrTag not in ( 'version', # used in inline test cases but not valid xhtml '{http://www.w3.org/2001/XMLSchema-instance}schemaLocation' ): toRoot.set(attrTag, attrValue) return toRoot def copyNonIxChildren(fromElt, toElt, excludeSubtree=False): for fromChild in fromElt.iterchildren(): if isinstance(fromChild, ModelObject): isIxNs = fromChild.namespaceURI in XbrlConst.ixbrlAll if isIxNs: if fromChild.localName not in ixElements[ fromChild.namespaceURI]: modelXbrl.error( ixMsgCode("elementNameInvalid", ns=_ixNS), _("Inline XBRL element name %(element)s is not valid" ), modelObject=fromChild, element=str(fromChild.elementQname)) else: checkHierarchyConstraints(fromChild) for attrTag, attrValue in fromChild.items(): checkAttribute(fromChild, True, attrTag, attrValue) for attrTag in ixAttrRequired[ fromChild.namespaceURI].get( fromChild.localName, []): if fromChild.get(attrTag) is None: modelXbrl.error( ixMsgCode("attributeRequired", fromChild), _("Attribute %(attribute)s required on element ix:%(element)s" ), modelObject=fromChild, attribute=attrTag, element=fromChild.localName) if excludeSubtree or (fromChild.localName in {"references", "resources"} and isIxNs): copyNonIxChildren(fromChild, toElt, excludeSubtree=True) else: if fromChild.localName in { "footnote", "nonNumeric", "continuation" } and isIxNs: toChild = etree.Element("ixNestedContent") toElt.append(toChild) copyNonIxChildren(fromChild, toChild) if fromChild.text is not None: toChild.text = fromChild.text if fromChild.tail is not None: toChild.tail = fromChild.tail elif isIxNs: copyNonIxChildren(fromChild, toElt) else: toChild = etree.Element(fromChild.localName) toElt.append(toChild) copyNonIxChildren(fromChild, toChild) for attrTag, attrValue in fromChild.items(): checkAttribute(fromChild, False, attrTag, attrValue) toChild.set(attrTag, attrValue) if fromChild.text is not None: toChild.text = fromChild.text if fromChild.tail is not None: toChild.tail = fromChild.tail # copy xhtml elements to fresh tree with open(os.path.join(modelXbrl.modelManager.cntlr.configDir, _xhtmlDTD)) as fh: dtd = DTD(fh) try: #with open("/users/hermf/temp/testDtd.htm", "w") as fh: # fh.write(etree.tostring(ixToXhtml(elt), encoding=_STR_UNICODE, pretty_print=True)) if not dtd.validate(ixToXhtml(elt)): modelXbrl.error("html:syntaxError", _("%(element)s error %(error)s"), modelObject=elt, element=elt.localName.title(), error=', '.join( e.message for e in dtd.error_log.filter_from_errors())) if isEFM: ValidateFilingText.validateHtmlContent(modelXbrl, elt, elt, "InlineXBRL", "EFM.5.02.05.", isInline=True) except XMLSyntaxError as err: modelXbrl.error("html:syntaxError", _("%(element)s error %(error)s"), modelObject=elt, element=elt.localName.title(), error=dtd.error_log.filter_from_errors())
class LogNormalizer: """Basic normalization flow manager. Normalizers definitions are loaded from a path and checked against the DTD. If the definitions are syntactically correct, the normalizers are instantiated and populate the manager's cache. Normalization priormority is established as follows: * Maximum priority assigned to normalizers where the "appliedTo" tag is set to "raw". They MUST be mutually exclusive. * Medium priority assigned to normalizers where the "appliedTo" tag is set to "body". * Lowest priority assigned to any remaining normalizers. Some extra treatment is also done prior and after the log normalization: * Assignment of a unique ID, under the tag "uuid" * Conversion of date tags to UTC, if the "_timezone" was set prior to the normalization process.""" def __init__(self, normalizers_path, active_normalizers={}): """ Instantiates a flow manager. The default behavior is to activate every available normalizer. @param normalizer_path: absolute path to the normalizer XML definitions to use. @param active_normalizers: a dictionary of active normalizers in the form {name: [True|False]}. """ self.normalizers_path = normalizers_path self.active_normalizers = active_normalizers self.dtd = DTD(open(os.path.join(self.normalizers_path, "normalizer.dtd"))) self._cache = [] self.reload() def reload(self): """Refreshes this instance's normalizers pool.""" self.normalizers = {"raw": [], "body": []} for path in self.iter_normalizer(): norm = parse(open(path)) if not self.dtd.validate(norm): warnings.warn("Skipping %s : invalid DTD" % path) else: normalizer = Normalizer(norm, os.path.join(self.normalizers_path, "common_tagTypes.xml")) self.normalizers.setdefault(normalizer.appliedTo, []) self.normalizers[normalizer.appliedTo].append(normalizer) self.activate_normalizers() def iter_normalizer(self): """ Iterates through normalizers and returns the normalizers' paths. @return: a generator of absolute paths. """ path = self.normalizers_path for root, dirs, files in os.walk(path): for name in files: if not name.startswith("common_tagTypes") and name.endswith(".xml"): yield os.path.join(root, name) def __len__(self): """ Returns the amount of available normalizers. """ return len([n for n in self.iter_normalizer()]) def update_normalizer(self, raw_xml_contents, name=None): """used to add or update a normalizer. @param raw_xml_contents: XML description of normalizer as flat XML. It must comply to the DTD. @param name: if set, the XML description will be saved as name.xml. If left blank, name will be fetched from the XML description. """ xmlconf = XMLfromstring(raw_xml_contents).getroottree() if not self.dtd.validate(xmlconf): raise ValueError, "This definition file does not follow the normalizers DTD :\n\n%s" % self.dtd.error_log.filter_from_errors() if not name: name = xmlconf.getroot().get("name") if not name.endswith(".xml"): name += ".xml" path = self.normalizers_path xmlconf.write(open(os.path.join(path, name), "w"), encoding="utf8", method="xml", pretty_print=True) self.reload() def get_normalizer_source(self, name): """Returns the raw XML source of normalizer name.""" try: norm = [u for u in sum(self.normalizers.values(), []) if u.name == name][0] return norm.get_source() except: raise ValueError, "Normalizer %s not found" % name def activate_normalizers(self): """Activates normalizers according to what was set by calling set_active_normalizers. If no call to the latter function has been made so far, this method activates every normalizer.""" if not self.active_normalizers: self.active_normalizers = dict([(n.name, True) for n in sum([v for v in self.normalizers.values()], [])]) # fool-proof the list self.set_active_normalizers(self.active_normalizers) # build an ordered cache to speed things up self._cache = [] # First normalizers to apply are the "raw" ones. for norm in self.normalizers["raw"]: # consider the normalizer to be inactive if not # explicitly in our list if self.active_normalizers.get(norm.name, False): self._cache.append(norm) # Then, apply the applicative normalization on "body": for norm in self.normalizers["body"]: if self.active_normalizers.get(norm.name, False): self._cache.append(norm) # Then, apply everything else for norm in sum([self.normalizers[u] for u in self.normalizers if u not in ["raw", "body"]], []): self._cache.append(norm) def get_active_normalizers(self): """Returns a dictionary of normalizers; keys are normalizers' names and values are True|False according to the normalizer's activation state.""" return self.active_normalizers def set_active_normalizers(self, norms={}): """Sets the active/inactive normalizers. Default behavior is to deactivate every normalizer. @param norms: a dictionary, similar to the one returned by get_active_normalizers.""" default = dict([(n.name, False) for n in sum([v for v in self.normalizers.values()], [])]) default.update(norms) self.active_normalizers = default def lognormalize(self, data): """ This method is the entry point to normalize data (a log). data is passed through every activated normalizer and extra tagging occurs accordingly. data receives also an extra uuid tag. If data contains a key called _timezone, its value is used to convert any date into UTC. This value must be a valid timezone name; see the pytz module for more information. @param data: must be a dictionary with at least a key 'raw' or 'body' with BaseString values (preferably Unicode). Here an example : >>> from logsparser import lognormalizer >>> from pprint import pprint >>> ln = lognormalizer.LogNormalizer('/usr/local/share/normalizers/') >>> mylog = {'raw' : 'Jul 18 15:35:01 zoo /USR/SBIN/CRON[14338]: (root) CMD (/srv/git/redmine-changesets.sh)'} >>> ln.lognormalize(mylog) >>> pprint mylog {'body': '(root) CMD (/srv/git/redmine-changesets.sh)', 'date': datetime.datetime(2011, 7, 18, 15, 35, 1), 'pid': '14338', 'program': '/USR/SBIN/CRON', 'raw': 'Jul 18 15:35:01 zoo /USR/SBIN/CRON[14338]: (root) CMD (/srv/git/redmine-changesets.sh)', 'source': 'zoo', 'uuid': 70851882840934161193887647073096992594L} """ data = self.uuidify(data) data = self.normalize(data) # convert date to UTC if "_timezone" in data.keys(): try: timezone = pytz.timezone(data["_timezone"]) loc_date = timezone.localize(data["date"]) data["date"] = loc_date.astimezone(pytz.utc) # turn the date into a "naive" object data["date"] = data["date"].replace(tzinfo=None) del data["_timezone"] except: warnings.warn("Invalid timezone %s, skipping UTC conversion" % data["_timezone"]) # some more functions for clarity def uuidify(self, log): """Adds a unique UID to the normalized log.""" log["uuid"] = _UUID_.uuid4().int return log def normalize(self, log): """plain normalization.""" for norm in self._cache: log = norm.normalize(log) return log def _normalize(self, log): """Used for testing only, the normalizers' tags prerequisite are deactivated.""" for norm in self._cache: log = norm.normalize(log, do_not_check_prereq=True) return