def file(self,filepath): archiveFileSource = self.fileSourceContainingFilepath(filepath) if archiveFileSource is not None: if filepath.startswith(archiveFileSource.basefile): archiveFileName = filepath[len(archiveFileSource.basefile) + 1:] else: # filepath.startswith(self.baseurl) archiveFileName = filepath[len(archiveFileSource.baseurl) + 1:] if archiveFileSource.isZip: b = archiveFileSource.fs.read(archiveFileName.replace("\\","/")) encoding = XmlUtil.encoding(b) return (io.TextIOWrapper( io.BytesIO(b), encoding=encoding), encoding) elif archiveFileSource.isXfd: for data in archiveFileSource.xfdDocument.iter(tag="data"): outfn = data.findtext("filename") if outfn == archiveFileName: b64data = data.findtext("mimedata") if b64data: # convert to bytes #byteData = [] #for c in b64data: # byteData.append(ord(c)) b = base64.b64decode(b64data.encode("latin-1")) # remove BOM codes if present if len(b) > 3 and b[0] == 239 and b[1] == 187 and b[2] == 191: start = 3; length = len(b) - 3; b = b[start:start + length] else: start = 0; length = len(b); # pass back as ascii #str = "" #for bChar in b[start:start + length]: # str += chr( bChar ) #return str return (io.TextIOWrapper( io.BytesIO(b), encoding=XmlUtil.encoding(b)), "latin-1") return (None,None) # check encoding with open(filepath, 'rb') as fb: hdrBytes = fb.peek(512) encoding = XmlUtil.encoding(hdrBytes) if encoding.lower() in ('utf-8','utf8'): text = None else: text = fb.read().decode(encoding) # allow filepath to close # this may not be needed for Mac or Linux, needs confirmation!!! if text is None: # ok to read as utf-8 return (open(filepath, 'rt', encoding='utf-8'), encoding) else: # strip XML declaration xmlDeclarationMatch = XMLdeclaration.search(text) if xmlDeclarationMatch: # remove it for lxml start,end = xmlDeclarationMatch.span() text = text[0:start] + text[end:] return (io.StringIO(initial_value=text), encoding)
def openFileStream(cntlr, filepath, mode='r', encoding=None): if isHttpUrl(filepath) and cntlr: filepath = cntlr.webCache.getfilename(filepath) # file path may be server (or memcache) or local file system if filepath.startswith(SERVER_WEB_CACHE) and cntlr: filestream = None cacheKey = filepath[len(SERVER_WEB_CACHE) + 1:].replace("\\", "/") if cntlr.isGAE: # check if in memcache cachedBytes = gaeGet(cacheKey) if cachedBytes: filestream = io.BytesIO(cachedBytes) if filestream is None: filestream = io.BytesIO() cntlr.webCache.retrieve( cntlr.webCache.cacheFilepathToUrl(filepath), filestream=filestream) if cntlr.isGAE: gaeSet(cacheKey, filestream.getvalue()) if mode.endswith('t') or encoding: contents = filestream.getvalue() filestream.close() filestream = FileNamedStringIO( filepath, contents.decode(encoding or 'utf-8')) return filestream # local file system elif encoding is None and 'b' not in mode: openedFileStream = io.open(filepath, mode='rb') hdrBytes = openedFileStream.read(512) encoding = XmlUtil.encoding(hdrBytes, default=None) openedFileStream.close() return io.open(filepath, mode=mode, encoding=encoding) else: # local file system return io.open(filepath, mode=mode, encoding=encoding)
def openXmlFileStream(cntlr, filepath, stripDeclaration=False): # returns tuple: (fileStream, encoding) openedFileStream = openFileStream(cntlr, filepath, 'rb') # check encoding hdrBytes = openedFileStream.read(512) encoding = XmlUtil.encoding(hdrBytes) if encoding.lower() in ('utf-8', 'utf8') and ( cntlr is None or not cntlr.isGAE) and not stripDeclaration: text = None openedFileStream.close() else: openedFileStream.seek(0) text = openedFileStream.read().decode(encoding) openedFileStream.close() # allow filepath to close # this may not be needed for Mac or Linux, needs confirmation!!! if text is None: # ok to read as utf-8 return io.open(filepath, 'rt', encoding='utf-8'), encoding else: # strip XML declaration xmlDeclarationMatch = XMLdeclaration.search(text) if xmlDeclarationMatch: # remove it for lxml start, end = xmlDeclarationMatch.span() text = text[0:start] + text[end:] return (io.StringIO(initial_value=text), encoding)
def securityFileSourceFile(cntlr, ownerObject, filepath, binary, stripDeclaration): # handle FileSource file requests which can return encrypted contents if ownerObject.hasEncryption: for entrypointfile in ownerObject.entrypointfiles: if (filepath == entrypointfile.get("file") or any(filepath == ixfile.get("file") for ixfile in entrypointfile.get("ixds",())) ) and "key" in entrypointfile: ownerObject.cipherKey = base64.decodebytes(entrypointfile["key"].encode()) break # set new iv, key based on entrypointfiles # may be a non-entry file (xsd, linkbase, jpg) using entry's key if os.path.exists(filepath + ENCRYPTED_FILE_SUFFIX) and ownerObject.cipherKey is not None: with io.open(filepath + ENCRYPTED_FILE_SUFFIX, "rb") as fh: nonce = fh.read(16) tag = fh.read(16) encrdata = fh.read() cipher = AES.new(ownerObject.cipherKey, AES.MODE_EAX, nonce) bytesdata = cipher.decrypt_and_verify(encrdata, tag) encrdata = None # dereference before decode operation if binary: # return bytes return (FileSource.FileNamedBytesIO(filepath, bytesdata[0:-bytesdata[-1]]), ) # trim AES CBC padding # detect encoding if there is an XML header encoding = XmlUtil.encoding(bytesdata[0:512], default=cntlr.modelManager.disclosureSystem.defaultXmlEncoding if cntlr else 'utf-8') # return decoded string text = bytesdata[0:-bytesdata[-1]].decode(encoding or 'utf-8') # trim AES CBC padding and decode bytesdata = None # dereference before text operation if stripDeclaration: # file source may strip XML declaration for libxml xmlDeclarationMatch = FileSource.XMLdeclaration.search(text) if xmlDeclarationMatch: # remove it for lxml start,end = xmlDeclarationMatch.span() text = text[0:start] + text[end:] return (FileSource.FileNamedStringIO(filepath, initial_value=text), encoding) return None
def openXmlFileStream(cntlr, filepath, stripDeclaration=False): # returns tuple: (fileStream, encoding) openedFileStream = openFileStream(cntlr, filepath, 'rb') # check encoding hdrBytes = openedFileStream.read(512) encoding = XmlUtil.encoding( hdrBytes, default=cntlr.modelManager.disclosureSystem.defaultXmlEncoding if cntlr else 'utf-8') # encoding default from disclosure system could be None if encoding.lower() in ('utf-8', 'utf8', 'utf-8-sig') and ( cntlr is None or not cntlr.isGAE) and not stripDeclaration: text = None openedFileStream.close() else: openedFileStream.seek(0) text = openedFileStream.read().decode(encoding or 'utf-8') openedFileStream.close() # allow filepath to close # this may not be needed for Mac or Linux, needs confirmation!!! if text is None: # ok to read as utf-8 return io.open(filepath, 'rt', encoding='utf-8'), encoding else: if stripDeclaration: # strip XML declaration xmlDeclarationMatch = XMLdeclaration.search(text) if xmlDeclarationMatch: # remove it for lxml start, end = xmlDeclarationMatch.span() text = text[0:start] + text[end:] return (FileNamedStringIO(filepath, initial_value=text), encoding)
def securityFileSourceFile(cntlr, ownerObject, filepath, binary, stripDeclaration): # handle FileSource file requests which can return encrypted contents if ownerObject.hasEncryption: for entrypointfile in ownerObject.entrypointfiles: if (filepath == entrypointfile.get("file") or any(filepath == ixfile.get("file") for ixfile in entrypointfile.get("ixds",())) ) and "key" in entrypointfile and "iv" in entrypointfile: ownerObject.cipherIv = base64.decodebytes(entrypointfile["iv"].encode()) ownerObject.cipherKey = base64.decodebytes(entrypointfile["key"].encode()) break # set new iv, key based on entrypointfiles # may be a non-entry file (xsd, linkbase, jpg) using entry's iv, key if os.path.exists(filepath + ENCRYPTED_FILE_SUFFIX) and ownerObject.cipherKey is not None and ownerObject.cipherIv is not None: encrdata = io.open(filepath + ENCRYPTED_FILE_SUFFIX, "rb").read() cipher = AES.new(ownerObject.cipherKey, AES.MODE_CBC, iv=ownerObject.cipherIv) bytesdata = cipher.decrypt(encrdata) encrdata = None # dereference before decode operation if binary: # return bytes return (FileSource.FileNamedBytesIO(filepath, bytesdata[0:-bytesdata[-1]]), ) # trim AES CBC padding # detect encoding if there is an XML header encoding = XmlUtil.encoding(bytesdata[0:512], default=cntlr.modelManager.disclosureSystem.defaultXmlEncoding if cntlr else 'utf-8') # return decoded string text = bytesdata[0:-bytesdata[-1]].decode(encoding or 'utf-8') # trim AES CBC padding and decode bytesdata = None # dereference before text operation if stripDeclaration: # file source may strip XML declaration for libxml xmlDeclarationMatch = FileSource.XMLdeclaration.search(text) if xmlDeclarationMatch: # remove it for lxml start,end = xmlDeclarationMatch.span() text = text[0:start] + text[end:] return (FileSource.FileNamedStringIO(filepath, initial_value=text), encoding) return None
def openXmlFileStream(cntlr, filepath, stripDeclaration=False): # returns tuple: (fileStream, encoding) openedFileStream = openFileStream(cntlr, filepath, "rb") # check encoding hdrBytes = openedFileStream.read(512) encoding = XmlUtil.encoding( hdrBytes, default=cntlr.modelManager.disclosureSystem.defaultXmlEncoding if cntlr else "utf-8" ) # encoding default from disclosure system could be None if ( encoding.lower() in ("utf-8", "utf8", "utf-8-sig") and (cntlr is None or not cntlr.isGAE) and not stripDeclaration ): text = None openedFileStream.close() else: openedFileStream.seek(0) text = openedFileStream.read().decode(encoding or "utf-8") openedFileStream.close() # allow filepath to close # this may not be needed for Mac or Linux, needs confirmation!!! if text is None: # ok to read as utf-8 return io.open(filepath, "rt", encoding="utf-8"), encoding else: if stripDeclaration: # strip XML declaration xmlDeclarationMatch = XMLdeclaration.search(text) if xmlDeclarationMatch: # remove it for lxml start, end = xmlDeclarationMatch.span() text = text[0:start] + text[end:] return (FileNamedStringIO(filepath, initial_value=text), encoding)
def openFileStream(cntlr, filepath, mode='r', encoding=None): if isHttpUrl(filepath) and cntlr: filepath = cntlr.webCache.getfilename(filepath) # file path may be server (or memcache) or local file system if filepath.startswith(SERVER_WEB_CACHE) and cntlr: filestream = None cacheKey = filepath[len(SERVER_WEB_CACHE) + 1:].replace("\\","/") if cntlr.isGAE: # check if in memcache cachedBytes = gaeGet(cacheKey) if cachedBytes: filestream = io.BytesIO(cachedBytes) if filestream is None: filestream = io.BytesIO() cntlr.webCache.retrieve(cntlr.webCache.cacheFilepathToUrl(filepath), filestream=filestream) if cntlr.isGAE: gaeSet(cacheKey, filestream.getvalue()) if mode.endswith('t') or encoding: contents = filestream.getvalue() filestream.close() filestream = FileNamedStringIO(filepath, contents.decode(encoding or 'utf-8')) return filestream # local file system elif encoding is None and 'b' not in mode: openedFileStream = io.open(filepath, mode='rb') hdrBytes = openedFileStream.read(512) encoding = XmlUtil.encoding(hdrBytes, default=None) openedFileStream.close() return io.open(filepath, mode=mode, encoding=encoding) else: # local file system return io.open(filepath, mode=mode, encoding=encoding)
def openXmlFileStream(cntlr, filepath, stripDeclaration=False): # returns tuple: (fileStream, encoding) openedFileStream = openFileStream(cntlr, filepath, 'rb') # check encoding hdrBytes = openedFileStream.read(512) encoding = XmlUtil.encoding(hdrBytes) if encoding.lower() in ('utf-8','utf8','utf-8-sig') and (cntlr is None or not cntlr.isGAE) and not stripDeclaration: text = None openedFileStream.close() else: openedFileStream.seek(0) text = openedFileStream.read().decode(encoding) openedFileStream.close() # allow filepath to close # this may not be needed for Mac or Linux, needs confirmation!!! if text is None: # ok to read as utf-8 return io.open(filepath, 'rt', encoding='utf-8'), encoding else: if stripDeclaration: # strip XML declaration xmlDeclarationMatch = XMLdeclaration.search(text) if xmlDeclarationMatch: # remove it for lxml start,end = xmlDeclarationMatch.span() text = text[0:start] + text[end:] return (FileNamedStringIO(filepath, initial_value=text), encoding)
def file(self, filepath): archiveFileSource = self.fileSourceContainingFilepath(filepath) if archiveFileSource is not None: if filepath.startswith(archiveFileSource.basefile): archiveFileName = filepath[len(archiveFileSource.basefile) + 1:] else: # filepath.startswith(self.baseurl) archiveFileName = filepath[len(archiveFileSource.baseurl) + 1:] if archiveFileSource.isZip: b = archiveFileSource.fs.read(archiveFileName) return io.TextIOWrapper(io.BytesIO(b), encoding=XmlUtil.encoding(b)) elif archiveFileSource.isXfd: for data in archiveFileSource.xfdDocument.getElementsByTagName( "data"): outfn = XmlUtil.text( data.getElementsByTagName("filename")[0]) b64data = XmlUtil.text( data.getElementsByTagName("mimedata")[0]) if len(outfn) > 1 and len( b64data) > 1 and outfn == archiveFileName: # convert to bytes #byteData = [] #for c in b64data: # byteData.append(ord(c)) b = base64.b64decode(b64data.encode("latin-1")) # remove BOM codes if present if len(b) > 3 and b[0] == 239 and b[1] == 187 and b[ 2] == 191: start = 3 length = len(b) - 3 b = b[start:start + length] else: start = 0 length = len(b) # pass back as ascii #str = "" #for bChar in b[start:start + length]: # str += chr( bChar ) #return str return io.TextIOWrapper(io.BytesIO(b), encoding=XmlUtil.encoding(b)) return None return open(filepath, 'rt', encoding='utf-8')
def file(self,filepath): archiveFileSource = self.fileSourceContainingFilepath(filepath) if archiveFileSource is not None: if filepath.startswith(archiveFileSource.basefile): archiveFileName = filepath[len(archiveFileSource.basefile) + 1:] else: # filepath.startswith(self.baseurl) archiveFileName = filepath[len(archiveFileSource.baseurl) + 1:] if archiveFileSource.isZip: b = archiveFileSource.fs.read(archiveFileName) return io.TextIOWrapper( io.BytesIO(b), encoding=XmlUtil.encoding(b)) elif archiveFileSource.isXfd: for data in archiveFileSource.xfdDocument.getElementsByTagName("data"): outfn = XmlUtil.text(data.getElementsByTagName("filename")[0]) b64data = XmlUtil.text(data.getElementsByTagName("mimedata")[0]) if len(outfn) > 1 and len(b64data) > 1 and outfn == archiveFileName: # convert to bytes #byteData = [] #for c in b64data: # byteData.append(ord(c)) b = base64.b64decode(b64data.encode("latin-1")) # remove BOM codes if present if len(b) > 3 and b[0] == 239 and b[1] == 187 and b[2] == 191: start = 3; length = len(b) - 3; b = b[start:start + length] else: start = 0; length = len(b); # pass back as ascii #str = "" #for bChar in b[start:start + length]: # str += chr( bChar ) #return str return io.TextIOWrapper( io.BytesIO(b), encoding=XmlUtil.encoding(b)) return None return open(filepath, 'rt', encoding='utf-8')
def openFileStream(cntlr, filepath, mode='r', encoding=None): if PackageManager.isMappedUrl(filepath): filepath = PackageManager.mappedUrl(filepath) elif isHttpUrl(filepath) and cntlr and hasattr( cntlr, "modelManager" ): # may be called early in initialization for PluginManager filepath = cntlr.modelManager.disclosureSystem.mappedUrl(filepath) if archiveFilenameParts(filepath): # file is in an archive return openFileSource(filepath, cntlr).file(filepath, binary='b' in mode, encoding=encoding)[0] if isHttpUrl(filepath) and cntlr: _cacheFilepath = cntlr.webCache.getfilename( filepath, normalize=True ) # normalize is separate step in ModelDocument retrieval, combined here if _cacheFilepath is None: raise IOError(_("Unable to open file: {0}.").format(filepath)) filepath = _cacheFilepath # file path may be server (or memcache) or local file system if filepath.startswith(SERVER_WEB_CACHE) and cntlr: filestream = None cacheKey = filepath[len(SERVER_WEB_CACHE) + 1:].replace("\\", "/") if cntlr.isGAE: # check if in memcache cachedBytes = gaeGet(cacheKey) if cachedBytes: filestream = io.BytesIO(cachedBytes) if filestream is None: filestream = io.BytesIO() cntlr.webCache.retrieve( cntlr.webCache.cacheFilepathToUrl(filepath), filestream=filestream) if cntlr.isGAE: gaeSet(cacheKey, filestream.getvalue()) if mode.endswith('t') or encoding: contents = filestream.getvalue() filestream.close() filestream = FileNamedStringIO( filepath, contents.decode(encoding or 'utf-8')) return filestream # local file system elif encoding is None and 'b' not in mode: openedFileStream = io.open(filepath, mode='rb') hdrBytes = openedFileStream.read(512) encoding = XmlUtil.encoding(hdrBytes, default=None) openedFileStream.close() return io.open(filepath, mode=mode, encoding=encoding) else: # local file system return io.open(filepath, mode=mode, encoding=encoding)
def openFileStream(cntlr, filepath, mode='r', encoding=None): if PackageManager.isMappedUrl(filepath): filepath = PackageManager.mappedUrl(filepath) else: filepath = cntlr.modelManager.disclosureSystem.mappedUrl(filepath) if archiveFilenameParts(filepath): # file is in an archive return openFileSource(filepath, cntlr).file(filepath, binary='b' in mode, encoding=encoding)[0] if isHttpUrl(filepath) and cntlr: _cacheFilepath = cntlr.webCache.getfilename(filepath) if _cacheFilepath is None: raise IOError(_("Unable to open file: {0}.").format(filepath)) filepath = _cacheFilepath # file path may be server (or memcache) or local file system if filepath.startswith(SERVER_WEB_CACHE) and cntlr: filestream = None cacheKey = filepath[len(SERVER_WEB_CACHE) + 1:].replace("\\","/") if cntlr.isGAE: # check if in memcache cachedBytes = gaeGet(cacheKey) if cachedBytes: filestream = io.BytesIO(cachedBytes) if filestream is None: filestream = io.BytesIO() cntlr.webCache.retrieve(cntlr.webCache.cacheFilepathToUrl(filepath), filestream=filestream) if cntlr.isGAE: gaeSet(cacheKey, filestream.getvalue()) if mode.endswith('t') or encoding: contents = filestream.getvalue() filestream.close() filestream = FileNamedStringIO(filepath, contents.decode(encoding or 'utf-8')) return filestream # local file system elif encoding is None and 'b' not in mode: openedFileStream = io.open(filepath, mode='rb') hdrBytes = openedFileStream.read(512) encoding = XmlUtil.encoding(hdrBytes, default=None) openedFileStream.close() return io.open(filepath, mode=mode, encoding=encoding) else: # local file system return io.open(filepath, mode=mode, encoding=encoding)
def streamingExtensionsLoader(modelXbrl, mappedUri, filepath, **kwargs): # check if big instance and has header with an initial incomplete tree walk (just 2 elements if not _streamingExtensionsCheck: return None # track whether modelXbrl has been validated by this streaming extension modelXbrl._streamingExtensionValidated = False def logSyntaxErrors(parsercontext): for error in parsercontext.error_log: modelXbrl.error("xmlSchema:syntax", _("%(error)s, %(fileName)s, line %(line)s, column %(column)s, %(sourceAction)s source element"), modelObject=modelXbrl, fileName=os.path.basename(filepath), error=error.message, line=error.line, column=error.column, sourceAction="streaming") #### note: written for iterparse of lxml prior to version 3.3, otherwise rewrite to use XmlPullParser ### #### note: iterparse wants a binary file, but file is text mode _file, = modelXbrl.fileSource.file(filepath, binary=True) startedAt = time.time() modelXbrl.profileActivity() ''' this seems twice as slow as iterparse class instInfoTarget(): def __init__(self, element_factory=None, parser=None): self.newTree = True self.streamingAspects = None self.foundInstance = False self.creationSoftwareComment = '' self.currentEltTag = "(before xbrli:xbrl)" self.numRootFacts = 0 def start(self, tag, attrib, nsmap=None): if self.newTree: if tag == "{http://www.xbrl.org/2003/instance}xbrl": self.foundInstance = True self.newTree = False else: # break raise NotInstanceDocumentException() elif not tag.startswith("{http://www.xbrl.org/"): self.numRootFacts += 1 if self.numRootFacts % 1000 == 0: modelXbrl.profileActivity("... streaming tree check", minTimeToShow=20.0) self.currentEltTag = tag def end(self, tag): pass def data(self, data): pass def comment(self, text): if not self.foundInstance: # accumulate comments before xbrli:xbrl self.creationSoftwareComment += ('\n' if self.creationSoftwareComment else '') + text elif not self.creationSoftwareComment: self.creationSoftwareComment = text # or first comment after xbrli:xbrl def pi(self, target, data): if target == "xbrl-streamable-instance": if self.currentEltTag == "{http://www.xbrl.org/2003/instance}xbrl": self.streamingAspects = dict(etree.PI(target,data).attrib.copy()) # dereference target results else: modelXbrl.error("streamingExtensions:headerMisplaced", _("Header is misplaced: %(target)s, must follow xbrli:xbrl element but was found at %(element)s"), modelObject=modelXbrl, target=target, element=self.currentEltTag) def close(self): if not self.creationSoftwareComment: self.creationSoftwareComment = None return True instInfo = instInfoTarget() infoParser = etree.XMLParser(recover=True, huge_tree=True, target=instInfo) try: etree.parse(_file, parser=infoParser, base_url=filepath) except NotInstanceDocumentException: pass ''' foundErrors = False foundInstance = False streamingAspects = None creationSoftwareComment = None instInfoNumRootFacts = 0 numElts = 0 elt = None instInfoContext = etree.iterparse(_file, events=("start","end"), huge_tree=True) for event, elt in instInfoContext: if event == "start": if elt.getparent() is not None: if elt.getparent().tag == "{http://www.xbrl.org/2003/instance}xbrl": if not foundInstance: foundInstance = True pi = precedingProcessingInstruction(elt, "xbrl-streamable-instance") if pi is None: break else: streamingAspects = dict(pi.attrib.copy()) if creationSoftwareComment is None: creationSoftwareComment = precedingComment(elt) if not elt.tag.startswith("{http://www.xbrl.org/"): instInfoNumRootFacts += 1 if instInfoNumRootFacts % 1000 == 0: modelXbrl.profileActivity("... streaming tree check", minTimeToShow=20.0) elif not foundInstance: break elif elt.tag == "{http://www.xbrl.org/2003/instance}xbrl": creationSoftwareComment = precedingComment(elt) if precedingProcessingInstruction(elt, "xbrl-streamable-instance") is not None: modelXbrl.error("streamingExtensions:headerMisplaced", _("Header is misplaced: %(error)s, must follow xbrli:xbrl element"), modelObject=elt) elif event == "end": elt.clear() numElts += 1 if numElts % 1000 == 0 and elt.getparent() is not None: while elt.getprevious() is not None and elt.getparent() is not None: del elt.getparent()[0] if elt is not None: elt.clear() _file.seek(0,io.SEEK_SET) # allow reparsing if not foundInstance or streamingAspects is None: del elt _file.close() return None modelXbrl.profileStat(_("streaming tree check"), time.time() - startedAt) startedAt = time.time() try: version = Decimal(streamingAspects.get("version")) if int(version) != 1: modelXbrl.error("streamingExtensions:unsupportedVersion", _("Streaming version %(version)s, major version number must be 1"), modelObject=elt, version=version) foundErrors = True except (InvalidOperation, OverflowError): modelXbrl.error("streamingExtensions:versionError", _("Version %(version)s, number must be 1.n"), modelObject=elt, version=streamingAspects.get("version", "(none)")) foundErrors = True for bufAspect in ("contextBuffer", "unitBuffer", "footnoteBuffer"): try: bufLimit = Decimal(streamingAspects.get(bufAspect, "INF")) if bufLimit < 1 or (bufLimit.is_finite() and bufLimit % 1 != 0): raise InvalidOperation elif bufAspect == "contextBuffer": contextBufferLimit = bufLimit elif bufAspect == "unitBuffer": unitBufferLimit = bufLimit elif bufAspect == "footnoteBuffer": footnoteBufferLimit = bufLimit except InvalidOperation: modelXbrl.error("streamingExtensions:valueError", _("Streaming %(attrib)s %(value)s, number must be a positive integer or INF"), modelObject=elt, attrib=bufAspect, value=streamingAspects.get(bufAspect)) foundErrors = True if _streamingExtensionsValidate: incompatibleValidations = [] _validateDisclosureSystem = modelXbrl.modelManager.validateDisclosureSystem _disclosureSystem = modelXbrl.modelManager.disclosureSystem if _validateDisclosureSystem and _disclosureSystem.EFM: incompatibleValidations.append("EFM") if _validateDisclosureSystem and _disclosureSystem.GFM: incompatibleValidations.append("GFM") if _validateDisclosureSystem and _disclosureSystem.EBA: incompatibleValidations.append("EBA") if _validateDisclosureSystem and _disclosureSystem.HMRC: incompatibleValidations.append("EBA") if modelXbrl.modelManager.validateCalcLB: incompatibleValidations.append("calculation LB") if incompatibleValidations: modelXbrl.error("streamingExtensions:incompatibleValidation", _("Streaming instance validation does not support %(incompatibleValidations)s validation"), modelObject=modelXbrl, incompatibleValidations=', '.join(incompatibleValidations)) foundErrors = True if instInfoContext.error_log: foundErrors = True logSyntaxErrors(instInfoContext) del instInfoContext # dereference if foundErrors: _file.close() return None _encoding = XmlUtil.encoding(_file.read(512)) _file.seek(0,io.SEEK_SET) # allow reparsing if _streamingExtensionsValidate: validator = Validate(modelXbrl) instValidator = validator.instValidator eltMdlObjs = {} contextBuffer = [] unitBuffer = [] footnoteBuffer = [] factBuffer = [] numFacts = 1 class modelLoaderTarget(): def __init__(self, element_factory=None, parser=None): self.newTree = True self.currentMdlObj = None self.beforeInstanceStream = True self.numRootFacts = 1 def start(self, tag, attrib, nsmap=None): mdlObj = _parser.makeelement(tag, attrib=attrib, nsmap=nsmap) mdlObj.sourceline = 1 if self.newTree: self.newTree = False self.currentMdlObj = mdlObj modelDocument = ModelDocument(modelXbrl, Type.INSTANCE, mappedUri, filepath, mdlObj.getroottree()) modelXbrl.modelDocument = modelDocument # needed for incremental validation mdlObj.init(modelDocument) modelDocument.parser = _parser # needed for XmlUtil addChild's makeelement modelDocument.parserLookupName = _parserLookupName modelDocument.parserLookupClass = _parserLookupClass modelDocument.xmlRootElement = mdlObj modelDocument.schemaLocationElements.add(mdlObj) modelDocument.documentEncoding = _encoding modelDocument._creationSoftwareComment = creationSoftwareComment modelXbrl.info("streamingExtensions:streaming", _("Stream processing this instance."), modelObject = modelDocument) else: self.currentMdlObj.append(mdlObj) self.currentMdlObj = mdlObj mdlObj._init() ns = mdlObj.namespaceURI ln = mdlObj.localName if (self.beforeInstanceStream and ( (ns == XbrlConst.link and ln not in ("schemaRef", "linkbaseRef")) or (ns == XbrlConst.xbrli and ln in ("context", "unit")) or (ns not in (XbrlConst.link, XbrlConst.xbrli)))): self.beforeInstanceStream = False if _streamingExtensionsValidate: instValidator.validate(modelXbrl, modelXbrl.modelManager.formulaOptions.typedParameters()) else: # need default dimensions ValidateXbrlDimensions.loadDimensionDefaults(modelXbrl) return mdlObj def end(self, tag): modelDocument = modelXbrl.modelDocument mdlObj = self.currentMdlObj parentMdlObj = mdlObj.getparent() self.currentMdlObj = parentMdlObj ns = mdlObj.namespaceURI ln = mdlObj.localName if ns == XbrlConst.xbrli: if ln == "context": if mdlObj.get("sticky"): del mdlObj.attrib["sticky"] XmlValidate.validate(modelXbrl, mdlObj) modelDocument.contextDiscover(mdlObj) else: if _streamingExtensionsValidate and len(contextBuffer) >= contextBufferLimit: # drop before adding as dropped may have same id as added cntx = contextBuffer.pop(0) dropContext(modelXbrl, cntx) del parentMdlObj[parentMdlObj.index(cntx)] cntx = None XmlValidate.validate(modelXbrl, mdlObj) modelDocument.contextDiscover(mdlObj) if contextBufferLimit.is_finite(): contextBuffer.append(mdlObj) if _streamingExtensionsValidate: contextsToCheck = (mdlObj,) instValidator.checkContexts(contextsToCheck) if modelXbrl.hasXDT: instValidator.checkContextsDimensions(contextsToCheck) del contextsToCheck # dereference elif ln == "unit": if _streamingExtensionsValidate and len(unitBuffer) >= unitBufferLimit: # drop before additing as dropped may have same id as added unit = unitBuffer.pop(0) dropUnit(modelXbrl, unit) del parentMdlObj[parentMdlObj.index(unit)] unit = None XmlValidate.validate(modelXbrl, mdlObj) modelDocument.unitDiscover(mdlObj) if unitBufferLimit.is_finite(): unitBuffer.append(mdlObj) if _streamingExtensionsValidate: instValidator.checkUnits( (mdlObj,) ) elif ln == "xbrl": # end of document # check remaining footnote refs for footnoteLink in footnoteBuffer: checkFootnoteHrefs(modelXbrl, footnoteLink) elif ns == XbrlConst.link: if ln == "footnoteLink": XmlValidate.validate(modelXbrl, mdlObj) footnoteLinks = (mdlObj,) modelDocument.linkbaseDiscover(footnoteLinks, inInstance=True) if footnoteBufferLimit.is_finite(): footnoteBuffer.append(mdlObj) if _streamingExtensionsValidate: instValidator.checkLinks(footnoteLinks) if len(footnoteBuffer) > footnoteBufferLimit: # check that hrefObjects for locators were all satisfied # drop before addition as dropped may have same id as added footnoteLink = footnoteBuffer.pop(0) checkFootnoteHrefs(modelXbrl, footnoteLink) dropFootnoteLink(modelXbrl, footnoteLink) del parentMdlObj[parentMdlObj.index(footnoteLink)] footnoteLink = None footnoteLinks = None elif ln in ("schemaRef", "linkbaseRef"): modelDocument.discoverHref(mdlObj) elif not modelXbrl.skipDTS: if ln in ("roleRef", "arcroleRef"): modelDocument.linkbaseDiscover((mdlObj,), inInstance=True) elif parentMdlObj.qname == XbrlConst.qnXbrliXbrl: self.numRootFacts += 1 XmlValidate.validate(modelXbrl, mdlObj) modelDocument.factDiscover(mdlObj, modelXbrl.facts) if _streamingExtensionsValidate: factsToCheck = (mdlObj,) instValidator.checkFacts(factsToCheck) if modelXbrl.hasXDT: instValidator.checkFactsDimensions(factsToCheck) del factsToCheck dropFact(modelXbrl, mdlObj, modelXbrl.facts) del parentMdlObj[parentMdlObj.index(mdlObj)] if self.numRootFacts % 1000 == 0: modelXbrl.profileActivity("... streaming fact {0} of {1} {2:.2f}%".format(self.numRootFacts, instInfoNumRootFacts, 100.0 * self.numRootFacts / instInfoNumRootFacts), minTimeToShow=20.0) return mdlObj def data(self, data): self.currentMdlObj.text = data def comment(self, text): pass def pi(self, target, data): pass def close(self): return None _parser, _parserLookupName, _parserLookupClass = parser(modelXbrl, filepath, target=modelLoaderTarget()) etree.parse(_file, parser=_parser, base_url=filepath) logSyntaxErrors(_parser) _file.close() if _streamingExtensionsValidate and validator is not None: del instValidator validator.close() # track that modelXbrl has been validated by this streaming extension modelXbrl._streamingExtensionValidated = True modelXbrl.profileStat(_("streaming complete"), time.time() - startedAt) return modelXbrl.modelDocument
def file(self, filepath, binary=False, stripDeclaration=False, encoding=None): ''' for text, return a tuple of (open file handle, encoding) for binary, return a tuple of (open file handle, ) ''' archiveFileSource = self.fileSourceContainingFilepath(filepath) if archiveFileSource is not None: if filepath.startswith(archiveFileSource.basefile): archiveFileName = filepath[len(archiveFileSource.basefile) + 1:] else: # filepath.startswith(self.baseurl) archiveFileName = filepath[len(archiveFileSource.baseurl) + 1:] if archiveFileSource.isZip: try: b = archiveFileSource.fs.read(archiveFileName.replace("\\","/")) if binary: return (io.BytesIO(b), ) if encoding is None: encoding = XmlUtil.encoding(b) if stripDeclaration: b = stripDeclarationBytes(b) return (FileNamedTextIOWrapper(filepath, io.BytesIO(b), encoding=encoding), encoding) except KeyError: raise ArchiveFileIOError(self, archiveFileName) elif archiveFileSource.isTarGz: try: fh = archiveFileSource.fs.extractfile(archiveFileName) b = fh.read() fh.close() # doesn't seem to close properly using a with construct if binary: return (io.BytesIO(b), ) if encoding is None: encoding = XmlUtil.encoding(b) if stripDeclaration: b = stripDeclarationBytes(b) return (FileNamedTextIOWrapper(filepath, io.BytesIO(b), encoding=encoding), encoding) except KeyError: raise ArchiveFileIOError(self, archiveFileName) elif archiveFileSource.isEis: for docElt in self.eisDocument.iter(tag="{http://www.sec.gov/edgar/common}document"): outfn = docElt.findtext("{http://www.sec.gov/edgar/common}conformedName") if outfn == archiveFileName: b64data = docElt.findtext("{http://www.sec.gov/edgar/common}contents") if b64data: b = base64.b64decode(b64data.encode("latin-1")) # remove BOM codes if present if len(b) > 3 and b[0] == 239 and b[1] == 187 and b[2] == 191: start = 3; length = len(b) - 3; b = b[start:start + length] else: start = 0; length = len(b); if binary: return (io.BytesIO(b), ) if encoding is None: encoding = XmlUtil.encoding(b, default="latin-1") return (io.TextIOWrapper(io.BytesIO(b), encoding=encoding), encoding) raise ArchiveFileIOError(self, archiveFileName) elif archiveFileSource.isXfd: for data in archiveFileSource.xfdDocument.iter(tag="data"): outfn = data.findtext("filename") if outfn == archiveFileName: b64data = data.findtext("mimedata") if b64data: b = base64.b64decode(b64data.encode("latin-1")) # remove BOM codes if present if len(b) > 3 and b[0] == 239 and b[1] == 187 and b[2] == 191: start = 3; length = len(b) - 3; b = b[start:start + length] else: start = 0; length = len(b); if binary: return (io.BytesIO(b), ) if encoding is None: encoding = XmlUtil.encoding(b, default="latin-1") return (io.TextIOWrapper(io.BytesIO(b), encoding=encoding), encoding) raise ArchiveFileIOError(self, archiveFileName) elif archiveFileSource.isInstalledTaxonomyPackage: # remove TAXONOMY_PACKAGE_FILE_NAME from file path if filepath.startswith(archiveFileSource.basefile): l = len(archiveFileSource.basefile) for f in TAXONOMY_PACKAGE_FILE_NAMES: if filepath[l - len(f):l] == f: filepath = filepath[0:l - len(f) - 1] + filepath[l:] break if binary: return (openFileStream(self.cntlr, filepath, 'rb'), ) else: return openXmlFileStream(self.cntlr, filepath, stripDeclaration)
def open(self): if not self.isOpen: if (self.isZip or self.isTarGz or self.isEis or self.isXfd or self.isRss or self.isInstalledTaxonomyPackage) and self.cntlr: self.basefile = self.cntlr.webCache.getfilename(self.url) else: self.basefile = self.url self.baseurl = self.url # url gets changed by selection if not self.basefile: return # an error should have been logged if self.isZip: try: self.fs = zipfile.ZipFile(openFileStream(self.cntlr, self.basefile, 'rb'), mode="r") self.isOpen = True except EnvironmentError as err: self.logError(err) pass elif self.isTarGz: try: self.fs = tarfile.open(self.basefile, "r:gz") self.isOpen = True except EnvironmentError as err: self.logError(err) pass elif self.isEis: # check first line of file buf = b'' try: file = open(self.basefile, 'rb') more = True while more: l = file.read(8) if len(l) < 8: break if len(buf) == 0 and l.startswith(b"<?xml "): # not compressed buf = l + file.read() # not compressed break compressedBytes = file.read( struct.unpack(">L", l[0:4])[0]) if len(compressedBytes) <= 0: break buf += zlib.decompress(compressedBytes) file.close() except EnvironmentError as err: self.logError(err) pass #uncomment to save for debugging #with open("c:/temp/test.xml", "wb") as f: # f.write(buf) if buf.startswith(b"<?xml "): try: # must strip encoding str = buf.decode(XmlUtil.encoding(buf)) endEncoding = str.index("?>", 0, 128) if endEncoding > 0: str = str[endEncoding+2:] file = io.StringIO(initial_value=str) parser = etree.XMLParser(recover=True, huge_tree=True) self.eisDocument = etree.parse(file, parser=parser) file.close() self.isOpen = True except EnvironmentError as err: self.logError(err) return # provide error message later except etree.LxmlError as err: self.logError(err) return # provide error message later elif self.isXfd: # check first line of file file = open(self.basefile, 'rb') firstline = file.readline() if firstline.startswith(b"application/x-xfdl;content-encoding=\"asc-gzip\""): # file has been gzipped base64input = file.read(-1) file.close(); file = None; fb = base64.b64decode(base64input) ungzippedBytes = b"" totalLenUncompr = 0 i = 0 while i < len(fb): lenCompr = fb[i + 0] * 256 + fb[i + 1] lenUncomp = fb[i + 2] * 256 + fb[i + 3] lenRead = 0 totalLenUncompr += lenUncomp gzchunk = (bytes((31,139,8,0)) + fb[i:i+lenCompr]) try: with gzip.GzipFile(fileobj=io.BytesIO(gzchunk)) as gf: while True: readSize = min(16384, lenUncomp - lenRead) readBytes = gf.read(size=readSize) lenRead += len(readBytes) ungzippedBytes += readBytes if len(readBytes) == 0 or (lenUncomp - lenRead) <= 0: break except IOError as err: pass # provide error message later i += lenCompr + 4 #for learning the content of xfd file, uncomment this: #with open("c:\\temp\\test.xml", "wb") as fh: # fh.write(ungzippedBytes) file = io.StringIO(initial_value=ungzippedBytes.decode("utf-8")) else: # position to start of file file.seek(0,io.SEEK_SET) try: self.xfdDocument = etree.parse(file) file.close() self.isOpen = True except EnvironmentError as err: self.logError(err) return # provide error message later except etree.LxmlError as err: self.logError(err) return # provide error message later elif self.isRss: try: self.rssDocument = etree.parse(self.basefile) self.isOpen = True except EnvironmentError as err: self.logError(err) return # provide error message later except etree.LxmlError as err: self.logError(err) return # provide error message later elif self.isInstalledTaxonomyPackage: self.isOpen = True # load mappings try: metadataFiles = self.taxonomyPackageMetadataFiles if len(metadataFiles) != 1: raise IOError(_("Taxonomy package must contain one and only one metadata file: {0}.") .format(', '.join(metadataFiles))) # HF: this won't work, see DialogOpenArchive for correct code # not sure if it is used taxonomyPackage = PackageManager.parsePackage(self.cntlr, self.url) fileSourceDir = os.path.dirname(self.baseurl) + os.sep self.mappedPaths = \ dict((prefix, remapping if isHttpUrl(remapping) else (fileSourceDir + remapping.replace("/", os.sep))) for prefix, remapping in taxonomyPackage["remappings"].items()) except EnvironmentError as err: self.logError(err) return # provide error message later
def file(self,filepath): archiveFileSource = self.fileSourceContainingFilepath(filepath) if archiveFileSource is not None: if filepath.startswith(archiveFileSource.basefile): archiveFileName = filepath[len(archiveFileSource.basefile) + 1:] else: # filepath.startswith(self.baseurl) archiveFileName = filepath[len(archiveFileSource.baseurl) + 1:] if archiveFileSource.isZip: b = archiveFileSource.fs.read(archiveFileName.replace("\\","/")) encoding = XmlUtil.encoding(b) return (io.TextIOWrapper( io.BytesIO(b), encoding=encoding), encoding) elif archiveFileSource.isXfd: for data in archiveFileSource.xfdDocument.iter(tag="data"): outfn = data.findtext("filename") if outfn == archiveFileName: b64data = data.findtext("mimedata") if b64data: # convert to bytes #byteData = [] #for c in b64data: # byteData.append(ord(c)) b = base64.b64decode(b64data.encode("latin-1")) # remove BOM codes if present if len(b) > 3 and b[0] == 239 and b[1] == 187 and b[2] == 191: start = 3; length = len(b) - 3; b = b[start:start + length] else: start = 0; length = len(b); # pass back as ascii #str = "" #for bChar in b[start:start + length]: # str += chr( bChar ) #return str return (io.TextIOWrapper( io.BytesIO(b), encoding=XmlUtil.encoding(b)), "latin-1") return (None,None) # check encoding with open(filepath, 'rb') as fb: hdrBytes = fb.read(512) encoding = XmlUtil.encoding(hdrBytes) if encoding.lower() in ('utf-8','utf8'): text = None else: fb.seek(0) text = fb.read().decode(encoding) # allow filepath to close # this may not be needed for Mac or Linux, needs confirmation!!! if text is None: # ok to read as utf-8 return io.open(filepath, 'rt', encoding='utf-8'), encoding else: # strip XML declaration xmlDeclarationMatch = XMLdeclaration.search(text) if xmlDeclarationMatch: # remove it for lxml start,end = xmlDeclarationMatch.span() text = text[0:start] + text[end:] return (io.StringIO(initial_value=text), encoding)
def file(self, filepath, binary=False, stripDeclaration=False, encoding=None): ''' for text, return a tuple of (open file handle, encoding) for binary, return a tuple of (open file handle, ) ''' archiveFileSource = self.fileSourceContainingFilepath(filepath) if archiveFileSource is not None: if filepath.startswith(archiveFileSource.basefile): archiveFileName = filepath[len(archiveFileSource.basefile) + 1:] else: # filepath.startswith(self.baseurl) archiveFileName = filepath[len(archiveFileSource.baseurl) + 1:] if archiveFileSource.isZip: try: b = archiveFileSource.fs.read( archiveFileName.replace("\\", "/")) if binary: return (io.BytesIO(b), ) if encoding is None: encoding = XmlUtil.encoding(b) if stripDeclaration: b = stripDeclarationBytes(b) return (FileNamedTextIOWrapper(filepath, io.BytesIO(b), encoding=encoding), encoding) except KeyError: raise ArchiveFileIOError(self, errno.ENOENT, archiveFileName) elif archiveFileSource.isTarGz: try: fh = archiveFileSource.fs.extractfile(archiveFileName) b = fh.read() fh.close( ) # doesn't seem to close properly using a with construct if binary: return (io.BytesIO(b), ) if encoding is None: encoding = XmlUtil.encoding(b) if stripDeclaration: b = stripDeclarationBytes(b) return (FileNamedTextIOWrapper(filepath, io.BytesIO(b), encoding=encoding), encoding) except KeyError: raise ArchiveFileIOError(self, archiveFileName) elif archiveFileSource.isEis: for docElt in self.eisDocument.iter( tag="{http://www.sec.gov/edgar/common}document"): outfn = docElt.findtext( "{http://www.sec.gov/edgar/common}conformedName") if outfn == archiveFileName: b64data = docElt.findtext( "{http://www.sec.gov/edgar/common}contents") if b64data: b = base64.b64decode(b64data.encode("latin-1")) # remove BOM codes if present if len(b) > 3 and b[0] == 239 and b[ 1] == 187 and b[2] == 191: start = 3 length = len(b) - 3 b = b[start:start + length] else: start = 0 length = len(b) if binary: return (io.BytesIO(b), ) if encoding is None: encoding = XmlUtil.encoding(b, default="latin-1") return (io.TextIOWrapper(io.BytesIO(b), encoding=encoding), encoding) raise ArchiveFileIOError(self, errno.ENOENT, archiveFileName) elif archiveFileSource.isXfd: for data in archiveFileSource.xfdDocument.iter(tag="data"): outfn = data.findtext("filename") if outfn == archiveFileName: b64data = data.findtext("mimedata") if b64data: b = base64.b64decode(b64data.encode("latin-1")) # remove BOM codes if present if len(b) > 3 and b[0] == 239 and b[ 1] == 187 and b[2] == 191: start = 3 length = len(b) - 3 b = b[start:start + length] else: start = 0 length = len(b) if binary: return (io.BytesIO(b), ) if encoding is None: encoding = XmlUtil.encoding(b, default="latin-1") return (io.TextIOWrapper(io.BytesIO(b), encoding=encoding), encoding) raise ArchiveFileIOError(self, errno.ENOENT, archiveFileName) elif archiveFileSource.isInstalledTaxonomyPackage: # remove TAXONOMY_PACKAGE_FILE_NAME from file path if filepath.startswith(archiveFileSource.basefile): l = len(archiveFileSource.basefile) for f in TAXONOMY_PACKAGE_FILE_NAMES: if filepath[l - len(f):l] == f: filepath = filepath[0:l - len(f) - 1] + filepath[l:] break if binary: return (openFileStream(self.cntlr, filepath, 'rb'), ) else: return openXmlFileStream(self.cntlr, filepath, stripDeclaration)
def open(self, reloadCache=False): if not self.isOpen: if (self.isZip or self.isTarGz or self.isEis or self.isXfd or self.isRss or self.isInstalledTaxonomyPackage) and self.cntlr: self.basefile = self.cntlr.webCache.getfilename( self.url, reload=reloadCache) else: self.basefile = self.url self.baseurl = self.url # url gets changed by selection if not self.basefile: return # an error should have been logged if self.isZip: try: self.fs = zipfile.ZipFile(openFileStream( self.cntlr, self.basefile, 'rb'), mode="r") self.isOpen = True except EnvironmentError as err: self.logError(err) pass elif self.isTarGz: try: self.fs = tarfile.open(self.basefile, "r:gz") self.isOpen = True except EnvironmentError as err: self.logError(err) pass elif self.isEis: # check first line of file buf = b'' try: file = open(self.basefile, 'rb') more = True while more: l = file.read(8) if len(l) < 8: break if len(buf) == 0 and l.startswith( b"<?xml "): # not compressed buf = l + file.read() # not compressed break compressedBytes = file.read( struct.unpack(">L", l[0:4])[0]) if len(compressedBytes) <= 0: break buf += zlib.decompress(compressedBytes) file.close() except EnvironmentError as err: self.logError(err) pass #uncomment to save for debugging #with open("c:/temp/test.xml", "wb") as f: # f.write(buf) if buf.startswith(b"<?xml "): try: # must strip encoding str = buf.decode(XmlUtil.encoding(buf)) endEncoding = str.index("?>", 0, 128) if endEncoding > 0: str = str[endEncoding + 2:] file = io.StringIO(initial_value=str) parser = etree.XMLParser(recover=True, huge_tree=True) self.eisDocument = etree.parse(file, parser=parser) file.close() self.isOpen = True except EnvironmentError as err: self.logError(err) return # provide error message later except etree.LxmlError as err: self.logError(err) return # provide error message later elif self.isXfd: # check first line of file file = open(self.basefile, 'rb') firstline = file.readline() if firstline.startswith( b"application/x-xfdl;content-encoding=\"asc-gzip\""): # file has been gzipped base64input = file.read(-1) file.close() file = None fb = base64.b64decode(base64input) ungzippedBytes = b"" totalLenUncompr = 0 i = 0 while i < len(fb): lenCompr = fb[i + 0] * 256 + fb[i + 1] lenUncomp = fb[i + 2] * 256 + fb[i + 3] lenRead = 0 totalLenUncompr += lenUncomp gzchunk = (bytes((31, 139, 8, 0)) + fb[i:i + lenCompr]) try: with gzip.GzipFile( fileobj=io.BytesIO(gzchunk)) as gf: while True: readSize = min(16384, lenUncomp - lenRead) readBytes = gf.read(size=readSize) lenRead += len(readBytes) ungzippedBytes += readBytes if len(readBytes) == 0 or (lenUncomp - lenRead) <= 0: break except IOError as err: pass # provide error message later i += lenCompr + 4 #for learning the content of xfd file, uncomment this: #with open("c:\\temp\\test.xml", "wb") as fh: # fh.write(ungzippedBytes) file = io.StringIO( initial_value=ungzippedBytes.decode("utf-8")) else: # position to start of file file.seek(0, io.SEEK_SET) try: self.xfdDocument = etree.parse(file) file.close() self.isOpen = True except EnvironmentError as err: self.logError(err) return # provide error message later except etree.LxmlError as err: self.logError(err) return # provide error message later elif self.isRss: try: self.rssDocument = etree.parse(self.basefile) self.isOpen = True except EnvironmentError as err: self.logError(err) return # provide error message later except etree.LxmlError as err: self.logError(err) return # provide error message later elif self.isInstalledTaxonomyPackage: self.isOpen = True # load mappings try: metadataFiles = self.taxonomyPackageMetadataFiles if len(metadataFiles) != 1: raise IOError( _("Taxonomy package must contain one and only one metadata file: {0}." ).format(', '.join(metadataFiles))) # HF: this won't work, see DialogOpenArchive for correct code # not sure if it is used taxonomyPackage = PackageManager.parsePackage( self.cntlr, self.url) fileSourceDir = os.path.dirname(self.baseurl) + os.sep self.mappedPaths = \ dict((prefix, remapping if isHttpUrl(remapping) else (fileSourceDir + remapping.replace("/", os.sep))) for prefix, remapping in taxonomyPackage["remappings"].items()) except EnvironmentError as err: self.logError(err) return # provide error message later
def open(self): if not self.isOpen: if (self.isZip or self.isEis or self.isXfd or self.isRss) and self.cntlr: self.basefile = self.cntlr.webCache.getfilename(self.url) else: self.basefile = self.url self.baseurl = self.url # url gets changed by selection if not self.basefile: return # an error should have been logged if self.isZip: try: self.fs = zipfile.ZipFile(openFileStream( self.cntlr, self.basefile, 'rb'), mode="r") self.isOpen = True except EnvironmentError as err: self.logError(err) pass elif self.isEis: # check first line of file buf = b'' try: file = open(self.basefile, 'rb') more = True while more: l = file.read(8) if len(l) < 8: break if len(buf) == 0 and l.startswith( b"<?xml "): # not compressed buf = l + file.read() # not compressed break compressedBytes = file.read( struct.unpack(">L", l[0:4])[0]) if len(compressedBytes) <= 0: break buf += zlib.decompress(compressedBytes) file.close() except EnvironmentError as err: self.logError(err) pass #uncomment to save for debugging #with open("c:/temp/test.xml", "wb") as f: # f.write(buf) if buf.startswith(b"<?xml "): try: # must strip encoding str = buf.decode(XmlUtil.encoding(buf)) endEncoding = str.index("?>", 0, 128) if endEncoding > 0: str = str[endEncoding + 2:] file = io.StringIO(initial_value=str) parser = etree.XMLParser(recover=True, huge_tree=True) self.eisDocument = etree.parse(file, parser=parser) file.close() self.isOpen = True except EnvironmentError as err: self.logError(err) return # provide error message later except etree.LxmlError as err: self.logError(err) return # provide error message later elif self.isXfd: # check first line of file file = open(self.basefile, 'rb') firstline = file.readline() if firstline.startswith( b"application/x-xfdl;content-encoding=\"asc-gzip\""): # file has been gzipped base64input = file.read(-1) file.close() file = None fb = base64.b64decode(base64input) ungzippedBytes = b"" totalLenUncompr = 0 i = 0 while i < len(fb): lenCompr = fb[i + 0] * 256 + fb[i + 1] lenUncomp = fb[i + 2] * 256 + fb[i + 3] lenRead = 0 totalLenUncompr += lenUncomp gzchunk = (bytes((31, 139, 8, 0)) + fb[i:i + lenCompr]) try: with gzip.GzipFile( fileobj=io.BytesIO(gzchunk)) as gf: while True: readSize = min(16384, lenUncomp - lenRead) readBytes = gf.read(size=readSize) lenRead += len(readBytes) ungzippedBytes += readBytes if len(readBytes) == 0 or (lenUncomp - lenRead) <= 0: break except IOError as err: pass # provide error message later i += lenCompr + 4 #for learning the content of xfd file, uncomment this: #with open("c:\\temp\\test.xml", "wb") as fh: # fh.write(ungzippedBytes) file = io.StringIO( initial_value=ungzippedBytes.decode("utf-8")) else: # position to start of file file.seek(0, io.SEEK_SET) try: self.xfdDocument = etree.parse(file) file.close() self.isOpen = True except EnvironmentError as err: self.logError(err) return # provide error message later except etree.LxmlError as err: self.logError(err) return # provide error message later elif self.isRss: try: self.rssDocument = etree.parse(self.basefile) self.isOpen = True except EnvironmentError as err: self.logError(err) return # provide error message later except etree.LxmlError as err: self.logError(err) return # provide error message later
def streamingExtensionsLoader(modelXbrl, mappedUri, filepath, *args, **kwargs): # check if big instance and has header with an initial incomplete tree walk (just 2 elements if not _streamingExtensionsCheck: return None # track whether modelXbrl has been validated by this streaming extension modelXbrl._streamingExtensionValidated = False def logSyntaxErrors(parsercontext): for error in parsercontext.error_log: modelXbrl.error( "xmlSchema:syntax", _("%(error)s, %(fileName)s, line %(line)s, column %(column)s, %(sourceAction)s source element" ), modelObject=modelXbrl, fileName=os.path.basename(filepath), error=error.message, line=error.line, column=error.column, sourceAction="streaming") #### note: written for iterparse of lxml prior to version 3.3, otherwise rewrite to use XmlPullParser ### #### note: iterparse wants a binary file, but file is text mode _file, = modelXbrl.fileSource.file(filepath, binary=True) startedAt = time.time() modelXbrl.profileActivity() ''' this seems twice as slow as iterparse class instInfoTarget(): def __init__(self, element_factory=None, parser=None): self.newTree = True self.streamingAspects = None self.foundInstance = False self.creationSoftwareComment = '' self.currentEltTag = "(before xbrli:xbrl)" self.numRootFacts = 0 def start(self, tag, attrib, nsmap=None): if self.newTree: if tag == "{http://www.xbrl.org/2003/instance}xbrl": self.foundInstance = True self.newTree = False else: # break raise NotInstanceDocumentException() elif not tag.startswith("{http://www.xbrl.org/"): self.numRootFacts += 1 if self.numRootFacts % 1000 == 0: modelXbrl.profileActivity("... streaming tree check", minTimeToShow=20.0) self.currentEltTag = tag def end(self, tag): pass def data(self, data): pass def comment(self, text): if not self.foundInstance: # accumulate comments before xbrli:xbrl self.creationSoftwareComment += ('\n' if self.creationSoftwareComment else '') + text elif not self.creationSoftwareComment: self.creationSoftwareComment = text # or first comment after xbrli:xbrl def pi(self, target, data): if target == "xbrl-streamable-instance": if self.currentEltTag == "{http://www.xbrl.org/2003/instance}xbrl": self.streamingAspects = dict(etree.PI(target,data).attrib.copy()) # dereference target results else: modelXbrl.error("streamingExtensions:headerMisplaced", _("Header is misplaced: %(target)s, must follow xbrli:xbrl element but was found at %(element)s"), modelObject=modelXbrl, target=target, element=self.currentEltTag) def close(self): if not self.creationSoftwareComment: self.creationSoftwareComment = None return True instInfo = instInfoTarget() infoParser = etree.XMLParser(recover=True, huge_tree=True, target=instInfo) try: etree.parse(_file, parser=infoParser, base_url=filepath) except NotInstanceDocumentException: pass ''' foundErrors = False foundInstance = False streamingAspects = None creationSoftwareComment = None instInfoNumRootFacts = 0 numElts = 0 elt = None instInfoContext = etree.iterparse(_file, events=("start", "end"), huge_tree=True) try: for event, elt in instInfoContext: if event == "start": if elt.getparent() is not None: if elt.getparent( ).tag == "{http://www.xbrl.org/2003/instance}xbrl": if not foundInstance: foundInstance = True pi = precedingProcessingInstruction( elt, "xbrl-streamable-instance") if pi is None: break else: streamingAspects = dict(pi.attrib.copy()) if creationSoftwareComment is None: creationSoftwareComment = precedingComment( elt) if not elt.tag.startswith("{http://www.xbrl.org/"): instInfoNumRootFacts += 1 if instInfoNumRootFacts % 1000 == 0: modelXbrl.profileActivity( "... streaming tree check", minTimeToShow=20.0) elif not foundInstance: break elif elt.tag == "{http://www.xbrl.org/2003/instance}xbrl": creationSoftwareComment = precedingComment(elt) if precedingProcessingInstruction( elt, "xbrl-streamable-instance") is not None: modelXbrl.error( "streamingExtensions:headerMisplaced", _("Header is misplaced: %(error)s, must follow xbrli:xbrl element" ), modelObject=elt) elif event == "end": elt.clear() numElts += 1 if numElts % 1000 == 0 and elt.getparent() is not None: while elt.getprevious() is not None and elt.getparent( ) is not None: del elt.getparent()[0] except etree.XMLSyntaxError as err: modelXbrl.error("xmlSchema:syntax", _("Unrecoverable error: %(error)s"), error=err) _file.close() return err _file.seek(0, io.SEEK_SET) # allow reparsing if not foundInstance or streamingAspects is None: del elt _file.close() return None modelXbrl.profileStat(_("streaming tree check"), time.time() - startedAt) startedAt = time.time() try: version = Decimal(streamingAspects.get("version")) if int(version) != 1: modelXbrl.error( "streamingExtensions:unsupportedVersion", _("Streaming version %(version)s, major version number must be 1" ), modelObject=elt, version=version) foundErrors = True except (InvalidOperation, OverflowError): modelXbrl.error("streamingExtensions:versionError", _("Version %(version)s, number must be 1.n"), modelObject=elt, version=streamingAspects.get("version", "(none)")) foundErrors = True for bufAspect in ("contextBuffer", "unitBuffer", "footnoteBuffer"): try: bufLimit = Decimal(streamingAspects.get(bufAspect, "INF")) if bufLimit < 1 or (bufLimit.is_finite() and bufLimit % 1 != 0): raise InvalidOperation elif bufAspect == "contextBuffer": contextBufferLimit = bufLimit elif bufAspect == "unitBuffer": unitBufferLimit = bufLimit elif bufAspect == "footnoteBuffer": footnoteBufferLimit = bufLimit except InvalidOperation: modelXbrl.error( "streamingExtensions:valueError", _("Streaming %(attrib)s %(value)s, number must be a positive integer or INF" ), modelObject=elt, attrib=bufAspect, value=streamingAspects.get(bufAspect)) foundErrors = True if _streamingExtensionsValidate: incompatibleValidations = [] _validateDisclosureSystem = modelXbrl.modelManager.validateDisclosureSystem _disclosureSystem = modelXbrl.modelManager.disclosureSystem if _validateDisclosureSystem and _disclosureSystem.validationType == "EFM": incompatibleValidations.append("EFM") if _validateDisclosureSystem and _disclosureSystem.validationType == "GFM": incompatibleValidations.append("GFM") if _validateDisclosureSystem and _disclosureSystem.validationType == "HMRC": incompatibleValidations.append("HMRC") if modelXbrl.modelManager.validateCalcLB: incompatibleValidations.append("calculation LB") if incompatibleValidations: modelXbrl.error( "streamingExtensions:incompatibleValidation", _("Streaming instance validation does not support %(incompatibleValidations)s validation" ), modelObject=modelXbrl, incompatibleValidations=', '.join(incompatibleValidations)) foundErrors = True if instInfoContext.error_log: foundErrors = True logSyntaxErrors(instInfoContext) del instInfoContext # dereference for pluginMethod in pluginClassMethods("Streaming.BlockStreaming"): _blockingPluginName = pluginMethod(modelXbrl) if _blockingPluginName: # name of blocking plugin is returned modelXbrl.error( "streamingExtensions:incompatiblePlugIn", _("Streaming instance not supported by plugin %(blockingPlugin)s" ), modelObject=modelXbrl, blockingPlugin=_blockingPluginName) foundErrors = True if foundErrors: _file.close() return None _encoding = XmlUtil.encoding(_file.read(512)) _file.seek(0, io.SEEK_SET) # allow reparsing if _streamingExtensionsValidate: validator = Validate(modelXbrl) instValidator = validator.instValidator contextBuffer = [] contextsToDrop = [] unitBuffer = [] unitsToDrop = [] footnoteBuffer = [] footnoteLinksToDrop = [] _streamingFactsPlugin = any( True for pluginMethod in pluginClassMethods("Streaming.Facts")) _streamingValidateFactsPlugin = (_streamingExtensionsValidate and any( True for pluginMethod in pluginClassMethods("Streaming.ValidateFacts"))) ''' this is very much slower than iterparse class modelLoaderTarget(): def __init__(self, element_factory=None, parser=None): self.newTree = True self.currentMdlObj = None self.beforeInstanceStream = True self.beforeStartStreamingPlugin = True self.numRootFacts = 1 modelXbrl.makeelementParentModelObject = None modelXbrl.isStreamingMode = True self.factsCheckVersion = None self.factsCheckMd5s = Md5Sum() def start(self, tag, attrib, nsmap=None): modelXbrl.makeelementParentModelObject = self.currentMdlObj # pass parent to makeelement for ModelObjectFactory mdlObj = _parser.makeelement(tag, attrib=attrib, nsmap=nsmap) mdlObj.sourceline = 1 if self.newTree: self.newTree = False self.currentMdlObj = mdlObj modelDocument = ModelDocument(modelXbrl, Type.INSTANCE, mappedUri, filepath, mdlObj.getroottree()) modelXbrl.modelDocument = modelDocument # needed for incremental validation mdlObj.init(modelDocument) modelDocument.parser = _parser # needed for XmlUtil addChild's makeelement modelDocument.parserLookupName = _parserLookupName modelDocument.parserLookupClass = _parserLookupClass modelDocument.xmlRootElement = mdlObj modelDocument.schemaLocationElements.add(mdlObj) modelDocument.documentEncoding = _encoding modelDocument._creationSoftwareComment = creationSoftwareComment modelXbrl.info("streamingExtensions:streaming", _("Stream processing this instance."), modelObject = modelDocument) else: self.currentMdlObj.append(mdlObj) self.currentMdlObj = mdlObj mdlObj._init() ns = mdlObj.namespaceURI ln = mdlObj.localName if (self.beforeInstanceStream and ( (ns == XbrlConst.link and ln not in ("schemaRef", "linkbaseRef")) or (ns == XbrlConst.xbrli and ln in ("context", "unit")) or (ns not in (XbrlConst.link, XbrlConst.xbrli)))): self.beforeInstanceStream = False if _streamingExtensionsValidate: instValidator.validate(modelXbrl, modelXbrl.modelManager.formulaOptions.typedParameters(modelXbrl.prefixedNamespaces)) else: # need default dimensions ValidateXbrlDimensions.loadDimensionDefaults(modelXbrl) elif not self.beforeInstanceStream and self.beforeStartStreamingPlugin: for pluginMethod in pluginClassMethods("Streaming.Start"): pluginMethod(modelXbrl) self.beforeStartStreamingPlugin = False return mdlObj def end(self, tag): modelDocument = modelXbrl.modelDocument mdlObj = self.currentMdlObj parentMdlObj = mdlObj.getparent() self.currentMdlObj = parentMdlObj ns = mdlObj.namespaceURI ln = mdlObj.localName if ns == XbrlConst.xbrli: if ln == "context": if mdlObj.get("sticky"): del mdlObj.attrib["sticky"] XmlValidate.validate(modelXbrl, mdlObj) modelDocument.contextDiscover(mdlObj) else: if _streamingExtensionsValidate and len(contextBuffer) >= contextBufferLimit: # drop before adding as dropped may have same id as added cntx = contextBuffer.pop(0) if _streamingValidateFactsPlugin: contextsToDrop.append(cntx) else: dropContext(modelXbrl, cntx) del parentMdlObj[parentMdlObj.index(cntx)] cntx = None #>>XmlValidate.validate(modelXbrl, mdlObj) #>>modelDocument.contextDiscover(mdlObj) if contextBufferLimit.is_finite(): contextBuffer.append(mdlObj) if _streamingExtensionsValidate: contextsToCheck = (mdlObj,) instValidator.checkContexts(contextsToCheck) if modelXbrl.hasXDT: instValidator.checkContextsDimensions(contextsToCheck) del contextsToCheck # dereference elif ln == "unit": if _streamingExtensionsValidate and len(unitBuffer) >= unitBufferLimit: # drop before adding as dropped may have same id as added unit = unitBuffer.pop(0) if _streamingValidateFactsPlugin: unitsToDrop.append(unit) else: dropUnit(modelXbrl, unit) del parentMdlObj[parentMdlObj.index(unit)] unit = None #>>XmlValidate.validate(modelXbrl, mdlObj) #>>modelDocument.unitDiscover(mdlObj) if unitBufferLimit.is_finite(): unitBuffer.append(mdlObj) if _streamingExtensionsValidate: instValidator.checkUnits( (mdlObj,) ) elif ln == "xbrl": # end of document # check remaining batched facts if any if _streamingValidateFactsPlugin: # plugin attempts to process batch of all root facts not yet processed (not just current one) # finish any final batch of facts if len(modelXbrl.facts) > 0: factsToCheck = modelXbrl.facts.copy() factsHaveBeenProcessed = True # can block facts deletion if required data not yet available, such as numeric unit for DpmDB for pluginMethod in pluginClassMethods("Streaming.ValidateFacts"): if not pluginMethod(modelXbrl, factsToCheck): factsHaveBeenProcessed = False if factsHaveBeenProcessed: for fact in factsToCheck: dropFact(modelXbrl, fact, modelXbrl.facts) del parentMdlObj[parentMdlObj.index(fact)] for cntx in contextsToDrop: dropContext(modelXbrl, cntx) del parentMdlObj[parentMdlObj.index(cntx)] for unit in unitsToDrop: dropUnit(modelXbrl, unit) del parentMdlObj[parentMdlObj.index(unit)] for footnoteLink in footnoteLinksToDrop: dropFootnoteLink(modelXbrl, footnoteLink) del parentMdlObj[parentMdlObj.index(footnoteLink)] fact = cntx = unit = footnoteLink = None del contextsToDrop[:] del unitsToDrop[:] del footnoteLinksToDrop[:] del factsToCheck # check remaining footnote refs for footnoteLink in footnoteBuffer: checkFootnoteHrefs(modelXbrl, footnoteLink) for pluginMethod in pluginClassMethods("Streaming.Finish"): pluginMethod(modelXbrl) elif ns == XbrlConst.link: if ln == "footnoteLink": XmlValidate.validate(modelXbrl, mdlObj) footnoteLinks = (mdlObj,) modelDocument.linkbaseDiscover(footnoteLinks, inInstance=True) if footnoteBufferLimit.is_finite(): footnoteBuffer.append(mdlObj) if _streamingExtensionsValidate: instValidator.checkLinks(footnoteLinks) if len(footnoteBuffer) > footnoteBufferLimit: # check that hrefObjects for locators were all satisfied # drop before addition as dropped may have same id as added footnoteLink = footnoteBuffer.pop(0) checkFootnoteHrefs(modelXbrl, footnoteLink) if _streamingValidateFactsPlugin: footnoteLinksToDrop.append(footnoteLink) else: dropFootnoteLink(modelXbrl, footnoteLink) del parentMdlObj[parentMdlObj.index(footnoteLink)] footnoteLink = None footnoteLinks = None elif ln in ("schemaRef", "linkbaseRef"): modelDocument.discoverHref(mdlObj) elif not modelXbrl.skipDTS: if ln in ("roleRef", "arcroleRef"): modelDocument.linkbaseDiscover((mdlObj,), inInstance=True) elif parentMdlObj.qname == XbrlConst.qnXbrliXbrl: self.numRootFacts += 1 #>>XmlValidate.validate(modelXbrl, mdlObj) #>>modelDocument.factDiscover(mdlObj, modelXbrl.facts) if self.factsCheckVersion: self.factCheckFact(mdlObj) if _streamingExtensionsValidate or _streamingValidateFactsPlugin: factsToCheck = (mdlObj,) # validate current fact by itself if _streamingExtensionsValidate: instValidator.checkFacts(factsToCheck) if modelXbrl.hasXDT: instValidator.checkFactsDimensions(factsToCheck) if _streamingValidateFactsPlugin: # plugin attempts to process batch of all root facts not yet processed (not just current one) # use batches of 1000 facts if len(modelXbrl.facts) > 1000: factsToCheck = modelXbrl.facts.copy() factsHaveBeenProcessed = True # can block facts deletion if required data not yet available, such as numeric unit for DpmDB for pluginMethod in pluginClassMethods("Streaming.ValidateFacts"): if not pluginMethod(modelXbrl, factsToCheck): factsHaveBeenProcessed = False if factsHaveBeenProcessed: for fact in factsToCheck: dropFact(modelXbrl, fact, modelXbrl.facts) del parentMdlObj[parentMdlObj.index(fact)] for cntx in contextsToDrop: dropContext(modelXbrl, cntx) del parentMdlObj[parentMdlObj.index(cntx)] for unit in unitsToDrop: dropUnit(modelXbrl, unit) del parentMdlObj[parentMdlObj.index(unit)] for footnoteLink in footnoteLinksToDrop: dropFootnoteLink(modelXbrl, footnoteLink) del parentMdlObj[parentMdlObj.index(footnoteLink)] fact = cntx = unit = footnoteLink = None del contextsToDrop[:] del unitsToDrop[:] del footnoteLinksToDrop[:] del factsToCheck # dereference fact or batch of facts else: dropFact(modelXbrl, mdlObj, modelXbrl.facts) # single fact has been processed del parentMdlObj[parentMdlObj.index(mdlObj)] if self.numRootFacts % 1000 == 0: pass #modelXbrl.profileActivity("... streaming fact {0} of {1} {2:.2f}%".format(self.numRootFacts, instInfoNumRootFacts, # 100.0 * self.numRootFacts / instInfoNumRootFacts), # minTimeToShow=20.0) gc.collect() sys.stdout.write ("\rAt fact {} of {} mem {}".format(self.numRootFacts, instInfoNumRootFacts, modelXbrl.modelManager.cntlr.memoryUsed)) return mdlObj def data(self, data): self.currentMdlObj.text = data def comment(self, text): pass def pi(self, target, data): if target == "xbrl-facts-check": _match = re.search("([\\w-]+)=[\"']([^\"']+)[\"']", data) if _match: _matchGroups = _match.groups() if len(_matchGroups) == 2: if _matchGroups[0] == "version": self.factsCheckVersion = _matchGroups[1] elif _matchGroups[0] == "sum-of-fact-md5s": try: expectedMd5 = Md5Sum(_matchGroups[1]) if self.factsCheckMd5s != expectedMd5: modelXbrl.warning("streamingExtensions:xbrlFactsCheckWarning", _("XBRL facts sum of md5s expected %(expectedMd5)s not matched to actual sum %(actualMd5Sum)s"), modelObject=modelXbrl, expectedMd5=expectedMd5, actualMd5Sum=self.factsCheckMd5s) else: modelXbrl.info("info", _("Successful XBRL facts sum of md5s."), modelObject=modelXbrl) except ValueError: modelXbrl.error("streamingExtensions:xbrlFactsCheckError", _("Invalid sum-of-md5s %(sumOfMd5)s"), modelObject=modelXbrl, sumOfMd5=_matchGroups[1]) def close(self): del modelXbrl.makeelementParentModelObject return None def factCheckFact(self, fact): self.factsCheckMd5s += fact.md5sum for _tupleFact in fact.modelTupleFacts: self.factCheckFact(_tupleFact) _parser, _parserLookupName, _parserLookupClass = parser(modelXbrl, filepath, target=modelLoaderTarget()) etree.parse(_file, parser=_parser, base_url=filepath) logSyntaxErrors(_parser) ''' # replace modelLoaderTarget with iterparse (as it now supports CustomElementClassLookup) streamingParserContext = etree.iterparse(_file, events=("start", "end"), huge_tree=True) from arelle.ModelObjectFactory import setParserElementClassLookup modelXbrl.isStreamingMode = True # must be set before setting element class lookup (_parser, _parserLookupName, _parserLookupClass) = setParserElementClassLookup(streamingParserContext, modelXbrl) foundInstance = False beforeInstanceStream = beforeStartStreamingPlugin = True numRootFacts = 0 factsCheckVersion = None def factCheckFact(fact): modelDocument._factsCheckMd5s += fact.md5sum for _tupleFact in fact.modelTupleFacts: factCheckFact(_tupleFact) for event, mdlObj in streamingParserContext: if event == "start": if mdlObj.tag == "{http://www.xbrl.org/2003/instance}xbrl": modelDocument = ModelDocument(modelXbrl, Type.INSTANCE, mappedUri, filepath, mdlObj.getroottree()) modelXbrl.modelDocument = modelDocument # needed for incremental validation mdlObj.init(modelDocument) modelDocument.parser = _parser # needed for XmlUtil addChild's makeelement modelDocument.parserLookupName = _parserLookupName modelDocument.parserLookupClass = _parserLookupClass modelDocument.xmlRootElement = mdlObj modelDocument.schemaLocationElements.add(mdlObj) modelDocument.documentEncoding = _encoding modelDocument._creationSoftwareComment = precedingComment( mdlObj) modelDocument._factsCheckMd5s = Md5Sum() modelXbrl.info("streamingExtensions:streaming", _("Stream processing this instance."), modelObject=modelDocument) elif mdlObj.getparent() is not None: mdlObj._init() # requires discovery as part of start elements if mdlObj.getparent( ).tag == "{http://www.xbrl.org/2003/instance}xbrl": if not foundInstance: foundInstance = True pi = precedingProcessingInstruction( mdlObj, "xbrl-facts-check") if pi is not None: factsCheckVersion = pi.attrib.get("version", None) elif not foundInstance: break ns = mdlObj.qname.namespaceURI ln = mdlObj.qname.localName if beforeInstanceStream: if ((ns == XbrlConst.link and ln not in ("schemaRef", "linkbaseRef")) or (ns == XbrlConst.xbrli and ln in ("context", "unit")) or (ns not in (XbrlConst.link, XbrlConst.xbrli))): beforeInstanceStream = False if _streamingExtensionsValidate: instValidator.validate( modelXbrl, modelXbrl.modelManager.formulaOptions. typedParameters(modelXbrl.prefixedNamespaces)) else: # need default dimensions ValidateXbrlDimensions.loadDimensionDefaults( modelXbrl) elif not beforeInstanceStream and beforeStartStreamingPlugin: for pluginMethod in pluginClassMethods("Streaming.Start"): pluginMethod(modelXbrl) beforeStartStreamingPlugin = False elif event == "end": parentMdlObj = mdlObj.getparent() ns = mdlObj.namespaceURI ln = mdlObj.localName if ns == XbrlConst.xbrli: if ln == "context": if mdlObj.get("sticky"): del mdlObj.attrib["sticky"] XmlValidate.validate(modelXbrl, mdlObj) modelDocument.contextDiscover(mdlObj) else: if len(contextBuffer) >= contextBufferLimit: # drop before adding as dropped may have same id as added cntx = contextBuffer.pop(0) if _streamingFactsPlugin or _streamingValidateFactsPlugin: contextsToDrop.append(cntx) else: dropContext(modelXbrl, cntx) #>>del parentMdlObj[parentMdlObj.index(cntx)] cntx = None XmlValidate.validate(modelXbrl, mdlObj) modelDocument.contextDiscover(mdlObj) if contextBufferLimit.is_finite(): contextBuffer.append(mdlObj) if _streamingExtensionsValidate: contextsToCheck = (mdlObj, ) instValidator.checkContexts(contextsToCheck) if modelXbrl.hasXDT: instValidator.checkContextsDimensions( contextsToCheck) del contextsToCheck # dereference elif ln == "unit": if len(unitBuffer) >= unitBufferLimit: # drop before additing as dropped may have same id as added unit = unitBuffer.pop(0) if _streamingFactsPlugin or _streamingValidateFactsPlugin: unitsToDrop.append(unit) else: dropUnit(modelXbrl, unit) #>>del parentMdlObj[parentMdlObj.index(unit)] unit = None XmlValidate.validate(modelXbrl, mdlObj) modelDocument.unitDiscover(mdlObj) if unitBufferLimit.is_finite(): unitBuffer.append(mdlObj) if _streamingExtensionsValidate: instValidator.checkUnits((mdlObj, )) elif ln == "xbrl": # end of document # check remaining batched facts if any if _streamingFactsPlugin or _streamingValidateFactsPlugin: # plugin attempts to process batch of all root facts not yet processed (not just current one) # finish any final batch of facts if len(modelXbrl.facts) > 0: factsToCheck = modelXbrl.facts.copy() # can block facts deletion if required data not yet available, such as numeric unit for DpmDB if _streamingValidateFactsPlugin: for pluginMethod in pluginClassMethods( "Streaming.ValidateFacts"): pluginMethod(instValidator, factsToCheck) if _streamingFactsPlugin: for pluginMethod in pluginClassMethods( "Streaming.Facts"): pluginMethod(modelXbrl, factsToCheck) for fact in factsToCheck: dropFact(modelXbrl, fact, modelXbrl.facts) #>>del parentMdlObj[parentMdlObj.index(fact)] for cntx in contextsToDrop: dropContext(modelXbrl, cntx) #>>del parentMdlObj[parentMdlObj.index(cntx)] for unit in unitsToDrop: dropUnit(modelXbrl, unit) #>>del parentMdlObj[parentMdlObj.index(unit)] for footnoteLink in footnoteLinksToDrop: dropFootnoteLink(modelXbrl, footnoteLink) #>>del parentMdlObj[parentMdlObj.index(footnoteLink)] fact = cntx = unit = footnoteLink = None del contextsToDrop[:] del unitsToDrop[:] del footnoteLinksToDrop[:] del factsToCheck # check remaining footnote refs for footnoteLink in footnoteBuffer: checkFootnoteHrefs(modelXbrl, footnoteLink) pi = childProcessingInstruction(mdlObj, "xbrl-facts-check", reversed=True) if pi is not None: # attrib is in .text, not attrib, no idea why!!! _match = re.search("([\\w-]+)=[\"']([^\"']+)[\"']", pi.text) if _match: _matchGroups = _match.groups() if len(_matchGroups) == 2: if _matchGroups[0] == "sum-of-fact-md5s": try: expectedMd5 = Md5Sum(_matchGroups[1]) if modelDocument._factsCheckMd5s != expectedMd5: modelXbrl.warning( "streamingExtensions:xbrlFactsCheckWarning", _("XBRL facts sum of md5s expected %(expectedMd5)s not matched to actual sum %(actualMd5Sum)s" ), modelObject=modelXbrl, expectedMd5=expectedMd5, actualMd5Sum=modelDocument. _factsCheckMd5s) else: modelXbrl.info( "info", _("Successful XBRL facts sum of md5s." ), modelObject=modelXbrl) except ValueError: modelXbrl.error( "streamingExtensions:xbrlFactsCheckError", _("Invalid sum-of-md5s %(sumOfMd5)s" ), modelObject=modelXbrl, sumOfMd5=_matchGroups[1]) if _streamingValidateFactsPlugin: for pluginMethod in pluginClassMethods( "Streaming.ValidateFinish"): pluginMethod(instValidator) if _streamingFactsPlugin: for pluginMethod in pluginClassMethods( "Streaming.Finish"): pluginMethod(modelXbrl) elif ns == XbrlConst.link: if ln in ("schemaRef", "linkbaseRef"): modelDocument.discoverHref( mdlObj, urlRewritePluginClass= "ModelDocument.InstanceSchemaRefRewriter") elif ln in ("roleRef", "arcroleRef"): modelDocument.linkbaseDiscover((mdlObj, ), inInstance=True) elif ln == "footnoteLink": XmlValidate.validate(modelXbrl, mdlObj) footnoteLinks = (mdlObj, ) modelDocument.linkbaseDiscover(footnoteLinks, inInstance=True) if footnoteBufferLimit.is_finite(): footnoteBuffer.append(mdlObj) if _streamingExtensionsValidate: instValidator.checkLinks(footnoteLinks) if len(footnoteBuffer) > footnoteBufferLimit: # check that hrefObjects for locators were all satisfied # drop before addition as dropped may have same id as added footnoteLink = footnoteBuffer.pop(0) checkFootnoteHrefs(modelXbrl, footnoteLink) if _streamingValidateFactsPlugin: footnoteLinksToDrop.append(footnoteLink) else: dropFootnoteLink(modelXbrl, footnoteLink) #>>del parentMdlObj[parentMdlObj.index(footnoteLink)] footnoteLink = None footnoteLinks = None elif parentMdlObj.qname == XbrlConst.qnXbrliXbrl and isinstance( mdlObj, ModelFact): numRootFacts += 1 XmlValidate.validate(modelXbrl, mdlObj) modelDocument.factDiscover(mdlObj, modelXbrl.facts) if factsCheckVersion: factCheckFact(mdlObj) if _streamingExtensionsValidate or _streamingFactsPlugin or _streamingValidateFactsPlugin: factsToCheck = (mdlObj, ) # validate current fact by itself if _streamingExtensionsValidate: instValidator.checkFacts(factsToCheck) if modelXbrl.hasXDT: instValidator.checkFactsDimensions(factsToCheck) if _streamingFactsPlugin or _streamingValidateFactsPlugin: # plugin attempts to process batch of all root facts not yet processed (not just current one) # use batches of 1000 facts if len(modelXbrl.facts) > 1000: factsToCheck = modelXbrl.facts.copy() # can block facts deletion if required data not yet available, such as numeric unit for DpmDB if _streamingValidateFactsPlugin: for pluginMethod in pluginClassMethods( "Streaming.ValidateFacts"): pluginMethod(instValidator, factsToCheck) if _streamingFactsPlugin: for pluginMethod in pluginClassMethods( "Streaming.Facts"): pluginMethod(modelXbrl, factsToCheck) for fact in factsToCheck: dropFact(modelXbrl, fact, modelXbrl.facts) #>>del parentMdlObj[parentMdlObj.index(fact)] for cntx in contextsToDrop: dropContext(modelXbrl, cntx) #>>del parentMdlObj[parentMdlObj.index(cntx)] for unit in unitsToDrop: dropUnit(modelXbrl, unit) #>>del parentMdlObj[parentMdlObj.index(unit)] for footnoteLink in footnoteLinksToDrop: dropFootnoteLink(modelXbrl, footnoteLink) #>>del parentMdlObj[parentMdlObj.index(footnoteLink)] fact = cntx = unit = footnoteLink = None del contextsToDrop[:] del unitsToDrop[:] del footnoteLinksToDrop[:] del factsToCheck # dereference fact or batch of facts else: dropFact( modelXbrl, mdlObj, modelXbrl.facts) # single fact has been processed #>>del parentMdlObj[parentMdlObj.index(mdlObj)] if numRootFacts % 1000 == 0: pass #modelXbrl.profileActivity("... streaming fact {0} of {1} {2:.2f}%".format(self.numRootFacts, instInfoNumRootFacts, # 100.0 * self.numRootFacts / instInfoNumRootFacts), # minTimeToShow=20.0) #gc.collect() #sys.stdout.write ("\rAt fact {} of {} mem {}".format(numRootFacts, instInfoNumRootFacts, modelXbrl.modelManager.cntlr.memoryUsed)) if mdlObj is not None: mdlObj.clear() del _parser, _parserLookupName, _parserLookupClass if _streamingExtensionsValidate and validator is not None: _file.close() del instValidator validator.close() # track that modelXbrl has been validated by this streaming extension modelXbrl._streamingExtensionValidated = True modelXbrl.profileStat(_("streaming complete"), time.time() - startedAt) return modelXbrl.modelDocument
def open(self): if not self.isOpen: if (self.isZip or self.isEis or self.isXfd or self.isRss) and self.cntlr: self.basefile = self.cntlr.webCache.getfilename(self.url) else: self.basefile = self.url self.baseurl = self.url # url gets changed by selection if not self.basefile: return # an error should have been logged if self.isZip: self.fs = zipfile.ZipFile(self.basefile, mode="r") self.isOpen = True elif self.isEis: # check first line of file buf = b'' try: file = open(self.basefile, 'rb') more = True while more: l = file.read(8) if len(l) < 8: break if len(buf) == 0 and l.startswith(b"<?xml "): # not compressed buf = l + file.read() # not compressed break compressedBytes = file.read( struct.unpack(">L", l[0:4])[0]) if len(compressedBytes) <= 0: break buf += zlib.decompress(compressedBytes) file.close() except EnvironmentError as err: self.logError(err) pass #uncomment to save for debugging #with open("c:/temp/test.xml", "wb") as f: # f.write(buf) if buf.startswith(b"<?xml "): try: # must strip encoding str = buf.decode(XmlUtil.encoding(buf)) endEncoding = str.index("?>", 0, 128) if endEncoding > 0: str = str[endEncoding+2:] file = io.StringIO(initial_value=str) self.eisDocument = etree.parse(file) file.close() self.isOpen = True except EnvironmentError as err: self.logError(err) return # provide error message later except etree.LxmlError as err: self.logError(err) return # provide error message later elif self.isXfd: # check first line of file file = open(self.basefile, 'rb') firstline = file.readline() if firstline.startswith(b"application/x-xfdl;content-encoding=\"asc-gzip\""): # file has been gzipped base64input = file.read(-1) file.close(); file = None; fb = base64.b64decode(base64input) ungzippedBytes = b"" totalLenUncompr = 0 i = 0 while i < len(fb): lenCompr = fb[i + 0] * 256 + fb[i + 1] lenUncomp = fb[i + 2] * 256 + fb[i + 3] lenRead = 0 totalLenUncompr += lenUncomp gzchunk = (bytes((31,139,8,0)) + fb[i:i+lenCompr]) try: with gzip.GzipFile(fileobj=io.BytesIO(gzchunk)) as gf: while True: readSize = min(16384, lenUncomp - lenRead) readBytes = gf.read(size=readSize) lenRead += len(readBytes) ungzippedBytes += readBytes if len(readBytes) == 0 or (lenUncomp - lenRead) <= 0: break except IOError as err: pass # provide error message later i += lenCompr + 4 #for learning the content of xfd file, uncomment this: #with open("c:\\temp\\test.xml", "wb") as fh: # fh.write(ungzippedBytes) file = io.StringIO(initial_value=ungzippedBytes.decode("utf-8")) else: # position to start of file file.seek(0,io.SEEK_SET) try: self.xfdDocument = etree.parse(file) file.close() self.isOpen = True except EnvironmentError as err: self.logError(err) return # provide error message later except etree.LxmlError as err: self.logError(err) return # provide error message later elif self.isRss: try: self.rssDocument = etree.parse(self.basefile) self.isOpen = True except EnvironmentError as err: self.logError(err) return # provide error message later except etree.LxmlError as err: self.logError(err) return # provide error message later
def file(self, filepath, binary=False): ''' for text, return a tuple of (open file handle, encoding) for binary, return a tuple of (open file handle, ) ''' archiveFileSource = self.fileSourceContainingFilepath(filepath) if archiveFileSource is not None: if filepath.startswith(archiveFileSource.basefile): archiveFileName = filepath[len(archiveFileSource.basefile) + 1:] else: # filepath.startswith(self.baseurl) archiveFileName = filepath[len(archiveFileSource.baseurl) + 1:] if archiveFileSource.isZip: b = archiveFileSource.fs.read(archiveFileName.replace("\\","/")) if binary: return (io.BytesIO(b), ) encoding = XmlUtil.encoding(b) return (io.TextIOWrapper(io.BytesIO(b), encoding=encoding), encoding) elif archiveFileSource.isEis: for docElt in self.eisDocument.iter(tag="{http://www.sec.gov/edgar/common}document"): outfn = docElt.findtext("{http://www.sec.gov/edgar/common}conformedName") if outfn == archiveFileName: b64data = docElt.findtext("{http://www.sec.gov/edgar/common}contents") if b64data: b = base64.b64decode(b64data.encode("latin-1")) # remove BOM codes if present if len(b) > 3 and b[0] == 239 and b[1] == 187 and b[2] == 191: start = 3; length = len(b) - 3; b = b[start:start + length] else: start = 0; length = len(b); if binary: return (io.BytesIO(b), ) encoding = XmlUtil.encoding(b, default="latin-1") return (io.TextIOWrapper(io.BytesIO(b), encoding=encoding), encoding) raise ArchiveFileIOError(self, archiveFileName) elif archiveFileSource.isXfd: for data in archiveFileSource.xfdDocument.iter(tag="data"): outfn = data.findtext("filename") if outfn == archiveFileName: b64data = data.findtext("mimedata") if b64data: b = base64.b64decode(b64data.encode("latin-1")) # remove BOM codes if present if len(b) > 3 and b[0] == 239 and b[1] == 187 and b[2] == 191: start = 3; length = len(b) - 3; b = b[start:start + length] else: start = 0; length = len(b); if binary: return (io.BytesIO(b), ) encoding = XmlUtil.encoding(b, default="latin-1") return (io.TextIOWrapper(io.BytesIO(b), encoding=encoding), encoding) raise ArchiveFileIOError(self, archiveFileName) if binary: return (io.open(filepath, 'rb'), ) # check encoding with open(filepath, 'rb') as fb: hdrBytes = fb.read(512) encoding = XmlUtil.encoding(hdrBytes) if encoding.lower() in ('utf-8','utf8'): text = None else: fb.seek(0) text = fb.read().decode(encoding) # allow filepath to close # this may not be needed for Mac or Linux, needs confirmation!!! if text is None: # ok to read as utf-8 return io.open(filepath, 'rt', encoding='utf-8'), encoding else: # strip XML declaration xmlDeclarationMatch = XMLdeclaration.search(text) if xmlDeclarationMatch: # remove it for lxml start,end = xmlDeclarationMatch.span() text = text[0:start] + text[end:] return (io.StringIO(initial_value=text), encoding)
def file(self, filepath, binary=False): ''' for text, return a tuple of (open file handle, encoding) for binary, return a tuple of (open file handle, ) ''' archiveFileSource = self.fileSourceContainingFilepath(filepath) if archiveFileSource is not None: if filepath.startswith(archiveFileSource.basefile): archiveFileName = filepath[len(archiveFileSource.basefile) + 1:] else: # filepath.startswith(self.baseurl) archiveFileName = filepath[len(archiveFileSource.baseurl) + 1:] if archiveFileSource.isZip: try: b = archiveFileSource.fs.read(archiveFileName.replace("\\","/")) if binary: return (io.BytesIO(b), ) encoding = XmlUtil.encoding(b) return (io.TextIOWrapper(io.BytesIO(b), encoding=encoding), encoding) except KeyError: raise ArchiveFileIOError(self, archiveFileName) elif archiveFileSource.isEis: for docElt in self.eisDocument.iter(tag="{http://www.sec.gov/edgar/common}document"): outfn = docElt.findtext("{http://www.sec.gov/edgar/common}conformedName") if outfn == archiveFileName: b64data = docElt.findtext("{http://www.sec.gov/edgar/common}contents") if b64data: b = base64.b64decode(b64data.encode("latin-1")) # remove BOM codes if present if len(b) > 3 and b[0] == 239 and b[1] == 187 and b[2] == 191: start = 3; length = len(b) - 3; b = b[start:start + length] else: start = 0; length = len(b); if binary: return (io.BytesIO(b), ) encoding = XmlUtil.encoding(b, default="latin-1") return (io.TextIOWrapper(io.BytesIO(b), encoding=encoding), encoding) raise ArchiveFileIOError(self, archiveFileName) elif archiveFileSource.isXfd: for data in archiveFileSource.xfdDocument.iter(tag="data"): outfn = data.findtext("filename") if outfn == archiveFileName: b64data = data.findtext("mimedata") if b64data: b = base64.b64decode(b64data.encode("latin-1")) # remove BOM codes if present if len(b) > 3 and b[0] == 239 and b[1] == 187 and b[2] == 191: start = 3; length = len(b) - 3; b = b[start:start + length] else: start = 0; length = len(b); if binary: return (io.BytesIO(b), ) encoding = XmlUtil.encoding(b, default="latin-1") return (io.TextIOWrapper(io.BytesIO(b), encoding=encoding), encoding) raise ArchiveFileIOError(self, archiveFileName) if binary: return (openFileStream(self.cntlr, filepath, 'rb'), ) else: return openXmlFileStream(self.cntlr, filepath)
def file(self, filepath, binary=False): ''' for text, return a tuple of (open file handle, encoding) for binary, return a tuple of (open file handle, ) ''' archiveFileSource = self.fileSourceContainingFilepath(filepath) if archiveFileSource is not None: if filepath.startswith(archiveFileSource.basefile): archiveFileName = filepath[len(archiveFileSource.basefile) + 1:] else: # filepath.startswith(self.baseurl) archiveFileName = filepath[len(archiveFileSource.baseurl) + 1:] if archiveFileSource.isZip: b = archiveFileSource.fs.read( archiveFileName.replace("\\", "/")) if binary: return (io.BytesIO(b), ) encoding = XmlUtil.encoding(b) return (io.TextIOWrapper(io.BytesIO(b), encoding=encoding), encoding) elif archiveFileSource.isEis: for docElt in self.eisDocument.iter( tag="{http://www.sec.gov/edgar/common}document"): outfn = docElt.findtext( "{http://www.sec.gov/edgar/common}conformedName") if outfn == archiveFileName: b64data = docElt.findtext( "{http://www.sec.gov/edgar/common}contents") if b64data: b = base64.b64decode(b64data.encode("latin-1")) # remove BOM codes if present if len(b) > 3 and b[0] == 239 and b[ 1] == 187 and b[2] == 191: start = 3 length = len(b) - 3 b = b[start:start + length] else: start = 0 length = len(b) if binary: return (io.BytesIO(b), ) encoding = XmlUtil.encoding(b, default="latin-1") return (io.TextIOWrapper(io.BytesIO(b), encoding=encoding), encoding) raise ArchiveFileIOError(self, archiveFileName) elif archiveFileSource.isXfd: for data in archiveFileSource.xfdDocument.iter(tag="data"): outfn = data.findtext("filename") if outfn == archiveFileName: b64data = data.findtext("mimedata") if b64data: b = base64.b64decode(b64data.encode("latin-1")) # remove BOM codes if present if len(b) > 3 and b[0] == 239 and b[ 1] == 187 and b[2] == 191: start = 3 length = len(b) - 3 b = b[start:start + length] else: start = 0 length = len(b) if binary: return (io.BytesIO(b), ) encoding = XmlUtil.encoding(b, default="latin-1") return (io.TextIOWrapper(io.BytesIO(b), encoding=encoding), encoding) raise ArchiveFileIOError(self, archiveFileName) if binary: return (openFileStream(self.cntlr, filepath, 'rb'), ) else: return openXmlFileStream(self.cntlr, filepath)
compressedBytes = file.read( struct.unpack(u">L", l[0:4])[0]) if len(compressedBytes) <= 0: break buf += zlib.decompress(compressedBytes) file.close() except EnvironmentError, err: self.logError(err) pass #uncomment to save for debugging #with open("c:/temp/test.xml", "wb") as f: # f.write(buf) if buf.startswith("<?xml "): try: # must strip encoding unicode = buf.decode(XmlUtil.encoding(buf)) endEncoding = unicode.index(u"?>", 0, 128) if endEncoding > 0: unicode = unicode[endEncoding+2:] file = io.StringIO(initial_value=unicode) parser = etree.XMLParser(recover=True, huge_tree=True) self.eisDocument = etree.parse(file, parser=parser) file.close() self.isOpen = True except EnvironmentError, err: self.logError(err) return # provide error message later except etree.LxmlError, err: self.logError(err) return # provide error message later
def streamingExtensionsLoader(modelXbrl, mappedUri, filepath, **kwargs): # check if big instance and has header with an initial incomplete tree walk (just 2 elements if not _streamingExtensionsCheck: return None # track whether modelXbrl has been validated by this streaming extension modelXbrl._streamingExtensionValidated = False def logSyntaxErrors(parsercontext): for error in parsercontext.error_log: modelXbrl.error("xmlSchema:syntax", _("%(error)s, %(fileName)s, line %(line)s, column %(column)s, %(sourceAction)s source element"), modelObject=modelXbrl, fileName=os.path.basename(filepath), error=error.message, line=error.line, column=error.column, sourceAction="streaming") #### note: written for iterparse of lxml prior to version 3.3, otherwise rewrite to use XmlPullParser ### #### note: iterparse wants a binary file, but file is text mode _file, = modelXbrl.fileSource.file(filepath, binary=True) startedAt = time.time() modelXbrl.profileActivity() ''' this seems twice as slow as iterparse class instInfoTarget(): def __init__(self, element_factory=None, parser=None): self.newTree = True self.streamingAspects = None self.foundInstance = False self.creationSoftwareComment = '' self.currentEltTag = "(before xbrli:xbrl)" self.numRootFacts = 0 def start(self, tag, attrib, nsmap=None): if self.newTree: if tag == "{http://www.xbrl.org/2003/instance}xbrl": self.foundInstance = True self.newTree = False else: # break raise NotInstanceDocumentException() elif not tag.startswith("{http://www.xbrl.org/"): self.numRootFacts += 1 if self.numRootFacts % 1000 == 0: modelXbrl.profileActivity("... streaming tree check", minTimeToShow=20.0) self.currentEltTag = tag def end(self, tag): pass def data(self, data): pass def comment(self, text): if not self.foundInstance: # accumulate comments before xbrli:xbrl self.creationSoftwareComment += ('\n' if self.creationSoftwareComment else '') + text elif not self.creationSoftwareComment: self.creationSoftwareComment = text # or first comment after xbrli:xbrl def pi(self, target, data): if target == "xbrl-streamable-instance": if self.currentEltTag == "{http://www.xbrl.org/2003/instance}xbrl": self.streamingAspects = dict(etree.PI(target,data).attrib.copy()) # dereference target results else: modelXbrl.error("streamingExtensions:headerMisplaced", _("Header is misplaced: %(target)s, must follow xbrli:xbrl element but was found at %(element)s"), modelObject=modelXbrl, target=target, element=self.currentEltTag) def close(self): if not self.creationSoftwareComment: self.creationSoftwareComment = None return True instInfo = instInfoTarget() infoParser = etree.XMLParser(recover=True, huge_tree=True, target=instInfo) try: etree.parse(_file, parser=infoParser, base_url=filepath) except NotInstanceDocumentException: pass ''' foundErrors = False foundInstance = False streamingAspects = None creationSoftwareComment = None instInfoNumRootFacts = 0 numElts = 0 elt = None instInfoContext = etree.iterparse(_file, events=("start","end"), huge_tree=True) for event, elt in instInfoContext: if event == "start": if elt.getparent() is not None: if elt.getparent().tag == "{http://www.xbrl.org/2003/instance}xbrl": if not foundInstance: foundInstance = True pi = precedingProcessingInstruction(elt, "xbrl-streamable-instance") if pi is None: break else: streamingAspects = dict(pi.attrib.copy()) if creationSoftwareComment is None: creationSoftwareComment = precedingComment(elt) if not elt.tag.startswith("{http://www.xbrl.org/"): instInfoNumRootFacts += 1 if instInfoNumRootFacts % 1000 == 0: modelXbrl.profileActivity("... streaming tree check", minTimeToShow=20.0) elif not foundInstance: break elif elt.tag == "{http://www.xbrl.org/2003/instance}xbrl": creationSoftwareComment = precedingComment(elt) if precedingProcessingInstruction(elt, "xbrl-streamable-instance") is not None: modelXbrl.error("streamingExtensions:headerMisplaced", _("Header is misplaced: %(error)s, must follow xbrli:xbrl element"), modelObject=elt) elif event == "end": elt.clear() numElts += 1 if numElts % 1000 == 0 and elt.getparent() is not None: while elt.getprevious() is not None and elt.getparent() is not None: del elt.getparent()[0] if elt is not None: elt.clear() _file.seek(0,io.SEEK_SET) # allow reparsing if not foundInstance or streamingAspects is None: del elt _file.close() return None modelXbrl.profileStat(_("streaming tree check"), time.time() - startedAt) startedAt = time.time() try: version = Decimal(streamingAspects.get("version")) if int(version) != 1: modelXbrl.error("streamingExtensions:unsupportedVersion", _("Streaming version %(version)s, major version number must be 1"), modelObject=elt, version=version) foundErrors = True except (InvalidOperation, OverflowError): modelXbrl.error("streamingExtensions:versionError", _("Version %(version)s, number must be 1.n"), modelObject=elt, version=streamingAspects.get("version", "(none)")) foundErrors = True for bufAspect in ("contextBuffer", "unitBuffer", "footnoteBuffer"): try: bufLimit = Decimal(streamingAspects.get(bufAspect, "INF")) if bufLimit < 1 or (bufLimit.is_finite() and bufLimit % 1 != 0): raise InvalidOperation elif bufAspect == "contextBuffer": contextBufferLimit = bufLimit elif bufAspect == "unitBuffer": unitBufferLimit = bufLimit elif bufAspect == "footnoteBuffer": footnoteBufferLimit = bufLimit except InvalidOperation: modelXbrl.error("streamingExtensions:valueError", _("Streaming %(attrib)s %(value)s, number must be a positive integer or INF"), modelObject=elt, attrib=bufAspect, value=streamingAspects.get(bufAspect)) foundErrors = True if _streamingExtensionsValidate: incompatibleValidations = [] _validateDisclosureSystem = modelXbrl.modelManager.validateDisclosureSystem _disclosureSystem = modelXbrl.modelManager.disclosureSystem if _validateDisclosureSystem and _disclosureSystem.EFM: incompatibleValidations.append("EFM") if _validateDisclosureSystem and _disclosureSystem.GFM: incompatibleValidations.append("GFM") if _validateDisclosureSystem and _disclosureSystem.EBA: incompatibleValidations.append("EBA") if _validateDisclosureSystem and _disclosureSystem.HMRC: incompatibleValidations.append("EBA") if modelXbrl.modelManager.validateCalcLB: incompatibleValidations.append("calculation LB") if incompatibleValidations: modelXbrl.error("streamingExtensions:incompatibleValidation", _("Streaming instance validation does not support %(incompatibleValidations)s validation"), modelObject=modelXbrl, incompatibleValidations=', '.join(incompatibleValidations)) foundErrors = True if instInfoContext.error_log: foundErrors = True logSyntaxErrors(instInfoContext) del instInfoContext # dereference for pluginMethod in pluginClassMethods("Streaming.BlockStreaming"): _blockingPluginName = pluginMethod(modelXbrl) if _blockingPluginName: # name of blocking plugin is returned modelXbrl.error("streamingExtensions:incompatiblePlugIn", _("Streaming instance not supported by plugin %(blockingPlugin)s"), modelObject=modelXbrl, blockingPlugin=_blockingPluginName) foundErrors = True if foundErrors: _file.close() return None _encoding = XmlUtil.encoding(_file.read(512)) _file.seek(0,io.SEEK_SET) # allow reparsing if _streamingExtensionsValidate: validator = Validate(modelXbrl) instValidator = validator.instValidator eltMdlObjs = {} contextBuffer = [] unitBuffer = [] footnoteBuffer = [] factBuffer = [] numFacts = 1 _streamingValidateFactsPlugin = any(True for pluginMethod in pluginClassMethods("Streaming.ValidateFacts")) class modelLoaderTarget(): def __init__(self, element_factory=None, parser=None): self.newTree = True self.currentMdlObj = None self.beforeInstanceStream = True self.beforeStartStreamingPlugin = True self.numRootFacts = 1 modelXbrl.streamingParentModelObject = None modelXbrl.isStreamingMode = True def start(self, tag, attrib, nsmap=None): modelXbrl.streamingParentModelObject = self.currentMdlObj # pass parent to makeelement for ModelObjectFactory mdlObj = _parser.makeelement(tag, attrib=attrib, nsmap=nsmap) mdlObj.sourceline = 1 if self.newTree: self.newTree = False self.currentMdlObj = mdlObj modelDocument = ModelDocument(modelXbrl, Type.INSTANCE, mappedUri, filepath, mdlObj.getroottree()) modelXbrl.modelDocument = modelDocument # needed for incremental validation mdlObj.init(modelDocument) modelDocument.parser = _parser # needed for XmlUtil addChild's makeelement modelDocument.parserLookupName = _parserLookupName modelDocument.parserLookupClass = _parserLookupClass modelDocument.xmlRootElement = mdlObj modelDocument.schemaLocationElements.add(mdlObj) modelDocument.documentEncoding = _encoding modelDocument._creationSoftwareComment = creationSoftwareComment modelXbrl.info("streamingExtensions:streaming", _("Stream processing this instance."), modelObject = modelDocument) else: self.currentMdlObj.append(mdlObj) self.currentMdlObj = mdlObj mdlObj._init() ns = mdlObj.namespaceURI ln = mdlObj.localName if (self.beforeInstanceStream and ( (ns == XbrlConst.link and ln not in ("schemaRef", "linkbaseRef")) or (ns == XbrlConst.xbrli and ln in ("context", "unit")) or (ns not in (XbrlConst.link, XbrlConst.xbrli)))): self.beforeInstanceStream = False if _streamingExtensionsValidate: instValidator.validate(modelXbrl, modelXbrl.modelManager.formulaOptions.typedParameters()) else: # need default dimensions ValidateXbrlDimensions.loadDimensionDefaults(modelXbrl) elif not self.beforeInstanceStream and self.beforeStartStreamingPlugin: for pluginMethod in pluginClassMethods("Streaming.Start"): pluginMethod(modelXbrl) self.beforeStartStreamingPlugin = False return mdlObj def end(self, tag): modelDocument = modelXbrl.modelDocument mdlObj = self.currentMdlObj parentMdlObj = mdlObj.getparent() self.currentMdlObj = parentMdlObj ns = mdlObj.namespaceURI ln = mdlObj.localName if ns == XbrlConst.xbrli: if ln == "context": if mdlObj.get("sticky"): del mdlObj.attrib["sticky"] XmlValidate.validate(modelXbrl, mdlObj) modelDocument.contextDiscover(mdlObj) else: if _streamingExtensionsValidate and len(contextBuffer) >= contextBufferLimit: # drop before adding as dropped may have same id as added cntx = contextBuffer.pop(0) dropContext(modelXbrl, cntx) del parentMdlObj[parentMdlObj.index(cntx)] cntx = None XmlValidate.validate(modelXbrl, mdlObj) modelDocument.contextDiscover(mdlObj) if contextBufferLimit.is_finite(): contextBuffer.append(mdlObj) if _streamingExtensionsValidate: contextsToCheck = (mdlObj,) instValidator.checkContexts(contextsToCheck) if modelXbrl.hasXDT: instValidator.checkContextsDimensions(contextsToCheck) del contextsToCheck # dereference elif ln == "unit": if _streamingExtensionsValidate and len(unitBuffer) >= unitBufferLimit: # drop before additing as dropped may have same id as added unit = unitBuffer.pop(0) dropUnit(modelXbrl, unit) del parentMdlObj[parentMdlObj.index(unit)] unit = None XmlValidate.validate(modelXbrl, mdlObj) modelDocument.unitDiscover(mdlObj) if unitBufferLimit.is_finite(): unitBuffer.append(mdlObj) if _streamingExtensionsValidate: instValidator.checkUnits( (mdlObj,) ) elif ln == "xbrl": # end of document # check remaining footnote refs for footnoteLink in footnoteBuffer: checkFootnoteHrefs(modelXbrl, footnoteLink) for pluginMethod in pluginClassMethods("Streaming.Finish"): pluginMethod(modelXbrl) elif ns == XbrlConst.link: if ln == "footnoteLink": XmlValidate.validate(modelXbrl, mdlObj) footnoteLinks = (mdlObj,) modelDocument.linkbaseDiscover(footnoteLinks, inInstance=True) if footnoteBufferLimit.is_finite(): footnoteBuffer.append(mdlObj) if _streamingExtensionsValidate: instValidator.checkLinks(footnoteLinks) if len(footnoteBuffer) > footnoteBufferLimit: # check that hrefObjects for locators were all satisfied # drop before addition as dropped may have same id as added footnoteLink = footnoteBuffer.pop(0) checkFootnoteHrefs(modelXbrl, footnoteLink) dropFootnoteLink(modelXbrl, footnoteLink) del parentMdlObj[parentMdlObj.index(footnoteLink)] footnoteLink = None footnoteLinks = None elif ln in ("schemaRef", "linkbaseRef"): modelDocument.discoverHref(mdlObj) elif not modelXbrl.skipDTS: if ln in ("roleRef", "arcroleRef"): modelDocument.linkbaseDiscover((mdlObj,), inInstance=True) elif parentMdlObj.qname == XbrlConst.qnXbrliXbrl: self.numRootFacts += 1 XmlValidate.validate(modelXbrl, mdlObj) modelDocument.factDiscover(mdlObj, modelXbrl.facts) if _streamingExtensionsValidate or _streamingValidateFactsPlugin: factsToCheck = (mdlObj,) # validate current fact by itself if _streamingExtensionsValidate: instValidator.checkFacts(factsToCheck) if modelXbrl.hasXDT: instValidator.checkFactsDimensions(factsToCheck) if _streamingValidateFactsPlugin: # plugin attempts to process batch of all root facts not yet processed (not just current one) factsToCheck = modelXbrl.facts.copy() factsHaveBeenProcessed = True # can block facts deletion if required data not yet available, such as numeric unit for DpmDB for pluginMethod in pluginClassMethods("Streaming.ValidateFacts"): if not pluginMethod(modelXbrl, factsToCheck): factsHaveBeenProcessed = False if factsHaveBeenProcessed: for fact in factsToCheck: dropFact(modelXbrl, fact, modelXbrl.facts) del parentMdlObj[parentMdlObj.index(fact)] else: dropFact(modelXbrl, mdlObj, modelXbrl.facts) # single fact has been processed del parentMdlObj[parentMdlObj.index(mdlObj)] del factsToCheck # dereference fact or batch of facts if self.numRootFacts % 1000 == 0: modelXbrl.profileActivity("... streaming fact {0} of {1} {2:.2f}%".format(self.numRootFacts, instInfoNumRootFacts, 100.0 * self.numRootFacts / instInfoNumRootFacts), minTimeToShow=20.0) return mdlObj def data(self, data): self.currentMdlObj.text = data def comment(self, text): pass def pi(self, target, data): pass def close(self): del modelXbrl.streamingParentModelObject return None _parser, _parserLookupName, _parserLookupClass = parser(modelXbrl, filepath, target=modelLoaderTarget()) etree.parse(_file, parser=_parser, base_url=filepath) logSyntaxErrors(_parser) if _streamingExtensionsValidate and validator is not None: _file.close() del instValidator validator.close() # track that modelXbrl has been validated by this streaming extension modelXbrl._streamingExtensionValidated = True modelXbrl.profileStat(_("streaming complete"), time.time() - startedAt) return modelXbrl.modelDocument