def construct_ziotype_data(session, log_scope, document: etree.ElementBase) -> dict: fields = document.find("velden") omschrijving = trim_string( session, log_scope, find(fields, "naam"), 80, "omschrijving", ObjectTypenKeys.resultaattypen, ) return { "informatieobjecttype_omschrijving": omschrijving, "volgnummer": int(document.get("volgnummer")), "richting": get_choice_field( session, f"{log_scope} richting", find(fields, "type", False), RichtingChoices.values, ObjectTypenKeys.zaakinformatieobjecttypen, default=DEFAULT_RICHTING, ), # TODO no mapping for non-required fields # "statustype": "http://example.com" }
def _get_info(self, link: etree.ElementBase) -> str: href = link.get("href") is_ncdetails = all([(key in href) for key in ["details", "ncshipment"]]) args = dict([ ("url", href), ( "headers", dict([ ("Accept", link.get("media-type")), ("Authorization", "Basic %s" % self.authorization), ("Accept-language", "en-CA"), ] + ([("Content-Type", link.get("media-type"))] if is_ncdetails else [])), ), ("method", "POST" if is_ncdetails else "GET"), ]) return http(**args)
def _cover_from_tuple(self, item: ElementBase, attributes): for attr in attributes: value = item.get(attr, None) if value is None: continue value = self.http.normalize_uri(value) test = self.http.check_url(value) if test: return value return None
def construct_statustype_data(session, log_scope, statustype: etree.ElementBase) -> dict: fields = statustype.find("velden") return { "volgnummer": int(statustype.get("volgnummer")), "omschrijving": find(fields, "naam"), "omschrijvingGeneriek": find(fields, "naam-model", False), "statustekst": find(fields, "bericht", False), # "informeren": true }
def run_event(event_node: etree.ElementBase, project_path: str) -> None: if event_node is None: return ProcessManager.log.info(event_node.get(XmlAttributeName.DESCRIPTION)) ws: re.Pattern = re.compile('[ \t\n\r]+') environ: dict = os.environ.copy() command: str = ' && '.join( ws.sub(' ', node.text) for node in filter(is_command_node, event_node) ) ProcessManager.run_command(command, project_path, environ)
def create_child(self, element: etree.ElementBase): """ Finds NodeBase child class for a child element. There is no need to use this function. If this lookup behaviour has to be customized, override child_lookup(..) instead. """ if element.tag is etree.Comment: return if element.tag is etree.PI: return # If this is a ref element. Find the element it references and add it as a child. if element.tag == "ref": node_id = element.get("id") child = self.find_node_by_id(node_id) if child is None: raise IdNotFoundError(self.file_name, element.sourceline, node_id) else: # Check if this is a data Data element if isinstance(inspect.getattr_static(self, element.tag, None), Data): from urban_journey.ujml.nodes.data import data klass = data else: # Check if parent element knows what type it is. klass = self.child_lookup(element) # Update the node_register if it's empty. if len(node_register) == 0: update_plugins() # Look for node class in the register. if klass is None: if element.tag in node_register: klass = node_register[element.tag] else: # Node type was not found. raise UnknownElementError(self.file_name, element.sourceline, element.tag) child = klass(element, self.root) # Check if the super() was called in the __init__ of the node. if not hasattr(child, "element"): self.raise_exception(MissingSuperInitError, self.tag, element.tag) # Add child self.add_child(child)
def parse_background(self, element: ElementBase) -> str: """ :param element: :return: """ style = element.get('style', None) value = None if style: css = make_parser(None) try: # do not touch this! for declaration in css.parse_style_attr(style)[0]: if declaration.name == 'background': for token in declaration.value: if token.type == 'URI': value = token.value break if declaration.name == 'background-image': value = declaration.value[0].value break except IndexError: return '' return self.http.normalize_uri(value)
def get_codes(cmd: ET.ElementBase, attr_name: str) -> List[str]: # the list comprehension here drops empty elements return [x for x in cmd.get(attr_name, "").split(",") if x]
def construct_zaaktype_data(session, log_scope, process: etree.ElementBase, processtype_year: int) -> dict: fields = process.find("velden") omschrijving = trim_string( session, log_scope, find(fields, "kernomschrijving"), 80, "omschrijving", ObjectTypenKeys.zaaktypen, ) omschrijvingGeneriek = trim_string( session, log_scope, find(fields, "model-kernomschrijving", False), 80, "omschrijvingGeneriek", ObjectTypenKeys.zaaktypen, ) indicatie_intern_of_extern = ("extern" if "extern" in find( fields, "zaaktype-categorie", False).lower() else "intern") handeling_initiator = value_or_default( session, f"{log_scope} handelingInitiator", find(fields, "zaaktype-naam/structuur/handeling-initiator", False), DEFAULT_HANDELING_INITIATOR, ObjectTypenKeys.zaaktypen, ) aanleiding = value_or_default( session, f"{log_scope} aanleiding", find(fields, "aanleiding", False), DEFAULT_AANLEIDING, ObjectTypenKeys.zaaktypen, ) onderwerp = value_or_default( session, f"{log_scope} onderwerp", find(fields, "zaaktype-naam/structuur/onderwerp", False), DEFAULT_ONDERWERP, ObjectTypenKeys.zaaktypen, ) handeling_behandelaar = value_or_default( session, f"{log_scope} handeling_behandelaar", find(fields, "zaaktype-naam/structuur/handeling-behandelaar", False), DEFAULT_HANDELING_BEHANDELAAR, ObjectTypenKeys.zaaktypen, ) servicenorm = get_duration( find(fields, "afdoeningstermijn"), find(fields, "afdoeningstermijn-eenheid"), ) doorlooptijd = get_duration( find(fields, "wettelijke-afdoeningstermijn", False), find(fields, "wettelijke-afdoeningstermijn-eenheid", False), ) if not doorlooptijd: doorlooptijd = get_duration( find(fields, "afdoeningstermijn"), find(fields, "afdoeningstermijn-eenheid"), ) session.log_info( f'{log_scope} Used "afdoeningstermijn" ({doorlooptijd}) for "Zaaktype.doorlooptijd": Import has no value for "wettelijke-afdoeningstermijn".', ObjectTypenKeys.zaaktypen, ) verlengings_termijn = get_duration( find(fields, "wettelijke-verdagingstermijn", False), find(fields, "wettelijke-verdagingstermijn-eenheid", False), ) return { "identificatie": process.get("id"), "omschrijving": omschrijving, "omschrijvingGeneriek": omschrijvingGeneriek, "vertrouwelijkheidaanduiding": get_choice_field( session, f"{log_scope} vertrouwelijkheidaanduiding", find(fields, "vertrouwelijkheid", False), VertrouwelijkheidsAanduidingen.values, ObjectTypenKeys.zaaktypen, default=DEFAULT_VERTROUWELIJKHEID, required=True, ), "doel": find(fields, "naam"), "aanleiding": aanleiding, "toelichting": find(fields, "toelichting-proces", False), "indicatieInternOfExtern": indicatie_intern_of_extern, "handelingInitiator": handeling_initiator, "onderwerp": onderwerp, "handelingBehandelaar": handeling_behandelaar, "doorlooptijd": doorlooptijd, "opschortingEnAanhoudingMogelijk": get_boolean(find(fields, "aanhouden-mogelijk", False)), "verlengingMogelijk": bool(verlengings_termijn), "verlengingstermijn": verlengings_termijn, "trefwoorden": get_array(find(fields, "lokale-trefwoorden", False)), # always empty? "publicatieIndicatie": get_boolean(find(fields, "publicatie-indicatie", False)), "publicatietekst": find(fields, "publicatietekst", False), "verantwoordingsrelatie": get_array(find(fields, "verantwoordingsrelatie", False)), # always empty? "selectielijstProcestype": get_procestype(process, processtype_year), "referentieproces": { "naam": find(fields, "ztc-procestype") }, # Set during `load_data` # "catalogus": "", "beginGeldigheid": session.job.start_date.isoformat(), "eindeGeldigheid": None, "versiedatum": get_date(find(fields, "actueel-van")), "servicenorm": servicenorm, "productenOfDiensten": [], "gerelateerdeZaaktypen": [], "besluittypen": [], # "deelzaaktypen": [], }
def process_xml_element(self, el: etree.ElementBase, event: str, a: Dict[str, Any]) -> None: if el.tag == 'CONTENT' and event == 'end': a['metadata']['record_id'] = el.get('RECORDID') elif el.tag == 'MASTERIDENTIFER' and event == 'end': a['metadata']['title'] = clean_text(el.text) elif el.tag == 'TYPE' and event == 'end': a['metadata']['doc_type'] = clean_text(el.text) elif el.tag == 'DOCUMENTID' and event == 'end': a['metadata']['doc_id'] = clean_text(el.text) elif el.tag == 'VERSION' and event == 'end': a['metadata']['version'] = clean_text(el.text) elif el.tag == 'AUTHOR' and event == 'end': a['metadata']['author'] = clean_text(el.text) elif el.tag == 'ENDTIMESTAMP_MILLIS' and event == 'end': millis = int(clean_text(el.text)) a['metadata']['end_timestamp_millis'] = millis a['metadata']['end_time'] = get_iso_datetime_from_millis(millis) elif el.tag == 'STARTTIMESTAMP_MILLIS' and event == 'end': millis = int(clean_text(el.text)) a['metadata']['start_timestamp_millis'] = millis a['metadata']['start_time'] = get_iso_datetime_from_millis(millis) elif el.tag == 'CREATETIMESTAMP_MILLIS' and event == 'end': millis = int(clean_text(el.text)) a['metadata']['create_timestamp_millis'] = millis a['metadata']['create_time'] = get_iso_datetime_from_millis(millis) elif el.tag == 'LASTMODIFIEDTIMESTAMP_MILLIS' and event == 'end': millis = int(clean_text(el.text)) a['metadata']['last_modified_timestamp_millis'] = millis a['metadata']['last_modified_time'] = get_iso_datetime_from_millis( millis) elif el.tag == 'RESOURCEPATH' and event == 'end': a['metadata']['doc_location_path'] = clean_text(el.text) elif el.tag == 'PUBLISHEDTIMESTAMP_MILLIS' and event == 'end': millis = int(clean_text(el.text)) a['metadata']['published_timestamp_millis'] = millis a['metadata']['published_time'] = get_iso_datetime_from_millis( millis) elif el.tag == a['metadata']['doc_type']: a['is_data'] = (event == 'start') elif a['is_data'] and event == 'end' and el.text: # treat all text as html # lxml will automatically wrap plain text in a para, body and html tags structured_content = [] text_list = [] try: maybe_json = json.loads(el.text) structured_content.append({'type': 'json', 'json': maybe_json}) except (JSONDecodeError, ValueError): extractors = [ ListExtractor(excluded_tags=['table']), TableExtractor(), TextExtractor(excluded_tags=[ 'ul', 'ol', 'table', 'title', 'h1', 'h2', 'h3', 'h4' ]), HeadingExtractor(excluded_tags=['ul', 'ol', 'table']) ] stream: IO[AnyStr] = BytesIO( fix_content(el.text).encode('utf-8')) for ev, elem in self.element_iterator(stream, html=True): process_html_element(elem, ev, extractors, structured_content, text_list) # re-extract content in single column tables used for layout purposes only html = None # memoize k = [] for i, c in enumerate(structured_content): typ = c['type'] if typ in ['text', 'heading']: k.append(1) elif typ == 'list': k.append(len(c.get('items', []))) elif typ == 'table': k.append( len(c.get('head', [])) + len(c.get('body', []))) if len(c.get('fields', [])) == 1: if not html: # reset stream to reiterate stream.seek(0) # read stream into str and parse as html html = lxml.html.fromstring(stream.read()) # find single column layout table contents = html.xpath( ('/descendant::table[{0}]/tbody/tr/td/*|' + '/descendant::table[{0}]/tr/td/*').format( c['index'])) root = etree.Element('div') root.extend(contents) sc = [] tl = [] for evt, ele in etree.iterwalk(root, events=('start', 'end')): process_html_element(ele, evt, extractors, sc, tl) j = len(c.get('references', [])) structured_content = flatten([ structured_content[:(i - j)], sc, structured_content[(i + 1):] ]) text_list = flatten([ text_list[:sum(k[:(i - j)])], tl, text_list[sum(k[:(i + 1)]):] ]) data = {} if len(text_list) == 1: data['text'] = text_list[0] else: data['text'] = text_list if structured_content: data['structured_content'] = structured_content a['data'][el.tag.lower()] = data
def process_xml_element( el: etree.ElementBase, event: str, accumulator: Dict[str, Any], excluded_html_tags: List[str], ) -> Dict[str, Any]: """ Stateful, so cannot be parallelized. :param el: XML element :param event: event type [start, end] :param accumulator: accumulates state :param excluded_html_tags: XML tags to exclude :return: accumulated content as dict """ a = deepcopy(accumulator) if el.tag == 'CONTENT' and event == 'end': a['metadata']['record_id'] = el.get('RECORDID') elif el.tag == 'MASTERIDENTIFER' and event == 'end': a['metadata']['title'] = clean_text(el.text) elif el.tag == 'TYPE' and event == 'end': a['metadata']['doc_type'] = clean_text(el.text) elif el.tag == 'DOCUMENTID' and event == 'end': a['metadata']['doc_id'] = clean_text(el.text) elif el.tag == 'VERSION' and event == 'end': a['metadata']['version'] = clean_text(el.text) elif el.tag == 'AUTHOR' and event == 'end': a['metadata']['author'] = clean_text(el.text) elif el.tag == 'ENDTIMESTAMP_MILLIS' and event == 'end': millis = int(clean_text(el.text)) a['metadata']['end_timestamp_millis'] = millis a['metadata']['end_time'] = get_iso_datetime_from_millis(millis) elif el.tag == 'STARTTIMESTAMP_MILLIS' and event == 'end': millis = int(clean_text(el.text)) a['metadata']['start_timestamp_millis'] = millis a['metadata']['start_time'] = get_iso_datetime_from_millis(millis) elif el.tag == 'CREATETIMESTAMP_MILLIS' and event == 'end': millis = int(clean_text(el.text)) a['metadata']['create_timestamp_millis'] = millis a['metadata']['create_time'] = get_iso_datetime_from_millis(millis) elif el.tag == 'LASTMODIFIEDTIMESTAMP_MILLIS' and event == 'end': millis = int(clean_text(el.text)) a['metadata']['last_modified_timestamp_millis'] = millis a['metadata']['last_modified_time'] = get_iso_datetime_from_millis( millis) elif el.tag == 'RESOURCEPATH' and event == 'end': a['metadata']['doc_location_path'] = clean_text(el.text) elif el.tag == 'PUBLISHEDTIMESTAMP_MILLIS' and event == 'end': millis = int(clean_text(el.text)) a['metadata']['published_timestamp_millis'] = millis a['metadata']['published_time'] = get_iso_datetime_from_millis(millis) elif el.tag == a['metadata']['doc_type']: a['is_data'] = (event == 'start') elif a['is_data'] and event == 'end' and el.text: # treat all text as html # lxml will automatically wrap plain text in a para, body and html tags structured_content = [] text_list = [] extractors = [ ListExtractor(excluded_tags=['table']), TableExtractor(), TextExtractor(excluded_tags=[ 'ul', 'ol', 'table', 'title', 'h1', 'h2', 'h3', 'h4' ]), HeadingExtractor(excluded_tags=['ul', 'ol', 'table']) ] stream = BytesIO(fix_content(el.text).encode('utf-8')) for ev, elem in element_iterator(stream, excluded_html_tags, html=True): structured, text = process_html_element(elem, ev, extractors) structured_content.extend(structured) text_list.extend(text) data = {} if len(text_list) == 1: data['text'] = text_list[0] else: data['text'] = text_list if structured_content: data['structured_content'] = structured_content a['data'][el.tag.lower()] = data return a
def _get_attr_as_bool(node: etree.ElementBase, attribute_name: str, default_value: str = 'false') -> bool: attr: str = node.get(attribute_name, default=default_value).casefold() return any([attr == 'true', attr == '1'])