def to_xml(self, value, param_name): wrapper = Element(param_name) for item_value in value: xml_item = Element('item') wrapper.append(xml_item) wrapper.item[-1] = item_value return wrapper
def add_document(self, url, title, categories, published_time, content, author=None, topics=None, links=None, terms=None, document_id=None): if url is None or len(url) == 0: raise KeyError("'url' is mandatory") elif url in self.url_indices: log.info(f"Ignoring duplicate URL={url}") return new_document = Element("document") title = Corpus.unicodify(title) new_document.document_id = md5(title.encode("utf-8")).hexdigest()[-6:] if document_id is None or \ len(document_id) == 0 else document_id new_document.url = url new_document.title = title new_document.author = author new_document.published_time = published_time # handle lists new_document.categories = Element("categories") if categories: new_document.categories.category = categories new_document.topics = Element("topics") if topics: new_document.topics.topic = topics new_document.links = Element("links") if links: new_document.links.link = links new_document.content = Element("content") if content: new_document.content.p = [ Corpus.unicodify(p) for p in content if p ] # handle terms new_document.terms = Element("terms") terms_list = [] if terms: for term in terms: term_elmt = Element("term") term_elmt.word = term term_elmt.locations = Element("locations") locations_list = [] for location in terms[term]: location_elmt = Element("location") location_elmt.begin, location_elmt.end = location locations_list.append(location_elmt) term_elmt.locations.location = locations_list terms_list.append(term_elmt) new_document.terms.term = terms_list self.corpus.append(new_document) self.url_indices.append(url)
def to_xml(self, value, param_name): wrapper = Element(param_name) for _dict in value: wrapper.append(self.get_xml_dict(_dict, 'dict')) return wrapper
def get_xml_dict(self, _dict, name): xml_dict = Element(name) for k, v in _dict.items(): xml_item = Element('item') key = Element('key') value = Element('value') xml_item.key = key xml_item.value = value xml_item.key[-1] = k xml_item.value[-1] = v xml_dict.append(xml_item) return xml_dict
def copyNode(node, children=False, parent=False): """ Copy an XML Node :param node: Etree Node :param children: Copy children nodes is set to True :param parent: Append copied node to parent if given :return: New Element """ if parent is not False: element = SubElement(parent, node.tag, attrib=node.attrib, nsmap={None: "http://www.tei-c.org/ns/1.0"}) else: element = Element(node.tag, attrib=node.attrib, nsmap={None: "http://www.tei-c.org/ns/1.0"}) if children: if node.text: element._setText(node.text) for child in xmliter(node): element.append(copy(child)) return element
def __init__(self, xml_input=None, annotations=None): super().__init__("corpus", "document") self.corpus = Element("corpus") self.url_indices = [] self.has_terms_locations = False self.nlp = stanza.Pipeline("en", processors={ "tokenize": "gum", "ner": "default", "lemma": "gum", "pos": "gum", "depparse": "gum" }, verbose=False, tokenize_no_ssplit=True) self.annotations = annotations.documents if annotations else None if xml_input: if xml_input and not os.path.exists(xml_input): raise FileNotFoundError( f"{xml_input} not found. Check the path again.") elif os.path.isfile(xml_input): self.read_from_xml(xml_input) else: self.read_from_folder(xml_input)
def getvalue(self, serialize=True): """ Gets the actual payload's value converted to a string representing either XML or JSON. """ if self.zato_is_xml: if self.zato_output_repeated: value = Element('item_list') else: value = Element('item') else: if self.zato_output_repeated: value = [] else: value = {} if self.zato_output_repeated: output = self.zato_output else: output = set(dir(self)) & self.zato_all_attrs output = [dict((name, getattr(self, name)) for name in output)] if output: # All elements must be of the same type so it's OK to do it is_sa_namedtuple = isinstance(output[0], KeyedTuple) for item in output: if self.zato_is_xml: out_item = Element('item') else: out_item = {} for is_required, name in chain(self.zato_required, self.zato_optional): leave_as_is = isinstance(name, AsIs) elem_value = self._getvalue(name, item, is_sa_namedtuple, is_required, leave_as_is) if isinstance(name, ForceType): name = name.name if isinstance(elem_value, basestring): elem_value = elem_value if isinstance( elem_value, unicode) else elem_value.decode('utf-8') if self.zato_is_xml: setattr(out_item, name, elem_value) else: out_item[name] = elem_value if self.zato_output_repeated: value.append(out_item) else: value = out_item if self.zato_is_xml: em = ElementMaker(annotate=False, namespace=self.namespace, nsmap={None: self.namespace}) zato_env = em.zato_env(em.cid(self.zato_cid), em.result(ZATO_OK)) top = getattr(em, self.response_elem)(zato_env) top.append(value) else: top = {self.response_elem: value} search = self.zato_meta.get('search') if search: top['_meta'] = search if serialize: if self.zato_is_xml: deannotate(top, cleanup_namespaces=True) return etree.tostring(top) else: return dumps(top) else: return top
def convert(self, param, param_name, value, has_simple_io_config, is_xml, date_time_format=None): try: if any( param_name.startswith(prefix) for prefix in self.bool_parameter_prefixes) or isinstance( param, Boolean): value = asbool( value or None ) # value can be an empty string and asbool chokes on that if value and value is not None: # Can be a 0 if isinstance(param, Boolean): value = asbool(value) elif isinstance(param, CSV): value = value.split(',') elif isinstance(param, List): if is_xml: # We are parsing XML to create a SIO request if isinstance(value, EtreeElement): return [elem.text for elem in value.getchildren()] # We are producing XML out of an SIO response else: wrapper = Element(param_name) for item_value in value: xml_item = Element('item') wrapper.append(xml_item) wrapper.item[-1] = item_value return wrapper # This is a JSON list return value elif isinstance(param, Integer): value = int(value) elif isinstance(param, Unicode): value = unicode(value) elif isinstance(param, UTC): value = value.replace('+00:00', '') else: if value and value != ZATO_NONE and has_simple_io_config: if any(param_name==elem for elem in self.int_parameters) or \ any(param_name.endswith(suffix) for suffix in self.int_parameter_suffixes): value = int(value) if date_time_format and isinstance(value, datetime): value = value.strftime(date_time_format) if isinstance(param, CSV) and not value: value = [] return value except Exception, e: msg = 'Conversion error, param:[{}], param_name:[{}], repr(value):[{}], e:[{}]'.format( param, param_name, repr(value), format_exc(e)) logger.error(msg) raise ZatoException(msg=msg)
def new(tag, **extra): return Element(qn(tag), **extra)