def insert_place_old_label(db, cursor, dt_id): """ """ print("** TABLE place_old_label – INSERT") tags = re.compile('<[^>]+>') # TODO: appeler le bon DT (et non _output7.xml, uniquement en dev) tree = etree.parse('../../../dico-topo/data/'+dt_id+'/output7.xml') # tree = etree.parse('../../../dico-topo/data/'+dt_id+'/'+dt_id+'.xml') # conversion HTML5 de toute l’entrée forme_ancienne old_label2html = io.StringIO('''\ <xsl:stylesheet version="1.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform"> <xsl:output method="text"/> <xsl:template match="/"> <xsl:apply-templates/> </xsl:template> <xsl:template match="sup"> <xsl:text><sup></xsl:text> <xsl:apply-templates/> <xsl:text></sup></xsl:text> </xsl:template> <xsl:template match="sm"> <xsl:text><span class="sc"></xsl:text> <xsl:apply-templates/> <xsl:text></span></xsl:text> </xsl:template> <xsl:template match="i"> <xsl:text><i></xsl:text> <xsl:apply-templates/> <xsl:text></i></xsl:text> </xsl:template> <xsl:template match="i[parent::forme_ancienne]"> <xsl:text><dfn></xsl:text> <xsl:apply-templates/> <xsl:text></dfn></xsl:text> </xsl:template> <xsl:template match="i[parent::reference]"> <xsl:text><cite></xsl:text> <xsl:apply-templates/> <xsl:text></cite></xsl:text> </xsl:template> <xsl:template match="date"> <xsl:text><time></xsl:text> <xsl:apply-templates/> <xsl:text></time></xsl:text> </xsl:template> <xsl:template match="comment"> <xsl:text><br/></xsl:text> <xsl:apply-templates/> </xsl:template> <xsl:template match="pg"/> </xsl:stylesheet>''') xslt_old_label2html = etree.parse(old_label2html) transform_old_label2html = etree.XSLT(xslt_old_label2html) # nettoyage typo, réutilisable clean_start = re.compile('^[«—\- ]+') clean_end = re.compile('[»— .,;]+$') clean_markup = re.compile(',(</[^>]+>)') # sortir la virgule du markup (span, cite, dfn, ?) # utilitaires pour extraire et nettoyer les formes anciennes # relou, support xpath incomplet, on ne peut pas sortir le texte qui suit le dernier élément <i> # <xsl:template match="i[position()=last()]/following-sibling::text()"/> # On sort les notes, notamment pour DT60 # On corrige plus loin en traitement de chaîne de chars. get_old_label = io.StringIO('''\ <xsl:stylesheet version="1.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform"> <xsl:output method="text"/> <xsl:template match="/"> <xsl:apply-templates/> </xsl:template> <xsl:template match="i"> <xsl:text><dfn></xsl:text> <xsl:apply-templates/> <xsl:text></dfn></xsl:text> </xsl:template> <xsl:template match="reference"/> <xsl:template match="date"/> <xsl:template match="comment"/> <xsl:template match="pg"/> <xsl:template match="note"/> </xsl:stylesheet>''') xslt_get_old_label = etree.parse(get_old_label) transform_old_label2dfn = etree.XSLT(xslt_get_old_label) get_old_label_rich_date = io.StringIO('''\ <xsl:stylesheet version="1.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform"> <xsl:output method="text"/> <xsl:template match="/forme_ancienne"> <xsl:apply-templates select="date[1]"/> </xsl:template> <xsl:template match="date"> <xsl:apply-templates/> </xsl:template> <xsl:template match="sup"> <xsl:text><sup></xsl:text> <xsl:apply-templates/> <xsl:text></sup></xsl:text> </xsl:template> <xsl:template match="sm"> <xsl:text><span class="sc"></xsl:text> <xsl:apply-templates/> <xsl:text></span></xsl:text> </xsl:template> <xsl:template match="pg"/> </xsl:stylesheet>''') xslt_get_old_label_rich_date = etree.parse(get_old_label_rich_date) transform_old_label2rich_date = etree.XSLT(xslt_get_old_label_rich_date) get_old_label_rich_ref = io.StringIO('''\ <xsl:stylesheet version="1.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform"> <xsl:output method="text"/> <xsl:template match="/forme_ancienne"> <xsl:apply-templates select="reference[1]"/> </xsl:template> <xsl:template match="reference"> <xsl:apply-templates/> </xsl:template> <xsl:template match="pg"/> <xsl:template match="sup"> <xsl:text><sup></xsl:text> <xsl:apply-templates/> <xsl:text></sup></xsl:text> </xsl:template> <xsl:template match="i"> <xsl:text><cite></xsl:text> <xsl:apply-templates/> <xsl:text></cite></xsl:text> </xsl:template> <xsl:template match="sm"> <xsl:text><span class="sc"></xsl:text> <xsl:apply-templates/> <xsl:text></span></xsl:text> </xsl:template> <xsl:template match="date"> <xsl:text><time></xsl:text> <xsl:apply-templates/> <xsl:text></time></xsl:text> </xsl:template> </xsl:stylesheet>''') xslt_get_old_label_rich_ref = etree.parse(get_old_label_rich_ref) transform_old_label2rich_ref = etree.XSLT(xslt_get_old_label_rich_ref) i = 0 for entry in tree.xpath('/DICTIONNAIRE/article'): # un dictionnaire pour stocker les données relative à chaque article place = {} # id de l’article place['id'] = entry.get('id') # récupérer la mention de responsabilité attachée à la création du lieu cursor.execute("SELECT responsibility_id FROM place WHERE place_id = '%s'" % place['id']) responsibility_id = cursor.fetchone()[0] # formes anciennes et attestations # jusqu’à 47 formes anciennes pour une vedette, dans l’Aisne: `distinct-values(//article/count(forme_ancienne)` # des formes anciennes sans forme !!!, ex: DT02-04777 # On garde 1 date et 1 ref par forme ancienne ; on place les rares exceptions dans le champs texte qu’on publiera # Source XML : # <forme_ancienne>Inter <i>Estran</i> et <i>Abugniez</i> et <i>Gerigniez,</i> # <date>1168</date> # <reference>(cart. de l’abb. de Thenailles, f<sup>os</sup> 15, 20, 36)</reference>. # </forme_ancienne> # # Variables # * old_label_id # identifiant calculé de la forme ancienne # DT02-00043-03 # * old_label_xml_str # élément <forme_ancienne> dans la source XML # <forme_ancienne>Inter <i>Estran</i> et <i>Abugniez</i> et <i>Gerigniez,</i> <date>1168</date> <reference>(cart. de l’abb. de Thenailles, f<sup>os</sup> 15, 20, 36)</reference>.</forme_ancienne> # * old_label_html_str # élément <forme_ancienne> converti en HTML # <p>Inter <dfn>Estran</dfn> et <dfn>Abugniez</dfn> et <dfn>Gerigniez</dfn>, <time>1168</time> (cart. de l’abb. de Thenailles, f<sup>os</sup> 15, 20, 36)</p> # * dfn # la/les forme(s) d’une entrée forme ancienne, en HTML # Inter <dfn>Estran</dfn> et <dfn>Abugniez</dfn> et <dfn>Gerigniez</dfn> # * rich_ref # la référence d’une entrée forme ancienne avec enrichissement typo, en HTML # cart. de l’abb. de Thenailles, f<sup>os</sup> 15, 20, 36 # * rich_date # la date avec enrichissement typo, en HTML # <span class='sc">xiii</span><sup>e</sup> siècle (cf DT02-00048-03) # * date # la date sans enrichissement typo # xiiie siècle (idem) if entry.xpath('forme_ancienne'): n = 1 for old_label in entry.xpath('forme_ancienne'): # ATTENTION: on ne conserve pas les formes anciennes sans label (not(i)), # sauf si la forme ancienne est la première de la liste des formes anciennes (ou unique). # Dans ce cas on reprend le label de l’article. if old_label.xpath('not(i)') and n > 1: continue old_label_id = place['id']+'-0'+str(n) if n < 10 else place['id']+'-'+str(n) old_label_xml_str = tostring(old_label, encoding='unicode') old_label_xml_str = ' '.join(old_label_xml_str.split()) tree = etree.fromstring(old_label_xml_str) # tout le contenu de l’élément forme_ancienne, formaté en HTML old_label_html_str = str(transform_old_label2html(tree)) old_label_html_str = re.sub(clean_start, '', old_label_html_str) old_label_html_str = re.sub(clean_markup, '\\1,', old_label_html_str) # sortir les virgules des balises old_label_html_str = re.sub(clean_end, '', old_label_html_str) old_label_html_str = "<p>%s</p>" % (old_label_html_str) # on encapsule dans un <p> ? dans <li> ? Todo: à (re)voir # sortir les préfixes "*" des formes anciennes et les conserver dans les références (moche) old_label_html_str = old_label_html_str.replace('<dfn>*', '<dfn>') # DFN dfn = str(transform_old_label2dfn(tree)) dfn = re.sub(clean_start, '', dfn) dfn = dfn.replace('<dfn>*', '<dfn>') # déprime de la gestion de l’"*" initiale (cf plus haut aussi) # sortir la ponctuation des élements <dfn> avant normalisation de la fin de la chaîne complète (dfn) # dfn = dfn.replace(',</dfn>', '</dfn>,') dfn = re.sub('([, .; :]+)</dfn>', '</dfn>\\1', dfn) dfn = re.sub(clean_end, '', dfn) dfn = dfn.rstrip() # ceintures bretelles # On vire le texte qui suit le dernier élément <dfn> (support xpath insuffisant avec lxml) pos = dfn.rfind('</dfn>') # rfind retourne -1 si la chaîne n’est pas trouvée… if pos != -1: dfn = dfn[:pos+6] # des ponctuations illogiques du fait des traitement précédents, on standardise à la hache dfn = re.sub('[, ;][, ;.:]{3,}', '. ', dfn) # 7201 formes anciennes font plus de 100 chars : on coupe ! # TODO: corriger XML ou le code de chargement pour repositionner les balises dans la chaîne conservée # use iterator: re.finditer('</dfn>', dfn) if len(dfn) > 100: # on vire les balises, pour rien risquer… dfn = re.sub(tags, '', dfn) dfn = dfn[:100].strip() + '…' # quand label du toponyme ancien est vide, on reprend celui de la vedette # TODO: essayer d’affiner cette logique avec OC et SN if not dfn: dfn = entry.xpath('vedette/sm[1]')[0].text.rstrip(',') dfn = dfn.strip() # print(placename['id']+' forme_ancienne sans label => '+dfn) # Date des formes anciennes # Parfois plusieurs dates pour une forme ancienne ; on n’inscrit en base que la première (cf XLST). # Attention au mauvais formatage des dates dans les XML (des sauts de lignes intempestifs…) rich_date = str(transform_old_label2rich_date(tree)).lstrip() date = re.sub(tags, '', rich_date) # remove multiple spaces date = " ".join(date.split()) date = date2iso(date) # print(rich_date + ' = ' + date) # Ref (référence de la forme ancienne) ; 1 forme_ancienne avec plus d’une réf ! (pour l’instant, on les vire = choix du prestataire d’ailleurs…) # contenu riche : i, sup, pg (on les sort en XSLT), date, sm rich_ref = str(transform_old_label2rich_ref(tree)).strip() rich_ref = rich_ref.lstrip('(').rstrip(')') # suppression des parenthèses rich_ref = rich_ref.replace(',</cite>', '</cite>,') # sortir la ponctuation du titre # ref = re.sub(tags, '', rich_ref) n += 1 i += 1 try: cursor.execute( "INSERT INTO place_old_label (" "old_label_id," "rich_label," "rich_date," "text_date," "rich_reference," "responsibility_id," "place_id)" "VALUES(?, ?, ?, ?, ?, ?, ?)", (old_label_id, dfn, rich_date, date, rich_ref, responsibility_id, place['id'])) except sqlite3.IntegrityError as e: print(e, "place %s" % (place['id'])) # db.commit() else: continue db.commit() # Remettre les valeurs vides à NULL… honteux et efficace: cursor.execute(""" UPDATE place_old_label SET rich_date = CASE rich_date WHEN '' THEN NULL ELSE rich_date END, text_date = CASE text_date WHEN '' THEN NULL ELSE text_date END, rich_reference = CASE rich_reference WHEN '' THEN NULL ELSE rich_reference END """) db.commit()
def GetXSLT(xslt_context, xslt_template): xslt_template = loader.get_template(xslt_template) xslt_str = xslt_template.render(xslt_context) xslt_doc = etree.parse(StringIO(xslt_str)) xslt = etree.XSLT(xslt_doc) return xslt
def __init__(self): self.transform = etree.XSLT(StripNamespace.xslt)
def create_rml(self, cr, xml, uid, context=None): if self.tmpl=='' and not self.internal_header: self.internal_header=True if not context: context={} pool = pooler.get_pool(cr.dbname) ir_translation_obj = pool.get('ir.translation') # In some case we might not use xsl ... if not self.xsl: return xml stylesheet_file = tools.file_open(self.xsl) try: stylesheet = etree.parse(stylesheet_file) xsl_path, _ = os.path.split(self.xsl) for import_child in stylesheet.findall('./import'): if 'href' in import_child.attrib: imp_file = import_child.get('href') _, imp_file = tools.file_open(imp_file, subdir=xsl_path, pathinfo=True) import_child.set('href', urllib.quote(str(imp_file))) imp_file.close() finally: stylesheet_file.close() #TODO: get all the translation in one query. That means we have to: # * build a list of items to translate, # * issue the query to translate them, # * (re)build/update the stylesheet with the translated items def translate(doc, lang): translate_aux(doc, lang, False) def translate_aux(doc, lang, t): for node in doc: t = t or node.get("t") if t: text = None tail = None if node.text: text = node.text.strip().replace('\n',' ') if node.tail: tail = node.tail.strip().replace('\n',' ') if text: translation1 = ir_translation_obj._get_source(cr, uid, self.name2, 'xsl', lang, text) if translation1: node.text = node.text.replace(text, translation1) if tail: translation2 = ir_translation_obj._get_source(cr, uid, self.name2, 'xsl', lang, tail) if translation2: node.tail = node.tail.replace(tail, translation2) translate_aux(node, lang, t) if context.get('lang', False): translate(stylesheet.iter(), context['lang']) transform = etree.XSLT(stylesheet) xml = etree.tostring( transform(etree.fromstring(xml))) return xml
class Question(ABC): """ Define an absract class for all supported questions. """ _xslt_html2tex = os.path.join(os.path.dirname(__file__), 'html2tex.xslt') _transform = etree.XSLT(etree.parse(_xslt_html2tex)) figpath = FIGURES_PATH # possible numerics, open def __init__(self, q): """ Init class from an etree Element. """ self.q = q self.name = q.find('name/text').text self.qtype = None self.fileCreated = [] self.gStrategy = GRADING_STRATEGY # save number of svg file per question self.svg_id = 0 def __repr__(self): """ Change string representation. """ rep = ("Instance of {} containing the question '{}'." .format(self.__class__.__name__, self.name)) return rep def __str__(self): """ Change string representation to print the tree. """ rep = self.__repr__() s = unescape(etree.tostring(self.q, pretty_print=True, encoding='utf8').decode('utf8')) return '\n'.join((rep, s)) @classmethod def fromstring(cls, qstring): """ Create an instance of QuestionX from a XML string. """ q = etree.fromstring(qstring) return cls(q) def html2tex(self, cdata_content): """ Convert CDATA field into latex text. Parameter --------- cdata_content : string CDATA protected content that will be parsed. Return ------ tree_content : etree.Element the text content in html store as a etree. Remarks ------- <br> are removed (not exml content). """ # remove manually CDATA from the string cdata_content = (cdata_content.replace('<text><![CDATA[', '<text>') .replace(']]></text>', '</text>') .replace('\n', '') .replace('<br>', '')) parser = etree.HTMLParser(recover=True) tree_content = etree.fromstring(unescape(cdata_content), parser) self._img_check(tree_content) # transform with XSLT into XSLT (tree) for all other element xslt_content = self._transform(tree_content) # convert to XML (more suitable for search) tree_text = etree.XML(etree.tostring(xslt_content, encoding='utf8').decode('utf-8')) return tree_text def _img_check(self, tree_content): """ Change path and check/convert to latex supported image type. there is 2 steps : i) extract svg, ii) convert file non supported by LaTeX Parameters ---------- tree_content : etree etree arising from cdata parsing. """ # Step 1. # Remove embedded svg file (tikz) and convert to img for svg in tree_content.findall('.//picture/svg'): width = svg.attrib['width'] filename_svg = os.path.join(self.figpath, self.name + str(self.svg_id) + '.svg') with open(filename_svg, 'w') as f: f.write(etree.tostring(svg, encoding='utf8', pretty_print=True) .decode('utf-8')) img_svg = etree.Element('img', attrib={'src': filename_svg, 'width': width}) svg.getparent().append(img_svg) svg.getparent().remove(svg) # increment svg_id self.svg_id += 1 # <picture> is then parsed in html2tex # Step 2. # check that img file are supported by LaTeX for img in tree_content.findall('.//img'): src = img.attrib['src'] # remove percent-encoding with urlib src = urllib.parse.unquote(src) path, filename = os.path.split(src) basename, ext = os.path.splitext(filename) # need to be converted if ext not in LATEX_EXT: im = Image(filename=os.path.join(self.figpath, filename)) fileout = os.path.join(self.figpath, basename + LATEX_IMG_OUT) im.save(filename=fileout) im.close() src = fileout else: src = os.path.join(self.figpath, filename) # store new path/file img.attrib['src'] = src def fileExport(self): """ Extract embedded data to 'real' file for LaTeX processing. """ # extract all file in the quetsion recurssively for file in self.q.findall('.//file'): filename = file.attrib['name'] data = base64.decodebytes(file.text.encode()) # create directory if needed if not os.path.exists(self.figpath): os.makedirs(self.figpath) # save file with open(os.path.join(self.figpath, filename), 'bw') as f: f.write(data) self.fileCreated.append(filename) def question(self): """ Get question text. """ # perhaps not so obvious and will require to extract cdata cdata_content = etree.tostring(self.q.find('questiontext/text'), encoding='utf8').decode('utf-8') text = self.html2tex(cdata_content) return text @abstractmethod def gettype(self): """ Determine the amc question type. """ pass @abstractmethod def answers(self): """ Create and parse answers. """ pass def transform(self, catname): """ Main routine, applied the xml transformation. """ # initialize amcq = etree.Element('question', attrib={'amctype': self.gettype(), 'category': catname, 'qname': self.name}) self.fileExport() qtext = self.question() amcq.append(qtext) choices = self.answers() amcq.append(choices) return amcq
#!/usr/bin/python from lxml import etree xsltf = open('main.xsl') xslt_tree = etree.parse(xsltf) f = open('fruits.xml') tree = etree.parse(f) transform = etree.XSLT(xslt_tree) transformed = transform(tree) output = etree.tostring(transformed) print output
def xml2txt(input): data = etree.parse(input) transform = etree.XSLT(etree.parse('txt.xslt')) res = transform(data) return bytes(res)
def from_data(self, uid, fields, rows, company_name): pageSize = [210.0, 297.0] new_doc = etree.Element("report") config = etree.SubElement(new_doc, 'config') def _append_node(name, text): n = etree.SubElement(config, name) n.text = text _append_node( 'date', time.strftime( str(locale.nl_langinfo(locale.D_FMT).replace('%y', '%Y')))) _append_node('PageSize', '%.2fmm,%.2fmm' % tuple(pageSize)) _append_node('PageWidth', '%.2f' % (pageSize[0] * 2.8346, )) _append_node('PageHeight', '%.2f' % (pageSize[1] * 2.8346, )) _append_node('PageFormat', 'a4') _append_node( 'header-date', time.strftime( str(locale.nl_langinfo(locale.D_FMT).replace('%y', '%Y')))) _append_node('company', company_name) l = [] t = 0 temp = [] tsum = [] skip_index = [] header = etree.SubElement(new_doc, 'header') i = 0 for f in fields: if f.get('header_data_id', False): value = f.get('header_name', "") field = etree.SubElement(header, 'field') field.text = tools.ustr(value) else: skip_index.append(i) i += 1 lines = etree.SubElement(new_doc, 'lines') for row_lines in rows: node_line = etree.SubElement(lines, 'row') j = 0 for row in row_lines: if not j in skip_index: para = "yes" tree = "no" value = row.get('data', '') if row.get('bold', False): para = "group" if row.get('number', False): tree = "float" col = etree.SubElement(node_line, 'col', para=para, tree=tree) col.text = tools.ustr(value) j += 1 transform = etree.XSLT( etree.parse( os.path.join(tools.config['root_path'], 'addons/base/report/custom_new.xsl'))) rml = etree.tostring(transform(new_doc)) self.obj = trml2pdf.parseNode(rml, title='Printscreen') return self.obj
def convert(self, stream, options, file_ext, log, accelerators): from lxml import etree from calibre.ebooks.metadata.fb2 import ensure_namespace, get_fb2_data from calibre.ebooks.metadata.opf2 import OPFCreator from calibre.ebooks.metadata.meta import get_metadata from calibre.ebooks.oeb.base import XLINK_NS, XHTML_NS, RECOVER_PARSER from calibre.ebooks.chardet import xml_to_unicode self.log = log log.debug('Parsing XML...') raw = get_fb2_data(stream)[0] raw = raw.replace(b'\0', b'') raw = xml_to_unicode(raw, strip_encoding_pats=True, assume_utf8=True, resolve_entities=True)[0] try: doc = etree.fromstring(raw) except etree.XMLSyntaxError: try: doc = etree.fromstring(raw, parser=RECOVER_PARSER) if doc is None: raise Exception('parse failed') except: doc = etree.fromstring(raw.replace('& ', '&'), parser=RECOVER_PARSER) if doc is None: raise ValueError('The FB2 file is not valid XML') doc = ensure_namespace(doc) try: fb_ns = doc.nsmap[doc.prefix] except Exception: fb_ns = FB2NS NAMESPACES = {'f':fb_ns, 'l':XLINK_NS} stylesheets = doc.xpath('//*[local-name() = "stylesheet" and @type="text/css"]') css = '' for s in stylesheets: css += etree.tostring(s, encoding=unicode, method='text', with_tail=False) + '\n\n' if css: import css_parser, logging parser = css_parser.CSSParser(fetcher=None, log=logging.getLogger('calibre.css')) XHTML_CSS_NAMESPACE = '@namespace "%s";\n' % XHTML_NS text = XHTML_CSS_NAMESPACE + css log.debug('Parsing stylesheet...') stylesheet = parser.parseString(text) stylesheet.namespaces['h'] = XHTML_NS css = unicode(stylesheet.cssText).replace('h|style', 'h|span') css = re.sub(r'name\s*=\s*', 'class=', css) self.extract_embedded_content(doc) log.debug('Converting XML to HTML...') ss = open(P('templates/fb2.xsl'), 'rb').read() ss = ss.replace("__FB_NS__", fb_ns) if options.no_inline_fb2_toc: log('Disabling generation of inline FB2 TOC') ss = re.compile(r'<!-- BUILD TOC -->.*<!-- END BUILD TOC -->', re.DOTALL).sub('', ss) styledoc = etree.fromstring(ss) transform = etree.XSLT(styledoc) result = transform(doc) # Handle links of type note and cite notes = {a.get('href')[1:]: a for a in result.xpath('//a[@link_note and @href]') if a.get('href').startswith('#')} cites = {a.get('link_cite'): a for a in result.xpath('//a[@link_cite]') if not a.get('href', '')} all_ids = {x for x in result.xpath('//*/@id')} for cite, a in cites.iteritems(): note = notes.get(cite, None) if note: c = 1 while 'cite%d' % c in all_ids: c += 1 if not note.get('id', None): note.set('id', 'cite%d' % c) all_ids.add(note.get('id')) a.set('href', '#%s' % note.get('id')) for x in result.xpath('//*[@link_note or @link_cite]'): x.attrib.pop('link_note', None) x.attrib.pop('link_cite', None) for img in result.xpath('//img[@src]'): src = img.get('src') img.set('src', self.binary_map.get(src, src)) index = transform.tostring(result) open(u'index.xhtml', 'wb').write(index) open(u'inline-styles.css', 'wb').write(css) stream.seek(0) mi = get_metadata(stream, 'fb2') if not mi.title: mi.title = _('Unknown') if not mi.authors: mi.authors = [_('Unknown')] cpath = None if mi.cover_data and mi.cover_data[1]: with open(u'fb2_cover_calibre_mi.jpg', 'wb') as f: f.write(mi.cover_data[1]) cpath = os.path.abspath(u'fb2_cover_calibre_mi.jpg') else: for img in doc.xpath('//f:coverpage/f:image', namespaces=NAMESPACES): href = img.get('{%s}href'%XLINK_NS, img.get('href', None)) if href is not None: if href.startswith('#'): href = href[1:] cpath = os.path.abspath(href) break opf = OPFCreator(os.getcwdu(), mi) entries = [(f2, guess_type(f2)[0]) for f2 in os.listdir(u'.')] opf.create_manifest(entries) opf.create_spine([u'index.xhtml']) if cpath: opf.guide.set_cover(cpath) with open(u'metadata.opf', 'wb') as f: opf.render(f) return os.path.join(os.getcwdu(), u'metadata.opf')
# use the opener to fetch a URL opener.open(base_url) # Install the opener. # Now all calls to urllib.request.urlopen use our opener. urllib.request.install_opener(opener) # retrieve the publication from id and extract the creator relationships pub_rels_url = base_url+"publications/"+args.pub_id+"/relationships?types=8,9&page-size=250" # extract all the creator relationhip ids from our various relations extract_rels = etree.XSLT(etree.XML(open('./extract_relationships.xslt').read())) rel_ids=() try: fp = urllib.request.urlopen(pub_rels_url) # get array of relationship ids: rel_ids = etree.XPath("//text()") (extract_rels(etree.fromstring(fp.read()))) fp.close() except HTTPError as err: if err.code == 404: print("Sorry publications ("+args.pub_id+") not found") exit else: raise
from django.template import RequestContext from django.http import HttpResponseRedirect from django.core.urlresolvers import reverse from django.conf import settings from DjVALD.vald.models import Transition,State,Source,Species # This imports all the generic tap views! from DjNode.tapservice.views import parseSQL, vamdc2queryset from base64 import b64encode as b64 def enc(s): return b64(s).replace('=','') from lxml import etree as E vo2html=E.XSLT(E.parse(open(settings.BASEPATH+'DjNode/static/xsl/VOTable2XHTML_mine.xsl'))) UMIST_DICT={'1':'species__atomic', '2':'species__ion', '3':'vacwave', '4':'airwave', '5':'loggf', '6':'lostate__energy', '7':'lostate__J', } def index(request): c=RequestContext(request,{}) return render_to_response('vald/index.html', c)
from lxml import etree from lxml.builder import ElementMaker from lxml.etree import Element from datapane.common import NPath, guess_type, log, timestamp from datapane.common.report import local_report_def, validate_report_doc from ..utils import DPException from .common import DPTmpFile, Resource, do_download_file from .dp_object import DPObjectRef, UploadableObjectMixin from .runtime import _report E = ElementMaker() # XML Tag Factory local_post_xslt = etree.parse(str(local_report_def / "local_post_process.xslt")) local_post_transform = etree.XSLT(local_post_xslt) id_count = itertools.count(start=1) # only these types will be documented by default __all__ = [ "ReportBlock", "Blocks", "Markdown", "File", "Plot", "Table", "Report" ] __pdoc__ = { "ReportBlock.attributes": False, "File.caption": False, "Plot.file": False, "Table.file": False, "Report.endpoint": False, }
for labelNode in issueNode: issue.addLabel(labelNode.text) aList.append(issue) def getTestListsFromComment(aList, aComment): words = re.findall(r"\S+", aComment) for w in words: if re.match(REGEXP_TEST_LIST, w): aList.append(w) if __name__ == "__main__": # Load XSL stylesheets HTMLparser = etree.HTMLParser() issueListXSLT = etree.XSLT(etree.parse("githubIssueList.xsl")) issuePageXSLT = etree.XSLT(etree.parse("githubIssuePage.xsl")) # Download the first page of issue list and determine the number of pages downloadPage(GITHUB_URI + GITHUB_ISSUE_LIST + "&page=1", TMP_FILE) root = issueListXSLT(etree.parse(TMP_FILE, HTMLparser)).getroot() numberOfPages = root[0].get("numberOfPages") if (numberOfPages): numberOfPages = int(numberOfPages) else: numberOfPages = 1 # Fill-in the issue list issueList = [] appendIssues(issueList, root.find('issueList')) for i in range(2, numberOfPages + 1):
<xsl:template match="*"> <xsl:element name="{local-name()}"> <xsl:apply-templates select="@*|node()"/> </xsl:element> </xsl:template> <xsl:template match="@*"> <xsl:attribute name="{local-name()}"> <xsl:value-of select="."/> </xsl:attribute> </xsl:template> </xsl:stylesheet> ''' xslt_doc = etree.parse(StringIO(xslt)) transform = etree.XSLT(xslt_doc) class PGRSpecConverter: @staticmethod def to_sfa_rspec(rspec, content_type=None): if not isinstance(rspec, RSpec): pg_rspec = RSpec(rspec) else: pg_rspec = rspec version_manager = VersionManager() sfa_version = version_manager._get_version('sfa', '1') sfa_rspec = RSpec(version=sfa_version) #nodes = pg_rspec.version.get_nodes()
def verify_and_create_supporting_files(self, creating_forms=False): template_home = os.path.join(settings.BASE_DIR, settings.BASE_APP_DIR, 'templates', self.DOC_DEFAULT_TEMPLATE_OS_HOME, self.id) if not os.path.isdir(template_home): os.makedirs(template_home) template_media_home = os.path.join( settings.MEDIA_ROOT, self.DOC_DEFAULT_TEMPLATE_MEDIA_ROOT, self.id, 'js') if not os.path.exists(template_media_home): os.makedirs(template_media_home) # FormDefinition.verify_xml_against_schema(form_def) param_creating_form = SystemParameter.find_one( "DOC_DEFAULT_SUPPORTING_FILES_CREATION", FieldDataType.BOOLEAN, False) if creating_forms == False and param_creating_form == True: creating_forms = True if os.path.isfile(os.path.join( template_home, self.DOC_EDIT_FILE_NAME)) and creating_forms == True: os.remove(os.path.join(template_home, self.DOC_EDIT_FILE_NAME)) if not os.path.isfile( os.path.join(template_home, self.DOC_EDIT_FILE_NAME)): if self.edit_xslt: xslt = etree.fromstring(self.edit_xslt) elif os.path.isfile(self.DOC_XSLT_DEFAULT_EDIT_FILE): xslt = etree.parse(self.DOC_XSLT_DEFAULT_EDIT_FILE) else: raise FileNotFoundError dom = etree.fromstring(self.definition) transform = etree.XSLT(xslt) newdom = transform(dom) contents = str(newdom) with open(os.path.join(template_home, self.DOC_EDIT_FILE_NAME), 'w+', encoding='utf8') as text_file: print(contents, file=text_file) if (os.path.isfile( os.path.join(template_media_home, self.DOC_EDIT_JS_FILE_NAME)) and creating_forms == True): os.remove( os.path.join(template_media_home, self.DOC_EDIT_JS_FILE_NAME)) if not os.path.isfile( os.path.join(template_media_home, self.DOC_EDIT_JS_FILE_NAME)): if self.edit_js_xslt: xslt = etree.fromstring(self.edit_js_xslt) elif os.path.isfile(self.DOC_XSLT_DEFAULT_EDIT_JS_FILE): xslt = etree.parse(self.DOC_XSLT_DEFAULT_EDIT_JS_FILE) else: raise FileNotFoundError dom = etree.fromstring(self.definition) transform = etree.XSLT(xslt) newdom = transform(dom) contents = str(newdom) with open(os.path.join(template_media_home, self.DOC_EDIT_JS_FILE_NAME), 'w+', encoding='utf8') as text_file: print(contents, file=text_file) if os.path.isfile(os.path.join( template_home, self.DOC_VIEW_FILE_NAME)) and creating_forms == True: os.remove(os.path.join(template_home, self.DOC_VIEW_FILE_NAME)) if not os.path.isfile( os.path.join(template_home, self.DOC_VIEW_FILE_NAME)): if self.view_xslt: xslt = etree.fromstring(self.view_xslt) elif os.path.isfile(self.DOC_XSLT_DEFAULT_VIEW_FILE): xslt = etree.parse(self.DOC_XSLT_DEFAULT_VIEW_FILE) else: raise FileNotFoundError dom = etree.fromstring(self.definition) transform = etree.XSLT(xslt) newdom = transform(dom) contents = str(newdom) with open(os.path.join(template_home, self.DOC_VIEW_FILE_NAME), 'w+', encoding='utf8') as text_file: print(contents, file=text_file) logger.info('we processed the files for ' + self.id)
def getHtmlPage(): link = request.args.get('link') # get html code from wikipedia page = urllib.request.urlopen(link) soup = bs(page) # search for image with a monarch 'alt' attribute imgs = soup.body.findAll('img', alt="Monarch")[0] title = soup.body.find('h1', id="firstHeading") # getting the tree within the parent table > retrieving all table rows tables_data = imgs.parent.parent.parent.parent trs = tables_data.find_all("tr") # loop through every row(generation) >> get the name of every generation member st = "" for tr in trs: tds = tr.find_all("td") for td in tds: if (td.find("a") == None) & (len(td.text) > 0): st = st + td.text + "," if (td.find("a") != None) & (len(td.text) > 0): st = st + td.text + "," st = st + "*" # store every generation in an array tab = st.split("****") generation_id_count = 0 # create xml file root element 'Familytree' that contains all generation from single family tree root = ET.Element('familytree') root.set("familyName", str(title.text)) mydict = {} for x in tab: gen = x.split( "," ) #split every generation from the first array to store every generation in a single array within the family tree array gen.pop() generation = ET.SubElement( root, 'generation' ) # create xml element 'generation' that's going to store every generation members data generation.set( 'id', str(generation_id_count) ) # every generation has and id that distinguish it from other gens ref_count = 1 check = False for y in gen: #loop through every member of a generation name = y # Had to add this part of code to remove some extra text added to members name (wikipedia's mistake) if "spouse" in name.lstrip(): spouseword = y[y.find("spouse"):len(y)] newWord = y.replace(spouseword, "").lstrip() else: newWord = name.lstrip() if (len(newWord) < 5): continue if "°" in newWord: tempword = newWord[newWord.find("°") - 1:newWord.find("°") + 1] newWord = newWord.replace(tempword, "").lstrip() # create xml 'personne' element that contains every memeber name ref = str(generation_id_count) + str( ref_count ) # id of every memeber is {generationID + counter from 1 to n / n = every generation members count} personne = ET.SubElement(generation, 'personne') personne.set('ref', str(ref)) #check if a member of a generation is the king to attach a special attribute 'estroi' to it if newWord in Kings: if check == False: personne.set('estroi', str(True)) mydict[generation_id_count] = ref check = True else: personne.set('estroi', str(False)) else: personne.set('estroi', str(False)) # add the member name value to the 'personne' element nomcomplet = ET.SubElement(personne, "nomcomplet") nomcomplet.text = newWord.lstrip() ref_count = ref_count + 1 # add reference to the parent of every generation with 'link' attribute // 'link' value refers to the last generation king {parent of the actual one here} if (generation_id_count > 0): parent = ET.SubElement(generation, 'parent') parent.set('link', str(mydict[generation_id_count - 1])) else: parent = ET.SubElement(generation, 'parent') parent.set('link', "") #increment the generation count generation_id_count = generation_id_count + 1 # after generatin our xml file , we pass it through an xml parser xmlstr = minidom.parseString( ET.tostring(root)).toprettyxml(indent=" ") with open("./dataFiles/tree_Database.xml", "w", encoding="utf-8") as f: f.write(xmlstr) xslDoc = etree.parse("./dataFiles/index.xsl") xsltTransformer = etree.XSLT(xslDoc) xmlDoc = etree.parse("./dataFiles/tree_Database.xml") outputDoc = xsltTransformer(xmlDoc) root = lh.tostring(outputDoc) #convert the generated HTML to a string soup = bs(root) #make BeautifulSoup prettyHTML = soup.prettify() return str(prettyHTML)
def __init__(self, reports: Reports, output_dir: str) -> None: import lxml.etree as etree super().__init__(reports, output_dir) self.xslt_txt = etree.XSLT(etree.parse(self.memory_xml.xslt_txt_path))
def parse_xml(response_doc, xsl_file): xml_doc = etree.parse(response_doc) styledoc = etree.parse(xsl_file) style = etree.XSLT(styledoc) xml_transformed = style(xml_doc) return xml_transformed
def materialsReport(self, item_list, realm): self.setWindowTitle('Materials Report') self.ExportHTMLButton.hide() self.ExportPlainTextButton.hide() materials = {'Items': {}, 'Gems': {}, 'Dusts': {}, 'Liquids': {}} for item in [x for x in item_list.values() if x.isPlayerCrafted()]: for slot in [ x for x in item.getSlotList() if x.isCrafted() and x.isUtilized() ]: try: # THE KEY MAY NOT EXIST ... materials['Items'][item.Location] += [ slot.getGemName(realm) ] except KeyError: materials['Items'][item.Location] = [ slot.getGemName(realm) ] for material_type, material_list in slot.getGemMaterials( realm).items(): for material, amount in material_list.items(): try: # THE KEY MAY NOT EXIST ... materials[material_type][material] += amount except KeyError: materials[material_type][material] = amount for material_type, material_list in materials.items(): if material_type in GemMaterialsOrder.keys(): keys = GemMaterialsOrder[material_type] material_list = [(x, material_list.get(x)) for x in keys if x in material_list] materials[material_type] = material_list report = etree.Element('Materials') for material_type, material_list in materials.items(): if material_type == 'Items': parent = etree.SubElement(report, 'Items') for location, jewels in material_list.items(): element = etree.SubElement(parent, 'Item', Location=location) for jewel in jewels: etree.SubElement(element, 'Jewel').text = jewel else: parent = etree.SubElement(report, material_type) for material, amount in material_list: etree.SubElement(parent, 'Material', Amount=str(amount), Material=material) xslt = etree.parse(r'reports/DefaultMaterialsReport.xsl') transform = etree.XSLT(xslt) report = str(transform(report)) self.ReportTextBrowser.setHtml(report)
def xml_to_html(source: bytes, xml_tree) -> str: xslt_to_html_tree = etree.XML(source) xslt_to_html_transform = etree.XSLT(xslt_to_html_tree) return xslt_to_html_transform(xml_tree)
<xsl:template match="*"> <xsl:element name="{local-name()}"> <xsl:apply-templates select="@*|node()"/> </xsl:element> </xsl:template> <xsl:template match="@*"> <xsl:attribute name="{local-name()}"> <xsl:value-of select="."/> </xsl:attribute> </xsl:template> </xsl:stylesheet> ''' remove_namespaces_xslt = etree.parse( io.BytesIO(str.encode(remove_namespaces_xslt))) REMOVE_NAMESPACES = etree.XSLT(remove_namespaces_xslt) def main( appliances=[], credentials=[], domains=[], timeout=120, no_check_hostname=False, test_files=[], ): """svc-test a utility for testing DataPower configuration and status. Tests are determined """ check_hostname = not no_check_hostname env = datapower.Environment(appliances,
def _create_table(self, uid, ids, fields, fields_order, results, context, title=''): pageSize = [297.0, 210.0] new_doc = etree.Element("report") config = etree.SubElement(new_doc, 'config') def _append_node(name, text): n = etree.SubElement(config, name) n.text = text #_append_node('date', time.strftime('%d/%m/%Y')) _append_node( 'date', time.strftime( str(locale.nl_langinfo(locale.D_FMT).replace('%y', '%Y')))) _append_node('PageSize', '%.2fmm,%.2fmm' % tuple(pageSize)) _append_node('PageWidth', '%.2f' % (pageSize[0] * 2.8346, )) _append_node('PageHeight', '%.2f' % (pageSize[1] * 2.8346, )) _append_node('report-header', title) _append_node( 'company', pooler.get_pool(self.cr.dbname).get('res.users').browse( self.cr, uid, uid).company_id.name) rpt_obj = pooler.get_pool(self.cr.dbname).get('res.users') rml_obj = report_sxw.rml_parse(self.cr, uid, rpt_obj._name, context) _append_node( 'header-date', str(rml_obj.formatLang(time.strftime("%Y-%m-%d"), date=True)) + ' ' + str(time.strftime("%H:%M"))) l = [] t = 0 strmax = (pageSize[0] - 40) * 2.8346 temp = [] tsum = [] for i in range(0, len(fields_order)): temp.append(0) tsum.append(0) ince = -1 for f in fields_order: s = 0 ince += 1 if fields[f]['type'] in ('date', 'time', 'datetime', 'float', 'integer'): s = 60 strmax -= s if fields[f]['type'] in ('float', 'integer'): temp[ince] = 1 else: t += fields[f].get('size', 80) / 28 + 1 l.append(s) for pos in range(len(l)): if not l[pos]: s = fields[fields_order[pos]].get('size', 80) / 28 + 1 l[pos] = strmax * s / t _append_node('tableSize', ','.join(map(str, l))) header = etree.SubElement(new_doc, 'header') for f in fields_order: field = etree.SubElement(header, 'field') field.text = tools.ustr(fields[f]['string'] or '') lines = etree.SubElement(new_doc, 'lines') for line in results: node_line = etree.SubElement(lines, 'row') count = -1 for f in fields_order: float_flag = 0 count += 1 if fields[f]['type'] == 'many2one' and line[f]: if not line.get('__group'): line[f] = line[f][1] if fields[f]['type'] == 'selection' and line[f]: for key, value in fields[f]['selection']: if key == line[f]: line[f] = value break if fields[f]['type'] in ('one2many', 'many2many') and line[f]: line[f] = '( ' + tools.ustr(len(line[f])) + ' )' if fields[f]['type'] == 'float' and line[f]: precision = (('digits' in fields[f]) and fields[f]['digits'][1]) or 2 prec = '%.' + str(precision) + 'f' line[f] = prec % (line[f]) float_flag = 1 if fields[f]['type'] == 'date' and line[f]: new_d1 = line[f] if not line.get('__group'): format = str( locale.nl_langinfo(locale.D_FMT).replace( '%y', '%Y')) d1 = datetime.strptime(line[f], '%Y-%m-%d') new_d1 = d1.strftime(format) line[f] = new_d1 if fields[f]['type'] == 'time' and line[f]: new_d1 = line[f] if not line.get('__group'): format = str(locale.nl_langinfo(locale.T_FMT)) d1 = datetime.strptime(line[f], '%H:%M:%S') new_d1 = d1.strftime(format) line[f] = new_d1 if fields[f]['type'] == 'datetime' and line[f]: new_d1 = line[f] if not line.get('__group'): format = str( locale.nl_langinfo(locale.D_FMT).replace( '%y', '%Y')) + ' ' + str( locale.nl_langinfo(locale.T_FMT)) d1 = datetime.strptime(line[f], '%Y-%m-%d %H:%M:%S') new_d1 = d1.strftime(format) line[f] = new_d1 if line.get('__group'): col = etree.SubElement(node_line, 'col', para='group', tree='no') else: col = etree.SubElement(node_line, 'col', para='yes', tree='no') # Prevent empty labels in groups if f == line.get('__grouped_by') and line.get( '__group' ) and not line[f] and not float_flag and not temp[count]: col.text = line[f] = 'Undefined' col.set('tree', 'undefined') if line[f] != None: col.text = tools.ustr(line[f] or '') if float_flag: col.set('tree', 'float') if line.get('__no_leaf') and temp[ count] == 1 and f != 'id' and not line[ '__context']['group_by']: tsum[count] = float(tsum[count]) + float(line[f]) if not line.get( '__group') and f != 'id' and temp[count] == 1: tsum[count] = float(tsum[count]) + float(line[f]) else: col.text = '/' node_line = etree.SubElement(lines, 'row') for f in range(0, len(fields_order)): col = etree.SubElement(node_line, 'col', para='group', tree='no') col.set('tree', 'float') if tsum[f] != None: if tsum[f] != 0.0: digits = fields[fields_order[f]].get('digits', (16, 2)) prec = '%%.%sf' % (digits[1], ) total = prec % (tsum[f], ) txt = str(total or '') else: txt = str(tsum[f] or '') else: txt = '/' if f == 0: txt = 'Total' col.set('tree', 'no') col.text = tools.ustr(txt or '') transform = etree.XSLT( etree.parse( os.path.join(tools.config['root_path'], 'addons/base/report/custom_new.xsl'))) rml = etree.tostring(transform(new_doc)) self.obj = render.rml(rml, title=self.title) self.obj.render() return True
def __init__(self, styf): """Create an XSLT transformer based on the XSLT stylesheet at `styf` (a file path).""" self.xmlCache = self.OneCache() self.styf = styf self.xslt = etree.XSLT(etree.parse(self.styf))
for xml in xml_files[collection]: xml_filename = source_path + collection + '\\' + xml parser = ET.XMLParser(recover=True) dom = ET.parse(xml_filename, parser) # remove empty nodes for element in dom.xpath(".//*[not(node())]"): element.getparent().remove(element) # remove nodes with text "null" for element in dom.xpath(".//*[text()='null']"): element.getparent().remove(element) # remove nodes with attribute "null" for element in dom.xpath(".//*[@*='null']"): element.getparent().remove(element) xslt = ET.parse(xsl_filename) transform = ET.XSLT(xslt) newdom = transform(dom) out = out_path + xml # write out to new file with open(out, 'wb') as f: f.write("<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n") f.write(ET.tostring(newdom, pretty_print=True)) print "Writing", out print "All done!"
def __init__(self, xslt_filename, xslt_params=None): self.xslt = ET.XSLT(ET.parse(xslt_filename)) self.xslt_params = xslt_params or {}
xslt_transform = None limit = 0 if not args.testnumber else int(args.testnumber) fobj = open(args.outfile, 'w') if args.outfile else StringIO() try: for doc in docs(args.database, args.collection, limit): html_doc = cmdl.main(doc) verbose(html_doc) if not (args.transform or args.verbose): print html_doc else: if not xslt_transform: xslt_transform = etree.XSLT( etree.XML(open(args.transform).read())) etree.XML(html_doc) rdfa_doc = str(xslt_transform(etree.XML(html_doc))) verbose(rdfa_doc) _, tmpf = tempfile.mkstemp(suffix='.html') with open(tmpf, 'w') as _tf: _tf.write(rdfa_doc) ttl_doc = pyRdfa.pyRdfa().rdf_from_source(tmpf, rdfOutput=True) #os.remove(tmpf) verbose(ttl_doc) fobj.write(ttl_doc) finally: fobj.close()
# Extract the mathvariants transformation. xsltTransform = etree.XSLT(etree.XML('''\ <xsl:stylesheet version="1.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform"> <xsl:strip-space elements="*"/> <xsl:template match="charlist"> <root><xsl:apply-templates select="character"/></root> </xsl:template> <xsl:template match="character"> <xsl:if test="surrogate"> <entry> <xsl:attribute name="mathvariant"> <xsl:value-of select="surrogate/@mathvariant"/> </xsl:attribute> <xsl:attribute name="baseChar"> <xsl:value-of select="surrogate/@ref"/> </xsl:attribute> <xsl:attribute name="transformedChar"> <xsl:choose> <xsl:when test="bmp"> <xsl:value-of select="bmp/@ref"/> </xsl:when> <xsl:otherwise> <xsl:value-of select="@id"/> </xsl:otherwise> </xsl:choose> </xsl:attribute> </entry> </xsl:if> </xsl:template> </xsl:stylesheet>'''))
def get_results_by_instance_keyword(request): print 'BEGIN def getResultsKeyword(request)' resultsByKeyword = [] results = [] resultString = "" #Instance json_instances = [] if 'HTTPS' in request.META['SERVER_PROTOCOL']: protocol = "https" else: protocol = "http" instance = Instance(name="Local", protocol=protocol, address=request.META['REMOTE_ADDR'], port=request.META['SERVER_PORT'], access_token="token", refresh_token="token") json_instances.append(instance.to_json()) request.session['instancesExplore'] = json_instances sessionName = "resultsExploreOaiPMh" + instance['name'] try: keyword = request.GET['keyword'] schemas = request.GET.getlist('schemas[]') userSchemas = request.GET.getlist('userSchemas[]') refinements = refinements_to_mongo( request.GET.getlist('refinements[]')) if 'onlySuggestions' in request.GET: onlySuggestions = json.loads(request.GET['onlySuggestions']) else: onlySuggestions = False registries = request.GET.getlist('registries[]') except: keyword = '' schemas = [] userSchemas = [] refinements = {} onlySuggestions = True registries = [] #We get all template versions for the given schemas #First, we take care of user defined schema templatesIDUser = Template.objects(title__in=userSchemas).distinct( field="id") templatesIDUser = [str(x) for x in templatesIDUser] #Take care of the rest, with versions templatesVersions = Template.objects(title__in=schemas).distinct( field="templateVersion") #We get all templates ID, for all versions allTemplatesIDCommon = TemplateVersion.objects( pk__in=templatesVersions, isDeleted=False).distinct(field="versions") #We remove the removed version allTemplatesIDCommonRemoved = TemplateVersion.objects( pk__in=templatesVersions, isDeleted=False).distinct(field="deletedVersions") templatesIDCommon = list( set(allTemplatesIDCommon) - set(allTemplatesIDCommonRemoved)) templatesID = templatesIDUser + templatesIDCommon if len(registries) == 0: #We retrieve deactivated registries so as not to get their metadata formats deactivatedRegistries = [ str(x.id) for x in OaiRegistry.objects(isDeactivated=True).order_by('id') ] metadataFormatsID = OaiMetadataFormat.objects( template__in=templatesID, registry__not__in=deactivatedRegistries).distinct(field="id") else: #We retrieve registries from the refinement metadataFormatsID = OaiMetadataFormat.objects( template__in=templatesID, registry__in=registries).distinct(field="id") instanceResults = OaiRecord.executeFullTextQuery(keyword, metadataFormatsID, refinements) if len(instanceResults) > 0: if not onlySuggestions: xsltPath = os.path.join(settings.SITE_ROOT, 'static/resources/xsl/xml2html.xsl') xslt = etree.parse(xsltPath) transform = etree.XSLT(xslt) template = loader.get_template( 'oai_pmh/explore/explore_result_keyword.html') #Retrieve schema and registries. Avoid to retrieve the information for each result registriesName = {} objMetadataFormats = {} listRegistriesID = set([x['registry'] for x in instanceResults]) for registryId in listRegistriesID: obj = OaiRegistry.objects(pk=registryId).get() registriesName[str(registryId)] = obj.name listSchemaId = set([x['metadataformat'] for x in instanceResults]) for schemaId in listSchemaId: obj = OaiMetadataFormat.objects(pk=schemaId).get() objMetadataFormats[str(schemaId)] = obj listItems = [] xmltodictunparse = XMLdata.unparse appendResult = results.append toXML = etree.XML parse = etree.parse XSLT = etree.XSLT if not onlySuggestions: for instanceResult in instanceResults: custom_xslt = False appendResult({ 'title': instanceResult['identifier'], 'content': xmltodictunparse(instanceResult['metadata']), 'id': str(instanceResult['_id']) }) dom = toXML( str( xmltodictunparse( instanceResult['metadata']).encode('utf-8'))) #Check if a custom list result XSLT has to be used try: metadataFormat = objMetadataFormats[str( instanceResult['metadataformat'])] if metadataFormat.template.ResultXsltList: listXslt = parse( BytesIO( metadataFormat.template.ResultXsltList.content. encode('utf-8'))) listTransform = XSLT(listXslt) newdom = listTransform(dom) custom_xslt = True else: newdom = transform(dom) except Exception, e: #We use the default one newdom = transform(dom) custom_xslt = False registry_name = registriesName[instanceResult['registry']] if len(registry_name) > 30: registry_name = "{0}...".format(registry_name[:30]) context = RequestContext( request, { 'id': str(instanceResult['_id']), 'xml': str(newdom), 'title': instanceResult['identifier'], 'custom_xslt': custom_xslt, 'template_name': metadataFormat.template.title, 'registry_name': registry_name, 'oai_pmh': True }) resultString += template.render(context) else: for instanceResult in instanceResults[:20]: wordList = re.sub("[^\w]", " ", keyword).split() wordList = [x + "|" + x + "\w+" for x in wordList] wordList = '|'.join(wordList) listWholeKeywords = re.findall( "\\b(" + wordList + ")\\b", XMLdata.unparse( instanceResult['metadata']).encode('utf-8'), flags=re.IGNORECASE) labels = list(set(listWholeKeywords)) for label in labels: label = label.lower() result_json = {} result_json['label'] = label result_json['value'] = label if not result_json in resultsByKeyword: resultsByKeyword.append(result_json)
def call_target(self, *args, **kwargs): xsl = etree.parse(os.path.join(base_dir, 'xsl', 'html5-to-cnxml.xsl')) target = etree.XSLT(xsl) return target(*args, **kwargs)
def insert_place_values(db, cursor, dt_id, user_id): """ """ print("** TABLE place, place_comment, place_description, place_feature_type – INSERT") #TODO: appeler le bon DT (et non _output7.xml, uniquement en dev) tree = etree.parse('../../../dico-topo/data/'+dt_id+'/output7.xml') # code du dpt dpt = tree.xpath('/DICTIONNAIRE')[0].get('dep') # print("INSERT bibl for {0}".format(dt_id)) insert_bibl(db, cursor, dt_id) bibl_id = cursor.lastrowid # on sélectionne le tome plus bas pour DT72 et DT80. Très pénible! if dt_id == 'DT72': cursor.execute( "SELECT id FROM bibl WHERE bnf_catalogue_ark = 'ark:/12148/cb37374247g' and bibl like '%tome 1%'") tome1_id = cursor.fetchone()[0] cursor.execute( "SELECT id FROM bibl WHERE bnf_catalogue_ark = 'ark:/12148/cb37374247g' and bibl like '%tome 2%'") tome2_id = cursor.fetchone()[0] if dt_id == 'DT80': cursor.execute( "SELECT id FROM bibl WHERE bnf_catalogue_ark = 'ark:/12148/cb30482383j' and bibl like '%tome 1%'") tome1_id = cursor.fetchone()[0] cursor.execute( "SELECT id FROM bibl WHERE bnf_catalogue_ark = 'ark:/12148/cb30482383j' and bibl like '%tome 2%'") tome2_id = cursor.fetchone()[0] for entry in tree.xpath('/DICTIONNAIRE/article'): # stocker les données relatives à chaque Place (article du DT) place = {} # id/old-id de l’article (e.g. 'P49443358/DT86-11608') place['id'] = entry.get('id') place['old-id'] = entry.get('old-id') # page de début place['num_start_page'] = entry.get('pg') # on redéfinit bibl_id pour DT72 et DT80 #HONTE if dt_id == 'DT72' and place['num_start_page'] <= '400': bibl_id = tome1_id elif dt_id == 'DT72' and place['num_start_page'] > '400': bibl_id = tome2_id elif dt_id == 'DT80' and entry.get('tm') == '1': bibl_id = tome1_id elif dt_id == 'DT80' and entry.get('tm') == '2': bibl_id = tome2_id #print(place['id'], '=>', bibl_id) # code insee (si commune, optionnel) place['commune_insee_code'] = entry.xpath('insee')[0].text if entry.xpath('insee') else None # code insee de la commune d’appartenance du lieu (ie. code de la commune dans le champ localisation) place['localization_commune_insee_code'] = entry.xpath('definition/localisation/commune')[0].get('insee') \ if entry.xpath('definition/localisation/commune') and place['commune_insee_code'] is None \ else None # TODO: déprécié? supprimer? voir avec CF control_vals = ['too_many_insee_codes', 'article_not_found', 'commune_is_empty'] if place['localization_commune_insee_code'] in control_vals: place['localization_commune_insee_code'] = None # @precision: relation entre le lieu (place_id) et la commune de localisation (localization_commune_insee_code) # 'certain: lieu situé dans la commune, http://vocab.getty.edu/ontology#anchor-28390563 # 'approximatif': lieu situé près de la commune, http://vocab.getty.edu/ontology#anchor1075244680 # absence de @precision: cas impossible à trancher -> tgn3000_related_to ? # TODO: des cas où @precision n’est pas renseigné : comment définir le type de relation? voir avec CF localization_commune_relation_type = entry.xpath('definition/localisation/commune')[0].get('precision') \ if entry.xpath('definition/localisation/commune') and place['localization_commune_insee_code'] is not None \ else None # le lieu est dans les environs de la commune if localization_commune_relation_type == 'approximatif': place['localization_commune_relation_type'] = 'tgn3000_related_to' # le lieu est localisé dans la commune elif localization_commune_relation_type == 'certain': place['localization_commune_relation_type'] = 'broaderPartitive' else: place['localization_commune_relation_type'] = None # formatage de place_description.content (xml_dt:definition) # LIENS # - feature types (FT): html:a, sans @href à ce stade du projet # - commune d’appartenance: html:a, avec code INSEE en @href # - renvois: html:a, avec @rel='search' et @data-dpt='{dpt-id}' pour construire les liens de recherche. # TODO: standardiser les liens au FT remove_tags = re.compile('</?(definition|localisation|date|renvoi|commune)[^>]*>') # on ne matche que les codes insee conformes au motif \[0-9]{5}\ rename_commune_optag = re.compile('<commune insee="([0-9]{5})"[^>]*>') rename_commune_cltag = re.compile('</commune>') rename_typo_tag = re.compile('<(/?)typologie[^>]*>') if entry.xpath('definition'): description = tostring(entry.xpath('definition')[0], encoding='unicode') description = ' '.join(description.split()) # ATTENTION à l’ordre des replace !!! (on réécrit commune avant de la supprimer…) description = re.sub(rename_commune_optag, '<a href="\\1">', description) description = re.sub(rename_commune_cltag, '</a>', description) description = re.sub(rename_typo_tag, '<\\1a>', description) # liens sur les renvois (tordu, car pas en XSLT initialement) description = re.sub( '(<renvoi>.*)<sm>([^<]+)</sm>(.*</renvoi>)', '\\1<a rel="search" data-dpt="'+dpt+'"><span class="sc">\\2</span></a>\\3', description) description = re.sub(remove_tags, '', description) # Tristesse de découvrir que le schéma n’est respecté… et hacks honteux # des small-caps dans les descriptions (principalement les siècles) description = re.sub(re.compile('<sm>([^<]+)</sm>'), '<span class="sc">\\1</span>', description) # erreurs de segmentation dans la source XML description = description.replace('</span> <sup>', '</span><sup>') # sortir les sauts de page description = re.sub(re.compile('<pg>[0-9]+</pg>'), '', description) # des références… description = re.sub(re.compile('<(/?)reference>'), '<\\1cite>', description) # ponctuation très fautive autour des balise TODO: évaluer description = punctuation_clean(description) # ceinture bretelle, on trim() description = description.strip() # uppercase first letter of description (bien compliqué…) re_first_letter = re.compile('(<a>)?([^ ])') first_letter_pos = re.match(re_first_letter, description).start(2) description = ''.join([description[:first_letter_pos], description[first_letter_pos].upper(), description[first_letter_pos + 1:]]) else: description = None # Validation HTM5, sortie d’une erreur sinon description_authorized_tags_set = {'p', 'a', 'i', 'sup', 'span', 'cite'} if description is not None: html_snippet_validator(description, place['id'], description_authorized_tags_set) # TODO: on charge la description même si elle n’est pas valide ? place['description'] = description # id du département place['dpt'] = dpt # VEDETTE (place.label) """ 2020-07: choix d’abandonner la distinction entre vedette pricipale et vedettes secondaires (alt_label) place['label'] = entry.xpath('vedette/sm[1]')[0].text.rstrip(',') place['label'] = place['label'].strip() place['label'] = place['label'].replace('*', '') # les vedettes secondaires (optionnel, mais fréquent) place['alt_labels'] = [] for i in entry.xpath('vedette//sm[position()>1]'): place['alt_labels'].append(i.text.rstrip(',')) """ place['label'] = tostring(entry.xpath('vedette')[0], method='text', encoding='unicode') # TODO: vérifier toutes les ponctuations en fin de vedette/label (pour tout supprimer) # place['label'] = place['label'].strip().rstrip('.,;') # place['label'] = place['label'].strip().strip('.,; »«*,') place['label'] = place['label'].strip().strip('.,;» «*,') # SN: le prefixe "*" marque les formes reconstituées/hypothétiques pour les lieux disparus. On supprime ? # parfois à la fin de la vedette (cf DT72) place['label'] = place['label'].replace('*', '') # feature types place['feature_types'] = [] for i in entry.xpath('definition/typologie'): place['feature_types'].append(i.text.rstrip(',')) # COMMENTAIRE**S** # possiblement plusieurs commentaires et plusieurs paragraphes par commentaire (//commentaire[2]/p[2]) # un html5:article par commentaire, avec html5:p # NB. impossible de déterminer sur quoi porte un commentaire (l’article, la forme ?) # conversion HTML5 de chaque commentaire # contient els: p, pg, date, forme_ancienne2, i, sm, sup, note, reference, renvoi # TODO: REPRENDRE LA TRANSFORMATION DES RENVOIS POUR LANCER LA RECHERCHE SUR LE BON DPT # TODO: certains renvois dans //renvoi/i (et non //renvoi/sm) : normaliser XML ? # TODO: voir avec JP si on inscrit le commentaire dans <article> # TODO: quid des <dfn> pour les formes anciennes2 ? commentaire2html = io.StringIO('''\ <xsl:stylesheet version="1.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform"> <xsl:output method="text"/> <xsl:template match="/"> <xsl:apply-templates/> </xsl:template> <xsl:template match="pg"/> <xsl:template match="p"> <xsl:text><p></xsl:text> <xsl:apply-templates/> <xsl:text></p></xsl:text> </xsl:template> <xsl:template match="forme_ancienne2"> <xsl:apply-templates/> </xsl:template> <xsl:template match="reference"> <xsl:text><cite></xsl:text> <xsl:apply-templates/> <xsl:text></cite></xsl:text> </xsl:template> <xsl:template match="renvoi"> <xsl:apply-templates/> </xsl:template> <xsl:template match="sm[parent::renvoi]"> <xsl:text><a rel="search" data-dpt="</xsl:text> <xsl:value-of select="/DICTIONNAIRE/@dep"/> <xsl:text>"></xsl:text> <xsl:apply-templates/> <xsl:text></a></xsl:text> </xsl:template> <xsl:template match="note"> <xsl:text><span class="note"></xsl:text> <xsl:apply-templates/> <xsl:text></span></xsl:text> </xsl:template> <xsl:template match="sup"> <xsl:text><sup></xsl:text> <xsl:apply-templates/> <xsl:text></sup></xsl:text> </xsl:template> <xsl:template match="sm"> <xsl:text><span class="sc"></xsl:text> <xsl:apply-templates/> <xsl:text></span></xsl:text> </xsl:template> <xsl:template match="i"> <xsl:text><i></xsl:text> <xsl:apply-templates/> <xsl:text></i></xsl:text> </xsl:template> <xsl:template match="date"> <xsl:text><time></xsl:text> <xsl:apply-templates/> <xsl:text></time></xsl:text> </xsl:template> <xsl:template match="ads"/> </xsl:stylesheet>''') xslt_commentaire2html = etree.parse(commentaire2html) transform_commentaire2html = etree.XSLT(xslt_commentaire2html) place['comment'] = '' if entry.xpath('commentaire'): for commentaire in entry.xpath('commentaire'): comment = str(transform_commentaire2html(commentaire)).strip() # remove multiple spaces comment = " ".join(comment.split()) # hack bad XML format (plus à ça près) comment = comment.replace('.</a>', '</a>.') comment = comment.replace('.">', '">') comment = comment.replace(' <sup>', '<sup>') # optimiser append place['comment'] += comment # Validation HTML5 (on se contente de tester, on insère tout de même en base) comment_authorized_tags_set = {'p', 'a', 'i', 'sup', 'span', 'cite', 'time'} if description is not None: html_snippet_validator(place['comment'], place['id'], comment_authorized_tags_set) else: place['comment'] = None # INSERTIONS # bibl, voir plus haut, avant de boucler sur la source XML # responsablilty creation_date = datetime.now().isoformat(timespec='seconds') try: cursor.execute( "INSERT INTO responsibility (" "user_id," "bibl_id," "num_start_page," "creation_date)" "VALUES (?, ?, ?, ?)", (user_id, bibl_id, place['num_start_page'], creation_date)) except sqlite3.IntegrityError as e: print(e, "insert responsability, place %s" % (place['id'])) responsability_id = cursor.lastrowid #db.commit() # registre try: cursor.execute( "INSERT INTO id_register (" "primary_value, secondary_value)" "VALUES (?, ?)", (place['id'], place['old-id'])) except sqlite3.IntegrityError as e: print(e, "insert id_register, place %s" % (place['id'])) # place try: cursor.execute( "INSERT INTO place (" "place_id," "label," "country," "dpt," "commune_insee_code," "localization_commune_insee_code," "localization_commune_relation_type," "responsibility_id)" "VALUES (?, ?, ?, ?, ?, ?, ?, ?)", (place['id'], place['label'], 'FR', place['dpt'], place['commune_insee_code'], place['localization_commune_insee_code'], place['localization_commune_relation_type'], responsability_id)) except sqlite3.IntegrityError as e: print(e, "insert place, place %s" % (place['id'])) #db.commit() """ 2020-07: abandon de l’insertion des alt_label # place_alt_label if place['alt_labels']: for alt_label in place['alt_labels']: cursor.execute( "INSERT INTO place_alt_label (" "label," "responsibility_id," "place_id)" "VALUES (?, ?, ?)", (alt_label, responsability_id, place['id'])) db.commit() """ # place_comment if place['comment']: try: cursor.execute( "INSERT INTO place_comment (" "content," "responsibility_id," "place_id)" "VALUES (?, ?, ?);", (place['comment'], responsability_id, place['id'])) except sqlite3.IntegrityError as e: print(e, "insert place_comment, place %s" % (place['id'])) #db.commit() # place_description if place['description']: try: cursor.execute( "INSERT INTO place_description (" "content," "responsibility_id," "place_id)" "VALUES (?, ?, ?);", (place['description'], responsability_id, place['id'])) except sqlite3.IntegrityError as e: print(e, "insert place_description, place %s" % (place['id'])) #db.commit() # place_feature_type if place['feature_types']: for feature_type in place['feature_types']: try: cursor.execute( "INSERT INTO place_feature_type (" "term," "responsibility_id," "place_id)" "VALUES (?, ?, ?);", (feature_type, responsability_id, place['id'])) except sqlite3.IntegrityError as e: print(e, ("insert place_feature_type: place %s – FT '%s'" % (place['id'], feature_type))) db.commit()