def get_xhtml_from_xml(self, xml_string): # IV convert the xml tags and attribute to HTML-TEI # remove comments content = re.sub(ur'(?musi)<!--.*?-->', ur'', xml_string) # self.c = 0 self.conversion_cache = {} def replace_tag(match): if match.group(0) in self.conversion_cache: return self.conversion_cache[match.group(0)] self.c += 1 if self.c > 10e6: exit() ret = match.group(0) tag = match.group(2) # don't convert <p> if tag in ['p', 'span']: return ret # any closing tag is /span if '/' in match.group(1): return '</span>' if tag == 'pb': print self.c # tag - ret = ur'<span data-dpt="%s"' % tag # attribute - assumes " for attribute values attrs = (re.sub(ur'(?ui)(\w+)(\s*=\s*")', ur'data-dpt-\1\2', match.group(3))).strip() if attrs: ret += ' ' + attrs ret += match.group(4) # print '', ret self.conversion_cache[match.group(0)] = ret return ret from digipal.utils import re_sub_fct content = re_sub_fct(content, ur'(?musi)(<\s*/?\s*)(\w+)([^>]*?)(/?\s*>)', replace_tag) return content
def operation_foliate(self, options, content): ''' <span data-dpt="margin">fol. 1. b</span>[...] => <p><span data-dpt="location" data-dpt-loctype="locus">1v</span><p> ''' self._next_locus = u'1r' def replace(match): ret = match.group(0) locus = match.group(1) parts = re.match(ur'(?musi)^\s?(\d+)\.?\s*(b?)\.?$', locus) if not parts: print 'WARNING: no match [%s]' % repr(locus) else: lo = parts.group(1) lon = lo if parts.group(2) == 'b': lon = u'%sr' % (int(lo) + 1,) lo += 'v' else: lon = lo + 'v' lo += 'r' print '%s ("%s")' % (lo, locus) if lo != self._next_locus: print 'WARNING: locus out of sequence, expected %s, got %s' % (self._next_locus, lo) self._next_locus = lon ret = u'</p><p><span data-dpt="location" data-dpt-loctype="locus">%s</span></p><p>' % lo return ret content = re_sub_fct(content, ur'(?musi)<span data-dpt="margin">\s*fol.([^<]*)</span>', replace) return content
def operation_pb2locus(self, options, content): start_page = 1 if options: start_page = int(options[0]) self.rep_option = start_page def replace(match): # !!! ASSUME pb is not in <p> or anything else number = re.sub(ur'^.*"([^"]+)".*$', ur'\1', match.group(1)) if len(number) == len(match.group(1)): number = self.rep_option ret = u'<p><span data-dpt="location" data-dpt-loctype="locus">%s</span></p>' % number self.rep_option = get_int(number, default=self.rep_option) + 1 return ret content = re_sub_fct(content, ur'<span\s+data-dpt\s*=\s*"pb"([^>]*)>', replace) return content