def command_upload(self): '''upload XML_PATH IP_ID CONTENT_TYPE [XPATH] pm dptext upload exon\source\rekeyed\converted\EXON-1-493.xhtml 1 transcription ''' if len(self.args) < 4: print 'upload requires 3 arguments' return xml_path, ip_id, content_type_name = self.args[1:4] xpath = None if len(self.args) > 4: xpath = self.args[4] # I find the TextContentXML record (or create it) tcx = self.get_textcontentxml(ip_id, content_type_name) if not tcx: print 'ERROR: could not find record (%s, %s)' % (ip_id, content_type_name) return # II load the file and convert it from digipal.utils import read_file, get_xml_from_unicode xml_string = read_file(xml_path) # III get the XML into a string if xpath: xml = get_xml_from_unicode(xml_string, add_root=True) els = xml.xpath(xpath) if len(els) > 0: root = els[0] else: raise Exception(u'No match for XPATH "%s"' % xpath) from lxml import etree #content = etree.tostring(root, encoding="UTF-8") content = dputils.get_unicode_from_xml(etree, remove_root=True) else: content = xml_string # print type(root) # print dir(root) # content = str(root) if 'ũ' in content: print 'Numeric entity' exit() # IV convert the xml tags and attribute to HTML-TEI # content = self.get_xhtml_from_xml(content) # save the content into the TextContentXML record tcx.content = content tcx.save() from django.template.defaultfilters import filesizeformat print 'Uploaded %s into record #%s' % (filesizeformat( tcx.get_length()), tcx.id)
def command_upload(self): '''upload XML_PATH IP_ID CONTENT_TYPE [XPATH] pm dptext upload exon\source\rekeyed\converted\EXON-1-493.xhtml 1 transcription ''' if len(self.args) < 4: print 'upload requires 3 arguments' return xml_path, ip_id, content_type_name = self.args[1:4] xpath = None if len(self.args) > 4: xpath = self.args[4] # I find the TextContentXML record (or create it) tcx = self.get_textcontentxml(ip_id, content_type_name) if not tcx: print 'ERROR: could not find record (%s, %s)' % (ip_id, content_type_name) return # II load the file and convert it from digipal.utils import read_file, get_xml_from_unicode xml_string = read_file(xml_path) # III get the XML into a string if xpath: xml = get_xml_from_unicode(xml_string, add_root=True) els = xml.xpath(xpath) if len(els) > 0: root = els[0] else: raise Exception(u'No match for XPATH "%s"' % xpath) from lxml import etree #content = etree.tostring(root, encoding="UTF-8") content = dputils.get_unicode_from_xml(etree, remove_root=True) else: content = xml_string # print type(root) # print dir(root) # content = str(root) if 'ũ' in content: print 'Numeric entity' exit() # IV convert the xml tags and attribute to HTML-TEI # content = self.get_xhtml_from_xml(content) # save the content into the TextContentXML record tcx.content = content tcx.save() from django.template.defaultfilters import filesizeformat print 'Uploaded %s into record #%s' % (filesizeformat(tcx.get_length()), tcx.id)
def get_tei_from_text_response(response, item_partid, content_type): ret = response.get('content', '') # decode entities (e.g. ’) # TODO: make sure we keep core XML entities otherwise it may cause # parsing errors down the line from HTMLParser import HTMLParser parser = HTMLParser() ret = parser.unescape(ret) # convert & back to & to keep XML well-formed ret = ret.replace(u'&', u'&') # convert to XML object #xml = dputils.get_xml_from_unicode(ret, ishtml=True, add_root=True) #print repr(response) from digipal.models import ItemPart itempart = ItemPart.objects.filter(id=item_partid).first() tcx = TextContentXML.objects.filter(text_content__type__slug='translation', text_content__item_part__id=item_partid).first() # #print ret from django.template.loader import render_to_string context = { 'meta': { 'title': '%s of %s' % (content_type.title(), itempart), 'ms': { 'place': itempart.current_item.repository.place.name, 'repository': itempart.current_item.repository.name, 'shelfmark': itempart.current_item.shelfmark, }, 'edition': { 'date': tcx.modified }, 'project': settings.SITE_TITLE, 'authority': settings.SITE_TITLE, }, } template = render_to_string('digipal_text/tei_from_xhtml.xslt', context) ret = dputils.get_xslt_transform('<root>%s</root>' % ret, template) ret = dputils.get_unicode_from_xml(xmltree=ret).replace('xmlns=""', '') # convert XML to string #ret = dputils.get_unicode_from_xml(xml, remove_root=True) return ret
def get_tei_from_text_response(response, item_partid, content_type): ret = response.get('content', '') # decode entities (e.g. ’) # TODO: make sure we keep core XML entities otherwise it may cause # parsing errors down the line from HTMLParser import HTMLParser parser = HTMLParser() ret = parser.unescape(ret) # convert & back to & to keep XML well-formed ret = ret.replace(u'&', u'&') # convert to XML object #xml = dputils.get_xml_from_unicode(ret, ishtml=True, add_root=True) from digipal.models import ItemPart itempart = ItemPart.objects.filter(id=item_partid).first() tcx = TextContentXML.objects.filter( text_content__type__slug='translation', text_content__item_part__id=item_partid).first() # from django.template.loader import render_to_string context = { 'meta': { 'title': '%s of %s' % (content_type.title(), itempart), 'ms': { 'place': itempart.current_item.repository.place.name, 'repository': itempart.current_item.repository.name, 'shelfmark': itempart.current_item.shelfmark, }, 'edition': { 'date': tcx.modified }, 'project': settings.SITE_TITLE, 'authority': settings.SITE_TITLE, }, } template = render_to_string('digipal_text/tei_from_xhtml.xslt', context) ret = dputils.get_xslt_transform('<root>%s</root>' % ret, template) ret = dputils.get_unicode_from_xml(xmltree=ret).replace('xmlns=""', '') # convert XML to string #ret = dputils.get_unicode_from_xml(xml, remove_root=True) return ret