Example #1
0
    def command_upload(self):
        '''upload    XML_PATH IP_ID CONTENT_TYPE [XPATH]
            pm dptext upload exon\source\rekeyed\converted\EXON-1-493.xhtml 1 transcription
        '''
        if len(self.args) < 4:
            print 'upload requires 3 arguments'
            return

        xml_path, ip_id, content_type_name = self.args[1:4]

        xpath = None
        if len(self.args) > 4:
            xpath = self.args[4]

        # I find the TextContentXML record (or create it)
        tcx = self.get_textcontentxml(ip_id, content_type_name)
        if not tcx:
            print 'ERROR: could not find record (%s, %s)' % (ip_id,
                                                             content_type_name)
            return

        # II load the file and convert it
        from digipal.utils import read_file, get_xml_from_unicode
        xml_string = read_file(xml_path)

        # III get the XML into a string
        if xpath:
            xml = get_xml_from_unicode(xml_string, add_root=True)
            els = xml.xpath(xpath)
            if len(els) > 0:
                root = els[0]
            else:
                raise Exception(u'No match for XPATH "%s"' % xpath)
            from lxml import etree
            #content = etree.tostring(root, encoding="UTF-8")
            content = dputils.get_unicode_from_xml(etree, remove_root=True)
        else:
            content = xml_string
#         print type(root)
#         print dir(root)
#         content = str(root)

        if '&#361;' in content:
            print 'Numeric entity'
            exit()

        # IV convert the xml tags and attribute to HTML-TEI
        # content = self.get_xhtml_from_xml(content)

        # save the content into the TextContentXML record
        tcx.content = content
        tcx.save()

        from django.template.defaultfilters import filesizeformat
        print 'Uploaded %s into record #%s' % (filesizeformat(
            tcx.get_length()), tcx.id)
Example #2
0
    def command_upload(self):
        '''upload    XML_PATH IP_ID CONTENT_TYPE [XPATH]
            pm dptext upload exon\source\rekeyed\converted\EXON-1-493.xhtml 1 transcription
        '''
        if len(self.args) < 4:
            print 'upload requires 3 arguments'
            return

        xml_path, ip_id, content_type_name = self.args[1:4]

        xpath = None
        if len(self.args) > 4:
            xpath = self.args[4]

        # I find the TextContentXML record (or create it)
        tcx = self.get_textcontentxml(ip_id, content_type_name)
        if not tcx:
            print 'ERROR: could not find record (%s, %s)' % (ip_id, content_type_name)
            return

        # II load the file and convert it
        from digipal.utils import read_file, get_xml_from_unicode
        xml_string = read_file(xml_path)

        # III get the XML into a string
        if xpath:
            xml = get_xml_from_unicode(xml_string, add_root=True)
            els = xml.xpath(xpath)
            if len(els) > 0:
                root = els[0]
            else:
                raise Exception(u'No match for XPATH "%s"' % xpath)
            from lxml import etree
            #content = etree.tostring(root, encoding="UTF-8")
            content = dputils.get_unicode_from_xml(etree, remove_root=True)
        else:
            content = xml_string
#         print type(root)
#         print dir(root)
#         content = str(root)

        if '&#361;' in content:
            print 'Numeric entity'
            exit()

        # IV convert the xml tags and attribute to HTML-TEI
        # content = self.get_xhtml_from_xml(content)

        # save the content into the TextContentXML record
        tcx.content = content
        tcx.save()

        from django.template.defaultfilters import filesizeformat
        print 'Uploaded %s into record #%s' % (filesizeformat(tcx.get_length()), tcx.id)
Example #3
0
def get_tei_from_text_response(response, item_partid, content_type):
    ret = response.get('content', '')

    # decode entities (e.g. &rsquo;)
    # TODO: make sure we keep core XML entities otherwise it may cause
    # parsing errors down the line
    from HTMLParser import HTMLParser
    parser = HTMLParser()
    ret = parser.unescape(ret)
    # convert & back to &amp; to keep XML well-formed
    ret = ret.replace(u'&', u'&amp;')

    # convert to XML object
    #xml = dputils.get_xml_from_unicode(ret, ishtml=True, add_root=True)
    #print repr(response)
    from digipal.models import ItemPart
    itempart = ItemPart.objects.filter(id=item_partid).first()

    tcx = TextContentXML.objects.filter(text_content__type__slug='translation', text_content__item_part__id=item_partid).first()

    #
    #print ret
    from django.template.loader import render_to_string
    context = {
        'meta': {
            'title': '%s of %s' % (content_type.title(), itempart),
            'ms': {
                'place': itempart.current_item.repository.place.name,
                'repository': itempart.current_item.repository.name,
                'shelfmark': itempart.current_item.shelfmark,
            },
            'edition': {
                'date': tcx.modified
            },
            'project': settings.SITE_TITLE,
            'authority': settings.SITE_TITLE,
        },
    }
    template = render_to_string('digipal_text/tei_from_xhtml.xslt', context)
    ret = dputils.get_xslt_transform('<root>%s</root>' % ret, template)

    ret = dputils.get_unicode_from_xml(xmltree=ret).replace('xmlns=""', '')

    # convert XML to string
    #ret = dputils.get_unicode_from_xml(xml, remove_root=True)

    return ret
Example #4
0
def get_tei_from_text_response(response, item_partid, content_type):
    ret = response.get('content', '')

    # decode entities (e.g. &rsquo;)
    # TODO: make sure we keep core XML entities otherwise it may cause
    # parsing errors down the line
    from HTMLParser import HTMLParser
    parser = HTMLParser()
    ret = parser.unescape(ret)
    # convert & back to &amp; to keep XML well-formed
    ret = ret.replace(u'&', u'&amp;')

    # convert to XML object
    #xml = dputils.get_xml_from_unicode(ret, ishtml=True, add_root=True)
    from digipal.models import ItemPart
    itempart = ItemPart.objects.filter(id=item_partid).first()

    tcx = TextContentXML.objects.filter(
        text_content__type__slug='translation',
        text_content__item_part__id=item_partid).first()

    #
    from django.template.loader import render_to_string
    context = {
        'meta': {
            'title': '%s of %s' % (content_type.title(), itempart),
            'ms': {
                'place': itempart.current_item.repository.place.name,
                'repository': itempart.current_item.repository.name,
                'shelfmark': itempart.current_item.shelfmark,
            },
            'edition': {
                'date': tcx.modified
            },
            'project': settings.SITE_TITLE,
            'authority': settings.SITE_TITLE,
        },
    }
    template = render_to_string('digipal_text/tei_from_xhtml.xslt', context)
    ret = dputils.get_xslt_transform('<root>%s</root>' % ret, template)

    ret = dputils.get_unicode_from_xml(xmltree=ret).replace('xmlns=""', '')

    # convert XML to string
    #ret = dputils.get_unicode_from_xml(xml, remove_root=True)

    return ret