def convert(self): if len(self.cargs) < 3: raise CommandError("Convert requires 2 arguments") xml_path = self.cargs[1] xslt_path = self.cargs[2] out_file = self.cargs[3] if len(self.cargs) > 3 else None xml_string = utils.readFile(xml_path) xml_string = re.sub(ur"\bxmlns=", ur"xmlns2=", xml_string) # TODO: remove this hack, only for odt conversion # position 33% is like 'super' style xml_string = re.sub(ur'"-33%', ur'"sub', xml_string) xml_string = re.sub(ur'"33%', ur'"super', xml_string) xslt_string = utils.readFile(xslt_path) # replacements in the XSLT comments, xslt_string = self.parse_xslt_directives(xslt_string, xml_string) ret = str(dputils.get_xslt_transform(xml_string, xslt_string)) if out_file: dputils.write_file(out_file, str(comments) + ret, encoding=None) else: print str(comments) + ret return ret
def command_download(self): ret = ur'' recordid = self.args[1] unitid = '' if len(self.args) > 2: unitid = self.args[2] from digipal_text.models import TextContentXML from digipal_text.views.viewer import get_fragment_extent, get_all_units text_content_xml = TextContentXML.objects.get(id=recordid) content = text_content_xml.content suffix = '' if unitid: suffix = '-unit' units = get_all_units(content, 'entry') for unit in units: if unit['unitid'] == unitid: ret = ur'<root>%s</root>' % unit['content'] else: ret = content import regex if ret is None: ret = u'' # print repr(ret) file_name = 'tcx%s%s.xml' % (text_content_xml.id, suffix) from digipal.utils import write_file write_file(file_name, ret) print 'Written file %s ' % file_name
def convert(self): if len(self.cargs) < 3: raise CommandError('Convert requires 2 arguments') xml_path = self.cargs[1] xslt_path = self.cargs[2] out_file = self.cargs[3] if len(self.cargs) > 3 else None xml_string = utils.readFile(xml_path) xml_string = re.sub(ur'\bxmlns=', ur'xmlns2=', xml_string) # TODO: remove this hack, only for odt conversion # position 33% is like 'super' style xml_string = re.sub(ur'"-33%', ur'"sub', xml_string) xml_string = re.sub(ur'"33%', ur'"super', xml_string) xslt_string = utils.readFile(xslt_path) # replacements in the XSLT comments, xslt_string = self.parse_xslt_directives(xslt_string, xml_string) ret = str(dputils.get_xslt_transform(xml_string, xslt_string)) if out_file: dputils.write_file(out_file, str(comments) + ret, encoding=None) else: print str(comments) + ret return ret
def command_autoconvert(self): dry = self.is_dry_run() from digipal_text.models import TextContentXML, TextAnnotation from digipal_text.views import viewer before = ur'' after = ur'' total = 0 converted = 0 for tcx in TextContentXML.objects.filter( text_content__type__slug='transcription').order_by('id'): total += 1 content = tcx.content if not content: continue tcx.convert() if content != tcx.content: converted += 1 text_name = u'#%s: %s [length diff = %s]' % ( tcx.id, tcx, abs(len(content) - len(tcx.content))) print text_name before += u'\n\n' before += text_name before += u'\n\n' before += content.replace('\r', '\n') after += u'\n\n' after += text_name after += u'\n\n' after += tcx.content.replace('\r', '\n') if 0: html = '' from difflib import HtmlDiff diff = HtmlDiff(tabsize=2) d = diff.make_table([content], [tcx.content]) html += u'<h2>%s</h2>' % text_name html += d if not dry: tcx.save() #break #tcx.save() dputils.write_file('before.txt', before) dputils.write_file('after.txt', after) print '%s converted out of %s texts' % (converted, total) if dry: print 'DRY RUN: no data was changed in the database.'
def command_download(self): ret = ur'' recordid = self.args[1] unitid = '' if len(self.args) > 2: unitid = self.args[2] from digipal_text.models import TextContentXML from digipal_text.views.viewer import get_fragment_extent, get_all_units text_content_xml = TextContentXML.objects.get(id=recordid) content = text_content_xml.content suffix = '' if unitid: suffix = '-unit' units = get_all_units(content, 'entry') for unit in units: if unit['unitid'] == unitid: ret = ur'<root>%s</root>' % unit['content'] else: ret = content import regex if ret is None: ret = u'' # ret = regex.sub(ur'(?musi)<span data-dpt="abbr">.*?</span>(<span data-dpt="exp">)', ur'\1', ret) # ret = regex.sub(ur'(?musi)<span data-dpt="hi" data-dpt-rend="su[pb]">(.*?)</span>', ur'\1', ret) # ret = regex.sub(ur'(?musi)<i>(.*?)</i>', ur'\1', ret) # print repr(ret) # for it in regex.findall('<span data-dpt="hi" data-dpt-rend="su[pb]">.*?</span>', ret): # print repr(it) # for it in regex.findall(ur'(?musi)qu[i1][i1]', ret): # print repr(it) if 0: ret = regex.sub(ur'(?musi)<span data-dpt="hi" data-dpt-rend="sup">([^<]+)</span>', ur'<sup>\1</sup>', ret) ret = regex.sub(ur'(?musi)<span data-dpt="hi" data-dpt-rend="sub">([^<]+)</span>', ur'<sub>\1</sub>', ret) ret = regex.sub(ur'(?musi)<span data-dpt="lb" data-dpt-src="ms"></span>', ur'<br/>', ret) ret = regex.sub(ur'(?musi)<span data-dpt="lb" data-dpt-src="prj"></span>', ur'<lb/>', ret) ret = regex.sub(ur'(?musi)<span data-dpt="abbr">(.*?)</span>', ur'<abbr>\1</abbr>', ret) ret = regex.sub(ur'(?musi)<span data-dpt="exp">(.*?)</span>', ur'<exp>\1</exp>', ret) # print repr(ret) file_name = 'tcx%s%s.xml' % (text_content_xml.id, suffix) from digipal.utils import write_file write_file(file_name, ret) print 'Written file %s ' % file_name
def html2md(self): if len(self.args) < 2: print 'ERROR: missing path. Check help.' exit() path = self.args[1] from digipal.views import doc from django.utils.text import slugify for path in utils.get_all_files_under(path, file_types='f', filters=self.options['filter'], extensions=['html', 'htm'], can_return_root=True): info = doc.get_md_from_html(path) target = os.path.join(doc.get_doc_root_path('digipal'), slugify(info['title']))+'.md' if 'confluence-workbox' in target: continue utils.write_file(target, info['md']) print '%s\n => %s' % (path, target) for f in info['files']: print ' + %s' % f