def process(self, path, metadata): ext = os.path.splitext(path)[1] if ext not in self.supported_extensions: return path, metadata basename = os.path.basename(path) src_path = os.path.join(copy_to_secure_location(path), basename) src_dir = os.path.dirname(src_path) remove_file_dir(path) # Remove <SDFIELD> tags if any cleaned_html = rename_sdfield_tags( open(src_path, 'rb').read().decode('utf-8')) with open(src_path, 'wb') as fd: fd.write(cleaned_html.encode('utf-8')) error_file = os.path.join(src_dir, 'tidy-errors') cmd = 'tidy -asxhtml -clean -indent -modify -utf8 -f %s %s' % ( error_file, src_path) os.system(cmd) os.unlink(error_file) return src_path, metadata
def process(self, path, metadata): ext = os.path.splitext(path)[1] if ext not in self.supported_extensions: return path, metadata basename = os.path.basename(path) src_path = os.path.join( copy_to_secure_location(path), basename) src_dir = os.path.dirname(src_path) remove_file_dir(path) # Remove <SDFIELD> tags if any cleaned_html = rename_sdfield_tags( open(src_path, 'rb').read().decode('utf-8')) with open(src_path, 'wb') as fd: fd.write(cleaned_html.encode('utf-8')) error_file = os.path.join(src_dir, 'tidy-errors') cmd = 'tidy -asxhtml -clean -indent -modify -utf8 -f %s %s' % ( error_file, src_path) os.system(cmd) os.unlink(error_file) return src_path, metadata
def test_rename_sdfield_tags_nested(self): html_input = '<p>Blah<sdfield>12<span>b</span></sdfield></p>' result = rename_sdfield_tags(html_input) expected = '<p>Blah<span class="sdfield">12<span>b</span></span></p>' assert result == expected
def test_rename_sdfield_tags_empty(self): html_input = '<p>Blah</p>' result = rename_sdfield_tags(html_input) expected = '<p>Blah</p>' assert result == expected
def test_rename_sdfield_tags_uppercase(self): html_input = '<P>Blah<SDFIELD TYPE="PAGE">8</SDFIELD></P>' result = rename_sdfield_tags(html_input) expected = '<P>Blah<span class="sdfield" TYPE="PAGE">8</span></P>' assert result == expected
def test_rename_sdfield_tags(self): html_input = '<p>Blah<sdfield type="PAGE">8</sdfield></p>' result = rename_sdfield_tags(html_input) expected = '<p>Blah<span class="sdfield" type="PAGE">8</span></p>' assert result == expected