def cleanData(data): from org.w3c.tidy import Tidy tidy = Tidy() tidy.setXHTML(True) tidy.setInputEncoding("UTF-8") tidy.setOutputEncoding("UTF-8") tidy.setMakeClean(False) tidy.setDropEmptyParas(False) tidy.setPrintBodyOnly(True) tidy.setQuoteAmpersand(True) tidy.setTrimEmptyElements(False) inputStream = ByteArrayInputStream(String(data).getBytes("UTF-8")) outputStream = ByteArrayOutputStream() tidy.parseDOM(inputStream, outputStream) return outputStream.toString("UTF-8")