def fixstring(str): """remove crap characters due to interpreting utf-8 as microsoft code page""" str = str.replace(u"“", u'"').replace(u"’", u"'").replace(u"â€", u'"') str = cf.convert_entities(str) str = cf.convert_unicode_u(str) str = html_to_segments(str) return str.strip()
def fixstring(str): """remove crap characters due to interpreting utf-8 as microsoft code page""" str = str.replace(u"“",u'"').replace(u"’",u"'").replace(u"â€",u'"') str = cf.convert_entities(str) str = cf.convert_unicode_u(str) str = html_to_segments(str) return str.strip()
def cleansentence(self): return html_to_segments(self.sentence)