def get_text(html): data,page = html ud = UniversalDetector() ud.feed(data) ud.close() encoding = ud.result['encoding'] data = unicode(data, encoding) return data
def get_authors_title_test(): import urllib l = 'http://lib.ru/TXT/ruscience.txt' page = urllib.urlopen(l+'_Ascii.txt') text = page.read(2048) ud = UniversalDetector() ud.feed(text) ud.close() encoding = ud.result['encoding'] text = unicode(text, encoding) authors, title = Retriever.get_authors_and_title(text) assert len(authors) == 1 assert authors[0] == u'Дмитрий Толмацкий' assert title == u'Российская наука на пути из реанимации в морг' # print 'authors', ",".join( [author.encode('utf8') for author in authors ] ) # print 'title',title pass
def detectFileEncode(self, filePath): detector = UniversalDetector() with open(filePath, 'r') as fp: for line in fp.readlines(): detector.feed(line) if detector.done: break detector.close() print detector.result return detector.result['encoding']