def test_html_to_unicode(self): """Linote html_to_unicode function""" html_to_unicode( 'charset=("zh_cn")', '<html><h1>漢字汉字</h1></html>').should.eq( ('utf8', u'<html><h1>\u6f22\u5b57\u6c49\u5b57</h1></html>'))
def handle(job, *args, **kwargs): queue = kwargs['queue'] task = json.loads(job) url = task["url"] status, source = fetcher.fetch(url, use_proxy=False) logger.info('%s|%s' % (url, status)) try: _, source = encoding.html_to_unicode('', source) except Exception, e: print e
def handle(job, *args, **kwargs): print 'handle', args, kwargs task = json.loads(job) url = task["url"] domain = tldextracter.extract_domain(url) status, content = fetch(url, use_proxy=False) try: url = url.encode('utf8') urlhash = cityhash.CityHash64(url) except: return (url, None, status, domain, content) logger.info('%s|%s' % (url, status)) if magic.from_buffer(content, mime=True) != 'text/html': return (url, urlhash, status, domain, content) _, content = encoding.html_to_unicode('', content) if status != 200: db.push(url, detail=False) return (url, urlhash, status, domain, content) return (url, urlhash, status, domain, content)
def format(self, note): content = '' if note is not None: _, content = encoding.html_to_unicode('', note.content) content = encoding_match.sub('', content) return content
'nid': nid, 'pid': pid, 'cover': cover, 'playlistId': playlistId, 'o_playlistId': o_playlistId, 'cid': cid, 'subcid': subcid, 'osubcid': osubcid, 'category': category, 'cateCode': cateCode, 'pianhua': pianhua, 'tag': tag, 'tvid': tvid, 'title': title, 'last': last, 'brief': brief } return item if __name__ == '__main__': import fetcher url = 'http://tv.sohu.com' url = 'http://tv.sohu.com/20131223/n392267093.shtml' url = 'http://tv.sohu.com/20131223/n392267093.shtml' status, content = fetcher.fetch(url) _, ucontent = encoding.html_to_unicode('', content) #print extract_links(url, ucontent) #print extract_content(url, ucontent) #print extract_sohutv(url, ucontent) print extract_sohutv_data_by_regex(url, ucontent)