def unicode(s): encodings = ["utf-8", "gb18030", "gb2312", "gbk", "big5", "ascii"] for enc in encodings: try: print s utf8 = s.decode(enc, 'ignore').encode('utf-8') try: html2doc(utf8) except Exception, e: print e.__class__.__name__ continue return enc except UnicodeDecodeError, e: print str(e) pass
def unicode(s): encodings = ["utf-8", "gb18030", "gb2312", "gbk", "big5", "ascii"] for enc in encodings: try: print s utf8 = s.decode(enc, 'ignore').encode('utf-8') try: html2doc(utf8) except Exception, e: print e.__class__.__name__ continue return enc except UnicodeDecodeError, e: print str(e) pass
def web_content(row, article, words): def score(word, num): word = word.lower() md = hashlib.md5(word.encode('utf-8')).hexdigest()[:2] df = redis_tag.hget(md, word) df = max(int(df), 0) if df is not None else 0 if df < 10: return 0 return num * max(math.log(df + 1, 50), 1.5) index = [(x, score(x, y)) for x, y in sorted(words['words']['all'].iteritems(), key=lambda x: -x[1])] index = dict( filter(lambda x: x[1] > 0 and x[0] not in article['tags'], index)).keys() index = index[:int(len(words['words']['all']) * 0.25)] if index: doc = html2doc(row['content']) replace2tag(doc, index) content = child2html(doc) return content return row['content']
def parse(self): imgs = set() doc = html2doc(self.article.content) for img in doc.iter('img'): src = img.get('src') if src and src.strip(): imgs.add(src.strip()) return {'urls':list(imgs)}
def _parse_imgs(self): imgs = set() doc = html2doc(self.content) for img in doc.iter('img'): src = img.get('src') if src and src.strip(): imgs.add(src.strip()) return list(imgs)
def _parse_imgs(self): imgs = set() doc = html2doc(self.content) for img in doc.iter('img'): src = img.get('src') if src and src.strip(): imgs.add(src.strip()) return list(imgs)
def handle(self, index, task): try: index, task = task[0], task[1] url = task['_id'] html = get(u(url)) html = clean_doc(html2doc(html, url=url), return_html=True) if index == 'cate': self.domains[task['domain']].parse_cate(task, {'html': html}) elif index == 'album': pass elif index == 'page': self.domains[task['domain']].parse_album(task, {'html': html}) except KeyboardInterrupt: self.exit()
def handle(self, index, task): try: index, task = task[0], task[1] url = task['_id'] html = get(u(url)) html = clean_doc(html2doc(html, url=url), return_html=True) if index == 'cate': self.domains[task['domain']].parse_cate(task, {'html':html}) elif index == 'album': pass elif index == 'page': self.domains[task['domain']].parse_album(task, {'html':html}) except KeyboardInterrupt: self.exit()
def content4imgs(content, title, imgs): doc = html2doc(content) bads = [] for img in doc.iter('img'): src = img.get('src') if src in imgs: if imgs[src]['count'] >= 10 \ or imgs[src]['count'] >= 5 and (imgs[src]['width'] < 240 or imgs[src]['height'] < 160) \ or imgs[src]['count'] >= 2 and (imgs[src]['width'] < 200 or imgs[src]['height'] < 120) \ or imgs[src]['width'] < 160 or imgs[src]['height'] < 40 \ or imgs[src]['width'] < 360 and -10 < imgs[src]['width'] - imgs[src]['height'] < 10: parent = img.getparent() if parent is not None and parent.get('class') == 'article-img': parent.drop_tree() else: img.drop_tree() bads.append(imgs[src]['md5']) else: img.attrib['src'] = img2link(imgs[src]['path']) return bads, child2html(doc)
def content4imgs(content, title, imgs): doc = html2doc(content) bads = [] for img in doc.iter('img'): src = img.get('src') if src in imgs: if imgs[src]['count'] >= 10 \ or imgs[src]['count'] >= 5 and (imgs[src]['width'] < 240 or imgs[src]['height'] < 160) \ or imgs[src]['count'] >= 2 and (imgs[src]['width'] < 200 or imgs[src]['height'] < 120) \ or imgs[src]['width'] < 160 or imgs[src]['height'] < 40 \ or imgs[src]['width'] < 360 and -10 < imgs[src]['width'] - imgs[src]['height'] < 10: parent = img.getparent() if parent is not None and parent.get('class') == 'article-img': parent.drop_tree() else: img.drop_tree() bads.append(imgs[src]['md5']) else: img.attrib['src'] = img2link(imgs[src]['path']) return bads, child2html(doc)
def web_content(xcontent, article, words): def score(word, num): word = word.lower() md = hashlib.md5(word.encode('utf-8')).hexdigest()[:2] df = redis_tag.hget(md, word) df = max(int(df), 0) if df is not None else 0 if df < 10: return 0 return num * max(math.log(df + 1, 50), 1.5) index = [(x, score(x, y)) for x, y in sorted(words['words']['all'].iteritems(), key=lambda x: -x[1])] index = dict(filter(lambda x: x[1] > 0 and x[0] not in article['tags'], index)).keys() index = index[:int(len(words['words']['all']) * 0.25)] if index: doc = html2doc(xcontent) replace2tag(doc, index) content = child2html(doc) return content return xcontent