Beispiel #1
0
def unicode(s):
    encodings = ["utf-8", "gb18030", "gb2312", "gbk", "big5", "ascii"]
    for enc in encodings:
        try:
            print s
            utf8 = s.decode(enc, 'ignore').encode('utf-8')
            try:
                html2doc(utf8)
            except Exception, e:
                print e.__class__.__name__
                continue
            return enc
        except UnicodeDecodeError, e:
            print str(e)
            pass
Beispiel #2
0
def unicode(s):
    encodings = ["utf-8", "gb18030", "gb2312", "gbk", "big5", "ascii"]
    for enc in encodings:
        try:
            print s
            utf8 = s.decode(enc, 'ignore').encode('utf-8')
            try:
                html2doc(utf8)
            except Exception, e:
                print e.__class__.__name__
                continue
            return enc
        except UnicodeDecodeError, e:
            print str(e)
            pass
Beispiel #3
0
def web_content(row, article, words):
    def score(word, num):
        word = word.lower()
        md = hashlib.md5(word.encode('utf-8')).hexdigest()[:2]
        df = redis_tag.hget(md, word)
        df = max(int(df), 0) if df is not None else 0
        if df < 10:
            return 0
        return num * max(math.log(df + 1, 50), 1.5)

    index = [(x, score(x, y))
             for x, y in sorted(words['words']['all'].iteritems(),
                                key=lambda x: -x[1])]
    index = dict(
        filter(lambda x: x[1] > 0 and x[0] not in article['tags'],
               index)).keys()

    index = index[:int(len(words['words']['all']) * 0.25)]

    if index:
        doc = html2doc(row['content'])
        replace2tag(doc, index)
        content = child2html(doc)
        return content
    return row['content']
Beispiel #4
0
	def parse(self):
		imgs = set()
		doc = html2doc(self.article.content)
		for img in doc.iter('img'):
			src = img.get('src')
			if src and src.strip():
				imgs.add(src.strip())
		return {'urls':list(imgs)}
Beispiel #5
0
	def _parse_imgs(self):
		imgs = set()
		doc = html2doc(self.content)
		for img in doc.iter('img'):
			src = img.get('src')
			if src and src.strip():
				imgs.add(src.strip())
		return list(imgs)
Beispiel #6
0
 def _parse_imgs(self):
     imgs = set()
     doc = html2doc(self.content)
     for img in doc.iter('img'):
         src = img.get('src')
         if src and src.strip():
             imgs.add(src.strip())
     return list(imgs)
Beispiel #7
0
 def handle(self, index, task):
     try:
         index, task = task[0], task[1]
         url = task['_id']
         html = get(u(url))
         html = clean_doc(html2doc(html, url=url), return_html=True)
         if index == 'cate':
             self.domains[task['domain']].parse_cate(task, {'html': html})
         elif index == 'album':
             pass
         elif index == 'page':
             self.domains[task['domain']].parse_album(task, {'html': html})
     except KeyboardInterrupt:
         self.exit()
Beispiel #8
0
	def handle(self, index, task):
		try:
			index, task = task[0], task[1]
			url = task['_id']
			html = get(u(url))
			html = clean_doc(html2doc(html, url=url), return_html=True)
			if index == 'cate':
				self.domains[task['domain']].parse_cate(task, {'html':html})
			elif index == 'album':
				pass
			elif index == 'page':
				self.domains[task['domain']].parse_album(task, {'html':html})
		except KeyboardInterrupt:
			self.exit()
Beispiel #9
0
def content4imgs(content, title, imgs):
	doc = html2doc(content)
	bads = []
	for img in doc.iter('img'):
		src = img.get('src')
		if src in imgs:
			if imgs[src]['count'] >= 10 \
					or imgs[src]['count'] >= 5 and (imgs[src]['width'] < 240 or imgs[src]['height'] < 160) \
					or imgs[src]['count'] >= 2 and (imgs[src]['width'] < 200 or imgs[src]['height'] < 120) \
					or imgs[src]['width'] < 160 or imgs[src]['height'] < 40 \
					or imgs[src]['width'] < 360 and -10 < imgs[src]['width'] - imgs[src]['height'] < 10:
				parent = img.getparent()
				if parent is not None and parent.get('class') == 'article-img':
					parent.drop_tree()
				else:
					img.drop_tree()
				bads.append(imgs[src]['md5'])
			else:
				img.attrib['src'] = img2link(imgs[src]['path'])
	return bads, child2html(doc)
Beispiel #10
0
def content4imgs(content, title, imgs):
    doc = html2doc(content)
    bads = []
    for img in doc.iter('img'):
        src = img.get('src')
        if src in imgs:
            if imgs[src]['count'] >= 10 \
              or imgs[src]['count'] >= 5 and (imgs[src]['width'] < 240 or imgs[src]['height'] < 160) \
              or imgs[src]['count'] >= 2 and (imgs[src]['width'] < 200 or imgs[src]['height'] < 120) \
              or imgs[src]['width'] < 160 or imgs[src]['height'] < 40 \
              or imgs[src]['width'] < 360 and -10 < imgs[src]['width'] - imgs[src]['height'] < 10:
                parent = img.getparent()
                if parent is not None and parent.get('class') == 'article-img':
                    parent.drop_tree()
                else:
                    img.drop_tree()
                bads.append(imgs[src]['md5'])
            else:
                img.attrib['src'] = img2link(imgs[src]['path'])
    return bads, child2html(doc)
Beispiel #11
0
def web_content(xcontent, article, words):
	def score(word, num):
		word = word.lower()
		md = hashlib.md5(word.encode('utf-8')).hexdigest()[:2]
		df = redis_tag.hget(md, word)
		df = max(int(df), 0) if df is not None else 0
		if df < 10:
			return 0
		return num * max(math.log(df + 1, 50), 1.5)

	index = [(x, score(x, y)) for x, y in sorted(words['words']['all'].iteritems(), key=lambda x: -x[1])]
	index = dict(filter(lambda x: x[1] > 0 and x[0] not in article['tags'], index)).keys()

	index = index[:int(len(words['words']['all']) * 0.25)]

	if index:
		doc = html2doc(xcontent)
		replace2tag(doc, index)
		content = child2html(doc)
		return content
	return xcontent