Python html2doc Examples, utils.html2doc Python Examples

Example #1

0

Show file

File: test.py Project: dotajin/haoku-open

def unicode(s):
    encodings = ["utf-8", "gb18030", "gb2312", "gbk", "big5", "ascii"]
    for enc in encodings:
        try:
            print s
            utf8 = s.decode(enc, 'ignore').encode('utf-8')
            try:
                html2doc(utf8)
            except Exception, e:
                print e.__class__.__name__
                continue
            return enc
        except UnicodeDecodeError, e:
            print str(e)
            pass

Example #2

0

Show file

def unicode(s):
    encodings = ["utf-8", "gb18030", "gb2312", "gbk", "big5", "ascii"]
    for enc in encodings:
        try:
            print s
            utf8 = s.decode(enc, 'ignore').encode('utf-8')
            try:
                html2doc(utf8)
            except Exception, e:
                print e.__class__.__name__
                continue
            return enc
        except UnicodeDecodeError, e:
            print str(e)
            pass

Example #3

0

Show file

File: spider_20141209.py Project: dotajin/haoku-open

def web_content(row, article, words):
    def score(word, num):
        word = word.lower()
        md = hashlib.md5(word.encode('utf-8')).hexdigest()[:2]
        df = redis_tag.hget(md, word)
        df = max(int(df), 0) if df is not None else 0
        if df < 10:
            return 0
        return num * max(math.log(df + 1, 50), 1.5)

    index = [(x, score(x, y))
             for x, y in sorted(words['words']['all'].iteritems(),
                                key=lambda x: -x[1])]
    index = dict(
        filter(lambda x: x[1] > 0 and x[0] not in article['tags'],
               index)).keys()

    index = index[:int(len(words['words']['all']) * 0.25)]

    if index:
        doc = html2doc(row['content'])
        replace2tag(doc, index)
        content = child2html(doc)
        return content
    return row['content']

Example #4

0

Show file

	def parse(self):
		imgs = set()
		doc = html2doc(self.article.content)
		for img in doc.iter('img'):
			src = img.get('src')
			if src and src.strip():
				imgs.add(src.strip())
		return {'urls':list(imgs)}

Example #5

0

Show file

File: article.py Project: dotajin/haoku-open

	def _parse_imgs(self):
		imgs = set()
		doc = html2doc(self.content)
		for img in doc.iter('img'):
			src = img.get('src')
			if src and src.strip():
				imgs.add(src.strip())
		return list(imgs)

Example #6

0

Show file

File: article (copy).py Project: dotajin/haoku-open

 def _parse_imgs(self):
     imgs = set()
     doc = html2doc(self.content)
     for img in doc.iter('img'):
         src = img.get('src')
         if src and src.strip():
             imgs.add(src.strip())
     return list(imgs)

Example #7

0

Show file

 def handle(self, index, task):
     try:
         index, task = task[0], task[1]
         url = task['_id']
         html = get(u(url))
         html = clean_doc(html2doc(html, url=url), return_html=True)
         if index == 'cate':
             self.domains[task['domain']].parse_cate(task, {'html': html})
         elif index == 'album':
             pass
         elif index == 'page':
             self.domains[task['domain']].parse_album(task, {'html': html})
     except KeyboardInterrupt:
         self.exit()

Example #8

0

Show file

File: master.py Project: dotajin/haoku-open

	def handle(self, index, task):
		try:
			index, task = task[0], task[1]
			url = task['_id']
			html = get(u(url))
			html = clean_doc(html2doc(html, url=url), return_html=True)
			if index == 'cate':
				self.domains[task['domain']].parse_cate(task, {'html':html})
			elif index == 'album':
				pass
			elif index == 'page':
				self.domains[task['domain']].parse_album(task, {'html':html})
		except KeyboardInterrupt:
			self.exit()

Example #9

0

Show file

File: image.py Project: dotajin/haoku-open

def content4imgs(content, title, imgs):
	doc = html2doc(content)
	bads = []
	for img in doc.iter('img'):
		src = img.get('src')
		if src in imgs:
			if imgs[src]['count'] >= 10 \
					or imgs[src]['count'] >= 5 and (imgs[src]['width'] < 240 or imgs[src]['height'] < 160) \
					or imgs[src]['count'] >= 2 and (imgs[src]['width'] < 200 or imgs[src]['height'] < 120) \
					or imgs[src]['width'] < 160 or imgs[src]['height'] < 40 \
					or imgs[src]['width'] < 360 and -10 < imgs[src]['width'] - imgs[src]['height'] < 10:
				parent = img.getparent()
				if parent is not None and parent.get('class') == 'article-img':
					parent.drop_tree()
				else:
					img.drop_tree()
				bads.append(imgs[src]['md5'])
			else:
				img.attrib['src'] = img2link(imgs[src]['path'])
	return bads, child2html(doc)

Example #10

0

Show file

def content4imgs(content, title, imgs):
    doc = html2doc(content)
    bads = []
    for img in doc.iter('img'):
        src = img.get('src')
        if src in imgs:
            if imgs[src]['count'] >= 10 \
              or imgs[src]['count'] >= 5 and (imgs[src]['width'] < 240 or imgs[src]['height'] < 160) \
              or imgs[src]['count'] >= 2 and (imgs[src]['width'] < 200 or imgs[src]['height'] < 120) \
              or imgs[src]['width'] < 160 or imgs[src]['height'] < 40 \
              or imgs[src]['width'] < 360 and -10 < imgs[src]['width'] - imgs[src]['height'] < 10:
                parent = img.getparent()
                if parent is not None and parent.get('class') == 'article-img':
                    parent.drop_tree()
                else:
                    img.drop_tree()
                bads.append(imgs[src]['md5'])
            else:
                img.attrib['src'] = img2link(imgs[src]['path'])
    return bads, child2html(doc)

Example #11

0

Show file

File: exc_1215.py Project: dotajin/haoku-open

def web_content(xcontent, article, words):
	def score(word, num):
		word = word.lower()
		md = hashlib.md5(word.encode('utf-8')).hexdigest()[:2]
		df = redis_tag.hget(md, word)
		df = max(int(df), 0) if df is not None else 0
		if df < 10:
			return 0
		return num * max(math.log(df + 1, 50), 1.5)

	index = [(x, score(x, y)) for x, y in sorted(words['words']['all'].iteritems(), key=lambda x: -x[1])]
	index = dict(filter(lambda x: x[1] > 0 and x[0] not in article['tags'], index)).keys()

	index = index[:int(len(words['words']['all']) * 0.25)]

	if index:
		doc = html2doc(xcontent)
		replace2tag(doc, index)
		content = child2html(doc)
		return content
	return xcontent