Esempio n. 1
0
	def clean_texts(self, html):
		doc = clean_html(html, return_doc=True)
		html = ''
		for child in doc.getchildren():
			if child.getchildren():
				html += doc2html(child)
				continue
			text = child.text_content() or ''
			text = self.clean_text(text.strip())
			if text:
				child.text = text
				html += doc2html(child)
		return html
Esempio n. 2
0
 def clean_texts(self, html):
     doc = clean_html(html, return_doc=True)
     html = ''
     for child in doc.getchildren():
         if child.getchildren():
             html += doc2html(child)
             continue
         text = child.text_content() or ''
         text = self.clean_text(text.strip())
         if text:
             child.text = text
             html += doc2html(child)
     return html
Esempio n. 3
0
    def __init__(self, input, url, **options):
        self.input = input
        self.url = url
        self.options = options

        self.doc = clean_html(input, url, return_doc=True)
        self.html = doc2html(self.doc)
Esempio n. 4
0
	def __init__(self, input, url, **options):
		self.input = input
		self.url = url
		self.options = options

		self.doc = clean_html(input, url, return_doc=True)
		self.html = doc2html(self.doc)
Esempio n. 5
0
def replace_node(format, node):
    if node.getparent() is not None:
        tail = node.tail
        node.tail = ''
        newnode = fromstring(format % doc2html(node))
        newnode.tail = tail
        node.getparent().replace(node, newnode)
    return node
Esempio n. 6
0
def replace_node(format, node):
	if node.getparent() is not None:
		tail = node.tail
		node.tail = ''
		newnode = fromstring(format % doc2html(node))
		newnode.tail = tail
		node.getparent().replace(node, newnode)
	return node
Esempio n. 7
0
	def summary(self):
		if self.doc is None:
			return ''

		self.clean_tags()
		self.clean_lines()
		self.clean_header()
		self.clean_footer()
		self.clean()

		return doc2html(self.doc)
Esempio n. 8
0
    def __init__(self, input, url, **options):
        self.input = input
        self.url = url
        self.options = options
        if 'title' in options:
            self._title = options.get('title')
        if 'pages' in options:
            self._pages = options.get('pages')

        self.doc = clean_html(input, url, return_doc=True)
        self.html = doc2html(self.doc)
Esempio n. 9
0
	def summary(self):
		if self.doc is None:
			return ''

		self.clean_tags()
		self.clean_lines()
		self.clean_header()
		self.clean_footer()
		self.clean()

		return doc2html(self.doc)
Esempio n. 10
0
	def __init__(self, input, url, **options):
		self.input = input
		self.url = url
		self.options = options
		if 'title' in options:
			self._title = options.get('title')
		if 'pages' in options:
			self._pages = options.get('pages')

		self.doc = clean_html(input, url, return_doc=True)
		self.html = doc2html(self.doc)
Esempio n. 11
0
    def event_selector(self, cellid):
        
        dataset = dashboard.read_dataset(self.io_filter, cellid)
        spt = dataset['spt']
        stim = dataset['stim']
        psth, time = basic.CalcPSTH(spt, stim)

        data = [{'x': x, 'y': int(y)} for x, y in zip(time, psth)]
        template = self.env.get_template('event_selector.html')
       
        doc = doc2html(self.__doc__)
        
        return template.render(cellid=str(cellid),
                               data=json.dumps(data),
                               doc=doc)
Esempio n. 12
0
    def plot(self, method, cellid, nocache=None, clean=None,
             comment=None, reviewer=None, **kwargs):

        img_data = self.get_img_data(cellid, method, nocache, **kwargs)

        docstring = getattr(self.visualize, method).__doc__
        doc = doc2html(docstring)
         
        if reviewer: message = "You evaluation has been saved to the DB"
        else: message = ''
        
        return self.env.get_template('plot.html').render(cellid=cellid,
                                                         doc=doc,
                                                         method=method,
                                                         img_data=img_data,
                                                         message=message,
                                                         opts=kwargs)
Esempio n. 13
0
    def node2content(self, node):
        res = {'text': '', 'selector': '', 'score': 0, 'node': None}
        if node is None:
            return res

        res['node'] = node
        res['selector'] = self.get_selector(node)

        options = {}
        if 'texts' in self.options:
            options['texts'] = self.options['texts']
        if 'debug' in self.options:
            options['debug'] = self.options['debug']
        res['text'] = clean_content(doc2html(node),
                                    url=self.url,
                                    title=self.title,
                                    pages=self.pages,
                                    **options)
        return res
Esempio n. 14
0
	def drop_tag(self, parent, node):
		if parent is not None:
			previous = node.getprevious()
			next = node.getnext()
			if node.text:
				if previous is not None:
					previous.tail = (previous.tail or '') + node.text
				else:
					parent.text = (parent.text or '') + node.text

			for child in node.getchildren():
				node.addprevious(fromstring(doc2html(child)))

			previous = node.getprevious()
			if node.tail:
				if previous is not None:
					previous.tail = (previous.tail or '') + node.tail
				else:
					parent.text = (parent.text or '') + node.tail
			node.drop_tree()
Esempio n. 15
0
    def drop_tag(self, parent, node):
        if parent is not None:
            previous = node.getprevious()
            next = node.getnext()
            if node.text:
                if previous is not None:
                    previous.tail = (previous.tail or '') + node.text
                else:
                    parent.text = (parent.text or '') + node.text

            for child in node.getchildren():
                node.addprevious(fromstring(doc2html(child)))

            previous = node.getprevious()
            if node.tail:
                if previous is not None:
                    previous.tail = (previous.tail or '') + node.tail
                else:
                    parent.text = (parent.text or '') + node.tail
            node.drop_tree()
Esempio n. 16
0
	def node2content(self, node):
		res = {'text':'', 'selector':'', 'score':0, 'node':None}
		if node is None:
			return res

		res['node'] = node
		res['selector'] = self.get_selector(node)

		options = {}
		if 'texts' in self.options:
			options['texts'] = self.options['texts']
		if 'debug' in self.options:
			options['debug'] = self.options['debug']
		res['text'] = clean_content(
			doc2html(node),
			url=self.url,
			title=self.title,
			pages=self.pages,
			**options
		)
		return res
Esempio n. 17
0
class Document(object):
    TEXT_LENGTH_THREASHOLD = 25

    def __init__(self, input, **options):
        self.input = input
        self.url = options.get('url', '')
        self.debug = options.get('debug', False)
        self.title = options.get('title', '^^')
        self.pages = options.get('pages', None)
        self.texts = options.get('texts', None)
        self.domain = get_domain(self.url)
        self.options = options
        self.doc = clean_html(input, return_doc=True)
        self.text = self.doc.text_content()
        self.len = word_count(self.text) if self.text else 0

    def summary(self):
        if hasattr(self, 'output'):
            return self.output

        if self.doc is None:
            return ''

        MIN_LEN = self.options.get(
            'min_text_length',
            self.TEXT_LENGTH_THREASHOLD,
        )

        for node in tags(self.doc, 'form', 'iframe', 'textarea', 'table',
                         'input'):
            if node != self.doc:
                node.drop_tree()

        for img in self.doc.xpath('.//img'):
            if img.get('data-original'):
                img.set('src', img.get('data-original'))
            if img.get('original'):
                img.set('src', img.get('original'))
            if re.search('\/static\/|\.gif', img.get('src', '')):
                self.drop(img)

        click = re.compile(u'点击|>>')
        for node in self.doc.iter('a'):
            if not node.getchildren():
                if click.search(node.text_content()):
                    self.drop(node)
            else:
                for child in node.getchildren():
                    if click.search(child.text or ''):
                        self.drop(child)

        imgs = []
        for child in self.doc.getchildren():
            res = self.is_need_drop(child, False if imgs else True)
            if res == 'img':
                imgs.append(child)
                continue
            elif res == False:
                break

            self.drop(child)
            for img in imgs:
                self.drop(img)
            imgs = []

        # imgs = []
        # for child in reversed(self.doc.getchildren()):
        # 	res = self.is_need_drop(child, False if imgs else True)
        # 	if res == 'img':
        # 		imgs.append(child)
        # 		continue
        # 	elif res == False:
        # 		break

        # 	self.drop(child)
        # 	for img in imgs:
        # 		self.drop(img)
        # 	imgs = []

        # for child in self.doc.getchildren():
        # 	if self.is_bad_node(child):
        # 		self.drop(child)
        # 	elif self.texts is not None:
        # 		text = child.text_content().strip()
        # 		if text and text in self.texts:
        # 			self.drop(child)
        # 		else:
        # 			self.texts.add(text)

        self.output = self.clean()
        return self.output

    def is_bad_node(self, node):
        text = node.text_content().strip()

        if node.tag.lower() in 'img|br':
            return False

        if not text and not node.getchildren():
            return True

        for img in node.xpath('.//img'):
            if self.title in img.get('alt', '') \
              or self.title in img.get('title', ''):
                return False

        text_len = word_count(text)
        link_len, link_cnt = 0, 0
        for link in node.findall('.//a'):
            link_cnt += 1
            if not link.text_content():
                continue
            link_len += word_count(link.text_content())

        if link_cnt > 1 and text_len > 1 and link_len / float(text_len) > 0.4:
            return True

        if link_cnt > 1 and text_len / float(link_cnt) < 10:
            return True

        if link_cnt > 1 and node.cssselect('li a'):
            return True

        block_cnt = len(node.xpath(BAD_XPATH))
        if link_cnt > 0 and block_cnt > 1 and len(node.cssselect('pre')) == 0:
            return True

        if text_len / float(self.len + 1) < 0.15 or text_len < 100:
            if re.search('\d{3,}-\d+-\d+', text):
                return True
            # filterRe = re.compile(u'点击(.*)(进入|观看)|^事实\+$')
            # if filterRe.match(text):
            # 	return True

        return False

    def is_need_drop(self, node, short=True):
        if node.tag.lower() == 'img':
            return False

        if self.is_bad_node(node):
            return True

        text = node.text_content().strip()
        text_len = word_count(text)

        if text_len == 0 and not node.xpath('.//img'):
            return True

        if short and text_len < 8 and not node.xpath('.//img'):
            return True

        if short and text_len < 20 and not node.xpath('.//img') \
          and re.search(u'^【.*】|^(.*)|^\(.*\)|【.*】$|(.*)$|\(.*\)$', text):
            return True

        filterRe = re.compile(
            u"(上一篇|下一篇|AD|热点关注|原标题|来源|编辑|标签|转自|微信|群号|微信号)[::]|"
            u"追究.*法律责任|关联阅读|请点击|#换成@|关注|(本文|原文|文章)(地址|标题|转自|链接|转载)|原创文章|"
            u"查看原文|延伸阅读|(推荐|相关)文章|转载请注明|继续浏览|正文.*结束|版 权 所 有|"
            u"(转载|登载|观点|允许).*(禁止|版权|本文)|(允许|禁止|版权|本文).*(转载|登载|观点)|"
            u"(关注|订阅|搜索|回复).*微信|微信.*(关注|订阅|搜索|回复)|【.*记者|版权声明|"
            u"(关注|下载).*(扫描|扫码|二维码)|(扫描|扫码|二维码).*(关注|下载)|专题:|"
            u"更多.*(内容|信息|文章).*请|责编|QQ群|^【.*】$|^(.*)$")

        if text_len / float(self.len + 1) < 0.15 or text_len < 100:
            if short and self.title and self.title in text:
                return True
            if emailRe.search(text) or filterRe.search(text):
                return True

        for link in node.xpath('.//a'):
            href = link.get('href', '')
            if href == self.url or self.pages and href in self.pages:
                return False if link.xpath('.//img') else True
            path = get_path(href)
            domain = get_domain(href)
            if domain == self.domain and path in ['/', ''
                                                  ] and link.xpath('.//img'):
                self.drop(link)

        # for img in node.xpath('.//img'):
        # 	alt = img.get('alt')
        # 	if alt and len(alt) < 50:
        # 		if re.search(u'微信二维码', alt):
        # 			return True
        # 		if len(SequenceMatcher(self.title, alt)\
        # 				.get_matching_blocks()) / float(len(self.title)) < 0.3:
        # 			return False

        # 	title = img.get('title')
        # 	if title and len(title) < 50:
        # 		if re.search(u'微信二维码', title):
        # 			return True
        # 		if len(SequenceMatcher(self.title, title)\
        # 				.get_matching_blocks()) / float(len(self.title)) < 0.3:
        # 			return False

        if node.xpath('.//img'):
            return 'img'

        return False

    def drop(self, node):
        if self.debug:
            node.attrib['class'] = 'k-drop-tree'
        else:
            node.drop_tree()

    def drop_tag(self, parent, node):
        if parent is not None:
            previous = node.getprevious()
            next = node.getnext()
            if node.text:
                if previous is not None:
                    previous.tail = (previous.tail or '') + node.text
                else:
                    parent.text = (parent.text or '') + node.text

            for child in node.getchildren():
                node.addprevious(fromstring(doc2html(child)))

            previous = node.getprevious()
            if node.tail:
                if previous is not None:
                    previous.tail = (previous.tail or '') + node.tail
                else:
                    parent.text = (parent.text or '') + node.tail
            node.drop_tree()

    def clean(self):
        for node in list(self.doc.iter()):
            parent = node.getparent()
            previous = node.getprevious()
            next = node.getnext()

            if node.tag == 'a' and not node.get('href', '').startswith('http') \
              or node.tag == 'img' and not node.get('src', '').startswith('http'):
                node.drop_tree()
                continue

            if parent is not None and node.tag in ['a', 'span', 'font']:
                if not (node.text and node.tag == 'a' \
                  and (node.get('href', '^^').strip() == node.text.strip() or u'下载' in node.text)):
                    if node.tag != 'a' \
                      or node.getchildren() \
                      or node.text and not re.search(ur'[\[\>\]]', node.text.strip()):
                        self.drop_tag(parent, node)
                        continue
                    node.drop_tree()
                    continue

        return compose_html(doc2html(self.doc))