def clean_texts(self, html): doc = clean_html(html, return_doc=True) html = '' for child in doc.getchildren(): if child.getchildren(): html += doc2html(child) continue text = child.text_content() or '' text = self.clean_text(text.strip()) if text: child.text = text html += doc2html(child) return html
def __init__(self, input, url, **options): self.input = input self.url = url self.options = options self.doc = clean_html(input, url, return_doc=True) self.html = doc2html(self.doc)
def replace_node(format, node): if node.getparent() is not None: tail = node.tail node.tail = '' newnode = fromstring(format % doc2html(node)) newnode.tail = tail node.getparent().replace(node, newnode) return node
def summary(self): if self.doc is None: return '' self.clean_tags() self.clean_lines() self.clean_header() self.clean_footer() self.clean() return doc2html(self.doc)
def __init__(self, input, url, **options): self.input = input self.url = url self.options = options if 'title' in options: self._title = options.get('title') if 'pages' in options: self._pages = options.get('pages') self.doc = clean_html(input, url, return_doc=True) self.html = doc2html(self.doc)
def event_selector(self, cellid): dataset = dashboard.read_dataset(self.io_filter, cellid) spt = dataset['spt'] stim = dataset['stim'] psth, time = basic.CalcPSTH(spt, stim) data = [{'x': x, 'y': int(y)} for x, y in zip(time, psth)] template = self.env.get_template('event_selector.html') doc = doc2html(self.__doc__) return template.render(cellid=str(cellid), data=json.dumps(data), doc=doc)
def plot(self, method, cellid, nocache=None, clean=None, comment=None, reviewer=None, **kwargs): img_data = self.get_img_data(cellid, method, nocache, **kwargs) docstring = getattr(self.visualize, method).__doc__ doc = doc2html(docstring) if reviewer: message = "You evaluation has been saved to the DB" else: message = '' return self.env.get_template('plot.html').render(cellid=cellid, doc=doc, method=method, img_data=img_data, message=message, opts=kwargs)
def node2content(self, node): res = {'text': '', 'selector': '', 'score': 0, 'node': None} if node is None: return res res['node'] = node res['selector'] = self.get_selector(node) options = {} if 'texts' in self.options: options['texts'] = self.options['texts'] if 'debug' in self.options: options['debug'] = self.options['debug'] res['text'] = clean_content(doc2html(node), url=self.url, title=self.title, pages=self.pages, **options) return res
def drop_tag(self, parent, node): if parent is not None: previous = node.getprevious() next = node.getnext() if node.text: if previous is not None: previous.tail = (previous.tail or '') + node.text else: parent.text = (parent.text or '') + node.text for child in node.getchildren(): node.addprevious(fromstring(doc2html(child))) previous = node.getprevious() if node.tail: if previous is not None: previous.tail = (previous.tail or '') + node.tail else: parent.text = (parent.text or '') + node.tail node.drop_tree()
def node2content(self, node): res = {'text':'', 'selector':'', 'score':0, 'node':None} if node is None: return res res['node'] = node res['selector'] = self.get_selector(node) options = {} if 'texts' in self.options: options['texts'] = self.options['texts'] if 'debug' in self.options: options['debug'] = self.options['debug'] res['text'] = clean_content( doc2html(node), url=self.url, title=self.title, pages=self.pages, **options ) return res
class Document(object): TEXT_LENGTH_THREASHOLD = 25 def __init__(self, input, **options): self.input = input self.url = options.get('url', '') self.debug = options.get('debug', False) self.title = options.get('title', '^^') self.pages = options.get('pages', None) self.texts = options.get('texts', None) self.domain = get_domain(self.url) self.options = options self.doc = clean_html(input, return_doc=True) self.text = self.doc.text_content() self.len = word_count(self.text) if self.text else 0 def summary(self): if hasattr(self, 'output'): return self.output if self.doc is None: return '' MIN_LEN = self.options.get( 'min_text_length', self.TEXT_LENGTH_THREASHOLD, ) for node in tags(self.doc, 'form', 'iframe', 'textarea', 'table', 'input'): if node != self.doc: node.drop_tree() for img in self.doc.xpath('.//img'): if img.get('data-original'): img.set('src', img.get('data-original')) if img.get('original'): img.set('src', img.get('original')) if re.search('\/static\/|\.gif', img.get('src', '')): self.drop(img) click = re.compile(u'点击|>>') for node in self.doc.iter('a'): if not node.getchildren(): if click.search(node.text_content()): self.drop(node) else: for child in node.getchildren(): if click.search(child.text or ''): self.drop(child) imgs = [] for child in self.doc.getchildren(): res = self.is_need_drop(child, False if imgs else True) if res == 'img': imgs.append(child) continue elif res == False: break self.drop(child) for img in imgs: self.drop(img) imgs = [] # imgs = [] # for child in reversed(self.doc.getchildren()): # res = self.is_need_drop(child, False if imgs else True) # if res == 'img': # imgs.append(child) # continue # elif res == False: # break # self.drop(child) # for img in imgs: # self.drop(img) # imgs = [] # for child in self.doc.getchildren(): # if self.is_bad_node(child): # self.drop(child) # elif self.texts is not None: # text = child.text_content().strip() # if text and text in self.texts: # self.drop(child) # else: # self.texts.add(text) self.output = self.clean() return self.output def is_bad_node(self, node): text = node.text_content().strip() if node.tag.lower() in 'img|br': return False if not text and not node.getchildren(): return True for img in node.xpath('.//img'): if self.title in img.get('alt', '') \ or self.title in img.get('title', ''): return False text_len = word_count(text) link_len, link_cnt = 0, 0 for link in node.findall('.//a'): link_cnt += 1 if not link.text_content(): continue link_len += word_count(link.text_content()) if link_cnt > 1 and text_len > 1 and link_len / float(text_len) > 0.4: return True if link_cnt > 1 and text_len / float(link_cnt) < 10: return True if link_cnt > 1 and node.cssselect('li a'): return True block_cnt = len(node.xpath(BAD_XPATH)) if link_cnt > 0 and block_cnt > 1 and len(node.cssselect('pre')) == 0: return True if text_len / float(self.len + 1) < 0.15 or text_len < 100: if re.search('\d{3,}-\d+-\d+', text): return True # filterRe = re.compile(u'点击(.*)(进入|观看)|^事实\+$') # if filterRe.match(text): # return True return False def is_need_drop(self, node, short=True): if node.tag.lower() == 'img': return False if self.is_bad_node(node): return True text = node.text_content().strip() text_len = word_count(text) if text_len == 0 and not node.xpath('.//img'): return True if short and text_len < 8 and not node.xpath('.//img'): return True if short and text_len < 20 and not node.xpath('.//img') \ and re.search(u'^【.*】|^(.*)|^\(.*\)|【.*】$|(.*)$|\(.*\)$', text): return True filterRe = re.compile( u"(上一篇|下一篇|AD|热点关注|原标题|来源|编辑|标签|转自|微信|群号|微信号)[::]|" u"追究.*法律责任|关联阅读|请点击|#换成@|关注|(本文|原文|文章)(地址|标题|转自|链接|转载)|原创文章|" u"查看原文|延伸阅读|(推荐|相关)文章|转载请注明|继续浏览|正文.*结束|版 权 所 有|" u"(转载|登载|观点|允许).*(禁止|版权|本文)|(允许|禁止|版权|本文).*(转载|登载|观点)|" u"(关注|订阅|搜索|回复).*微信|微信.*(关注|订阅|搜索|回复)|【.*记者|版权声明|" u"(关注|下载).*(扫描|扫码|二维码)|(扫描|扫码|二维码).*(关注|下载)|专题:|" u"更多.*(内容|信息|文章).*请|责编|QQ群|^【.*】$|^(.*)$") if text_len / float(self.len + 1) < 0.15 or text_len < 100: if short and self.title and self.title in text: return True if emailRe.search(text) or filterRe.search(text): return True for link in node.xpath('.//a'): href = link.get('href', '') if href == self.url or self.pages and href in self.pages: return False if link.xpath('.//img') else True path = get_path(href) domain = get_domain(href) if domain == self.domain and path in ['/', '' ] and link.xpath('.//img'): self.drop(link) # for img in node.xpath('.//img'): # alt = img.get('alt') # if alt and len(alt) < 50: # if re.search(u'微信二维码', alt): # return True # if len(SequenceMatcher(self.title, alt)\ # .get_matching_blocks()) / float(len(self.title)) < 0.3: # return False # title = img.get('title') # if title and len(title) < 50: # if re.search(u'微信二维码', title): # return True # if len(SequenceMatcher(self.title, title)\ # .get_matching_blocks()) / float(len(self.title)) < 0.3: # return False if node.xpath('.//img'): return 'img' return False def drop(self, node): if self.debug: node.attrib['class'] = 'k-drop-tree' else: node.drop_tree() def drop_tag(self, parent, node): if parent is not None: previous = node.getprevious() next = node.getnext() if node.text: if previous is not None: previous.tail = (previous.tail or '') + node.text else: parent.text = (parent.text or '') + node.text for child in node.getchildren(): node.addprevious(fromstring(doc2html(child))) previous = node.getprevious() if node.tail: if previous is not None: previous.tail = (previous.tail or '') + node.tail else: parent.text = (parent.text or '') + node.tail node.drop_tree() def clean(self): for node in list(self.doc.iter()): parent = node.getparent() previous = node.getprevious() next = node.getnext() if node.tag == 'a' and not node.get('href', '').startswith('http') \ or node.tag == 'img' and not node.get('src', '').startswith('http'): node.drop_tree() continue if parent is not None and node.tag in ['a', 'span', 'font']: if not (node.text and node.tag == 'a' \ and (node.get('href', '^^').strip() == node.text.strip() or u'下载' in node.text)): if node.tag != 'a' \ or node.getchildren() \ or node.text and not re.search(ur'[\[\>\]]', node.text.strip()): self.drop_tag(parent, node) continue node.drop_tree() continue return compose_html(doc2html(self.doc))