def main(): html = open('./samples/21853124_0.shtml').read() doc = Document(html) doc.transform() doc.get_publish_date() doc.short_title() doc.text_content()
def strip_chapter(self, html): """ Strips chapter and gets relevant HTML using Readability :param html: str :return: """ doc = Document(html) if len(doc.summary()) <= 20: content = str(BeautifulSoup(html, 'html.parser').find_all('div', class_=self.main_content_div)[0]) content = '<html><head><meta charset="utf-8"></head>' + content + '</html>' return doc.short_title(), content return (doc.short_title(), str(doc.summary()).replace('<html>', '<html><head><meta charset="utf-8"></head>'))
def read_command(api, args): from readability.readability import Document import html2text h = html2text.HTML2Text() h.inline_links = False h.ignore_images = True h.ignore_emphasis = True res = requests.get(args.url) if res.ok: article = Document(res.content) print article.short_title() print h.handle(article.summary()) else: print res.headers['status']
def strip_chapter(self, html): """ Strips chapter and gets relevant HTML using Readability :param html: str :return: """ doc = Document(html) if len(doc.summary()) <= 20: print 'This page has errors, returning entry-content div raw HTML.' content = str(BeautifulSoup(html, 'html.parser').find_all('div', class_=self.main_content_div)[0]) content = '<html><head><meta charset="utf-8"></head>' + content + '</html>' return doc.short_title(), content return (doc.short_title(), str(doc.summary()).replace('<html>', '<html><head><meta charset="utf-8"></head>'))
def extract_article(url): r = requests.get(url) # the the url exists, continue if r.status_code == 200: # extract and parse response url url = parse_url(r.url) # extract html html = r.content.decode('utf-8', errors='ignore') # run boilerpipe # boilerpipe_extractor = Extractor(html=html) # run readability readability_extractor = Document(html) html = readability_extractor.summary() # return article data return { 'title': readability_extractor.short_title(), 'html': html, 'content': strip_tags(html).encode('utf-8', errors='ignore'), 'url': url } # otherwise return an empty dict else: return {}
def get_data(url): error_num = 0 while True: if error_num >= 10: cprint("Finished Because error_num reached 10 times", "red") return 0, 0 try: req = requests.get(url) if int(req.status_code) == 503: cprint("Google detected the abnormal network traffic", "red") time.sleep(60 * 60) elif int(req.status_code) != 200: cprint("Now Get StatusCode{}: Error_num{}".format(req.status_code, error_num), "red") return 0, 0 else: html = req.text break except ConnectionError: cprint("Now Get ConnectionError: Error_num{}".format(error_num), "red") error_num += 1 time.sleep(5) try: document = Document(html) content_html = document.summary() content_text = lxml.html.fromstring(content_html).text_content().strip() short_title = document.short_title() return short_title, content_text except: return 0, 0
class Gist: keyword_pattern = re.compile(r'^[^\d]+$') stop_words = set(get_stop_words('en')) def __init__(self, html): self.html = html self.document = Document(html) @property def title(self): return self.document.short_title() @cached_property def text(self): text = self.document.summary() text = re.sub('<br[^>]+>', '\n', text) text = re.sub('</?p[^>]+>', '\n\n', text) text = re.sub('<[^>]+>', '', text) text = re.sub('^[ \t]+$', '', text) text = re.sub('\n{3,}', '\n\n', text, flags=re.MULTILINE) return text @staticmethod def _common_prefix(one, two): parallelity = [x == y for x, y in zip(one, two)] + [False] return parallelity.index(False) @classmethod def _find_representative(cls, stem, text): tokens = text.split() prefixes = {token: cls._common_prefix(token, stem) for token in tokens} best = lambda token: (-token[1], len(token[0])) return sorted(prefixes.items(), key=best)[0][0] @classmethod def _is_good_keyword(cls, word): return (word not in cls.stop_words) and \ cls.keyword_pattern.match(word) @classmethod def find_keywords(cls, text): whoosh_backend = SearchForm().searchqueryset.query.backend if not whoosh_backend.setup_complete: whoosh_backend.setup() with whoosh_backend.index.searcher() as searcher: keywords = searcher.key_terms_from_text( 'text', text, numterms=10, normalize=False) keywords = list(zip(*keywords))[0] if keywords else [] keywords = [cls._find_representative(keyword, text) for keyword in keywords] keywords = [keyword for keyword in keywords if cls._is_good_keyword(keyword)] #no double keywords in list keywords = list(set(keywords)) #no punctuation in suggested keywords keywords = [''.join(c for c in s if c not in string.punctuation) for s in keywords] return keywords @property def keywords(self): return self.find_keywords(self.text)
def extract_article(url): r = requests.get(url) # the the url exists, continue if r.status_code == 200: # extract and parse response url url = parse_url(r.url) # extract html html = r.content.decode('utf-8', errors='ignore') # run boilerpipe BP = Extractor(html=html) # run readability Rdb = Document(html) html = Rdb.summary() # return article data return { 'extracted_title': Rdb.short_title().strip(), 'extracted_content': strip_tags(BP.getText()), } # otherwise return an empty dict else: return {}
def markdownify(url_list, **options): articles = [] images = [] paragraph_links = options['paragraph_links'] wrap_text = options['wrap_text'] preamble = options['preamble'] for url in url_list: req = urllib2.Request(url,None,{'Referer': url_list[0]}) html = urllib2.urlopen(req).read() document = Document(html, url=url) readable_title = document.short_title() summary = document.summary() summary_doc = build_doc(summary) images.extend([a.get('src') for a in summary_doc.findall('.//img')]) articles.append(document.summary()) markdown_articles = [] for (article, url) in zip(articles, url_list): h = html2text.HTML2Text(baseurl=url) h.inline_links = False h.links_each_paragraph = (paragraph_links and 1) or 0 h.body_width = (wrap_text and 78) or 0 markdown_articles.append(h.handle(article)) combined_article = u"\n\n----\n\n".join(markdown_articles) if preamble: combined_article = (u"Title: %s \nOriginal URL: %s\n\n" % (readable_title, url_list[0])) + combined_article return combined_article.encode("utf-8")
def process(doc, params): url = params['url'] html_body = Document(doc) summary = html_body.summary() title = html_body.short_title() images = [] for img in html_body.reverse_tags(html_body.html, 'img'): try: fp = tempfile.NamedTemporaryFile(dir='/tmp/') img_src = urljoin(url, img.get('src')) img_name = None if re.search(r'http[s]?://', img_src): r = requests.get(img_src, stream=True) img_name = get_filename_from_url(img_src) write_file(r, fp) else: img_meta, content = img_src.split(',') image = base64.b64decode(content) img_name = get_filename_from_base64(img_meta) fp.write(image) images.append((img_name, fp)) except Exception: logger.error( 'extractor.formats.html Image Collector Error!!', exc_info=True, extra={'data': {'url': url}}, ) html = '<h1>' + title + '</h1>' + summary html = '<p>{}</p>'.format(html) text = html2text.html2text(html) return text, images, 1, None
def get_summary(url): html = urllib.request.urlopen(url).read() doc = Document(html) doc.parse(["summary", "short_title"]) readable_article = doc.summary() readable_title = doc.short_title() return readable_article, readable_title
def process(doc, url): html_body = Document(doc) summary = html_body.summary() title = html_body.short_title() images = [] for img in html_body.reverse_tags(html_body.html, 'img'): try: fp = tempfile.NamedTemporaryFile(dir=settings.TEMP_DIR) img_src = urljoin(url, img.get('src')) if re.search(r'http[s]?://', img_src): r = requests.get(img_src, stream=True) write_file(r, fp) else: image = base64.b64decode(img_src.split(',')[1]) fp.write(image) images.append(fp) except Exception: logger.error( 'extractor.formats.html Image Collector Error!!', exc_info=True, extra={'data': { 'url': url }}, ) html = '<h1>' + title + '</h1>' + summary regex = re.compile('\n*', flags=re.IGNORECASE) html = '<p>{}</p>'.format(regex.sub('', html)) soup = BeautifulSoup(html, 'lxml') text = _get_plain_text(soup) return text, images, 1
def reada(url, cache=True): if cache: cached = memcache.get(key=url) if cached is not None: return cached #file = urllib.urlopen(url) #import urllib2 opener = urllib2.build_opener() opener.addheaders = [('User-agent', 'Mozilla/5.0')] file = opener.open(url) ## enc = 'utf-8' text = '' try: # 1, web html 2 readability raw = Document(file.read(), url=url) html = raw.summary().encode(enc, 'replace') title = raw.short_title() # 2, readability 2 markdown, copy from main data = html.decode(enc) h = html2text.HTML2Text(baseurl=url) h.ignore_images = False h.body_width = 100000 text = h.handle(data) finally: file.close() d = {'url': url, 'title': title, 'content': text} if cache: memcache.add(key=url, value=d, time=600) return d
def _parse_article(self, response): feed_entry = response.meta["feed_entry"] il = FeedEntryItemLoader(parent=response.meta["il"]) try: response.text except AttributeError: # Response is not text (e.g. PDF, ...). il.add_value("title", feed_entry.get("title")) il.add_value("content_html", feed_entry.get("summary")) return il.load_item() doc = Document(response.text, url=response.url) il.add_value("title", doc.short_title() or feed_entry.get("title")) summary = feed_entry.get("summary") try: content = doc.summary(html_partial=True) if summary and len(summary) > len(content): # Something probably went wrong if the extracted content is shorter than # the summary. raise Unparseable except Unparseable: content = summary il.add_value("content_html", content) return il.load_item()
def process(doc): html_body = Document(doc) summary = html_body.summary() title = html_body.short_title() text = text_maker.handle(summary) return title, text
def main(): html = urllib.urlopen("http://habrahabr.ru/post/150756/").read() doc = Document(html) short_title = doc.short_title() readable_article = doc.summary() f = open("C:\\users\\mykola\\documents\\%s.html" % short_title, "wb") f.write(readable_article.encode("utf-8")) f.close()
def get_article_from_item(self, item): url = item['link'] logging.debug(url) author = 'n/a' if item.has_key('author'): author = item.author html = urllib.urlopen(url).read() doc = Document(html) return Article(doc.title(), doc.short_title(), author, doc.summary())
def process_html(html): doc = Document(html) return { 'content': doc.content(), 'clean_html': doc.get_clean_html(), 'short_title': doc.short_title(), 'summary': html_to_text(doc.summary()), 'title': doc.title() }
def extract_data(self, patchurl): try: f = requests.get(patchurl) html = f.content doc = Document(html) title = doc.short_title() summary = doc.summary() return smart_str(title), smart_str(summary) except: return None, None
def extract_by_readability(html): document = Document(html) def strip_html(html): return re.sub(r'<[^<]+?>', '', html) return { 'title': ensure_unicode(document.short_title()), 'body': strip_html(ensure_unicode(document.summary())), }
def decode_doc(doc, url): #print('doc') cs = re.compile(b'^<(meta|META).*charset=("|\')?([^ "\']*)') pkey = re.compile(b'^<(meta|META).*keywords.*content=("|\')?([^ "\']*)') codec = None keywords = None #print(*doc) for l in doc : if (l.startswith(b'<meta') or l.startswith(b'<META')) : if codec is None and (b'charset' in l) : m = cs.match(l) codec = m.group(3).decode() if keywords is None and b'keywords' in l : m = pkey.match(l) if m : keywords = m.group(3) sdoc = [] for l in doc : try : l = l.decode(codec) except : l = '' sdoc.append(l) try : if keywords : keywords = keywords.decode(codec) else : #print(*sdoc, sep = '\n') keywords = '' keywords = re.split(r'[ ,;\|]',keywords) #print(keywords.encode('utf8')) except : pass #if sum(len(x) for x in sdoc) < 1000 : return doc = '\n'.join(sdoc) #if len(doc) < 1000 :return try : doc = Document(doc) title = doc.short_title() content = doc.summary() except : return #print(doc.summary().encode('utf8')) #print(doc.short_title().encode('utf8')) data = {"url":url, 'keywords':keywords, 'title': title, 'content':content} return data
def try_readability(): html = urllib.request.urlopen(ARTICLE).read() doc = Document(html) con = BeautifulSoup(doc.summary()).get_text() tit = doc.short_title() print("===READABILITY===") print("=CONTENT=") print(con) print("=TITLE=") print(tit)
def parse_news_content(self, response): for link in self.full_article_link_extractor.extract_links(response): request = response.request.replace(url=link.url) yield request item = self._create_item(response) if item is not None: doc = Document(response.body) item['title'] = doc.short_title() item['content'] = html2text.html2text(doc.summary()) yield item
def extract_url_content(self, url=None): if not url: url = self.url url_parse = urlparse(url) headers = {} if url_parse.netloc != "t.co": user_agent = "Mozilla/5.0 (X11; Linux x86_64; rv:9.0.1) Gecko/20100101 Firefox/9.0.1 Iceweasel/9.0.1" headers['User-Agent'] = user_agent content = requests.get(url, headers=headers) self.content_type = content.headers.get('content-type') self.status_code = content.status_code self.content = content.text self.url = self.clean_url(self.url) self.url = self.url_morph(content.url) self.image = self.find_taller_image(self.content) if self.image: self.logger.info("found image : %s"%self.image) self.url_parse = urlparse(self.url) if url_parse.netloc in oembed.keys(): print "found oembed" mod = oembed[url_parse.netloc] self.content = mod.get_widget(url) self.summary = self.content self.title = os.path.basename(url_parse.path) self.content_type = "collectr/parsed" self.tags = [mod.get_tag()] self.tagstring = mod.get_tag() return if self.status_code >= 400: raise UrlExtractException("Can't extract content for %s (http<%d>)" % (url, content.status_code)) elif "image" in self.content_type: print "log: content type : image" self.summary = """<img src="%s" />""" % self.url self.title = self.url elif "html" in self.content_type: doc = Document(self.content) self.summary = doc.summary() try: self.title = doc.short_title() except AttributeError: self.title = u"No title" else: self.summary = None self.title = os.path.basename(url_parse.path)
def import_html(results, content): content = Document(content) converter = HTML2Text() converter.body_width = 0 body = content.summary() text = BeautifulSoup(body).get_text(" ") results.investigation.update(name=content.short_title(), import_md=converter.handle(body), import_text=text)
def extract(html): try: doc = Document(html) article = doc.summary() title = doc.short_title() return { 'title': title, 'article': html_to_text(article), 'full_text': html_to_text(html) } except: logging.exception('extract html') return {}
def read_command(args): try: s_idx, t_idx = (int(x) for x in args.idx.split(':')) url = args.session.tabs[s_idx].entries[t_idx].url except: print 'Invalid index' return import requests from readability.readability import Document import html2text h = html2text.HTML2Text() h.inline_links = False h.ignore_images = True h.ignore_emphasis = True res = requests.get(url) if res.ok: article = Document(res.content) print article.short_title() print h.handle(article.summary()) else: print res.headers['status']
def import_html(results, content): content = Document(content) converter = HTML2Text() converter.body_width = 0 body = content.summary() text = BeautifulSoup(body).get_text(" ") results.investigation.update( name=content.short_title(), import_md=converter.handle(body), import_text=text)
def parse(filename): html = open(filename, encoding="latin").read() doc = Document(html) summary = doc.summary() summary = re.sub('(<map.*?</map>)', '', summary, re.M) summary = re.sub(r"<img.*?usemap=.*?>", '', summary, re.M) summary = re.sub(r'<a href="index.html"><img.*?/></a>', '', summary, re.M) if 'href="index.html"' in summary: raise Exception("FAIIILEEED") print("<small>" + doc.short_title() + "</small>") print("<p>" + summary + "<p>") print("<p class='breakhere'></p>")
def parse_html(url): response = request(url) if not response: return response document = Document(response.content) doc = { 'titulo': document.short_title(), 'texto': document.summary(), 'site': urlparse(url).netloc, 'url': get_object_or_404(Url, url=url), 'imagem': get_image(response.content, urlparse(url).netloc) } return doc
def read_extractor(html, url): '''readability extractor''' try: clean_doc = Document(html,url = url, positive_keywords=",".join(POSITIVE_K) , negative_keywords=",".join(NEGATIVE_K)) #summary = clean_doc.summary() article = clean_doc.article text = re.sub(" |\t", " ",bs(article, "lxml").get_text()) title = clean_doc.short_title() return (title, clean_doc, text) except Exception as e: return False
def extractMainArticle(html): p = Document(html) readable_article = p.summary() readable_title = p.short_title() soup = BeautifulSoup(readable_article) text_nodes = soup.findAll(text=True) text = ''.join(text_nodes) #text = readable_title + " " + text #return text wtext = {"title": readable_title, "text": text} return wtext
def extractMainArticle(html): p = Document(html) readable_article = p.summary() readable_title = p.short_title() soup = BeautifulSoup(readable_article) text_nodes = soup.findAll(text=True) text = ''.join(text_nodes) #text = readable_title + " " + text #return text wtext = {"title":readable_title, "text": text} return wtext
def parse_with_readability(html): """ Return { 'title': '', 'summary': '' } """ doc = Document(html) return { 'title': doc.short_title(), 'summary': doc.summary(html_partial=True) }
class WebInfoExtractor: def __init__(self, url): self.url = url self.readable = None self.page = None try: head = requests.head(url, headers=HEADERS) except requests.exceptions.RequestException: return if 'text/html' in head.headers.get('content-type', ''): try: html = requests.get(url, headers=HEADERS).text except requests.exceptions.RequestException: return self.readable = Document(html) self.page = BeautifulSoup(html, 'lxml') def get_title(self): return self.readable and self.readable.short_title() def get_date(self): return extract_date(self.url, self.page) def get_country(self): if not self.page: return None country = self.page.select('.primary-country .country a') if country: return country[0].text.strip() country = self.page.select('.country') if country: return country[0].text.strip() return None def get_source(self): if self.page: source = self.page.select('.field-source') if source: return source[0].text.strip() return tldextract.extract(self.url).domain def get_website(self): return urlparse(self.url).netloc
def _parse_article(self, response): feed_entry = response.meta["feed_entry"] il = FeedEntryItemLoader(parent=response.meta["il"]) doc = Document(response.text, url=response.url) il.add_value("title", doc.short_title() or feed_entry.get("title")) summary = feed_entry.get("summary") try: content = doc.summary(html_partial=True) if summary and len(summary) > len(content): # Something probably went wrong if the extracted content is shorter than # the summary. raise Unparseable except Unparseable: content = summary il.add_value("content_html", content) return il.load_item()
def getDoc2(url): t = time.time() # import urllib # html = urllib.urlopen(url).read() headers = {'user-agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:40.0) Gecko/20100101 Firefox/40.1'} r = get(url,headers=headers) html = r.content doc = Document(html,url=url) readable_article = doc.summary() readable_title = doc.short_title() readable_article = readable_article.replace("http","/?url=http") timeElapsed = int((time.time()-t)*1000) fileSize = 0.7 + float(sys.getsizeof(readable_article)/1000.0) fileSize = round(fileSize,1) return {'title':readable_title,'description':"",'url':url,'timeElapsed':timeElapsed,'content':readable_article,'size':fileSize}
def make_readable(url): try: html = urllib2.urlopen(url).read() except urllib2.URLError: return None document = Document(html) document_dict = { 'title': document.title(), 'summary': document.summary(), 'content': document.content(), 'short_title': document.short_title() } return document_dict
def scrape(result): start_url = result.url response = requests.get(start_url) soup = BeautifulSoup(response.content, 'html.parser') document = Document(response.content) body_soup = BeautifulSoup(document.summary(), 'html.parser') links = [Link(url=el['href']) for el in soup.select('a') if el.get('href')] for link in links: link.save() content = Content( result=result, html=soup.prettify(), title=document.short_title(), body_html=body_soup.prettify(), body=body_soup.get_text().strip(), ) content.save() content.links.add(*links)
def main(): url = sys.argv[1] #url = 'http://jandan.net/2016/01/04/safe-sex-education.html' config = get_login_info() apikey = config['apikey'] html_text = getHtml(url) doc = Document(html_text) readable_article = doc.summary() readable_title = doc.short_title() soup = BeautifulSoup(readable_article) final_article = soup.text print "原文:" print final_article eng_article = baidu_translate(apikey, final_article, 'zh', 'en') print "英文:" print eng_article zh_article_back = baidu_translate(apikey, eng_article, 'en', 'zh') print "中文:" print zh_article_back
def fetch_privacy_policy(policy_url): print('fetch_privacy_policy', policy_url) # Extract domain ext = tldextract.extract(policy_url) domain = ext.domain suffix = ext.suffix registered_domain = ext.registered_domain print('domain', domain) # Fetch policy page print('Fetch policy', policy_url) content = fetch(policy_url) if not content: return lowered = content.lower() # if not any(keyword in KEYWORDS for keyword in lowered.split()): # print('No keyword found') # return if len(lowered) < 1600: print('Too short:', len(content)) return # Extract content readability = Document(content) title = readability.short_title() clean_content = readability.summary() lang = langdetect.detect(clean_content) return { "raw_content": content, "clean_content": clean_content, "lang": lang, "title": title, "url": policy_url, "suffix": suffix, "registered_domain": registered_domain }
def extract(self, response, link): #extract # for link in link_list: # response = ulib.urlopen(link).read() #get relevant content using readability readable = Document(response) body = readable.summary() title = readable.short_title() #strip extra html readability leaves in, like p tags title = html.fromstring(title).text_content() body = html.fromstring(body).text_content() title = condense_whitespace(title) body = condense_whitespace(body) links = self.extra['links'] try: d = unicode(self.extra['dates'][links.index(link)]) except: #pr web rss feeds don't have pubdate html_body = html.fromstring(response) d = re.sub('.*\(.*\)', '', html_body.find_class('releaseDateline')[0].text_content()) #print d try: date = parse(d) except: date = datetime.now() doc = { 'url': link, 'title': title, 'text': body, 'date': date, 'source': self.extra['source'][links.index(link)]} return doc
def evernotify(html, url): doc = Document(html, url=url) html = doc.summary() allowed_tags = ['a', 'abbr', 'acronym', 'address', 'area', 'b', 'bdo', 'big', 'blockquote', 'br', 'caption', 'center', 'cite', 'code', 'col', 'colgroup', 'dd', 'del', 'dfn', 'div', 'dl', 'dt', 'em', 'font', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'hr', 'i', 'img', 'ins', 'kbd', 'li', 'map', 'ol', 'p', 'pre', 'q', 's', 'samp', 'small', 'span', 'strike', 'strong', 'sub', 'sup', 'table', 'tbody', 'td', 'tfoot', 'th', 'thead', 'title', 'tr', 'tt', 'u', 'ul', 'var', 'xmp'] disallowed_attrs = ['id', 'class', 'onclick', 'ondblclick', 'accesskey', 'data', 'dynsrc', 'tabindex', 'content'] soup = bs4(html) body = soup.body body.name = "en-note" pid = 0 for tag in body.find_all(lambda b: True, recursive=True): if tag.name not in allowed_tags: tag.name = "span" for attr in filter(lambda d: tag.attrs.get(d, False), disallowed_attrs): del(tag[attr]) for attr in filter(lambda a: a.startswith('item'), tag.attrs.keys()): del(tag[attr]) body = body.prettify() body = '<?xml version="1.0" encoding="UTF-8"?>\n\ <!DOCTYPE en-note SYSTEM "http://xml.evernote.com/pub/enml2.dtd">\n%s' % body body = body.encode('utf-8') print body return doc.short_title(), body
def simplify(self): if not self.doc: raise StripError("Not a html document") html_body = Document(self.doc) summary = html_body.summary() title = html_body.short_title() images = [] for img in html_body.reverse_tags(html_body.html, 'img'): try: fp = tempfile.NamedTemporaryFile(dir=settings.BASE_DIR) r = requests.get(img.get('src'), stream=True) write_file(r, fp) images.append(fp) except Exception: pass html = "<h1>" + title + "</h1>" + summary regex = re.compile('\n*', flags=re.IGNORECASE) html = regex.sub('', html) return html, images
def process(doc): html_body = Document(doc) summary = html_body.summary() title = html_body.short_title() images = [] for img in html_body.reverse_tags(html_body.html, 'img'): try: fp = tempfile.NamedTemporaryFile(dir=settings.BASE_DIR) r = requests.get(img.get('src'), stream=True) write_file(r, fp) images.append(fp) except Exception: pass html = '<h1>' + title + '</h1>' + summary regex = re.compile('\n*', flags=re.IGNORECASE) html = '<p>{}</p>'.format(regex.sub('', html)) soup = BeautifulSoup(html, 'lxml') text = _get_plain_text(soup) return text, images
def extractTitle(html): if html == "": return None try: doc = Document(html) short_title = doc.short_title() title = doc.title() if short_title is not None and short_title.strip() != "": title = short_title for delimiter in ['|', '-', '::', '/', '_']: if delimiter in title: parts = title.split(delimiter) if len(parts[0]) >= 4: title = parts[0] break elif len(parts[-1]) >= 4: title = parts[-1] break return title except: pass return None
def parse(self, response): if self.max_crawl < 1: return # response = scrapy.http.TextResponse(response) doc = Document(response.text) title = doc.short_title() content = pq(doc.summary()).text() if len(content) > 150: item = Scrapy1Item() item["content"] = content #.encode("utf-8") item["title"] = title #.encode("utf-8") item["url"] = response.url item["crawler_time"] = datetime.now().strftime("%Y-%m-%d %H:%M:%S") self.max_crawl -= 1 yield item for url in response.xpath("//a/@href").extract(): if "javascript" in url and "http" not in url: continue url=urllib.parse.urljoin(get_base_url(response), url) host = urlparse(url).netloc if host not in self.host_list: continue print("crawl {}".format(url)) yield scrapy.Request(url, callback=self.parse)
import yql import urllib con = Connection() db = con.learner news = db.news yqlpublic = yql.Public() rsscfg = open("rss.cfg", "r") for line in rsscfg.readlines(): category = line.split(":") query = 'select link,pubDate from rss where url ="http://rss.news.yahoo.com/rss/%s"' % category[0] results = yqlpublic.execute(query) for result in results.rows: html = urllib.urlopen(result["link"]).read() doc = Document(html) news.insert( { "art_id": 0, "content": doc.summary(), "description": "", "hardness": 0, "pub_date": result["pubDate"], "ranking": 0, "rss_id": category[1], "time_index": 1, "tiny_image": "", "title": doc.short_title(), "url": result["link"], } )
def extract_with_python_readability(raw_content): doc = Document(raw_content) return [u'' + doc.short_title().strip(), u'' + doc.summary().strip()]
def extract(self, body, url): doc = Document(body) title = doc.short_title() content = doc.summary() self.save_article(url, title, content) self.logger.info('Article extract, title=%s, content=%s' % (title, content[:100]))
class Page(object): """ Basic Page """ def __init__(self, item, config): """ Mapping item and config """ for k, v in item.items(): if k in ["url", "source_url", "depth", "type"]: setattr(self, k, v) for k, v in config.items(): if k in ['filter_lang','max_depth', "query", "directory", "filter", "short_export", "date"]: setattr(self, k, v) ##logger.debug("Page Init") self.status = True self.load_default() def load_default(self): self.msg = "" self.code = 100 self.status = True return self def process(self, filter_text=True): self.check_depth() self.valid_url() self.fetch() self.clean_article() self.extract() self.check_lang() if filter_text: if self.filter is not False: self.filter_text() return self.status @check_status #@debug def check_depth(self): '''checking depth''' if self.depth is False or self.depth is None: self.depth = 0 #logger.debug("Page check depth") if self.depth > self.max_depth: self.code = "102" self.msg = "Depth exceed max_depth for page" %(self.max_depth) self.status = False return self.status else: return self @check_status #@debug def valid_url(self): '''checking url format and validity''' for k, v in self.parse_link(self.url).items(): if k is None: continue if v is None or v == "": setattr(self, k, "") else: setattr(self, k, v) try: if self.scheme not in ACCEPTED_PROTOCOL: self.msg = 'URL: Wrong protocol %s' % self.scheme self.status = False self.code = 804 return self.status except Exception as e: logging.warning("%s" %str(e)) pass try: if self.filetype in BAD_TYPES: self.msg = 'URL: Invalid webpage type %s' % self.filetype self.status = False self.code = 806 return self.status except Exception as e: pass try: if self.domain in BAD_DOMAINS: self.msg = 'URL: Bad domain %s' % self.domain self.status = False self.code = 807 return self.status except Exception as e: logging.warning("%s" %str(e)) pass try: if self.subdomain in BAD_DOMAINS: self.msg = 'URL: Bad subdomain %s' % self.subdomain self.status = False self.code = 807 return self.status except Exception as e: logging.warning("%s" %str(e)) pass try: if self.path in BAD_PATHS: self.msg = 'URL: Bad path %s' % self.path self.status = False self.code = 807 return self.status except Exception as e: logging.warning("%s" %str(e)) pass if filter.match(self.url): self.msg = 'URL: Blacklisted url' self.status = False self.code = 808 return self.status return self @check_status def fetch(self): '''downloading page''' try: req = requests.get(self.url, allow_redirects=True, timeout=3) req.raise_for_status() try: self.html = req.text self.content_type = req.headers['content-type'] if 'text/html' not in self.content_type: self.msg ="Control: Content type is not TEXT/HTML" self.code = 404 self.status = False return self.status #Error on ressource or on server elif req.status_code in range(400,520): self.code = int(req.status_code) self.msg = "Control: Request error on connexion no ressources or not able to reach server" self.status = False return self.status else: if self.html == "" or self.html is None: self.msg = "Error loading HTML from request" self.code = 405 self.status = False return self.status try: self.html = self.html self.tree = lxml.html.document_fromstring(self.html) #cleaning with lxml it's fun! self.tree = cleaner.clean_html(self.tree) self.tree.make_links_absolute(self.url) self.doc = lxml.html.tostring(self.tree) self.doc = (self.doc).replace(unichr(160), " ") self.doc = re.sub(re.compile("\r+|\n+|\t+|\s+")," ",self.doc) if self.doc == "" or self.doc is None: self.msg = "Error loading HTML from request" self.code = 405 self.status = False return self.status else: return self except Exception as e: self.msg = "Error loading HTML: "+str(e) self.code = 405 self.status = False return self.status except Exception as e: self.msg = "Requests: answer was not understood %s" %e self.code = 400 self.status = False return self.status except Exception as e: #logger.warning(e) self.msg = "Incorrect link url" try: self.code = req.status_code self.status = False return self.status except Exception as e: self.code = 400 self.status = False return self.status @check_status def clean_article(self): try: self.clean_doc = Document(self.doc,url = self.url, positive_keywords= "entry-content,post,main,content,container,blog,article*,post,entry", negative_keywords="like*,ad*,comment.*,comments,comment-body,about,access,navigation, sidebar.*?,share.*?,relat.*?,widget.*?") self.article = self.clean_doc.summary() self.text = re.sub(" |\t", " ",bs(self.article).get_text()) self.title = self.clean_doc.short_title() if self.text == "" or self.text == u'': self.msg = "Error extracting Article and cleaning it" self.code = 700 self.status = False return self.status if self.title == '': self.title = u'' return self except AttributeError as e: self.msg = "Error loading HTML: %s" %str(e) self.code = 400 self.status = False return self.status @check_status #@debug def extract(self): '''extracting info from page''' if self.doc is not None: links = list(set([n.get('href') for n in bs(self.article).find_all("a")])) links = [n for n in links if n != self.url] #get links, cited_links, cited_links_ids, cited_domains self.outlinks = self.parse_outlinks(links) self.get_meta() return self else: #~ #self.msg = str(#logger.debug("ParserError")) self.msg = "Extract Error" self.code = 701 self.status = False return self.status def parse_link(self, url): '''parsing link info''' link = {"url":url} parsed_url = urlparse(url) for k in ["scheme", "netloc", "path", "params", "query", "fragment"]: if k == "query": link["url_query"] = getattr(parsed_url,k) else: link[k] = getattr(parsed_url,k) tld_dat = tldextract.extract(url) for k in ["domain", "subdomain", "suffix"]: link[k] = getattr(tld_dat,k) #~ link["subdomain"] = tld_dat.subdomain #~ link["domain"] = tld_dat.domain.lower() if link["subdomain"] not in ["www", "ww1", "ww2", ""]: link["url_id"] = link["subdomain"]+"_"+link["domain"] else: link["url_id"] = link["domain"] link["extension"] = link["suffix"] del link["suffix"] link["chunks"] = [x for x in link["path"].split('/') if len(x) > 0] link["internal_depth"] = len(link["chunks"]) link["filetype"] = re.split(".", link['netloc'])[-1] return link def parse_outlinks(self, links): '''creating outlinks from page''' self.links = [self.parse_link(url) for url in set(links) if url is not None and url != ""] self.cited_links = [n["url"] for n in self.links] self.cited_links_ids = [n["url_id"] for n in self.links] self.cited_domains = [n["domain"] for n in self.links] self.outlinks = [{"url": n["url"], "url_id":n["url_id"], "source_url": self.url, "depth": self.depth+1, "type":"page"} for n in self.links] return self.outlinks def get_meta(self): self.generators = [] self.meta = {} for n in bs(self.doc).find_all("meta"): name = n.get("name") prop = n.get("property") content = n.get("content") if name is not None and name not in ["type", "viewport"]: if name.lower() in ["generator"]: self.generators.append(content) else: self.meta[re.sub("og:|DC.", "", name)] = content #~ if prop is not None: self.meta[re.sub("og:|DC.", "", prop)] = content try: self.keywords = self.meta["keywords"] except KeyError: self.keywords = [""] return self.meta @check_status #@debug def check_lang(self): '''checking lang''' try: self.lang = detect(self.text) except Exception as e: logging.warning("No lang detected in article") try: self.lang = detect(self.title) except Exception as e: logging.warning("No lang detected in title") self.lang = None if self.filter_lang is not False: if self.lang == self.filter_lang: return self else: self.status = False return self.status @check_status #@debug def filter_text(self): '''filter_text: checking relevancy''' q = Query(self.query, self.directory) #print "Debug query", q.query doc = {"content": self.text, "title": self.title} relevant = q.match(doc) if relevant is False: self.code = 800 self.msg = "Article Query Filter: text not relevant" self.status = False return self.status else: self.status = True return self def format_export(self): '''format export''' #for n in ["url_id","url", "cited_links", "cited_links_ids","source_url", "cited_domains", "title", "text", "keywords", "generators", "extension", "filetype", "depth", "crawl_nb", "status", "msg", "date", "code", "nb", "total"]: pass #@debug def set_data(self): '''Set data : creating default page info''' data = {} for n in ["date", "url", "url_id","url", "cited_links", "cited_links_ids","source_url", "cited_domains", "title", "text","html", "keywords", "generators", "extension", "filetype", "depth", "crawl_nb", "status", "msg", "date", "code", "lang"]: #unique info if n in ["url_id","url","extension", "filetype", "depth", "crawl_nb", "source_url", "type", "lang"]: if n in ["type"]: if self.depth == 0: data["type"] = "source" else: if self.status is True: data["type"] = "page" else: data["type"] = "log" else: try: #conserver le type de donnée data[n] = self.__dict__[n] except KeyError: if n in ["crawl_nb", "depth"]: data[n] = 0 else: data[n] = None #multiple info else: try: data[n] = [self.__dict__[n]] except KeyError: data[n] = [None] #meta_data #~ for k, v in self.meta.items(): #~ data["meta_"+k] = v return data #@debug def add_data(self): '''Add data : updating values of page_info adding contextual info to existing''' data = {} for n in ["cited_links", "cited_links_ids", "cited_domains", "title", "text","html", "keywords", "generators", "status", "code", "msg", "date"]: try: data[n] = {"$each":[self.__dict__[n]], "$position":0} except KeyError: data[n] = {"$each":[None], "$position":0} return data def get_status(self): data = {} for k in ["status", "date", "code", "msg"]: try: data[k] = self.__dict__[k] except KeyError: data[k] = None return data
class Gist: keyword_pattern = re.compile(r"^[^\d]+$") stop_words = set(get_stop_words("en")) def __init__(self, html): self.html = html self.document = Document(html) @property def title(self): return self.document.short_title() @cached_property def text(self): text = self.document.summary() text = re.sub("<br[^>]+>", "\n", text) text = re.sub("</?p[^>]+>", "\n\n", text) text = re.sub("<[^>]+>", "", text) text = re.sub("^[ \t]+$", "", text) text = re.sub("\n{3,}", "\n\n", text, flags=re.MULTILINE) return text @staticmethod def _common_prefix(one, two): parallelity = [x == y for x, y in zip(one, two)] + [False] return parallelity.index(False) @classmethod def _find_representative(cls, stem, text): tokens = text.split() prefixes = {token: cls._common_prefix(token, stem) for token in tokens} best = lambda token: (-token[1], len(token[0])) return sorted(prefixes.items(), key=best)[0][0] @classmethod def _is_good_keyword(cls, word): return (word not in cls.stop_words) and cls.keyword_pattern.match(word) @classmethod def find_keywords(cls, text): whoosh_backend = SearchForm().searchqueryset.query.backend if not whoosh_backend.setup_complete: whoosh_backend.setup() with whoosh_backend.index.searcher() as searcher: keywords = searcher.key_terms_from_text("text", text, numterms=10, normalize=False) keywords = list(zip(*keywords))[0] if keywords else [] keywords = [ cls._find_representative(keyword, text) for keyword in keywords ] keywords = [ keyword for keyword in keywords if cls._is_good_keyword(keyword) ] # no double keywords in list keywords = list(set(keywords)) # no punctuation in suggested keywords keywords = [ "".join(c for c in s if c not in string.punctuation) for s in keywords ] return keywords @property def keywords(self): return self.find_keywords(self.text)
# encoding:utf-8 # import html2text import requests import time import re from readability.readability import Document url = "http://world.huanqiu.com/exclusive/2016-07/9209839.html" # res = requests.get('http://finance.sina.com.cn/roll/2019-02-12/doc-ihrfqzka5034116.shtml') res = requests.get(url) st = time.time() d = Document(res.content) # 获取新闻标题 readable_title = d.short_title() print(readable_title) # 获取内容并清洗 readable_article = d.summary() # print(readable_article) print(d.get_clean_html()) print("time: {}".format(time.time() - st)) # text_p = re.sub(r'</?div.*?>', '', readable_article) # text_p = re.sub(r'((</p>)?<a href=.*?>|</a>(<p>)?)', '', text_p) # text_p = re.sub(r'<select>.*?</select>', '', text_p) # print(text_p)
class ReadHtml: """ Extract main article content from a webpage @see http://github.com/buriy/python-readability """ def __init__(self, url): self.url = url self.html = self.get_html() self.text = ReadHtml.strip_tags(self.html) try: self.doc = Document(self.html) except TypeError as e: log.error('{}: {}'.format( self.url, e )) # Special cases for URLs at these domains: # news.ycombinator.com # slashdot.org # reddit.com # - Attempt to download all linked pages # If the bookmark is at the root level (/) # mirror the entire site # Attempt to find the Hacker News comments thread # Special cases for images and PDFs # - use ReadImg and ReadPdf instead def get_html(self): """ Download the HTML of the URL """ try: return self.html except AttributeError: self.request = urllib.request.urlopen(self.url) self.html_bytes = self.request.read() try: return self.html_bytes.decode('utf-8') except UnicodeDecodeError: return self.html_bytes.decode('iso-8859-1') def get_http_code(self): return int(self.request.getcode()) def get_http_headers(self): return ReadHtml.convert_http_headers( self.request.info().items() ) @staticmethod def convert_http_headers(items): headers = {} for header in items: headers[header[0]] = header[1] return headers def get_content(self): """ Get the readable main content from a webpage """ try: return self.doc.summary() except (TypeError, AttributeError, Unparseable) as e: log.error('{}: {}'.format( self.url, e )) return '' def get_title(self): """ Get the "readable title" from a webpage """ try: return self.doc.short_title() except (TypeError, AttributeError, Unparseable) as e: log.error('{}: {}'.format( self.url, e )) return '' def get_text(self): """ Return the extracted text from the webpage """ return self.text def get_links(self): """ Return a list of URL strings captured from the article body """ pass def is_article(self): """ Can the webpage be read as an article? TODO: This can be more sophisticated, such as checking for <p> tags, setting a higher threshold for number of characters TODO: Find out if Readability will return this value in the API, even though it misses a lot """ if len(self.text) > 0: return True return False @staticmethod def strip_tags(html_string): """ Strip tags and extract the text from the HTML string @see http://stackoverflow.com/a/925630 """ s = MLStripper() s.feed(str(html_string)) return s.get_data()