def test_best_elem_is_root_and_passing(self): sample = ( '<html class="article" id="body">' ' <body>' ' <p>1234567890123456789012345</p>' ' </body>' '</html>' ) doc = Document(sample) doc.summary()
def test_si_sample(self): """Using the si sample, load article with only opening body element""" sample = load_sample('si-game.sample.html') doc = Document(sample) doc.parse(["summary"]) res = doc.summary() self.assertEqual('<html><body><h1>Tigers-Roya', res[0:27])
def convert(link): """ use burify's readability implementation to transcode a web page and return the transcoded page and images found in it """ if not link: logger.error('Cannot transcode nothing!') return None, None, None try: data = transcoder.prepare_link(link) if data: article = Document(data) if article: images, content = _collect_images( article.summary(html_partial=False), link) return article.short_title(), content, images else: logger.info('Burify cannot recognize the data') return None, None, None else: logger.info('Cannot parse %s correctly' % link) return None, None, None except Exception as k: logger.error('%s for %s' % (str(k), str(link))) return None, None, None
def process_item(self, article, spider): doc = Document(article['text']) article['text'] = strip_tags(doc.summary()) article['hash'] = hashlib.sha256(article['url']).hexdigest() return article
def get(self): url = self.get_argument("url", None) # https://www.ifanr.com/1080409 doc = Webcache.find_one({'url': url}, {'_id': 0}) if doc: self.res = dict(doc) return self.write_json() try: sessions = requests.session() sessions.headers[ 'User-Agent'] = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.131 Safari/537.36' response = sessions.get(url) # response.encoding = 'utf-8' # TODO response.encoding = get_charset(response) doc = Document(response.text) title = doc.title() summary = doc.summary() markdown = html2text.html2text(summary) markdown = markdown.replace('-\n', '-') markdown = markdown.strip() res = {} res['url'] = url res['title'] = title res['markdown'] = markdown if title and markdown: webcache = Webcache webcache.new(res) self.res = res self.write_json() except Exception as e: print(e)
def test_lxml_obj_result(self): """Feed Document with an lxml obj instead of an html string. Expect an lxml response""" utf8_parser = lxml.html.HTMLParser(encoding='utf-8') sample = lxml.html.document_fromstring(load_sample('nyt-article-video.sample.html'), parser=utf8_parser) doc = Document(sample, url='http://nytimes.com/') res = doc.summary() self.assertFalse(isinstance(res, basestring))
def test_correct_cleanup(self): sample = """ <html> <body> <section>test section</section> <article class=""> <p>Lot of text here.</p> <div id="advertisement"><a href="link">Ad</a></div> <p>More text is written here, and contains punctuation and dots.</p> </article> <aside id="comment1"/> <div id="comment2"> <a href="asd">spam</a> <a href="asd">spam</a> <a href="asd">spam</a> </div> <div id="comment3"/> <aside id="comment4">A small comment.</aside> <div id="comment5"><p>The comment is also helpful, but it's still not the correct item to be extracted.</p> <p>It's even longer than the article itself!"</p></div> </body> </html> """ doc = Document(sample) s = doc.summary() #print(s) assert('punctuation' in s) assert(not 'comment' in s) assert(not 'aside' in s)
def test_si_sample_html_partial(self): """Using the si sample, make sure we can get the article alone.""" sample = load_sample('si-game.sample.html') doc = Document(sample) doc.parse(["summary"], html_partial=True) res = doc.summary() self.assertEqual('<div><h1>Tigers-R', res[0:17])
def test_many_repeated_spaces(self): long_space = ' ' * 1000000 sample = '<html><body><p>foo' + long_space + '</p></body></html>' doc = Document(sample) s = doc.summary() assert 'foo' in s
def test_si_sample(self): """Using the si sample, load article with only opening body element""" sample = load_sample('si-game.sample.html') doc = Document( sample, url='http://sportsillustrated.cnn.com/baseball/mlb/gameflash/2012/04/16/40630_preview.html') res = doc.summary() self.assertEqual('<html><body><div><div class', res[0:27])
def test_si_sample_html_partial(self): """Using the si sample, make sure we can get the article alone.""" sample = load_sample("si-game.sample.html") doc = Document( sample, url="http://sportsillustrated.cnn.com/baseball/mlb/gameflash/2012/04/16/40630_preview.html" ) res = doc.summary(enclose_with_html_tag=True) self.assertEqual('<div><div class="', res[0:17])
def get(self): urls = self.get_query_arguments('url') if urls and len(urls) == 1: url = urls[0] doc = Document(requests.get(url).text) self.write(smartypants(doc.summary())) self.write(STYLE) else: self.write("Please provide ?url=[your-url]")
def transform(self, row, chan): row['response'] = resolve_future(row['response']) doc = Document(row['response'].content) row['title'] = doc.title() summary = doc.summary() row['text'] = html2text(summary, bodywidth=160).replace('****', '').strip() yield row
def extract_article(url, ip): """Extracts the article using readability""" title, summary = None, None response = get_url(url, ip) if response.status_code == 200: doc = Document(response.content) summary = unicode(doc.summary()) title = unicode(doc.title()) return title, summary else: return None
def extract_article(html, title=None): """ Wraps around readability.Document and returns the articles title and content. """ doc = Document(html, negative_keywords=settings.ARTEX_NEGATIVE_KEYWORDS) doc_title = doc.short_title() # invoke the summary method to invoke readability's magic doc.summary(html_partial=True) # obtain the article as HtmlElement tree: html_tree = doc.html # clean up the article html: clean_html = cleanup(html_tree, doc_title) # check if the outer element is a tag from negative_keywords if elem_attr_contain(clean_html, settings.ARTEX_NEGATIVE_KEYWORDS): bad_attr = True else: bad_attr = False if clean_html.tag in settings.ARTEX_NEGATIVE_KEYWORDS or bad_attr: # if so, redo extraction with min_text_length set to 0 doc = Document(html, negative_keywords=settings.ARTEX_NEGATIVE_KEYWORDS, min_text_length=0) doc_title = doc.short_title() # invoke the summary method to invoke readability's magic doc.summary(html_partial=True) # obtain the article as HtmlElement tree: html_tree = doc.html # clean up the article html: clean_html = cleanup(html_tree, doc_title) content = elem_content_to_string(clean_html) if title: # if the extracted title is not a subset of given title, use # the given title (b/c we assume this is more accurate, but # maybe with some unneccessary boilerplate). if not doc_title in title or doc_title == '': doc_title = title return doc_title, content
def extract_content_texts(name): article_archive = os.path.join(DEFAULT_SAVE_PATH, name, 'raw_articles') json_archive = os.path.join(DEFAULT_SAVE_PATH, name, 'json_articles') mkdir_p(json_archive) for html in glob.glob(article_archive+'/*.html'): fname = os.path.basename(html)+'.json' savepath = os.path.join(json_archive, fname) if os.path.exists(savepath): logging.info('Skipping existing json data: {0}'.format(savepath)) continue data = {} with open(html, 'r') as myfile: doc = Document(myfile.read()) data['title'] = doc.title() data['content'] = doc.content() data['summary'] = doc.summary() with open(savepath, 'w') as saving: json.dump(data, saving)
def preliminary_parse(self): if(not self.is_downloaded): raise Exception("not downloaded") try: d = Document(self.html) self._readability_title = d.short_title() self._readability_text = d.summary() logging.debug(u"readability title: {0}".format(repr(self._readability_title))) logging.debug(u"readability text: {0}".format(repr(self._readability_text))) if(self._readability_title and self._readability_text): return except Exception as e: logging.warning("error while doing readability parse: {0}".format(str(e))) logging.debug("falling back to newspaper parse") self.newspaper_article.parse() logging.debug(u"newspaper title: {0}".format(repr(self._newspaper_title))) logging.debug(u"newspaper text: {0}".format(repr(self._newspaper_text)))
def get(self): sharetype = self.get_argument("sharetype", "goodlink") link = self.get_argument("link", '') user_id = self.current_user["user_id"] assert link url = link doc = Webcache.find_one({'url': url}, {'_id': 0}) if not doc: sessions = requests.session() sessions.headers[ 'User-Agent'] = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.131 Safari/537.36' response = sessions.get(url) # response.encoding = 'utf-8' # TODO response.encoding = get_charset(response) logger.info('response.encoding {}'.format(response.encoding)) doc = Document(response.text) doc_title = doc.title() summary = doc.summary() _markdown = html2text.html2text(summary) _markdown = _markdown.replace('-\n', '-').strip() res_webcache = {} res_webcache['url'] = url res_webcache['title'] = doc_title res_webcache['markdown'] = _markdown if _markdown: webcache = Webcache webcache.new(res_webcache) else: logger.info('already') doc_title = doc.title res = { 'title': doc_title, 'sharetype': sharetype, 'link': link, } share = Share res['user_id'] = user_id share = share.new(res) user = User.by_sid(user_id) user.user_leaf += 10 user.save() self.redirect("/share/" + str(share.id))
def complement(self): for entry in self.entries: try: response = requests.get(entry.url, timeout=10) except requests.RequestException as excp: logger.warn('Exception requesting article %s: %s', entry.url, excp.message) continue document = Document(response.content, url=response.url) # Image extraction first document._html() # Trigger parsing images = document.html.xpath( '//meta[@property="og:image"]/@content') images += document.html.xpath( '//meta[@name="twitter:image:src"]/@content') # Content extraction second entry.url = response.url entry.image = (images or [''])[0] entry.title = document.short_title() entry.content = document.summary() yield entry
def extract(self, item): """Creates an readability document and returns an ArticleCandidate containing article title and text. :param item: A NewscrawlerItem to parse. :return: ArticleCandidate containing the recovered article data. """ doc = Document(deepcopy(item['spider_response'].body)) description = doc.summary() article_candidate = ArticleCandidate() article_candidate.extractor = self._name article_candidate.title = doc.short_title() article_candidate.description = description article_candidate.text = self._text(item) article_candidate.topimage = self._topimage(item) article_candidate.author = self._author(item) article_candidate.publish_date = self._publish_date(item) article_candidate.language = self._language(item) return article_candidate
def parse_web_page(text): """ Generic wep page parser with readability. Used as a fallback. :param text: unicode text :return: title, article :raise ParserException: """ try: from readability import Document from readability.readability import Unparseable except ImportError: raise ParserException('readability is not installed') if not text: raise ParserException('No decoded text available, aborting!') try: doc = Document(text) except Unparseable as e: raise ParserException(e.message) else: return doc.short_title(), doc.summary(True)
return TAG_RE.sub('', text) superbowl_media = get_media_data('tweets_#superbowl') #superbowl_media.to_pickle('superbowl_media') #superbowl_media = pd.read_pickle('superbowl_media') target_day = superbowl_media[superbowl_media['day']==1] target_time = target_day[target_day['hour']==19] target = target_time[target_time['name']=='YahooSports'] url = target.iloc[0]['media'][0]['expanded_url'] import requests from readability import Document response = requests.get(url) doc = Document(response.text) print(remove_tags(doc.summary())) t = open('article.txt','w') t.write(remove_tags(doc.summary())) t.close() from sumy.parsers.plaintext import PlaintextParser from sumy.nlp.tokenizers import Tokenizer from sumy.summarizers.lex_rank import LexRankSummarizer file = "article.txt" parser = PlaintextParser.from_file(file, Tokenizer("english")) summarizer = LexRankSummarizer() summary = summarizer(parser.document, 5) print(doc.title()) for sentence in summary: print(sentence)
def predict(): df = pd.read_csv('../webapp/revised_rating_data') lemmatized = df['lemmatized'].tolist() X_class = df['lemmatized'] y_class = df['point_non-bad'] X_train_class, X_test_class, y_train_class, y_test_class = train_test_split(X_class, y_class, test_size=0.25, random_state=42) tvec_class = TfidfVectorizer(stop_words='english') tvec_class.fit(X_train_class.values.astype('U')) X_train_class = tvec_class.transform(X_train_class.values.astype('U')) lr_class = LogisticRegression() lr_class.fit(X_train_class, y_train_class) data = pd.read_csv('../webapp/revised_data') X = df['lemmatized'] y = data['topics'] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state=42) tvec = TfidfVectorizer(stop_words='english') tvec.fit(X_train.values.astype('U')) X_train = tvec.transform(X_train.values.astype('U')) lr = LogisticRegression() lr.fit(X_train, y_train) if request.method == 'POST': message = request.form['message'] data = message response = requests.get(data) doc = Document(response.text) full_text = doc.summary(html_partial=True) full_text = full_text.replace(r"\n", " ") full_text = full_text.replace(r"\t", " ") full_text = full_text.replace(r"/", " ") full_text = full_text.replace(r"<p>", " ") full_text = normalize('NFKD', full_text) full_text = full_text.split('< p>') TAG_RE = re.compile(r'<[^>][^>]+>') def remove_tags(text): return TAG_RE.sub(' ', text) term_text = list(map(remove_tags, full_text)) term_frame = pd.DataFrame(np.array(term_text), columns = ['quoteText']) def text_to_words(titletext): letters_only = re.sub("[^a-zA-Z]", " ", titletext) words = letters_only.lower().split() lemmatizer = WordNetLemmatizer() tokens_lem = [lemmatizer.lemmatize(i) for i in words] return(' '.join(tokens_lem)) lemm_text=[] for text in term_frame['quoteText']: lemm_text.append(text_to_words(text)) vect_class = tvec_class.transform(lemm_text).toarray() prediction_class = pd.DataFrame(lr_class.predict_proba(vect_class), columns=['warning','non-warning']) vect = tvec.transform(lemm_text).toarray() prediction = pd.DataFrame(lr.predict(vect), columns =['pred_topic']) results = pd.merge(term_frame, prediction, left_index=True, right_index=True) results = pd.merge(results, prediction_class, left_index=True, right_index=True) results = results.sort_values('non-warning') my_prediction = results["warning"].mean() #results = results[results['warning'] > 0.3 ] topics = [] topicIndx = [] topicContent=[] for i in results['pred_topic']: if i not in topics: topics.append(i) for i in topics: topic = results[results['pred_topic'] == i] count = 0 for j in topic.index: count +=1 topicContent.append(topic.quoteText[j]) topicIndx.append(i) df1 = pd.DataFrame({'topic':topicIndx, 'content':topicContent}) df1 = df1.replace('\n','', regex=True) df1 = df1.replace('<i>','', regex=True) df1 = df1.replace(' ','', regex=True) return render_template('result-Copy1.html', prediction = my_prediction, df1 = df1.to_html())
def test_utf8_kanji(self): """Using the UTF-8 kanji sample, load article which is written in kanji""" sample = load_sample("utf-8-kanji.sample.html") doc = Document(sample) res = doc.summary()
def format_html(cls, row, media_path, content=None, custom_html=False): media_dir, file_path = os.path.split(media_path) resource_dir = os.path.join(settings.ARCHIVE_LOCATION, 'resources', str(row.id)) resource_link = '/{}/{}/{}/{}'.format(row.usr.username, row.directory, str(row.id), 'resources') if not os.path.exists(resource_dir): os.makedirs(resource_dir) if not content: content = "" with open(media_path, encoding='utf-8', mode='r') as fd: content = fd.read() soup = BeautifulSoup(content, 'lxml') for script in soup.find_all('script'): script.decompose() url_path = row.url ourl = urlparse(url_path) ourld = ourl.scheme + '://' + ourl.netloc link_list = soup.find_all(['a', 'link', 'img']) for link in link_list: if link.name == 'img': lnk = link.get('src', '') else: lnk = link.get('href', '') if lnk and lnk != '#': if link.name == 'img' or (link.name == 'link' and '.css' in lnk): lnk = dbxs.format_link(lnk, url_path) lnk_bytes = bytes(lnk, 'utf-8') h = hashlib.sha256(lnk_bytes) lnk_hash = h.hexdigest() if link.name == 'img': link['src'] = resource_link + '/' + lnk_hash if custom_html: link['class'] = 'card-img-top' else: lnk_hash = lnk_hash + '.css' link['href'] = resource_link + '/' + lnk_hash file_image = os.path.join(resource_dir, lnk_hash) if not os.path.exists(file_image): cls.vnt_noblock.get(lnk, out=file_image) logger.info('getting file: {}, out: {}'.format(lnk, file_image)) elif lnk.startswith('http'): pass else: nlnk = dbxs.format_link(lnk, url_path) if link.name == 'img': link['src'] = nlnk if custom_html: link['class'] = 'card-img-top' else: link['href'] = nlnk if custom_html: ndata = soup.prettify() if soup.title: title = soup.title.text else: title = row.url.rsplit('/')[-1] data = Document(ndata) data_sum = data.summary() if data_sum: nsoup = BeautifulSoup(data_sum, 'lxml') if nsoup.text.strip(): data = cls.custom_template(title, nsoup.prettify(), row) else: data = cls.custom_soup(ndata, title, row) else: data = cls.custom_soup(ndata, title, row) else: data = soup.prettify() return bytes(data, 'utf-8')
def selectalgo(search_name, _PATH): jenableparallel = True try: jieba.enable_parallel(2) except: jenableparallel = False print("This env can't enable jieba parallel") link = "https://zh.wikipedia.org/wiki/" + search_name site = requests.get(link) text = BeautifulSoup(site.content, "html.parser") wikiTitle = text.find(id="firstHeading").getText() text = text.find(id="mw-content-text").extract() decolist = [ "hatnote", "infobox", "navbox", "vertical-navbox", "toc", "mw-editsection", "reference", "plainlist", "plainlists", "references-column-width", "refbegin" ] # decompose key word for deco in decolist: for s in text.find_all(class_=deco): s.decompose() for s in text.find_all("sup"): text.sup.decompose() if (text.find(id="noarticletext")): print("noarticletext") return "noarticletext", None selectpos = ["l", "n", "nr", "v", "vn", "eng"] # select pos tags = jieba.analyse.extract_tags(OpenCC('tw2sp').convert(text.getText()), topK=20, withWeight=True, allowPOS=(selectpos)) bantag = ["編輯", "條目"] # ban wiki tag taglist = Taglist() # tfidffile = open(_PATH+search_name+"textsegmentation.txt", "w") for tag, wei in tags: if OpenCC('s2twp').convert(tag) in bantag or OpenCC('s2twp').convert( tag) in search_name: # tags.remove((tag, wei)) continue print(tag, wei) taglist.append(Tag(tag, wei)) # tfidffile.write("{} {}\n".format(tag, wei)) # tfidffile.close() header = { "User-Agent": "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/525.13 (KHTML, like Gecko) Chrome/0.2.149.27 Safari/525.13" } search_results = google.search(search_name) banword = [ "ppt", "slide", "pdf", "news", "tv", "facebook.com", "平台", "平臺", "books.com", "course", "課程", "偽基", "youtube.com", "cw.com", "www.104.com", "udn.com", "KKTIX", "pcschool.com" ] # banword = [] selectsite = [] opcc = OpenCC('tw2sp') for i, res in enumerate(search_results): print(res.name, "{}/{}".format(i + 1, len(search_results))) print(res.link) banflag = False for bw in banword: if bw in res.name or bw in res.link: print("<{}>".format(bw)) banflag = True break if banflag: continue try: response = requests.get(res.link, headers=header) except: print("some thing error") else: if "wikipedia" in res.link and False: print("iswiki") soup = text else: doc = Document(response.text) newhtml = doc.summary() converted = opcc.convert(newhtml) soup = BeautifulSoup(converted, "html.parser") words = jbps.cut(soup.get_text()) # record = [] # record.append(res.name+"\n") # record.append(res.link+"\n") # record.append(res.description+"\n") score = 0 tagset = set() for word, _ in words: # print(word) index = taglist.isInName(word) if index >= 0 and not index in tagset: # record.append("%s %f\n" % (word, taglist[index].weight)) score += taglist[index].weight tagset.add(index) # print(res.name, score) if score > 0: webname = "" offset = 7 if res.link[offset] == '/': offset += 1 for c in res.link[offset:]: if c != '/': webname += c else: break print(webname) selectsite.append( Selected(res.name, webname, res.link, score, res.description, soup)) # record.append(str(score)) # with open(_PATH+"score/{}_{:.2f}.txt".format(webname, score), "w") as file: # file.writelines(record) if jenableparallel: jieba.enable_parallel() return wikiTitle, sorted(selectsite, key=lambda s: s.score, reverse=True)[:5]
if not request.headers["content-type"][:9] in ["text/html", "text/plain"]: return False return True def get_site_content(link): """Try and extract site content from url""" rv = "" try: r = requests.get(link, timeout=15.0) except requests.exceptions.RequestException, e: logger.warning("Failed loading URL '{}': {}".format(link, e)) else: if valid_request(r): # extract the (most likely) main content doc = Document(r.text, url=link) content = doc.summary(html_partial=True) rv = remove_html(content) else: logger.info("Invalid request {} for url '{}'".format(r, link)) return rv def repeated_func_schedule(time, func): spawn_later(0, func) spawn_later(time, repeated_func_schedule, time, func)
def html_read_to_text(html): doc = Document(html) print(doc.title()) print(doc.summary())
def clean_content((url, content)): try: doc = Document(content) yield url, doc.summary() except Unparseable: pass
def clean_html(epub, epub_path, source_code, url, file_idx): # activated from fetch_page blacklist = ['script', 'style', 'dd', 'em', 'text', 'blockquote'] graylist = ['div', 'h1', 'h2', 'h3', 'h4', 'h5', 'span'] doc = Document(source_code.text) # or: DefaultExtractor ArticleExtractor ArticleSentencesExtractor KeepEverythingExtractor # NumWordsRulesExtractor CanolaExtractor KeepEverythingWithMinKWordsExtractor LargestContentExtractor # extractor = Extractor(extractor='KeepEverythingExtractor', url=url) # extracted_html = extractor.getHTML() # print (source_code.text) base_url = '{uri.scheme}://{uri.netloc}'.format(uri=urlparse(url)) # soup = BeautifulSoup(doc.summary(), "html.parser") soup = BeautifulSoup(doc.summary(), "lxml") # print(soup) # soup = BeautifulSoup(extracted_html, "html.parser") for tag in soup.findAll(): del tag['srcset'] del tag['align'] del tag['data-file-height'] del tag['data-file-width'] del tag['role'] id = str(tag.get('id')) ch = id.find(':') if ch > -1: id = id[:ch] + id[ch + 1:] tag['id'] = id # print(': '+id) ch = id.find(',') if ch > -1: id = id[:ch] + id[ch + 1:] tag['id'] = id # print(', '+id) ch = id.find('.') if ch > -1: id = id[:ch] + id[ch + 1:] tag['id'] = id # print('. '+id) if tag.name.lower() in blacklist: # blacklisted tags are removed in their entirety tag.extract() elif tag.name.lower() in graylist: tag.attrs = [] # del tag['class'] for tag in soup.findAll( 'a'): # make all external links absolute and complete href = str(tag.get('href')) if not href: if href.startswith('http'): pass elif href.startswith('//'): href = 'http:' + href elif href.startswith('/'): href = base_url + href elif href.startswith('#'): # relative link to #id pass else: href = url + '/' + href tag['href'] = href idx = 0 # for tag in soup.findAll('html'): # tag['xmlns'] = "http://www.w3.org/1999/xhtml" for tag in soup.findAll('img'): src = tag.get('src') ext = src[-3:] if ext == 'png' or ext == 'jpg': if src.startswith('http'): pass elif src.startswith('//'): src = 'http:' + src elif src.startswith('/'): src = base_url + src else: src = url + '/' + src img_name = 'img_' + str(file_idx) + '_' + str(idx) + '.' + ext # format: images/img_0_0.png tag['src'] = '../' + get_img(epub, epub_path, src, img_name) del tag['srcset'] idx += 1 html = str(soup) body = re.compile(r'<body\b[^>]*>', re.I) # <body attributes>-tag html = body.sub('<body><h1>' + doc.title() + '</h1>', html) head = re.compile(r'<html\b[^>]*>', re.I) # <html attributes>-tag html = head.sub( '<html xmlns="http://www.w3.org/1999/xhtml"><head><title>' + doc.title() + '</title><link href="../css/epub.css" rel="stylesheet" type="text/css"/></head>', html) # print(html[:300]) doctype = '''<?xml version='1.0' encoding='utf-8'?> <!DOCTYPE html PUBLIC '-//W3C//DTD XHTML 1.1//EN' 'http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd'> ''' # html = '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">' + html html = doctype + html html = html.encode('utf-8') return html, doc.title()
def searchOndDay(curDate, howManyNewsOneDay=5, fiddler=None): # 搜索引擎 searchEngine = 'bing.com' # 这个不能改,因为后边的完整url是定制的 # 计算URL delta = datetime3.date(2019, 11, 1) - curDate deltaNum = 18201 - delta.days print('搜素日期:', curDate) searchUrl = r"https://cn.bing.com/search?q=737max%e7%a9%ba%e9%9a%be&filters=ex1%3a%22ez5_" + \ str(deltaNum) + "_" + str(deltaNum) + \ r"%22&redir=2&frb=1&qpvt=737max%e7%a9%ba%e9%9a%be" print('搜索Url:', searchUrl) # 发送一个http请求并接收结果 r = requests.getPlus(searchUrl, verify=fiddler) # 判断http请求是否正确返回 if r.status_code != 200: print('error:搜索页状态码异常') return 0 # 获取返回html文本 '''r.encoding = "utf-8" # 因为是针对bing,我们知道编码肯定是utf-8''' searchHtml = r.text # 判断返回中是否有查询结果,判断是否被ban t = re.findall(r'条结果', searchHtml, re.I) if t == []: print('error:被ban了') return 0 else: t = re.findall(r'\d+(?= 条结果)', searchHtml, re.I) t = t[0] print('搜索结果共几条:', t) # 解析searchHtml tree = etree.HTML(searchHtml) # 真正有效的新闻有几条(不算视频集和图片集) newsList = tree.xpath( '/html/body[1]/div[1]/main[1]/ol[1]/li[@class="b_algo"]') newsNum = len(newsList) print('真正有效的新闻共几条:', newsNum) # 保存搜索页 file = open("./corpora/" + searchEngine + '_' + str(curDate) + '.html', "wb") file.write(searchHtml.encode('utf-8')) file.close() # 循环(howManyNewsOneDay)条真正有效的新闻 newsIndex = 0 # 注意是从1开始的,因为以上来就+=1(历史原因,懒得改了) howManyNewsSaved = 0 while howManyNewsSaved < howManyNewsOneDay: newsIndex += 1 # 如果总共都不够那么多条,那及时退出 if newsIndex > newsNum: break print(' 第%d个新闻' % newsIndex) # 取出当前新闻的相关信息 news = newsList[newsIndex - 1] titleElement = news.xpath('./h2/a') # 判断是否网页新闻(有可能是ppt,pdf) if titleElement == []: print(' 新闻可能是文件形式,不算数') continue titleElement = titleElement[0] newsUrl = titleElement.attrib['href'] print(' 网址:', newsUrl) newsTitle = titleElement.text print(' 标题:', newsTitle) introduction = news.xpath('string(./div[1]/p[1])') print(' 简介:', end='') print(indent(introduction, length=40, fIndent=0, lIndent=10)) newsTime = re.findall(r'^\d+-\d+-\d+', introduction, re.I)[0] newsTimeYear = int(re.findall(r'^\d+(?=-)', newsTime, re.I)[0]) newsTimeMonth = int(re.findall(r'(?<=-)\d+(?=-)', newsTime, re.I)[0]) newsTimeDay = int(re.findall(r'(?<=-)\d+$', newsTime, re.I)[0]) print(' 发布时间:', newsTime) newsId = searchEngine + '_' + str(curDate) + '_' + str(newsIndex) print(' Id:', newsId) # 判断是否文字新闻,是否合格 host = re.search('(?<=://)\S+?(?=/)', newsUrl).group() if host in [ 'www.yunjuu.com', 'v.qq.com', 'www.bilibili.com', 'v.youku.com', 'haokan.baidu.com', ]: print(' 新闻不合格,这个不算数') continue # 访问新闻网页 try: r = requests.getPlus(newsUrl, verify=fiddler) except Exception as e: print(' 这个新闻网站跪了,不算数:', e) continue # 是否返回成功 if r.status_code != 200: print(' error: 状态码非200,不算数') continue # 获取返回html文本 '''r.encoding = "utf-8"''' newsHtml = r.text # 去掉html中的回车和多余空格 newsHtml = newsHtml.replace('\n', '') newsHtml = newsHtml.replace(' ', '') # 用readability抽取主要信息 newsdoc = Document(newsHtml) newsTitle = newsdoc.title() print(' 标题:', newsTitle) newsContentWithTags = newsdoc.summary() # readability包的处理结果是带着html标签的 # 去掉html标签,得到纯文本 newsContent = html2text(newsContentWithTags) # 输出content print(' 正文:', end='') print(indent(newsContent, length=40, fIndent=0, lIndent=10)) # 判断是否文字新闻,是否合格 if len(newsContent) < 270: print(' 新闻不合格,这个不算数') continue # 插入数据库 SysDb.insertRow( 'websiteTabel', { '搜索引擎': searchEngine, '搜索日期年': curDate.year, '搜索日期月': curDate.month, '搜索日期日': curDate.day, '搜索网址': searchUrl, '搜索html': searchHtml, '新闻序号': newsIndex, '新闻ID': newsId, '新闻网址原': newsUrl, '新闻网址真': r.url, '新闻html': newsHtml, '新闻标题': newsTitle, # '新闻作者': {'类型': '文本', '初始值': None, '主键否': '非主键'}, # '新闻机构': {'类型': '文本', '初始值': None, '主键否': '非主键'}, '新闻日期年': newsTimeYear, '新闻日期月': newsTimeMonth, '新闻日期日': newsTimeDay, '新闻正文': newsContent }) # 保存了一个,计数加一 howManyNewsSaved += 1
def parseitem(self,response): ':type response: Response' if 'Please turn on JavaScript' in response.body: body = response.body body = re.sub('<p class="caption"[^<]+', '', body) body = re.sub('<noscript>(.|\r|\n)*?</noscript>','',body) response = response.replace(body=body) sel = Selector(response) item = NewsscraperItem() ### storing the name of URL and source in item dictionary item['url']= response.url item['source']= self.name ### extracting the time of scraping of data inside the item item['dateScraped']= strftime("%Y-%m-%d %H:%M:%S", gmtime()) ### checking for url category defined in allowed domain. If response.url contain the string then move in otherwise in else condition try: if 'www.bbc.co.uk' in response.url: ### extracting title title = sel.xpath("//h1[starts-with(@class,'story')]/text()").extract() if(title): ### extracting title from the page and checking different xpath for searching the title item['title']=title[0].strip() ### extracting date from the page using xpath d = sel.xpath("//span[@class='date']/text()").extract()[0].strip() ###string to datetime conversion f = strptime(d,'%d %B %Y') ###formating date in a particular format defined in config file item['date']= strftime(Config['dateformat'],f) ### extracting content from the page x = sel.xpath("(//div[@class='story-body']//*[self::p or self::strong]/text()) |(//span[@class='cross-head']/text())|(//div[@class='story-body']/p/a/text())").extract() if len(x) > 1: st="\n" p = st.join(x) ### using regular expression to remove continuous white spaces from the content and replace by single space item['content']= re.sub(r"[ \t\n]+", " ",p) else: # Not able to extract article content using xpath. Move to backup approach and use readability try: html = sel.xpath("//div[@class = 'story-body']").extract() doc = Document(html) doc.options['debug'] = False try: logging.basicConfig(level=logging.CRITICAL) htmlContent = doc.summary() content = html2texthandler(htmlContent) except Exception, e: pass finally: logging.basicConfig(level=logging.INFO) item['content']= re.sub(r"[ \t\n\"]", " ",content) except: return
import requests from readability import Document # https://github.com/buriy/python-readability # response = requests.get('http://example.com') # response = requests.get('http://usosdelasticsenlaadministracion.blogspot.com/') # https://williamjturkel.net/2013/06/15/basic-text-analysis-with-command-line-tools-in-linux/ response = requests.get('http://www.eumed.net/ce/2015/1/tecnologia.html') doc = Document(response.text) # print(doc.title()) content = doc.summary() # print(doc.summary()) file = open('tecnologia.html', 'w') file.write(content) file.close()
def get_summary(content): doc = Document(content) summary = doc.summary(html_partial=True) return summary
def html_select(raw_html, xpath_lan): doc = Document(raw_html) summary_html = doc.summary() # print(summary_html) selector = etree.HTML(summary_html) return selector.xpath(xpath_lan)
def getContent(): """收集内容""" """ 你的 APPID AK SK """ APP_ID = '14658509' API_KEY = 'C14bCL7NkReQpak382maUYXi' SECRET_KEY = '8vWAXHBTmfL3r96PlKIggpwuXwdNl4wz' client = AipNlp(APP_ID, API_KEY, SECRET_KEY) #[1网址,2标题,3内容,4情感分析items词典,5公司名列表,6评论观点列表,7文章分类,8文章标签] #http://linyi.iqilu.com/caijing/2018/1117/4113682.shtml #monitor_result=[] for news_url in urls: one_monitor=[] one_monitor.append(news_url)#①网址 try:#确保一条新闻具有完整性 news=urlopen(news_url,timeout=15)#设置timeout后,urlopen不会一直等待网址响应、也就不会出现卡死现象 news_html=news.read()#str类型的网页源码,这条指令和parse冲突,不能同时运行 #response = requests.get('http://example.com') #doc = Document(response.text) except: one_monitor.append("urlopen_error") monitor_result.append(one_monitor) success_num +=1 print("打开网址错误") continue try:#③内容,评论观点抽取最大就3000字 news_contents=Document(news_html) news_title=news_contents.title().strip(" ")[:39].encode("utf-8")#②标题,此处如果用默认的ascii转码、由于超出范围会报错 #print(news_title)#则删除空白符(包括'\n', '\r', '\t', ' ') one_monitor.append(news_title) news_content=BeautifulSoup(news_contents.summary()).get_text().strip(" ")[:2000].encode("utf-8") #len(news_content)#print(news_content) one_monitor.append(news_content) emotion_content=news_content.decode("utf-8")[:500].encode("utf-8")#要防止str只截取定长字节而有不完整汉字 #print(emotion_content) except: one_monitor.append("extract_error") try: #print(emotion_content) #print(u"我很高兴"[:1000])#我很高兴 emotion=client.sentimentClassify(emotion_content)["items"]#④情感 one_monitor.append(emotion) except: one_monitor.append("emotion_error") try:#⑤机构名列表 # ids = [1,4,3,3,4,2,3,4,5,6,1] # list(set(ids))#结果是重新排序的 orgs=[item["item"].encode("utf-8") for item in client.lexer(news_content)["items"] if item["ne"] =="ORG"] one_monitor.append(";".join(list(set(orgs)))) #print(";".join(list(set(orgs)))) except: one_monitor.append("org_error") try:#⑥评论观点列表 conments=[item['abstract'].encode("utf-8") for item in client.commentTag(news_content)['items']] one_monitor.append(";".join(list(set(conments)))) #print(";".join(list(set(conments)))) except: one_monitor.append("comment_error") try:#⑦文章分类 # a=[[1,2],[4,3,5]] # [c for b in a for c in b] group=client.topic(news_title, news_content)["item"].values()#[[字典],[字典]] #group=client.topic("对严重失信者,能否限制其发预付卡?法学家谈如何破解预付卡立法瓶颈", news_content)["item"].values() value_list=[dic[u'tag'] for dic_list in group for dic in dic_list]#float类型不能参与join one_monitor.append(u";".join(value_list).encode("utf-8")) #print(u";".join(value_list).encode("utf-8")) except: one_monitor.append("topic_error") try:#⑧文章标签 keyword=client.keyword(news_title, news_content)["items"]#[字典] #keyword=client.keyword("对严重失信者,能否限制其发预付卡?法学家谈如何破解预付卡立法瓶颈", news_content)["items"] key_list=[dic[u'tag'] for dic in keyword] one_monitor.append(u";".join(key_list).encode("utf-8")) #print(u";".join(key_list).encode("utf-8")) print("成功%s"%success_num) except: one_monitor.append("keyword_error") error_num +=1 print("其中有误%s"%error_num) monitor_result.append(one_monitor) success_num +=1 #time.sleep(1) if success_num % 200 == 0:#要定期保存,防止功亏一篑 with open("./temp/risk_monitoring%s.csv"%index,"w") as reader: writer = csv.writer(reader) writer.writerows(monitor_result)
def content(link): target = urllib.urlopen(link) d = Document(input=target) # catching if not u'' return d.summary()
def ndtv_anti_ad_block_text(article): doc = Document(article.html) text_maker = html2text.HTML2Text() text_maker.ignore_links = True text_maker.ignore_images = True return text_maker.handle(doc.summary())
### extracting content from the page try: x = (sel.xpath("(//div[@class='story-inner']/p/text()) |(//h2[@class='heading']/text())").extract()) joiner = "\n" p = joiner.join(x) ### using regular expression to remove continuous white spaces from the content and replace by single space item['content']= re.sub(r"[ \t\n]+", " ",p) except: try: html = sel.xpath("//div[@class = 'story-body']").extract() logging.basicConfig(level=logging.CRITICAL) doc = Document(html) htmlContent = doc.summary() content = html2texthandler(htmlContent) item['content']= re.sub(r"[ \t\n\"]", " ",content) except: print"image coontent" return finally: logging.basicConfig(level=logging.INFO) if (item['content']!="" and item['title']!="" and item['date']!="" ): return item else: return ### if url is of second category then else condition take care of that else:
def getAndParse(url): # 跳过的host continue1 = False redUrl = getRedictedUrl(url) for ig in ignores['hosts']: if ig in url or (redUrl and ig in redUrl): continue1 = True break if continue1: return None, None try: newContent, redUrl = getContentAndRedictedUrl(url) except Exception as e: print 'new content1', e try: newContent, redUrl = getContentAndRedictedUrl(url) except Exception as e: print 'new content1', e return None, None except requests.exceptions.ConnectionError as er: print 'new content2', er return None, None except requests.exceptions.ConnectionError as er: print 'new content2', er try: newContent, redUrl = getContentAndRedictedUrl(url) except Exception as e: print 'new content1', e return None, None except requests.exceptions.ConnectionError as er: print 'new content2', er return None, None if not redUrl: return None, None # 对跳转后的url,再过滤一遍 # continue2 = False # for ig in ignores['hosts']: # if ig in redUrl: # continue2 = True # return None # # if continue2: # return None urlHost = urlparse(redUrl).hostname new2 = newContent.encode('utf-8') # soup = getSoupByStr(newContent) soup = getSoupByStrEncode(new2, "utf-8") # 统一清理通用rm for rm in rules['common']['rm']: removeNodesFromSoup(rm, soup) # 删除停止node needAutoExtract = True if rules.has_key(urlHost): contentRule = rules[urlHost]['content'] if contentRule: # 有配置正文规则 specContent = soup.select(contentRule) # 根据配置,抽取正文 if specContent and len(specContent) > 0: del specContent[0].attrs soup = specContent[0] needAutoExtract = False # 现规则多直接按标签删除,所有,只有找到content才清楚rm配置的选项 if rules[urlHost]['rm'] and len(rules[urlHost]['rm']) > 0: for rm in rules[urlHost]['rm']: removeNodesFromSoup(rm, soup) # 删除停止node unwrapUseless(soup) content = unicode(soup).replace(u'<body>', '').replace(u'</body>', '') \ .replace(u'</div>', '').replace(u'<div>', '') else: # m没有配置任何规则,自动抽取正文 # print urlHost, ' : ',url # return None attemp = soup.select('#content') #很多小说网站正文都是#content if attemp and len(attemp): #猜中了 needAutoExtract = False unwrapUseless(soup) content = unicode(soup).replace(u'<body>', '').replace(u'</body>', '') \ .replace(u'</div>', '').replace(u'<div>', '') # else: if needAutoExtract: unwrapUseless(soup) doc = Document(unicode(soup).encode( 'utf-8')) #可能会报这个错误Expected a bytes object, not a unicode object content = doc.summary(html_partial=True) # content = content.replace('<html>','').replace('</html>','') newContent2 = cleanTailHead(urlHost, content) if newContent2 != content: content = newContent2 if content and len(content) < 10: return None, None # newSoup = getSoupByStr(content) # newSoup.select('div')[0].unwrap() # content = unicode(newSoup).replace(u'<body>','').replace(u'</body>','') # content = content.replace(r'<p>\d+、.*</b></p>', '') # content = re.sub(u'<p>\d+、((?:.|\n)*?)</p>', "", content, 1) content = content.replace(u'�', u'') content = content.replace(u'\'', r'\'') return content, urlHost
three = "https://article.hareruyamtg.com/article/48018/?lang=en" headers = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36' } response = requests.get(two, headers=headers) doc = Document(response.text) h = html2text.HTML2Text() h.ignore_links = True text = h.handle(response.text) print(text) cleaned_content = cleanhtml(doc.summary()) sentences = text.split(".") for sent in sentences: for card in CardNames: match = sent.find(card) if match != -1: le = len(card) x = match.__index__() print(f"\n{10*'#'}") print("Match:", f"{Fore.RED}{sent[x:x + le]}{Style.RESET_ALL}") print( "Corpus:", sent[:x - 1], f"{Fore.RED}{sent[x:x + le]}{Style.RESET_ALL} {sent[x+le+1:]}.", "\n") print(f"{10*'#'}\n")
def test_not_self_closing(self): sample = '<h2><a href="#"></a>foobar</h2>' doc = Document(sample) assert ( '<body id="readabilityBody"><h2><a href="#"></a>foobar</h2></body>' == doc.summary())
def link2html(link): response = requests.get(link) doc = Document(response.text) return doc.title(), doc.summary()
def test_too_many_images_sample_html_partial(self): """Using the too-many-images sample, make sure we still get the article.""" sample = load_sample("too-many-images.sample.html") doc = Document(sample) res = doc.summary(html_partial=True) self.assertEqual('<div><div class="post-body', res[0:26])
def test_too_many_images_sample_html_partial(self): """Using the too-many-images sample, make sure we still get the article.""" sample = load_sample('too-many-images.sample.html') doc = Document(sample) res = doc.summary(html_partial=True) self.assertEqual('<div><div class="post-body', res[0:26])
publication = field.text if body and re.search('Redistribution rights for this field are unavailable',body) and len(body) < 100: print(" Warning: no redistribution rights available for that article") body = "<p><b>Redistribution rights for this article were not available.</b></p>" except (ArticleMissing, ArticleAccessDenied) as e: print(" Warning: couldn't fetch that article") headline = link_text body = "<p><b>The Guardian Open Platform returned an error for that article: {0}</b></p>".format(e) body += '<p>You can still try <a href="{0}">the original article link</a></p>'.format(link_url) force_request = requests.get(link_url) force_article = Document(force_request.text) body += cleaner.clean_html(force_article.summary()) page_filename = "{0:03d}.html".format(page_number) html_body = E.body(E.h3(headline)) if byline: html_body.append( E.h4('By '+byline) ) html_body.append( E.p('[{s}]'.format(s=section)) ) if standfirst: standfirst_fragments = fragments_fromstring(standfirst) standfirst_element = E.p( E.em( *standfirst_fragments ) ) html_body.append( standfirst_element ) if thumbnail:
from email.mime.text import MIMEText from email.utils import formatdate from email import encoders # Get the html response = requests.get("http://URL/TO/CONVERT") # Clean up the html using readability doc = Document(response.text) # Use the webpage title as base file name file_name = re.sub(r'[^a-zA-Z0-9]+', '-', doc.title()) # Write the html response to local file f = open(file_name + '.html', 'w') f.write(doc.summary()) f.close() # Convert the local html file to .mobi call(["./kindlegen", file_name + '.html']) # Send the document as email attachment msg = MIMEMultipart() send_from = msg['From'] = '*****@*****.**' send_to = msg['To'] = '*****@*****.**' # Can be 'Send to Kindle' email msg['Date'] = formatdate(localtime=True) msg['Subject'] = file_name + ".mobi" # Attache email body msg.attach( MIMEText('Want to write a customized email boddy? Then put it here.'))
def get_content_from_url(url): def srcrepl(base_url, match): absolute_link = urljoin(base_url, match.group(3)) absolute_link = '/link?url=' + absolute_link return "<" + match.group(1) + match.group( 2) + "=" + "\"" + absolute_link + "\"" + match.group(4) + ">" def relative_to_absolute_urls(fragment, base_url): p = re.compile(r"<(.*?)(src|href)=\"(?!http)(.*?)\"(.*?)>") absolute_fragment = p.sub(partial(srcrepl, base_url), fragment) return absolute_fragment file_cache = f'./cache/sites/{get_cache_key(url)}.html' if not path.exists(file_cache): response = requests.get(url) text = response.text with open(file_cache, 'w') as f: f.write(text) else: with open(file_cache) as f: text = str(f.read()) doc = Document(text) summary = doc.summary(html_partial=True) if 'wikipedia.org' in url: d = pq(summary) to_remove = [ "#External_links", "#General_information", "#Experiments", "#Online_lectures", '.spoken-wikipedia', '#Bibliography', '.book', '.refbegin', '.shortdescription', '.reference', '.infobox', '.reflist', '#References', '#Further_reading', '#See_also', '.mw-editsection', '.tright' ] def check_link(index, a): da = pq(a) if da.attr('href') and '#cite_' in da.attr('href'): da.remove() d('a').each(check_link) for selector in to_remove: d(selector).remove() summary = d.html() try: parsed_url = urlparse(url) base_url = parsed_url.scheme + '://' + parsed_url.netloc summary = relative_to_absolute_urls(summary, base_url) except: pass soup = BeautifulSoup(summary, features="lxml") content = soup.get_text().rstrip('\n') content = re.sub(r'\n+', '\n', content).strip() return summary, content, doc.title()
#!/usr/bin/python import requests from readability import Document import sys from markdownify import markdownify as md bookmarkDir = "/home/ironman/obsidians/personalObsidian/bookmarks/" if len(sys.argv) > 1: url = sys.argv[1] response = requests.get(url) doc = Document(response.text) fileName = doc.title() + ".md" fileName = fileName.replace('/', ' ') markdownSummery = md(doc.summary()) markdown = "# {} \n\n *{}* \n\n {}".format(doc.title(), url, markdownSummery) with open(bookmarkDir + fileName, 'w') as the_file: the_file.write(markdown) else: print("please enter a url to make article view")
for row in results: # content = row[1] # content = row[4].replace('mi', 'mo') id = row[0] # url = row[1] url = 'http://www.3dllc.com/html/37/37023/9515879.html' # if not u'easou' in url: # continue newContent = getContent(url) doc = Document(newContent) content = doc.summary(html_partial=True) # # soup = getSoupByStr(newContent) # # ps = soup.select('#chapterContent')[0] # # ps.select('div')[0].unwrap() # # ps.unwrap() # for water in soup.select('.watermark'): # water.extract() # # t = soup.select('p')[0] # title = t.get_text() # if re.match('\d+.*',title): # # if id < 1766:
def handle_data(): def cleancap(raw_cap): cleanr = re.compile('<.*?>') cleantext = re.sub(cleanr, '', raw_cap) tmp = cleantext.split('\n') cap = list() pre = '' for line in tmp: if line.replace(' ', '') and line != pre: if '-->' in line: cap.append('') else: pre = line cap.append(line) tmp = set() for idx in range(len(cap)): if '-->' in cap[idx] and (idx >= len(cap) - 2 or '-->' in cap[idx + 2]): tmp.add(idx) tmp.add(idx + 1) final = list() for idx in range(len(cap)): if idx not in tmp: final.append(cap[idx]) return '\n'.join(final) user_level = request.form['user_level'] title = '' publish_date = '' text = request.form['text'] if (text.startswith('http://www.youtube.com') or text.startswith('http://youtube.com') or text.startswith('http://youtu.be') or text.startswith('https://www.youtube.com') or text.startswith('https://youtube.com') or text.startswith('https://youtu.be')): ydl_opts = { 'writesubtitles': True, 'writeautomaticsub': True, 'skip_download': True, # We just want to extract the info 'outtmpl': 'download/target' # file_path/target } file = '' with youtube_dl.YoutubeDL(ydl_opts) as ydl: ydl.download([text]) dirPath = "download" fileList = os.listdir(dirPath) if 'target.en.vtt' in fileList: file = cleancap(open('download/target.en.vtt').read()) else: file = 'There is no english substitle in this video!' for fileName in fileList: if os.path.isfile(os.path.join(dirPath, fileName)): os.remove(os.path.join(dirPath, fileName)) v_id = text.split('=')[-1] content = [v_id, file] type_ = 'youtube' r = requests.get(text) if r.status_code < 400: title = BeautifulSoup(r.text, 'html.parser').find('title').text publish_date = BeautifulSoup(r.text, 'html.parser').find( 'meta', itemprop="datePublished")['content'] elif text.startswith('http://') or text.startswith('https://'): response = requests.get(text, headers=headers) doc = Document(remove_sometag(response.text)) title = doc.short_title() publish_date = getPublishDate(response.content.decode('UTF-8')) content = doc.summary() type_ = 'url' else: content = text type_ = 'text' content = clean_content(content, type_) new,pure_text,vocab_dict = create_article(title, user_level, content, type_=='youtube', \ set(dictWord['V'].keys()), set(dictWord['N'].keys()), set(dictWord['ADJ'].keys())) store(pure_text, vocab_dict, user_level) return render_template('format.html', title=title, publish_date=publish_date, \ user_level=user_level, content=new)
def get_article_body(article, feed): body = "" # If scrape, get article with readability if feed["scrape"]: headers = { "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.110 Safari/537.36" } response = requests.get(article.link, headers=headers) doc = Document(response.text) body = doc.summary() # Else construct from article object else: # Add all content to body if hasattr(article, "content"): for c in article.content: if c.type == "text/html" or c.type == "text/plain": body += c.value # Use summary as fallback elif hasattr(article, "summary"): body += article.summary # Replace relative links with absolute ones, using beautifulsoup try: splitted_url = urlsplit(article.link) except Exception: splitted_url = urlsplit(feed["url"]) soup = BeautifulSoup(body, features="lxml") for img in soup.find_all("img", src=True): src = img.get("src") splitted_src = urlsplit(src) constructed_src = [ splitted_src.scheme, splitted_src.netloc, splitted_src.path, splitted_src.query, splitted_src.fragment, ] if constructed_src[0] == "": constructed_src[0] = splitted_url.scheme if constructed_src[1] == "": constructed_src[1] = splitted_url.netloc new_src = urlunsplit(constructed_src) if new_src.startswith("http"): body = body.replace('"{}"'.format(src), '"{}"'.format(new_src), 1) for a in soup.find_all("a", href=True): href = a.get("href") splitted_href = urlsplit(href) constructed_href = [ splitted_href.scheme, splitted_href.netloc, splitted_href.path, splitted_href.query, splitted_href.fragment, ] if constructed_href[0] == "": constructed_href[0] = splitted_url.scheme if constructed_href[1] == "": constructed_href[1] = splitted_url.netloc new_href = urlunsplit(constructed_href) if new_href.startswith("http"): body = body.replace('"{}"'.format(href), '"{}"'.format(new_href), 1) return body
def open_anything(source, type_arr, encode=None, logger=None): """URI, filename, or string --> stream This function lets you define parsers that take any input source (URL, pathname to local or network file) and deal with it in a uniform manner. Returned object is guaranteed to have all the basic stdio read methods (read, readline, readlines). Just .close() the object when you're done with it. Examples: >>> from xml.dom import minidom >>> sock = open_anything("http://localhost/kant.xml") >>> doc = minidom.parse(sock) >>> sock.close() >>> sock = open_anything("c:\\inetpub\\wwwroot\\kant.xml") >>> doc = minidom.parse(sock) >>> sock.close() This function is part of "Dive Into Python", a free Python book for experienced programmers. Visit http://diveintopython.org/ for the latest version. """ print('\nopen_anything ({})...'.format(source)) if logger is None: logger = create_log(log_name="util") # ,level=loglevel) pass if hasattr(source, "read"): print("Dealing with text...") type_arr[0] = "text" return source pass if source == "-": return sys.stdin pass if (source.startswith('http://') or source.startswith('https://') or source.endswith('.html') or source.endswith('.htm')): # try to open with urllib2 (if source is http, ftp, or file URL) # import urllib2 print("Dealing with html...") type_arr[0] = "html" driver = None h = 'http' if not os.path.exists(h): os.mkdir(h) head = source.split("/") head = head[len(head) - 1] head = ''.join([h, '/', os.path.splitext(head)[0]]) txtname = ''.join([head, '.txt']) # print(txtname) # c if os.path.exists(txtname): print("{0} already exists,".format(txtname), "\njust read from it; stop getting http") f = open(txtname, 'r', encoding='utf-8') text = f.read() f.close() return text try: # return urllib2.urlopen(source, timeout=10) response = requests.get(source, timeout=50) len_text = len(response.text) if len_text > 0: doc = Document(response.text) res = doc.summary() res = cleanhtml(res) len_res = len(res) if len_res > 0: f = open(txtname, 'w', encoding='utf-8') f.write(res) f.close() print("http to txt, save in {0}".format(txtname)) else: print("Something wrong!") print("len_text:", len_text) print("len_res:", len_res) return res driver = webdriver.Chrome() ''' if platform == 'darwin': driver = webdriver.Safari() elif platform == 'win32': #driver = webdriver.Firefox() driver = webdriver.Chrome() ''' driver.set_page_load_timeout(50) print("\ngetting http : ", source) driver.get(source) time.sleep(6) res = driver.page_source driver.close() f = open(txtname, 'w', encoding='utf-8') f.write(res) f.close() print("http to txt, save in {0}".format(txtname)) return res pass except TimeoutException as e: msg = "too much time to load html: {0}, info:{1}".format(source, e) print(msg) logger.info(msg) time.sleep(2) driver.get(source) time.sleep(13) res = driver.page_source driver.close() f = open(txtname, 'w', encoding='utf-8') f.write(res) f.close() print("http to txt, save in {0}".format(txtname)) return res pass except (IOError, OSError) as e: msg = "failed to load html: {0}, info: {1}".format(source, e) logger.info(msg) pass except Exception as e: msg = "failed to load html: {0}, err: {1}".format(source, e) print(msg) logger.error(msg) pass finally: if driver is not None: driver.quit() """ # try to open with pypdf(if source is pdf) # failed to open pdf # see # http://stackoverflow.com/questions/25665/ \ python-module-for-converting-pdf-to-text from PyPDF2 import PdfFileReader try: type_arr[0]="pdf" return PdfFileReader(open(source,"rb")) pass except (IOError, OSError): pass """ if not os.path.exists(source): print('\nfile not exist: {}'.format(source)) return None if source.endswith('.pdf'): # try to open with pypdf(if source is pdf) print("Dealing with pdf...") type_arr[0] = "pdf" try: # print "using pdf method to open" return get_pdf_io(source) pass except (IOError, OSError) as e: msg = "failed to load pdf: {0}, info:{1}".format(source, e) logger.info(msg) pass # try to open with pywin32(if source is doc) # @todo use pywin32 to open doc # try to open with native open function (if source is pathname) type_arr[0] = "text" try: return open(source, encoding=encode) pass except (IOError, OSError) as e: msg = "failed to load txt: {0}, info:{1}".format(source, e) logger.info(msg) pass print('\nFailed to open_anything ({})'.format(source)) return None pass
def test_wrong_link_issue_49(self): """We shouldn't break on bad HTML.""" sample = load_sample('the-hurricane-rubin-carter-denzel-washington.html') doc = Document(sample) res = doc.summary(html_partial=True) self.assertEqual('<div><div class="content__article-body ', res[0:39])
def get_main_html(html): doc = Document(html) return doc.summary()
def test_nyt_sample_html_iframe(self): """Using the nyt sample, make sure the summary holds an <iframe> element (youtube video)""" sample = load_sample('nyt-article-video.sample.html') doc = Document(sample, url='http://nytimes.com/') res = doc.summary() self.assertTrue('<iframe ' in res)
def save(self, *args, **kwargs): if self.description: document = Document(self.description) self.readable_description = document.summary(html_partial=True) return super(FeedItem, self).save(*args, **kwargs)
def get_article_body(url): page = requests.get(url, timeout=(3.05, 10)) doc = Document(page.text) soup = BeautifulSoup(doc.summary(), 'html.parser') return soup.get_text()
import xml.etree.ElementTree from readability import Document from langdetect import detect from models import Visit from database import db_session LANG = { 'en': 'english', 'fr': 'french' } def remove_tags(text): return ''.join(xml.etree.ElementTree.fromstring(text).itertext()) for v in Visit.query.all(): doc = Document(v.raw_dom) v.extacted_text = remove_tags(doc.summary()) v.lang = LANG.get(detect(v.extacted_text), 'simple') db_session.commit()