def convert(self): if self.url: response = requests.get(self.url, headers=self.headers) content = response.content elif self.path: with open(self.path, 'rb') as f: content = f.read() soup = BeautifulSoup(content, 'html.parser') try: self.encoding = get_encoding(soup) except ValueError: self.encoding = soup.original_encoding doc = Document(content.decode(self.encoding, "ignore")) self.title = doc.title() if len(doc.title()) > 0 else "Awesome article" self.soup = BeautifulSoup(doc.summary(), 'html.parser') self.process_images() self.add_head() if len(self.soup.find_all("h1")) == 0: self.insert_title() if self.url: self.insert_link() self.save_html() self.convert_to_mobi() if self.send_by_mail: self.send_to_kindle() if self.clean: self.do_cleaning()
def get_url(url_var): response = requests.get(url_var) tree = html.fromstring(response.content) doc = Document(response.text) # print(doc.title()) tree_text = tree.xpath( '//p/text() | //p/a/text() |//p/b/text() | //div/text() | //h2/text() | //h1/text() | //h2/a/text() | //h3/text() | //h3/a/text()' ) # tree_text = tree.xpath('//div/text()') # print(tree_text) # response=request.urlopen("https://en.wikipedia.org/wiki/Louis_Tomlinson") # print("URL:",response.geturl()) # print("2:") text = "" file = open('text_with_b.txt', 'w+') for x in tree_text: if ("'b'" in x): x.strip("'b'") file.write(str(x.encode("utf-8"))) file.close() # print("3:") s0 = '"b"' s1 = "'b'" s2 = "'b" s3 = "b'" # print(set(string.printable)) # pattern = "\"(?=[<\"]+,)[>\"]+\"" with open('text_with_b.txt', 'r') as infile, \ open('text_document.txt', 'w') as outfile: data = infile.read() data = data.replace(s0, "") data = data.replace(s1, "") data = data.replace(s2, "") data = data.replace(s3, "") outfile.write(data) infile.close() outfile.close() print(doc.title()) print() fs = FrequencySummarizer() f = open('result.txt', 'w+') f.write(str(doc.title() + '\n')) with open('text_document.txt', "r") as file: text = file.read() for s in fs.summarize(text, 4): f.write(str('*' + s)) print() f.close() # starting summarizer # print("4:")
def parse_text2(self, url, doc): '''解析网页中的文本数据:正文,标题,发布时间,作者等''' article = Document(doc) try: text = article.summary() # 如果找不到当前页面的title,就把title设为当前页面的链接的title title = article.title() if article.title( ) else self.links_all_dict.get(url, '') except Exception as e: logger.warning('没有找到页面 {} 的title和text,原因 {}'.format(url, e)) title, text = '', '' return title, text
def parse_item(self, response): if response.status == 200: # Extracting the content using css selectors urls = response.css('.media__link::attr(href)').extract() tag_texts = response.css('.media__tag::text').extract() tag_urls = response.css('.media__tag::attr(href)').extract() urls_cleansed = self.reconcile_url_base(urls) tag_urls_cleansed = self.reconcile_url_base(tag_urls) if len(urls_cleansed) != len(tag_urls_cleansed): raise Exception( 'Length Mismatch between article urls and tag urls') article_info_list = [] for item in zip(urls_cleansed, tag_urls_cleansed, tag_texts): url = item[0] tag_url = item[1] tag_text = item[2] url_response = requests.get(url) doc = Document(url_response.text) soup = BeautifulSoup(url_response.text) date_info = soup.find('div', attrs={'class': 'date date--v2'}) if date_info: created_time_epoch = int(date_info['data-seconds']) created_time_datetime = datetime.fromtimestamp( created_time_epoch) else: created_time_datetime = None title = doc.title() cleansed_body = doc.summary() # Cannot scrap created time with scrapy nor with readability, so use BeautifulSoup for that. body_soup = BeautifulSoup(cleansed_body) cleansed_article_text = ' '.join([ x.get_text().replace('\n', ' ') for x in body_soup.find_all('p') ]) cleansed_article_text = self.clean_article_text( cleansed_article_text) itemm = BbcArticlesItem() itemm['title'] = doc.title() itemm['url'] = url #itemm['time']=created_time_datetime #itemm['type']=tag_url itemm['related_topics'] = tag_text itemm['article_text'] = cleansed_article_text yield itemm
def get(self): url = self.get_argument("url", None) # https://www.ifanr.com/1080409 doc = Webcache.find_one({'url': url}, {'_id': 0}) if doc: self.res = dict(doc) return self.write_json() try: sessions = requests.session() sessions.headers[ 'User-Agent'] = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.131 Safari/537.36' response = sessions.get(url) # response.encoding = 'utf-8' # TODO response.encoding = get_charset(response) doc = Document(response.text) title = doc.title() summary = doc.summary() markdown = html2text.html2text(summary) markdown = markdown.replace('-\n', '-') markdown = markdown.strip() res = {} res['url'] = url res['title'] = title res['markdown'] = markdown if title and markdown: webcache = Webcache webcache.new(res) self.res = res self.write_json() except Exception as e: print(e)
def getAutoDetail(cls, contentPageNumber, html, enableDownloadImage=False, enableSnapshot=False): autoDetail = {} try: doc = Document(html) # response. if contentPageNumber <= 1: autoDetail["title"] = ArticleUtils.cleanHeadTitle(doc.title()) # autoDetail["publishAt"] = TimeUtils.get_conent_time(html) # autoDetail["html"] = html contentSnapshot = doc.summary() if StringUtils.isNotEmpty( ArticleUtils.removeAllTag(contentSnapshot)): if enableSnapshot: autoDetail["contentSnapshot"] = contentSnapshot.replace( "<html>", "").replace("</html>", "").replace("<body>", "").replace("</body>", "") autoDetail["content"] = ArticleUtils.removeTag4Content( contentSnapshot) if enableDownloadImage: autoDetail[ "contentImages"] = ArticleUtils.get_content_image_urls( contentSnapshot, response.url) except Exception as e: return autoDetail return autoDetail
def getAutoDetail(cls, response, enableDownloadImage=False, enableSnapshot=False, isFirstPage=True): autoDetail = {} try: html = "".join(response.xpath("//html").extract()) doc = Document(html) if isFirstPage: autoDetail["title"] = doc.title() autoDetail["publishAt"] = TimeUtils.get_conent_time(html) contentSnapshot = doc.summary() if enableSnapshot: autoDetail["contentSnapshot"] = contentSnapshot.replace( "<html>", "").replace("</html>", "").replace("<body>", "").replace("</body>", "") autoDetail["content"] = ArticleUtils.removeTag4Content( contentSnapshot) if enableDownloadImage: autoDetail[ "contentImages"] = ArticleUtils.get_content_image_urls( contentSnapshot, response.url) except Exception as e: return autoDetail return autoDetail
def get_text(url): response = get(url) doc = Document(response.text) title = doc.title() summary = doc.summary() body = bs4.BeautifulSoup(summary, features="lxml").get_text() return f"{title} : {body}"
def extract(): url = request.form['site'] response = requests.get(url) doc = Document(response.text) parser = html2text.HTML2Text() parser.ignore_links = True parser.ignore_images = True parser.ignore_emphasis = True parser.ignore_anchors = True parser.ignore_tables = True title = doc.title() title = re.sub(r' *- [-a-zA-Z0-9 @:%._\+~#=]{1,256}', '', title) # <---- this crops everything after -[space] title = re.sub(r' *- *[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)', '', title) # <---- this is for websites only article = parser.handle(str(doc.summary())) article = fix_article(article) article = article.split("<br/>") if url != '': insert_in_db(analytics_client, container, url, title, article) return flask.render_template('index.html', title=title, data=article) else: error_msg = "Enter a valid URL" return flask.render_template('index.html', error=error_msg)
def parse(html): doc = Document(html) title = doc.title() if title == u'[no-title]': title = u'' content_html = doc.summary() content_html = content_html.replace(u'<html>', u'').replace(u'</html>', u'')\ .replace(u'<body>', u'').replace(u'</body>', u'') clear_paths = [u'//script', u'//img', u'//a'] body = clearDOM(html, clear_paths) match_list = findTimeStr(body) post_date = u'' for match_item in match_list: if len(match_item) > len(post_date): post_date = match_item style_in_list = [] style_need_replace = [] content_item = { u'title': title, u'content_html': content_html, u'post_date': post_date, u'style_in_list': style_in_list, u'style_need_replace': style_need_replace } return content_item
def get_article(url): try: headers = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36' } response = requests.get( url, headers=headers, timeout=4 ) # can throw BaseException if the server does not respond, or TooManyRedirects except BaseException: return '' readability_doc = Document(response.text) # doc.summary() is not really a summary just the main part of the # website's content html_content = readability_doc.summary() title = readability_doc.title() soup = BeautifulSoup(html_content, 'html.parser') # Replace all excesive whitespace and new line content = re.sub(r"(\s{2,})|(\n{2,})", "\n", soup.get_text(), flags=re.UNICODE) if " - " in title: title = re.sub(r" - [\s\w]+$", "", title) return content, title
def analyze(request): 'API text analyze view' if request.method == 'POST': text = request.body.decode('utf-8') try: text = json.loads(text)['text'] except ValueError: # catch POST form as well for key in request.POST.dict().keys(): text = key if settings.ALLOW_URL_IMPORTS and text.startswith(('http://', 'https://', 'www')): page = requests.get(text) doc = Document(page.text) soup = BeautifulSoup(doc.summary()) text = soup.get_text() title = doc.title().strip() text = '{0}.\n{1}'.format(title, text) if not text: response = JsonResponse( {'status': 'false', 'message': 'need some text here!'}) response.status_code = 400 return response # add some limit here text = text[:200000] ret = {} ret = analyze_text(text) return JsonResponse(ret) else: ret = {'methods_allowed': 'POST'} return JsonResponse(ret)
def feedtheURLs(url, fileName): # f**k you sina if ("sina" in str(url)): content = urlopen(url).read() str_start = '<!--博文正文 begin -->' str_end = '<!-- 正文结束 -->' start = content.find(str_start) end = content.find(str_end) con1 = '<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />\n' con2 = content[start:end] body = con1 + con2 doc = Document(body) else: try: response = requests.get(url, timeout=10) response.raise_for_status() except Exception: return doc = Document(response.text) try: # f**k sina if ("sina" in str(url)): title = str(re.findall("<h2.*?\/h2>", body)[0]).decode('utf-8') else: title = doc.title() # content = doc.content() summary = doc.summary() title = strip_tags(title) # content = strip_tags(content) summary = strip_tags(summary) # print (title) # sys.exit() except Exception: return with open("Crawler_Output/{}.txt".format(fileName), "a") as my_file: if (re.search(u'[\u4e00-\u9fff]', summary)): my_file.write("标题:" + title.encode('utf-8')) my_file.write("\n链接:" + url) my_file.write("文章内容:\n" + summary.encode('utf-8')) else: my_file.write("标题:" + "该链接是无效链接") my_file.write("\n链接:" + url) my_file.write("文章内容:\n" + "该文章已经被删除或网络请求错误") clean_lines = [] with open("Crawler_Output/{}.txt".format(fileName), "r") as f: lines = f.readlines() clean_lines = [l.strip() for l in lines if l.strip()] with open("Crawler_Output/{}.txt".format(fileName), "w") as f: # f.write(''' # ======================================================== # ''') f.writelines('\n'.join(clean_lines)) f.write(''' ======================================================== ''')
def extract_core_html(html: str): """从文章类型提取核心HTML Args: html (str): raw html """ doc = Document(html) return doc.title(), doc.summary()
def process_one(self, content): try: doc = Document(content) return ContentResult(doc.title(), doc.summary()) except Exception as e: logger.error( f"Readability failed on {content.title} with error {e}") return ContentResult('', '')
def extract(html): """ Simply uses regex and readability to get document body from html """ doc = Document(html) json_ret = {"title": tag_re.sub('', doc.title()), "body": tag_re.sub('', doc.summary()).replace("\n", " " ).replace("\xa0", " ")} return json_ret
def transform(self, row, chan): row['response'] = resolve_future(row['response']) doc = Document(row['response'].content) row['title'] = doc.title() summary = doc.summary() row['text'] = html2text(summary, bodywidth=160).replace('****', '').strip() yield row
def extract_title_and_summary(content): doc = Document(content) title = doc.title() try: lang = detect(title) except LangDetectException: lang = 'unknown' s = TAG_RE.sub('', doc.summary()) s = ' '.join([x for x in s.split() if x.strip() != '']) return title, lang, s
def get_manchete(k): soup = BeautifulSoup(k, 'lxml') #manchete = soup.findAll('h1',{'class':'content-head__title'}) manchete = soup.findAll('h1',{'property':'na:headline'}) try: manchete_ok = manchete[0].text except IndexError: page_content = Document(k) manchete_ok = page_content.title() return(manchete_ok)
def parse(self,file_path): with open(file_path) as fin: content = fin.read() doc = Document(content) title = doc.title() article = doc.summary() readable_article=self.strip(article) readable_title=self.strip(title) #print readable_article #print readable_title return readable_title + " " + readable_article
def extract_article(url, ip): """Extracts the article using readability""" title, summary = None, None response = get_url(url, ip) if response.status_code == 200: doc = Document(response.content) summary = unicode(doc.summary()) title = unicode(doc.title()) return title, summary else: return None
def parallelizable_request(self, entry): req = requests.get(entry["link"]) if not req.ok: print(f"Honk! Couldn't grab content for {self.feed_url}") return None doc = Document(req.content) source = entry["link"].split(".")[1] story = Story(doc.title(), body_html=doc.summary(), byline=source) return story
def compare(request): 'API compare documents view' doc_dicts = [] if request.method == 'POST': text = request.body.decode('utf-8') try: text = json.loads(text)['text'] except ValueError: # catch POST form as well for key in request.POST.dict().keys(): text = key if settings.ALLOW_URL_IMPORTS and text.startswith(('http://', 'https://', 'www')): lines = text_to_list(text) i = 0 for line in lines[:2]: if not line.startswith(('http://', 'https://', 'www')): response = JsonResponse({'status': 'false', 'message': 'need at least 2 urls!'}) response.status_code = 400 return response page = requests.get(line) doc = Document(page.text) soup = BeautifulSoup(doc.summary()) text = soup.get_text() title = doc.title().strip() text = '{0}.\n{1}'.format(title, text) if not text: response = JsonResponse({'status': 'false', 'message': 'need some text here!'}) response.status_code = 400 return response # add some limit here text = text[:200000] doc = text_to_doc(text) language = doc.lang_ if i>0 and language!=doc_dicts[0]['language']: response = JsonResponse( {'status': 'false', 'message': 'texts must be in same language!'}) response.status_code = 400 return response doc_dicts.append({'language': language, 'doc': doc}) i += 1 ret = compare_docs(doc_dicts) ret['language'] = language ret['text'] = text return JsonResponse(ret) else: response = JsonResponse({'status': 'false', 'message': 'need 2 documents!'}) response.status_code = 400 return response else: return JsonResponse({'methods_allowed': 'POST'})
def get_manchete(k): #soup = BeautifulSoup(k, 'lxml') soup = k #manchete = soup.findAll('h1',{'class':'content-head__title'}) manchete = soup.findAll('h1',{'class':'articulo-titulo'}) try: manchete_ok = manchete[0].text except IndexError: page_content = Document(k) manchete_ok = page_content.title() return(manchete_ok)
def extract_article(html_content, language="zh", holding_url="http://127.0.0.1/"): doc = Document(html_content) title = doc.title() html = doc.summary(True) # return title, html article = Article(url=holding_url, language=language) article.download(input_html=html) article.parse() return title, article.text
def get_article_content(html_content, title=None): doc = Document(html_content) # 'html_content' is used for display # and existing classes are removed html_content = re.sub('class=".*?"', '', doc.summary(html_partial=False)) # 'content' is used for search content = BeautifulSoup(html_content, "html.parser").text return { 'title': title if title else doc.title(), 'content': content, 'html_content': html_content, }
def populate_from_html(self, html): doc = Document(html) title = doc.title() body = doc.summary(html_partial=True) body_plain_text = Item.strip_tags(body) self.source_response_raw = html self.title = title self.body = body self.body_plain_text = body_plain_text
def view_html(url): """Converts an html document to a markdown'd string using my own fork of python-readability""" try: from readability import Document except ImportError: print("Can't convert document: python-readability is not installed") return html = urlopen(url).read() doc=Document(html) print(wrap(asciify(BOLD+doc.title()+RESET+"\n"+doc.markdown(),strip_newlines=False),80,''))
def abc(url): #url = "https://github.com/codelucas/newspaper" r= requests.get(url) r.encoding = "utf-8" #html = r.text doc = Document(r.text) print(doc.title()) print(doc.summary()) text_maker = html2text.HTML2Text() text_maker.ignore_links = True text_maker.bypass_tables = False text = text_maker.handle(doc.summary()) print(text)
def saveHTML(url, img=True, prefix=''): if img: text = generate(url) else: response = requests.get(url) text = response.text doc = Document(text) content = fixHeader(doc.summary()) filename = os.path.join(prefix, doc.title() + '.html') with open(filename, 'w') as f: f.write(content) return filename
def translate(link): global url global text_nodes global text_strings global markup dest = "en" url = link parser = HtmlParser() response = requests.get(url) doc = Document(response.text) # tree = fromstring(r.content) title = doc.title() # tree.findtext('.//title') lang = translator.detect(title).lang if lang == 'en': # print("The article appears to be in English already.") return 'null' title = translator.translate(title).text content = doc.summary() # print(content) soup = bs(content, 'lxml') text = str(soup.find('body')) # text = r.text.split('<body')[1].split('</body>')[0] repls = ('h1>', 'h3>'), ('h2>', 'h3>'), ('<h1', '<h3'), ('<h2', '<h3') text = reduce(lambda a, kv: a.replace(*kv), repls, text) text = emoji.get_emoji_regexp().sub(r'', text) # removing the emojis # print(text) parser.feed(text) # print("text_nodes: ", text_nodes) # print(text_strings) # print(text) # print(markup) # print("STARTING TO TRANSLATE...", url) translations = translator.translate(text_strings, dest=str(dest)) final_payload = [] for translation in translations: scheme = [translation.text] # print(translation.origin, ' -> ', scheme[0]) final_payload.extend(scheme) markup = markup.format(*final_payload) markup = re.sub(r'\s([?.!"](?:\s|$))', r'\1', markup) print("\n") # print(markup) access_token = os.environ.get("access_token") t = TelegraphPoster( access_token=access_token) article = t.post(title=str(title), author='lulz', text=str(markup)) x = str(article).replace("'", '"') article = json.loads(x) text = "Your article is ready to read! {}".format(article['url']) return text
def getContent(self, url, cate): # print("Fetching "+url+" ...") try: page_source = urllib.request.urlopen(url) self.crawledList.append(url) html = page_source.read().decode("utf8") doc = Document(html) bsObj = BeautifulSoup(doc.summary(), "html.parser") title = str(doc.title()) # content = # bsObj.find('div', attrs={'class':'story-body__inner'}) #Process Text text = str(bsObj.text) text = text.replace("Image copyright", "") text = text.replace(" ", "") text = text.replace("\n\n", "") text = text.replace("- BBC News", "") #Save text #Category if (cate is 'business'): self.category.append('business') self.bu += 1 if (cate is 'entertainment'): self.category.append('entertainment') self.en += 1 if (cate is 'politics'): self.category.append('politics') self.po += 1 if (cate is 'sport'): self.category.append('sport') self.sp += 1 if (cate is 'tech'): self.category.append('tech') self.te += 1 #Filename filename = url[url.rfind("/") + 1:] self.filename.append(filename) #Title self.title.append(title) #content self.content.append(text) self.getLinkList(url) self.saveToTxt(title + "\n\n" + text, cate) self.sucess += 1 return 1 except Exception as error: print(error)
def parse(self, response: scrapy.http.Response, **kwargs): """ Page parser. :param response: Page and response object. :param kwargs: A dict of parameters. :return: None """ content_type: bytes = response.headers.get('Content-Type') if not content_type.startswith(b'text/html'): return None # Note: response.headers is a caseless dict. this_page = PageItem() article = Document(response.text, handle_failures=None) this_page['link_count'] = len(response.css('a[href]')) this_page['title'] = article.title() this_page['url'] = response.url this_page['content'] = article.summary() # Submit the this_page to pipeline. yield this_page # Get other links from the page and append them to url list link_list = get_links(response) for title, url in link_list: title = title.strip() if title else '' url = response.urljoin(url) if '.sit.edu.cn' not in url: continue """ Separate pages from attachments. We may fetch the url and see what server say in 'Content-Type' but it can't be done in parse function. Actually, it's the simplest way to distinguish pages and attachments without fetching. """ _, path = divide_url(url) link_type = guess_link_type(path) if link_type == 'page': # Fetch next page yield scrapy.Request(url=url, callback=self.parse) elif link_type == 'attachment': # link_type may equal to 'attachment' item = AttachmentItem() item['referer'] = response.url # Url of current web page item['url'] = url # Url of attachment item['title'] = title.replace('\xa0', '').replace( ' ', '') # Take file title from last page. yield item
def extract_content_texts(name): article_archive = os.path.join(DEFAULT_SAVE_PATH, name, 'raw_articles') json_archive = os.path.join(DEFAULT_SAVE_PATH, name, 'json_articles') mkdir_p(json_archive) for html in glob.glob(article_archive+'/*.html'): fname = os.path.basename(html)+'.json' savepath = os.path.join(json_archive, fname) if os.path.exists(savepath): logging.info('Skipping existing json data: {0}'.format(savepath)) continue data = {} with open(html, 'r') as myfile: doc = Document(myfile.read()) data['title'] = doc.title() data['content'] = doc.content() data['summary'] = doc.summary() with open(savepath, 'w') as saving: json.dump(data, saving)
def get(self): sharetype = self.get_argument("sharetype", "goodlink") link = self.get_argument("link", '') user_id = self.current_user["user_id"] assert link url = link doc = Webcache.find_one({'url': url}, {'_id': 0}) if not doc: sessions = requests.session() sessions.headers[ 'User-Agent'] = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.131 Safari/537.36' response = sessions.get(url) # response.encoding = 'utf-8' # TODO response.encoding = get_charset(response) logger.info('response.encoding {}'.format(response.encoding)) doc = Document(response.text) doc_title = doc.title() summary = doc.summary() _markdown = html2text.html2text(summary) _markdown = _markdown.replace('-\n', '-').strip() res_webcache = {} res_webcache['url'] = url res_webcache['title'] = doc_title res_webcache['markdown'] = _markdown if _markdown: webcache = Webcache webcache.new(res_webcache) else: logger.info('already') doc_title = doc.title res = { 'title': doc_title, 'sharetype': sharetype, 'link': link, } share = Share res['user_id'] = user_id share = share.new(res) user = User.by_sid(user_id) user.user_leaf += 10 user.save() self.redirect("/share/" + str(share.id))