def gather_informations(): url = request.args['url'] g = Goose( config={ 'local_storage_path': './data/' if os.environ.get('BUCKET_HOST') is not None else '.', 'enable_image_fetching': True }) try: goose_response = g.extract(url=url) print(goose_response.top_image) except Exception as e: print('error') return jsonify({'error': True, 'message': e}), 500 response_img = '' if goose_response.top_image is not None: response_img = goose_response.top_image.src return jsonify({ 'title': goose_response.title, 'urlRequested': url, 'text': goose_response.cleaned_text[:200], 'mainImage': response_img }), 200
def parse_item(self, response): self.iter_count += 1 html = response.body # Objeto Goose para extraer datos de la pagina goose_extractor = Goose() article = goose_extractor.extract(raw_html=html) # Comprobar que la pagina contenga (por lo menos) un header h2 con la palabra 'Examples', para saber si es un tropo o no if(response.css('h2').re('.Examples:.')): self.trope_count+=1 follow = True json_file = self.generate_json(article) self.create_files(json_file, 'tropo') # Archivo para comprobar los tropos indexados #with open(self.final_directory + 'trope_list.txt', 'a+', encoding='utf-8') as fp: # fp.write(response.url+'\n') else: self.non_trope_count += 1 if('Laconic' in response.url): print('Encontrado un Laconic!') self.laconic_count += 1 json_file = self.generate_json(article) self.create_files(json_file, 'laconic') else: print('Enlace ignorado! (no era un tropo)') follow = False # Cerrar objeto goose goose_extractor.close()
def get_news( self): # 실제로 url에 들어가 기사들을 읽어온다 , 첫번째 카테고리만으로 검색했을때 데이터를 가져와준다 #categories 는 1,2,3숫자를 받는다(여러개 가능) print('기사 추출 시작') for url in self.urls: try: category = self.categories[self.choose_category - 1] g = Goose({'stopwords_class': StopWordsKorean}) article = g.extract(url=url) title = article.title #print(title) content = self.read_article_contents(url) if content == "": continue print(content) self.article_info["category"] = category self.article_info["contents"] = content self.article_info["title"] = title self.article_info["url"] = url self.articles.append(self.article_info) self.num_article += 1 except: continue return self.articles
def gooseChineseExample(self): data_list = [] # 文章地址 num = 0 for url in self.Baiduurl: # 初始化,设置中文分词 g = Goose({'stopwords_class': StopWordsChinese}) # 获取文章内容 article = g.extract(url=url) # 获取标题 title = article.title data_list.append('标题: ' + title) # 获取来源 source = self.getSource() data_list.append('来源: ' + str(source[num])) # 发布时间 Time = self.getTime() data_list.append('发布时间: ' + str(Time[num])) # 显示正文 text = article.cleaned_text data_list.append('文本: ' + text) data_list.append( '=============================================================================' ) num += 1 data_list = '\n'.join(data_list) print(data_list)
def parse_detail(self, response): # 学术讲座 http://www.cqupt.edu.cn/cqupt/news_detail.shtml?id=155176964575282691 # 列表 API http://www.cqupt.edu.cn/getPublicPage.do 外加参数 cookie # js 动态加载,详情API http://www.cqupt.edu.cn/getPublicNotic.do?id=155176964575282691 item_loader = CquptSpiderItemLoader(item=CquptSpiderItem(), response=response) g = Goose({'stopwords_class': StopWordsChinese}) content = g.extract(raw_html=response.text) item_loader.add_value('url', response.url) item_loader.add_value('url_obj_id', response.url) item_loader.add_xpath('html_title', '/html/head/title/text()') item_loader.add_value('crawl_time', datetime.datetime.now()) if len(content.cleaned_text) < self.main_content_min_length: # 正文长度不够,认为是导航页或者列表页 # 尝试解析SEO 信息 item_loader.add_xpath( 'meta_description', "/html/head/meta[@name='description']/@content") item_loader.add_xpath( 'meta_keywords', "/html/head/meta[@name='keywords']/@content | " "/html/head/meta[@name='Keywords']/@content") item_loader.add_value('tags', content.title) else: item_loader.add_value('meta_keywords', content.meta_keywords) item_loader.add_value('meta_description', content.meta_description) item_loader.add_value('title', content.title) item_loader.add_value('create_date', content.publish_date) item_loader.add_value('authors', content.authors) item_loader.add_value('top_image', content.top_image) item_loader.add_value('tags', content.tags) item_loader.add_value('content', content.cleaned_text) item = item_loader.load_item() return item
def gooseChineseExample(): g = Goose({'stopwords_class': StopWordsChinese}) url = "https://item.btime.com/36a0f17i0489keqltn35q96p4lr?from=haozcxw" article = g.extract(url=url) print(article.title) print(article.meta_description) print(article.cleaned_text[:150])
def get_news_result_cnt(self, news_url): config = Configuration() config.http_proxies = {'http': self.proxy, 'https': self.proxy} config.browser_user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36' config.stopwords_class = StopWordsChinese config.http_proxies = {'http': self.proxy, 'https': self.proxy} g = Goose(config) article = g.extract(news_url) try: published_time = int( parse(article.publish_date).timestamp() * 1000) if article.publish_date else None except: published_time = int( time.mktime(time.strptime(article.publish_date, "%Y年%m月%d日")) * 1000) if article.publish_date else None news_post = dict(doc_id=md5( article.final_url.encode('utf-8')).hexdigest(), keyword='', url=article.final_url, title=article.title, platform='news', content=article.cleaned_text, author=article.authors, source=self.source, published_time=published_time, spi_time=int(time.time() * 1000)) return news_post
def get_content(link): link = link g = Goose({ 'use_meta_language': False, 'target_language': 'id', 'enable_image_fetching': True, }) extract = g.extract(url=link) content = extract.cleaned_text content = GetContent.remove_publisher(content) content = content.replace('."', '. ') content = content.replace('\n', ' ').replace(' ', ' ').replace(' ', ' ').replace("\'", "").strip('-').strip() content = re.sub(r'[^\x00-\x7F]+', '', content) content = content.replace(' ...', '.').replace('.. .', '. ') content = GetContent.brut_split(content) content = content.replace('.CO', '').replace('.COM', '').replace('. CO', '').replace('. COM', '') content = content.strip('.').strip() + '.' content = GetContent.remove_baca(content) spoiler = content[:150] + '...' try: image = extract.top_image image_src = image.src except: image_src = '' if len(content) <= 500: return "Not Valid" else: return content, spoiler, image_src
def get_article_content(url): try: logger.info("Getting article content of " + url + " with Goose") goose_config = { 'browser_user_agent': 'Mozilla', 'parser_class': 'lxml', # soup or lxml for parsing xml and html # 'enable_image_fetching': True, 'http_timeout': browser_timeout } if config["proxy"]["enabled"].lower() == "true": goose_config["http_proxy"] = config["proxy"]["http_ip_port"] goose_config["https_proxy"] = config["proxy"]["https_ip_port"] g = Goose(goose_config) logger.debug("Goose current parser is {}".format( g.config.get_parser())) article = g.extract(url=url) logger.debug("Extracted content of article from {}".format(url)) content = article.cleaned_text.replace("\n", " ") cleaned_text = article.cleaned_text paragraphs_list = list() paragraphs_list = paragraphs_list + cleaned_text.split('\n') logger.debug(content) return {"content": content, "paragraphs_list": paragraphs_list} except Exception as e: logging.exception( "Error getting article's content from {}".format(url)) erroneous_urls.append({"url": url, "error": "Unable to get content"}) content = "" return {"content": content, "paragraphs_list": list()}
def __init__(self, html): """ @param html: str """ self.html = html self.soup = BeautifulSoup(html, 'lxml') self.goose = Goose({'enable_image_fetching': False})
def get_paragrams(search_res): """ Args: search_res:返回一组搜索结果和链接 Returns: clean_res:返回所有链接的正文段落 Raises: e:文章段落分割异常 """ paras = [] goose = Goose({'browser_user_agent': 'Mozilla', 'parser_class': 'soup', 'stopwords_class': StopWordsChinese}) # 设置goose参数 for ind, res_elem in enumerate(search_res): try: res_herf = res_elem[1] if get_access_result(target_url=res_herf) == None: # 测试是否可以访问 print('Can\'t access to website:'+res_herf) continue article = goose.extract(url=res_herf) # 正文提取 异常处理 paras.extend(list(article.cleaned_text.split())) # 分割成段 except Exception as e: print("Fail to split paragrams in", res_elem[1], end=' ') print(e) continue return paras
def get_links(website_url: str) -> list: ''' It used to get the links in a web page :param website_url: the URL of website that is going to be extracted :return: a link list, after a rough selection ''' links = set() # create goose and bs4 instance, g = Goose() try: main_page = g.extract(url=website_url) soup = BeautifulSoup(main_page.raw_html, 'lxml') # Get the link for line in soup.find_all('a'): link = line.get('href') if link is not None: # very few start with empty space in the head, so delete it link = link.strip(' ') links.add(link) print('Extracted: ', website_url) except Exception as e: # Print the error message if failed to extract print('Fail to extract: ', website_url, ' Error:', str(e)) if len(links) == 0: print( 'Warning! Function: get_links() output empty list when extracting ', website_url) return list(links)
def summarize(url): g = Goose() article = g.extract(url=url) clean = article.cleaned_text stopword_set = set(stopwords.words("english")) sentence_list = nltk.sent_tokenize(clean) word_frequencies = {} for word in nltk.word_tokenize(clean): if word not in stopword_set: if word not in word_frequencies.keys(): word_frequencies[word] = 1 else: word_frequencies[word] += 1 maximum_frequency = max(word_frequencies.values()) for word in word_frequencies.keys(): word_frequencies[word] = (word_frequencies[word]/maximum_frequency) sentence_scores = {} for sent in sentence_list: for word in nltk.word_tokenize(sent.lower()): if word in word_frequencies.keys(): if len(sent.split(' ')) < 30: if sent not in sentence_scores.keys(): sentence_scores[sent] = word_frequencies[word] else: sentence_scores[sent] += word_frequencies[word] summary_sentences = heapq.nlargest(4, sentence_scores, key=sentence_scores.get) summary = ' '.join(summary_sentences) return summary
def getArticle(self): """\ """ # load test case data self.loadData() self.loadHtml() # basic configuration # no image fetching config = self.getConfig() self.parser = config.get_parser() # target language # needed for non english language most of the time target_language = self.data.get('target_language') if target_language: config.target_language = target_language config.use_meta_language = False with requests_mock.Mocker(real_http=True) as m: m.get(self.data['url'], text=self.html) # run goose g = Goose(config=config) return g.extract(url=self.data['url'])
def goose_scraper(link): ''' Returns cleaned text using the python goose3 api ''' g = Goose() article = g.extract(link) return article.cleaned_text
def get_text(): g = Goose() url = "https://item.btime.com/36a0f17i0489keqltn35q96p4lr?from=haozcxw" article = g.extract(url=url) print(article.title) print(article.cleaned_text) return article.cleaned_text
def get_news_result_cnt(self, news_url, keyword=''): config = Configuration() config.http_proxies = { 'http': self.proxy, 'https': self.proxy } config.browser_user_agent = self.ua config.stopwords_class = StopWordsChinese config.http_proxies = { 'http': self.proxy, 'https': self.proxy } g = Goose(config) article = g.extract(news_url) text_html = article.raw_html text_tree = etree.HTML(text_html) if article.cleaned_text: cont = article.cleaned_text else: cont = ''.join(text_tree.xpath('//div[@class="col-md-10 col-xs-12 detailNews"]/p//text()')).replace('\xa0', '') art_title = article.title news_post = dict( doc_id=md5(article.final_url.encode('utf-8')).hexdigest(), keyword=keyword, url=article.final_url, title=art_title, platform='news', content=cont, author=article.authors, source=self.source, published_time=int(parse(article.publish_date).timestamp() * 1000) if article.publish_date else None, spi_time=int(time.time() * 1000) ) return news_post
def textExtractor(urlList): """ Extract texts from tweets urls, back with tid with extracted text list :param urlList: filtered url list :return: a list contain twitter ID with all text extracted from url links """ # urlList: list of urls with tid print('start text extraction from url') g = Goose() if urlList: textList = [] time_out = time.process_time() + 5 while time.process_time() <= time_out: for url in urlList: print(url[0]) try: # 10 min timeout, in case url not working properly or taking too long article = g.extract(url=url[1]) text = article.cleaned_text textList.append((url[0], text)) # with open( # r"C:\\Users\\no281\\Documents\\harVeyTwitter\\articalExtracted\\test\\" + str( # url[0]) + ".txt", 'w') as outfile: # outfile.write(text) # outfile.close() except: print('url break, continue') return textList
def content_extractor(): if request.method == 'GET': return "<h1>Yes, the server's running</h1>" if request.method == 'POST': # to handle the absurd CORS problems - figure out how to do JSON data = str(request.data, encoding='utf-8') # actual content extraction url = data g = Goose(config={'enable_image_fetching': True}) article = g.extract(url=url) # when you have in extension form, `data` will be the targetSiteURL's # raw html. Hence you'll have the following commansds: # raw_html = data # article = g.extract(raw_html=raw_html) # Right now, though, goose handles getting the html # if image available send that also img_src = "" if article.top_image: img_src = article.top_image.src res_dict = { 'title': article.title, 'img_src': img_src, 'content': article.cleaned_text } response = jsonify(res_dict) return response
def extract_article_information_from_html(html): """ This methods gets a website the HTML as string and extracts the text of the article :param html: a HTML object from package requests :return: the article information """ article_information = {} # run with newspaper article_newspaper = Article('') article_newspaper.set_html(html) article_newspaper.parse() article_information["summary"] = article_newspaper.summary article_information["author"] = str(article_newspaper.authors).strip('[]') article_information["tags"] = article_newspaper.tags article_information["title"] = article_newspaper.title newspaper_text = article_newspaper.text # run with newsplease # article_newsplease = NewsPlease.from_html(html) # newsplease_text = article_newsplease.cleaned_text # run with goose goose_extractor = Goose() goose_extractor = goose_extractor.extract(raw_html=html) article_goose = goose_extractor.cleaned_text if len(newspaper_text.split(" ")) > len(article_goose.split(" ")): article_information["text"] = newspaper_text else: article_information["text"] = article_goose return article_information
def get_articles(path, news_website='https://www.yahoo.com/news/', max_articles=150): # articles should be saved in /articles file # See Goose and newspaper3k documentation for explanation on how to use # these packages (tried to use Beautiful Soup for this but was # frustratingly difficult, since it seems that the newspages load when # someone is actually on the side, instead of automatically loading # everything). # https://github.com/goose3/goose3 for goose documentation os.chdir(path) paper = newspaper.build(news_website) g = Goose() i = 0 for article in paper.articles: if 'html' in article.url: i += 1 print(article.url) print(i) if i != max_articles: url = article.url article_extr = g.extract(url=url) file = open('title-{}.txt'.format(i), 'w') file.write(article_extr.title) file.close() file = open('article-{}.txt'.format(i), 'w') file.write(article_extr.cleaned_text) file.close() file = open('topic-{}.txt'.format(i), 'w') file.write(article_extr.domain) file.close() else: break else: continue
def read_articles(self, headlines=None, save_continuously=False, save_dir=""): if headlines is None: headlines = self.headlines extractor = Goose() for date, daily_news in headlines.items(): # Shuffle since if there are too many some will be ignored # and we want the ignored ones to be randomly deselected shuffle(daily_news) news_read = [] for new in daily_news: try: body = extractor.extract(url=new["link"]).cleaned_text news_read.append({**new, "body": body}) if len(self.news) == settings["max_news_per_day"]: break except NetworkError: logger.error("Page not found in {}".format(new["link"])) except MissingSchema: logger.warning("Couldn't read link {}".format(new["link"])) logger.warning(" Reason: string 'http://' might be missing") except Exception as e: logger.warning("Unknown exception while trying to read {}".format(new["link"])) logger.warning(" {}".format(e)) if len(news_read) > 0: self.news[date] = news_read if save_continuously: if save_dir == "": logger.warning("Please provide a save directory") else: self.save_news(save_dir, {date: news_read}) logger.info("From {} headlines, {} of their articles where correctly downloaded".format( sum([len(headers) for headers in self.headlines.values()]), sum([len(day_news) for day_news in self.news.values()]))) return self.news
def fullNews(link, feed): g = Goose() try: article = g.extract(url=link) createfiles(feed, article.title, article.cleaned_text) except: print('error')
def body(url): g = Goose() article = g.extract(url=url) article = str(article.cleaned_text) article = article.replace('"', '') article = " ".join(article.split()) article = str(article) return article
def get_reading(): global body try: g = Goose({'browser_user_agent': useragent_generator()}) reading = g.extract(url=BASE_URL) body = reading.cleaned_text except: body = 'None'
def body(url): g = Goose() article = g.extract(url=url) article = str(article.cleaned_text) article = article.replace('"','') article = " ".join(article.split()) article = str(article) return article
def __init__(self, url, feed_id): self.goose = Goose({'enable_image_fetching': False}) self.url = url self.feed_id = feed_id # TODO race condition overwrites modification but will fix with SQL database self.feed_articles = Feeds.get(self.feed_id) if self.feed_articles is None: self.feed_articles = {}
def extract_article(self): ''' returns a goose article object ''' gooser = Goose() article = gooser.extract(url = self.url) return article
def _extract_content(self, html): ContentExtractor.calculate_best_node = calculate_best_node ContentExtractor.post_cleanup = post_cleanup g = Goose({'enable_image_fetching': False}) article = g.extract(raw_html=html) ContentExtractor.calculate_best_node = f1 ContentExtractor.post_cleanup = f2 return article.cleaned_text
def find_time_unsw(url): # print('===find_time_unsw===') g = Goose() page = g.extract(url=url) soup = BeautifulSoup(page.raw_html, 'lxml') metas = soup.find_all('p') #This is text return (metas[-1].get_text())
def find_time_abc(url): g = Goose() page = g.extract(url=url) soup = BeautifulSoup(page.raw_html, 'lxml') metas = soup.find_all('meta') for meta in metas: if not meta.get('property') == None: if 'published_time' in meta.get('property'): return (meta.get('content'))
It processes the output from the scraper into the format required by the Rookie corpus injestion pipeline ''' from goose3 import Goose from bs4 import BeautifulSoup from collections import defaultdict import pickle import json with open("schneier.com", "rb") as inf: schneier = pickle.load(inf) g = Goose() def get_headline(article): '''example input => '<a name="12">Comments from Readers</a>''' headline = article.split("</h4>")[0] headline = BeautifulSoup(headline, 'html.parser').get_text() return headline # e.g. Comments from Readers def get_pubdate(url_page): '''example input => /crypto-gram/archives/2007/0315.html''' yyyy, mody = page.replace(".html","").split("/")[-2:] mo = mody[0:2] dy = mody[2:4] return "{}-{}-{}".format(yyyy, mo, dy)
# coding: utf-8 # In[3]: from goose3 import Goose from goose3.text import StopWordsChinese # 初始化,设置中文分词 g = Goose({'stopwords_class': StopWordsChinese}) # 文章地址 url = 'https://mp.weixin.qq.com/s/zflbcF5PS06QC5YJXpiviQ' # 获取文章内容 article = g.extract(url=url) # 标题 print('标题:', article.title) # 显示正文 print(article.cleaned_text) # In[6]: url = 'http://edition.cnn.com/2012/02/22/world/europe/uk-occupy-london/index.html?hpt=ieu_c2' g = Goose({ 'browser_user_agent': 'Version/5.1.2 Safari/534.52.7', 'http_timeout': 15 }) article = g.extract(url=url) print(article.meta_description) print(article.meta_keywords)