Ejemplo n.º 1
0
def gather_informations():
    url = request.args['url']
    g = Goose(
        config={
            'local_storage_path':
            './data/' if os.environ.get('BUCKET_HOST') is not None else '.',
            'enable_image_fetching':
            True
        })
    try:
        goose_response = g.extract(url=url)
        print(goose_response.top_image)
    except Exception as e:
        print('error')
        return jsonify({'error': True, 'message': e}), 500

    response_img = ''
    if goose_response.top_image is not None:
        response_img = goose_response.top_image.src
    return jsonify({
        'title': goose_response.title,
        'urlRequested': url,
        'text': goose_response.cleaned_text[:200],
        'mainImage': response_img
    }), 200
Ejemplo n.º 2
0
	def parse_item(self, response):
	
		self.iter_count += 1
		
		html = response.body
		
		# Objeto Goose para extraer datos de la pagina
		goose_extractor = Goose()
		article = goose_extractor.extract(raw_html=html)
		
		# Comprobar que la pagina contenga (por lo menos) un header h2 con la palabra 'Examples', para saber si es un tropo o no
		if(response.css('h2').re('.Examples:.')):
			self.trope_count+=1
			follow = True
			json_file = self.generate_json(article)
			self.create_files(json_file, 'tropo')
			
			# Archivo para comprobar los tropos indexados
			#with open(self.final_directory + 'trope_list.txt', 'a+', encoding='utf-8') as fp:
			#	fp.write(response.url+'\n')
			
		else:
			self.non_trope_count += 1
			if('Laconic' in response.url):
				print('Encontrado un Laconic!')
				self.laconic_count += 1
				json_file = self.generate_json(article)
				self.create_files(json_file, 'laconic')
			else:
				print('Enlace ignorado! (no era un tropo)')
			follow = False
		
		# Cerrar objeto goose
		goose_extractor.close()
Ejemplo n.º 3
0
    def get_news(
            self):  # 실제로 url에 들어가 기사들을 읽어온다 , 첫번째 카테고리만으로 검색했을때 데이터를 가져와준다
        #categories 는 1,2,3숫자를 받는다(여러개 가능)
        print('기사 추출 시작')
        for url in self.urls:
            try:
                category = self.categories[self.choose_category - 1]

                g = Goose({'stopwords_class': StopWordsKorean})
                article = g.extract(url=url)
                title = article.title
                #print(title)
                content = self.read_article_contents(url)
                if content == "":
                    continue
                print(content)
                self.article_info["category"] = category
                self.article_info["contents"] = content
                self.article_info["title"] = title
                self.article_info["url"] = url
                self.articles.append(self.article_info)
                self.num_article += 1
            except:
                continue

        return self.articles
Ejemplo n.º 4
0
    def gooseChineseExample(self):

        data_list = []
        # 文章地址
        num = 0
        for url in self.Baiduurl:
            # 初始化,设置中文分词
            g = Goose({'stopwords_class': StopWordsChinese})
            # 获取文章内容
            article = g.extract(url=url)
            # 获取标题
            title = article.title
            data_list.append('标题: ' + title)
            # 获取来源
            source = self.getSource()
            data_list.append('来源: ' + str(source[num]))
            # 发布时间
            Time = self.getTime()
            data_list.append('发布时间: ' + str(Time[num]))
            # 显示正文
            text = article.cleaned_text
            data_list.append('文本: ' + text)
            data_list.append(
                '============================================================================='
            )
            num += 1
        data_list = '\n'.join(data_list)
        print(data_list)
Ejemplo n.º 5
0
 def parse_detail(self, response):
     # 学术讲座 http://www.cqupt.edu.cn/cqupt/news_detail.shtml?id=155176964575282691
     # 列表 API http://www.cqupt.edu.cn/getPublicPage.do 外加参数 cookie
     # js 动态加载,详情API http://www.cqupt.edu.cn/getPublicNotic.do?id=155176964575282691
     item_loader = CquptSpiderItemLoader(item=CquptSpiderItem(),
                                         response=response)
     g = Goose({'stopwords_class': StopWordsChinese})
     content = g.extract(raw_html=response.text)
     item_loader.add_value('url', response.url)
     item_loader.add_value('url_obj_id', response.url)
     item_loader.add_xpath('html_title', '/html/head/title/text()')
     item_loader.add_value('crawl_time', datetime.datetime.now())
     if len(content.cleaned_text) < self.main_content_min_length:
         # 正文长度不够,认为是导航页或者列表页
         # 尝试解析SEO 信息
         item_loader.add_xpath(
             'meta_description',
             "/html/head/meta[@name='description']/@content")
         item_loader.add_xpath(
             'meta_keywords',
             "/html/head/meta[@name='keywords']/@content | "
             "/html/head/meta[@name='Keywords']/@content")
         item_loader.add_value('tags', content.title)
     else:
         item_loader.add_value('meta_keywords', content.meta_keywords)
         item_loader.add_value('meta_description', content.meta_description)
         item_loader.add_value('title', content.title)
         item_loader.add_value('create_date', content.publish_date)
         item_loader.add_value('authors', content.authors)
         item_loader.add_value('top_image', content.top_image)
         item_loader.add_value('tags', content.tags)
         item_loader.add_value('content', content.cleaned_text)
     item = item_loader.load_item()
     return item
Ejemplo n.º 6
0
def gooseChineseExample():
    g = Goose({'stopwords_class': StopWordsChinese})
    url = "https://item.btime.com/36a0f17i0489keqltn35q96p4lr?from=haozcxw"
    article = g.extract(url=url)
    print(article.title)
    print(article.meta_description)
    print(article.cleaned_text[:150])
Ejemplo n.º 7
0
 def get_news_result_cnt(self, news_url):
     config = Configuration()
     config.http_proxies = {'http': self.proxy, 'https': self.proxy}
     config.browser_user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36'
     config.stopwords_class = StopWordsChinese
     config.http_proxies = {'http': self.proxy, 'https': self.proxy}
     g = Goose(config)
     article = g.extract(news_url)
     try:
         published_time = int(
             parse(article.publish_date).timestamp() *
             1000) if article.publish_date else None
     except:
         published_time = int(
             time.mktime(time.strptime(article.publish_date, "%Y年%m月%d日")) *
             1000) if article.publish_date else None
     news_post = dict(doc_id=md5(
         article.final_url.encode('utf-8')).hexdigest(),
                      keyword='',
                      url=article.final_url,
                      title=article.title,
                      platform='news',
                      content=article.cleaned_text,
                      author=article.authors,
                      source=self.source,
                      published_time=published_time,
                      spi_time=int(time.time() * 1000))
     return news_post
Ejemplo n.º 8
0
    def get_content(link):
        link = link
        g = Goose({
            'use_meta_language': False,
            'target_language': 'id',
            'enable_image_fetching': True,
        })
        extract = g.extract(url=link)

        content = extract.cleaned_text
        content = GetContent.remove_publisher(content)
        content = content.replace('."', '. ')
        content = content.replace('\n', ' ').replace('   ', ' ').replace('  ', ' ').replace("\'", "").strip('-').strip()
        content = re.sub(r'[^\x00-\x7F]+', '', content)
        content = content.replace(' ...', '.').replace('.. .', '. ')
        content = GetContent.brut_split(content)
        content = content.replace('.CO', '').replace('.COM', '').replace('. CO', '').replace('. COM', '')
        content = content.strip('.').strip() + '.'
        content = GetContent.remove_baca(content)
        spoiler = content[:150] + '...'
        try:
            image = extract.top_image
            image_src = image.src
        except:
            image_src = ''

        if len(content) <= 500:
            return "Not Valid"
        else:
            return content, spoiler, image_src
Ejemplo n.º 9
0
def get_article_content(url):
    try:
        logger.info("Getting article content of " + url + " with Goose")
        goose_config = {
            'browser_user_agent': 'Mozilla',
            'parser_class': 'lxml',  # soup or lxml for parsing xml and html
            # 'enable_image_fetching': True,
            'http_timeout': browser_timeout
        }

        if config["proxy"]["enabled"].lower() == "true":
            goose_config["http_proxy"] = config["proxy"]["http_ip_port"]
            goose_config["https_proxy"] = config["proxy"]["https_ip_port"]

        g = Goose(goose_config)
        logger.debug("Goose current parser is {}".format(
            g.config.get_parser()))
        article = g.extract(url=url)
        logger.debug("Extracted content of article from {}".format(url))
        content = article.cleaned_text.replace("\n", " ")
        cleaned_text = article.cleaned_text
        paragraphs_list = list()
        paragraphs_list = paragraphs_list + cleaned_text.split('\n')

        logger.debug(content)

        return {"content": content, "paragraphs_list": paragraphs_list}
    except Exception as e:
        logging.exception(
            "Error getting article's content from {}".format(url))
        erroneous_urls.append({"url": url, "error": "Unable to get content"})
        content = ""
        return {"content": content, "paragraphs_list": list()}
Ejemplo n.º 10
0
 def __init__(self, html):
     """
     @param html: str
     """
     self.html = html
     self.soup = BeautifulSoup(html, 'lxml')
     self.goose = Goose({'enable_image_fetching': False})
Ejemplo n.º 11
0
def get_paragrams(search_res):
    """
        Args:
            search_res:返回一组搜索结果和链接

        Returns:
            clean_res:返回所有链接的正文段落

        Raises:
            e:文章段落分割异常
    """
    paras = []
    goose = Goose({'browser_user_agent': 'Mozilla', 'parser_class': 'soup',
                   'stopwords_class': StopWordsChinese})  # 设置goose参数
    for ind, res_elem in enumerate(search_res):
        try:
            res_herf = res_elem[1]
            if get_access_result(target_url=res_herf) == None:  # 测试是否可以访问
                print('Can\'t access to website:'+res_herf)
                continue
            article = goose.extract(url=res_herf)  # 正文提取 异常处理
            paras.extend(list(article.cleaned_text.split()))  # 分割成段
        except Exception as e:
            print("Fail to split paragrams in", res_elem[1], end='  ')
            print(e)
            continue
    return paras
def get_links(website_url: str) -> list:
    '''
        It used to get the links in a web page
    :param website_url: the URL of website that is going to be extracted
    :return: a link list, after a rough selection
    '''
    links = set()
    # create goose and bs4 instance,
    g = Goose()
    try:
        main_page = g.extract(url=website_url)
        soup = BeautifulSoup(main_page.raw_html, 'lxml')
        # Get the link
        for line in soup.find_all('a'):
            link = line.get('href')
            if link is not None:
                # very few start with empty space in the head, so delete it
                link = link.strip(' ')
                links.add(link)
        print('Extracted: ', website_url)
    except Exception as e:
        # Print the error message if failed to extract
        print('Fail to extract: ', website_url, '   Error:', str(e))
    if len(links) == 0:
        print(
            'Warning! Function: get_links() output empty list when extracting ',
            website_url)
    return list(links)
Ejemplo n.º 13
0
def summarize(url):
    g = Goose()
    article = g.extract(url=url)
    clean = article.cleaned_text
    stopword_set = set(stopwords.words("english"))
    sentence_list = nltk.sent_tokenize(clean)

    word_frequencies = {}
    for word in nltk.word_tokenize(clean):
        if word not in stopword_set:
            if word not in word_frequencies.keys():
                word_frequencies[word] = 1
            else:
                word_frequencies[word] += 1

    maximum_frequency = max(word_frequencies.values())

    for word in word_frequencies.keys():
        word_frequencies[word] = (word_frequencies[word]/maximum_frequency)
    sentence_scores = {}
    for sent in sentence_list:
        for word in nltk.word_tokenize(sent.lower()):
            if word in word_frequencies.keys():
                if len(sent.split(' ')) < 30:
                    if sent not in sentence_scores.keys():
                        sentence_scores[sent] = word_frequencies[word]
                    else:
                        sentence_scores[sent] += word_frequencies[word]

    summary_sentences = heapq.nlargest(4, sentence_scores, key=sentence_scores.get)

    summary = ' '.join(summary_sentences)

    return summary
Ejemplo n.º 14
0
    def getArticle(self):
        """\

        """
        # load test case data
        self.loadData()
        self.loadHtml()

        # basic configuration
        # no image fetching
        config = self.getConfig()
        self.parser = config.get_parser()

        # target language
        # needed for non english language most of the time
        target_language = self.data.get('target_language')
        if target_language:
            config.target_language = target_language
            config.use_meta_language = False

        with requests_mock.Mocker(real_http=True) as m:
            m.get(self.data['url'], text=self.html)
            # run goose
            g = Goose(config=config)
            return g.extract(url=self.data['url'])
Ejemplo n.º 15
0
def goose_scraper(link):
    '''
    Returns cleaned text using the python goose3 api
    '''
    g = Goose()
    article = g.extract(link)
    return article.cleaned_text
Ejemplo n.º 16
0
def get_text():
    g = Goose()
    url = "https://item.btime.com/36a0f17i0489keqltn35q96p4lr?from=haozcxw"
    article = g.extract(url=url)
    print(article.title)
    print(article.cleaned_text)
    return article.cleaned_text
Ejemplo n.º 17
0
 def get_news_result_cnt(self, news_url, keyword=''):
     config = Configuration()
     config.http_proxies = {
         'http': self.proxy,
         'https': self.proxy
     }
     config.browser_user_agent = self.ua
     config.stopwords_class = StopWordsChinese
     config.http_proxies = {
         'http': self.proxy,
         'https': self.proxy
     }
     g = Goose(config)
     article = g.extract(news_url)
     text_html = article.raw_html
     text_tree = etree.HTML(text_html)
     if article.cleaned_text:
         cont = article.cleaned_text
     else:
         cont = ''.join(text_tree.xpath('//div[@class="col-md-10 col-xs-12 detailNews"]/p//text()')).replace('\xa0',
                                                                                                             '')
     art_title = article.title
     news_post = dict(
         doc_id=md5(article.final_url.encode('utf-8')).hexdigest(),
         keyword=keyword,
         url=article.final_url,
         title=art_title,
         platform='news',
         content=cont,
         author=article.authors,
         source=self.source,
         published_time=int(parse(article.publish_date).timestamp() * 1000) if article.publish_date else None,
         spi_time=int(time.time() * 1000)
     )
     return news_post
Ejemplo n.º 18
0
def textExtractor(urlList):
    """
    Extract texts from tweets urls, back with tid with extracted text list
    :param urlList: filtered url list
    :return: a list contain twitter ID with all text extracted from url links
    """
    # urlList: list of urls with tid
    print('start text extraction from url')
    g = Goose()
    if urlList:
        textList = []
        time_out = time.process_time() + 5

        while time.process_time() <= time_out:
            for url in urlList:
                print(url[0])
                try:  # 10 min timeout, in case url not working properly or taking too long
                    article = g.extract(url=url[1])
                    text = article.cleaned_text
                    textList.append((url[0], text))
                    # with open(
                    #         r"C:\\Users\\no281\\Documents\\harVeyTwitter\\articalExtracted\\test\\" + str(
                    #             url[0]) + ".txt", 'w') as outfile:
                    #     outfile.write(text)
                    # outfile.close()
                except:
                    print('url break, continue')
    return textList
Ejemplo n.º 19
0
def content_extractor():
    if request.method == 'GET':
        return "<h1>Yes, the server's running</h1>"
    if request.method == 'POST':
        # to handle the absurd CORS problems - figure out how to do JSON
        data = str(request.data, encoding='utf-8')

        # actual content extraction
        url = data
        g = Goose(config={'enable_image_fetching': True})
        article = g.extract(url=url)
        # when you have in extension form, `data` will be the targetSiteURL's
        # raw html. Hence you'll have the following commansds:
            # raw_html = data
            # article = g.extract(raw_html=raw_html)
        # Right now, though, goose handles getting the html

        # if image available send that also
        img_src = ""
        if article.top_image:
            img_src = article.top_image.src

        res_dict = {
            'title': article.title,
            'img_src': img_src,
            'content': article.cleaned_text
        }
        response = jsonify(res_dict)
        return response
Ejemplo n.º 20
0
def extract_article_information_from_html(html):
    """
    This methods gets a website the HTML as string and extracts the text of
    the article

    :param html: a HTML object from package requests
    :return: the article information
    """
    article_information = {}

    # run with newspaper
    article_newspaper = Article('')
    article_newspaper.set_html(html)
    article_newspaper.parse()

    article_information["summary"] = article_newspaper.summary
    article_information["author"] = str(article_newspaper.authors).strip('[]')
    article_information["tags"] = article_newspaper.tags
    article_information["title"] = article_newspaper.title

    newspaper_text = article_newspaper.text
    # run with newsplease
    # article_newsplease = NewsPlease.from_html(html)
    # newsplease_text = article_newsplease.cleaned_text
    # run with goose
    goose_extractor = Goose()
    goose_extractor = goose_extractor.extract(raw_html=html)
    article_goose = goose_extractor.cleaned_text
    if len(newspaper_text.split(" ")) > len(article_goose.split(" ")):
        article_information["text"] = newspaper_text
    else:
        article_information["text"] = article_goose
    return article_information
def get_articles(path,
                 news_website='https://www.yahoo.com/news/',
                 max_articles=150):
    # articles should be saved in /articles file
    # See Goose and newspaper3k documentation for explanation on how to use
    # these packages (tried to use Beautiful Soup for this but was
    # frustratingly difficult, since it seems that the newspages load when
    # someone is actually on the side, instead of automatically loading
    # everything).
    # https://github.com/goose3/goose3 for goose documentation
    os.chdir(path)
    paper = newspaper.build(news_website)
    g = Goose()
    i = 0
    for article in paper.articles:
        if 'html' in article.url:
            i += 1
            print(article.url)
            print(i)
            if i != max_articles:
                url = article.url
                article_extr = g.extract(url=url)
                file = open('title-{}.txt'.format(i), 'w')
                file.write(article_extr.title)
                file.close()
                file = open('article-{}.txt'.format(i), 'w')
                file.write(article_extr.cleaned_text)
                file.close()
                file = open('topic-{}.txt'.format(i), 'w')
                file.write(article_extr.domain)
                file.close()
            else:
                break
        else:
            continue
Ejemplo n.º 22
0
    def read_articles(self, headlines=None, save_continuously=False, save_dir=""):
        if headlines is None:
            headlines = self.headlines
        extractor = Goose()
        for date, daily_news in headlines.items():
            # Shuffle since if there are too many some will be ignored
            # and we want the ignored ones to be randomly deselected
            shuffle(daily_news)

            news_read = []
            for new in daily_news:
                try:
                    body = extractor.extract(url=new["link"]).cleaned_text
                    news_read.append({**new, "body": body})
                    if len(self.news) == settings["max_news_per_day"]:
                        break
                except NetworkError:
                    logger.error("Page not found in {}".format(new["link"]))
                except MissingSchema:
                    logger.warning("Couldn't read link {}".format(new["link"]))
                    logger.warning("  Reason: string 'http://' might be missing")
                except Exception as e:
                    logger.warning("Unknown exception while trying to read {}".format(new["link"]))
                    logger.warning("   {}".format(e))
            if len(news_read) > 0:
                self.news[date] = news_read
                if save_continuously:
                    if save_dir == "":
                        logger.warning("Please provide a save directory")
                    else:
                        self.save_news(save_dir, {date: news_read})
        logger.info("From {} headlines, {} of their articles where correctly downloaded".format(
            sum([len(headers) for headers in self.headlines.values()]),
            sum([len(day_news) for day_news in self.news.values()])))
        return self.news
Ejemplo n.º 23
0
def fullNews(link, feed):

    g = Goose()
    try:
        article = g.extract(url=link)
        createfiles(feed, article.title, article.cleaned_text)
    except:
        print('error')
Ejemplo n.º 24
0
def body(url):
    g = Goose()
    article = g.extract(url=url)
    article = str(article.cleaned_text)
    article = article.replace('"', '')
    article = " ".join(article.split())
    article = str(article)
    return article
Ejemplo n.º 25
0
def get_reading():
    global body
    try:
        g = Goose({'browser_user_agent': useragent_generator()})
        reading = g.extract(url=BASE_URL)
        body = reading.cleaned_text
    except:
        body = 'None'
Ejemplo n.º 26
0
def body(url):
	g = Goose()
	article = g.extract(url=url)
	article = str(article.cleaned_text)
	article = article.replace('"','')
	article = " ".join(article.split())
	article = str(article)
	return article
Ejemplo n.º 27
0
 def __init__(self, url, feed_id):
     self.goose = Goose({'enable_image_fetching': False})
     self.url = url
     self.feed_id = feed_id
     # TODO race condition overwrites modification but will fix with SQL database
     self.feed_articles = Feeds.get(self.feed_id)
     if self.feed_articles is None:
         self.feed_articles = {}
Ejemplo n.º 28
0
	def extract_article(self):
		'''
		returns a goose article object
		'''

		gooser = Goose()
		article = gooser.extract(url = self.url)
		return article
Ejemplo n.º 29
0
 def _extract_content(self, html):
     ContentExtractor.calculate_best_node = calculate_best_node
     ContentExtractor.post_cleanup = post_cleanup
     g = Goose({'enable_image_fetching': False})
     article = g.extract(raw_html=html)
     ContentExtractor.calculate_best_node = f1
     ContentExtractor.post_cleanup = f2
     return article.cleaned_text
Ejemplo n.º 30
0
def find_time_unsw(url):
    # print('===find_time_unsw===')
    g = Goose()
    page = g.extract(url=url)
    soup = BeautifulSoup(page.raw_html, 'lxml')
    metas = soup.find_all('p')
    #This is text
    return (metas[-1].get_text())
Ejemplo n.º 31
0
def find_time_abc(url):
    g = Goose()
    page = g.extract(url=url)
    soup = BeautifulSoup(page.raw_html, 'lxml')
    metas = soup.find_all('meta')
    for meta in metas:
        if not meta.get('property') == None:
            if 'published_time' in meta.get('property'):
                return (meta.get('content'))
Ejemplo n.º 32
0
It processes the output from the scraper into the format required by
the Rookie corpus injestion pipeline

'''

from goose3 import Goose
from bs4 import BeautifulSoup
from collections import defaultdict

import pickle
import json

with open("schneier.com", "rb") as inf:
    schneier = pickle.load(inf)

g = Goose()


def get_headline(article):
    '''example input => '<a name="12">Comments from Readers</a>'''
    headline = article.split("</h4>")[0]
    headline = BeautifulSoup(headline, 'html.parser').get_text()
    return headline  # e.g. Comments from Readers


def get_pubdate(url_page):
    '''example input => /crypto-gram/archives/2007/0315.html'''
    yyyy, mody = page.replace(".html","").split("/")[-2:]
    mo = mody[0:2]
    dy = mody[2:4]
    return "{}-{}-{}".format(yyyy, mo, dy)
Ejemplo n.º 33
0
# coding: utf-8

# In[3]:


from goose3 import Goose
from goose3.text import StopWordsChinese
# 初始化,设置中文分词
g = Goose({'stopwords_class': StopWordsChinese})
# 文章地址
url = 'https://mp.weixin.qq.com/s/zflbcF5PS06QC5YJXpiviQ'
# 获取文章内容
article = g.extract(url=url)
# 标题
print('标题:', article.title)
# 显示正文
print(article.cleaned_text)


# In[6]:


url = 'http://edition.cnn.com/2012/02/22/world/europe/uk-occupy-london/index.html?hpt=ieu_c2'
g = Goose({
    'browser_user_agent': 'Version/5.1.2 Safari/534.52.7',
    'http_timeout': 15
})
article = g.extract(url=url)
print(article.meta_description)
print(article.meta_keywords)