def parse_reuters(self):
        try:
            self.newsdata = self.session.get(
                self.url).json()["wireitems"][0]["templates"][0]["story"]

            self.article = newspaper.fulltext(self.newsdata["body"],
                                              language=self.language)
        except Exception as e:
            print(e)
            return []

        try:
            self.caption = self.newsdata["images"][0]["caption"]

            self.picture = self.newsdata["images"][0]["url"] + "&w=200.0"

            self.resize = False
        except:
            pass

        try:
            location = self.newsdata["dateline"]

            if " (Reuters)" in location:
                self.location = location.split(" (Reuters)")[0].split("/")[0]
            elif "[" in self.article and "]" in self.article:
                self.location = self.article.split("[")[1].split("]")[0].split(
                    "日 ロイター")[0][:-1]

            if self.location == "":
                self.location = None
        except:
            pass
Beispiel #2
0
 def search(self, topic, max=1, c=100):
     """
     Takes the input topic and collects next 100 news articles based on max
     paramter. Collects article text and author.
     """
     # Searches for headlines and gets article data
     json = self.api.get_everything(q=topic,
                                    language='en',
                                    sort_by='relevancy',
                                    page=max,
                                    page_size=c)
     articles = json['articles']
     news = list()
     for article in articles:
         # Takes the URL of the article and downloads the full content
         a = Article(article['url'])
         a.download()
         # If download failed, continue to next article
         try:
             # Takes the whole article's text and adds it to output
             text = fulltext(a.html)
             current = dict()
             current['text'] = text
             current['author'] = article['author']
             news.append(current)
         except Exception as e:
             pass
     return news, max + 1
def getarticle(readfile):
    ''' get the article and save it in a different file '''
    try:
        fileopen = open(readfile)
    except IOError:
        print "file " + readfile + " not in the location specified"
        return

    i = 1
    for line in fileopen:
        try:
            ua = generate_user_agent()
            head = ua.encode('ascii', 'ignore')
            headers = {'useragent': head}

            print "reading article :"
            print line
            html = requests.get(line, headers=headers).text
            tex = fulltext(html)
            writefile = "201604" + str(j) + "_" + str(i) + ".txt"
            with io.open(writefile, encoding='utf-8', mode='w+') as ns:
                strng = ' '.join(tex.split())
                ns.write(strng)
                ns.close()
            i = i + 1
        except:
            pass
def alt_extract_info(tab, driver, url):
    cookies = driver.get_cookies()
    s = requests.Session()
    for cookie in cookies:
        s.cookies.set(cookie['name'], cookie['value'])
    article = s.get(url)
    text = fulltext(article.text)
Beispiel #5
0
def clean_article(content):
    """Converts html text into article text"""
    result = ''
    try:
        result = fulltext(content)
    finally:  # Catch-all to ensure all broken html is discarded
        return result
Beispiel #6
0
    def parse_reuters(self):
        try:
            self.newsdata = self.session.get(
                self.url).json()["wireitems"][0]["templates"][0]["story"]
        except Exception as e:
            print(e)
            return []

        try:
            self.article = newspaper.fulltext(self.newsdata["body"],
                                              language=self.language)

            self.caption = self.newsdata["images"][0]["caption"]

            self.picture = self.newsdata["images"][0]["url"] + "&w=200.0"

            self.resize = False
        except Exception as e:
            print(e)

        try:
            location = self.newsdata["dateline"]

            if location != "(Reuters)":
                self.location = location.split(" (Reuters)")[0].split("/")[0]
            elif "\uff3b" in self.location and "\u3000" in self.location:
                self.location = self.article.split("\uff3b")[1].split(
                    "\u3000")[0]

            print(self.location)
        except Exception as e:
            print(e)
def normalize_text(html):
    try:
        url_re = re.compile("https{0,1}://[^\s]+")
        url2_re = re.compile("[a-z0-9\.]+\.[a-z0-9\.]+/[^\s]*")
        space_re = re.compile("[\s]{2,}")

        html = html.encode("ascii", errors="ignore")
        text = newspaper.fulltext(html)
        
        sent = text.encode('ascii', errors='ignore')
        sent = str(sent).replace("r\\", "")
        sent = str(sent).replace("n\\", "")
        sent = str(sent).replace("\\", "")
        text = sent

        t, d = MosesTokenizer(), MosesDetokenizer()
        tokens = t.tokenize(text)
        detokens = d.detokenize(tokens)
        text = " ".join(detokens)
            # Removing URLs
        text = url_re.sub(" ", text)
        text = url2_re.sub(" ", text)
            
        # Removing multiple spacing characters
        text = space_re.sub(" ", text)

        text = text.encode("ascii", errors="ignore").decode()
        text = preProcess(text)
            # Stripping leading and trailing spaces
        text = text.strip()
        return text
    except Exception as e:
        return ""
Beispiel #8
0
def run_newspaper(htmlstring):
    '''try with the newspaper module'''
    try:
        text = fulltext(htmlstring)  # sanitize(fulltext(htmlstring))
    except AttributeError:
        return ''
    return text
def getarticle(readfile):
    ''' get the article and save it in a different file '''
    try:
        fileopen = open(readfile)
    except IOError:
        print "file " + readfile + " not in the location specified"
        return

    i = 1
    for line in fileopen:
        try:
        	ua = generate_user_agent()
        	head = ua.encode('ascii', 'ignore')
        	headers = {'useragent':head}

        	print "reading article :"
        	print line
        	html = requests.get(line, headers = headers).text
        	tex = fulltext(html)
        	writefile = "201604"+str(j)+"_"+str(i)+".txt"
        	with io.open(writefile, encoding='utf-8', mode='w+') as ns:
        		strng = ' '.join(tex.split())
        		ns.write(strng)
        		ns.close()
        	i = i + 1       	
       	except:
       	    pass
def fetch_bookmarks(urls):
    obj = Scrape_Filter()

    bookmark_data = dict()
    start_scrapy(urls)

    for url in fetched:
        req = fetched[url]
        soup = BeautifulSoup(req, 'html5lib')
        obj.check_lang(soup)

        try:
            text = fulltext(req)
        except:
            article = Article(url)
            article.download()
            article.parse()
            text = article.text
        title = obj.get_title(soup)
        desc_keywords = obj.get_keywords_and_description(soup)
        content = obj.filter_text(text)

        bookmark_data[url] = dict()
        bookmark_data[url]["title"] = title
        bookmark_data[url]["desc"] = desc_keywords
        bookmark_data[url]["content"] = content

    return bookmark_data
Beispiel #11
0
    def test_parse_html(self):
        self.setup_stage('parse')

        AUTHORS = [
            'Chien-Ming Wang', 'Dana A. Ford', 'James S.A. Corey',
            'Tom Watkins'
        ]
        TITLE = 'After storm, forecasters see smooth sailing for Thanksgiving'
        LEN_IMGS = 46
        META_LANG = 'en'
        META_SITE_NAME = 'CNN'

        self.article.parse()
        self.article.nlp()

        text = mock_resource_with('cnn', 'txt')
        self.assertEqual(text, self.article.text)
        self.assertEqual(text, fulltext(self.article.html))

        # NOTE: top_img extraction requires an internet connection
        # unlike the rest of this test file
        TOP_IMG = ('http://i2.cdn.turner.com/cnn/dam/assets/131129200805-'
                   '01-weather-1128-story-top.jpg')
        self.assertEqual(TOP_IMG, self.article.top_img)

        self.assertCountEqual(AUTHORS, self.article.authors)
        self.assertEqual(TITLE, self.article.title)
        self.assertEqual(LEN_IMGS, len(self.article.imgs))
        self.assertEqual(META_LANG, self.article.meta_lang)
        self.assertEqual(META_SITE_NAME, self.article.meta_site_name)
        self.assertEqual('2013-11-27 00:00:00', str(self.article.publish_date))
Beispiel #12
0
    def test_parse_html(self):
        self.setup_stage('parse')

        AUTHORS = ['Chien-Ming Wang', 'Dana A. Ford', 'James S.A. Corey',
                   'Tom Watkins']
        TITLE = 'After storm, forecasters see smooth sailing for Thanksgiving'
        LEN_IMGS = 46
        META_LANG = 'en'

        self.article.parse()
        self.article.nlp()

        text = mock_resource_with('cnn', 'txt')
        self.assertEqual(text, self.article.text)
        self.assertEqual(text, fulltext(self.article.html))

        # NOTE: top_img extraction requires an internet connection
        # unlike the rest of this test file
        TOP_IMG = ('http://i2.cdn.turner.com/cnn/dam/assets/131129200805-'
                   '01-weather-1128-story-top.jpg')
        self.assertEqual(TOP_IMG, self.article.top_img)

        self.assertCountEqual(AUTHORS, self.article.authors)
        self.assertEqual(TITLE, self.article.title)
        self.assertEqual(LEN_IMGS, len(self.article.imgs))
        self.assertEqual(META_LANG, self.article.meta_lang)
        self.assertEqual('2013-11-27 00:00:00', str(self.article.publish_date))
Beispiel #13
0
def extract(html, language='ru'):
    try:
        text = fulltext(html=html, language=language)
    except:
        text = ''

    return text
Beispiel #14
0
def summarize(url):
    from newspaper import fulltext
    import requests
    text = fulltext(requests.get(url).text)
    model = Summarizer()
    result = model(text, ratio=0.1)
    full = ''.join(result)
    return full
Beispiel #15
0
def test_method(url):
    article = get_article(url)
    req = requests.get(url).text
    tst = req.parse()
    parser = MyHTMLParser()
    tst = parser.feed(req)

    text = fulltext(req)
Beispiel #16
0
def attempt2():
    header = {
        "Accept-Encoding": "gzip",
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"
    }
    html = requests.get(url, headers=header).text
    text = fulltext(html)
    print(text)
Beispiel #17
0
def remove_image(link):
    try:
        time.sleep(5)
        article_soup = get_html(link)
        article_soup.find('figure').decompose()
        return fulltext(str(article_soup.currentTag))
    except Exception as e:
        pass
Beispiel #18
0
def get_text(url):

    from newspaper import fulltext
    import requests
    url = str(url)
    text = fulltext(requests.get(url).text)

    return text
Beispiel #19
0
 def test_japanese_fulltext_extract2(self):
     url = 'http://www.afpbb.com/articles/-/3178894'
     article = Article(url=url, language='ja')
     html = mock_resource_with('japanese_article2', 'html')
     article.download(html)
     article.parse()
     text = mock_resource_with('japanese2', 'txt')
     self.assertEqual(text, article.text)
     self.assertEqual(text, fulltext(article.html, 'ja'))
Beispiel #20
0
    async def get_url(url, returned_format=None):
        if returned_format == 'html':
            print(
                '[!] HTML support is being refactored. Currently data is being returned plaintext'
            )
        r = requests.get(url)

        b = newspaper.fulltext(r.text)
        return str(b).replace('\n', '<br>') if b else None
Beispiel #21
0
 def test_latvian_fulltext_extract(self):
     url = 'https://www.lsm.lv/raksts/zinas/arzemes/norvegija-pec-zemes-nogruvuma-pieci-bojagajusie.a387519/'
     article = Article(url=url, language='lv')
     html = mock_resource_with('latvian_article', 'html')
     article.download(html)
     article.parse()
     text = mock_resource_with('latvian', 'txt')
     self.assertEqual(text, article.text)
     self.assertEqual(text, fulltext(article.html, 'lv'))
Beispiel #22
0
 def test_thai_fulltext_extract(self):
     url = 'https://prachatai.com/journal/2019/01/80642'
     article = Article(url=url, language='th')
     html = mock_resource_with('thai_article', 'html')
     article.download(html)
     article.parse()
     text = mock_resource_with('thai', 'txt')
     self.assertEqual(text, article.text)
     self.assertEqual(text, fulltext(article.html, 'th'))
Beispiel #23
0
 def test_japanese_fulltext_extract(self):
     url = 'https://www.nikkei.com/article/DGXMZO31897660Y8A610C1000000/?n_cid=DSTPCS001'
     article = Article(url=url, language='ja')
     html = mock_resource_with('japanese_article', 'html')
     article.download(html)
     article.parse()
     text = mock_resource_with('japanese', 'txt')
     self.assertEqual(text, article.text)
     self.assertEqual(text, fulltext(article.html, 'ja'))
Beispiel #24
0
 def test_thai_fulltext_extract(self):
     url = 'https://prachatai.com/journal/2019/01/80642'
     article = Article(url=url, language='th')
     html = mock_resource_with('thai_article', 'html')
     article.download(html)
     article.parse()
     text = mock_resource_with('thai', 'txt')
     self.assertEqual(text, article.text)
     self.assertEqual(text, fulltext(article.html, 'th'))
Beispiel #25
0
 def test_chinese_fulltext_extract(self):
     url = 'http://news.sohu.com/20050601/n225789219.shtml'
     article = Article(url=url, language='zh')
     html = mock_resource_with('chinese_article', 'html')
     article.download(html)
     article.parse()
     text = mock_resource_with('chinese', 'txt')
     assert article.text == text
     assert fulltext(article.html, 'zh') == text
Beispiel #26
0
 def test_chinese_fulltext_extract(self):
     url = 'http://news.sohu.com/20050601/n225789219.shtml'
     article = Article(url=url, language='zh')
     html = mock_resource_with('chinese_article', 'html')
     article.download(html)
     article.parse()
     text = mock_resource_with('chinese', 'txt')
     self.assertEqual(text, article.text)
     self.assertEqual(text, fulltext(article.html, 'zh'))
Beispiel #27
0
def get_html():
    news_url = request.args.get("url")
    try:
        if (news_url and len(news_url)):
            html = requests.get(news_url).text
            return render_template('news-summary.html',
                                   news_html=fulltext(html))
    except Exception as e:
        return render_template('news-summary.html', news_html=e)
Beispiel #28
0
 def test_japanese_fulltext_extract2(self):
     url = 'http://www.afpbb.com/articles/-/3178894'
     article = Article(url=url, language='ja')
     html = mock_resource_with('japanese_article2', 'html')
     article.download(html)
     article.parse()
     text = mock_resource_with('japanese2', 'txt')
     self.assertEqual(text, article.text)
     self.assertEqual(text, fulltext(article.html, 'ja'))
Beispiel #29
0
 def newspaper_extractor(self,html):
     try:
         content = fulltext(html)
         if content and content != "":
             return content
         else:
             return self.readability_extractor(html)
     except:
         return self.readability_extractor(html)
Beispiel #30
0
 async def get_url(self, url, returned_format=None):
     if returned_format == 'html':
         logging.info(
             '[!] HTML support is being refactored. Currently data is being returned plaintext'
         )
     r = self.get_response_from_url(url)
     # Use the response text to get contents for this url
     b = newspaper.fulltext(r.text)
     return str(b).replace('\n', '<br>') if b else None
Beispiel #31
0
 def test_japanese_fulltext_extract(self):
     url = 'https://www.nikkei.com/article/DGXMZO31897660Y8A610C1000000/?n_cid=DSTPCS001'
     article = Article(url=url, language='ja')
     html = mock_resource_with('japanese_article', 'html')
     article.download(html)
     article.parse()
     text = mock_resource_with('japanese', 'txt')
     self.assertEqual(text, article.text)
     self.assertEqual(text, fulltext(article.html, 'ja'))
Beispiel #32
0
 def analysis_news_content(self, html, obj, newspaper=True):
     if newspaper:
         text = fulltext(html).split('\n')
         txt = list(filter(lambda x: x.strip() != '', text))
         content = '<p>'.join(txt)
         return content
     else:
         content_list= obj.xpath('//div[@class="section-content"]//text()')[7:]
         content = '<p>'.join([i.replace("\n", '').strip() for i in content_list]).replace("<p><p>", '<p>')
         return content
Beispiel #33
0
 def test_spanish_fulltext_extract(self):
     url = 'http://ultimahora.es/mallorca/noticia/noticias/local/fiscal' \
           'ia-anticorrupcion-estudia-recurre-imputacion-infanta.html'
     article = Article(url=url, language='es')
     html = mock_resource_with('spanish_article', 'html')
     article.download(html)
     article.parse()
     text = mock_resource_with('spanish', 'txt')
     self.assertEqual(text, article.text)
     self.assertEqual(text, fulltext(article.html, 'es'))
Beispiel #34
0
 def test_spanish_fulltext_extract(self):
     url = 'http://ultimahora.es/mallorca/noticia/noticias/local/fiscal'\
           'ia-anticorrupcion-estudia-recurre-imputacion-infanta.html'
     article = Article(url=url, language='es')
     html = mock_resource_with('spanish_article', 'html')
     article.download(html)
     article.parse()
     text = mock_resource_with('spanish', 'txt')
     assert article.text == text
     assert fulltext(article.html, 'es') == text
Beispiel #35
0
 def test_arabic_fulltext_extract(self):
     url = 'http://arabic.cnn.com/2013/middle_east/8/3/syria.clashes/'\
           'index.html'
     article = Article(url=url)
     html = mock_resource_with('arabic_article', 'html')
     article.download(html)
     article.parse()
     assert article.meta_lang == 'ar'
     text = mock_resource_with('arabic', 'txt')
     assert article.text == text
     assert fulltext(article.html, 'ar') == text
Beispiel #36
0
 def analysis_news_content(self, html, html_obj, newspaper=False):
     if newspaper:
         text = fulltext(html).split('\n')
         txt = list(filter(lambda x: x.strip() != '', text))
         content = '<p>'.join(txt)
     else:
         content_list = html_obj.xpath('//div[@id="content"]//p//text()')
         content = '<p>'.join([
             i.replace("\n", '').strip() for i in content_list
         ]).replace("<p><p>", '<p>')
     return content
Beispiel #37
0
 def test_arabic_fulltext_extract(self):
     url = 'http://arabic.cnn.com/2013/middle_east/8/3/syria.clashes/' \
           'index.html'
     article = Article(url=url)
     html = mock_resource_with('arabic_article', 'html')
     article.download(html)
     article.parse()
     self.assertEqual('ar', article.meta_lang)
     text = mock_resource_with('arabic', 'txt')
     self.assertEqual(text, article.text)
     self.assertEqual(text, fulltext(article.html, 'ar'))
def googleparser(topic, depth):
    '''
    topic - тема новостного запроса
    depth - число опрашиваемых страниц поисковой выдачи
    
    Возвращает два списка: с текстами новостей и с проблемными ссылками 
    
    '''
    # 1. формирование массива ссылок
    gnews_links = []
    gnews = []
    googlenews.search(topic)

    start_time = time.time()

    print('--- Формируется массив ссылок... ---')

    for i in range(1, depth):
        googlenews.clear()
        googlenews.getpage(i)
        for j in range(0, len(googlenews.gettext())):
            gnews.append(googlenews.gettext()[j])
            gnews_links.append(googlenews.getlinks()[j])

    print("--- На формирование массива затрачено %s секунд ---" %
          (time.time() - start_time))
    print('--- Завершено. Получено %s ссылок ---' % len(gnews_links))

    # 2. выгрузка новостей и формирование массива текстов

    body = []
    count = 0
    error_link = []  #массив с битыми ссылками

    #замеряем время
    start_time = time.time()

    print('--- Выгружаются новости... ---')

    for url in gnews_links:
        try:
            html = requests.get(url).text
            text = fulltext(html)
            body.append(text)
        except:
            error_link.append(
                gnews_links[count]
            )  #иногда попадаются проблемные ссылки. Здесь мы будем сохранять их
            pass
        count += 1

    print("--- Завершено. На выгрузку затрачено %s секунд ---" %
          (time.time() - start_time))
    return body, error_link
Beispiel #39
0
 def test_japanese_fulltext_extract(self):
     try:
         url = 'http://www.cnn.co.jp/tech/35087106.html'
         article = Article(url=url, language='ja')
         html = mock_resource_with('japanese_article', 'html')
         article.download(html)
         article.parse()
         text = mock_resource_with('japanese', 'txt')
         self.assertEqual(text, article.text)
         self.assertEqual(text, fulltext(article.html, 'ja'))
     except Exception as e:
         print('ERR', str(e))
Beispiel #40
0
    def test_parse_html(self):
        AUTHORS = ['Chien-Ming Wang', 'Dana A. Ford', 'James S.A. Corey', 'Tom Watkins']
        TITLE = 'After storm, forecasters see smooth sailing for Thanksgiving'
        LEN_IMGS = 46
        META_LANG = 'en'

        self.article.parse()
        self.article.nlp()

        text = mock_resource_with('cnn', 'txt')
        assert self.article.text == text
        assert fulltext(self.article.html) == text

        # NOTE: top_img extraction requires an internet connection
        # unlike the rest of this test file
        TOP_IMG = ('http://i2.cdn.turner.com/cnn/dam/assets/131129200805-'
                   '01-weather-1128-story-top.jpg')
        assert self.article.top_img == TOP_IMG

        assert sorted(self.article.authors) == AUTHORS
        assert self.article.title == TITLE
        assert len(self.article.imgs) == LEN_IMGS
        assert self.article.meta_lang == META_LANG
        assert str(self.article.publish_date) == '2013-11-27 00:00:00'
Beispiel #41
0
article.summary

## THINGS WE NEED
# source 
# url
# title
# date
# summary
# keywords

import newspaper
from newspaper import Article

cnn_paper = newspaper.build(u'http://cnn.com')

for article in cnn_paper.articles:
    print(article.url)

for category in cnn_paper.category_urls():
    print(category)

cnn_article = cnn_paper.articles[0]
cnn_article.download()
cnn_article.parse()
cnn_article.nlp()

from newspaper import fulltext

html = requests.get(...).text
text = fulltext(html)
Beispiel #42
0
        text = None
        if not os.path.exists(target_hthml_file):
            article = Article(url)
            article.download()
            print(f"url: {article.url}")
            print(f"source_url: {article.source_url}")
            print(f"message: {article.download_exception_msg}")
            if article.download_exception_msg is None:
                html = article.html
                write_text_to_file(target_hthml_file, html)

                article.parse()
                print(f"authors: {article.authors}")
                print(f"publish_date: {article.publish_date}")
                print(f"text: {article.text}")
                article.nlp()
                print(f"key words: {article.keywords}")
                print(f"summary: {article.summary}")
                text = article.text
        else:
            article = Article(url)
            html_text = read_text_from_file(target_hthml_file)            
            text = fulltext(html_text,'en')

        if text is not None:
            # print(f"text: {text}")
            # removing stop words and punctuation
            all_article_words = nltk.word_tokenize(text)
            article_tokens = [w for w in all_article_words if not w in stop_words]
            print(f"loaded: {len(article_tokens)} actual from {len(all_article_words)} total words")
            # TODO stemming
Beispiel #43
0
def get_newspaper_text(html):
    return fulltext(html)
Beispiel #44
0
                         bootstrap_servers=['172.16.129.43:9092'])
producer = KafkaProducer(bootstrap_servers=['172.16.129.43:9092'])

 
 
# 测试
print "start newspaper parser!"


for message in consumer:
    if message is not None:
        print "xxx"
        try:
            jsonValue = json.loads(message.value)
            html = jsonValue["html"]
            contentWithOutTag = fulltext(html, language="zh")
            for useParser in ["lxml"]:
                # 将无标签正文带回html解析
                parseHtml = extractHtml(html, contentWithOutTag, useParser)
                parseTitle = parseHtml.title()
                parsePublishDate = parseHtml.publishDate()
                parseContent = parseHtml.mainContent()
                if len(parseContent[0]) == 0:
                    parseContent[0] = contentWithOutTag
                if len(parsePublishDate) == 0 or parsePublishDate == " " or parsePublishDate == None:
                    # 解析不到发布时间 则将发布时间设置为爬虫时间
                    parsePublishDate = time.asctime(time.localtime(time.time()))
               
                print "\n-----------------------------------------------------------------------------\n"
                print "url:\t", jsonValue["url"]
                print "标题:\t", parseTitle