Example #1
0
    def download(self):
        try:
            logger.info('Downloading article for {}'.format(
                self._pocket_item.url))
            article = Article(self._pocket_item.url)
            article.download()
            logger.info('Parsing article for {}'.format(self._pocket_item.url))
            article.parse()
            logger.info('Performing NLP on article for {}'.format(
                self._pocket_item.url))
            article.nlp()

            article.tags = list(article.tags)
            if article.publish_date:
                article.publish_date = article.publish_date.timestamp()

            article.images = list(article.images)

            self._pocket_item.article = dict(
                (k, v) for k, v in article.__dict__.items()
                if k in self.ARTICLE_ATTRIBUTES_TO_KEEP)
        except ArticleException:
            logger.warning('Could not download article for {}'.format(
                self._pocket_item.url))
            return {}
Example #2
0
def test_save_article_function():
    from newspaper import Article
    today = time.time()
    today = datetime.datetime.fromtimestamp(today)
    url = 'http://www.bbc.com/news/world-europe-35828810'
    #url = 'http://fox13now.com/2013/12/30/new-year-new-laws-obamacare-pot-guns-and-drones/'
    a = Article(url)
    a.build()
    #print (a.title, a.publish_date)

    #if the news has no publish_date, set it to today
    if a.publish_date is None:
        a.publish_date = today

    path_to_save = get_path_to_save(a)
    data_a = get_serialized_article_obj(a)
    create_file(path_to_save, data = data_a)
Example #3
0
    def startParsingNews(self):

        # Loads the JSON files with news sites
        data = {}
        data['newspapers'] = {}
        with open('NewsPapers.json') as data_file:
            companies = json.load(data_file)

        count = 1

        nowDay = datetime.now().strftime('%Y-%m-%d')
        nowTime = datetime.now().strftime("_%Y-%m-%d_%H-%M-%S")
        grabedTimeCheck = datetime.now().strftime("%Y-%m-%dT")
        grabedTimeCheckHour = int(datetime.now().strftime("%H")) - 1
        grabedTimeCheck += '%02d' % grabedTimeCheckHour

        pathlib.Path('news/' + nowDay).mkdir(parents=True, exist_ok=True)

        htmlFile = self.createHTML(nowTime)
        htmlTailContent = self.getHTMLTail().read()

        # Iterate through each news company
        for company, value in companies.items():
            # If a RSS link is provided in the JSON file, this will be the first choice.
            # Reason for this is that, RSS feeds often give more consistent and correct data.
            # If you do not want to scrape from the RSS-feed, just leave the RSS attr empty in the JSON file.
            if 'rss' in value:
                d = fp.parse(value['rss'])
                print("Downloading articles from ", company)
                newsPaper = {
                    "rss": value['rss'],
                    "link": value['link'],
                    "articles": []
                }

                # TODO: dump data to each company file
                outputFile = open('news/' + nowDay + '/' + company + nowTime +
                                  "_rss.txt",
                                  "a",
                                  encoding='utf8')
                if htmlFile:
                    htmlFile.write('\n<h1>' + company + '</h1>\n')
                    htmlFile.write(
                        '  <div class="ui segment" style=" color: #FFF4E0; background-color: #393E46">\n'
                    )
                    htmlFile.write(
                        '    <div class="ui accordion" style="width:600px; color: #FFF4E0; background-color: #393E46">\n'
                    )
#               outputFile = open( 'index_', "a", encoding='utf8')
                for entry in d.entries:
                    # Check if publish date is provided, if no the article is skipped.
                    # This is done to keep consistency in the data and to keep the script from crashing.
                    if hasattr(entry, 'published'):
                        if count > self.LIMIT:
                            break
                        article = {}
                        article['link'] = entry.link
                        date = entry.published_parsed
                        print('origin:' +
                              datetime.fromtimestamp(mktime(date)).isoformat())
                        newTimeOffset = self.getTimeOffset(
                            datetime.fromtimestamp(mktime(date)).isoformat())
                        article['published'] = newTimeOffset

                        # if the time is out of date, ignore it.
                        if not str(article['published']).startswith(nowDay):
                            print('Got a news but not today: {}, {}'.format(
                                nowDay, article['published']))
                            continue
                        if not str(article['published']).startswith(
                                grabedTimeCheck):
                            print(
                                'Got a news but not this hour: {}, {}'.format(
                                    grabedTimeCheck, article['published']))
                            #count = count - 1
                            continue
                        try:
                            content = Article(entry.link)
                            content.download()
                            content.parse()
                        except Exception as e:
                            # If the download for some reason fails (ex. 404) the script will continue downloading
                            # the next article.
                            print(e)
                            print("continuing...")
                            continue
                        article['title'] = content.title
                        article['text'] = content.text
                        newsPaper['articles'].append(article)
                        print(count, "articles downloaded from", company,
                              ", url: ", entry.link)
                        count = count + 1
                        # TODO: dump data to each company file
                        if outputFile:
                            outputFile.write('-----' + str(count - 1) +
                                             '-----\n')
                            outputFile.write('***** ' + content.title +
                                             ' *****\n\n')
                            outputFile.write(content.text + '\n\n')
                            outputFile.write(entry.link + '\n\n')
                            outputFile.write(article['published'] + '\n\n')
                        if htmlFile:
                            htmlFile.write(
                                '      <div class="title" style="color: #FFF4E0;"><i class="dropdown icon" style="color: #f8b500;"></i>'
                                + content.title + '</div>\n')
                            htmlFile.write('      <div class="content">\n')
                            htmlFile.write(
                                '        <a href="' + entry.link +
                                '" target="_blank"><blockquote>NEWS link</blockquote></a>\n'
                            )
                            htmlFile.write(
                                '        <p class="transition hidden"><blockquote>Published time: '
                                + article['published'] + '</blockquote></p>\n')
                            formattedText = content.text.replace("\n", "<br>")
                            htmlFile.write(
                                '        <p class="transition hidden"><blockquote>'
                                + formattedText + '</blockquote></p>\n')
                            htmlFile.write('      </div>\n')
                if htmlFile:
                    htmlFile.write('    </div>')
                    htmlFile.write('  </div>')
            else:
                # This is the fallback method if a RSS-feed link is not provided.
                # It uses the python newspaper library to extract articles
                print("Building site for ", company)
                #self.get_web_page(value['link'])

                if company == 'setn':
                    news_list_url = self.parsing_SETN_news(value['link'])
                elif company == 'cna':
                    news_list_url = self.parsing_CNA_news(value['link'])
                elif company == 'ltn':
                    news_list_url = self.parsing_LTN_news(value['link'])
                elif company == 'chinatimes':
                    news_list_url = self.parsing_CHINATIMES_news(value['link'])

#             soup = BeautifulSoup(self.get_web_page(value['link']), 'html.parser')
#             news_list_div = soup.find_all('div', class_="NewsList")

#paper = newspaper.build(value['link'])
                newsPaper = {"link": value['link'], "articles": []}
                noneTypeCount = 0

                # TODO: dump data to each company file
                outputFile = open('news/' + nowDay + '/' + company + nowTime +
                                  "_sites.txt",
                                  "a",
                                  encoding='utf8')

                if htmlFile:
                    htmlFile.write('\n<h1>' + company + '</h1>\n')
                    htmlFile.write(
                        '  <div class="ui segment" style=" color: #FFF4E0; background-color: #393E46">\n'
                    )
                    htmlFile.write(
                        '    <div class="ui accordion" style="width:600px; color: #FFF4E0; background-color: #393E46">\n'
                    )
                for news in news_list_url:
                    try:
                        content = Article(news)
                        content.download()
                        content.parse()
                    except Exception as e:
                        # If the download for some reason fails (ex. 404) the script will continue downloading
                        # the next article.
                        print(e)
                        print("continuing...")
                        continue

    #print(content.title)
#               for content in paper.articles:
                    if count > self.LIMIT:
                        break
                    try:
                        content.download()
                        content.parse()
                    except Exception as e:
                        print(e)
                        print("continuing...")
                        continue
                    # Again, for consistency, if there is no found publish date the article will be skipped.
                    # After 10 downloaded articles from the same newspaper without publish date, the company will be skipped.
                    if content.publish_date is None:
                        print(count, " Article has date of type None...")
                        noneTypeCount = noneTypeCount + 1
                        content.publish_date = "2018-10-10 0900"
                        break
                        if noneTypeCount > 30:
                            print("Too many noneType dates, aborting...")
                            noneTypeCount = 0
                            break
                        count = count + 1
                        continue
                    article = {}
                    article['title'] = content.title
                    article['text'] = content.text
                    article['link'] = content.url
                    print('origin:' + content.publish_date.isoformat())
                    newTimeOffset = self.getTimeOffset(
                        content.publish_date.isoformat())
                    article['published'] = newTimeOffset

                    # if the time is out of date, ignore it.
                    #if not str(article['published']).startswith( nowDay ):
                    #    print( 'Got a news but not today: {}, {}'.format(nowDay, article['published']) )
                    #count = count - 1
                    #    continue
                    #if not str(article['published']).startswith( grabedTimeCheck ):
                    #    print( 'Got a news but not this hour: {}, {}'.format(grabedTimeCheck, article['published']) )
                    #    #count = count - 1
                    #    continue
                    newsPaper['articles'].append(article)
                    print(count, "articles downloaded from", company,
                          " using newspaper, url: ", content.url)
                    count = count + 1
                    noneTypeCount = 0
                    # TODO: dump data to each company file
                    if outputFile:
                        outputFile.write('-----' + str(count - 1) + '-----\n')
                        outputFile.write('***** ' + content.title +
                                         ' *****\n\n')
                        outputFile.write(content.text + '\n\n')
                        outputFile.write(content.url + '\n\n')
                        outputFile.write(content.publish_date.isoformat() +
                                         '\n\n')
                    if htmlFile:
                        htmlFile.write(
                            '      <div class="title" style="color: #FFF4E0;"><i class="dropdown icon" style="color: #f8b500;"></i>'
                            + content.title + '</div>\n')
                        htmlFile.write('      <div class="content">\n')
                        htmlFile.write(
                            '        <a href="' + content.url +
                            '" target="_blank"><blockquote>NEWS link</blockquote></a>\n'
                        )
                        htmlFile.write(
                            '        <p class="transition hidden"><blockquote>Published time: '
                            + content.publish_date.isoformat() +
                            '</blockquote></p>\n')
                        formattedText = content.text.replace("\n", "<br>")
                        htmlFile.write(
                            '        <p class="transition hidden"><blockquote>'
                            + formattedText + '</blockquote></p>\n')
                        htmlFile.write('      </div>\n')
                if htmlFile:
                    htmlFile.write('    </div>')
                    htmlFile.write('  </div>')
            count = 1
            data['newspapers'][company] = newsPaper

        if not htmlTailContent:
            htmlFile.write('</body>')
            htmlFile.write('</html>')
        else:
            htmlFile.write(htmlTailContent)
        # Finally it saves the articles as a JSON-file.
        try:
            with open('news/' + nowDay + '/' + 'scraped_articles' + nowTime +
                      '.json',
                      'w',
                      encoding='utf8') as outfile:
                json.dump(data, outfile, ensure_ascii=False)
        except Exception as e:
            print(e)
Example #4
0
# 'https://www.scmp.com/economy/china-economy/article/3075133/chinas-inbound-foreign-direct-investment-plunges-february',
# 'https://www.scmp.com/news/china/science/article/3079879/chinas-initial-coronavirus-outbreak-wuhan-spread-twice-fast-we',
# 'https://www.scmp.com/economy/china-economy/article/3077760/coronavirus-chinas-march-pmi-steadies-economy-not-out-woods']


df = pd.DataFrame(columns=['author','publish_date', 'title', 'text', 'source', 'url'])
for idx,url in enumerate(urls):
    if url:
        a_row = []
        article = Article(url)

        article.download()
        article.parse()

        if idx == 0:
            article.publish_date = "2020-04-15"
        if idx == 1:
            article.publish_date = "2020-04-19"
        if idx == 2:
            article.publish_date = "2020-01-31"
        if idx == 3:
            article.publish_date = "2020-01-31"
        if idx == 4:
            article.publish_date = "2020-03-30"

        # Just take the first author since
        # some articles tag on more than needed.
        a_row.append(article.authors[0])
        if type(article.publish_date) != str:
            a_row.append(article.publish_date.date())
        else:
Example #5
0
    def newspaper_parser(self, sleep_time=5):
        logging.debug('running newspaper_parser() for sercure sites...')
        results = []
        count = 0

        profile = webdriver.FirefoxProfile()
        browser = webdriver.Firefox(executable_path=r'gecko\geckodriver.exe')
        credential_names = list(self.credentials.keys())

        browser.get(self.login_url)
        cred1 = browser.find_element_by_id(credential_names[0])
        cred2 = browser.find_element_by_id(credential_names[1])
        cred1.send_keys(self.credentials[credential_names[0]])
        cred2.send_keys(self.credentials[credential_names[1]])
        time.sleep(10)
        browser.find_element_by_class_name(self.submit_id).click()
        time.sleep(10)

        cookies = browser.get_cookies()
        browser.close()

        s = requests.Session()
        for cookie in cookies:
            s.cookies.set(cookie['name'], cookie['value'])

        for l in self.links:
            try:
                page = s.get(l)
            except Exception as e:
                logging.error("issue bundling {} for {}, {}".format(
                    l, self.searchTerm, e))
                print(e)
                time.sleep(20)
                continue

            soup = BeautifulSoup(page.content, features="lxml")
            article = Article(url=l)
            article.set_html(str(soup))

            article.parse()
            article.nlp()
            up_date = article.publish_date
            if self.newspaper == 'Wall Street Journal':
                soup = BeautifulSoup(article.html, features="lxml")
                # if no articles, stop
                pub_date = soup.find("meta", {
                    "name": "article.published"
                }).get("content", None)
                up_date = soup.find("meta", {
                    "name": "article.updated"
                }).get("content", None)
                article.publish_date = pub_date

            data = {
                'search': self.searchTerm,
                'title': article.title,
                'date_published': article.publish_date,
                'date_updated': up_date,
                'news_outlet': self.newspaper,
                'authors': article.authors,
                # 'feature_img': article.top_image,
                'article_link': article.canonical_link,
                'keywords': article.keywords,
                # 'movies': article.movies,
                'summary': article.summary,
                'text': article.text,
                'html': article.html,
            }
            results.append(data)
            time.sleep(sleep_time)

            count += 1
        print("done for ", self.searchTerm)
        return results
Example #6
0
pip install nltk

pip install newspaper3k

import nltk
from newspaper import Article

url = 'https://www.cnbc.com/2020/03/19/ecb-launches-new-820-billion-coronavirus-package.html'
article= Article(url)

article.download()
article.parse()
nltk.download('punkt')
article.nlp()

# get the authors
article.authors

# get the authors
article.publish_date()

article.top_image()

print(article.text)

# get summary of the artile 
print(article.summary)