def download(self): try: logger.info('Downloading article for {}'.format( self._pocket_item.url)) article = Article(self._pocket_item.url) article.download() logger.info('Parsing article for {}'.format(self._pocket_item.url)) article.parse() logger.info('Performing NLP on article for {}'.format( self._pocket_item.url)) article.nlp() article.tags = list(article.tags) if article.publish_date: article.publish_date = article.publish_date.timestamp() article.images = list(article.images) self._pocket_item.article = dict( (k, v) for k, v in article.__dict__.items() if k in self.ARTICLE_ATTRIBUTES_TO_KEEP) except ArticleException: logger.warning('Could not download article for {}'.format( self._pocket_item.url)) return {}
def test_save_article_function(): from newspaper import Article today = time.time() today = datetime.datetime.fromtimestamp(today) url = 'http://www.bbc.com/news/world-europe-35828810' #url = 'http://fox13now.com/2013/12/30/new-year-new-laws-obamacare-pot-guns-and-drones/' a = Article(url) a.build() #print (a.title, a.publish_date) #if the news has no publish_date, set it to today if a.publish_date is None: a.publish_date = today path_to_save = get_path_to_save(a) data_a = get_serialized_article_obj(a) create_file(path_to_save, data = data_a)
def startParsingNews(self): # Loads the JSON files with news sites data = {} data['newspapers'] = {} with open('NewsPapers.json') as data_file: companies = json.load(data_file) count = 1 nowDay = datetime.now().strftime('%Y-%m-%d') nowTime = datetime.now().strftime("_%Y-%m-%d_%H-%M-%S") grabedTimeCheck = datetime.now().strftime("%Y-%m-%dT") grabedTimeCheckHour = int(datetime.now().strftime("%H")) - 1 grabedTimeCheck += '%02d' % grabedTimeCheckHour pathlib.Path('news/' + nowDay).mkdir(parents=True, exist_ok=True) htmlFile = self.createHTML(nowTime) htmlTailContent = self.getHTMLTail().read() # Iterate through each news company for company, value in companies.items(): # If a RSS link is provided in the JSON file, this will be the first choice. # Reason for this is that, RSS feeds often give more consistent and correct data. # If you do not want to scrape from the RSS-feed, just leave the RSS attr empty in the JSON file. if 'rss' in value: d = fp.parse(value['rss']) print("Downloading articles from ", company) newsPaper = { "rss": value['rss'], "link": value['link'], "articles": [] } # TODO: dump data to each company file outputFile = open('news/' + nowDay + '/' + company + nowTime + "_rss.txt", "a", encoding='utf8') if htmlFile: htmlFile.write('\n<h1>' + company + '</h1>\n') htmlFile.write( ' <div class="ui segment" style=" color: #FFF4E0; background-color: #393E46">\n' ) htmlFile.write( ' <div class="ui accordion" style="width:600px; color: #FFF4E0; background-color: #393E46">\n' ) # outputFile = open( 'index_', "a", encoding='utf8') for entry in d.entries: # Check if publish date is provided, if no the article is skipped. # This is done to keep consistency in the data and to keep the script from crashing. if hasattr(entry, 'published'): if count > self.LIMIT: break article = {} article['link'] = entry.link date = entry.published_parsed print('origin:' + datetime.fromtimestamp(mktime(date)).isoformat()) newTimeOffset = self.getTimeOffset( datetime.fromtimestamp(mktime(date)).isoformat()) article['published'] = newTimeOffset # if the time is out of date, ignore it. if not str(article['published']).startswith(nowDay): print('Got a news but not today: {}, {}'.format( nowDay, article['published'])) continue if not str(article['published']).startswith( grabedTimeCheck): print( 'Got a news but not this hour: {}, {}'.format( grabedTimeCheck, article['published'])) #count = count - 1 continue try: content = Article(entry.link) content.download() content.parse() except Exception as e: # If the download for some reason fails (ex. 404) the script will continue downloading # the next article. print(e) print("continuing...") continue article['title'] = content.title article['text'] = content.text newsPaper['articles'].append(article) print(count, "articles downloaded from", company, ", url: ", entry.link) count = count + 1 # TODO: dump data to each company file if outputFile: outputFile.write('-----' + str(count - 1) + '-----\n') outputFile.write('***** ' + content.title + ' *****\n\n') outputFile.write(content.text + '\n\n') outputFile.write(entry.link + '\n\n') outputFile.write(article['published'] + '\n\n') if htmlFile: htmlFile.write( ' <div class="title" style="color: #FFF4E0;"><i class="dropdown icon" style="color: #f8b500;"></i>' + content.title + '</div>\n') htmlFile.write(' <div class="content">\n') htmlFile.write( ' <a href="' + entry.link + '" target="_blank"><blockquote>NEWS link</blockquote></a>\n' ) htmlFile.write( ' <p class="transition hidden"><blockquote>Published time: ' + article['published'] + '</blockquote></p>\n') formattedText = content.text.replace("\n", "<br>") htmlFile.write( ' <p class="transition hidden"><blockquote>' + formattedText + '</blockquote></p>\n') htmlFile.write(' </div>\n') if htmlFile: htmlFile.write(' </div>') htmlFile.write(' </div>') else: # This is the fallback method if a RSS-feed link is not provided. # It uses the python newspaper library to extract articles print("Building site for ", company) #self.get_web_page(value['link']) if company == 'setn': news_list_url = self.parsing_SETN_news(value['link']) elif company == 'cna': news_list_url = self.parsing_CNA_news(value['link']) elif company == 'ltn': news_list_url = self.parsing_LTN_news(value['link']) elif company == 'chinatimes': news_list_url = self.parsing_CHINATIMES_news(value['link']) # soup = BeautifulSoup(self.get_web_page(value['link']), 'html.parser') # news_list_div = soup.find_all('div', class_="NewsList") #paper = newspaper.build(value['link']) newsPaper = {"link": value['link'], "articles": []} noneTypeCount = 0 # TODO: dump data to each company file outputFile = open('news/' + nowDay + '/' + company + nowTime + "_sites.txt", "a", encoding='utf8') if htmlFile: htmlFile.write('\n<h1>' + company + '</h1>\n') htmlFile.write( ' <div class="ui segment" style=" color: #FFF4E0; background-color: #393E46">\n' ) htmlFile.write( ' <div class="ui accordion" style="width:600px; color: #FFF4E0; background-color: #393E46">\n' ) for news in news_list_url: try: content = Article(news) content.download() content.parse() except Exception as e: # If the download for some reason fails (ex. 404) the script will continue downloading # the next article. print(e) print("continuing...") continue #print(content.title) # for content in paper.articles: if count > self.LIMIT: break try: content.download() content.parse() except Exception as e: print(e) print("continuing...") continue # Again, for consistency, if there is no found publish date the article will be skipped. # After 10 downloaded articles from the same newspaper without publish date, the company will be skipped. if content.publish_date is None: print(count, " Article has date of type None...") noneTypeCount = noneTypeCount + 1 content.publish_date = "2018-10-10 0900" break if noneTypeCount > 30: print("Too many noneType dates, aborting...") noneTypeCount = 0 break count = count + 1 continue article = {} article['title'] = content.title article['text'] = content.text article['link'] = content.url print('origin:' + content.publish_date.isoformat()) newTimeOffset = self.getTimeOffset( content.publish_date.isoformat()) article['published'] = newTimeOffset # if the time is out of date, ignore it. #if not str(article['published']).startswith( nowDay ): # print( 'Got a news but not today: {}, {}'.format(nowDay, article['published']) ) #count = count - 1 # continue #if not str(article['published']).startswith( grabedTimeCheck ): # print( 'Got a news but not this hour: {}, {}'.format(grabedTimeCheck, article['published']) ) # #count = count - 1 # continue newsPaper['articles'].append(article) print(count, "articles downloaded from", company, " using newspaper, url: ", content.url) count = count + 1 noneTypeCount = 0 # TODO: dump data to each company file if outputFile: outputFile.write('-----' + str(count - 1) + '-----\n') outputFile.write('***** ' + content.title + ' *****\n\n') outputFile.write(content.text + '\n\n') outputFile.write(content.url + '\n\n') outputFile.write(content.publish_date.isoformat() + '\n\n') if htmlFile: htmlFile.write( ' <div class="title" style="color: #FFF4E0;"><i class="dropdown icon" style="color: #f8b500;"></i>' + content.title + '</div>\n') htmlFile.write(' <div class="content">\n') htmlFile.write( ' <a href="' + content.url + '" target="_blank"><blockquote>NEWS link</blockquote></a>\n' ) htmlFile.write( ' <p class="transition hidden"><blockquote>Published time: ' + content.publish_date.isoformat() + '</blockquote></p>\n') formattedText = content.text.replace("\n", "<br>") htmlFile.write( ' <p class="transition hidden"><blockquote>' + formattedText + '</blockquote></p>\n') htmlFile.write(' </div>\n') if htmlFile: htmlFile.write(' </div>') htmlFile.write(' </div>') count = 1 data['newspapers'][company] = newsPaper if not htmlTailContent: htmlFile.write('</body>') htmlFile.write('</html>') else: htmlFile.write(htmlTailContent) # Finally it saves the articles as a JSON-file. try: with open('news/' + nowDay + '/' + 'scraped_articles' + nowTime + '.json', 'w', encoding='utf8') as outfile: json.dump(data, outfile, ensure_ascii=False) except Exception as e: print(e)
# 'https://www.scmp.com/economy/china-economy/article/3075133/chinas-inbound-foreign-direct-investment-plunges-february', # 'https://www.scmp.com/news/china/science/article/3079879/chinas-initial-coronavirus-outbreak-wuhan-spread-twice-fast-we', # 'https://www.scmp.com/economy/china-economy/article/3077760/coronavirus-chinas-march-pmi-steadies-economy-not-out-woods'] df = pd.DataFrame(columns=['author','publish_date', 'title', 'text', 'source', 'url']) for idx,url in enumerate(urls): if url: a_row = [] article = Article(url) article.download() article.parse() if idx == 0: article.publish_date = "2020-04-15" if idx == 1: article.publish_date = "2020-04-19" if idx == 2: article.publish_date = "2020-01-31" if idx == 3: article.publish_date = "2020-01-31" if idx == 4: article.publish_date = "2020-03-30" # Just take the first author since # some articles tag on more than needed. a_row.append(article.authors[0]) if type(article.publish_date) != str: a_row.append(article.publish_date.date()) else:
def newspaper_parser(self, sleep_time=5): logging.debug('running newspaper_parser() for sercure sites...') results = [] count = 0 profile = webdriver.FirefoxProfile() browser = webdriver.Firefox(executable_path=r'gecko\geckodriver.exe') credential_names = list(self.credentials.keys()) browser.get(self.login_url) cred1 = browser.find_element_by_id(credential_names[0]) cred2 = browser.find_element_by_id(credential_names[1]) cred1.send_keys(self.credentials[credential_names[0]]) cred2.send_keys(self.credentials[credential_names[1]]) time.sleep(10) browser.find_element_by_class_name(self.submit_id).click() time.sleep(10) cookies = browser.get_cookies() browser.close() s = requests.Session() for cookie in cookies: s.cookies.set(cookie['name'], cookie['value']) for l in self.links: try: page = s.get(l) except Exception as e: logging.error("issue bundling {} for {}, {}".format( l, self.searchTerm, e)) print(e) time.sleep(20) continue soup = BeautifulSoup(page.content, features="lxml") article = Article(url=l) article.set_html(str(soup)) article.parse() article.nlp() up_date = article.publish_date if self.newspaper == 'Wall Street Journal': soup = BeautifulSoup(article.html, features="lxml") # if no articles, stop pub_date = soup.find("meta", { "name": "article.published" }).get("content", None) up_date = soup.find("meta", { "name": "article.updated" }).get("content", None) article.publish_date = pub_date data = { 'search': self.searchTerm, 'title': article.title, 'date_published': article.publish_date, 'date_updated': up_date, 'news_outlet': self.newspaper, 'authors': article.authors, # 'feature_img': article.top_image, 'article_link': article.canonical_link, 'keywords': article.keywords, # 'movies': article.movies, 'summary': article.summary, 'text': article.text, 'html': article.html, } results.append(data) time.sleep(sleep_time) count += 1 print("done for ", self.searchTerm) return results
pip install nltk pip install newspaper3k import nltk from newspaper import Article url = 'https://www.cnbc.com/2020/03/19/ecb-launches-new-820-billion-coronavirus-package.html' article= Article(url) article.download() article.parse() nltk.download('punkt') article.nlp() # get the authors article.authors # get the authors article.publish_date() article.top_image() print(article.text) # get summary of the artile print(article.summary)