def _append_news_text(db): """ Loops through all news that don't have text already, add texts to it. IP is renewed after 1000 news have been processed """ # Create directory where we will dump HTML pages if it doesn't exists already. if not os.path.exists('dumps/news'): os.makedirs('dumps/news') counter = 0 news = next(db.news.find({'text': {"$exists": False}}).sort('_id', 1).limit(1), None) while(news): if counter % 1000 == 0: leftover = db.news.find({'text': {"$exists": True}}).count() total = db.news.count() logger.info('Current progress %d/%d', leftover, total) renew_ip() _append_one_news_text(db, news['_id'], news['link'], news['date_first_published']) news = next(db.news.find({'text': {"$exists": False}}).sort('_id', 1).limit(1), None) counter = counter + 1
def _append_comments(db, partition_id): """ Loops through all news that do not have comments and append them. IP is renewed after 1000 processed news. """ # Create directory to dump HTML pages if it doesn't exists already if not os.path.exists('dumps/comments'): os.makedirs('dumps/comments') counter = 0 news_filter = {'comments': {'$exists': False}, 'comment_count': {'$gt': 0}} if partition_id != None: news_filter['_id'] = {'$regex': '%d$' % partition_id} news = next(db.news.find(news_filter).sort('_id', -1).limit(1), None) while(news): if counter % 1000 == 0: leftover = db.news.find(news_filter).count() total = db.news.find({'comment_count': {'$gt': 0}}).count() logger.info('Current progress %d/%d', total - leftover, total) renew_ip() _append_one_news_comments(db, news['_id'], news['link'], news['comment_count']) news = next(db.news.find(news_filter).sort('_id', -1).limit(1), None) counter = counter + 1
def _insert_news_metadata(db): """ Loops through each category and adds news metadata to Mongo. IP is renewed after each category. """ # Create directory where we will dump HTML pages, if it doesn't exists if not os.path.exists('dumps/news_metadata'): os.makedirs('dumps/news_metadata') current_category_id = 1 # Try to find ID of highest category and start from there. # This means we will always go over existing category again:/ current_category_row = db.news.find_one(sort=[('category_ids',-1)]) if current_category_row: current_category_id = max(current_category_row['category_ids']) logger.info('Current maximum category is %d', current_category_id) while(current_category_id < 2000): # Some weird category we need to skip (they are empty, # I don't care looking at edge cases why they do not work. if current_category_id not in (31, 32, 35, 96, 1683, 1692): renew_ip() _insert_news_metadata_category(db, current_category_id) current_category_id = current_category_id + 1
def _append_one_news_text(db, news_id, news_link, news_date_published): """ Reads news text for one news and updates it in Mongo. It also updates news first date of publish (we don't have that on metadata fetch). There is simple retry logic if we get any HTTP error. All pages are saved locally and we use first those if they exists. """ if os.path.exists('dumps/news/n%s.html' % news_id): logger.info('Found cached file %s, reusing it', news_id) f = open('dumps/news/n%s.html' % news_id, 'r') html_content = f.read() f.close() else: if news_link.startswith('?'): url = 'http://www.b92.net/info/vesti/index.php%s' % news_link elif news_link.startswith('http'): url = news_link elif news_link.startswith('vesti.php'): url = 'http://www.b92.net/%s' % news_link else: url = 'http://www.b92.net%s' % news_link logger.info('Fetching url %s', url) retries = 1 while(True): try: request = urllib2.Request(url, None, headers) response = urllib2.urlopen(request) html_content = response.read() f = open('dumps/news/n%s.html' % (news_id), 'w') f.write(html_content) f.close() break except HTTPError as e: if e.code == 404 or e.code == 403: logger.warning('Article doesn\'t exist anymore, error %d', e.code) db.news.update_one({'_id': news_id}, {'$set': {'text': '', 'html_text': ''}}) return logger.warning('Error during fetching') if retries % 10 == 0: raise e elif retries % 3 == 0: renew_ip() pass retries = retries + 1 soup = BeautifulSoup(html_content, 'html.parser', from_encoding='cp1250') article_header = soup.select('div.article-header') if len(article_header) != 1: logger.warning('Something wrong with article header') else: time_element = article_header[0].select('time') if len(time_element) == 1: pub_time_text = time_element[0].text if '->' in pub_time_text: # If we see "->" that in B92 means that news was updated (football matches...) # We need to be careful if we cross day boundary logger.info('Time change detected, previously set %s and now it says %s', unicode(news_date_published), pub_time_text) hourstr, minutestr = re.search(re_changing_publishing_time, pub_time_text).groups() date_first_published = news_date_published.replace(hour=int(hourstr), minute=int(minutestr)) if date_first_published > news_date_published: date_first_published = date_first_published - timedelta(days=1) logger.info('Moved date to previous day') db.news.update_one({'_id': news_id}, {'$set': {'date_first_published': date_first_published}}) logger.info('Updated first published date to %s', unicode(date_first_published)) article = soup.select('article.item-page') if len(article) != 1: logger.warning('Something wrong with article') db.news.update_one({'_id': news_id}, {'$set': {'text': '', 'html_text': ''}}) else: paragraphs = article[0].findChildren('p') # Remove first paragraph as it is same as excerpt # Remove also all empty paragraphs # What remains is concatenated html_text = ''.join([unicode(p) for p in paragraphs[1:] if p.text != '']) text = ''.join([unicode(p.text) for p in paragraphs[1:] if p.text != '']) db.news.update_one({'_id': news_id}, {'$set': {'text': text, 'html_text': html_text}}) sleep(0.1)
import sys from urllib2 import HTTPError import urllib2 from tools import renew_ip, setup_logger, exception_hook, headers logger = setup_logger('b92statistike-dump_categories.log') if __name__ == '__main__': """ This program reads all pages where categories are defined and prints them out. TODO: I used this only when data was in SQL, but even then I manually created inserts out of this dumped output. Should be more automatic, but it was one-time thing. """ sys.excepthook = exception_hook renew_ip() for i in range(1, 2000): url = 'http://www.b92.net/info/vesti/index.php?&nav_category={}'.format(i) logger.info('Fetching url %s', url) retries = 0 try: request = urllib2.Request(url, None, headers) response = urllib2.urlopen(request) except HTTPError as e: logger.warning('Error during fetching') if retries == 3: raise e else: retries = retries + 1 soup = BeautifulSoup(response.read(), 'html5lib', from_encoding='cp1250')