def main(): feeds_file_input = os.path.abspath('config/feed_list.json') db_config_file = os.path.abspath('config/db_config.json') config_handler = ConfigHandler(feeds_file_input, db_config_file) uri, db_name, feeds_name_collection, feeds_collection, image_collection = config_handler.get_db_config( ) db_context = DbContext(uri, db_name, feeds_name_collection, feeds_collection, image_collection) Logger.log('reading {0} ...'.format(feeds_file_input)) feed_list, feed_list_jsondata = config_handler.load_feed_list() Logger.log('collecting from {0} feeds'.format(len(feed_list))) Logger.log('inserting the feed list into the database...') for feed in feed_list_jsondata: db_context.feeds_name_collection.update_one({'url': feed['url']}, {'$set': feed}, upsert=True) images_path_file = 'config/image_collection.json' images_path = config_handler.load_image_collection_path(images_path_file) scraper = Scraper(feed_list, images_path) entries = scraper.get_entries() # get the metadata in interest and the images with Pool() as pool: metadata = pool.map(scraper.get_metadata, entries) Logger.log('inserting metadata into the database...') for feed_data in metadata: db_context.feeds_collection.update_one({'link': feed_data['link']}, {'$set': feed_data}, upsert=True) #db_context.feeds_collection.update_many(metadata, {'$set': metadata}, upsert=True) metadata_number = db_context.feeds_collection.find({}).count() Logger.log('{0} metadata inserted'.format(metadata_number)) Logger.log('creating indexes...') # compound index db_context.feeds_collection.create_index([('title', pymongo.TEXT), ('summary', pymongo.TEXT)], default_language='english', name='title_summary_index') db_context.feeds_collection.create_index([("image_path", pymongo.ASCENDING) ]) Logger.log('downloading the images...') scraper.download_images(metadata, images_path) Logger.log('inserting image collection path into the database...') full_img_path = scraper.download_dir data = {'path': full_img_path} db_context.image_collection.update_one(data, {'$set': data}, upsert=True) Logger.log('all done.\n')
class UpdateMonitor: def __init__(self, feeds_file_input, db_config_file): self.config_handler = ConfigHandler(feeds_file_input, db_config_file) URI, db_name, feeds_name_collection, feeds_collection, image_collection = self.config_handler.get_db_config( ) self.db_context = DbContext(URI, db_name, feeds_name_collection, feeds_collection, image_collection) def get_intervals(self, feed_url): pass # simple HTTP request header checking does not work for the update_interval calculation """ response = requests.get(feed_url) if 'Last-Modified' in response.headers: build_date = response.headers['Last-Modified'] print(build_date) data = {'Last-Modified': build_date, 'expires': -1, 'Cache-Control': 'must-revalidate'} resp = requests.get(feed_url, params=data) #date = 'Fri, 4 Aug 2017 23:00:00 GMT' #resp = requests.get(feed_url, params={'If-Modified-Since': build_date}) print(resp) update_interval = '' """ feed = feedparser.parse(feed_url) if 'updated' in feed['channel']: try: build_date = datetime.strptime(feed['channel']['updated'], '%a, %d %b %Y %H:%M:%S %z') except ValueError: build_date = feed['channel']['updated'] update_interval = 0 elif 'sy_updateperiod' in feed['channel']: period = feed['channel']['sy_updateperiod'] if period == 'hourly': update_interval = 60 else: update_interval = 0 response = requests.get(feed_url) if 'Last-Modified' in response.headers: build_date = datetime.strptime( response.headers['Last-Modified'], '%a, %d %b %Y %H:%M:%S %z') else: build_date = 0 else: # if no updatePeriod is defined then the feed's build date refreshes with every request build_date = 0 update_interval = 0 if 'ttl' in feed['channel']: # ttl - time to live (minutes): http://www.rssboard.org/rss-specification#ltttlgtSubelementOfLtchannelgt update_interval = int(feed['channel']['ttl']) print('\tlast updated on: {0}'.format(build_date)) print('\tupdate interval: {0}'.format(update_interval)) return update_interval, build_date def check_feeds(self): feed_list, feeds_data = self.config_handler.load_feed_list() for feed in feed_list: print(feed) update_interval, build_date = self.get_intervals(feed) self.db_context.feeds_name_collection.update_one({'url': feed}, { '$set': { 'update_interval': update_interval, 'build_date': build_date } }, upsert=True) print() def reload_feed(self, feed_name): """ should the feed be reloaded """ feed = self.db_context.feeds_name_collection.find_one( {'url': feed_name}) build_date = feed['build_date'] update_interval = feed['update_interval'] time_now = datetime.utcnow() difference = datetime(build_date) + datetime( update_interval) - time_now if build_date + update_interval > time_now: return True return False