Exemple #1
0
def main():
    feeds_file_input = os.path.abspath('config/feed_list.json')
    db_config_file = os.path.abspath('config/db_config.json')
    config_handler = ConfigHandler(feeds_file_input, db_config_file)
    uri, db_name, feeds_name_collection, feeds_collection, image_collection = config_handler.get_db_config(
    )
    db_context = DbContext(uri, db_name, feeds_name_collection,
                           feeds_collection, image_collection)

    Logger.log('reading {0} ...'.format(feeds_file_input))
    feed_list, feed_list_jsondata = config_handler.load_feed_list()
    Logger.log('collecting from {0} feeds'.format(len(feed_list)))

    Logger.log('inserting the feed list into the database...')
    for feed in feed_list_jsondata:
        db_context.feeds_name_collection.update_one({'url': feed['url']},
                                                    {'$set': feed},
                                                    upsert=True)

    images_path_file = 'config/image_collection.json'
    images_path = config_handler.load_image_collection_path(images_path_file)
    scraper = Scraper(feed_list, images_path)

    entries = scraper.get_entries()
    # get the metadata in interest and the images
    with Pool() as pool:
        metadata = pool.map(scraper.get_metadata, entries)

    Logger.log('inserting metadata into the database...')
    for feed_data in metadata:
        db_context.feeds_collection.update_one({'link': feed_data['link']},
                                               {'$set': feed_data},
                                               upsert=True)
    #db_context.feeds_collection.update_many(metadata, {'$set': metadata}, upsert=True)
    metadata_number = db_context.feeds_collection.find({}).count()
    Logger.log('{0} metadata inserted'.format(metadata_number))

    Logger.log('creating indexes...')
    # compound index
    db_context.feeds_collection.create_index([('title', pymongo.TEXT),
                                              ('summary', pymongo.TEXT)],
                                             default_language='english',
                                             name='title_summary_index')
    db_context.feeds_collection.create_index([("image_path", pymongo.ASCENDING)
                                              ])

    Logger.log('downloading the images...')
    scraper.download_images(metadata, images_path)

    Logger.log('inserting image collection path into the database...')
    full_img_path = scraper.download_dir
    data = {'path': full_img_path}
    db_context.image_collection.update_one(data, {'$set': data}, upsert=True)

    Logger.log('all done.\n')
Exemple #2
0
class UpdateMonitor:
    def __init__(self, feeds_file_input, db_config_file):
        self.config_handler = ConfigHandler(feeds_file_input, db_config_file)
        URI, db_name, feeds_name_collection, feeds_collection, image_collection = self.config_handler.get_db_config(
        )
        self.db_context = DbContext(URI, db_name, feeds_name_collection,
                                    feeds_collection, image_collection)

    def get_intervals(self, feed_url):
        pass
        # simple HTTP request header checking does not work for the update_interval calculation
        """
        response = requests.get(feed_url)
        if 'Last-Modified' in response.headers:
            build_date = response.headers['Last-Modified']
            print(build_date)
            data = {'Last-Modified': build_date, 'expires': -1, 'Cache-Control': 'must-revalidate'}
            resp = requests.get(feed_url, params=data)
            
            #date = 'Fri, 4 Aug 2017 23:00:00 GMT'
            #resp = requests.get(feed_url, params={'If-Modified-Since': build_date})
            print(resp)
            update_interval = ''
        """

        feed = feedparser.parse(feed_url)
        if 'updated' in feed['channel']:
            try:
                build_date = datetime.strptime(feed['channel']['updated'],
                                               '%a, %d %b %Y %H:%M:%S %z')
            except ValueError:
                build_date = feed['channel']['updated']
            update_interval = 0
        elif 'sy_updateperiod' in feed['channel']:
            period = feed['channel']['sy_updateperiod']
            if period == 'hourly':
                update_interval = 60
            else:
                update_interval = 0
            response = requests.get(feed_url)
            if 'Last-Modified' in response.headers:
                build_date = datetime.strptime(
                    response.headers['Last-Modified'],
                    '%a, %d %b %Y %H:%M:%S %z')
            else:
                build_date = 0
        else:
            # if no updatePeriod is defined then the feed's build date refreshes with every request
            build_date = 0
            update_interval = 0
        if 'ttl' in feed['channel']:
            # ttl - time to live (minutes): http://www.rssboard.org/rss-specification#ltttlgtSubelementOfLtchannelgt
            update_interval = int(feed['channel']['ttl'])

        print('\tlast updated on: {0}'.format(build_date))
        print('\tupdate interval: {0}'.format(update_interval))

        return update_interval, build_date

    def check_feeds(self):
        feed_list, feeds_data = self.config_handler.load_feed_list()
        for feed in feed_list:
            print(feed)
            update_interval, build_date = self.get_intervals(feed)
            self.db_context.feeds_name_collection.update_one({'url': feed}, {
                '$set': {
                    'update_interval': update_interval,
                    'build_date': build_date
                }
            },
                                                             upsert=True)
            print()

    def reload_feed(self, feed_name):
        """
            should the feed be reloaded
        """

        feed = self.db_context.feeds_name_collection.find_one(
            {'url': feed_name})
        build_date = feed['build_date']
        update_interval = feed['update_interval']
        time_now = datetime.utcnow()
        difference = datetime(build_date) + datetime(
            update_interval) - time_now
        if build_date + update_interval > time_now:
            return True
        return False