def fetch_all_feed_info(db): feeds = list(db.feeds.find({"info_status": "needfetch"})) feed_info_urls = [] def feed_fetch_complete_hook(urls_data): logging.info("Enhanced feed info fetching complete. %s unique URLs fetched. Processing..." % len(urls_data)) feeds = db.feeds.find({"info_status": "needfetch"}) for feed in feeds: # logging.debug("Looking at feed %s: %s" % (feed, feed['info_url'])) if feed["info_url"]: info_url = ( ("http://" + feed["info_url"]) if not feed["info_url"].startswith("http://") and not feed["info_url"].startswith("https://") else feed["info_url"] ) if info_url not in urls_data: logging.warn( "URL %s not properly fetched (not one of %i entries in urls_data), skipping..." % (info_url, len(urls_data)) ) continue assert info_url in urls_data if not urls_data[info_url][0]: # request was not successful inc_fetch_retry(db, feed, max_retry=FEED_MAX_RETRY, errors=[urls_data[info_url][1]]) logging.warn( "Fetch for feed at %s not successful: %s (try %i of %i)" % (info_url, urls_data[info_url][1], feed["fetch_info_retry"], FEED_MAX_RETRY) ) else: result = process_feed_info(db, feed, urls_data[info_url][1]) if not result[0]: logging.info("Processing for feed at %s not successful: %s" % (info_url, result[1])) else: logging.info("Processing for feed at %s successful" % info_url) # compose and fetch all info URLs in all feeds with them for feed in feeds: assert feed["info_url"] feed_info_urls.append( ("http://" + feed["info_url"]) if not feed["info_url"].startswith("http://") and not feed["info_url"].startswith("https://") else feed["info_url"] ) feed_info_urls_str = ", ".join(feed_info_urls) feed_info_urls_str = ( (feed_info_urls_str[:2000] + " ...") if len(feed_info_urls_str) > 2000 else feed_info_urls_str ) # truncate if necessary if len(feed_info_urls): logging.info("Fetching enhanced feed info for %i feeds: %s" % (len(feed_info_urls), feed_info_urls_str)) util.stream_fetch( feed_info_urls, feed_fetch_complete_hook, fetch_timeout=10, max_fetch_size=4 * 1024, urls_group_size=20, urls_group_time_spacing=20, per_request_complete_callback=lambda url, data: logging.debug( "Feed at %s retrieved, result: %s" % (url, data) ), )
def fetch_all_asset_info(db): assets = list(db.asset_extended_info.find({'info_status': 'needfetch'})) asset_info_urls = [] def asset_fetch_complete_hook(urls_data): logging.info("Enhanced asset info fetching complete. %s unique URLs fetched. Processing..." % len(urls_data)) for asset in assets: logging.debug("Looking at asset %s: %s" % (asset, asset['info_url'])) if asset['info_url']: info_url = ('http://' + asset['info_url']) \ if not asset['info_url'].startswith('http://') and not asset['info_url'].startswith('https://') else asset['info_url'] assert info_url in urls_data if not urls_data[info_url][0]: #request was not successful inc_fetch_retry(db, asset, max_retry=ASSET_MAX_RETRY, errors=[urls_data[info_url][1]]) logging.warn("Fetch for asset at %s not successful: %s (try %i of %i)" % ( info_url, urls_data[info_url][1], asset['fetch_info_retry'], ASSET_MAX_RETRY)) else: result = process_asset_info(db, asset, urls_data[info_url][1]) if not result[0]: logging.info("Processing for asset %s at %s not successful: %s" % (asset['asset'], info_url, result[1])) else: logging.info("Processing for asset %s at %s successful" % (asset['asset'], info_url)) #compose and fetch all info URLs in all assets with them for asset in assets: if not asset['info_url']: continue if asset.get('disabled', False): logging.info("ExtendedAssetInfo: Skipping disabled asset %s" % asset['asset']) continue #may or may not end with .json. may or may not start with http:// or https:// asset_info_urls.append(('http://' + asset['info_url']) \ if not asset['info_url'].startswith('http://') and not asset['info_url'].startswith('https://') else asset['info_url']) asset_info_urls_str = ', '.join(asset_info_urls) asset_info_urls_str = (asset_info_urls_str[:2000] + ' ...') if len(asset_info_urls_str) > 2000 else asset_info_urls_str #truncate if necessary if len(asset_info_urls): logging.info('Fetching enhanced asset info for %i assets: %s' % (len(asset_info_urls), asset_info_urls_str)) util.stream_fetch(asset_info_urls, asset_fetch_complete_hook, fetch_timeout=10, max_fetch_size=4*1024, urls_group_size=20, urls_group_time_spacing=20, per_request_complete_callback=lambda url, data: logging.debug("Asset info URL %s retrieved, result: %s" % (url, data)))
def fetch_all_feed_info(db): feeds = list(db.feeds.find({'info_status': 'needfetch'})) feed_info_urls = [] def feed_fetch_complete_hook(urls_data): logging.info("Enhanced feed info fetching complete. %s unique URLs fetched. Processing..." % len(urls_data)) feeds = db.feeds.find({'info_status': 'needfetch'}) for feed in feeds: #logging.debug("Looking at feed %s: %s" % (feed, feed['info_url'])) if feed['info_url']: info_url = ('http://' + feed['info_url']) \ if not feed['info_url'].startswith('http://') and not feed['info_url'].startswith('https://') else feed['info_url'] if info_url not in urls_data: logging.warn("URL %s not properly fetched (not one of %i entries in urls_data), skipping..." % (info_url, len(urls_data))) continue assert info_url in urls_data if not urls_data[info_url][0]: #request was not successful inc_fetch_retry(db, feed, max_retry=FEED_MAX_RETRY, errors=[urls_data[info_url][1]]) logging.warn("Fetch for feed at %s not successful: %s (try %i of %i)" % ( info_url, urls_data[info_url][1], feed['fetch_info_retry'], FEED_MAX_RETRY)) else: result = process_feed_info(db, feed, urls_data[info_url][1]) if not result[0]: logging.info("Processing for feed at %s not successful: %s" % (info_url, result[1])) else: logging.info("Processing for feed at %s successful" % info_url) #compose and fetch all info URLs in all feeds with them for feed in feeds: assert feed['info_url'] feed_info_urls.append(('http://' + feed['info_url']) \ if not feed['info_url'].startswith('http://') and not feed['info_url'].startswith('https://') else feed['info_url']) feed_info_urls_str = ', '.join(feed_info_urls) feed_info_urls_str = (feed_info_urls_str[:2000] + ' ...') if len(feed_info_urls_str) > 2000 else feed_info_urls_str #truncate if necessary if len(feed_info_urls): logging.info('Fetching enhanced feed info for %i feeds: %s' % (len(feed_info_urls), feed_info_urls_str)) util.stream_fetch(feed_info_urls, feed_fetch_complete_hook, fetch_timeout=10, max_fetch_size=4*1024, urls_group_size=20, urls_group_time_spacing=20, per_request_complete_callback=lambda url, data: logging.debug("Feed at %s retrieved, result: %s" % (url, data)))
def fetch_all_feed_info(db): feeds = list(db.feeds.find({'info_status': 'needfetch'})) feed_info_urls = [] def feed_fetch_complete_hook(urls_data): logging.info("Enhanced feed info fetching complete. %s unique URLs fetched. Processing..." % len(urls_data)) feeds = db.feeds.find({'info_status': 'needfetch'}) for feed in feeds: #logging.debug("Looking at feed %s: %s" % (feed, feed['info_url'])) if feed['info_url']: info_url = ('http://' + feed['info_url']) \ if not feed['info_url'].startswith('http://') and not feed['info_url'].startswith('https://') else feed['info_url'] if info_url not in urls_data: logging.error("URL %s not properly fetched (not one of %i entries in urls_data), skipping..." % (info_url, len(urls_data))) continue assert info_url in urls_data if not urls_data[info_url][0]: #request was not successful max_retry = 3 inc_fetch_retry(db, feed, max_retry=max_retry, errors=[urls_data[info_url][1]]) logging.error("Fetch for feed at %s not successful: %s (try %i of %i)" % (info_url, urls_data[info_url][1], feed['fetch_info_retry'], max_retry)) else: result = process_feed_info(db, feed, urls_data[info_url][1]) if not result[0]: logging.info("Processing for feed at %s not successful: %s" % (info_url, result[1])) else: logging.info("Processing for feed at %s successful" % info_url) #compose and fetch all info URLs in all feeds with them for feed in feeds: assert feed['info_url'] feed_info_urls.append(('http://' + feed['info_url']) \ if not feed['info_url'].startswith('http://') and not feed['info_url'].startswith('https://') else feed['info_url']) feed_info_urls_str = ', '.join(feed_info_urls) feed_info_urls_str = (feed_info_urls_str[:2000] + ' ...') if len(feed_info_urls_str) > 2000 else feed_info_urls_str #truncate if necessary if len(feed_info_urls): logging.info('Fetching enhanced feed info for %i feeds: %s' % (len(feed_info_urls), feed_info_urls_str)) util.stream_fetch(feed_info_urls, feed_fetch_complete_hook, fetch_timeout=5, max_fetch_size=4*1024, urls_group_size=50, urls_group_time_spacing=6)
def fetch_all_asset_info(db): assets = list(db.asset_extended_info.find({'info_status': 'needfetch'})) asset_info_urls = [] def asset_fetch_complete_hook(urls_data): logging.info( "Enhanced asset info fetching complete. %s unique URLs fetched. Processing..." % len(urls_data)) for asset in assets: logging.debug("Looking at asset %s: %s" % (asset, asset['info_url'])) if asset['info_url']: info_url = ('http://' + asset['info_url']) \ if not asset['info_url'].startswith('http://') and not asset['info_url'].startswith('https://') else asset['info_url'] assert info_url in urls_data if not urls_data[info_url][0]: #request was not successful inc_fetch_retry(db, asset, max_retry=ASSET_MAX_RETRY, errors=[urls_data[info_url][1]]) logging.warn( "Fetch for asset at %s not successful: %s (try %i of %i)" % (info_url, urls_data[info_url][1], asset['fetch_info_retry'], ASSET_MAX_RETRY)) else: result = process_asset_info(db, asset, urls_data[info_url][1]) if not result[0]: logging.info( "Processing for asset %s at %s not successful: %s" % (asset['asset'], info_url, result[1])) else: logging.info( "Processing for asset %s at %s successful" % (asset['asset'], info_url)) #compose and fetch all info URLs in all assets with them for asset in assets: if not asset['info_url']: continue if asset.get('disabled', False): logging.info("ExtendedAssetInfo: Skipping disabled asset %s" % asset['asset']) continue #may or may not end with .json. may or may not start with http:// or https:// asset_info_urls.append(('http://' + asset['info_url']) \ if not asset['info_url'].startswith('http://') and not asset['info_url'].startswith('https://') else asset['info_url']) asset_info_urls_str = ', '.join(asset_info_urls) asset_info_urls_str = ( asset_info_urls_str[:2000] + ' ...' ) if len(asset_info_urls_str ) > 2000 else asset_info_urls_str #truncate if necessary if len(asset_info_urls): logging.info('Fetching enhanced asset info for %i assets: %s' % (len(asset_info_urls), asset_info_urls_str)) util.stream_fetch( asset_info_urls, asset_fetch_complete_hook, fetch_timeout=10, max_fetch_size=4 * 1024, urls_group_size=20, urls_group_time_spacing=20, per_request_complete_callback=lambda url, data: logging.debug( "Asset info URL %s retrieved, result: %s" % (url, data)))