Ejemplo n.º 1
0
def fetch_all_feed_info(db):
    feeds = list(db.feeds.find({"info_status": "needfetch"}))
    feed_info_urls = []

    def feed_fetch_complete_hook(urls_data):
        logging.info("Enhanced feed info fetching complete. %s unique URLs fetched. Processing..." % len(urls_data))
        feeds = db.feeds.find({"info_status": "needfetch"})
        for feed in feeds:
            # logging.debug("Looking at feed %s: %s" % (feed, feed['info_url']))
            if feed["info_url"]:
                info_url = (
                    ("http://" + feed["info_url"])
                    if not feed["info_url"].startswith("http://") and not feed["info_url"].startswith("https://")
                    else feed["info_url"]
                )
                if info_url not in urls_data:
                    logging.warn(
                        "URL %s not properly fetched (not one of %i entries in urls_data), skipping..."
                        % (info_url, len(urls_data))
                    )
                    continue
                assert info_url in urls_data
                if not urls_data[info_url][0]:  # request was not successful
                    inc_fetch_retry(db, feed, max_retry=FEED_MAX_RETRY, errors=[urls_data[info_url][1]])
                    logging.warn(
                        "Fetch for feed at %s not successful: %s (try %i of %i)"
                        % (info_url, urls_data[info_url][1], feed["fetch_info_retry"], FEED_MAX_RETRY)
                    )
                else:
                    result = process_feed_info(db, feed, urls_data[info_url][1])
                    if not result[0]:
                        logging.info("Processing for feed at %s not successful: %s" % (info_url, result[1]))
                    else:
                        logging.info("Processing for feed at %s successful" % info_url)

    # compose and fetch all info URLs in all feeds with them
    for feed in feeds:
        assert feed["info_url"]
        feed_info_urls.append(
            ("http://" + feed["info_url"])
            if not feed["info_url"].startswith("http://") and not feed["info_url"].startswith("https://")
            else feed["info_url"]
        )
    feed_info_urls_str = ", ".join(feed_info_urls)
    feed_info_urls_str = (
        (feed_info_urls_str[:2000] + " ...") if len(feed_info_urls_str) > 2000 else feed_info_urls_str
    )  # truncate if necessary
    if len(feed_info_urls):
        logging.info("Fetching enhanced feed info for %i feeds: %s" % (len(feed_info_urls), feed_info_urls_str))
        util.stream_fetch(
            feed_info_urls,
            feed_fetch_complete_hook,
            fetch_timeout=10,
            max_fetch_size=4 * 1024,
            urls_group_size=20,
            urls_group_time_spacing=20,
            per_request_complete_callback=lambda url, data: logging.debug(
                "Feed at %s retrieved, result: %s" % (url, data)
            ),
        )
Ejemplo n.º 2
0
def fetch_all_asset_info(db):
    assets = list(db.asset_extended_info.find({'info_status': 'needfetch'}))
    asset_info_urls = []

    def asset_fetch_complete_hook(urls_data):
        logging.info("Enhanced asset info fetching complete. %s unique URLs fetched. Processing..." % len(urls_data))
        for asset in assets:
            logging.debug("Looking at asset %s: %s" % (asset, asset['info_url']))
            if asset['info_url']:
                info_url = ('http://' + asset['info_url']) \
                    if not asset['info_url'].startswith('http://') and not asset['info_url'].startswith('https://') else asset['info_url']
                assert info_url in urls_data
                if not urls_data[info_url][0]: #request was not successful
                    inc_fetch_retry(db, asset, max_retry=ASSET_MAX_RETRY, errors=[urls_data[info_url][1]])
                    logging.warn("Fetch for asset at %s not successful: %s (try %i of %i)" % (
                        info_url, urls_data[info_url][1], asset['fetch_info_retry'], ASSET_MAX_RETRY))
                else:
                    result = process_asset_info(db, asset, urls_data[info_url][1])
                    if not result[0]:
                        logging.info("Processing for asset %s at %s not successful: %s" % (asset['asset'], info_url, result[1]))
                    else:
                        logging.info("Processing for asset %s at %s successful" % (asset['asset'], info_url))
        
    #compose and fetch all info URLs in all assets with them
    for asset in assets:
        if not asset['info_url']: continue
        
        if asset.get('disabled', False):
            logging.info("ExtendedAssetInfo: Skipping disabled asset %s" % asset['asset'])
            continue

        #may or may not end with .json. may or may not start with http:// or https://
        asset_info_urls.append(('http://' + asset['info_url']) \
            if not asset['info_url'].startswith('http://') and not asset['info_url'].startswith('https://') else asset['info_url'])

    asset_info_urls_str = ', '.join(asset_info_urls)
    asset_info_urls_str = (asset_info_urls_str[:2000] + ' ...') if len(asset_info_urls_str) > 2000 else asset_info_urls_str #truncate if necessary
    if len(asset_info_urls):
        logging.info('Fetching enhanced asset info for %i assets: %s' % (len(asset_info_urls), asset_info_urls_str))
        util.stream_fetch(asset_info_urls, asset_fetch_complete_hook,
            fetch_timeout=10, max_fetch_size=4*1024, urls_group_size=20, urls_group_time_spacing=20,
            per_request_complete_callback=lambda url, data: logging.debug("Asset info URL %s retrieved, result: %s" % (url, data)))
Ejemplo n.º 3
0
def fetch_all_feed_info(db):
    feeds = list(db.feeds.find({'info_status': 'needfetch'}))
    feed_info_urls = []

    def feed_fetch_complete_hook(urls_data):
        logging.info("Enhanced feed info fetching complete. %s unique URLs fetched. Processing..." % len(urls_data))
        feeds = db.feeds.find({'info_status': 'needfetch'})
        for feed in feeds:
            #logging.debug("Looking at feed %s: %s" % (feed, feed['info_url']))
            if feed['info_url']:
                info_url = ('http://' + feed['info_url']) \
                    if not feed['info_url'].startswith('http://') and not feed['info_url'].startswith('https://') else feed['info_url']
                if info_url not in urls_data:
                    logging.warn("URL %s not properly fetched (not one of %i entries in urls_data), skipping..." % (info_url, len(urls_data)))
                    continue
                assert info_url in urls_data
                if not urls_data[info_url][0]: #request was not successful
                    inc_fetch_retry(db, feed, max_retry=FEED_MAX_RETRY, errors=[urls_data[info_url][1]])
                    logging.warn("Fetch for feed at %s not successful: %s (try %i of %i)" % (
                        info_url, urls_data[info_url][1], feed['fetch_info_retry'], FEED_MAX_RETRY))
                else:
                    result = process_feed_info(db, feed, urls_data[info_url][1])
                    if not result[0]:
                        logging.info("Processing for feed at %s not successful: %s" % (info_url, result[1]))
                    else:
                        logging.info("Processing for feed at %s successful" % info_url)
        
    #compose and fetch all info URLs in all feeds with them
    for feed in feeds:
        assert feed['info_url']
        feed_info_urls.append(('http://' + feed['info_url']) \
            if not feed['info_url'].startswith('http://') and not feed['info_url'].startswith('https://') else feed['info_url'])
    feed_info_urls_str = ', '.join(feed_info_urls)
    feed_info_urls_str = (feed_info_urls_str[:2000] + ' ...') if len(feed_info_urls_str) > 2000 else feed_info_urls_str #truncate if necessary
    if len(feed_info_urls):
        logging.info('Fetching enhanced feed info for %i feeds: %s' % (len(feed_info_urls), feed_info_urls_str))
        util.stream_fetch(feed_info_urls, feed_fetch_complete_hook,
            fetch_timeout=10, max_fetch_size=4*1024, urls_group_size=20, urls_group_time_spacing=20,
            per_request_complete_callback=lambda url, data: logging.debug("Feed at %s retrieved, result: %s" % (url, data)))
Ejemplo n.º 4
0
def fetch_all_feed_info(db):
    feeds = list(db.feeds.find({'info_status': 'needfetch'}))
    feed_info_urls = []

    def feed_fetch_complete_hook(urls_data):
        logging.info("Enhanced feed info fetching complete. %s unique URLs fetched. Processing..." % len(urls_data))
        feeds = db.feeds.find({'info_status': 'needfetch'})
        for feed in feeds:
            #logging.debug("Looking at feed %s: %s" % (feed, feed['info_url']))
            if feed['info_url']:
                info_url = ('http://' + feed['info_url']) \
                    if not feed['info_url'].startswith('http://') and not feed['info_url'].startswith('https://') else feed['info_url']
                if info_url not in urls_data:
                    logging.error("URL %s not properly fetched (not one of %i entries in urls_data), skipping..." % (info_url, len(urls_data)))
                    continue
                assert info_url in urls_data
                if not urls_data[info_url][0]: #request was not successful
                    max_retry = 3
                    inc_fetch_retry(db, feed, max_retry=max_retry, errors=[urls_data[info_url][1]])
                    logging.error("Fetch for feed at %s not successful: %s (try %i of %i)" % (info_url, urls_data[info_url][1], feed['fetch_info_retry'], max_retry))
                else:
                    result = process_feed_info(db, feed, urls_data[info_url][1])
                    if not result[0]:
                        logging.info("Processing for feed at %s not successful: %s" % (info_url, result[1]))
                    else:
                        logging.info("Processing for feed at %s successful" % info_url)
        
    #compose and fetch all info URLs in all feeds with them
    for feed in feeds:
        assert feed['info_url']
        feed_info_urls.append(('http://' + feed['info_url']) \
            if not feed['info_url'].startswith('http://') and not feed['info_url'].startswith('https://') else feed['info_url'])
    feed_info_urls_str = ', '.join(feed_info_urls)
    feed_info_urls_str = (feed_info_urls_str[:2000] + ' ...') if len(feed_info_urls_str) > 2000 else feed_info_urls_str #truncate if necessary
    if len(feed_info_urls):
        logging.info('Fetching enhanced feed info for %i feeds: %s' % (len(feed_info_urls), feed_info_urls_str))
        util.stream_fetch(feed_info_urls, feed_fetch_complete_hook,
            fetch_timeout=5, max_fetch_size=4*1024, urls_group_size=50, urls_group_time_spacing=6)
Ejemplo n.º 5
0
def fetch_all_asset_info(db):
    assets = list(db.asset_extended_info.find({'info_status': 'needfetch'}))
    asset_info_urls = []

    def asset_fetch_complete_hook(urls_data):
        logging.info(
            "Enhanced asset info fetching complete. %s unique URLs fetched. Processing..."
            % len(urls_data))
        for asset in assets:
            logging.debug("Looking at asset %s: %s" %
                          (asset, asset['info_url']))
            if asset['info_url']:
                info_url = ('http://' + asset['info_url']) \
                    if not asset['info_url'].startswith('http://') and not asset['info_url'].startswith('https://') else asset['info_url']
                assert info_url in urls_data
                if not urls_data[info_url][0]:  #request was not successful
                    inc_fetch_retry(db,
                                    asset,
                                    max_retry=ASSET_MAX_RETRY,
                                    errors=[urls_data[info_url][1]])
                    logging.warn(
                        "Fetch for asset at %s not successful: %s (try %i of %i)"
                        % (info_url, urls_data[info_url][1],
                           asset['fetch_info_retry'], ASSET_MAX_RETRY))
                else:
                    result = process_asset_info(db, asset,
                                                urls_data[info_url][1])
                    if not result[0]:
                        logging.info(
                            "Processing for asset %s at %s not successful: %s"
                            % (asset['asset'], info_url, result[1]))
                    else:
                        logging.info(
                            "Processing for asset %s at %s successful" %
                            (asset['asset'], info_url))

    #compose and fetch all info URLs in all assets with them
    for asset in assets:
        if not asset['info_url']: continue

        if asset.get('disabled', False):
            logging.info("ExtendedAssetInfo: Skipping disabled asset %s" %
                         asset['asset'])
            continue

        #may or may not end with .json. may or may not start with http:// or https://
        asset_info_urls.append(('http://' + asset['info_url']) \
            if not asset['info_url'].startswith('http://') and not asset['info_url'].startswith('https://') else asset['info_url'])

    asset_info_urls_str = ', '.join(asset_info_urls)
    asset_info_urls_str = (
        asset_info_urls_str[:2000] + ' ...'
    ) if len(asset_info_urls_str
             ) > 2000 else asset_info_urls_str  #truncate if necessary
    if len(asset_info_urls):
        logging.info('Fetching enhanced asset info for %i assets: %s' %
                     (len(asset_info_urls), asset_info_urls_str))
        util.stream_fetch(
            asset_info_urls,
            asset_fetch_complete_hook,
            fetch_timeout=10,
            max_fetch_size=4 * 1024,
            urls_group_size=20,
            urls_group_time_spacing=20,
            per_request_complete_callback=lambda url, data: logging.debug(
                "Asset info URL %s retrieved, result: %s" % (url, data)))