def main(): cache = get_cache() failed_uris = get_failed() # don't check uris that have enough tags in the cache done = set() tags = set(NEEDED) for key, value in cache.iteritems(): if not tags - set(value.keys()): done.add(key) # also don't check failed (to allow multiple partial runs) done |= failed_uris # get uris for the rest uris_todo = set(cache.keys()) - done # get tags and replace all results new_result, new_failed = get_all_tags(uris_todo) for uri, tags in new_result.iteritems(): if uri in cache: cache[uri].update(tags) else: cache[uri] = tags set_failed(failed_uris | set(new_failed)) set_cache(cache)
def main(): cache = get_cache() failed_uris = get_failed() parse_failed_uris = get_parse_failed() uris = cache.keys() peak_missing = [uri for uri in uris if LISTENERPEAK not in cache[uri]] peak_missing = set(peak_missing) - failed_uris # XXX: fetch_stream_infos is the same for each root url peak_missing = {get_root(uri) for uri in peak_missing} peak_missing = set(peak_missing) - parse_failed_uris pool = Pool(PROCESSES) try: pfunc = fetch_stream_infos for i, res in enumerate(pool.imap_unordered(pfunc, peak_missing)): uri, streams = res # save all 1000 if (i+1) % 1000 == 0: set_cache(cache) print "%d/%d " % (i+1, len(peak_missing)) + uri + " -> ", print "%d new streams" % len(streams) if not streams: parse_failed_uris.add(uri) # add new found uris to cache + listener count for stream in streams: peak = str(int(stream.peak)) current = str(int(stream.current)) uri = stream.stream if uri not in cache: cache[uri] = {} if LISTENERPEAK in cache[uri]: cache[uri][LISTENERPEAK].append(peak) else: cache[uri][LISTENERPEAK] = [peak] if LISTENERCURRENT in cache[uri]: cache[uri][LISTENERCURRENT].append(current) else: cache[uri][LISTENERCURRENT] = [current] except Exception as e: print e finally: set_parse_failed(parse_failed_uris) set_cache(cache) pool.terminate() pool.join()