def main(): cache = get_cache() failed_uris = get_failed() # don't check uris that have enough tags in the cache done = set() tags = set(NEEDED) for key, value in cache.iteritems(): if not tags - set(value.keys()): done.add(key) # also don't check failed (to allow multiple partial runs) done |= failed_uris # get uris for the rest uris_todo = set(cache.keys()) - done # get tags and replace all results new_result, new_failed = get_all_tags(uris_todo) for uri, tags in new_result.iteritems(): if uri in cache: cache[uri].update(tags) else: cache[uri] = tags set_failed(failed_uris | set(new_failed)) set_cache(cache)
def main(): # crawl and cache try: with open(XIPH_CACHE, "rb") as h: result = pickle.load(h) except IOError: result = crawl_xiph() with open(XIPH_CACHE, "wb") as h: pickle.dump(result, h) cache = get_cache() # add new streams and listeners counts to the cache for pl in result: for stream in pl.streams: if stream not in cache: cache[stream] = {} if LISTENERCURRENT in cache[stream]: cache[stream][LISTENERCURRENT].append(str(pl.listeners)) else: cache[stream][LISTENERCURRENT] = [str(pl.listeners)] set_cache(cache)
def main(): cache = get_cache() failed_uris = get_failed() parse_failed_uris = get_parse_failed() uris = cache.keys() peak_missing = [uri for uri in uris if LISTENERPEAK not in cache[uri]] peak_missing = set(peak_missing) - failed_uris # XXX: fetch_stream_infos is the same for each root url peak_missing = {get_root(uri) for uri in peak_missing} peak_missing = set(peak_missing) - parse_failed_uris pool = Pool(PROCESSES) try: pfunc = fetch_stream_infos for i, res in enumerate(pool.imap_unordered(pfunc, peak_missing)): uri, streams = res # save all 1000 if (i+1) % 1000 == 0: set_cache(cache) print "%d/%d " % (i+1, len(peak_missing)) + uri + " -> ", print "%d new streams" % len(streams) if not streams: parse_failed_uris.add(uri) # add new found uris to cache + listener count for stream in streams: peak = str(int(stream.peak)) current = str(int(stream.current)) uri = stream.stream if uri not in cache: cache[uri] = {} if LISTENERPEAK in cache[uri]: cache[uri][LISTENERPEAK].append(peak) else: cache[uri][LISTENERPEAK] = [peak] if LISTENERCURRENT in cache[uri]: cache[uri][LISTENERCURRENT].append(current) else: cache[uri][LISTENERCURRENT] = [current] except Exception as e: print e finally: set_parse_failed(parse_failed_uris) set_cache(cache) pool.terminate() pool.join()
def main(): cache = {} try: with open(URILIST, "rb") as h: for uri in h.read().splitlines(): cache[uri] = {} except IOError: pass set_cache(cache)
def main(): uris = [] r = requests.get("https://somafm.com/listen/") playlists = re.findall('[^"\']*?.pls', r.text) for i, pls in enumerate(playlists): print "%d/%d" % (i + 1, len(playlists)) uris.extend(get_pls(pls)) cache = get_cache() for uri in uris: if uri not in cache: cache[uri] = {} set_cache(cache)