Exemple #1
0
def main():
    cache = get_cache()
    failed_uris = get_failed()

    # don't check uris that have enough tags in the cache
    done = set()
    tags = set(NEEDED)
    for key, value in cache.iteritems():
        if not tags - set(value.keys()):
            done.add(key)

    # also don't check failed (to allow multiple partial runs)
    done |= failed_uris

    # get uris for the rest
    uris_todo = set(cache.keys()) - done

    # get tags and replace all results
    new_result, new_failed = get_all_tags(uris_todo)
    for uri, tags in new_result.iteritems():
        if uri in cache:
            cache[uri].update(tags)
        else:
            cache[uri] = tags

    set_failed(failed_uris | set(new_failed))
    set_cache(cache)
Exemple #2
0
def main():

    # crawl and cache
    try:
        with open(XIPH_CACHE, "rb") as h:
            result = pickle.load(h)
    except IOError:
        result = crawl_xiph()
        with open(XIPH_CACHE, "wb") as h:
            pickle.dump(result, h)

    cache = get_cache()

    # add new streams and listeners counts to the cache
    for pl in result:
        for stream in pl.streams:
            if stream not in cache:
                cache[stream] = {}

            if LISTENERCURRENT in cache[stream]:
                cache[stream][LISTENERCURRENT].append(str(pl.listeners))
            else:
                cache[stream][LISTENERCURRENT] = [str(pl.listeners)]

    set_cache(cache)
def main():

    # crawl and cache
    try:
        with open(XIPH_CACHE, "rb") as h:
            result = pickle.load(h)
    except IOError:
        result = crawl_xiph()
        with open(XIPH_CACHE, "wb") as h:
            pickle.dump(result, h)

    cache = get_cache()

    # add new streams and listeners counts to the cache
    for pl in result:
        for stream in pl.streams:
            if stream not in cache:
                cache[stream] = {}

            if LISTENERCURRENT in cache[stream]:
                cache[stream][LISTENERCURRENT].append(str(pl.listeners))
            else:
                cache[stream][LISTENERCURRENT] = [str(pl.listeners)]

    set_cache(cache)
Exemple #4
0
def main():
    cache = get_cache()
    failed_uris = get_failed()

    # don't check uris that have enough tags in the cache
    done = set()
    tags = set(NEEDED)
    for key, value in cache.iteritems():
        if not tags - set(value.keys()):
            done.add(key)

    # also don't check failed (to allow multiple partial runs)
    done |= failed_uris

    # get uris for the rest
    uris_todo = set(cache.keys()) - done

    # get tags and replace all results
    new_result, new_failed = get_all_tags(uris_todo)
    for uri, tags in new_result.iteritems():
        if uri in cache:
            cache[uri].update(tags)
        else:
            cache[uri] = tags

    set_failed(failed_uris | set(new_failed))
    set_cache(cache)
Exemple #5
0
def main():
    cache = get_cache()
    failed_uris = get_failed()
    parse_failed_uris = get_parse_failed()

    uris = cache.keys()

    peak_missing = [uri for uri in uris if LISTENERPEAK not in cache[uri]]
    peak_missing = set(peak_missing) - failed_uris

    # XXX: fetch_stream_infos is the same for each root url
    peak_missing = {get_root(uri) for uri in peak_missing}
    peak_missing = set(peak_missing) - parse_failed_uris
    

    pool = Pool(PROCESSES)
    try:
        pfunc = fetch_stream_infos
        for i, res in enumerate(pool.imap_unordered(pfunc, peak_missing)):
            uri, streams = res

            # save all 1000
            if (i+1) % 1000 == 0:
                set_cache(cache)

            print "%d/%d " % (i+1, len(peak_missing)) + uri + " -> ",
            print "%d new streams" % len(streams)

            if not streams:
                parse_failed_uris.add(uri)

            # add new found uris to cache + listener count
            for stream in streams:
                peak = str(int(stream.peak))
                current = str(int(stream.current))
                uri = stream.stream

                if uri not in cache:
                    cache[uri] = {}

                if LISTENERPEAK in cache[uri]:
                    cache[uri][LISTENERPEAK].append(peak)
                else:
                    cache[uri][LISTENERPEAK] = [peak]

                if LISTENERCURRENT in cache[uri]:
                    cache[uri][LISTENERCURRENT].append(current)
                else:
                    cache[uri][LISTENERCURRENT] = [current]

    except Exception as e:
        print e
    finally:
        set_parse_failed(parse_failed_uris)
        set_cache(cache)
        pool.terminate()
        pool.join()
Exemple #6
0
def main():
    cache = get_cache()
    failed_uris = get_failed()
    parse_failed_uris = get_parse_failed()

    uris = cache.keys()

    peak_missing = [uri for uri in uris if LISTENERPEAK not in cache[uri]]
    peak_missing = set(peak_missing) - failed_uris

    # XXX: fetch_stream_infos is the same for each root url
    peak_missing = {get_root(uri) for uri in peak_missing}
    peak_missing = set(peak_missing) - parse_failed_uris
    

    pool = Pool(PROCESSES)
    try:
        pfunc = fetch_stream_infos
        for i, res in enumerate(pool.imap_unordered(pfunc, peak_missing)):
            uri, streams = res

            # save all 1000
            if (i+1) % 1000 == 0:
                set_cache(cache)

            print "%d/%d " % (i+1, len(peak_missing)) + uri + " -> ",
            print "%d new streams" % len(streams)

            if not streams:
                parse_failed_uris.add(uri)

            # add new found uris to cache + listener count
            for stream in streams:
                peak = str(int(stream.peak))
                current = str(int(stream.current))
                uri = stream.stream

                if uri not in cache:
                    cache[uri] = {}

                if LISTENERPEAK in cache[uri]:
                    cache[uri][LISTENERPEAK].append(peak)
                else:
                    cache[uri][LISTENERPEAK] = [peak]

                if LISTENERCURRENT in cache[uri]:
                    cache[uri][LISTENERCURRENT].append(current)
                else:
                    cache[uri][LISTENERCURRENT] = [current]

    except Exception as e:
        print e
    finally:
        set_parse_failed(parse_failed_uris)
        set_cache(cache)
        pool.terminate()
        pool.join()
Exemple #7
0
def main():
    cache = {}
    try:
        with open(URILIST, "rb") as h:
            for uri in h.read().splitlines():
                cache[uri] = {}
    except IOError:
        pass

    set_cache(cache)
Exemple #8
0
def main():
    uris = []
    r = requests.get("https://somafm.com/listen/")
    playlists = re.findall('[^"\']*?.pls', r.text)
    for i, pls in enumerate(playlists):
        print "%d/%d" % (i + 1, len(playlists))
        uris.extend(get_pls(pls))

    cache = get_cache()
    for uri in uris:
        if uri not in cache:
            cache[uri] = {}
    set_cache(cache)