Exemple #1
0
def main():
    cache = get_cache()
    failed_uris = get_failed()

    # don't check uris that have enough tags in the cache
    done = set()
    tags = set(NEEDED)
    for key, value in cache.iteritems():
        if not tags - set(value.keys()):
            done.add(key)

    # also don't check failed (to allow multiple partial runs)
    done |= failed_uris

    # get uris for the rest
    uris_todo = set(cache.keys()) - done

    # get tags and replace all results
    new_result, new_failed = get_all_tags(uris_todo)
    for uri, tags in new_result.iteritems():
        if uri in cache:
            cache[uri].update(tags)
        else:
            cache[uri] = tags

    set_failed(failed_uris | set(new_failed))
    set_cache(cache)
Exemple #2
0
def main():
    cache = get_cache()
    failed_uris = get_failed()

    # don't check uris that have enough tags in the cache
    done = set()
    tags = set(NEEDED)
    for key, value in cache.iteritems():
        if not tags - set(value.keys()):
            done.add(key)

    # also don't check failed (to allow multiple partial runs)
    done |= failed_uris

    # get uris for the rest
    uris_todo = set(cache.keys()) - done

    # get tags and replace all results
    new_result, new_failed = get_all_tags(uris_todo)
    for uri, tags in new_result.iteritems():
        if uri in cache:
            cache[uri].update(tags)
        else:
            cache[uri] = tags

    set_failed(failed_uris | set(new_failed))
    set_cache(cache)
Exemple #3
0
def main():
    cache = get_cache()
    failed_uris = get_failed()
    parse_failed_uris = get_parse_failed()

    uris = cache.keys()

    peak_missing = [uri for uri in uris if LISTENERPEAK not in cache[uri]]
    peak_missing = set(peak_missing) - failed_uris

    # XXX: fetch_stream_infos is the same for each root url
    peak_missing = {get_root(uri) for uri in peak_missing}
    peak_missing = set(peak_missing) - parse_failed_uris
    

    pool = Pool(PROCESSES)
    try:
        pfunc = fetch_stream_infos
        for i, res in enumerate(pool.imap_unordered(pfunc, peak_missing)):
            uri, streams = res

            # save all 1000
            if (i+1) % 1000 == 0:
                set_cache(cache)

            print "%d/%d " % (i+1, len(peak_missing)) + uri + " -> ",
            print "%d new streams" % len(streams)

            if not streams:
                parse_failed_uris.add(uri)

            # add new found uris to cache + listener count
            for stream in streams:
                peak = str(int(stream.peak))
                current = str(int(stream.current))
                uri = stream.stream

                if uri not in cache:
                    cache[uri] = {}

                if LISTENERPEAK in cache[uri]:
                    cache[uri][LISTENERPEAK].append(peak)
                else:
                    cache[uri][LISTENERPEAK] = [peak]

                if LISTENERCURRENT in cache[uri]:
                    cache[uri][LISTENERCURRENT].append(current)
                else:
                    cache[uri][LISTENERCURRENT] = [current]

    except Exception as e:
        print e
    finally:
        set_parse_failed(parse_failed_uris)
        set_cache(cache)
        pool.terminate()
        pool.join()
Exemple #4
0
def main():
    cache = get_cache()
    failed_uris = get_failed()
    parse_failed_uris = get_parse_failed()

    uris = cache.keys()

    peak_missing = [uri for uri in uris if LISTENERPEAK not in cache[uri]]
    peak_missing = set(peak_missing) - failed_uris

    # XXX: fetch_stream_infos is the same for each root url
    peak_missing = {get_root(uri) for uri in peak_missing}
    peak_missing = set(peak_missing) - parse_failed_uris
    

    pool = Pool(PROCESSES)
    try:
        pfunc = fetch_stream_infos
        for i, res in enumerate(pool.imap_unordered(pfunc, peak_missing)):
            uri, streams = res

            # save all 1000
            if (i+1) % 1000 == 0:
                set_cache(cache)

            print "%d/%d " % (i+1, len(peak_missing)) + uri + " -> ",
            print "%d new streams" % len(streams)

            if not streams:
                parse_failed_uris.add(uri)

            # add new found uris to cache + listener count
            for stream in streams:
                peak = str(int(stream.peak))
                current = str(int(stream.current))
                uri = stream.stream

                if uri not in cache:
                    cache[uri] = {}

                if LISTENERPEAK in cache[uri]:
                    cache[uri][LISTENERPEAK].append(peak)
                else:
                    cache[uri][LISTENERPEAK] = [peak]

                if LISTENERCURRENT in cache[uri]:
                    cache[uri][LISTENERCURRENT].append(current)
                else:
                    cache[uri][LISTENERCURRENT] = [current]

    except Exception as e:
        print e
    finally:
        set_parse_failed(parse_failed_uris)
        set_cache(cache)
        pool.terminate()
        pool.join()