コード例 #1
0
        pool.push(work, link)
        pool.join()

    logger.info('Join complete, updating with results.')
    words.update(*pool.get_results())
    pool.close()

    return words

if __name__ == '__main__':
    import argparse

    from common import OfflineParser

    parser = OfflineParser (
        description='Finds every word that\'s a potential tripcode.',
        epilog='if no links are given all of 4chan is scraped'
    )

    parser.add_argument (
        'outfile',
        type=argparse.FileType('w'),
        help='file to write the words, will be overwritten'
    )

    parser.add_argument (
        'link', nargs='*',
        help='boards/pages/threads, may either be full URLs or names like /g/'
    )

    args = parser.parse_args()
コード例 #2
0
    for counter in pool.get_results():
        ngrams.update(counter)

    pool.close()

    return ngrams


if __name__ == "__main__":
    import argparse

    from common import OfflineParser

    parser = OfflineParser(
        description="Collects ngrams where the tokens are words.",
        epilog="if no links are given all of 4chan is scraped",
    )

    parser.add_argument("outfile", type=argparse.FileType("w"), help="file to write the ngrams, will be overwritten")

    parser.add_argument("n", type=int, help="the n in n-gram, 1 gives unigrams, 2 bigrams, ...")

    parser.add_argument("link", nargs="*", help="boards/pages/threads, may either be full URLs or names like /g/")

    args = parser.parse_args()

    if parser.sanity_check(args):
        exit(1)

    parser.pre_process(args)
    ngrams = find_ngrams(args.n, *args.link)
コード例 #3
0
    keys = WebEntity.webcache.keys()
    keys = filter (
        lambda key : key not in live,
        keys
    )

    for key in keys:
        logger.info('pruning %s', key)
        WebEntity.webcache.remove_key(key)

if __name__ == '__main__':
    from common import OfflineParser

    parser = OfflineParser (
        description='Prunes 404ed entries from the web cache.',
        epilog='if no links are given all of 4chan is scraped'
    )

    parser.add_argument (
        'link', nargs='*',
        help='boards/pages/threads, may either be full URLs or names like /g/'
    )

    args = parser.parse_args()

    if parser.sanity_check(args):
        exit(1)

    parser.pre_process(args)
    prune_cache(*args.link)
    parser.post_process(args, force_cache_write=True)
コード例 #4
0
    live = map(WebEntity.webcache.url_to_key, live)
    live = set(live)

    keys = WebEntity.webcache.keys()
    keys = filter(lambda key: key not in live, keys)

    for key in keys:
        logger.info('pruning %s', key)
        WebEntity.webcache.remove_key(key)


if __name__ == '__main__':
    from common import OfflineParser

    parser = OfflineParser(
        description='Prunes 404ed entries from the web cache.',
        epilog='if no links are given all of 4chan is scraped')

    parser.add_argument(
        'link',
        nargs='*',
        help='boards/pages/threads, may either be full URLs or names like /g/')

    args = parser.parse_args()

    if parser.sanity_check(args):
        exit(1)

    parser.pre_process(args)
    prune_cache(*args.link)
    parser.post_process(args, force_cache_write=True)