pool.push(work, link) pool.join() logger.info('Join complete, updating with results.') words.update(*pool.get_results()) pool.close() return words if __name__ == '__main__': import argparse from common import OfflineParser parser = OfflineParser ( description='Finds every word that\'s a potential tripcode.', epilog='if no links are given all of 4chan is scraped' ) parser.add_argument ( 'outfile', type=argparse.FileType('w'), help='file to write the words, will be overwritten' ) parser.add_argument ( 'link', nargs='*', help='boards/pages/threads, may either be full URLs or names like /g/' ) args = parser.parse_args()
for counter in pool.get_results(): ngrams.update(counter) pool.close() return ngrams if __name__ == "__main__": import argparse from common import OfflineParser parser = OfflineParser( description="Collects ngrams where the tokens are words.", epilog="if no links are given all of 4chan is scraped", ) parser.add_argument("outfile", type=argparse.FileType("w"), help="file to write the ngrams, will be overwritten") parser.add_argument("n", type=int, help="the n in n-gram, 1 gives unigrams, 2 bigrams, ...") parser.add_argument("link", nargs="*", help="boards/pages/threads, may either be full URLs or names like /g/") args = parser.parse_args() if parser.sanity_check(args): exit(1) parser.pre_process(args) ngrams = find_ngrams(args.n, *args.link)
keys = WebEntity.webcache.keys() keys = filter ( lambda key : key not in live, keys ) for key in keys: logger.info('pruning %s', key) WebEntity.webcache.remove_key(key) if __name__ == '__main__': from common import OfflineParser parser = OfflineParser ( description='Prunes 404ed entries from the web cache.', epilog='if no links are given all of 4chan is scraped' ) parser.add_argument ( 'link', nargs='*', help='boards/pages/threads, may either be full URLs or names like /g/' ) args = parser.parse_args() if parser.sanity_check(args): exit(1) parser.pre_process(args) prune_cache(*args.link) parser.post_process(args, force_cache_write=True)
live = map(WebEntity.webcache.url_to_key, live) live = set(live) keys = WebEntity.webcache.keys() keys = filter(lambda key: key not in live, keys) for key in keys: logger.info('pruning %s', key) WebEntity.webcache.remove_key(key) if __name__ == '__main__': from common import OfflineParser parser = OfflineParser( description='Prunes 404ed entries from the web cache.', epilog='if no links are given all of 4chan is scraped') parser.add_argument( 'link', nargs='*', help='boards/pages/threads, may either be full URLs or names like /g/') args = parser.parse_args() if parser.sanity_check(args): exit(1) parser.pre_process(args) prune_cache(*args.link) parser.post_process(args, force_cache_write=True)