def build_cache (*links): """ Builds up the internal WebEntity.webcache with a snapshot of the provided URLs. If no URLs are given, it will attempt to update the cache with a snapshot of the entirety of 4chan. """ pool = Pool(num_threads=parameters.num_threads) def work (unit): logger.info('working %r', unit) if isinstance(unit, Thread): unit.download() else: for e in unit.process(): pool.push(work, e) if not links: links = all_boards for link in map(classify, links): pool.push(work, link) pool.join() logger.info('Join complete.')
def find_hashes (*links): """ Finds unique tripcodes. If no URLs are given it will attempt to scrape all of 4chan where tripcodes are allowed. """ hashes = set() pool = Pool(num_threads=parameters.num_threads) def work (unit): if isinstance(unit, Post): if unit.public: return unit.public.cipher return logger.info('working %r', unit) for e in unit.process(): pool.push(work, e) if not links: links = boards for link in map(classify, links): pool.push(work, link) pool.join() logger.info('Join complete, updating with results.') for e in pool.get_results(): hashes.add(e) pool.close() return hashes
def scrape_images (directory, keep_names, *links): """ Downloads images from links. """ pool = Pool(num_threads=parameters.num_threads) def work (unit): if isinstance(unit, Post): if not unit.image: return filename = get_filename ( directory, unit, keep_names ) if not os.path.exists(filename): logger.info('downloading %s', unit.image) image_data = unit.image.download(bypass_cache=True) return filename, image_data logger.debug('%s already downloaded', filename) return logger.info('working %r', unit) for e in unit.process(): pool.push(work, e) for link in map(classify, links): pool.push(work, link) pool.join() logger.info('Join complete.') downloaded = pool.get_results() pool.close() logger.info('Setting up directories') directories = set ( map ( lambda t : os.path.split(t[0])[0], downloaded ) ) for directory in directories: if not os.path.exists(directory): logger.debug('making directory %s', directory) os.makedirs(directory) logger.info('Writing images to disk.') for filename, image_data in downloaded: with open(filename, 'w') as outfile: outfile.write(image_data)
def find_ngrams(n, *links): """ Finds ngrams. If no URLs are given it will attempt to scrape all of 4chan. """ import re ngrams = collections.Counter() pool = Pool(num_threads=parameters.num_threads) token_pattern = re.compile(r"([A-Za-z0-9]\S*[A-Za-z0-9]|[A-Za-z0-9])") def generate_ngrams(tokens): return zip(*[tokens[i:] for i in range(n)]) def work(unit): logger.info("working %r", unit) if isinstance(unit, Thread): thread = unit.download_and_decode() ngrams = collections.Counter() for post in thread["posts"]: contents = post.get("com", "") contents = sanitize(contents).encode("utf8") tokens = token_pattern.findall(contents) tokens = [token.lower() for token in tokens] ngrams.update(generate_ngrams(tokens)) return ngrams for e in unit.process(): pool.push(work, e) if not links: links = all_boards for link in map(classify, links): pool.push(work, link) pool.join() logger.info("Join complete, updating with results.") for counter in pool.get_results(): ngrams.update(counter) pool.close() return ngrams
def scrape_images(directory, keep_names, *links): """ Downloads images from links. """ pool = Pool(num_threads=parameters.num_threads) def work(unit): if isinstance(unit, Post): if not unit.image: return filename = get_filename(directory, unit, keep_names) if not os.path.exists(filename): logger.info('downloading %s', unit.image) image_data = unit.image.download(bypass_cache=True) return filename, image_data logger.debug('%s already downloaded', filename) return logger.info('working %r', unit) for e in unit.process(): pool.push(work, e) for link in map(classify, links): pool.push(work, link) pool.join() logger.info('Join complete.') downloaded = pool.get_results() pool.close() logger.info('Setting up directories') directories = set(map(lambda t: os.path.split(t[0])[0], downloaded)) for directory in directories: if not os.path.exists(directory): logger.debug('making directory %s', directory) os.makedirs(directory) logger.info('Writing images to disk.') for filename, image_data in downloaded: with open(filename, 'wb') as outfile: outfile.write(image_data)
def prune_cache (*links): """ Prunes 404ed entries from the internal WebEntity.webcache. This function accepts only links to boards and pages. If no links are given every board on 4chan is checked. """ pool = Pool(num_threads=parameters.num_threads) def work (unit): if isinstance(unit, Thread): return unit logger.info('working %r', unit) for e in unit.process(): pool.push(work, e) return unit if not links: links = all_boards for link in map(classify, links): if isinstance(link, Thread): logger.warn('ignoring %s', link) pool.push(work, link) pool.join() logger.info('Join complete, pruning cache.') live = pool.get_results() pool.close() live = map(lambda alive : alive.apiurl , live) live = map(WebEntity.webcache.url_to_key, live) live = set(live) keys = WebEntity.webcache.keys() keys = filter ( lambda key : key not in live, keys ) for key in keys: logger.info('pruning %s', key) WebEntity.webcache.remove_key(key)
def crack (*links): """ Returns a list of Posts with cracked trips. Reads 4chan URLs, scrapes contents and attempts to crack the tripcodes found. If any posts were cracked the corresponding Post object is added to a list that is returned. The list is sorted by time of post. """ posts = SortedSet() pool = Pool(num_threads=parameters.num_threads) pub_solver = SQLSolver(parameters.public_file) sec_solver = SQLSolver(parameters.secure_file) def work (unit): if isinstance(unit, Post): if unit.public or unit.secure: return unit return logger.info('working %r', unit) for e in unit.process(): pool.push(work, e) for link in map(classify, links): pool.push(work, link) pool.join() logger.info('Join complete, updating with results.') posts.update(pool.get_results()) pool.close() solved = [] for e in sorted(posts, key = lambda post : post.time): if e.public: e.public.solve(pub_solver) if e.secure: e.secure.solve(sec_solver) if e.solved(): solved.append(e) return solved
def find_words (*links): """ Finds words. If no URLs are given it will attempt to scrape all of 4chan. """ import re words = set() pool = Pool(num_threads=parameters.num_threads) word_pattern = re.compile(r'([^\s\#]+)') def work (unit): logger.info('working %r', unit) if isinstance(unit, Thread): thread = unit.download_and_decode() words = set() for post in thread['posts']: for field in ('name', 'email', 'sub', 'com', 'filename'): contents = post.get(field, '') contents = sanitize(contents).encode('utf8') words.update(word_pattern.findall(contents)) return words for e in unit.process(): pool.push(work, e) if not links: links = all_boards for link in map(classify, links): pool.push(work, link) pool.join() logger.info('Join complete, updating with results.') words.update(*pool.get_results()) pool.close() return words
def prune_cache(*links): """ Prunes 404ed entries from the internal WebEntity.webcache. This function accepts only links to boards and pages. If no links are given every board on 4chan is checked. """ pool = Pool(num_threads=parameters.num_threads) def work(unit): if isinstance(unit, Thread): return unit logger.info('working %r', unit) for e in unit.process(): pool.push(work, e) return unit if not links: links = all_boards for link in map(classify, links): if isinstance(link, Thread): logger.warn('ignoring %s', link) pool.push(work, link) pool.join() logger.info('Join complete, pruning cache.') live = pool.get_results() pool.close() live = map(lambda alive: alive.apiurl, live) live = map(WebEntity.webcache.url_to_key, live) live = set(live) keys = WebEntity.webcache.keys() keys = filter(lambda key: key not in live, keys) for key in keys: logger.info('pruning %s', key) WebEntity.webcache.remove_key(key)