def find_hashes (*links):
    """
    Finds unique tripcodes.

    If no URLs are given it will attempt to scrape all of 4chan where tripcodes
    are allowed.
    """
    hashes = set()
    pool   = Pool(num_threads=parameters.num_threads)

    def work (unit):
        if isinstance(unit, Post):
            if unit.public:
                return unit.public.cipher
            return

        logger.info('working %r', unit)
        for e in unit.process():
            pool.push(work, e)

    if not links:
        links = boards

    for link in map(classify, links):
        pool.push(work, link)
        pool.join()

    logger.info('Join complete, updating with results.')

    for e in pool.get_results():
        hashes.add(e)
    pool.close()

    return hashes
def scrape_images (directory, keep_names, *links):
    """
    Downloads images from links.
    """
    pool = Pool(num_threads=parameters.num_threads)

    def work (unit):
        if isinstance(unit, Post):
            if not unit.image:
                return

            filename = get_filename (
                directory, unit, keep_names
            )

            if not os.path.exists(filename):
                logger.info('downloading %s', unit.image)
                image_data = unit.image.download(bypass_cache=True)

                return filename, image_data

            logger.debug('%s already downloaded', filename)

            return

        logger.info('working %r', unit)
        for e in unit.process():
            pool.push(work, e)

    for link in map(classify, links):
        pool.push(work, link)
    pool.join()

    logger.info('Join complete.')

    downloaded = pool.get_results()
    pool.close()

    logger.info('Setting up directories')

    directories = set (
        map (
            lambda t : os.path.split(t[0])[0],
            downloaded
        )
    )

    for directory in directories:
        if not os.path.exists(directory):
            logger.debug('making directory %s', directory)
            os.makedirs(directory)

    logger.info('Writing images to disk.')

    for filename, image_data in downloaded:
        with open(filename, 'w') as outfile:
            outfile.write(image_data)
Example #3
0
def scrape_images(directory, keep_names, *links):
    """
    Downloads images from links.
    """
    pool = Pool(num_threads=parameters.num_threads)

    def work(unit):
        if isinstance(unit, Post):
            if not unit.image:
                return

            filename = get_filename(directory, unit, keep_names)

            if not os.path.exists(filename):
                logger.info('downloading %s', unit.image)
                image_data = unit.image.download(bypass_cache=True)

                return filename, image_data

            logger.debug('%s already downloaded', filename)

            return

        logger.info('working %r', unit)
        for e in unit.process():
            pool.push(work, e)

    for link in map(classify, links):
        pool.push(work, link)
    pool.join()

    logger.info('Join complete.')

    downloaded = pool.get_results()
    pool.close()

    logger.info('Setting up directories')

    directories = set(map(lambda t: os.path.split(t[0])[0], downloaded))

    for directory in directories:
        if not os.path.exists(directory):
            logger.debug('making directory %s', directory)
            os.makedirs(directory)

    logger.info('Writing images to disk.')

    for filename, image_data in downloaded:
        with open(filename, 'wb') as outfile:
            outfile.write(image_data)
def find_ngrams(n, *links):
    """
    Finds ngrams.

    If no URLs are given it will attempt to scrape all of 4chan.
    """
    import re

    ngrams = collections.Counter()
    pool = Pool(num_threads=parameters.num_threads)

    token_pattern = re.compile(r"([A-Za-z0-9]\S*[A-Za-z0-9]|[A-Za-z0-9])")

    def generate_ngrams(tokens):
        return zip(*[tokens[i:] for i in range(n)])

    def work(unit):
        logger.info("working %r", unit)

        if isinstance(unit, Thread):
            thread = unit.download_and_decode()
            ngrams = collections.Counter()

            for post in thread["posts"]:
                contents = post.get("com", "")
                contents = sanitize(contents).encode("utf8")

                tokens = token_pattern.findall(contents)
                tokens = [token.lower() for token in tokens]

                ngrams.update(generate_ngrams(tokens))

            return ngrams

        for e in unit.process():
            pool.push(work, e)

    if not links:
        links = all_boards

    for link in map(classify, links):
        pool.push(work, link)
        pool.join()

    logger.info("Join complete, updating with results.")

    for counter in pool.get_results():
        ngrams.update(counter)

    pool.close()

    return ngrams
def build_cache (*links):
    """
    Builds up the internal WebEntity.webcache with a snapshot of the provided
    URLs.

    If no URLs are given, it will attempt to update the cache with a snapshot
    of the entirety of 4chan.
    """
    pool = Pool(num_threads=parameters.num_threads)

    def work (unit):
        logger.info('working %r', unit)

        if isinstance(unit, Thread):
            unit.download()
        else:
            for e in unit.process():
                pool.push(work, e)

    if not links:
        links = all_boards

    for link in map(classify, links):
        pool.push(work, link)
        pool.join()

    logger.info('Join complete.')
def prune_cache(*links):
    """
    Prunes 404ed entries from the internal WebEntity.webcache.

    This function accepts only links to boards and pages.
    If no links are given every board on 4chan is checked.
    """
    pool = Pool(num_threads=parameters.num_threads)

    def work(unit):
        if isinstance(unit, Thread):
            return unit

        logger.info('working %r', unit)
        for e in unit.process():
            pool.push(work, e)

        return unit

    if not links:
        links = all_boards

    for link in map(classify, links):
        if isinstance(link, Thread):
            logger.warn('ignoring %s', link)

        pool.push(work, link)

    pool.join()
    logger.info('Join complete, pruning cache.')

    live = pool.get_results()
    pool.close()

    live = map(lambda alive: alive.apiurl, live)
    live = map(WebEntity.webcache.url_to_key, live)
    live = set(live)

    keys = WebEntity.webcache.keys()
    keys = filter(lambda key: key not in live, keys)

    for key in keys:
        logger.info('pruning %s', key)
        WebEntity.webcache.remove_key(key)
def prune_cache (*links):
    """
    Prunes 404ed entries from the internal WebEntity.webcache.

    This function accepts only links to boards and pages.
    If no links are given every board on 4chan is checked.
    """
    pool = Pool(num_threads=parameters.num_threads)

    def work (unit):
        if isinstance(unit, Thread):
            return unit

        logger.info('working %r', unit)
        for e in unit.process():
            pool.push(work, e)

        return unit

    if not links:
        links = all_boards

    for link in map(classify, links):
        if isinstance(link, Thread):
            logger.warn('ignoring %s', link)

        pool.push(work, link)

    pool.join()
    logger.info('Join complete, pruning cache.')

    live = pool.get_results()
    pool.close()

    live = map(lambda alive : alive.apiurl  , live)
    live = map(WebEntity.webcache.url_to_key, live)
    live = set(live)

    keys = WebEntity.webcache.keys()
    keys = filter (
        lambda key : key not in live,
        keys
    )

    for key in keys:
        logger.info('pruning %s', key)
        WebEntity.webcache.remove_key(key)
def crack (*links):
    """
    Returns a list of Posts with cracked trips.

    Reads 4chan URLs, scrapes contents and attempts to crack the tripcodes
    found. If any posts were cracked the corresponding Post object is added to
    a list that is returned.

    The list is sorted by time of post.
    """
    posts = SortedSet()
    pool  = Pool(num_threads=parameters.num_threads)

    pub_solver = SQLSolver(parameters.public_file)
    sec_solver = SQLSolver(parameters.secure_file)

    def work (unit):
        if isinstance(unit, Post):
            if unit.public or unit.secure:
                return unit
            return

        logger.info('working %r', unit)
        for e in unit.process():
            pool.push(work, e)

    for link in map(classify, links):
        pool.push(work, link)

    pool.join()
    logger.info('Join complete, updating with results.')

    posts.update(pool.get_results())
    pool.close()

    solved = []

    for e in sorted(posts, key = lambda post : post.time):
        if e.public:
            e.public.solve(pub_solver)
        if e.secure:
            e.secure.solve(sec_solver)
        if e.solved():
            solved.append(e)

    return solved
def find_words (*links):
    """
    Finds words.

    If no URLs are given it will attempt to scrape all of 4chan.
    """
    import re

    words = set()
    pool  = Pool(num_threads=parameters.num_threads)

    word_pattern = re.compile(r'([^\s\#]+)')

    def work (unit):
        logger.info('working %r', unit)

        if isinstance(unit, Thread):
            thread = unit.download_and_decode()
            words  = set()

            for post in thread['posts']:
                for field in ('name', 'email', 'sub', 'com', 'filename'):
                    contents = post.get(field, '')
                    contents = sanitize(contents).encode('utf8')

                    words.update(word_pattern.findall(contents))

            return words

        for e in unit.process():
            pool.push(work, e)

    if not links:
        links = all_boards

    for link in map(classify, links):
        pool.push(work, link)
        pool.join()

    logger.info('Join complete, updating with results.')
    words.update(*pool.get_results())
    pool.close()

    return words