Example #1
0
    def run(self):

        check_or_make_dir(self.deviant_dir)

        # First get the rss feed which lists the deviations
        rss_xml = yield from self.fetch_rss()

        # Visit each deviation serially and get the page html
        for dev in self.scrape_deviations_list(rss_xml, self._rss_namespaces):
            self.info(dev.url)

            if dev.rating == "adult":
                # TODO: Handle mature deviations
                # Ignore for now
                self.warn("Ignoring mature deviation [%s]" % dev.url)
                continue

            dev_page_html = yield from self.fetch_deviation_page(dev.url)

            if dev.medium == "image":
                image_url = self.scrape_deviation_image_url(dev.guid, dev_page_html)
                image_filename = filename_from_url(image_url)
                yield from self.download_deviation(image_url, image_filename)
            elif dev.medium == "document":
                # TODO: Handle text deviation
                self.warn("Ignoring text deviation %s" % dev.url)
            elif not dev.medium:
                self.warn("Media type not specified %s" % dev.url)
            else:
                raise ScrapingException("Unknown medium type %s for %s" % (dev.medium, dev.url))

        yield from sleep(0.001)

        self.info("Done")
Example #2
0
    def run(self):

        check_or_make_dir(self.artist_dir)
        projects = yield from self.fetch_projects()

        for project in projects:
            self.info(project.title)
            for image_url in (yield from self.fetch_project_image_url(project)):
                filename = filename_from_url(image_url)
                filepath = os.path.join(self.artist_dir, filename)
                yield from self.download(image_url, filepath, self.overwrite)

        sleep(0.1)
Example #3
0
    def run(self):

        check_or_make_dir(self.tumblr_dir)

        pagenum = 1
        while True:
            page_html = yield from self.fetch_page(pagenum)

            posts = self.scrape_posts(page_html)
            if not posts:
                # No posts were found. We've probably reached an out of range page
                break

            for post in posts:
                post_html = yield from self.fetch_post(post.url)
                for image_url in self.scrape_images(post_html):
                    file_name = filename_from_url(image_url)
                    yield from self.download_image(image_url, file_name)

            pagenum += 1
Example #4
0
    def run(self):

        check_or_make_dir(self.project_dir)
        projects = yield from self.fetch_projects()

        for project in projects:
            self.info(project.original_image)
            image_url = project.original_image
            filename = filename_from_url(image_url)
            file_path = os.path.join(self.project_dir, filename)
            os.path.join(self.project_dir, file_path)
            yield from self.download(image_url, file_path, self.overwrite)

            # Can only guess file extension after file is done downloading
            if "." not in filename:
                file_ext = self.guess_img_ext(file_path)
                move(file_path, file_path + "." + file_ext)

        yield from sleep(0.001)

        self.info("Done")