def __init__(self, album_url): """ Constructor. Pass in the album_url that you want to download. """ self.album_url = album_url # Check the URL is actually imgur: match = re.match( r"(https?)://(www\.)?(?:m\.)?imgur\.com/(a|gallery)/([a-zA-Z0-9]+)(#[0-9]+)?", album_url) if not match: raise ImgurAlbumException("URL must be a valid Imgur Album") self.protocol = match.group(1) self.album_key = match.group(4) self.custom_path = None # Read the no-script version of the page for all the images: full_list_url = "https://imgur.com/a/" + self.album_key + "/layout/blog" html = http_downloader.page_text(full_list_url) if not html: raise ImgurAlbumException("Error reading Imgur Album Page: %s" % full_list_url) self.imageIDs = re.findall( r'.*?{"hash":"([a-zA-Z0-9]+)".*?"ext":"(\.[a-zA-Z0-9]+)".*?', html) seen = set() self.urls = [ "https://i.imgur.com/" + x[0] + x[1] for x in self.imageIDs if x not in seen and not seen.add(x) ]
def handle(task, progress): url = task.url progress.set_status("Requesting page...") resp = http_downloader.page_text(url, json=False) if not resp: return False config = Config() config.memoize_articles = False config.verbose = False article = Article(url='', config=config) article.download() article.set_html(resp) article.parse() if not article.top_image: return None src = article.top_image if 'http' not in src: if 'https' in url: src = 'https://' + src.lstrip('/ ').strip() else: src = 'http://' + src.lstrip('/ ').strip() progress.set_status("Downloading image...") return http_downloader.download_binary(src, task.file, prog=progress, handler_id=tag)
def handle(task, progress): url = task.url if 'gfycat.com/' not in url: return False progress.set_status("Checking for direct gfycat url...") uid = re.findall(r"com/([a-zA-Z]+)", url) if not uid: return False uid = uid[0] files = http_downloader.page_text( 'https://api.gfycat.com/v1/gfycats/%s' % uid, True) if not files: return False files = files["gfyItem"] opt = None for fm in format_opts: if fm in files and files[fm]: opt = fm break if not opt: return False progress.set_status("Downloading gfycat %s..." % opt) return http_downloader.download_binary(files[opt], task.file, prog=progress, handler_id=tag)
def test_raw(self): """ Read raw page text """ html = http.page_text( "https://raw.githubusercontent.com/shadowmoose/RedditDownloader/master/Dockerfile" ) self.assertIn("python", html, "Downloaded invalid Raw data from url!")