Python page_text Exemples, processing.wrappers.http_downloader.page_text Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : imgur.py Projet : DrMaxis/RedditDownloader

    def __init__(self, album_url):
        """
		Constructor. Pass in the album_url that you want to download.
		"""
        self.album_url = album_url

        # Check the URL is actually imgur:
        match = re.match(
            r"(https?)://(www\.)?(?:m\.)?imgur\.com/(a|gallery)/([a-zA-Z0-9]+)(#[0-9]+)?",
            album_url)
        if not match:
            raise ImgurAlbumException("URL must be a valid Imgur Album")

        self.protocol = match.group(1)
        self.album_key = match.group(4)
        self.custom_path = None

        # Read the no-script version of the page for all the images:
        full_list_url = "https://imgur.com/a/" + self.album_key + "/layout/blog"

        html = http_downloader.page_text(full_list_url)

        if not html:
            raise ImgurAlbumException("Error reading Imgur Album Page: %s" %
                                      full_list_url)

        self.imageIDs = re.findall(
            r'.*?{"hash":"([a-zA-Z0-9]+)".*?"ext":"(\.[a-zA-Z0-9]+)".*?', html)
        seen = set()
        self.urls = [
            "https://i.imgur.com/" + x[0] + x[1] for x in self.imageIDs
            if x not in seen and not seen.add(x)
        ]

Exemple #2

0

Afficher le fichier

Fichier : generic_newspaper.py Projet : vyterran/RedditDownloader

def handle(task, progress):
	url = task.url
	progress.set_status("Requesting page...")
	resp = http_downloader.page_text(url, json=False)
	if not resp:
		return False

	config = Config()
	config.memoize_articles = False
	config.verbose = False
	article = Article(url='', config=config)

	article.download()
	article.set_html(resp)
	article.parse()
	if not article.top_image:
		return None

	src = article.top_image
	if 'http' not in src:
		if 'https' in url:
			src = 'https://' + src.lstrip('/ ').strip()
		else:
			src = 'http://' + src.lstrip('/ ').strip()

	progress.set_status("Downloading image...")

	return http_downloader.download_binary(src, task.file, prog=progress, handler_id=tag)

Exemple #3

0

Afficher le fichier

def handle(task, progress):
    url = task.url
    if 'gfycat.com/' not in url:
        return False
    progress.set_status("Checking for direct gfycat url...")
    uid = re.findall(r"com/([a-zA-Z]+)", url)
    if not uid:
        return False
    uid = uid[0]

    files = http_downloader.page_text(
        'https://api.gfycat.com/v1/gfycats/%s' % uid, True)
    if not files:
        return False
    files = files["gfyItem"]

    opt = None
    for fm in format_opts:
        if fm in files and files[fm]:
            opt = fm
            break

    if not opt:
        return False

    progress.set_status("Downloading gfycat %s..." % opt)
    return http_downloader.download_binary(files[opt],
                                           task.file,
                                           prog=progress,
                                           handler_id=tag)

Exemple #4

0

Afficher le fichier

Fichier : test_http_downloader.py Projet : vyterran/RedditDownloader

 def test_raw(self):
     """ Read raw page text """
     html = http.page_text(
         "https://raw.githubusercontent.com/shadowmoose/RedditDownloader/master/Dockerfile"
     )
     self.assertIn("python", html, "Downloaded invalid Raw data from url!")