Beispiel #1
0
 def test_download_google_results_file(self):
     query = 'Well of Ascension'
     file_name = os.path.join('temp', 'google_results.html')
     if os.path.exists(file_name):
         os.remove(file_name)
     d = Downloader()
     url = d.goodreads_id_query(query)
     result = d.download_file(url, file_name)
     self.assertTrue(os.path.exists(file_name))
Beispiel #2
0
def kntu_download(user_name, password, pasted_urls):

    kntu_headers = {
        'User-Agent':
        'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.193 Safari/537.36',
    }

    kntu_login_data = {
        'anchor': '',
        'username': user_name,
        'password': password,
        'rememberusername': '******'
    }

    kntu_downloader = Downloader('https://vc.kntu.ac.ir/login/index.php',
                                 'https://connect.kntu.ac.ir/',
                                 kntu_login_data, kntu_headers, kntu_headers)
    if not kntu_downloader.login({'logintoken'}):
        return

    for url in pasted_urls:
        if re.match(
                r'https://vc\d*\.kntu\.ac\.ir/mod/adobeconnect/joinrecording\.php.*',
                url):
            filename = re.findall('recording=(\d+)&', url)[0]
            print('Downloading ' + filename + '...')
            kntu_downloader.set_name_to_save(filename)
            kntu_downloader.set_pasted_url(url)
            kntu_downloader.set_cookies()
            if not kntu_downloader.create_downlaod_link():
                continue
            kntu_downloader.download_file()
            kntu_downloader.save_file()
            kntu_downloader.extract_zip_file()
            kntu_downloader.convert_media()
            kntu_downloader.download_other_files()
            print(filename + ' downloaded!')
        else:
            print('Wrong URL format')
Beispiel #3
0
 def download(self, sub, path):
     r = requests.get(sub.page_url)
     if r.status_code == 200:
         soup = self._get_soup(r.text)
         sub.url = self._get_full_url(
             soup.find(u'a', id=u'downloadButton')[u'href'])
         dl = Downloader()
         zip_path = os.path.splitext(path)[0] + u'.zip'
         if dl.download_file(sub.url, zip_path):
             is_extration_success = self._extract_sub_zip(zip_path, path)
             try:
                 os.remove(zip_path)
             except OSError, e:
                 pass
             if is_extration_success:
                 return True
Beispiel #4
0
	def download(self, sub, path):
		r = requests.get(sub.page_url)
		if r.status_code == 200:
			soup = self._get_soup(r.text)
			sub.url = self._get_full_url(
				soup.find(u'a', id=u'downloadButton')[u'href']
			)
			dl = Downloader()
			zip_path = os.path.splitext(path)[0] + u'.zip'
			if dl.download_file(sub.url, zip_path):
				is_extration_success = self._extract_sub_zip(zip_path, path)
				try:
					os.remove(zip_path)
				except OSError, e:
					pass
				if is_extration_success:
					return True
Beispiel #5
0
class WearCollector(Collector):
    def __init__(self,
                 reporter: Reporter,
                 waiter: Waiter,
                 outdir: str,
                 useragent: str = ''):
        super(WearCollector, self).__init__()
        self.reporter: Reporter = reporter
        self.waiter = waiter
        self.outdir = outdir
        self.useragent = useragent
        self.cacher = Cacher(self.outdir)
        # 非同期処理の同時接続数制御
        self.semaphore = Semaphore(2)
        # ファイルダウンローダ
        self.downloader = Downloader(self.waiter, self.semaphore,
                                     self.reporter)

    async def download_user_page(self, url: str, page_num):
        url = url + f'?pageno={page_num}'

        # キャッシュがあれば使う
        filename = urllib.parse.quote(url, safe='') + '.html'
        content, info = self.cacher.get(filename)
        if content and info:
            html = content
            realurl = info.get('realurl')
            self.reporter.report(INFO, f'use cache {url}')
        else:
            await self.waiter.wait(url)
            async with self.semaphore:
                self.reporter.report(INFO, f'fetching {url}', type=NETWORK)
                async with aiohttp.request(
                        'get', url, headers={'user-agent':
                                             self.useragent}) as res:
                    html = await res.text()
                    realurl = str(res.url)
                    self.cacher.set(filename, html, {
                        'status': res.status,
                        'realurl': realurl
                    })

        # 終了条件
        if page_num >= 2 and realurl.count('?pageno') == 0:
            return False
        else:
            for url, data in await self.run_in_executor(parse_user, html):
                await self.add_future(
                    'gallery',
                    self.gallery_collector(url, 1, 501, userdata=data))
            return True

    async def user_collector(self, url: str, pagestart: int, pageend: int):
        await self.queued_paging(
            pagestart, pageend,
            lambda page: self.download_user_page(url, page))

    async def download_gallery_page(self,
                                    url: str,
                                    page_num: int,
                                    userdata=None):
        url = url + f'?pageno={page_num}'
        filename = urllib.parse.quote(url, safe='') + '.html'
        content, info = self.cacher.get(filename)
        if content and info:
            html = content
            realurl = info.get('realurl')
            self.reporter.report(INFO, f'use cache {url}')
        else:
            await self.waiter.wait(url)
            async with self.semaphore:
                self.reporter.report(INFO, f'fetching {url}', type=NETWORK)
                async with aiohttp.request(
                        'get', url, headers={'user-agent':
                                             self.useragent}) as res:
                    html = await res.text()
                    realurl = str(res.url)
                    self.cacher.set(filename, html, {
                        'status': res.status,
                        'realurl': realurl
                    })

        # 終了条件
        if page_num >= 2 and realurl.count('?pageno') == 0:
            return False
        else:
            for url, data in await self.run_in_executor(
                    parse_gallely, html, userdata):
                imagefile = urllib.parse.quote(url, safe='')
                tmp_save(os.path.join(self.outdir, imagefile + '.json'),
                         json.dumps(data))
                imagepath = os.path.join(self.outdir, imagefile)
                if not os.path.exists(imagepath):
                    await self.add_future(
                        'image',
                        self.downloader.download_file(
                            url,
                            imagepath,
                            headers={'user-agent': self.useragent}))
            return True

    async def gallery_collector(self,
                                url: str,
                                pagestart: int,
                                pageend: int,
                                userdata=None):
        await self.queued_paging(
            pagestart, pageend, lambda page: self.download_gallery_page(
                url, page, userdata=userdata))