def downloadHisa(self): self.signals.status.emit("Downloading HiSA, please wait...") self.user_dir = CfdTools.runFoamCommand( "echo $WM_PROJECT_USER_DIR").rstrip().split('\n')[-1] self.user_dir = CfdTools.reverseTranslatePath(self.user_dir) try: # Workaround for certificate issues in python >= 2.7.9 import ssl if hasattr(ssl, '_create_unverified_context'): urlrequest._urlopener = urlrequest.FancyURLopener( context=ssl._create_unverified_context()) else: urlrequest._urlopener = urlrequest.FancyURLopener() # Download (filename, header) = urlrequest.urlretrieve(self.hisa_url, reporthook=self.downloadStatus) except Exception as ex: raise Exception("Error downloading HiSA: {}".format(str(ex))) self.signals.status.emit("Extracting HiSA...") CfdTools.runFoamCommand( '{{ mkdir -p "$WM_PROJECT_USER_DIR" && cd "$WM_PROJECT_USER_DIR" && unzip -o "{}"; }}' .format(CfdTools.translatePath(filename)))
def run(self): try: request = rq.FancyURLopener({}) with request.open(self.url) as url_opener: self.result = url_opener.read().decode('utf-8') except urllib.error: self.result = '' finally: self.Finished.emit(self.ref, self.result)
def run(self): try: request = rq.FancyURLopener({}) with request.open(self.url) as url_opener: result = json.loads(url_opener.read().decode('utf-8')) self.config = result['config'] except urllib.error: self.result = '' finally: self.Finished.emit(self.prefix, json.dumps(self.config))
def get_emoticons_dict(url="https://pc.net/emoticons/"): opener = ureq.FancyURLopener({}) f = opener.open(url) content = f.read() soup = BeautifulSoup(content, "html") emoticons_html_tags = soup.find_all(class_="smiley") emoticons_dict = dict() for t in emoticons_html_tags: label = t.a.attrs["href"].split("/")[1] emoticons_dict[t.text] = label return emoticons_dict
def downloadFile(url, dest): msg("downloading " + dest) try: if sys.version_info < (3,0): file = urllib.FancyURLopener() file.retrieve(url, dest) else: from urllib import request file = request.FancyURLopener() file.retrieve(url, dest) except Exception as e: msg('could not download "' + dest + '" because of this: ' + str(e), "RED") sys.exit(1)
def run(self): try: request = rq.FancyURLopener({}) with request.open(self.searchUrl) as url_opener: result = json.loads(url_opener.read().decode('utf-8')) if not 'response' in result or result['response']['numFound'] == 0: answer = ('') else: answer = json.dumps(result['response']['docs']) except: answer = '' finally: self.Finished.emit(answer)
def run(self): opener = urllib.FancyURLopener() opener.http_error_default = self.http_error_default self.start_progress_bar() try: if self.progress: filename, info = opener.retrieve(self.url, self.destination, self.progress_bar) else: filename, info = opener.retrieve(self.url, self.destination) except IOError as err: self.error_progress_bar() log.error(err) if not self.ignore_errors: raise self.success_progress_bar()
def _handle_url_download(self, update_url, pack_basename, temp_packpath): if 'github.com/' in update_url: owner, repo = _get_github_owner_repo(update_url) update_url = '/'.join( ['https://github.com', owner, repo, 'archive']) update_url = _add_slash(update_url) update_url += pack_basename from urllib import request try: request.FancyURLopener().retrieve(update_url, temp_packpath, self._download_status) except Exception as error: log.error(traceback.format_exc().strip()) self._error('Error retrieving package...\n' + str(error)) return False return True
def download(self, url, headers, proxy, num_retries): print('downloading:', url) self.thethrottle.wait(url) opener = request.FancyURLopener(proxy) opener.addheaders = headers try: with opener.open(url) as f: html = f.read().decode() except urllib.error.URLError as e: print('downloading failes:', e.reason) html = None if num_retries > 0: if hasattr(e, 'code') and 500 <= e.code < 600: Download(url, headers, proxy, num_retries - 1) return html
def get_pdf_info(url): password = '' # If any password set for pdf opening page_no = set() max_pages = 10 # Set the maximum pages to be extracted caching = True la_params = LAParams() # Performing Layout Analysis output_fp1 = StringIO() resource_object = PDFResourceManager( caching=caching ) # Used to store shared resources such as fonts or images in pdf. opener = request.FancyURLopener({}) # Adding headers for opening pdf online if needed opener.addheaders = [( 'User-agent', 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:38.0) Gecko/20100101 Firefox/38.0' ), ('Connection', 'keep-alive')] fp = opener.open(url) device = TextConverter(resource_object, output_fp1, laparams=la_params) f = process_pdf( resource_object, device, fp, page_no, maxpages=max_pages, password=password, caching=caching, check_extractable=True, ) # Task done by process_pdf # 1.Process pdf - first create a parse object for the corresponding pdf # 2.Bypassing the password protected document # 3.Checking if the given pdf is extractable or not # 4.If extractable parse the pages till max_pages limit given fp.close() device.close() ctx = get_paragraphs(output_fp1.getvalue().strip()) return ctx
def downloadAllAds(self, ad_index, NUMBER_OF_WEBPAGES, NUMBER_OF_ADS, startingwebpage, baseURL): # This function have been used with the following parameters: # ad_index: 495 # NUMBER_OF_WEBPAGES = 1500 # NUMBER_OF_ADS = 15000 # startingwebpage = 495 # baseURL = "https://www.immobiliare.it/vendita-case/roma/?criterio=rilevanza&pag=" # instatiate the object needed to download the html file given a specific URL opener = request.FancyURLopener({}) # for each webpage downloaded for webpageId in tqdm(range(startingwebpage, NUMBER_OF_WEBPAGES + 1)): # download the page at https://www.immobiliare.it/vendita-case/roma/?criterio=rilevanza&pag=[webpageId] webPageHtml = self._downloadURL(url=baseURL + str(webpageId), opener=opener) # create the html parser using the library BeautifulSoup htmlParser = self._createHtmlParser(webPageHtml=webPageHtml) # retrieve all the ads allAds = self._retrieveListOfAds(htmlParser=htmlParser) # Retrieve all the links pointing to the single house ad which are contained inside the html page allLinks = [buildLink(link=self._getAdLink(adTag=ad), websiteURL=self.websiteURL) for ad in allAds] # for each link which points to the house ad for linkAd in allLinks: # increase the index of the ad ad_index += 1 # download the html file of the advertisement webpage htmlFile = downloadURL(url=linkAd, opener=opener) # create the html parser using the library BeautifulSoup soup = self._createHtmlParser(webPageHtml=htmlFile) # write the html self._writeHtml(html=soup.prettify(), ad_index=ad_index) # update the counter count_webpages += 1 # if the number of webpages downloaded exceed the number of ads # requested as input, then the procedure is early stopped if (count_webpages >= NUMBER_OF_ADS): print("[LOG]:All the necessary webpages have been downloaded") return print("[LOG]: " + str(count_webpages) + " webpages have been downloaded")
# -*- coding: utf-8 -*- from urllib import request proxy_handler = request.ProxyHandler({'http': '10.144.1.10:8080'}) # proxy_auth_handler = request.ProxyBasicAuthHandler() # proxy_auth_handler.add_password('realm', 'host', 'username', 'password') opener = request.build_opener(proxy_handler) with opener.open('http://www.pythonchallenge.com/') as f: print('Status:', f.status, f.reason) proxies = {'http': 'http://10.144.1.10:8080/'} opener = request.FancyURLopener(proxies) # 使用指定代理覆盖当前环境设置 with opener.open("http://www.pythonchallenge.com/") as f: print('Status:', f.status, f.reason) with request.urlopen('https://www.bing.com') as f: data = f.read().decode('utf-8') # urlopen返回的对象支持close、read、readline、readlines和迭代等 print('Status:', f.status, f.reason) for k, v in f.getheaders(): print('%s: %s' % (k, v)) # print('Data:', data.decode('utf-8')) # ### 标准库urllib模块 # - URL handling modules # - https://docs.python.org/3/library/urllib.html
def __init__(self, api_key, api_secret, *, verbose=False): self.verbose = verbose self.api_key = api_key self.api_secret = api_secret self.opener = request.FancyURLopener()
def download_work(self, video_url, video_name): opener = request.FancyURLopener() opener.retrieve(video_url, video_name)