def get_aid_page(aid_url): try: req = urllib.request.Request(aid_url) # 设置请求头部信息 req.add_header("Host", "www.bilibili.com") req.add_header("User-Agent", "Mozilla/5.0 (X11; Linux x86_64; rv:54.0) Gecko/20100101 Firefox/54.0") req.add_header("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8") req.add_header("Accept-Language", "zh-CN,en-US;q=0.7,en;q=0.3") req.add_header("Connection", "keep-alive") req.add_header("Accept-Encoding", "gzip, deflate") response = urllib.request.urlopen(req, timeout=10) if response.info()["Content-Encoding"] == "gzip": buf = io.BytesIO(response.read()) f = gzip.GzipFile(fileobj=buf) aid_page_content = f.read().decode("utf-8") else: aid_page_content = response.read().decode('utf-8') response.close() #print(aid_page_content) return aid_page_content except Exception as e: raise e finally: if response: response.close()
def spider(url): global sleep_time request = urllib.request.Request(url) # 下面的两个header是为了模拟手机浏览器,因为慕课网app可以不用注册就可以访问视频,所以把咱们的程序模拟成手机浏览器,就可以直接下载了 request.add_header('user-agent', 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.93 Safari/537.36') while True: try: time.sleep(sleep_time) response = urllib.request.build_opener().open(request) if response.getcode() == 200: html = response.read() response.close() if html is not None: if sleep_time > 5: sleep_time -= 1 return html else: continue except urllib.error.URLError as e: print(e.reason, ':', url) except socket.timeout as e: print("-----socket timout:", url) except: if sleep_time < 20: sleep_time += 1 print('********************do not know why it is happened!*****************') print("************************ now sleep time is: %d *********************" % sleep_time )
def test_get_stream_offset(self): response = urllib.request.urlopen( "http://localhost:8888/stream?stream_time=1529488179412403&offset=15" ) contents = response.read() response.close() self.check_archive_with_offset(contents, 15, 15)
def Downloader(): print("Commencing download") inputValue = link.get("1.0", "end-1c") print(inputValue) x = 'bytes=0-' #getting wrong file size remaining_download_tries = 5 #url="https://download.netbeans.org/netbeans/8.0.2/final/bundles/netbeans-8.0.2-javase-windows.exe" #https://www.codesector.com/files/teracopy.exe downloadname = str(inputValue.split('/')[-1]) #gives proper filename downloadpath = Dpath + downloadname print(downloadpath) req = urllib.request.Request(inputValue, headers={'Range': x}) response = urllib.request.urlopen(req) response.close() contentlength = int(response.getheader("Content-Length")) print(contentlength) while remaining_download_tries > 0: try: print("starting download") with urllib.request.urlopen(req) as fsrc, open( downloadpath, 'w+b' ) as fdst: #NamedTemporaryFile(delete=False) replace open () with Named..() for temp file download copyfileobj(fsrc, fdst, 16 * 1024, contentlength) print("complete") except: remaining_download_tries = remaining_download_tries - 1 print("retrying download") with urllib.request.urlopen(inputValue) as fsrc, open( downloadpath, 'w+b' ) as fdst: #NamedTemporaryFile(delete=False) replace open () with Named..() for temp file download copyfileobj(fsrc, fdst, 16 * 1024) print("complete") finally: print("D") break
def test_get_frames_empty(self): response = urllib.request.urlopen( "http://localhost:8888/stream?stream_time=1529488179412403&frame_time=2581101" ) contents = response.read() response.close() self.check_archive_with_frames(contents, [], 0)
def test_get_frames(self): response = urllib.request.urlopen( "http://localhost:8888/stream?stream_time=1529488179412403&" "frame_time=0&frame_time=630021&frame_time=1210079&frame_time=2581100&frame_time=2581101" ) contents = response.read() response.close() self.check_archive_with_frames(contents, [0, 7, 18, 29], 4)
def get_url_content(website): headers = {'Accept-Charset': 'utf-8', 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.97 Safari/537.36'} request = urllib.request.Request(website, headers=headers) response = urllib.request.urlopen(request) html = response.read() response.close() return html
def query(query, useragent='python-duckduckgo ' + str(__version__), safesearch=True, html=False, meanings=True, **kwargs): """ Query DuckDuckGo, returning a Results object. Here's a query that's unlikely to change: --- result = query('1 + 1') --- result.type 'nothing' --- result.answer.text '1 + 1 = 2' --- result.answer.type 'calc' Keword arguments: useragent: UserAgent to use while querying. Default: "python-duckduckgo %d" (str) safesearch: True for on, False for off. Default: True (bool) html: True to allow HTML in output. Default: False (bool) meanings: True to include disambiguations in results (bool) Any other keyword arguments are passed directly to DuckDuckGo as URL params. """ safesearch = '1' if safesearch else '-1' html = '0' if html else '1' meanings = '0' if meanings else '1' params = { 'q': query, 'o': 'json', 'kp': safesearch, 'no_redirect': '1', 'no_html': html, 'd': meanings, } params.update(kwargs) encparams = urllib.parse.urlencode(params) url = 'http://api.duckduckgo.com/?' + encparams request = urllib.request.Request(url, headers={'User-Agent': useragent}) response = urllib.request.urlopen(request) json = j.loads(response.read().decode('utf-8')) response.close() return Results(json)
def test_remote(): while not web_paths.empty(): path = web_paths.get() url = "%s%s" % (target, path) request = urllib.request(url) try: response = urllib.response(request) content = response.read() print("[%d] => %s" % (response.code, path)) response.close() except: print("failed") pass
def query(query, useragent='python-duckduckgo '+str(__version__), safesearch=True, html=False, meanings=True, **kwargs): """ Query DuckDuckGo, returning a Results object. Here's a query that's unlikely to change: --- result = query('1 + 1') --- result.type 'nothing' --- result.answer.text '1 + 1 = 2' --- result.answer.type 'calc' Keword arguments: useragent: UserAgent to use while querying. Default: "python-duckduckgo %d" (str) safesearch: True for on, False for off. Default: True (bool) html: True to allow HTML in output. Default: False (bool) meanings: True to include disambiguations in results (bool) Any other keyword arguments are passed directly to DuckDuckGo as URL params. """ safesearch = '1' if safesearch else '-1' html = '0' if html else '1' meanings = '0' if meanings else '1' params = { 'q': query, 'o': 'json', 'kp': safesearch, 'no_redirect': '1', 'no_html': html, 'd': meanings, } params.update(kwargs) encparams = urllib.parse.urlencode(params) url = 'http://api.duckduckgo.com/?' + encparams request = urllib.request.Request(url, headers={'User-Agent': useragent}) response = urllib.request.urlopen(request) json = j.loads(response.read().decode('utf-8')) response.close() return Results(json)
def Start_split(url, client_count): url = url client_count = client_count writepath = 'file.txt' mode = 'ab' if os.path.exists(writepath) else 'wb+' req = HeadRequest(url) response = urllib.request.urlopen(req) response.close() print("Fileinfo ==>") print(response.info()) strRes = str(response.info()) contentlength = int(response.getheader("Content-Length")) global newLength newLength = contentlength print("N-Division requests") print("\tNo. of clients:", client_count) print("\tFileSize in bytes:", contentlength) # logging app.insert("\nN-Division requests") app.insert("\n\tNo. of clients:" + str(client_count)) app.insert("\n\tFileSize in bytes:" + str(contentlength)) #seekmer.create(contentlength) #print("sample file of content length created") urlRangeList = n_division(client_count, contentlength) for a in urlRangeList: print(a) app.insert("\n" + a) requests = [] for x in urlRangeList: ss = "urllib.request.Request(" + url + ", headers={'Range':" + x + "})" requests.append(ss) # pass urlRangeList[i] to the clients_list[i] for i in range(client_count): clients = [[url, xx] for xx in urlRangeList] for test in clients: print(test) print("done") return clients
def _process_uncached_url(self, url, no_description=False): request = urllib.request.Request(url, headers={ "User-Agent": self.userAgent, "Accept": self.acceptHeader }) try: startTime = time.time() response = urllib.request.urlopen(request, timeout=self.timeout) timeTaken = time.time() - startTime except socket.timeout: raise URLLookupError("Timed out") except urllib.error.URLError as err: raise URLLookupError(err.reason) except Exception as err: raise URLLookupError(type(err).__name__) try: newURL = response.geturl() if newURL != url and self.showRedirects: yield "→ <{0}>".format(newURL) self.annotateResponse(response) if response.contentLength is not None: sizeFormatted = formatBytes(response.contentLength) else: sizeFormatted = "unknown size" responseIter = iter(response.handler.processResponse(response, no_description=no_description)) firstLine = next(responseIter) for line in self.responseFormats: yield line.format(time=timeTaken, size=sizeFormatted, plugin=firstLine) for line in responseIter: yield line finally: response.close() del response.buf del response
def get_submit_videos_page(url): try: req = urllib.request.Request(url) # 设置请求头部信息 req.add_header("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8") req.add_header("Accept-Language", "zh-CN,en-US;q=0.7,en;q=0.3") req.add_header("Connection", "keep-alive") req.add_header("Host", "space.bilibili.com") req.add_header("User-Agent", "Mozilla/5.0 (X11; Linux x86_64; rv:54.0) Gecko/20100101 Firefox/54.0") req.add_header("Accept-Encoding", "gzip, deflate, br") # 请求参数 params = { "mid": "18199039", "page": "1", "pagesize": "100" } data = urllib.parse.urlencode(params).encode("utf-8") response = urllib.request.urlopen(req, data, timeout=10) if response.info()["Content-Encoding"] == "gzip": buf = io.BytesIO(response.read()) f = gzip.GzipFile(fileobj=buf) page_content = f.read().decode("utf-8") else: page_content = response.read().decode("utf-8") response.close() #print(page_content) return page_content except Exception as e: raise e finally: if response: response.close()
url = 'https://www.baidu.com/s?wd=pycharm%20web%20tomcat%20%E9%83%A8%E7%BD%B2&rsv_spt=1&rsv_iqid=0xb25600d300005801&issp=1&f=8&rsv_bp=1&rsv_idx=2&ie=utf-8&tn=baiduhome_pg&rsv_enter=1&oq=pycharm%20tomcat%20%E9%83%A8%E7%BD%B2&rsv_t=02279cxCOeMjpOzNhCqHcXCv7p2JPX29wSb37PHvoVdhnh1%2FPejpS3fVk84ZcHCYz79c&inputT=595&rsv_pq=966a0ca000007720&rsv_sug3=27&rsv_sug1=7&rsv_sug2=0&rsv_sug7=100&rsv_sug4=1986&rsv_sug=1' unquoteurl = unquote(url) urlparses = urlparse(unquoteurl) print(unquoteurl) #print urlparse information for info in urlparses: print(info) ''' #demo for htmlparser ''' myhtmlparser1 = HtmlParsor() myhtmlparser1.feed("<html><title>hello</title><body>hello world!!!</body></html>") myhtmlparser1.close() ''' #get url's content #myrequest = urllib.request('http://www.4hb.com/letters/ltrdelacct4.html') file = "D:\\INFOR.html" shp = SelfHtmlParser() response = urllib.request.urlopen( 'http://www.4hb.com/letters/ltrdelacct4.html') mycontent = response.read() reg = shp(mycontent) openfile = open(file, "a+") response.close() print(reg.title) pass
def download(url, headers={}): req = urllib.request.Request(url, headers=headers) response = urllib.request.urlopen(req) content = response.read().decode('utf-8') response.close() return content
''' url = 'https://www.baidu.com/s?wd=pycharm%20web%20tomcat%20%E9%83%A8%E7%BD%B2&rsv_spt=1&rsv_iqid=0xb25600d300005801&issp=1&f=8&rsv_bp=1&rsv_idx=2&ie=utf-8&tn=baiduhome_pg&rsv_enter=1&oq=pycharm%20tomcat%20%E9%83%A8%E7%BD%B2&rsv_t=02279cxCOeMjpOzNhCqHcXCv7p2JPX29wSb37PHvoVdhnh1%2FPejpS3fVk84ZcHCYz79c&inputT=595&rsv_pq=966a0ca000007720&rsv_sug3=27&rsv_sug1=7&rsv_sug2=0&rsv_sug7=100&rsv_sug4=1986&rsv_sug=1' unquoteurl = unquote(url) urlparses = urlparse(unquoteurl) print(unquoteurl) #print urlparse information for info in urlparses: print(info) ''' #demo for htmlparser ''' myhtmlparser1 = HtmlParsor() myhtmlparser1.feed("<html><title>hello</title><body>hello world!!!</body></html>") myhtmlparser1.close() ''' #get url's content #myrequest = urllib.request('http://www.4hb.com/letters/ltrdelacct4.html') file = "D:\\INFOR.html" shp = SelfHtmlParser() response = urllib.request.urlopen('http://www.4hb.com/letters/ltrdelacct4.html') mycontent = response.read() reg = shp(mycontent) openfile = open(file,"a+") response.close() print(reg.title) pass