def wikiscrape(): req = urr.Request('https://en.wikipedia.org/wiki/List_of_data_breaches') content = urr.urlopen(req) cs = content.info().get_content_charset() # get charset of webpage html = content.read().decode(cs) # consists of html source code print("Downloading...") soup = BeautifulSoup(html, 'html.parser') data = [] tabclasses = [] tables = soup.findAll("table") # "class" : "wikitable" for index, tab in enumerate(tables): data.append([]) tabclasses.append(tab.attrs) for ind, items in enumerate(tab.find_all("tr")): cols = items.find_all(["th", "td"]) cols = [ele.text.strip() for ele in cols] data[index].append([ele for ele in cols if (ele != [])]) df = pd.DataFrame() for rows in data: df = df.append(rows) df = df.replace(r'\n', '', regex=True) print('N dimensions of data {}'.format(df.shape)) print('Function call complete') return df
def download_stock_data(csv_url): response = request.urlopen(csv_url) #stores the connection in response csv = response.read() # csv_str = str(csv) # csv data converted to string lines = csv_str.split("\\n") #breaks the string at new line dest_url = r'goog.csv'
def test_urlwithfrag(self): urlwith_frag = "https://docs.python.org/2/glossary.html#glossary" with support.transient_internet(urlwith_frag): req = request.Request(urlwith_frag) res = yield from request.urlopen(req) self.assertEqual(res.geturl(), "https://docs.python.org/2/glossary.html#glossary")
def test_redirect_url_withfrag(self): redirect_url_with_frag = "http://bit.ly/1iSHToT" with support.transient_internet(redirect_url_with_frag): req = request.Request(redirect_url_with_frag) res = yield from request.urlopen(req) self.assertEqual(res.geturl(), "https://docs.python.org/3.4/glossary.html#term-global-interpreter-lock")
def test_iteration(self): expected_response = b"pycon 2008..." handler = self.start_server([(200, [], expected_response)]) resp = yield from request.urlopen("http://localhost:%s" % handler.port) line = yield from resp.read() #for line in data: self.assertEqual(line, expected_response)
def download_page(url): version = (3, 0) cur_version = sys.version_info if cur_version >= version: #If the Current Version of Python is 3.0 or above import urllib, request #urllib library for Extracting web pages try: headers = {} headers[ 'User-Agent'] = "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36" req = urllib, request.Request(url, headers=headers) resp = urllib, request.urlopen(req) respData = str(resp.read()) return respData except Exception as e: print(str(e)) else: #If the Current Version of Python is 2.x import urllib2 try: headers = {} headers[ 'User-Agent'] = "Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1312.27 Safari/537.17" req = urllib2.Request(url, headers=headers) response = urllib2.urlopen(req) page = response.read() return page except: return "Page Not found"
def download_dataset(url, file, path_dataset): if not os.path.exists(path_dataset): os.makedirs(path_dataset) if not os.path.exists(file): data = request.urlopen(url).read() with open(file, "wb") as f: f.write(data)
def test_basic_auth_success(self): ah = request.HTTPBasicAuthHandler() ah.add_password(self.REALM, self.server_url, self.USER, self.PASSWD) request.install_opener(request.build_opener(ah)) try: _u = yield from request.urlopen(self.server_url) self.assertTrue(_u) except error.HTTPError: self.fail("Basic auth failed for the url: %s", self.server_url)
def test_sites_no_connection_close(self): # Some sites do not send Connection: close header. # Verify that those work properly. (#issue12576) URL = 'http://www.imdb.com' # mangles Connection:close with support.transient_internet(URL): try: res = yield from request.urlopen(URL) pass except ValueError as e: self.fail("urlopen failed for site not sending \ Connection:close") else: self.assertTrue(res) req = yield from request.urlopen(URL) res = yield from req.read() self.assertTrue(res)
def download_file(csv_url): #check the internet connection response = request.urlopen(csv_url) csv = response.read() csv_str = str(csv) lines = csv_str.split("\\n") dest_url = r'goo.csv' fx = open(dest_url,"w") for line in lines: fx.write(line + '\n') fx.close()
def test_basic(self): handler = self.start_server() open_url = yield from request.urlopen("http://localhost:%s" % handler.port) for attr in ("read", "close", "info", "geturl"): self.assertTrue(hasattr(open_url, attr), "object returned from " "urlopen lacks the %s attribute" % attr) try: _r = yield from open_url.read() self.assertTrue(_r, "calling 'read' failed") finally: open_url.close()
def get_access_token(): url = "https://openapi.baidu.com/oauth/2.0/token?grant_type=client_credentials&client_id=ZrjLfF5Rh7pOL66gaOmDGnXn&client_secret=16bac9645093ca2632ebb81015ff7544" req = request.Request(url, method="POST") resp = request.urlopen(req) data = resp.read().decode('utf-8') json_data = json.loads(data) global bda_access_token bda_access_token = json_data['access_token'] return bda_access_token
def urlopen_with_retry(*args, **kwargs): retry_time = 3 for i in range(retry_time): try: return request.urlopen(*args, **kwargs) except socket.timeout as e: if i + 1 == retry_time: raise e # try to tackle youku CDN fails except error.HTTPError as http_error: if i + 1 == retry_time: raise http_error
def label(deqstr): import urllib import request emailstext = [] for g in deqstr: url = s3_root + g response = request.urlopen(url) raw_text = response.read().decode('utf8') emailstext.append(g) emailstext.append(raw_text) x = "".join(emailstext) return x
def test_info(self): handler = self.start_server() try: open_url = yield from request.urlopen( "http://localhost:%s" % handler.port) info_obj = open_url.info() self.assertIsInstance(info_obj, email.message.Message, "object returned by 'info' is not an " "instance of email.message.Message") self.assertEqual(info_obj.get_content_subtype(), "plain") finally: self.server.stop()
def test_line_iteration(self): lines = [b"We\n", b"got\n", b"here\n", b"verylong " * 192 + b"\n"] expected_response = b"".join(lines) handler = self.start_server([(200, [], expected_response)]) resp = yield from request.urlopen("http://localhost:%s" % handler.port) data = yield from resp.readlines(4) for index, line in enumerate(data): self.assertEqual(line, lines[index], "Fetched line number %s doesn't match expected:\n" " Expected length was %s, got %s" % (index, len(lines[index]), len(line))) self.assertEqual(index + 1, len(lines))
def urlopen(self, url, data=None, **kwargs): l = [] f = yield from request.urlopen(url, data, **kwargs) try: # Exercise various methods #_f = yield from f.readlines(200) #l.extend(_f) l.append((yield from f.readline())) l.append((yield from f.read(1024))) l.append((yield from f.read())) finally: f.close() return b"".join(l)
def main(WebUrl): # globals flist if __name__ == "__main__": lparser = parserLinks() web = request.urlopen(WebUrl) # context= web.read() for context in web.readlines(): _str = "%s" % context try: lparser.feed(_str) except parser.HTMLParseError: # print( "parser error") pass web.close() imagelist = lparser.getfilelist() downjpgmutithread(imagelist)
def downjpg(filepath, FileName="default.jpg"): try: FileName = getFileName(filepath) web = request.urlopen(filepath) print("访问网络文件" + filepath + "\n") jpg = web.read() DstDir = "G:\\happy_python\\" print("保存文件" + DstDir + FileName + "\n") try: File = open(DstDir + FileName, "wb") File.write(jpg) File.close() return except IOError: print("error\n") return except Exception: print("error\n") return
def read_page(url, page_num, keyword): # 模仿浏览器post需求信息,并读取返回后的页面信息 page_headers = { 'Host': 'www.lagou.com', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) ' 'Chrome/45.0.2454.85 Safari/537.36 115Browser/6.0.3', 'Connection': 'keep-alive' } if page_num == 1: boo = 'true' else: boo = 'false' page_data = parse.urlencode([ # 通过页面分析,发现浏览器提交的FormData包括以下参数 ('first', boo), ('pn', page_num), ('kd', keyword) ]) req = request.Request(url, headers=page_headers) page = request.urlopen(req, data=page_data.encode('utf-8')).read() page = page.decode('utf-8') return page
def test_geturl(self): # Make sure same URL as opened is returned by geturl. handler = self.start_server() open_url = yield from request.urlopen("http://localhost:%s" % handler.port) url = open_url.geturl() self.assertEqual(url, "http://localhost:%s" % handler.port)
def getRequestedTemp(): return (json.loads(urlopen(Request(requestedTemperature, **requestOpts)).read().decode("utf-8")))["tempF"]
def test_sending_headers(self): handler = self.start_server() req = request.Request("http://localhost:%s/" % handler.port, headers={"Range": "bytes=20-39"}) yield from request.urlopen(req) self.assertEqual(handler.headers_received["Range"], "bytes=20-39")
import urllib import request #可以看到HTTP响应的头和JSON数据 with request.urlopen(''https://api.douban.com/v2/book/2129650') as f: data=f.read() print('Status:',f.status,f.reason) for k, v in f.getheaders(): print('%s: %s' % (k, v)) print('Data:', data.decode('utf-8')) #模拟浏览器发送GET请求,就需要使用Request对象,通过往Request对象添加HTTP头 req = request.Request('http://www.douban.com/') req.add_header('User-Agent', 'Mozilla/6.0 (iPhone; CPU iPhone OS 8_0 like Mac OS X) AppleWebKit/536.26 (KHTML, like Gecko) Version/8.0 Mobile/10A5376e Safari/8536.25') with request.urlopen(req) as f: print('Status:', f.status, f.reason) for k, v in f.getheaders(): print('%s: %s' % (k, v)) print('Data:', f.read().decode('utf-8'))
# add the correct User-Agent headers = { 'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36' } # the company page you're about to scrape req = Request('https://angel.co/uber') web_byte = urlopen(req).read() company_page = web_byte.decode('utf-8') #company_page = 'https://mhcdev.com/login' # open the page page_request = request.Request(company_page, headers=headers) page = request.urlopen(page_request) # parse the html using beautiful soup html_content = BeautifulSoup(page, 'html.parser') # we parse the title title = html_content.find('h1') title = title.text.strip() print(title) # we parse the description description = html_content.find('h2', attrs={'class': 'js-startup_high_concept'}) description = description.text.strip() print(description)
import urllib from urllib.request import urlopen import request a = urllib.request.urlopen('https://www.baidu.com').read() print(len(a)) a1 = urlopen('http://m.baidu.com').read() print(len(a1)) from urllib import request a2 = request.urlopen('http://jd.com').read() print(len(a2))
# In[7]: from urllib import request # In[8]: dir(request) # In[9]: html=request.urlopen(url) # In[10]: html # In[11]: soup=BeautifulSoup(html) # In[12]:
fronm urllib import request url = "http://www.baidu.com" res = request.urlopen(url) # 获取相应 print(res.info()) print(res.getcode()) print(res.geturl())
def setRequestedTemp(f): urlopen( Request(url=requestedTemperature, method="PUT", data=json.dumps({"tempF": f}).encode("utf-8"), **requestOpts) )
import request resp = request.urlopen('http://www.baidu.com') print(resp.read()) headers = []