feature='html.parser' soup=BeautifulSoup(data,features) except: pass def open1(): a=(list.curselection()) b=a[0] webbrowser.open(list1.get(b)) root=Tk() root.geometry("400x400") root.title("web_scrapper") list1=ListBox(root,width=300,height=20,selectmode=SINGLE) list1.pack(fill=BOTH) x=0 for link in soup.find_All('a'): link1=(link.get('href')) link1=(str(link1)) if link1.startswith('http')==True: list1.insert(x,link1) else: if link1=="None" or link1=="#": pass else link1==url+link1 list1.insert(x,link1) x+=1 button1=Button(root,text="open",command=open1) button1.pack(fill=BOTH) root.mainloop()
from bs4 import BeautifulSoup import requests url = "https://smartpages.nexpart.com/smartpage.php?mfrlinecode=GKI&partnumber=AF12061" response = requests.get(url, timeout=5) content = BeautifulSoup(response.content, "html.parser") for data = content.find_All('span', attrs={"class": "lang_en sp_title"}).get_text(): print(content)
def sendmessage(title, message): pynotify.init("Test") notice = pynotify.Notification(title, message) notice.show() return #Url contains link to Cricket Scores Website url = "" while True: r = requests.get(url) while r.status_code is not 200: r = requests.get(url) soup = BeautifulSoup(r.text,"lxml") data = soup.find_All("here the tag which contains the score") #Here the Score1 variable contains 1st data which got from the tag score1 = data[0].text print score1 sendmessage("Data:",score1) sleep(60)
##from bs4 import BeautifulSoup ##import requests ## ##url = 'http://proxy-hunter.blogspot.com/2010/03/18-03-10-speed-l1-hunter-proxies-310.html' ##r = requests.get(url) ##r.content ##r.encoding ##soup = BeautifulSoup(r.text) ##regex = r.compile(r'^(\d{3}).(\d{1,3}).(\d{1,3}).(\d{1,3}):(\d{2,4})') ##proxylist = soup.find_All(attrs = {"class":"Apple-style-span", "style": "color: black;"}, text = regex) ##data = proxylist[0] ##for x in data.split('\n'): ## print x from bs4 import BeautifulSoup as Soup import re, urllib url = 'http://proxy-hunter.blogspot.com/2010/03/18-03-10-speed-l1-hunter-proxies-310.html' document = urllib.urlopen(url) tree = Soup(document.read()) regex = re.compile(r'^(\d{3}).(\d{1,3}).(\d{1,3}).(\d{1,3}):(\d{2,4})') proxylist = tree.find_All(attrs = {"class":"Apple-style-span", "style": "color: black;"}, text = regex) data = proxylist[0] for x in data.split('\n'): print x
def GetApi(): u = "http://www.hantao888.com/brand.php" resp = requests.get(u) resp.encoding = ("utf-8") html = resp.text #print(html) AAA = BeautifulSoup(html) filter_brand = AAA.select("div.filter_brand a") NO = len(filter_brand) print(NO) try: for q in range(0, NO): filter_brand = str( AAA.select("div.filter_brand a")[q]["href"][-2:]) filter_brandTEST = AAA.select("div.filter_brand a")[q]["title"] print(filterbrand(filter_brand)) print(filter_brandTEST) header = { "Referer": "http://www.hantao888.com/mobile/brand.php?id=" + str(q), 'Cache-control': 'private', 'Connection': 'keep-alive', 'Content-Encoding': 'gzip', 'Content-Type': 'text/html; charset=utf-8', } data = { 'ast': '0', 'amount': '10', } qq = str(q) #print(qq)`` url = 'http://www.hantao888.com/mobile/brand.php?act=asynclist&category=0&brand=' + filterbrand( filter_brand ) + '&price_min=&price_max=&filter_attr=&page=1&sort=last_update&order=DESC' urlS = 'http://www.hantao888.com/mobile/brand.php?id=' + filterbrand( filter_brand) print(urlS) resp2 = requests.post(url, headers=header, data=data).text respS = requests.post(urlS) #print(respS) #s = json.loads(respS) a = json.loads(resp2) P = len(a) print(P) for i in range(0, P): b = a[i] c = b['pro-inner'] #Ssoup = BeautifulSoup(respS.text) #status = Ssoup.select("a.btn2") #print(len(status)) #print(Ssoup) #print(status) #抓取產品名稱 soup = BeautifulSoup(c) proTitle = soup.select("div.proTitle a")[0] #print(title.string) #print(PID) #抓取產品金額 proPrice = soup.select("div.proPrice span")[0] #抓取照片URL imgA = str(soup.select("img")[0]['src']) imgurl = 'http://www.hantao888.com' + imgA #print(imgurl) #img = requests.get(imgurl) #print(Getimg) #html = requests.get(item.get('src')) # get函式獲取圖片連結地址,requests傳送訪問請求 #img_name = folder_path + str(index + 1) +'.png' #with open(img_name, 'wb') as file: # 以byte形式將圖片資料寫入 # file.write(html.content) # file.flush() # file.close() # 關閉檔案 PorID = str(soup.select("div.proTitle a")[0]) PID = str(PorID[22:26]) header = { "Accept": "*/*", "Accept-Encoding": "gzip, deflate", "Accept-Language": "zh-TW,zh;q=0.8,en-US;q=0.5,en;q=0.3", "Accept-Language": "zh-TW,zh;q=0.8,en-US;q=0.5,en;q=0.3", "Connection": "keep-alive", "Cookie": "ECS[display]=grid; zh_choose=t; ECS[history]=1030%2C3194%2C2107%2C636; ECS[visit_times]=9; zh_choose=t; real_ipd=1.200.75.180; ECS_ID=9461df93025d671d8bc7538cb61ea5a466153dac", "Referer": "http://www.hantao888.com/goods.php?id=" + PID, "User-Agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 12_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/12.0 Mobile/15A372 Safari/604.1", } #print(header['Referer']) if PID[3] == '"': #print(PID[0:3]) uu = "http://www.hantao888.com/mobile/goods.php?id=" + PID + "&attr=&number=1&1616771455463463=" PURL = "http://www.hantao888.com/goods.php?act=price&id=" + PID[ 0:3] + "&attr=&number=1&1616771455463463=" print(uu) prodata = requests.get(uu, headers=header).text #print(prodata) #Prodata = json.loads(prodata) Prodatasoup = BeautifulSoup(prodata, 'html.parser') #print(Prodatasoup) prodata = Prodatasoup.select( "a.btn-popupSKU-addcart")[0].text #a = Prodatasoup.find_all('br') protext = Prodatasoup.find_All("br") print(protext) getdata = requests.get(PURL, headers=header).text Getdata = json.loads(getdata) #print(filter_brand,proTitle.string,Getdata["result"][6:-7],Getdata["result1"][6:-7]) #print(filter_brand) #print(proTitle.string) G = str(Getdata["result"][6:-7]) GG = str(Getdata["result1"][6:-7]) Edata = [PID[0:3], proTitle.string, imgurl, G, GG, prodata] print(Edata) else: #print(PID) uu = "http://www.hantao888.com/mobile/goods.php?id=" + PID PURL = "http://www.hantao888.com/goods.php?act=price&id=" + PID + "&attr=&number=1&1616771455463463=" prodata = requests.get(uu, headers=header).text #Prodata = json.loads(prodata) Prodatasoup = BeautifulSoup(prodata) prodata = Prodatasoup.select( "a.btn-popupSKU-addcart")[0].text #protext = Prodatasoup.select("div.desc p") #print(prodata) getdata = requests.get(PURL, headers=header).text Getdata = json.loads(getdata) #print(Getdata) #print(filter_brand,proTitle.string,Getdata["result"][6:-7],Getdata["result1"][6:-7]) #print(filter_brand) #print(proTitle.string) G = str(Getdata["result"][6:-7]) GG = str(Getdata["result1"][6:-7]) Edata = [PID, proTitle.string, imgurl, G, GG, prodata] #print(Edata) #print(Edata) #wb = openpyxl.load_workbook('EcTestCase.xlsx') #sheet = wb["P"] #titles = ("品牌名稱","商品名稱","商品照片URL","價格","價格2") #sheet.append(titles) #sheet.append(Edata) #wb.save("EcTestCase.xlsx") except: pass