Example #1
0
	feature='html.parser'
	soup=BeautifulSoup(data,features)
except:
	pass
def open1():
	a=(list.curselection())
	b=a[0]
	webbrowser.open(list1.get(b))

root=Tk()
root.geometry("400x400")
root.title("web_scrapper")
list1=ListBox(root,width=300,height=20,selectmode=SINGLE)
list1.pack(fill=BOTH)
x=0
for link in soup.find_All('a'):
	link1=(link.get('href'))
	link1=(str(link1))
	if link1.startswith('http')==True:
		list1.insert(x,link1)
	else:
		if link1=="None" or link1=="#":
			pass
		else 
		link1==url+link1
		list1.insert(x,link1)
	x+=1
button1=Button(root,text="open",command=open1)
button1.pack(fill=BOTH)
root.mainloop()
Example #2
0
from bs4 import BeautifulSoup
import requests

url = "https://smartpages.nexpart.com/smartpage.php?mfrlinecode=GKI&partnumber=AF12061"
response = requests.get(url, timeout=5)
content = BeautifulSoup(response.content, "html.parser")

for data = content.find_All('span', attrs={"class": "lang_en sp_title"}).get_text():
    print(content)
Example #3
0
def sendmessage(title, message):

    pynotify.init("Test")

    notice = pynotify.Notification(title, message)

    notice.show()
        
    return
#Url contains link to Cricket Scores Website
url = ""

while True:

    r = requests.get(url)

    while r.status_code is not 200:

            r = requests.get(url)

    soup = BeautifulSoup(r.text,"lxml")

    data = soup.find_All("here the tag which contains the score")
#Here the Score1 variable contains 1st data which got from the tag
    score1 = data[0].text
    print score1    
    
    sendmessage("Data:",score1)

    sleep(60)   
Example #4
0
##from bs4 import BeautifulSoup
##import requests
## 
##url = 'http://proxy-hunter.blogspot.com/2010/03/18-03-10-speed-l1-hunter-proxies-310.html'
##r = requests.get(url)
##r.content
##r.encoding
##soup = BeautifulSoup(r.text)
##regex  = r.compile(r'^(\d{3}).(\d{1,3}).(\d{1,3}).(\d{1,3}):(\d{2,4})')
##proxylist = soup.find_All(attrs = {"class":"Apple-style-span", "style": "color: black;"}, text = regex)
##data = proxylist[0]
##for x in data.split('\n'):
##        print x


from bs4 import BeautifulSoup as Soup
import re, urllib 
url = 'http://proxy-hunter.blogspot.com/2010/03/18-03-10-speed-l1-hunter-proxies-310.html'
document = urllib.urlopen(url)
tree = Soup(document.read())
regex  = re.compile(r'^(\d{3}).(\d{1,3}).(\d{1,3}).(\d{1,3}):(\d{2,4})')
proxylist = tree.find_All(attrs = {"class":"Apple-style-span", "style": "color: black;"}, text = regex)
data = proxylist[0]
for x in data.split('\n'):
        print x
Example #5
0
def GetApi():
    u = "http://www.hantao888.com/brand.php"
    resp = requests.get(u)
    resp.encoding = ("utf-8")
    html = resp.text
    #print(html)
    AAA = BeautifulSoup(html)
    filter_brand = AAA.select("div.filter_brand a")
    NO = len(filter_brand)
    print(NO)
    try:
        for q in range(0, NO):
            filter_brand = str(
                AAA.select("div.filter_brand a")[q]["href"][-2:])
            filter_brandTEST = AAA.select("div.filter_brand a")[q]["title"]
            print(filterbrand(filter_brand))
            print(filter_brandTEST)

            header = {
                "Referer":
                "http://www.hantao888.com/mobile/brand.php?id=" + str(q),
                'Cache-control': 'private',
                'Connection': 'keep-alive',
                'Content-Encoding': 'gzip',
                'Content-Type': 'text/html; charset=utf-8',
            }
            data = {
                'ast': '0',
                'amount': '10',
            }
            qq = str(q)
            #print(qq)``
            url = 'http://www.hantao888.com/mobile/brand.php?act=asynclist&category=0&brand=' + filterbrand(
                filter_brand
            ) + '&price_min=&price_max=&filter_attr=&page=1&sort=last_update&order=DESC'
            urlS = 'http://www.hantao888.com/mobile/brand.php?id=' + filterbrand(
                filter_brand)

            print(urlS)
            resp2 = requests.post(url, headers=header, data=data).text
            respS = requests.post(urlS)

            #print(respS)
            #s = json.loads(respS)
            a = json.loads(resp2)
            P = len(a)
            print(P)

            for i in range(0, P):

                b = a[i]
                c = b['pro-inner']

                #Ssoup = BeautifulSoup(respS.text)
                #status = Ssoup.select("a.btn2")
                #print(len(status))
                #print(Ssoup)
                #print(status)

                #抓取產品名稱
                soup = BeautifulSoup(c)
                proTitle = soup.select("div.proTitle a")[0]

                #print(title.string)
                #print(PID)
                #抓取產品金額
                proPrice = soup.select("div.proPrice span")[0]
                #抓取照片URL
                imgA = str(soup.select("img")[0]['src'])
                imgurl = 'http://www.hantao888.com' + imgA
                #print(imgurl)
                #img = requests.get(imgurl)
                #print(Getimg)

                #html = requests.get(item.get('src'))   # get函式獲取圖片連結地址,requests傳送訪問請求
                #img_name = folder_path + str(index + 1) +'.png'
                #with open(img_name, 'wb') as file:  # 以byte形式將圖片資料寫入
                #    file.write(html.content)
                #    file.flush()
                #    file.close()  # 關閉檔案
                PorID = str(soup.select("div.proTitle a")[0])
                PID = str(PorID[22:26])
                header = {
                    "Accept":
                    "*/*",
                    "Accept-Encoding":
                    "gzip, deflate",
                    "Accept-Language":
                    "zh-TW,zh;q=0.8,en-US;q=0.5,en;q=0.3",
                    "Accept-Language":
                    "zh-TW,zh;q=0.8,en-US;q=0.5,en;q=0.3",
                    "Connection":
                    "keep-alive",
                    "Cookie":
                    "ECS[display]=grid; zh_choose=t; ECS[history]=1030%2C3194%2C2107%2C636; ECS[visit_times]=9; zh_choose=t; real_ipd=1.200.75.180; ECS_ID=9461df93025d671d8bc7538cb61ea5a466153dac",
                    "Referer":
                    "http://www.hantao888.com/goods.php?id=" + PID,
                    "User-Agent":
                    "Mozilla/5.0 (iPhone; CPU iPhone OS 12_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/12.0 Mobile/15A372 Safari/604.1",
                }
                #print(header['Referer'])
                if PID[3] == '"':
                    #print(PID[0:3])
                    uu = "http://www.hantao888.com/mobile/goods.php?id=" + PID + "&attr=&number=1&1616771455463463="
                    PURL = "http://www.hantao888.com/goods.php?act=price&id=" + PID[
                        0:3] + "&attr=&number=1&1616771455463463="
                    print(uu)
                    prodata = requests.get(uu, headers=header).text
                    #print(prodata)
                    #Prodata = json.loads(prodata)
                    Prodatasoup = BeautifulSoup(prodata, 'html.parser')
                    #print(Prodatasoup)
                    prodata = Prodatasoup.select(
                        "a.btn-popupSKU-addcart")[0].text
                    #a = Prodatasoup.find_all('br')
                    protext = Prodatasoup.find_All("br")
                    print(protext)
                    getdata = requests.get(PURL, headers=header).text
                    Getdata = json.loads(getdata)

                    #print(filter_brand,proTitle.string,Getdata["result"][6:-7],Getdata["result1"][6:-7])
                    #print(filter_brand)
                    #print(proTitle.string)
                    G = str(Getdata["result"][6:-7])
                    GG = str(Getdata["result1"][6:-7])

                    Edata = [PID[0:3], proTitle.string, imgurl, G, GG, prodata]

                    print(Edata)

                else:
                    #print(PID)
                    uu = "http://www.hantao888.com/mobile/goods.php?id=" + PID
                    PURL = "http://www.hantao888.com/goods.php?act=price&id=" + PID + "&attr=&number=1&1616771455463463="

                    prodata = requests.get(uu, headers=header).text
                    #Prodata = json.loads(prodata)
                    Prodatasoup = BeautifulSoup(prodata)
                    prodata = Prodatasoup.select(
                        "a.btn-popupSKU-addcart")[0].text
                    #protext = Prodatasoup.select("div.desc p")
                    #print(prodata)

                    getdata = requests.get(PURL, headers=header).text
                    Getdata = json.loads(getdata)
                    #print(Getdata)
                    #print(filter_brand,proTitle.string,Getdata["result"][6:-7],Getdata["result1"][6:-7])
                    #print(filter_brand)
                    #print(proTitle.string)
                    G = str(Getdata["result"][6:-7])
                    GG = str(Getdata["result1"][6:-7])
                    Edata = [PID, proTitle.string, imgurl, G, GG, prodata]
                    #print(Edata)
                    #print(Edata)
                #wb = openpyxl.load_workbook('EcTestCase.xlsx')
                #sheet = wb["P"]
                #titles = ("品牌名稱","商品名稱","商品照片URL","價格","價格2")
                #sheet.append(titles)
                #sheet.append(Edata)
                #wb.save("EcTestCase.xlsx")
    except:
        pass