def GetPageData(self, data_list): str = 'https://piao.damai.cn/' for i in data_list: Req = urllib.request.Request(str + i + ".html", headers=self.header) pagedata = urllib.request.urlopen(Req, timeout=5000).read() page = bt(pagedata, 'lxml') showtime = page.find('div', class_='m-sdbox m-showtime').span.string venue = page.find('div', class_='m-sdbox m-venue').div.a.string title = page.find('h2', class_='tt').span.string print(title, venue, showtime, sep='|*-*|') time.sleep(2)
def fetchtopic(): res = urllib2.urlopen('http://www.cnbeta.com/#1') #res = res.readlines() #print res soup = bt(res,from_encoding='gbk') r = soup.findAll(attrs={'class':'topic'}) l=[] for one in r: d={} d['url']=host + one.a['href'] d['title']=one.a.strong.string l.append(d) return l
import requests from bs4 import BeautifulSoup as bt response = requests.get('https://github.com/login') obj = bt(response.text, 'html.parser') # 获取token login = obj.find(name='input', attrs={ 'name': 'authenticity_token' }).get('value') print(login)
import requests from bs4 import BeautifulSoup as bt item = 'shoes' url = "https://www.flipkart.com/search?q=" + item + "&otracker=search&otracker1=search&marketplace=FLIPKART&as-show=on&as=off" source = requests.get(url).text ''' fid = open('test_html.txt','w') print(source.encode('utf-8'),file = fid) fid.close() ''' soup = bt(source, 'lxml') match = soup.find_all('div', class_='_1vC4OE') for z in match: print(z)
import requests from bs4 import BeautifulSoup as bt import json import re URL = 'https://en.wikipedia.org/wiki/Solar_cell' page = requests.get(URL) # print(page.content) # Scrap the page soup = bt(page.content, 'html.parser') """ Target div that contain the paragraphs """ content = soup.find(id='mw-content-text') def get_citations_needed_count(): """ Report the number of paragraph that need citation """ number_of_citations_needed = content.find_all( class_="noprint Inline-Template Template-Fact") return f"Number of citation :{len(number_of_citations_needed)}" print(get_citations_needed_count())
def fetchdetail(url): res = urllib2.urlopen(url) soup = bt(res,from_encoding='gbk') #r = soup.find(id='news_content') return soup