def parge_url(url): with open(csv_name,'a',newline='',encoding='gbk') as f:#wb新建 writer = csv.writer(f) response =requests.get(url,headers =headers) res =beau(response.text,'lxml') print(response.status_code) for i in res.select('#comments > div.comment-item'): try: mid = beau(str(i),'lxml')#中间再次解析 name = mid.select('span.comment-info a')[0].text star = re.findall('allstar(.*?) rating',str(i)) time =mid.find_all(class_ ='comment-time')[0].get('title').strip('') comment = mid.select('p span.short')[0].text img_url =mid.select('div.avatar a img')[0].get('src') list =[] list.append(name) list.append(star) list.append(time) list.append(comment) list.append(img_url) print(list) try: writer.writerow(list) except: print('数据请求失败*************************') pass except: print('数据解析失败-----------')
def download_bug(len1, temp_bugid): while (len1 != 0): l1 = temp_bugid #use temp list l1 print "start of while loop" temp_bugid = [] #flush global list for i in l1: bugnumber = i + '.html' url = 'https://code.google.com/p/chromium/issues/detail?id=' + i try: page = urllib2.urlopen(url) #htm = urllib2.urlopen(url) soup = beau(page) #bugnumber = bugno + '.html' print bugnumber s = open(bugnumber, 'w') s.write(str(soup)) #time.sleep(3) except: #except with no arguments will catch the exception and pass it and move onto next iteration print "bugnumber = %s" % (bugnumber) temp_bugid.append( i ) #append empty list with all bugs that were not downloaded for which # there was error in try and exception was raised then iterate while on left length until its is 0 #one can catch Httpsresponse exception as Urllib2.HTTpsreponse print temp_bugid len1 = len(temp_bugid) print "length of temp_bugid = %s" % len1
def droid(url,root): htm = urllib2.urlopen(url) #open droid url soup = beau(htm) #feed all the data from html page of droid to var named soup elm = soup.findAll('a') #find all "a" tags lst = [] for i in elm: x = i.attrs["href"] #for all a tags find all attributes with hyperlink name lst.append(x) #for all given href feed the value of that in a list # print(i.attrs["href"]) #now there are 2 hrefs in on the fdroid page both have github the one ending with issues is not the string we want lis = [] for i in lst: if "github" in i: if not re.search(r"issues$",i): #if string does not have issue appent to list lis.append(i) temp = lis[-1] if re.search(r"/$",temp): temp = temp[:-1] print(temp) gitclone(temp,root) #feed the url fecthed to gitclone function to perform clone else: print(temp) gitclone(temp,root)
def download_bug(bugno, url): web_page = urllib2.urlopen(url) # htm = urllib2.urlopen(url) soup = beau(web_page) bugnumber = bugno + '.html' print bugnumber s = open(bugnumber, 'w') s.write(str(soup))
def __parse_news(self, rss_html_content: str): content = {} soup = beau(rss_html_content, "html.parser") # clean the last line soup.find_all("p")[-1].decompose() print("rss_Handler") img_url = re.split("(-)\d+(x)\d*", soup.find("img").attrs["src"]) content["img_url"] = img_url[0] + img_url[-1].split("?")[0] txt = soup.text.rstrip() content["txt"] = self.__split_news(txt) return content
def github(url,root): htm = urllib2.urlopen(url) soup = beau(htm) elms = soup.select("h3.repo-list-name a") lst = [] for i in elms: x = i.attrs["href"] # print(i.attrs["href"]) lst.append(x) # lst = lst.append(x) #print('lst: ',lst) for git in lst: git ='https://github.com' + git + '.git' gitclone(git,root) print(git)
def bugid(url): htm = urllib2.urlopen(url) soup = beau(htm) elm = soup.findAll("td", {"class": "vt id col_0"}) x = 0 temp = [] for i in elm: alink = i.findAll("a") for i in alink: str1 = i.text str1 = str1.replace('\n', '') str1 = str1.replace(' ', '') temp_bugid.append(str1) temp = temp_bugid[0:496] temp3 = [x for x in temp_bugid if x not in temp] temp4.extend(temp3)
def crawl_kuaidaili(self): headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3', 'Accept-Encoding': 'gzip, deflate, br', 'Accept-Language': 'zh-CN,zh;q=0.9', 'Cache-Control': 'max-age=0', 'Connection': 'keep-alive', 'Referer': 'https://www.google.com/', 'User-Agent': random.choice(user_list) } for i in range(5): url = 'https://www.kuaidaili.com/free/inha/{}/'.format(str(i)) time.sleep(2) res = requests.get(url, headers=headers) soup = beau(res.text, 'lxml') for j in soup.select('div > table > tbody > tr'): ip = re.findall('<td data-title="IP">(.*?)</td>', str(j))[0] port = re.findall('<td data-title="PORT">(.*?)</td>', str(j))[0] yield ':'.join([ip, port])
def crawl__66(self): headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3', 'Accept-Encoding': 'gzip, deflate, br', 'Accept-Language': 'zh-CN,zh;q=0.9', 'Cache-Control': 'max-age=0', 'Connection': 'keep-alive', 'Referer': 'http://www.66ip.cn/2.html', 'User-Agent': random.choice(user_list) } for i in range(5): url = 'http://www.66ip.cn/{}.html'.format(str(i)) time.sleep(2) res = requests.get(url, headers=headers) soup = beau(res.text, 'lxml') for j in soup.select('div.containerbox > div > table > tr'): a = re.findall('<tr><td>(.*?)</td><td>', str(j))[0] if a == 'ip': pass else: port = re.findall('</td><td>(.*?)</td><td>', str(j))[0] yield ':'.join([a, port])
from urllib.request import urlopen as UReq from bs4 import BeautifulSoup as beau myurl = 'https://campinascomprelocal.com.br/tipo/bares/' print(myurl) # open connection page uClient = UReq(myurl) page_html = uClient.read() uClient.close() soup = beau(page_html, 'lxml') contents = soup.title print(contents)
import sys from bs4 import BeautifulSoup as beau soup = beau(open('/home/elsys/PycharmProjects/beautiful_01/bares.html'), "lxml") filename = "bares.csv" f = open(filename, "w") headers = "ENDERECO, NUMERO, BARES\n" f.write(headers) for title in soup.findAll('div', {"class": "jet-listing-dynamic-field__inline-wrap"}): print(title.text + ",") f.write(title.text + "\n") f.close()
bug_end = bug_end.split("bug_end = ") for i in bug_end: bug_end1 = i bug_end1 = bug_end1.rstrip() #print(bug_end1) max_timeout_secs = max_timeout_secs.split("max_timeout_secs = ") for i in max_timeout_secs: max_timeout_secs1 = i max_timeout_secs1 = max_timeout_secs1.rstrip() #print(max_timeout_secs1) htm = urllib2.urlopen(url2) #htm = urllib2.urlopen("https://f-droid.org/repository/browse/?fdid=uk.org.crimetalk") soup = beau(htm) #s = open('web.html','w') #print(e) elm = soup.findAll('a') lst = [] for i in elm: x = i.attrs["href"] lst.append(x) # print(i.attrs["href"]) #print(lst) temp = [] for i in lst: if "/browse/" in i: temp.append(i) t1 = []
- requests - requests_html hay beautifulsoup4 [tuỳ chọn] - argparse hay sys.argv Gợi ý: - ``nargs`` https://docs.python.org/2/library/argparse.html ''' import requests import sys from bs4 import BeautifulSoup as beau ses = requests.Session() resp = ses.get('https://ketqua.net') data = resp.text data_format = beau(data, 'html.parser') list_bingo = [] index_special = data[23047:23049] # print(data_format.prettify()) def get_list_bingo(): for i in data_format.find_all('td', class_='chu17 need_blank'): if i.text.isdigit(): list_bingo.append(i.text) # print(i.text) # print(list_bingo) get_list_bingo()
import sys from bs4 import BeautifulSoup as beau soup = beau(open('/home/elsys/PycharmProjects/beautiful_01/bares.html'), "html.parser") filename = "bares3.csv" f = open(filename, "w") # headers = "ENDERECO, NUMERO, BARES\n" # f.write(headers) # # for title in soup.findAll('div', {"class": "jet-listing-dynamic-field__inline-wrap"}): # print(title.text + ",") # for title in soup.findAll('div', {"class": "jet-listing jet-listing-dynamic-field display-inline"}): # for title in soup.findAll('div', {"class": "jet-listing-dynamic-field__content"}): nome_bar = soup.findAll('div', {"class": "elementor-widget-container"}) # print(len(nome_bar)) # print(nome_bar[3].text) filename = "bares3.csv" f = open(filename, "w") headers = "ENDERECO, NUMERO, BARES\n" f.write(headers) for list in nome_bar: