def webScraper(html): html_data = Beautifulsoup(html) req_data = html_data.find_all(['p']) res = "" for i in req_data: res = res + str(i) return res
def comics_parse(self, response): content = response.body if not content: self.log('parse comics body error.') return soup = Beautifulsoup(content, "html5lib") page_list_tag = soup.find('ul', class_='pagelist') current_li = page_list_tag.find('li', class_='thisclass') page_num = current_li.string self.log('current page = ' + page_num) # display current page img tag li_tag = soup.find('li', id='imgshow') img_tag = li_tag.find('img') # current img url img_url = img_tag['src'] self.log('img url: ' + img_url) # comics title title = img_tag['alt'] # save img to local self.save_img(page_num, title, img_url) # next page url, when the href attribute is # so that signed the last page a_tag_list = page_list_tag.find_all('a') next_page = a_tag_list[-1]['href'] if next_page == '#': self.log('parse comics: ' + title + ' finished.') else: next_page = SITE_NAME + '/' + KEY_WORD + '/' + next_page yield scrapy.Request(next_page, callback=self.comics_parse)
def parse(self, response): content = response.body if not content: self.log('parse body error.') return # use beautifulsoup instead of lxml soup = Beautifulsoup(content, "html5lib") # get tags contain comics listcon_tag = soup.find('ul', class_='listcon') if len(listcon_tag) < 1: self.log('extract comics list error.') return # get <a> in each tag list com_a_list = listcon_tag.find_all('a', attrs={ 'href': True }) if len(com_a_list) < 1: self.log('can not find <a> that contain href arrtibute.') return # append each comics url to array comics_url_list = [] for tag_a in com_a_list: url = SITE_NAME + tag_a['href'] comics_url_list.append(url) print('\n>>>>>>>>>>>>>>>>>>> current page comics list <<<<<<<<<<<<<<<<<<<<') print(comics_url_list) # handle each page comic for url in comics_url_list: print('>>>>>>>> parse comics:' + url) yield scrapy.Request(url=url, callback=self.comics_parse) # if just crawl current one page execute return #return # get all pages and each for this page_tag = soup.find('ul', class_="pagelist") if len(page_tag) < 1: self.log('extract page list error.') return # get next page url page_a_list = page_tag.find_all('a', attrs={ 'href': True }) if len(page_a_list) < 2: self.log('extract page tag a error.') return #check current page is or not last page by select control select_tag = soup.find('select', attrs={ 'name': 'sldd' }) option_list = select_tag.find_all('option') # verify current page is last by attribute selected in option tage last_option = option_list[-1] current_option = select_tag.find('option', attrs={ 'selected': True }) # check is or not last page is_last = (last_option.string == current_option.string) if not is_last: next_page = SITE_NAME + '/' + KEY_WORD + '/' + page_a_list[-2]['href'] if next_page is not None: print('\n------ parse next page --------') print(next_page) yield scrapy.Request(next_page, callback=self.parse) pass else: print('========= Last page ==========')
def get_soup(html): soup = bsp.soup(html, 'lxml') return soup
from bs4 import Beautifulsoup import requests with open( "https://musbonrealestate.com/Musbon/Index.aspx?gclid=CjwKCAjw6fCCBhBNEiwAem5SO-Wemf1LQRCWoRKmi84e-ZyZGt6T9w0Hn4tBp_oCShnirLwJhKEJUhoCzP8QAvD_BwE" ) as html_file: soup = Beautifulsoup(html_file, 'lxml') print(soup)
import requests from bs4 import Beautifulsoup result = requests.get("https://www.google.pl/") #print(result.status_code) Sprawdzenie czy strona jest dostępna #print(result.headers) Sprawdzenie nagłówków HTTP src = result.content #przypisanie zawartości strony do zmiennej #print(src) soup = Beautifulsoup( src, 'lxml') #Tworzymy obiekt Beautifulsoup oparty na zmiennej źródłowej src links = soup.find_all('a') #Szukamy akpitów <a>, rezultatem będzie lista print(links) for link in links: if "About" in link.text: print(link) print(link.attrs['href'])
def parse(self, response): content = response.body if not content: self.log('parse body error.') return # use beautifulsoup instead of lxml soup = Beautifulsoup(content, "html5lib") # get tags contain comics listcon_tag = soup.find('ul', class_='listcon') if len(listcon_tag) < 1: self.log('extract comics list error.') return # get <a> in each tag list com_a_list = listcon_tag.find_all('a', attrs={'href': True}) if len(com_a_list) < 1: self.log('can not find <a> that contain href arrtibute.') return # append each comics url to array comics_url_list = [] for tag_a in com_a_list: url = SITE_NAME + tag_a['href'] comics_url_list.append(url) print( '\n>>>>>>>>>>>>>>>>>>> current page comics list <<<<<<<<<<<<<<<<<<<<' ) print(comics_url_list) # handle each page comic for url in comics_url_list: print('>>>>>>>> parse comics:' + url) yield scrapy.Request(url=url, callback=self.comics_parse) # if just crawl current one page execute return #return # get all pages and each for this page_tag = soup.find('ul', class_="pagelist") if len(page_tag) < 1: self.log('extract page list error.') return # get next page url page_a_list = page_tag.find_all('a', attrs={'href': True}) if len(page_a_list) < 2: self.log('extract page tag a error.') return #check current page is or not last page by select control select_tag = soup.find('select', attrs={'name': 'sldd'}) option_list = select_tag.find_all('option') # verify current page is last by attribute selected in option tage last_option = option_list[-1] current_option = select_tag.find('option', attrs={'selected': True}) # check is or not last page is_last = (last_option.string == current_option.string) if not is_last: next_page = SITE_NAME + '/' + KEY_WORD + '/' + page_a_list[-2][ 'href'] if next_page is not None: print('\n------ parse next page --------') print(next_page) yield scrapy.Request(next_page, callback=self.parse) pass else: print('========= Last page ==========')
import urllib.request, urllib.parse, urllib.error from bs4 import Beautifulsoup import ssl ctx = ssl.creat_default_context() ctx.check_hostname = False ctx.verify_mode = ssl.CERT_NONE url = input('Enter - ') html = urllib.request.urlopen(url, context=ctx).read() soup = Beautifulsoup(html, 'html.parser') tags = soup('a') for tag in tags: print(tag.get(href, None))
'user-agent':' Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36' } url_path = 'https://www.pexels.com/search/' word = input('请输入你要下载的图片') #调用斯必克的翻译api,可以对输入的中文进行翻译 url_tra ='http://howto speak.org:443/api/e2c?user_key=dfcacb6404295f9ed9e430f67b641a8e¬rans=0&text='+word english_data =requests.get(url_tra) #将数据解析为json数据 js_data = json.loads(englisn_data) #从json格式当中获取数据 content = js_data['english'] #结合翻译的英文,以及对应的url格式,构造新的uel路径,给下面使用 url = url_path + content +'/' #使用ruquests,从而获取到需要的图片的页面 wb_data = requests.get(url,headers = headers) soup = Beautifulsoup(wb_data.txt,'lxml') #获取这个页面的图片的url imgs =soup.select('article > a >img') list = [] #将获取到的url存储到列表当中 for img in imgs : photo =img .get('src') list.append(photo) #定义存储图片的地址 path ='' #逐个的调用url,获取图片 for item in list: data = requests.get(item,headers = headers ) #使用item来打开文件 fp = open(path +item .split('?')[0][-10:],'wb') fp.writte(data.content)
from bs4 import Beautifulsoup import requests import html5lib # import os url = "https://www.timeanddate.com/weather/netherlands/delft" response = requests.get(url) result = Beautifulsoup(response.content, "html5lib") temperature_raw_data = result.findall('td')[12] temperature = temperature_raw_data.get_text() print('Temperature of Delft city :', temperature)
def parser(self): response = requests.get(url=self.url, headers=self.headers) soup = Beautifulsoup(response, 'lxml') return soup
#Writing a CSV from a HTML- scraping data import csv import urllib from bs4 import Beautifulsoup html = urlopen("https://cricclubs.com/NTCA/teamSchedule.do?teamId=1636&clubId=343") bsobj = Beautifulsoup(html, "lxml") table = bsobj.findall("table", {"class":"sortable table"})[0] rows = table.findall("tr") csvFile = open("ntca.csv", 'wt', newline='') writer = csv.writer(csvFile) try: for row in rows: csvRow = [] for cell in row.findall(['td', 'th']): csvRow.append(cell.get_text()) writer.writerow(csvRow) finally: csvFile.close()
import requests from bs4 import Beautifulsoup sample_obj = [] # This writes a new file with the sample_obj content with open('index.html', 'w') as f: f.write(sample_obj) soup = Beautifulsoup(sample_obj) # Pretty can render a cleaner code, e.g. *.html #print(soup.prettify()) # Tag soup.b # This will print the first <b> tag in a html file soup.p # Does the same with the <p> tag print #will find the <b> tags(soup.find('b')) print #will finds all <b> tags(soup.find_all('b')) print #will find the name of the <b> tags(soup.b.name) #changing tag names #tag = soup.b #tag.name = "blockquote" #print(tag['id']) #print(tag['any_attribute']) #print(tag.attrs) # returns a dictionary of the attributes #del tag['id']
from bs4 import Beautifulsoup import pandas with open("hu.html") as f: page = Beautifulsoup(f, "lxml") page.select(title)[0].get_text( ) # .get_text() only takes the text, without h1 / h2 / etc print(title) #.upper() / .lower() headings = page.select("h2") len(headings) for heading in headings: print(heading.get_text().strip()) flyer_title = page.select(".flyer_content_title") flyer_title[0].get_text() heading_title = flyer_title[0].get_text() print(heading_title) ___ # PANDAS df = pd.DataFrame([{"title": heading_title, "text": text}]) df
from bs4 import Beautifulsoup html = ''' <html> <head><meta charset = 'UTF8'> <title>我是網頁標題</title> </head> <body> <p id = "p1">我是段落一</p> <p id = "p2" class="red">我是段落二</p> </body> </html> ''' sp = Beautifulsoup(html, "lxml") print(sp.find('p')) print(sp.find_all('p')) print(sp.find('p', {'id': 'p2', 'class': 'red'})) print(sp.find('p', id='p2', class_='red')) datas = sp.select('title')
import requests import bs4 from bs4 import Beautifulsoup import pandas as pd import time URL = "https://www.indeed.com/jobs?q=Python&l=Chennai&start=2" page = requests.get(URL) soup = Beautifulsoup(page.text, "html.parser") print(soup.prettify())