def get_url_set(driver, key_word): container_list = [] #存放临时URL real_url_set = set() #URL集 #百度搜索结果 baidu_url = "https://www.baidu.com/s?ie=UTF-8&wd=" + key_word driver.get(baidu_url) #获取每条搜索结果的URL for page in range(1, MAX_PAGEs + 1): BF1 = BF(driver.page_source) #print(driver.page_source) page_container_list = BF1.findAll( "div", {"class": re.compile(".*c-container.*")}) container_list.extend(page_container_list) b = driver.find_element_by_xpath("//*[text()='下一页>']").click() time.sleep(2) #将每条URL进行一次跳转,得到初始URL,并添加进real_url_set中 for container in container_list: href = container.find('h3').find('a').get('href') try: baidu_url = requests.get(url=href, headers=headers, allow_redirects=False) except: continue real_url = baidu_url.headers['Location'] #得到网页原始地址 if real_url.startswith('http'): real_url_set.add(real_url + '\n') #必应搜索结果 being_url = "https://cn.bing.com/search?q=" + key_word + "&FORM=PORE" try: driver.get(being_url) except: driver.refresh() #需要刷新一下界面 time.sleep(2) driver.refresh() time.sleep(5) for page in range(1, MAX_PAGEs + 1): BF1 = BF(driver.page_source) #print(driver.page_source) page_container_list = BF1.find("ol", {"id": "b_results"}).findAll("h2") for page_container in page_container_list: try: real_url_set.add(page_container.find("a").get('href')) except: break b = driver.find_element_by_xpath(".//*[@title='下一页']").click() time.sleep(2) #谷歌暂时没做 #google_url="" return real_url_set
def get_url_set(self, key_word): container_list = [] # 存放临时URL real_url_set = set() # URL集 # CNN CNN_url = "https://edition.cnn.com/search?size=20&q=" + key_word self.driver.get(CNN_url) BF1 = BF(self.driver.page_source, 'lxml') container_list = BF1.findALL("div", {"class", "cnn-search__result-contents"}) for container in container_list: try: href = container.find("h3").find("a").get("href") except: continue real_url_set.add(container)
def get_url_set(driver, key_word): container_list = [] #存放临时URL real_url_set = set() #URL集 baidu_url_list = [ "https://www.baidu.com/s?ie=UTF-8&wd=" + key_word, "https://www.baidu.com/s?ie=UTF-8&tn=news&wd=" + key_word ] for i in range(2): driver.get(baidu_url_list[i]) #获取每条搜索结果的URL for page in range(1, MAX_PAGEs + 1): BF1 = BF(driver.page_source, 'lxml') #print(driver.page_source) if i == 0: page_container_list = BF1.findAll( "div", {"class": re.compile(".*c-container.*")}) else: page_container_list = BF1.findAll( "div", {"class": re.compile("result")}) #print(page_container_list) container_list.extend(page_container_list) b = driver.find_element_by_xpath("//*[text()='下一页>']").click() time.sleep(2) if i == 0: #print(container_list) for container in container_list: print(container) href = container.find("h3").find("a").get("href") try: baidu_url = requests.get(url=href, headers=headers, allow_redirects=False) except: continue real_url = baidu_url.headers['Location'] #得到网页原始地址 if real_url.startswith('http'): real_url_set.add(real_url + '\n') container_list = [] else: for container in container_list: href = container.find("h3").find("a").get("href") if "baijiahao" not in href: real_url_set.add(href) return real_url_set
def main(): # main function # Set the url link urlMovies = 'https://www.subs4free.club/' # Set my-user-aget. If you dont know who is your 'User-aget', just google "my user agent" and it will show it first on result headers = { "User-Agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.102 Safari/537.36'} # Get the whole page content pageMovies = requests.get(urlMovies, headers=headers) # Convert page to lxml soupMovies = BF(pageMovies.content, 'html.parser') # DIV attribute that contains info for every movie elementListMovies = soupMovies.findAll("div", {"class": "movies-info"}) createInfoMsgToSend(elementListMovies)
def abcNewsGetUrlSet(self, key_word) -> set: container_list = [] # 存放临时URL real_url_set = set() # URL集 # abcNews abcNews_url = "https://abcnews.go.com/search?r=week&searchtext=" + key_word self.driver.get(abcNews_url) BF1 = BF(self.driver.page_source, 'lxml') container_list = BF1.findALL("div", {"class": re.compile("result.*")}) for container in container_list: try: href = container.find("a", {"class": "title"}).get("href") title = container.find("a", {"class": "title"}).get_text() except: continue real_url_set.add((href, title)) return real_url_set
def cnnGetUrlSet(self, key_word) -> set: container_list = [] # 存放临时URL real_url_set = set() # URL集 # CNN cnn_url = "https://edition.cnn.com/search?size=20&q=" + key_word self.driver.get(cnn_url) bf1 = BF(self.driver.page_source, 'lxml') container_list = bf1.findALL("div", {"class": "cnn-search__result-contents"}) for container in container_list: try: href = container.find("h3").find("a").get("href") title = container.find("h3").find("a").get_text() except: continue real_url_set.add((href, title)) return real_url_set
def tassGetUrlSet(self, key_word) -> set: container_list = [] # 存放临时URL real_url_set = set() # URL集 # tass tass_url = "https://tass.com/search?sort=date&searchStr" + key_word self.driver.get(tass_url) BF1 = BF(self.driver.page_source, 'lxml') container_list = BF1.findALL("div", {"class": "news-list__item ng-scope"}) for container in container_list: try: href = "www.tass.con/" + container.find("a").get("href") title = container.find("span", { "class": "news-preview__title ng-binding" }).get_text() except: continue real_url_set.add((href, title)) return real_url_set
def find_jobs(): html_text = requests.get(website).text soup = BF(html_text, 'lxml') jobs = soup.find_all('li', class_='clearfix job-bx wht-shd-bx') for index, job in enumerate(jobs): published_date = job.find('span', class_='sim-posted').span.text if 'few' in published_date: company_name = job.find('h3', class_='joblist-comp-name').text skills = job.find('span', class_='srp-skills').text experience = job.li.text.replace('card_travel', '') more_info = job.header.h2.a['href'] if unfamiliar_skill not in skills: with open(f'posts/{company_name.strip()}.txt', 'w') as f: f.write(f"Company name: {company_name.strip()}\n") f.write(f"Required Skills: {skills.strip()}\n") f.write(f"Required Experience: {experience.strip()}\n") f.write(f'More Info: {more_info}\n') print( f'The file saved in the posts folder: {company_name.strip()}.txt ' )
from bs4 import BeautifulSoup as BF with open('home.html', 'r') as html_file: content = html_file.read() soup = BF(content, 'lxml') course_cards = soup.find_all('div', class_='card') for course in course_cards: course_name = course.h5.text course_price = course.a.text.split()[-1] print(f"{course_name} costs {course_price}")
def get_soup(corpus): #获取爬去xml文件的Beautifulsoup original_corpus = open('%s.xml' % corpus, "r") soup = BF(original_corpus, 'lxml') return soup
def getDataFromSite(): response = requests.get(currentTarget, headers=HEADERS) return BF(response.content, 'html.parser')
def get_url_set(self, driver, key_word): container_list = [] #存放临时URL real_url_set = set() #URL集 #百度网页搜索+百度资讯搜索结果 baidu_url_list = [ "https://www.baidu.com/s?ie=UTF-8&wd=" + key_word, "https://www.baidu.com/s?ie=UTF-8&tn=news&wd=" + key_word ] for i in range(2): driver.get(baidu_url_list[i]) #获取每条搜索结果的URL for page in range(1, MAX_PAGEs + 1): BF1 = BF(driver.page_source, 'lxml') #print(driver.page_source) if i == 0: page_container_list = BF1.findAll( "div", {"class": re.compile(".*c-container.*")}) else: page_container_list = BF1.findAll( "div", {"class": re.compile("result")}) #print(page_container_list) container_list.extend(page_container_list) b = driver.find_element_by_xpath("//*[text()='下一页>']").click() time.sleep(2) if i == 0: #print(container_list) for container in container_list: #print(container) href = container.find("h3").find("a").get("href") try: baidu_url = requests.get(url=href, headers=headers, allow_redirects=False) except: continue real_url = baidu_url.headers['Location'] #得到网页原始地址 if real_url.startswith('http'): real_url_set.add(real_url + '\n') container_list = [] else: for container in container_list: href = container.find("h3").find("a").get("href") if "baijiahao" not in href: real_url_set.add(href) #必应搜索结果 being_url = "https://cn.bing.com/search?q=" + key_word + "&FORM=PORE" try: driver.get(being_url) except: driver.refresh() #需要刷新一下界面 time.sleep(2) driver.refresh() time.sleep(5) for page in range(1, MAX_PAGEs + 1): BF1 = BF(driver.page_source) #print(driver.page_source) page_container_list = BF1.find("ol", { "id": "b_results" }).findAll("h2") for page_container in page_container_list: try: real_url_set.add(page_container.find("a").get('href')) except: break try: b = driver.find_element_by_xpath(".//*[@title='下一页']").click() except: b = driver.find_element_by_xpath( ".//*[@title='Next page']").click() time.sleep(2) #谷歌网页加谷歌新闻 #需使用VPN google_url_list = [ "https://www.google.com.hk/search?q=" + key_word, "https://www.google.com/search?q={}&tbm=nws".format(key_word) ] for google_url in google_url_list: try: driver.get(google_url) except: driver.refresh() #需要刷新一下界面 time.sleep(2) driver.refresh() time.sleep(5) driver.get(google_url) for page in range(1, MAX_PAGEs + 1): BF1 = BF(driver.page_source) #print(driver.page_source) page_container_list = BF1.findAll("div", {"class": "g"}) for page_container in page_container_list: try: real_url_set.add(page_container.find("a").get('href')) except: break b = driver.find_element_by_xpath("//*[text()='下一页']").click() time.sleep(2) #Wikipedia 需使用VPN Wikipedia_url = "https://zh.wikipedia.org/w/index.php?search=" + key_word + "&limit=100&ns0=1" driver.get(Wikipedia_url) BF1 = BF(driver.page_source) page_container_list = BF1.findAll( "div", {"class": "mw-search-result-heading"}) for page_container in page_container_list: try: real_url_set.add("https://zh.wikipedia.org" + page_container.find("a").get('href')) except: break return real_url_set
'Accept-Encoding': 'gzip, deflate, compress', 'Accept-Language': 'en-us;q=0.5,en;q=0.3', 'Cache-Control': 'max-age=0', 'Connection': 'keep-alive', 'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:22.0) Gecko/20100101 Firefox/22.0' } real_url_list = [] container_list = [] for page in range(1, max_pages + 1): BF1 = BF(driver.page_source) #print(driver.page_source) page_container_list = BF1.findAll("div", {"class": re.compile(".*c-container.*")}) container_list.extend(page_container_list) b = driver.find_element_by_xpath("//*[text()='下一页>']").click() time.sleep(2) #get all URLs for container in container_list: href = container.find('h3').find('a').get('href') try: baidu_url = requests.get(url=href, headers=headers, allow_redirects=False)
import os import requests from bs4 import BeautifulSoup as BF if __name__ == "__main__": # main function # Set the url link urlGames = 'https://game20.gr/category/news/' urlMovies = 'https://www.subs4free.info/' urlBeta = 'https://www.allgamesdelta.net/' # Set my-user-aget. If you dont know who is your 'User-aget', just google "my user agent" and it will show it first on result headers = { "User-Agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.102 Safari/537.36'} # Get the whole page content pageGames = requests.get(urlGames, headers=headers) pageMovies = requests.get(urlMovies, headers=headers) pageBeta = requests.get(urlBeta, headers=headers) soupGames = BF(pageGames.content, 'lxml') soupMovies = BF(pageMovies.content, 'lxml') soupBeta = BF(pageBeta.content, 'lxml') sel = " body > div.wrapper > div.container > section > div.container-section > div.list-info > div.movies-info > div.movie-cont-right > div.panel-heading-info > a" # elementListMovies = soupMovies.select('a.headinglink') elementListMovies = soupMovies.findAll("div", {"class" : "movie-info"}) elementListGames = soupGames.findAll('article') elementListBeta = soupBeta.findAll("div", {"class" : "post hentry"})
def get_url_set(self, key_word): container_list = [] # 存放临时URL real_url_set = set() # URL集 # CNN CNN_url = "https://edition.cnn.com/search?size=20&q="+key_word self.driver.get(CNN_url) BF1 = BF(self.driver.page_source, 'lxml') container_list=BF1.findALL("div", {"class", "cnn-search__result-contents"}) for container in container_list: try: href=container.find("h3").find("a").get("href") except: continue real_url_set.add(container) def get_url_set(self, key_word): container_list = [] # 存放临时URL real_url_set = set() # URL集 # CNN CNN_url = "https://edition.cnn.com/search?size=20&q=" + key_word self.driver.get(CNN_url) BF1 = BF(self.driver.page_source, 'lxml') container_list = BF1.findALL("div", {"class", "cnn-search__result-contents"}) for container in container_list: try: href = container.find("h3").find("a").get("href") except: continue real_url_set.add(container) for i in range(2): self.driver.get(baidu_url_list[i]) # 获取每条搜索结果的URL for page in range(1, self.MAX_PAGEs + 1): BF1 = BF(self.driver.page_source, 'lxml') # print(driver.page_source) if i == 0: page_container_list = BF1.findAll("div", {"class": re.compile(".*c-container.*")}) else: page_container_list = BF1.findAll("div", {"class": re.compile("result")}) # print(page_container_list) container_list.extend(page_container_list) b = self.driver.find_element_by_xpath("//*[text()='下一页>']").click() time.sleep(2) if i == 0: # print(container_list) for container in container_list: # print(container) try: href = container.find("h3").find("a").get("href") baidu_url = requests.get(url=href, headers=self.headers, allow_redirects=False) except: continue real_url = baidu_url.headers['Location'] # 得到网页原始地址 if real_url.startswith('http'): real_url_set.add(real_url + '\n') container_list = [] else: for container in container_list: href = container.find("h3").find("a").get("href") if "baijiahao" not in href: real_url_set.add(href)