def parse2(self, response): bs_obj = bs4.BeautifulSoup(response.text, 'html.parser') item = response.meta['item'] item['parcel_status'] = 'sold'
# ch21_27.py import bs4, requests, re url = 'http://www.taiwanlottery.com.tw' html = requests.get(url) print("網頁下載中 ...") html.raise_for_status() # 驗證網頁是否下載成功 print("網頁下載完成") objSoup = bs4.BeautifulSoup(html.text, 'lxml') # 建立BeautifulSoup物件 dataTag = objSoup.select('.contents_box02') # 尋找class是contents_box02 print("串列長度", len(dataTag)) for i in range(len(dataTag)): # 列出含contents_box02的串列 print(dataTag[i]) pattern=r'\d+/\d+/\d+' # 找尋開出順序與大小順序的球 balls = dataTag[0].find_all('div', {'class':'ball_tx ball_green'}) date = dataTag[0].find('span', {'class':'font_black15'}) datelist=re.findall(pattern,str(date)) print('威力彩開獎 日期: ' + datelist[0]) print("開出順序 : ", end='') for i in range(6): # 前6球是開出順序 print(balls[i].text, end=' ') print("\n大小順序 : ", end='') for i in range(6,len(balls)): # 第7球以後是大小順序 print(balls[i].text, end=' ') # 找出第二區的紅球
import requests import bs4 ress = requests.get('https://loksabha.nic.in/') soup = bs4.BeautifulSoup(ress.text, 'html.parser') s = soup.find_all('div', {'class': 'update'}) for count in s: print(count.findChild('ul').text)
import urllib.request import bs4 import time import sqlite3 ## 변수 선언 부분 ## con, cur = None, None data1, data2, data3 = "", "", "" sql = "" # 메인 코드 부분 ## while True: url = "http://news.naver.com/" html = urllib.request.urlopen(url) bs_obj = bs4.BeautifulSoup(html, "html.parser") hdline_article_list = bs_obj.find("ul", {"class":"hdline_article_list"}) lis = hdline_article_list.findAll("li") con = sqlite3.connect("../sqlite-tools-win32-x86-3300100/naverDB") cur = con.cursor() for li in lis: a = li.find("a") a.text.strip() data1 = "정치" # 뉴스 구분 data2 = a.text.strip() data2 = data2.replace("'", "\"") # 뉴스 제목 data3 = "ydgil" # 작성자
def test_selector_build(): # Simple case source = """<html><body><div class="hi"></div></body></html>""" soup = bs4.BeautifulSoup(source, "html5lib") element = soup.body.select("div")[0] selector = Selector.build(soup, element) assert selector.css == "html>body>div" assert selector.xpath == "/html/body/div" selector = Selector.build(soup, element) assert selector.css == "html>body>div" assert selector.xpath == "/html/body/div" # Complex nesting source = """<html><body> <a></a> <div><a></a></div> </body></html>""" soup = bs4.BeautifulSoup(source, "html5lib") elements = soup.body.select("a") selector = Selector.build(soup, elements[0]) assert selector.css == "html>body>a" assert selector.xpath == "/html/body/a" selector = Selector.build(soup, elements[1]) assert selector.css == "html>body>div>a" assert selector.xpath == "/html/body/div/a" # Deeply nested source = """<html><body> <div class="a" wtl-uid="12"><div><div class="b"><div class="c"><div class="d"><div class="e"><div class="f"> <span>Hi</span> </div></div></div></div></div></div></div> <div class="a"><div><div class="b"><div class="c"><div class="d"><div class="f"> <span>Howdy</span> </div></div></div></div></div></div></div> </body></html>""" soup = bs4.BeautifulSoup(source, "html5lib") element = soup.body.select(".e span")[0] selector = Selector.build(soup, element) assert selector.css == "html>body>div:nth-of-type(1)>div>div>div>div>div>div>span" assert selector.xpath == "/html/body/div[1]/div/div/div/div/div/div/span" selector = Selector.build(soup, element) assert selector.css == "html>body>div:nth-of-type(1)>div>div>div>div>div>div>span" assert selector.xpath == "/html/body/div[1]/div/div/div/div/div/div/span" element = soup.body.select(".d > .f > span")[0] selector = Selector.build(soup, element) assert selector.css == "html>body>div:nth-of-type(2)>div>div>div>div>div>span" assert selector.xpath == "/html/body/div[2]/div/div/div/div/div/span" selector = Selector.build(soup, 12) assert selector.css == "html>body>div:nth-of-type(1)" assert selector.xpath == "/html/body/div[1]" # Unsafe names source = """<html><body> <div:nonstandard><a></a></div> </body></html>""" soup = bs4.BeautifulSoup(source, "html5lib") element = soup.body.select("a")[0] selector = Selector.build(soup, element) assert selector.css == "html>body>*>a" assert selector.xpath == "/html/body/*/a" # Invalid WTL-uid selector = Selector.build(soup, 23) assert selector.css == "bad_wtl_uid_no_matches" assert selector.xpath == "bad_wtl_uid_no_matches"
def __init__(self, html): self._soup = bs4.BeautifulSoup(html, 'html5lib') self._inline_scripts = None self._scripts = None
def get_soup(self, url): ret = bs4.BeautifulSoup(req.urlopen(url).read(), 'lxml') req.urlopen(url).close() return ret
gross = [] #19 mv_attributs = names,years,imdb_ratings,metascores,votes,categories,mv_pages,genre1,genre2,genre3,stars1,stars2,stars3,rank,nb_oscar,win,nom,runtime,budget,gross # TEST POSSIBILITIES : page_link = "https://www.imdb.com/title/tt7286456/" #oscar win nom #page_link = "https://www.imdb.com/title/tt0120903/" #win nom #page_link = "https://www.imdb.com/title/tt6914122/" #nom #page_link = "https://www.imdb.com/title/tt8201852/" #Empty #page_link ="https://www.imdb.com/title/tt2017038/" #1 Star response = requests.get(page_link) html = bs4.BeautifulSoup(response.text, 'html.parser') nb_genre = 0 #get the movie genres div = html.find('div', class_="subtext") #test_genre = False for a in div.find_all('a'): #test_genre = False title = a.get('title') #there is a balise title which we do not want if title is None: mv_attributs[7+nb_genre].append(a.text) #test_genre = True nb_genre += 1 if nb_genre == 1: mv_attributs[8].append(None)
- comp_file = zipfile.Zipfile('comp_file.zip','w') import requests import bs4 result = requests.get("http://example.com") type(result) result.text import bs4 soup = bs4.BeautifulSoup(result.text,"lxml") soup soup.select('title')[0].getText() site_para = soup.select("p") site_para[0] res = requests.get('http://en.wikipedia.org/wiki/Grace_Hopper') soup = bs4.BeautifulSoup(res.text,"lxml") first_item = soup.select('.toctext')[0]
def prettify(raw_html: str): soup = bs4.BeautifulSoup(raw_html, 'html.parser') print(soup.prettify())
def get_splits( playerid: str, year: Optional[int] = None, player_info: bool = False, pitching_splits: bool = False ) -> Union[pd.DataFrame, Tuple[pd.DataFrame, Dict]]: """ Returns a dataframe of all split stats for a given player. If player_info is True, this will also return a dictionary that includes player position, handedness, height, weight, position, and team """ soup = get_split_soup(playerid, year, pitching_splits) # the splits tables on the bbref site are all within an embedded comment. This finds all the comments comment = soup.find_all(text=lambda text: isinstance(text, bs.Comment)) data = [] level_data = [] for i in range(len(comment)): commentsoup = bs.BeautifulSoup(comment[i], 'lxml') split_tables = commentsoup.find_all("div", {"class": "table_container"}) splits = [ele for ele in split_tables] headings = [] level_headings = [] for j in range(len(splits)): split_type = splits[j].find_all('caption')[0].string.strip() # two types of tables on bref, game level and non-game level if split_type[-5:] == 'Level': if year == None: # The bbref tables for career splits have one extra preceding th column labeled 'I' that is not used and is not in the single season records level_headings = [ th.get_text() for th in splits[j].find("tr").find_all("th") ][1:] else: level_headings = [ th.get_text() for th in splits[j].find("tr").find_all("th") ][:] level_headings.append('Split Type') level_headings.append('Player ID') # singles data isn't included in the tables so this appends the column header level_headings.append('1B') level_data.append(level_headings) rows = splits[j].find_all('tr') for row in rows: if year == None: # The bbref tables for career splits have one extra preceding th column labeled 'I' that is not used and is not in the single season records level_cols = row.find_all('td') else: level_cols = row.find_all(['th', 'td']) level_cols = [ele.text.strip() for ele in level_cols] if split_type != "By Inning": # bbref added three empty columns to the by inning tables that don't match the rest of the tables. Not including this split table in results level_cols.append(split_type) level_cols.append(playerid) level_data.append([ele for ele in level_cols]) else: if year == None: # The bbref tables for career splits have one extra preceding th column labeled 'I' that is not used and is not in the single season records headings = [ th.get_text() for th in splits[j].find("tr").find_all("th") ][1:] else: headings = [ th.get_text() for th in splits[j].find("tr").find_all("th") ][:] headings.append('Split Type') headings.append('Player ID') # singles data isn't included in the tables so this appends the column header headings.append('1B') data.append(headings) rows = splits[j].find_all('tr') for row in rows: if year == None: # The bbref tables for career splits have one extra preceding th column labeled 'I' that is not used and is not in the single season records cols = row.find_all('td') else: cols = row.find_all(['th', 'td']) cols = [ele.text.strip() for ele in cols] if split_type != "By Inning": # bbref added three empty columns to the by inning tables that don't match the rest of the tables. Not including this split table in results cols.append(split_type) cols.append(playerid) data.append([ele for ele in cols]) data = pd.DataFrame(data) data = data.rename(columns=data.iloc[0]) data = data.reindex(data.index.drop(0)) data = data.set_index(['Player ID', 'Split Type', 'Split']) data = data.drop(index=['Split'], level=2) data = data.apply(pd.to_numeric, errors='coerce').convert_dtypes() data = data.dropna(axis=1, how='all') data['1B'] = data['H'] - data['2B'] - data['3B'] - data['HR'] data = data.loc[playerid] if pitching_splits is True: # Returns Game Level tables as a second dataframe for pitching splits level_data = pd.DataFrame(level_data) level_data = level_data.rename(columns=level_data.iloc[0]) level_data = level_data.reindex(level_data.index.drop(0)) level_data = level_data.set_index(['Player ID', 'Split Type', 'Split']) level_data = level_data.drop(index=['Split'], level=2) level_data = level_data.apply(pd.to_numeric, errors='coerce').convert_dtypes() level_data = level_data.dropna(axis=1, how='all') level_data = level_data.loc[playerid] # data = pd.concat([data, level_data]) if player_info is False: if pitching_splits is True: return data, level_data else: return data else: player_info_data = get_player_info(playerid=playerid, soup=soup) if pitching_splits is True: return data, player_info_data, level_data else: return data, player_info_data
def get_id(raw_html: str, tag_id: str) -> bs4.ResultSet: soup = bs4.BeautifulSoup(raw_html, 'html.parser') return soup.find_all("div", id=tag_id)
def call_rec(sub, vid_id, seek_time): print("SEEK_TIME:"+seek_time) seek_time = int(seek_time) topic=sub.split()[0].lower() #nltk.download('punkt') #nltk.download('averaged_perceptron_tagger') #nltk.download('stopwords') dict=YouTubeTranscriptApi.get_transcript(vid_id,languages=['en']) transcript='' for i in range(len(dict)): if dict[i]['start']<seek_time: transcript=transcript+' '+dict[i]['text'] else: break print(transcript) p = wikipedia.page(sub) #print(p.url) #print(p.title) content = p.content stop_words = set(stopwords.words('english')) text= content + transcript text = ' '.join([word.lower() for word in text.split() if word.lower() not in stop_words and len(word)>2]) #print('the' in text.split()) data = [] from nltk.tokenize import sent_tokenize, word_tokenize # iterate through each sentence in the file f = text.replace("\n", " ").replace(",","").replace("(","").replace(")","").replace(";","") for i in sent_tokenize(f): temp = [] # tokenize the sentence into words for j in word_tokenize(i): if(j.isalpha() and j.lower() not in stop_words): temp.append(j.lower()) data.append(temp) #print('the' in data) # Create CBOW model model1 = Word2Vec(data, min_count = 1, size = 100, window = 10) model1.train(data, total_examples=1, epochs=50) #print("the" in model1.wv.vocab) topic_relevant=[] for t in model1.wv.most_similar(topic): topic_relevant.append(t[0]) #print(topic_relevant) about_topics='' for topics in topic_relevant: #print("***"+topics) response = requests.get("https://en.wikipedia.org/wiki/"+topics) about_topics +=topics+' :' if response is not None: html = bs4.BeautifulSoup(response.text, 'html.parser') paragraphs = html.select("p") #print(wikipedia.page(topics).content) for para in paragraphs: #print("##########################") #print(para.text) if len(para.text.split())>20: about_topics=about_topics+para.text break about_topics=about_topics+'\n' response.close(); print(topic_relevant) return about_topics """
import bs4 as bs import urllib.request url = 'http://lishi.tianqi.com/shantou/201101.html' source = urllib.request.urlopen(url) soup = bs.BeautifulSoup(source,'html.parser') uls = soup.find_all('ul') for ul in uls: lis = ul.find_all('li') if len(lis[0].text) == 10: for li in lis: print(li.text)
def _get_image(url: str) -> tuple: with session.get(url) as res: res.raise_for_status() soup = bs4.BeautifulSoup(res.text, "html.parser") img = soup("div", class_="storyContent")[-1].img return (img["title"], img["src"])
def try_parse_html(html, **kwargs): try: return bs4.BeautifulSoup(html, 'html.parser', **kwargs) except HTMLParseError: return None
#! python3 # downloadXckd.py - downloading all the comics from an website import requests, os, bs4 url = 'http://xkcd.com' #url inicial os.makedirs('xkcd', exist_ok=True) #armazena quadrinhos no ./xkcd e deixa claro #que exista uma pasta pronta while not url.endswith('#'): #enquanto o url nao termina com a str '#' print('Downloading page %s...' % url) #realiza o download res = requests.get(url) res.raise_for_status() #garante que nao há erro no url soup = bs4.BeautifulSoup(res.text, "html.parser") #a funcao fica analizando e #buscando partes do codigo html que correspondam a variavel comicElem = soup.select('#comic img') #procura o url da imagem if comicElem == []: print('Could not find comic image.') #caso nao ache, printa isso else: comicUrl = comicElem[0].get('src') print('Downloading image %s...' % (comicUrl)) res = requests.get(comicUrl) res.raise_for_status() #caso ele ache, o programa vai armazenando dentro do vetor as imagens dentro #do diretorio selecionado, sempre ao final da iteracao checando se o url #esta correto para evitar erros com a func raise_for_status() imageFile = open(os.path.join('xkcd', os.path.basename(comicUrl)), 'wb')
add_arg('--html-tag', default='a', type=str, help='html tag you want to parse') add_arg('--unique-id', default='', type=str, help='A common string in your links') add_arg('--output-file', default='output_file.txt', type=str, help='A file to write to when stdout is activated') args = parser.parse_args() parsed_url = urlparse(*args.url) domain = '{uri.netloc}'.format(uri=parsed_url) url = Request(*args.url, headers=headers) bs4_data = bs4.BeautifulSoup(urllib.request.urlopen(url), "lxml") urls_list = [] if args.html_tag == 'a': href = 'href' for tag in bs4_data.find_all(args.html_tag): if args.unique_id: if args.unique_id == str(tag): urls_list.append(str(tag['href'])) else: urls_list.append(str(tag['href'])) else: for tag in bs4_data.find_all(args.html_tag): if args.unique_id in str(tag): urls_list.append(str(tag))
def GenerateHTML(self, controller, minify=False, prettify=False): soup = _CreateSoupWithoutHeadOrBody(six.text_type(self._soup)) # Remove declaration. for x in soup.contents: if isinstance(x, bs4.Doctype): x.extract() # Remove declaration. for x in soup.contents: if isinstance(x, bs4.Declaration): x.extract() # Remove all imports. imports = soup.findAll('link', rel='import') for imp in imports: imp.extract() # Remove all script links. scripts_external = soup.findAll('script', src=True) for script in scripts_external: script.extract() # Remove all in-line scripts. scripts_external = soup.findAll('script', src=None) for script in scripts_external: script.extract() # Process all in-line styles. inline_styles = soup.findAll('style') for style in inline_styles: html = controller.GetHTMLForInlineStylesheet(six.text_type(style.string)) if html: ns = soup.new_tag('style') ns.append(bs4.NavigableString(html)) style.replaceWith(ns) else: style.extract() # Rewrite all external stylesheet hrefs or remove, as needed. stylesheet_links = soup.findAll('link', rel='stylesheet') for stylesheet_link in stylesheet_links: html = controller.GetHTMLForStylesheetHRef(stylesheet_link['href']) if html: tmp = bs4.BeautifulSoup(html, 'html5lib').findAll('style') assert len(tmp) == 1 stylesheet_link.replaceWith(tmp[0]) else: stylesheet_link.extract() # Remove comments if minifying. if minify: comments = soup.findAll( text=lambda text: isinstance(text, bs4.Comment)) for comment in comments: comment.extract() if prettify: return soup.prettify('utf-8').strip() # We are done. return six.text_type(soup).strip()
import requests import bs4 res = requests.get('https://google.com/search?q='+'loyalty program million') res.raise_for_status() soup = bs4.BeautifulSoup(res.text, "html.parser") linkElements = soup.select('.r a') linkToSave = min(10, len(linkElements)) for i in range(linkToSave): with open('Links.txt', 'a+') as fo: fo.write('https://google.com' + linkElements[i].get('href') + '\n') fo.close() next_page = response.css('#foot a').attrib['href']
#! python3 #downloadXkcd.py - Downloayds every single XKCD comic. import requests, os, bs4 url = 'http://xkcd.com' # starting url os.makedirs('xkcd', exist_ok=True) # store comics in ./xkcd while not url.endswith('#'): # Download the page print('Downloading page %s...' % url) res = requests.get(url) res.raise_for_status() soup = bs4.BeautifulSoup(res.text) # Find the URL of the comic image. comicElem = soup.select('#comic img') if comicElem == []: print('Could not find comic image') else: comicUrl = 'http:' + comicElem[0].get('src') # Download the image print('Downloading image %s' % (comicUrl)) res = requests.get(comicUrl) res.raise_for_status() # Save the image to ./xkcd imageFile = open(os.path.join('xkcd', os.path.basename(comicUrl)), 'wb') for chunk in res.iter_content(100000): imageFile.write(chunk)
def scraper(self): df = pd.read_csv(self.input_file) for idx, row in df['Facebook Page ID'].iteritems(): print(idx) dic = {'id': '', 'about': '', 'products': '', 'web_url': '', 'web_url1': '', 'web_url2': '', 'category': ''} fb_id = str(row) fb_id = fb_id.split("'") fb_id = str(fb_id[1]) print(fb_id) dic['id'] = fb_id url = "https://www.facebook.com/{}/about/".format(fb_id) header = {'accept-language': 'en-US,en;q=0.9'} try: resp = requests.get(url, headers=header) soup = bs4.BeautifulSoup(resp.text, 'html.parser') obj_list = soup.find_all('div', {'class': ['_50f4', '_3-8w']}) for val in obj_list: try: if 'About'.lower() in val.getText().strip().lower(): about_data = val.find_next().getText() print(about_data) dic['about'] = about_data except Exception as e: print(e) for val1 in obj_list: try: if 'Products'.lower() in val1.getText().strip().lower(): product_data = val1.find_next().getText() print(product_data) dic['products'] = product_data except Exception as e: print(e) try: data = Selector(text=resp.text) web_url = data.xpath('//*[@id="u_0_p"]/div') web_url = web_url.get('data') web_url = str(web_url) web_url = web_url.split(">") web_url = str(web_url[1]) web_url = web_url.split("<") web_url = web_url[0] print(web_url) dic['web_url'] = web_url except Exception as e: print(e) try: data1 = Selector(text=resp.text) web_url1 = data1.xpath('//*[@id="u_0_q"]/div') web_url1 = web_url1.get('data') web_url1 = str(web_url1) web_url1 = web_url1.split(">") web_url1 = str(web_url1[1]) web_url1 = web_url1.split("<") web_url1 = web_url1[0] print(web_url1) dic['web_url1'] = web_url1 except Exception as e: print(e) try: data2 = Selector(text=resp.text) web_url2 = data2.xpath('//*[@id="u_0_o"]/div') web_url2 = web_url2.get('data') web_url2 = str(web_url2) web_url2 = web_url2.split(">") web_url2 = str(web_url2[1]) web_url2 = web_url2.split("<") web_url2 = web_url2[0] print(web_url2) dic['web_url2'] = web_url2 except Exception as e: print(e) try: category_regex = r'\/pages\/category\/[0-9A-z-]+' regex_compile = re.compile(category_regex) search_category = regex_compile.findall(resp.text)[0] search_category = str(search_category) search_category = search_category.split("/") search_category = str(search_category[3]) print(search_category) dic['category'] = search_category except Exception as e: print(e) except Exception as e: print(e) sleep(randint(5, 8)) with open('/home/praveen/Working_files/Social_bakers_collection/Indian_top_facebook_brand_output.json', 'a') as output: json.dump(dic, output) output.write('\n')
async def apk(e): approved_userss = approved_users.find({}) for ch in approved_userss: iid = ch["id"] userss = ch["user"] if e.is_group: if await is_register_admin(e.input_chat, e.message.sender_id): pass elif e.chat_id == iid and e.sender_id == userss: pass else: return try: app_name = e.pattern_match.group(1) remove_space = app_name.split(" ") final_name = "+".join(remove_space) page = requests.get( "https://play.google.com/store/search?q=" + final_name + "&c=apps" ) lnk = str(page.status_code) soup = bs4.BeautifulSoup(page.content, "lxml", from_encoding="utf-8") results = soup.findAll("div", "ZmHEEd") app_name = ( results[0].findNext("div", "Vpfmgd").findNext("div", "WsMG1c nnK0zc").text ) app_dev = results[0].findNext("div", "Vpfmgd").findNext("div", "KoLSrc").text app_dev_link = ( "https://play.google.com" + results[0].findNext("div", "Vpfmgd").findNext("a", "mnKHRc")["href"] ) app_rating = ( results[0] .findNext("div", "Vpfmgd") .findNext("div", "pf5lIe") .find("div")["aria-label"] ) app_link = ( "https://play.google.com" + results[0] .findNext("div", "Vpfmgd") .findNext("div", "vU6FJ p63iDd") .a["href"] ) app_icon = ( results[0] .findNext("div", "Vpfmgd") .findNext("div", "uzcko") .img["data-src"] ) app_details = "<a href='" + app_icon + "'>ЁЯУ▓​</a>" app_details += " <b>" + app_name + "</b>" app_details += ( "\n\n<code>Developer :</code> <a href='" + app_dev_link + "'>" + app_dev + "</a>" ) app_details += "\n<code>Rating :</code> " + app_rating.replace( "Rated ", "тнР " ).replace(" out of ", "/").replace(" stars", "", 1).replace( " stars", "тнР " ).replace( "five", "5" ) app_details += ( "\n<code>Features :</code> <a href='" + app_link + "'>View in Play Store</a>" ) app_details += "\n\n===> @MissJuliaRobot <===" await e.reply(app_details, link_preview=True, parse_mode="HTML") except IndexError: await e.reply("No result found in search. Please enter **Valid app name**") except Exception as err: await e.reply("Exception Occured:- " + str(err))
import googlemaps import os import sys import zipfile #GET NAME OF FILE name = sys.argv[1] #EXTRACT KMZ with zipfile.ZipFile(name, "r") as zipper: zipper.extractall("") #OPEN FILE data = codecs.open('doc.kml', encoding = 'utf-8').read() os.remove('doc.kml') #PARSING XHTML doc = bs4.BeautifulSoup(data,'html.parser') name = doc.find('placemark').find('name').text coords = doc.find('placemark').find('point').find('coordinates').text.split(',') coords = (float(coords[1]), float(coords[0])) #GETTING ELEVATION client = googlemaps.Client(key = open('Data\elevation key.txt').read()) elevation = googlemaps.elevation.elevation(client, locations = coords)[0]['elevation'] #PRINT RESULT print("Name: " + name) print("Elevation: " + str(elevation)) input()
import bs4 as bs import urllib.request #MADE BY RANJITH home = urllib.request.urlopen('http://www.vg.no/').read() soup = bs.BeautifulSoup(home, 'lxml') list = [] main_li = [] def scraper(): def article_list(list): for article in soup.find_all('div', class_='article-content'): link = article.find('a') try: link = link.get('href') if "http" not in link: if 'nyheter/' in link: list.append("http://www.vg.no" + link) except: pass def article_read(article): site = urllib.request.urlopen(article).read() soup = bs.BeautifulSoup(site, 'lxml') def article_title(): header = soup.find('div', class_='reg-grid-full') title = header.find('h1', class_='main-title') title = title.text.strip() try:
import bs4 as bs from urllib.request import Request, urlopen import pandas as pd import datetime import csv now = str(datetime.datetime.now())[:10] sauce = Request('https://www.the303columbus.com/floorplans.aspx', headers={'User-Agent': 'Mozilla/5.0'}) webpage = urlopen(sauce).read() soup = bs.BeautifulSoup(webpage, 'lxml') table = soup.find_all('table') myrow = [] for tr in table: td = tr.find_all('td') row = [i.text for i in td] myrow.append(row) myrow = pd.DataFrame(myrow) myrow = myrow.drop(myrow.columns[[0, 2, 4, 6, 8, 9, 10, 11, 12, 13]], axis=1) myrow.columns = ['bed', 'bath', 'SQ.FT', 'rent'] myrow.index = ['Glenn', 'Nicklaus', 'Allen', 'Stine', 'Oakley', 'Campbell'] myrow.index.name = now filename = datetime.datetime.now().strftime('columbus303-%Y-%m-%d.csv') myrow.to_csv(filename)
def _download_data(self, *, session: Optional[requests.Session] = None) -> _TopcoderData: session = session or utils.get_default_session() # download HTML url = 'https://community.topcoder.com/stat?c=problem_statement&pm={}'.format(self.problem_id) resp = utils.request('GET', url, session=session) # parse HTML soup = bs4.BeautifulSoup(resp.content.decode(resp.encoding), utils.html_parser) problem_texts = soup.find_all('td', class_='problemText') if len(problem_texts) != 1: raise SampleParseError("""<td class="problemText"> is not found or not unique""") problem_text = problem_texts[0] # parse Definition section # format: # <tr>...<h3>Definition</h3>...<tr> # <tr><td>...</td> # <td><table> # ... # <tr><td>Class:</td><td>...</td></tr> # <tr><td>Method:</td><td>...</td></tr> # ... # </table></td></tr> log.debug('parse Definition section') h3 = problem_text.find('h3', text='Definition') if h3 is None: raise SampleParseError("""<h3>Definition</h3> is not found""") definition = {} for text, key in { 'Class:': 'class', 'Method:': 'method', 'Parameters:': 'parameters', 'Returns:': 'returns', 'Method signature:': 'method_signature', }.items(): td = h3.parent.parent.next_sibling.find('td', class_='statText', text=text) log.debug('%s', td.parent) definition[key] = td.next_sibling.string # parse Examples section # format: # <tr>...<h3>Examples</h3>...<tr> # <tr><td>0)</td><td></td></tr> # <tr><td></td> # <td><table> # ... # <pre>{5, 8}</pre> # <pre>"foo"</pre> # <pre>3.5</pre> # <pre>Returns: 40.0</pre> # ... # </table></td></tr> # <tr><td>1)</td><td></td></tr> # ... log.debug('parse Examples section') h3 = problem_text.find('h3', text='Examples') if h3 is None: raise SampleParseError("""<h3>Examples</h3> is not found""") raw_sample_cases = [] # type: List[Tuple[List[str], str]] cursor = h3.parent.parent while True: # read the header like "0)" cursor = cursor.next_sibling log.debug('%s', cursor) if not cursor or cursor.name != 'tr': break if cursor.find('td').string != '{})'.format(len(raw_sample_cases)): raise SampleParseError("""<td ...>){})</td> is expected, but not found""".format(len(raw_sample_cases))) # collect <pre>s cursor = cursor.next_sibling log.debug('%s', cursor) if not cursor or cursor.name != 'tr': raise SampleParseError("""<tr>...</tr> is expected, but not found""") input_items = [] for pre in cursor.find_all('pre'): marker = 'Returns: ' if pre.string.startswith(marker): output_item = pre.string[len(marker):] break else: input_items.append(pre.string) else: raise SampleParseError("""<pre>Returns: ...</pre> is expected, but not found""") raw_sample_cases.append((input_items, output_item)) # convert samples cases to the Greed format sample_cases = [] for i, (input_items, output_item) in enumerate(raw_sample_cases): sample_cases.append(TestCase( 'example-{}'.format(i), 'input', ('\n'.join(map(_convert_to_greed, input_items)) + '\n').encode(), 'output', (_convert_to_greed(output_item) + '\n').encode(), )) return _TopcoderData(definition=definition, raw_sample_cases=raw_sample_cases, sample_cases=sample_cases)
#! /usr/bin/python3 # Open several Google search results. import requests, sys, webbrowser, bs4 print('Googling...') # display text while downloading the Google page res = requests.get('http://google.com/search?q=' + ' '.join(sys.argv[1:])) res.raise_for_status() # Retrieve top search result links. soup = bs4.BeautifulSoup(res.text, "lxml") # Open a browser tab for each result. linkElems = soup.select('.r a') numOpen = min(5, len(linkElems)) for i in range(numOpen): webbrowser.open('http://google.com' + linkElems[i].get('href'))
import pandas as pd #%% add_1 = 'https://en.wikipedia.org/wiki/Lists_of_writers' #page = requests.get('https://www.newswire.ca/news/air-canada?page=1&pagesize=100') add_2 = 'https://www.newswire.ca/news/air-canada?page=1&pagesize=100' add_3 = 'https://www.cision.ca/resources/' page = requests.get(add_2) #soup = bs4.BeautifulSoup(page) soup = bs4.BeautifulSoup(page.content, 'html.parser') names = soup.findAll('a') #%% BaseURL = 'https://www.newswire.ca' Titr = [] URL = [] for i in range(1, 11): add = 'https://www.newswire.ca/news/air-canada?page=' + str(i) + '&pagesize=100' page = requests.get(add) soup = bs4.BeautifulSoup(page.content, 'html.parser') names = soup.findAll('a') print(add) for name in names: try:
#! /usr/local/bin/python3 import bs4 exampleFile = open('example.html') exampleSoup = bs4.BeautifulSoup(exampleFile.read(), "lxml") print(type(exampleSoup)) # <class 'bs4.BeautifulSoup'> elems = exampleSoup.select('#author') print(type(elems)) # <class 'list'> print(len(elems)) # 1 print(type(elems[0])) # <class 'bs4.element.Tag'> print(str(elems[0])) # '<span id="author">Al Sweigart</span>' print(elems[0].getText()) # 'Al Sweigart' print(elems[0].attrs) # {'id': 'author'} print()