def get_price(code_number, headers): """ 年でループ """ dfs = [] year = range(2000, 2021) for y in year: try: url = "https://kabuoji3.com/stock/{}/{}/".format(code_number, y) soup = BeautifulSoup( requests.get(url, headers=headers).content, "html.parser") tag_tr = soup.find_all("tr") head = [h.text for h in tag_tr[0].find_all("th")] data = [] for i in range(1, len(tag_tr)): data.append([d.text for d in tag_tr[i].find_all("td")]) df = pd.DataFrame(data, columns=head) col = ["始値", "高値", "安値", "終値", "出来高", "終値調整"] for c in col: df[c] = df[c].astype(float) dfs.append(df) except IndexError: pass data = pd.concat(dfs, axis=0) data = data.reset_index(drop=True) return data
def geturl(self, webpage, key=None): #key = None ##############################test global dlLinksNext try: webpage = unicode(webpage, 'gbk').encode('utf-8') soup = BeautifulSoup(webpage) tagA = soup.findAll('a') for link in tagA: if not key: dlLinksNext.put(link.get('href')) elif key in str(link): dlLinksNext.put(link.get('href')) except (UnicodeDecodeError): #error = '132 have code' error = 'UnicodeDecodeError' self.loger.logInfo(error) except (UnicodeEncodeError): #error = '135 had code' error = 'UnicodeDecodeError' self.loger.logInfo(error)
import requests from bs3 import BeautifulSoup url = "https://www.yelp.com/sf" yelp_r = requests.get(url) print(yelp_r.status_code) #should be 200 yelp_soup = BeautifulSoup(yelp_r.text, 'html.parser') print(yelp_soup.prettify()) print(yelp_soup.findAll('a')) for link in yelp_soup.findAll('a'): print(link)
from bs3 import BeautifulSoup import requests import matplotlib.pyplot as plt raw_html = requests.get('https://nl.wikipedia.org/wiki/Regering-Jambon').text html = BeautifulSoup(raw_html, 'html.parser') leden = [] views = [] for a in html.select('table.wikitable tr td:nth-child(2)'): leden.append(a.text.replace(' ', '_').strip()) for lid in leden: r = requests.get( "https://wikimedia.org/api/rest_v1/metrics/pageviews/per-article/nl.wikipedia/all-access/all-agents/{}/daily/20191001/2019103100" .format(lid)) data = r.json() count = 0 for item in data['items']: count += item['views'] views.append(count) #print("{}".format(lid.replace('_', ' ') + ' ' + str(count))) plt.style.use('seaborn-poster') plt.title('Wikipedia page visits in Oct.') plt.xlabel('Flemish minister') plt.ylabel('Visits') #plt.annotate(xy=[0, 1], s=str(14000)) plt.bar([ 'Jambon', 'Crevits', 'Somers', 'Weyts', 'Demir', 'Beke', 'Diependaele',
class WebScrape: def __init__(self): print("WebScrape Imported") def lazada_scrape(self, head, category, url): list_of_rows = [] url = "http://www.lazada.com.ph/" + url + "/" source_code = requests.get(url) txt = source_code.text soup = BeautifulSoup(txt, 'html.parser') max_page = int(soup.select("span.pages > a:nth-of-type(6)")[0].get_text()) page = 1 myfile = open(category + ".csv", 'w', newline='') writer = csv.DictWriter(myfile, fieldnames=[ "url", "product_name", "product_header", "product_category", "product_price", "product_sale", "product_old", "installment", "rating" ], delimiter=',') writer.writeheader() while page <= max_page: print(page) url = "http://www.lazada.com.ph/shop-mobiles/?page=" + str(page) source_code = requests.get(url) txt = source_code.text soup = BeautifulSoup(txt, 'html.parser') for div in soup.find_all("div", {"class": "product-card"}): mylist = [] for link in div.find_all("a"): mylist.append(str(link.get("href"))) for title in div.find_all("span", {"class": "product-card__name"}): mylist.append( str(title.text).replace("\u200f", " ").replace( "\uFF08", "(").replace("\uff09", ")")) mylist.append(head) mylist.append(category) for price in div.find_all("div", {"class": "product-card__price"}): mylist.append(str(price.text.replace("\u20B1", "Php "))) sale = div.find_all("div", {"class": "product-card__sale"}) if not sale: mylist.append("0%") else: for sales in sale: mylist.append(str(sales.text)) old = div.find_all("div", {"class": "old-price-wrap"}) if not old: mylist.append("Php 0.00") else: for olds in old: mylist.append( str(olds.text).replace("\u20B1", "Php ").replace("\n", "")) installment = div.find_all("span", {"class": "installment-part"}) if not installment: mylist.append("Php 0.00") else: for installments in installment: mylist.append( str(installments.text).replace("\u20B1", "Php ")) rating = div.find_all("span", {"class": "rating__number"}) if not rating: mylist.append("(0 reviews)") else: for ratings in rating: mylist.append(str(ratings.text)) list_of_rows.append(mylist) page += 1 wr = csv.writer(myfile, quoting=csv.QUOTE_ALL) wr.writerows(list_of_rows)