def get_price(code_number, headers): """ 年でループ """ dfs = [] year = range(2000, 2021) for y in year: try: url = "https://kabuoji3.com/stock/{}/{}/".format(code_number, y) soup = BeautifulSoup( requests.get(url, headers=headers).content, "html.parser") tag_tr = soup.find_all("tr") head = [h.text for h in tag_tr[0].find_all("th")] data = [] for i in range(1, len(tag_tr)): data.append([d.text for d in tag_tr[i].find_all("td")]) df = pd.DataFrame(data, columns=head) col = ["始値", "高値", "安値", "終値", "出来高", "終値調整"] for c in col: df[c] = df[c].astype(float) dfs.append(df) except IndexError: pass data = pd.concat(dfs, axis=0) data = data.reset_index(drop=True) return data
def book_spider(book_tag): global file_content url = "http://bbs.csdn.net/topics/310046216" source_code = requests.get(url) # just get the code, no headers or anything plain_text = source_code.text # BeautifulSoup objects can be sorted through easy soup = BeautifulSoup(plain_text) '''print('\n') print('--' * 30) print('--' * 30) print("\t"*4+book_tag+" :") print('--' * 30) print('--' * 30) print('\n')''' title_divide = '\n' + '--' * 30 + '\n' + '--' * 30 + '\n' file_content += title_divide + '\t' * 4 + \ book_tag + ':' + title_divide count = 1 for book_info in soup.findAll('div', {'class': 'info'}): title = book_info.findAll('a', { 'onclick': re.compile(r"\"moreurl(.+)")})[0].get('title') pub = book_info.findAll('div', {'class':'pub'})[0].string.strip() rating = book_info.findAll('span', { 'class':'rating_nums'})[0].string.strip() people_num = book_info.findAll('span', { 'class':'pl'})[0].string.strip() file_content += "*%d\t《%s》\t评分:%s%s\n\t%s\n\n" % ( count, title, rating, people_num, pub) count += 1
def get_single_book_data(book_url): source_code = requests.get(book_url) plain_text = source_code.text soup = BeautifulSoup(plain_text) # for rating in soup.findAll('strong', {'class':'ll rating_num '}): # print("评分:" + rating.string.strip()) for rating in soup.findAll('p', {'class':'rating_self clearfix'}): print rating.strong.string '''for book_info in soup.findAll('div', {'id':'info'}):
def fetch_tv_info(username, password): match = re.search(pattern, get_home_page()) challenge = "" if match is not None: # get the challenge challenge = match.group(2) print "challenge string: " + challenge # now login response = login(username, password, challenge) content = response.read() # open the TV page opener = urllib2.build_opener() response = execute_opener(opener, "http://iptv.bg/watch") content = response.read() #print content soup = BeautifulSoup(content, fromEncoding='utf-8') tvTags = soup.findAll(name='li', attrs={'class': 'listmode_tv'}) tv_info = [] for tag in tvTags: name = tag.find(name='div', attrs={'class': 'tv_info'}).find(name='b').getText() logo = tag.find(name='img').get('src', default='') url = tag.findAll( name='div', attrs={'class': 'noprint player_soft'})[-1].find(name='a').get('href', default='') thumbnail = tag.findAll( name='div', attrs={'class': 'noprint player_soft'})[-1].find(name='a').get('href', default='') info_tag = tag.find(name='div', attrs={'class': 'tv_info'}) info = '' thumbnail = '' if info_tag is not None: thumbnail = info_tag.find(name='img').get('src', default='') detail_tag = info_tag.find(name='em').find(name='abbr') if detail_tag is not None: info = detail_tag.get('title', default='Unknown') tv_info += [{'name': name, 'logo': logo, 'path': url, 'thumbnail': thumbnail, 'info': info}] return tv_info
def geturl(self, webpage, key=None): #key = None ##############################test global dlLinksNext try: webpage = unicode(webpage, 'gbk').encode('utf-8') soup = BeautifulSoup(webpage) tagA = soup.findAll('a') for link in tagA: if not key: dlLinksNext.put(link.get('href')) elif key in str(link): dlLinksNext.put(link.get('href')) except (UnicodeDecodeError): #error = '132 have code' error = 'UnicodeDecodeError' self.loger.logInfo(error) except (UnicodeEncodeError): #error = '135 had code' error = 'UnicodeDecodeError' self.loger.logInfo(error)
import requests from bs3 import BeautifulSoup url = "https://www.yelp.com/sf" yelp_r = requests.get(url) print(yelp_r.status_code) #should be 200 yelp_soup = BeautifulSoup(yelp_r.text, 'html.parser') print(yelp_soup.prettify()) print(yelp_soup.findAll('a')) for link in yelp_soup.findAll('a'): print(link)
from bs3 import BeautifulSoup import requests import matplotlib.pyplot as plt raw_html = requests.get('https://nl.wikipedia.org/wiki/Regering-Jambon').text html = BeautifulSoup(raw_html, 'html.parser') leden = [] views = [] for a in html.select('table.wikitable tr td:nth-child(2)'): leden.append(a.text.replace(' ', '_').strip()) for lid in leden: r = requests.get( "https://wikimedia.org/api/rest_v1/metrics/pageviews/per-article/nl.wikipedia/all-access/all-agents/{}/daily/20191001/2019103100" .format(lid)) data = r.json() count = 0 for item in data['items']: count += item['views'] views.append(count) #print("{}".format(lid.replace('_', ' ') + ' ' + str(count))) plt.style.use('seaborn-poster') plt.title('Wikipedia page visits in Oct.') plt.xlabel('Flemish minister') plt.ylabel('Visits') #plt.annotate(xy=[0, 1], s=str(14000)) plt.bar([ 'Jambon', 'Crevits', 'Somers', 'Weyts', 'Demir', 'Beke', 'Diependaele',
class WebScrape: def __init__(self): print("WebScrape Imported") def lazada_scrape(self, head, category, url): list_of_rows = [] url = "http://www.lazada.com.ph/" + url + "/" source_code = requests.get(url) txt = source_code.text soup = BeautifulSoup(txt, 'html.parser') max_page = int(soup.select("span.pages > a:nth-of-type(6)")[0].get_text()) page = 1 myfile = open(category + ".csv", 'w', newline='') writer = csv.DictWriter(myfile, fieldnames=[ "url", "product_name", "product_header", "product_category", "product_price", "product_sale", "product_old", "installment", "rating" ], delimiter=',') writer.writeheader() while page <= max_page: print(page) url = "http://www.lazada.com.ph/shop-mobiles/?page=" + str(page) source_code = requests.get(url) txt = source_code.text soup = BeautifulSoup(txt, 'html.parser') for div in soup.find_all("div", {"class": "product-card"}): mylist = [] for link in div.find_all("a"): mylist.append(str(link.get("href"))) for title in div.find_all("span", {"class": "product-card__name"}): mylist.append( str(title.text).replace("\u200f", " ").replace( "\uFF08", "(").replace("\uff09", ")")) mylist.append(head) mylist.append(category) for price in div.find_all("div", {"class": "product-card__price"}): mylist.append(str(price.text.replace("\u20B1", "Php "))) sale = div.find_all("div", {"class": "product-card__sale"}) if not sale: mylist.append("0%") else: for sales in sale: mylist.append(str(sales.text)) old = div.find_all("div", {"class": "old-price-wrap"}) if not old: mylist.append("Php 0.00") else: for olds in old: mylist.append( str(olds.text).replace("\u20B1", "Php ").replace("\n", "")) installment = div.find_all("span", {"class": "installment-part"}) if not installment: mylist.append("Php 0.00") else: for installments in installment: mylist.append( str(installments.text).replace("\u20B1", "Php ")) rating = div.find_all("span", {"class": "rating__number"}) if not rating: mylist.append("(0 reviews)") else: for ratings in rating: mylist.append(str(ratings.text)) list_of_rows.append(mylist) page += 1 wr = csv.writer(myfile, quoting=csv.QUOTE_ALL) wr.writerows(list_of_rows)