def get_mulu(): # 获取章节目录 r = requests.get(base_url + "/95_95114/").content.decode("UTF-8") soup = be(r, features="lxml") for i in soup.find_all("a"): if "href" in i.attrs.keys(): if "第" in i.string and i.string not in page_list.keys(): page_list[i.string.replace("\xa0", " ")] = i.attrs["href"] print(page_list)
def listx(): c() ls = {} r = be(str(br.open(u).read()), 'html.parser').find_all('div', attrs={'class': "buybox-content" })[2].pre.select('a')[2:] for x in r: ls.update({x.text: x['href']}) if x.text != '' else None print('\n'.join(f'{w[4]}{list(ls.keys()).index(x)+1}. {w[6]}{x}' for x in ls)) return ls
def read_text(path): down_page = "" if path in page_list.values(): print(list(page_list.keys())[list(page_list.values()).index(path)]) r = requests.get(base_url + path).content.decode("UTF-8") soup = be(r, features="lxml") for i in soup.find_all("div", id="content"): for m in i.contents: if type(m) == bs4.element.NavigableString and len(m) > 1: for x in cut(m, 100): print(x) input() for i in soup.find_all("div", class_="bottem1"): for m in i.children: if m.string == "下一章": down_page = m.attrs["href"] if len(down_page) <= 1: down_page = page_list[list(page_list.keys())[list( page_list.values()).index(path)]] return down_page else: print("input error")
import sys import io sys.stdout = io.TextIOWrapper(sys.stdout.detach(), encoding='utf-8') sys.stderr = io.TextIOWrapper(sys.stderr.detach(), encoding='utf-8') from bs4 import BeautifulSoup as be with open("food-list.html", encoding="utf-8") as html: soup = be(html, 'html.parser') print("1 ==>", soup.select_one("li:nth-of-type(8)").string) print("2 ==>", soup.select_one("#ac-list > li:nth-of-type(4)").string) print("3 ==>", soup.select("#ac-list > li[data-lo='cn']") [0].string) # select 는 리스트로 반환 되기때문에 index로 구분 해줘야한다. print("4 ==>", soup.select("#ac-list > li.alcohol.high")[0].sting) param = {"data-lo": "cn", "class": "alcohol"} print("5 ==>", soup.find("li", param).string) print("6 ==>", soup.find(id="ac-list").find("li", param).string) for ac in soup.find_all("li"): if ac["data-lo"] == "us": print("data-lo == us ", ac.string)
# 抓網頁 import requests as re from bs4 import BeautifulSoup as be web = re.get("https://erichuang950313.github.io/nba_web/main/homepage.html") print(web.status_code) #status_code 傳送狀態 #200是ok #https://blog.miniasp.com/post/2009/01/16/Web-developer-should-know-about-HTTP-Status-Code.aspx a = be(web.text, "html.parser") print(a.find_all("", {"id": "a"})) print(a.find("", {"id": "a"}).text) # "" 不能去掉 # "" 標籤名稱
import sys import io sys.stdout = io.TextIOWrapper(sys.stdout.detach(), encoding = 'utf-8') sys.stderr = io.TextIOWrapper(sys.stderr.detach(), encoding = 'utf-8') import urllib.request as req from bs4 import BeautifulSoup as be import urllib.parse as rep base = "https://www.inflearn.com/" quote = rep.quote_plus("추천-강좌") # 한글은 유니코드로 인코딩 해준다. url = base + quote res = req.urlopen(url).read() soup = be(res,"html.parser") recommand = soup.find_all("span.rank_cont > li") print(recommand) # for i ,e in enumerate(recommand,1): # print(i , e.select_one("h4.block_title > a").string) for e in recommand: if e.find("a") is not None: print(i," " ,e.select_one(".tltle").string) i += 1
def fillunivlist(ulist, html): soup = be(html, "html.parser") for tr in soup.find('tbody').children: if isinstance(tr, bs4.element.Tag): tds = tr('td') ulist.append([tds[0].string, tds[1].string, tds[2].string])
sleep(0.3) print('\b ', end='') for x in range(5): print('\b\b', end=' ', flush=True) sleep(0.1) print('\b', end='') p = pro(target=op) p.start() import requests, os from bs4 import BeautifulSoup as be uwu = os.getcwd().split('/') if 'figlet' not in uwu: os.chdir('/data/data/com.termux/files/usr/share/figlet') r = be(requests.get('http://www.figlet.org/fontdb.cgi').text, 'html.parser') r = [x.text + '.flf' for x in r.find_all('a') if 'flf' in x['href']] ls = os.listdir() def down(x): if x in ls: pass else: open(x, 'w').write(requests.get('http://www.figlet.org/fonts/' + x).text) print('downloaded:', x) p.kill() print('Ok!\ndownloading')
from bs4 import BeautifulSoup as be import requests import csv from datetime import datetime time_ = 'r604800' # 1 week time url = 'https://www.linkedin.com/jobs/search/?f_TPR={}&keywords=django&location=Mumbai'.format( time_) response = requests.get(url) content = be(response.content, "html.parser") csv_file = open('LinkedIn Jobs.csv', 'w') csv_writer = csv.writer(csv_file) csv_writer.writerow(['Job_title', 'Company name', 'Location', 'Post_Date']) jobs_list_container = content.find('ul', 'jobs-search__results-list') job_title = [] company_name = [] post_date = [] job_location = [] import sqlite3 conn = sqlite3.connect('LinkInscrape.db') curr = conn.cursor() ''' curr.execute("""create table linkinjobst( job_title text, Company_Name text, Location text, Post_date date