def yahoo_car(): url = "https://tw.usedcar.yahoo.com" ss = myutils.get_session() req = ss.get(url=url, headers=myutils.get_header()) soup = BeautifulSoup(req.text, "html.parser") # print(soup.prettify()) # 車型 car_type_list = soup.select("form select[name='catb'] option") car_type_dict = { t["value"]: t.text for t in car_type_list if len(t["value"]) > 0 } # 廠牌 brand_list = soup.select("form select[name='catid'] option") brand_dict = { t["value"]: t.text for t in brand_list if len(t["value"]) > 0 } input_data = { i["name"]: i["value"] for i in soup.select("form input[type='hidden']") } print(car_type_dict) print(brand_dict) action = soup.select_one("form")["action"] print("input data", input_data) for brand in brand_dict: search_page("000000515224", input_data, action, url, ss, kw=brand)
def get_new_car_type(url: str): header = myutils.get_header() header["referer"] = "https://c.8891.com.tw/Models" ss = myutils.get_session() res = ss.get(url=url, headers=header) brandsoup = myutils.get_soup(res.text) # 獲取車型清單 car_type_dict = { t.text: t["href"] for t in brandsoup.select( "div.brand-list-main.IndexKindContent a.brand-list-type") } return car_type_dict
def get_article(): ss = myutils.get_session() # make header header = myutils.get_header() header["Accept"] = "application/json, text/plain, */*" header["Accept-Encoding"] = "gzip, deflate" header["Host"] = "www.carplushk.com" header["Accept-Language"] = "zh-tw" header["Referer"] = "http://www.carplushk.com/category/review/" header["Connection"] = "keep-alive" url = '''http://www.carplushk.com/wp-admin/admin-ajax.php?id=&post_id=4036&slug=review&canonical_url=http%3A%2F%20%20%20%20%20%2Fwww.carplushk.com%2Fcategory%2Freview%2F&posts_per_page=12&page={}&offset=25&post_type=post&repeater=template_1%20%20%20%20%20&seo_start_page=1&preloaded=false&preloaded_amount=0&cta[cta]=true&cta[cta_position]=after:12&cta[%20%20%20%20%20cta_repeater]=template_3&cta[cta_theme_repeater]=null&category=review&order=DESC&orderby=date&action=alm_get_posts&query_type=standard''' urlajax = url.format("0") print(urlajax) res = ss.get(url=urlajax, headers=header) data_dict = json.loads(res.text) try: total_post = int(data_dict["meta"]["totalposts"]) except ValueError as err: print("*" * 50) print("total post is not a number") # for k in data_dict: # print(k, " : ", data_dict[k]) # soup = myutils.get_soup(data_dict["html"]) # print(soup.prettify()) # for s in soup.select("div.ajaxmoreblk a"): # a = {"_id": s["href"], "title": s.text, "from": "http://www.carplushk.com", "type": "review"} # article_list.append(a) # print(article_list) # result = mongo_service.insert_many("data", "car_article", article_list) # print(result) for i in range( int(total_post/12) + 1): article_list = [] urlajax = url.format(i) res = ss.get(url=urlajax, headers=header) data_dict = json.loads(res.text) soup = myutils.get_soup(data_dict["html"]) for s in soup.select("div.ajaxmoreblk a")[:-1]: a = {"_id": s["href"], "title": s.text, "from": "http://www.carplushk.com", "type": "review"} if not mongo_service.is_exist(idd=a["_id"], collection="car_article"): article_list.append(a) else: print(a["_id"], " already in article db") print(article_list) if len(article_list) > 0: result = mongo_service.insert_many("data", "car_article", article_list) print(result)
def get_new_car_pic(url: str): """ :param url: 範例 https://c.8891.com.tw/audi/a1-sportback/HDPhoto.html :return: 該車型所有圖片url """ pic_url_list = [] # 放圖片url ss = myutils.get_session() # 可以換成requests.session() res = ss.get(url=url, headers=myutils.get_header()) # header裡只有user agent print("get response from", res.url) # print(req.text) scriptsoup = myutils.get_soup(res.text).find_all('script', type="text/javascript") for script in scriptsoup: # print(script) tmp = str(script) if tmp.find("InitData") != -1: # print(tmp.index(": ["), tmp.index("]")) pid_str = tmp[tmp.index(": [") + 3:tmp.index("]")] pid_list = pid_str.split(",") print(pid_list) photo_lib_url = "https://c.8891.com.tw/photoLibrary-ajaxList.html?pid=" pidstr = "" for idx, pid in enumerate(pid_list): pidstr += pid # 一次獲取多少張圖片的網址 num_of_photo = 7 if idx % num_of_photo == 0 or idx % len(pid_list) == 0: # print(pidstr) # 向https://c.8891.com.tw/photoLibrary-ajaxList.html 發出請求 r = ss.get(url=photo_lib_url + myutils.url_encoding(pidstr), headers=myutils.get_header()) # 網址裡的,需要轉換編碼 # print(r.url, "photo rul result:") # print(r.text) try: json_obj = json.loads(r.text)["data"] except Exception as err: print("error ", "~" * 20) print(err) print(r.text) for photo_json in json_obj: photo_url = photo_json["smallPic"].replace( r"\/", "/") # 把反斜線弄掉 pic_url_list.append(photo_url) pidstr = "" else: pidstr += "," return pic_url_list
def get_used_car_page(url): logger.info("{} get url:{}".format(__name__, url)) ss = myutils.get_session() res = ss.get(url=url, headers=myutils.get_header()) soup = myutils.get_soup(res.text) logger.info(str(soup.prettify())) car = {} car_type = soup.select("div.breadcrumb a.NormalLink") print(car_type) car["brand"] = car_type[2].text if len(car_type) >= 5: car["type"] = car_type[4].text car["type2"] = car_type[3].text car["title"] = soup.select_one( "div.right-info info-right-width div.infos-head-title span").text car["price"] = soup.select_one("div.car-price-box div#price b").text
def get_new_car_brand(): # print("start") url = "https://c.8891.com.tw" ss = myutils.get_session() # get https://c.8891.com.tw/Models res = ss.get(url=url + "/Models", headers=myutils.get_header()) soup = myutils.get_soup(res.text) # print(soup.select("div.scroll-area")) # 獲取車輛品牌清單 new_car_brand_list = [] for a in soup.select("div.scroll-area li"): new_car_brand = {} new_car_brand["country"] = a["country"] new_car_brand["brand_id"] = a["id"] atag = a.select_one("a") new_car_brand["brand"] = atag.text.strip() new_car_brand["link"] = url + atag["href"] new_car_brand_list.append(new_car_brand) return new_car_brand_list
def get_article_content(url: str, ss): conn = mongo_service.get_mongo_conn() db = conn["data"] coll = db["car_article"] cursor = coll.find({}) ss = myutils.get_session() header = myutils.get_header() header["Accept"] = "application/json, text/plain, */*" header["Accept-Encoding"] = "gzip, deflate" header["Host"] = "www.carplushk.com" header["Accept-Language"] = "zh-tw" header["Referer"] = "http://www.carplushk.com/category/review/" header["Connection"] = "keep-alive" count = 1 for art_url in cursor: art_dict = {} print(art_url["_id"], "\n") art_dict["_id"] = art_url["_id"] res = ss.get(url=art_url["_id"], headers=header) soup = myutils.get_soup(res.text) content = soup.select_one("div.entry-content.single-page") pdate = content.select_one("div.postdayau").text art_dict["post_time"] = datetime.datetime.strptime(pdate.split("By")[0].strip(), "%d %b, %Y") print(art_dict["post_time"]) main_content = "" for tag in content: if tag.name == "p": if tag.text.find("Text & Photo") == -1: main_content += tag.text main_content += "\n" elif tag.name == "h2": main_content += "=t{}=t\n".format(tag.string) art_dict["content"] = main_content print(art_dict) count += 1 if count == 5: break time.sleep(random.randint(1, 5))
def get_pic_page_url(url: str): ss = myutils.get_session() req = ss.get(url=url, headers=myutils.get_header()) soup = myutils.get_soup(req.text) url_list = [a["href"] for a in soup.select("div.jp-bg-color.mt10 a")] return url_list[1]
from json.decoder import JSONDecodeError from collections import defaultdict import myutils from bs4 import BeautifulSoup import time import random import json from service import job_service from urllib import parse import pandas as pd ss = myutils.get_session() first_url = "https://www.104.com.tw/jobs/search/?ro=0&keyword={}&jobsource=2018indexpoc" keyword = "AI" def get_page(page_num: int) -> dict: header = myutils.get_header() header["Accept"] = "application/json, text/javascript, */*; q=0.01" header["Accept-Encoding"] = "gzip, deflate, br" header["Accept-Language"] = "zh-TW,zh;q=0.9,en-US;q=0.8,en;q=0.7" header["Connection"] = "keep-alive" header["Host"] = "www.104.com.tw" header["Referer"] = first_url + "&order=1" header["Sec-Fetch-Dest"] = "empty" header["Sec-Fetch-Mode"] = "cors" header["Sec-Fetch-Site"] = "same-origin" header["X-Requested-With"] = "XMLHttpRequest" global keyword list_url = "https://www.104.com.tw/jobs/search/list?ro=0&kwop=7&keyword={}&order=15&asc=0&page={}&mode=s&jobsource=2018indexpoc" list_url = list_url.format(keyword, str(page_num))