def get_new_car_pic(url: str): """ :param url: 範例 https://c.8891.com.tw/audi/a1-sportback/HDPhoto.html :return: 該車型所有圖片url """ pic_url_list = [] # 放圖片url ss = myutils.get_session() # 可以換成requests.session() res = ss.get(url=url, headers=myutils.get_header()) # header裡只有user agent print("get response from", res.url) # print(req.text) scriptsoup = myutils.get_soup(res.text).find_all('script', type="text/javascript") for script in scriptsoup: # print(script) tmp = str(script) if tmp.find("InitData") != -1: # print(tmp.index(": ["), tmp.index("]")) pid_str = tmp[tmp.index(": [") + 3:tmp.index("]")] pid_list = pid_str.split(",") print(pid_list) photo_lib_url = "https://c.8891.com.tw/photoLibrary-ajaxList.html?pid=" pidstr = "" for idx, pid in enumerate(pid_list): pidstr += pid # 一次獲取多少張圖片的網址 num_of_photo = 7 if idx % num_of_photo == 0 or idx % len(pid_list) == 0: # print(pidstr) # 向https://c.8891.com.tw/photoLibrary-ajaxList.html 發出請求 r = ss.get(url=photo_lib_url + myutils.url_encoding(pidstr), headers=myutils.get_header()) # 網址裡的,需要轉換編碼 # print(r.url, "photo rul result:") # print(r.text) try: json_obj = json.loads(r.text)["data"] except Exception as err: print("error ", "~" * 20) print(err) print(r.text) for photo_json in json_obj: photo_url = photo_json["smallPic"].replace( r"\/", "/") # 把反斜線弄掉 pic_url_list.append(photo_url) pidstr = "" else: pidstr += "," return pic_url_list
def meta_search(kw, ss, url, cata, total_car_num): search_header = myutils.get_header() search_header[ "Content-Type"] = "application/x-www-form-urlencoded;charset=UTF-8" search_header["Accept"] = "*/*" search_header["Host"] = "tw.usedcar.yahoo.com" search_header["Accept-Language"] = "zh-tw" search_header["Accept-Encoding"] = "br,gzip,deflate" search_header["Origin"] = "https://tw.usedcar.yahoo.com" search_header["Referer"] = url search_header["Connection"] = "keep-alive" search_header["Content-Length"] = "56" search_header["X-Requested-With"] = "XMLHttpRequest" post_data = { "MIME 類型": "application/x-www-form-urlencoded; charset=UTF-8", "cata": "000000515224", "cateid": cata, "action": "dataPrepare" } req = ss.post(url="https://tw.usedcar.yahoo.com/search/search_services", headers=search_header, data=post_data) json_data = json.loads(req.text) print("meta search---------------------") # print(json_data) car_search(ss, url, cata, total_car_num, kw)
def yahoo_car(): url = "https://tw.usedcar.yahoo.com" ss = myutils.get_session() req = ss.get(url=url, headers=myutils.get_header()) soup = BeautifulSoup(req.text, "html.parser") # print(soup.prettify()) # 車型 car_type_list = soup.select("form select[name='catb'] option") car_type_dict = { t["value"]: t.text for t in car_type_list if len(t["value"]) > 0 } # 廠牌 brand_list = soup.select("form select[name='catid'] option") brand_dict = { t["value"]: t.text for t in brand_list if len(t["value"]) > 0 } input_data = { i["name"]: i["value"] for i in soup.select("form input[type='hidden']") } print(car_type_dict) print(brand_dict) action = soup.select_one("form")["action"] print("input data", input_data) for brand in brand_dict: search_page("000000515224", input_data, action, url, ss, kw=brand)
def get_job_content(job_url): job_id = myutils.get_jobid_by_url(job_url) content_rul = "https://www.104.com.tw/job/ajax/content/" + job_id # 製作header header = myutils.get_header() header["Accept"] = "application/json, text/plain, */*" header["Accept-Language"] = "zh-tw" header["Host"] = "www.104.com.tw" header["Referer"] = job_url header["Accept-Encoding"] = "br, gzip, deflate" header["Sec-Fetch-Dest"] = "empty" header["Sec-Fetch-Mode"] = "cors" header["Sec-Fetch-Site"] = "same-origin" header["Connection"] = "keep-alive" req = ss.get(url=content_rul, headers=header) # print(json.dumps(json.loads(req.text), indent=4, ensure_ascii=False)) try: content_data = json.loads(req.text) except JSONDecodeError as err: print(err) print(job_url) print(req.text) job_content = {} job_content["id"] = job_id job_content["job_name"] = content_data["data"]["header"]["jobName"] job_content["url"] = job_url job_content["company_name"] = content_data["data"]["header"]["custName"] job_content["company_url"] = content_data["data"]["header"]["custUrl"] job_content["contact"] = content_data["data"]["contact"] job_content["skill"] = content_data["data"]["condition"]["specialty"] job_content["job_detail"] = content_data["data"]["jobDetail"]["jobDescription"] print("get content url:", job_url, "success") return job_content
def get_page(page_num: int) -> dict: header = myutils.get_header() header["Accept"] = "application/json, text/javascript, */*; q=0.01" header["Accept-Encoding"] = "gzip, deflate, br" header["Accept-Language"] = "zh-TW,zh;q=0.9,en-US;q=0.8,en;q=0.7" header["Connection"] = "keep-alive" header["Host"] = "www.104.com.tw" header["Referer"] = first_url + "&order=1" header["Sec-Fetch-Dest"] = "empty" header["Sec-Fetch-Mode"] = "cors" header["Sec-Fetch-Site"] = "same-origin" header["X-Requested-With"] = "XMLHttpRequest" global keyword list_url = "https://www.104.com.tw/jobs/search/list?ro=0&kwop=7&keyword={}&order=15&asc=0&page={}&mode=s&jobsource=2018indexpoc" list_url = list_url.format(keyword, str(page_num)) print("get page ", list_url) req = ss.get(url=list_url, headers=header) jd = json.loads(req.text) print(list_url, "status", jd["status"]) # print(jd["data"]["list"]) job_dict = { myutils.get_jobid_by_url(job["link"]["job"]): {"job_name": job["jobName"], "url": "https:" + job["link"]["job"]} for job in jd["data"]["list"]} # print(job_dict) return job_dict
def main(): global first_url global keyword first_url = first_url.format(keyword) page_num = 1 req = ss.get(url=first_url, headers=myutils.get_header()) soup = get_soup(req.text) total_page = get_total_page(req.text) job_data = {} for idx, bs in enumerate(soup.select("article div.b-block__left")): # print(bs) # print(idx, idx, idx) job = bs.select("a.js-job-link") for j in job: # print("url", j["href"], idx) if j["href"].find("hotjob_chr") == -1: job_data[myutils.get_jobid_by_url(j["href"])] = {"url": "https:" + j["href"], "job_name": j.text} # print("job_name", j.text) # print("-----------------") print(job_data) job_result = [] for job in job_data: job_url = job_data[job]["url"] # print(job_url) job_content = get_job_content(job_url) job_service.add_job(job_content) job_result.append(job_content) for i in range(2, page_num + 1): job_data = get_page(i) for job in job_data: job_url = job_data[job]["url"] # print(job_url) sleep_time = random.uniform(1, 2) print("sleep {} sec".format(sleep_time)) time.sleep(sleep_time) job_content = get_job_content(job_url) job_service.add_job(job_content) job_result.append(job_content) # print(len([bs.select("a.js-job-link") for bs in soup.select("div.b-block__left")])) # 計算技能名稱出現次數 skill_dict = defaultdict(lambda: 0) for job in job_result: for skill in job["skill"]: skill_dict[skill["description"]] += 1 print(skill_dict) with open("./dict/skill.txt", "a") as file: file.write(json.dumps(skill_dict))
def get_new_car_type(url: str): header = myutils.get_header() header["referer"] = "https://c.8891.com.tw/Models" ss = myutils.get_session() res = ss.get(url=url, headers=header) brandsoup = myutils.get_soup(res.text) # 獲取車型清單 car_type_dict = { t.text: t["href"] for t in brandsoup.select( "div.brand-list-main.IndexKindContent a.brand-list-type") } return car_type_dict
def get_article(): ss = myutils.get_session() # make header header = myutils.get_header() header["Accept"] = "application/json, text/plain, */*" header["Accept-Encoding"] = "gzip, deflate" header["Host"] = "www.carplushk.com" header["Accept-Language"] = "zh-tw" header["Referer"] = "http://www.carplushk.com/category/review/" header["Connection"] = "keep-alive" url = '''http://www.carplushk.com/wp-admin/admin-ajax.php?id=&post_id=4036&slug=review&canonical_url=http%3A%2F%20%20%20%20%20%2Fwww.carplushk.com%2Fcategory%2Freview%2F&posts_per_page=12&page={}&offset=25&post_type=post&repeater=template_1%20%20%20%20%20&seo_start_page=1&preloaded=false&preloaded_amount=0&cta[cta]=true&cta[cta_position]=after:12&cta[%20%20%20%20%20cta_repeater]=template_3&cta[cta_theme_repeater]=null&category=review&order=DESC&orderby=date&action=alm_get_posts&query_type=standard''' urlajax = url.format("0") print(urlajax) res = ss.get(url=urlajax, headers=header) data_dict = json.loads(res.text) try: total_post = int(data_dict["meta"]["totalposts"]) except ValueError as err: print("*" * 50) print("total post is not a number") # for k in data_dict: # print(k, " : ", data_dict[k]) # soup = myutils.get_soup(data_dict["html"]) # print(soup.prettify()) # for s in soup.select("div.ajaxmoreblk a"): # a = {"_id": s["href"], "title": s.text, "from": "http://www.carplushk.com", "type": "review"} # article_list.append(a) # print(article_list) # result = mongo_service.insert_many("data", "car_article", article_list) # print(result) for i in range( int(total_post/12) + 1): article_list = [] urlajax = url.format(i) res = ss.get(url=urlajax, headers=header) data_dict = json.loads(res.text) soup = myutils.get_soup(data_dict["html"]) for s in soup.select("div.ajaxmoreblk a")[:-1]: a = {"_id": s["href"], "title": s.text, "from": "http://www.carplushk.com", "type": "review"} if not mongo_service.is_exist(idd=a["_id"], collection="car_article"): article_list.append(a) else: print(a["_id"], " already in article db") print(article_list) if len(article_list) > 0: result = mongo_service.insert_many("data", "car_article", article_list) print(result)
def download_pic(ss, car): pic_path = "./pic/{}/{}/{}/{}/{}/{}_{}_{}_{}".format( car["廠牌"], car.get("型號a", "0"), car.get("型號", "0"), car["auto_build_year"], car["mid"], car["廠牌"], car.get("型號a", "0"), car.get("型號", "0"), car["auto_build_year"]) car_pic = car.pop("pic") car["pic"] = [] for i, pic in enumerate(car_pic): q = ss.get(url=pic, headers=myutils.get_header()) car["pic"].append({ "url": pic, "file_path": myutils.write_pic_file(pic_path + "_{}.jpg".format(i), q.content) })
def get_used_car_page(url): logger.info("{} get url:{}".format(__name__, url)) ss = myutils.get_session() res = ss.get(url=url, headers=myutils.get_header()) soup = myutils.get_soup(res.text) logger.info(str(soup.prettify())) car = {} car_type = soup.select("div.breadcrumb a.NormalLink") print(car_type) car["brand"] = car_type[2].text if len(car_type) >= 5: car["type"] = car_type[4].text car["type2"] = car_type[3].text car["title"] = soup.select_one( "div.right-info info-right-width div.infos-head-title span").text car["price"] = soup.select_one("div.car-price-box div#price b").text
def do_search(key_word: str, page_num): global first_url page_num = int(page_num) if key_word is None or len(key_word) == 0: return "error keyword" if page_num is None: return "page_num" # 取得第一頁資料 first_page_url = first_url.format(key_word) req = ss.get(url=first_page_url, headers=myutils.get_header()) soup = get_soup(req.text) job_data = {} for idx, bs in enumerate(soup.select("article div.b-block__left")): # print(bs) # print(idx, idx, idx) job = bs.select("a.js-job-link") for j in job: # print("url", j["href"], idx) if j["href"].find("hotjob_chr") == -1: job_data[myutils.get_jobid_by_url(j["href"])] = {"url": "https:" + j["href"], "job_name": j.text} job_result = [] for job in job_data: job_url = job_data[job]["url"] # print(job_url) job_content = get_job_content(job_url) job_service.add_job(job_content) job_result.append(job_content) # 取得第二頁以後資料 key_word = parse.quote(key_word) first_url = first_url.format(key_word) if page_num != 0: for i in range(2, page_num + 1): job_data = get_page(keyword=key_word, page_num=i) for job in job_data: job_url = job_data[job]["url"] # print(job_url) sleep_time = random.uniform(1, 2) print("sleep {} sec".format(sleep_time)) time.sleep(sleep_time) job_content = get_job_content(job_url) job_service.add_job(job_content) job_result.append(job_content) myutils.write_json_file(job_result, str(int(time.time())) + "job.json")
def get_new_car_brand(): # print("start") url = "https://c.8891.com.tw" ss = myutils.get_session() # get https://c.8891.com.tw/Models res = ss.get(url=url + "/Models", headers=myutils.get_header()) soup = myutils.get_soup(res.text) # print(soup.select("div.scroll-area")) # 獲取車輛品牌清單 new_car_brand_list = [] for a in soup.select("div.scroll-area li"): new_car_brand = {} new_car_brand["country"] = a["country"] new_car_brand["brand_id"] = a["id"] atag = a.select_one("a") new_car_brand["brand"] = atag.text.strip() new_car_brand["link"] = url + atag["href"] new_car_brand_list.append(new_car_brand) return new_car_brand_list
def search_page(cata, input_data, action, url, ss, kw=""): if len(kw) == 0: input_data["catb"] = cata input_data["kw"] = kw print(input_data) search_header = myutils.get_header() search_header[ "Accept"] = "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8" search_header["Accept-Encoding"] = "br, gzip, deflate" search_header["Host"] = "tw.usedcar.yahoo.com" search_header["Accept-Language"] = "zh-tw" search_header["Referer"] = "https://tw.usedcar.yahoo.com/" search_header["Connection"] = "keep-alive" searchreq = ss.get(url + action, params=input_data, headers=search_header) print("search page", searchreq.url) soup2 = myutils.get_soup(searchreq.text) total_car_num = soup2.select_one("div .infol.mei-u em").text print("total num:", total_car_num) meta_search(kw, ss, searchreq.url, cata, total_car_num)
def get_article_content(url: str, ss): conn = mongo_service.get_mongo_conn() db = conn["data"] coll = db["car_article"] cursor = coll.find({}) ss = myutils.get_session() header = myutils.get_header() header["Accept"] = "application/json, text/plain, */*" header["Accept-Encoding"] = "gzip, deflate" header["Host"] = "www.carplushk.com" header["Accept-Language"] = "zh-tw" header["Referer"] = "http://www.carplushk.com/category/review/" header["Connection"] = "keep-alive" count = 1 for art_url in cursor: art_dict = {} print(art_url["_id"], "\n") art_dict["_id"] = art_url["_id"] res = ss.get(url=art_url["_id"], headers=header) soup = myutils.get_soup(res.text) content = soup.select_one("div.entry-content.single-page") pdate = content.select_one("div.postdayau").text art_dict["post_time"] = datetime.datetime.strptime(pdate.split("By")[0].strip(), "%d %b, %Y") print(art_dict["post_time"]) main_content = "" for tag in content: if tag.name == "p": if tag.text.find("Text & Photo") == -1: main_content += tag.text main_content += "\n" elif tag.name == "h2": main_content += "=t{}=t\n".format(tag.string) art_dict["content"] = main_content print(art_dict) count += 1 if count == 5: break time.sleep(random.randint(1, 5))
def car_search(ss, url, cata, total_car_num, kw): each_page = 30 search_header = myutils.get_header() search_header[ "Content-Type"] = "application/x-www-form-urlencoded;charset=UTF-8" search_header["Accept"] = "*/*" search_header["Host"] = "tw.usedcar.yahoo.com" search_header["Accept-Language"] = "zh-tw" search_header["Accept-Encoding"] = "br,gzip,deflate" search_header["Origin"] = "https://tw.usedcar.yahoo.com" search_header["Referer"] = url search_header["Connection"] = "keep-alive" search_header["Content-Length"] = "268" search_header["X-Requested-With"] = "XMLHttpRequest" post_data = { "MIME 類型": "application/x-www-form-urlencoded; charset=UTF-8", "cata": "000000515224", "catb": cata, "undedup": 0, "unspc": 0, "areaa": "tw", "sort": 3, "total": total_car_num, "cp": 1, "ppa": each_page, "pa": 10, "type": "srplist", "vmode": 0, "action": "srplistquery", "kw": kw } if len(kw) > 0: post_data["catid"] = "000000515224" print("car search---------------------") # print(json_data) total_page = (int(total_car_num) // each_page) + 1 for page in range(1, total_page + 1): print("total_page:", total_page, "current page :", page) post_data["cp"] = page try: req = ss.post( url="https://tw.usedcar.yahoo.com/search/search_services", headers=search_header, data=post_data) json_data = json.loads(req.text) except Exception as err: print("-" * 30) file_path = "./err/msg/{}.txt".format(kw + str(page)) myutils.write_text_file(file_path=file_path, content=req.text) # error_log = { # "err": err, # "data": file_path # } # mongo_service.insert_data(collection="err", db_name="data", json_data=json.dumps(error_log)) # raise err try: for car in json_data["data"][1:]: if not mongo_service.is_exist(car["mid"]): print("cat id {} already existed".format(car["mid"])) continue url = car["mlink"] r = ss.get(url, headers=myutils.get_header()) print("get car detail : url", url) car_soup = myutils.get_soup(r.text) # print(car_soup.prettify()) # 車輛廠牌分類 car_brand = [a.text for a in car_soup.select("div.itemhd a")] car["新舊"] = car_brand[0] car["車型"] = car_brand[1] car["廠牌"] = car_brand[2].replace("/", "[sl]") if len(car_brand) > 3: car["型號a"] = car_brand[3] car["型號"] = "fix_" + car_brand[3] if len(car_brand) > 4: car["型號"] = car_brand[4] # 車輛狀態 car_status = [] for i in car_soup.select("div#ycoptions ul#itemAttrs li")[0:3]: for j in i: car_status.extend(j.select("td")) # print("car_status", car_status) for i in range(0, len(car_status), 2): if "hide" not in car_status[i]["class"]: car[car_status[i].text] = car_status[i + 1].text # 車輛配備 car_equipment = car_soup.select( "div#ycoptions ul#itemAttrs li.col2 td span") print("car_equipment", car_equipment) for i in car_equipment: car[i.text] = 1 # 車輛圖案 car_pic = car_soup.select_one( "div#ycitemslideshow div.sft input") car_pic = car_pic["value"].replace( "'", '"') # 取value屬性的值,接着用雙引號取代單引號 if len(car_pic) > 0: car_pic = json.loads(car_pic) # 將文字轉成物件 car_pic = [pic["i"] for pic in car_pic] # print("car_pic", car_pic) car["pic"] = car_pic # get_picture(ss, car_pic["href"]) # 寫入圖片到本地 # print(car) # download_pic(ss, car) car["_id"] = car.pop("mid") # car.remove_key("mid") # print(car) mongo_service.insert_data("data", "car", car) item_sleep_time = random.uniform(0, 2) print("item sleep :", item_sleep_time) time.sleep(item_sleep_time) page_sleep_time = random.uniform(0, 5) print("page sleep :", page_sleep_time) time.sleep(page_sleep_time) except Exception as err: print("-" * 20) # print(car) # print("car pic", car_pic) error_log = {"err": err, "data": car} mongo_service.insert_data(collection="err", json_data=error_log, db_name="data")
def get_pic_page_url(url: str): ss = myutils.get_session() req = ss.get(url=url, headers=myutils.get_header()) soup = myutils.get_soup(req.text) url_list = [a["href"] for a in soup.select("div.jp-bg-color.mt10 a")] return url_list[1]