Exemple #1
0
def yahoo_car():
    url = "https://tw.usedcar.yahoo.com"
    ss = myutils.get_session()
    req = ss.get(url=url, headers=myutils.get_header())
    soup = BeautifulSoup(req.text, "html.parser")
    # print(soup.prettify())
    # 車型
    car_type_list = soup.select("form select[name='catb'] option")
    car_type_dict = {
        t["value"]: t.text
        for t in car_type_list if len(t["value"]) > 0
    }
    # 廠牌
    brand_list = soup.select("form select[name='catid'] option")
    brand_dict = {
        t["value"]: t.text
        for t in brand_list if len(t["value"]) > 0
    }

    input_data = {
        i["name"]: i["value"]
        for i in soup.select("form input[type='hidden']")
    }
    print(car_type_dict)
    print(brand_dict)
    action = soup.select_one("form")["action"]
    print("input data", input_data)
    for brand in brand_dict:
        search_page("000000515224", input_data, action, url, ss, kw=brand)
Exemple #2
0
def get_new_car_type(url: str):
    header = myutils.get_header()
    header["referer"] = "https://c.8891.com.tw/Models"
    ss = myutils.get_session()
    res = ss.get(url=url, headers=header)
    brandsoup = myutils.get_soup(res.text)
    # 獲取車型清單
    car_type_dict = {
        t.text: t["href"]
        for t in brandsoup.select(
            "div.brand-list-main.IndexKindContent a.brand-list-type")
    }
    return car_type_dict
Exemple #3
0
def get_article():
    ss = myutils.get_session()
    # make header
    header = myutils.get_header()
    header["Accept"] = "application/json, text/plain, */*"
    header["Accept-Encoding"] = "gzip, deflate"
    header["Host"] = "www.carplushk.com"
    header["Accept-Language"] = "zh-tw"
    header["Referer"] = "http://www.carplushk.com/category/review/"
    header["Connection"] = "keep-alive"

    url = '''http://www.carplushk.com/wp-admin/admin-ajax.php?id=&post_id=4036&slug=review&canonical_url=http%3A%2F%20%20%20%20%20%2Fwww.carplushk.com%2Fcategory%2Freview%2F&posts_per_page=12&page={}&offset=25&post_type=post&repeater=template_1%20%20%20%20%20&seo_start_page=1&preloaded=false&preloaded_amount=0&cta[cta]=true&cta[cta_position]=after:12&cta[%20%20%20%20%20cta_repeater]=template_3&cta[cta_theme_repeater]=null&category=review&order=DESC&orderby=date&action=alm_get_posts&query_type=standard'''

    urlajax = url.format("0")
    print(urlajax)
    res = ss.get(url=urlajax, headers=header)
    data_dict = json.loads(res.text)
    try:
        total_post = int(data_dict["meta"]["totalposts"])
    except ValueError as err:
        print("*" * 50)
        print("total post is not a number")
    # for k in data_dict:
    #     print(k, " : ", data_dict[k])
    # soup = myutils.get_soup(data_dict["html"])
    # print(soup.prettify())

    # for s in soup.select("div.ajaxmoreblk a"):
    #     a = {"_id": s["href"], "title": s.text, "from": "http://www.carplushk.com", "type": "review"}
    #     article_list.append(a)
    # print(article_list)
    # result = mongo_service.insert_many("data", "car_article", article_list)
    # print(result)

    for i in range( int(total_post/12) + 1):
        article_list = []
        urlajax = url.format(i)
        res = ss.get(url=urlajax, headers=header)
        data_dict = json.loads(res.text)
        soup = myutils.get_soup(data_dict["html"])
        for s in soup.select("div.ajaxmoreblk a")[:-1]:
            a = {"_id": s["href"], "title": s.text, "from": "http://www.carplushk.com", "type": "review"}
            if not mongo_service.is_exist(idd=a["_id"], collection="car_article"):
                article_list.append(a)
            else:
                print(a["_id"], " already in article db")
        print(article_list)
        if len(article_list) > 0:
            result = mongo_service.insert_many("data", "car_article", article_list)
            print(result)
Exemple #4
0
def get_new_car_pic(url: str):
    """

    :param url: 範例 https://c.8891.com.tw/audi/a1-sportback/HDPhoto.html
    :return: 該車型所有圖片url
    """
    pic_url_list = []  # 放圖片url
    ss = myutils.get_session()  # 可以換成requests.session()
    res = ss.get(url=url, headers=myutils.get_header())  # header裡只有user agent
    print("get response from", res.url)
    # print(req.text)
    scriptsoup = myutils.get_soup(res.text).find_all('script',
                                                     type="text/javascript")
    for script in scriptsoup:
        # print(script)
        tmp = str(script)
        if tmp.find("InitData") != -1:
            # print(tmp.index(": ["), tmp.index("]"))
            pid_str = tmp[tmp.index(": [") + 3:tmp.index("]")]
            pid_list = pid_str.split(",")
            print(pid_list)
            photo_lib_url = "https://c.8891.com.tw/photoLibrary-ajaxList.html?pid="
            pidstr = ""
            for idx, pid in enumerate(pid_list):
                pidstr += pid
                # 一次獲取多少張圖片的網址
                num_of_photo = 7
                if idx % num_of_photo == 0 or idx % len(pid_list) == 0:
                    # print(pidstr)
                    # 向https://c.8891.com.tw/photoLibrary-ajaxList.html 發出請求
                    r = ss.get(url=photo_lib_url +
                               myutils.url_encoding(pidstr),
                               headers=myutils.get_header())  # 網址裡的,需要轉換編碼
                    # print(r.url, "photo rul result:")
                    # print(r.text)
                    try:
                        json_obj = json.loads(r.text)["data"]
                    except Exception as err:
                        print("error ", "~" * 20)
                        print(err)
                        print(r.text)

                    for photo_json in json_obj:
                        photo_url = photo_json["smallPic"].replace(
                            r"\/", "/")  # 把反斜線弄掉
                        pic_url_list.append(photo_url)
                    pidstr = ""
                else:
                    pidstr += ","
    return pic_url_list
Exemple #5
0
def get_used_car_page(url):
    logger.info("{} get url:{}".format(__name__, url))
    ss = myutils.get_session()
    res = ss.get(url=url, headers=myutils.get_header())
    soup = myutils.get_soup(res.text)
    logger.info(str(soup.prettify()))
    car = {}
    car_type = soup.select("div.breadcrumb a.NormalLink")
    print(car_type)
    car["brand"] = car_type[2].text
    if len(car_type) >= 5:
        car["type"] = car_type[4].text
    car["type2"] = car_type[3].text
    car["title"] = soup.select_one(
        "div.right-info info-right-width div.infos-head-title span").text
    car["price"] = soup.select_one("div.car-price-box div#price b").text
Exemple #6
0
def get_new_car_brand():
    # print("start")
    url = "https://c.8891.com.tw"
    ss = myutils.get_session()
    # get https://c.8891.com.tw/Models
    res = ss.get(url=url + "/Models", headers=myutils.get_header())
    soup = myutils.get_soup(res.text)
    # print(soup.select("div.scroll-area"))
    # 獲取車輛品牌清單
    new_car_brand_list = []
    for a in soup.select("div.scroll-area li"):
        new_car_brand = {}
        new_car_brand["country"] = a["country"]
        new_car_brand["brand_id"] = a["id"]
        atag = a.select_one("a")
        new_car_brand["brand"] = atag.text.strip()
        new_car_brand["link"] = url + atag["href"]
        new_car_brand_list.append(new_car_brand)
    return new_car_brand_list
Exemple #7
0
def get_article_content(url: str, ss):
    conn = mongo_service.get_mongo_conn()
    db = conn["data"]
    coll = db["car_article"]
    cursor = coll.find({})

    ss = myutils.get_session()
    header = myutils.get_header()
    header["Accept"] = "application/json, text/plain, */*"
    header["Accept-Encoding"] = "gzip, deflate"
    header["Host"] = "www.carplushk.com"
    header["Accept-Language"] = "zh-tw"
    header["Referer"] = "http://www.carplushk.com/category/review/"
    header["Connection"] = "keep-alive"
    count = 1
    for art_url in cursor:
        art_dict = {}
        print(art_url["_id"], "\n")
        art_dict["_id"] = art_url["_id"]
        res = ss.get(url=art_url["_id"], headers=header)
        soup = myutils.get_soup(res.text)
        content = soup.select_one("div.entry-content.single-page")
        pdate = content.select_one("div.postdayau").text
        art_dict["post_time"] = datetime.datetime.strptime(pdate.split("By")[0].strip(), "%d %b, %Y")
        print(art_dict["post_time"])
        main_content = ""
        for tag in content:
            if tag.name == "p":
                if tag.text.find("Text & Photo") == -1:
                    main_content += tag.text
                    main_content += "\n"
            elif tag.name == "h2":
                main_content += "=t{}=t\n".format(tag.string)
        art_dict["content"] = main_content
        print(art_dict)
        count += 1
        if count == 5:
            break
        time.sleep(random.randint(1, 5))
Exemple #8
0
def get_pic_page_url(url: str):
    ss = myutils.get_session()
    req = ss.get(url=url, headers=myutils.get_header())
    soup = myutils.get_soup(req.text)
    url_list = [a["href"] for a in soup.select("div.jp-bg-color.mt10 a")]
    return url_list[1]
Exemple #9
0
from json.decoder import JSONDecodeError
from collections import defaultdict
import myutils
from bs4 import BeautifulSoup
import time
import random
import json
from service import job_service
from urllib import parse
import pandas as pd

ss = myutils.get_session()
first_url = "https://www.104.com.tw/jobs/search/?ro=0&keyword={}&jobsource=2018indexpoc"
keyword = "AI"

def get_page(page_num: int) -> dict:
    header = myutils.get_header()
    header["Accept"] = "application/json, text/javascript, */*; q=0.01"
    header["Accept-Encoding"] = "gzip, deflate, br"
    header["Accept-Language"] = "zh-TW,zh;q=0.9,en-US;q=0.8,en;q=0.7"
    header["Connection"] = "keep-alive"
    header["Host"] = "www.104.com.tw"
    header["Referer"] = first_url + "&order=1"
    header["Sec-Fetch-Dest"] = "empty"
    header["Sec-Fetch-Mode"] = "cors"
    header["Sec-Fetch-Site"] = "same-origin"
    header["X-Requested-With"] = "XMLHttpRequest"
    global keyword

    list_url = "https://www.104.com.tw/jobs/search/list?ro=0&kwop=7&keyword={}&order=15&asc=0&page={}&mode=s&jobsource=2018indexpoc"
    list_url = list_url.format(keyword, str(page_num))