Beispiel #1
0
def get_xxx_detail(text, table_name):
    dbHelp = DBHelper(user_config.db_host, user_config.db_port,
                      user_config.db_user, user_config.db_password,
                      user_config.db_database)
    soup = BeautifulSoup(text, "html.parser")
    # 如果不指定name,指定attrs,那么会找所有好友该属性的标签。
    menu_lst_tag = soup.find(name="div", attrs={"class": "rstdtl-menu-lst"})

    #找一层就可以了。
    menu_head_content_tags = menu_lst_tag.findChildren(recursive=False)
    menu_type = ""
    for menu_head_content_tag in menu_head_content_tags:
        #print(menu_head_content_tag)
        if menu_head_content_tag.get("class")[0] == "rstdtl-menu-lst__heading":
            menu_type = my_util.getTagText(menu_head_content_tag)
            continue
        info = {}
        info["type"] = menu_type
        info["name"] = my_util.getTagText(
            menu_head_content_tag.find(
                name="p", attrs={"class": "rstdtl-menu-lst__menu-title"}))
        img_tag = menu_head_content_tag.find(name="img")
        img_href = img_tag.get("src") if img_tag else ""
        if img_href != "":
            info["img"] = down_file(img_href,
                                    img_href[img_href.rfind("/") + 1:])
        info["price"] = my_util.getTagText(
            menu_head_content_tag.find(
                name="p", attrs={"class": "rstdtl-menu-lst__price"}))
        info["description"] = my_util.getTagText(
            menu_head_content_tag.find(name="p",
                                       attrs={"class": "rstdtl-menu-lst__ex"}))
        #print(info)
        save_data(table_name, info, dbHelp)
    dbHelp.closeDB()
Beispiel #2
0
def get_party_detail(links):
    dbHelp = DBHelper(user_config.db_host, user_config.db_port,
                      user_config.db_user, user_config.db_password,
                      user_config.db_database)
    for link in links:
        info = {}
        info["link"] = link
        text = get_html(link)
        if text == "EOF" or text == "ERR":
            print("获取失败:" + link)
            continue

        soup = BeautifulSoup(text, "html.parser")
        #如果不指定name,指定attrs,那么会找所有好友该属性的标签。
        title_tag = soup.find(name="h3",
                              attrs={"class": "course-dtl__course-title"})
        info["name"] = my_util.getTagText(title_tag)
        img_div_tag = soup.find(name="div", attrs={"class": "course-dtl__img"})
        img_tag = img_div_tag.find(name="img") if img_div_tag else None
        img_href = img_tag.get("src") if img_tag else ""
        if img_href != "":
            info["img"] = down_file(img_href,
                                    img_href[img_href.rfind("/") + 1:])
        desc_tag = soup.find(name="div", attrs={"class": "course-dtl__desc"})
        info["description"] = my_util.getTagText(desc_tag)

        table_tag = soup.find(
            name="table",
            attrs={"class": "c-table c-table--form course-dtl__data-table"})

        info_map = {
            "コース料金": "price",
            "品数": "num",
            "滞在可能時間": "free_time",
            "コース内容": "content"
        }
        if table_tag:
            trs = table_tag.select("tbody tr")
            for tr in trs:
                th = tr.find(name="th")
                th_text = my_util.getTagText(th)
                if th_text in info_map:
                    info[info_map.get(th_text)] = my_util.getTagText(
                        tr.find(name="td"))

        # print(info)
        save_data("STORE_PARTY", info, dbHelp)

    dbHelp.closeDB()
Beispiel #3
0
def worker(thread_name, thread_idx):
    print(thread_name)
    print(thread_idx)
    print("################")

    # page_idx 从1开始
    page_idx = thread_idx + 1  #page start idx
    handled_item = 0
    start_idx_in_page = 0
    #如果有之前有错误,现在从错误地方开始。
    # 使用readline()读文件
    # f = open("err.txt",'r')
    # line = f.readline()
    # if line:
    # 	handled_item = int(line)
    # 	page_idx = handled_item//PAGE_ITEM + 1
    # f.close()
    file_name = thread_name + "_err.txt"
    if os.path.exists(file_name):
        with open(file_name, "r") as f:
            line = f.readline()
            if line:
                handled_item = int(line)
                start_idx_in_page = handled_item % PAGE_ITEM
                page_idx = page_idx + (handled_item // PAGE_ITEM) * THREAD_NUM
        os.remove(file_name)

    # with open("err.txt", "a+") as f:
    # 	f.seek(0) # seek to file head.
    # 	line = f.readline()
    # 	if line:
    # 		handled_item = int(line)
    # 		page_idx = handled_item // PAGE_ITEM + 1
    # os.remove("err.txt")

    # print(page_idx)

    #多线程。 每个线程持有一个数据库对象。
    dbHelp = DBHelper(user_config.db_host, user_config.db_port,
                      user_config.db_user, user_config.db_password,
                      user_config.db_database)

    while True:
        url = user_config.ginza_url + str(page_idx) + user_config.search_opt
        print(thread_name + " Get:" + url)
        text = get_html(url)
        if text == "EOF":
            break
        if text == "ERR":
            with open(file_name, "w") as f:
                f.write(str(handled_item))
            break
        # print(text)

        links = parse_store_link(text, start_idx_in_page)
        # print(links)

        result = get_detail_info(links, dbHelp)
        if result == -1:
            with open(file_name, "w") as f:
                f.write(str(handled_item))
            break
        handled_item += result
        page_idx += THREAD_NUM
        start_idx = 0  # 新的一页 当然从0开始。

    dbHelp.closeDB()