def get_xxx_detail(text, table_name): dbHelp = DBHelper(user_config.db_host, user_config.db_port, user_config.db_user, user_config.db_password, user_config.db_database) soup = BeautifulSoup(text, "html.parser") # 如果不指定name,指定attrs,那么会找所有好友该属性的标签。 menu_lst_tag = soup.find(name="div", attrs={"class": "rstdtl-menu-lst"}) #找一层就可以了。 menu_head_content_tags = menu_lst_tag.findChildren(recursive=False) menu_type = "" for menu_head_content_tag in menu_head_content_tags: #print(menu_head_content_tag) if menu_head_content_tag.get("class")[0] == "rstdtl-menu-lst__heading": menu_type = my_util.getTagText(menu_head_content_tag) continue info = {} info["type"] = menu_type info["name"] = my_util.getTagText( menu_head_content_tag.find( name="p", attrs={"class": "rstdtl-menu-lst__menu-title"})) img_tag = menu_head_content_tag.find(name="img") img_href = img_tag.get("src") if img_tag else "" if img_href != "": info["img"] = down_file(img_href, img_href[img_href.rfind("/") + 1:]) info["price"] = my_util.getTagText( menu_head_content_tag.find( name="p", attrs={"class": "rstdtl-menu-lst__price"})) info["description"] = my_util.getTagText( menu_head_content_tag.find(name="p", attrs={"class": "rstdtl-menu-lst__ex"})) #print(info) save_data(table_name, info, dbHelp) dbHelp.closeDB()
def get_party_detail(links): dbHelp = DBHelper(user_config.db_host, user_config.db_port, user_config.db_user, user_config.db_password, user_config.db_database) for link in links: info = {} info["link"] = link text = get_html(link) if text == "EOF" or text == "ERR": print("获取失败:" + link) continue soup = BeautifulSoup(text, "html.parser") #如果不指定name,指定attrs,那么会找所有好友该属性的标签。 title_tag = soup.find(name="h3", attrs={"class": "course-dtl__course-title"}) info["name"] = my_util.getTagText(title_tag) img_div_tag = soup.find(name="div", attrs={"class": "course-dtl__img"}) img_tag = img_div_tag.find(name="img") if img_div_tag else None img_href = img_tag.get("src") if img_tag else "" if img_href != "": info["img"] = down_file(img_href, img_href[img_href.rfind("/") + 1:]) desc_tag = soup.find(name="div", attrs={"class": "course-dtl__desc"}) info["description"] = my_util.getTagText(desc_tag) table_tag = soup.find( name="table", attrs={"class": "c-table c-table--form course-dtl__data-table"}) info_map = { "コース料金": "price", "品数": "num", "滞在可能時間": "free_time", "コース内容": "content" } if table_tag: trs = table_tag.select("tbody tr") for tr in trs: th = tr.find(name="th") th_text = my_util.getTagText(th) if th_text in info_map: info[info_map.get(th_text)] = my_util.getTagText( tr.find(name="td")) # print(info) save_data("STORE_PARTY", info, dbHelp) dbHelp.closeDB()
def worker(thread_name, thread_idx): print(thread_name) print(thread_idx) print("################") # page_idx 从1开始 page_idx = thread_idx + 1 #page start idx handled_item = 0 start_idx_in_page = 0 #如果有之前有错误,现在从错误地方开始。 # 使用readline()读文件 # f = open("err.txt",'r') # line = f.readline() # if line: # handled_item = int(line) # page_idx = handled_item//PAGE_ITEM + 1 # f.close() file_name = thread_name + "_err.txt" if os.path.exists(file_name): with open(file_name, "r") as f: line = f.readline() if line: handled_item = int(line) start_idx_in_page = handled_item % PAGE_ITEM page_idx = page_idx + (handled_item // PAGE_ITEM) * THREAD_NUM os.remove(file_name) # with open("err.txt", "a+") as f: # f.seek(0) # seek to file head. # line = f.readline() # if line: # handled_item = int(line) # page_idx = handled_item // PAGE_ITEM + 1 # os.remove("err.txt") # print(page_idx) #多线程。 每个线程持有一个数据库对象。 dbHelp = DBHelper(user_config.db_host, user_config.db_port, user_config.db_user, user_config.db_password, user_config.db_database) while True: url = user_config.ginza_url + str(page_idx) + user_config.search_opt print(thread_name + " Get:" + url) text = get_html(url) if text == "EOF": break if text == "ERR": with open(file_name, "w") as f: f.write(str(handled_item)) break # print(text) links = parse_store_link(text, start_idx_in_page) # print(links) result = get_detail_info(links, dbHelp) if result == -1: with open(file_name, "w") as f: f.write(str(handled_item)) break handled_item += result page_idx += THREAD_NUM start_idx = 0 # 新的一页 当然从0开始。 dbHelp.closeDB()