def _catch_Index_Url(): print("开始从TXT文件中获取url") All_line = Creep_Tools.Read_Line_by_Line._Read_Line_by_Line("JD_index_url.txt") for line in All_line: line = line.split("@@@") index_name = line[0] index_url = line[1] # 获取此URL下的soup soup = _Analyze_Soup(index_url) # 获取此URL下的最大页数 Max_Pag = int(_Page_num(soup)) print(index_name + "MAX_Pag" + str(Max_Pag)) if Max_Pag == 0: print(Max_Pag) elif Max_Pag == 1: try: print(Max_Pag) url = index_url soup = _Analyze_Soup(url) if soup is not None: # 获取此URL下的所有商品页面的URL url_list = parser_for_one_url(soup) for url in url_list: print(index_name + "@@@" + url + "@@@" + Max_Pag) commdity_url = index_name + "@@@" + url file.write(commdity_url + "\n") else: pass except: pass else: try: for page in range(1, Max_Pag): print(index_name + str(page)) page = "&page=" + str(page) print(index_name + page) try: url = index_url + page soup = Creep_Tools._Analyze_Soup(url) if soup is not None: # 获取此URL下的所有商品页面的URL url_list = parser_for_one_url(soup) for url in url_list: print(index_name + "@@@" + url + "@@@" + page) commdity_url = index_name + "@@@" + url + page file.write(commdity_url + "\n") else: pass except: pass except: pass print("")
def _Main(): All_Line_Arr = Creep_Tools.Read_Line_by_Line._Read_Line_by_Line("goods_id.log") for All_Line in All_Line_Arr: Line_arr = All_Line.split('@@@') commodity_index = Line_arr[0] commodity_url = Line_arr[1] ID_arr = commodity_url.split('.') ID = ID_arr[2] ID_arr = ID.split('/') ID = ID_arr[1] # 2.创建对应的文件夹 code_picture_path = "E:/JD/" + commodity_index + "/" + ID + "/code_picture" detail_picture_path = "E:/JD/" + commodity_index + "/" + ID + "/detail_picture" Creep_Tools._mkdir(code_picture_path) # 3.获取网页的HTML文件 import commodity_list # 4.解析网页的soup文件 # 获取商品图片的 soup = Creep_Tools._Analyze_Soup(commodity_url) ul = soup.find('ul', {'class': "lh"}) if ul: all_li = ul.find_all('li') num = 0 for li in all_li: num = num +1 img = dict(li.contents[0].attrs)['data-url'] img_url = 'https://img11.360buyimg.com/popWaterMark/%s' % (img) save_path = code_picture_path ID = ID + str(num) Creep_Tools._Download_Picture(img_url, ID, save_path) # 获取商品详情 product_detail = get_product_detail(soup) print(product_detail) # 获取商品价格 price = get_product_price(soup) print(price) savePath = get_datail_picture(soup) print(savePath)
def parser_for_one_url(soup): url_list = [] try: lists = soup.find_all("ul", {"class": "gl-warp clearfix"}) for item in lists: hrefs = item.find_all() for herf in hrefs: names = herf.find_all("div", {"class": "p-name"}) for name in names: url = name.a["href"] if url is not None: try: url_list.append(url) except: pass else: print("soup为空") except: pass return url_list if __name__ == "__main__": with open("JD_commodity_urls.txt", mode="w", encoding="utf-8") as file: _catch_Index_Url() Creep_Tools._Deduplication("JD_commodity_urls.log") # 测试用URL # soup =_Analyze_Soup("http://list.jd.hk/list.html?cat=1319,1525,7057&go=0&gjz=0") # parser_for_one_url(soup) print("运行终了")