def id_page_one(pre_cate, i): url = "https://s.taobao.com/search" urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) res = requests.get(url, headers=get_headers2(), params=get_params2(pre_cate, i), verify=False).text print(res)
def one_detail_info(goods_id): url = "https://item.jd.com/{}.html".format(goods_id) res = requests.get(url, headers=get_headers3()).text product_info = {} product_info['source'] = url product_info['goods_id'] = goods_id product_info['title'] = ''.join(etree.HTML(res).xpath("//html[@lang='zh-CN']/head/title/text()")) lis = etree.HTML(res).xpath("//ul[@class='lh']/li") imgsSrc_list = [] for li in lis: imgsSrc = "https:" + ''.join(li.xpath("./img/@src")) imgsSrc_list.append(imgsSrc) product_info['imgsSrc'] = imgsSrc_list product_info['shop_name'] = ''.join(etree.HTML(res).xpath("//div[@class='J-hove-wrap EDropdown fr']/div/div/a/@title")) # 获取视频id if re.search('mainVideoId\"\:.*?\"(\d+)\"', res, re.S) != None: mainVideoId = re.search('mainVideoId\"\:.*?\"(\d+)\"', res, re.S).group(1) url1 = "https://c.3.cn/tencent/video_v3" res1 = requests.get(url1, params=get_params2(mainVideoId), headers=get_headers4(url)).text product_info['videoUrl'] = re.search('playUrl\"\:.*?\"(.*?)\"', res1, re.S).group(1) # 获取价格请求参数中的venderId venderId = re.search('venderId\:.*?(\d+)', res, re.S).group(1) url2 = "https://c0.3.cn/stock" cat = re.search('cat\:.*?\[(.*?)\]', res, re.S).group(1) res2 = requests.get(url2, headers=get_headers4(url), params=get_params3(goods_id, venderId, cat)).text product_info['price'] = re.search('\"p\"\:.*?\"(.*?)\"', res2, re.S).group(1) # 获取商品的所有规格id color_dict = {} for color in etree.HTML(res).xpath("//div[@id='choose-attr-1']/div[@class='dd']/div"): # 颜色名 key = ''.join(color.xpath("./@data-value")) # 颜色对应图片的链接为字典值 color_dict['%s' % key] = 'https:' + ''.join(color.xpath("./a/img/@src")) # pprint.pprint(color_dict) sku_list = eval('[' + re.search('colorSize.*?\[(.*?)\]', res, re.S).group(1) + ']') sku_dict = {} for sku in sku_list: res3 = requests.get(url2, headers=get_headers4(url), params=get_params3(sku['skuId'], venderId, cat)).text price = re.search('\"p\"\:.*?\"(.*?)\"', res3, re.S).group(1) sku.pop('skuId') sku_dict['%s' % list(sku.values())] = price # pprint.pprint(sku_dict) another_dict = {} for k, v in sku_dict.items(): k1 = eval(k) for x in k1: if x in color_dict: another_dict['%s' % k] = {"price": v, 'url': '%s' % color_dict[x]} product_info['spcification'] = another_dict pprint.pprint(another_dict) return product_info
def one_page(cate, page): url1 = "http://search.suning.com/emall/mobile/wap/clientSearch.jsonp" res_temp = requests.get(url1, params=get_params1(cate, page), headers=get_headers2()).text res_eve = '{' + re.search('\{(.*?)jlf_fold_onoff', res_temp, re.S).group(1).rstrip('"').rstrip(',') + '}' pprint.pprint(res_eve) res1 = json.loads(res_eve) for goods in res1['goods']: url2 = "https://m.suning.com/product/0000000000/{}.html".format( goods['partnumber']) res2 = requests.get(url2, params=get_params2(), headers=get_headers3())
def one_id_plus(pre_cate, i): url = "https://s.taobao.com/search" urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) res = requests.get(url, headers=get_headers2(), params=get_params2(pre_cate, i), verify=False).text temp = eval('[' + re.search('allNids.*?\[(.*?)\]', res, re.S).group(1) + ']') product_id_list = [] for good_id in temp: product_id_list.append(good_id) return product_id_list
def one_detail(goods_id): url2 = "https://m.suning.com/product/0000000000/{}.html".format(goods_id) urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) res2 = requests.get(url2, params=get_params2(), headers=get_headers3(), verify=False).text # pprint.pprint(res2) product_info = {} product_info['title'] = ''.join( etree.HTML(res2).xpath("//span[@id='product-name']/text()")) product_info['goods_id'] = goods_id info_json = re.findall('shopName\:.*?\"(.*?)\"', res2, re.S) product_info['shop_name'] = info_json[1] product_info['source'] = "https:" + re.search( 'toPcUrl.*?\"(.*?)\"', res2, re.S).group(1).replace('\\u002F', '/') res3 = requests.get(product_info['source'], headers=get_headers4(), verify=False).text cluster = eval('[' + re.search('clusterMap\"\:.*?\[(.*?)colorList', res3, re.S).group(1).rstrip('"').strip().rstrip(',')) cluster_list = [] zero_list = [] for clu in cluster: for cl in clu['itemCuPartNumber']: cluster_list.append(cl['partNumber']) zero_list.append(cl['partNumber'][0:5] * 2) # 规格url构建 url4_one = str(cluster_list).replace('[', '').replace("'", "").replace( ']', '').replace(' ', '') url4_two = str(zero_list).replace('[', '').replace("'", "").replace( ']', '').replace(' ', '') url4 = 'https://icps.suning.com/icps-web/getVarnishAllPriceNoCache/' + url4_one + '_028_0280199_' + url4_two + '_1_getClusterPrice.jsonp?callback=getClusterPrice' price_dict = {} res4_temp = requests.get(url4, headers=get_headers5(), verify=False).text res4_eve = re.search('\((.*?)\)', res4_temp, re.S).group(1) res4_evet = json.loads(res4_eve) for res4 in res4_evet: price_dict['%s' % res4['cmmdtyCode']] = res4['price'] img_dict = {} for color in etree.HTML(res3).xpath("//dl[@id='colorItemList']/dd/ul/li"): url5 = 'https:' + ''.join(color.xpath("./a/@href")) res5 = requests.get(url5, headers=get_headers4(), verify=False).text imgs = json.loads(''.join( etree.HTML(res5).xpath( "//script[@type='application/ld+json']/text()"))) img_dict['%s' % ''.join(color.xpath("./@title"))] = imgs['images'][0] version_dict = {} for version in etree.HTML(res3).xpath( "//dl[@id='versionItemList']/dd/ul/li"): version_dict['%s' % ''.join(version.xpath("./@sku"))] = ''.join( version.xpath("./@title")) merge_dict = {} for key1, value1 in version_dict.items(): for key2, value2 in img_dict.items(): merge_dict["%s" % value1 + '&' + key2] = { 'url': '%s' % value2, 'id': '%s' % key1 } merge_price = {} for key3, value3 in merge_dict.items(): if value3['id'] in price_dict: merge_price['%s' % key3] = { 'price': '%s' % price_dict['%s' % value3['id']], 'url': '%s' % value3['url'] } product_info['spcification'] = merge_price price_list = [] for key4, value4 in merge_price.items(): price_list.append(float(value4['price'])) product_info['price'] = min(price_list) video = re.search('videoUrl\"\:.*?\"(.*?)\"', res3, re.S) if video != None: product_info['videoUrl'] = re.search('videoUrl\"\:.*?\"(.*?)\"', res3, re.S).group(1) imgsSrc = etree.HTML(res2).xpath("//div[@class='swiper-slide']/img") imgsSrc_list = [] for imgSrc in imgsSrc: imgsSrc_list.append("https:" + ''.join(imgSrc.xpath("./@src"))) product_info['imgsSrc'] = imgsSrc_list # pprint.pprint(product_info) return product_info
def get_detail_info(cate): url = 'https://dc.3.cn/category/get?&callback=getCategoryCallback' res = requests.get(url, headers=get_headers1()).text cate_temp = json.loads(re.search('\((.*?)\)', res, re.S).group(1)) # pprint.pprint(cate_temp) cate_dict = {} for data_temp in cate_temp['data']: for data_eve in data_temp['s']: for temp in data_eve['s']: for data in temp['s']: cate_name = re.search('\|(.*?)\|\|', data['n'], re.S).group(1) cate_url = re.search('(.*?)\|', data['n'], re.S).group(1) if '-' in cate_url: cate_dict[ '%s' % cate_name] = 'https://list.jd.com/list.html?cat=' + cate_url.replace( '-', ',') else: cate_dict['%s' % cate_name] = 'https://' + cate_url res1 = requests.get(cate_dict['%s' % cate], headers=get_headers3()).text ids = eval('[' + re.search("wids.*?\'(.*?)\'", res1, re.S).group(1) + ']') product_info_list = [] for ids_eve in ids: url4 = "https://item.jd.com/{}.html".format(ids_eve) print(url4) res4 = requests.get(url4, headers=get_headers3()).text product_info = {} product_info['source'] = url4 product_info['goods_id'] = ids_eve product_info['title'] = ''.join( etree.HTML(res4).xpath("//html[@lang='zh-CN']/head/title/text()")) lis = etree.HTML(res4).xpath("//ul[@class='lh']/li") imgsSrc_list = [] for li in lis: imgsSrc = "https:" + ''.join(li.xpath("./img/@src")) imgsSrc_list.append(imgsSrc) product_info['imgsSrc'] = imgsSrc_list product_info['shop_name'] = ''.join( etree.HTML(res4).xpath( "//div[@class='J-hove-wrap EDropdown fr']/div/div/a/@title")) # 获取视频id if re.search('mainVideoId\"\:.*?\"(\d+)\"', res4, re.S) != None: mainVideoId = re.search('mainVideoId\"\:.*?\"(\d+)\"', res4, re.S).group(1) url1 = "https://c.3.cn/tencent/video_v3" res1 = requests.get(url1, params=get_params2(mainVideoId), headers=get_headers4(url4)).text product_info['videoUrl'] = re.search('playUrl\"\:.*?\"(.*?)\"', res1, re.S).group(1) # 获取价格请求参数中的venderId venderId = re.search('venderId\:.*?(\d+)', res4, re.S).group(1) url2 = "https://c0.3.cn/stock" cat = re.search('cat\:.*?\[(.*?)\]', res4, re.S).group(1) res2 = requests.get(url2, headers=get_headers4(url4), params=get_params3(ids_eve, venderId, cat)).text if re.search('\"p\"\:.*?\"(.*?)\"', res2, re.S) != None: product_info['price'] = re.search('\"p\"\:.*?\"(.*?)\"', res2, re.S).group(1) # 获取商品的所有规格id color_dict = {} for color in etree.HTML(res4).xpath( "//div[@id='choose-attr-1']/div[@class='dd']/div"): # 颜色名 key = ''.join(color.xpath("./@data-value")) # 颜色对应图片的链接为字典值 color_dict['%s' % key] = 'https:' + ''.join(color.xpath("./a/img/@src")) # pprint.pprint(color_dict) sku_list = eval( '[' + re.search('colorSize.*?\[(.*?)\]', res4, re.S).group(1) + ']') sku_dict = {} for sku in sku_list: res3 = requests.get(url2, headers=get_headers4(url4), params=get_params3(sku['skuId'], venderId, cat)).text price = re.search('\"p\"\:.*?\"(.*?)\"', res3, re.S).group(1) sku.pop('skuId') sku_dict['%s' % list(sku.values())] = price # pprint.pprint(sku_dict) another_dict = {} for k, v in sku_dict.items(): k1 = eval(k) for x in k1: if x in color_dict: another_dict['%s' % k] = { "price": v, 'url': '%s' % color_dict[x] } # pprint.pprint(another_dict) product_info['spcification'] = another_dict pprint.pprint(product_info) product_info_list.append(product_info) return product_info_list
def one_detail(goods_id): url2 = "https://m.suning.com/product/0000000000/{}.html".format(goods_id) urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) res2 = requests.get(url2, params=get_params2(), headers=get_headers3(), verify=False).text pprint.pprint(res2)
etree.HTML(res).xpath("//html[@lang='zh-CN']/head/title/text()")) lis = etree.HTML(res).xpath("//ul[@class='lh']/li") imgsSrc_list = [] for li in lis: imgsSrc = "https:" + ''.join(li.xpath("./img/@src")) imgsSrc_list.append(imgsSrc) product_info['imgsSrc'] = imgsSrc_list product_info['shop_name'] = ''.join( etree.HTML(res).xpath( "//div[@class='J-hove-wrap EDropdown fr']/div/div/a/@title")) # 获取视频id if re.search('mainVideoId\"\:.*?\"(\d+)\"', res, re.S) != None: mainVideoId = re.search('mainVideoId\"\:.*?\"(\d+)\"', res, re.S).group(1) url1 = "https://c.3.cn/tencent/video_v3" res1 = requests.get(url1, params=get_params2(mainVideoId), headers=get_headers4(url)).text product_info['videoUrl'] = re.search('playUrl\"\:.*?\"(.*?)\"', res1, re.S).group(1) # 获取价格请求参数中的venderId venderId = re.search('venderId\:.*?(\d+)', res, re.S).group(1) url2 = "https://c0.3.cn/stock" goods_id = '70108536727' cat = re.search('cat\:.*?\[(.*?)\]', res, re.S).group(1) res2 = requests.get(url2, headers=get_headers4(url), params=get_params3(goods_id, venderId, cat)).text product_info['price'] = re.search('\"p\"\:.*?\"(.*?)\"', res2, re.S).group(1) # 获取商品的所有规格id color_dict = {} for color in etree.HTML(res).xpath(