def get_ypimg_page(self, albums_href, store_id): response = request_url(albums_href, headers=self.headers, proxies_list=self.ip_pool) try: html = etree.HTML(response.text) except: print("请求失败:" + albums_href) pass else: albums_name = html.xpath("//div[@class='showalbumheader__gallerydec']/h2/span[1]/text()")[0] albums_name = re.sub("\"|'", "“", albums_name) # albums_count = html.xpath("//div[@class='showalbumheader__gallerydec']/h2/span[2]/text()") # pic_src = html.xpath("//div[@class='showalbum__parent showalbum__nor nor']//img/@src") other_msg = html.xpath("//div[@class='showalbumheader__gallerydec']/div[1]/text()") other_msg = re.sub("\"|'", "“", str(other_msg)) data_id = html.xpath( "//div[@class='showalbum__parent showalbum__nor nor']/div[@class='showalbum__children image__main']/@data-id") img_href = list() for id in data_id: img_url = parse_url(albums_href, id, "uid=1") # &tab=min 请求缩略版图片 img_href.append(img_url) img_href = str(img_href) albums_info = (albums_name, store_id, albums_href, img_href, str(other_msg)) insert_albums = "insert into albums (albums_name,store_id,albums_href,img_url,other_msg) values %s on duplicate key update %s;" % ( str(albums_info), """albums_name="%s",store_id=%s,albums_href="%s",img_url="%s",other_msg="%s" """ % albums_info) insert_albums = re.sub(r"\\|\n", "", insert_albums) print(insert_albums) try: self.crs.execute(insert_albums) except: self.conn.ping() self.crs = self.conn.cursor() self.crs.execute(insert_albums) self.conn.commit()
def get_other_msg(self, cookies, base_url): for item in cookies: headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.106 Safari/537.36", "cookie": "%s=%s" % (item.name, item.value) } params = { "act": "single_album", "shop_id": "A202002021618192140113358", "time_stamp": int(time.time() * 1000), } url = parse_url(base_url.lower(), self.server_url, "") response = requests.get(url, params=params, headers=headers) for img in json.loads(response.text)['result']['goods_list']: print(img['imgs']) print(img['imgsSrc'])
def other_url_handl(self, url, store_id): other_headers = None response = request_url(url, headers=self.headers, proxies_list=self.ip_pool) try: shop_id = re.search(r"/shop_detail/(\w\d+)", response.url).group(1) except: print(response.url) else: if response is not None: cookies, base_url = get_cookies.get_cookies(url, ip_pool) url = parse_url(base_url.lower(), self.server_url, "") for item in cookies: other_headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.106 Safari/537.36", "cookie": "%s=%s" % (item.name, item.value) } self.get_other_msg(other_headers, url, store_id, shop_id)
def get_ypimg_page(self, yupoo_href_list): for href in yupoo_href_list: temp_dict = dict() response = requests.get(href, headers=self.headers) html = etree.HTML(response.text) albums_title = html.xpath("//div[@class='showalbumheader__gallerydec']/h2/span[1]/text()") pic_count = html.xpath("//div[@class='showalbumheader__gallerydec']/h2/span[2]/text()") pic_src = html.xpath("//div[@class='showalbum__parent showalbum__nor nor']//img/@src") other_info = html.xpath("//div[@class='showalbumheader__gallerydec']/div[1]/text()") data_id = html.xpath("//div[@class='showalbum__parent showalbum__nor nor']/div[@class='showalbum__children image__main']/@data-id") pic_href = list() for id in data_id: img_url = parse_url(href, id, "uid=1") pic_href.append(img_url) temp_dict["相册名称"] = albums_title temp_dict["图片数量"] = pic_count temp_dict["图片地址"] = [pic for pic in pic_href] temp_dict["其他信息"] = other_info print(temp_dict) self.finally_list.append(temp_dict)
def other_url_handl(self, url, store_id): """ 获取shop_id,构造json信息的地址,对请求头进行初步处理 :param url: 首页地址 :param store_id: 商店ID :return: None """ other_headers = None response = request_url(url, headers=self.headers, proxies_list=self.ip_pool) # 对于请求错误的url进行重新请求,直到正确 if "b.oijgvrq.cn" in response.url: self.other_url_handl(url, store_id) # 从首页重定向的url中获取商品的id用于后面构造url try: shop_id = re.search(r"/shop_detail/(\w\d+)", response.url).group(1) except: print(response.url) else: if response is not None: cookies = get_cookies.get_cookies(url) url = parse_url(response.url.lower(), self.server_url, "") # 对请求头进行处理,获取请求首页时服务器设置的cookie值中的token字段构造请求头 for item in cookies: other_headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.106 Safari/537.36", "cookie": "%s=%s" % (item.name, item.value) } # 对于一些特殊的商店因为请求头中服务器没有设置token字段,所有手动构造 if "token" not in other_headers["cookie"]: other_headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.106 Safari/537.36", "cookie": "token=Mzk4MDk3Q0E5RTZCN0I1MkYwMTYwNDlCQUNFNkQ5QzVFOEZCOTI1OEEwOTA2MDc0QzUzRTVCNDVDMTg1RTgzRTZBNTY1MTZDQTNFNDFCRkI2ODZGRTgxRjQxRDU3MEZD;" } self.get_other_msg(other_headers, url, store_id, shop_id)
headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.106 Safari/537.36" } with open("./ip_pool", "r") as f: content = f.read() ip_pool = json.loads(content) server_url = "service/album/get_album_themes_list.jsp" for url in url_list: response = requests.get(url, proxies=random.choices(ip_pool)[0], headers=headers) shop_id = re.search(r"/shop_detail/(.*)", response.url).group(1) if response is not None: cookies, base_url = get_cookies(url, ip_pool) url = parse_url(base_url.lower(), server_url, "") for item in cookies: other_headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.106 Safari/537.36", # "cookie": "%s=%s" % (item.name, item.value) "cookie": "UM_distinctid=172fef76482421-09a957e7ea6b66-4353761-144000-172fef764838c7; " "sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%22A202007011648174290197177%22%2C%22first_id%22%3A%22172fef75cf638b-0c1df4c5ab5928-4353761-1327104-172fef75cf78d9%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E7%9B%B4%E6%8E%A5%E6%B5%81%E9%87%8F%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC_%E7%9B%B4%E6%8E%A5%E6%89%93%E5%BC%80%22%2C%22%24latest_referrer%22%3A%22%22%7D%2C%22%24device_id%22%3A%22172fef75cf638b-0c1df4c5ab5928-4353761-1327104-172fef75cf78d9%22%7D; " "token=Mzk4MDk3Q0E5RTZCN0I1MkYwMTYwNDlCQUNFNkQ5QzVFOEZCOTI1OEEwOTA2MDc0QzUzRTVCNDVDMTg1RTgzRTZBNTY1MTZDQTNFNDFCRkI2ODZGRTgxRjQxRDU3MEZD; " "CNZZDATA1275056938=120497081-1594357916-%7C1594363316; " "JSESSIONID=B74EC8A017C3DDD861F3E6E17F3D6C3A" } params = { "page_index": 1, "act": "single_album",