Esempio n. 1
0
 def get_ypimg_page(self, albums_href, store_id):
     response = request_url(albums_href, headers=self.headers, proxies_list=self.ip_pool)
     try:
         html = etree.HTML(response.text)
     except:
         print("请求失败:" + albums_href)
         pass
     else:
         albums_name = html.xpath("//div[@class='showalbumheader__gallerydec']/h2/span[1]/text()")[0]
         albums_name = re.sub("\"|'", "“", albums_name)
         # albums_count = html.xpath("//div[@class='showalbumheader__gallerydec']/h2/span[2]/text()")
         # pic_src = html.xpath("//div[@class='showalbum__parent showalbum__nor nor']//img/@src")
         other_msg = html.xpath("//div[@class='showalbumheader__gallerydec']/div[1]/text()")
         other_msg = re.sub("\"|'", "“", str(other_msg))
         data_id = html.xpath(
             "//div[@class='showalbum__parent showalbum__nor nor']/div[@class='showalbum__children image__main']/@data-id")
         img_href = list()
         for id in data_id:
             img_url = parse_url(albums_href, id, "uid=1")  # &tab=min 请求缩略版图片
             img_href.append(img_url)
         img_href = str(img_href)
         albums_info = (albums_name, store_id, albums_href, img_href, str(other_msg))
         insert_albums = "insert into albums (albums_name,store_id,albums_href,img_url,other_msg) values %s on duplicate key update %s;" % (
         str(albums_info),
         """albums_name="%s",store_id=%s,albums_href="%s",img_url="%s",other_msg="%s" """ % albums_info)
         insert_albums = re.sub(r"\\|\n", "", insert_albums)
         print(insert_albums)
         try:
             self.crs.execute(insert_albums)
         except:
             self.conn.ping()
             self.crs = self.conn.cursor()
             self.crs.execute(insert_albums)
         self.conn.commit()
Esempio n. 2
0
 def get_other_msg(self, cookies, base_url):
     for item in cookies:
         headers = {
             "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.106 Safari/537.36",
             "cookie": "%s=%s" % (item.name, item.value)
         }
     params = {
         "act": "single_album",
         "shop_id": "A202002021618192140113358",
         "time_stamp": int(time.time() * 1000),
     }
     url = parse_url(base_url.lower(), self.server_url, "")
     response = requests.get(url, params=params, headers=headers)
     for img in json.loads(response.text)['result']['goods_list']:
         print(img['imgs'])
         print(img['imgsSrc'])
Esempio n. 3
0
 def other_url_handl(self, url, store_id):
     other_headers = None
     response = request_url(url, headers=self.headers, proxies_list=self.ip_pool)
     try:
         shop_id = re.search(r"/shop_detail/(\w\d+)", response.url).group(1)
     except:
         print(response.url)
     else:
         if response is not None:
             cookies, base_url = get_cookies.get_cookies(url, ip_pool)
             url = parse_url(base_url.lower(), self.server_url, "")
             for item in cookies:
                 other_headers = {
                     "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.106 Safari/537.36",
                     "cookie": "%s=%s" % (item.name, item.value)
                 }
             self.get_other_msg(other_headers, url, store_id, shop_id)
Esempio n. 4
0
 def get_ypimg_page(self, yupoo_href_list):
     for href in yupoo_href_list:
         temp_dict = dict()
         response = requests.get(href, headers=self.headers)
         html = etree.HTML(response.text)
         albums_title = html.xpath("//div[@class='showalbumheader__gallerydec']/h2/span[1]/text()")
         pic_count = html.xpath("//div[@class='showalbumheader__gallerydec']/h2/span[2]/text()")
         pic_src = html.xpath("//div[@class='showalbum__parent showalbum__nor nor']//img/@src")
         other_info = html.xpath("//div[@class='showalbumheader__gallerydec']/div[1]/text()")
         data_id = html.xpath("//div[@class='showalbum__parent showalbum__nor nor']/div[@class='showalbum__children image__main']/@data-id")
         pic_href = list()
         for id in data_id:
             img_url = parse_url(href, id, "uid=1")
             pic_href.append(img_url)
         temp_dict["相册名称"] = albums_title
         temp_dict["图片数量"] = pic_count
         temp_dict["图片地址"] = [pic for pic in pic_href]
         temp_dict["其他信息"] = other_info
         print(temp_dict)
         self.finally_list.append(temp_dict)
Esempio n. 5
0
 def other_url_handl(self, url, store_id):
     """
     获取shop_id,构造json信息的地址,对请求头进行初步处理
     :param url: 首页地址
     :param store_id: 商店ID
     :return: None
     """
     other_headers = None
     response = request_url(url,
                            headers=self.headers,
                            proxies_list=self.ip_pool)
     # 对于请求错误的url进行重新请求,直到正确
     if "b.oijgvrq.cn" in response.url:
         self.other_url_handl(url, store_id)
     # 从首页重定向的url中获取商品的id用于后面构造url
     try:
         shop_id = re.search(r"/shop_detail/(\w\d+)", response.url).group(1)
     except:
         print(response.url)
     else:
         if response is not None:
             cookies = get_cookies.get_cookies(url)
             url = parse_url(response.url.lower(), self.server_url, "")
             # 对请求头进行处理,获取请求首页时服务器设置的cookie值中的token字段构造请求头
             for item in cookies:
                 other_headers = {
                     "User-Agent":
                     "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.106 Safari/537.36",
                     "cookie": "%s=%s" % (item.name, item.value)
                 }
             # 对于一些特殊的商店因为请求头中服务器没有设置token字段,所有手动构造
             if "token" not in other_headers["cookie"]:
                 other_headers = {
                     "User-Agent":
                     "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.106 Safari/537.36",
                     "cookie":
                     "token=Mzk4MDk3Q0E5RTZCN0I1MkYwMTYwNDlCQUNFNkQ5QzVFOEZCOTI1OEEwOTA2MDc0QzUzRTVCNDVDMTg1RTgzRTZBNTY1MTZDQTNFNDFCRkI2ODZGRTgxRjQxRDU3MEZD;"
                 }
             self.get_other_msg(other_headers, url, store_id, shop_id)
Esempio n. 6
0
headers = {
    "User-Agent":
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.106 Safari/537.36"
}
with open("./ip_pool", "r") as f:
    content = f.read()
ip_pool = json.loads(content)
server_url = "service/album/get_album_themes_list.jsp"
for url in url_list:
    response = requests.get(url,
                            proxies=random.choices(ip_pool)[0],
                            headers=headers)
    shop_id = re.search(r"/shop_detail/(.*)", response.url).group(1)
    if response is not None:
        cookies, base_url = get_cookies(url, ip_pool)
        url = parse_url(base_url.lower(), server_url, "")
        for item in cookies:
            other_headers = {
                "User-Agent":
                "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.106 Safari/537.36",
                # "cookie": "%s=%s" % (item.name, item.value)
                "cookie":
                "UM_distinctid=172fef76482421-09a957e7ea6b66-4353761-144000-172fef764838c7; "
                "sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%22A202007011648174290197177%22%2C%22first_id%22%3A%22172fef75cf638b-0c1df4c5ab5928-4353761-1327104-172fef75cf78d9%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E7%9B%B4%E6%8E%A5%E6%B5%81%E9%87%8F%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC_%E7%9B%B4%E6%8E%A5%E6%89%93%E5%BC%80%22%2C%22%24latest_referrer%22%3A%22%22%7D%2C%22%24device_id%22%3A%22172fef75cf638b-0c1df4c5ab5928-4353761-1327104-172fef75cf78d9%22%7D; "
                "token=Mzk4MDk3Q0E5RTZCN0I1MkYwMTYwNDlCQUNFNkQ5QzVFOEZCOTI1OEEwOTA2MDc0QzUzRTVCNDVDMTg1RTgzRTZBNTY1MTZDQTNFNDFCRkI2ODZGRTgxRjQxRDU3MEZD; "
                "CNZZDATA1275056938=120497081-1594357916-%7C1594363316; "
                "JSESSIONID=B74EC8A017C3DDD861F3E6E17F3D6C3A"
            }
        params = {
            "page_index": 1,
            "act": "single_album",