Beispiel #1
0
 def parse2(self, response):
     last_page_pids = response.meta["last_page_pids"]
     r1 = self.first_pettern.findall(response.text)
     if r1:
         r1 = r1[0]
         if r1:
             sku2title = {}
             for sku in self.sku_pattern1.findall(response.text):
                 sku2title[sku[0]] = re.sub("<[\s\S]*?>|\t|\n", "", sku[1])
             response.meta["sku2title"].update(sku2title)
             self.logger.info("jd_skuids 3")
             yield Request(url="https://chat1.jd.com/api/checkChat?pidList={0}&callback=jQuery8117083".format(last_page_pids + "," + r1), callback=self.parse3, meta=response.meta)
         else:
             self.logger.info(response.request.url)
             # 说明没有下半页"https://chat1.jd.com/api/checkChat?pidList=10020242230938,1999899692,72276507174,19999997645,1999899692,100000002015,100000002686,200134637813&callback=jQuery8117083"
             yield Request(
                 url="https://chat1.jd.com/api/checkChat?pidList={0}&callback=jQuery8117083".format(last_page_pids),
                 callback=self.parse3, meta=response.meta)
     else:
         if response.meta["currentpage"] <= 2*response.meta["totalpage"]-1:
             self.logger.info(response.request.url)
             raise Exception(response.request.url)
         else:
             self.logger.info(response.request.url)
             #说明没有下半页"https://chat1.jd.com/api/checkChat?pidList=10020242230938,1999899692,72276507174,19999997645,1999899692,100000002015,100000002686,200134637813&callback=jQuery8117083"
             yield Request(url="https://chat1.jd.com/api/checkChat?pidList={0}&callback=jQuery8117083".format(last_page_pids), callback=self.parse3, meta=response.meta)
Beispiel #2
0
 def make_request_from_data(self, data):
     str_seed = bytes_to_str(data, self.redis_encoding)
     seed = Seed.parse_seed(str_seed)
     if seed.type == 0:
         cate_id, brand_id, name = seed.value
         if brand_id:
             cid1, cid2, cid3 = re.split(',', cate_id)
             # if cid1 == "1713":
             #     en_cate_id, en_brand_id = urllib.parse.urlencode({"cat": cate_id}), urllib.parse.urlencode(
             #         {"ev": "expublishers_" + brand_id})
             # else:
             #en_cate_id, en_brand_id = urllib.parse.urlencode({"cat": cate_id}), urllib.parse.urlencode({"ev": "exbrand_" + name})
             url = 'https://list.jd.com/list.html?{0}&{1}&psort=4&click=1'.format(
                 urllib.parse.urlencode({"cat": cate_id}),
                 urllib.parse.urlencode({"ev": "exbrand_" + name}))
         else:
             url = 'https://list.jd.com/list.html?{0}&psort=4&click=1'.format(
                 urllib.parse.urlencode({"cat": cate_id}))
         return Request(url=url,
                        meta={
                            "_seed": str_seed,
                            "headers": {
                                "Referer": "https://www.jd.com/"
                            }
                        },
                        priority=0,
                        callback=self.parse)
     elif seed.type == 3:
         str_seed = seed.value
         request = Request.deserialize(str_seed, self)
         return request
 def make_request_from_data(self, data):
     str_seed = bytes_to_str(data, self.redis_encoding)
     seed = Seed.parse_seed(str_seed)
     print(seed)
     if seed.type == 0:
         skuid = seed.value
         url = "https://club.jd.com/comment/getSkuPageFoldComments.action?callback=jQuery2675603&productId={0}&score=0&sortType=6&page=0&pageSize=10".format(
             skuid)
         return Request(
             url=url,
             meta={
                 "_seed": str_seed,
                 "current_page": 0,
                 "headers": {
                     "Connection": "close",
                     "Referer":
                     "https://item.m.jd.com/{0}.html".format(skuid)
                 }
             },
             priority=0,
             callback=self.parse)
     elif seed.type == 3:
         str_seed = seed.value
         request = Request.deserialize(str_seed, self)
         return request
Beispiel #4
0
 def parse2(self, response):
     last_page_pids = response.meta["last_page_pids"]
     r1 = self.first_pettern.findall(response.text)
     print(response.request.url)
     if r1:
         r1 = r1[0]
         if r1:
             yield Request(
                 url=
                 "https://chat1.jd.com/api/checkChat?pidList={0}&callback=jQuery8117083"
                 .format(last_page_pids + "," + r1),
                 callback=self.parse3,
                 meta={"_seed": response.meta["_seed"]})
         else:
             # 说明没有下半页"https://chat1.jd.com/api/checkChat?pidList=10020242230938,1999899692,72276507174,19999997645,1999899692,100000002015,100000002686,200134637813&callback=jQuery8117083"
             yield Request(
                 url=
                 "https://chat1.jd.com/api/checkChat?pidList={0}&callback=jQuery8117083"
                 .format(last_page_pids),
                 callback=self.parse3,
                 meta={"_seed": response.meta["_seed"]})
     else:
         #说明没有下半页"https://chat1.jd.com/api/checkChat?pidList=10020242230938,1999899692,72276507174,19999997645,1999899692,100000002015,100000002686,200134637813&callback=jQuery8117083"
         yield Request(
             url=
             "https://chat1.jd.com/api/checkChat?pidList={0}&callback=jQuery8117083"
             .format(last_page_pids),
             callback=self.parse3,
             meta={"_seed": response.meta["_seed"]})
Beispiel #5
0
 def make_request_from_data(self, data):
     str_seed = bytes_to_str(data, self.redis_encoding)
     seed = Seed.parse_seed(str_seed)
     if seed.type == 0:
         return Request(url=seed.value,
                        meta={"_seed": str_seed},
                        priority=0,
                        callback=self.parse)
     elif seed.type == 3:
         str_seed = seed.value
         request = Request.deserialize(str_seed, self)
         return request
Beispiel #6
0
 def make_request_from_data(self, data):
     str_seed = bytes_to_str(data, self.redis_encoding)
     seed = Seed.parse_seed(str_seed)
     if seed.type == 0:
         phonenumber = seed.value.strip()
         url = "http://shouji.xpcha.com/{0}.html".format(phonenumber)
         return Request(url=url,
                        meta={"_seed": str_seed},
                        priority=0,
                        callback=self.parse)
     elif seed.type == 3:
         str_seed = seed.value
         request = Request.deserialize(str_seed, self)
         return request
Beispiel #7
0
 def parse1(self, response):
     seed = Seed.parse_seed(response.meta["_seed"])
     cate_id, brand_id, page, s = seed.value
     r1 = self.first_pettern.findall(response.text)
     if r1:
         r1 = r1[0]
         if r1:
             cate_id, brand_id, page, s, items = cate_id, brand_id, page + 1, s + 30, r1
             if brand_id:
                 en_cate_id, en_brand_id = urllib.parse.urlencode({
                     "cat":
                     cate_id
                 }), urllib.parse.urlencode({"ev": "exbrand_" + brand_id})
                 url = 'https://list.jd.com/list.html?{0}&{1}&psort=4&page={2}&s={3}&scrolling=y&log_id=1596108547754.6591&tpl=1_M&isList=1&show_items={4}'.format(
                     en_cate_id, en_brand_id, page, s, items)
                 request = Request(url=url,
                                   callback=self.parse2,
                                   priority=2)
                 request.headers[
                     "Referer"] = "https://list.jd.com/list.html?{0}&{1}&page={2}&s={3}&psort=4&click=1".format(
                         en_cate_id, en_brand_id, page - 1, s - 30)
             else:
                 en_cate_id = urllib.parse.urlencode({"cat": cate_id})
                 url = 'https://list.jd.com/list.html?{0}&psort=4&page={1}&s={2}&log_id=1596108547754.6591&tpl=1_M&isList=1&show_items={3}'.format(
                     en_cate_id, page, s, items)
                 request = Request(url=url,
                                   callback=self.parse2,
                                   priority=2)
                 request.headers[
                     "Referer"] = "https://list.jd.com/list.html?{0}&page={1}&s={2}&psort=4&click=1".format(
                         en_cate_id, page - 1, s - 30)
             request.meta["_seed"] = str(
                 Seed((cate_id, brand_id, page, s), type=2))
             request.meta["last_page_pids"] = r1
             yield request
Beispiel #8
0
 def make_request_from_data(self, data):
     str_seed = bytes_to_str(data, self.redis_encoding)
     seed = Seed.parse_seed(str_seed)
     if seed.type == 0:
         skuid = seed.value
         #url = "https://wq.jd.com/commodity/comment/getcommentlist?callback=fetchJSON_comment98&pagesize=10&sceneval=2&skucomment=1&score=0&sku={0}&sorttype=6&page=0".format(skuid)
         url = "https://wq.jd.com/commodity/comment/getcommentlist?callback=skuJDEvalB&version=v2&pagesize=10&sceneval=2&skucomment=1&score=0&sku={}&sorttype=6&page=1&t=0.5156075450518778".format(
             skuid)
         headers = {
             'Connection':
             'close',
             'Host':
             'wq.jd.com',
             'accept':
             '*/*',
             'sec-fetch-site':
             'same-site',
             'sec-fetch-mode':
             'no-cors',
             'sec-fetch-dest':
             'script',
             "Referer":
             "https://item.m.jd.com/ware/view.action?wareId={}&sid=null".
             format(skuid),
             'accept-encoding':
             'gzip, deflate, br',
             'accept-language':
             'zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7',
             'User-Agent':
             'Mozilla/5.0 (Linux; Android 10; HRY-AL00a; HMSCore 5.1.1.303) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.106 HuaweiBrowser/11.0.7.303 Mobile Safari/537.36',
             "cookie":
             "__jdc=122270672; mba_muid=16087105855231456793479; shshshfpa=b86c237d-b506-9cc9-730d-39db2f5ea48c-1608710586; shshshfpb=aW2xjA0PZevBiTvJrQ6rk4A%3D%3D; retina=1; webp=1; visitkey=31140776387466944; sbx_hot_h=null; deviceVersion=83.0.4103.106; deviceOS=android; deviceOSVersion=10; deviceName=Chrome; rurl=https%3A%2F%2Fwqs.jd.com%2Ffaqs%2Findex.html%3Fsceneval%3D2%26ptag%3D7001.1.124%26productId%3D12991458%26ispg%3D%26_fd%3Djdm%26jxsid%3D16109541564584400343; equipmentId=A75Q6PQS36IHI62HBEUGC44IVLERE7257UWVYTGEXPMR6NOKARSVVF2Q6EBPSVGNR537LK6GQN3ENW47JREOEXNAVI; __jdv=122270672%7Cdirect%7C-%7Cnone%7C-%7C1614224630058; sc_width=360; shshshfp=c6774e911e47825ddd51cefc23f9b157; wxa_level=1; cid=9; jxsid=16145705280303310338; __jda=122270672.16087105855231456793479.1608710585.1614224630.1614570529.10; wq_ug=14; fingerprint=794164a430090764096f40466260c718; mt_xid=V2_52007VwMVU1ReUlsbQB1YBmUDF1ZaXlpYGk8RbFVuBEBVWV9RRkhIGw4ZYlcRWkFQWwlIVR5aAjAAR1BZX1tZHnkaXQZnHxNQQVlSSx9JElgFbAEbYl9oUmoXSB5dDWYKE1BZXlNeF08cVQNvMxJbWV8%3D; wq_logid=1614571192.282863947; wqmnx1=MDEyNjM5M3AuL3d3MiY2NjQ1eGQtTTFBaSBsby8zd3IzZTUyNy00UkghKQ%3D%3D; __jdb=122270672.9.16087105855231456793479|10.1614570529; mba_sid=16145705290954323095988279117.9; __wga=1614571199267.1614570547761.1614225998734.1610954174749.5.6; PPRD_P=UUID.16087105855231456793479-LOGID.1614571199300.300139660; jxsid_s_t=1614571199496; jxsid_s_u=https%3A//item.m.jd.com/ware/view.action; sk_history=70241615154%2C101609%2C615036%2C54761686610%2C1399903%2C10024515889185%2C10381689654%2C12991458%2C100010062010%2C58070892025%2C100007627009%2C; shshshsID=e45b3b58ca53b7ab42489de6ebc02d6b_5_1614571200418"
         }
         return Request(url=url,
                        meta={
                            "_seed": str_seed,
                            "dydmc_delay": 0.15 + random.random() * 0.1,
                            "headers": headers
                        },
                        priority=0,
                        callback=self.parse)
     elif seed.type == 3:
         str_seed = seed.value
         request = Request.deserialize(str_seed, self)
         return request
Beispiel #9
0
 def make_request_from_data(self, data):
     str_seed = bytes_to_str(data, self.redis_encoding)
     seed = Seed.parse_seed(str_seed)
     if seed.type == 0:
         cats = re.split(',', seed.value)
         format_value = (seed.value, 2,
                         "pub") if cats[0] == '1713' else (seed.value, 1,
                                                           "brand")
         url = 'https://list.jd.com/list.html?cat={0}&trans=1&md={1}&my=list_{2}'.format(
             *format_value)
         return Request(url=url,
                        meta={"_seed": str_seed},
                        priority=0,
                        callback=self.parse)
     elif seed.type == 3:
         str_seed = seed.value
         request = Request.deserialize(str_seed, self)
         return request
Beispiel #10
0
 def make_request_from_data(self, data):
     str_seed = bytes_to_str(data, self.redis_encoding)
     seed = Seed.parse_seed(str_seed)
     if seed.type == 0:
         sku_ids = seed.value
         url = "http://p.3.cn/prices/mgets?&type=1&skuIds=J_" + sku_ids + '&pduid=' + self.usrid
         return Request(url=url,
                        meta={
                            "_seed": str_seed,
                            "headers": {
                                "Connection": "keep-alive"
                            }
                        },
                        priority=0,
                        callback=self.parse)
     elif seed.type == 3:
         str_seed = seed.value
         request = Request.deserialize(str_seed, self)
         return request
Beispiel #11
0
 def parse(self, response):
     seed = Seed.parse_seed(response.meta["_seed"])
     phonenumber = seed.value
     pro_city = self.pro_city_pattern.findall(response.text)
     tel_compay = self.telcompany_pattern.findall(response.text)
     if pro_city:
         if pro_city[0][0] != "未知":
             yield {
                 "phonenumber":
                 phonenumber,
                 "province":
                 pro_city[0][0],
                 "city":
                 pro_city[0][0] if pro_city[0][1] == "" else pro_city[0][1],
                 "company":
                 tel_compay[0]
             }
         else:
             #失败写出
             yield Request(
                 url="https://haoma.baidu.com/phoneSearch?search={0}".
                 format(phonenumber),
                 meta={
                     "_seed": response.meta["_seed"],
                     "headers": {
                         "Referer": "https://www.baidu.com/"
                     }
                 },
                 priority=1,
                 callback=self.parse1)
     else:
         yield Request(
             url="https://haoma.baidu.com/phoneSearch?search={0}".format(
                 phonenumber),
             meta={
                 "_seed": response.meta["_seed"],
                 "headers": {
                     "Referer": "https://www.baidu.com/"
                 }
             },
             priority=1,
             callback=self.parse1)
 def make_request_from_data(self, data):
     str_seed = bytes_to_str(data, self.redis_encoding)
     seed = Seed.parse_seed(str_seed)
     if seed.type == 0:
         skuid = seed.value
         url = "https://wq.jd.com/commodity/comment/getcommentlist?callback=fetchJSON_comment98&pagesize=10&sceneval=2&skucomment=1&score=0&sku={0}&sorttype=6&page=0".format(
             skuid)
         return Request(
             url=url,
             meta={
                 "_seed": str_seed,
                 "headers": {
                     "Connection": "close",
                     "Referer":
                     "https://item.m.jd.com/{0}.html".format(skuid)
                 }
             },
             priority=0,
             callback=self.parse)
     elif seed.type == 3:
         str_seed = seed.value
         request = Request.deserialize(str_seed, self)
         return request
Beispiel #13
0
 def make_request_from_data(self, data):
     str_seed = bytes_to_str(data, self.redis_encoding)
     seed = Seed.parse_seed(str_seed)
     if seed.type == 0:
         skuid = seed.value
         url = "https://club.jd.com/comment/skuProductPageComments.action?callback=fetchJSON_comment98" \
               "&productId={0}&score=0&sortType=5&page=0&pageSize=10&isShadowSku=0&fold=1".format(skuid)
         return Request(
             url=url,
             meta={
                 "_seed": str_seed,
                 "headers": {
                     "Connection": "close",
                     "Referer":
                     "https://item.m.jd.com/{0}.html".format(skuid)
                 }
             },
             priority=0,
             callback=self.parse)
     elif seed.type == 3:
         str_seed = seed.value
         request = Request.deserialize(str_seed, self)
         return request
Beispiel #14
0
 def parse3(self, response):
     seed = Seed.parse_seed(response.meta["_seed"])
     cate_id, brand_id, page, s = seed.value
     r = json.loads(self.json_pettern.findall(response.text)[0])
     if r:
         tmp = {}
         for item in r:
             if item:
                 tmp[item.get("pid")] = {
                     "skuid":
                     item.get("pid"),
                     "cate_id":
                     cate_id,
                     "brand_id":
                     brand_id,
                     "shopid":
                     item.get("shopId"),
                     "venderid":
                     item.get("venderId", None),
                     "shop_name":
                     item.get("seller"),
                     "ziying":
                     1 if item.get("seller")
                     and item.get("seller").find("自营") != -1 else 0,
                     "title":
                     response.meta["sku2title"][str(item.get("pid"))],
                     "chaoshi":
                     1 if "京东超市" in response.meta["sku2title"][str(
                         item.get("pid"))] else 0
                 }
         response.meta["info"] = tmp
         response.meta["dydmc_delay"] = 1
         response.meta["headers"] = {
             "Connection":
             "close",
             "User-Agent":
             "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Mobile Safari/537.36"
         }
         yield Request(
             url="https://p.3.cn/prices/mgets?&type=1&skuIds=J_" +
             "%2CJ_".join([sku for sku in response.meta["sku2title"]]) +
             '&pduid=' + str(random.randint(100000000, 999999999)),
             callback=self.parse4,
             meta=response.meta,
             priority=4)
     else:
         raise Exception(response.request.url)
 def parse(self, response):
     seed = Seed.parse_seed(response.meta["_seed"])
     skuid = seed.value
     count = self.allcnt_pattern.findall(response.text)
     r = self.comments_pattern.findall(response.text)[0]
     if r != '[]':
         for item in json.loads(r):
             yield {
                 "id": item.get("id"),
                 "creationTime": item.get("creationTime"),
                 "isTop": str(item.get('isTop')),
                 "isMobile": item.get("isMobile"),
                 "userLevelName": item.get("userLevelName"),
                 "userClientShow": item.get("userClientShow"),
                 "plusAvailable": item.get("plusAvailable"),
                 "firstCategory": item.get("firstCategory"),
                 "secondCategory": item.get("secondCategory"),
                 "thirdCategory": item.get("thirdCategory"),
                 "discussionId": item.get("discussionId"),
                 "referenceId": item.get("referenceId"),
                 "referenceTime": item.get("referenceTime"),
                 "nickname": item.get("nickname"),
                 "commentcout": count[0],
                 "current_page": response.meta["current_page"]
             }
         maxpagesindex = max(0, min((int(count[0]) - 1) // 10, 99))
         if response.meta["current_page"] < maxpagesindex:
             url = "https://club.jd.com/comment/getSkuPageFoldComments.action?callback=jQuery2675603&productId={0}&score=0&sortType=6&page={1}&pageSize=10".format(
                 skuid, response.meta["current_page"] + 1)
             yield Request(
                 url=url,
                 meta={
                     "_seed": response.meta["_seed"],
                     "commentcount": count[0],
                     "current_page": response.meta["current_page"] + 1,
                     "headers": {
                         "Connection":
                         "close",
                         "Referer":
                         "https://item.m.jd.com/{0}.html".format(skuid)
                     }
                 },
                 priority=1,
                 callback=self.parse)
Beispiel #16
0
 def parse(self, response):
     seed = Seed.parse_seed(response.meta["_seed"])
     page_strs = self.totalpage_perttern.findall(response.text)
     if page_strs:
         page_strs = page_strs[0]
         for i in range(1, int(page_strs) + 1):
             page, s = 2 * i - 1, 60 * (i - 1) + 1
             cate_id, brand_id, name = seed.value
             if brand_id:
                 en_cate_id, ename = urllib.parse.urlencode({
                     "cat": cate_id
                 }), urllib.parse.urlencode({"ev": "exbrand_" + name})
                 url = 'https://list.jd.com/list.html?{0}&{1}&page={2}&s={3}&psort=4&click=1'.format(
                     en_cate_id, ename, page, s)
                 refer = "https://www.jd.com/" if i == 1 else 'https://list.jd.com/list.html?{0}&{1}&page={2}&s={3}&psort=4&click=1'.format(
                     en_cate_id, ename, 2 * (i - 1) - 1, 60 * (i - 2) + 1)
             else:
                 en_cate_id = urllib.parse.urlencode({"cat": cate_id})
                 url = 'https://list.jd.com/list.html?{0}&page={1}&s={2}&psort=4&click=1'.format(
                     en_cate_id, page, s)
                 refer = "https://www.jd.com/" if i == 1 else 'https://list.jd.com/list.html?{0}&page={1}&s={2}&psort=4&click=1'.format(
                     en_cate_id, 2 * (i - 1) - 1, 60 * (i - 2) + 1)
             yield Request(url=url,
                           callback=self.parse1,
                           meta={
                               "dydmc_delay":
                               2.5,
                               "totalpage":
                               int(page_strs),
                               "currentpage":
                               page,
                               "_seed":
                               str(
                                   Seed((cate_id, brand_id, name, page, s),
                                        type=1)),
                               "headers": {
                                   "Connection": "close",
                                   "Referer": refer
                               }
                           },
                           priority=1)
     else:
         raise Exception(response.request.url)
 def parse(self, response):
     seed = Seed.parse_seed(response.meta["_seed"])
     skuid = seed.value
     count = self.allcnt_pattern.findall(response.text)
     for item in literal_eval(
             self.comments_pattern.findall(response.text)[0].replace(
                 "\t", "").replace("\n", "").replace(",}", "}").replace(
                     "false", 'False').replace("true", 'True').replace(
                         'False,', '"False",').replace('True,', '"True",')):
         yield {
             "id": item.get("id"),
             "creationTime": item.get("creationTime"),
             "isTop": str(item.get('isTop')),
             "isMobile": item.get("isMobile"),
             "userLevelName": item.get("userLevelName"),
             "userClientShow": item.get("userClientShow"),
             "plusAvailable": item.get("plusAvailable"),
             "firstCategory": item.get("firstCategory"),
             "secondCategory": item.get("secondCategory"),
             "thirdCategory": item.get("thirdCategory"),
             "discussionId": item.get("discussionId"),
             "referenceId": item.get("referenceId"),
             "referenceTime": item.get("referenceTime"),
             "nickname": item.get("nickname"),
             "commentcout": count[0]
         }
     maxpagesindex = max(0, min((int(count[0]) - 1) // 10, 99))
     for pindex in range(maxpagesindex + 1):
         url = "https://wq.jd.com/commodity/comment/getcommentlist?callback=fetchJSON_comment98&pagesize=10&sceneval=2&skucomment=1&score=0&sku={0}&sorttype=6&page={1}".format(
             skuid, pindex)
         yield Request(
             url=url,
             meta={
                 "_seed": response.meta["_seed"],
                 "commentcount": count[0],
                 "headers": {
                     "Connection": "close",
                     "Referer":
                     "https://item.m.jd.com/{0}.html".format(skuid)
                 }
             },
             priority=1,
             callback=self.parse1)
Beispiel #18
0
    def parse4(self, response):
        items = json.loads(response.text)
        if items:
            for item in items:
                item["id"] = item["id"][2:]
                item = dict(item)
                item['clean_price'] = self.clean_price(item)
                response.meta["prices"] = item
                response.meta["dydmc_delay"] = 0.25
                response.meta["headers"] = {
                    "Connection": "close",
                    "Referer":
                    "https://item.m.jd.com/{0}.html".format(item["id"])
                }
                url = "https://wq.jd.com/commodity/comment/getcommentlist?callback=fetchJSON_comment98&pagesize=10&sceneval=2&skucomment=1&score=0&sku={0}&sorttype=6&page=0".format(
                    item["id"])
                yield Request(url=url,
                              meta=response.meta,
                              callback=self.parse5,
                              priority=5)

        else:
            raise Exception("unvalid error!")
Beispiel #19
0
 def parse0(self, response):
     seed = Seed.parse_seed(response.meta["_seed"])
     tuples = self.pattern.findall(response.text)
     for item in tuples:
         cate_id, brand_id, name = seed.value, item[0], item[1]
         if brand_id:
             url = 'https://list.jd.com/list.html?{0}&{1}&psort=4&click=1'.format(
                 urllib.parse.urlencode({"cat": cate_id}),
                 urllib.parse.urlencode({"ev": "exbrand_" + name}))
         else:
             url = 'https://list.jd.com/list.html?{0}&psort=4&click=1'.format(
                 urllib.parse.urlencode({"cat": cate_id}))
         yield Request(url=url,
                       meta={
                           "dydmc_delay": 2.5,
                           "_seed":
                           str(Seed(value=(cate_id, brand_id, name))),
                           "headers": {
                               "Referer": "https://www.jd.com/"
                           }
                       },
                       priority=0,
                       callback=self.parse)
Beispiel #20
0
 def parse1(self, response):
     seed = Seed.parse_seed(response.meta["_seed"])
     cate_id, brand_id, name, page, s = seed.value
     sku2title = {}
     for sku in self.sku_pattern1.findall(response.text):
         sku2title[sku[0]] = re.sub("<[\s\S]*?>|\t|\n", "", sku[1])
     r1 = self.first_pettern.findall(response.text)
     if r1:
         r1 = r1[0]
         if r1:
             cate_id, brand_id, page, s, items = cate_id, brand_id, page + 1, s + 30, r1
             if brand_id:
                 en_cate_id, ename = urllib.parse.urlencode({
                     "cat": cate_id
                 }), urllib.parse.urlencode({"ev": "exbrand_" + name})
                 url = 'https://list.jd.com/list.html?{0}&{1}&psort=4&page={2}&s={3}&scrolling=y&log_id=1596108547754.6591&tpl=1_M&isList=1&show_items={4}'.format(
                     en_cate_id, ename, page, s, items)
                 request = Request(url=url,
                                   callback=self.parse2,
                                   priority=2)
                 request.headers[
                     "Referer"] = "https://list.jd.com/list.html?{0}&{1}&page={2}&s={3}&psort=4&click=1".format(
                         en_cate_id, ename, page - 1, s - 30)
             else:
                 en_cate_id = urllib.parse.urlencode({"cat": cate_id})
                 url = 'https://list.jd.com/list.html?{0}&psort=4&page={1}&s={2}&log_id=1596108547754.6591&tpl=1_M&isList=1&show_items={3}'.format(
                     en_cate_id, page, s, items)
                 request = Request(url=url,
                                   callback=self.parse2,
                                   priority=2)
                 request.headers[
                     "Referer"] = "https://list.jd.com/list.html?{0}&page={1}&s={2}&psort=4&click=1".format(
                         en_cate_id, page - 1, s - 30)
             request.meta["_seed"] = str(
                 Seed((cate_id, brand_id, page, s), type=2))
             request.meta["last_page_pids"] = r1
             request.meta["sku2title"] = sku2title
             request.meta["totalpage"] = response.meta["totalpage"]
             request.meta["currentpage"] = page - 1
             yield request
         else:
             raise Exception(response.request.url)
     else:
         raise Exception(response.request.url)
Beispiel #21
0
 def parse1(self, response):
     seed = Seed.parse_seed(response.meta["_seed"])
     cate_id, brand_id, page, s = seed.value
     sku2title = {}
     for sku in self.sku_pattern1.findall(response.text):
         sku2title[sku[0]] = re.sub("<[\s\S]*?>|\t|\n", "", sku[1])
     r1 = self.first_pettern.findall(response.text)
     if r1:
         r1 = r1[0]
         if r1:
             if len(r1.split(",")) == 30:
                 cate_id, brand_id, page, s, items = cate_id, brand_id, page + 1, s + 30, r1
                 if brand_id:
                     en_cate_id, en_brand_id = urllib.parse.urlencode(
                         {"cat": cate_id}), urllib.parse.urlencode({"ev": "exbrand_" + brand_id})
                     url = 'https://list.jd.com/list.html?{0}&{1}&psort=4&page={2}&s={3}&scrolling=y&log_id=1596108547754.6591&tpl=1_M&isList=1&show_items={4}'.format(
                         en_cate_id, en_brand_id, page, s, items)
                     request = Request(url=url, callback=self.parse2, priority=2)
                     request.headers["Referer"] = "https://list.jd.com/list.html?{0}&{1}&page={2}&s={3}&psort=4&click=1".format(
                         en_cate_id, en_brand_id, page-1, s-30)
                     self.logger.info("jd_skuids 6")
                 else:
                     en_cate_id = urllib.parse.urlencode({"cat": cate_id})
                     url = 'https://list.jd.com/list.html?{0}&psort=4&page={1}&s={2}&log_id=1596108547754.6591&tpl=1_M&isList=1&show_items={3}'.format(
                         en_cate_id, page, s, items)
                     request = Request(url=url, callback=self.parse2, priority=2)
                     request.headers["Referer"] = "https://list.jd.com/list.html?{0}&page={1}&s={2}&psort=4&click=1".format(
                         en_cate_id, page - 1, s - 30)
                     self.logger.info("jd_skuids 7")
                 request.meta["_seed"] = str(Seed((cate_id, brand_id, page, s), type=2))
                 request.meta["last_page_pids"] = r1
                 request.meta["sku2title"] = sku2title
                 request.meta["totalpage"] = response.meta["totalpage"]
                 request.meta["currentpage"] = page - 1
                 yield request
             else:
                 # 说明没有下半页"https://chat1.jd.com/api/checkChat?pidList=10020242230938,1999899692,72276507174,19999997645,1999899692,100000002015,100000002686,200134637813&callback=jQuery8117083"
                 yield Request(url="https://chat1.jd.com/api/checkChat?pidList={0}&callback=jQuery8117083".format(
                     r1), callback=self.parse3, meta=response.meta)
         else:
             raise Exception(response.request.url)
     else:
         raise Exception(response.request.url)