Esempio n. 1
0
def get_headers(type=1):
    if type == 1:
        headers1 = '''accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3
    accept-encoding: gzip, deflate, br
    accept-language: zh-CN,zh;q=0.9
    upgrade-insecure-requests: 1
    user-agent: Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/11.0 Mobile/15A372 Safari/604.1'''
        headers1 = headers_todict(headers1)
        return headers1
    elif type == 2:
        headers2 = '''accept: */*
        accept-encoding: gzip, deflate, br
        accept-language: zh-CN,zh;q=0.9
        upgrade-insecure-requests: 1
        user-agent: Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/11.0 Mobile/15A372 Safari/604.1'''
        headers2 = headers_todict(headers2)
        return headers2
    elif type == 3:
        headers3 = '''Accept: application/json
User-Agent: Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/11.0 Mobile/15A372 Safari/604.1
Content-type: application/x-www-form-urlencoded
Accept-Encoding: gzip, deflate, br
Accept-Language: zh-CN,zh;q=0.9'''
        headers3 = headers_todict(headers3)
        return headers3
Esempio n. 2
0
    def parse_kanleyoukan(self, response):
        text = response.text
        youxiao = '(detail_pine)'
        youxiao_m = re.search(youxiao, text)
        if youxiao_m:
            text = text.replace("\n", "")
            text = text.replace("\r", "")
            text = text.replace("\t", "")
            meta = response.meta
            shop_id = meta.get("shop_id")
            item_id = meta.get("item_id")

            url2 = '''https://mdskip.taobao.com/core/initItemDetail.htm?isUseInventoryCenter=false&cartEnable=true&service3C=false&isApparel=false&isSecKill=false&tmallBuySupport=true&isAreaSell=false&tryBeforeBuy=false&offlineShop=false&itemId={}&showShopProm=false&isPurchaseMallPage=false&itemGmtModified=1568217644000&isRegionLevel=false&household=false&sellerPreview=false&queryMemberRight=true&addressLevel=2&isForbidBuyItem=false&callback=setMdskip&timestamp=1568612546869'''
            headers2 = '''User-Agent:Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36
            Referer: https://detail.tmall.com/item.htm?id=585627813748
            Cookie:t=771e985ad68867dca634b66c8b52c710; cna=RxThFT+HtVACATyweW2UU3HN; tracknick=%5Cu4E09%5Cu9014%5Cu6CB3%5Cu8FD8%5Cu662F%5Cu5929%5Cu5802; _cc_=URm48syIZQ%3D%3D; enc=5DlFftlD20fPoNrYGejylp4qjVAwaVqHVif222OGfTVcQTqAx2FMz1Zq21yB5qgS%2FwJtxLSsCsnMVvodBHNseg%3D%3D; thw=cn; x=e%3D1%26p%3D*%26s%3D0%26c%3D0%26f%3D0%26g%3D0%26t%3D0%26__ll%3D-1%26_ato%3D0; hng=CN%7Czh-CN%7CCNY%7C156; cookie2=1cf7677466b3b4247ee06b58db60af76; _tb_token_=3bebd4e0eb356; v=0; mt=ci%3D-1_1; miid=1514173119299498437; uc1=cookie14=UoTbnKMDmEcUIg%3D%3D; tk_trace=oTRxOWSBNwn9dPyorMJE%2FoPdY8zZPEr%2FCrvCMS%2BG3sTRRWrQ%2BVVTl09ME1KrXdS91s4jJYL5NA7c2uWyfyw%2F9gYrem7NEW7%2FhRF2%2BDZyzjzJUQM%2B3ajmc%2BqsNRORcaRH3CNHyGPkcc%2F%2BLjMbn8%2FEPT%2FB8BQzJOUzWHeXEfYMACGjSPsry1CB514xsVVKb7xQpGcujL%2FGqsEgBrb1wz3x5x7vG9V5OAdii7QZqPQIrqC92RZGPM2m943EN8TkLKCavVsJtrfVF%2B2rncH0VPQJbqgCp1b7IcFLp4aV1X2Gt2nDZo4%2BPKkcowzbgNV4LUNTu6ynXBPWBq0RDDlOX%2FY1ucI%3D; linezing_session=1jM55aS1rIVc20jqjpDgyWCs_1571208987345cSbM_6; _m_h5_tk=3756ebaa20d617d7e1fc48cec3e82ad6_1571233351866; _m_h5_tk_enc=8a6d0092297b00d8f43562a95fa43ddf; l=dBgt4Jcmq1sEN_vbBOCZnurza779sIRAguPzaNbMi_5IL18suR7OkgjmxeJ6cjWfTlYB4dG4psJ9-etkZ4eT6qM8sxAJNxDc.; isg=BJqaNiC5kzLEHR9lMWc-xLee60C8yx6lNJSpOKQTRy34FzpRjFiZtWll46Mux5Y9'''

            url1 = "https://item.taobao.com/item.htm?id={}"
            headers1 = '''User-Agent: Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Maxthon/4.4.6.2000 Chrome/30.0.1599.101 Safari/537.36'''
            fenlie = "},{"
            for i in text.split(fenlie):
                text_i = i
                item_id_s = self.match_zhengze(r'itemId\":(\d+)',
                                               text_i)  #商品ID
                seller_id_s = self.match_zhengze('sellerId":(\d+)',
                                                 text_i)  #卖家ID
                cate_id_s = self.match_zhengze('categoryId":(\d+)',
                                               text_i)  #类目id
                item = taobao()
                item["item_id"] = item_id_s
                item["seller_id"] = seller_id_s
                item["cate_id"] = cate_id_s
                item["pipeline_level"] = "看了又看"
                yield item
                if item_id_s:
                    url = url2.format(item_id_s)
                    headers = headers_todict(headers2)
                    yield scrapy.Request(url=url,
                                         callback=self.parse_goods_xiaoliang,
                                         method="GET",
                                         headers=headers,
                                         meta={"item_id": item_id_s})
                    url = url1.format(item_id_s)  #
                    headers = headers_todict(headers1)
                    # yield scrapy.Request(url=url, callback=self.parse_good_information, method="GET", headers=headers,meta={"item_id": item_id_s,"seller_id":seller_id_s})#商品详情
        else:
            request = self.try_again(response)
            if request:
                yield request
Esempio n. 3
0
    def get_headers(self, type=1):
        if type == 1:
            headers = '''Accept: */*
Accept-Encoding: gzip, deflate, br
Accept-Language: zh-CN,zh;q=0.9
Cache-Control: no-cache
Connection: keep-alive
Host: hotels.ctrip.com
Pragma: no-cache
Referer: https://www.ctrip.com/
Sec-Fetch-Mode: no-cors
Sec-Fetch-Site: same-site
User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'''
        else:
            headers = '''accept: application/json
accept-encoding: gzip, deflate, br
accept-language: zh-CN,zh;q=0.9
cache-control: no-cache
content-type: application/json;charset=UTF-8
origin: https://hotels.ctrip.com
pragma: no-cache
sec-fetch-mode: cors
sec-fetch-site: same-site
user-agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36
Host: m.ctrip.com
'''
        return headers_todict(headers)
Esempio n. 4
0
    def get_headers(self, type=1):
        if type == 1:
            headers = '''Accept: */*
Accept-Encoding: gzip, deflate, br
Accept-Language: zh-CN,zh;q=0.9
Cache-Control: no-cache
Connection: keep-alive
Host: webh.huajiao.com
Pragma: no-cache
Referer: https://www.huajiao.com/category/1000
Sec-Fetch-Mode: no-cors
Sec-Fetch-Site: same-site
User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'''
        else:
            headers = '''accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3
accept-encoding: gzip, deflate, br
accept-language: zh-CN,zh;q=0.9
cache-control: no-cache
pragma: no-cache
sec-fetch-mode: navigate
sec-fetch-site: none
sec-fetch-user: ?1
upgrade-insecure-requests: 1
user-agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'''
        return headers_todict(headers)
def get_taobao_headers():
    headers = '''accept: */*
    accept-encoding: gzip, deflate, br
    accept-language: zh-CN,zh;q=0.9
    cache-control: no-cache
    pragma: no-cache
    sec-fetch-mode: no-cors
    sec-fetch-site: same-site
    user-agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'''
    return headers_todict(headers)
Esempio n. 6
0
    def get_headers(self):
        headers = '''Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3
Accept-Encoding: gzip, deflate, br
Accept-Language: zh-CN,zh;q=0.9
Connection: keep-alive
Host: m.aliexpress.com
Upgrade-Insecure-Requests: 1
User-Agent: Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/11.0 Mobile/15A372 Safari/604.1'''
        headers = headers_todict(headers)
        return headers
Esempio n. 7
0
    def parse_shopid(self, response):
        youxiao = 'jsonp102({"SCCP'
        text = response.text
        if youxiao in text:
            text = text.replace("\n", "")
            text = text.replace("\r", "")
            text = text.replace("\t", "")
            meta = response.meta
            shop_id = meta.get("shop_id")

            zhuangtai_s = self.match_zhengze('"SCCP_2_[^"]*":([^\}]*)\}', text)
            item = taobao()
            item["zhuangtai"] = zhuangtai_s
            item["pipeline_level"] = "店铺扫描"
            yield item

            if int(zhuangtai_s) > 0:  #这里判断
                url = "http://shop.m.taobao.com/shop/shop_info.htm?shop_id={}&tbpm=3"
                url = url.format(shop_id)
                headers_str = '''User-Agent:Mozilla/5.0 (iPhone; CPU iPhone OS 6_0 like Mac OS X) AppleWebKit/536.26 (KHTML, like Gecko) Version/6.0 Mobile/10A403 Safari/8536.25'''
                headers = headers_todict(headers_str)
                yield scrapy.Request(url=url,
                                     callback=self.parse_shopxinyong,
                                     method="GET",
                                     headers=headers,
                                     meta={"shop_id": shop_id})  #手机信用

                url_tui = "https://tui.taobao.com/recommend?shop_id={}&floorId=42296&appid=6862"
                url_tui = url_tui.format(shop_id)
                headers_tui_str = "User-Agent:Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Maxthon/4.4.6.2000 Chrome/30.0.1599.101 Safari/537.36"
                headers2 = headers_todict(headers_tui_str)
                yield scrapy.Request(url=url_tui,
                                     callback=self.parse_tui_diannao,
                                     method="GET",
                                     headers=headers2,
                                     meta={"shop_id": shop_id})  #店铺tui
        else:
            request = self.try_again(response)
            if request:
                yield request
Esempio n. 8
0
    def start_requests(self):
        headers_str = '''User-Agent:Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Maxthon/4.4.6.2000 Chrome/30.0.1599.101 Safari/537.36'''
        headers = headers_todict(headers_str)

        # for i in range(self.id_start,self.id_end+1):
        for i in [101717810, 57472900, 10000366]:
            url = "https://count.taobao.com/counter3?callback=jsonp102&keys=SCCP_2_{}".format(
                i)
            yield scrapy.Request(url=url,
                                 callback=self.parse_shopid,
                                 method="GET",
                                 headers=headers,
                                 meta={"shop_id": i})
Esempio n. 9
0
class AmazonukSpider(RedisSpider):
    name = 'amazon_uk'
    allowed_domains = ['www.amazon.com']
    start_urls = ['http://www.amazon.com/']
    redis_key = "amazon_ph:start_url"
    headers = headers_todict(
        '''accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3
    accept-encoding: gzip, deflate, br
    accept-language: zh-CN,zh;q=0.9
    cache-control: no-cache
    pragma: no-cache
    upgrade-insecure-requests: 1
    user-agent: Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.87 Safari/537.36'''
    )
Esempio n. 10
0
 def get_headers(self,type = 1):
     if type == 1:
         headers = '''accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3
         accept-encoding: gzip, deflate, br
         accept-language: zh-CN,zh;q=0.9
         upgrade-insecure-requests: 1
         user-agent: Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.87 Safari/537.36'''
     else:
         headers = '''accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3
                     accept-encoding: gzip, deflate, br
                     accept-language: zh-CN,zh;q=0.9
                     upgrade-insecure-requests: 1
                     user-agent: Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.87 Safari/537.36'''
     return headers_todict(headers)
Esempio n. 11
0
    def get_headers(self, type=1):
        if type == 1:
            headers = '''accept: */*
accept-encoding: gzip, deflate, br
accept-language: zh-CN,zh;q=0.9
referer: https://shopee.com.my/all_categories
sec-fetch-dest: empty
sec-fetch-mode: cors
sec-fetch-site: same-origin
user-agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.16 Safari/537.36
x-api-source: pc
x-requested-with: XMLHttpRequest'''
        else:
            headers = '''accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3
                        accept-encoding: gzip, deflate, br
                        accept-language: zh-CN,zh;q=0.9
                        upgrade-insecure-requests: 1
                        user-agent: Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.87 Safari/537.36'''
        return headers_todict(headers)
Esempio n. 12
0
 def get_headers(self, type="1"):
     if type == "1":
         headers = '''Host: feedback.aliexpress.com
         Connection: keep-alive
         Cache-Control: max-age=0
         Origin: https://feedback.aliexpress.com
         Upgrade-Insecure-Requests: 1
         Content-Type: application/x-www-form-urlencoded
         User-Agent: Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3724.8 Safari/537.36
         Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3
         Referer: https://feedback.aliexpress.com/display/productEvaluation.htm?v=2&productId=32996635572&ownerMemberId=230443220&memberType=seller&startValidDate=&i18n=true
         Accept-Encoding: gzip, deflate, br
         Accept-Language: zh-CN,zh;q=0.9'''
     else:
         headers = '''accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3
                     accept-encoding: gzip, deflate, br
                     accept-language: zh-CN,zh;q=0.9
                     upgrade-insecure-requests: 1
                     user-agent: Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.87 Safari/537.36'''
     return headers_todict(headers)
Esempio n. 13
0
    def get_headers(self, type=1):
        if type == 1:
            headers = '''accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3
accept-encoding: gzip, deflate, br
accept-language: zh-CN,zh;q=0.9
cache-control: no-cache
cookie: akavpau_FRPRD_FNACCOM=1579504176~id=c47606a9252db9ae6b6424077f58d7e4; datadome=Y7k9tjHyvrXoo9-JIl5bDmSAvRSDneVG1e2pruOX.vUcQykUqMFxcqF_W7-lpxQy30ef45kU2gL.z9B_mFruvO5aaeLL81KaW3KQe70COl
pragma: no-cache
sec-fetch-mode: navigate
sec-fetch-site: none
sec-fetch-user: ?1
upgrade-insecure-requests: 1
user-agent: Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.87 Safari/537.36'''
        else:
            headers = '''accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3
                        accept-encoding: gzip, deflate, br
                        accept-language: zh-CN,zh;q=0.9
                        upgrade-insecure-requests: 1
                        user-agent: Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.87 Safari/537.36'''
        return headers_todict(headers)
Esempio n. 14
0
 def get_headers(self, type=1):
     if type == 1:
         headers = '''Accept: */*
                     Accept-Encoding: gzip, deflate, br
                     Accept-Language: zh-CN,zh;q=0.9
                     Connection: keep-alive
                     Cookie: QCCSESSID=s840r5rdj2rbck69tc7plc7i40; UM_distinctid=170a9c662216a-0d96b081e7094c-b791237-240000-170a9c66222d11; zg_did=%7B%22did%22%3A%20%22170a9c662b13ee-076880bdb17784-b791237-240000-170a9c662b268f%22%7D; _uab_collina=158339631590820690201933; acw_tc=73dc082415833963157051353e9213409bbbf2078f2453340d24defabe; hasShow=1; Hm_lvt_3456bee468c83cc63fb5147f119f1075=1583396316,1583396373,1583459671,1583470013; CNZZDATA1254842228=1004506580-1583391724-https%253A%252F%252Fsp0.baidu.com%252F%7C1583472798; zg_de1d1a35bfa24ce29bbf2c7eb17e6c4f=%7B%22sid%22%3A%201583474054515%2C%22updated%22%3A%201583474054687%2C%22info%22%3A%201583396315830%2C%22superProperty%22%3A%20%22%7B%7D%22%2C%22platform%22%3A%20%22%7B%7D%22%2C%22utm%22%3A%20%22%7B%7D%22%2C%22referrerDomain%22%3A%20%22%22%2C%22cuid%22%3A%20%229c1d7cbd2e6e44ca53f5b36a49bdee3d%22%2C%22zs%22%3A%200%2C%22sc%22%3A%200%7D; Hm_lpvt_3456bee468c83cc63fb5147f119f1075=1583474055
                     Host: www.qichacha.com
                     Referer: https://www.qichacha.com/search?key=92429005MA4DC7B27E
                     Sec-Fetch-Dest: empty
                     Sec-Fetch-Mode: cors
                     Sec-Fetch-Site: same-origin
                     User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.16 Safari/537.36
                     X-Requested-With: XMLHttpRequest'''
     else:
         headers = '''accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3
         accept-encoding: gzip, deflate, br
         accept-language: zh-CN,zh;q=0.9
         upgrade-insecure-requests: 1
         user-agent: Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.87 Safari/537.36'''
     return headers_todict(headers)
Esempio n. 15
0
    def parse_shopid(self, response):
        youxiao = 'jsonp102({"SCCP'
        text = response.text
        if youxiao in text:
            text = text.replace("\n", "")
            text = text.replace("\r", "")
            text = text.replace("\t", "")
            item_s = taobao()
            item_s["source_code"] = text
            item_s["pipeline_level"] = "店铺扫描"

            yield item_s
            meta = response.meta
            shop_id = meta.get("shop_id")

            zhuangtai_s = self.match_zhengze('"SCCP_2_[^"]*":([^\}]*)\}', text)
            item = taobao()
            item["zhuangtai"] = zhuangtai_s
            item["shop_id"] = shop_id
            item["pipeline_level"] = "店铺扫描"
            yield item

            if int(zhuangtai_s) > 0:  #这里判断
                url = "http://shop.m.taobao.com/shop/shop_info.htm?shop_id={}&tbpm=3"
                url = url.format(shop_id)
                headers_str = '''User-Agent:Mozilla/5.0 (iPhone; CPU iPhone OS 6_0 like Mac OS X) AppleWebKit/536.26 (KHTML, like Gecko) Version/6.0 Mobile/10A403 Safari/8536.25'''
                headers = headers_todict(headers_str)
                yield scrapy.Request(url=url,
                                     callback=self.parse_shopxinyong,
                                     method="GET",
                                     headers=headers,
                                     meta={"shop_id": shop_id})  #手机信用

        else:
            request = self.try_again(response)
            if request:
                yield request
Esempio n. 16
0
class AmazonPhSpider(RedisSpider):
    name = 'duhuang'
    allowed_domains = ['duhuang.com']
    start_urls = ['http://www.duhuang.com/']
    redis_key = "duhuang:start_url"
    headers = headers_todict('''Host: www.dhgate.com
Connection: keep-alive
Cache-Control: max-age=0
Upgrade-Insecure-Requests: 1
User-Agent: Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.87 Safari/537.36
Sec-Fetch-Mode: navigate
Sec-Fetch-User: ?1
Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3
Sec-Fetch-Site: none
Accept-Encoding: gzip, deflate, br
Accept-Language: zh-CN,zh;q=0.9
Cookie: vid=rBUUMl2DFlQ+oA/mBSOuAg==; NTKF_T2D_CLIENTID=guest14AD0ED5-5CE7-1831-3338-480F3C006C44; cto_lwid=2b0fa1c5-ad41-4220-ab73-8d28fee0f719; gaVisitorUuid=74e7a602-8866-4041-ae8e-b18a061188c5; pw_deviceid=40caaaf1-f170-4c61-adaf-589b66d206ae; pw_status_912a3e66fccf9b4a6ba50e65fd43522dabaecbe69c771596aeead701e85dd0af=deny; vscr_vid=ff3a19737627479e990f4ffb12fba990; smc_uid=1569314979382597; smc_tag=eyJpZCI6NzMzLCJuYW1lIjoiZGhnYXRlLmNvbSJ9; smc_not=default; seller_site_lang=zh_CN; seller_site_region=CN; c=UeHqQFOh-1575546092257-b880d6d2b8bf51942328817; c_haslogined=1; dh_isChange=isChange; gaIsValuable=1; searchinfo=pagesize%3D24%3Bviewtype%3D1%3B; dhc_s=321291c2-c2f6-4d69-a26e-c748d629cdcb; ref_f=seo|seller||organic|baidu||seller.dhgate.com; session=h9FBr1ccCnbexEA1MYiXQQ; _Jo0OQK=66FAFB7B7665388F613191308D62742C63A8C85B45E99FF22161029F88AF1D6D21E0C6E93F2376B923EBA903DA2963E8C4827B1313D3EFD632D94B573A64B71623CEFE3C935BE26F12FF0CFE73051AB9DA509C47B205529371D27B1313D3EFD632D419E13A61F2567F6E7302D862DBB29F0GJ1Z1Xw==; suship=CN; language=en; intl_locale=en; nTalk_CACHE_DATA={uid:dh_1000_ISME9754_ff8080816eea80d9016f83f286d6287f,tid:1578466731910275}; _pk_ses..c028=*; smc_sesn=2; item_recentvisit=446848411%2C409056389%2C453794591; cto_bundle=HhAwFF9mMXBzTFZGOU9PcEZvUmdQSDhiQlBWbXQ2THN4QTElMkIwRWFiS1ZqbE5ERnl6Yml1VHRlR2VWdWJuaHU1OWl0SldSVGR6bmZHRyUyQiUyQkpVJTJGSjZpdVpOQ1NRMDlmdjBUcWM1U2hQTkgxN21jRWpRUG8wdG9XZmwwYnAlMkZad2lwczd4bGpzd21ORHM2UDM1THhaSEVRcWF6VjRxWTJMZUxEc0hXSVNSaVpBZzhSVVYwJTNE; JSESSIONID=Q43UpUYr-5Drsk_EvDQJhAbbebLITvtbNyrzYptz; dht_lot=Public_S0003; b2b_cart_sid=0a593161-b5e6-4018-932b-1de576305240; b2b_ip_country=CN; login_auth_token=ca75df1d-853d-46e4-91ba-8b660fb4b656; __utma=251624089.850967565.1578468269.1578468269.1578468269.1; __utmc=251624089; __utmz=251624089.1578468269.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); __utmt=1; _ga=GA1.2.850967565.1578468269; _gid=GA1.2.654880981.1578468273; _fmdata=AuB8%2Bk%2Bmil%2FzIeL26QejkjppmwIFzN2AxdgfZ01aMgeDzzSsSGiKxgFm68VLcxPDjlTtB5cP%2BIhBXVVhxGqyVGDOw7CYeO19jSvEzCz9lro%3D; _xid=qLL57rIGQyu5PK%2FSDHOvtOjWIdYE8tTR2XYYacit1AjDI0CYMjdBhDXX7%2FohBsPqIYMFq22NdMccLf6MOT80Sg%3D%3D; B2BCookie=a5450f13-7766-4749-81ff-447f6ad4054c; b2b_b_t_v2=789dbd93aa6c91eed57d96db0d4b1065c8b64a5497fd49b7449cec61c98e5196272920b2674d30991cd6fe75b8facb477487535198a0fbd420cee0f6ef246cc72f4a4e3f529074c3dd52aa9d7c87f88b10b4947c633a422543bcb9e2a1882c98; _Session_ID=pugdaTSL1we3L4Mfsg3CcT6bDfzBoWhAFLZP6U0m; bc=c|e; b2b_buyer_lv=0; b2b_nick_n=516387331; _b_o0l=8a9a0b754374779ebd197ef8054358bcbc7adae9758e4366d312f63a8a07bcb2; _b_o05="0,0,0,0"; [email protected]; b2b_buyerid=ff8080816eea80d9016f83f286d6287f; b_u_cc=ucc=CN; pvn=96; lastvisittime=1578468295373; vnum=30; __utmb=251624089.2.10.1578468269; smc_spv=2; smc_tpv=3; smct_session={"s":1578466752509,"l":1578468408511,"lt":1578468320333,"t":79,"p":77}''')

    def start_requests(self):
        with open(r"C:\Users\admin\Desktop\{select_敦煌_业务信息URL}[店铺ID,Business_Information_url].txt","r",encoding="utf-8") as f:
            for i in f:
                data = i.strip().split(",")
                id = data[0]
                url = data[1]
                meta = {"key":id}
                yield scrapy.Request(url=url,method="GET",headers=self.headers,dont_filter=True,meta=meta)

    def parse(self, response):
        youxiao = re.search("(Information)",response.text)
        url = response.url
        key = response.meta.get("key")
        if youxiao:
            title = response.css(".b-title").xpath("./text()").get()

            item = GmWorkItem()
            item["key"] = key
            item["url"] = url
            item["company_name"] = title

            yield item

        else:
            print("错误")
            try_result = self.try_again(response,key)
            yield try_result

    def try_again(self,rsp,key):
        max_num = 5
        meta = rsp.meta
        try_num = meta.get("try_num",0)
        if try_num > max_num:
            try_num += 1
            request = rsp.request
            request.dont_filter = True
            request.meta["try_num"] = try_num
            return request
        else:
            item_e = GmWorkItem()
            item_e["error_id"] = 1
            item_e["key"] = key
            return item_e
Esempio n. 17
0
class AlibabgjSpider(RedisSpider):
    name = 'alibabgj_shop'
    allowed_domains = ['alibaba.com']
    start_urls = ['http://www.alibaba.com/']
    redis_key = "alibabgj_shop:start_url"
    headers = headers_todict('''accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3
    accept-encoding: gzip, deflate, br
    accept-language: zh-CN,zh;q=0.9
    upgrade-insecure-requests: 1
    user-agent: Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.87 Safari/537.36''')
    def start_requests(self):
        url = "https://www.baidu.com"
        headers = self.get_headers("baidu")
        yield scrapy.Request(url=url,method="GET",callback=self.seed_request,headers=headers,dont_filter=True)

    def seed_request(self, response):
        # with open(r"C:\Users\admin\Desktop\alibabgjtest.txt","r",encoding="utf-8") as f:
        f = ["hwcodec.en"]
        for i in f:
            i = i.strip()
            url = "https://{}.alibaba.com/contactinfo.html".format(i)
            meta = {"key":i}
            yield scrapy.Request(url=url,method="GET",headers=self.headers,dont_filter=True,meta=meta)

    def parse(self, response):
        youxiao = re.search("(HTTP 404|Information|302 Found)",response.text)
        url = response.url
        key = response.meta.get("key")
        if youxiao:
            text = response.text
            # item_s = GmWorkItem()
            # item_s["key"] = key
            # item_s["source_code"] = text
            # yield item_s
            address_detail = ""
            company_name = ""
            val_judge = 0
            contact_table = response.css(".contact-table").xpath("./tr")
            if not contact_table:
                contact_table = response.css(".company-info-data.table").xpath("./tr")
                val_judge = 1
            for i in contact_table:
                name = i.xpath("./th").xpath("string(.)").get()
                if val_judge:
                    value = i.xpath("./td[2]").xpath("string(.)").get()
                else:
                    value = i.xpath("./td").xpath("string(.)").get()
                if name and "Address" in name:
                    address_detail = value
                if name and "Company Name" in name:
                    company_name = value
            country = ""
            province = ""
            city = ""
            address = ""
            zip = ""
            info_table = response.css(".info-table").xpath("./tr")
            if not info_table:
                info_table = response.css(".public-info").xpath("./dl")
                for i in range(len(info_table.xpath("./dt"))):
                    name = info_table.xpath("./dt[{}]".format(i+1)).xpath("string(.)").get()
                    value = info_table.xpath("./dd[{}]".format(i+1)).xpath("string(.)").get()
                    if name and "Country" in name:
                        country = value
                    if name and "Province" in name:
                        province = value
                    if name and "City" in name:
                        city = value
                    if name and "Zip" in name:
                        zip = value
                    if name and "Address" in name:
                        address = value
            else:
                for i in info_table:
                    name = i.xpath("./th").xpath("string(.)").get()
                    value = i.xpath("./td").xpath("string(.)").get()
                    if name and "Country" in name:
                        country = value
                    if name and "Province" in name:
                        province = value
                    if name and "City" in name:
                        city = value
                    if name and "Zip" in name:
                        zip = value
                    if name and "Address" in name:
                        address = value
            contact_people = response.css(".contact-name").xpath("./text()").get()
            if not contact_people:
                contact_people = response.css(".name").xpath("./text()").get()
            companyJoinYears = response.css(".join-year").xpath("./span/text()").get()
            company_type = response.css(".business-type").xpath("./text()").get()
            ordCnt6m = response.css(".transaction-number-value").xpath("./text()").get()
            ordAmt = response.css(".transaction-amount-value").xpath("./text()").get()
            if ordAmt:
                ordAmt = ordAmt.replace(",", "")
                ordAmt = ordAmt.replace("+", "")
            item = GmWorkItem()
            item["key"] = key
            item["url"] = url
            item["company_name"] = company_name
            item["address_detail"] = address_detail
            item["country"] = country
            item["province"] = province
            item["city"] = city
            item["address"] = address
            item["zip"] = zip
            item["contact_people"] = contact_people

            item["sales_money"] = ordAmt
            item["sales_num"] = ordCnt6m
            item["company_type"] = company_type
            item["keep_time"] = companyJoinYears
            yield item
            if response.status == 200:
                bizId = ""
                host_token = ""
                siteId = ""
                pageId = ""

                match = re.search("bizId%22%3A(.*?)%2C%22",text)
                if match:
                    bizId = match.group(1)
                match1 = re.search("host_token:'(.*?)'",text)
                if match1:
                    host_token = match1.group(1)
                match2 = re.search("siteId%22%3A(.*?)%2C%22",text)
                if match2:
                    siteId = match2.group(1)
                match3 = re.search("pageId%22%3A(.*?)%2C%22",text)
                if match3:
                    pageId = match3.group(1)
                language = "en_US"
                envMode = "product"
                renderType = "component"
                componentKeys = "companyCard"
                data = {"bizId": bizId, "language": language,"envMode":envMode,"hostToken":host_token,
                        "siteId":siteId,"pageId":pageId,"renderType":renderType,"componentKeys":componentKeys}
                meta = {"key":key}
                sale_url = "https://{}.alibaba.com/event/app/alisite/render.htm".format(key)
                if bizId and host_token and siteId and pageId:
                    yield scrapy.FormRequest(url=sale_url,callback=self.sale_money,formdata=data,meta=meta)
        else:
            try_result = self.try_again(response,key)
            yield try_result

    def sale_money(self, response):
        effective = '"success":true'
        meta = response.meta
        key = meta.get("key")
        if re.search(effective,response.text):
            companyName = ""
            ordAmt = ""
            ordCnt6m = ""
            company_type = ""
            companyJoinYears = ""
            match = re.search(r'\\"companyName\\":\\"(.*?)\\"',response.text)
            if match:
                companyName = match.group(1)
            match1 = re.search(r'\\"ordAmt\\":\\"(.*?)\\"',response.text)
            if match1:
                ordAmt = match1.group(1)
                ordAmt = ordAmt.replace(",","")
                ordAmt = ordAmt.replace("+","")
            match2 = re.search(r'\\"ordCnt6m\\":(\d*)',response.text)
            if match2:
                ordCnt6m = match2.group(1)
            match3 = re.search(r'\\"value\\":\\"(.*?)\\"', response.text)
            if match3:
                company_type = match3.group(1)
            match3 = re.search(r'\\"companyJoinYears\\":\\"(.*?)\\"', response.text)
            if match3:
                companyJoinYears = match3.group(1)
            item = GmWorkItem()
            item["key"] = key
            item["company_name"] = companyName
            item["sales_money"] = ordAmt
            item["sales_num"] = ordCnt6m
            item["company_type"] = company_type
            item["keep_time"] = companyJoinYears
            item["pipeline_level"] = "销量"
            yield item
        else:
            try_result = self.try_again(response, key)
            yield try_result


    def try_again(self,rsp,key):
        max_num = 5
        meta = rsp.meta
        try_num = meta.get("try_num",0)
        if try_num < max_num:
            try_num += 1
            request = rsp.request
            request.dont_filter = True
            request.meta["try_num"] = try_num
            return request
        else:
            item_e = GmWorkItem()
            item_e["error_id"] = 1
            item_e["key"] = key
            return item_e

    def get_headers(self,type="1"):
        if type == "1":
            headers = '''accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3
            accept-encoding: gzip, deflate, br
            accept-language: zh-CN,zh;q=0.9
            upgrade-insecure-requests: 1
            user-agent: Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.87 Safari/537.36'''
        elif type == "baidu":
            headers = '''accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3
            accept-encoding: gzip, deflate, br
            accept-language: zh-CN,zh;q=0.9
            upgrade-insecure-requests: 1
            user-agent: Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.87 Safari/537.36'''
        else:
            headers = '''accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3
                       accept-encoding: gzip, deflate, br
                       accept-language: zh-CN,zh;q=0.9
                       upgrade-insecure-requests: 1
                       user-agent: Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.87 Safari/537.36'''
        return headers_todict(headers)
Esempio n. 18
0
class AmazongsSpider(RedisSpider):
    name = 'amazon_goodstoshop'
    allowed_domains = ['alibaba.com']
    start_urls = ['http://www.amazon.com/']
    redis_key = "amazon_goodstoshop:start_url"
    headers = headers_todict(
        '''accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3
    accept-encoding: gzip, deflate, br
    accept-language: zh-CN,zh;q=0.9
    upgrade-insecure-requests: 1
    user-agent: Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.87 Safari/537.36'''
    )

    def start_requests(self):
        url = "https://www.baidu.com/"
        yield scrapy.Request(url=url,
                             method="GET",
                             callback=self.seed_request,
                             headers=self.headers,
                             dont_filter=True)

    def seed_request(self, response):

        path = r"C:/Users\admin/Desktop/"
        file_name = "{6_1排行榜_offer有效}[goodsid].txt_去重 - 副本.txt"
        with open(path + file_name, "r", encoding="utf-8") as f:
            for i in f:
                i = i.strip()
                page_num = 0
                url = "https://www.amazon.co.uk/gp/aw/ol/{}".format(i)
                yield scrapy.Request(url=url,
                                     method="GET",
                                     headers=self.headers,
                                     meta={
                                         "page_num": page_num,
                                         "key": i
                                     })

    def parse(self, response):
        youxiao = re.search("(olpOfferList|olpProduct)", response.text)
        key = response.meta.get("key")
        if youxiao:
            item_s = GmWorkItem()
            item_s["key"] = key
            item_s["source_code"] = response.text
            yield item_s
            shop_list = response.css(
                ".a-section.a-spacing-double-large").xpath(
                    "./div//h3[@class='a-spacing-none olpSellerName']/a")
            if not shop_list:
                item = GmWorkItem()
                item["key"] = key
                item["name"] = ""
                item["url"] = ""
                item["seller_id"] = ""
                yield item
            for i in shop_list:
                name = i.xpath("./text()").get()
                if name:
                    name = name.strip()
                url = i.xpath("./@href").get()
                seller_id = ""
                match = re.search('(s|seller)=(.*?)($|[&])', url)
                if match:
                    seller_id = match.group(2)
                item = GmWorkItem()
                item["key"] = key
                item["name"] = name
                item["url"] = url
                item["seller_id"] = seller_id
                yield item
            next_url = response.css("li.a-last").xpath("./a/@href").get()
            if next_url:
                next_url = "https://www.amazon.co.uk" + next_url
                yield scrapy.Request(url=next_url,
                                     method="GET",
                                     headers=self.headers,
                                     meta={"key": key})
        else:
            try_result = self.try_again(response, key)
            yield try_result

    def try_again(self, rsp, key):
        max_num = 10
        meta = rsp.meta
        try_num = meta.get("try_num", 0)
        if try_num < max_num:
            try_num += 1
            request = rsp.request
            request.dont_filter = True
            request.meta["try_num"] = try_num
            return request
        else:
            item_e = GmWorkItem()
            item_e["error_id"] = 1
            item_e["key"] = key
            return item_e
Esempio n. 19
0
    def parse_shopxinyong(self, response):
        text = response.text
        youxiao = '(您浏览店铺不存在|没有找到相应的店铺|店主被删除或冻结了|掌柜|您查看的页面找不到了|Location:http://\.m\.tmall\.com|com/error1\.html|//chaoshi[a-z]*\.m\.tmall|//aliqin\.tmall|//a\.m\.tmall|modbundle-start)'
        youxiao_m = re.search(youxiao, text)
        if youxiao_m:
            text = text.replace("\n", "")
            text = text.replace("\r", "")
            text = text.replace("\t", "")
            meta = response.meta
            shop_id = meta.get("shop_id")

            zhuangtai_s = self.match_zhengze(
                '(店铺不存在|没有找到|掌柜|删除或冻结|Location:http://[^\.]*\.m\.tmall)',
                text)  #状态
            shop_id_s = self.match_zhengze(
                'shop_id=([^"#;]*)"|shopId = "([^#"]*)";', text)  #店铺ID
            sellerid_s = self.match_zhengze(
                '''data-suid='([^']*)'|seller_id=([^"]*)"''', text)  #卖家ID
            zhanggui_s = self.match_zhengze('>掌柜ID</label>([\s\S]+?)<div ',
                                            text)  #掌柜
            nickurl_s = self.match_zhengze('nick = ([ ^ "]*)"', text)  #nickurl
            nick_s = self.match_zhengze('"nick":"([^"]*)",', text)  #nick
            shop_name_s = self.match_zhengze('title>([\s\S]+?)</titl',
                                             text)  #店铺名称
            haoping_s = self.match_zhengze('好评率:([^<]*)<', text)  #好评率
            miaoshuxf_s = self.match_zhengze('描述相符</label>([^<]*)<',
                                             text)  #描述相符
            fuwutd_s = self.match_zhengze('服务态度</label>([^<]*)<', text)  #服务态度
            fahuosd_s = self.match_zhengze('发货速度</label>([^<]*)<', text)  #发货速度
            area_s = self.match_zhengze('label>地区</label>([\s\S]+?)</li>',
                                        text)  #所在地区
            phone_s = self.match_zhengze("客服电话:<[^>]*>([^<]*)<", text)  #客服电话
            shopurl_s = self.match_zhengze('"shopUrl":"([^"]*)"',
                                           text)  #shopUrl
            item = taobao()
            item["zhuangtai"] = zhuangtai_s
            item["seller_id"] = sellerid_s
            item["shop_id"] = shop_id_s
            item["zhanggui"] = zhanggui_s
            item["nickurl"] = nickurl_s
            item["nick"] = nick_s
            item["shop_name"] = shop_name_s
            item["haoping"] = haoping_s
            item["miaoshuxf"] = miaoshuxf_s
            item["fuwutd"] = fuwutd_s
            item["fahuosd"] = fahuosd_s
            item["area"] = area_s
            item["phone"] = phone_s
            item["shopurl"] = shopurl_s
            item["pipeline_level"] = "手机店铺信用"
            yield item

            if sellerid_s:
                url_1 = "https://ext-mdskip.taobao.com/extension/seller_info.htm?user_num_id={}"
                headers = '''User-Agent:Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Maxthon/4.4.6.2000 Chrome/30.0.1599.101 Safari/537.36
                                    Referer:https://www.taobao.com'''
                url_1 = url_1.format(sellerid_s)
                headers = headers_todict(headers)
                yield scrapy.Request(url=url_1,
                                     callback=self.main_sale,
                                     method="GET",
                                     headers=headers,
                                     meta={"seller_id": sellerid_s})  #主营
                url_2 = "https://count.taobao.com/counter3?keys=SM_368_dsr-{}&callback=jsonp173"
                headers = "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Maxthon/4.4.6.2000 Chrome/30.0.1599.101 Safari/537.36"
                url_2 = url_2.format(sellerid_s)
                headers = headers_todict(headers)
                yield scrapy.Request(url=url_2,
                                     callback=self.parse_shopxinyong_diannao,
                                     method="GET",
                                     headers=headers,
                                     meta={"seller_id":
                                           sellerid_s})  # 公司信用_电脑端

                asyn_url = "http://hdc1.alicdn.com/asyn.htm?userId={}&pageId=&v=2014"
                headers = "User-Agent:Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Maxthon/4.4.6.2000 Chrome/30.0.1599.101 Safari/537.36"
                asyn_url = asyn_url.format(sellerid_s)
                headers = headers_todict(headers)
                yield scrapy.Request(url=asyn_url,
                                     callback=self.parse_asyn_good,
                                     method="GET",
                                     headers=headers,
                                     meta={"shop_id": shop_id})

        else:
            request = self.try_again(response)
            if request:
                yield request
Esempio n. 20
0
    def parse_tui_diannao(self, response):
        text = response.text
        youxiao = '("itemId"|"result":\[\])'
        youxiao_m = re.search(youxiao, text)

        if youxiao_m:
            text = text.replace("\n", "")
            text = text.replace("\r", "")
            text = text.replace("\t", "")
            meta = response.meta
            shop_id = meta.get("shop_id")

            url1 = "https://tui.taobao.com/recommend?shop_id={}&item_ids={}&floorId=42296&pSize=12&callback=detail_pine&appid=6862&count=12&pNum=0"
            headers1 = '''Referer: https://item.taobao.com/item.htm?id=590354499275
            User-Agent:Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Maxthon/4.4.6.2000 Chrome/30.0.1599.101 Safari/537.36'''
            fenlie = "},{"
            for i in text.split(fenlie):
                text_s = i
                shop_type_s = self.match_zhengze(
                    '"userType":[\s]*["]{0,1}([^",\}]*)', text_s)
                shop_id_s = self.match_zhengze(
                    '"shopId":[\s]*["]{0,1}([^",\}]*)', text_s)
                seller_id_s = self.match_zhengze(
                    '"sellerId":[\s]*["]{0,1}([^",\}]*)', text_s)
                item_id_s = self.match_zhengze(
                    '"itemId":[\s]*["]{0,1}([^",\}]*)', text_s)
                good_name_s = self.match_zhengze('"itemName":"([\s\S]+?)",',
                                                 text_s)  #UTF8
                price_s = self.match_zhengze('"price":[\s]*["]{0,1}([^",\}]*)',
                                             text_s)
                promotion_price_s = self.match_zhengze(
                    '"promotionPriceRaw":[\s]*["]{0,1}([^",\}]*)', text_s)
                sell_count_s = self.match_zhengze(
                    '"sellCount":[\s]*["]{0,1}([^",\}]*)', text_s)
                mouth_count_s = self.match_zhengze(
                    '"monthSellCount": [\s]*["]{0,1}([^",\}] * )', text_s)
                quantity_s = self.match_zhengze(
                    '"quantity":[\s]*["]{0,1}([^",\}]*)', text_s)
                favor_count_s = self.match_zhengze(
                    '"favorCount":[\s]*["]{0,1}([^",\}]*)', text_s)
                brand_id_s = self.match_zhengze('"brandId":([0-9]*)', text_s)
                category_id_s = self.match_zhengze(
                    '"categoryId":[\s]*["]{0,1}([^",\}]*)', text_s)
                category_id_lv1_s = self.match_zhengze(
                    '"categoryLv1Id":[\s]*["]{0,1}([^",\}]*)', text_s)
                sub_item_name_s = self.match_zhengze(
                    '"subItemName":[\s]*["]{0,1}([^",\}]*)', text_s)
                pic_s = self.match_zhengze('"pic":[\s]*["]{0,1}([^",\}]*)',
                                           text_s)
                item = taobao()
                item["shop_type"] = shop_type_s
                item["shop_id"] = shop_id_s
                item["seller_id"] = seller_id_s
                item["good_name"] = good_name_s
                item["price"] = price_s
                item["promotion_price"] = promotion_price_s
                item["sell_count"] = sell_count_s
                item["mouth_count"] = mouth_count_s
                item["quantity"] = quantity_s
                item["favor_count"] = favor_count_s
                item["brand_id"] = brand_id_s
                item["category_id"] = category_id_s
                item["category_id_lv1"] = category_id_lv1_s
                item["sub_item_name"] = sub_item_name_s
                item["pic"] = pic_s
                item["pipeline_level"] = "tui店铺"
                yield item
                if item_id_s:
                    url = url1.format(shop_id, item_id_s)
                    headers = headers_todict(headers1)
                    yield scrapy.Request(url=url,
                                         callback=self.parse_kanleyoukan,
                                         method="GET",
                                         headers=headers,
                                         meta={
                                             "shop_id": shop_id,
                                             "item_id": item_id_s
                                         })
        else:
            request = self.try_again(response)
            if request:
                yield request
Esempio n. 21
0
class AmazonPhSpider(RedisSpider):
    name = 'amazon_ph'
    allowed_domains = ['www.amazon.com']
    start_urls = ['http://www.amazon.com/']
    redis_key = "amazon_ph:start_url"
    headers = headers_todict(
        '''accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3
    accept-encoding: gzip, deflate, br
    accept-language: zh-CN,zh;q=0.9
    cache-control: no-cache
    pragma: no-cache
    upgrade-insecure-requests: 1
    user-agent: Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.87 Safari/537.36'''
    )

    def start_requests(self):
        url = "https://www.amazon.com/销售排行榜/zgbs/ref=zg_bs_unv_0_1045024_4?language=zh_CN"
        deep = 1
        yield scrapy.Request(url=url,
                             method="GET",
                             headers=self.headers,
                             dont_filter=True,
                             meta={"deep": deep})

    def parse(self, response):
        catelog = response.meta.get("catelog_name")
        deep = response.meta.get("deep", 1)
        catelog_url = response.url
        xpath_str = ".{}/li".format("/ul" * deep)
        url_catalog = response.css("#zg_browseRoot").xpath(xpath_str)
        second_url = response.css(".a-last").xpath("./a/@href").get()
        if second_url:
            yield scrapy.Request(url=second_url,
                                 method="GET",
                                 headers=self.headers,
                                 meta={
                                     "catelog_name": catelog,
                                     "deep": deep
                                 })
        for i in url_catalog:
            url_next = i.xpath("./a/@href").get()
            catelog_name = i.xpath("./a/text()").get()

            if url_next:
                deep_next = deep + 1
                yield scrapy.Request(url=url_next,
                                     method="GET",
                                     headers=self.headers,
                                     meta={
                                         "catelog_name": catelog_name,
                                         "deep": deep_next
                                     })
        goods_list = response.css("#zg-ordered-list").xpath("./li")
        for i in goods_list:
            url = i.xpath("./span/div/span/a/@href").get()
            if url:
                url = "https://www.amazon.com" + url
            good_name = i.xpath("./span/div/span/a/div/text()").get()
            if good_name:
                good_name = good_name.strip()
            level = i.xpath("./span/div/span/div[1]/a[1]/i/span/text()").get()
            if level:
                level = level.replace(" out of 5 stars", "")
            evaluates = i.xpath("./span/div/span/div[1]/a[2]/text()").get()
            if evaluates:
                evaluates = evaluates.replace(",", "")
            price = i.xpath("./span/div/span/div[2]/a/span/span/text()").get()
            if price:
                price = price.replace("$", "")

            item = AmazonItem()
            item["url"] = url
            item["good_name"] = good_name
            item["level"] = level
            item["evaluates"] = evaluates
            item["price"] = price
            item["catelog_name"] = catelog
            item["catelog_url"] = catelog_url
            item["deep"] = deep

            yield item