def get_headers(type=1): if type == 1: headers1 = '''accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3 accept-encoding: gzip, deflate, br accept-language: zh-CN,zh;q=0.9 upgrade-insecure-requests: 1 user-agent: Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/11.0 Mobile/15A372 Safari/604.1''' headers1 = headers_todict(headers1) return headers1 elif type == 2: headers2 = '''accept: */* accept-encoding: gzip, deflate, br accept-language: zh-CN,zh;q=0.9 upgrade-insecure-requests: 1 user-agent: Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/11.0 Mobile/15A372 Safari/604.1''' headers2 = headers_todict(headers2) return headers2 elif type == 3: headers3 = '''Accept: application/json User-Agent: Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/11.0 Mobile/15A372 Safari/604.1 Content-type: application/x-www-form-urlencoded Accept-Encoding: gzip, deflate, br Accept-Language: zh-CN,zh;q=0.9''' headers3 = headers_todict(headers3) return headers3
def parse_kanleyoukan(self, response): text = response.text youxiao = '(detail_pine)' youxiao_m = re.search(youxiao, text) if youxiao_m: text = text.replace("\n", "") text = text.replace("\r", "") text = text.replace("\t", "") meta = response.meta shop_id = meta.get("shop_id") item_id = meta.get("item_id") url2 = '''https://mdskip.taobao.com/core/initItemDetail.htm?isUseInventoryCenter=false&cartEnable=true&service3C=false&isApparel=false&isSecKill=false&tmallBuySupport=true&isAreaSell=false&tryBeforeBuy=false&offlineShop=false&itemId={}&showShopProm=false&isPurchaseMallPage=false&itemGmtModified=1568217644000&isRegionLevel=false&household=false&sellerPreview=false&queryMemberRight=true&addressLevel=2&isForbidBuyItem=false&callback=setMdskip×tamp=1568612546869''' headers2 = '''User-Agent:Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36 Referer: https://detail.tmall.com/item.htm?id=585627813748 Cookie:t=771e985ad68867dca634b66c8b52c710; cna=RxThFT+HtVACATyweW2UU3HN; tracknick=%5Cu4E09%5Cu9014%5Cu6CB3%5Cu8FD8%5Cu662F%5Cu5929%5Cu5802; _cc_=URm48syIZQ%3D%3D; enc=5DlFftlD20fPoNrYGejylp4qjVAwaVqHVif222OGfTVcQTqAx2FMz1Zq21yB5qgS%2FwJtxLSsCsnMVvodBHNseg%3D%3D; thw=cn; x=e%3D1%26p%3D*%26s%3D0%26c%3D0%26f%3D0%26g%3D0%26t%3D0%26__ll%3D-1%26_ato%3D0; hng=CN%7Czh-CN%7CCNY%7C156; cookie2=1cf7677466b3b4247ee06b58db60af76; _tb_token_=3bebd4e0eb356; v=0; mt=ci%3D-1_1; miid=1514173119299498437; uc1=cookie14=UoTbnKMDmEcUIg%3D%3D; tk_trace=oTRxOWSBNwn9dPyorMJE%2FoPdY8zZPEr%2FCrvCMS%2BG3sTRRWrQ%2BVVTl09ME1KrXdS91s4jJYL5NA7c2uWyfyw%2F9gYrem7NEW7%2FhRF2%2BDZyzjzJUQM%2B3ajmc%2BqsNRORcaRH3CNHyGPkcc%2F%2BLjMbn8%2FEPT%2FB8BQzJOUzWHeXEfYMACGjSPsry1CB514xsVVKb7xQpGcujL%2FGqsEgBrb1wz3x5x7vG9V5OAdii7QZqPQIrqC92RZGPM2m943EN8TkLKCavVsJtrfVF%2B2rncH0VPQJbqgCp1b7IcFLp4aV1X2Gt2nDZo4%2BPKkcowzbgNV4LUNTu6ynXBPWBq0RDDlOX%2FY1ucI%3D; linezing_session=1jM55aS1rIVc20jqjpDgyWCs_1571208987345cSbM_6; _m_h5_tk=3756ebaa20d617d7e1fc48cec3e82ad6_1571233351866; _m_h5_tk_enc=8a6d0092297b00d8f43562a95fa43ddf; l=dBgt4Jcmq1sEN_vbBOCZnurza779sIRAguPzaNbMi_5IL18suR7OkgjmxeJ6cjWfTlYB4dG4psJ9-etkZ4eT6qM8sxAJNxDc.; isg=BJqaNiC5kzLEHR9lMWc-xLee60C8yx6lNJSpOKQTRy34FzpRjFiZtWll46Mux5Y9''' url1 = "https://item.taobao.com/item.htm?id={}" headers1 = '''User-Agent: Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Maxthon/4.4.6.2000 Chrome/30.0.1599.101 Safari/537.36''' fenlie = "},{" for i in text.split(fenlie): text_i = i item_id_s = self.match_zhengze(r'itemId\":(\d+)', text_i) #商品ID seller_id_s = self.match_zhengze('sellerId":(\d+)', text_i) #卖家ID cate_id_s = self.match_zhengze('categoryId":(\d+)', text_i) #类目id item = taobao() item["item_id"] = item_id_s item["seller_id"] = seller_id_s item["cate_id"] = cate_id_s item["pipeline_level"] = "看了又看" yield item if item_id_s: url = url2.format(item_id_s) headers = headers_todict(headers2) yield scrapy.Request(url=url, callback=self.parse_goods_xiaoliang, method="GET", headers=headers, meta={"item_id": item_id_s}) url = url1.format(item_id_s) # headers = headers_todict(headers1) # yield scrapy.Request(url=url, callback=self.parse_good_information, method="GET", headers=headers,meta={"item_id": item_id_s,"seller_id":seller_id_s})#商品详情 else: request = self.try_again(response) if request: yield request
def get_headers(self, type=1): if type == 1: headers = '''Accept: */* Accept-Encoding: gzip, deflate, br Accept-Language: zh-CN,zh;q=0.9 Cache-Control: no-cache Connection: keep-alive Host: hotels.ctrip.com Pragma: no-cache Referer: https://www.ctrip.com/ Sec-Fetch-Mode: no-cors Sec-Fetch-Site: same-site User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36''' else: headers = '''accept: application/json accept-encoding: gzip, deflate, br accept-language: zh-CN,zh;q=0.9 cache-control: no-cache content-type: application/json;charset=UTF-8 origin: https://hotels.ctrip.com pragma: no-cache sec-fetch-mode: cors sec-fetch-site: same-site user-agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36 Host: m.ctrip.com ''' return headers_todict(headers)
def get_headers(self, type=1): if type == 1: headers = '''Accept: */* Accept-Encoding: gzip, deflate, br Accept-Language: zh-CN,zh;q=0.9 Cache-Control: no-cache Connection: keep-alive Host: webh.huajiao.com Pragma: no-cache Referer: https://www.huajiao.com/category/1000 Sec-Fetch-Mode: no-cors Sec-Fetch-Site: same-site User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36''' else: headers = '''accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3 accept-encoding: gzip, deflate, br accept-language: zh-CN,zh;q=0.9 cache-control: no-cache pragma: no-cache sec-fetch-mode: navigate sec-fetch-site: none sec-fetch-user: ?1 upgrade-insecure-requests: 1 user-agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36''' return headers_todict(headers)
def get_taobao_headers(): headers = '''accept: */* accept-encoding: gzip, deflate, br accept-language: zh-CN,zh;q=0.9 cache-control: no-cache pragma: no-cache sec-fetch-mode: no-cors sec-fetch-site: same-site user-agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36''' return headers_todict(headers)
def get_headers(self): headers = '''Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3 Accept-Encoding: gzip, deflate, br Accept-Language: zh-CN,zh;q=0.9 Connection: keep-alive Host: m.aliexpress.com Upgrade-Insecure-Requests: 1 User-Agent: Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/11.0 Mobile/15A372 Safari/604.1''' headers = headers_todict(headers) return headers
def parse_shopid(self, response): youxiao = 'jsonp102({"SCCP' text = response.text if youxiao in text: text = text.replace("\n", "") text = text.replace("\r", "") text = text.replace("\t", "") meta = response.meta shop_id = meta.get("shop_id") zhuangtai_s = self.match_zhengze('"SCCP_2_[^"]*":([^\}]*)\}', text) item = taobao() item["zhuangtai"] = zhuangtai_s item["pipeline_level"] = "店铺扫描" yield item if int(zhuangtai_s) > 0: #这里判断 url = "http://shop.m.taobao.com/shop/shop_info.htm?shop_id={}&tbpm=3" url = url.format(shop_id) headers_str = '''User-Agent:Mozilla/5.0 (iPhone; CPU iPhone OS 6_0 like Mac OS X) AppleWebKit/536.26 (KHTML, like Gecko) Version/6.0 Mobile/10A403 Safari/8536.25''' headers = headers_todict(headers_str) yield scrapy.Request(url=url, callback=self.parse_shopxinyong, method="GET", headers=headers, meta={"shop_id": shop_id}) #手机信用 url_tui = "https://tui.taobao.com/recommend?shop_id={}&floorId=42296&appid=6862" url_tui = url_tui.format(shop_id) headers_tui_str = "User-Agent:Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Maxthon/4.4.6.2000 Chrome/30.0.1599.101 Safari/537.36" headers2 = headers_todict(headers_tui_str) yield scrapy.Request(url=url_tui, callback=self.parse_tui_diannao, method="GET", headers=headers2, meta={"shop_id": shop_id}) #店铺tui else: request = self.try_again(response) if request: yield request
def start_requests(self): headers_str = '''User-Agent:Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Maxthon/4.4.6.2000 Chrome/30.0.1599.101 Safari/537.36''' headers = headers_todict(headers_str) # for i in range(self.id_start,self.id_end+1): for i in [101717810, 57472900, 10000366]: url = "https://count.taobao.com/counter3?callback=jsonp102&keys=SCCP_2_{}".format( i) yield scrapy.Request(url=url, callback=self.parse_shopid, method="GET", headers=headers, meta={"shop_id": i})
class AmazonukSpider(RedisSpider): name = 'amazon_uk' allowed_domains = ['www.amazon.com'] start_urls = ['http://www.amazon.com/'] redis_key = "amazon_ph:start_url" headers = headers_todict( '''accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3 accept-encoding: gzip, deflate, br accept-language: zh-CN,zh;q=0.9 cache-control: no-cache pragma: no-cache upgrade-insecure-requests: 1 user-agent: Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.87 Safari/537.36''' )
def get_headers(self,type = 1): if type == 1: headers = '''accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3 accept-encoding: gzip, deflate, br accept-language: zh-CN,zh;q=0.9 upgrade-insecure-requests: 1 user-agent: Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.87 Safari/537.36''' else: headers = '''accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3 accept-encoding: gzip, deflate, br accept-language: zh-CN,zh;q=0.9 upgrade-insecure-requests: 1 user-agent: Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.87 Safari/537.36''' return headers_todict(headers)
def get_headers(self, type=1): if type == 1: headers = '''accept: */* accept-encoding: gzip, deflate, br accept-language: zh-CN,zh;q=0.9 referer: https://shopee.com.my/all_categories sec-fetch-dest: empty sec-fetch-mode: cors sec-fetch-site: same-origin user-agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.16 Safari/537.36 x-api-source: pc x-requested-with: XMLHttpRequest''' else: headers = '''accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3 accept-encoding: gzip, deflate, br accept-language: zh-CN,zh;q=0.9 upgrade-insecure-requests: 1 user-agent: Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.87 Safari/537.36''' return headers_todict(headers)
def get_headers(self, type="1"): if type == "1": headers = '''Host: feedback.aliexpress.com Connection: keep-alive Cache-Control: max-age=0 Origin: https://feedback.aliexpress.com Upgrade-Insecure-Requests: 1 Content-Type: application/x-www-form-urlencoded User-Agent: Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3724.8 Safari/537.36 Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3 Referer: https://feedback.aliexpress.com/display/productEvaluation.htm?v=2&productId=32996635572&ownerMemberId=230443220&memberType=seller&startValidDate=&i18n=true Accept-Encoding: gzip, deflate, br Accept-Language: zh-CN,zh;q=0.9''' else: headers = '''accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3 accept-encoding: gzip, deflate, br accept-language: zh-CN,zh;q=0.9 upgrade-insecure-requests: 1 user-agent: Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.87 Safari/537.36''' return headers_todict(headers)
def get_headers(self, type=1): if type == 1: headers = '''accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3 accept-encoding: gzip, deflate, br accept-language: zh-CN,zh;q=0.9 cache-control: no-cache cookie: akavpau_FRPRD_FNACCOM=1579504176~id=c47606a9252db9ae6b6424077f58d7e4; datadome=Y7k9tjHyvrXoo9-JIl5bDmSAvRSDneVG1e2pruOX.vUcQykUqMFxcqF_W7-lpxQy30ef45kU2gL.z9B_mFruvO5aaeLL81KaW3KQe70COl pragma: no-cache sec-fetch-mode: navigate sec-fetch-site: none sec-fetch-user: ?1 upgrade-insecure-requests: 1 user-agent: Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.87 Safari/537.36''' else: headers = '''accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3 accept-encoding: gzip, deflate, br accept-language: zh-CN,zh;q=0.9 upgrade-insecure-requests: 1 user-agent: Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.87 Safari/537.36''' return headers_todict(headers)
def get_headers(self, type=1): if type == 1: headers = '''Accept: */* Accept-Encoding: gzip, deflate, br Accept-Language: zh-CN,zh;q=0.9 Connection: keep-alive Cookie: QCCSESSID=s840r5rdj2rbck69tc7plc7i40; UM_distinctid=170a9c662216a-0d96b081e7094c-b791237-240000-170a9c66222d11; zg_did=%7B%22did%22%3A%20%22170a9c662b13ee-076880bdb17784-b791237-240000-170a9c662b268f%22%7D; _uab_collina=158339631590820690201933; acw_tc=73dc082415833963157051353e9213409bbbf2078f2453340d24defabe; hasShow=1; Hm_lvt_3456bee468c83cc63fb5147f119f1075=1583396316,1583396373,1583459671,1583470013; CNZZDATA1254842228=1004506580-1583391724-https%253A%252F%252Fsp0.baidu.com%252F%7C1583472798; zg_de1d1a35bfa24ce29bbf2c7eb17e6c4f=%7B%22sid%22%3A%201583474054515%2C%22updated%22%3A%201583474054687%2C%22info%22%3A%201583396315830%2C%22superProperty%22%3A%20%22%7B%7D%22%2C%22platform%22%3A%20%22%7B%7D%22%2C%22utm%22%3A%20%22%7B%7D%22%2C%22referrerDomain%22%3A%20%22%22%2C%22cuid%22%3A%20%229c1d7cbd2e6e44ca53f5b36a49bdee3d%22%2C%22zs%22%3A%200%2C%22sc%22%3A%200%7D; Hm_lpvt_3456bee468c83cc63fb5147f119f1075=1583474055 Host: www.qichacha.com Referer: https://www.qichacha.com/search?key=92429005MA4DC7B27E Sec-Fetch-Dest: empty Sec-Fetch-Mode: cors Sec-Fetch-Site: same-origin User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.16 Safari/537.36 X-Requested-With: XMLHttpRequest''' else: headers = '''accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3 accept-encoding: gzip, deflate, br accept-language: zh-CN,zh;q=0.9 upgrade-insecure-requests: 1 user-agent: Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.87 Safari/537.36''' return headers_todict(headers)
def parse_shopid(self, response): youxiao = 'jsonp102({"SCCP' text = response.text if youxiao in text: text = text.replace("\n", "") text = text.replace("\r", "") text = text.replace("\t", "") item_s = taobao() item_s["source_code"] = text item_s["pipeline_level"] = "店铺扫描" yield item_s meta = response.meta shop_id = meta.get("shop_id") zhuangtai_s = self.match_zhengze('"SCCP_2_[^"]*":([^\}]*)\}', text) item = taobao() item["zhuangtai"] = zhuangtai_s item["shop_id"] = shop_id item["pipeline_level"] = "店铺扫描" yield item if int(zhuangtai_s) > 0: #这里判断 url = "http://shop.m.taobao.com/shop/shop_info.htm?shop_id={}&tbpm=3" url = url.format(shop_id) headers_str = '''User-Agent:Mozilla/5.0 (iPhone; CPU iPhone OS 6_0 like Mac OS X) AppleWebKit/536.26 (KHTML, like Gecko) Version/6.0 Mobile/10A403 Safari/8536.25''' headers = headers_todict(headers_str) yield scrapy.Request(url=url, callback=self.parse_shopxinyong, method="GET", headers=headers, meta={"shop_id": shop_id}) #手机信用 else: request = self.try_again(response) if request: yield request
class AmazonPhSpider(RedisSpider): name = 'duhuang' allowed_domains = ['duhuang.com'] start_urls = ['http://www.duhuang.com/'] redis_key = "duhuang:start_url" headers = headers_todict('''Host: www.dhgate.com Connection: keep-alive Cache-Control: max-age=0 Upgrade-Insecure-Requests: 1 User-Agent: Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.87 Safari/537.36 Sec-Fetch-Mode: navigate Sec-Fetch-User: ?1 Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3 Sec-Fetch-Site: none Accept-Encoding: gzip, deflate, br Accept-Language: zh-CN,zh;q=0.9 Cookie: vid=rBUUMl2DFlQ+oA/mBSOuAg==; NTKF_T2D_CLIENTID=guest14AD0ED5-5CE7-1831-3338-480F3C006C44; cto_lwid=2b0fa1c5-ad41-4220-ab73-8d28fee0f719; gaVisitorUuid=74e7a602-8866-4041-ae8e-b18a061188c5; pw_deviceid=40caaaf1-f170-4c61-adaf-589b66d206ae; pw_status_912a3e66fccf9b4a6ba50e65fd43522dabaecbe69c771596aeead701e85dd0af=deny; vscr_vid=ff3a19737627479e990f4ffb12fba990; smc_uid=1569314979382597; smc_tag=eyJpZCI6NzMzLCJuYW1lIjoiZGhnYXRlLmNvbSJ9; smc_not=default; seller_site_lang=zh_CN; seller_site_region=CN; c=UeHqQFOh-1575546092257-b880d6d2b8bf51942328817; c_haslogined=1; dh_isChange=isChange; gaIsValuable=1; searchinfo=pagesize%3D24%3Bviewtype%3D1%3B; dhc_s=321291c2-c2f6-4d69-a26e-c748d629cdcb; ref_f=seo|seller||organic|baidu||seller.dhgate.com; session=h9FBr1ccCnbexEA1MYiXQQ; _Jo0OQK=66FAFB7B7665388F613191308D62742C63A8C85B45E99FF22161029F88AF1D6D21E0C6E93F2376B923EBA903DA2963E8C4827B1313D3EFD632D94B573A64B71623CEFE3C935BE26F12FF0CFE73051AB9DA509C47B205529371D27B1313D3EFD632D419E13A61F2567F6E7302D862DBB29F0GJ1Z1Xw==; suship=CN; language=en; intl_locale=en; nTalk_CACHE_DATA={uid:dh_1000_ISME9754_ff8080816eea80d9016f83f286d6287f,tid:1578466731910275}; _pk_ses..c028=*; smc_sesn=2; item_recentvisit=446848411%2C409056389%2C453794591; cto_bundle=HhAwFF9mMXBzTFZGOU9PcEZvUmdQSDhiQlBWbXQ2THN4QTElMkIwRWFiS1ZqbE5ERnl6Yml1VHRlR2VWdWJuaHU1OWl0SldSVGR6bmZHRyUyQiUyQkpVJTJGSjZpdVpOQ1NRMDlmdjBUcWM1U2hQTkgxN21jRWpRUG8wdG9XZmwwYnAlMkZad2lwczd4bGpzd21ORHM2UDM1THhaSEVRcWF6VjRxWTJMZUxEc0hXSVNSaVpBZzhSVVYwJTNE; JSESSIONID=Q43UpUYr-5Drsk_EvDQJhAbbebLITvtbNyrzYptz; dht_lot=Public_S0003; b2b_cart_sid=0a593161-b5e6-4018-932b-1de576305240; b2b_ip_country=CN; login_auth_token=ca75df1d-853d-46e4-91ba-8b660fb4b656; __utma=251624089.850967565.1578468269.1578468269.1578468269.1; __utmc=251624089; __utmz=251624089.1578468269.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); __utmt=1; _ga=GA1.2.850967565.1578468269; _gid=GA1.2.654880981.1578468273; _fmdata=AuB8%2Bk%2Bmil%2FzIeL26QejkjppmwIFzN2AxdgfZ01aMgeDzzSsSGiKxgFm68VLcxPDjlTtB5cP%2BIhBXVVhxGqyVGDOw7CYeO19jSvEzCz9lro%3D; _xid=qLL57rIGQyu5PK%2FSDHOvtOjWIdYE8tTR2XYYacit1AjDI0CYMjdBhDXX7%2FohBsPqIYMFq22NdMccLf6MOT80Sg%3D%3D; B2BCookie=a5450f13-7766-4749-81ff-447f6ad4054c; b2b_b_t_v2=789dbd93aa6c91eed57d96db0d4b1065c8b64a5497fd49b7449cec61c98e5196272920b2674d30991cd6fe75b8facb477487535198a0fbd420cee0f6ef246cc72f4a4e3f529074c3dd52aa9d7c87f88b10b4947c633a422543bcb9e2a1882c98; _Session_ID=pugdaTSL1we3L4Mfsg3CcT6bDfzBoWhAFLZP6U0m; bc=c|e; b2b_buyer_lv=0; b2b_nick_n=516387331; _b_o0l=8a9a0b754374779ebd197ef8054358bcbc7adae9758e4366d312f63a8a07bcb2; _b_o05="0,0,0,0"; [email protected]; b2b_buyerid=ff8080816eea80d9016f83f286d6287f; b_u_cc=ucc=CN; pvn=96; lastvisittime=1578468295373; vnum=30; __utmb=251624089.2.10.1578468269; smc_spv=2; smc_tpv=3; smct_session={"s":1578466752509,"l":1578468408511,"lt":1578468320333,"t":79,"p":77}''') def start_requests(self): with open(r"C:\Users\admin\Desktop\{select_敦煌_业务信息URL}[店铺ID,Business_Information_url].txt","r",encoding="utf-8") as f: for i in f: data = i.strip().split(",") id = data[0] url = data[1] meta = {"key":id} yield scrapy.Request(url=url,method="GET",headers=self.headers,dont_filter=True,meta=meta) def parse(self, response): youxiao = re.search("(Information)",response.text) url = response.url key = response.meta.get("key") if youxiao: title = response.css(".b-title").xpath("./text()").get() item = GmWorkItem() item["key"] = key item["url"] = url item["company_name"] = title yield item else: print("错误") try_result = self.try_again(response,key) yield try_result def try_again(self,rsp,key): max_num = 5 meta = rsp.meta try_num = meta.get("try_num",0) if try_num > max_num: try_num += 1 request = rsp.request request.dont_filter = True request.meta["try_num"] = try_num return request else: item_e = GmWorkItem() item_e["error_id"] = 1 item_e["key"] = key return item_e
class AlibabgjSpider(RedisSpider): name = 'alibabgj_shop' allowed_domains = ['alibaba.com'] start_urls = ['http://www.alibaba.com/'] redis_key = "alibabgj_shop:start_url" headers = headers_todict('''accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3 accept-encoding: gzip, deflate, br accept-language: zh-CN,zh;q=0.9 upgrade-insecure-requests: 1 user-agent: Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.87 Safari/537.36''') def start_requests(self): url = "https://www.baidu.com" headers = self.get_headers("baidu") yield scrapy.Request(url=url,method="GET",callback=self.seed_request,headers=headers,dont_filter=True) def seed_request(self, response): # with open(r"C:\Users\admin\Desktop\alibabgjtest.txt","r",encoding="utf-8") as f: f = ["hwcodec.en"] for i in f: i = i.strip() url = "https://{}.alibaba.com/contactinfo.html".format(i) meta = {"key":i} yield scrapy.Request(url=url,method="GET",headers=self.headers,dont_filter=True,meta=meta) def parse(self, response): youxiao = re.search("(HTTP 404|Information|302 Found)",response.text) url = response.url key = response.meta.get("key") if youxiao: text = response.text # item_s = GmWorkItem() # item_s["key"] = key # item_s["source_code"] = text # yield item_s address_detail = "" company_name = "" val_judge = 0 contact_table = response.css(".contact-table").xpath("./tr") if not contact_table: contact_table = response.css(".company-info-data.table").xpath("./tr") val_judge = 1 for i in contact_table: name = i.xpath("./th").xpath("string(.)").get() if val_judge: value = i.xpath("./td[2]").xpath("string(.)").get() else: value = i.xpath("./td").xpath("string(.)").get() if name and "Address" in name: address_detail = value if name and "Company Name" in name: company_name = value country = "" province = "" city = "" address = "" zip = "" info_table = response.css(".info-table").xpath("./tr") if not info_table: info_table = response.css(".public-info").xpath("./dl") for i in range(len(info_table.xpath("./dt"))): name = info_table.xpath("./dt[{}]".format(i+1)).xpath("string(.)").get() value = info_table.xpath("./dd[{}]".format(i+1)).xpath("string(.)").get() if name and "Country" in name: country = value if name and "Province" in name: province = value if name and "City" in name: city = value if name and "Zip" in name: zip = value if name and "Address" in name: address = value else: for i in info_table: name = i.xpath("./th").xpath("string(.)").get() value = i.xpath("./td").xpath("string(.)").get() if name and "Country" in name: country = value if name and "Province" in name: province = value if name and "City" in name: city = value if name and "Zip" in name: zip = value if name and "Address" in name: address = value contact_people = response.css(".contact-name").xpath("./text()").get() if not contact_people: contact_people = response.css(".name").xpath("./text()").get() companyJoinYears = response.css(".join-year").xpath("./span/text()").get() company_type = response.css(".business-type").xpath("./text()").get() ordCnt6m = response.css(".transaction-number-value").xpath("./text()").get() ordAmt = response.css(".transaction-amount-value").xpath("./text()").get() if ordAmt: ordAmt = ordAmt.replace(",", "") ordAmt = ordAmt.replace("+", "") item = GmWorkItem() item["key"] = key item["url"] = url item["company_name"] = company_name item["address_detail"] = address_detail item["country"] = country item["province"] = province item["city"] = city item["address"] = address item["zip"] = zip item["contact_people"] = contact_people item["sales_money"] = ordAmt item["sales_num"] = ordCnt6m item["company_type"] = company_type item["keep_time"] = companyJoinYears yield item if response.status == 200: bizId = "" host_token = "" siteId = "" pageId = "" match = re.search("bizId%22%3A(.*?)%2C%22",text) if match: bizId = match.group(1) match1 = re.search("host_token:'(.*?)'",text) if match1: host_token = match1.group(1) match2 = re.search("siteId%22%3A(.*?)%2C%22",text) if match2: siteId = match2.group(1) match3 = re.search("pageId%22%3A(.*?)%2C%22",text) if match3: pageId = match3.group(1) language = "en_US" envMode = "product" renderType = "component" componentKeys = "companyCard" data = {"bizId": bizId, "language": language,"envMode":envMode,"hostToken":host_token, "siteId":siteId,"pageId":pageId,"renderType":renderType,"componentKeys":componentKeys} meta = {"key":key} sale_url = "https://{}.alibaba.com/event/app/alisite/render.htm".format(key) if bizId and host_token and siteId and pageId: yield scrapy.FormRequest(url=sale_url,callback=self.sale_money,formdata=data,meta=meta) else: try_result = self.try_again(response,key) yield try_result def sale_money(self, response): effective = '"success":true' meta = response.meta key = meta.get("key") if re.search(effective,response.text): companyName = "" ordAmt = "" ordCnt6m = "" company_type = "" companyJoinYears = "" match = re.search(r'\\"companyName\\":\\"(.*?)\\"',response.text) if match: companyName = match.group(1) match1 = re.search(r'\\"ordAmt\\":\\"(.*?)\\"',response.text) if match1: ordAmt = match1.group(1) ordAmt = ordAmt.replace(",","") ordAmt = ordAmt.replace("+","") match2 = re.search(r'\\"ordCnt6m\\":(\d*)',response.text) if match2: ordCnt6m = match2.group(1) match3 = re.search(r'\\"value\\":\\"(.*?)\\"', response.text) if match3: company_type = match3.group(1) match3 = re.search(r'\\"companyJoinYears\\":\\"(.*?)\\"', response.text) if match3: companyJoinYears = match3.group(1) item = GmWorkItem() item["key"] = key item["company_name"] = companyName item["sales_money"] = ordAmt item["sales_num"] = ordCnt6m item["company_type"] = company_type item["keep_time"] = companyJoinYears item["pipeline_level"] = "销量" yield item else: try_result = self.try_again(response, key) yield try_result def try_again(self,rsp,key): max_num = 5 meta = rsp.meta try_num = meta.get("try_num",0) if try_num < max_num: try_num += 1 request = rsp.request request.dont_filter = True request.meta["try_num"] = try_num return request else: item_e = GmWorkItem() item_e["error_id"] = 1 item_e["key"] = key return item_e def get_headers(self,type="1"): if type == "1": headers = '''accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3 accept-encoding: gzip, deflate, br accept-language: zh-CN,zh;q=0.9 upgrade-insecure-requests: 1 user-agent: Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.87 Safari/537.36''' elif type == "baidu": headers = '''accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3 accept-encoding: gzip, deflate, br accept-language: zh-CN,zh;q=0.9 upgrade-insecure-requests: 1 user-agent: Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.87 Safari/537.36''' else: headers = '''accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3 accept-encoding: gzip, deflate, br accept-language: zh-CN,zh;q=0.9 upgrade-insecure-requests: 1 user-agent: Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.87 Safari/537.36''' return headers_todict(headers)
class AmazongsSpider(RedisSpider): name = 'amazon_goodstoshop' allowed_domains = ['alibaba.com'] start_urls = ['http://www.amazon.com/'] redis_key = "amazon_goodstoshop:start_url" headers = headers_todict( '''accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3 accept-encoding: gzip, deflate, br accept-language: zh-CN,zh;q=0.9 upgrade-insecure-requests: 1 user-agent: Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.87 Safari/537.36''' ) def start_requests(self): url = "https://www.baidu.com/" yield scrapy.Request(url=url, method="GET", callback=self.seed_request, headers=self.headers, dont_filter=True) def seed_request(self, response): path = r"C:/Users\admin/Desktop/" file_name = "{6_1排行榜_offer有效}[goodsid].txt_去重 - 副本.txt" with open(path + file_name, "r", encoding="utf-8") as f: for i in f: i = i.strip() page_num = 0 url = "https://www.amazon.co.uk/gp/aw/ol/{}".format(i) yield scrapy.Request(url=url, method="GET", headers=self.headers, meta={ "page_num": page_num, "key": i }) def parse(self, response): youxiao = re.search("(olpOfferList|olpProduct)", response.text) key = response.meta.get("key") if youxiao: item_s = GmWorkItem() item_s["key"] = key item_s["source_code"] = response.text yield item_s shop_list = response.css( ".a-section.a-spacing-double-large").xpath( "./div//h3[@class='a-spacing-none olpSellerName']/a") if not shop_list: item = GmWorkItem() item["key"] = key item["name"] = "" item["url"] = "" item["seller_id"] = "" yield item for i in shop_list: name = i.xpath("./text()").get() if name: name = name.strip() url = i.xpath("./@href").get() seller_id = "" match = re.search('(s|seller)=(.*?)($|[&])', url) if match: seller_id = match.group(2) item = GmWorkItem() item["key"] = key item["name"] = name item["url"] = url item["seller_id"] = seller_id yield item next_url = response.css("li.a-last").xpath("./a/@href").get() if next_url: next_url = "https://www.amazon.co.uk" + next_url yield scrapy.Request(url=next_url, method="GET", headers=self.headers, meta={"key": key}) else: try_result = self.try_again(response, key) yield try_result def try_again(self, rsp, key): max_num = 10 meta = rsp.meta try_num = meta.get("try_num", 0) if try_num < max_num: try_num += 1 request = rsp.request request.dont_filter = True request.meta["try_num"] = try_num return request else: item_e = GmWorkItem() item_e["error_id"] = 1 item_e["key"] = key return item_e
def parse_shopxinyong(self, response): text = response.text youxiao = '(您浏览店铺不存在|没有找到相应的店铺|店主被删除或冻结了|掌柜|您查看的页面找不到了|Location:http://\.m\.tmall\.com|com/error1\.html|//chaoshi[a-z]*\.m\.tmall|//aliqin\.tmall|//a\.m\.tmall|modbundle-start)' youxiao_m = re.search(youxiao, text) if youxiao_m: text = text.replace("\n", "") text = text.replace("\r", "") text = text.replace("\t", "") meta = response.meta shop_id = meta.get("shop_id") zhuangtai_s = self.match_zhengze( '(店铺不存在|没有找到|掌柜|删除或冻结|Location:http://[^\.]*\.m\.tmall)', text) #状态 shop_id_s = self.match_zhengze( 'shop_id=([^"#;]*)"|shopId = "([^#"]*)";', text) #店铺ID sellerid_s = self.match_zhengze( '''data-suid='([^']*)'|seller_id=([^"]*)"''', text) #卖家ID zhanggui_s = self.match_zhengze('>掌柜ID</label>([\s\S]+?)<div ', text) #掌柜 nickurl_s = self.match_zhengze('nick = ([ ^ "]*)"', text) #nickurl nick_s = self.match_zhengze('"nick":"([^"]*)",', text) #nick shop_name_s = self.match_zhengze('title>([\s\S]+?)</titl', text) #店铺名称 haoping_s = self.match_zhengze('好评率:([^<]*)<', text) #好评率 miaoshuxf_s = self.match_zhengze('描述相符</label>([^<]*)<', text) #描述相符 fuwutd_s = self.match_zhengze('服务态度</label>([^<]*)<', text) #服务态度 fahuosd_s = self.match_zhengze('发货速度</label>([^<]*)<', text) #发货速度 area_s = self.match_zhengze('label>地区</label>([\s\S]+?)</li>', text) #所在地区 phone_s = self.match_zhengze("客服电话:<[^>]*>([^<]*)<", text) #客服电话 shopurl_s = self.match_zhengze('"shopUrl":"([^"]*)"', text) #shopUrl item = taobao() item["zhuangtai"] = zhuangtai_s item["seller_id"] = sellerid_s item["shop_id"] = shop_id_s item["zhanggui"] = zhanggui_s item["nickurl"] = nickurl_s item["nick"] = nick_s item["shop_name"] = shop_name_s item["haoping"] = haoping_s item["miaoshuxf"] = miaoshuxf_s item["fuwutd"] = fuwutd_s item["fahuosd"] = fahuosd_s item["area"] = area_s item["phone"] = phone_s item["shopurl"] = shopurl_s item["pipeline_level"] = "手机店铺信用" yield item if sellerid_s: url_1 = "https://ext-mdskip.taobao.com/extension/seller_info.htm?user_num_id={}" headers = '''User-Agent:Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Maxthon/4.4.6.2000 Chrome/30.0.1599.101 Safari/537.36 Referer:https://www.taobao.com''' url_1 = url_1.format(sellerid_s) headers = headers_todict(headers) yield scrapy.Request(url=url_1, callback=self.main_sale, method="GET", headers=headers, meta={"seller_id": sellerid_s}) #主营 url_2 = "https://count.taobao.com/counter3?keys=SM_368_dsr-{}&callback=jsonp173" headers = "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Maxthon/4.4.6.2000 Chrome/30.0.1599.101 Safari/537.36" url_2 = url_2.format(sellerid_s) headers = headers_todict(headers) yield scrapy.Request(url=url_2, callback=self.parse_shopxinyong_diannao, method="GET", headers=headers, meta={"seller_id": sellerid_s}) # 公司信用_电脑端 asyn_url = "http://hdc1.alicdn.com/asyn.htm?userId={}&pageId=&v=2014" headers = "User-Agent:Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Maxthon/4.4.6.2000 Chrome/30.0.1599.101 Safari/537.36" asyn_url = asyn_url.format(sellerid_s) headers = headers_todict(headers) yield scrapy.Request(url=asyn_url, callback=self.parse_asyn_good, method="GET", headers=headers, meta={"shop_id": shop_id}) else: request = self.try_again(response) if request: yield request
def parse_tui_diannao(self, response): text = response.text youxiao = '("itemId"|"result":\[\])' youxiao_m = re.search(youxiao, text) if youxiao_m: text = text.replace("\n", "") text = text.replace("\r", "") text = text.replace("\t", "") meta = response.meta shop_id = meta.get("shop_id") url1 = "https://tui.taobao.com/recommend?shop_id={}&item_ids={}&floorId=42296&pSize=12&callback=detail_pine&appid=6862&count=12&pNum=0" headers1 = '''Referer: https://item.taobao.com/item.htm?id=590354499275 User-Agent:Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Maxthon/4.4.6.2000 Chrome/30.0.1599.101 Safari/537.36''' fenlie = "},{" for i in text.split(fenlie): text_s = i shop_type_s = self.match_zhengze( '"userType":[\s]*["]{0,1}([^",\}]*)', text_s) shop_id_s = self.match_zhengze( '"shopId":[\s]*["]{0,1}([^",\}]*)', text_s) seller_id_s = self.match_zhengze( '"sellerId":[\s]*["]{0,1}([^",\}]*)', text_s) item_id_s = self.match_zhengze( '"itemId":[\s]*["]{0,1}([^",\}]*)', text_s) good_name_s = self.match_zhengze('"itemName":"([\s\S]+?)",', text_s) #UTF8 price_s = self.match_zhengze('"price":[\s]*["]{0,1}([^",\}]*)', text_s) promotion_price_s = self.match_zhengze( '"promotionPriceRaw":[\s]*["]{0,1}([^",\}]*)', text_s) sell_count_s = self.match_zhengze( '"sellCount":[\s]*["]{0,1}([^",\}]*)', text_s) mouth_count_s = self.match_zhengze( '"monthSellCount": [\s]*["]{0,1}([^",\}] * )', text_s) quantity_s = self.match_zhengze( '"quantity":[\s]*["]{0,1}([^",\}]*)', text_s) favor_count_s = self.match_zhengze( '"favorCount":[\s]*["]{0,1}([^",\}]*)', text_s) brand_id_s = self.match_zhengze('"brandId":([0-9]*)', text_s) category_id_s = self.match_zhengze( '"categoryId":[\s]*["]{0,1}([^",\}]*)', text_s) category_id_lv1_s = self.match_zhengze( '"categoryLv1Id":[\s]*["]{0,1}([^",\}]*)', text_s) sub_item_name_s = self.match_zhengze( '"subItemName":[\s]*["]{0,1}([^",\}]*)', text_s) pic_s = self.match_zhengze('"pic":[\s]*["]{0,1}([^",\}]*)', text_s) item = taobao() item["shop_type"] = shop_type_s item["shop_id"] = shop_id_s item["seller_id"] = seller_id_s item["good_name"] = good_name_s item["price"] = price_s item["promotion_price"] = promotion_price_s item["sell_count"] = sell_count_s item["mouth_count"] = mouth_count_s item["quantity"] = quantity_s item["favor_count"] = favor_count_s item["brand_id"] = brand_id_s item["category_id"] = category_id_s item["category_id_lv1"] = category_id_lv1_s item["sub_item_name"] = sub_item_name_s item["pic"] = pic_s item["pipeline_level"] = "tui店铺" yield item if item_id_s: url = url1.format(shop_id, item_id_s) headers = headers_todict(headers1) yield scrapy.Request(url=url, callback=self.parse_kanleyoukan, method="GET", headers=headers, meta={ "shop_id": shop_id, "item_id": item_id_s }) else: request = self.try_again(response) if request: yield request
class AmazonPhSpider(RedisSpider): name = 'amazon_ph' allowed_domains = ['www.amazon.com'] start_urls = ['http://www.amazon.com/'] redis_key = "amazon_ph:start_url" headers = headers_todict( '''accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3 accept-encoding: gzip, deflate, br accept-language: zh-CN,zh;q=0.9 cache-control: no-cache pragma: no-cache upgrade-insecure-requests: 1 user-agent: Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.87 Safari/537.36''' ) def start_requests(self): url = "https://www.amazon.com/销售排行榜/zgbs/ref=zg_bs_unv_0_1045024_4?language=zh_CN" deep = 1 yield scrapy.Request(url=url, method="GET", headers=self.headers, dont_filter=True, meta={"deep": deep}) def parse(self, response): catelog = response.meta.get("catelog_name") deep = response.meta.get("deep", 1) catelog_url = response.url xpath_str = ".{}/li".format("/ul" * deep) url_catalog = response.css("#zg_browseRoot").xpath(xpath_str) second_url = response.css(".a-last").xpath("./a/@href").get() if second_url: yield scrapy.Request(url=second_url, method="GET", headers=self.headers, meta={ "catelog_name": catelog, "deep": deep }) for i in url_catalog: url_next = i.xpath("./a/@href").get() catelog_name = i.xpath("./a/text()").get() if url_next: deep_next = deep + 1 yield scrapy.Request(url=url_next, method="GET", headers=self.headers, meta={ "catelog_name": catelog_name, "deep": deep_next }) goods_list = response.css("#zg-ordered-list").xpath("./li") for i in goods_list: url = i.xpath("./span/div/span/a/@href").get() if url: url = "https://www.amazon.com" + url good_name = i.xpath("./span/div/span/a/div/text()").get() if good_name: good_name = good_name.strip() level = i.xpath("./span/div/span/div[1]/a[1]/i/span/text()").get() if level: level = level.replace(" out of 5 stars", "") evaluates = i.xpath("./span/div/span/div[1]/a[2]/text()").get() if evaluates: evaluates = evaluates.replace(",", "") price = i.xpath("./span/div/span/div[2]/a/span/span/text()").get() if price: price = price.replace("$", "") item = AmazonItem() item["url"] = url item["good_name"] = good_name item["level"] = level item["evaluates"] = evaluates item["price"] = price item["catelog_name"] = catelog item["catelog_url"] = catelog_url item["deep"] = deep yield item