def sort_second(self, response): youxiao = re.search("(category-menu)", response.text) #判断是否出现左边导航列表 youxiao1 = re.search("(js-Search-titleCount)", response.text) #判断是否出现good列表 sort = response.meta.get("sort") if youxiao1: self.get_list(response, youxiao1) item = GmWorkItem() item["sort"] = str(sort) item["url"] = response.url else: url = response.url if youxiao: headers = self.get_headers(1) sort_list = response.css(".category-menu").xpath("./dl/dd//a") for i in sort_list: sort_copy = sort.copy() url = i.xpath("./@href").get() name = i.xpath("./text()").get() if re.search("\w", name) and name not in sort_copy: sort_copy.append(name) item = GmWorkItem() item["sort"] = str(sort_copy) item["url"] = url yield item yield scrapy.Request(url=url, method="GET", callback=self.sort_second, headers=headers, dont_filter=True, meta={"sort": sort_copy}) else: try_result = self.try_again(response, key=str(sort), url=url) yield try_result
def parse(self, response): youxiao = re.search("(information-holder|results)", response.text) url_key = response.request.url id = response.meta.get("id") category = response.meta.get("category") first_page = response.meta.get("first_page") page_num = response.meta.get("page_num", 1) if youxiao: item_s = GmWorkItem() item_s["url"] = url_key item_s["source_code"] = response.text yield item_s goods_num = response.css(".results").xpath("./text()").get() #总商品数 if goods_num: match = re.search("of ([^ ]+) results", goods_num) if match: goods_num = match.group(1) goods_num = goods_num.replace(",", "") shop_list = response.css(".row.information-holder").xpath("./a") if not shop_list: print("shop_list有url没有选取", id) for i in shop_list: url = i.xpath("./@href").get() name = i.xpath("./span/text()").get() price = i.xpath("./strong/text()").get() url = "https://www.fruugo.co.uk" + url item = GmWorkItem() item["key"] = url_key item["name"] = name item["url"] = url item["price"] = price item["goods_num"] = goods_num item["category"] = category yield item if first_page and goods_num: headers = self.get_headers(1) limitnum = 1000 per_page = 64 num = self.get_pagenum(int(goods_num), per_page) if num > limitnum: num = limitnum for i in range(2, num + 1): next_url = url_key + "?page={}".format(i) meta = { "id": id, "category": category, "page_num": page_num } yield scrapy.Request(url=next_url, method="GET", headers=headers, dont_filter=True, meta=meta) else: try_result = self.try_again(response, url=url_key) yield try_result
def parse(self, response): youxiao = re.search("(olpOfferList|olpProduct)", response.text) key = response.meta.get("key") if youxiao: item_s = GmWorkItem() item_s["key"] = key item_s["source_code"] = response.text yield item_s shop_list = response.css( ".a-section.a-spacing-double-large").xpath( "./div//h3[@class='a-spacing-none olpSellerName']/a") if not shop_list: item = GmWorkItem() item["key"] = key item["name"] = "" item["url"] = "" item["seller_id"] = "" yield item for i in shop_list: name = i.xpath("./text()").get() if name: name = name.strip() url = i.xpath("./@href").get() seller_id = "" match = re.search('(s|seller)=(.*?)($|[&])', url) if match: seller_id = match.group(2) item = GmWorkItem() item["key"] = key item["name"] = name item["url"] = url item["seller_id"] = seller_id yield item next_url = response.css("li.a-last").xpath("./a/@href").get() if next_url: next_url = "https://www.amazon.co.uk" + next_url yield scrapy.Request(url=next_url, method="GET", headers=self.headers, meta={"key": key}) else: try_result = self.try_again(response, key) yield try_result
def sort_all(self, response): youxiao = re.search('("data")', response.text) key = "sort" if response.status == 200 and youxiao: item_s = GmWorkItem() item_s["key"] = key item_s["source_code"] = response.text yield item_s json_data = json.loads(response.text) data = json_data.get("data") for i in data: main = i.get("main") catid = main.get("catid") name = main.get("name") sub = i.get("sub") for j in sub: sub_sub = j.get("sub_sub") catid_sub = j.get("catid") name_sub = j.get("name") if sub_sub: for x in sub_sub: name_sub2 = x.get("display_name") catid_sub2 = x.get("catid") item = GmWorkItem() item["catid"] = catid item["category"] = name item["catid_sub"] = catid_sub item["category1"] = name_sub item["catid_sub2"] = catid_sub2 item["category2"] = name_sub2 yield item else: item = GmWorkItem() item["catid"] = catid item["category"] = name item["catid_sub"] = catid_sub item["category1"] = name_sub item["catid_sub2"] = "" item["category2"] = "" else: yield self.try_again(response, key=key)
def sort_all(self, response): youxiao = re.search("(About seller|Sprzedający)", response.text) url = response.request.url if youxiao: seller_id = "" positive_number = "" bad_number = "" match = re.search('"sellerId":"(.*?)"', response.text) if match: seller_id = match.group(1) positive_feedback = response.css( ".a7caa336.d7c56f78._476b319e").xpath("./text()").get() number = response.css(".fa4668cc").xpath("./text()").getall() if len(number) == 2: positive_number = number[0] bad_number = number[1] year = response.css("._1604f5d6._82f13583").xpath( "./div/div/text()").get() match = re.search('({"leftLink".*?"hideContact":.*?})', response.text) regon = "" nip = "" company_data = [] if match: data_str = match.group(1) try: data = json.loads(data_str) company_data = data.get("companyData") for i in company_data: if "REGON" in i: regon = i if "NIP" in i: nip = i except: pass # else: # try_result = self.try_again(response, url=url) # yield try_result item = GmWorkItem() item["seller_id"] = seller_id item["positive_feedback"] = positive_feedback item["positive_number"] = positive_number item["bad_number"] = bad_number item["year"] = year item["regon"] = regon item["nip"] = nip item["company_data"] = str(company_data) yield item else: try_result = self.try_again(response, url=url) yield try_result
def get_list(self, response, youxiao=None): if not youxiao: youxiao = re.search("(js-Search-titleCount)", response.text) if youxiao: url = response.url category_list = [] page_num = response.meta.get("page_num") headers = self.get_headers(1) category = response.css(".Breadcrumb-list").xpath("./li") for i in category: category = i.xpath("./a/span/text()").get() if not category: category = i.xpath("./span[2]/text()").get() if category: category_list.append(category) count = response.css(".Search-titleCount.js-Search-titleCount" ).xpath("./text()").get() if count: match = re.search("\((\d*)\)", count) count = 0 if match: count = match.group(1) goods_list = response.css( ".clearfix.Article-item.js-Search-hashLinkId") for i in goods_list: good_url = i.css(".Article-desc").xpath("./span/a/@href").get() good_name = i.css(".Article-desc").xpath( "./span/a/text()").get() good_id = i.xpath("./@id").get() score = i.css(".Article-rate.js-bestReview").xpath( "./span/span[1]/text()").get() score_number = i.css(".Article-rate.js-bestReview").xpath( "./span/span[2]/text()").get() if score_number: match = re.search("\((.*?)\)", score_number) if match: score_number = match.group(1) price = i.css(".userPrice").xpath("./text()").get() if price: price = price.strip() goodshop_url = i.css(".OffersSumary.clearfix").xpath( "./a/@href").get() item = GmWorkItem() item["good_url"] = good_url item["good_name"] = good_name item["good_id"] = good_id item["score"] = score item["score_number"] = score_number item["price"] = price item["goodshop_url"] = goodshop_url item["category"] = str(category)
def get_detail(self, response): seller_id = response.meta.get("seller_id") youxiao = re.search("(SUCCESS::调用成功)", response.text) if youxiao: try: match = re.search(" mtopjsonp3\((.*)\)", response.text) if match: json_str = match.group(1) json_data = json.loads(json_str) data = json_data.get("data") anchorId = data.get("anchorId") nick = data.get("nick") relation = data.get("relation") fansCount = relation.get("fansCount") followTopCount = relation.get("followTopCount") liveCount = relation.get("liveCount") replays = data.get("replays") info_list = [] viewer_totle = 0 for i in replays: info_dict = dict() liveId = i.get("liveId") liveTime = i.get("liveTime") roomTypeName = i.get("roomTypeName") title = i.get("title") viewerCount = i.get("viewerCount") info_dict["liveId"] = liveId info_dict["liveTime"] = liveTime info_dict["roomTypeName"] = roomTypeName info_dict["title"] = title info_dict["viewerCount"] = viewerCount info_list.append(info_dict) viewer_totle += int(viewerCount) live_info = json.dumps(info_list) item = GmWorkItem() item["anchor_id"] = anchorId item["nick"] = nick item["fans_count"] = fansCount item["follow_count"] = followTopCount item["live_count"] = liveCount item["viewer_totle"] = viewer_totle item["live_info"] = live_info yield item except Exception as e: print(e) try_result = self.try_again(response, seller_id=seller_id) yield try_result else: try_result = self.try_again(response, seller_id=seller_id) yield try_result
def sort_all(self,response): youxiao = re.search("(StoreState_base)",response.text) url = response.url if youxiao: match = re.search("StoreState_base'] = ({.*?});</script>",response.text) if match: data_str = match.group(1) try: data = json.loads(data_str) items = data.get("items") items_groups = items.get("itemsGroups",{}) for i in items_groups: good_list = i.get("items") for j in good_list: good_id = j.get("id") good_url = j.get("url") location = j.get("location",{}) city = location.get("city") title = j.get("title",{}) good_name = title.get("text") status = j.get("type") price_json = j.get("price",{}) normal = price_json.get("normal",{}) price = normal.get("amount") sales = j.get("bidInfo") seller = j.get("seller",{}) shop_id = seller.get("id") shop_super = seller.get("superSeller") shop_name = seller.get("login") sort_id = j.get("categoryPath") item = GmWorkItem() item["id"] = good_id item["goods_url"] = good_url item["city"] = city item["good_name"] = good_name item["status"] = status item["price"] = price item["sales_num"] = sales item["shop_id"] = shop_id item["shop_super"] = shop_super item["shop_name"] = shop_name item["sort_id"] = sort_id yield item except: try_result = self.try_again(response, url=url) yield try_result else: try_result = self.try_again(response, url=url) yield try_result
def get_sellerid(self, response): meta = response.meta seller_id = meta.get("seller_id") shop_id = meta.get("shop_id") status = response.status if status == 200: if "allProducts" not in response.text: print(shop_id,seller_id) item = GmWorkItem() item["shop_id"] = shop_id item["seller_id"] = seller_id yield item else: yield print("302:",shop_id, seller_id)
def try_again(self,rsp,key): max_num = 5 meta = rsp.meta try_num = meta.get("try_num",0) if try_num > max_num: try_num += 1 request = rsp.request request.dont_filter = True request.meta["try_num"] = try_num return request else: item_e = GmWorkItem() item_e["error_id"] = 1 item_e["key"] = key return item_e
def parse(self, response): youxiao = re.search("(gS4GqiXvRSi8oJgNBVklGA)", response.text) url = response.url if youxiao: item_s = GmWorkItem() item_s["url"] = url item_s["source_code"] = response.text yield item_s shop_list = response.xpath( "//div[@data-box-id='gS4GqiXvRSi8oJgNBVklGA==']/div/ul//a") if not shop_list: print("shop_list有url没有选取") for i in shop_list: url = i.xpath("./@href").get() name = i.xpath("./text()").get() url = "https://allegro.pl" + url item = GmWorkItem() item["name"] = name item["url"] = url yield item else: try_result = self.try_again(response, url=url) yield try_result
def sort_all(self, response): if response.status == 200: headers = self.get_headers(1) sort_all = response.css(".container-header._1s2v1._n2pii._sdhee") for i in sort_all: sort_url = i.xpath("./small/a/@href").get() if sort_url: sort_url = "https://allegro.pl" + sort_url item = GmWorkItem() item["url"] = sort_url yield scrapy.Request(url=sort_url, method="GET", headers=headers, dont_filter=True) else: print("sort_all有url没有选取")
def try_again(self,rsp,**kwargs): max_num = -1 meta = rsp.meta try_num = meta.get("try_num",0) if try_num < max_num: try_num += 1 request = rsp.request request.dont_filter = True request.meta["try_num"] = try_num return request else: item_e = GmWorkItem() item_e["error_id"] = 1 for i in kwargs: item_e[i] = kwargs[i] return item_e
def parse_shop(self, response): url = response.url youxiao = re.search('("error_msg":null)', response.text) shop_id = response.meta.get("shop_id") if youxiao: try: items = json.loads(response.text) data = items.get("data") if data: name = data.get("name") description = data.get("description") country = data.get("country") place = data.get("place") item_count = data.get("item_count") rating_star = data.get("rating_star") shop_location = data.get("shop_location") follower_count = data.get("follower_count") #粉丝数 rating_good = data.get("rating_good") #好评数 rating_bad = data.get("rating_bad") # 差评数 cancellation_rate = data.get("cancellation_rate") # 退货率 item = GmWorkItem() item["shop_id"] = shop_id item["name"] = name item["description"] = description item["country"] = country item["place"] = place item["follower_count"] = follower_count item["rating_good"] = rating_good item["rating_bad"] = rating_bad item["cancellation_rate"] = cancellation_rate item["url"] = url item["item_count"] = item_count item["rating_star"] = rating_star item["shop_location"] = shop_location item["pipeline_level"] = "店铺信息" yield item except Exception as e: print(e) yield self.try_again(response, shop_id=shop_id, pipeline_level="店铺信息") else: print("无效:{}".format(url)) yield self.try_again(response, shop_id=shop_id, pipeline_level="店铺信息")
def try_again(self, rsp, shop_id, seller_id, page_num): max_num = 5 meta = rsp.meta try_num = meta.get("try_num", 0) if try_num < max_num: try_num += 1 request = rsp.request request.dont_filter = True request.meta["try_num"] = try_num return request else: item_e = GmWorkItem() item_e["error_id"] = 1 item_e["shop_id"] = shop_id item_e["seller_id"] = seller_id item_e["page_num"] = page_num return item_e
def parse(self, response): youxiao = re.search("(Information)",response.text) url = response.url key = response.meta.get("key") if youxiao: title = response.css(".b-title").xpath("./text()").get() item = GmWorkItem() item["key"] = key item["url"] = url item["company_name"] = title yield item else: print("错误") try_result = self.try_again(response,key) yield try_result
def sale_money(self, response): effective = '"success":true' meta = response.meta key = meta.get("key") if re.search(effective,response.text): companyName = "" ordAmt = "" ordCnt6m = "" company_type = "" companyJoinYears = "" match = re.search(r'\\"companyName\\":\\"(.*?)\\"',response.text) if match: companyName = match.group(1) match1 = re.search(r'\\"ordAmt\\":\\"(.*?)\\"',response.text) if match1: ordAmt = match1.group(1) ordAmt = ordAmt.replace(",","") ordAmt = ordAmt.replace("+","") match2 = re.search(r'\\"ordCnt6m\\":(\d*)',response.text) if match2: ordCnt6m = match2.group(1) match3 = re.search(r'\\"value\\":\\"(.*?)\\"', response.text) if match3: company_type = match3.group(1) match3 = re.search(r'\\"companyJoinYears\\":\\"(.*?)\\"', response.text) if match3: companyJoinYears = match3.group(1) item = GmWorkItem() item["key"] = key item["company_name"] = companyName item["sales_money"] = ordAmt item["sales_num"] = ordCnt6m item["company_type"] = company_type item["keep_time"] = companyJoinYears item["pipeline_level"] = "销量" yield item else: try_result = self.try_again(response, key) yield try_result
def parse(self, response): meta = response.meta id = meta.get("id") youxiao = re.search('(keyMoveItem)', response.text) if youxiao: company = response.css(".zx-list-item-url").xpath("./text()").get() legal_person = response.css(".legal-txt").xpath("./text()").get() area = response.css(".zx-ent-props").xpath( "./span/span[contains(text(),'地址')]/../text()").get() id_s = response.css(".zx-ent-hit-reason-text").xpath( "./em/text()").get() item = GmWorkItem() item["id"] = id item["company"] = company item["legal_person"] = legal_person item["area"] = area item["id_s"] = id_s yield item else: print("{}错误了".format(id)) try_result = self.try_again(response, id=id) yield try_result
def detail_data(self, response): uid = response.meta.get("uid") nickname = response.meta.get("nickname") signature = response.meta.get("signature") labels = response.meta.get("labels") match = re.search('handle', response.text) if match: info_list = response.css(".handle").xpath("./div/ul/li") gift_num = "0" getgift_num = "0" praise_num = "0" fans_num = "0" for i in info_list: name = i.xpath("./p/text()").get() value = i.xpath("./h4/text()").get() if "送礼" in name: gift_num = value.strip() elif "收礼" in name: getgift_num = value.strip() elif "赞" in name: praise_num = value.strip() elif "粉丝" in name: fans_num = value.strip() item = GmWorkItem() item["up_id"] = uid item["nick"] = nickname item["signature"] = signature item["labels"] = str(labels) item["gift_num"] = gift_num item["getgift_num"] = getgift_num item["ol"] = praise_num item["fans"] = fans_num yield item else: try_result = self.try_again(response, url=response.url) yield try_result
def baidu_second(self,response): meta = response.meta company = meta.get("company") address = meta.get("address") youxiao = re.search('("status":0)',response.text) if youxiao: data_json = json.loads(response.text) result = data_json.get("result") addressComponent = result.get("addressComponent") province = addressComponent.get("province") city = addressComponent.get("city") district = addressComponent.get("district") item = GmWorkItem() item["company"] = company item["address"] = address item["province"] = province item["city"] = city item["district"] = district yield item else: print("百度第一步{}错误了".format(company)) try_result = self.try_again(response, company=company,address=address) yield try_result
def parse(self, response): meta = response.meta company = meta.get("company") address = meta.get("address") youxiao = re.search('("status":"1")',response.text) if youxiao: data_json = json.loads(response.text) geocodes = data_json.get("geocodes") if geocodes: data = geocodes[0] province = data.get("province") city = data.get("city") district = data.get("district") item = GmWorkItem() item["company"] = company item["address"] = address item["province"] = province item["city"] = city item["district"] = district yield item else: print("{}错误了".format(company)) try_result = self.try_again(response, company=company,address=address) yield try_result
def parse(self, response): youxiao = re.search("(HTTP 404|Information|302 Found)",response.text) url = response.url key = response.meta.get("key") if youxiao: text = response.text # item_s = GmWorkItem() # item_s["key"] = key # item_s["source_code"] = text # yield item_s address_detail = "" company_name = "" val_judge = 0 contact_table = response.css(".contact-table").xpath("./tr") if not contact_table: contact_table = response.css(".company-info-data.table").xpath("./tr") val_judge = 1 for i in contact_table: name = i.xpath("./th").xpath("string(.)").get() if val_judge: value = i.xpath("./td[2]").xpath("string(.)").get() else: value = i.xpath("./td").xpath("string(.)").get() if name and "Address" in name: address_detail = value if name and "Company Name" in name: company_name = value country = "" province = "" city = "" address = "" zip = "" info_table = response.css(".info-table").xpath("./tr") if not info_table: info_table = response.css(".public-info").xpath("./dl") for i in range(len(info_table.xpath("./dt"))): name = info_table.xpath("./dt[{}]".format(i+1)).xpath("string(.)").get() value = info_table.xpath("./dd[{}]".format(i+1)).xpath("string(.)").get() if name and "Country" in name: country = value if name and "Province" in name: province = value if name and "City" in name: city = value if name and "Zip" in name: zip = value if name and "Address" in name: address = value else: for i in info_table: name = i.xpath("./th").xpath("string(.)").get() value = i.xpath("./td").xpath("string(.)").get() if name and "Country" in name: country = value if name and "Province" in name: province = value if name and "City" in name: city = value if name and "Zip" in name: zip = value if name and "Address" in name: address = value contact_people = response.css(".contact-name").xpath("./text()").get() if not contact_people: contact_people = response.css(".name").xpath("./text()").get() companyJoinYears = response.css(".join-year").xpath("./span/text()").get() company_type = response.css(".business-type").xpath("./text()").get() ordCnt6m = response.css(".transaction-number-value").xpath("./text()").get() ordAmt = response.css(".transaction-amount-value").xpath("./text()").get() if ordAmt: ordAmt = ordAmt.replace(",", "") ordAmt = ordAmt.replace("+", "") item = GmWorkItem() item["key"] = key item["url"] = url item["company_name"] = company_name item["address_detail"] = address_detail item["country"] = country item["province"] = province item["city"] = city item["address"] = address item["zip"] = zip item["contact_people"] = contact_people item["sales_money"] = ordAmt item["sales_num"] = ordCnt6m item["company_type"] = company_type item["keep_time"] = companyJoinYears yield item if response.status == 200: bizId = "" host_token = "" siteId = "" pageId = "" match = re.search("bizId%22%3A(.*?)%2C%22",text) if match: bizId = match.group(1) match1 = re.search("host_token:'(.*?)'",text) if match1: host_token = match1.group(1) match2 = re.search("siteId%22%3A(.*?)%2C%22",text) if match2: siteId = match2.group(1) match3 = re.search("pageId%22%3A(.*?)%2C%22",text) if match3: pageId = match3.group(1) language = "en_US" envMode = "product" renderType = "component" componentKeys = "companyCard" data = {"bizId": bizId, "language": language,"envMode":envMode,"hostToken":host_token, "siteId":siteId,"pageId":pageId,"renderType":renderType,"componentKeys":componentKeys} meta = {"key":key} sale_url = "https://{}.alibaba.com/event/app/alisite/render.htm".format(key) if bizId and host_token and siteId and pageId: yield scrapy.FormRequest(url=sale_url,callback=self.sale_money,formdata=data,meta=meta) else: try_result = self.try_again(response,key) yield try_result
def data_parse(self, response): jiaoyan = "Success" meta = response.meta pinyin = meta.get("pinyin") name = meta.get("name") city_id = meta.get("city_id") page = meta.get("page") if jiaoyan in response.text: data_json = json.loads(response.text) Response = data_json.get("Response") resultTitle = Response.get("resultTitle") hotel_num = re.sub("\D", "", resultTitle) #添加剩余页面 if page == "1" and hotel_num: totle_num = int(int(hotel_num) / 20) + 1 if int(hotel_num) % 20 else int( int(hotel_num) / 20) # if totle_num>2:#test # totle_num = 2 headers = self.get_headers(2) home_url = "https://hotels.ctrip.com/hotels/listPage?cityename={}&city={}".format( pinyin, city_id) url = "https://m.ctrip.com/restapi/soa2/16709/HotelSearch" headers["Referer"] = home_url for i in range(2, totle_num + 1): data = '''{"meta":{"fgt":"","hotelId":"","priceToleranceData":"","priceToleranceDataValidationCode":"","mpRoom":[],"hotelUniqueKey":"","shoppingid":""},"seqid":"","deduplication":[],"filterCondition":{"star":[],"rate":"","rateCount":[],"priceRange":{"lowPrice":0,"highPrice":-1},"priceType":"","breakfast":[],"payType":[],"bedType":[],"bookPolicy":[],"bookable":[],"discount":[],"zone":[],"landmark":[],"metro":[],"airportTrainstation":[],"location":[],"cityId":[],"amenty":[],"promotion":[],"category":[],"feature":[],"brand":[],"popularFilters":[]},"searchCondition":{"sortType":"1","adult":1,"child":0,"age":"","pageNo":页数,"optionType":"","optionId":"","lat":0,"destination":"","keyword":"","cityName":"城市名称","lng":0,"cityId":城市id,"checkIn":"入住时间","checkOut":"离店时间","roomNum":1,"mapType":"gd","travelPurpose":0,"countryId":1,"url":"酒店url","pageSize":20,"timeOffset":28800,"radius":0,"directSearch":0},"queryTag":"NORMAL","genk":true,"genKeyParam":"a=0,b=入住时间,c=离店时间,d=zh-cn,e=2","webpSupport":true,"platform":"online","pageID":"102002","head":{"Version":"","userRegion":"CN","Locale":"zh-CN","LocaleController":"zh-CN","TimeZone":"8","Currency":"CNY","PageId":"102002","webpSupport":true,"userIP":"","P":"","ticket":"","clientID":"","Union":{"AllianceID":"","SID":"","Ouid":""},"HotelExtension":{"group":"CTRIP","hasAidInUrl":false,"Qid":"","hotelUuidKey":"","hotelUuid":""}}}''' data = data.replace("页数", str(i)) data = data.replace("城市名称", name) data = data.replace("城市id", city_id) data = data.replace("酒店url", home_url) meta = { "pinyin": pinyin, "name": name, "city_id": city_id, "page": str(i) } yield scrapy.Request(url=url, callback=self.data_parse, method="POST", body=data, headers=headers, dont_filter=True, meta=meta) hotelList = Response.get("hotelList", dict()) list_data = hotelList.get("list") for i in list_data: base = i.get("base") hotelId = base.get("hotelId") hotelEnName = base.get("hotelEnName") hotelName = base.get("hotelName") tags = str(base.get("tags", "")) comment = i.get("comment") content = comment.get("content", "") comment_num = re.sub("\D", "", content) money = i.get("money") price = money.get("price") position = i.get("position") cityName = position.get("cityName") area = position.get("area") address = position.get("address") score = i.get("score") number = score.get("number") item = GmWorkItem() item["hotel_num"] = hotel_num item["hotel_id"] = hotelId item["hotel_name"] = hotelName item["hotel_enname"] = hotelEnName item["tag"] = tags item["price"] = price item["city"] = cityName item["area"] = area item["address"] = address item["comment_num"] = comment_num item["comment"] = number yield item else: try_result = self.try_again(response, pinyin=pinyin, name=name, id=city_id, page=page) yield try_result
def parse(self, response): youxiao = re.search('("error":null)', response.text) meta = response.meta match_id = meta.get("match_id") page = meta.get("page") url = response.request.url headers = self.get_headers(1) if youxiao: try: data = json.loads(response.text) if page == 1: total_count = data.get("total_count") if total_count > 5000: total_count = 5000 pages = int(total_count / 50) * 50 if total_count % 50 == 0 else int( total_count / 50) * 50 + 50 for i in range(50, pages, 50): page_num_r = i new_page = int((page_num_r + 50) / 50) meta_r = {"match_id": match_id, "page": new_page} url_r = 'https://shopee.com.my/api/v2/search_items/?by=sales&limit=50&match_id={}&newest={}&order=desc&page_type=search&version=2'.format( match_id, page_num_r) yield scrapy.Request(url=url_r, headers=headers, meta=meta_r) items = data['items'] if items != None or items != []: for i in items: shop_id = i.get("shopid") goods_id = i.get("itemid") name = i.get("name") price = i.get("price") #价格 if price: price = price / 100000 currency = i.get("currency") # 币种 historical_sold = i.get("historical_sold") # 历史销量 sales_num = i.get("sold") stock = i.get("stock") # 库存 item_rating = i.get("item_rating") rating_star = "" if item_rating: rating_star = item_rating.get('rating_star') # 评分 item_status = i.get("item_status") # 商品状态 show_free_shipping = i.get("show_free_shipping") # 免邮 brand = i.get("brand") # 品牌 location = i.get("shop_location") view_count = i.get("view_count") item = GmWorkItem() item["shop_id"] = shop_id item["goods_id"] = goods_id item["name"] = name item["price"] = price item["currency"] = currency item["totle_num"] = historical_sold #历史销量 item["sales_num"] = sales_num item["stock"] = stock item["rating_star"] = rating_star item["item_status"] = item_status item["show_free_shipping"] = show_free_shipping item["brand"] = brand item["url"] = url item["location"] = location item["view_count"] = view_count item["pipeline_level"] = "list" yield item shop_url = self.shop_url.format(shop_id) #goods详情页 yield scrapy.Request(url=shop_url, headers=headers, callback=self.parse_shop, meta={'shop_id': shop_id}) else: print("为空:{}".format(url)) except Exception as e: print(e) yield self.try_again(response, match_id=match_id, page=page) else: print("无效:{}".format(url)) yield self.try_again(response, match_id=match_id, page=page)
def get_detail(self, response): meta = response.meta json_str = response.text req_url = response.url seller_id = meta.get("seller_id") shop_id = meta.get("shop_id") page_id = meta.get("page_id") if json_str.startswith('{"'): item_s = GmWorkItem() item_s["source_code"] = json_str yield item_s json_data = json.loads(json_str) # success = json_data.get("success") data = json_data.get("data") # nextUrl = data.get("nextUrl") items = data.get("items") # if not items: # print("item为空",shop_id,req_url) trace = data.get("trace") page = trace.get("page") aem_count = int( page.get("aem_count")) if page.get("aem_count") else 0 if aem_count: self.goods_num += aem_count if self.goods_num % 100000 == 1: print(self.goods_num) for i in range(20, aem_count, 20): url = "https://m.aliexpress.com/api/search/products/items?pageId={}&searchType=storeSearch&sellerAdminSeq={}&storeId={}&infiniteScroll=true&start={}&shipToCountry=US&__amp_source_origin=https%3A%2F%2Fm.aliexpress.com" Referer_str = "https://m.aliexpress.com/storesearch/list/.html?sortType=TC3_D&searchType=storeSearch&trace=store2mobilestoreNew&storeId={}" cookies = "aefeMsite=amp--wRru0loiCNZjcQEqYc1Ew; ali_apache_id=11.180.122.26.1575437527682.392996.5; isg=BDEx-5kOyCf7m2SmkQaxvTBcQL0LtqIM-G1_rBNGL_giOlOMW256Y8wcWIj58j3I" Referer = Referer_str.format(shop_id) url = url.format(page_id, seller_id, shop_id, i) headers = self.get_headers() headers["Cookie"] = cookies headers["Referer"] = Referer meta = { "page_id": page_id, "seller_id": seller_id, "shop_id": shop_id } yield scrapy.Request(url=url, callback=self.get_detail, method="GET", headers=headers, meta=meta) for good in items: item = GmWorkItem() goods_url = good.get("action") averageStarStr = good.get("averageStarStr") imgUrl = good.get("imgUrl") price = good.get("price") price1 = price.get("price") price_currency = price1.get("currency") price_value = price1.get("value") productId = good.get("productId") subject = good.get("subject") item["shop_id"] = shop_id item["seller_id"] = seller_id item["goods_url"] = goods_url item["average_score"] = averageStarStr item["img_url"] = imgUrl item["currency"] = price_currency item["price"] = price_value item["goods_id"] = productId item["subject"] = subject item["shop_id"] = shop_id item["aem_count"] = aem_count item["pipeline_level"] = "smt商品列表" yield item else: yield self.try_again(response)
def parse(self, response): youxiao = re.search("(product-title|no longer available)", response.text) url_key = response.request.url if youxiao: item_s = GmWorkItem() item_s["url"] = url_key item_s["source_code"] = response.text yield item_s good_name = response.css(".mb-8.js-product-title").xpath( "./text()").get() shop_name = response.css(".Product__Title.js-break-md-right" ).xpath("./p/a/text()").get() shop_url = response.css(".Product__Title.js-break-md-right").xpath( "./p/a/@href").get() price = response.css(".price.js-meta-price").xpath( "./text()").get() if price: price = re.sub("[^\d\.]", "", price) product = response.css( ".table.table-striped.a11y-text-width").xpath("./tr") brand = "" category = "" size = "" fruugo_id = "" ean = "" retailer_vrn = "" colour = "" for i in product: name_product = i.xpath("./th/text()").get() value = i.xpath("./td/text()").get() value = value.strip() if value else None if not value: value = i.xpath("./td/a/text()").get() value = value.strip() if value else None if "Brand" in name_product: brand = value if "Category" in name_product: category = value if "Size" in name_product: size = value if "Fruugo ID" in name_product: fruugo_id = value if "EAN" in name_product: ean = value if "Retailer VRN" in name_product: retailer_vrn = value if "Colour" in name_product: colour = value description = response.css(".js-product-description").xpath( "./text()").get() item = GmWorkItem() item["key"] = url_key item["good_name"] = good_name item["price"] = price item["shop_name"] = shop_name item["shop_url"] = shop_url item["brand"] = brand item["category"] = category item["size"] = size item["goods_id"] = fruugo_id item["ean"] = ean item["retailer_vrn"] = retailer_vrn item["colour"] = colour item["description"] = description yield item else: try_result = self.try_again(response, url=url_key) yield try_result
def parse(self, response): youxiao = re.search("feedback-container|feedbackServer", response.text) meta = response.meta seller_id = meta.get("ownerMemberId") goods_id = meta.get("productId") page = meta.get("page") if not page: page = "1" if youxiao: comment_num_str = response.css(".fb-star-selector").xpath( "./em/text()").get() comment_num = 0 if comment_num_str: match = re.search("(\d+)", comment_num_str) if match: comment_num = match.group(1) rate_list = response.css(".rate-list").xpath("./li") rate = [] for i in rate_list: comment_score_1 = i.xpath("./span[3]/text()").get() rate.append(comment_score_1) comment_distribution = str(rate) feedbacks = response.css(".feedback-list-wrap").xpath("./div") mouth = "" day_match = "" for i in feedbacks: user_name = i.css(".user-name").xpath("./a/text()").get() if not user_name: user_name = i.css(".user-name").xpath("./text()").get() country = i.css(".user-country").xpath("./b/text()").get() comment_score = i.css(".star-view").xpath( "./span/@style").get() # user_info = i.css(".user-order-info") # colour = user_info.xpath("./span[1]/text()").get() # Logistics = user_info.xpath("./span[2]/text()").get() buyer_feedback = i.css(".buyer-feedback") comment = buyer_feedback.xpath("./span[1]/text()").get() time = buyer_feedback.xpath("./span[2]/text()").get() item = GmWorkItem() item["seller_id"] = seller_id item["goods_id"] = goods_id item["current_page"] = page item["comment_num"] = comment_num item["comment_distribution"] = comment_distribution item["user_name"] = user_name item["country"] = country item["comment_score"] = comment_score item["comment"] = comment item["time"] = time yield item if time: mouth_match = re.search("\d+ ([a-z]+) \d{4}", time, flags=re.I) if mouth_match: mouth = mouth_match.group(1) day_match = re.search("(\d+) [a-z]+ \d{4}", time, flags=re.I) if day_match: day_match = day_match.group(1) page_mouth = self.mouth_dict.get(mouth.upper()) if (page_mouth == self.current_mouth or page_mouth == self.current_mouth - 1) and int(page) * 10 < int( comment_num): #这里跨年问题or page_mouth in [1,12,11,10,9] ownerMemberId = seller_id productId = goods_id page = int(page) + 1 current_page = page - 1 request = self.request(ownerMemberId, productId, str(page), str(current_page)) yield request else: try_result = self.try_again(response, seller_id=seller_id, goods_id=goods_id, current_page=page) yield try_result
def get_detail(self, response): meta = response.meta totle_num = meta.get("totle_num") page_num = meta.get("page_num") shop_id = meta.get("shop_id") seller_id = meta.get("seller_id") judge = 0 try: json_str = json.loads(response.text) data = json_str.get("data") if not totle_num: totle = data.get("total") totle_num = int(totle / 20) + 1 if totle % 20 else int(totle / 20) ret = data.get("ret") for i in ret: item = GmWorkItem() id = i.get("id") orders = i.get("orders") salePrice = i.get("salePrice") maxPrice = salePrice.get("maxPrice") minPrice = salePrice.get("minPrice") pcDetailUrl = i.get("pcDetailUrl") subject = i.get("subject") averageStar = i.get("averageStar") #评分 feedbacks = i.get("feedbacks") #反馈数 mediaId = i.get("mediaId") #媒体id image350Url = i.get("image350Url") #图片url tagResult = i.get("tagResult") #标签 item["shop_id"] = shop_id item["seller_id"] = seller_id item["totle_num"] = totle_num item["id"] = id item["orders"] = orders item["max_price"] = maxPrice item["min_price"] = minPrice item["goods_url"] = pcDetailUrl item["average_score"] = averageStar item["goods_name"] = subject item["comment_num"] = feedbacks item["media_id"] = mediaId item["img_url"] = image350Url item["tag"] = tagResult yield item if orders == 0: judge = 1 item_s = GmWorkItem() item_s["shop_id"] = shop_id item_s["source_code"] = json_str yield item_s if page_num >= totle_num or len(ret) < 20: judge = 1 if judge == 0: page_num += 1 url = "https://{}.aliexpress.com/{}".format(shop_id, page_num) meta = { "totle_num": totle_num, "page_num": page_num, "shop_id": shop_id, "seller_id": seller_id } yield scrapy.Request(url=url, callback=self.get_detail, method="GET", meta=meta, dont_filter=True) except Exception as e: try_result = self.try_again(response, shop_id, seller_id, page_num) yield try_result
def parse_goodinfo(self, response): good_id = re.search(r'itm.+/(\d+)', response.url) if good_id != None: good_id = good_id.group(1) else: good_id = ' ' html = response.body.decode() good_name = response.xpath('//h1[@id="itemTitle"]/text()').get() if good_name: good_name = good_name.strip().replace(',', ',') else: good_name = ' ' price_dollar = response.xpath('//span[@id="prcIsum"]/@content').get() if price_dollar: price_dollar = price_dollar.strip().replace(',', '') else: price_dollar = ' ' price_RMB = response.xpath( '//div[@id="prcIsumConv"]/span/text()').get() if price_RMB != None: price_RMB = price_RMB.split()[1].strip().replace(',', '') else: price_RMB = ' ' project_location = response.xpath( '//span[@itemprop="availableAtOrFrom"]/text()').get() if project_location: project_location = project_location.strip().replace(',', ',') else: project_location = ' ' brand = response.xpath('//span[@itemprop="name"]/text()').getall() if brand != []: brand = brand[-1].strip().replace(',', ',') else: brand = ' ' seller_name = response.xpath( '//span[@class="mbg-nw"]/font/font/text()|//span[@class="mbg-nw"]/text()').get() if seller_name: seller_name = seller_name.strip().replace(',', ',') else: seller_name = ' ' sales_count = response.xpath( '//a[@class="vi-txt-underline"]/text()').get() if sales_count != None: sales_count = sales_count.split()[0] else: sales_count = ' ' cats = response.xpath('//li[@class="bc-w"]//span/text()').getall() if len(cats) == 0: cat_1 = cat_2 = cat_3 = cat_4 = cat_5 = cat_6 = ' ' elif len(cats) == 1: cat_1 = cats[0].strip().replace(',', ',') cat_2 = cat_3 = cat_4 = cat_5 = cat_6 = ' ' elif len(cats) == 2: cat_1, cat_2 = cats[0].strip().replace( ',', ','), cats[1].strip().replace(',', ',') cat_3 = cat_4 = cat_5 = cat_6 = ' ' elif len(cats) == 3: cat_1, cat_2, cat_3 = cats[0].strip().replace( ',', ','), cats[1].strip().replace(',', ','), cats[2].strip().replace(',', ',') cat_4 = cat_5 = cat_6 = ' ' elif len(cats) == 4: cat_1, cat_2, cat_3, cat_4 = cats[0].strip().replace(',', ','), cats[1].strip().replace( ',', ','), cats[2].strip().replace(',', ','), cats[3].strip().replace(',', ',') cat_5 = cat_6 = ' ' elif len(cats) == 5: cat_1, cat_2, cat_3, cat_4, cat_5 = cats[0].strip().replace(',', ','), cats[1].strip().replace( ',', ','), cats[2].strip().replace(',', ','), cats[3].strip().replace(',', ','), cats[4].strip().replace(',', ',') cat_6 = ' ' else: cat_1, cat_2, cat_3, cat_4, cat_5, cat_6, = cats[0].strip().replace(',', ','), cats[1].strip().replace(',', ','), cats[2].strip( ).replace(',', ','), cats[3].strip().replace(',', ','), cats[4].strip().replace(',', ','), cats[5].strip().replace(',', ',') item = GmWorkItem(good_id=good_id, good_name=good_name, price_dollar=price_dollar, price_RMB=price_RMB, project_location=project_location, brand=brand, seller_name=seller_name, sales_count=sales_count, cat_1=cat_1, cat_2=cat_2, cat_3=cat_3, cat_4=cat_4, cat_5=cat_5, cat_6=cat_6, html=html) yield item