def parse2(self, response): # 食品大分类 typename = '食品' html = Selector(response) a = ','.join(html.xpath('/html/body/p/text()').extract()) # print(a) b = str(re.findall("data: .*\}", a)).replace("['data: [", "").replace("]}']", "") # print(b) childrens = str( re.findall( '"bgColor":"rgba\(246,246,246,1\)"\},\"childrens":(.*)\]', b)).replace("['", "").replace("']", "") dataSource = str( re.findall('"dataSource":(.*?),"tabDashType"', childrens)).replace("['", "").replace("']", "") print(dataSource) jl = json.loads((dataSource)) for c in jl: # print(c) children = c['children'] for m in children: children2 = m['children'] # print(children2) # print(len(children2)) # print('\n') for n in children2: if len(children2) < 7: itemname = n['name'] itemurl = n['link'] print(itemname, itemurl) item = JingdongspiderItem(itemname=itemname, typename=typename, itemurl=itemurl) yield scrapy.Request(url=itemurl, callback=self.parse_url, headers=self.header) else: children3 = n['children'] # print(children3) for p in children3: # print(p) itemname = p['name'] itemurl = p['link'] # print(itemname, itemurl) # 小分类的名字及网址 item = JingdongspiderItem(itemname=itemname, typename=typename, itemurl=itemurl) yield scrapy.Request(url=itemurl, callback=self.parse_url, headers=self.header, meta={'item': item})
def parse_product(self, response): product_name = response.xpath( '//ul[@class="parameter2 p-parameter-list"]/li[1]/@title' ).extract_first() product_id = response.meta['product_id'] print(product_name + ' ' + product_id) item = JingdongspiderItem() phone_url = 'http://item.jd.com/' + product_id + '.html' cursor = self.collection.find({'url': phone_url}) if cursor.count() > 0: return item['phone_name'] = product_name item['url'] = phone_url item['brand'] = response.meta['brand'] phone_reviews = [] post_url = 'https://club.jd.com/comment/productPageComments.action' data_form = { 'callback': 'fetchJSON_comment98vv61', 'productId': str(product_id), 'score': 0, 'sortType': 5, 'pageSize': 10, 'isShadowSku': 0, 'page': 0 } s = requests.session() while True: t = s.get(post_url, params=data_form).text try: t = re.search(r'(?<=fetchJSON_comment98vv61\().*(?=\);)', t).group(0) except Exception as e: break j = json.loads(t) comment_list = j['comments'] if len(comment_list) == 0: break for comment in comment_list: content = comment['content'] user_name = comment['nickname'] comment_time = comment['referenceTime'] score = comment['score'] comment = { 'user_name': user_name, 'comment': content, 'comment_time': comment_time, 'score': score } phone_reviews.append(comment) print(comment) sleep(random.random()) data_form['page'] += 1 s.close() item['phone_reviews'] = phone_reviews item['source_platform'] = '京东' item['domain'] = 'www.jd.com' item['record_date'] = str(datetime.date.today()) yield item
def parse1(self, response): # 特产大分类 typename = '特产' html = Selector(response) a = ','.join(html.xpath('/html/body/p/text()').extract()) b = str(re.findall("\[.*\]", a)).replace("['[", "").replace("]']", "") # print(b) jl = json.loads(b) childrens = jl['childrens'] childrens = str(childrens) dataSource = re.findall('"dataSource":(.*?),"datapool"', a) dataSource = dataSource[0] js = json.loads((dataSource)) # print(js) for c in js: children = c["children"] children1 = children[1:] for i in children1: children2 = i['children'] # print(children2) for m in children2: children3 = m['children'] # print(children3) for n in children3: itemname = n['title'] itemurl = n['url'] # print(itemname, itemurl) # 小分类的名字及网址 item = JingdongspiderItem(itemname=itemname, typename=typename, itemurl=itemurl) yield scrapy.Request(url=itemurl, callback=self.parse_url, headers=self.header, meta={'item': item})
def parse4(self, response): # 生鲜大分类,抓取小分类的方法和其他三种有区别 typename = '生鲜' html = Selector(response) script = str(html.xpath('//*[@id="J_container"]/script[1]').extract()) # print(script) jl = re.findall('children:\[\{ NAME.*?o2:1\}\]', script) for i in jl: m = re.findall('\{ NAME.*?\}', i) for n in m: # print(n) itemname = str(re.findall("NAME:(.*?),URL", n)).replace( "\\\\'", "").replace("\\\\',", "").replace('["', '').replace('"]', '') itemurl = str(re.findall("URL(.*?)\\',id:", n)).replace( '\\\\"]', '').replace('[":', '').replace(' ', '').replace("\\\\'", '') if 'http' not in itemurl: itemurl = itemurl.replace('//', 'https://') # 部分网址以//开头,需要修改 # print(itemname, itemurl) # print(itemurl) item = JingdongspiderItem(itemname=itemname, typename=typename, itemurl=itemurl) yield scrapy.Request(url=itemurl, callback=self.parse_url, headers=self.header, meta={'item': item})
def parse_brand_list(self, response): data = json.loads(response.text) search_data = json.loads(data['searchData']) ware_list = search_data['wareList'] # print(type(ware_list)) for ware in ware_list['wareList']: print(ware['wareId'] + ' ' + ware['wname']) item = JingdongspiderItem() phone_url = 'http://item.jd.com/' + ware['wareId'] + '.html' cursor = self.collection.find({'url': phone_url}) if cursor.count() > 0: continue item['phone_name'] = ware['wname'] item['url'] = phone_url item['brand'] = response.meta['brand'] phone_reviews = [] post_url = 'https://club.jd.com/comment/productPageComments.action' data_form = { 'callback': 'fetchJSON_comment98vv61', 'productId': str(ware['wareId']), 'score': 0, 'sortType': 5, 'pageSize': 10, 'isShadowSku': 0, 'page': 0 } s = requests.session() while True: t = s.get(post_url, params=data_form).text try: t = re.search(r'(?<=fetchJSON_comment98vv61\().*(?=\);)', t).group(0) except Exception as e: break j = json.loads(t) comment_list = j['comments'] if len(comment_list) == 0: break for comment in comment_list: content = comment['content'] user_name = comment['nickname'] comment_time = comment['referenceTime'] score = comment['score'] comment = { 'user_name': user_name, 'comment': content, 'comment_time': comment_time, 'score': score } phone_reviews.append(comment) print(comment) sleep(random.random()) data_form['page'] += 1 s.close() item['phone_reviews'] = phone_reviews item['source_platform'] = '京东' item['domain'] = 'www.jd.com' yield item
def parse_url(self, response): # 实现小分类里的自身循环和跳转到下一级具体商品页面 item = response.meta['item'] html = Selector(response) id = html.xpath('//*[@id="plist"]/ul/li/div/@data-sku').extract() if len(id) == 0: id = html.xpath('//*[@id="J_goodsList"]/ul/li/@data-sku').extract() if len(id) > 0: # 小分类有产品才能进行自身循环和下一级具体页面 for i in id: foodurl = 'https://item.jd.com/' + i + '.html' # print(foodurl) itemname = item['itemname'] typename = item['typename'] itemurl = item['itemurl'] item = JingdongspiderItem(itemname=itemname, typename=typename, foodurl=foodurl, itemurl=itemurl) # 不加这三行会出现数据库数据重复 # print(i) yield scrapy.Request(url=foodurl, callback=self.parse_food, headers=self.header, meta={'item': item}) # 跳转到具体商品的页面 itemurl = str(item['itemurl']) # print(itemurl) page = str( html.xpath( '//*[@id="J_topPage"]/span/i/text()').extract()).replace( "['", "").replace("']", "") # 获取小分类页数 # print(page) page = int(page) a = str(re.findall('http.*?#J', itemurl)).replace( '#J', '').replace("['", "").replace("']", "") if len(a) < 3: a = itemurl for i in range(1, page + 1): n = str(i) next_url = a + '&page=' + n # print(next_url) yield scrapy.Request(url=next_url, callback=self.parse_url, headers=self.header, meta={'item': item}) # 下一页 else: for i in range(1, page + 1): n = 2 * i - 1 n = str(n) next_url = a + '&page=' + n # print(next_url) yield scrapy.Request(url=next_url, callback=self.parse_url, headers=self.header, meta={'item': item})
def parse3(self, response): # 酒水大分类 typename = '酒水' html = Selector(response) a = ','.join(html.xpath('/html/body/p/text()').extract()) # print(a) b = str(re.findall("data: .*\}", a)).replace("['data: [", "").replace("]}']", "") # print(b) childrens = str(re.findall('"dataSource":(.*?),"tabDashType"', b)).replace("['", "").replace("']", "") jl = json.loads(childrens) for m in jl: children1 = m['children'] # print(children1) for n in children1: children2 = n['children'] # print(children2) # print(len(children2)) for p in children2: if len(children2) > 2: pass # itemname = p['name'] # itemurl = p['link'] # print(itemname, itemurl) # 推荐区的和后面的重叠,所以pass else: children3 = p['children'] # print(children3) for q in children3: # print(p) itemname = q['name'] itemurl = q['link'] # print(itemname, itemurl) # 小分类的名字及网址 item = JingdongspiderItem(itemname=itemname, typename=typename, itemurl=itemurl) yield scrapy.Request(url=itemurl, callback=self.parse_url, headers=self.header, meta={'item': item})
def get_media_requests(self, item, info): for i_url in item['item_url']: item['i_url'] = i_url typename = item['typename'] item = JingdongspiderItem(i_url=i_url, typename=typename) yield scrapy.Request(url=i_url, meta={'item': item})