def poor_comment(self, response): items = [] comment_num = len( response.xpath('//*[@id="comments-list"]/div/@id').extract()) - 4 for i in range(comment_num): try: if response.xpath('//*[@id="comment-' + str(i) + '"]/div/div[2]/div[2]/div/dl/dd/text()' ).extract_first().replace( "\r\n", '') == self.goods_type[0]: item = BugsItem() item['key'] = 'c' item['goods_id'] = str(self.gid) item['goods_name'] = self.data['goods_name'] item['comment_id'] = response.xpath( '//*[@id="comment-' + str(i) + '"]/div/div[1]/div[2]/text()').extract_first().replace( "\r\n", '') item['comment_index'] = str(self.PoorCount) item['poor_content'] = response.xpath( '//*[@id="comment-' + str(i) + '"]/div/div[2]/div[2]/dl/dd/text()').extract_first() string = response.xpath( '//*[@id="comment-' + str(i) + '"]/div/div[2]/div[1]/span[2]/a/text()').extract_first( ).replace("\r\n", '') Regular_expression = '([0-9]{3}[1-9]|[0-9]{2}[1-9][0-9]{1}|[0-9]{1}[1-9][0-9]{2}|[1-9][0-9]{3})-(((0[13578]|1[02])-(0[1-9]|[12][0-9]|3[01]))|((0[469]|11)-(0[1-9]|[12][0-9]|30))|(02-(0[1-9]|[1][0-9]|2[0-8])))' date = re.match(Regular_expression, string).group() item['comment_time'] = date items.append(item) yield item except Exception as e: item = BugsItem() item['key'] = 'c' item['goods_id'] = str(self.gid) item['goods_name'] = self.data['goods_name'] item['comment_id'] = response.xpath( '//*[@id="comment-' + str(i) + '"]/div/div[1]/div[2]/text()').extract_first().replace( "\r\n", '') item['comment_index'] = str(self.PoorCount) item['poor_content'] = response.xpath( '//*[@id="comment-' + str(i) + '"]/div/div[2]/div[2]/dl/dd/text()').extract_first() string = response.xpath('//*[@id="comment-' + str(i) + '"]/div/div[2]/div[1]/span[2]/a/text()' ).extract_first().replace("\r\n", '') Regular_expression = '([0-9]{3}[1-9]|[0-9]{2}[1-9][0-9]{1}|[0-9]{1}[1-9][0-9]{2}|[1-9][0-9]{3})-(((0[13578]|1[02])-(0[1-9]|[12][0-9]|3[01]))|((0[469]|11)-(0[1-9]|[12][0-9]|30))|(02-(0[1-9]|[1][0-9]|2[0-8])))' date = re.match(Regular_expression, string).group() item['comment_time'] = date items.append(item) yield item self.PoorCount += 1 url = 'http://club.jd.com/review/{}-1-{}-1.html'.format( str(self.gid), str(self.PoorCount)) time.sleep(5) if (self.PoorCount <= 2): yield Request(url, callback=self.poor_comment, dont_filter=True)
def __init__(self, **kwargs): super(JDSpoder, self).__init__(**kwargs) # print(args) self.start_urls = [kwargs.get('url')] self.url_num = 0 self.data = BugsItem() self.gid = '0'
def price(self, response): datas = json.loads(response.body) datas = datas[0] self.data['price'] = datas['p'] item = BugsItem() item['key'] = self.data['key'] item['goods_id'] = self.data['goods_id'] item['shop_name'] = self.data['shop_name'] item['goods_name'] = self.data['goods_name'] item['CommentCount'] = self.data['CommentCount'] item['GoodCount'] = self.data['GoodCount'] item['GoodRate'] = self.data['GoodRate'] item['GeneralCount'] = self.data['GeneralCount'] item['GeneralRate'] = self.data['GeneralRate'] item['PoorCount'] = self.data['PoorCount'] item['PoorRate'] = self.data['PoorRate'] item['DefaultGoodCount'] = self.data['DefaultGoodCount'] item['price'] = self.data['price'] item['goods_brands'] = self.data['goods_brands'] item['goods_effect'] = self.data['goods_effect'] item['goods_local'] = self.data['goods_local'] item['data_time'] = datetime.now().strftime( "%Y-%m-%dT%H:%M:%S.000+0800") yield item url = 'http://club.jd.com/review/{}-1-1-0.html'.format(self.gid)
def __init__(self, **kwargs): super(JDSpoder, self).__init__(**kwargs) # print(args) self.start_urls = [kwargs.get('url')] self.url_num = 0 self.data = BugsItem() self.gid = '0' self.data['goods_effect'] = None self.data['goods_local'] = None
def get_next(self, response): ids = response.xpath('//li/@data-sku').extract() item = BugsItem() for id in ids: item['key'] = 'i' item['goods_id'] = id yield item if self.index < self.index_num: self.index += 1 url = self.url.format(page=self.index) yield Request(url, callback=self.parse, dont_filter=True)
def parse(self, response): goos_id_topirty = [] ids = response.xpath( '//*[@id="J_goodsList"]/ul/li/@data-sku').extract() item = BugsItem() for id in ids: item['key'] = 'i' item['goods_id'] = id goos_id_topirty.append(id) yield item self.index += 1 url = 'https://search.jd.com/s_new.php?keyword=%E5%90%8C%E4%BB%81%E5%A0%82&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&bs=1&cod=1&psort=3&ev=exbrand_%E5%90%8C%E4%BB%81%E5%A0%82%EF%BC%88TRT%EF%BC%89%5E&page={page}&s=91&scrolling=y&log_id=1517755388.58478&tpl=1_M&show_items='.format( page=self.index) for id in goos_id_topirty: url = url + str(id) + ',' yield Request(url, callback=self.get_next, dont_filter=True, headers=self.headers)
def parse(self, response): goos_id_topirty = [] ids = response.xpath( '//*[@id="J_goodsList"]/ul/li/@data-sku').extract() item = BugsItem() for id in ids: item['key'] = 'i' item['goods_id'] = id goos_id_topirty.append(id) yield item # print(self.index) self.index += 1 url = 'https://search.jd.com/s_new.php?keyword=%E7%B2%BE%E6%B2%B9&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&suggest=1.rem.0.V09&wq=%E7%B2%BE%E6%B2%B9&psort=3&stock=1&page={page}&s=31&scrolling=y&log_id=1524571725.40460&tpl=1_M&show_items='.format( page=self.index) for id in goos_id_topirty: url = url + str(id) + ',' yield Request(url, callback=self.get_next, dont_filter=True, headers=self.headers)