Ejemplo n.º 1
0
    def poor_comment(self, response):
        items = []
        comment_num = len(
            response.xpath('//*[@id="comments-list"]/div/@id').extract()) - 4
        for i in range(comment_num):
            try:
                if response.xpath('//*[@id="comment-' + str(i) +
                                  '"]/div/div[2]/div[2]/div/dl/dd/text()'
                                  ).extract_first().replace(
                                      "\r\n", '') == self.goods_type[0]:
                    item = BugsItem()
                    item['key'] = 'c'
                    item['goods_id'] = str(self.gid)
                    item['goods_name'] = self.data['goods_name']
                    item['comment_id'] = response.xpath(
                        '//*[@id="comment-' + str(i) +
                        '"]/div/div[1]/div[2]/text()').extract_first().replace(
                            "\r\n", '')
                    item['comment_index'] = str(self.PoorCount)
                    item['poor_content'] = response.xpath(
                        '//*[@id="comment-' + str(i) +
                        '"]/div/div[2]/div[2]/dl/dd/text()').extract_first()
                    string = response.xpath(
                        '//*[@id="comment-' + str(i) +
                        '"]/div/div[2]/div[1]/span[2]/a/text()').extract_first(
                        ).replace("\r\n", '')
                    Regular_expression = '([0-9]{3}[1-9]|[0-9]{2}[1-9][0-9]{1}|[0-9]{1}[1-9][0-9]{2}|[1-9][0-9]{3})-(((0[13578]|1[02])-(0[1-9]|[12][0-9]|3[01]))|((0[469]|11)-(0[1-9]|[12][0-9]|30))|(02-(0[1-9]|[1][0-9]|2[0-8])))'
                    date = re.match(Regular_expression, string).group()
                    item['comment_time'] = date
                    items.append(item)
                    yield item
            except Exception as e:
                item = BugsItem()
                item['key'] = 'c'
                item['goods_id'] = str(self.gid)
                item['goods_name'] = self.data['goods_name']
                item['comment_id'] = response.xpath(
                    '//*[@id="comment-' + str(i) +
                    '"]/div/div[1]/div[2]/text()').extract_first().replace(
                        "\r\n", '')
                item['comment_index'] = str(self.PoorCount)
                item['poor_content'] = response.xpath(
                    '//*[@id="comment-' + str(i) +
                    '"]/div/div[2]/div[2]/dl/dd/text()').extract_first()
                string = response.xpath('//*[@id="comment-' + str(i) +
                                        '"]/div/div[2]/div[1]/span[2]/a/text()'
                                        ).extract_first().replace("\r\n", '')
                Regular_expression = '([0-9]{3}[1-9]|[0-9]{2}[1-9][0-9]{1}|[0-9]{1}[1-9][0-9]{2}|[1-9][0-9]{3})-(((0[13578]|1[02])-(0[1-9]|[12][0-9]|3[01]))|((0[469]|11)-(0[1-9]|[12][0-9]|30))|(02-(0[1-9]|[1][0-9]|2[0-8])))'
                date = re.match(Regular_expression, string).group()
                item['comment_time'] = date
                items.append(item)
                yield item

        self.PoorCount += 1
        url = 'http://club.jd.com/review/{}-1-{}-1.html'.format(
            str(self.gid), str(self.PoorCount))
        time.sleep(5)
        if (self.PoorCount <= 2):
            yield Request(url, callback=self.poor_comment, dont_filter=True)
Ejemplo n.º 2
0
 def __init__(self, **kwargs):
     super(JDSpoder, self).__init__(**kwargs)
     # print(args)
     self.start_urls = [kwargs.get('url')]
     self.url_num = 0
     self.data = BugsItem()
     self.gid = '0'
Ejemplo n.º 3
0
 def price(self, response):
     datas = json.loads(response.body)
     datas = datas[0]
     self.data['price'] = datas['p']
     item = BugsItem()
     item['key'] = self.data['key']
     item['goods_id'] = self.data['goods_id']
     item['shop_name'] = self.data['shop_name']
     item['goods_name'] = self.data['goods_name']
     item['CommentCount'] = self.data['CommentCount']
     item['GoodCount'] = self.data['GoodCount']
     item['GoodRate'] = self.data['GoodRate']
     item['GeneralCount'] = self.data['GeneralCount']
     item['GeneralRate'] = self.data['GeneralRate']
     item['PoorCount'] = self.data['PoorCount']
     item['PoorRate'] = self.data['PoorRate']
     item['DefaultGoodCount'] = self.data['DefaultGoodCount']
     item['price'] = self.data['price']
     item['goods_brands'] = self.data['goods_brands']
     item['goods_effect'] = self.data['goods_effect']
     item['goods_local'] = self.data['goods_local']
     item['data_time'] = datetime.now().strftime(
         "%Y-%m-%dT%H:%M:%S.000+0800")
     yield item
     url = 'http://club.jd.com/review/{}-1-1-0.html'.format(self.gid)
Ejemplo n.º 4
0
 def __init__(self, **kwargs):
     super(JDSpoder, self).__init__(**kwargs)
     # print(args)
     self.start_urls = [kwargs.get('url')]
     self.url_num = 0
     self.data = BugsItem()
     self.gid = '0'
     self.data['goods_effect'] = None
     self.data['goods_local'] = None
Ejemplo n.º 5
0
 def get_next(self, response):
     ids = response.xpath('//li/@data-sku').extract()
     item = BugsItem()
     for id in ids:
         item['key'] = 'i'
         item['goods_id'] = id
         yield item
     if self.index < self.index_num:
         self.index += 1
         url = self.url.format(page=self.index)
         yield Request(url, callback=self.parse, dont_filter=True)
Ejemplo n.º 6
0
 def parse(self, response):
     goos_id_topirty = []
     ids = response.xpath(
         '//*[@id="J_goodsList"]/ul/li/@data-sku').extract()
     item = BugsItem()
     for id in ids:
         item['key'] = 'i'
         item['goods_id'] = id
         goos_id_topirty.append(id)
         yield item
     self.index += 1
     url = 'https://search.jd.com/s_new.php?keyword=%E5%90%8C%E4%BB%81%E5%A0%82&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&bs=1&cod=1&psort=3&ev=exbrand_%E5%90%8C%E4%BB%81%E5%A0%82%EF%BC%88TRT%EF%BC%89%5E&page={page}&s=91&scrolling=y&log_id=1517755388.58478&tpl=1_M&show_items='.format(
         page=self.index)
     for id in goos_id_topirty:
         url = url + str(id) + ','
     yield Request(url,
                   callback=self.get_next,
                   dont_filter=True,
                   headers=self.headers)
Ejemplo n.º 7
0
 def parse(self, response):
     goos_id_topirty = []
     ids = response.xpath(
         '//*[@id="J_goodsList"]/ul/li/@data-sku').extract()
     item = BugsItem()
     for id in ids:
         item['key'] = 'i'
         item['goods_id'] = id
         goos_id_topirty.append(id)
         yield item
         # print(self.index)
     self.index += 1
     url = 'https://search.jd.com/s_new.php?keyword=%E7%B2%BE%E6%B2%B9&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&suggest=1.rem.0.V09&wq=%E7%B2%BE%E6%B2%B9&psort=3&stock=1&page={page}&s=31&scrolling=y&log_id=1524571725.40460&tpl=1_M&show_items='.format(
         page=self.index)
     for id in goos_id_topirty:
         url = url + str(id) + ','
     yield Request(url,
                   callback=self.get_next,
                   dont_filter=True,
                   headers=self.headers)