Example #1
0
 def parse2(self, response):
     # 食品大分类
     typename = '食品'
     html = Selector(response)
     a = ','.join(html.xpath('/html/body/p/text()').extract())
     # print(a)
     b = str(re.findall("data: .*\}", a)).replace("['data: [",
                                                  "").replace("]}']", "")
     # print(b)
     childrens = str(
         re.findall(
             '"bgColor":"rgba\(246,246,246,1\)"\},\"childrens":(.*)\]',
             b)).replace("['", "").replace("']", "")
     dataSource = str(
         re.findall('"dataSource":(.*?),"tabDashType"',
                    childrens)).replace("['", "").replace("']", "")
     print(dataSource)
     jl = json.loads((dataSource))
     for c in jl:
         # print(c)
         children = c['children']
         for m in children:
             children2 = m['children']
             # print(children2)
             # print(len(children2))
             # print('\n')
             for n in children2:
                 if len(children2) < 7:
                     itemname = n['name']
                     itemurl = n['link']
                     print(itemname, itemurl)
                     item = JingdongspiderItem(itemname=itemname,
                                               typename=typename,
                                               itemurl=itemurl)
                     yield scrapy.Request(url=itemurl,
                                          callback=self.parse_url,
                                          headers=self.header)
                 else:
                     children3 = n['children']
                     # print(children3)
                     for p in children3:
                         # print(p)
                         itemname = p['name']
                         itemurl = p['link']
                         # print(itemname, itemurl)
                         #                        小分类的名字及网址
                         item = JingdongspiderItem(itemname=itemname,
                                                   typename=typename,
                                                   itemurl=itemurl)
                         yield scrapy.Request(url=itemurl,
                                              callback=self.parse_url,
                                              headers=self.header,
                                              meta={'item': item})
Example #2
0
    def parse_product(self, response):
        product_name = response.xpath(
            '//ul[@class="parameter2 p-parameter-list"]/li[1]/@title'
        ).extract_first()
        product_id = response.meta['product_id']
        print(product_name + ' ' + product_id)

        item = JingdongspiderItem()
        phone_url = 'http://item.jd.com/' + product_id + '.html'
        cursor = self.collection.find({'url': phone_url})
        if cursor.count() > 0:
            return
        item['phone_name'] = product_name
        item['url'] = phone_url
        item['brand'] = response.meta['brand']

        phone_reviews = []
        post_url = 'https://club.jd.com/comment/productPageComments.action'
        data_form = {
            'callback': 'fetchJSON_comment98vv61',
            'productId': str(product_id),
            'score': 0,
            'sortType': 5,
            'pageSize': 10,
            'isShadowSku': 0,
            'page': 0
        }
        s = requests.session()
        while True:
            t = s.get(post_url, params=data_form).text
            try:
                t = re.search(r'(?<=fetchJSON_comment98vv61\().*(?=\);)',
                              t).group(0)
            except Exception as e:
                break
            j = json.loads(t)
            comment_list = j['comments']
            if len(comment_list) == 0:
                break
            for comment in comment_list:
                content = comment['content']
                user_name = comment['nickname']
                comment_time = comment['referenceTime']
                score = comment['score']
                comment = {
                    'user_name': user_name,
                    'comment': content,
                    'comment_time': comment_time,
                    'score': score
                }
                phone_reviews.append(comment)
                print(comment)
            sleep(random.random())
            data_form['page'] += 1
        s.close()
        item['phone_reviews'] = phone_reviews
        item['source_platform'] = '京东'
        item['domain'] = 'www.jd.com'
        item['record_date'] = str(datetime.date.today())
        yield item
Example #3
0
 def parse1(self, response):
     #  特产大分类
     typename = '特产'
     html = Selector(response)
     a = ','.join(html.xpath('/html/body/p/text()').extract())
     b = str(re.findall("\[.*\]", a)).replace("['[", "").replace("]']", "")
     # print(b)
     jl = json.loads(b)
     childrens = jl['childrens']
     childrens = str(childrens)
     dataSource = re.findall('"dataSource":(.*?),"datapool"', a)
     dataSource = dataSource[0]
     js = json.loads((dataSource))
     # print(js)
     for c in js:
         children = c["children"]
         children1 = children[1:]
         for i in children1:
             children2 = i['children']
             # print(children2)
             for m in children2:
                 children3 = m['children']
                 # print(children3)
                 for n in children3:
                     itemname = n['title']
                     itemurl = n['url']
                     # print(itemname, itemurl)
                     #                        小分类的名字及网址
                     item = JingdongspiderItem(itemname=itemname,
                                               typename=typename,
                                               itemurl=itemurl)
                     yield scrapy.Request(url=itemurl,
                                          callback=self.parse_url,
                                          headers=self.header,
                                          meta={'item': item})
Example #4
0
    def parse4(self, response):
        # 生鲜大分类,抓取小分类的方法和其他三种有区别
        typename = '生鲜'
        html = Selector(response)

        script = str(html.xpath('//*[@id="J_container"]/script[1]').extract())
        # print(script)
        jl = re.findall('children:\[\{ NAME.*?o2:1\}\]', script)
        for i in jl:
            m = re.findall('\{ NAME.*?\}', i)
            for n in m:
                # print(n)
                itemname = str(re.findall("NAME:(.*?),URL", n)).replace(
                    "\\\\'", "").replace("\\\\',",
                                         "").replace('["',
                                                     '').replace('"]', '')
                itemurl = str(re.findall("URL(.*?)\\',id:", n)).replace(
                    '\\\\"]', '').replace('[":',
                                          '').replace(' ',
                                                      '').replace("\\\\'", '')
                if 'http' not in itemurl:
                    itemurl = itemurl.replace('//', 'https://')
                #     部分网址以//开头,需要修改
                # print(itemname, itemurl)
                # print(itemurl)
                item = JingdongspiderItem(itemname=itemname,
                                          typename=typename,
                                          itemurl=itemurl)
                yield scrapy.Request(url=itemurl,
                                     callback=self.parse_url,
                                     headers=self.header,
                                     meta={'item': item})
    def parse_brand_list(self, response):
        data = json.loads(response.text)
        search_data = json.loads(data['searchData'])
        ware_list = search_data['wareList']
        # print(type(ware_list))
        for ware in ware_list['wareList']:
            print(ware['wareId'] + ' ' + ware['wname'])
            item = JingdongspiderItem()
            phone_url = 'http://item.jd.com/' + ware['wareId'] + '.html'
            cursor = self.collection.find({'url': phone_url})
            if cursor.count() > 0:
                continue
            item['phone_name'] = ware['wname']
            item['url'] = phone_url
            item['brand'] = response.meta['brand']

            phone_reviews = []
            post_url = 'https://club.jd.com/comment/productPageComments.action'
            data_form = {
                'callback': 'fetchJSON_comment98vv61',
                'productId': str(ware['wareId']),
                'score': 0,
                'sortType': 5,
                'pageSize': 10,
                'isShadowSku': 0,
                'page': 0
            }
            s = requests.session()
            while True:
                t = s.get(post_url, params=data_form).text
                try:
                    t = re.search(r'(?<=fetchJSON_comment98vv61\().*(?=\);)',
                                  t).group(0)
                except Exception as e:
                    break
                j = json.loads(t)
                comment_list = j['comments']
                if len(comment_list) == 0:
                    break
                for comment in comment_list:
                    content = comment['content']
                    user_name = comment['nickname']
                    comment_time = comment['referenceTime']
                    score = comment['score']
                    comment = {
                        'user_name': user_name,
                        'comment': content,
                        'comment_time': comment_time,
                        'score': score
                    }
                    phone_reviews.append(comment)
                    print(comment)
                sleep(random.random())
                data_form['page'] += 1
            s.close()
            item['phone_reviews'] = phone_reviews
            item['source_platform'] = '京东'
            item['domain'] = 'www.jd.com'
            yield item
Example #6
0
    def parse_url(self, response):
        # 实现小分类里的自身循环和跳转到下一级具体商品页面
        item = response.meta['item']
        html = Selector(response)
        id = html.xpath('//*[@id="plist"]/ul/li/div/@data-sku').extract()
        if len(id) == 0:
            id = html.xpath('//*[@id="J_goodsList"]/ul/li/@data-sku').extract()
        if len(id) > 0:
            # 小分类有产品才能进行自身循环和下一级具体页面
            for i in id:
                foodurl = 'https://item.jd.com/' + i + '.html'
                # print(foodurl)
                itemname = item['itemname']
                typename = item['typename']
                itemurl = item['itemurl']
                item = JingdongspiderItem(itemname=itemname,
                                          typename=typename,
                                          foodurl=foodurl,
                                          itemurl=itemurl)
                # 不加这三行会出现数据库数据重复
                # print(i)
                yield scrapy.Request(url=foodurl,
                                     callback=self.parse_food,
                                     headers=self.header,
                                     meta={'item': item})
            #             跳转到具体商品的页面

            itemurl = str(item['itemurl'])
            # print(itemurl)
            page = str(
                html.xpath(
                    '//*[@id="J_topPage"]/span/i/text()').extract()).replace(
                        "['", "").replace("']", "")
            # 获取小分类页数
            # print(page)
            page = int(page)
            a = str(re.findall('http.*?#J', itemurl)).replace(
                '#J', '').replace("['", "").replace("']", "")
            if len(a) < 3:
                a = itemurl
                for i in range(1, page + 1):
                    n = str(i)
                    next_url = a + '&page=' + n
                    # print(next_url)
                    yield scrapy.Request(url=next_url,
                                         callback=self.parse_url,
                                         headers=self.header,
                                         meta={'item': item})
            #         下一页
            else:
                for i in range(1, page + 1):
                    n = 2 * i - 1
                    n = str(n)
                    next_url = a + '&page=' + n
                    # print(next_url)
                    yield scrapy.Request(url=next_url,
                                         callback=self.parse_url,
                                         headers=self.header,
                                         meta={'item': item})
Example #7
0
 def parse3(self, response):
     # 酒水大分类
     typename = '酒水'
     html = Selector(response)
     a = ','.join(html.xpath('/html/body/p/text()').extract())
     # print(a)
     b = str(re.findall("data: .*\}", a)).replace("['data: [",
                                                  "").replace("]}']", "")
     # print(b)
     childrens = str(re.findall('"dataSource":(.*?),"tabDashType"',
                                b)).replace("['", "").replace("']", "")
     jl = json.loads(childrens)
     for m in jl:
         children1 = m['children']
         # print(children1)
         for n in children1:
             children2 = n['children']
             # print(children2)
             # print(len(children2))
             for p in children2:
                 if len(children2) > 2:
                     pass
                     # itemname = p['name']
                     # itemurl = p['link']
                     # print(itemname, itemurl)
                 # 推荐区的和后面的重叠,所以pass
                 else:
                     children3 = p['children']
                     # print(children3)
                     for q in children3:
                         # print(p)
                         itemname = q['name']
                         itemurl = q['link']
                         # print(itemname, itemurl)
                         #                        小分类的名字及网址
                         item = JingdongspiderItem(itemname=itemname,
                                                   typename=typename,
                                                   itemurl=itemurl)
                         yield scrapy.Request(url=itemurl,
                                              callback=self.parse_url,
                                              headers=self.header,
                                              meta={'item': item})
Example #8
0
 def get_media_requests(self, item, info):
     for i_url in item['item_url']:
         item['i_url'] = i_url
         typename = item['typename']
         item = JingdongspiderItem(i_url=i_url, typename=typename)
         yield scrapy.Request(url=i_url, meta={'item': item})