Ejemplo n.º 1
0
 def parse(self, response):
     print('1,=====================',response.url)
     # print(response.text)
     url = response.url
     item = TCommentsPubItem()
     src_url = response.url
     item["src_url"] = src_url
     print('src_url:', src_url)
     bid = response.url.replace('http://www.shuqi.com/cover.php?bid=','')
     print('bid:',bid)
     timestamp = str(int(time.time()))[0:10]
     print('timestamp:',timestamp)
     pageKey = "f2850e634f85f485d719314ae3cfe252"
     # jsstr = self.get_js()
     # ctx = execjs.compile(jsstr)
     s = bid + timestamp + pageKey
     sign = hashlib.md5(s.encode(encoding='UTF-8')).hexdigest()
     print('sign:',sign)
     formdata = {
         'bid': bid,
         'timestamp': timestamp,
         'sign': sign,
     }
     link = 'https://ognv1.sqreader.com/index.php?r=pcapi/pcbook/bookinfo'
     yield scrapy.FormRequest(
         url=link,
         formdata=formdata,
         callback=self.parse_page_p,
         meta={'bid': bid,'item': item},
         dont_filter=True,
     )
Ejemplo n.º 2
0
 def parse(self, response):
     print('1,==========================', response.url)
     if response.status != 200:
         yield scrapy.Request(url=response.url, callback=self.parse, dont_filter=True)
     else:
         print('请求成功>>>')
         item = TCommentsPubItem()
         src_url = response.url
         item["src_url"] = src_url
         print('src_url:', src_url)
         product_number = ' '.join(response.xpath('//div[@class=" fr worksLR"]/h4/text()').extract()).strip()
         print('product_number:', product_number)
         product_number = get_product_number(product_number)
         print('product_number:', product_number)
         item["product_number"] = product_number
         plat_number = 'P35'
         print('plat_number:', plat_number)
         item["plat_number"] = plat_number
         bookId = ''.join(re.findall(r'bookid\=(\d+)', src_url, re.I|re.M))
         print('bookId:',bookId)
         formdata = {
             'type': '1',
             'bookid': bookId,
             'page': '1',
         }
         link = 'http://www.xiang5.com/comment.php?a=allchangping'
         yield scrapy.FormRequest(url=link, formdata=formdata, callback=self.parse_page_max, meta={'item': item, 'bookId': bookId}, dont_filter=True)
Ejemplo n.º 3
0
 def parse(self, response):
     print('1,========================', response.url)
     html = response.text
     item = TCommentsPubItem()
     url = response.url
     src_url = url
     item["src_url"] = src_url
     print('src_url:', src_url)
     product_number = ''.join(
         response.xpath('//h3/a/text()').extract()).strip()
     product_number = get_product_number(product_number)
     print('product_number:', product_number)
     item["product_number"] = product_number
     plat_number = 'P17'
     print('plat_number:', plat_number)
     item["plat_number"] = plat_number
     bid = ''.join(
         response.xpath(
             '//*[@id="addToBookshelfBtn"]/@bid').extract()).strip()
     print('bid:', bid)
     formdata = {
         'bid': bid,
         'pageIndex': '1',
     }
     link = 'http://book.qq.com/intro/getComment.html'
     yield scrapy.FormRequest(
         url=link,
         formdata=formdata,
         callback=self.parse_page_p,
         meta={
             'item': item,
             'bid': bid
         },
         dont_filter=True,
     )
Ejemplo n.º 4
0
 def parse(self, response):
     print('1,==========================', response.url)
     # print(response.text)
     item = TCommentsPubItem()
     src_url = response.url
     item["src_url"] = src_url
     print('src_url:', src_url)
     product_number = ''.join(
         response.xpath('//h3[@title]/em/text()').extract()).strip()
     print('product_number:', product_number)
     product_number = get_product_number(product_number)
     print('product_number:', product_number)
     item["product_number"] = product_number
     plat_number = 'P34'
     print('plat_number:', plat_number)
     item["plat_number"] = plat_number
     bookid = ''.join(re.findall(r'source\/(.*)', src_url, re.I | re.M))
     link = 'http://yuedu.163.com/snsComment.do?operation=get&type=2&id={}&page=1'.format(
         bookid)
     yield scrapy.Request(url=link,
                          callback=self.parse_page_max,
                          meta={
                              'item': item,
                              'bookid': bookid
                          },
                          dont_filter=True)
Ejemplo n.º 5
0
 def parse(self, response):
     print('1,========================', response.url)
     html = response.text
     # print(html)
     item = TCommentsPubItem()
     url = response.url
     src_url = url
     item["src_url"] = src_url
     print('src_url:', src_url)
     product_number = ''.join(
         response.xpath(
             '//img[@class="qqredaer_tit"]/@title').extract()).strip()
     product_number = get_product_number(product_number)
     print('product_number:', product_number)
     item["product_number"] = product_number
     plat_number = 'P17'
     print('plat_number:', plat_number)
     item["plat_number"] = plat_number
     link = ''.join(
         response.xpath(
             '//div[@class="tablist"]/ul/li/a[contains(text(),"评论")]/@href'
         ).extract()).strip()
     print('link:', link)
     yield scrapy.Request(url=link,
                          callback=self.parse_page_link,
                          meta={'item': item},
                          dont_filter=True)
Ejemplo n.º 6
0
 def parse(self, response):
     print('1,=======================',response.url)
     url = response.url
     item = TCommentsPubItem()
     src_url = url
     item["src_url"] = src_url
     print('src_url:', src_url)
     bookId = ''.join(re.findall(r'book\/(\d+)\/',url, re.I|re.M))
     print('bookId:',bookId)
     link = 'http://www.hongshu.com/comment/{}/index.do'.format(bookId)
     yield scrapy.Request(url=link, callback=self.parse_page_link, meta={'item':item, 'bookId': bookId}, dont_filter=True)
Ejemplo n.º 7
0
 def parse_page(self, response):
     print('1,=====================', response.url)
     # print(response.text)
     item = TCommentsPubItem()
     src_url = response.url
     item["src_url"] = src_url
     bookId = src_url.replace('https://book.qidian.com/info/', '')
     print('bookId:', bookId)
     link = 'https://book.qidian.com/ajax/book/GetBookForum?_csrfToken=HUc4mzWMTveOYkK01P9mREV04r5f0zisvnDNZl7j&authorId=0&bookId={}&chanId=0&pageSize=15'.format(
         bookId)
     yield scrapy.Request(url=link,
                          callback=self.parse_page_link,
                          meta={'item': item},
                          dont_filter=True)
Ejemplo n.º 8
0
    def parse(self, response):
        print('1,=====================', response.url)
        item = TCommentsPubItem()
        src_url = response.url
        item["src_url"] = src_url
        print('src_url:', src_url)
        novelid = src_url.replace('http://www.jjwxc.net/onebook.php?novelid=',
                                  '')
        print('novelid:', novelid)
        product_number = ''.join(
            response.xpath(
                '//h1[@itemprop="name"]/span/text()').extract()).strip()
        print('product_number:', product_number)
        # product_number = get_product_number(product_number)
        if '【' and '】' in product_number:
            product_number = product_number.replace('【', '[').replace('】', ']')
            print('product_number:', product_number)
            product_number = get_product_number(product_number)
            print('product_number:', product_number)
            item["product_number"] = product_number
        else:
            product_number = product_number
            product_number = get_product_number(product_number)
            print('product_number:', product_number)
            item["product_number"] = product_number
        # print('product_number:', product_number)
        # item["product_number"] = product_number
        plat_number = 'P16'
        print('plat_number:', plat_number)
        item["plat_number"] = plat_number
        last_modify_date = datetime.datetime.now().strftime(
            '%Y-%m-%d %H:%M:%S')
        item["last_modify_date"] = last_modify_date
        print('last_modify_date:', last_modify_date)
        j_url = 'http://s8-static.jjwxc.net/comment_json.php?novelid={}'.format(
            novelid)
        # print('j_url:',j_url)
        yield scrapy.Request(url=j_url,
                             callback=self.parse_page,
                             meta={'item': item},
                             dont_filter=True)

        link = 'http://www.jjwxc.net/comment.php?novelid={}&huati=1&page=1'.format(
            novelid)
        print('link:', link)
        yield scrapy.Request(url=link,
                             callback=self.parse_page_url,
                             meta={'item': item},
                             dont_filter=True)
Ejemplo n.º 9
0
 def parse(self, response):
     print('1,=========================',response.url)
     item = TCommentsPubItem()
     src_url = response.url
     item["src_url"] = src_url
     # print('src_url:', src_url)
     product_number = ''.join(re.findall(r'av\d+',src_url))
     # print('product_number:',product_number)
     item["product_number"] = product_number
     html = response.text
     # print(html)
     cid = ''.join(re.findall(r'cid=(\d+)&amp',html))
     # print('cid:',cid)
     link = 'https://api.bilibili.com/x/v1/dm/list.so?oid={}'.format(cid)
     # print('link:',link)
     yield scrapy.Request(url=link, callback=self.parse_page, meta={'item': item}, dont_filter=True)
Ejemplo n.º 10
0
 def parse(self, response):
     print('1,========================', response.url)
     item = TCommentsPubItem()
     url = response.url
     src_url = url
     item["src_url"] = src_url
     # print('src_url:', src_url)
     html = response.text
     # print(html)
     # cookies = dict(responses.cookies.items())
     # print('cookies:',cookies)
     bookId = url.replace('https://www.hongxiu.com/book/', '')
     print('bookId:', bookId)
     link = 'https://www.hongxiu.com/ajax/comment/pageList?_csrfToken=9bx7F3sUwWZoBWjQrEImCBarm6KMDNYhoG8EtVSc&pageNum=1&pageSize=10&bookId={}'.format(
         bookId)
     yield scrapy.Request(url=link,
                          callback=self.parse_page_link,
                          meta={
                              'item': item,
                              'bookId': bookId
                          },
                          dont_filter=True)
Ejemplo n.º 11
0
 def parse(self, response):
     print('1,=======================', response.url)
     # print(response.text)
     item = TCommentsPubItem()
     src_url = response.url
     item["src_url"] = src_url
     product_number = ''.join(
         response.xpath('//h2/a/text()').extract()).strip()
     product_number = get_product_number(product_number)
     print('product_number:', product_number)
     item["product_number"] = product_number
     plat_number = 'P18'
     print('plat_number:', plat_number)
     item["plat_number"] = plat_number
     link = ''.join(
         response.xpath(
             '//p[@class="loadMore"]/a/@href').extract()).strip()
     # print('url:',url)
     yield scrapy.Request(url=link,
                          callback=self.parse_page,
                          meta={'item': item},
                          dont_filter=True)
Ejemplo n.º 12
0
 def parse(self, response):
     print('1,===============', response.url)
     item = TCommentsPubItem()
     src_url = response.url
     item["src_url"] = src_url
     print('src_url:', src_url)
     product_number = ''.join(
         response.xpath(
             '//div[@class="Info Sign"]/h1/a[@target="_blank"]/text()').
         extract()).strip()
     product_number = get_product_number(product_number)
     print('product_number:', product_number)
     item["product_number"] = product_number
     plat_number = 'P22'
     print('plat_number:', plat_number)
     item["plat_number"] = plat_number
     link = ''.join(
         response.xpath('//div[@class="MORE"]/a[@id="comment_more2"]/@href'
                        ).extract()).strip()
     # print('link:',link)
     yield scrapy.Request(url=link,
                          callback=self.parse_page_link,
                          meta={'item': item},
                          dont_filter=True)
Ejemplo n.º 13
0
 def parse(self, response):
     print('1,===============',response.url)
     html = response.text
     item = TCommentsPubItem()
     src_url = response.url
     item["src_url"] = src_url
     print('src_url:', src_url)
     product_number = ''.join(re.findall(r'bookName\=\"(.*)\"',html)).strip().replace('bookName="','').replace('"','')
     product_number = get_product_number(product_number)
     print('product_number:', product_number)
     item["product_number"] = product_number
     plat_number = 'P21'
     print('plat_number:', plat_number)
     item["plat_number"] = plat_number
     long_comment = 0
     item["long_comment"] = long_comment
     last_modify_date = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
     item["last_modify_date"] = last_modify_date
     print('last_modify_date:', last_modify_date)
     bookId = ''.join(re.findall(r'bookId=\"\d+\"',html)).strip().replace('bookId="','').replace('"','')
     # print('bookId:',bookId)
     url = 'http://book.zongheng.com/api/book/comment/getThreadL1st2.htm'
     formdata = {
         'bookId': bookId,
         'pagebar': '1',
         'pageNum': '1',
         'pageSize': '30',
     }
     # FormRequest 是Scrapy发送POST请求的方法
     yield scrapy.FormRequest(
         url=url,
         formdata=formdata,
         callback=self.parse_page_p,
         meta={'item': item, 'bookId': bookId},
         dont_filter=True,
     )
Ejemplo n.º 14
0
    def parse_page(self, response):
        print('2,==================', response.url)
        # print(response.text)
        item = TCommentsPubItem()
        src_url = response.url
        item["src_url"] = src_url
        print('src_url:', src_url)
        uids = response.xpath(
            '//div[@class="mod comments reviews"]/div[@class="bd"]/ul/li[@data-feed-id]/@data-feed-id'
        ).extract()
        # print('uids:',uids)
        for uid in uids:
            print('uid:', uid)
            product_number = ''.join(
                response.xpath('//h1[@class="page-title"]/text() | //h2/text()'
                               ).extract()).strip().replace('的评论', '')
            product_number = get_product_number(product_number)
            print('product_number:', product_number)
            item["product_number"] = product_number
            plat_number = 'P31'
            print('plat_number:', plat_number)
            item["plat_number"] = plat_number
            nick_name = ''.join(
                response.xpath(
                    '//div[@class="mod comments reviews"]/div[@class="bd"]/ul/li[@data-feed-id="{}"]/div[@class="left"]/a[@class="name"]/text()'
                    .format(uid)).extract()).strip()
            item["nick_name"] = nick_name
            print('nick_name:', nick_name)
            cmt_date = ''.join(
                response.xpath(
                    '//div[@class="mod comments reviews"]/div[@class="bd"]/ul/li[@data-feed-id="{}"]/div[@class="right"]/div[@class="controls"]/span[@class="time"]/text()'
                    .format(uid)).extract()).strip()
            item["cmt_date"] = cmt_date
            print('cmt_date:', cmt_date)
            cmt_time = ''.join(
                response.xpath(
                    '//div[@class="mod comments reviews"]/div[@class="bd"]/ul/li[@data-feed-id="{}"]/div[@class="right"]/div[@class="controls"]/span[@class="time"]/text()'
                    .format(uid)).extract()).strip()
            item["cmt_time"] = cmt_time
            print('cmt_time', cmt_time)
            comments = ''.join(
                response.xpath(
                    '//div[@class="mod comments reviews"]/div[@class="bd"]/ul/li[@data-feed-id="{}"]/div[@class="right"]/p[@class="summary"]//text()'
                    .format(uid)).extract()).strip()
            if comments:
                item["comments"] = comments
                print('comments:', comments)
            else:
                comments = ''.join(
                    response.xpath(
                        '//div[@class="mod comments reviews"]/div[@class="bd"]/ul/li[@data-feed-id="{}"]/div[@class="right"]/h3/a/text()'
                        .format(uid)).extract()).strip()
                item["comments"] = comments
                print('comments:', comments)
            like_cnt = ''.join(
                response.xpath(
                    '//div[@class="mod comments reviews"]/div[@class="bd"]/ul/li[@data-feed-id="{}"]/div[@class="left"]/p/img/@title'
                    .format(uid)).extract()).strip()
            if like_cnt:
                like_cnt = ''.join(re.findall(r'粉丝值:\d+',
                                              like_cnt)).replace('粉丝值:', '')
                item["like_cnt"] = like_cnt
                print('like_cnt:', like_cnt)
            else:
                like_cnt = 0
                item["like_cnt"] = like_cnt
                print('like_cnt:', like_cnt)
            cmt_reply_cnt = ''.join(
                response.xpath(
                    '//div[@class="mod comments reviews"]/div[@class="bd"]/ul/li[@data-feed-id="{}"]/div[@class="right"]/div[@class="controls"]/a[@action-type="comment"]/span[@class="num"]/text()'
                    .format(uid)).extract()).strip()
            if cmt_reply_cnt:
                item["cmt_reply_cnt"] = cmt_reply_cnt
                print('cmt_reply_cnt:', cmt_reply_cnt)
            else:
                cmt_reply_cnt = 0
                item["cmt_reply_cnt"] = cmt_reply_cnt
                print('cmt_reply_cnt:', cmt_reply_cnt)
            long_comment = 0
            item["long_comment"] = long_comment
            last_modify_date = datetime.datetime.now().strftime(
                '%Y-%m-%d %H:%M:%S')
            item["last_modify_date"] = last_modify_date
            print('last_modify_date:', last_modify_date)
            yield item

        next_page_url = ''.join(
            response.xpath(
                '//div[@class="pagination"]/p/a[@class="btn next"]/@href').
            extract()).strip()
        if next_page_url:
            yield scrapy.Request(url=next_page_url,
                                 callback=self.parse_page,
                                 dont_filter=True)