def parse(self, response): print('1,=====================',response.url) # print(response.text) url = response.url item = TCommentsPubItem() src_url = response.url item["src_url"] = src_url print('src_url:', src_url) bid = response.url.replace('http://www.shuqi.com/cover.php?bid=','') print('bid:',bid) timestamp = str(int(time.time()))[0:10] print('timestamp:',timestamp) pageKey = "f2850e634f85f485d719314ae3cfe252" # jsstr = self.get_js() # ctx = execjs.compile(jsstr) s = bid + timestamp + pageKey sign = hashlib.md5(s.encode(encoding='UTF-8')).hexdigest() print('sign:',sign) formdata = { 'bid': bid, 'timestamp': timestamp, 'sign': sign, } link = 'https://ognv1.sqreader.com/index.php?r=pcapi/pcbook/bookinfo' yield scrapy.FormRequest( url=link, formdata=formdata, callback=self.parse_page_p, meta={'bid': bid,'item': item}, dont_filter=True, )
def parse(self, response): print('1,==========================', response.url) if response.status != 200: yield scrapy.Request(url=response.url, callback=self.parse, dont_filter=True) else: print('请求成功>>>') item = TCommentsPubItem() src_url = response.url item["src_url"] = src_url print('src_url:', src_url) product_number = ' '.join(response.xpath('//div[@class=" fr worksLR"]/h4/text()').extract()).strip() print('product_number:', product_number) product_number = get_product_number(product_number) print('product_number:', product_number) item["product_number"] = product_number plat_number = 'P35' print('plat_number:', plat_number) item["plat_number"] = plat_number bookId = ''.join(re.findall(r'bookid\=(\d+)', src_url, re.I|re.M)) print('bookId:',bookId) formdata = { 'type': '1', 'bookid': bookId, 'page': '1', } link = 'http://www.xiang5.com/comment.php?a=allchangping' yield scrapy.FormRequest(url=link, formdata=formdata, callback=self.parse_page_max, meta={'item': item, 'bookId': bookId}, dont_filter=True)
def parse(self, response): print('1,========================', response.url) html = response.text item = TCommentsPubItem() url = response.url src_url = url item["src_url"] = src_url print('src_url:', src_url) product_number = ''.join( response.xpath('//h3/a/text()').extract()).strip() product_number = get_product_number(product_number) print('product_number:', product_number) item["product_number"] = product_number plat_number = 'P17' print('plat_number:', plat_number) item["plat_number"] = plat_number bid = ''.join( response.xpath( '//*[@id="addToBookshelfBtn"]/@bid').extract()).strip() print('bid:', bid) formdata = { 'bid': bid, 'pageIndex': '1', } link = 'http://book.qq.com/intro/getComment.html' yield scrapy.FormRequest( url=link, formdata=formdata, callback=self.parse_page_p, meta={ 'item': item, 'bid': bid }, dont_filter=True, )
def parse(self, response): print('1,==========================', response.url) # print(response.text) item = TCommentsPubItem() src_url = response.url item["src_url"] = src_url print('src_url:', src_url) product_number = ''.join( response.xpath('//h3[@title]/em/text()').extract()).strip() print('product_number:', product_number) product_number = get_product_number(product_number) print('product_number:', product_number) item["product_number"] = product_number plat_number = 'P34' print('plat_number:', plat_number) item["plat_number"] = plat_number bookid = ''.join(re.findall(r'source\/(.*)', src_url, re.I | re.M)) link = 'http://yuedu.163.com/snsComment.do?operation=get&type=2&id={}&page=1'.format( bookid) yield scrapy.Request(url=link, callback=self.parse_page_max, meta={ 'item': item, 'bookid': bookid }, dont_filter=True)
def parse(self, response): print('1,========================', response.url) html = response.text # print(html) item = TCommentsPubItem() url = response.url src_url = url item["src_url"] = src_url print('src_url:', src_url) product_number = ''.join( response.xpath( '//img[@class="qqredaer_tit"]/@title').extract()).strip() product_number = get_product_number(product_number) print('product_number:', product_number) item["product_number"] = product_number plat_number = 'P17' print('plat_number:', plat_number) item["plat_number"] = plat_number link = ''.join( response.xpath( '//div[@class="tablist"]/ul/li/a[contains(text(),"评论")]/@href' ).extract()).strip() print('link:', link) yield scrapy.Request(url=link, callback=self.parse_page_link, meta={'item': item}, dont_filter=True)
def parse(self, response): print('1,=======================',response.url) url = response.url item = TCommentsPubItem() src_url = url item["src_url"] = src_url print('src_url:', src_url) bookId = ''.join(re.findall(r'book\/(\d+)\/',url, re.I|re.M)) print('bookId:',bookId) link = 'http://www.hongshu.com/comment/{}/index.do'.format(bookId) yield scrapy.Request(url=link, callback=self.parse_page_link, meta={'item':item, 'bookId': bookId}, dont_filter=True)
def parse_page(self, response): print('1,=====================', response.url) # print(response.text) item = TCommentsPubItem() src_url = response.url item["src_url"] = src_url bookId = src_url.replace('https://book.qidian.com/info/', '') print('bookId:', bookId) link = 'https://book.qidian.com/ajax/book/GetBookForum?_csrfToken=HUc4mzWMTveOYkK01P9mREV04r5f0zisvnDNZl7j&authorId=0&bookId={}&chanId=0&pageSize=15'.format( bookId) yield scrapy.Request(url=link, callback=self.parse_page_link, meta={'item': item}, dont_filter=True)
def parse(self, response): print('1,=====================', response.url) item = TCommentsPubItem() src_url = response.url item["src_url"] = src_url print('src_url:', src_url) novelid = src_url.replace('http://www.jjwxc.net/onebook.php?novelid=', '') print('novelid:', novelid) product_number = ''.join( response.xpath( '//h1[@itemprop="name"]/span/text()').extract()).strip() print('product_number:', product_number) # product_number = get_product_number(product_number) if '【' and '】' in product_number: product_number = product_number.replace('【', '[').replace('】', ']') print('product_number:', product_number) product_number = get_product_number(product_number) print('product_number:', product_number) item["product_number"] = product_number else: product_number = product_number product_number = get_product_number(product_number) print('product_number:', product_number) item["product_number"] = product_number # print('product_number:', product_number) # item["product_number"] = product_number plat_number = 'P16' print('plat_number:', plat_number) item["plat_number"] = plat_number last_modify_date = datetime.datetime.now().strftime( '%Y-%m-%d %H:%M:%S') item["last_modify_date"] = last_modify_date print('last_modify_date:', last_modify_date) j_url = 'http://s8-static.jjwxc.net/comment_json.php?novelid={}'.format( novelid) # print('j_url:',j_url) yield scrapy.Request(url=j_url, callback=self.parse_page, meta={'item': item}, dont_filter=True) link = 'http://www.jjwxc.net/comment.php?novelid={}&huati=1&page=1'.format( novelid) print('link:', link) yield scrapy.Request(url=link, callback=self.parse_page_url, meta={'item': item}, dont_filter=True)
def parse(self, response): print('1,=========================',response.url) item = TCommentsPubItem() src_url = response.url item["src_url"] = src_url # print('src_url:', src_url) product_number = ''.join(re.findall(r'av\d+',src_url)) # print('product_number:',product_number) item["product_number"] = product_number html = response.text # print(html) cid = ''.join(re.findall(r'cid=(\d+)&',html)) # print('cid:',cid) link = 'https://api.bilibili.com/x/v1/dm/list.so?oid={}'.format(cid) # print('link:',link) yield scrapy.Request(url=link, callback=self.parse_page, meta={'item': item}, dont_filter=True)
def parse(self, response): print('1,========================', response.url) item = TCommentsPubItem() url = response.url src_url = url item["src_url"] = src_url # print('src_url:', src_url) html = response.text # print(html) # cookies = dict(responses.cookies.items()) # print('cookies:',cookies) bookId = url.replace('https://www.hongxiu.com/book/', '') print('bookId:', bookId) link = 'https://www.hongxiu.com/ajax/comment/pageList?_csrfToken=9bx7F3sUwWZoBWjQrEImCBarm6KMDNYhoG8EtVSc&pageNum=1&pageSize=10&bookId={}'.format( bookId) yield scrapy.Request(url=link, callback=self.parse_page_link, meta={ 'item': item, 'bookId': bookId }, dont_filter=True)
def parse(self, response): print('1,=======================', response.url) # print(response.text) item = TCommentsPubItem() src_url = response.url item["src_url"] = src_url product_number = ''.join( response.xpath('//h2/a/text()').extract()).strip() product_number = get_product_number(product_number) print('product_number:', product_number) item["product_number"] = product_number plat_number = 'P18' print('plat_number:', plat_number) item["plat_number"] = plat_number link = ''.join( response.xpath( '//p[@class="loadMore"]/a/@href').extract()).strip() # print('url:',url) yield scrapy.Request(url=link, callback=self.parse_page, meta={'item': item}, dont_filter=True)
def parse(self, response): print('1,===============', response.url) item = TCommentsPubItem() src_url = response.url item["src_url"] = src_url print('src_url:', src_url) product_number = ''.join( response.xpath( '//div[@class="Info Sign"]/h1/a[@target="_blank"]/text()'). extract()).strip() product_number = get_product_number(product_number) print('product_number:', product_number) item["product_number"] = product_number plat_number = 'P22' print('plat_number:', plat_number) item["plat_number"] = plat_number link = ''.join( response.xpath('//div[@class="MORE"]/a[@id="comment_more2"]/@href' ).extract()).strip() # print('link:',link) yield scrapy.Request(url=link, callback=self.parse_page_link, meta={'item': item}, dont_filter=True)
def parse(self, response): print('1,===============',response.url) html = response.text item = TCommentsPubItem() src_url = response.url item["src_url"] = src_url print('src_url:', src_url) product_number = ''.join(re.findall(r'bookName\=\"(.*)\"',html)).strip().replace('bookName="','').replace('"','') product_number = get_product_number(product_number) print('product_number:', product_number) item["product_number"] = product_number plat_number = 'P21' print('plat_number:', plat_number) item["plat_number"] = plat_number long_comment = 0 item["long_comment"] = long_comment last_modify_date = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') item["last_modify_date"] = last_modify_date print('last_modify_date:', last_modify_date) bookId = ''.join(re.findall(r'bookId=\"\d+\"',html)).strip().replace('bookId="','').replace('"','') # print('bookId:',bookId) url = 'http://book.zongheng.com/api/book/comment/getThreadL1st2.htm' formdata = { 'bookId': bookId, 'pagebar': '1', 'pageNum': '1', 'pageSize': '30', } # FormRequest 是Scrapy发送POST请求的方法 yield scrapy.FormRequest( url=url, formdata=formdata, callback=self.parse_page_p, meta={'item': item, 'bookId': bookId}, dont_filter=True, )
def parse_page(self, response): print('2,==================', response.url) # print(response.text) item = TCommentsPubItem() src_url = response.url item["src_url"] = src_url print('src_url:', src_url) uids = response.xpath( '//div[@class="mod comments reviews"]/div[@class="bd"]/ul/li[@data-feed-id]/@data-feed-id' ).extract() # print('uids:',uids) for uid in uids: print('uid:', uid) product_number = ''.join( response.xpath('//h1[@class="page-title"]/text() | //h2/text()' ).extract()).strip().replace('的评论', '') product_number = get_product_number(product_number) print('product_number:', product_number) item["product_number"] = product_number plat_number = 'P31' print('plat_number:', plat_number) item["plat_number"] = plat_number nick_name = ''.join( response.xpath( '//div[@class="mod comments reviews"]/div[@class="bd"]/ul/li[@data-feed-id="{}"]/div[@class="left"]/a[@class="name"]/text()' .format(uid)).extract()).strip() item["nick_name"] = nick_name print('nick_name:', nick_name) cmt_date = ''.join( response.xpath( '//div[@class="mod comments reviews"]/div[@class="bd"]/ul/li[@data-feed-id="{}"]/div[@class="right"]/div[@class="controls"]/span[@class="time"]/text()' .format(uid)).extract()).strip() item["cmt_date"] = cmt_date print('cmt_date:', cmt_date) cmt_time = ''.join( response.xpath( '//div[@class="mod comments reviews"]/div[@class="bd"]/ul/li[@data-feed-id="{}"]/div[@class="right"]/div[@class="controls"]/span[@class="time"]/text()' .format(uid)).extract()).strip() item["cmt_time"] = cmt_time print('cmt_time', cmt_time) comments = ''.join( response.xpath( '//div[@class="mod comments reviews"]/div[@class="bd"]/ul/li[@data-feed-id="{}"]/div[@class="right"]/p[@class="summary"]//text()' .format(uid)).extract()).strip() if comments: item["comments"] = comments print('comments:', comments) else: comments = ''.join( response.xpath( '//div[@class="mod comments reviews"]/div[@class="bd"]/ul/li[@data-feed-id="{}"]/div[@class="right"]/h3/a/text()' .format(uid)).extract()).strip() item["comments"] = comments print('comments:', comments) like_cnt = ''.join( response.xpath( '//div[@class="mod comments reviews"]/div[@class="bd"]/ul/li[@data-feed-id="{}"]/div[@class="left"]/p/img/@title' .format(uid)).extract()).strip() if like_cnt: like_cnt = ''.join(re.findall(r'粉丝值:\d+', like_cnt)).replace('粉丝值:', '') item["like_cnt"] = like_cnt print('like_cnt:', like_cnt) else: like_cnt = 0 item["like_cnt"] = like_cnt print('like_cnt:', like_cnt) cmt_reply_cnt = ''.join( response.xpath( '//div[@class="mod comments reviews"]/div[@class="bd"]/ul/li[@data-feed-id="{}"]/div[@class="right"]/div[@class="controls"]/a[@action-type="comment"]/span[@class="num"]/text()' .format(uid)).extract()).strip() if cmt_reply_cnt: item["cmt_reply_cnt"] = cmt_reply_cnt print('cmt_reply_cnt:', cmt_reply_cnt) else: cmt_reply_cnt = 0 item["cmt_reply_cnt"] = cmt_reply_cnt print('cmt_reply_cnt:', cmt_reply_cnt) long_comment = 0 item["long_comment"] = long_comment last_modify_date = datetime.datetime.now().strftime( '%Y-%m-%d %H:%M:%S') item["last_modify_date"] = last_modify_date print('last_modify_date:', last_modify_date) yield item next_page_url = ''.join( response.xpath( '//div[@class="pagination"]/p/a[@class="btn next"]/@href'). extract()).strip() if next_page_url: yield scrapy.Request(url=next_page_url, callback=self.parse_page, dont_filter=True)