def parse(self, response): print('1,=======================', response.url) # print(response.text) # url = response.url item = TNovelSummaryItem() src_url = response.url item["src_url"] = src_url print('src_url:', src_url) bid = response.url.replace('http://www.shuqi.com/cover.php?bid=', '') print('bid:', bid) timestamp = str(int(time.time()))[0:10] print('timestamp:', timestamp) pageKey = "f2850e634f85f485d719314ae3cfe252" # jsstr = self.get_js() # ctx = execjs.compile(jsstr) s = bid + timestamp + pageKey sign = hashlib.md5(s.encode(encoding='UTF-8')).hexdigest() print('sign:', sign) formdata = { 'bid': bid, 'timestamp': timestamp, 'sign': sign, } link = 'https://ognv1.sqreader.com/index.php?r=pcapi/pcbook/bookinfo' yield scrapy.FormRequest( url=link, formdata=formdata, callback=self.parse_page_p, meta={ 'bid': bid, 'item': item }, dont_filter=True, )
def parse(self, response): print('1,=========================', response.url) text = response.text # print(text) item = TNovelSummaryItem() src_url = response.url item["src_url"] = src_url print('src_url:', src_url) product_number = ''.join( response.xpath('//h2/text()').extract()).strip() print('product_number:', product_number) product_number = get_product_number(product_number) print('product_number:', product_number) item["product_number"] = product_number plat_number = 'P31' print('plat_number:', plat_number) item["plat_number"] = plat_number Chapter_num_update = ''.join( response.xpath('//h4/a/text()').extract()).strip() Chapter_num_update = ''.join( re.findall(r'第([\u4e00-\u9fa5]{1,10})章', Chapter_num_update, re.I | re.M)) Chapter_num_update = chinese_to_arabic(Chapter_num_update) item["Chapter_num_update"] = Chapter_num_update print('Chapter_num_update:', Chapter_num_update) update_date = ''.join( response.xpath( '//h4/span[@class="time"]/text()').extract()).strip() update_date = parse_time(update_date) item["update_date"] = update_date print('update_date:', update_date) words = ''.join( response.xpath('//span[@class="words"]/text()').extract()).strip() words = ''.join(re.findall(r'(\d+)字', words, re.I | re.M)) item["words"] = words print('words:', words) tickets_num = None item["tickets_num"] = tickets_num score = None item["score"] = score reward_num = None item["reward_num"] = reward_num bookId = ''.join(re.findall(r'book\/(\d+)', src_url, re.I | re.M)) link = 'http://a.heiyan.com/ajax/book/extend/{}/detail'.format(bookId) # print('link:',link) yield scrapy.Request(url=link, callback=self.parse_page_click_num, meta={ 'item': item, 'bookId': bookId }, dont_filter=True)
def parse(self, response): print('1,==========================', response.url) text = response.text # print(text) item = TNovelSummaryItem() src_url = response.url item["src_url"] = src_url print('src_url:', src_url) product_number = ''.join( response.xpath( '//h1[@class="RecArticle"]//text()').extract()).strip() print('product_number:', product_number) product_number = get_product_number(product_number) print('product_number:', product_number) item["product_number"] = product_number plat_number = 'P30' print('plat_number:', plat_number) item["plat_number"] = plat_number Chapter_num_update = ''.join( response.xpath( '//*[@id="con_ListStyleBTab2C_1"]/div[@class="area"]/span/a/b/text()' ).extract()).strip() Chapter_num_update = ''.join( re.findall(r'(\d+)', Chapter_num_update, re.I | re.M)) item["Chapter_num_update"] = Chapter_num_update print('Chapter_num_update:', Chapter_num_update) update_date = ''.join( response.xpath( '//*[@id="con_ListStyleBTab2C_1"]/div[@class="area"]/span/text()' ).extract()).strip() update_date = ''.join( re.findall(r'\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}', update_date, re.I | re.M)) item["update_date"] = update_date print('update_date:', update_date) tickets_num = ''.join( response.xpath( '//div[@class="inter_con"]/div[@class="give"]/ul/li[@id="Interbt3"]/p/span/text()' ).extract()).strip() item["tickets_num"] = tickets_num print('tickets_num:', tickets_num) reward_num = ''.join( response.xpath( '//div[@class="inter_con"]/div[@class="give"]/ul/li[@id="Interbt1"]/p/span/text()' ).extract()).strip() item["reward_num"] = reward_num print('reward_num:', reward_num) score = None item["score"] = score book_id = ''.join(re.findall(r'book\/(\d+)', src_url, re.I | re.M)) print('book_id:', book_id) link = 'http://www.3gsc.com.cn/BookLazyload/getstatis?book_id={}'.format( book_id) yield scrapy.Request(url=link, callback=self.parse_page, meta={ 'item': item, 'book_id': book_id }, dont_filter=True) comment_num_link = ''.join( response.xpath( '//div[@class="forum_stati"]/div[@class="opt"]/a/@href'). extract()).strip() if comment_num_link: comment_num_link = 'http://www.3gsc.com.cn' + comment_num_link yield scrapy.Request(url=comment_num_link, callback=self.parse_page_comment_num, meta={'item': item}, dont_filter=True) else: print('没找到连接////')
def parse(self, response): print('1,======================', response.url) text = response.text # print(text) item = TNovelSummaryItem() src_url = response.url item["src_url"] = src_url print('src_url:', src_url) product_number = ''.join( response.xpath('//h1/em/text()').extract()).strip() print('product_number:', product_number) product_number = get_product_number(product_number) print('product_number:', product_number) item["product_number"] = product_number plat_number = 'P33' print('plat_number:', plat_number) item["plat_number"] = plat_number Chapter_num_update = ''.join( response.xpath( '//div[@class="update"]/p/a/text()').extract()).strip() Chapter_num_update = ''.join( re.findall(r'第(\d+)章', Chapter_num_update, re.I | re.S)) item["Chapter_num_update"] = Chapter_num_update print('Chapter_num_update:', Chapter_num_update) update_date = ''.join( response.xpath( '//div[@class="update"]/p/span/text()').extract()).strip() update_date = parse_time(update_date) item["update_date"] = update_date print('update_date:', update_date) words = ''.join( response.xpath( '//div[@class="book-info"]/p[@class="total"]//text()').extract( )).strip() words = ''.join(re.findall(r'(.*?)字\|', words, re.I | re.S)) words = process_number(words) item["words"] = words print('words:', words) click_num = ''.join( response.xpath( '//div[@class="book-info"]/p[@class="total"]//text()').extract( )).strip() click_num = ''.join(re.findall(r'([0-9]+\.[0-9]+万)总点击', click_num)) print('click_num:', click_num) click_num = process_number(click_num) item["click_num"] = click_num print('click_num:', click_num) tickets_num = ''.join( response.xpath('//*[@id="monthCount"]/text()').extract()).strip() item["tickets_num"] = tickets_num print('tickets_num:', tickets_num) comment_num = ''.join( response.xpath( '//div[@class="lbf-pagination"]/ul/li[last()-1]/a/text()'). extract()).strip() if comment_num: comment_num = int(comment_num) * 10 else: comment_num = 10 item["comment_num"] = comment_num print('comment_num:', comment_num) score = None item["score"] = score collect_num = None item["collect_num"] = collect_num reward_num = None item["reward_num"] = reward_num last_modify_date = datetime.datetime.now().strftime( '%Y-%m-%d %H:%M:%S') item["last_modify_date"] = last_modify_date print('last_modify_date:', last_modify_date) print(item) yield item
def parse(self, response): print('1,=================', response.url) text = response.text # print(text) url = response.url item = TNovelSummaryItem() src_url = url item["src_url"] = src_url print('src_url:', src_url) product_number = ''.join( response.xpath('//h1/em/text()').extract()).strip() print('product_number:', product_number) product_number = get_product_number(product_number) item["product_number"] = product_number print('product_number:', product_number) plat_number = 'P20' print('plat_number:', plat_number) item["plat_number"] = plat_number update_date = ''.join( response.xpath('//p[@class="cf"]/em[@class="time"]/text()'). extract()).strip() update_date = parse_time(update_date) item["update_date"] = update_date print('update_date:', update_date) # words = ''.join(response.xpath('//div[@class="book-info "]/p[3]/em[1]/span/text()').extract()).strip() # item["words"] = words # print('words:',words) tickets_num = ''.join( response.xpath('//*[@id="monthCount"]/text()').extract()).strip() item["tickets_num"] = tickets_num print('tickets_num:', tickets_num) score_s = ''.join( response.xpath( '//*[@id="j_bookScore"]//text()').extract()).strip() if '暂无评分' in score_s: score = 0 else: score = score_s item["score"] = score print('score:', score) collect_num = None item["collect_num"] = collect_num print('collect_num:', collect_num) reward_num = ''.join( response.xpath('//*[@id="rewardNum"]/text()').extract()).strip() item["reward_num"] = reward_num print('reward_num:', reward_num) last_modify_date = datetime.datetime.now().strftime( '%Y-%m-%d %H:%M:%S') item["last_modify_date"] = last_modify_date print('last_modify_date:', last_modify_date) authorId = ''.join( response.xpath( '//*[@id="authorId"]/@data-authorid').extract()).strip() print('authorId:', authorId) chanId = re.findall(r'chanId\=(\d+)', text)[0] print('chanId:', chanId) bookId = ''.join( re.findall(r'https\:\/\/book\.qidian\.com\/info\/(\d+)', url, re.I | re.M)) print('bookId:', bookId) _csrfToken = 'HUc4mzWMTveOYkK01P9mREV04r5f0zisvnDNZl7j' link = 'https://book.qidian.com/ajax/book/category?_csrfToken=HUc4mzWMTveOYkK01P9mREV04r5f0zisvnDNZl7j&bookId={}'.format( bookId) yield scrapy.Request(url=link, callback=self.parse_page_Chapter_num, meta={ 'item': item, 'authorId': authorId, 'chanId': chanId, 'bookId': bookId }, dont_filter=True)
def parse(self, response): print('1,=========================', response.url) headers = { "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8", # "Accept-Encoding": "gzip, deflate", "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8", "Cache-Control": "max-age=0", "Connection": "keep-alive", "Cookie": "UM_distinctid=164daf563af44a-0b77993c46f21-6114147a-100200-164daf563b0422; canal=0; schannelm=0; www_say=775abff326250295f011c827045e4f45; PHPSESSID=imbduief49qgu8e9mttjtb03u1; Hm_lvt_688746b9e4f9d33e0e2ce6aeffb4fa58=1535520731,1535597551; counter=zixing; countertime=2018/8/30; _jzqc=1; _qzjc=1; CNZZDATA1253179669=891460335-1532681545-%7C1535606694; Hm_lpvt_688746b9e4f9d33e0e2ce6aeffb4fa58=1535610428; uuid=2AE001D147E7F1C7E3026160C9234536; marks=13; _qzja=1.754836888.1532681874609.1535597551629.1535610427783.1535602634139.1535610427783.0.0.0.22.6; _qzjto=18.2.0; _jzqa=1.4298357099084273000.1532681875.1535597552.1535610428.6; _jzqx=1.1535610428.1535610428.1.jzqsr=xiang5%2Ecom|jzqct=/.-; _jzqckmp=1; _jzqb=1.1.10.1535610428.1; _qzjb=1.1535610427783.1.0.0.0", "Host": "www.xiang5.com", "Upgrade-Insecure-Requests": "1", "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36", } if response.status != 200: yield scrapy.Request(url=response.url, headers=headers, callback=self.parse, dont_filter=True) else: print('请求成功>>>') item = TNovelSummaryItem() src_url = response.url item["src_url"] = src_url print('src_url:', src_url) product_number = ' '.join( response.xpath('//div[@class=" fr worksLR"]/h4/text()'). extract()).strip() print('product_number:', product_number) product_number = get_product_number(product_number) print('product_number:', product_number) item["product_number"] = product_number plat_number = 'P35' print('plat_number:', plat_number) item["plat_number"] = plat_number Chapter_num_update = ''.join( response.xpath( '//div[@class="worksL2"]/h2/b[@class="colR"]/a/text()'). extract()).strip() Chapter_num_update = ''.join( re.findall(r'第(\d+)章', Chapter_num_update, re.I | re.M)) item["Chapter_num_update"] = Chapter_num_update print('Chapter_num_update:', Chapter_num_update) update_date = ''.join( response.xpath( '//div[@class="worksL2"]/h2/text()').extract()).strip() # print('update_date:', update_date) update_date = re.match( r'([\d+]{4}\-[\d+]{2}\-[\d+]{2}\s[\d+]{2}\:[\d+]{2}\:[\d+]{2})', update_date, re.I | re.S).group() item["update_date"] = update_date print('update_date:', update_date) words = ''.join( response.xpath('//div[@class="workSecHit"]/span/text()'). extract()).strip() words = ' '.join(re.findall(r'字数:(.*)', words, re.I | re.M)) words = process_number(words) item["words"] = words print('words:', words) click_num = ''.join( response.xpath('//div[@class="workSecHit"]/span/text()'). extract()).strip() click_num = ''.join( re.findall(r'点击:(\d+)\s', click_num, re.I | re.M)) item["click_num"] = click_num print('click_num:', click_num) collect_num = ''.join( response.xpath('//div[@class="workSecHit"]/span/text()'). extract()).strip() collect_num = ''.join( re.findall(r'收藏:(\d+)\s', collect_num, re.I | re.M)) item["collect_num"] = collect_num print('collect_num:', collect_num) tickets_num = None item["tickets_num"] = tickets_num comment_num = ''.join( response.xpath( '//*[@id="pinglun"]/h4/span[@class="fl"]/b/text()'). extract()).strip() comment_num = ''.join( re.findall(r'(\d+)', comment_num, re.I | re.M)) item["comment_num"] = comment_num print('comment_num:', comment_num) score = None item["score"] = score reward_num = None item["reward_num"] = reward_num last_modify_date = datetime.datetime.now().strftime( '%Y-%m-%d %H:%M:%S') item["last_modify_date"] = last_modify_date print('last_modify_date:', last_modify_date) print(item) yield item
def parse(self, response): print('1,=======================', response.url) text = response.text item = TNovelSummaryItem() src_url = response.url item["src_url"] = src_url print('src_url:', src_url) product_number = ''.join( response.xpath( '//div[@class="Info Sign"]/h1/a[@target="_blank"]/text()'). extract()).strip() product_number = get_product_number(product_number) print('product_number:', product_number) item["product_number"] = product_number plat_number = 'P22' print('plat_number:', plat_number) item["plat_number"] = plat_number Chapter_num_update = None item["Chapter_num_update"] = Chapter_num_update print('Chapter_num_update:', Chapter_num_update) update_date_s = ''.join( response.xpath('//dl[@class="Tab"]/dt[@class="tit"]/em/text()'). extract()).strip().replace('更新:', '').strip() update_date = update_date_s + datetime.datetime.now().strftime(':%S') item["update_date"] = update_date print('update_date:', update_date) # timeArray = time.strptime(update_date_s, "%Y-%m-%d %H:%M") # timeStamp = int(time.mktime(timeArray)) # print('timeStamp:', timeStamp) words = ''.join( response.xpath( '//div[@class="BookData"]/p[last()-1]/em[@class="red"]/text()' ).extract()).strip() item["words"] = words print('words:', words) tickets_num = None item["tickets_num"] = tickets_num score = None item["score"] = score collect_num = None item["collect_num"] = collect_num reward_num = None item["reward_num"] = reward_num last_modify_date = datetime.datetime.now().strftime( '%Y-%m-%d %H:%M:%S') item["last_modify_date"] = last_modify_date print('last_modify_date:', last_modify_date) authorId = re.findall(r'\/zuozhe\/(\d+)', text, re.I | re.M)[0] print('authorId:', authorId) bookId = ''.join(re.findall(r'([\d+]{4,6})', src_url, re.I | re.M)) print('bookId:', bookId) click_num_link = 'http://api.ali.17k.com/v2/book/{}/stat_info?app_key=3362611833&click_info=1&hb_info=1&flower_info=1&stamp_info=1&cps_source='.format( bookId) yield scrapy.Request(url=click_num_link, callback=self.parse_page_click_num, meta={ 'item': item, 'bookId': bookId }, dont_filter=True)
def parse(self, response): print('1,========================',response.url) text = response.text # print(text) item = TNovelSummaryItem() url = response.url src_url = url item["src_url"] = src_url print('src_url:', src_url) product_number = ''.join(response.xpath('//img[@class="qqredaer_tit"]/@title').extract()).strip() product_number = get_product_number(product_number) print('product_number:', product_number) item["product_number"] = product_number plat_number = 'P17' print('plat_number:', plat_number) item["plat_number"] = plat_number Chapter_num_update_s = ''.join(response.xpath('//*[@id="newChapterList"]/div[@class="chaptername"]/b/a[@class="green"]/text()').extract()).strip() print('Chapter_num_update_s:', Chapter_num_update_s) if Chapter_num_update_s: Chapter_num_update = ''.join(re.findall(u'第.*?卷 第([\u4e00-\u9fa5]{4,10})\s', Chapter_num_update_s, re.I | re.M)) if Chapter_num_update: Chapter_num_update = chinesedigits(Chapter_num_update) item["Chapter_num_update"] = Chapter_num_update print('Chapter_num_update:', Chapter_num_update) else: Chapter_num_update = ''.join(re.findall(r'第(\d+)章',Chapter_num_update_s, re.I|re.M)) if Chapter_num_update: item["Chapter_num_update"] = Chapter_num_update print('Chapter_num_update:', Chapter_num_update) else: Chapter_num_update = ''.join(re.findall(r'第.*?卷 (\d+)',Chapter_num_update_s, re.I|re.M)) if Chapter_num_update: item["Chapter_num_update"] = Chapter_num_update print('Chapter_num_update:', Chapter_num_update) else: Chapter_num_update = ''.join(re.findall(r'第.*?卷 \【(.*?)\】', Chapter_num_update_s, re.I | re.M)) if Chapter_num_update: item["Chapter_num_update"] = Chapter_num_update print('Chapter_num_update:', Chapter_num_update) else: Chapter_num_update = ''.join(re.findall(r'第.*?卷 [\u4e00-\u9fa5]{0,6}\((\d+)\)', Chapter_num_update_s, re.I | re.M)) if Chapter_num_update: item["Chapter_num_update"] = Chapter_num_update print('Chapter_num_update:', Chapter_num_update) else: Chapter_num_update = None item["Chapter_num_update"] = Chapter_num_update print('Chapter_num_update:', Chapter_num_update) else: Chapter_num_update = None item["Chapter_num_update"] = Chapter_num_update print('Chapter_num_update:', Chapter_num_update) update_date = ''.join(response.xpath('//*[@id="newChapterList"]/div[@class="chaptername"]/text()').extract()).strip() if update_date: update_date = update_date.replace('(更新时间:','').replace(')','') else: update_date = datetime.datetime.now().strftime('%Y-%m-%d') item["update_date"] = update_date print('update_date:',update_date) words = ' '.join(response.xpath('//div[@class="num"]/table/tr/td/text()').extract()).strip() if words: words = ''.join(re.findall(r'总字数:(\d+)',words, re.I|re.M)) else: words = None item["words"] = words print('words:',words) click_num = ' '.join(response.xpath('//div[@class="num"]/table/tr/td/text()').extract()).strip() if click_num: click_num = ''.join(re.findall(r'阅文点击:(\d+)',click_num, re.I|re.M)) else: click_num = None item["click_num"] = click_num print('click_num:',click_num) score = None item["score"] = score collect_num = None item["collect_num"] = collect_num last_modify_date = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') item["last_modify_date"] = last_modify_date print('last_modify_date:', last_modify_date) bid = ''.join(re.findall(r'\/(\d+).html',url, re.I|re.M)) print('bid:',bid) link = 'http://yunqi.qq.com/novelcomment/index.html?bid={}'.format(bid) yield scrapy.Request(url=link, callback=self.parse_page, meta={'item': item,'bid':bid}, dont_filter=True)
def parse(self, response): print('1,========================', response.url) text = response.text # print(text) item = TNovelSummaryItem() url = response.url src_url = url item["src_url"] = src_url print('src_url:', src_url) product_number = ''.join( response.xpath('//h3/a/text()').extract()).strip() print('product_number:', product_number) product_number = get_product_number(product_number) print('product_number:', product_number) item["product_number"] = product_number plat_number = 'P17' print('plat_number:', plat_number) item["plat_number"] = plat_number Chapter_num_update = ''.join( response.xpath('//div[@class="new_list"]/dl[1]/dd[1]/a/text()'). extract()).strip() if Chapter_num_update: Chapter_num_update = ''.join( re.findall(r'第(\d+)章', Chapter_num_update, re.I | re.M)) item["Chapter_num_update"] = Chapter_num_update print('Chapter_num_update:', Chapter_num_update) else: Chapter_num_update = None item["Chapter_num_update"] = Chapter_num_update print('Chapter_num_update:', Chapter_num_update) update_date = ''.join( response.xpath( '//div[@class="new_list"]/dl[1]/dd[@class="gray"]/text()'). extract()).strip() if update_date: item["update_date"] = update_date print('update_date:', update_date) else: update_date = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') item["update_date"] = update_date print('update_date:', update_date) words = ''.join( response.xpath( '//div[@id="bookinfo"]/div[@class="book_info"]/dl[1]/dd[last()]/text()' ).extract()).strip() if '字' in words: words = words.replace('字', '') words = process_number(words) item["words"] = words print('words:', words) else: words = process_number(words) item["words"] = words print('words:', words) click_num = None item["click_num"] = click_num tickets_num = None item["tickets_num"] = tickets_num comment_num = ''.join( response.xpath( '//*[@id="commentCount"]/text()').extract()).strip() item["comment_num"] = comment_num print('comment_num:', comment_num) score = ''.join( response.xpath( '//*[@id="StarIco"]/span[@id="StarIcoValue"]/b//text()'). extract()).strip() item["score"] = score print('score:', score) collect_num = None item["collect_num"] = collect_num reward_num = None item["reward_num"] = reward_num last_modify_date = datetime.datetime.now().strftime( '%Y-%m-%d %H:%M:%S') item["last_modify_date"] = last_modify_date print('last_modify_date:', last_modify_date) yield item
def parse(self, response): print('1,=======================', response.url) # print(response.text) item = TNovelSummaryItem() src_url = response.url item["src_url"] = src_url print('src_url:', src_url) product_number = ''.join( response.xpath('//h3[@title]/em/text()').extract()).strip() print('product_number:', product_number) product_number = get_product_number(product_number) print('product_number:', product_number) item["product_number"] = product_number plat_number = 'P34' print('plat_number:', plat_number) item["plat_number"] = plat_number Chapter_num_update = ''.join( response.xpath( '//div[@class="contents"]/div[@class="tab-item crt"]/a[@class="m-newupdate"]/h4/text()' ).extract()).strip() Chapter_num_update = ''.join( re.findall(r'第(.*?)章', Chapter_num_update, re.I | re.S)) Chapter_num_update = chinese_to_arabic(Chapter_num_update) item["Chapter_num_update"] = Chapter_num_update print('Chapter_num_update:', Chapter_num_update) update_date = ''.join( response.xpath( '//div[@class="contents"]/div[@class="tab-item crt"]/span[@class="updatetime"]/text()' ).extract()).replace('更新时间:', '').strip() print('update_date:', update_date) update_date = parse_date(update_date) item["update_date"] = update_date print('update_date:', update_date) words = ''.join( response.xpath('//div[@class="m-bookstatus"]/table/tr//text()'). extract()).strip() words = ''.join(re.findall(r'字数:(.*?)\n', words, re.I | re.M)) words = process_number(words) item["words"] = words print('words:', words) click_num = ''.join( response.xpath('//div[@class="m-bookstatus"]/table/tr//text()'). extract()).strip() click_num = ''.join(re.findall(r'点击:(.*?)\n', click_num, re.I | re.M)) print('click_num:', click_num) click_num = process_number(click_num) item["click_num"] = click_num print('click_num:', click_num) score = ''.join( response.xpath( '//div[@class="starlevel"]/span[@class="score"]/text()'). extract()).strip() item["score"] = score print('score:', score) collect_num = None item["collect_num"] = collect_num bookid = ''.join(re.findall(r'source\/(.*)', src_url, re.I | re.M)) link = 'http://yuedu.163.com/snsComment.do?operation=get&type=2&id={}'.format( bookid) # print(link) yield scrapy.Request(url=link, callback=self.parse_page_comment_num, meta={ 'item': item, 'bookid': bookid }, dont_filter=True)
def parse(self, response): print('1,=======================', response.url) text = response.text # print(text) item = TNovelSummaryItem() src_url = response.url item["src_url"] = src_url print('src_url:', src_url) product_number = ''.join( response.xpath( '//div[@class="bookname"]/h2/a/text()').extract()).strip() print('product_number:', product_number) product_number = get_product_number(product_number) print('product_number:', product_number) item["product_number"] = product_number plat_number = 'P18' print('plat_number:', plat_number) item["plat_number"] = plat_number Chapter_num_update = ''.join( response.xpath( '//div[@class="bookTit"]/div[@class="new"]/p/a/text()'). extract()).strip() # print('Chapter_num_update:',Chapter_num_update) if Chapter_num_update: try: Chapter_num_update = ''.join( re.findall(u'第(.*?)章', Chapter_num_update)) Chapter_num_update = chinesedigits(Chapter_num_update) item["Chapter_num_update"] = Chapter_num_update print('Chapter_num_update:', Chapter_num_update) except: Chapter_num_update = ''.join( re.findall(r'(\d+)', Chapter_num_update)) item["Chapter_num_update"] = Chapter_num_update print('Chapter_num_update:', Chapter_num_update) else: Chapter_num_update = None item["Chapter_num_update"] = Chapter_num_update print('Chapter_num_update:', Chapter_num_update) update_date = ''.join( response.xpath( '//div[@class="bookTit"]/div[@class="time"]/p/text()').extract( )).strip() if update_date: item["update_date"] = update_date print('update_date:', update_date) else: update_date = datetime.datetime.now().strftime('%Y-%m-%d') item["update_date"] = update_date print('update_date:', update_date) words = ''.join( response.xpath('//div[@class="bookinf01"]/p/span[2]/text()'). extract()).strip() words = process_number(words) item["words"] = words print('words:', words) click_num = None item["click_num"] = click_num print('click_num:', click_num) tickets_num = None item["tickets_num"] = tickets_num comment_num = ''.join( response.xpath( '//div[@class="bookCir"]/div[@class="title"]/p/span[last()]/text()' ).extract()).strip() # print('comment_num:',comment_num) if '条' in comment_num: comment_num = comment_num.split(',')[1] comment_num = ''.join( re.findall(r'(.*?)条', comment_num, re.I | re.M)) # print('comment_num:',comment_num) if 'w+' in comment_num: comment_num = process_number(comment_num) item["comment_num"] = comment_num print('comment_num:', comment_num) else: comment_num = comment_num item["comment_num"] = comment_num print('comment_num:', comment_num) else: comment_num = 0 item["comment_num"] = comment_num print('comment_num:', comment_num) score = ''.join( response.xpath( '//div[@class="bookinf01"]/div[@class="bookname"]/span/text()' ).extract()).strip() if score: item["score"] = score print('score:', score) else: score = 0 item["score"] = score print('score:', score) collect_num = ''.join( response.xpath('//div[@class="bookinf01"]/p/span[3]/b/text()'). extract()).strip() print('collect_num:', collect_num) if collect_num: collect_num = process_number(collect_num) item["collect_num"] = collect_num print('collect_num:', collect_num) else: collect_num = 0 item["collect_num"] = collect_num print('collect_num:', collect_num) reward_num = None item["reward_num"] = reward_num last_modify_date = datetime.datetime.now().strftime( '%Y-%m-%d %H:%M:%S') item["last_modify_date"] = last_modify_date print('last_modify_date:', last_modify_date) yield item
def parse(self, response): print('1,=======================', response.url) text = response.text # print(text) item = TNovelSummaryItem() src_url = response.url item["src_url"] = src_url print('src_url:', src_url) product_number = ''.join( response.xpath( '//div[@class="main"]/div[@class="status fl"]/h1/a/text()'). extract()).strip() if '【' and '】' in product_number: product_number = product_number.replace('【', '[').replace('】', ']') print('product_number:', product_number) product_number = get_product_number(product_number) print('product_number:', product_number) item["product_number"] = product_number else: product_number = product_number product_number = get_product_number(product_number) print('product_number:', product_number) item["product_number"] = product_number plat_number = 'P21' item["plat_number"] = plat_number print('plat_number:', plat_number) Chapter_num_update = ''.join( response.xpath( '//div[@class="update box"]/div[@class="cont"]/a/text()'). extract()).strip() if Chapter_num_update: Chapter_num_update = ''.join( re.findall(u'第(.*?)部', Chapter_num_update, re.I | re.M)) Chapter_num_update = chinesedigits(Chapter_num_update) item["Chapter_num_update"] = Chapter_num_update print('Chapter_num_update:', Chapter_num_update) update_date = ''.join( response.xpath( '//div[@class="update box"]/div[@class="uptime"]/text()'). extract()).strip().split('\n')[0].replace('·', '') update_date = parse_time(update_date) item["update_date"] = update_date print('update_date:', update_date) words = ''.join( response.xpath( '//div[@class="main"]/div[@class="status fl"]/div[@class="booksub"]/span[@title]/text()' ).extract()).strip() item["words"] = words print('words:', words) click_num = ' '.join( response.xpath( '//div[@class="vote_info"]/p//text()').extract()).strip() if click_num: click_num = ''.join( re.findall(r'总点击: (\d+)', click_num, re.I | re.M)) else: click_num = None item["click_num"] = click_num print('click_num:', click_num) comment_num = ' '.join( response.xpath( '//div[@class="vote_info"]/p//text()').extract()).strip() if comment_num: comment_num = ''.join( re.findall(r'评论数: (\d+)', comment_num, re.I | re.M)) else: comment_num = None item["comment_num"] = comment_num print('comment_num:', comment_num) score = None item["score"] = score collect_num = ' '.join( response.xpath( '//div[@class="vote_info"]/p//text()').extract()).strip() if collect_num: collect_num = ''.join( re.findall(r'总收藏: (\d+)', collect_num, re.I | re.M)) else: collect_num = None item["collect_num"] = collect_num print('collect_num:', collect_num) reward_num = None item["reward_num"] = reward_num last_modify_date = datetime.datetime.now().strftime( '%Y-%m-%d %H:%M:%S') item["last_modify_date"] = last_modify_date print('last_modify_date:', last_modify_date) bookId = ''.join(re.findall(r'bookId=\"(\d+)\"', text, re.I | re.M)) print('bookId:', bookId) link = 'http://book.zongheng.com/book/async/info.htm' formdata = {"bookId": bookId} yield scrapy.FormRequest( url=link, formdata=formdata, callback=self.parse_page, meta={'item': item}, dont_filter=True, )
def parse(self, response): print('1,=================', response.url) text = response.text # with open('qidian.txt', "wb") as f: # 开始写文件,wb代表写二进制文件 # f.write(response.body) url = response.url item = TNovelSummaryItem() src_url = url item["src_url"] = src_url print('src_url:', src_url) product_number = ''.join( response.xpath('//h1/em/text()').extract()).strip() print('product_number:', product_number) product_number = get_product_number(product_number) item["product_number"] = product_number print('product_number:', product_number) plat_number = 'P20' print('plat_number:', plat_number) item["plat_number"] = plat_number tickets_num = ''.join( response.xpath('//*[@id="monthCount"]/text()').extract()).strip() item["tickets_num"] = tickets_num print('tickets_num:', tickets_num) reward_num = ''.join( response.xpath('//*[@id="rewardNum"]/text()').extract()).strip() item["reward_num"] = reward_num print('reward_num:', reward_num) last_modify_date = datetime.datetime.now().strftime( '%Y-%m-%d %H:%M:%S') item["last_modify_date"] = last_modify_date print('last_modify_date:', last_modify_date) font_type = ''.join( response.xpath( '//div[@class="book-info "]/p/em[1]/span[@class]/@class'). extract()).strip() print('font_type:', font_type) font_url = 'https://qidian.gtimg.com/qd_anti_spider/{}.woff'.format( font_type) print('font_url:', font_url) words = re.findall( r'</style><span class="{}">(.*)</span></em><cite>万字</cite><i>|</i><em><style>' .format(font_type), response.text, re.I | re.M)[0] print('words:', words) words = get_words(words, font_url) words = int(float(words) * 10000) item["words"] = words print('words:', words) click_num = re.findall( r'</style><span class="{}">(.*)</span></em><cite>万总会员点击<span>'. format(font_type), response.text, re.I | re.M)[0] click_num = re.findall( r'</style><span class="{}">(.*)'.format(font_type), click_num, re.I | re.M)[0] print('click_num:', click_num) click_num = get_words(click_num, font_url) click_num = int(float(click_num) * 10000) item["click_num"] = click_num print('click_num:', click_num) update_date = ''.join( response.xpath( '//li[@class="update"]/div[@class="detail"]/p[@class="cf"]/em/text()' ).extract()).strip() update_date = parse_time(update_date) item["update_date"] = update_date print('update_date:', update_date) collect_num = None item["collect_num"] = collect_num authorId = ''.join( response.xpath( '//*[@id="authorId"]/@data-authorid').extract()).strip() print('authorId:', authorId) chanId = re.findall(r'chanId\=(\d+)', text)[0] print('chanId:', chanId) bookId = ''.join( re.findall(r'https\:\/\/book\.qidian\.com\/info\/(\d+)', url, re.I | re.M)) print('bookId:', bookId) _csrfToken = 'HUc4mzWMTveOYkK01P9mREV04r5f0zisvnDNZl7j' link = 'https://book.qidian.com/ajax/comment/index?_csrfToken=HUc4mzWMTveOYkK01P9mREV04r5f0zisvnDNZl7j&bookId={}&pageSize=15'.format( bookId) yield scrapy.Request(url=link, callback=self.parse_page_score, meta={ 'item': item, 'authorId': authorId, 'chanId': chanId, 'bookId': bookId }, dont_filter=True)
def parse(self, response): print('1,==========================', response.url) text = response.text # print(text) item = TNovelSummaryItem() src_url = response.url item["src_url"] = src_url print('src_url:', src_url) product_number = ''.join( response.xpath( '//h1[@class="fllf"]/a[@title]/text()').extract()).strip() print('product_number:', product_number) product_number = get_product_number(product_number) print('product_number:', product_number) item["product_number"] = product_number plat_number = 'P32' print('plat_number:', plat_number) item["plat_number"] = plat_number Chapter_num_update = ''.join( response.xpath('//h3[@class="bom10"]/a[@class="cboy"]/text()'). extract()).strip() Chapter_num_update = ''.join( re.findall(r'第(\d+)章', Chapter_num_update, re.I | re.M)) item["Chapter_num_update"] = Chapter_num_update print('Chapter_num_update:', Chapter_num_update) update_date = ''.join( response.xpath('//h3[@class="bom10"]/span[@class="lf10"]/text()'). extract()).strip() update_date = parse_time(update_date) item["update_date"] = update_date print('update_date:', update_date) words = ' '.join( response.xpath( '//div[@class="right"]/p[@class="infor bom10"]/span/text()'). extract()).strip() words = ''.join(re.findall(r'总字数:(.*?)\s', words, re.I | re.M)) words = process_number(words) item["words"] = words print('words:', words) click_num = ' '.join( response.xpath( '//div[@class="right"]/p[@class="infor bom10"]/span/text()'). extract()).strip() click_num = ''.join(re.findall(r'点击:(.*?)\s ', click_num, re.I | re.M)) print('click_num:', click_num) if '万' in click_num: click_num = click_num.replace('万', '') click_num = int(atof(click_num) * 10000) item["click_num"] = click_num print('click_num:', click_num) else: click_num = int(atof(click_num)) item["click_num"] = click_num print('click_num:', click_num) tickets_num = None item["tickets_num"] = tickets_num comment_num = ''.join( response.xpath( '//div[@category="comment"]/a[@class="tabfmbtn cboy"]/text()'). extract()).strip() comment_num = ''.join(re.findall(r'最新书评(.*)', comment_num, re.I | re.S)).replace( '(', '').replace(')', '') comment_num = int(atof(comment_num)) item["comment_num"] = comment_num print('comment_num:', comment_num) score = None item["score"] = score collect_num = None item["collect_num"] = collect_num reward_num = None item["reward_num"] = reward_num last_modify_date = datetime.datetime.now().strftime( '%Y-%m-%d %H:%M:%S') item["last_modify_date"] = last_modify_date print('last_modify_date:', last_modify_date) print(item) yield item
def parse(self, response): print('1,================', response.url) item = TNovelSummaryItem() src_url = response.url item["src_url"] = src_url print('src_url:', src_url) product_number = ''.join( response.xpath( '//h1[@itemprop="name"]/span/text()').extract()).strip() if '【' and '】' in product_number: product_number = product_number.replace('【', '[').replace('】', ']') print('product_number:', product_number) product_number = get_product_number(product_number) print('product_number:', product_number) item["product_number"] = product_number else: product_number = product_number product_number = get_product_number(product_number) print('product_number:', product_number) item["product_number"] = product_number plat_number = 'P16' print('plat_number:', plat_number) item["plat_number"] = plat_number Chapter_num_update = ''.join( response.xpath( '//*[@id="oneboolt"]/tbody/tr[last()-1]/td[1]//text()'). extract()).strip() print('Chapter_num_update:', Chapter_num_update) item["Chapter_num_update"] = Chapter_num_update update_date = ''.join( response.xpath( '//*[@id="oneboolt"]/tbody/tr[last()-1]/td[last()]/span[1]/text()' ).extract()).strip() print('update_date:', update_date) item["update_date"] = update_date words = ''.join( response.xpath( '//*[@class="righttd"]/ul[@class="rightul"]/li/span[@itemprop="wordCount"]/text()' ).extract()).strip().replace('字', '') print('words:', words) item["words"] = words tickets_num = None item["tickets_num"] = tickets_num comment_num = ''.join( response.xpath( '//*[@id="oneboolt"]/tbody/tr[last()]/td/div/span[@itemprop="reviewCount"]/text()' ).extract()) print('comment_num:', comment_num) item["comment_num"] = comment_num score = None item['score'] = score collect_num = ''.join( response.xpath( '//*[@id="oneboolt"]/tbody/tr[last()]/td/div/span[@itemprop="collectedCount"]//text()' ).extract()).strip() print('collect_num:', collect_num) item["collect_num"] = collect_num reward_num = None item["reward_num"] = reward_num last_modify_date = datetime.datetime.now().strftime( '%Y-%m-%d %H:%M:%S') item["last_modify_date"] = last_modify_date print('last_modify_date:', last_modify_date) # driver = webdriver.PhantomJS(executable_path='D:\WGSruanjian\phantomjs-2.1.1-windows/bin/phantomjs.exe') # driver = webdriver.Chrome() # socket.setdefaulttimeout(20) # driver.get(src_url) # click_num = driver.find_element_by_xpath('//*[@id="totleclick"]').text # driver.close() # print('click_num:', click_num) # item["click_num"] = click_num yield SplashRequest(url=src_url, callback=self.parse_page, args={'wait': 0.5}, meta={'item': item}, dont_filter=True)