def parse(self, response): print('1,=========================', response.url) text = response.text # print(text) item = TNovelSummaryItem() src_url = response.url item["src_url"] = src_url print('src_url:', src_url) product_number = ''.join( response.xpath('//h2/text()').extract()).strip() print('product_number:', product_number) product_number = get_product_number(product_number) print('product_number:', product_number) item["product_number"] = product_number plat_number = 'P31' print('plat_number:', plat_number) item["plat_number"] = plat_number Chapter_num_update = ''.join( response.xpath('//h4/a/text()').extract()).strip() Chapter_num_update = ''.join( re.findall(r'第([\u4e00-\u9fa5]{1,10})章', Chapter_num_update, re.I | re.M)) Chapter_num_update = chinese_to_arabic(Chapter_num_update) item["Chapter_num_update"] = Chapter_num_update print('Chapter_num_update:', Chapter_num_update) update_date = ''.join( response.xpath( '//h4/span[@class="time"]/text()').extract()).strip() update_date = parse_time(update_date) item["update_date"] = update_date print('update_date:', update_date) words = ''.join( response.xpath('//span[@class="words"]/text()').extract()).strip() words = ''.join(re.findall(r'(\d+)字', words, re.I | re.M)) item["words"] = words print('words:', words) tickets_num = None item["tickets_num"] = tickets_num score = None item["score"] = score reward_num = None item["reward_num"] = reward_num bookId = ''.join(re.findall(r'book\/(\d+)', src_url, re.I | re.M)) link = 'http://a.heiyan.com/ajax/book/extend/{}/detail'.format(bookId) # print('link:',link) yield scrapy.Request(url=link, callback=self.parse_page_click_num, meta={ 'item': item, 'bookId': bookId }, dont_filter=True)
def parse(self, response): print('1,======================', response.url) text = response.text # print(text) item = TNovelSummaryItem() src_url = response.url item["src_url"] = src_url print('src_url:', src_url) product_number = ''.join( response.xpath('//h1/em/text()').extract()).strip() print('product_number:', product_number) product_number = get_product_number(product_number) print('product_number:', product_number) item["product_number"] = product_number plat_number = 'P33' print('plat_number:', plat_number) item["plat_number"] = plat_number Chapter_num_update = ''.join( response.xpath( '//div[@class="update"]/p/a/text()').extract()).strip() Chapter_num_update = ''.join( re.findall(r'第(\d+)章', Chapter_num_update, re.I | re.S)) item["Chapter_num_update"] = Chapter_num_update print('Chapter_num_update:', Chapter_num_update) update_date = ''.join( response.xpath( '//div[@class="update"]/p/span/text()').extract()).strip() update_date = parse_time(update_date) item["update_date"] = update_date print('update_date:', update_date) words = ''.join( response.xpath( '//div[@class="book-info"]/p[@class="total"]//text()').extract( )).strip() words = ''.join(re.findall(r'(.*?)字\|', words, re.I | re.S)) words = process_number(words) item["words"] = words print('words:', words) click_num = ''.join( response.xpath( '//div[@class="book-info"]/p[@class="total"]//text()').extract( )).strip() click_num = ''.join(re.findall(r'([0-9]+\.[0-9]+万)总点击', click_num)) print('click_num:', click_num) click_num = process_number(click_num) item["click_num"] = click_num print('click_num:', click_num) tickets_num = ''.join( response.xpath('//*[@id="monthCount"]/text()').extract()).strip() item["tickets_num"] = tickets_num print('tickets_num:', tickets_num) comment_num = ''.join( response.xpath( '//div[@class="lbf-pagination"]/ul/li[last()-1]/a/text()'). extract()).strip() if comment_num: comment_num = int(comment_num) * 10 else: comment_num = 10 item["comment_num"] = comment_num print('comment_num:', comment_num) score = None item["score"] = score collect_num = None item["collect_num"] = collect_num reward_num = None item["reward_num"] = reward_num last_modify_date = datetime.datetime.now().strftime( '%Y-%m-%d %H:%M:%S') item["last_modify_date"] = last_modify_date print('last_modify_date:', last_modify_date) print(item) yield item
def parse(self, response): print('1,=================', response.url) text = response.text # print(text) url = response.url item = TNovelSummaryItem() src_url = url item["src_url"] = src_url print('src_url:', src_url) product_number = ''.join( response.xpath('//h1/em/text()').extract()).strip() print('product_number:', product_number) product_number = get_product_number(product_number) item["product_number"] = product_number print('product_number:', product_number) plat_number = 'P20' print('plat_number:', plat_number) item["plat_number"] = plat_number update_date = ''.join( response.xpath('//p[@class="cf"]/em[@class="time"]/text()'). extract()).strip() update_date = parse_time(update_date) item["update_date"] = update_date print('update_date:', update_date) # words = ''.join(response.xpath('//div[@class="book-info "]/p[3]/em[1]/span/text()').extract()).strip() # item["words"] = words # print('words:',words) tickets_num = ''.join( response.xpath('//*[@id="monthCount"]/text()').extract()).strip() item["tickets_num"] = tickets_num print('tickets_num:', tickets_num) score_s = ''.join( response.xpath( '//*[@id="j_bookScore"]//text()').extract()).strip() if '暂无评分' in score_s: score = 0 else: score = score_s item["score"] = score print('score:', score) collect_num = None item["collect_num"] = collect_num print('collect_num:', collect_num) reward_num = ''.join( response.xpath('//*[@id="rewardNum"]/text()').extract()).strip() item["reward_num"] = reward_num print('reward_num:', reward_num) last_modify_date = datetime.datetime.now().strftime( '%Y-%m-%d %H:%M:%S') item["last_modify_date"] = last_modify_date print('last_modify_date:', last_modify_date) authorId = ''.join( response.xpath( '//*[@id="authorId"]/@data-authorid').extract()).strip() print('authorId:', authorId) chanId = re.findall(r'chanId\=(\d+)', text)[0] print('chanId:', chanId) bookId = ''.join( re.findall(r'https\:\/\/book\.qidian\.com\/info\/(\d+)', url, re.I | re.M)) print('bookId:', bookId) _csrfToken = 'HUc4mzWMTveOYkK01P9mREV04r5f0zisvnDNZl7j' link = 'https://book.qidian.com/ajax/book/category?_csrfToken=HUc4mzWMTveOYkK01P9mREV04r5f0zisvnDNZl7j&bookId={}'.format( bookId) yield scrapy.Request(url=link, callback=self.parse_page_Chapter_num, meta={ 'item': item, 'authorId': authorId, 'chanId': chanId, 'bookId': bookId }, dont_filter=True)
def parse(self, response): print('1,=======================', response.url) text = response.text # print(text) item = TNovelSummaryItem() src_url = response.url item["src_url"] = src_url print('src_url:', src_url) product_number = ''.join( response.xpath( '//div[@class="main"]/div[@class="status fl"]/h1/a/text()'). extract()).strip() if '【' and '】' in product_number: product_number = product_number.replace('【', '[').replace('】', ']') print('product_number:', product_number) product_number = get_product_number(product_number) print('product_number:', product_number) item["product_number"] = product_number else: product_number = product_number product_number = get_product_number(product_number) print('product_number:', product_number) item["product_number"] = product_number plat_number = 'P21' item["plat_number"] = plat_number print('plat_number:', plat_number) Chapter_num_update = ''.join( response.xpath( '//div[@class="update box"]/div[@class="cont"]/a/text()'). extract()).strip() if Chapter_num_update: Chapter_num_update = ''.join( re.findall(u'第(.*?)部', Chapter_num_update, re.I | re.M)) Chapter_num_update = chinesedigits(Chapter_num_update) item["Chapter_num_update"] = Chapter_num_update print('Chapter_num_update:', Chapter_num_update) update_date = ''.join( response.xpath( '//div[@class="update box"]/div[@class="uptime"]/text()'). extract()).strip().split('\n')[0].replace('·', '') update_date = parse_time(update_date) item["update_date"] = update_date print('update_date:', update_date) words = ''.join( response.xpath( '//div[@class="main"]/div[@class="status fl"]/div[@class="booksub"]/span[@title]/text()' ).extract()).strip() item["words"] = words print('words:', words) click_num = ' '.join( response.xpath( '//div[@class="vote_info"]/p//text()').extract()).strip() if click_num: click_num = ''.join( re.findall(r'总点击: (\d+)', click_num, re.I | re.M)) else: click_num = None item["click_num"] = click_num print('click_num:', click_num) comment_num = ' '.join( response.xpath( '//div[@class="vote_info"]/p//text()').extract()).strip() if comment_num: comment_num = ''.join( re.findall(r'评论数: (\d+)', comment_num, re.I | re.M)) else: comment_num = None item["comment_num"] = comment_num print('comment_num:', comment_num) score = None item["score"] = score collect_num = ' '.join( response.xpath( '//div[@class="vote_info"]/p//text()').extract()).strip() if collect_num: collect_num = ''.join( re.findall(r'总收藏: (\d+)', collect_num, re.I | re.M)) else: collect_num = None item["collect_num"] = collect_num print('collect_num:', collect_num) reward_num = None item["reward_num"] = reward_num last_modify_date = datetime.datetime.now().strftime( '%Y-%m-%d %H:%M:%S') item["last_modify_date"] = last_modify_date print('last_modify_date:', last_modify_date) bookId = ''.join(re.findall(r'bookId=\"(\d+)\"', text, re.I | re.M)) print('bookId:', bookId) link = 'http://book.zongheng.com/book/async/info.htm' formdata = {"bookId": bookId} yield scrapy.FormRequest( url=link, formdata=formdata, callback=self.parse_page, meta={'item': item}, dont_filter=True, )
def parse(self, response): print('1,=================', response.url) text = response.text # with open('qidian.txt', "wb") as f: # 开始写文件,wb代表写二进制文件 # f.write(response.body) url = response.url item = TNovelSummaryItem() src_url = url item["src_url"] = src_url print('src_url:', src_url) product_number = ''.join( response.xpath('//h1/em/text()').extract()).strip() print('product_number:', product_number) product_number = get_product_number(product_number) item["product_number"] = product_number print('product_number:', product_number) plat_number = 'P20' print('plat_number:', plat_number) item["plat_number"] = plat_number tickets_num = ''.join( response.xpath('//*[@id="monthCount"]/text()').extract()).strip() item["tickets_num"] = tickets_num print('tickets_num:', tickets_num) reward_num = ''.join( response.xpath('//*[@id="rewardNum"]/text()').extract()).strip() item["reward_num"] = reward_num print('reward_num:', reward_num) last_modify_date = datetime.datetime.now().strftime( '%Y-%m-%d %H:%M:%S') item["last_modify_date"] = last_modify_date print('last_modify_date:', last_modify_date) font_type = ''.join( response.xpath( '//div[@class="book-info "]/p/em[1]/span[@class]/@class'). extract()).strip() print('font_type:', font_type) font_url = 'https://qidian.gtimg.com/qd_anti_spider/{}.woff'.format( font_type) print('font_url:', font_url) words = re.findall( r'</style><span class="{}">(.*)</span></em><cite>万字</cite><i>|</i><em><style>' .format(font_type), response.text, re.I | re.M)[0] print('words:', words) words = get_words(words, font_url) words = int(float(words) * 10000) item["words"] = words print('words:', words) click_num = re.findall( r'</style><span class="{}">(.*)</span></em><cite>万总会员点击<span>'. format(font_type), response.text, re.I | re.M)[0] click_num = re.findall( r'</style><span class="{}">(.*)'.format(font_type), click_num, re.I | re.M)[0] print('click_num:', click_num) click_num = get_words(click_num, font_url) click_num = int(float(click_num) * 10000) item["click_num"] = click_num print('click_num:', click_num) update_date = ''.join( response.xpath( '//li[@class="update"]/div[@class="detail"]/p[@class="cf"]/em/text()' ).extract()).strip() update_date = parse_time(update_date) item["update_date"] = update_date print('update_date:', update_date) collect_num = None item["collect_num"] = collect_num authorId = ''.join( response.xpath( '//*[@id="authorId"]/@data-authorid').extract()).strip() print('authorId:', authorId) chanId = re.findall(r'chanId\=(\d+)', text)[0] print('chanId:', chanId) bookId = ''.join( re.findall(r'https\:\/\/book\.qidian\.com\/info\/(\d+)', url, re.I | re.M)) print('bookId:', bookId) _csrfToken = 'HUc4mzWMTveOYkK01P9mREV04r5f0zisvnDNZl7j' link = 'https://book.qidian.com/ajax/comment/index?_csrfToken=HUc4mzWMTveOYkK01P9mREV04r5f0zisvnDNZl7j&bookId={}&pageSize=15'.format( bookId) yield scrapy.Request(url=link, callback=self.parse_page_score, meta={ 'item': item, 'authorId': authorId, 'chanId': chanId, 'bookId': bookId }, dont_filter=True)
def parse(self, response): print('1,==========================', response.url) text = response.text # print(text) item = TNovelSummaryItem() src_url = response.url item["src_url"] = src_url print('src_url:', src_url) product_number = ''.join( response.xpath( '//h1[@class="fllf"]/a[@title]/text()').extract()).strip() print('product_number:', product_number) product_number = get_product_number(product_number) print('product_number:', product_number) item["product_number"] = product_number plat_number = 'P32' print('plat_number:', plat_number) item["plat_number"] = plat_number Chapter_num_update = ''.join( response.xpath('//h3[@class="bom10"]/a[@class="cboy"]/text()'). extract()).strip() Chapter_num_update = ''.join( re.findall(r'第(\d+)章', Chapter_num_update, re.I | re.M)) item["Chapter_num_update"] = Chapter_num_update print('Chapter_num_update:', Chapter_num_update) update_date = ''.join( response.xpath('//h3[@class="bom10"]/span[@class="lf10"]/text()'). extract()).strip() update_date = parse_time(update_date) item["update_date"] = update_date print('update_date:', update_date) words = ' '.join( response.xpath( '//div[@class="right"]/p[@class="infor bom10"]/span/text()'). extract()).strip() words = ''.join(re.findall(r'总字数:(.*?)\s', words, re.I | re.M)) words = process_number(words) item["words"] = words print('words:', words) click_num = ' '.join( response.xpath( '//div[@class="right"]/p[@class="infor bom10"]/span/text()'). extract()).strip() click_num = ''.join(re.findall(r'点击:(.*?)\s ', click_num, re.I | re.M)) print('click_num:', click_num) if '万' in click_num: click_num = click_num.replace('万', '') click_num = int(atof(click_num) * 10000) item["click_num"] = click_num print('click_num:', click_num) else: click_num = int(atof(click_num)) item["click_num"] = click_num print('click_num:', click_num) tickets_num = None item["tickets_num"] = tickets_num comment_num = ''.join( response.xpath( '//div[@category="comment"]/a[@class="tabfmbtn cboy"]/text()'). extract()).strip() comment_num = ''.join(re.findall(r'最新书评(.*)', comment_num, re.I | re.S)).replace( '(', '').replace(')', '') comment_num = int(atof(comment_num)) item["comment_num"] = comment_num print('comment_num:', comment_num) score = None item["score"] = score collect_num = None item["collect_num"] = collect_num reward_num = None item["reward_num"] = reward_num last_modify_date = datetime.datetime.now().strftime( '%Y-%m-%d %H:%M:%S') item["last_modify_date"] = last_modify_date print('last_modify_date:', last_modify_date) print(item) yield item