예제 #1
0
 def parse(self, response):
     print('1,=======================', response.url)
     # print(response.text)
     # url = response.url
     item = TNovelSummaryItem()
     src_url = response.url
     item["src_url"] = src_url
     print('src_url:', src_url)
     bid = response.url.replace('http://www.shuqi.com/cover.php?bid=', '')
     print('bid:', bid)
     timestamp = str(int(time.time()))[0:10]
     print('timestamp:', timestamp)
     pageKey = "f2850e634f85f485d719314ae3cfe252"
     # jsstr = self.get_js()
     # ctx = execjs.compile(jsstr)
     s = bid + timestamp + pageKey
     sign = hashlib.md5(s.encode(encoding='UTF-8')).hexdigest()
     print('sign:', sign)
     formdata = {
         'bid': bid,
         'timestamp': timestamp,
         'sign': sign,
     }
     link = 'https://ognv1.sqreader.com/index.php?r=pcapi/pcbook/bookinfo'
     yield scrapy.FormRequest(
         url=link,
         formdata=formdata,
         callback=self.parse_page_p,
         meta={
             'bid': bid,
             'item': item
         },
         dont_filter=True,
     )
예제 #2
0
 def parse(self, response):
     print('1,=========================', response.url)
     text = response.text
     # print(text)
     item = TNovelSummaryItem()
     src_url = response.url
     item["src_url"] = src_url
     print('src_url:', src_url)
     product_number = ''.join(
         response.xpath('//h2/text()').extract()).strip()
     print('product_number:', product_number)
     product_number = get_product_number(product_number)
     print('product_number:', product_number)
     item["product_number"] = product_number
     plat_number = 'P31'
     print('plat_number:', plat_number)
     item["plat_number"] = plat_number
     Chapter_num_update = ''.join(
         response.xpath('//h4/a/text()').extract()).strip()
     Chapter_num_update = ''.join(
         re.findall(r'第([\u4e00-\u9fa5]{1,10})章', Chapter_num_update,
                    re.I | re.M))
     Chapter_num_update = chinese_to_arabic(Chapter_num_update)
     item["Chapter_num_update"] = Chapter_num_update
     print('Chapter_num_update:', Chapter_num_update)
     update_date = ''.join(
         response.xpath(
             '//h4/span[@class="time"]/text()').extract()).strip()
     update_date = parse_time(update_date)
     item["update_date"] = update_date
     print('update_date:', update_date)
     words = ''.join(
         response.xpath('//span[@class="words"]/text()').extract()).strip()
     words = ''.join(re.findall(r'(\d+)字', words, re.I | re.M))
     item["words"] = words
     print('words:', words)
     tickets_num = None
     item["tickets_num"] = tickets_num
     score = None
     item["score"] = score
     reward_num = None
     item["reward_num"] = reward_num
     bookId = ''.join(re.findall(r'book\/(\d+)', src_url, re.I | re.M))
     link = 'http://a.heiyan.com/ajax/book/extend/{}/detail'.format(bookId)
     # print('link:',link)
     yield scrapy.Request(url=link,
                          callback=self.parse_page_click_num,
                          meta={
                              'item': item,
                              'bookId': bookId
                          },
                          dont_filter=True)
예제 #3
0
    def parse(self, response):
        print('1,==========================', response.url)
        text = response.text
        # print(text)
        item = TNovelSummaryItem()
        src_url = response.url
        item["src_url"] = src_url
        print('src_url:', src_url)
        product_number = ''.join(
            response.xpath(
                '//h1[@class="RecArticle"]//text()').extract()).strip()
        print('product_number:', product_number)
        product_number = get_product_number(product_number)
        print('product_number:', product_number)
        item["product_number"] = product_number
        plat_number = 'P30'
        print('plat_number:', plat_number)
        item["plat_number"] = plat_number
        Chapter_num_update = ''.join(
            response.xpath(
                '//*[@id="con_ListStyleBTab2C_1"]/div[@class="area"]/span/a/b/text()'
            ).extract()).strip()
        Chapter_num_update = ''.join(
            re.findall(r'(\d+)', Chapter_num_update, re.I | re.M))
        item["Chapter_num_update"] = Chapter_num_update
        print('Chapter_num_update:', Chapter_num_update)
        update_date = ''.join(
            response.xpath(
                '//*[@id="con_ListStyleBTab2C_1"]/div[@class="area"]/span/text()'
            ).extract()).strip()
        update_date = ''.join(
            re.findall(r'\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}', update_date,
                       re.I | re.M))
        item["update_date"] = update_date
        print('update_date:', update_date)
        tickets_num = ''.join(
            response.xpath(
                '//div[@class="inter_con"]/div[@class="give"]/ul/li[@id="Interbt3"]/p/span/text()'
            ).extract()).strip()
        item["tickets_num"] = tickets_num
        print('tickets_num:', tickets_num)
        reward_num = ''.join(
            response.xpath(
                '//div[@class="inter_con"]/div[@class="give"]/ul/li[@id="Interbt1"]/p/span/text()'
            ).extract()).strip()
        item["reward_num"] = reward_num
        print('reward_num:', reward_num)
        score = None
        item["score"] = score
        book_id = ''.join(re.findall(r'book\/(\d+)', src_url, re.I | re.M))
        print('book_id:', book_id)
        link = 'http://www.3gsc.com.cn/BookLazyload/getstatis?book_id={}'.format(
            book_id)
        yield scrapy.Request(url=link,
                             callback=self.parse_page,
                             meta={
                                 'item': item,
                                 'book_id': book_id
                             },
                             dont_filter=True)

        comment_num_link = ''.join(
            response.xpath(
                '//div[@class="forum_stati"]/div[@class="opt"]/a/@href').
            extract()).strip()
        if comment_num_link:
            comment_num_link = 'http://www.3gsc.com.cn' + comment_num_link
            yield scrapy.Request(url=comment_num_link,
                                 callback=self.parse_page_comment_num,
                                 meta={'item': item},
                                 dont_filter=True)
        else:
            print('没找到连接////')
예제 #4
0
    def parse(self, response):
        print('1,======================', response.url)
        text = response.text
        # print(text)
        item = TNovelSummaryItem()
        src_url = response.url
        item["src_url"] = src_url
        print('src_url:', src_url)
        product_number = ''.join(
            response.xpath('//h1/em/text()').extract()).strip()
        print('product_number:', product_number)
        product_number = get_product_number(product_number)
        print('product_number:', product_number)
        item["product_number"] = product_number
        plat_number = 'P33'
        print('plat_number:', plat_number)
        item["plat_number"] = plat_number
        Chapter_num_update = ''.join(
            response.xpath(
                '//div[@class="update"]/p/a/text()').extract()).strip()
        Chapter_num_update = ''.join(
            re.findall(r'第(\d+)章', Chapter_num_update, re.I | re.S))
        item["Chapter_num_update"] = Chapter_num_update
        print('Chapter_num_update:', Chapter_num_update)
        update_date = ''.join(
            response.xpath(
                '//div[@class="update"]/p/span/text()').extract()).strip()
        update_date = parse_time(update_date)
        item["update_date"] = update_date
        print('update_date:', update_date)
        words = ''.join(
            response.xpath(
                '//div[@class="book-info"]/p[@class="total"]//text()').extract(
                )).strip()
        words = ''.join(re.findall(r'(.*?)字\|', words, re.I | re.S))
        words = process_number(words)
        item["words"] = words
        print('words:', words)
        click_num = ''.join(
            response.xpath(
                '//div[@class="book-info"]/p[@class="total"]//text()').extract(
                )).strip()
        click_num = ''.join(re.findall(r'([0-9]+\.[0-9]+万)总点击', click_num))
        print('click_num:', click_num)
        click_num = process_number(click_num)
        item["click_num"] = click_num
        print('click_num:', click_num)
        tickets_num = ''.join(
            response.xpath('//*[@id="monthCount"]/text()').extract()).strip()
        item["tickets_num"] = tickets_num
        print('tickets_num:', tickets_num)
        comment_num = ''.join(
            response.xpath(
                '//div[@class="lbf-pagination"]/ul/li[last()-1]/a/text()').
            extract()).strip()
        if comment_num:
            comment_num = int(comment_num) * 10
        else:
            comment_num = 10
        item["comment_num"] = comment_num
        print('comment_num:', comment_num)
        score = None
        item["score"] = score
        collect_num = None
        item["collect_num"] = collect_num
        reward_num = None
        item["reward_num"] = reward_num
        last_modify_date = datetime.datetime.now().strftime(
            '%Y-%m-%d %H:%M:%S')
        item["last_modify_date"] = last_modify_date
        print('last_modify_date:', last_modify_date)

        print(item)
        yield item
예제 #5
0
 def parse(self, response):
     print('1,=================', response.url)
     text = response.text
     # print(text)
     url = response.url
     item = TNovelSummaryItem()
     src_url = url
     item["src_url"] = src_url
     print('src_url:', src_url)
     product_number = ''.join(
         response.xpath('//h1/em/text()').extract()).strip()
     print('product_number:', product_number)
     product_number = get_product_number(product_number)
     item["product_number"] = product_number
     print('product_number:', product_number)
     plat_number = 'P20'
     print('plat_number:', plat_number)
     item["plat_number"] = plat_number
     update_date = ''.join(
         response.xpath('//p[@class="cf"]/em[@class="time"]/text()').
         extract()).strip()
     update_date = parse_time(update_date)
     item["update_date"] = update_date
     print('update_date:', update_date)
     # words = ''.join(response.xpath('//div[@class="book-info "]/p[3]/em[1]/span/text()').extract()).strip()
     # item["words"] = words
     # print('words:',words)
     tickets_num = ''.join(
         response.xpath('//*[@id="monthCount"]/text()').extract()).strip()
     item["tickets_num"] = tickets_num
     print('tickets_num:', tickets_num)
     score_s = ''.join(
         response.xpath(
             '//*[@id="j_bookScore"]//text()').extract()).strip()
     if '暂无评分' in score_s:
         score = 0
     else:
         score = score_s
     item["score"] = score
     print('score:', score)
     collect_num = None
     item["collect_num"] = collect_num
     print('collect_num:', collect_num)
     reward_num = ''.join(
         response.xpath('//*[@id="rewardNum"]/text()').extract()).strip()
     item["reward_num"] = reward_num
     print('reward_num:', reward_num)
     last_modify_date = datetime.datetime.now().strftime(
         '%Y-%m-%d %H:%M:%S')
     item["last_modify_date"] = last_modify_date
     print('last_modify_date:', last_modify_date)
     authorId = ''.join(
         response.xpath(
             '//*[@id="authorId"]/@data-authorid').extract()).strip()
     print('authorId:', authorId)
     chanId = re.findall(r'chanId\=(\d+)', text)[0]
     print('chanId:', chanId)
     bookId = ''.join(
         re.findall(r'https\:\/\/book\.qidian\.com\/info\/(\d+)', url,
                    re.I | re.M))
     print('bookId:', bookId)
     _csrfToken = 'HUc4mzWMTveOYkK01P9mREV04r5f0zisvnDNZl7j'
     link = 'https://book.qidian.com/ajax/book/category?_csrfToken=HUc4mzWMTveOYkK01P9mREV04r5f0zisvnDNZl7j&bookId={}'.format(
         bookId)
     yield scrapy.Request(url=link,
                          callback=self.parse_page_Chapter_num,
                          meta={
                              'item': item,
                              'authorId': authorId,
                              'chanId': chanId,
                              'bookId': bookId
                          },
                          dont_filter=True)
예제 #6
0
    def parse(self, response):
        print('1,=========================', response.url)
        headers = {
            "Accept":
            "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
            # "Accept-Encoding": "gzip, deflate",
            "Accept-Language":
            "zh-CN,zh;q=0.9,en;q=0.8",
            "Cache-Control":
            "max-age=0",
            "Connection":
            "keep-alive",
            "Cookie":
            "UM_distinctid=164daf563af44a-0b77993c46f21-6114147a-100200-164daf563b0422; canal=0; schannelm=0; www_say=775abff326250295f011c827045e4f45; PHPSESSID=imbduief49qgu8e9mttjtb03u1; Hm_lvt_688746b9e4f9d33e0e2ce6aeffb4fa58=1535520731,1535597551; counter=zixing; countertime=2018/8/30; _jzqc=1; _qzjc=1; CNZZDATA1253179669=891460335-1532681545-%7C1535606694; Hm_lpvt_688746b9e4f9d33e0e2ce6aeffb4fa58=1535610428; uuid=2AE001D147E7F1C7E3026160C9234536; marks=13; _qzja=1.754836888.1532681874609.1535597551629.1535610427783.1535602634139.1535610427783.0.0.0.22.6; _qzjto=18.2.0; _jzqa=1.4298357099084273000.1532681875.1535597552.1535610428.6; _jzqx=1.1535610428.1535610428.1.jzqsr=xiang5%2Ecom|jzqct=/.-; _jzqckmp=1; _jzqb=1.1.10.1535610428.1; _qzjb=1.1535610427783.1.0.0.0",
            "Host":
            "www.xiang5.com",
            "Upgrade-Insecure-Requests":
            "1",
            "User-Agent":
            "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36",
        }
        if response.status != 200:
            yield scrapy.Request(url=response.url,
                                 headers=headers,
                                 callback=self.parse,
                                 dont_filter=True)
        else:
            print('请求成功>>>')
            item = TNovelSummaryItem()
            src_url = response.url
            item["src_url"] = src_url
            print('src_url:', src_url)
            product_number = ' '.join(
                response.xpath('//div[@class=" fr worksLR"]/h4/text()').
                extract()).strip()
            print('product_number:', product_number)
            product_number = get_product_number(product_number)
            print('product_number:', product_number)
            item["product_number"] = product_number
            plat_number = 'P35'
            print('plat_number:', plat_number)
            item["plat_number"] = plat_number
            Chapter_num_update = ''.join(
                response.xpath(
                    '//div[@class="worksL2"]/h2/b[@class="colR"]/a/text()').
                extract()).strip()
            Chapter_num_update = ''.join(
                re.findall(r'第(\d+)章', Chapter_num_update, re.I | re.M))
            item["Chapter_num_update"] = Chapter_num_update
            print('Chapter_num_update:', Chapter_num_update)
            update_date = ''.join(
                response.xpath(
                    '//div[@class="worksL2"]/h2/text()').extract()).strip()
            # print('update_date:', update_date)
            update_date = re.match(
                r'([\d+]{4}\-[\d+]{2}\-[\d+]{2}\s[\d+]{2}\:[\d+]{2}\:[\d+]{2})',
                update_date, re.I | re.S).group()
            item["update_date"] = update_date
            print('update_date:', update_date)
            words = ''.join(
                response.xpath('//div[@class="workSecHit"]/span/text()').
                extract()).strip()
            words = ' '.join(re.findall(r'字数:(.*)', words, re.I | re.M))
            words = process_number(words)
            item["words"] = words
            print('words:', words)
            click_num = ''.join(
                response.xpath('//div[@class="workSecHit"]/span/text()').
                extract()).strip()
            click_num = ''.join(
                re.findall(r'点击:(\d+)\s', click_num, re.I | re.M))
            item["click_num"] = click_num
            print('click_num:', click_num)
            collect_num = ''.join(
                response.xpath('//div[@class="workSecHit"]/span/text()').
                extract()).strip()
            collect_num = ''.join(
                re.findall(r'收藏:(\d+)\s', collect_num, re.I | re.M))
            item["collect_num"] = collect_num
            print('collect_num:', collect_num)
            tickets_num = None
            item["tickets_num"] = tickets_num
            comment_num = ''.join(
                response.xpath(
                    '//*[@id="pinglun"]/h4/span[@class="fl"]/b/text()').
                extract()).strip()
            comment_num = ''.join(
                re.findall(r'(\d+)', comment_num, re.I | re.M))
            item["comment_num"] = comment_num
            print('comment_num:', comment_num)
            score = None
            item["score"] = score
            reward_num = None
            item["reward_num"] = reward_num
            last_modify_date = datetime.datetime.now().strftime(
                '%Y-%m-%d %H:%M:%S')
            item["last_modify_date"] = last_modify_date
            print('last_modify_date:', last_modify_date)

            print(item)
            yield item
예제 #7
0
    def parse(self, response):
        print('1,=======================', response.url)
        text = response.text
        item = TNovelSummaryItem()
        src_url = response.url
        item["src_url"] = src_url
        print('src_url:', src_url)
        product_number = ''.join(
            response.xpath(
                '//div[@class="Info Sign"]/h1/a[@target="_blank"]/text()').
            extract()).strip()
        product_number = get_product_number(product_number)
        print('product_number:', product_number)
        item["product_number"] = product_number
        plat_number = 'P22'
        print('plat_number:', plat_number)
        item["plat_number"] = plat_number
        Chapter_num_update = None
        item["Chapter_num_update"] = Chapter_num_update
        print('Chapter_num_update:', Chapter_num_update)
        update_date_s = ''.join(
            response.xpath('//dl[@class="Tab"]/dt[@class="tit"]/em/text()').
            extract()).strip().replace('更新:', '').strip()
        update_date = update_date_s + datetime.datetime.now().strftime(':%S')
        item["update_date"] = update_date
        print('update_date:', update_date)
        # timeArray = time.strptime(update_date_s, "%Y-%m-%d %H:%M")
        # timeStamp = int(time.mktime(timeArray))
        # print('timeStamp:', timeStamp)
        words = ''.join(
            response.xpath(
                '//div[@class="BookData"]/p[last()-1]/em[@class="red"]/text()'
            ).extract()).strip()
        item["words"] = words
        print('words:', words)
        tickets_num = None
        item["tickets_num"] = tickets_num
        score = None
        item["score"] = score
        collect_num = None
        item["collect_num"] = collect_num
        reward_num = None
        item["reward_num"] = reward_num
        last_modify_date = datetime.datetime.now().strftime(
            '%Y-%m-%d %H:%M:%S')
        item["last_modify_date"] = last_modify_date
        print('last_modify_date:', last_modify_date)

        authorId = re.findall(r'\/zuozhe\/(\d+)', text, re.I | re.M)[0]
        print('authorId:', authorId)
        bookId = ''.join(re.findall(r'([\d+]{4,6})', src_url, re.I | re.M))
        print('bookId:', bookId)
        click_num_link = 'http://api.ali.17k.com/v2/book/{}/stat_info?app_key=3362611833&click_info=1&hb_info=1&flower_info=1&stamp_info=1&cps_source='.format(
            bookId)
        yield scrapy.Request(url=click_num_link,
                             callback=self.parse_page_click_num,
                             meta={
                                 'item': item,
                                 'bookId': bookId
                             },
                             dont_filter=True)
예제 #8
0
    def parse(self, response):
        print('1,========================',response.url)
        text = response.text
        # print(text)
        item = TNovelSummaryItem()
        url = response.url
        src_url = url
        item["src_url"] = src_url
        print('src_url:', src_url)
        product_number = ''.join(response.xpath('//img[@class="qqredaer_tit"]/@title').extract()).strip()
        product_number = get_product_number(product_number)
        print('product_number:', product_number)
        item["product_number"] = product_number
        plat_number = 'P17'
        print('plat_number:', plat_number)
        item["plat_number"] = plat_number
        Chapter_num_update_s = ''.join(response.xpath('//*[@id="newChapterList"]/div[@class="chaptername"]/b/a[@class="green"]/text()').extract()).strip()
        print('Chapter_num_update_s:', Chapter_num_update_s)
        if Chapter_num_update_s:
            Chapter_num_update = ''.join(re.findall(u'第.*?卷 第([\u4e00-\u9fa5]{4,10})\s', Chapter_num_update_s, re.I | re.M))
            if Chapter_num_update:
                Chapter_num_update = chinesedigits(Chapter_num_update)
                item["Chapter_num_update"] = Chapter_num_update
                print('Chapter_num_update:', Chapter_num_update)
            else:
                Chapter_num_update = ''.join(re.findall(r'第(\d+)章',Chapter_num_update_s, re.I|re.M))
                if Chapter_num_update:
                    item["Chapter_num_update"] = Chapter_num_update
                    print('Chapter_num_update:', Chapter_num_update)
                else:
                    Chapter_num_update = ''.join(re.findall(r'第.*?卷 (\d+)',Chapter_num_update_s, re.I|re.M))
                    if Chapter_num_update:
                        item["Chapter_num_update"] = Chapter_num_update
                        print('Chapter_num_update:', Chapter_num_update)
                    else:
                        Chapter_num_update = ''.join(re.findall(r'第.*?卷 \【(.*?)\】', Chapter_num_update_s, re.I | re.M))
                        if Chapter_num_update:
                            item["Chapter_num_update"] = Chapter_num_update
                            print('Chapter_num_update:', Chapter_num_update)
                        else:
                            Chapter_num_update = ''.join(re.findall(r'第.*?卷 [\u4e00-\u9fa5]{0,6}\((\d+)\)', Chapter_num_update_s, re.I | re.M))
                            if Chapter_num_update:
                                item["Chapter_num_update"] = Chapter_num_update
                                print('Chapter_num_update:', Chapter_num_update)
                            else:
                                Chapter_num_update = None
                                item["Chapter_num_update"] = Chapter_num_update
                                print('Chapter_num_update:', Chapter_num_update)
        else:
            Chapter_num_update = None
            item["Chapter_num_update"] = Chapter_num_update
            print('Chapter_num_update:', Chapter_num_update)


        update_date = ''.join(response.xpath('//*[@id="newChapterList"]/div[@class="chaptername"]/text()').extract()).strip()
        if update_date:
            update_date = update_date.replace('(更新时间:','').replace(')','')
        else:
            update_date = datetime.datetime.now().strftime('%Y-%m-%d')
        item["update_date"] = update_date
        print('update_date:',update_date)
        words = ' '.join(response.xpath('//div[@class="num"]/table/tr/td/text()').extract()).strip()
        if words:
            words = ''.join(re.findall(r'总字数:(\d+)',words, re.I|re.M))
        else:
            words = None
        item["words"] = words
        print('words:',words)
        click_num = ' '.join(response.xpath('//div[@class="num"]/table/tr/td/text()').extract()).strip()
        if click_num:
            click_num = ''.join(re.findall(r'阅文点击:(\d+)',click_num, re.I|re.M))
        else:
            click_num = None
        item["click_num"] = click_num
        print('click_num:',click_num)
        score = None
        item["score"] = score
        collect_num = None
        item["collect_num"] = collect_num
        last_modify_date = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
        item["last_modify_date"] = last_modify_date
        print('last_modify_date:', last_modify_date)
        bid = ''.join(re.findall(r'\/(\d+).html',url, re.I|re.M))
        print('bid:',bid)
        link = 'http://yunqi.qq.com/novelcomment/index.html?bid={}'.format(bid)
        yield scrapy.Request(url=link, callback=self.parse_page, meta={'item': item,'bid':bid}, dont_filter=True)
예제 #9
0
 def parse(self, response):
     print('1,========================', response.url)
     text = response.text
     # print(text)
     item = TNovelSummaryItem()
     url = response.url
     src_url = url
     item["src_url"] = src_url
     print('src_url:', src_url)
     product_number = ''.join(
         response.xpath('//h3/a/text()').extract()).strip()
     print('product_number:', product_number)
     product_number = get_product_number(product_number)
     print('product_number:', product_number)
     item["product_number"] = product_number
     plat_number = 'P17'
     print('plat_number:', plat_number)
     item["plat_number"] = plat_number
     Chapter_num_update = ''.join(
         response.xpath('//div[@class="new_list"]/dl[1]/dd[1]/a/text()').
         extract()).strip()
     if Chapter_num_update:
         Chapter_num_update = ''.join(
             re.findall(r'第(\d+)章', Chapter_num_update, re.I | re.M))
         item["Chapter_num_update"] = Chapter_num_update
         print('Chapter_num_update:', Chapter_num_update)
     else:
         Chapter_num_update = None
         item["Chapter_num_update"] = Chapter_num_update
         print('Chapter_num_update:', Chapter_num_update)
     update_date = ''.join(
         response.xpath(
             '//div[@class="new_list"]/dl[1]/dd[@class="gray"]/text()').
         extract()).strip()
     if update_date:
         item["update_date"] = update_date
         print('update_date:', update_date)
     else:
         update_date = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
         item["update_date"] = update_date
         print('update_date:', update_date)
     words = ''.join(
         response.xpath(
             '//div[@id="bookinfo"]/div[@class="book_info"]/dl[1]/dd[last()]/text()'
         ).extract()).strip()
     if '字' in words:
         words = words.replace('字', '')
         words = process_number(words)
         item["words"] = words
         print('words:', words)
     else:
         words = process_number(words)
         item["words"] = words
         print('words:', words)
     click_num = None
     item["click_num"] = click_num
     tickets_num = None
     item["tickets_num"] = tickets_num
     comment_num = ''.join(
         response.xpath(
             '//*[@id="commentCount"]/text()').extract()).strip()
     item["comment_num"] = comment_num
     print('comment_num:', comment_num)
     score = ''.join(
         response.xpath(
             '//*[@id="StarIco"]/span[@id="StarIcoValue"]/b//text()').
         extract()).strip()
     item["score"] = score
     print('score:', score)
     collect_num = None
     item["collect_num"] = collect_num
     reward_num = None
     item["reward_num"] = reward_num
     last_modify_date = datetime.datetime.now().strftime(
         '%Y-%m-%d %H:%M:%S')
     item["last_modify_date"] = last_modify_date
     print('last_modify_date:', last_modify_date)
     yield item
예제 #10
0
 def parse(self, response):
     print('1,=======================', response.url)
     # print(response.text)
     item = TNovelSummaryItem()
     src_url = response.url
     item["src_url"] = src_url
     print('src_url:', src_url)
     product_number = ''.join(
         response.xpath('//h3[@title]/em/text()').extract()).strip()
     print('product_number:', product_number)
     product_number = get_product_number(product_number)
     print('product_number:', product_number)
     item["product_number"] = product_number
     plat_number = 'P34'
     print('plat_number:', plat_number)
     item["plat_number"] = plat_number
     Chapter_num_update = ''.join(
         response.xpath(
             '//div[@class="contents"]/div[@class="tab-item crt"]/a[@class="m-newupdate"]/h4/text()'
         ).extract()).strip()
     Chapter_num_update = ''.join(
         re.findall(r'第(.*?)章', Chapter_num_update, re.I | re.S))
     Chapter_num_update = chinese_to_arabic(Chapter_num_update)
     item["Chapter_num_update"] = Chapter_num_update
     print('Chapter_num_update:', Chapter_num_update)
     update_date = ''.join(
         response.xpath(
             '//div[@class="contents"]/div[@class="tab-item crt"]/span[@class="updatetime"]/text()'
         ).extract()).replace('更新时间:', '').strip()
     print('update_date:', update_date)
     update_date = parse_date(update_date)
     item["update_date"] = update_date
     print('update_date:', update_date)
     words = ''.join(
         response.xpath('//div[@class="m-bookstatus"]/table/tr//text()').
         extract()).strip()
     words = ''.join(re.findall(r'字数:(.*?)\n', words, re.I | re.M))
     words = process_number(words)
     item["words"] = words
     print('words:', words)
     click_num = ''.join(
         response.xpath('//div[@class="m-bookstatus"]/table/tr//text()').
         extract()).strip()
     click_num = ''.join(re.findall(r'点击:(.*?)\n', click_num, re.I | re.M))
     print('click_num:', click_num)
     click_num = process_number(click_num)
     item["click_num"] = click_num
     print('click_num:', click_num)
     score = ''.join(
         response.xpath(
             '//div[@class="starlevel"]/span[@class="score"]/text()').
         extract()).strip()
     item["score"] = score
     print('score:', score)
     collect_num = None
     item["collect_num"] = collect_num
     bookid = ''.join(re.findall(r'source\/(.*)', src_url, re.I | re.M))
     link = 'http://yuedu.163.com/snsComment.do?operation=get&type=2&id={}'.format(
         bookid)
     # print(link)
     yield scrapy.Request(url=link,
                          callback=self.parse_page_comment_num,
                          meta={
                              'item': item,
                              'bookid': bookid
                          },
                          dont_filter=True)
예제 #11
0
    def parse(self, response):
        print('1,=======================', response.url)
        text = response.text
        # print(text)
        item = TNovelSummaryItem()
        src_url = response.url
        item["src_url"] = src_url
        print('src_url:', src_url)
        product_number = ''.join(
            response.xpath(
                '//div[@class="bookname"]/h2/a/text()').extract()).strip()
        print('product_number:', product_number)
        product_number = get_product_number(product_number)
        print('product_number:', product_number)
        item["product_number"] = product_number
        plat_number = 'P18'
        print('plat_number:', plat_number)
        item["plat_number"] = plat_number
        Chapter_num_update = ''.join(
            response.xpath(
                '//div[@class="bookTit"]/div[@class="new"]/p/a/text()').
            extract()).strip()
        # print('Chapter_num_update:',Chapter_num_update)
        if Chapter_num_update:
            try:
                Chapter_num_update = ''.join(
                    re.findall(u'第(.*?)章', Chapter_num_update))
                Chapter_num_update = chinesedigits(Chapter_num_update)
                item["Chapter_num_update"] = Chapter_num_update
                print('Chapter_num_update:', Chapter_num_update)
            except:
                Chapter_num_update = ''.join(
                    re.findall(r'(\d+)', Chapter_num_update))
                item["Chapter_num_update"] = Chapter_num_update
                print('Chapter_num_update:', Chapter_num_update)
        else:
            Chapter_num_update = None
            item["Chapter_num_update"] = Chapter_num_update
            print('Chapter_num_update:', Chapter_num_update)

        update_date = ''.join(
            response.xpath(
                '//div[@class="bookTit"]/div[@class="time"]/p/text()').extract(
                )).strip()
        if update_date:
            item["update_date"] = update_date
            print('update_date:', update_date)
        else:
            update_date = datetime.datetime.now().strftime('%Y-%m-%d')
            item["update_date"] = update_date
            print('update_date:', update_date)
        words = ''.join(
            response.xpath('//div[@class="bookinf01"]/p/span[2]/text()').
            extract()).strip()
        words = process_number(words)
        item["words"] = words
        print('words:', words)
        click_num = None
        item["click_num"] = click_num
        print('click_num:', click_num)
        tickets_num = None
        item["tickets_num"] = tickets_num
        comment_num = ''.join(
            response.xpath(
                '//div[@class="bookCir"]/div[@class="title"]/p/span[last()]/text()'
            ).extract()).strip()
        # print('comment_num:',comment_num)
        if '条' in comment_num:
            comment_num = comment_num.split(',')[1]
            comment_num = ''.join(
                re.findall(r'(.*?)条', comment_num, re.I | re.M))
            # print('comment_num:',comment_num)
            if 'w+' in comment_num:
                comment_num = process_number(comment_num)
                item["comment_num"] = comment_num
                print('comment_num:', comment_num)
            else:
                comment_num = comment_num
                item["comment_num"] = comment_num
                print('comment_num:', comment_num)
        else:
            comment_num = 0
            item["comment_num"] = comment_num
            print('comment_num:', comment_num)
        score = ''.join(
            response.xpath(
                '//div[@class="bookinf01"]/div[@class="bookname"]/span/text()'
            ).extract()).strip()
        if score:
            item["score"] = score
            print('score:', score)
        else:
            score = 0
            item["score"] = score
            print('score:', score)
        collect_num = ''.join(
            response.xpath('//div[@class="bookinf01"]/p/span[3]/b/text()').
            extract()).strip()
        print('collect_num:', collect_num)
        if collect_num:
            collect_num = process_number(collect_num)
            item["collect_num"] = collect_num
            print('collect_num:', collect_num)
        else:
            collect_num = 0
            item["collect_num"] = collect_num
            print('collect_num:', collect_num)
        reward_num = None
        item["reward_num"] = reward_num
        last_modify_date = datetime.datetime.now().strftime(
            '%Y-%m-%d %H:%M:%S')
        item["last_modify_date"] = last_modify_date
        print('last_modify_date:', last_modify_date)
        yield item
예제 #12
0
    def parse(self, response):
        print('1,=======================', response.url)
        text = response.text
        # print(text)
        item = TNovelSummaryItem()
        src_url = response.url
        item["src_url"] = src_url
        print('src_url:', src_url)
        product_number = ''.join(
            response.xpath(
                '//div[@class="main"]/div[@class="status fl"]/h1/a/text()').
            extract()).strip()
        if '【' and '】' in product_number:
            product_number = product_number.replace('【', '[').replace('】', ']')
            print('product_number:', product_number)
            product_number = get_product_number(product_number)
            print('product_number:', product_number)
            item["product_number"] = product_number
        else:
            product_number = product_number
            product_number = get_product_number(product_number)
            print('product_number:', product_number)
            item["product_number"] = product_number
        plat_number = 'P21'
        item["plat_number"] = plat_number
        print('plat_number:', plat_number)
        Chapter_num_update = ''.join(
            response.xpath(
                '//div[@class="update box"]/div[@class="cont"]/a/text()').
            extract()).strip()
        if Chapter_num_update:
            Chapter_num_update = ''.join(
                re.findall(u'第(.*?)部', Chapter_num_update, re.I | re.M))
            Chapter_num_update = chinesedigits(Chapter_num_update)
            item["Chapter_num_update"] = Chapter_num_update
            print('Chapter_num_update:', Chapter_num_update)
        update_date = ''.join(
            response.xpath(
                '//div[@class="update box"]/div[@class="uptime"]/text()').
            extract()).strip().split('\n')[0].replace('·', '')
        update_date = parse_time(update_date)
        item["update_date"] = update_date
        print('update_date:', update_date)
        words = ''.join(
            response.xpath(
                '//div[@class="main"]/div[@class="status fl"]/div[@class="booksub"]/span[@title]/text()'
            ).extract()).strip()
        item["words"] = words
        print('words:', words)
        click_num = ' '.join(
            response.xpath(
                '//div[@class="vote_info"]/p//text()').extract()).strip()
        if click_num:
            click_num = ''.join(
                re.findall(r'总点击: (\d+)', click_num, re.I | re.M))
        else:
            click_num = None
        item["click_num"] = click_num
        print('click_num:', click_num)
        comment_num = '  '.join(
            response.xpath(
                '//div[@class="vote_info"]/p//text()').extract()).strip()
        if comment_num:
            comment_num = ''.join(
                re.findall(r'评论数:  (\d+)', comment_num, re.I | re.M))
        else:
            comment_num = None
        item["comment_num"] = comment_num
        print('comment_num:', comment_num)
        score = None
        item["score"] = score
        collect_num = '  '.join(
            response.xpath(
                '//div[@class="vote_info"]/p//text()').extract()).strip()

        if collect_num:
            collect_num = ''.join(
                re.findall(r'总收藏:  (\d+)', collect_num, re.I | re.M))
        else:
            collect_num = None
        item["collect_num"] = collect_num
        print('collect_num:', collect_num)
        reward_num = None
        item["reward_num"] = reward_num
        last_modify_date = datetime.datetime.now().strftime(
            '%Y-%m-%d %H:%M:%S')
        item["last_modify_date"] = last_modify_date
        print('last_modify_date:', last_modify_date)
        bookId = ''.join(re.findall(r'bookId=\"(\d+)\"', text, re.I | re.M))
        print('bookId:', bookId)
        link = 'http://book.zongheng.com/book/async/info.htm'
        formdata = {"bookId": bookId}
        yield scrapy.FormRequest(
            url=link,
            formdata=formdata,
            callback=self.parse_page,
            meta={'item': item},
            dont_filter=True,
        )
예제 #13
0
    def parse(self, response):
        print('1,=================', response.url)
        text = response.text
        # with open('qidian.txt', "wb") as f:  # 开始写文件,wb代表写二进制文件
        #     f.write(response.body)
        url = response.url
        item = TNovelSummaryItem()
        src_url = url
        item["src_url"] = src_url
        print('src_url:', src_url)
        product_number = ''.join(
            response.xpath('//h1/em/text()').extract()).strip()
        print('product_number:', product_number)
        product_number = get_product_number(product_number)
        item["product_number"] = product_number
        print('product_number:', product_number)
        plat_number = 'P20'
        print('plat_number:', plat_number)
        item["plat_number"] = plat_number
        tickets_num = ''.join(
            response.xpath('//*[@id="monthCount"]/text()').extract()).strip()
        item["tickets_num"] = tickets_num
        print('tickets_num:', tickets_num)
        reward_num = ''.join(
            response.xpath('//*[@id="rewardNum"]/text()').extract()).strip()
        item["reward_num"] = reward_num
        print('reward_num:', reward_num)
        last_modify_date = datetime.datetime.now().strftime(
            '%Y-%m-%d %H:%M:%S')
        item["last_modify_date"] = last_modify_date
        print('last_modify_date:', last_modify_date)
        font_type = ''.join(
            response.xpath(
                '//div[@class="book-info "]/p/em[1]/span[@class]/@class').
            extract()).strip()
        print('font_type:', font_type)
        font_url = 'https://qidian.gtimg.com/qd_anti_spider/{}.woff'.format(
            font_type)
        print('font_url:', font_url)
        words = re.findall(
            r'</style><span class="{}">(.*)</span></em><cite>万字</cite><i>|</i><em><style>'
            .format(font_type), response.text, re.I | re.M)[0]
        print('words:', words)
        words = get_words(words, font_url)
        words = int(float(words) * 10000)
        item["words"] = words
        print('words:', words)
        click_num = re.findall(
            r'</style><span class="{}">(.*)</span></em><cite>万总会员点击<span>'.
            format(font_type), response.text, re.I | re.M)[0]
        click_num = re.findall(
            r'</style><span class="{}">(.*)'.format(font_type), click_num,
            re.I | re.M)[0]
        print('click_num:', click_num)
        click_num = get_words(click_num, font_url)
        click_num = int(float(click_num) * 10000)
        item["click_num"] = click_num
        print('click_num:', click_num)
        update_date = ''.join(
            response.xpath(
                '//li[@class="update"]/div[@class="detail"]/p[@class="cf"]/em/text()'
            ).extract()).strip()
        update_date = parse_time(update_date)
        item["update_date"] = update_date
        print('update_date:', update_date)
        collect_num = None
        item["collect_num"] = collect_num
        authorId = ''.join(
            response.xpath(
                '//*[@id="authorId"]/@data-authorid').extract()).strip()
        print('authorId:', authorId)
        chanId = re.findall(r'chanId\=(\d+)', text)[0]
        print('chanId:', chanId)
        bookId = ''.join(
            re.findall(r'https\:\/\/book\.qidian\.com\/info\/(\d+)', url,
                       re.I | re.M))
        print('bookId:', bookId)
        _csrfToken = 'HUc4mzWMTveOYkK01P9mREV04r5f0zisvnDNZl7j'
        link = 'https://book.qidian.com/ajax/comment/index?_csrfToken=HUc4mzWMTveOYkK01P9mREV04r5f0zisvnDNZl7j&bookId={}&pageSize=15'.format(
            bookId)

        yield scrapy.Request(url=link,
                             callback=self.parse_page_score,
                             meta={
                                 'item': item,
                                 'authorId': authorId,
                                 'chanId': chanId,
                                 'bookId': bookId
                             },
                             dont_filter=True)
예제 #14
0
    def parse(self, response):
        print('1,==========================', response.url)
        text = response.text
        # print(text)
        item = TNovelSummaryItem()
        src_url = response.url
        item["src_url"] = src_url
        print('src_url:', src_url)
        product_number = ''.join(
            response.xpath(
                '//h1[@class="fllf"]/a[@title]/text()').extract()).strip()
        print('product_number:', product_number)
        product_number = get_product_number(product_number)
        print('product_number:', product_number)
        item["product_number"] = product_number
        plat_number = 'P32'
        print('plat_number:', plat_number)
        item["plat_number"] = plat_number
        Chapter_num_update = ''.join(
            response.xpath('//h3[@class="bom10"]/a[@class="cboy"]/text()').
            extract()).strip()
        Chapter_num_update = ''.join(
            re.findall(r'第(\d+)章', Chapter_num_update, re.I | re.M))
        item["Chapter_num_update"] = Chapter_num_update
        print('Chapter_num_update:', Chapter_num_update)
        update_date = ''.join(
            response.xpath('//h3[@class="bom10"]/span[@class="lf10"]/text()').
            extract()).strip()
        update_date = parse_time(update_date)
        item["update_date"] = update_date
        print('update_date:', update_date)
        words = ' '.join(
            response.xpath(
                '//div[@class="right"]/p[@class="infor bom10"]/span/text()').
            extract()).strip()
        words = ''.join(re.findall(r'总字数:(.*?)\s', words, re.I | re.M))
        words = process_number(words)
        item["words"] = words
        print('words:', words)
        click_num = ' '.join(
            response.xpath(
                '//div[@class="right"]/p[@class="infor bom10"]/span/text()').
            extract()).strip()
        click_num = ''.join(re.findall(r'点击:(.*?)\s ', click_num, re.I | re.M))
        print('click_num:', click_num)
        if '万' in click_num:
            click_num = click_num.replace('万', '')
            click_num = int(atof(click_num) * 10000)
            item["click_num"] = click_num
            print('click_num:', click_num)
        else:
            click_num = int(atof(click_num))
            item["click_num"] = click_num
            print('click_num:', click_num)
        tickets_num = None
        item["tickets_num"] = tickets_num
        comment_num = ''.join(
            response.xpath(
                '//div[@category="comment"]/a[@class="tabfmbtn cboy"]/text()').
            extract()).strip()
        comment_num = ''.join(re.findall(r'最新书评(.*)',
                                         comment_num, re.I | re.S)).replace(
                                             '(', '').replace(')', '')
        comment_num = int(atof(comment_num))
        item["comment_num"] = comment_num
        print('comment_num:', comment_num)
        score = None
        item["score"] = score
        collect_num = None
        item["collect_num"] = collect_num
        reward_num = None
        item["reward_num"] = reward_num
        last_modify_date = datetime.datetime.now().strftime(
            '%Y-%m-%d %H:%M:%S')
        item["last_modify_date"] = last_modify_date
        print('last_modify_date:', last_modify_date)

        print(item)
        yield item
예제 #15
0
    def parse(self, response):
        print('1,================', response.url)
        item = TNovelSummaryItem()
        src_url = response.url
        item["src_url"] = src_url
        print('src_url:', src_url)
        product_number = ''.join(
            response.xpath(
                '//h1[@itemprop="name"]/span/text()').extract()).strip()
        if '【' and '】' in product_number:
            product_number = product_number.replace('【', '[').replace('】', ']')
            print('product_number:', product_number)
            product_number = get_product_number(product_number)
            print('product_number:', product_number)
            item["product_number"] = product_number
        else:
            product_number = product_number
            product_number = get_product_number(product_number)
            print('product_number:', product_number)
            item["product_number"] = product_number
        plat_number = 'P16'
        print('plat_number:', plat_number)
        item["plat_number"] = plat_number
        Chapter_num_update = ''.join(
            response.xpath(
                '//*[@id="oneboolt"]/tbody/tr[last()-1]/td[1]//text()').
            extract()).strip()
        print('Chapter_num_update:', Chapter_num_update)
        item["Chapter_num_update"] = Chapter_num_update
        update_date = ''.join(
            response.xpath(
                '//*[@id="oneboolt"]/tbody/tr[last()-1]/td[last()]/span[1]/text()'
            ).extract()).strip()
        print('update_date:', update_date)
        item["update_date"] = update_date
        words = ''.join(
            response.xpath(
                '//*[@class="righttd"]/ul[@class="rightul"]/li/span[@itemprop="wordCount"]/text()'
            ).extract()).strip().replace('字', '')
        print('words:', words)
        item["words"] = words
        tickets_num = None
        item["tickets_num"] = tickets_num
        comment_num = ''.join(
            response.xpath(
                '//*[@id="oneboolt"]/tbody/tr[last()]/td/div/span[@itemprop="reviewCount"]/text()'
            ).extract())
        print('comment_num:', comment_num)
        item["comment_num"] = comment_num
        score = None
        item['score'] = score
        collect_num = ''.join(
            response.xpath(
                '//*[@id="oneboolt"]/tbody/tr[last()]/td/div/span[@itemprop="collectedCount"]//text()'
            ).extract()).strip()
        print('collect_num:', collect_num)
        item["collect_num"] = collect_num
        reward_num = None
        item["reward_num"] = reward_num
        last_modify_date = datetime.datetime.now().strftime(
            '%Y-%m-%d %H:%M:%S')
        item["last_modify_date"] = last_modify_date
        print('last_modify_date:', last_modify_date)
        # driver = webdriver.PhantomJS(executable_path='D:\WGSruanjian\phantomjs-2.1.1-windows/bin/phantomjs.exe')
        # driver = webdriver.Chrome()
        # socket.setdefaulttimeout(20)
        # driver.get(src_url)
        # click_num = driver.find_element_by_xpath('//*[@id="totleclick"]').text
        # driver.close()
        # print('click_num:', click_num)
        # item["click_num"] = click_num

        yield SplashRequest(url=src_url,
                            callback=self.parse_page,
                            args={'wait': 0.5},
                            meta={'item': item},
                            dont_filter=True)