Beispiel #1
0
 def get_name(self, response):
     myPgae = response.body
     unicodePage = myPgae.decode('utf-8')
     # print  myPgae
     # # 根据正则表达式拿到所有的内容
     # novelsTable = re.findall(r'<ul class="main_con">(.*?)</ul>', unicodePage, re.S)  # 获取当前页面的Table
     # print novelsTable[0]
     novelsList = re.findall(r'<div class="book-mid-info">(.*?)</div>', unicodePage, re.S)  # 获取当前页面的Table
     # print novelsList
     # nameinfo = novelsList[0]
     for nameinfo in novelsList:
         info = re.findall(r'target="_blank".*?>(.*?)</a>', nameinfo, re.S)  # 小说地址
         novel_name = info[0]
         author = info[1]
         category = info[2]
         novelurl = "http:"+re.findall(r'<a href="(.*?)" target.*?', nameinfo, re.S)[0]
         serial = re.findall(r'<span >(.*?)</span>', nameinfo, re.S)
         serialstatus = serial[0]
         serialnumber = serial[1]
         targentcontent = XiaoshuoItem()
         targentcontent['novel_name'] = novel_name
         targentcontent['author'] = author
         targentcontent['novelurl'] = novelurl
         targentcontent['category'] = category
         targentcontent['serialnumber'] = serialnumber
         targentcontent['serialstatus'] = serialstatus
         # print targentcontent
         if novelurl is not None:
             yield Request(str(novelurl), dont_filter=True, callback=self.get_novelcontent, meta={'targentcontent': targentcontent})
Beispiel #2
0
    def parse(self, response):
        print("==============render over===============================")
        #//*[@id="content"]/dd[1]/table/tbody
        #确定分类
        catagory = response.xpath('//dl[@id="content"]/dt/h2/text()').re(u'(.+) - 文章列表')[0]
        for sel in response.xpath('//dl[@id="content"]/dd/table/tr[@bgcolor="#FFFFFF"]'):
            Item = XiaoshuoItem()

            name = sel.xpath('./td[1]/a[2]/text()').extract()[0]
            author = sel.xpath('./td[4]/text()').extract()[0]
            novelurl = sel.xpath('./td[2]/a/@href').extract()[0]
            serialstatus = sel.xpath('./td[6]/text()').extract()[0]
            wordsnum = sel.xpath('./td[4]/text()').extract()[0]
            set = novelurl.split('/')
            serialnum = set[len(set)-2]

            Item["name"] = name
            Item["author"] = author
            Item["novelurl"] = novelurl
            Item["serialstatus"] = serialstatus
            Item["wordsnum"] = wordsnum
            Item["serialnum"] = serialnum
            Item["category"] = catagory

            yield Item
            yield scrapy.Request(url=novelurl,callback = self.novel_get_parse,meta={'serial_id':serialnum})

        next_page = response.xpath('//dd[@class="pages"]/div/a[@class="next"]/@href').extract()[0] #获取下一页地址
        if next_page:
            yield scrapy.Request(next_page)
Beispiel #3
0
    def bookParse(self, response):

        item = XiaoshuoItem()
        # 小说类型
        item['noveltype'] = response.xpath(
            '//meta[contains(@property, "og:novel:category")]/@content'
        ).extract()[0]
        # 小说作者
        item['novelauthor'] = response.xpath(
            '//meta[contains(@property, "og:novel:author")]/@content').extract(
            )[0]
        # 小说名字
        item['novelname'] = response.xpath(
            '//meta[contains(@property, "og:novel:book_name")]/@content'
        ).extract()[0]
        # 小说连载状态
        item['novelstatus'] = response.xpath(
            '//meta[contains(@property, "og:novel:status")]/@content').extract(
            )[0]
        # 小说更新时间
        item['updatetime'] = response.xpath(
            '//meta[contains(@property, "og:novel:update_time")]/@content'
        ).extract()[0]
        # 小说简介
        item['novelsummary'] = "".join(
            response.xpath('//div[contains(@id, "waa")]/text()').extract())
        # 小说链接
        item['novelurl'] = response.xpath(
            '//a[contains(@class, "reader")]/@href').extract()[0]

        book_url = response.xpath(
            '//a[contains(@class, "reader")]/@href').extract()[0]

        yield scrapy.Request(url=book_url, callback=self.chapterListParser)
Beispiel #4
0
 def parse(self, response):
     item = XiaoshuoItem()
     item['novel_name'] = response.xpath(
         '//ul[@class="all-img-list cf"]//li[re:match(@data-rid,"\d")]/div[@class="book-mid-info"]/h4/a/text()'
     ).extract()
     item['novel_writter'] = response.xpath(
         '//ul[@class="all-img-list cf"]//li/div[2]/p[@class="author"]/a[@class="name"]/text()'
     ).extract()
     item['novel_main_type'] = response.xpath(
         '//ul[@class="all-img-list cf"]//li/div[2]/p[@class="author"]/a[2]/text()'
     ).extract()
     item['novel_sub_type'] = response.xpath(
         '//ul[@class="all-img-list cf"]//li/div[2]/p[@class="author"]/a[3]/text()'
     ).extract()
     item['novel_status'] = response.xpath(
         '//ul[@class="all-img-list cf"]//li/div[2]/p[@class="author"]/span/text()'
     ).extract()
     item['novel_img_url'] = response.xpath(
         '//ul[@class="all-img-list cf"]//li/div[1]//img/@src').extract()
     item['novel_real_url'] = response.xpath(
         '//ul[@class="all-img-list cf"]//li/div[2]/h4/a/@href').extract()
     for novel_dir, img_url in zip(item['novel_name'],
                                   item['novel_img_url']):
         if os.path.exists('D:\spider_data\\' + novel_dir):
             pass
         else:
             os.mkdir('D:\spider_data\\' + novel_dir)
             os.chdir('D:\spider_data\\' + novel_dir)
             img_resp = requests.get('https:' + img_url,
                                     headers=self.headers)
             with open(novel_dir + '.jpg', 'wb') as fp:
                 fp.write(img_resp.content)
                 fp.close()
                 sleep(1)
     yield item
Beispiel #5
0
 def full_text_parse(self, response):
     html = BeautifulSoup(response.text, 'lxml')
     content = html.find('div', {'id': 'content'}).text
     next = html.find('div', {'class': 'bottem2'}).find_all('a')[4]['href']
     chapter = html.find('div', {'class': 'bookname'}).find('h1').text
     name = html.find('div', {'class': 'con_top'}).find_all('a')[1].text
     author = response.meta['author']
     Item = XiaoshuoItem()
     Item['chapter'] = chapter
     Item['name'] = name
     Item['content'] = content
     Item['author'] = author
     print(Item['author'])
     print(Item['name'])
     print(Item['chapter'])
     print(Item['content'])
     yield Item
     while 1:
         try:
             request = scrapy.Request(
                 url='http://www.biquge.tv{}'.format(next),
                 callback=self.full_text_parse,
                 dont_filter=True)
             request.meta['author'] = author
             yield request
             break
         except:
             pass
Beispiel #6
0
 def parse_itme1(self, response):
     itme = XiaoshuoItem()
     url = response.xpath('//ul/li/a[@rel="nofollow"]/@href').extract()[1]
     itme['file_urls'] = [url]
     itme['names'] = response.meta['name']
     itme['leibie'] = response.meta['leibie']
     yield itme
Beispiel #7
0
 def parse(self, response):
     lists = response.xpath('//ul[@class="all-img-list cf"]/li')
     for i in lists:
         item = XiaoshuoItem()       #以字典形式
         #item['li'] = i.xpath('./@data-rid').extract()[0]
         item['name'] = i.xpath('./div[@class="book-mid-info"]/h4/a/text()').extract()[0]
         item['author'] = i.xpath('./div[@class="book-mid-info"]/p[@class="author"]/a[@class="name"]/text()').extract()[0]
         yield item
Beispiel #8
0
 def parse_detail(self, response):
     # 小说名字
     name = response.meta['name']
     # 章节名字
     chapter_name = response.xpath("//h1/text()").get()
     # 章节内容
     content = response.xpath("//div[@id='content']//text()").getall()
     content = "".join(content)
     yield XiaoshuoItem(name=name, chapter_name=chapter_name, content=content)
Beispiel #9
0
 def parse(self, response):
     if re.match('https://api.youshuge.com/new_home', response.url):
         s = response.body.decode('unicode_escape')
         while s != "}":
             s = s[s.find('"id":'):]
             if s[5:s.find(',"')]:
                 l.append(s[5:s.find(',"')])
             s = s[s.find(',"'):]
         else:
             for i in l[:1]:
                 yield scrapy.FormRequest(
                     url="https://api.youshuge.com/getbookinfo",
                     formdata={
                         "token": token,
                         "id": i
                     },
                     callback=self.parse)
     elif re.match('https://api.youshuge.com/getbookinfo', response.url):
         d = eval(
             response.body.decode('unicode_escape').replace(
                 '\\', '').replace('\r\n',
                                   '').replace('""',
                                               '"').replace('null', '""'))
         yield scrapy.FormRequest(url="https://api.youshuge.com/getcontent",
                                  headers=headers,
                                  formdata={
                                      "token": token,
                                      "bookid": str(d['data']['id']),
                                      'chapteid':
                                      str(d['data']['read_chapte'])
                                  },
                                  callback=self.parse)
     elif re.match('https://api.youshuge.com/getcontent', response.url):
         d = eval(
             response.body.decode('unicode_escape').replace(
                 '\\', '').replace('\r\n',
                                   '').replace('""',
                                               '"').replace('null', '""'))
         if d['msg'] != '余额不足':
             item = XiaoshuoItem()
             item['book_id'] = d['data']['book_id']
             item['chapte_id'] = d['data']['chapte']['id']
             item['chapte_name'] = d['data']['chapte_name']
             item['content'] = d['data']['content']
             yield item
             if d['data']['next_chapte']:
                 yield scrapy.FormRequest(
                     url="https://api.youshuge.com/getcontent",
                     headers=headers,
                     formdata={
                         "token": token,
                         "bookid": str(d['data']['book_id']),
                         'chapteid': str(d['data']['next_chapte'])
                     },
                     callback=self.parse)
Beispiel #10
0
 def dataParse(self, response):
     for i in response.css("html"):
         item = XiaoshuoItem()
         try:
             item["zuozhe"] = i.css(".w2 > a::text")[0].extract()
         except:
             pass
         try:
             item["timu"] = i.css("h1 > a::text")[0].extract()
         except:
             pass
         yield item
Beispiel #11
0
    def get_novelcontent(self, response):
        # print response.body
        novel_name = response.meta['name']  # 小说名字
        author = response.meta['author']  # 小说作者
        novelurl = response.url  # 小说地址
        # print novelurl
        click_num_total = response.xpath(
            '//tr[1]/td[1]/text()').extract_first()  # 点击
        if click_num_total:
            # print "changshi1: "+str(click_num_total)
            click_num_total = int(click_num_total.split(":")[1])
        # print "changshi2: " + str(click_num_total)
        collect_num_total = response.xpath(
            '//tr[1]/td[2]/text()').extract_first()  # 收藏
        if collect_num_total:
            collect_num_total = int(collect_num_total.split(":")[1])
        # print collect_num_total
        click_num_month = response.xpath(
            '//tr[1]/td[3]/text()').extract_first()  # 收藏
        if click_num_total:
            click_num_month = int(click_num_month.split(":")[1]) * 4
        # print click_num_month

        serialnumber = response.xpath(
            '//tr[1]/td[4]/text()').extract_first()  # 连载字数
        if serialnumber:
            serialnumber = int(serialnumber.split(":")[1])
        # print serialnumber
        # category = response.xpath('//div[@class="title"]/i[2]/text()').extract_first()  # 小说类别
        serialstatus = response.xpath(
            '//div[@class="title"]/i[2]/text()').extract_first()  # 小说类别
        # print category
        category = response.xpath(
            '//div[@class="title"]/a[2]/text()').extract_first()  # 状态
        # print serialstatus

        targentcontent = XiaoshuoItem()
        targentcontent['novel_name'] = novel_name.strip()
        targentcontent['author'] = author.strip()
        targentcontent['novelurl'] = novelurl
        targentcontent['serialstatus'] = serialstatus
        targentcontent['serialnumber'] = serialnumber
        targentcontent['category'] = category
        targentcontent['collect_num_total'] = int(collect_num_total)
        targentcontent['click_num_total'] = int(click_num_total)
        targentcontent['click_num_month'] = int(click_num_month)
        # targentcontent['name_id'] = name_id
        # targentcontent['novel_breif'] = novel_breif
        yield targentcontent
 def parse_item(self, response):
     chap_list = response.xpath(".//*[@class='listmain']/dl/dd")
     novel_name = response.xpath(
         ".//div[@id='book']//div[@id='info']/h1/text()").get()
     for chapter in chap_list:
         c_name = chapter.xpath('./a/text()').get()
         c_url = chapter.xpath('./a/@href').get()
         if c_name:
             item = XiaoshuoItem(c_name=c_name, novel_name=novel_name)
             url = response.urljoin(c_url)
             request = scrapy.Request(url=url,
                                      callback=self.parse_content,
                                      dont_filter=True)
             request.meta['key'] = item
             yield request
Beispiel #13
0
 def content(self, response):
     urls = response.xpath('//*[@id="list"]/dl/dd')
     for url in urls:
         item = XiaoshuoItem()
         item['title'] = response.xpath(
             '//*[@id="info"]/h1/text()').extract_first()
         author = response.xpath(
             '//*[@id="info"]/p[1]/text()').extract_first()
         item['author'] = re.sub(r'\xa0|\n|\r', '', author)
         item['last'] = response.xpath(
             '//*[@id="info"]/p[3]/text()').extract_first()
         body_url = url.xpath('a/@href').extract_first()
         body_url = 'http://www.biquge.com.tw' + body_url
         request = scrapy.Request(body_url, self.body, dont_filter=True)
         request.meta['item'] = item
         yield request
Beispiel #14
0
    def content_parse(self, response):
        """每章内容"""
        if response.status == 200:
            html = etree.HTML(response.text)
            if html:
                title = html.xpath('//em[@class="l"]/text()')
                page = html.xpath('//strong[@class ="l jieqi_title"]/text()')
                content = html.xpath(
                    '//div[contains(@class, "mainContenr") and @id="content"]/text()'
                )

                item = XiaoshuoItem()
                item['title'] = title[0] if title else ''
                item['page'] = page[0] if page else ''
                item['content'] = content
                yield item
Beispiel #15
0
    def get_name(self, response):
        myPgae = response.body
        unicodePage = myPgae.decode('utf-8')
        # print  myPgae
        # # 根据正则表达式拿到所有的内容
        # novelsTable = re.findall(r'<ul class="main_con">(.*?)</ul>', unicodePage, re.S)  # 获取当前页面的Table
        # print novelsTable[0]
        novelsList = re.findall(r'<li>(.*?)</li>', unicodePage,
                                re.S)  # 获取当前页面的Table

        # nameinfo = novelsList[17]
        if novelsList[17:66]:
            for nameinfo in novelsList[17:66]:
                info = re.findall(r'target="_blank">(.*?)</a>', nameinfo,
                                  re.S)  # 小说地址
                # print info[1]
                category = info[0]
                novel_name = info[1]
                author = info[3]
                # print author
                novelurl = re.findall(r'<a class="fs14" href="(.*?)" title.*?',
                                      nameinfo, re.S)[0]
                serialnumber = re.findall(r'<span class="number">(.*?)</span>',
                                          nameinfo, re.S)[0]
                # print serialnumber
                # category = nameinfo.xpath('li/span[1]/a/text()').extract()[0]
                # print category
                # novel_name = nameinfo.xpath('li/span[2]/a[1]/text()').extract()[0]
                # print novel_name
                # novelurl = nameinfo.xpath('li/span[2]/a[1]/@href').extract()[0]
                # print novelurl
                # serialnumber = nameinfo.xpath('li/span[3]/text()').extract()[0]
                # print int(serialnumber)
                # author = nameinfo.xpath('li/span[4]/a/text()').extract()[0]
                # print author
                targentcontent = XiaoshuoItem()
                targentcontent['novel_name'] = novel_name.strip()
                targentcontent['author'] = author.strip()
                targentcontent['novelurl'] = novelurl
                targentcontent['category'] = category
                targentcontent['serialnumber'] = int(serialnumber)
                # print targentcontent
                if novelurl is not None:
                    yield Request(str(novelurl),
                                  dont_filter=True,
                                  callback=self.get_novelcontent,
                                  meta={'targentcontent': targentcontent})
Beispiel #16
0
Datei: 17k.py Projekt: wsqat/308
 def get_name(self, response):
     myPgae = response.body
     unicodePage = myPgae.decode('utf-8')
     # print  myPgae
     # # 根据正则表达式拿到所有的内容
     # novelsTable = re.findall(r'<ul class="main_con">(.*?)</ul>', unicodePage, re.S)  # 获取当前页面的Table
     # print novelsTable[0]
     novelsList = re.findall(r'<tr class=.*?>(.*?)</tr>', unicodePage,
                             re.S)  # 获取当前页面的Table
     # print len(novelsList)
     # nameinfo = novelsList[0]
     if novelsList:
         for nameinfo in novelsList:
             # print nameinfo
             novelurl = re.findall(r'<a .*? href="(.*?)" target.*?',
                                   nameinfo, re.S)[0]
             # print novelurl
             info = re.findall(r'target="_blank".*?>(.*?)</a>', nameinfo,
                               re.S)  # 小说地址
             category = info[0]
             novel_name = info[1]
             author = info[-1]
             # print novel_name+" "+ author + " " + category
             serialnumber = re.findall(r'<td class="td5">(.*?)</td>',
                                       nameinfo, re.S)[0]
             # print serialnumber
             serialstatus = re.findall(r'<em class="fc2">(.*?)</em>',
                                       nameinfo, re.S)[0]
             serialstatus = serialstatus.strip()
             targentcontent = XiaoshuoItem()
             targentcontent['novel_name'] = novel_name.strip()
             targentcontent['author'] = author.strip()
             targentcontent['novelurl'] = novelurl
             targentcontent['category'] = category
             targentcontent['serialnumber'] = serialnumber
             targentcontent['serialstatus'] = serialstatus
             # return ""
             # print targentcontent
             # novelurl = "http://www.17k.com/book/1893454.html"
             # print novelurl
             if novelurl is not None:
                 yield Request(str(novelurl),
                               dont_filter=True,
                               callback=self.get_novelcontent,
                               meta={'targentcontent': targentcontent})
Beispiel #17
0
 def parse(self, response):
     kind_urls = response.xpath(
         "//div[@class='menu']/ul/li/a/@href").extract()[1:8]
     kind_names = response.xpath(
         "//div[@class='menu']/ul/li/a/span/text()").extract()[1:8]
     items = []
     for each in range(len(kind_names)):
         if not os.path.exists(kind_names[each]):
             os.makedirs(kind_names[each])
             for offset in range(1, 11):
                 item = XiaoshuoItem()
                 item['kind_name'] = kind_names[each]
                 item['kind_url'] = kind_urls[each] + str(offset) + '.htm'
                 items.append(item)
     for item in items:
         yield scrapy.Request(item['kind_url'],
                              meta={'meta': item},
                              callback=self.parse_second)
Beispiel #18
0
    def parse(self, response):
        '''  
        book names
        '''
        item = []
        items = []

        book_names_As = response.xpath("//*[@id="hotcontent"]/div[1]")

        for book_name_A in book_names_As :
            item = XiaoshuoItem()

            item['novel_name'] = book_name_A.xpath('./div[1]/dl/dt/a/text()').extract()[0]
            item['author'] = book_name_A.xpath('./div[1]/dl/dt/span/text()').extract()[0]
            item['novel_name_urls'] = book_name_A.xpath('//*[@id="hotcontent"]/div[1]/div[1]/dl/dt/a/@href')
            
            items.append(item)
        return items
            
Beispiel #19
0
 def parse_second(self, response):
     meta = response.meta['meta']
     items = []
     novel_urls = response.xpath("//li[@class='conter1']/a/@href").extract()
     novel_names = response.xpath(
         "//li[@class='conter1']/a/text()").extract()
     for novel_url, novel_name in zip(novel_urls, novel_names):
         item = XiaoshuoItem()
         novel_url = 'http://www.530p.com' + novel_url
         file_novel = meta['kind_name'] + '\\' + novel_name + '.txt'
         item['kind_name'] = meta['kind_name']
         item['kind_url'] = meta['kind_url']
         item['novel_name'] = novel_name
         item['novel_url'] = novel_url
         item['file_novel'] = file_novel
         items.append(item)
     for item in items:
         yield scrapy.Request(item['novel_url'],
                              meta={'meta1': item},
                              callback=self.parse_third)
Beispiel #20
0
 def get_name(self, response):
     myPgae = response.body
     unicodePage = myPgae.decode('utf-8')
     # print  myPgae
     # # 根据正则表达式拿到所有的内容
     # novelsTable = re.findall(r'<ul class="main_con">(.*?)</ul>', unicodePage, re.S)  # 获取当前页面的Table
     # print novelsTable[0]
     # max_num = response.xpath('//div[@class="topbox"]/span/text()').extract_first().split(u"本")[0]
     novelsList = re.findall(r'<div class="bookdetail bg">(.*?)</div>',
                             unicodePage, re.S)  # 获取当前页面的Table
     # print len(novelsList)
     if novelsList:
         for nameinfo in novelsList[1:31]:
             # print nameinfo
             info = re.findall(r'target="_blank".*?>(.*?)</a>', nameinfo,
                               re.S)  # 小说地址
             novel_name = info[0]
             author = info[1]
             category = info[2]
             novelurl = re.findall(r'<a href="(.*?)" target.*?', nameinfo,
                                   re.S)[0]
             # serial = re.findall(r'<span >(.*?)</span>', nameinfo, re.S)
             # serialstatus = serial[0]
             # serialnumber = serial[1]
             targentcontent = XiaoshuoItem()
             targentcontent['novel_name'] = novel_name.strip()
             targentcontent['author'] = author.strip()
             targentcontent['novelurl'] = novelurl
             targentcontent['category'] = category
             # < span class ="book_click" > 2013-10-21 < / span >
             update = re.findall(r'<span class="book_click">(.*?)</span>',
                                 nameinfo, re.S)[0].split("-")[0]
             if update > 2017:
                 targentcontent['serialstatus'] = u"连载中"
             else:
                 targentcontent['serialstatus'] = u"已完结"
             if novelurl is not None:
                 yield Request(str(novelurl),
                               dont_filter=True,
                               callback=self.get_novelcontent,
                               meta={'targentcontent': targentcontent})
Beispiel #21
0
    def get_name(self, response):
        baseurl = response.url
        # print baseurl
        myPgae = response.body
        unicodePage = myPgae.decode('utf-8')
        novelsList = re.findall(r'<li>(.*?)</li>', unicodePage,
                                re.S)  # 获取当前页面的Table
        print len(novelsList)
        # nameinfo = novelsList[10] # 10-29
        # nameinfo2 = novelsList[1]
        # print  nameinfo
        for nameinfo in novelsList[10:30]:
            novel_name = re.findall(r'target="_blank".*?>(.*?)</a>', nameinfo,
                                    re.S)[1]  # 小说地址
            # print name
            novelInfo = re.findall(r'target="blank".*?>(.*?)</a>', nameinfo,
                                   re.S)  # 小说地址
            # print novelInfo
            author = novelInfo[0].split(">")[2]
            category = novelInfo[1]
            novelurl = re.findall(r'href="(.*?)"', nameinfo, re.S)[0]
            novelurl = self.baseurl + novelurl
            info = re.findall(r'<span>(.*?)</span>', nameinfo, re.S)  # 小说地址
            serialstatus = info[0]
            click_num_month = int(info[1].split(":")[1])
            serialnumber = int(info[5].split(":")[1])

            targentcontent = XiaoshuoItem()
            targentcontent['novel_name'] = novel_name
            targentcontent['author'] = author
            targentcontent['novelurl'] = novelurl
            targentcontent['serialstatus'] = serialstatus
            targentcontent['serialnumber'] = serialnumber
            targentcontent['category'] = category
            targentcontent['click_num_month'] = int(click_num_month)

            if novelurl is not None:
                yield Request(str(novelurl),
                              dont_filter=True,
                              callback=self.get_novelcontent,
                              meta={'targentcontent': targentcontent})
Beispiel #22
0
    def parse(self, response):
        item = XiaoshuoItem()
        sel = Selector(response)

        oldUrl = sel.xpath('//div[@class = "fanye_cen"]/a/@href').extract()

        newUrl = "http://www.uukanshu.com" + oldUrl[0]

        if newUrl[-1] != "/":
            item['page'] = oldUrl[0]
            yield Request(newUrl, callback=self.parse)

        else:
            item['page'] = response.url[23:]
            print('下载完成')

        item['content'] = sel.xpath(
            '//div[@id="contentbox"]/text()|//div[@id="contentbox"]/p/text()').extract()
        item['title'] = sel.xpath(
            '//div[@class = "h1title"]/h1/text()').extract()

        yield item
Beispiel #23
0
 def parse_third(self, response):
     meta1 = response.meta['meta1']
     items = []
     part_urls = response.xpath(
         "//div[@class='clc']/a/@href").extract()[::-1]  # 章节url
     part_names = response.xpath(
         "//div[@class='clc']/a/text()").extract()[::-1]  # 章节名字
     for part_url, part_name in zip(part_urls, part_names):
         item = XiaoshuoItem()
         part_url = 'http://www.530p.com' + part_url
         item['kind_name'] = meta1['kind_name']
         item['kind_url'] = meta1['kind_url']
         item['novel_name'] = meta1['novel_name']
         item['novel_url'] = meta1['novel_url']
         item['file_novel'] = meta1['file_novel']
         item['part_url'] = part_url
         item['part_name'] = part_name
         items.append(item)
     for item in items:
         yield scrapy.Request(item['part_url'],
                              meta={'meta2': item},
                              callback=self.parse_forth)
Beispiel #24
0
    def parse(self, response):

        for each in response.xpath("//div[@class='table_con']"):
            item = XiaoshuoItem()
            # 小说类型
            storystyle = each.xpath(
                ".//span[@class='book']/em/a/text()").extract()
            # 小说名
            storyname = each.xpath(
                ".//span[@class='book']//a[@class='f14']/text()").extract()
            # 小说章节
            storychapter = each.xpath(
                ".//span[@class='book']/a[@target='_blank'][2]/text()"
            ).extract()
            # 总点击
            storyclick = each.xpath(".//span[@class='click']/text()").extract()
            # 作者
            storyauthor = each.xpath(
                ".//span[@class='author']/a/text()").extract()
            # 更新时间
            storyupdatetime = each.xpath(
                ".//span/span[@class='time']/text()").extract()

        for i in range(0, 50):
            item['storyStyle'] = storystyle[i].strip()
            item['storyName'] = storyname[i].strip()
            item['storyChapter'] = storychapter[i].strip()
            item['storyClick'] = storyclick[i].strip()
            item['storyAuthor'] = storyauthor[i].strip()
            item['storyUpdateTime'] = storyupdatetime[i].strip()

            yield item
        if self.index < 5:
            self.index += 1

        yield scrapy.Request("http://huayu.zongheng.com/store/c0/c0/u1/p" +
                             str(self.index) + "/v0/s0/ALL.html",
                             callback=self.parse)