Beispiel #1
0
    def parse(self, response):
        item = XiangmuItem()

        item["book_name"] = ''
        item["book_author"] = ''
        item["book_type"] = ''
        item["book_format"] = ''
        item["book_time"] = ''
        item["book_url"] = ''
        item["book_size"] = ''
        item["book_downl_url"] = ''
        item["book_source"] = ''
        item["book_intro"] = ''
        item["book_content"] = ''
        item["book_zip_pswd"] = ''
        item["book_chinese"] = ''
        item["book_id"] = ''

        selector = Selector(response)

        is_lists_page = selector.xpath('//ul[@id="resultsContainer"]')
        if is_lists_page:
            info_lists = is_lists_page.xpath(
                'li/div[@class="item_title"]/strong/h2/a/@href').extract()
            for each in info_lists:
                yield Request(each, callback=self.parse)

            page_lists = is_lists_page.xpath(
                '//select[@name="select"]/option/@value').extract()
            for each_page in page_lists[1:-1]:
                yield Request(self.main_url + each_page, callback=self.parse)
            pass

        is_info_page = selector.xpath('//div[@id="detail"]')
        if is_info_page:
            item['book_url'] = response.url
            item['book_id'] = get_md5(response.url)
            item['book_downl_url'] = response.url

            type = selector.xpath('//div[@class="posi"]/a/text()').extract()
            type_url = selector.xpath('//div[@class="posi"]/a/@href').extract()
            if "http://www" in type_url[-1]:
                item['book_type'] = type[-2]
            else:
                item['book_type'] = type[-1]

            information = is_info_page.xpath('div[@class="tb-detail-hd"]')
            item['book_name'] = information.xpath('h1/text()').extract()
            time = information.xpath(
                'li[@class="dated"]/span[@class="datetime"]/text()').extract()
            time = ''.join(time).split(':')[-1]
            item['book_time'] = time
            author = information.xpath(
                'li[@class="dated"]/span[@class="author"]/text()').extract()
            item['book_author'] = ''.join(author).replace('\r', '').replace(
                '\n', '')
            yield item
Beispiel #2
0
    def parse(self, response):
        item = XiangmuItem()

        item["book_name"] = ''
        item["book_author"] = ''
        item["book_type"] = ''
        item["book_format"] = ''
        item["book_time"] = ''
        item["book_url"] = ''
        item["book_size"] = ''
        item["book_downl_url"] = ''
        item["book_source"] = ''
        item["book_intro"] = ''
        item["book_content"] = ''
        item["book_zip_pswd"] = ''
        item["book_chinese"] = ''
        item["book_id"] = ''

        selector = Selector(response)
        is_lists_page = selector.xpath('//ul[@class="list"]')
        if is_lists_page:
            info_lists = is_lists_page.xpath(
                'li/div[@class="pic_upost"]/a/@href').extract()
            for each in info_lists:
                yield Request(self.main_url + each, callback=self.parse)

            # next_link = selector.xpath('//a[@v="next"]/@href').extract()
            # yield Request(self.main_url + next_link[0], callback=self.parse)

        is_info_page = selector.xpath('//div[@class="box_title"]')
        if is_info_page:

            item["book_name"] = selector.xpath(
                '//div[@class="box_title"]/h1/text()').extract()
            info = selector.xpath('//ul[@class="text01"]')
            item["book_type"] = info.xpath('li')[-1].xpath(
                'a/text()').extract()
            author = info.xpath('li/text()').extract()[0]
            item["book_author"] = ''.join(author).split('>')[-1]
            source = info.xpath('li/text()').extract()[2]
            item["book_source"] = ''.join(source).split('>')[-1]
            size = info.xpath('li/text()').extract()[3]
            item["book_size"] = ''.join(size).split('>')[-1] + '页'

            intro = selector.xpath(
                '//div[@class="abut_top_part"]/text()').extract()
            if intro:
                item['book_intro'] = intro

            item["book_url"] = response.url
            item["book_downl_url"] = response.url
            item["book_id"] = get_md5(response.url)
            yield item
Beispiel #3
0
    def parse(self, response):
        item = XiangmuItem()

        item["book_name"] = ''
        item["book_author"] = ''
        item["book_type"] = ''
        item["book_format"] = ''
        item["book_time"] = ''
        item["book_url"] = ''
        item["book_size"] = ''
        item["book_downl_url"] = ''
        item["book_source"] = ''
        item["book_intro"] = ''
        item["book_content"] = ''
        item["book_zip_pswd"] = ''
        item["book_chinese"] = ''
        item["book_id"] = ''

        selector = Selector(response)

        is_list_page = selector.xpath('//table[@class="book_list"]')
        if is_list_page:
            lists = selector.xpath('//td[@height="200px"]')
            for each in lists:
                href = each.xpath('a/@href').extract()
                yield Request(href[0], callback=self.parse)

        is_content_page = selector.xpath('//div[@id="nav_left"]')
        if is_content_page:
            name = selector.xpath("//b/text()").extract()
            item['book_name'] = name[0]

            inf_list = selector.xpath('//font[@id="status"]')

            author = inf_list[0].xpath('a/text()').extract()
            item['book_author'] = author

            item['book_type'] = str(inf_list[1].extract()).split('\n')[3].split(':')[-1]

            item['book_time'] = str(inf_list[-1].extract()).split('\n')[2].split(':')[-1]

            item['book_url'] = response.url
            item['book_downl_url'] = response.url

            item['book_intro'] = selector.xpath('//div[@id="desc_text"]/text()').extract()

            item['book_id'] = get_md5(response.url)

            item['book_format'] = "mobi/epub"
            yield item
Beispiel #4
0
    def parse(self, response):
        item = XiangmuItem()

        item["book_name"] = ''
        item["book_author"] = ''
        item["book_type"] = ''
        item["book_format"] = ''
        item["book_time"] = ''
        item["book_url"] = ''
        item["book_size"] = ''
        item["book_downl_url"] = ''
        item["book_source"] = ''
        item["book_intro"] = ''
        item["book_content"] = ''
        item["book_zip_pswd"] = ''
        item["book_chinese"] = ''
        item["book_id"] = ''

        selector = Selector(response)
        print response.url
        is_lists_page = selector.xpath('//div[@class="article-list pt10"]')
        if is_lists_page:
            info_lists = is_lists_page.xpath('//h3[@class="article-title_list"]/a/@href').extract()
            for each in info_lists:
                yield Request(self.main_url + each, callback=self.parse)

            page_lists = is_lists_page.xpath('div[@class="page ui-pagination"]/ul/li').extract()
            length = len(page_lists)-1
            for each_page in page_lists:
                if "curPage" in each_page:
                    cur_page = page_lists.index(each_page)
            next_links = is_lists_page.xpath('div[@class="page ui-pagination"]/ul/li/a/@href').extract()
            try:
                yield Request(self.main_url + next_links[cur_page+1])
            finally:
                pass

        is_info_page = selector.xpath('//div[@class="article"]')
        if is_info_page:
            item["book_type"] = selector.xpath('//div[@class="crumb"]/a/text()').extract()[-1]
            item["book_name"] = is_info_page.xpath('h1/text()').extract()
            item["book_source"] = is_info_page.xpath('div[@class="article-info clearfix"]/span[@class="fl"]/text()').extract()
            item["book_author"] = is_info_page.xpath('div[@class="article-info clearfix"]/span[@class="ml15"]/a/text()').extract()
            item["book_content"] = is_info_page.xpath('div[@id="article-main"]/p/text()').extract()
            item["book_url"] = response.url
            item["book_downl_url"] = response.url
            item["book_id"] = get_md5(response.url)
            yield item
    def parse(self, response):
        item = XiangmuItem()

        item["book_name"] = ''
        item["book_author"] = ''
        item["book_type"] = ''
        item["book_format"] = ''
        item["book_time"] = ''
        item["book_url"] = ''
        item["book_size"] = ''
        item["book_downl_url"] = ''
        item["book_source"] = ''
        item["book_intro"] = ''
        item["book_content"] = ''
        item["book_zip_pswd"] = ''
        item["book_chinese"] = ''
        item["book_id"] = ''

        sites = json.loads(response.body)

        for each in sites['result']:

            item["book_intro"] = each['content']
            item["book_url"] = self.main_url + each['url']
            item["book_downl_url"] = self.main_url + each['url']
            item["book_time"] = each['time']
            item["book_type"] = each['album_title']
            item["book_format"] = 'mobi'

            url = self.main_url + each['url']
            data = urllib2.urlopen(url).read().decode('utf-8')
            reg = r'<span>大小:(.*?)</span>'.decode('utf-8')
            gre = re.compile(reg, re.S)
            size = re.findall(gre, data)
            item["book_size"] = size
            item["book_id"] = get_md5(url)

            reg = r'《(.*?)》'.decode('utf-8')
            gre = re.compile(reg, re.S)
            name = re.findall(gre, data)
            if name:
                item['book_name'] = name[0]
            else:
                item['book_name'] = ''

            return item
    def parse(self, response):
        item = XiangmuItem()

        item["book_name"] = ''
        item["book_author"] = ''
        item["book_type"] = ''
        item["book_format"] = ''
        item["book_time"] = ''
        item["book_url"] = ''
        item["book_size"] = ''
        item["book_downl_url"] = ''
        item["book_source"] = ''
        item["book_intro"] = ''
        item["book_content"] = ''
        item["book_zip_pswd"] = ''
        item["book_chinese"] = ''
        item["book_id"] = ''

        selector = Selector(response)
        is_lists_page = selector.xpath('//ul[@class="all-img-list cf"]')
        if is_lists_page:
            info_lists = is_lists_page.xpath(
                'li/div[@class="book-mid-info"]/h4/a/@href').extract()
            for each in info_lists:
                yield Request(self.main_url + each, callback=self.parse)

        is_info_page = selector.xpath('//div[@class="book-info "]')
        if is_info_page:
            item["book_name"] = is_info_page.xpath('h1/em/text()').extract()
            item["book_author"] = is_info_page.xpath(
                'h1/span/a/text()').extract()
            type = is_info_page.xpath('p[@class="tag"]/a/text()').extract()
            item["book_type"] = ",".join(type)
            item["book_intro"] = is_info_page.xpath(
                'p[@class="intro"]/text()').extract()
            item["book_size"] = is_info_page.xpath("p")[-2].xpath(
                'em/text()')[0].extract() + '万字'
            item["book_content"] = ''.join(
                selector.xpath(
                    '//div[@class="book-intro"]/p/text()').extract()).replace(
                        " ", "").replace("\n", '').replace("\r", "")
            item["book_url"] = response.url
            item["book_downl_url"] = response.url
            item["book_id"] = get_md5(response.url)
            yield item
Beispiel #7
0
    def parse(self, response):
        item = XiangmuItem()

        item["book_name"] = ''
        item["book_author"] = ''
        item["book_type"] = ''
        item["book_format"] = ''
        item["book_time"] = ''
        item["book_url"] = ''
        item["book_size"] = ''
        item["book_downl_url"] = ''
        item["book_source"] = ''
        item["book_intro"] = ''
        item["book_content"] = ''
        item["book_zip_pswd"] = ''
        item["book_chinese"] = ''
        item["book_id"] = ''

        selector = Selector(response)

        item['book_url'] = response.url
        item['book_downl_url'] = response.url
        item['book_id'] = get_md5(response.url)

        name = ''.join(selector.xpath('//h5/text()')[0].extract())
        item['book_name'] = name.replace(' ',
                                         '').replace('\n',
                                                     '').replace('\t', '')

        item['book_intro'] = selector.xpath(
            '//div[@id="book_intro_content"]/text()').extract()

        info_list = selector.xpath(
            '//div[@class="col-xs-12 col-sm-4 col-md-4"]/div')

        item['book_author'] = info_list[0].xpath('a/text()')[0].extract()

        item['book_type'] = info_list[1].xpath('a/text()').extract()

        yield item