def parse(self, response): item = XiangmuItem() item["book_name"] = '' item["book_author"] = '' item["book_type"] = '' item["book_format"] = '' item["book_time"] = '' item["book_url"] = '' item["book_size"] = '' item["book_downl_url"] = '' item["book_source"] = '' item["book_intro"] = '' item["book_content"] = '' item["book_zip_pswd"] = '' item["book_chinese"] = '' item["book_id"] = '' selector = Selector(response) is_lists_page = selector.xpath('//ul[@id="resultsContainer"]') if is_lists_page: info_lists = is_lists_page.xpath( 'li/div[@class="item_title"]/strong/h2/a/@href').extract() for each in info_lists: yield Request(each, callback=self.parse) page_lists = is_lists_page.xpath( '//select[@name="select"]/option/@value').extract() for each_page in page_lists[1:-1]: yield Request(self.main_url + each_page, callback=self.parse) pass is_info_page = selector.xpath('//div[@id="detail"]') if is_info_page: item['book_url'] = response.url item['book_id'] = get_md5(response.url) item['book_downl_url'] = response.url type = selector.xpath('//div[@class="posi"]/a/text()').extract() type_url = selector.xpath('//div[@class="posi"]/a/@href').extract() if "http://www" in type_url[-1]: item['book_type'] = type[-2] else: item['book_type'] = type[-1] information = is_info_page.xpath('div[@class="tb-detail-hd"]') item['book_name'] = information.xpath('h1/text()').extract() time = information.xpath( 'li[@class="dated"]/span[@class="datetime"]/text()').extract() time = ''.join(time).split(':')[-1] item['book_time'] = time author = information.xpath( 'li[@class="dated"]/span[@class="author"]/text()').extract() item['book_author'] = ''.join(author).replace('\r', '').replace( '\n', '') yield item
def parse(self, response): item = XiangmuItem() item["book_name"] = '' item["book_author"] = '' item["book_type"] = '' item["book_format"] = '' item["book_time"] = '' item["book_url"] = '' item["book_size"] = '' item["book_downl_url"] = '' item["book_source"] = '' item["book_intro"] = '' item["book_content"] = '' item["book_zip_pswd"] = '' item["book_chinese"] = '' item["book_id"] = '' selector = Selector(response) is_lists_page = selector.xpath('//ul[@class="list"]') if is_lists_page: info_lists = is_lists_page.xpath( 'li/div[@class="pic_upost"]/a/@href').extract() for each in info_lists: yield Request(self.main_url + each, callback=self.parse) # next_link = selector.xpath('//a[@v="next"]/@href').extract() # yield Request(self.main_url + next_link[0], callback=self.parse) is_info_page = selector.xpath('//div[@class="box_title"]') if is_info_page: item["book_name"] = selector.xpath( '//div[@class="box_title"]/h1/text()').extract() info = selector.xpath('//ul[@class="text01"]') item["book_type"] = info.xpath('li')[-1].xpath( 'a/text()').extract() author = info.xpath('li/text()').extract()[0] item["book_author"] = ''.join(author).split('>')[-1] source = info.xpath('li/text()').extract()[2] item["book_source"] = ''.join(source).split('>')[-1] size = info.xpath('li/text()').extract()[3] item["book_size"] = ''.join(size).split('>')[-1] + '页' intro = selector.xpath( '//div[@class="abut_top_part"]/text()').extract() if intro: item['book_intro'] = intro item["book_url"] = response.url item["book_downl_url"] = response.url item["book_id"] = get_md5(response.url) yield item
def parse(self, response): item = XiangmuItem() item["book_name"] = '' item["book_author"] = '' item["book_type"] = '' item["book_format"] = '' item["book_time"] = '' item["book_url"] = '' item["book_size"] = '' item["book_downl_url"] = '' item["book_source"] = '' item["book_intro"] = '' item["book_content"] = '' item["book_zip_pswd"] = '' item["book_chinese"] = '' item["book_id"] = '' selector = Selector(response) is_list_page = selector.xpath('//table[@class="book_list"]') if is_list_page: lists = selector.xpath('//td[@height="200px"]') for each in lists: href = each.xpath('a/@href').extract() yield Request(href[0], callback=self.parse) is_content_page = selector.xpath('//div[@id="nav_left"]') if is_content_page: name = selector.xpath("//b/text()").extract() item['book_name'] = name[0] inf_list = selector.xpath('//font[@id="status"]') author = inf_list[0].xpath('a/text()').extract() item['book_author'] = author item['book_type'] = str(inf_list[1].extract()).split('\n')[3].split(':')[-1] item['book_time'] = str(inf_list[-1].extract()).split('\n')[2].split(':')[-1] item['book_url'] = response.url item['book_downl_url'] = response.url item['book_intro'] = selector.xpath('//div[@id="desc_text"]/text()').extract() item['book_id'] = get_md5(response.url) item['book_format'] = "mobi/epub" yield item
def parse(self, response): item = XiangmuItem() item["book_name"] = '' item["book_author"] = '' item["book_type"] = '' item["book_format"] = '' item["book_time"] = '' item["book_url"] = '' item["book_size"] = '' item["book_downl_url"] = '' item["book_source"] = '' item["book_intro"] = '' item["book_content"] = '' item["book_zip_pswd"] = '' item["book_chinese"] = '' item["book_id"] = '' selector = Selector(response) print response.url is_lists_page = selector.xpath('//div[@class="article-list pt10"]') if is_lists_page: info_lists = is_lists_page.xpath('//h3[@class="article-title_list"]/a/@href').extract() for each in info_lists: yield Request(self.main_url + each, callback=self.parse) page_lists = is_lists_page.xpath('div[@class="page ui-pagination"]/ul/li').extract() length = len(page_lists)-1 for each_page in page_lists: if "curPage" in each_page: cur_page = page_lists.index(each_page) next_links = is_lists_page.xpath('div[@class="page ui-pagination"]/ul/li/a/@href').extract() try: yield Request(self.main_url + next_links[cur_page+1]) finally: pass is_info_page = selector.xpath('//div[@class="article"]') if is_info_page: item["book_type"] = selector.xpath('//div[@class="crumb"]/a/text()').extract()[-1] item["book_name"] = is_info_page.xpath('h1/text()').extract() item["book_source"] = is_info_page.xpath('div[@class="article-info clearfix"]/span[@class="fl"]/text()').extract() item["book_author"] = is_info_page.xpath('div[@class="article-info clearfix"]/span[@class="ml15"]/a/text()').extract() item["book_content"] = is_info_page.xpath('div[@id="article-main"]/p/text()').extract() item["book_url"] = response.url item["book_downl_url"] = response.url item["book_id"] = get_md5(response.url) yield item
def parse(self, response): item = XiangmuItem() item["book_name"] = '' item["book_author"] = '' item["book_type"] = '' item["book_format"] = '' item["book_time"] = '' item["book_url"] = '' item["book_size"] = '' item["book_downl_url"] = '' item["book_source"] = '' item["book_intro"] = '' item["book_content"] = '' item["book_zip_pswd"] = '' item["book_chinese"] = '' item["book_id"] = '' sites = json.loads(response.body) for each in sites['result']: item["book_intro"] = each['content'] item["book_url"] = self.main_url + each['url'] item["book_downl_url"] = self.main_url + each['url'] item["book_time"] = each['time'] item["book_type"] = each['album_title'] item["book_format"] = 'mobi' url = self.main_url + each['url'] data = urllib2.urlopen(url).read().decode('utf-8') reg = r'<span>大小:(.*?)</span>'.decode('utf-8') gre = re.compile(reg, re.S) size = re.findall(gre, data) item["book_size"] = size item["book_id"] = get_md5(url) reg = r'《(.*?)》'.decode('utf-8') gre = re.compile(reg, re.S) name = re.findall(gre, data) if name: item['book_name'] = name[0] else: item['book_name'] = '' return item
def parse(self, response): item = XiangmuItem() item["book_name"] = '' item["book_author"] = '' item["book_type"] = '' item["book_format"] = '' item["book_time"] = '' item["book_url"] = '' item["book_size"] = '' item["book_downl_url"] = '' item["book_source"] = '' item["book_intro"] = '' item["book_content"] = '' item["book_zip_pswd"] = '' item["book_chinese"] = '' item["book_id"] = '' selector = Selector(response) is_lists_page = selector.xpath('//ul[@class="all-img-list cf"]') if is_lists_page: info_lists = is_lists_page.xpath( 'li/div[@class="book-mid-info"]/h4/a/@href').extract() for each in info_lists: yield Request(self.main_url + each, callback=self.parse) is_info_page = selector.xpath('//div[@class="book-info "]') if is_info_page: item["book_name"] = is_info_page.xpath('h1/em/text()').extract() item["book_author"] = is_info_page.xpath( 'h1/span/a/text()').extract() type = is_info_page.xpath('p[@class="tag"]/a/text()').extract() item["book_type"] = ",".join(type) item["book_intro"] = is_info_page.xpath( 'p[@class="intro"]/text()').extract() item["book_size"] = is_info_page.xpath("p")[-2].xpath( 'em/text()')[0].extract() + '万字' item["book_content"] = ''.join( selector.xpath( '//div[@class="book-intro"]/p/text()').extract()).replace( " ", "").replace("\n", '').replace("\r", "") item["book_url"] = response.url item["book_downl_url"] = response.url item["book_id"] = get_md5(response.url) yield item
def parse(self, response): item = XiangmuItem() item["book_name"] = '' item["book_author"] = '' item["book_type"] = '' item["book_format"] = '' item["book_time"] = '' item["book_url"] = '' item["book_size"] = '' item["book_downl_url"] = '' item["book_source"] = '' item["book_intro"] = '' item["book_content"] = '' item["book_zip_pswd"] = '' item["book_chinese"] = '' item["book_id"] = '' selector = Selector(response) item['book_url'] = response.url item['book_downl_url'] = response.url item['book_id'] = get_md5(response.url) name = ''.join(selector.xpath('//h5/text()')[0].extract()) item['book_name'] = name.replace(' ', '').replace('\n', '').replace('\t', '') item['book_intro'] = selector.xpath( '//div[@id="book_intro_content"]/text()').extract() info_list = selector.xpath( '//div[@class="col-xs-12 col-sm-4 col-md-4"]/div') item['book_author'] = info_list[0].xpath('a/text()')[0].extract() item['book_type'] = info_list[1].xpath('a/text()').extract() yield item