def parse(self, response): items = [] item = BookItem() item['url'] = response.url item['title'] = response.xpath('//div[@class = "padding"]//h1/text()' )[0].extract().encode("utf-8") item['author'] = response.xpath( '//span[@itemprop = "name"]/text()')[0].extract().encode("utf-8") item['yayinevi'] = response.xpath( '//span[@itemprop = "name"]/text()')[1].extract().encode("utf-8") item['summary'] = response.xpath( '//span[@itemprop = "description"]/text()')[0].extract().encode( "utf-8") item['language'] = response.xpath( '//table[@class = "attribute"]//tr//td//span/text()')[1].extract( ).encode("utf-8") item['date'] = response.xpath( '//table[@class = "attribute"]//tr//td[@itemprop = "datePublished"]/text()' )[0].extract() item['genre'] = response.xpath( '//div[@class = "grid_6 omega alpha section"]//a/text()' )[2].extract().encode("utf-8") item['image_urls'] = response.xpath( '//div[@class = "image"]//a//img[@itemprop = "image"]/@src' ).extract() item['price'] = response.xpath( '//div[@class = "price-sales column-box mg-b-20"]//span[@class = "value"]/text()' )[0].extract() items.append(item) return items
def detail_parse(self, response): image_url = response.xpath( "//img[@id='largePic']/@src").extract_first() title = response.xpath( "//div[@class='name_info']/h1[1]/@title").extract_first() author = self._author( response.xpath("//span[@id='author']/a[1]/text()").extract()) publisher = response.xpath( "//div[@class='messbox_info']/span[2]/a[1]/text()").extract_first( ) pubdate = self._pubdate( response.xpath( "//div[@class='messbox_info']/span[3]/text()").extract_first()) isbn = self._isbn( response.xpath( "//ul[@class='key clearfix']/li[5]/text()").extract_first()) types = ','.join( response.xpath("//span[@class='lie']/a/text()").extract()[1:]) description = self._description( response.xpath( "//div[@class='name_info']/h2/span[1]/text()").extract_first()) book = BookItem() book['title'] = title book['description'] = description book['isbn'] = isbn book['image_url'] = image_url book['types'] = types book['author'] = author book['publisher'] = publisher book['pubdate'] = pubdate yield book
def parse_item(self, response): i = BookItem() i['chapter'] = response.xpath(r'//h3/text()').extract()[0] i['text'] = ','.join( response.xpath( "//div[@class='read-content j_readContent']//p/text()"). extract()) return i
def u_start(self, response): s = '//div[@class="info"]/h2/a/@href' ss = response.xpath(s).extract() item = BookItem() for i in range(0, len(ss)): item['mhref'] = ss[i] # 子分类的链接 url = item['mhref'] time.sleep(1) yield scrapy.Request(url, callback=self.u_parse)
def parse_item(self, response): item = BookItem() for eitem in response.xpath('//div[@class="mod-list-item"]'): item['title'] = ''.join( eitem.xpath('.//a[@class="title-link"]/text()').extract()) item['description'] = ''.join( eitem.xpath('.//p[@class="smaller pb15"]/text()').extract()) item['price'] = ''.join( eitem.xpath('.//p[@class="currentPrice"]/text()').extract()) yield item
def parse(self, response): print('我是请求的url地址:', response.url) dt_list = response.xpath('//*[@id="booksort"]/div[2]/dl/dt') for dt in dt_list: item = BookItem() item['big_name'] = dt.xpath('./a/text()').extract_first() em_list = dt.xpath('./following-sibling::*[1]/em') for em in em_list: item['small_name'] = em.xpath('./a/text()').extract_first() item['small_url'] = 'https:' + em.xpath( './a/@href').extract_first() yield scrapy.Request(item['small_url'], callback=self.parse_small_list, meta={'bigkey': deepcopy(item)})
def parse_item(self, response): selector = etree.HTML(response.body) book = BookItem() titletext = selector.xpath("//h1/text()")[0].strip() if titletext.__contains__('章:'): title = titletext.split("章:") elif titletext.__contains__('章;'): title = titletext.split("章;") else: title = ['', titletext] if len(title) > 1: if len(title) > 1: text = selector.xpath("//div[@id='content']/text()") book["content"] = text book["title"] = title[1] return book else: print("-----------------------", title)
def parse(self, response): # 2、图书分类标签 # dt_list = response.xpath('//*[@id="booksort"]/div[2]/dl/dt') dt_list = response.xpath('//*[@id="booksort"]/div[2]/dl/dt[1]') # 3、图书分类,获取名字与链接 # 遍历所有dt标签,使用xpath中的follwong - sibling::*[1] # 取出下一届点的平级元素dd for dt in dt_list: item = BookItem() item['big_name'] = dt.xpath('a/text()').extract_first() # 具体分类 em_list = dt.xpath('./following-sibling::*[1]/em') for em in em_list[:1]: item['small_name'] = em.xpath('a/text()').extract_first() # 拼接url前缀 small_link = 'https:' + em.xpath('a/@href').extract_first() # 发送图书列表页(第二层) yield scrapy.Request(small_link, callback=self.parse_book, meta={'book': deepcopy(item)})
def m_parse(self, response): print('*********' + "1、类型选择 2、自己搜寻" + '*********') n = int(input()) if n == 1: print("1、文学 2、流行 3、文化 4、生活 5、经管 6、科技") m = int(input("请选择您喜欢的类型:")) m_title = '//div[@class="article"]/div[2]/div[' + str( m) + ']/table/tbody/tr/td/a/text()' titles = response.xpath(m_title).extract() print(titles) item = BookItem() a = [] for i, value in enumerate(titles): item['title'] = value # 每个种类的子分类 fulurl = 'https://book.douban.com/tag/' + item[ 'title'] # 子分类的链接 item['href'] = fulurl a.append(fulurl) print(i + 1, fulurl) time.sleep(1) m = int(input("请继续选择您喜欢的种类:")) url = a[m - 1] # print('_____________________'+url) yield scrapy.Request(url, callback=self.u_start) elif n == 2: m = input("请输入你要查询的书名:") n = quote(m) # https://api.douban.com/v2/book/search?q= url = 'https://api.douban.com/v2/book/search?q=' + n std = urllib.request.urlopen(url) rs = json.loads(std.read()) s = [] for i in rs['books']: s.append(i['id']) # https://book.douban.com/subject/1008145/ ull = 'https://book.douban.com/subject/' + s[0] print(ull) yield scrapy.Request(ull, callback=self.u_parse) else: yield scrapy.Request(response, callback=self.m_parse)
def u_parse(self, response): time.sleep(1) s = response.url fulurl = s + '/comments/' m_ltitle = '//div[@id="wrapper"]/h1/span/text()' # //div[@id="info"]/span/a/text() m_lauthor = '//div[@id="info"]/a[1]/text()' n_lauthor = '//div[@id="info"]/span/a/text()' m_lscore = '//strong/text()' m_lnumber = '//div[@class="rating_sum"]/span/a/span/text()' m_lbs = '//div[@id="info"]/text()' m_lcontent = '//div[@id="link-report"]/div/div/p[1]/text()' # //div[@id="link-report"]/*/div/div/p[1]/text() n_lcontent = '//div[@id="link-report"]/*/div/div/p[1]/text()' ltitles = response.xpath(m_ltitle).extract() # 标题 lauthors = response.xpath(m_lauthor).extract() # 作者 lscores = response.xpath(m_lscore).extract() # 评分 lnumbers = response.xpath(m_lnumber).extract() # 人数 lbss = response.xpath(m_lbs).extract() #出版社 lcontents = response.xpath(m_lcontent).extract() # 内容简介 if len(lauthors) == 0: lauthors = response.xpath(n_lauthor).extract() if len(lcontents) == 0: lcontents = response.xpath(n_lcontent).extract() # 内容简介 if len(lcontents) == 0: lcontents = ['null'] # 内容简介 item = BookItem() for i in range(0, len(ltitles)): item['ltitle'] = ltitles[i] item['lauthor'] = lauthors[i] item['lscore'] = lscores[i] item['lnumber'] = lnumbers[i] item['lbs'] = lbss[i + 4] item['lcontent'] = lcontents[i] print('**********' + item['ltitle'] + item['lauthor'] + item['lscore'] + item['lnumber'] + item['lbs'] + item['lcontent']) yield scrapy.Request(fulurl, meta={'item': item}, callback=self.PLk)