def parse(self, response): news_lists = response.xpath("//div[@class='con2']")[0] news_list = news_lists.xpath(".//div[@class='li']") for news in news_list: title = news.xpath("./a/div/div[@class='t18']/text()") time = news.xpath("./a/div/div[@class='time']/text()") content = news.xpath("./a/div/div[@class='p']/text()") href = news.xpath("./a/@href") if len(title) == 0 or len(time) == 0 or len(content) == 0 or len( href) == 0: continue title = title[0].extract() time = time[0].extract() content = content[0].extract() href = prefixURL + href[0].extract() author = "中国地质博物馆" description = "1" tag = 1 item = MuseumnewsItem() item['title'] = title item['author'] = author item['time'] = time item['description'] = description item['content'] = content item['url'] = href item['tag'] = tag yield item print('page = {}'.format(self.page)) if self.page < 20: self.page += 1 new_url = URL.format(page=self.page) print(new_url) yield Request(new_url, callback=self.parse, dont_filter=True)
def parse(self, response): news_body = response.xpath("//div[@class='fen-right float-l']")[0] news_list = news_body.xpath(".//ul[@class='fen-right-list']") for news in news_list: title = news.xpath("./li/span/a/text()") time = news.xpath("./li/span[@class='fen-right-time']/text()") href = news.xpath("./li/span/a/@href") if len(title) == 0 or len(time) == 0 or len(href) == 0: continue title = title[0].extract() time = time[0].extract() content = title href = prefixURL + href[0].extract() author = "中国科学技术馆" description = "1" tag = 1 item = MuseumnewsItem() item['title'] = title item['author'] = author item['time'] = time item['description'] = description item['content'] = content item['url'] = href item['tag'] = tag yield item print('page = {}'.format(self.page)) if self.page <= 45: self.page += 1 new_url = URL.format(page=self.page) print(new_url) yield Request(new_url, callback=self.parse, dont_filter=True)
def parse(self, response): news_body = response.xpath("//div[@class='infoDynamicList']")[0] news_list = news_body.xpath("./ul//li") for news in news_list: title = news.xpath("./a/h3/text()") time = news.xpath("./a/span/text()") content = news.xpath("./a/p/text()") href = news.xpath("./a/@href") if len(title) == 0 or len(time) == 0 or len(content) == 0 or len( href) == 0: continue title = title[0].extract() time = time[0].extract() content = content[0].extract() href = prefixURL + href[0].extract()[1:] author = "中国人民革命军事博物馆" description = "1" tag = 1 item = MuseumnewsItem() item['title'] = title item['author'] = author item['time'] = time item['description'] = description item['content'] = content item['url'] = href item['tag'] = tag yield item print('page = {}'.format(self.page)) if self.page < 30: self.page += 1 new_url = URL.format(page=self.page) print(new_url) yield Request(new_url, callback=self.parse, dont_filter=True)
def parse(self, response): news_body = response.xpath("//td[@height='450']")[0] news_list = news_body.xpath(".//table[@width='85%']") for news in news_list: info = news.xpath(".//text()") if len(info) == 0: continue title = info[1].extract() time = info[0].extract().replace("\xa0", "") content = title href = prefixURL + news.xpath(".//@href")[0].extract() author = "首都博物馆" description = "1" tag = 1 item = MuseumnewsItem() item['title'] = title item['author'] = author item['time'] = time item['description'] = description item['content'] = content item['url'] = href item['tag'] = tag yield item print('page = {}'.format(self.page)) if self.page < 71: self.page += 1 new_url = URL.format(page=self.page) print(new_url) yield Request(new_url, callback=self.parse, dont_filter=True)
def parse(self, response): news_list = response.xpath('//div[@class="result"]') # print(news_list) if not news_list: self.end = True return for news in news_list: href = news.xpath('./h3[@class="c-title"]/a/@href').extract() url = "".join(href).replace("\n", "").replace(" ", "") title = news.xpath('./h3[@class="c-title"]/a/text()').extract() title = "".join(title).replace("\n", "").replace(" ", "") content = news.xpath( './div[@class="c-summary c-row "]/text()').extract() content = "".join(content).replace("\n", "").replace(" ", "") if content == "": content = news.xpath( './div[@class="c-summary c-row "]/div[2]/text()').extract( ) content = "".join(content).replace("\n", "").replace(" ", "") author_time = news.xpath( './div[@class="c-summary c-row "]//p[@class="c-author"]/text()' ).extract() author_time = "".join(author_time).replace("\n", "").replace(" ", "").split() author = "" time = "" if author_time: # 有些新闻没有作者和时间 author = author_time[0] s_time = author_time[1] if s_time: time = self.parse_time(s_time) else: time = s_time description = "1" tag = 1 item = MuseumnewsItem() item['title'] = title item['author'] = author item['time'] = time item['description'] = description item['content'] = content item['url'] = url item['tag'] = tag yield item print('page = {}'.format(self.page)) if not self.end: self.page += 1 new_url = URL.format(museum=self.museum, bt=self.startTime, et=self.endTime, page=self.page * 10) print(new_url) yield Request(new_url, callback=self.parse, dont_filter=True)