def parse_item(self, response): loader = ChinaLoader(item=NewsItem(), response=response) loader.add_xpath('title', '//h1[@id="chan_newsTitle"]/text()') loader.add_value('url', response.url) loader.add_xpath('text', '//div[@id="chan_newsDetail"]//text()') loader.add_xpath('datetime', '//div[@id="chan_newsInfo"]/text()', re='(\d+-\d+-\d+\s\d+:\d+:\d+)') loader.add_xpath('source', '//div[@id="chan_newsInfo"]/text()', re='来源:(.*)') loader.add_value('website', '中华网') yield loader.load_item()
def parse_item2(self, response): item = NewsItem() item['title'] = response.xpath( '//title/text()').extract_first().strip() item['url'] = response.url item['text'] = ''.join( response.xpath( '//div[@id="chan_newsDetail"]//p/text()').extract()).strip() item['datetime'] = response.xpath( '//div[@id="chan_newsInfo"]/text()').re_first( '(\d+-\d+-\d+\s\d+:\d+:\d+)') item['source'] = response.xpath( '//div[@id="chan_newsInfo"]/text()').re_first('(;.*?;)') item['website'] = '中华网' yield item
def parse_item(self, response): loader = ChinaLoader(item=NewsItem(), response=response) loader.add_xpath('title', "h1[@id='chan_newsTitle']/text()") loader.add_value('url', response.url) loader.add_xpath('text', "//div[@id='chan_newsDetail']//text()") loader.add_xpath( "datetime", "//div[@class='chan_newsInfo_source']/span[@class='time']/text()", re='(\d+-\d+-\d+\s\d+:\d+:\d+)') loader.add_xpath( "source", "//div[@class='chan_newsInfo_source']/span[@class='source']/text()" ) loader.add_value('website', '中华网') yield loader.load_item()
def parse_item(self, response): loader = ChinaLoader(item=NewsItem(), response=response) #item['domain_id'] = response.xpath('//input[@id="sid"]/@value').get() #item['name'] = response.xpath('//div[@id="name"]').get() #item['description'] = response.xpath('//div[@id="description"]').get() loader.add_xpath('title', '//h1[@id="chan_newsTitle"]/text()') loader.add_value('url', response.url) loader.add_xpath('text', '//div[@id="chan_newsDetail"]//text()') loader.add_xpath('datetime', '//div[@id="chan_newsInfo"]/text()', re='(\d+-\d+-\d+\s\d+:\d+:\d+)') loader.add_xpath('source', '//div[@id="chan_newsInfo"]/text()', re='来源:(.*)') loader.add_value('website', '中华网') yield loader.load_item()
def parse_item(self, response): ''' item = NewsItem() item['title'] = response.xpath('//h1[@id="chan_newsTitle"]/text()').extract_first() item['url'] = response.url item['text'] = ''.join(response.xpath('//div[@id="chan_newsDetail"]/text()').extract()).strip() item['datatime'] = response.xpath('//div[@id="chan_newsInfo"]/text()').re_first('(\d+-\d+-\d+\s\d+:\d+:\d+)') item['source'] = response.xpath('//div[@id="chan_newsInfo"]/text()').re_first(' 来源:(.*)').strip() item['website'] = '中华网' yield item ''' loader = ChinaLoader(item=NewsItem(), response=response) loader.add_xpath('title', '//h1[@id="chan_newsTitle"]/text()') loader.add_xpath('url', response.url) loader.add_xpath('text', '//div[@id="chan_newsDetail"]/text()') loader.add_xpath('datetime', '//div[@id="chan_newsInfo"]/text()', re='(\d+-\d+-\d+\s\d+:\d+:\d+)') loader.add_xpath('source', '//div[@id="chan_newsInfo"]/text()', re=' 来源:(.*)') loader.add_value('website', '中华网') yield loader.load_item()
def parse_item(self, response): # item = NewsItem() # item['title'] = response.xpath('//*[@id="chan_newsTitle"]/text()').get() # item['url'] = response.url # item['source'] = response.xpath('//*[@id="js-article-title"]//span[@class="source"]/text()').get()[3:].strip() # item['datatime'] = response.xpath('//*[@id="js-article-title"]//span[@class="time"]/text()').get() # item['text'] = response.xpath('//*[@id="chan_newsDetail"]//p[position() < last()]/text()').getall() # item['website'] = 'tech.china.com' # return item loader = ChinaLoader(item=NewsItem(), response=response) loader.add_xpath('title', '//*[@id="chan_newsTitle"]/text()') loader.add_value('url', response.url) loader.add_xpath( 'text', '//*[@id="chan_newsDetail"]//p[position() < last()]/text()') loader.add_xpath( 'datatime', '//*[@id="js-article-title"]//span[@class="time"]/text()') loader.add_xpath( 'source', '//*[@id="js-article-title"]//span[@class="source"]/text()', re='来源:(.*)') loader.add_value('website', 'tech.china.com') # print() # print(loader.load_item()) # print(type(loader.load_item())) # print(loader.load_item()['title']) # print(type(loader.load_item()['title'])) # print(loader.load_item()['url']) # print(type(loader.load_item()['url'])) # print(loader.load_item()['text']) # print(type(loader.load_item()['text'])) # print(loader.load_item()['datatime']) # print(type(loader.load_item()['datatime'])) # print(loader.load_item()['source']) # print(type(loader.load_item()['source'])) # print() yield loader.load_item()