def get_main_link(self, response): item = NewsItem() channel = response.meta['channel'] item['channel'] = channel node_list = response.xpath("//*[@class='vervideo-bd']") for node in node_list: a_link = node.xpath("./a/@href").extract_first() a_link = response.urljoin(a_link) title = node.xpath( "./a/div[@class='vervideo-title']/text()").extract_first() duration = node.xpath( "./a/div[@class='vervideo-img']/div[@class='cm-duration']/text()" ).extract_first() cut_url = node.xpath( "./a/div[@class='vervideo-img']/div[@class='verimg-view']/div[@class='img']/@style" ).extract_first() cut_url = re.findall(r'url\((.*?)\);', cut_url)[0] authorname = node.xpath( "./div[@class='actcont-auto']/a/text()").extract_first() print(title, authorname, duration, cut_url) if a_link and title: item['title'] = title item['duration'] = duration item['cut_url'] = cut_url item['name'] = authorname yield scrapy.Request(url=a_link, callback=self.parse_detail, meta={'item': item})
def parse_item(self, response): item = NewsItem() item['url'] = response.url item['title'] = response.xpath('/html/body/main/section/article/div/div[1]/div[2]/div/div[1]/h1/text()').extract()[0].strip() item['pub_time'] = response.xpath('/html/body/main/section/article/div/div[1]/div[2]/div/div[2]/ul/li[2]/text()').extract()[0].strip() item['content_code'] = response.xpath('/html/body/main/section/article/div/div[2]').extract()[0].strip() # 返回每个提取到的item数据, 给管道文件处理, 同时还会回来执行后面的代码 yield item
def parse_item(self, response): item = NewsItem() item['url'] = response.url item['title'] = response.xpath('/html/body/div[5]/div/div[1]/div/div[1]/h1/text()').extract()[0].strip() item['pub_time'] = response.xpath('/html/body/div[5]/div/div[1]/div/div[1]/div[5]/span[1]/text()').extract()[0].strip() item['content_code'] = response.xpath('/html/body/div[5]/div/div[1]/div/div[1]/div[6]').extract()[0] # 返回每个提取到的item数据, 给管道文件处理, 同时还会回来执行后面的代码 yield item
def parse_item(self, response): item = NewsItem() item['url'] = response.url item['title'] = response.xpath('//*[@id="site-content"]/div/div[2]/h1/text()').extract()[0] item['pub_time'] = response.xpath('//*[@id="site-content"]/div/div[1]/div[2]/div/div/div/a[2]/text()').extract()[0][4:] item['content_code'] = response.xpath('//*[@id="site-content"]/div/div[3]').extract()[0] # 返回每个提取到的item数据, 给管道文件处理, 同时还会回来执行后面的代码 yield item
def parse_item(self, response): item = NewsItem() item['url'] = response.url item['title'] = response.xpath('//*[@id="wrapper"]/article/div/header/h1/text()').extract()[0].strip() item['pub_time'] = response.xpath('//*[@id="wrapper"]/article/div/header/time/a/text()').extract()[0].strip() item['content_code'] = response.xpath('//*[@id="wrapper"]/article/div/div').extract()[0].strip() # 返回每个提取到的item数据, 给管道文件处理, 同时还会回来执行后面的代码 yield item
def parse_item(self, response): item = NewsItem() item['url'] = response.url item['title'] = response.xpath('//*[@id="artibodyTitle"]/text()').extract()[0].strip() item['pub_time'] = response.xpath('//*[@id="pub_date"]/text()').extract()[0].strip() item['content_code'] = response.xpath('//*[@id="artibody"]').extract()[0].strip() # 返回每个提取到的item数据, 给管道文件处理, 同时还会回来执行后面的代码 yield item
def parse_item(self, response): item = NewsItem() item['url'] = response.url item['title'] = response.xpath('//*[@id="main"]/div[2]/h2/text()').extract()[0] item['pub_time'] = response.xpath('//*[@id="main"]/div[2]/div[1]/p/span[2]/text()').extract()[0] item['content_code'] = response.xpath('//*[@id="main"]/div[2]/div[2]').extract()[0] # 返回每个item yield item
def parse_item(self, response): item = NewsItem() item['url'] = response.url item['title'] = response.xpath('//h1/text()').extract()[0].strip() item['pub_time'] = response.xpath( '//span[@class="time"]/text()').extract()[0][:10] item['content_code'] = response.xpath('//article/div[2]').extract()[0] # 返回每个提取到的item数据, 给管道文件处理, 同时还会回来执行后面的代码 yield item
def parse_item(self, response): item = NewsItem() item['url'] = response.url item['title'] = response.xpath('//h1').extract()[0].strip() item['pub_time'] = "now" item['content_code'] = response.xpath( '//section[@class="textblock"]').extract()[0].strip() # 返回每个提取到的item数据, 给管道文件处理, 同时还会回来执行后面的代码 yield item
def parse_item(self, response): item = NewsItem() item['url'] = response.url item['title'] = response.xpath('//*[@id="Main-Article-QQ"]/div/div[1]/div[1]/div[1]/h1/text()').extract()[0].strip() item['pub_time'] = response.xpath('//*[@id="Main-Article-QQ"]/div/div[1]/div[1]/div[1]/div/div[1]/span[3]/text()').extract()[0].strip() item['content_code'] = response.xpath('//*[@id="Cnt-Main-Article-QQ"]').extract()[0].strip() # 返回每个提取到的item数据, 给管道文件处理, 同时还会回来执行后面的代码 yield item
def parse_item(self, response): item = NewsItem() item['url'] = response.url item['title'] = response.xpath('//*[@id="article"]/div[1]/div[1]/h1/text()').extract()[0].strip() item['pub_time'] = response.url.split("/")[-2] item['content_code'] = response.xpath('//*[@id="article"]/div[1]/div[2]').extract()[0].strip() # 返回每个提取到的item数据, 给管道文件处理, 同时还会回来执行后面的代码 yield item
def parse_item(self, response): item = NewsItem() item['url'] = response.url item['title'] = response.xpath( '//*[@id="articleTitle"]/text()').extract()[0].strip() item['pub_time'] = '2017-08-27' item['content_code'] = response.xpath( '//*[@id="articleContent"]/div/div[1]').extract()[0] # 返回每个提取到的item数据, 给管道文件处理, 同时还会回来执行后面的代码 yield item
def parse_item(self, response): item = NewsItem() item['url'] = response.url item['title'] = response.xpath( '//*[@id="artical_topic"]/text()').extract()[0].strip() item['pub_time'] = response.xpath( '//*[@id="artical_sth"]/p/span[1]/text()').extract()[0].strip() item['content_code'] = response.xpath( '//*[@id="main_content"]').extract()[0] # 返回每个提取到的item数据, 给管道文件处理, 同时还会回来执行后面的代码 yield item
def parse_item(self, response): item = NewsItem() item['url'] = response.url item['title'] = response.xpath( '//h1/span[2]/text()').extract()[0].strip() item['pub_time'] = response.xpath( '//time[1]/text()').extract()[0].strip() item['content_code'] = response.xpath( '//main/div/div[1]/div/article/div[1]').extract()[0].strip() # 返回每个提取到的item数据, 给管道文件处理, 同时还会回来执行后面的代码 yield item
def parse_item(self, response): item = NewsItem() item['url'] = response.url item['title'] = response.xpath( '//*[@id="conTit"]/h1/text()').extract()[0].strip() item['pub_time'] = response.xpath( '//*[@id="artInfo"]/text()').extract()[0].strip()[:11] item['content_code'] = response.xpath( '//*[@id="the_content"]/div[3]/div[2]').extract()[0].strip() # 返回每个提取到的item数据, 给管道文件处理, 同时还会回来执行后面的代码 yield item
def parse_item(self, response): item = NewsItem() item['url'] = response.url item['title'] = response.xpath( '//*[@id="activity-name"]/text()').extract()[0].strip() item['pub_time'] = response.xpath( '//*[@id="post-date"]/text()').extract()[0] item['content_code'] = response.xpath( '//*[@id="js_content"]').extract()[0] # 返回每个提取到的item数据, 给管道文件处理, 同时还会回来执行后面的代码 yield item
def parse_item(self, response): item = NewsItem() item['url'] = response.url item['title'] = response.xpath( '//*[@id="epContentLeft"]/h1/text()').extract()[0].strip() item['pub_time'] = response.xpath( '//*[@id="epContentLeft"]/div[1]/text()').extract()[0].strip()[:10] item['content_code'] = response.xpath( '//*[@id="endText"]').extract()[0].strip() # 返回每个提取到的item数据, 给管道文件处理, 同时还会回来执行后面的代码 yield item
def parse_item(self, response): item = NewsItem() item['url'] = response.url item['title'] = Selector(response).xpath( '/html/body/div[1]/div/div[1]/div/h2/text()').extract()[0].strip() item['pub_time'] = Selector(response).xpath( '/html/body/div[1]/div/div[1]/div/div[1]/span[4]/text()').extract( )[0].strip()[:11] item['content_code'] = Selector(response).xpath( '//html/body/div[1]/div/div[1]/div/div[2]').extract()[0].strip() # 返回每个提取到的item数据, 给管道文件处理, 同时还会回来执行后面的代码 yield item
def parse_item(self, response): item = NewsItem() item['url'] = response.url item['title'] = response.xpath( '//*[@id="content"]//h1/text()').extract()[0].strip() # item['pub_time'] = response.url.split('/')[-3] + '-' + response.url.split('/')[-2] item['pub_time'] = response.xpath( '//*[@id="content"]/span/text()').re_first('\d{4}年\d+月\d+日') item['content_code'] = ''.join( response.xpath('//*[@id="content"]//div/p//text()').extract()) # 返回每个提取到的item数据, 给管道文件处理, 同时还会回来执行后面的代码 yield item
def parse_item(self, response): item = NewsItem() item['url'] = response.url item['title'] = response.xpath( '//div[@id="singlepost-wrap"]/article/header/div[2]/h1/text()' ).extract()[0].strip() item['pub_time'] = response.xpath( '//div[@id="singlepost-wrap"]/article/header/div[3]/div[1]/div[3]/a/time/text()' ).extract()[0].strip() item['content_code'] = response.xpath( '//div[@id="entry-content"]').extract()[0] # 返回每个提取到的item数据, 给管道文件处理, 同时还会回来执行后面的代码 yield item
def parse_item(self, response): item = NewsItem() item['url'] = response.url item['title'] = response.xpath( '//*[@id="main-container"]/div/div/div/div[1]/div/div[1]/h2/text()' ).extract()[0].strip() item['pub_time'] = response.xpath( '//*[@id="main-container"]/div/div/div/div[1]/div/div[1]/p/span/span/text()[2]' ).extract()[0].strip()[5:15] item['content_code'] = response.xpath( '//*[@id="contents"]').extract()[0].strip() # 返回每个提取到的item数据, 给管道文件处理, 同时还会回来执行后面的代码 yield item
def parse_item(self, response): item = NewsItem() item['url'] = response.url item['title'] = response.xpath( '//*[@id="container"]/section/div/div[1]/div[2]/h1/text()' ).extract()[0].strip() item['pub_time'] = response.xpath( '//*[@id="container"]/section/div/div[1]/div[2]/p/em[5]/text()' ).extract()[0].strip() item['content_code'] = response.xpath( '//*[@id="container"]/section/div/div[1]/div[4]').extract()[0] # 返回每个提取到的item数据, 给管道文件处理, 同时还会回来执行后面的代码 yield item
def parse_item(self, response): item = NewsItem() # extract article info by xpath node = response.xpath('//*[@id="articles-show"]/article') # get content_div, title, publication time and url content_code = response.xpath( '//*[@id="js-article-content"]').extract() title = node.xpath('./h1/text()').extract() pub_time = node.xpath('./div/span/text()').extract() item['content_code'] = content_code[0] item['title'] = title[0] item['pub_time'] = pub_time[0] item['url'] = response.url # 返回每个提取到的item数据, 给管道文件处理, 同时还会回来执行后面的代码 yield item
def parse_item(self, response): item = NewsItem() item['url'] = response.url # get article id article_id = response.url.split('/')[-1][:6] # generate xpath title_xpath = '//*[@id="article' + article_id + '"' + ']/div[2]/div[2]/h1/text()' pub_time_xpath = '//*[@id="article' + article_id + '"' + ']/div[2]/div[2]/div[1]/div/span[1]/text()' content_xpath = '//*[@id="article_content' + article_id + '"' + ']' item['title'] = response.xpath(title_xpath).extract()[0].strip() item['pub_time'] = response.xpath(pub_time_xpath).extract()[0].strip() item['content_code'] = response.xpath( content_xpath).extract()[0].strip() # 返回每个提取到的item数据, 给管道文件处理, 同时还会回来执行后面的代码 yield item