def parse(self, response): item = MycrawlerItem() for news in response.xpath('//div[@class="region region-promoted"]'): item['URL'] = response.url item['TITLE'] = news.xpath( './/div[@class="views-field views-field-title text-justify"]/span/a/text()' ).extract_first() item['BODY'] = news.xpath( './/div[@class="views-field views-field-body text-justify padding-right-zero"]/span/text()' ).extract_first() item['DATE'] = news.xpath( './/div[@class="views-field views-field-field-date-publishing"]/div/text()' ).extract_first() yield item for news in response.xpath('//div[@id="block-system-main"]//li'): item['URL'] = response.url item['TITLE'] = news.xpath( './div[2]/span/a/text()').extract_first() item['BODY'] = news.xpath('./div[3]/span/text()').extract_first() item['DATE'] = news.xpath('./div[4]/div/text()').extract_first() yield item for news in response.xpath( '//div[@id="content-footer-inside"]//ul[@class="item"]/li'): item['URL'] = response.url item['TITLE'] = news.xpath( './div[2]/span/a/text()').extract_first() item['BODY'] = news.xpath('./div[3]/span/text()').extract_first() item['DATE'] = news.xpath('./div[4]/div/text()').extract() yield item
def parse(self, response): item = MycrawlerItem() all_News = response.xpath('//a[@class="title-link"]/ancestor::div[1]') for news in all_News: relative_url = news.xpath('a/@href').extract_first() absolute_url = response.urljoin(relative_url) title = news.xpath('./a//span/text()').extract_first() date = news.xpath( './/li[@class="mini-info-list__item"]/div[@data-datetime]/text()' ).extract_first() yield Request(absolute_url, callback=self.parse_page, meta={ 'URL': absolute_url, 'TITLE': title, 'DATE': date }) # relative_next_url = response.xpath('//a[@class="button next"]/@href').extract_first() # absolute_next_url = "https://newyork.craigslist.org" + relative_next_url next_page_url = response.xpath( '//li[@class="next"]/a/@href').extract_first() if next_page_url is not None: yield scrapy.Request(response.urljoin(next_page_url))
def parse_page(response): item = MycrawlerItem() item['URL'] = response.meta.get('URL') item['TITLE'] = response.meta.get('TITLE') item['DATE'] = response.meta.get('DATE') item['BODY'] = response.xpath( '//div[@class="lts-txt2"]/text() | //div[@class="lts-txt2"]/div/text() | //div[@class="lts-txt2"]/p/text() | //div[@class="lts-txt2"]/div/p' ).extract_first() yield item
def parse_sub_news(response): item = MycrawlerItem() item['URL'] = response.meta.get('URL') item['TITLE'] = response.xpath( '//div[@class="main-news-heading visible-xs visible-sm"]/h1/text()' ).extract_first item['DATE'] = response.meta.get('DATE') item['BODY'] = response.xpath( '//div[@class="lts-txt2"]/text()').extract_first() return item
def parse_main_news(response): item = MycrawlerItem() item['URL'] = response.meta.get('URL') item['TITLE'] = response.xpath( './/div[@class="main-news-heading visible-xs visible-sm"]/h1/text()' ).extract_first() item['DATE'] = response.meta.get('DATE') item['BODY'] = response.xpath( '//div[@class="text-left w-300 editor-styles"]/p/text()' ).extract_first() return item
def parse(self, response): item = MycrawlerItem() for news in response.xpath('//a[@class="title-link"]/ancestor::div[1]'): item['URL'] = response.url item['TITLE'] = news.xpath('./a/h3[@class="title-link__title"]/span/text()').extract_first() item['BODY'] = news.xpath('./p[@class="eagle-item__summary"]/text()').extract_first() item['DATE'] = news.xpath('.//li[@class="mini-info-list__item"]/div[@data-datetime]/text()').extract_first() yield item # yield: collected as item, and it's come as item of output file, inforamtion need to collected need to include next_page_url = response.xpath('//li[@class="next"]/a/@href').extract_first() if next_page_url is not None: yield scrapy.Request(response.urljoin(next_page_url)) return item
def parse(self, response): item = MycrawlerItem() for news in response.xpath('//div[@class="article-content"]'): item['URL'] = response.url item['TITLE'] = news.xpath('./h2/a/text()').extract_first() item['BODY'] = news.xpath('./p/text()').extract_first() item['DATE'] = news.xpath('./span/a/text()').extract() yield item # yield: collected as item, and it's come as item of output file, inforamtion need to collected need to include # next_page_url = response.xpath('//a[contains(text(),"Next Page")]/@href').extract_first() # if next_page_url is not None: # yield scrapy.Request(response.urljoin(next_page_url)) return item
def parse(self, response): item = MycrawlerItem() for news in response.xpath('//div[@class="story-text"]'): item['URL'] = response.url item['TITLE'] = news.xpath('.//h4/a/text()').extract_first() item['BODY'] = news.xpath('.//p/text()').extract_first() item['DATE'] = news.xpath('.//div/span/text()').extract_first() yield item # yield: collected as item, and it's come as item of output file, inforamtion need to collected need to include #nextPages = response.xpath("//ul[@class='pagination']/li/a/@href") #for nextPage in nextPages: # next_page_url = response.urljoin(nextPage.extract()) # if next_page_url is not None: # yield scrapy.Request(response.urljoin(next_page_url)) return item
def parse(self, response): item = MycrawlerItem() for news in response.xpath('//div[@class="col-md-6 cat-ite"]'): item['URL'] = response.url item['TITLE'] = news.xpath('.//div[2]/a/text()').extract_first() item['BODY'] = news.xpath('.//div[3]/text()').extract_first() # item['DATE'] = news.xpath('').extract_first() yield item # yield: collected as item, and it's come as item of output file, inforamtion need to collected need to include nextPages = response.xpath( "//div[@class='page-nation']//li/a[contains(text(),'Next')]/@href") for nextPage in nextPages: next_page_url = response.urljoin(nextPage.extract()) if next_page_url is not None: yield scrapy.Request(response.urljoin(next_page_url)) return item
def parse(self, response): item = MycrawlerItem() for news in response.xpath('//article'): item['URL'] = response.url item['TITLE'] = news.xpath('./text()').extract_first() item['BODY'] = news.xpath( './/div[@class="entry-content"]/text()').extract_first() item['DATE'] = news.xpath( './/span[@class="entry-date post-date"]/abbr/text()').extract( ) yield item # yield: collected as item, and it's come as item of output file, inforamtion need to collected need to include nextPages = response.xpath( "//div[@class='wp-pagenavi iegradient']/a/@href") for nextPage in nextPages: next_page_url = response.urljoin(nextPage.extract()) if next_page_url is not None: yield scrapy.Request(response.urljoin(next_page_url)) return item
def parse(self, response): # data_list = [] div_list = response.xpath( "/html/body/main/div/div/div[1]/div/div[2]/div[1]") """ /html/body/main/div/div/div[1]/div/div[2]/div[1] /html/body/main/div/div/div[1]/div/div[2]/div[1]/div[1]/div /html/body/main/div/div/div[1]/div/div[2]/div[1]/div[4]/div /html/body/main/div/div/div[1]/div/div[2]/div[1]/div[3]/div/div/div[1]/a """ # print(div_list) for div in div_list: # 注意:xpath返回的列表中的列表元素是selector对象,我们要解析获取的字符串的数据是存储在该对象中的,必须经过一个 extract()的操作才可以将该对象中存储的字符串的的数据获取 content = div.xpath("./div/div/div/div[1]/a/text()").extract() item = MycrawlerItem() item["content"] = content # xpath返回的列表元素有多个, 想要将每一个列表元素对应的字符串取出该如何操作 print(content) print(item) yield item