def parse(self, response): items = [] for element in response.css('div.blog-post-row div.node-internal-blog-post div.group-header div div.field-items div a')[:self.settings.attributes['SCRAPE_LIMIT'].value]: item = NewsRelease() item['title'] = element.css('::text').extract_first() item['link'] = element.css('::attr(href)').extract_first() items.append(item) return items
def parse(self, response): items = [] for element in response.css('div.node-article div.content a.title-link')[:self.settings.attributes['SCRAPE_LIMIT'].value]: item = NewsRelease() item['title'] = element.css('::text').extract_first() item['link'] = "https://www.energy.gov" + element.css('::attr(href)').extract_first() items.append(item) return items
def parse(self, response): items = [] for element in response.css('div.item-list ul li div.views-field-field-custom-title h2 a')[:self.settings.attributes['SCRAPE_LIMIT'].value]: item = NewsRelease() item['title'] = element.css('::text').extract_first() item['link'] = "https://www.ed.gov" + element.css('::attr(href)').extract_first() items.append(item) return items
def parse(self, response): items = [] for element in response.css('div.hudpagepad div.genlink a')[:self.settings.attributes['SCRAPE_LIMIT'].value]: item = NewsRelease() item['title'] = element.css('::text').extract_first() item['link'] = element.css('::attr(href)').extract_first() items.append(item) return items
def parse(self, response): items = [] for element in response.css('h2.post-title a')[:self.settings.attributes['SCRAPE_LIMIT'].value]: item = NewsRelease() item['title'] = element.css('::text').extract_first() item['link'] = "https://blog.dol.gov" + element.css('::attr(href)').extract_first() items.append(item) return items
def parse(self, response): items = [] for element in response.xpath("//table[@class='t-press']/tr[contains(@class, 'datarow')]/td[2]/a")[:self.settings.attributes['SCRAPE_LIMIT'].value]: item = NewsRelease() item['title'] = element.css('::text').extract_first() item['link'] = "https://www.treasury.gov" + element.css('::attr(href)').extract_first() items.append(item) return items
def parse(self, response): items = [] for element in response.css("div.event-desc a.read")[-self.settings.attributes['SCRAPE_LIMIT'].value:]: item = NewsRelease() item['link'] = element.css('::attr(href)').extract_first() request = scrapy.Request(item['link'], callback=self.parse_title) request.meta['item'] = item items.append(request) return items
def parse(self, response): items = [] for element in response.css('div.field-item a'): item = NewsRelease() item['link'] = element.css('::attr(href)').extract_first() request = scrapy.Request(item['link'], callback=self.parse_west_wing_read_title) request.meta['item'] = item items.append(request) return items
def parse(self, response): items = [] for element in response.css('div.page-results__wrap article'): item = NewsRelease() item['link'] = element.css('::attr(href)').extract_first() request = scrapy.Request(item['link'], callback=self.whitehouse_news_title) request.meta['item'] = item items.append(request) return items
def parse(self, response): items = [] for element in response.css( 'div.view-display-id-page_press_releases div table tbody tr td.views-field-title a' )[:self.settings.attributes['SCRAPE_LIMIT'].value]: item = NewsRelease() item['title'] = element.css('::text').extract_first() item['link'] = "https://www.transportation.gov" + element.css( '::attr(href)').extract_first() items.append(item) return items
def parse(self, response): items = [] for element in response.xpath( "//div[@class='l-wrap']/a[@target='_self']|//div[@class='l-wrap']/p/a[@target='_self']" )[:self.settings.attributes['SCRAPE_LIMIT'].value]: item = NewsRelease() item['title'] = element.css('::text').extract_first() item['link'] = "https://www.state.gov" + element.css( '::attr(href)').extract_first() items.append(item) return items
def parse(self, response): items = [] for element in response.css( 'p.speech-title a')[:self.settings.attributes['SCRAPE_LIMIT']. value]: item = NewsRelease() item['title'] = element.css('::text').extract_first() item['link'] = "https://www.va.gov/opa/pressrel/" + element.css( '::attr(href)').extract_first() items.append(item) return items
def parse(self, response): items = [] for element in response.css( 'article.press-post h3 a')[:self.settings. attributes['SCRAPE_LIMIT'].value]: item = NewsRelease() item['title'] = element.css('::text').extract_first() item['link'] = "https://www.donaldjtrump.com" + element.css( '::attr(href)').extract_first() items.append(item) return items
def parse(self, response): items = [] for element in response.css( 'div.blog-header h1.title a' )[:self.settings.attributes['SCRAPE_LIMIT'].value]: item = NewsRelease() item['title'] = element.css('::text').extract_first() item['link'] = "https://www.transportation.gov" + element.css( '::attr(href)').extract_first() items.append(item) return items
def parse(self, response): items = [] for element in response.css( 'div.view-about-newsroom div table tbody tr td.views-field-title a' )[:self.settings.attributes['SCRAPE_LIMIT'].value]: item = NewsRelease() item['title'] = element.css('::text').extract_first() item['link'] = "https://www.sba.gov" + element.css( '::attr(href)').extract_first() items.append(item) return items
def parse(self, response): items = [] for element in response.css( 'div.view-news-releases-updated div.view-content div.views-row div span a' )[:self.settings.attributes['SCRAPE_LIMIT'].value]: item = NewsRelease() item['title'] = element.css('::text').extract_first() item['link'] = "https://www.dhs.gov" + element.css( '::attr(href)').extract_first() items.append(item) return items
def parse(self, response): items = [] for element in response.css( 'tr.pr-list-page-row td.views-field-field-display-title a' )[:self.settings.attributes['SCRAPE_LIMIT'].value]: item = NewsRelease() item['title'] = element.css('::text').extract_first() item['link'] = "https://www.sec.gov" + element.css( '::attr(href)').extract_first() items.append(item) return items
def parse(self, response): items = [] for element in response.css( 'span.field-content a')[:self.settings. attributes['SCRAPE_LIMIT'].value]: item = NewsRelease() item['title'] = element.css('::text').extract_first() item['link'] = "https://www.justice.gov" + element.css( '::attr(href)').extract_first() items.append(item) return items
def parse(self, response): items = [] for element in response.css( 'ul.topic_list li')[:self.settings.attributes['SCRAPE_LIMIT']. value]: item = NewsRelease() item['title'] = element.css( 'span.topicDescription::text').extract_first() item['link'] = "https://www.cia.gov" + element.css( 'span.summary a::attr(href)').extract_first() items.append(item) return items
def parse(self, response): items = [] for element in response.css('ol#stream-items-id li div.tweet')[:self.settings.attributes['SCRAPE_LIMIT'].value]: item = NewsRelease() text = element.xpath("div[@class='content']/div[@class='js-tweet-text-container']//text()[not(ancestor-or-self::a[contains(@class, 'u-hidden')])]").extract() good_text = "" for part in text: if "\n" not in part: good_text += part item['title'] = good_text item['link'] = 'https://www.twitter.com' + element.css('::attr(data-permalink-path)').extract_first() items.append(item) return items
def parse(self, response): items = [] for element in response.css( 'h3.yt-lockup-title a.yt-uix-tile-link' )[:self.settings.attributes['SCRAPE_LIMIT'].value]: item = NewsRelease() if ('UCAql2DyGU2un1Ei2nMYsqOA' in response.url): item['title'] = '**TRUMP TV**' + element.css( '::text').extract_first() else: item['title'] = element.css('::text').extract_first() item['link'] = "https://www.youtube.com" + element.css( '::attr(href)').extract_first() items.append(item) return items