def parse(self, response): for titulo in response.xpath('.//div'): item = RssItem() item.title = titulo.xpath('.//a/text()').extract_first() item.link = titulo.xpath('.//a/@href').extract_first() yield item
def parse_products(self, response): items = response.xpath('.//*') for item in items: product = RssItem() title = 'prueba linkedin' url = item.xpath('.//a/@href').extract_first() product.title = title product.link = url yield product
def parse_products(self, response): items = response.xpath('//ul[contains(@class, "itemsList")]/li') for item in items: product = RssItem() title = item.xpath('.//h3/a/text()').extract_first().strip() url = item.xpath('.//h3/a/@href').extract_first() product_id = item.xpath('.//input[@name="id"]/@value').extract_first() price = item.xpath('.//span[@class="currencyPrice"]/text()').extract_first() product.title = title product.link = url product.description = price yield product
def parse(self, response): post_elements = response.css( ".cff-item.author-lnp---liberal-national-party") for post in post_elements: is_video_post = True if post.css(".cff-video-post") else False item = RssItem() item.author = post.css("div.cff-page-name a::text").get() item.pubDate = post.css(".cff-date::text").get().strip() item.link = post.css(".cff-viewpost-facebook::attr(href)").get() item.guid = item.link.link if is_video_post: item.title = post.css(".cff-poster::attr(alt)").get() item.description = post else: item.title = post.css(".cff-post-desc::text").get() item.description = post.css(".cff-post-desc").get() yield item
def parse_item(self, response): item = RssItem() item.title = response.css("title::text").get() item.author = "Liberal Party of Western Australia" item.link = response.url item.guid = response.url item.pubDate = response.css( 'meta[property="article:published_time"]::attr(content)').get() item.description = response.css(".entry-content").get() yield item
def parse_item(self, response): item = RssItem() item.title = response.css( 'meta[property="og:title"]::attr(content)').get() item.link = response.url item.guid = response.url item.pubDate = response.css("h4::text").get().strip() item.author = response.css("div.header h1::text").get() item.description = "".join( response.css('div[style="width:100%"] p').getall()) yield item
def parse_item(self, response): item = RssItem() item.title = response.css( 'meta[property="og:description"]::attr(content)').get() item.author = "South Australian Labor" item.link = response.url item.guid = response.url item.pubDate = response.css(".timestampContent").get() item.description = "".join( response.css('div[data-testid="post_message"]').getall()) yield item
def parse_item(self, response): item = RssItem() item.title = response.css( 'meta[property="og:title"]::attr(content)').get() item.author = " & ".join( response.css(".contributor .name::text").getall()) item.pubDate = response.css( 'meta[property="article:published_time"]::attr(content)').get() item.link = response.url item.guid = response.url item.description = response.css(".content").get() yield item
def parse_item(self, response): item = RssItem() item.title = response.css("title::text").get().rsplit(" - ", 1)[0] item.author = "Tasmanian Labor" item.link = response.url item.guid = response.url item.pubDate = response.css( 'meta[property="article:published_time"]::attr(content)').get() description = response.css(".content-body").get() description = re.sub(r"<h2>.*?</h2>.*<time>.*?</time>", "", description) item.description = description yield item
def parse_item(self, response): date_pattern = ( r'field_news_date:"\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\+\d{2}:\d{2}"' ) item = RssItem() item.title = response.css("title::text").get().split(" | ")[0] item.link = response.url item.guid = response.url item.pubDate = ( response.css("script::text") .re_first(date_pattern) .split(":", maxsplit=1)[-1] .strip('"') ) item.description = "".join(response.css("div.rpl-markup__inner p").getall()) item.author = "State Government of Victoria" yield item
def parse_item(self, response): item = RssItem() item.title = response.css('font[size="5"]::text').get() item.link = response.url item.guid = response.url item.author = " - ".join( response.css('font[size="6"]::text').getall()).strip() re_date = r"\d{1,2} [A-Z][a-z]+ \d{4}" item.pubDate = response.css(".paragraph").re_first(re_date) description = response.css("div.content-wrap .paragraph").get() description = re.sub( f"<strong>(<span .*?>)?<font .*?>{item.title.title}(<br>)?</font>(</span>)?</strong><br>", "", description, ) description = re.sub(re_date + "<br><br>", "", description) item.description = description yield item
def parse(self, response): for category_name in response.css('.list-group-item ::text'): item = RssItem() item.title = category_name.extract() yield item
def __init__(self, *args, **kwargs): super(self.__class__, self).__init__(*args, **kwargs) self.rss = RssItem()