def parse(self, response): url = response.url datetime = response.css(".news-date ::attr(datetime)").extract_first() headline = response.css(".news-title ::text").extract_first() subhead = response.css(".news-summary ::text").extract_first() author = response.css(".author-name a ::text").extract_first() body_text = " ".join( response.css(".news-text-content p ::text").extract()) media_text = " ".join( response.css(".news-text-content .news-media-description p ::text" ).extract()) body_text = body_text.replace(media_text, "") notice = SoccerNewsItem(headline=headline, subhead=subhead, author=author, body_text=body_text, url=url, datetime=datetime, source=self.name) yield notice
def parse(self, response): url = response.url datetime = response.css(".post-date ::text").extract_first() headline = response.css(".post-title ::text").extract_first() subhead = "" author = response.css(".author ::text").extract_first().strip() body_text = " ".join(response.css('.article-body p ::text').extract()) rel_lst = response.css('.zopo-title span ::text').extract() vid_text = " ".join(response.css("p.vjs-no-js ::text").extract()) mor_text = " ".join(response.css(".mor-link ::text").extract()) #twt_lst = response.css('.embed-twitter p ::text').extract() #igm_lst = response.css(".instagram-media p ::text").extract() for i in range(0, len(rel_lst), 3): i_text = " ".join(rel_lst[i:i + 3]) body_text = body_text.replace(i_text, "") body_text = body_text.replace(vid_text, "") body_text = body_text.replace(mor_text, "") #for i in twt_lst: # body_text = body_text.replace(i.strip(), "") #for i in igm_lst: # body_text = body_text.replace(i.strip(), "") notice = SoccerNewsItem(headline=headline, subhead=subhead, author=author, body_text=body_text, url=url, datetime=datetime, source=self.name) yield notice
def parse(self, response): url = response.url datetime = response.css(".date ::attr(datetime)").extract_first() headline = response.css("h1 ::text").extract_first() sh_lst = response.css("h2 ::text").extract() author = response.css(".author-link ::text").extract_first() body_text = " ".join(response.css('.editor p ::text').extract()) for i in range(len(sh_lst)): sh_lst[i] = sh_lst[i].strip() subhead = " ".join(sh_lst) rel_text = response.css('.relations p ::text').extract_first() if rel_text: body_text = body_text.replace(rel_text, "") box_text = " ".join(response.css('.box-left-55 p ::text').extract()) body_text = body_text.replace(box_text, "") #twt_lst = response.css(".twitter-tweet ::text").extract() #igm_lst = response.css(".instagram-media ::text").extract() #for i in twt_lst: # body_text = body_text.replace(i.strip(), "") #for i in igm_lst: # body_text = body_text.replace(i.strip(), "") notice = SoccerNewsItem(headline=headline, subhead=subhead, author=author, body_text=body_text, url=url, datetime=datetime, source=self.name) yield notice
def parse(self, response): url = response.url datetime = response.css( ".story-leaf-datetime ::attr(datetime)").extract_first() headline = response.css(".story-leaf-title ::text").extract_first() subhead = response.css(".story-leaf-subtitle ::text").extract_first() author = response.css(".story-leaf-author-link ::text").extract_first() body_text = " ".join( response.css(".story-leaf-txt-p p ::text").extract()) rel_text = " ".join( response.css('p.story-leaf-relatednews-epigraph ::text').extract()) body_text = body_text.replace(rel_text, "") notice = SoccerNewsItem(headline=headline, subhead=subhead, author=author, body_text=body_text, url=url, datetime=datetime, source=self.name) yield notice
def parse(self, response): url = response.url datetime = response.css( ".component-content time::attr(datetime)").extract_first() headline = response.css("h1.headline__heading ::text").extract_first() subhead = response.css(".lead-asset-caption ::text").extract_first() author = response.css(".byline__author-name a ::text").extract_first() bt_lst = response.css(".articleBodyText p ::text").extract() for i in range(len(bt_lst)): bt_lst[i] = bt_lst[i].strip() body_text = " ".join(bt_lst) body_text = body_text.replace(subhead, "") notice = SoccerNewsItem(headline=headline, subhead=subhead, author=author, body_text=body_text, url=url, datetime=datetime, source=self.name) yield notice