class ExpressSpider(scrapy.Spider): name = 'express' allowed_domains = ['express.co.uk'] source = 'Daily Express' start_urls = url_selector.get_urls(source) def parse(self, response): url = response.url datetime = response.css("time ::attr(datetime)").extract_first() headline = response.css("h1 ::text").extract_first() subhead = response.css("h3 ::text").extract_first() author = response.css(".author span ::text").extract_first() body_text = " ".join( response.css(".text-description p ::text").extract()) notice = SoccerNewsItem(headline=headline, subhead=subhead, author=author, body_text=body_text, url=url, datetime=datetime, source=self.name) yield notice
class ElDesmarqueSpider(scrapy.Spider): name = 'eldesmarque' allowed_domains = ['eldesmarque.com'] source = 'El Desmarque' start_urls = url_selector.get_urls(source) def parse(self, response): url = response.url datetime = response.css(".fecha ::text").extract_first() headline = response.css("h1.titulo ::text").extract_first() subhead = "" author = response.css(".autor span ::text").extract_first() body_text = " ".join( response.css("#cuerpo-noticia p ::text").extract()) script_text = " ".join( response.css("#cuerpo-noticia p script ::text").extract()) body_text = body_text.replace(script_text, "") notice = SoccerNewsItem(headline=headline, subhead=subhead, author=author, body_text=body_text, url=url, datetime=datetime, source=self.name) yield notice
class TheSunSpider(scrapy.Spider): name = 'thesun' allowed_domains = ['thesun.co.uk', 'thesun.ie'] source = 'The Sun' start_urls = url_selector.get_urls(source) def parse(self, response): url = response.url datetime = response.css( ".article__published span ::text").extract_first() headline = response.css(".article__headline ::text").extract_first() subhead = response.css(".article__subdeck p ::text").extract_first() author = response.css( "span.article__author-name ::text").extract_first() bt_lst = response.css(".article__content p ::text").extract() for i in range(len(bt_lst)): bt_lst[i] = bt_lst[i].strip() body_text = " ".join(bt_lst) notice = SoccerNewsItem(headline=headline, subhead=subhead, author=author, body_text=body_text, url=url, datetime=datetime, source=self.name) yield notice #Twt #https://www.thesun.co.uk/sport/football/6976195/chelsea-complete-signing-mateo-kovacic-real-madrid-loan/
class EstadioDeportivoSpider(scrapy.Spider): name = 'estadiodeportivo' allowed_domains = ['estadiodeportivo.com'] source = 'Estadio Deportivo' start_urls = url_selector.get_urls(source) def parse(self, response): url = response.url datetime = response.css("span.fecha_hora ::text").extract_first() headline = response.css(".noticia h1 ::text").extract_first() subhead = "" author = response.css("span.autor_sup ::text").extract_first() body_text = " ".join( response.css(".cuerpo_noticia p ::text").extract()) twt_text = " ".join(response.css(".twitter-tweet p ::text").extract()) body_text = body_text.replace(twt_text, "") notice = SoccerNewsItem(headline=headline, subhead=subhead, author=author, body_text=body_text, url=url, datetime=datetime, source=self.name) yield notice
class Futebol365Spider(scrapy.Spider): name = 'futebol365' allowed_domains = ['futebol365.pt'] source = 'Futebol 365' start_urls = url_selector.get_urls(source) def parse(self, response): url = response.url headline = response.css(".titulo ::text").extract_first() subhead = response.css(".texto span.negrito ::text").extract_first() date_author = response.css(".data ::text").extract_first().split(",") author = date_author[0].replace("por", "") datetime = date_author[1] body_text = " ".join(response.css(".texto p ::text").extract()) body_text = body_text.replace(subhead, "") notice = SoccerNewsItem(headline=headline, subhead=subhead, author=author, body_text=body_text, url=url, datetime=datetime, source=self.name) yield notice
class ESPNSpider(scrapy.Spider): name = 'espn' allowed_domains = ['espn.com', 'espn.com.br', 'espn.co.uk'] source = 'ESPN' start_urls = url_selector.get_urls(source) def parse(self, response): url = response.url datetime = response.css(".timestamp ::attr(data-date)").extract_first() headline = response.css(".article-header h1 ::text").extract_first() subhead = "" author = response.css(".author ::text").extract_first() body_text = " ".join(response.css(".article-body p ::text").extract()) related_text = " ".join( response.css(".article-body .editorial p ::text").extract()) prom_text = " ".join( response.css(".article-body .inline-track p ::text").extract()) #related_text = " ".join(response.css(".article-body strong ::text").extract()) body_text = body_text.replace(related_text, "") body_text = body_text.replace(prom_text, "") notice = SoccerNewsItem(headline=headline, subhead=subhead, author=author, body_text=body_text, url=url, datetime=datetime, source=self.name) yield notice
class DeporSpider(scrapy.Spider): name = 'depor' allowed_domains = ['depor.com'] source = 'Depor' start_urls = url_selector.get_urls(source) def parse(self, response): url = response.url datetime = response.css(".news-date ::attr(datetime)").extract_first() headline = response.css(".news-title ::text").extract_first() subhead = response.css(".news-summary ::text").extract_first() author = response.css(".author-name a ::text").extract_first() body_text = " ".join( response.css(".news-text-content p ::text").extract()) media_text = " ".join( response.css(".news-text-content .news-media-description p ::text" ).extract()) body_text = body_text.replace(media_text, "") notice = SoccerNewsItem(headline=headline, subhead=subhead, author=author, body_text=body_text, url=url, datetime=datetime, source=self.name) yield notice
class NoticiasAoMinutoSpider(scrapy.Spider): name = 'noticiasaominuto' allowed_domains = ['noticiasaominuto.com'] source = 'Notícias ao Minuto' start_urls = url_selector.get_urls(source) def parse(self, response): url = response.url datetime = response.css(".news-info-time ::text").extract_first() headline = response.css(".news-headline ::text").extract_first() subhead = response.css(".news-subheadline ::text").extract_first() author = response.css(".author-hover ::text").extract_first() body_text = " ".join( response.css(".news-main-text p ::text").extract()) pub_text = " ".join( response.css(".news-main-text p a ::text ").extract()) body_text = body_text.replace(pub_text, "") notice = SoccerNewsItem(headline=headline, subhead=subhead, author=author, body_text=body_text, url=url, datetime=datetime, source=self.name) yield notice #Pub #https://www.noticiasaominuto.com/desporto/1068563/fc-porto-prepara-nova-proposta-por-ntcham #First letter.
class ABolaSpider(scrapy.Spider): name = 'abola' allowed_domains = ['abola.pt'] source = 'A Bola' start_urls = url_selector.get_urls(source) def parse(self, response): url = response.url datetime = response.css(".data-hora span ::text").extract_first() headline = response.css("h1.titulo ::text").extract_first() subhead = "" author = response.css(".assinatura span ::text").extract_first() body_text = " ".join(response.css(".corpo-noticia ::text").extract()) notice = SoccerNewsItem( headline=headline, subhead=subhead, author=author, body_text=body_text, url=url, datetime=datetime, source=self.name) yield notice #Twt #https://www.abola.pt/Clubes/Noticias/Ver/739766/42
class FoxSportsSpider(scrapy.Spider): name = 'foxsports' allowed_domains = ['foxsports.com.br'] source = 'Fox Sports' start_urls = url_selector.get_urls(source) def parse(self, response): #Extract only portuguese news: if 'foxsports.com.br' in response.url: url = response.url datetime = response.css(".publish-date ::text").extract_first() headline = response.css("h1 ::text").extract_first() subhead = response.css("h2 ::text").extract_first() author = response.css(".author ::text").extract_first() body_text = " ".join(response.css(".embed p ::text").extract()) body_text = body_text.replace( "Veja as últimas do Mercado da Bola e quem pode chegar ao seu time", "") body_text = body_text.split("Saiba mais: ")[0] notice = SoccerNewsItem(headline=headline, subhead=subhead, author=author, body_text=body_text, url=url, datetime=datetime, source=self.name) yield notice #Rel text #https://www.foxsports.com.br/news/370761-barcelona-anuncia-venda-de-aleix-vidal-para-o-sevilla
class ElPaisSpider(scrapy.Spider): name = 'elpais' allowed_domains = ['elpais.com'] source = 'El País' start_urls = url_selector.get_urls(source) def parse(self, response): url = response.url datetime = response.css( ".articulo-actualizado ::attr(datetime)").extract_first() headline = response.css(".articulo-titulo ::text").extract_first() subhead = response.css(".articulo-subtitulo ::text").extract_first() author = " ".join(response.css(".autor-nombre ::text").extract()) body_text = " ".join( response.css(".articulo-cuerpo p ::text").extract()) notice = SoccerNewsItem(headline=headline, subhead=subhead, author=author, body_text=body_text, url=url, datetime=datetime, source=self.name) yield notice #Twt #https://elpais.com/deportes/2018/08/24/actualidad/1535133090_339849.html
class MundoDeportivoSpider(scrapy.Spider): name = 'mundodeportivo' allowed_domains = ['mundodeportivo.com'] source = 'Mundo Deportivo' start_urls = url_selector.get_urls(source) def parse(self, response): url = response.url datetime = response.css( ".story-leaf-datetime ::attr(datetime)").extract_first() headline = response.css(".story-leaf-title ::text").extract_first() subhead = response.css(".story-leaf-subtitle ::text").extract_first() author = response.css(".story-leaf-author-link ::text").extract_first() body_text = " ".join( response.css(".story-leaf-txt-p p ::text").extract()) rel_text = " ".join( response.css('p.story-leaf-relatednews-epigraph ::text').extract()) body_text = body_text.replace(rel_text, "") notice = SoccerNewsItem(headline=headline, subhead=subhead, author=author, body_text=body_text, url=url, datetime=datetime, source=self.name) yield notice
class BleacherReportSpider(scrapy.Spider): name = 'bleacherreport' allowed_domains = ['bleacherreport.com'] source = 'Bleacher Report' start_urls = url_selector.get_urls(source) def parse(self, response): url = response.url datetime = response.css("header .date ::text").extract_first() headline = response.css("header h1 ::text").extract_first() subhead = "" author = response.css(".authorInfo .name ::text").extract_first() bt_lst = response.css(".contentStream p ::text").extract() for i in range(len(bt_lst)): bt_lst[i] = bt_lst[i].strip() body_text = " ".join(bt_lst) notice = SoccerNewsItem(headline=headline, subhead=subhead, author=author, body_text=body_text, url=url, datetime=datetime, source=self.name) yield notice #Twt #https://bleacherreport.com/articles/2788607-arsenal-transfer-news-hector-herrera-dismisses-porto-exit-rumours
class FichajesComSpider(scrapy.Spider): name = 'fichajes.com' allowed_domains = ['fichajes.com'] source = 'Fichajes.com' start_urls = url_selector.get_urls(source) def parse(self, response): url = response.url datetime = response.css("time ::attr(datetime)").extract_first() headline = response.css(".article h1 ::text").extract_first() subhead = response.css(".article h2 ::text").extract_first() author = response.css(".name ::text").extract_first() body_text = " ".join(response.css(".article-text ::text").extract()) twt_text = " ".join(response.css(".twitter-tweet p ::text").extract()) body_text = body_text.replace(twt_text, "") notice = SoccerNewsItem(headline=headline, subhead=subhead, author=author, body_text=body_text, url=url, datetime=datetime, source=self.name) yield notice #Twt #http://www.fichajes.com/breves/chelsea-las-primeras-palabras-de-mateo-kovacic_138960 #http://www.fichajes.com/breves/el-atletico-de-madrid-dice-adios-a-andre-moreira_138630
class BesoccerSpider(scrapy.Spider): name = 'besoccer' allowed_domains = ['besoccer.com'] source = 'Be Soccer' start_urls = url_selector.get_urls(source) def parse(self, response): url = response.url datetime = response.css( "time span.ni-date::attr(content)").extract_first() headline = response.css("h1.ni-title ::text").extract_first() subhead = response.css("p.teaser ::text").extract_first() author = response.css("a.ni-author ::text").extract_first() body_text = " ".join(response.css(".ni-text-body p ::text").extract()) body_text = body_text.replace(subhead, "") notice = SoccerNewsItem(headline=headline, subhead=subhead, author=author, body_text=body_text, url=url, datetime=datetime, source=self.name) yield notice
class FootballEspanaSpider(scrapy.Spider): name = 'footballespana' allowed_domains = ['football-espana.net'] source = 'Football Espana' start_urls = url_selector.get_urls(source) def parse(self, response): url = response.url datetime = response.css(".date ::text").extract_first() headline = response.css(".title ::text").extract_first() subhead = "" author = response.css(".submitted ::text").extract_first() body_text = " ".join(response.css(".content p ::text").extract()) body_text = body_text.replace( "See the latest La Liga predictions and betting tips with Eurotips.co.uk", "") notice = SoccerNewsItem(headline=headline, subhead=subhead, author=author, body_text=body_text, url=url, datetime=datetime, source=self.name) yield notice
class TelegraphSpider(scrapy.Spider): name = 'telegraph' allowed_domains = ['telegraph.co.uk'] source = 'The Telegraph' start_urls = url_selector.get_urls(source) def parse(self, response): url = response.url datetime = response.css( ".component-content time::attr(datetime)").extract_first() headline = response.css("h1.headline__heading ::text").extract_first() subhead = response.css(".lead-asset-caption ::text").extract_first() author = response.css(".byline__author-name a ::text").extract_first() bt_lst = response.css(".articleBodyText p ::text").extract() for i in range(len(bt_lst)): bt_lst[i] = bt_lst[i].strip() body_text = " ".join(bt_lst) body_text = body_text.replace(subhead, "") notice = SoccerNewsItem(headline=headline, subhead=subhead, author=author, body_text=body_text, url=url, datetime=datetime, source=self.name) yield notice
class GoalSpider(scrapy.Spider): name = 'goal' allowed_domains = ['goal.com'] source = 'Goal' start_urls = url_selector.get_urls(source) def parse(self, response): url = response.url datetime = response.css("time ::attr(datetime)").extract_first() headline = response.css(".article-headline ::text").extract_first() subhead = response.css(".teaser ::text").extract_first() author = response.css(".name ::text").extract_first() body_text = " ".join(response.css(".body p ::text").extract()) rel_text = " ".join( response.css(".widget-inline-related-articles ::text ").extract()) body_text = body_text.replace(rel_text, "") notice = SoccerNewsItem(headline=headline, subhead=subhead, author=author, body_text=body_text, url=url, datetime=datetime, source=self.name) yield notice #Twt #http://www.goal.com/br/not%C3%ADcias/mateo-kovacic-ja-posa-com-a-camisa-do-chelsea-apos-ser/betcue5ge4391gyhxz5hxmkq0 # #http://www.goal.com/br/not%C3%ADcias/arsenal-prepara-oferta-irrecusavel-a-lucas-vazquez-do-real/ntw6y5r9ygyr1qhvkyk4pw7dp
class BBCSpider(scrapy.Spider): name = 'bbc' allowed_domains = ['bbc.com'] source = 'BBC' start_urls = url_selector.get_urls(source) def parse(self, response): url = response.url datetime = response.css(".abbr-on ::text").extract_first() headline = response.css(".story-headline ::text").extract_first() subhead = response.css( ".sp-story-body__introduction ::text").extract_first() author = "" body_text = " ".join(response.css(".story-body p ::text").extract()) body_text = body_text.replace( "Media playback is not supported on this device", "") body_text = body_text.replace( " Find all the latest football transfers on our dedicated page.", "") body_text = body_text.replace(subhead, "") notice = SoccerNewsItem(headline=headline, subhead=subhead, author=author, body_text=body_text, url=url, datetime=datetime, source=self.name) yield notice
class TalksportSpider(scrapy.Spider): name = 'talksport' allowed_domains = ['talksport.com'] source = 'Talksport' start_urls = url_selector.get_urls(source) def parse(self, response): url = response.url datetime = response.css(".article__published span ::text").extract_first() headline = response.css(".article__headline ::text").extract_first() subhead = response.css(".article__subdeck p ::text").extract_first() author = response.css("span.article__author-name ::text").extract_first() bt_lst = response.css('.article__content p ::text').extract() for i in range(len(bt_lst)): bt_lst[i] = bt_lst[i].strip() body_text = " ".join(bt_lst) notice = SoccerNewsItem( headline=headline, subhead=subhead, author=author, body_text=body_text, url=url, datetime=datetime, source=self.name) yield notice
class HitcSpider(scrapy.Spider): name = 'hitc' allowed_domains = ['hitc.com'] source = 'HITC' start_urls = url_selector.get_urls(source) def parse(self, response): url = response.url datetime = response.css(".insidebar time::attr(datetime)").extract_first() headline = response.css("header h1 ::text").extract_first() subhead = " ".join(response.css(".post-summary ::text").extract()) author = response.css(".post-author ::text").extract_first() body_text = " ".join(response.css(".post-content p ::text").extract()) body_text = body_text.replace(subhead, "") notice = SoccerNewsItem( headline=headline, subhead=subhead, author=author, body_text=body_text, url=url, datetime=datetime, source=self.name) yield notice #Twt: # http://www.hitc.com/en-gb/2018/07/07/do-weekend-wouldnt-pay-5p-west-ham-fans-react-to-haris-seferovic/
class OJogoSpider(scrapy.Spider): name = 'ojogo' allowed_domains = ['ojogo.pt'] source = 'O Jogo' start_urls = url_selector.get_urls(source) def parse(self, response): url = response.url datetime = response.css( ".t-a-info-1 time ::attr(datetime)").extract_first() headline = response.css(".t-i h1 ::text").extract_first() subhead = response.css(".t-a-c-intro-1 ::text").extract_first() author = response.css(".t-a-info-author ::text").extract_first() bt_lst = response.css(".t-a-c-wrap p ::text").extract() for i in range(len(bt_lst)): bt_lst[i] = bt_lst[i].strip() body_text = " ".join(bt_lst) body_text = body_text.replace(subhead, "") notice = SoccerNewsItem(headline=headline, subhead=subhead, author=author, body_text=body_text, url=url, datetime=datetime, source=self.name) yield notice
class SportSpider(scrapy.Spider): name = 'sport' allowed_domains = ['sport.es', 'sport-english.com'] source = 'Sport' start_urls = url_selector.get_urls(source) def parse(self, response): url = response.url datetime = response.css(".date ::attr(datetime)").extract_first() headline = response.css("h1 ::text").extract_first() sh_lst = response.css("h2 ::text").extract() author = response.css(".author-link ::text").extract_first() body_text = " ".join(response.css('.editor p ::text').extract()) for i in range(len(sh_lst)): sh_lst[i] = sh_lst[i].strip() subhead = " ".join(sh_lst) rel_text = response.css('.relations p ::text').extract_first() if rel_text: body_text = body_text.replace(rel_text, "") box_text = " ".join(response.css('.box-left-55 p ::text').extract()) body_text = body_text.replace(box_text, "") #twt_lst = response.css(".twitter-tweet ::text").extract() #igm_lst = response.css(".instagram-media ::text").extract() #for i in twt_lst: # body_text = body_text.replace(i.strip(), "") #for i in igm_lst: # body_text = body_text.replace(i.strip(), "") notice = SoccerNewsItem(headline=headline, subhead=subhead, author=author, body_text=body_text, url=url, datetime=datetime, source=self.name) yield notice
class MetroSpider(scrapy.Spider): name = 'metro' allowed_domains = ['metro.co.uk'] source = 'Metro' start_urls = url_selector.get_urls(source) def parse(self, response): url = response.url datetime = response.css(".post-date ::text").extract_first() headline = response.css(".post-title ::text").extract_first() subhead = "" author = response.css(".author ::text").extract_first().strip() body_text = " ".join(response.css('.article-body p ::text').extract()) rel_lst = response.css('.zopo-title span ::text').extract() vid_text = " ".join(response.css("p.vjs-no-js ::text").extract()) mor_text = " ".join(response.css(".mor-link ::text").extract()) #twt_lst = response.css('.embed-twitter p ::text').extract() #igm_lst = response.css(".instagram-media p ::text").extract() for i in range(0, len(rel_lst), 3): i_text = " ".join(rel_lst[i:i + 3]) body_text = body_text.replace(i_text, "") body_text = body_text.replace(vid_text, "") body_text = body_text.replace(mor_text, "") #for i in twt_lst: # body_text = body_text.replace(i.strip(), "") #for i in igm_lst: # body_text = body_text.replace(i.strip(), "") notice = SoccerNewsItem(headline=headline, subhead=subhead, author=author, body_text=body_text, url=url, datetime=datetime, source=self.name) yield notice
class DNSpider(scrapy.Spider): name = 'fcbn' allowed_domains = ['fcbn.pt'] source = 'FC Barcelona Noticias' start_urls = url_selector.get_urls(source) def parse(self, response): url = response.url datetime = response.css(".noti_publish ::text").extract_first() headline = response.css(".noti_title ::text").extract_first() subhead = " ".join(response.css(".noti_subtitle ::text").extract()) author = response.css(".noti_author ::text").extract_first() body_text = " ".join(response.css(".noti_body ::text").extract()) notice = SoccerNewsItem( headline=headline, subhead=subhead, author=author, body_text=body_text, url=url, datetime=datetime, source=self.name) yield notice
class MarcaSpider(scrapy.Spider): name = 'marca' allowed_domains = ['marca.com'] source = 'Marca' start_urls = url_selector.get_urls(source) def parse(self, response): url = response.url datetime = response.css("time ::attr(datetime)").extract_first() headline = response.css(".titles h1 ::text").extract_first() subhead = " ".join( response.css(".section-title-group ::text").extract()) author = response.css(".author-name ::text").extract_first() body_text = " ".join(response.css(".row p ::text").extract()) sub_text = " ".join(response.css(".subtitle-items p ::text").extract()) body_text = body_text.replace(sub_text, "") script_text = " ".join(response.css(".row p script ::text").extract()) body_text = body_text.replace(script_text, "") #cite_text = " ".join(response.css("p.cite-author ::text").extract()) #body_text = body_text.replace(cite_text, "") notice = SoccerNewsItem(headline=headline, subhead=subhead, author=author, body_text=body_text, url=url, datetime=datetime, source=self.name) yield notice #Twt #http://www.marca.com/futbol/barcelona/2018/07/09/5b3fa0b1e5fdeaec3e8b4657.html #Citation #http://co.marca.com/claro/futbol/colombianos-mundo/2018/07/29/5b5d95aaca474113278b45f9.html
class DiarioAsSpider(scrapy.Spider): name = 'diarioas' allowed_domains = ['as.com'] source = 'Diario AS' start_urls = url_selector.get_urls(source) def parse(self, response): url = response.url datetime = response.css( ".art-info time::attr(datetime)").extract_first() headline = response.css("h1.titular-articulo ::text").extract_first() subhead = response.css("h2.cont-entradilla-art ::text").extract_first() author = response.css("a.art-author ::text").extract_first() if not author: author = response.css(".info-author ::text").extract_first() body_text = " ".join(response.css(".int-articulo p ::text").extract()) img_text = " ".join(response.css(".txt-img-art ::text").extract()) sum_text = " ".join(response.css(".sumario-ficha p ::text").extract()) body_text = body_text.replace(img_text, "") body_text = body_text.replace(sum_text, "") notice = SoccerNewsItem(headline=headline, subhead=subhead, author=author, body_text=body_text, url=url, datetime=datetime, source=self.name) yield notice #Twt #https://as.com/futbol/2018/06/25/primera/1529918347_010865.html #Mais de uma imagem #https://en.as.com/en/2018/07/11/football/1531325690_379444.html
class FichajesNetSpider(scrapy.Spider): name = 'fichajes.net' allowed_domains = ['fichajes.net'] source = 'Fichajes.net' start_urls = url_selector.get_urls(source) def parse(self, response): url = response.url datetime = response.css(".md-info-date ::attr(datetime)").extract_first() headline = response.css("h1 ::text").extract_first() subhead = response.css("#node-story-full-group-header h2 ::text").extract_first() author = " ".join(response.css(".author ::text").extract()) body_text = " ".join(response.css(".content-body p ::text").extract()) notice = SoccerNewsItem( headline=headline, subhead=subhead, author=author, body_text=body_text, url=url, datetime=datetime, source=self.name) yield notice
class TeamtalkSpider(scrapy.Spider): name = 'teamtalk' allowed_domains = ['teamtalk.com'] source = 'Teamtalk' start_urls = url_selector.get_urls(source) def parse(self, response): url = response.url datetime = response.css(".article__header p ::text").extract_first() headline = response.css(".article__header h1 ::text").extract_first() subhead = response.css(".article__body strong ::text").extract_first() author = "" body_text = " ".join(response.css(".article__body p ::text").extract()) i_text = " ".join(response.css(".article__body p i ::text").extract()) script_text = " ".join( response.css(".article__body p script ::text").extract()) body_text = body_text.replace(subhead, "") body_text = body_text.replace(script_text, "") body_text = body_text.replace(i_text, "") notice = SoccerNewsItem(headline=headline, subhead=subhead, author=author, body_text=body_text, url=url, datetime=datetime, source=self.name) yield notice #Fix: #More from Planet Sport #https://www.teamtalk.com/news/bruce-admits-it-will-be-difficult-to-keep-grealish-away-from-tottenham #Paper talk #https://www.teamtalk.com/news/paper-talk-liverpool-make-sensational-e180m-bid-for-man-utd-target-arsenal-chase-celta-vigo-man
class MaisfutebolSpider(scrapy.Spider): name = 'maisfutebol' allowed_domains = ['maisfutebol.iol.pt'] source = 'Maisfutebol' start_urls = url_selector.get_urls(source) def parse(self, response): url = response.url datetime = response.css(".date ::text").extract_first() headline = response.css("h1 ::text").extract_first() subhead = response.css("h2 ::text").extract_first() author = response.css(".autores a ::text").extract_first() body_text = " ".join(response.css(".articleBody p ::text").extract()) notice = SoccerNewsItem(headline=headline, subhead=subhead, author=author, body_text=body_text, url=url, datetime=datetime, source=self.name) yield notice