Python get_urls Beispiele, scrapy_news.url_selector.get_urls Python Beispiele

Beispiel #1

0

Datei anzeigen

Datei: express.py Projekt: dcaled/FTR-18

class ExpressSpider(scrapy.Spider):
    name = 'express'
    allowed_domains = ['express.co.uk']
    source = 'Daily Express'
    start_urls = url_selector.get_urls(source)

    def parse(self, response):

        url = response.url
        datetime = response.css("time ::attr(datetime)").extract_first()
        headline = response.css("h1 ::text").extract_first()
        subhead = response.css("h3 ::text").extract_first()
        author = response.css(".author span ::text").extract_first()
        body_text = " ".join(
            response.css(".text-description p ::text").extract())

        notice = SoccerNewsItem(headline=headline,
                                subhead=subhead,
                                author=author,
                                body_text=body_text,
                                url=url,
                                datetime=datetime,
                                source=self.name)

        yield notice

Beispiel #2

0

Datei anzeigen

Datei: eldesmarque.py Projekt: dcaled/FTR-18

class ElDesmarqueSpider(scrapy.Spider):
    name = 'eldesmarque'
    allowed_domains = ['eldesmarque.com']
    source = 'El Desmarque'
    start_urls = url_selector.get_urls(source)

    def parse(self, response):
        url = response.url
        datetime = response.css(".fecha ::text").extract_first()
        headline = response.css("h1.titulo ::text").extract_first()
        subhead = ""
        author = response.css(".autor span ::text").extract_first()
        body_text = " ".join(
            response.css("#cuerpo-noticia p ::text").extract())

        script_text = " ".join(
            response.css("#cuerpo-noticia p script ::text").extract())
        body_text = body_text.replace(script_text, "")

        notice = SoccerNewsItem(headline=headline,
                                subhead=subhead,
                                author=author,
                                body_text=body_text,
                                url=url,
                                datetime=datetime,
                                source=self.name)

        yield notice

Beispiel #3

0

Datei anzeigen

Datei: thesun.py Projekt: dcaled/FTR-18

class TheSunSpider(scrapy.Spider):
    name = 'thesun'
    allowed_domains = ['thesun.co.uk', 'thesun.ie']
    source = 'The Sun'
    start_urls = url_selector.get_urls(source)

    def parse(self, response):

        url = response.url
        datetime = response.css(
            ".article__published span ::text").extract_first()
        headline = response.css(".article__headline ::text").extract_first()
        subhead = response.css(".article__subdeck p ::text").extract_first()
        author = response.css(
            "span.article__author-name ::text").extract_first()
        bt_lst = response.css(".article__content p ::text").extract()

        for i in range(len(bt_lst)):
            bt_lst[i] = bt_lst[i].strip()
        body_text = " ".join(bt_lst)

        notice = SoccerNewsItem(headline=headline,
                                subhead=subhead,
                                author=author,
                                body_text=body_text,
                                url=url,
                                datetime=datetime,
                                source=self.name)

        yield notice


#Twt
#https://www.thesun.co.uk/sport/football/6976195/chelsea-complete-signing-mateo-kovacic-real-madrid-loan/

Beispiel #4

0

Datei anzeigen

class EstadioDeportivoSpider(scrapy.Spider):
    name = 'estadiodeportivo'
    allowed_domains = ['estadiodeportivo.com']
    source = 'Estadio Deportivo'
    start_urls = url_selector.get_urls(source)

    def parse(self, response):
        url = response.url
        datetime = response.css("span.fecha_hora ::text").extract_first()
        headline = response.css(".noticia h1 ::text").extract_first()
        subhead = ""
        author = response.css("span.autor_sup ::text").extract_first()
        body_text = " ".join(
            response.css(".cuerpo_noticia p ::text").extract())
        twt_text = " ".join(response.css(".twitter-tweet p ::text").extract())

        body_text = body_text.replace(twt_text, "")

        notice = SoccerNewsItem(headline=headline,
                                subhead=subhead,
                                author=author,
                                body_text=body_text,
                                url=url,
                                datetime=datetime,
                                source=self.name)

        yield notice

Beispiel #5

0

Datei anzeigen

class Futebol365Spider(scrapy.Spider):
    name = 'futebol365'
    allowed_domains = ['futebol365.pt']
    source = 'Futebol 365'
    start_urls = url_selector.get_urls(source)

    def parse(self, response):
        url = response.url
        headline = response.css(".titulo ::text").extract_first()
        subhead = response.css(".texto span.negrito ::text").extract_first()

        date_author = response.css(".data ::text").extract_first().split(",")
        author = date_author[0].replace("por", "")
        datetime = date_author[1]

        body_text = " ".join(response.css(".texto p ::text").extract())
        body_text = body_text.replace(subhead, "")

        notice = SoccerNewsItem(headline=headline,
                                subhead=subhead,
                                author=author,
                                body_text=body_text,
                                url=url,
                                datetime=datetime,
                                source=self.name)

        yield notice

Beispiel #6

0

Datei anzeigen

Datei: espn.py Projekt: dcaled/FTR-18

class ESPNSpider(scrapy.Spider):
    name = 'espn'
    allowed_domains = ['espn.com', 'espn.com.br', 'espn.co.uk']
    source = 'ESPN'
    start_urls = url_selector.get_urls(source)

    def parse(self, response):
        url = response.url
        datetime = response.css(".timestamp ::attr(data-date)").extract_first()
        headline = response.css(".article-header h1 ::text").extract_first()
        subhead = ""
        author = response.css(".author ::text").extract_first()
        body_text = " ".join(response.css(".article-body p ::text").extract())

        related_text = " ".join(
            response.css(".article-body .editorial p ::text").extract())
        prom_text = " ".join(
            response.css(".article-body .inline-track p ::text").extract())
        #related_text = " ".join(response.css(".article-body strong ::text").extract())
        body_text = body_text.replace(related_text, "")
        body_text = body_text.replace(prom_text, "")

        notice = SoccerNewsItem(headline=headline,
                                subhead=subhead,
                                author=author,
                                body_text=body_text,
                                url=url,
                                datetime=datetime,
                                source=self.name)

        yield notice

Beispiel #7

0

Datei anzeigen

class DeporSpider(scrapy.Spider):
    name = 'depor'
    allowed_domains = ['depor.com']
    source = 'Depor'
    start_urls = url_selector.get_urls(source)

    def parse(self, response):
        url = response.url
        datetime = response.css(".news-date ::attr(datetime)").extract_first()
        headline = response.css(".news-title ::text").extract_first()
        subhead = response.css(".news-summary ::text").extract_first()
        author = response.css(".author-name a ::text").extract_first()
        body_text = " ".join(
            response.css(".news-text-content p ::text").extract())

        media_text = " ".join(
            response.css(".news-text-content .news-media-description p ::text"
                         ).extract())
        body_text = body_text.replace(media_text, "")

        notice = SoccerNewsItem(headline=headline,
                                subhead=subhead,
                                author=author,
                                body_text=body_text,
                                url=url,
                                datetime=datetime,
                                source=self.name)

        yield notice

Beispiel #8

0

Datei anzeigen

Datei: noticiasaominuto.py Projekt: dcaled/FTR-18

class NoticiasAoMinutoSpider(scrapy.Spider):
    name = 'noticiasaominuto'
    allowed_domains = ['noticiasaominuto.com']
    source = 'Notícias ao Minuto'
    start_urls = url_selector.get_urls(source)

    def parse(self, response):
        url = response.url
        datetime = response.css(".news-info-time ::text").extract_first()
        headline = response.css(".news-headline ::text").extract_first()
        subhead = response.css(".news-subheadline ::text").extract_first()
        author = response.css(".author-hover ::text").extract_first()
        body_text = " ".join(
            response.css(".news-main-text p ::text").extract())

        pub_text = " ".join(
            response.css(".news-main-text p a ::text ").extract())
        body_text = body_text.replace(pub_text, "")

        notice = SoccerNewsItem(headline=headline,
                                subhead=subhead,
                                author=author,
                                body_text=body_text,
                                url=url,
                                datetime=datetime,
                                source=self.name)

        yield notice


#Pub
#https://www.noticiasaominuto.com/desporto/1068563/fc-porto-prepara-nova-proposta-por-ntcham

#First letter.

Beispiel #9

0

Datei anzeigen

Datei: abola.py Projekt: dcaled/FTR-18

class ABolaSpider(scrapy.Spider):
    name = 'abola'
    allowed_domains = ['abola.pt']
    source = 'A Bola'
    start_urls = url_selector.get_urls(source)

    def parse(self, response):

        url = response.url
        datetime = response.css(".data-hora span ::text").extract_first()
        headline = response.css("h1.titulo ::text").extract_first()
        subhead = ""
        author = response.css(".assinatura span ::text").extract_first()
        body_text = " ".join(response.css(".corpo-noticia ::text").extract())

        notice = SoccerNewsItem(
            headline=headline, subhead=subhead, 
            author=author, body_text=body_text, 
            url=url, datetime=datetime,
            source=self.name)
        
        yield notice

#Twt
#https://www.abola.pt/Clubes/Noticias/Ver/739766/42

Beispiel #10

0

Datei anzeigen

class FoxSportsSpider(scrapy.Spider):
    name = 'foxsports'
    allowed_domains = ['foxsports.com.br']
    source = 'Fox Sports'
    start_urls = url_selector.get_urls(source)

    def parse(self, response):

        #Extract only portuguese news:
        if 'foxsports.com.br' in response.url:
            url = response.url
            datetime = response.css(".publish-date ::text").extract_first()
            headline = response.css("h1 ::text").extract_first()
            subhead = response.css("h2 ::text").extract_first()
            author = response.css(".author ::text").extract_first()
            body_text = " ".join(response.css(".embed p ::text").extract())

            body_text = body_text.replace(
                "Veja as últimas do Mercado da Bola e quem pode chegar ao seu time",
                "")
            body_text = body_text.split("Saiba mais: ")[0]

            notice = SoccerNewsItem(headline=headline,
                                    subhead=subhead,
                                    author=author,
                                    body_text=body_text,
                                    url=url,
                                    datetime=datetime,
                                    source=self.name)

            yield notice


#Rel text
#https://www.foxsports.com.br/news/370761-barcelona-anuncia-venda-de-aleix-vidal-para-o-sevilla

Beispiel #11

0

Datei anzeigen

class ElPaisSpider(scrapy.Spider):
    name = 'elpais'
    allowed_domains = ['elpais.com']
    source = 'El País'
    start_urls = url_selector.get_urls(source)

    def parse(self, response):
        url = response.url
        datetime = response.css(
            ".articulo-actualizado ::attr(datetime)").extract_first()
        headline = response.css(".articulo-titulo ::text").extract_first()
        subhead = response.css(".articulo-subtitulo ::text").extract_first()
        author = " ".join(response.css(".autor-nombre ::text").extract())
        body_text = " ".join(
            response.css(".articulo-cuerpo p ::text").extract())

        notice = SoccerNewsItem(headline=headline,
                                subhead=subhead,
                                author=author,
                                body_text=body_text,
                                url=url,
                                datetime=datetime,
                                source=self.name)

        yield notice


#Twt
#https://elpais.com/deportes/2018/08/24/actualidad/1535133090_339849.html

Beispiel #12

0

Datei anzeigen

class MundoDeportivoSpider(scrapy.Spider):
    name = 'mundodeportivo'
    allowed_domains = ['mundodeportivo.com']
    source = 'Mundo Deportivo'
    start_urls = url_selector.get_urls(source)

    def parse(self, response):

        url = response.url
        datetime = response.css(
            ".story-leaf-datetime ::attr(datetime)").extract_first()
        headline = response.css(".story-leaf-title ::text").extract_first()
        subhead = response.css(".story-leaf-subtitle ::text").extract_first()
        author = response.css(".story-leaf-author-link ::text").extract_first()
        body_text = " ".join(
            response.css(".story-leaf-txt-p p ::text").extract())

        rel_text = " ".join(
            response.css('p.story-leaf-relatednews-epigraph ::text').extract())
        body_text = body_text.replace(rel_text, "")

        notice = SoccerNewsItem(headline=headline,
                                subhead=subhead,
                                author=author,
                                body_text=body_text,
                                url=url,
                                datetime=datetime,
                                source=self.name)

        yield notice

Beispiel #13

0

Datei anzeigen

Datei: bleacherreport.py Projekt: dcaled/FTR-18

class BleacherReportSpider(scrapy.Spider):
    name = 'bleacherreport'
    allowed_domains = ['bleacherreport.com']
    source = 'Bleacher Report'
    start_urls = url_selector.get_urls(source)

    def parse(self, response):
        url = response.url
        datetime = response.css("header .date ::text").extract_first()
        headline = response.css("header h1 ::text").extract_first()
        subhead = ""

        author = response.css(".authorInfo .name ::text").extract_first()

        bt_lst = response.css(".contentStream p ::text").extract()

        for i in range(len(bt_lst)):
            bt_lst[i] = bt_lst[i].strip()
        body_text = " ".join(bt_lst)

        notice = SoccerNewsItem(headline=headline,
                                subhead=subhead,
                                author=author,
                                body_text=body_text,
                                url=url,
                                datetime=datetime,
                                source=self.name)

        yield notice


#Twt
#https://bleacherreport.com/articles/2788607-arsenal-transfer-news-hector-herrera-dismisses-porto-exit-rumours

Beispiel #14

0

Datei anzeigen

Datei: fichajescom.py Projekt: dcaled/FTR-18

class FichajesComSpider(scrapy.Spider):
    name = 'fichajes.com'
    allowed_domains = ['fichajes.com']
    source = 'Fichajes.com'
    start_urls = url_selector.get_urls(source)

    def parse(self, response):

        url = response.url
        datetime = response.css("time ::attr(datetime)").extract_first()
        headline = response.css(".article h1 ::text").extract_first()
        subhead = response.css(".article h2 ::text").extract_first()
        author = response.css(".name ::text").extract_first()
        body_text = " ".join(response.css(".article-text ::text").extract())

        twt_text = " ".join(response.css(".twitter-tweet p ::text").extract())
        body_text = body_text.replace(twt_text, "")

        notice = SoccerNewsItem(headline=headline,
                                subhead=subhead,
                                author=author,
                                body_text=body_text,
                                url=url,
                                datetime=datetime,
                                source=self.name)

        yield notice


#Twt
#http://www.fichajes.com/breves/chelsea-las-primeras-palabras-de-mateo-kovacic_138960
#http://www.fichajes.com/breves/el-atletico-de-madrid-dice-adios-a-andre-moreira_138630

Beispiel #15

0

Datei anzeigen

class BesoccerSpider(scrapy.Spider):
    name = 'besoccer'
    allowed_domains = ['besoccer.com']
    source = 'Be Soccer'
    start_urls = url_selector.get_urls(source)

    def parse(self, response):
        url = response.url
        datetime = response.css(
            "time span.ni-date::attr(content)").extract_first()
        headline = response.css("h1.ni-title ::text").extract_first()
        subhead = response.css("p.teaser ::text").extract_first()
        author = response.css("a.ni-author ::text").extract_first()
        body_text = " ".join(response.css(".ni-text-body p ::text").extract())

        body_text = body_text.replace(subhead, "")

        notice = SoccerNewsItem(headline=headline,
                                subhead=subhead,
                                author=author,
                                body_text=body_text,
                                url=url,
                                datetime=datetime,
                                source=self.name)

        yield notice

Beispiel #16

0

Datei anzeigen

class FootballEspanaSpider(scrapy.Spider):
    name = 'footballespana'
    allowed_domains = ['football-espana.net']
    source = 'Football Espana'
    start_urls = url_selector.get_urls(source)

    def parse(self, response):
        url = response.url
        datetime = response.css(".date ::text").extract_first()
        headline = response.css(".title ::text").extract_first()
        subhead = ""
        author = response.css(".submitted ::text").extract_first()
        body_text = " ".join(response.css(".content p ::text").extract())

        body_text = body_text.replace(
            "See the latest La Liga predictions and betting tips with  Eurotips.co.uk",
            "")

        notice = SoccerNewsItem(headline=headline,
                                subhead=subhead,
                                author=author,
                                body_text=body_text,
                                url=url,
                                datetime=datetime,
                                source=self.name)

        yield notice

Beispiel #17

0

Datei anzeigen

Datei: telegraph.py Projekt: dcaled/FTR-18

class TelegraphSpider(scrapy.Spider):
    name = 'telegraph'
    allowed_domains = ['telegraph.co.uk']
    source = 'The Telegraph'
    start_urls = url_selector.get_urls(source)

    def parse(self, response):
        url = response.url
        datetime = response.css(
            ".component-content time::attr(datetime)").extract_first()
        headline = response.css("h1.headline__heading ::text").extract_first()
        subhead = response.css(".lead-asset-caption ::text").extract_first()
        author = response.css(".byline__author-name a ::text").extract_first()
        bt_lst = response.css(".articleBodyText p ::text").extract()

        for i in range(len(bt_lst)):
            bt_lst[i] = bt_lst[i].strip()
        body_text = " ".join(bt_lst)

        body_text = body_text.replace(subhead, "")

        notice = SoccerNewsItem(headline=headline,
                                subhead=subhead,
                                author=author,
                                body_text=body_text,
                                url=url,
                                datetime=datetime,
                                source=self.name)

        yield notice

Beispiel #18

0

Datei anzeigen

class GoalSpider(scrapy.Spider):
    name = 'goal'
    allowed_domains = ['goal.com']
    source = 'Goal'
    start_urls = url_selector.get_urls(source)

    def parse(self, response):

        url = response.url
        datetime = response.css("time ::attr(datetime)").extract_first()
        headline = response.css(".article-headline ::text").extract_first()
        subhead = response.css(".teaser ::text").extract_first()
        author = response.css(".name ::text").extract_first()
        body_text = " ".join(response.css(".body p ::text").extract())

        rel_text = " ".join(
            response.css(".widget-inline-related-articles ::text ").extract())
        body_text = body_text.replace(rel_text, "")

        notice = SoccerNewsItem(headline=headline,
                                subhead=subhead,
                                author=author,
                                body_text=body_text,
                                url=url,
                                datetime=datetime,
                                source=self.name)

        yield notice


#Twt
#http://www.goal.com/br/not%C3%ADcias/mateo-kovacic-ja-posa-com-a-camisa-do-chelsea-apos-ser/betcue5ge4391gyhxz5hxmkq0

#
#http://www.goal.com/br/not%C3%ADcias/arsenal-prepara-oferta-irrecusavel-a-lucas-vazquez-do-real/ntw6y5r9ygyr1qhvkyk4pw7dp

Beispiel #19

0

Datei anzeigen

Datei: bbc.py Projekt: dcaled/FTR-18

class BBCSpider(scrapy.Spider):
    name = 'bbc'
    allowed_domains = ['bbc.com']
    source = 'BBC'
    start_urls = url_selector.get_urls(source)

    def parse(self, response):
        url = response.url
        datetime = response.css(".abbr-on ::text").extract_first()
        headline = response.css(".story-headline ::text").extract_first()
        subhead = response.css(
            ".sp-story-body__introduction ::text").extract_first()
        author = ""
        body_text = " ".join(response.css(".story-body p ::text").extract())

        body_text = body_text.replace(
            "Media playback is not supported on this device", "")
        body_text = body_text.replace(
            " Find all the latest football transfers  on our dedicated page.",
            "")
        body_text = body_text.replace(subhead, "")

        notice = SoccerNewsItem(headline=headline,
                                subhead=subhead,
                                author=author,
                                body_text=body_text,
                                url=url,
                                datetime=datetime,
                                source=self.name)

        yield notice

Beispiel #20

0

Datei anzeigen

Datei: talksport.py Projekt: dcaled/FTR-18

class TalksportSpider(scrapy.Spider):
    name = 'talksport'
    allowed_domains = ['talksport.com']
    source = 'Talksport'
    start_urls = url_selector.get_urls(source)

    def parse(self, response):

        url = response.url
        datetime = response.css(".article__published span ::text").extract_first()
        headline  = response.css(".article__headline ::text").extract_first()
        subhead = response.css(".article__subdeck p ::text").extract_first()
        author =  response.css("span.article__author-name ::text").extract_first()
        bt_lst = response.css('.article__content p ::text').extract()

        for i in range(len(bt_lst)):
            bt_lst[i] = bt_lst[i].strip()
        body_text = " ".join(bt_lst)


        notice = SoccerNewsItem(
            headline=headline, subhead=subhead, 
            author=author, body_text=body_text, 
            url=url, datetime=datetime,
            source=self.name)

        yield notice

Beispiel #21

0

Datei anzeigen

Datei: hitc.py Projekt: dcaled/FTR-18

class HitcSpider(scrapy.Spider):
    name = 'hitc'
    allowed_domains = ['hitc.com']
    source = 'HITC'
    start_urls = url_selector.get_urls(source)

    def parse(self, response):
        url = response.url
        datetime = response.css(".insidebar time::attr(datetime)").extract_first()
        headline = response.css("header h1 ::text").extract_first()
        subhead = " ".join(response.css(".post-summary ::text").extract())
        author = response.css(".post-author ::text").extract_first()
        body_text = " ".join(response.css(".post-content p ::text").extract())

        body_text = body_text.replace(subhead, "")

        notice = SoccerNewsItem(
            headline=headline, subhead=subhead, 
            author=author, body_text=body_text, 
            url=url, datetime=datetime,
            source=self.name)
        
        yield notice

#Twt: 
# http://www.hitc.com/en-gb/2018/07/07/do-weekend-wouldnt-pay-5p-west-ham-fans-react-to-haris-seferovic/

Beispiel #22

0

Datei anzeigen

class OJogoSpider(scrapy.Spider):
    name = 'ojogo'
    allowed_domains = ['ojogo.pt']
    source = 'O Jogo'
    start_urls = url_selector.get_urls(source)

    def parse(self, response):

        url = response.url
        datetime = response.css(
            ".t-a-info-1 time ::attr(datetime)").extract_first()
        headline = response.css(".t-i h1 ::text").extract_first()
        subhead = response.css(".t-a-c-intro-1 ::text").extract_first()
        author = response.css(".t-a-info-author ::text").extract_first()
        bt_lst = response.css(".t-a-c-wrap p ::text").extract()

        for i in range(len(bt_lst)):
            bt_lst[i] = bt_lst[i].strip()
        body_text = " ".join(bt_lst)

        body_text = body_text.replace(subhead, "")

        notice = SoccerNewsItem(headline=headline,
                                subhead=subhead,
                                author=author,
                                body_text=body_text,
                                url=url,
                                datetime=datetime,
                                source=self.name)

        yield notice

Beispiel #23

0

Datei anzeigen

class SportSpider(scrapy.Spider):
    name = 'sport'
    allowed_domains = ['sport.es', 'sport-english.com']
    source = 'Sport'
    start_urls = url_selector.get_urls(source)

    def parse(self, response):

        url = response.url
        datetime = response.css(".date ::attr(datetime)").extract_first()
        headline = response.css("h1 ::text").extract_first()
        sh_lst = response.css("h2 ::text").extract()
        author = response.css(".author-link ::text").extract_first()
        body_text = " ".join(response.css('.editor p ::text').extract())

        for i in range(len(sh_lst)):
            sh_lst[i] = sh_lst[i].strip()
        subhead = " ".join(sh_lst)

        rel_text = response.css('.relations p ::text').extract_first()
        if rel_text:
            body_text = body_text.replace(rel_text, "")
        box_text = " ".join(response.css('.box-left-55 p ::text').extract())
        body_text = body_text.replace(box_text, "")

        #twt_lst = response.css(".twitter-tweet ::text").extract()
        #igm_lst = response.css(".instagram-media ::text").extract()

        #for i in twt_lst:
        #    body_text = body_text.replace(i.strip(), "")

        #for i in igm_lst:
        #    body_text = body_text.replace(i.strip(), "")

        notice = SoccerNewsItem(headline=headline,
                                subhead=subhead,
                                author=author,
                                body_text=body_text,
                                url=url,
                                datetime=datetime,
                                source=self.name)

        yield notice

Beispiel #24

0

Datei anzeigen

Datei: metro.py Projekt: dcaled/FTR-18

class MetroSpider(scrapy.Spider):
    name = 'metro'
    allowed_domains = ['metro.co.uk']
    source = 'Metro'
    start_urls = url_selector.get_urls(source)

    def parse(self, response):

        url = response.url
        datetime = response.css(".post-date ::text").extract_first()
        headline = response.css(".post-title ::text").extract_first()
        subhead = ""
        author = response.css(".author ::text").extract_first().strip()
        body_text = " ".join(response.css('.article-body p ::text').extract())

        rel_lst = response.css('.zopo-title span ::text').extract()
        vid_text = " ".join(response.css("p.vjs-no-js ::text").extract())
        mor_text = " ".join(response.css(".mor-link ::text").extract())
        #twt_lst = response.css('.embed-twitter p ::text').extract()
        #igm_lst = response.css(".instagram-media p ::text").extract()

        for i in range(0, len(rel_lst), 3):
            i_text = " ".join(rel_lst[i:i + 3])
            body_text = body_text.replace(i_text, "")

        body_text = body_text.replace(vid_text, "")
        body_text = body_text.replace(mor_text, "")

        #for i in twt_lst:
        #    body_text = body_text.replace(i.strip(), "")

        #for i in igm_lst:
        #    body_text = body_text.replace(i.strip(), "")

        notice = SoccerNewsItem(headline=headline,
                                subhead=subhead,
                                author=author,
                                body_text=body_text,
                                url=url,
                                datetime=datetime,
                                source=self.name)

        yield notice

Beispiel #25

0

Datei anzeigen

Datei: fcbn.py Projekt: dcaled/FTR-18

class DNSpider(scrapy.Spider):
    name = 'fcbn'
    allowed_domains = ['fcbn.pt']
    source = 'FC Barcelona Noticias'
    start_urls = url_selector.get_urls(source)

    def parse(self, response):
        url = response.url
        datetime = response.css(".noti_publish ::text").extract_first()
        headline = response.css(".noti_title ::text").extract_first()
        subhead = " ".join(response.css(".noti_subtitle ::text").extract())
        author = response.css(".noti_author ::text").extract_first()
        body_text = " ".join(response.css(".noti_body ::text").extract())

        notice = SoccerNewsItem(
            headline=headline, subhead=subhead, 
            author=author, body_text=body_text, 
            url=url, datetime=datetime,
            source=self.name)
        
        yield notice

Beispiel #26

0

Datei anzeigen

Datei: marca.py Projekt: dcaled/FTR-18

class MarcaSpider(scrapy.Spider):
    name = 'marca'
    allowed_domains = ['marca.com']
    source = 'Marca'
    start_urls = url_selector.get_urls(source)

    def parse(self, response):

        url = response.url
        datetime = response.css("time ::attr(datetime)").extract_first()
        headline = response.css(".titles h1 ::text").extract_first()
        subhead = " ".join(
            response.css(".section-title-group ::text").extract())
        author = response.css(".author-name ::text").extract_first()
        body_text = " ".join(response.css(".row p ::text").extract())

        sub_text = " ".join(response.css(".subtitle-items p ::text").extract())
        body_text = body_text.replace(sub_text, "")

        script_text = " ".join(response.css(".row p script ::text").extract())
        body_text = body_text.replace(script_text, "")

        #cite_text = " ".join(response.css("p.cite-author ::text").extract())
        #body_text = body_text.replace(cite_text, "")

        notice = SoccerNewsItem(headline=headline,
                                subhead=subhead,
                                author=author,
                                body_text=body_text,
                                url=url,
                                datetime=datetime,
                                source=self.name)

        yield notice


#Twt
#http://www.marca.com/futbol/barcelona/2018/07/09/5b3fa0b1e5fdeaec3e8b4657.html
#Citation
#http://co.marca.com/claro/futbol/colombianos-mundo/2018/07/29/5b5d95aaca474113278b45f9.html

Beispiel #27

0

Datei anzeigen

Datei: diarioas.py Projekt: dcaled/FTR-18

class DiarioAsSpider(scrapy.Spider):
    name = 'diarioas'
    allowed_domains = ['as.com']
    source = 'Diario AS'
    start_urls = url_selector.get_urls(source)

    def parse(self, response):
        url = response.url
        datetime = response.css(
            ".art-info time::attr(datetime)").extract_first()
        headline = response.css("h1.titular-articulo ::text").extract_first()
        subhead = response.css("h2.cont-entradilla-art ::text").extract_first()

        author = response.css("a.art-author ::text").extract_first()
        if not author:
            author = response.css(".info-author ::text").extract_first()

        body_text = " ".join(response.css(".int-articulo p ::text").extract())

        img_text = " ".join(response.css(".txt-img-art ::text").extract())
        sum_text = " ".join(response.css(".sumario-ficha p ::text").extract())
        body_text = body_text.replace(img_text, "")
        body_text = body_text.replace(sum_text, "")

        notice = SoccerNewsItem(headline=headline,
                                subhead=subhead,
                                author=author,
                                body_text=body_text,
                                url=url,
                                datetime=datetime,
                                source=self.name)

        yield notice


#Twt
#https://as.com/futbol/2018/06/25/primera/1529918347_010865.html
#Mais de uma imagem
#https://en.as.com/en/2018/07/11/football/1531325690_379444.html

Beispiel #28

0

Datei anzeigen

Datei: fichajesnet.py Projekt: dcaled/FTR-18

class FichajesNetSpider(scrapy.Spider):
    name = 'fichajes.net'
    allowed_domains = ['fichajes.net']
    source = 'Fichajes.net'
    start_urls = url_selector.get_urls(source)

    def parse(self, response):

        url = response.url
        datetime = response.css(".md-info-date ::attr(datetime)").extract_first()
        headline = response.css("h1 ::text").extract_first()
        subhead = response.css("#node-story-full-group-header h2 ::text").extract_first()
        author = " ".join(response.css(".author ::text").extract())
        body_text = " ".join(response.css(".content-body p ::text").extract())

        notice = SoccerNewsItem(
            headline=headline, subhead=subhead, 
            author=author, body_text=body_text, 
            url=url, datetime=datetime,
            source=self.name)

        yield notice

Beispiel #29

0

Datei anzeigen

Datei: teamtalk.py Projekt: dcaled/FTR-18

class TeamtalkSpider(scrapy.Spider):
    name = 'teamtalk'
    allowed_domains = ['teamtalk.com']
    source = 'Teamtalk'
    start_urls = url_selector.get_urls(source)

    def parse(self, response):
        url = response.url
        datetime = response.css(".article__header p ::text").extract_first()
        headline = response.css(".article__header h1 ::text").extract_first()
        subhead = response.css(".article__body strong ::text").extract_first()
        author = ""
        body_text = " ".join(response.css(".article__body p ::text").extract())

        i_text = " ".join(response.css(".article__body p i ::text").extract())
        script_text = " ".join(
            response.css(".article__body p script ::text").extract())

        body_text = body_text.replace(subhead, "")
        body_text = body_text.replace(script_text, "")
        body_text = body_text.replace(i_text, "")

        notice = SoccerNewsItem(headline=headline,
                                subhead=subhead,
                                author=author,
                                body_text=body_text,
                                url=url,
                                datetime=datetime,
                                source=self.name)

        yield notice


#Fix:
#More from Planet Sport
#https://www.teamtalk.com/news/bruce-admits-it-will-be-difficult-to-keep-grealish-away-from-tottenham
#Paper talk
#https://www.teamtalk.com/news/paper-talk-liverpool-make-sensational-e180m-bid-for-man-utd-target-arsenal-chase-celta-vigo-man

Beispiel #30

0

Datei anzeigen

Datei: maisfutebol.py Projekt: dcaled/FTR-18

class MaisfutebolSpider(scrapy.Spider):
    name = 'maisfutebol'
    allowed_domains = ['maisfutebol.iol.pt']
    source = 'Maisfutebol'
    start_urls = url_selector.get_urls(source)

    def parse(self, response):
        url = response.url
        datetime = response.css(".date ::text").extract_first()
        headline = response.css("h1 ::text").extract_first()
        subhead = response.css("h2 ::text").extract_first()
        author = response.css(".autores a ::text").extract_first()
        body_text = " ".join(response.css(".articleBody p ::text").extract())

        notice = SoccerNewsItem(headline=headline,
                                subhead=subhead,
                                author=author,
                                body_text=body_text,
                                url=url,
                                datetime=datetime,
                                source=self.name)

        yield notice