예제 #1
0
    def parse_periodista(self,response):
        #check if we are in an articles url
        title = response.xpath('//h1[@itemprop="headline"]/text()').get() 
        if title is not None:
            #get article's text
            text = response.xpath('//div[@class="per-item-page-part per-article-body"]//p/text()|//div[@class="per-item-page-part per-article-body"]//strong/text()|//div[@class="per-item-page-part per-article-body"]//p/*/text()').getall()
            list_to_string = " ".join(" ".join(text))
            markspaces = re.sub( "  ", "space",list_to_string)
            uneeded_spaces = re.sub( " ", "",markspaces)
            final_text = re.sub( "space", " ",uneeded_spaces)
            clear_characters = re.sub( "\xa0","",final_text)
            url = response.url

            date = response.xpath('//div[@class="col-md-4 per-color-grey per-font-size-md per-padding-top-20"]/text()').get()
            final_date = formatdate(date)

            #flag to see later on if we have videos
            flag = re.search(r"binteo|foto",url)

            #check if we are in an article, and if it doesn't have videos
            if len(clear_characters)>GENERAL_CATEGORIES['ALLOWED_LENGTH'] and flag is None:
                yield {
                    "subtopic": GENERAL_CATEGORIES['SPORT'],
                    "website": PERIODISTA_VARS['WEBSITE'],
                    "title": re.sub( r'\t|\n|\r',"",title),
                    "article_date": final_date,  
                    "author": PERIODISTA_VARS['WEBSITE'],
                    "article_body": re.sub( r'\s\s\s',"",clear_characters),
                    "url": url,                
                } 
예제 #2
0
    def parse_cnn(self,response):
        #check if we are in an articles url
        title = response.xpath('//h1[@class="story-title"]/text()').get() 
        if title is not None:
            #get article's text
            text = response.xpath('//p/text()').getall()
            list_to_string = " ".join(" ".join(text))
            markspaces = re.sub( "  ", "space",list_to_string)
            uneeded_spaces = re.sub( " ", "",markspaces)
            final_text = re.sub( "space", " ",uneeded_spaces)
            clear_characters = re.sub( "\xa0","",final_text)

            date = re.sub(r'\n|\t',"",response.xpath('//div[@class="story-date story-credits icon icon-time"]/text()').get())
            final_date = formatdate(date)

            url = response.url
            yield {
                "subtopic": GENERAL_CATEGORIES['SPORT'],
                "website": CNN_VARS['WEBSITE'],
                "title": title,
                "article_date": final_date,
                "author": re.sub(r'\n|\t',"",response.xpath('//div[@class="story-author"]/text()').get()),
                "article_body": re.sub( r'\n',"",clear_characters),
                "url": url,                
            }
예제 #3
0
    def parse_gazzetta(self, response):
        #check if we are in an articles url
        title = response.xpath('//div[@class="field-item even"]/h1/text()').get()
        if title is not None:
            # extract subtitle by splitting our url by '/'
            # and keeping the third object on our created list
            url = response.url
            subtopic = url.split('/')[3]

            temp=response.xpath('//span[@itemprop="name"]/text()').get()
            #check if there is an author
            if isinstance(temp,str):
                author = re.fullmatch(r'\W+',temp)
                if author is None:
                    author = temp
                else:
                    author = GAZZEETTA_VARS['WEBSITE']
            else:
                author = response.xpath('//h3[@class="blogger-social"]/a/text()').get()

            date = response.xpath('//div[@class="article_date"]/text()').get()
            final_date = formatdate(date)

            yield {
                "subtopic": subtopic,
                "website": GAZZEETTA_VARS['WEBSITE'],
                "title": title,
                "article_date": final_date,
                "author": author,
                "article_body": response.xpath('//div[@itemprop="articleBody"]//p/text()|//p/a/text()|//p/strong/text()').getall() ,#|//div[@itemprop="articleBody"]//p/a/text()|div[@itemprop="articleBody"]//p/strong/text()').getall(),
                "url": url
            }
예제 #4
0
    def parse_iefimerida(self,response):
        #check if we are in an articles url
        title = response.xpath('//h1/span/text()').get() 
        if title is not None:
            #get the article's text
            text = response.xpath('//div[@class="field--name-body on-container"]//p/text()|//div[@class="field--name-body on-container"]/strong/text()|//div[@class="field--name-body on-container"]//p/*/text()|//div[@class="field--name-body on-container"]//li/text()|//div[@class="field--name-body on-container"]//h2/text()').getall()
            list_to_string = " ".join(" ".join(text))
            markspaces = re.sub( "  ", "space",list_to_string)
            uneeded_spaces = re.sub( " ", "",markspaces)
            final_text = re.sub( "space", " ",uneeded_spaces)
            clear_characters = re.sub("\xa0","",final_text)

            #flag to see later on if we have tweets ect
            flag = re.search(r"@",clear_characters)
            url = response.url

            date = response.xpath('//span[@class="created"]/text()').get()
            final_date = formatdate(date)

            #check if we are in an article and that it doesn't have any images
            if len(final_text)>GENERAL_CATEGORIES['ALLOWED_LENGTH']and flag is None:
                yield {
                    "subtopic": GENERAL_CATEGORIES['FOOD'],
                    "website": IEFIMERIDA_VARS['AUTHOR'],
                    "title": title,
                    "article_date": final_date, 
                    "author": IEFIMERIDA_VARS['AUTHOR'],
                    "article_body": re.sub( r'\s\s\s|\n',"",final_text),
                    "url": url,                
                }
예제 #5
0
    def parse_thetoc(self,response):
        #check if we are in an articles url
        title = response.xpath('//div[@class="article-title"]//h1/text()').get() 
        if title is not None:
            #get article's text
            text = response.xpath('//div[@class="article-content articleText"]//p/text()|//div[@class="article-content articleText"]//strong/text()|//div[@class="article-content articleText"]//p/*/text()').getall()
            list_to_string = " ".join(" ".join(text))
            markspaces = re.sub( "  ", "space",list_to_string)
            uneeded_spaces = re.sub( " ", "",markspaces)
            final_text = re.sub( "space", " ",uneeded_spaces)
            clear_characters = re.sub( "\xa0","",final_text)

            date = response.xpath('//span[@class="article-date"]/text()').get()
            final_date = THETOC_VARS['full_date'] +formatdate(date)

            #flag to see later on if we have tweets ect
            flag = re.search(r"@",clear_characters)
            url = response.url

            #check if we are in an article and that it doesn't have images
            if len(text)>GENERAL_CATEGORIES['ALLOWED_LENGTH'] and flag is None:
                yield {
                    "subtopic": GENERAL_CATEGORIES['SPORT'],
                    "website": THETOC_VARS['WEBSITE'],
                    "title": title,
                    "article_date": final_date,
                    "author": re.sub(r'\n|\t',"",response.xpath('//div[@class="author-social"]//h5/a/span[2]/text()').get()),
                    "article_body": re.sub( r'\n|\t',"",clear_characters),
                    "url": url,                
                }
예제 #6
0
    def parse_item(self,response):
        #get article's text
        text = response.xpath('//div[@class="story-content"]//p/text()|//div[@class="story-content"]//strong/text()|//div[@class="story-content"]//a/text()').getall()
        title = response.xpath('//h1[@class="story-title"]/text()').get()
        list_to_string = " ".join(" ".join(text))
        markspaces = re.sub( "  ", "space",list_to_string)
        uneeded_spaces = re.sub( " ", "",markspaces)
        final_text = re.sub( "space", " ",uneeded_spaces)

        url = response.url
        article_type = url.split('/')[5]
        contains_photos = re.search('Photos',final_text)
        #the article isn't a photo gallery

        date = response.xpath('//div[@class="story-date story-credits icon icon-time"]/text()').get()
        final_date = formatdate(date)

        if article_type == CNN_VARS['ARTICLE_TYPE'] and contains_photos is None:
            yield{ 
                "subtopic": GENERAL_CATEGORIES['STYLE'],
                "website": CNN_VARS['WEBSITE'],
                "title": title,
                "article_date": final_date,
                "author": re.sub(r'\n|\t',"",response.xpath('//div[@class="story-author"]/text()').get()),
                "article_body": re.sub( r'\n|\t',"",final_text),
                "url": url,     
            }
예제 #7
0
    def parse_in(self,response):
        #check if we are in an articles url
        title = response.xpath('//h1[@class="entry-title black-c"]/text()').get() 
        if title is not None:
            #get article's text
            text = response.xpath('//div[@class="main-content pos-rel article-wrapper"]//p/text()|//div[@class="main-content pos-rel article-wrapper"]//strong/text()|//div[@class="main-content pos-rel article-wrapper"]//p/*/text()').getall()
            list_to_string = " ".join(" ".join(text))
            markspaces = re.sub( "  ", "space",list_to_string)
            uneeded_spaces = re.sub( " ", "",markspaces)
            final_text = re.sub( "space", " ",uneeded_spaces)
            clear_characters = re.sub( "\xa0","",final_text)

            #flag to see later on if we have tweets ect
            flag = re.search(r"@",clear_characters)
            url = response.url

            date = response.xpath('//time/text()').get()
            final_date = formatdate(date)

            #check if we are in an article and that it doesn't have images
            if len(clear_characters)>GENERAL_CATEGORIES['ALLOWED_LENGTH'] and flag is None:
                yield {
                    "subtopic": GENERAL_CATEGORIES['SPORT'],
                    "website": IN_VARS['WEBSITE'],
                    "title": title,
                    "article_date": final_date, 
                    "author": response.xpath('//span[@class="vcard author"]//a/text()').get(),
                    "article_body": re.sub( r'\s\s\s',"",clear_characters),
                    "url": url,                
                }
예제 #8
0
    def parse_sport24(self,response):
        #check if we are in an articles url
        title = response.xpath('//div[@class="storyContent"]/h1/text()').get()
        if title is not None:
            #get article's text
            text = response.xpath('//div[@itemprop="articleBody"]//p/text()|//div[@itemprop="articleBody"]//h3/text()|//div[@itemprop="articleBody"]//p/*/text()').getall()
            list_to_string = " ".join(" ".join(text))
            markspaces = re.sub( "  ", "space",list_to_string)
            uneeded_spaces = re.sub( " ", "",markspaces)
            final_text = re.sub( "space", " ",uneeded_spaces)
            clear_characters = re.sub( "\xa0","",final_text)
            clear_escape = re.sub(r'\n|\t',"",clear_characters)

            date = response.xpath('//span[@class="byline_date"]/b/text()').get()
            final_date = formatdate(date)

            #flag to see later on if we have tweets ect
            flag = re.search(r"@",clear_escape)
            url = response.url
            subtopic = url.split('/')[3]
            if flag is None:
                yield {
                    "subtopic": subtopic,
                    "website" : SPORT24_VARS['WEBSITE'],
                    "title": title,
                    "article_date": final_date,
                    "author": response.xpath('//span[@class="byline_author"]/b/text()').get(),
                    "article_body": clear_escape, 
                    "url": url
                }
예제 #9
0
    def parse_reader_item(self,response):
        #check if we are in an articles url
        title = response.xpath('//h1/text()').get() 
        if title is not None:
            #get article's text
            text = response.xpath('//div[@class="article-summary"]//p/text()|//div[@class="article-body"]//p/text()|//div[@class="article-body"]//p/*/text()').getall()
            list_to_string = " ".join(" ".join(text))
            markspaces = re.sub( "  ", "space",list_to_string)
            uneeded_spaces = re.sub( " ", "",markspaces)
            final_text = re.sub( "space", " ",uneeded_spaces)
            clear_characters = re.sub( "\xa0","",final_text)

            date = response.xpath('//time/text()').get()
            final_date = formatdate(date)

            author = response.xpath('//p[@class="article-author"]/a/text()').get()
            if author is not None:
                author = re.sub("\xa0","",author)
            else:
                author = READER_VARS['AUTHOR']

            url = response.url
            if title is not None:
                yield {
                    "subtopic": GENERAL_CATEGORIES['SPORT'],
                    "website": READER_VARS['AUTHOR'],
                    "title": re.sub( r'\n|\t',"",title),
                    "article_date": final_date,
                    "author": author,
                    "article_body": re.sub( r'\n|\t',"",clear_characters),
                    "url": url,              
                }
예제 #10
0
    def parse_newpost(self,response):
        #check if we are in an articles url
        title = response.xpath('//h1[@class="article-title"]/text()').get() 
        if title is not None:
            #get the article's text
            text = response.xpath('//div[@class="article-main clearfix"]//p/text()|//div[@class="article-main clearfix"]//li/text()|//div[@class="article-main clearfix"]//p/a/strong/text()|//div[@class="article-main clearfix"]//p/*/text()').getall()
            list_to_string = " ".join(" ".join(text))
            markspaces = re.sub( "  ", "space",list_to_string)
            uneeded_spaces = re.sub( " ", "",markspaces)
            final_text = re.sub( "space", " ",uneeded_spaces)
            clear_characters = re.sub( "\xa0","",final_text)

            #flag to see later on if we have tweets ect
            flag = re.search(r"@",clear_characters)
            url = response.url

            date = (response.xpath('//small[@class="article-created-time"]/text()').get()).split('/')[0]
            final_date = formatdate(date)

            #check if we are in an article and that it doesn't have any images
            if len(clear_characters)>GENERAL_CATEGORIES['ALLOWED_LENGTH'] and flag is None:
                yield {
                    "subtopic": GENERAL_CATEGORIES['FOOD'],
                    "website": NEWPOST_VARS['WEBSITE'],
                    "title": title,
                    "article_date": final_date, 
                    "author": NEWPOST_VARS['WEBSITE'],
                    "article_body": re.sub( r'\s\s\s',"",clear_characters),
                    "url": url,                
            }
예제 #11
0
    def parse_thepressproject(self, response):
        #check if we are in an articles url
        title = response.xpath(
            '//h1[@class="entry-title"]/text()|//h1[@class="entry-title"]/*/text()'
        ).get()
        if title is not None:
            #check if we are in the correct category
            sub = response.xpath(
                '//div[@class="article-categories"]/a/text()').get()
            if sub == PRESSPROJECT_VARS['CATEGORY_WORLD']:

                #check this article isn't a video article
                video_article = response.xpath(
                    '//i[@class="title-icon video-icon fab fa-youtube"]').get(
                    )
                if video_article is None:

                    #fix title's format
                    list_to_string = " ".join(" ".join(title))
                    no_whites = re.sub(r'\t|\n', "", list_to_string)
                    markspaces = re.sub("       ", "space", no_whites)
                    uneeded_spaces = re.sub(" ", "", markspaces)
                    final_title = re.sub("space", " ", uneeded_spaces)
                    delete_front_space = re.sub("    ", "", final_title)
                    final_title = re.sub("   ", "", delete_front_space)

                    #get article's text
                    text = response.xpath(
                        '//div[@id="maintext"]//p/text()|//div[@id="maintext"]//strong/text()|//div[@id="maintext"]//p/*/text()'
                    ).getall()
                    list_to_string = " ".join(" ".join(text))
                    markspaces = re.sub("  ", "space", list_to_string)
                    uneeded_spaces = re.sub(" ", "", markspaces)
                    final_text = re.sub("space", " ", uneeded_spaces)
                    clear_characters = re.sub("\xa0", "", final_text)

                    #flag to see later on if we have tweets ect
                    flag = re.search(r"@", clear_characters)
                    url = response.url
                    date = response.xpath(
                        '//div[@class="article-date"]/label[1]/text()').get()
                    final_date = formatdate(date)

                    #check if we are in an article and that it doesn't have images
                    if len(clear_characters) > GENERAL_CATEGORIES[
                            'ALLOWED_LENGTH'] and flag is None:
                        yield {
                            "subtopic": GENERAL_CATEGORIES['WORLD'],
                            "website": PRESSPROJECT_VARS['AUTHOR'],
                            "title": final_title,
                            "article_date": final_date,
                            "author": PRESSPROJECT_VARS['AUTHOR'],
                            "article_body": re.sub(r'\s\s\s', "",
                                                   clear_characters),
                            "url": url,
                        }
예제 #12
0
    def parse_popaganda(self, response):
        #check if we are in an articles url
        global popaganda_counter
        title = response.xpath('//h1/text()').get()
        if title is not None and popaganda_counter < 300:
            #check if we are in the correct category
            category = response.xpath(
                '//div[@class="category"]/a/text()').get()
            if category == POPAGANDA_VARS['CATEGORY_CULTURE']:

                list_to_string = " ".join(" ".join(title))
                markspaces = re.sub("       ", "space", list_to_string)
                uneeded_spaces = re.sub(" ", "", markspaces)
                put_spaces_back = re.sub("space", " ", uneeded_spaces)
                final_title = re.sub(r'\n|\s\s\s', "", put_spaces_back)

                text = response.xpath(
                    '//div[@class="post-content newstrack-post-content"]//p/text()|//div[@class="post-content newstrack-post-content"]/p/strong/text()|//div[@class="post-content newstrack-post-content"]//h3/text()|//div[@class="post-content newstrack-post-content"]//p/*/text()'
                ).getall()
                list_to_string = " ".join(" ".join(text))
                markspaces = re.sub("  ", "space", list_to_string)
                uneeded_spaces = re.sub(" ", "", markspaces)
                final_text = re.sub("space", " ", uneeded_spaces)
                clear_characters = re.sub("\xa0", "", final_text)

                author = response.xpath(
                    '//div[@class="author"]/a/text()|//div[@itemprop="author"]/*/text()'
                ).get()
                if author == None:
                    author = POPAGANDA_VARS['WEBSITE']

                date = response.xpath(
                    '//div[@class="date"]/text()|//div[@class="fullscreen-date"]/text()'
                ).get()
                final_date = formatdate(date)

                #flag to see later on if we have tweets ect
                flag = re.search(r"@", clear_characters)
                url = response.url

                #check if we are in an article and that it doesn't have images
                if title is not None and len(
                        clear_characters
                ) > GENERAL_CATEGORIES['ALLOWED_LENGTH'] and flag is None:
                    popaganda_counter += 1
                    yield {
                        "subtopic": POPAGANDA_VARS['CULTURE'],
                        "website": POPAGANDA_VARS['WEBSITE'],
                        "title": final_title,
                        "article_date": final_date,
                        "author": POPAGANDA_VARS['WEBSITE'],
                        "article_body": re.sub(r'\s\s\s|\n', "",
                                               clear_characters),
                        "url": url,
                    }
예제 #13
0
    def parse_naftemporiki(self, response):
        global naftemporiki_counter
        #check if we are in an articles url
        title = response.xpath('//h2[@id="sTitle"]/text()').get()
        if title is not None and naftemporiki_counter < 300:
            #check if we are in the correct category
            subtopic = response.xpath(
                '//span[@itemprop="articleSection"]/text()').get()
            if subtopic == NAFTEMPORIKI_VARS['CATEGORY_ENVIRONMENT']:
                #fix the title's format
                list_to_string = " ".join(" ".join(title))
                markspaces = re.sub("       ", "space", list_to_string)
                uneeded_spaces = re.sub(" ", "", markspaces)
                put_spaces_back = re.sub("space", " ", uneeded_spaces)
                final_title = re.sub(r'\n|\s\s\s', "", put_spaces_back)

                #get the article's text
                text = response.xpath(
                    '//div[@class="entityMain article"]//p/text()|//div[@class="entityMain article"]/p/strong/text()|//div[@class="entityMain article"]//h3/text()|//div[@class="entityMain article"]//p/*/text()'
                ).getall()
                list_to_string = " ".join(" ".join(text))
                markspaces = re.sub("  ", "space", list_to_string)
                uneeded_spaces = re.sub(" ", "", markspaces)
                final_text = re.sub("space", " ", uneeded_spaces)
                clear_characters = re.sub("\xa0", "", final_text)

                date = response.xpath('//div[@class="Date"]/text()').get()
                final_date = formatdate(date)

                #flag to see later on if we have tweets ect
                flag = re.search(r"@", clear_characters)
                url = response.url

                #check if we are in an article and that it doesn't have images
                if len(final_text) > GENERAL_CATEGORIES[
                        'ALLOWED_LENGTH'] and flag is None:
                    naftemporiki_counter += 1
                    yield {
                        "subtopic":
                        response.xpath(
                            '//div[@class="Breadcrumb"]/a[2]/text()').get(),
                        "website":
                        NAFTEMPORIKI_VARS['AUTHOR'],
                        "title":
                        final_title,
                        "article_date":
                        final_date,
                        "author":
                        NAFTEMPORIKI_VARS['AUTHOR'],
                        "article_body":
                        re.sub(r'\s\s\s|\n', "", final_text),
                        "url":
                        url,
                    }
예제 #14
0
    def parse_efsyn(self, response):
        #check if we are in an articles url
        title = response.xpath('//h1[1]/text()').get()
        if title is not None:
            #check if we are in the correct category
            subtopic = response.xpath('//article/a/@href').get()
            category = subtopic.split('/')[1]
            if category == EFSYN_VARS['CATEGORY_WORLD']:
                #fix title's format
                list_to_string = " ".join(" ".join(title))
                markspaces = re.sub("       ", "space", list_to_string)
                uneeded_spaces = re.sub(" ", "", markspaces)
                put_spaces_back = re.sub("space", " ", uneeded_spaces)
                final_title = re.sub(r'\n|\s\s\s', "", put_spaces_back)

                #get article's text
                text = response.xpath(
                    '//div[@class="article__body js-resizable"]//p/text()|//div[@class="article__body js-resizable"]/p/strong/text()|//div[@class="article__body js-resizable"]//h3/text()|//div[@class="article__body js-resizable"]//p/*/text()'
                ).getall()
                list_to_string = " ".join(" ".join(text))
                markspaces = re.sub("  ", "space", list_to_string)
                uneeded_spaces = re.sub(" ", "", markspaces)
                final_text = re.sub("space", " ", uneeded_spaces)
                clear_characters = re.sub("\xa0", "", final_text)

                #get author
                author = response.xpath(
                    '//div[@class="article__author"]//a/text()').get()
                if author == None:
                    author = response.xpath(
                        '//div[@class="article__author"]/span/text()').get()

                date = response.xpath('//time/text()').get()
                final_date = formatdate(date)

                #flag to see later on if we have tweets ect
                flag = re.search(r"@", clear_characters)
                url = response.url

                #check if we are in an article and that it doesn't have images
                if len(clear_characters) > GENERAL_CATEGORIES[
                        'ALLOWED_LENGTH'] and flag is None:
                    yield {
                        "subtopic": EFSYN_VARS['WORLD'],
                        "website": EFSYN_VARS['WEBSITE'],
                        "title": final_title,
                        "article_date": final_date,
                        "author": author,
                        "article_body": re.sub(r'\s\s\s|\n', "",
                                               clear_characters),
                        "url": url,
                    }
예제 #15
0
    def parse_kathimerini(self, response):
        #check if we are in an articles url
        title = response.xpath('//h2[@class="item-title"]/text()').get()
        if title is not None:
            list_to_string = " ".join(" ".join(title))
            markspaces = re.sub("       ", "space", list_to_string)
            uneeded_spaces = re.sub(" ", "", markspaces)
            put_spaces_back = re.sub("space", " ", uneeded_spaces)
            final_title = re.sub(r'\n|\s\s\s', "", put_spaces_back)

            text = response.xpath(
                '//div[@class="freetext"]//p/text()|//div[@class="freetext"]//strong/text()|//div[@class="freetext"]//h3/text()|//div[@class="freetext"]//p/*/text()'
            ).getall()
            list_to_string = " ".join(" ".join(text))
            markspaces = re.sub("  ", "space", list_to_string)
            uneeded_spaces = re.sub(" ", "", markspaces)
            final_text = re.sub("space", " ", uneeded_spaces)
            clear_characters = re.sub("\xa0", "", final_text)

            date = response.xpath('//time/text()').get()
            final_date = formatdate(date)

            #flag to see later on if we have tweets ect
            flag = re.search(r"@", clear_characters)
            url = response.url

            author = response.xpath(
                '//span[@class="item-author"]/a/text()').get()
            if author == KATHIMERINI_VARS['CATEGORY_AUTHOR']:
                author = KATHIMERINI_VARS['AUTHOR']
            #check if we are in an article and that it doesn't have images
            if len(final_text
                   ) > GENERAL_CATEGORIES['ALLOWED_LENGTH'] and flag is None:
                yield {
                    "subtopic":
                    response.xpath(
                        '//span[@class="item-category"]/a/text()').get(),
                    "website":
                    KATHIMERINI_VARS['AUTHOR'],
                    "title":
                    final_title,
                    "article_date":
                    final_date,
                    "author":
                    author,
                    "article_body":
                    re.sub(r'\s\s\s|\n', "", final_text),
                    "url":
                    url,
                }
예제 #16
0
    def parse_lifo(self, response):
        global lifo_counter
        #check if we are in an articles url
        title = response.xpath(
            '//h1[@itemprop="headline"]/text()|//meta[@itemprop="headline"]/text()|//h1/*/text()'
        ).get()
        if title is not None and lifo_counter < 300:
            #fix the title's format
            list_to_string = " ".join(" ".join(title))
            markspaces = re.sub("       ", "space", list_to_string)
            uneeded_spaces = re.sub(" ", "", markspaces)
            put_spaces_back = re.sub("space", " ", uneeded_spaces)
            final_title = re.sub(r'\n|\s\s\s', "", put_spaces_back)

            date = response.xpath('//time/text()').get()
            final_date = formatdate(date)

            #get the article's text
            text = response.xpath(
                '//div[@class="clearfix wide bodycontent"]//p/text()|//div[@class="clearfix wide bodycontent"]/p/strong/text()|//div[@class="clearfix wide bodycontent"]//h3/text()|//div[@class="clearfix wide bodycontent"]//p/*/text()'
            ).getall()
            list_to_string = " ".join(" ".join(text))
            markspaces = re.sub("  ", "space", list_to_string)
            uneeded_spaces = re.sub(" ", "", markspaces)
            final_text = re.sub("space", " ", uneeded_spaces)
            clear_characters = re.sub("\xa0", "", final_text)

            author = response.xpath(
                '//div[@class="author"]/a/text()|//div[@itemprop="author"]/*/text()'
            ).get()
            if author == None:
                author = LIFO_VARS['AUTHOR']

            #flag to see later on if we have tweets ect
            flag = re.search(r"@", clear_characters)
            url = response.url

            #check if we are in an article and that it doesn't have images
            if len(clear_characters
                   ) > GENERAL_CATEGORIES['ALLOWED_LENGTH'] and flag is None:
                lifo_counter += 1
                yield {
                    "subtopic": GENERAL_CATEGORIES['ENVIRONMENT'],
                    "website": LIFO_VARS['AUTHOR'],
                    "title": final_title,
                    "article_date": final_date,
                    "author": author,
                    "article_body": re.sub(r'\s\s\s|\n', "", clear_characters),
                    "url": url,
                }
예제 #17
0
    def parse_topontiki(self, response):
        #check if we are in an articles url
        title = response.xpath('//h1/text()').get()
        if title is not None:
            sub = response.xpath('//h2/a[1]/text()').get()
            #check if we are in the correct category
            if sub == TOPONTIKI_VARS['CATEGORY_CULTURE']:
                list_to_string = " ".join(" ".join(title))
                markspaces = re.sub("       ", "space", list_to_string)
                uneeded_spaces = re.sub(" ", "", markspaces)
                put_spaces_back = re.sub("space", " ", uneeded_spaces)
                final_title = re.sub(r'\n|\s\s\s', "", put_spaces_back)

                text = response.xpath(
                    '//div[@class="field-item even"]//p/text()|//div[@class="field-item even"]//p/*/text()|//div[@class="field-item even"]//p//span/text()'
                ).getall()
                list_to_string = " ".join(" ".join(text))
                markspaces = re.sub("  ", "space", list_to_string)
                uneeded_spaces = re.sub(" ", "", markspaces)
                final_text = re.sub("space", " ", uneeded_spaces)
                clear_characters = final_text.replace("\xa0", "")

                date = response.xpath('//span[@class="date"]/text()').get()
                final_date = formatdate(date)

                #flag to see later on if we have tweets ect
                flag = re.search(r"@", clear_characters)
                url = response.url

                #check if we are in an article and that it doesn't have images
                if len(clear_characters) > GENERAL_CATEGORIES[
                        'ALLOWED_LENGTH'] and flag is None:
                    yield {
                        "subtopic":
                        GENERAL_CATEGORIES['CULTURE'],
                        "website":
                        TOPONTIKI_VARS['WEBSITE'],
                        "title":
                        final_title,
                        "article_date":
                        final_date,
                        "author":
                        response.xpath('//a[@class="author"]/text()').get(),
                        "article_body":
                        re.sub(r'\s\s\s|\n', "", clear_characters),
                        "url":
                        url,
                    }
예제 #18
0
    def parse_protagon(self, response):
        global protagon_counter
        #check if we are in an articles url
        title = response.xpath('//h1[@class="entry-title"]/text()').get()
        if title is not None and protagon_counter < 300:
            #check if we are in the correct category
            sub = response.xpath('//span[@class="s_roumpr"]/a/text()').get()
            if sub == PROTAGON_VARS['CATEGORY_ECONOMICS']:

                text = response.xpath(
                    '//div[@class="left-single-column "]//p/text()|//div[@class="left-single-column "]//strong/text()|//div[@class="left-single-column "]//p/*/text()'
                ).getall()
                list_to_string = " ".join(" ".join(text))
                markspaces = re.sub("  ", "space", list_to_string)
                uneeded_spaces = re.sub(" ", "", markspaces)
                final_text = re.sub("space", " ", uneeded_spaces)
                clear_characters = re.sub("\xa0", "", final_text)

                #flag to see later on if we have tweets ect
                flag = re.search(r"@", clear_characters)
                url = response.url

                author = re.findall(
                    r"(\w+).(\w+)",
                    response.xpath(
                        '//strong[@class="generalbold uppercase"]/a/text()').
                    get())
                list_to_tuple = author[0]
                author = ' '.join(list_to_tuple)

                date = response.xpath(
                    '//span[@class="generalight uppercase"]/text()').get()
                final_date = formatdate(date)

                #check if we are in an article and that it doesn't have images
                if len(clear_characters) > GENERAL_CATEGORIES[
                        'ALLOWED_LENGTH'] and flag is None:
                    protagon_counter += 1
                    yield {
                        "subtopic": GENERAL_CATEGORIES['ECONOMICS'],
                        "website": PROTAGON_VARS['WEBSITE'],
                        "title": title,
                        "article_date": final_date,
                        "author": author,
                        "article_body": re.sub(r'\s\s\s', "",
                                               clear_characters),
                        "url": url,
                    }
예제 #19
0
    def parse_cnn(self, response):
        global cnn_counter
        #check if we are in an articles url
        title = response.xpath('//h1[@class="story-title"]/text()').get()
        if title is not None and cnn_counter < 300:
            #get the article's text
            text = response.xpath(
                '//div[@class="story-content"]//p/text()|//div[@class="story-content"]//strong/text()|//div[@class="story-content"]//a/text()'
            ).getall()
            list_to_string = " ".join(" ".join(text))
            markspaces = re.sub("  ", "space", list_to_string)
            uneede_spaces = re.sub(" ", "", markspaces)
            final_text = re.sub("space", " ", uneede_spaces)
            clear_characters = re.sub("\xa0", "", final_text)

            date = re.sub(
                r'\n|\t', "",
                response.xpath(
                    '//div[@class="story-date story-credits icon icon-time"]/text()'
                ).get())
            final_date = formatdate(date)

            url = response.url

            if len(clear_characters) > GENERAL_CATEGORIES['ALLOWED_LENGTH']:
                cnn_counter += 1
                yield {
                    "subtopic":
                    GENERAL_CATEGORIES['ENVIRONMENT'],
                    "website":
                    CNN_VARS['WEBSITE'],
                    "title":
                    title,
                    "article_date":
                    final_date,
                    "author":
                    re.sub(
                        r'\n|\t', "",
                        response.xpath(
                            '//div[@class="story-author"]/text()').get()),
                    "article_body":
                    re.sub(r'\n|\t', "", clear_characters),
                    "url":
                    url,
                }
예제 #20
0
    def parseItemPS(self, response):
        global cnn_counter
        title = response.xpath('//h1[@class="story-title"]/text()').get()

        text = response.xpath(
            '//div[@class="story-content"]//p/text()|//div[@class="story-content"]//strong/text()|//div[@class="story-content"]//a/text()'
        ).getall()
        list_to_string = " ".join(" ".join(text))
        markspaces = re.sub("  ", "space", list_to_string)
        uneeded_spaces = re.sub(" ", "", markspaces)
        final_text = re.sub("space", " ", uneeded_spaces)
        clear_characters = re.sub("\xa0", "", final_text)

        date = response.xpath(
            '//div[@class="story-date story-credits icon icon-time"]/text()'
        ).get()
        final_date = formatdate(date)

        #check if this an article and not an photo gallery
        url = response.url
        article_type = url.split('/')[5]
        contains_photos = re.search('Photos', clear_characters)
        if article_type == CNN_VARS[
                'ARTICLE_TYPE'] and contains_photos is None and cnn_counter < 300:
            cnn_counter += 1
            yield {
                "subtopic":
                GENERAL_CATEGORIES['CULTURE'],
                "website":
                CNN_VARS['WEBSITE'],
                "title":
                title,
                "article_date":
                final_date,
                "author":
                re.sub(
                    r'\n|\t', "",
                    response.xpath(
                        '//div[@class="story-author"]/text()').get()),
                "article_body":
                re.sub(r'\n|\t', "", clear_characters),
                "url":
                url,
            }
예제 #21
0
    def parse_tanea(self, response):
        #check if we are in an articles url
        title = response.xpath(
            '//h1[@class="entry-title black-c"]/text()').get()
        if title is not None:
            list_to_string = " ".join(" ".join(title))
            markspaces = re.sub("       ", "space", list_to_string)
            uneeded_spaces = re.sub(" ", "", markspaces)
            put_spaces_back = re.sub("space", " ", uneeded_spaces)
            final_title = re.sub(r'\n|\s\s\s', "", put_spaces_back)

            text = response.xpath(
                '//div[@class="main-content pos-rel article-wrapper"]//p/text()|//div[@class="main-content pos-rel article-wrapper"]//strong/text()|//div[@class="main-content pos-rel article-wrapper"]//h3/text()|//div[@class="main-content pos-rel article-wrapper"]//p/*/text()'
            ).getall()
            list_to_string = " ".join(" ".join(text))
            markspaces = re.sub("  ", "space", list_to_string)
            uneeded_spaces = re.sub(" ", "", markspaces)
            final_text = re.sub("space", " ", uneeded_spaces)
            clear_characters = re.sub("\xa0", "", final_text)

            date = response.xpath(
                '//span[@class="firamedium postdate updated"]/text()').get()
            final_date = formatdate(date)

            #flag to see later on if we have tweets ect
            flag = re.search(r"@", clear_characters)
            url = response.url
            subtopic = url.split('/')[7]
            if len(subtopic) > 15:
                subtopic = TANEA_VARS['CATEGORY_CULTURE']

            #check if we are in an article and that it doesn't have images
            if len(final_text
                   ) > GENERAL_CATEGORIES['ALLOWED_LENGTH'] and flag is None:
                yield {
                    "subtopic": GENERAL_CATEGORIES['CULTURE'],
                    "website": TANEA_VARS['AUTHOR'],
                    "title": final_title,
                    "article_date": final_date,
                    "author": TANEA_VARS['AUTHOR'],
                    "article_body": re.sub(r'\s\s\s|\n', "", final_text),
                    "url": url,
                }
예제 #22
0
    def parse_insomnia(self, response):
        #check if we are in an articles url
        title = response.xpath('//div[@class="container"]//h1/text()').get()
        if title is not None:
            #get article's text
            text = response.xpath(
                '//div[@class="the-content"]//p/text()|//div[@class="the-content"]//p/*/text()'
            ).getall()
            list_to_string = " ".join(" ".join(text))
            markspaces = re.sub("  ", "space", list_to_string)
            uneeded_spaces = re.sub(" ", "", markspaces)
            final_text = re.sub("space", " ", uneeded_spaces)
            clear_characters = re.sub("\xa0", "", final_text)

            #flag to see later on if we have tweets ect
            flag = re.search(r"@", clear_characters)

            #find subtopic through url
            url = response.url
            subtopic = url.split('/')[4]

            date = response.xpath('//span[@class="timestamp"]/text()').get()
            final_date = formatdate(date)

            #check if we are in an article and that it doesn't have images
            if len(clear_characters
                   ) > GENERAL_CATEGORIES['ALLOWED_LENGTH'] and flag is None:
                yield {
                    "subtopic":
                    subtopic,
                    "website":
                    INSOMNIA_VARS['WEBSITE'],
                    "title":
                    title,
                    "article_date":
                    final_date,
                    "author":
                    response.xpath('//span[@class="author"]/a/text()').get(),
                    "article_body":
                    re.sub(r'\s\s\s|\n|\t', "", clear_characters),
                    "url":
                    response.url,
                }
예제 #23
0
    def parse_thetoc(self, response):
        global thetoc_counter
        #check if we are in an articles url
        title = response.xpath(
            '//div[@class="article-title"]//h1/text()').get()
        if title is not None and thetoc_counter < 300:
            text = response.xpath(
                '//div[@class="article-content articleText"]//p/text()|//div[@class="article-content articleText"]//strong/text()|//div[@class="article-content articleText"]//p/*/text()'
            ).getall()
            list_to_string = " ".join(" ".join(text))
            markspaces = re.sub("  ", "space", list_to_string)
            uneeded_spaces = re.sub(" ", "", markspaces)
            final_text = re.sub("space", " ", uneeded_spaces)
            clear_characters = re.sub("\xa0", "", final_text)

            date = response.xpath('//span[@class="article-date"]/text()').get()
            final_date = THETOC_VARS['full_date'] + formatdate(date)

            url = response.url
            if len(clear_characters) > GENERAL_CATEGORIES['ALLOWED_LENGTH']:
                thetoc_counter += 1
                yield {
                    "subtopic":
                    GENERAL_CATEGORIES['ECONOMICS'],
                    "website":
                    re.search(r"www.+\.gr", url).group(0),
                    "title":
                    title,
                    "article_date":
                    final_date,
                    "author":
                    re.sub(
                        r'\n|\t', "",
                        response.xpath(
                            '//div[@class="author-social"]//h5/a/span[2]/text()'
                        ).get()),
                    "article_body":
                    re.sub(r'\n|\t', "", clear_characters),
                    "url":
                    url,
                }
예제 #24
0
    def parse_tovima(self, response):
        global tovima_counter
        #check if we are in an articles url
        title = response.xpath(
            '//h1[@class="entry-title thirty black-c zonabold"]/text()').get()
        if title is not None and tovima_counter < 300:
            list_to_string = " ".join(" ".join(title))
            markspaces = re.sub("       ", "space", list_to_string)
            uneeded_spaces = re.sub(" ", "", markspaces)
            put_spaces_back = re.sub("space", " ", uneeded_spaces)
            final_title = re.sub(r'\n|\s\s\s', "", put_spaces_back)

            text = response.xpath(
                '//div[@class="main-content pos-rel article-wrapper"]//p/text()|//div[@class="main-content pos-rel article-wrapper"]//strong/text()|//div[@class="main-content pos-rel article-wrapper"]//h3/text()|//div[@class="main-content pos-rel article-wrapper"]//p/*/text()'
            ).getall()
            list_to_string = " ".join(" ".join(text))
            markspaces = re.sub("  ", "space", list_to_string)
            uneeded_spaces = re.sub(" ", "", markspaces)
            final_text = re.sub("space", " ", uneeded_spaces)
            clear_characters = re.sub("\xa0", "", final_text)

            date = response.xpath('//time/span/text()').get()
            final_date = formatdate(date)

            #flag to see later on if we have tweets ect
            flag = re.search(r"@", clear_characters)
            url = response.url

            #check if we are in an article and that it doesn't have any images
            if len(final_text
                   ) > GENERAL_CATEGORIES['ALLOWED_LENGTH'] and flag is None:
                tovima_counter += 1
                yield {
                    "subtopic": GENERAL_CATEGORIES['ECONOMICS'],
                    "website": TOVIMA_VARS['AUTHOR'],
                    "title": final_title,
                    "article_date": final_date,
                    "author": TOVIMA_VARS['AUTHOR'],
                    "article_body": re.sub(r'\s\s\s|\n', "", final_text),
                    "url": url,
                }
예제 #25
0
    def parse_popaganda(self,response):
        #check if we are in an articles url
        title = response.xpath('//h1/text()').get() 
        if title != None:
            #fix title's format
            list_to_string = " ".join(" ".join(title))
            markspaces = re.sub( "       ", "space",list_to_string)
            uneeded_spaces = re.sub( " ", "",markspaces)
            put_spaces_back = re.sub( "space", " ",uneeded_spaces)
            uneeded_escapes = re.sub(r'\n|\s\s\s',"",put_spaces_back)
            final_title = re.sub("\xa0","",uneeded_escapes)

            #get the article's text
            text = response.xpath('//div[@class="post-content big nxContent"]//p/text()|//div[@class="post-content big nxContent"]//strong/text()|//div[@class="post-content big nxContent"]//span/*/text()|//div[@class="post-content big nxContent"]//em/text()|//div[@class="post-content big nxContent"]//p/*/text()').getall()
            list_to_string = " ".join(" ".join(text))
            markspaces = re.sub( "  ", "space",list_to_string)
            uneeded_spaces = re.sub( " ", "",markspaces)
            final_text = re.sub( "space", " ",uneeded_spaces)
            clear_characters = re.sub(r'\s\s\s|\n',"",final_text)

            date = response.xpath('//div[@class="article_date"]/text()|//div[@class="fullscreen-date"]/text()').get()
            final_date = formatdate(date)

            author = response.xpath('//div[@class="author-title"]/a/text()|//div[@itemprop="author-title"]/*/text()|//div[@class="fullscreen-author"]/a/text()').get()
            if author == None:
                author = POPAGANDA_VARS['WEBSITE']

            #flag to see later on if we have tweets ect
            flag = re.search(r"@",clear_characters)   
            if len(clear_characters)>GENERAL_CATEGORIES['ALLOWED_LENGTH']and flag is None:
                yield {
                    "subtopic": POPAGANDA_VARS['FOOD'],
                    "website": POPAGANDA_VARS['WEBSITE'],
                    "title": final_title,
                    "article_date": final_date, 
                    "author": re.sub(r'\n',"",author),
                    "article_body": clear_characters.replace(" ","",1),
                    "url": response.url,
                }