Beispiel #1
0
    def parse_mihaaru_links(self, response):
        """
        Does not use an RSS Feed.
        Scrapes site for links to articles.
        """
        links = response.xpath('//div[@class="main_news_size_2"]/a')

        for link in links:
            item = NewsbyteItem()
            item['source'] = response.url
            url = link.xpath('@href').extract()
            item['link'] = url[0]
            title = link.xpath('text()').extract()
            item['title'] = title[0]
            item['country'] = '#' if response.meta[
                'country'] is None else response.meta['country']
            item['item_id'] = str(uuid4())
            item['language'] = '#' if response.meta[
                'language'] is None else response.meta['language']
            item['region'] = self.region
            pubdate = time.localtime(
            )  # if there is no pubdate the time it is scraped is used
            item['pubdate'] = time.mktime(pubdate)
            request = Request(item['link'],
                              callback=getattr(self, response.meta['method']),
                              dont_filter=response.meta.get(
                                  'dont_filter', False))
            request.meta['item'] = item

            yield request
Beispiel #2
0
    def parse_vienlinks(self, response):
        """
        There's an issue regarding date published... currently using the date when it's scraped.
        Does not use an RSS Feed.
        """
        for link in response.xpath('//div[@class="list-view"]//a/@href'):
            full_link = response.urljoin(link.extract())
            item = NewsbyteItem()
            item['source'] = response.url
            item['link'] = full_link
            item['country'] = '#' if response.meta[
                'country'] is None else response.meta['country']
            item['item_id'] = str(uuid4())
            item['language'] = '#' if response.meta[
                'language'] is None else response.meta['language']
            item['region'] = self.region
            pubdate = time.localtime()
            item['pubdate'] = time.mktime(pubdate)
            request = Request(full_link,
                              callback=getattr(self, response.meta['method']),
                              dont_filter=response.meta.get(
                                  'dont_filter', False))
            request.meta['item'] = item

            yield request
Beispiel #3
0
    def parse_fortunelinks(self, response):
        """
        Does not use an RSS Feed.
        """
        for link in response.xpath('//div[@class="span6"]/h3/a'):
            new_link = ''.join(link.xpath('./@href').extract())
            title = ''.join(link.xpath('./text()').extract())

            item = NewsbyteItem()
            item['source'] = response.url
            item['link'] = new_link
            item['title'] = title
            item['country'] = '#' if response.meta['country'] is None else response.meta['country']
            item['item_id'] = str(uuid4())
            item['language'] = '#' if response.meta['language'] is None else response.meta['language']
            item['region'] = item['region'] = self.region

            request = Request(
                new_link,
                callback=getattr(self, response.meta['method']),
                dont_filter=response.meta.get('dont_filter', False)
            )

            request.meta['item'] = item

            yield request
Beispiel #4
0
    def parse_dailystarlinks(self, response):
        """
        Does not use an RSS Feed.
        """
        links = response.xpath(
            '//div[@class="three-33"]/ul[@class="list-border besides"]/li/a/@href'
        ).extract()
        frontpage_links = [link for link in links if 'frontpage' in link]

        for link in frontpage_links:
            item = NewsbyteItem()

            item['source'] = response.url
            item['link'] = response.urljoin(link)

            item['country'] = '#' if response.meta[
                'country'] is None else response.meta['country']
            item['item_id'] = str(uuid4())
            item['language'] = '#' if response.meta[
                'language'] is None else response.meta['language']
            item['region'] = self.region
            request = Request(item['link'],
                              callback=getattr(self, response.meta['method']),
                              dont_filter=response.meta.get(
                                  'dont_filter', False))
            request.meta['item'] = item
            request.meta['xpath'] = response.meta['xpath']

            yield request
Beispiel #5
0
    def parse_raiamwema_links(self, response):
        """
        Does not use an RSS Feed.
        Scrapes site for links to articles.
        """

        # Gets latest posts
        links = response.xpath('//div[@class="item-details"]')
        for link in links:
            try:
                item = NewsbyteItem()
                item['source'] = response.url
                url = link.xpath('h3/a/@href').extract()
                item['link'] = url[0]
                title = link.xpath('h3/a/text()').extract()
                item['title'] = title[0]
                item['country'] = '#' if response.meta['country'] is None else response.meta['country']
                item['item_id'] = str(uuid4())
                item['language'] = '#' if response.meta['language'] is None else response.meta['language']
                item['region'] = self.region
                pubdate = time.localtime()  # if there is no pubdate the time it is scraped is used
                item['pubdate'] = time.mktime(pubdate)
                description = link.xpath('div[@class="td-excerpt"]/text()').extract()
                item['description'] = description[0]
                request = Request(
                    item['link'],
                    callback=getattr(self, response.meta['method']),
                    dont_filter=response.meta.get('dont_filter', False)
                )
                request.meta['item'] = item

                yield request
            except Exception as e:
                print e
Beispiel #6
0
    def parse_dominican_today_links(self, response):
        """
        Does not use an RSS Feed.
        Scrapes site for links to articles.
        Only gets news categorized under "Local"
        """
        links = response.xpath(
            '//div[contains(@class, "noticias2-category")]/h2/a')

        for link in links:
            try:
                item = NewsbyteItem()
                item['source'] = response.url
                url = link.xpath('@href').extract()
                item['link'] = url[0]
                title = link.xpath('text()').extract()
                item['title'] = title[0]
                item['country'] = '#' if response.meta[
                    'country'] is None else response.meta['country']
                item['item_id'] = str(uuid4())
                item['language'] = '#' if response.meta[
                    'language'] is None else response.meta['language']
                item['region'] = self.region
                request = Request(item['link'],
                                  callback=getattr(self,
                                                   response.meta['method']),
                                  dont_filter=response.meta.get(
                                      'dont_filter', False))
                request.meta['item'] = item

                yield request
            except Exception as e:
                print e
Beispiel #7
0
    def parse_al_ayyam(self, response):
        """
        This function uses feedparser to parse through rss feeds extracting the XML nodes important to present in news articles
         (i.e title of article, link, publication date, etc.) and populates the scrapy.Fields in the NewsbyteItem dictionary with said info.
        """
        feed = feedparser.parse(response.body)
        for entry in feed.entries:
            try:
                item = NewsbyteItem()
                item['source'] = response.url
                item['title'] = lxml.html.fromstring(entry.title).text
                pubdate = entry.published
                if not isinstance(pubdate,
                                  unicode):  # if pubdate is not unicode

                    # fix wrong dates
                    if pubdate.tm_year < 2000:
                        pubdate = time.localtime()
                    elif time.localtime().tm_yday - pubdate.tm_yday > 7:
                        continue
                    else:
                        item['pubdate'] = time.mktime(pubdate)
                else:
                    pubdate = parse(pubdate, fuzzy=True, dayfirst=True)
                    pubdate = time.mktime(pubdate.timetuple())

                item['pubdate'] = pubdate
                item['link'] = 'http://www.al-ayyam.ps/' + entry.link
                item['country'] = '#' if response.meta[
                    'country'] is None else response.meta['country']
                item['language'] = '#' if response.meta[
                    'language'] is None else response.meta['language']
                item['description'] = entry.description
                item['item_id'] = str(uuid4())
                item['region'] = self.region
                request = Request(item['link'],
                                  callback=getattr(self,
                                                   response.meta['method']),
                                  dont_filter=response.meta.get(
                                      'dont_filter', False))

                request.meta['item'] = item
                request.meta['entry'] = entry
                request.meta['xpath'] = response.meta['xpath']
                if 'thumb_xpath' in response.meta:
                    request.meta['thumb_xpath'] = response.meta['thumb_xpath']
                else:
                    item['thumbnail'] = ''

                yield request

            except Exception as e:
                print '%s: %s' % (type(e), e)
                print entry
Beispiel #8
0
    def parse_abante_links(self, response):
        """
        Does not use an RSS Feed.
        Scrapes site for links to articles.
        """

        # Gets links
        links = response.xpath('//div[contains(@class,"td-animation")]')
        for link in links:
            try:
                item = NewsbyteItem()
                item['source'] = response.url
                url = link.xpath(
                    'div[@class="item-details"]/h3/a/@href').extract()
                item['link'] = url[0]
                title = link.xpath(
                    'div[@class="item-details"]/h3/a/@title').extract()
                item['title'] = title[0]
                item['country'] = '#' if response.meta[
                    'country'] is None else response.meta['country']
                item['item_id'] = str(uuid4())
                item['language'] = '#' if response.meta[
                    'language'] is None else response.meta['language']
                item['region'] = self.region
                pubdate = link.xpath(
                    'div[@class="item-details"]/div/span/time/@datetime'
                ).extract()
                pubdate = parse(pubdate[0], fuzzy=True, dayfirst=False)
                item['pubdate'] = time.mktime(pubdate.timetuple())
                description = link.xpath(
                    'div[@class="item-details"]/div[@class="td-excerpt"]'
                ).extract()
                item['description'] = description[0]
                thumbnail = link.xpath(
                    'div[@class="td-module-thumb"]/a/img/@src').extract()
                item['thumbnail'] = thumbnail[0]
                request = Request(item['link'],
                                  callback=getattr(self,
                                                   response.meta['method']),
                                  dont_filter=response.meta.get(
                                      'dont_filter', False))
                request.meta['item'] = item

                yield request
            except Exception as e:
                print e
Beispiel #9
0
    def parse_megi_links(self, response):
        """
        Does not use an RSS Feed.
        Scrapes site for links to articles.
        """

        # Gets latest posts
        latest = response.xpath('//div[@class="latest_widget_blurb_wrapper"]')

        # Get header of lifestyle news
        lifestyle = response.xpath('//div[@class="homepage_article_widget_section_title" and text()="Lifestyle"]')
        # Gets front page posts before lifestyle news
        news = lifestyle.xpath('preceding::div[@class="special_smallstory_white_wrappers"]')

        # Combines latest news nad other news
        links = latest + news
        for link in links:
            try:
                item = NewsbyteItem()
                item['source'] = response.url
                url = link.xpath('div/a/@href').extract()
                item['link'] = response.urljoin(url[0])
                title = link.xpath('div/a/text()').extract()
                item['title'] = title[0]
                item['country'] = '#' if response.meta['country'] is None else response.meta['country']
                item['item_id'] = str(uuid4())
                item['language'] = '#' if response.meta['language'] is None else response.meta['language']
                item['region'] = self.region
                pubdate = time.localtime()  # if there is no pubdate the time it is scraped is used
                item['pubdate'] = time.mktime(pubdate)
                description = link.xpath('div/text()').extract()
                item['description'] = description[1]
                request = Request(
                    item['link'],
                    callback=getattr(self, response.meta['method']),
                    dont_filter=response.meta.get('dont_filter', False)
                )
                request.meta['item'] = item

                yield request
            except Exception as e:
                print e
Beispiel #10
0
    def parse_common(self, response):
        """
        This function uses feedparser to parse through rss feeds extracting the XML nodes important to present in news articles
         (i.e title of article, link, publication date, etc.) and populates the scrapy.Fields in the NewsbyteItem dictionary with said info.
        """
        feed = feedparser.parse(response.body)
        for entry in feed.entries:
            try:
                item = NewsbyteItem()

                # Get published date of article
                attributes = entry.keys()
                if 'published_parsed' in attributes:
                    pubdate = entry.published_parsed
                else:
                    pubdate = None
                if pubdate is None and 'updated_parsed' in attributes:
                    pubdate = entry.updated_parsed
                elif pubdate is None and 'published' in attributes:
                    pubdate = entry.published
                if pubdate is None:
                    pubdate = time.localtime()  # if there is no pubdate the time it is scraped is used
                if not isinstance(pubdate, unicode):  # if pubdate is not unicode
                    # Fix Sri Lanka native dates
                    if response.url == 'http://www.lankadeepa.lk/rss/latest_news/1':
                        date = entry.published
                        split_date = date.split()
                        # Create new pubdate from published_parsed and published dates
                        pubdate = time.strptime(str(pubdate.tm_year)+" "+str(pubdate.tm_mon)+" "+split_date[3]+" "+split_date[5], "%Y %m %d %H:%M")
                    # Fix wrong dates
                    if pubdate.tm_year < 2000: # I don't know what this is for
                        pubdate = time.localtime()
                    elif time.localtime().tm_yday - pubdate.tm_yday > 7: # Checks if article is older than 7 days
                        print "outdated"
                        continue
                    else:
                        item['pubdate'] = time.mktime(pubdate)
                else:
                    # Certain feeds use mm/dd/yyyy format instead of dd/mm/yyyy
                    if response.url == 'http://mubasher.aljazeera.net/rss.xml' or \
                        response.url == 'http://www.almadapaper.net/rss/':
                        pubdate = parse(pubdate, fuzzy=True, dayfirst=False)
                    else:
                        pubdate = parse(pubdate, fuzzy=True, dayfirst=True)

                    # Checks if article is older than 7 days
                    if time.localtime().tm_yday - pubdate.timetuple().tm_yday > 7:
                        continue
                    else:
                        pubdate = time.mktime(pubdate.timetuple())
                    item['pubdate'] = pubdate

                item['item_id'] = str(uuid4())
                item['source'] = response.url
                item['link'] = entry.link
                item['country'] = '#' if response.meta['country'] is None else response.meta['country']
                item['language'] = '#' if response.meta['language'] is None else response.meta['language']
                item['title'] = lxml.html.fromstring(entry.title).text
                item['description'] = entry.description
                item['region'] = self.region
                request = Request(
                    entry.link,
                    callback=getattr(self, response.meta['method']),
                    dont_filter=response.meta.get('dont_filter', False)
                )

                request.meta['item'] = item
                request.meta['entry'] = entry
                request.meta['xpath'] = response.meta['xpath']
                if 'thumb_xpath' in response.meta:
                    request.meta['thumb_xpath'] = response.meta['thumb_xpath']
                else:
                    item['thumbnail'] = ''

                yield request

            except Exception as e:
                print '%s: %s' % (type(e), e)
                print entry