Beispiel #1
0
    def parse_news(self, response):
        """
        Extract the data from the news page and if the page is not in cache,
        this HTML request is counted, so the ip should be updated if necessary.
        The update ip needs to stay here unless you don't want HTTPCACHE
        """

        loader = NewsLoader(item=NewsItem(), response=response)
        loader.add_xpath('title',
                         '//span[@class="lede-text-only__highlight"]/text()')
        loader.add_xpath(
            'title', '//span[@class="lede-large-content__highlight"]/text()')
        loader.add_xpath('title', '//article//h1/text()')
        authors = response.xpath('//div[@class="author"]/text()').extract()
        for author in authors:
            author = strip_html5_whitespace(author)
            author = replace_escape_chars(author)
            if len(author) != 0:
                loader.add_value('author', author)
        timestamp = response.xpath(
            '//time[@class="article-timestamp"]/@datetime').extract()[0]
        timestamp = du.normalize_timestamp(timestamp, hasTimezone=True)
        loader.add_value('date', timestamp.split(' ')[0])
        loader.add_value('time', timestamp.split(' ')[1])
        loader.add_xpath('content', '//div[@class="body-copy fence-body"]')
        loader.add_xpath('tags', '//meta[@name="keywords"]/@content')
        return loader.load_item()
    def parse_news(self, response):
        """
        Extract the data from the news page and if the page is not in cache,
        this HTML request is counted, so the ip should be updated if necessary.
        The update ip needs to stay here unless you don't want HTTPCACHE
        """

        loader = NewsLoader(item=NewsItem(), response=response)
        loader.add_xpath('title',
                         '//article/header[@class = "entry-header"]/a/text()')
        loader.add_xpath(
            'title', '//article/header[@class = "entry-header"]/h1/text()')
        loader.add_xpath(
            'author',
            '//article/header/p/span/a/span[@class = "entry-author-name"]/text()'
        )
        timestamp = response.xpath(
            '//article/header/p/time[@class = "entry-time"]/@datetime'
        ).extract()[0]
        timestamp = du.normalize_timestamp(timestamp, hasTimezone=True)
        loader.add_value('date', timestamp.split(' ')[0])
        loader.add_value('time', timestamp.split(' ')[1])
        loader.add_xpath('content', '//article/div[@class= "entry-content"]/p')
        loader.add_xpath('tags', '//meta[@name="news_keywords"]/@content')
        return loader.load_item()
Beispiel #3
0
    def parse(self, response):
        '''
        Infinite scroll of market watch news from marketwatch.com/newsviewer
        '''

        display = Display(visible=0, size=(480, 320))
        display.start()
        
        driver = webdriver.Chrome()
        driver.get(response.url)
        
        driver.execute_script("x = document.getElementById('mktwheadlines').getElementsByClassName('viewport')[0];")
        
        #removing unncesserary stuff from the page
        driver.execute_script('x=document.getElementById("thirdpartyheadlines"); x.parentNode.removeChild(x);')
        driver.execute_script('x=document.getElementById("rightrail"); x.parentNode.removeChild(x);')
        driver.execute_script('x=document.getElementById("thirdpartycontrols"); x.parentNode.removeChild(x);')
        driver.execute_script('x=document.getElementById("sponsoredlinks"); x.parentNode.removeChild(x);')
        driver.execute_script('x=document.getElementById("below"); x.parentNode.removeChild(x);')
        driver.execute_script('x=document.getElementById("chrome"); x.parentNode.removeChild(x);')
        
        i = 0
        last_timestamp_scraped = datetime.utcnow().strftime('%Y-%m-%d %H:%M:%S')
        # Infinite scrolling
        try:   
            while True: 
                time.sleep(SCROLL_PAUSE_TIME)
                driver.execute_script("x = document.getElementById('mktwheadlines').getElementsByClassName('viewport')[0];x.scrollTo(0,x.scrollHeight);")
                i = i + 1
                    
                # Every ten iteration, remove the <li> tags from the web page
                if i % 10 == 0:
                    elements = driver.find_elements_by_xpath('.//div[@id="mktwheadlines"]//ol[@class="viewport"]/li')
                    # Printing retrived elements to the file
                    item = BriefItem()
                    for elem in elements:
                        try:
                            timestamp = elem.get_attribute("timestamp")
                            timestamp = du.normalize_timestamp(timestamp, timezone = 'US/Eastern')
                            if du.compare_time(timestamp, last_timestamp_scraped) :
                                item['title'] = elem.find_element_by_xpath('.//div[@class="nv-text-cont"]').text
                                try:
                                    item['url'] = elem.find_element_by_xpath('.//a[@class="read-more"]').get_attribute("href")
                                except Exception as e:
                                    item['url'] = ""
                                item['date'] = timestamp.split(' ')[0]
                                item['time'] = timestamp.split(' ')[1]
                                last_timestamp_scraped = timestamp
                                
                            yield item        
                        except Exception as e:
                            self.logger.error(e)
                    driver.execute_script('var element = document.getElementsByTagName("li"); var index;for (index = 0; index <= element.length - 2; index++) {element[0].parentNode.removeChild(element[0]);}')
        except Exception as e:
            self.logger.error("Error scraping marketwatch.com")
            self.logger.error(e)
        finally:    
            # need to close the driver
            driver.close()
Beispiel #4
0
    def parse(self, response):
        '''
        Parse a single page from investing.com/news/stock-market-news and yield
        a Request for the next page.
        '''
        # retriving the list of articles for each page
        articles = response.xpath("//article[@class='articleItem']")
        item = BriefItem()

        for article in articles:
            try:
                # retriving the date
                item['title'] = article.xpath(
                    "//a[@class='title']/text()").extract()[0]
                date = article.xpath(
                    "//span[@class='date']/text()").extract()[0]
                # date_time of the current day may be expressed in minutes/hour ago
                if 'ago' in date:
                    item['date'] = du.normalize_timestamp(
                        du.getCurrentDate(), output_format="%Y-%m-%d")
                else:
                    # the date may start with a blank space followed by an '-'
                    date = self.fixTimeFormat(date)
                    date = du.normalize_timestamp(date,
                                                  output_format="%Y-%m-%d")
                    item['date'] = date
                item['time'] = ""
                url = article.xpath("//a[@class='title']/@href").extract()[0]
                if 'http' not in url:
                    url = "https://www.investing.com" + url
                item['url'] = url
                yield item
            except Exception as e:
                self.logger.error(e)
        # moving to the next page
        if self.i != self.page_number:
            self.i = self.i + 1
            yield Request("http://www.investing.com/news/stock-market-news/" +
                          str(self.i))
Beispiel #5
0
 def parse_article(self, response):
     '''
     Parse a single article and write requested info to file.
     '''
     loader = NewsLoader(item=NewsItem(), response=response)
     loader.add_xpath('title', './/h1/text()')
     loader.add_xpath('tags', '//meta[@name="keywords"]/@content')
     loader.add_xpath('tags', '//meta[@name="keywords"]/@content')
     timestamp = response.xpath('//meta[@itemprop="datePublished"]/@content').extract()[0]
     timestamp = du.normalize_timestamp(timestamp, hasTimezone=True)
     loader.add_value('date', timestamp.split(' ')[0])
     loader.add_value('time', timestamp.split(' ')[1])
     loader.add_xpath('content', './/div[@id="grantexto"]')
     yield loader.load_item()
Beispiel #6
0
    def parse_news(self, response):
        """
        Parse all the news inside the page
        """

        list_of_news = response.xpath('//div[@class="headlineMed"]').extract()
        item = BriefItem()
        for news in list_of_news:
            item['title'] = Selector(
                text=news).xpath('//a/text()').extract()[0]
            timestamp = response.url[45:53] + " " + Selector(
                text=news).xpath('//div/text()').extract()[0]
            timestamp = du.normalize_timestamp(timestamp)
            item['date'] = timestamp.split(' ')[0]
            item['time'] = timestamp.split(' ')[1]
            item['url'] = Selector(text=news).xpath('//a/@href').extract()[0]
            yield item
Beispiel #7
0
    def parse_news(self, response):
        """
        Return a News item with all the content inside the page
        """

        loader = NewsLoader(item=NewsItem(), response=response)
        loader.add_xpath('title', '//div[@id="js-article-text"]//h1/text()')
        loader.add_xpath(
            'author',
            '//div[@id="js-article-text"]//a[@class="author"]/text()')
        timestamp = response.xpath(
            '//meta[@property="article:published_time"][1]/@content').extract(
            )[0]
        timestamp = du.normalize_timestamp(timestamp, hasTimezone=True)
        loader.add_value('date', timestamp.split(' ')[0])
        loader.add_value('time', timestamp.split(' ')[1])
        list_of_contents = response.xpath(
            '//div[@itemprop="articleBody"]/p/text()').extract()
        content = ' '.join(list_of_contents)
        loader.add_value('content', content)
        loader.add_xpath('tags', '//meta[@name="keywords"]/@content')
        return loader.load_item()
Beispiel #8
0
    def parse_news(self, response):
        """
        Return a News item with all the content inside the page
        """

        loader = NewsLoader(item=NewsItem(), response=response)
        loader.add_xpath('title', '//header//h1/text()')
        author = ''.join(response.xpath('//span[@class="byline"]').extract())
        author = remove_tags(author).replace("by", '').replace(' and ', ', ')
        loader.add_value('author', author)
        timestamp = response.xpath(
            '//meta[@name="DC.date.issued"][1]/@content').extract()[0]
        timestamp = du.normalize_timestamp(timestamp)
        loader.add_value('date', timestamp.split(' ')[0])
        loader.add_value('time', timestamp.split(' ')[1])
        list_of_contents = response.xpath(
            '//div[@id="storytext"]/*[not(@class="cnnplayer") and '
            'not(@class="storytimestamp")]').extract()
        content = ' '.join(list_of_contents)
        loader.add_value('content', content)
        loader.add_xpath('tags', '//meta[@name="keywords"]/@content')
        return loader.load_item()
Beispiel #9
0
    def parse(self, response):
        """
        Parse the start_urls with selenium
        """
        
        display = Display(visible=0, size=(480, 320))
        display.start()
        
        driver = webdriver.Chrome()
        driver.get(response.url)
        
        #removing unncesserary stuff from the page
        driver.execute_script('x=document.getElementById("mktwheadlines"); x.parentNode.removeChild(x);')
        driver.execute_script('x=document.getElementById("rightrail"); x.parentNode.removeChild(x);')
        driver.execute_script('x=document.getElementById("mktwcontrols"); x.parentNode.removeChild(x);')
        driver.execute_script('x=document.getElementById("sponsoredlinks"); x.parentNode.removeChild(x);')
        driver.execute_script('x=document.getElementById("below"); x.parentNode.removeChild(x);')
        driver.execute_script('x=document.getElementById("chrome"); x.parentNode.removeChild(x);')

        #remove the tag li:loading from the list
        driver.execute_script(
            'nv_cont_list = document.getElementById("thirdpartyheadlines").getElementsByTagName("ol")[0];'
            'loader = nv_cont_list.getElementsByClassName("loading")[0];'
            'loader.parentNode.removeChild(loader);'
        )
        
        last_timestamp_scraped = datetime.utcnow().strftime('%Y-%m-%d %H:%M:%S')
        
        while True:
            time.sleep(1)
            
            #executes the script to get the news item and then append the tag 
            #li:loading at the end of the list
            driver.execute_script(self.js_script, ITEMS_TO_PULL_FOR_REQUEST)
            try:
                #wait until the tag li:loading reappers
                WebDriverWait(driver, 60).until(
                    EC.presence_of_element_located((By.CLASS_NAME, "loading"))
                )
            except Exception as e:
                self.logger.error(e)
            #scroll the list <ul> to more than a half of its height to avoid
            #the downloading of newer news
            driver.execute_script(
                    'x = document.getElementById("thirdpartyheadlines")'
                                '.getElementsByTagName("ol")[0];'
                    'x.scrollTo(0,x.scrollHeight*3/4);'
            )
            #remove the tag li:loading from the list
            driver.execute_script(
                'nv_cont_list = document.getElementById("thirdpartyheadlines").getElementsByTagName("ol")[0];'
                'loader = nv_cont_list.getElementsByClassName("loading")[0];'
                'loader.parentNode.removeChild(loader);'
            )
            #retrieving the list of news
            elements = driver.find_elements_by_xpath('.//div[@id="thirdpartyheadlines"]//ol[@class="viewport"]/li[not(@class="loading")]')
            item = BriefItem()
            for elem in elements:
                try:
                    timestamp = elem.get_attribute("timestamp")
                    timestamp = du.normalize_timestamp(timestamp, timezone = 'US/Eastern')
                    if du.compare_time(timestamp, last_timestamp_scraped):
                        item['title'] = elem.find_element_by_xpath('.//div[@class="nv-text-cont"]').text
                        try:
                            item['url'] = elem.find_element_by_xpath('.//a[@class="read-more"]').get_attribute("href")
                        except Exception as e:
                            item['url'] = ""   
                        item['date'] = timestamp.split(' ')[0]
                        item['time'] = timestamp.split(' ')[1]
                        last_timestamp_scraped = timestamp
                        yield item
                except Exception as e:
                    self.logger.error(e)
            #delete the newer headlines until remains "ITEMS_TO_KEEP" news
            self.logger.info(last_timestamp_scraped)
            driver.execute_script(
                'var elements = document.getElementById("thirdpartyheadlines").getElementsByTagName("ol")[0].getElementsByTagName("li");'
                'var list = document.getElementById("thirdpartyheadlines").getElementsByTagName("ol")[0];'
                'while(list.childNodes.length > arguments[0]){'
                'list.removeChild(list.firstChild);}', ITEMS_TO_KEEP)
        driver.close()