Esempio n. 1
0
    def __is_spot_im__(self, url: str) -> bool:
        """Search if Spot.IM keywords exist in HTML source codes of the input url.

        Args:
            url (str): target page.

        Returns:
            bool: wether the target page has Spot.IM keywords or not.
        """
        headers = base_header()
        headers[
            'Accept'] = 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'
        headers['Accept-Encoding'] = 'identity'
        try:
            response = requests.get(url, headers=headers)
            response.raise_for_status()
        except Exception as e:
            self.error(
                "Failed to request HTML source codes from {}: \n{}".format(
                    url, repr(e)))
            return False

        source = response.content.decode(encoding=response.encoding)
        source = source.lower()
        return 'spotim' in source or 'spot-im' in source
Esempio n. 2
0
    def bootstrap(self) -> None:
        super().bootstrap()

        headers = base_header()
        headers['Accept'] = '*/*'
        headers['X-Requested-With'] = "XMLHttpRequest"
        for section in self.sections:
            for offset in range(10000-self.batchSize, -1, -self.batchSize):
                requestURL = self.API_ENDPOINT.format(section=section, offset=offset, limit=self.batchSize)
                try:
                    response = requests.get(requestURL, headers=headers)
                    response.raise_for_status()
                    response = response.json()
                except Exception as e:
                    self.error("Failed to complete the boostrap process: \n{}".format(repr(e)))
                    exit()
                soup = BeautifulSoup('<html><body>{}</body></html>'.format(response['rendering']), "lxml")
                for a in soup.select('div[class="story-headline"] > h2 > a'):
                    url = a['href']
                    self.__process_url__(url, category=section)
Esempio n. 3
0
    def __request_page_source__(self, url:str) -> str:
        """Request HTML source of the input url.

        Args:
            url (str): target url.

        Returns:
            str: source codes or None.
        """

        headers = base_header()
        headers['Accept'] = 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'
        headers['Accept-Encoding'] = 'identity'
        try:
            response = requests.get(url, headers = headers)
            response.raise_for_status()
        except Exception as e:
            self.error("Failed to request HTML source codes from {}: \n{}".format(url, repr(e)))
            return None

        return response.content.decode(encoding=response.encoding)
Esempio n. 4
0
    def update(self) -> None:
        super().update()

        headers = base_header()
        headers['Accept'] = '*/*'
        headers['X-Requested-With'] = "XMLHttpRequest"
        for section in self.sections:
            #for offset in range(0, 0, self.batchSize):
            requestURL = self.API_ENDPOINT.format(section=section, offset=0, limit=self.batchSize)
            try:
                response = requests.get(requestURL, headers=headers)
                response.raise_for_status()
                response = response.json()
            except Exception as e:
                self.error("Failed to retrieve part of urls from the Washington Post: {}\n{}".format( repr(e)))
                continue
            soup = BeautifulSoup('<html><body>{}</body></html>'.format(response['rendering']), "lxml")
            for a in soup.select('div[class="story-headline"] > h2 > a'):
                url = a['href']
                self.__process_url__(url, category=section)
            self.info('Update finished for section {} in Washington Post.'.format(section))
Esempio n. 5
0
    def request_routine(self, articleURL):
        headers = base_header()
        headers["Host"] = "theintercept.com"
        headers[
            "Accept"] = "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8"
        headers['Accept-Encoding'] = 'identity'
        response = requests.get(articleURL, headers=headers)
        s = response.content.decode(encoding=response.encoding)
        idx = s.find('post_id')
        if idx == -1:
            self.error('Cannot find post id in {}.'.format(articleURL))
            return
        postID = []
        while not s[idx].isdigit():
            idx += 1
        while idx < len(s) and s[idx].isdigit():
            postID.append(s[idx])
            idx += 1
        postID = ''.join(postID)

        return super().request_routine(
            "https://theintercept.com/?p={}".format(postID))
Esempio n. 6
0
    def __init__(self) -> None:
        self.headers = base_header()
        self.headers["Content-Type"] = "application/json"
        self.headers["Accept"] = "*/*"

        self.targetUrl = None