def __is_spot_im__(self, url: str) -> bool: """Search if Spot.IM keywords exist in HTML source codes of the input url. Args: url (str): target page. Returns: bool: wether the target page has Spot.IM keywords or not. """ headers = base_header() headers[ 'Accept'] = 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8' headers['Accept-Encoding'] = 'identity' try: response = requests.get(url, headers=headers) response.raise_for_status() except Exception as e: self.error( "Failed to request HTML source codes from {}: \n{}".format( url, repr(e))) return False source = response.content.decode(encoding=response.encoding) source = source.lower() return 'spotim' in source or 'spot-im' in source
def bootstrap(self) -> None: super().bootstrap() headers = base_header() headers['Accept'] = '*/*' headers['X-Requested-With'] = "XMLHttpRequest" for section in self.sections: for offset in range(10000-self.batchSize, -1, -self.batchSize): requestURL = self.API_ENDPOINT.format(section=section, offset=offset, limit=self.batchSize) try: response = requests.get(requestURL, headers=headers) response.raise_for_status() response = response.json() except Exception as e: self.error("Failed to complete the boostrap process: \n{}".format(repr(e))) exit() soup = BeautifulSoup('<html><body>{}</body></html>'.format(response['rendering']), "lxml") for a in soup.select('div[class="story-headline"] > h2 > a'): url = a['href'] self.__process_url__(url, category=section)
def __request_page_source__(self, url:str) -> str: """Request HTML source of the input url. Args: url (str): target url. Returns: str: source codes or None. """ headers = base_header() headers['Accept'] = 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8' headers['Accept-Encoding'] = 'identity' try: response = requests.get(url, headers = headers) response.raise_for_status() except Exception as e: self.error("Failed to request HTML source codes from {}: \n{}".format(url, repr(e))) return None return response.content.decode(encoding=response.encoding)
def update(self) -> None: super().update() headers = base_header() headers['Accept'] = '*/*' headers['X-Requested-With'] = "XMLHttpRequest" for section in self.sections: #for offset in range(0, 0, self.batchSize): requestURL = self.API_ENDPOINT.format(section=section, offset=0, limit=self.batchSize) try: response = requests.get(requestURL, headers=headers) response.raise_for_status() response = response.json() except Exception as e: self.error("Failed to retrieve part of urls from the Washington Post: {}\n{}".format( repr(e))) continue soup = BeautifulSoup('<html><body>{}</body></html>'.format(response['rendering']), "lxml") for a in soup.select('div[class="story-headline"] > h2 > a'): url = a['href'] self.__process_url__(url, category=section) self.info('Update finished for section {} in Washington Post.'.format(section))
def request_routine(self, articleURL): headers = base_header() headers["Host"] = "theintercept.com" headers[ "Accept"] = "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8" headers['Accept-Encoding'] = 'identity' response = requests.get(articleURL, headers=headers) s = response.content.decode(encoding=response.encoding) idx = s.find('post_id') if idx == -1: self.error('Cannot find post id in {}.'.format(articleURL)) return postID = [] while not s[idx].isdigit(): idx += 1 while idx < len(s) and s[idx].isdigit(): postID.append(s[idx]) idx += 1 postID = ''.join(postID) return super().request_routine( "https://theintercept.com/?p={}".format(postID))
def __init__(self) -> None: self.headers = base_header() self.headers["Content-Type"] = "application/json" self.headers["Accept"] = "*/*" self.targetUrl = None