def get_basic_info(self): """ Collects and stores basic information about the target """ rest_url = url_path_join(self.url, self.api_path) if self.basic_info is not None: return self.basic_info try: req = self.s.get(rest_url) except Exception: raise NoWordpressApi if req.status_code >= 400: raise NoWordpressApi self.basic_info = req.json() if 'name' in self.basic_info.keys(): self.name = self.basic_info['name'] if 'description' in self.basic_info.keys(): self.description = self.basic_info['description'] if 'namespaces' in self.basic_info.keys() and 'wp/v2' in \ self.basic_info['namespaces']: self.has_v2 = True return self.basic_info
def crawl_pages(self, url): """ Crawls all pages while there is at least one result for the given endpoint """ page = 1 total_entries = 0 total_pages = 0 more_entries = True entries = [] base_url = url if self.search_terms is not None: if '?' in base_url: base_url += '&' + urlencode({'search': self.search_terms}) else: base_url += '?' + urlencode({'search': self.search_terms}) while more_entries: rest_url = url_path_join(self.url, self.api_path, (base_url % page)) try: req = self.s.get(rest_url) if page == 1 and 'X-WP-Total' in req.headers: total_entries = int(req.headers['X-WP-Total']) total_pages = int(req.headers['X-WP-TotalPages']) print("Number of entries: %d" % total_entries) except HTTPError400: break except Exception: raise WordPressApiNotV2 try: json_content = get_content_as_json(req) if type(json_content) is list and len(json_content) > 0: entries += json_content if total_entries > 0: print_progress_bar(page, total_pages, length=70) else: more_entries = False except JSONDecodeError: more_entries = False page += 1 return entries
def crawl_single_page(self, url): """ Crawls a single URL """ content = None rest_url = url_path_join(self.url, self.api_path, url) try: req = self.s.get(rest_url) except HTTPError400: return None except HTTPError404: return None except Exception: raise WordPressApiNotV2 try: content = get_content_as_json(req) except JSONDecodeError: pass return content
def crawl_namespaces(self, ns): """ Crawls all accessible get routes defined for the specified namespace. """ namespaces = self.get_namespaces() routes = self.get_routes() ns_data = {} if ns != "all" and ns not in namespaces: raise NSNotFoundException for url, route in routes.items(): if 'namespace' not in route.keys() \ or 'endpoints' not in route.keys(): continue url_as_ns = url.lstrip('/') if '(?P<' in url or url_as_ns in namespaces: continue if ns != 'all' and route['namespace'] != ns or \ route['namespace'] in ['wp/v2', '']: continue for endpoint in route['endpoints']: if 'GET' not in endpoint['methods']: continue keep = True if len(endpoint['args']) > 0 and type( endpoint['args']) is dict: for name, arg in endpoint['args'].items(): if arg['required']: keep = False if keep: rest_url = url_path_join(self.url, self.api_path, url) try: ns_request = self.s.get(rest_url) ns_data[url] = ns_request.json() except Exception: continue return ns_data
def crawl_pages(self, url, start=None, num=None, search_terms=None, display_progress=True): """ Crawls all pages while there is at least one result for the given endpoint or tries to get pages from start to end """ if search_terms is None: search_terms = self.search_terms page = 1 total_entries = 0 total_pages = 0 more_entries = True entries = [] base_url = url entries_left = 1 per_page = 10 if search_terms is not None: if '?' in base_url: base_url += '&' + urlencode({'search': search_terms}) else: base_url += '?' + urlencode({'search': search_terms}) if start is not None: page = math.floor(start / per_page) + 1 if num is not None: entries_left = num while more_entries and entries_left > 0: rest_url = url_path_join(self.url, self.api_path, (base_url % page)) if start is not None: rest_url += "&per_page=%d" % per_page try: req = self.s.get(rest_url) if (page == 1 or start is not None and page == math.floor( start / per_page) + 1) and 'X-WP-Total' in req.headers: total_entries = int(req.headers['X-WP-Total']) total_pages = int(req.headers['X-WP-TotalPages']) print("Total number of entries: %d" % total_entries) if start is not None and total_entries < start: start = total_entries - 1 except HTTPError400: break except Exception: raise WordPressApiNotV2 try: json_content = get_content_as_json(req) if type(json_content) is list and len(json_content) > 0: if (start is None or start is not None and page > math.floor(start / per_page) + 1) and num is None: entries += json_content if start is not None: entries_left -= len(json_content) elif start is not None and page == math.floor( start / per_page) + 1: if num is None or num is not None and len( json_content[start % per_page:]) < num: entries += json_content[start % per_page:] if num is not None: entries_left -= len(json_content[start % per_page:]) else: entries += json_content[start % per_page:(start % per_page) + num] entries_left = 0 else: if num is not None and entries_left > len( json_content): entries += json_content entries_left -= len(json_content) else: entries += json_content[:entries_left] entries_left = 0 if display_progress: if num is None and start is None and total_entries >= 0: print_progress_bar(page, total_pages, length=70) elif num is None and start is not None and total_entries >= 0: print_progress_bar(total_entries - start - entries_left, total_entries - start, length=70) elif num is not None and total_entries > 0: print_progress_bar(num - entries_left, num, length=70) else: more_entries = False except JSONDecodeError: more_entries = False page += 1 return (entries, total_entries)