def _search_get_page(self, point, start_index): logging.debug("Loading page from start_index: %d", start_index) if start_index == 0: # first request has to load normal search page logging.debug("Using normal search endpoint") params = urlencode({"origin": point.format(None, "", "", "")}) url = self._urls["search"] + "?" + params # make request try: return str(self._browser.get(url).soup.find(id="geocaches")) except requests.exceptions.ConnectionError as e: raise Error("Cannot load search results.") from e else: # other requests can use AJAX endpoint logging.debug("Using AJAX search endpoint") params = urlencode({ "inputOrigin": point.format(None, "", "", ""), "startIndex": start_index, "originTreatment": 0 }) url = self._urls["search_more"] + "?" + params # make request try: return self._browser.get(url).json()["HtmlString"].strip() except requests.exceptions.ConnectionError as e: raise Error("Cannot load search results.") from e
def download(self, get_png_first=False): """Download UTFGrid from geocaching.com Return generator object of Cache instances, store geocaches also in self.geocaches. It appears to be mandatory to first download map tile (.png file) and only then UTFGrid. However, this is not enforced all the time. There is probably some time limit from previous loading of the same tile and also a general traffic regulator involved. Try first to download grid and if it does not work, get .png first and then try again. TODO It might be useful to store time when tile is last downloaded and act based on that. Logging some statistics (time when tile is loaded + received status code + content length + time spent on request) might help in algorithm design and evaluating if additional traffic from .png loading is tolerable and if this should be done all the time. Requesting for UTFgrid and waiting for 204 response takes also its time.""" logging.info("Downloading UTFGrid for tile ({}, {}, {})".format( self.x, self.y, self.z)) try: if get_png_first: logging.debug(".. getting .png file first") self._gc._browser.get(self._urls["tile"]) logging.debug(".. getting UTFGrid") res = self._gc._browser.get(self._urls["grid"]) if res.status_code == 204: if get_png_first: logging.debug("There is really no content! Returning 0 caches.") return logging.debug("Cannot load UTFgrid: no content. " "Trying to load .png tile first") new_caches = self.download(get_png_first=True) except requests.exceptions.ConnectionError as e: raise Error("Cannot load UTFgrid.") from e if res.status_code == 200: try: json_grid = res.json() except ValueError as e: # This happened during testing, don't know why. if get_png_first: raise Error("Cannot load UTFgrid.") from e else: logging.debug("JSON parsing failed, trying .png first") return self.download(get_png_first=True) new_caches = self._parse_utfgrid(json_grid) for c in new_caches: self.geocaches.append(c) yield c
def _request(self, url, *, expect="soup", method="GET", login_check=True, **kwargs): """ Do a HTTP request and return a response based on expect param. :param str url: Request target. :param str method: HTTP method to use. :param str expect: Expected type of data (either :code:`soup`, :code:`json` or :code:`raw`). :param bool login_check: Whether to check if user is logged in or not. :param kwargs: Passed to `requests.request <http://docs.python-requests.org/en/latest/api/#requests.request>`_ as is. """ # check login unless explicitly turned off if login_check and not self._logged_in: raise NotLoggedInException("Login is needed.") url = url if "//" in url else urljoin(self._baseurl, url) try: res = self._session.request(method, url, **kwargs) res.raise_for_status() # return bs4.BeautifulSoup, JSON dict or raw requests.Response if expect == "soup": return bs4.BeautifulSoup(res.text, "html.parser") elif expect == "json": return res.json() elif expect == "raw": return res except requests.exceptions.RequestException as e: raise Error("Cannot load page: {}".format(url)) from e
def _search_get_page(self, point, page_num): """Returns one page of caches as a list. Searches for a caches around a point and returns N-th page (specifiend by page argument).""" assert isinstance(point, Point) assert type(page_num) is int logging.info("Fetching page %d.", page_num) # assemble request params = urlencode({"lat": point.latitude, "lng": point.longitude}) url = self._urls["caches_nearest"] + "?" + params # we have to add POST for other pages than 1st if page_num == 1: post = None else: # TODO handle searching on second page without first post = self._pagging_helpers post["__EVENTTARGET"] = self._pagging_postbacks[page_num] post["__EVENTARGUMENT"] = "" # make request try: root = self._browser.post(url, post).soup except requests.exceptions.ConnectionError as e: raise Error("Cannot load search page #{}.".format(page_num)) from e # root of a few following elements widget_general = root.find_all("td", "PageBuilderWidget") # parse pagging widget caches_total, page_num, page_count = [ int(elm.text) for elm in widget_general[0].find_all("b") ] logging.debug("Found %d results. Showing page %d of %d.", caches_total, page_num, page_count) # save search postbacks for future usage if page_num == 1: pagging_links = [ _ for _ in widget_general[1].find_all("a") if _.get("id") ] self._pagging_postbacks = { int(link.text): link.get("href").split("'")[1] for link in pagging_links } # other nescessary fields self._pagging_helpers = { field["name"]: field["value"] for field in root.find_all("input", type="hidden") } # parse results table data = root.find("table", "SearchResultsTable").find_all("tr", "Data") return [self._search_parse_cache(c) for c in data]
def _download_utfgrid(self, *, get_png=False): """Load UTFGrid tile from geocaching.com. It appears to be mandatory to first download map tile (.png file) and only then UTFGrid. However, this is not enforced all the time. There is probably some time limit from previous loading of the same tile and also a general traffic regulator involved. Try first to download grid and if it does not work, get .png and then try it again. :param bool get_png: Whether to download .png first. :return: JSON with raw tile data. :rtype: :class:`dict` """ # TODO: It might be useful to store time when tile is last downloaded and act based on that. # Logging some statistics (time when tile is loaded + received status code + content length # + time spent on request) might help in algorithm design and evaluating if additional # traffic from .png loading is tolerable and if this should be done all the time. # Requesting for UTFgrid and waiting for 204 response takes also its time. logging.debug("Downloading UTFGrid for {}".format(self)) params = {"x": self.x, "y": self.y, "z": self.z} if get_png: logging.debug("Getting .png file") self.geocaching._request(self._urls["tile"], params=params, expect="raw") logging.debug("Getting UTFGrid") res = self.geocaching._request(self._urls["grid"], params=params, expect="raw") if res.status_code == 204: if get_png: logging.debug( "There is really no content! Returning 0 caches.") return logging.debug( "Cannot load UTFgrid: no content. Trying to load .png tile first" ) return self._download_utfgrid(get_png=True) if res.status_code == 200: try: return res.json() except ValueError as e: # this happened during testing, don't know why if get_png: raise Error("Cannot load UTFgrid.") from e else: logging.debug("JSON parsing failed, trying .png first") return self._download_utfgrid(get_png=True)
def geocode(self, query): """Tries to fetch coordinates for given query.""" assert type(query) is str url = self._urls["geocode"] + "?q=" + query try: res = self._browser.get(url).json() except requests.exceptions.ConnectionError as e: raise Error("Cannot load geocode page.") from e if res["status"] != "success": raise GeocodeError(res["msg"]) return Point(float(res["data"]["lat"]), float(res["data"]["lng"]))
def _get_middle_point(self): """Get middle point from list of x, y coordinates The points form a rectangular matrix, whose maximum size is self.size ** 2, but it can be smaller if the matrix is at the edge of UTFGrid. Investigate block and return x, y coordinates of uncut square block middle point.""" check_status = self._check_block() if check_status == 0: raise Error("Something went wrong with geocache coordinate " "parsing from UTFGrid. Either the JSON parsing " "failed or Groundspeak has changed something.") elif check_status == 1: return [sum(i) / 2 for i in [self._xlim, self._ylim]] else: return [sum(self._find_limits(axis)) / 2 for axis in ["x", "y"]]
def load_trackable_by_url(self, url, destination=None): try: root = self._browser.get(url).soup except requests.exceptions.ConnectionError as e: raise Error("Cannot load cache details page.") from e title_tuple = re.split("[\(\)-]", root.title.string) tid = title_tuple[1] trackable_type = title_tuple[2] name = '' for n in title_tuple[3:]: name += n + '-' name = name.rstrip('-') owner_raw = root.findAll( "a", {"id": "ctl00_ContentBody_BugDetails_BugOwner"}) #return owner_raw owner = re.split("[\<\>]", str(owner_raw))[2] location_raw = root.findAll( "a", {"id": "ctl00_ContentBody_BugDetails_BugLocation"}) #return owner_raw location_url = location_raw[0].get('href') if 'cache_details' in location_url: location = self.load_cache_by_url(location_url).location else: location = re.split("[\<\>]", str(location_raw))[2] description_raw = root.findAll("div", {"id": "TrackableDetails"}) description = description_raw[0].text goal_raw = root.findAll("div", {"id": "TrackableGoal"}) goal = goal_raw[0].text # create trackable object t = destination or Trackable(tid, self) assert isinstance(t, Trackable) t.tid = tid t.name = name t.owner = owner t.location = location t.type = trackable_type t.description = description t.goal = goal return t
def load_cache_quick(self, wp, destination=None): """Loads details from map server. Loads just basic cache details, but very quickly.""" assert type(wp) is str and wp.startswith("GC") logging.info("Loading quick details about %s...", wp) # assemble request params = urlencode({"i": wp}) url = self._urls["map"] + "?" + params try: res = self._browser.get(url).json() except requests.exceptions.ConnectionError as e: raise Error("Cannot load quick cache details page.") from e if res["status"] == "failed" or len(res["data"]) != 1: raise LoadError("Waypoint '{}' cannot be loaded: {}".format( wp, res["msg"])) data = res["data"][0] # create cache object c = destination or Cache(wp, self) assert isinstance(c, Cache) # prettify data c.name = data["name"] c.cache_type = data["type"]["text"] c.state = data["available"] c.size = data["container"]["text"] c.difficulty = data["difficulty"]["text"] c.terrain = data["terrain"]["text"] c.hidden = Util.parse_date(data["hidden"]) c.author = data["owner"]["text"] c.favorites = int(data["fp"]) c.pm_only = data["subrOnly"] logging.debug("Cache loaded: %r", c) return c
def load_trackable_list(self, url): try: root = self._browser.get(url).soup except requests.exceptions.ConnectionError as e: raise Error("Cannot load cache details page.") from e trackable_table = root.find_all("table")[1] urls_raw = trackable_table.find_all("a") # filter out all urls for trackables urls = [ url.get("href") for url in urls_raw if "track" in url.get("href") ] # find the names matching the trackble urls names = [ re.split("[\<\>]", str(url))[2] for url in urls_raw if "track" in url.get("href") ] # create trackables and build list to return trackables = [] for n, u in zip(names, urls): trackables.append(Trackable(None, self, name=n, trackable_page=u)) return trackables
def login(self, username, password): """Logs the user in. Downloads the relevant cookies to keep the user logged in.""" logging.info("Logging in...") try: login_page = self._browser.get(self._urls["login_page"]) except requests.exceptions.ConnectionError as e: raise Error("Cannot load login page.") from e logging.debug("Checking for previous login.") logged = self.get_logged_user(login_page) if logged: if logged == username: logging.info("Already logged as %s.", logged) self._logged_in = True return else: logging.info("Already logged as %s, but want to log in as %s.", logged, username) self.logout() # continue logging in post = {} logging.debug("Assembling POST data.") # login fields login_elements = login_page.soup.find_all( "input", type=["text", "password", "checkbox"]) post.update({ field["name"]: val for field, val in zip(login_elements, [username, password, 1]) }) # other nescessary fields other_elements = login_page.soup.find_all("input", type=["hidden", "submit"]) post.update( {field["name"]: field["value"] for field in other_elements}) # login to the site logging.debug("Submiting login form.") try: after_login_page = self._browser.post(self._urls["login_page"], post) except requests.exceptions.ConnectionError as e: raise Error( "Cannot load response after submiting login form.") from e logging.debug("Checking the result.") if self.get_logged_user(after_login_page): logging.info("Logged in successfully as %s.", username) self._logged_in = True return else: self.logout() raise LoginFailedException( "Cannot login to the site (probably wrong username or password)." )
def load_cache(self, wp, destination=None): """Loads details from cache page. Loads all cache details and return fully populated cache object.""" assert type(wp) is str and wp.startswith("GC") logging.info("Loading details about %s...", wp) # assemble request params = urlencode({"wp": wp}) url = self._urls["cache_details"] + "?" + params try: root = self._browser.get(url).soup except requests.exceptions.ConnectionError as e: raise Error("Cannot load cache details page.") from e cache_details = root.find(id="cacheDetails") # check for PM only caches if using free account if cache_details is None: if root.select(".PMOWarning") is not None: raise PMOnlyException("Premium Members only.") # parse raw data name = cache_details.find("h2") cache_type = cache_details.find("img").get("alt") author = cache_details("a")[1] hidden = cache_details.find("div", "minorCacheDetails").find_all("div")[1] location = root.find(id="uxLatLon") state = root.find("ul", "OldWarning") found = root.find("div", "FoundStatus") D_T = root.find("div", "CacheStarLabels").find_all("img") size = root.find("div", "CacheSize").find("img") attributes_raw = root.find_all( "div", "CacheDetailNavigationWidget")[0].find_all("img") user_content = root.find_all("div", "UserSuppliedContent") hint = root.find(id="div_hint") favorites = root.find("span", "favorite-value") # create cache object c = destination or Cache(wp, self) assert isinstance(c, Cache) # prettify data c.name = name.text c.cache_type = cache_type c.author = author.text c.hidden = Util.parse_date(hidden.text.split()[2]) c.location = Point.from_string(location.text) c.state = state is None c.found = found and "Found It!" in found.text or False c.difficulty, c.terrain = [float(_.get("alt").split()[0]) for _ in D_T] c.size = " ".join(size.get("alt").split()[1:]) attributes_raw = [ _.get("src").split('/')[-1].rsplit("-", 1) for _ in attributes_raw ] c.attributes = { attribute_name: appendix.startswith("yes") for attribute_name, appendix in attributes_raw if not appendix.startswith("blank") } c.summary = user_content[0].text c.description = str(user_content[1]) c.hint = Util.rot13(hint.text.strip()) c.favorites = int(favorites.text) logging.debug("Cache loaded: %r", c) return c
def load_cache_by_url(self, url, destination=None): try: root = self._browser.get(url).soup except requests.exceptions.ConnectionError as e: raise Error("Cannot load cache details page.") from e cache_details = root.find(id="cacheDetails") # check for PM only caches if using free account if cache_details is None: if root.select(".PMOWarning") is not None: raise PMOnlyException("Premium Members only.") # parse raw data wp = root.title.string.split(' ')[0] name = cache_details.find("h2") cache_type = cache_details.find("img").get("src") author = cache_details("a")[1] hidden = cache_details.find("div", "minorCacheDetails").find_all("div")[1] location = root.find(id="uxLatLon") state = root.find("ul", "OldWarning") found = root.find("div", "FoundStatus") D_T = root.find("div", "CacheStarLabels").find_all("img") size = root.find("div", "CacheSize").find("img") attributes_raw = root.find_all( "div", "CacheDetailNavigationWidget")[0].find_all("img") user_content = root.find_all("div", "UserSuppliedContent") hint = root.find(id="div_hint") favorites = root.find("span", "favorite-value") # check for trackables inventory_raw = root.find_all("div", "CacheDetailNavigationWidget") inventory_links = inventory_raw[1].find_all("a") if len(inventory_links) >= 3: trackable_page = self._urls['trackable_base'] + inventory_links[ -3].get("href") else: trackable_page = None # create cache object c = destination or Cache(wp, self) assert isinstance(c, Cache) # prettify data c.name = name.text c.cache_type = Cache.get_cache_type_by_img(cache_type) c.author = author.text c.hidden = Util.parse_date(hidden.text.split(":")[-1]) c.location = Point.from_string(location.text) c.state = state is None c.found = found and "Found It!" in found.text or False c.difficulty, c.terrain = [float(_.get("alt").split()[0]) for _ in D_T] c.size = size.get("src").split("/")[-1].rsplit( ".", 1)[0] # filename of img[src] attributes_raw = [ _.get("src").split('/')[-1].rsplit("-", 1) for _ in attributes_raw ] c.attributes = { attribute_name: appendix.startswith("yes") for attribute_name, appendix in attributes_raw if not appendix.startswith("blank") } c.summary = user_content[0].text c.description = str(user_content[1]) c.hint = Util.rot13(hint.text.strip()) if favorites is None: c.favorites = 0 else: c.favorites = int(favorites.text) if trackable_page is not None: c.trackables = self.load_trackable_list(trackable_page) else: c.trackables = [] logging.debug("Cache loaded: %r", c) return c