def test_connection_error(bot_helper, responses): # Check our assumptions: should be connection error because "responses" library is mocking the internet with pytest.raises(requests.ConnectionError): simple_http_get('http://example.com/foo/bar') # Should result in an error message from linkinfo (and implicitly no exception raised) result = bot_helper['linkinfo'].get_link_info('http://example.com/foo/bar') assert result.is_error
def get_yt_json(vid_id): """Gets the (vaguely) relevant parts of the raw json from youtube. """ # v=2 needed for like count url = "https://gdata.youtube.com/feeds/api/videos/{}?alt=json&v=2".format(vid_id) httpdata = simple_http_get(url) if httpdata.status_code != requests.codes.ok: return None return httpdata.json()["entry"]
def search_hoogle(self, e): """Search Hoogle with a given string and return the first few (exact number configurable) results. """ query = e['data'] hurl = 'http://www.haskell.org/hoogle/?mode=json&hoogle=' + query hresp = simple_http_get(hurl) if hresp.status_code != requests.codes.ok: self.log.warn('request failed for ' + hurl) return # The Hoogle response JSON is of the following format: # { # "version": "<hoogle version>" # "results": [ # { # "location": "<link to docs>" # "self": "<name> :: <type>" # "docs": "<short description>" # }, # ... # ] # } maxresults = int(self.config_get('results')) if hresp.json is None: self.log.warn('invalid JSON received from Hoogle') return if 'parseError' in hresp.json(): e.reply(hresp.json()['parseError'].replace('\n', ' ')) return allresults = hresp.json()['results'] totalresults = len(allresults) results = allresults[0:maxresults] niceresults = [] for result in results: niceresults.append(result['self']) encqry = urllib.parse.quote(query.encode('utf-8')) fullurl = 'http://www.haskell.org/hoogle/?hoogle=' + encqry e.reply('Showing {} of {} results: {} ({})'.format( maxresults if maxresults < totalresults else totalresults, totalresults, '; '.join(niceresults), fullurl))
def get_info(number=None): """Gets the json data for a particular comic (or the latest, if none provided). """ if number: url = "http://xkcd.com/{}/info.0.json".format(number) else: url = "http://xkcd.com/info.0.json" httpdata = simple_http_get(url) if httpdata.status_code != requests.codes.ok: return None # Only care about part of the data httpjson = httpdata.json() data = {key: httpjson[key] for key in ["title", "alt", "num"]} # Unfuck up unicode strings data = fix_json_unicode(data) data["url"] = "http://xkcd.com/" + str(data["num"]) return data
def scrape_html_title(self, url): """Scrape the ``<title>`` tag contents from an HTML page. """ # Let's see what's on the other end... r = simple_http_get(url.geturl()) # Only bother with 200 OK if r.status_code != requests.codes.ok: self.log.debug('request failed for ' + url.geturl()) return None if 'html' not in r.headers['Content-Type']: self.log.debug('Content-Type not HTML-ish ({}): {}' .format(r.headers['Content-Type'], url.geturl())) return None # Attempt to scrape the HTML for a <title> if 'charset=' in r.headers['content-type']: # If present, HTTP Content-Type header charset takes precedence parser = lxml.html.HTMLParser( encoding=r.headers['content-type'].rsplit('=', 1)[1]) else: parser = lxml.html.html_parser html = lxml.etree.fromstring(r.content, parser) title = html.find('.//title') if title is None: self.log.debug('failed to find <title>: ' + url.geturl()) return None # Normalise title whitespace title = ' '.join(title.text.strip().split()) nsfw = url.netloc.endswith('.xxx') # See if the title is in the URL if self._filter_title_in_url(url, title): return None # Return the scraped title return 'Title', nsfw, '"{}"'.format(title)