def parse_content(content, encoding=None): html = parse_soup(content, from_encoding=encoding) old_encoding = encoding encoding = get_encoding(html) if encoding is not None and encoding != old_encoding: html = parse_soup(content, from_encoding=encoding) return html
def isup(text): """<url> - uses isup.me to check if <url> is online or offline :type text: str """ url = text.strip() # slightly overcomplicated, esoteric URL parsing _, auth, path, _, _ = urllib.parse.urlsplit(url) domain = auth or path try: response = requests.get('http://isup.me/' + domain) response.raise_for_status() except requests.exceptions.ConnectionError: return "Failed to get status." if response.status_code != requests.codes.ok: return "Failed to get status." soup = parse_soup(response.text) content = soup.find('div', id="domain-main-content").text.strip() if "not just you" in content: return "It's not just you. {} looks \x02\x034down\x02\x0f from here!".format( url) if "is up" in content: return "It's just you. {} is \x02\x033up\x02\x0f.".format(url) return "Huh? That doesn't look like a site on the interweb."
def parse_page(content): """Parse the horoscope page >>> parse_page('') Traceback (most recent call last): [...] plugins.horoscope.HoroscopeParseError: Unable to parse horoscope >>> parse_page('<div class="main-horoscope"><div>hello world</div></div>') Traceback (most recent call last): [...] plugins.horoscope.HoroscopeParseError: Unable to parse horoscope >>> parse_page('<div class="main-horoscope"><p>hello world</p></div>') 'hello world' """ soup = parse_soup(content) container = soup.find("div", class_="main-horoscope") if not container: raise HoroscopeParseError("Unable to parse horoscope", content) para = container.p if not para: raise HoroscopeParseError("Unable to parse horoscope", content) return para.text
def query(endpoint, text): params = {'q': " ".join(text.split())} with requests.get( search_url + "/" + endpoint, params=params, headers=HEADERS, verify=session.verify ) as r: r.raise_for_status() return parse_soup(r.content)
def mfp(text, reply): """<user> - returns macros from the MyFitnessPal food diary of <user>""" request = requests.get(scrape_url.format(text)) try: request.raise_for_status() except HTTPError as e: reply("Failed to fetch info ({})".format(e.response.status_code)) raise if request.status_code != requests.codes.ok: return "Failed to fetch info ({})".format(request.status_code) output = "Diary for {}: ".format(text) try: soup = parse_soup(request.text) title = soup.find('h1', {'class': 'main-title'}) if title: if title.text == 'This Food Diary is Private': return "{}'s food diary is private.".format(text) if title.text == 'This Username is Invalid': return "User {} does not exist.".format(text) # the output of table depends on the user's MFP profile configuration headers = get_headers(soup) totals = get_values(soup, 'total') remaining = get_values(soup, 'alt') for idx, val in enumerate(headers['captions']): kwargs = { 'caption': val, 'total': totals[idx], 'remain': remaining[idx], 'units': headers['units'][idx], 'pct': math.floor((totals[idx] / remaining[idx]) * 100) } output += ("{caption}: {total}/{remain}{units} ({pct}%) ".format( **kwargs)) output += " ({})".format(scrape_url.format(text)) except Exception: reply("Error parsing results.") raise return output
def RUADICK(text, message): """<username> - checks ruadick.com to see if you're a dick on reddit""" DickCheck = text.strip() dickstatus = requests.get( 'http://www.ruadick.com/user/{}'.format(DickCheck)) dickstatus.raise_for_status() DickSoup = parse_soup(dickstatus.content) Dickstr = str(DickSoup.h2) dickstrip = Dickstr.lstrip('<h2>').rstrip('</h2>') if dickstrip == 'None': message('I can\'t find that user') else: message('{} {}'.format(dickstrip, dickstatus.url))
def steam(text, reply): """<query> - Search for specified game/trailer/DLC""" params = {"term": text.strip().lower()} try: request = requests.get("http://store.steampowered.com/search/", params=params) request.raise_for_status() except requests.RequestException as e: reply("Could not get game info: {}".format(e)) raise soup = parse_soup(request.text, from_encoding="utf-8") result = soup.find("a", {"class": "search_result_row"}) if not result: return "No game found." app_id = result["data-ds-appid"] return format_game(app_id)
async def refresh_fml_cache(loop): """ gets a page of random FMLs and puts them into a dictionary """ url = 'http://www.fmylife.com/random/' _func = functools.partial(requests.get, url, timeout=6) request = await loop.run_in_executor(None, _func) soup = parse_soup(request.text) for e in soup.find_all('p', {'class': 'block'}): # the /today bit is there to exclude fml news etc. a = e.find('a', {'href': re.compile('/article/today')}) if not a: continue # the .html in the url must be removed before extracting the id fml_id = int(a['href'][:-5].split('_')[-1]) text = a.text.strip() # exclude lengthy submissions and FML photos if len(text) > 375 or text[-3:].lower() != "fml": continue fml_cache.append((fml_id, text))
def steam(text, reply): """<query> - Search for specified game/trailer/DLC""" params = {'term': text.strip().lower()} try: request = requests.get("http://store.steampowered.com/search/", params=params) request.raise_for_status() except (requests.exceptions.HTTPError, requests.exceptions.ConnectionError) as e: reply("Could not get game info: {}".format(e)) raise soup = parse_soup(request.text, from_encoding="utf-8") result = soup.find('a', {'class': 'search_result_row'}) if not result: return "No game found." app_id = result['data-ds-appid'] return format_game(app_id)
async def refresh_fml_cache(loop): """ gets a page of random FMLs and puts them into a dictionary """ url = "http://www.fmylife.com/random" _func = functools.partial(requests.get, url, timeout=6) request = await loop.run_in_executor(None, _func) soup = parse_soup(request.text) # the /today bit is there to exclude fml news etc. articles = soup.find_all("a", { "class": "article-link", "href": re.compile("/article/today") }) for a in articles: # the .html in the url must be removed before extracting the id fml_id = int(a["href"][:-5].split("_")[-1]) text = a.text.strip() # exclude lengthy submissions and FML photos if len(text) > 375 or text[-3:].lower() != "fml": continue fml_cache.append((fml_id, text))
def xkcd_search(term): params = { 's': term, 'Search': 'Search', 'comic': 56, 'e': 0, 'n': 0, 'b': 0, 'm': 0, 'd': 0, 't': 0, } request = requests.get(str(ONR_URL), params=params) request.raise_for_status() soup = parse_soup(request.text) result = soup.find('li') if result: url = result.find('div', {'class': 'tinylink'}).text xkcd_id = url[:-1].split("/")[-1] return xkcd_info(xkcd_id, url=True) return "No results found!"
def etymology(text, reply): """<word> - retrieves the etymology of <word> :type text: str """ url = 'http://www.etymonline.com/index.php' response = requests.get(url, params={"term": text}) try: response.raise_for_status() except HTTPError as e: if e.response.status_code == 404: return "No etymology found for {} :(".format(text) reply("Error reaching etymonline.com: {}".format( e.response.status_code)) raise if response.status_code != requests.codes.ok: return "Error reaching etymonline.com: {}".format(response.status_code) soup = parse_soup(response.text) block = soup.find('div', class_=re.compile("word--.+")) etym = ' '.join(e.text for e in block.div) etym = ' '.join(etym.splitlines()) etym = ' '.join(etym.split()) etym = formatting.truncate(etym, 200) etym += " Source: " + web.try_shorten(response.url) return etym
def egg_calculator(text): """<time> - Parses dragonvalebreedingguide.com for a list of possible dragons based on the incubation time. Enter the time as 5 hours, 30 minutes. For upgraded incubation times put 'upgrade' at the front of the time length """ time = "" time2 = "" if text.lower().startswith("upgrade"): timer = text.replace("upgrade", "") time2 = time_parse(timer.strip()) if not time2: return "invalid time format" else: timer = text time = time_parse(timer.strip()) if not time: return "invalid time format" params = {'time': time, 'time2': time2, 'avail': 1} r = requests.get(egg_calc_url, params=params, timeout=5) soup = parse_soup(r.text) dragons = [] for line in soup.findAll('td', {'class': 'views-field views-field-title'}): dragons.append(line.text.replace("\n", "").strip()) return ", ".join(dragons)
def amazon(text, reply, _parsed=False): """<query> - Searches Amazon for query""" headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, ' 'like Gecko) Chrome/41.0.2228.0 Safari/537.36', 'Referer': 'http://www.amazon.com/' } params = {'url': 'search-alias', 'field-keywords': text.strip()} if _parsed: # input is from a link parser, we need a specific URL request = requests.get(SEARCH_URL.format(_parsed), params=params, headers=headers) else: request = requests.get(SEARCH_URL.format(REGION), params=params, headers=headers) try: request.raise_for_status() except HTTPError: reply("Amazon API error occurred.") raise soup = parse_soup(request.text) # check if there are any results on the amazon page results = soup.find('div', {'id': 'atfResults'}) if not results: if not _parsed: return "No results found." return None # get the first item from the results on the amazon page results = results.find('ul', { 'id': 's-results-list-atf' }).find_all('li', {'class': 's-result-item'}) item = results[0] asin = item['data-asin'] # here we use dirty html scraping to get everything we need title = formatting.truncate( item.find('h2', { 'class': 's-access-title' }).text, 60) tags = [] # tags! if item.find('i', {'class': 'a-icon-prime'}): tags.append("$(b)Prime$(b)") if item.find('i', {'class': 'sx-bestseller-badge-primary'}): tags.append("$(b)Bestseller$(b)") # we use regex because we need to recognise text for this part # the other parts detect based on html tags, not text if re.search( r"(Kostenlose Lieferung|Livraison gratuite|FREE Shipping|Envío GRATIS" r"|Spedizione gratuita)", item.text, re.I): tags.append("$(b)Free Shipping$(b)") try: price = item.find('span', {'class': ['s-price', 'a-color-price']}).text except AttributeError: for i in item.find_all('sup', class_='sx-price-fractional'): i.string.replace_with('.' + i.string) price = item.find('span', {'class': 'sx-price'}).text # use a whole lot of BS4 and regex to get the ratings try: # get the rating rating = item.find('i', { 'class': 'a-icon-star' }).find('span', { 'class': 'a-icon-alt' }).text rating = re.search(r"([0-9]+(?:[.,][0-9])?).*5", rating).group(1).replace(",", ".") # get the rating count pattern = re.compile(r'(product-reviews|#customerReviews)') num_ratings = item.find('a', {'href': pattern}).text.replace(".", ",") # format the rating and count into a nice string rating_str = "{}/5 stars ({} ratings)".format(rating, num_ratings) except AttributeError: rating_str = "No Ratings" # generate a short url if AFFILIATE_TAG: url = "http://www.amazon.com/dp/" + asin + "/?tag=" + AFFILIATE_TAG else: url = "http://www.amazon.com/dp/" + asin + "/" url = web.try_shorten(url) # join all the tags into a string tag_str = " - " + ", ".join(tags) if tags else "" # finally, assemble everything into the final string, and return it! if not _parsed: return colors.parse("".join("$(b){}$(b) ({}) - {}{} - {}".format( title, price, rating_str, tag_str, url).splitlines())) return colors.parse("".join("$(b){}$(b) ({}) - {}{}".format( title, price, rating_str, tag_str).splitlines()))
def get_data(user, currency="us"): """ Takes a steam user ID and returns a dict containing info about the games the user owns :type user: str :type currency: str :return: dict """ data = {} # form the request params = {'player': user, 'currency': currency} # get the page try: if cfscrape: scraper = cfscrape.create_scraper() request = scraper.get(CALC_URL, params=params) else: headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, ' 'like Gecko) Chrome/41.0.2228.0 Safari/537.36', 'Referer': 'https://steamdb.info/' } request = requests.get(CALC_URL, params=params, headers=headers) request.raise_for_status() except (requests.exceptions.HTTPError, requests.exceptions.ConnectionError) as e: if cfscrape: raise SteamError("Could not get user info: {}".format(e)) else: raise SteamError( "Could not get user info: {} (You may have been blocked by CloudFlare, try installing the " "cfscrape module)".format(e)) # parse that page! soup = parse_soup(request.content) # get all the data we need try: data["name"] = soup.find("h1", { "class": "header-title" }).find("a").text data["url"] = request.url data["status"] = soup.find('td', text='Status').find_next('td').text data["value"] = soup.find("h1", {"class": "calculator-price"}).text data["value_sales"] = soup.find("h1", { "class": "calculator-price-lowest" }).text data["count"] = int( soup.find("div", { "class": "pull-right price-container" }).find("p").find("span", { "class": "number" }).text.replace(',', '')) played = soup.find('td', text='Games not played').find_next('td').text played = PLAYED_RE.search(played).groups() data["count_unplayed"] = int(played[0]) data["count_played"] = data["count"] - data["count_unplayed"] data["percent_unplayed"] = round( percentage(data["count_unplayed"], data["count"]), 1) data["percent_played"] = round( percentage(data["count_played"], data["count"]), 1) except AttributeError: raise SteamError("Could not read info, does this user exist?") return data