def refresh_cache(): print '[+] refreshing distrowatch cache' output = '[DistroWatch]' def parse_table(data): global data_limit distro_names = [] for distro in iterable.limit(data_limit, data): distro_names.append(distro.text.strip()) return ', '.join(distro_names) # most popular distros in the last 12, 6 and 1 months html = request.get('https://distrowatch.com/dwres.php?resource=popularity') soup = BeautifulSoup(html, 'lxml') tables = soup.select('td.NewsText tr td table') for table in tables: header = table.find('th', attrs={'class': 'Invert'}) data = table.find_all('td', attrs={'class': 'phr2'}) # skip table if it doesn't have distro info if header is None or data is None: continue # skip this table if its not wanted header = header.text.strip() if header not in allowed_datasets: continue output = output + ' \x02Popular\x02 (' + header.replace('Last ', '') + '): ' output = output + parse_table(data) + '.' # trending distros in the past 12, 6 and 1 months html = request.get('https://distrowatch.com/dwres.php?resource=trending') soup = BeautifulSoup(html, 'lxml') tables = soup.select('table table table table.News') for table in tables: header = table.find('th', attrs={'class': 'Invert'}) data = table.parent.find_all('td', attrs={'class': 'phr2'}) if header is None or data is None: continue # skip this table if its not wanted header = header.text.strip() if header not in allowed_datasets: continue output = output + ' \x02Trending\x02 (' + header.replace( 'Trending ', '') + '): ' output = output + parse_table(data) + '.' global cache cache = output
def fetch(start, dest): start = request.urlencode(start) dest = request.urlencode(dest) url = "http://www.travelmath.com/flying-distance/from/{}/to/{}".format( start, dest) html = request.get(url) return html
def kernel(inp, reply=None): data = request.get("https://www.kernel.org/finger_banner") lines = data.split('\n') versions = [] old_versions = [] for line in lines: info = re.match( r'^The latest ([[a-z0-9 \-\.]+) version of the Linux kernel is:\s*(.*)$', line) if info is None: continue name = info.group(1) version = info.group(2) if 'longterm' in name: old_versions.append(version) else: versions.append(name + ': ' + version) output = 'Linux kernel versions: ' + '; '.join(versions) if len(old_versions) > 0: output = output + '. Old longterm versions: ' + ', '.join(old_versions) return output
def debt(inp): """debt -- returns the us national debt""" url = "https://commodity.com/debt-clock/us/" html = request.get(url) debt = parse(html) return "Current US Debt: \x02{}\x02".format(debt)
def refresh_cache(): print "[+] refreshing fmylife cache" html = request.get('https://www.fmylife.com/random') soup = BeautifulSoup(html, 'lxml') posts = soup.find_all('a', attrs={'class': 'article-link'}) for post in posts: id = post['href'].split('_')[1].split('.')[0] text = post.text.strip() cache.append((id, text))
def amazon_url(match): id = match.group(2).strip() url = 'https://www.amazon.com/dp/' + id + '/' html = request.get(url) title, price = parse_product(html) if len(title) > 80: title = title[:80] + '...' return u'[Amazon] {} \x0303{}\x03 {}'.format(title, price, url)
def get_more_detail(api_path, gid): api_gid = 'gid_{}'.format(gid.replace('/', '_').replace('-', '_')) detail_linescore = '{}/{}/linescore.json'.format(api_path, api_gid) detail_eventlog = '{}/{}/eventLog.xml'.format(api_path, api_gid) try: linescore = get_json(detail_linescore) except Exception as e: return e if not isinstance(linescore, dict): return Exception('linescore is not an object') try: linescore = linescore['data']['game'] except KeyError: return Exception('linescore structure is unexpected') # count balls = linescore.get('balls', 'unkn') strikes = linescore.get('strikes', 'unkn') outs = linescore.get('outs', 'unkn') runners_onbase = linescore.get('runner_on_base_status', 'unkn') pitcher = linescore.get('current_pitcher', dict()).get('last_name', 'unkn') batter = linescore.get('current_batter', dict()).get('last_name', 'unkn') # bonus latest_event = '' try: events_xml = BeautifulSoup(get(detail_eventlog)) events = events_xml.find_all('event') maxval = -999 for event in events: if int(event['number']) > maxval and event['description'] != '': maxval = int(event['number']) latest_event = event['description'] except Exception as e: latest_event = e return {'balls':balls, 'strikes':strikes, 'outs':outs, 'onbase':runners_onbase, 'pitcher':pitcher, 'batter':batter, 'latest':latest_event}
def get_title(url): html = request.get(url) soup = BeautifulSoup(html, 'lxml') if '#' in url: postid = url.split('#')[1] post = soup.find('div', {'id': postid}) else: post = soup.find('div', {'class': 'opContainer'}) comment = process_text( post.find('blockquote', { 'class': 'postMessage' }).renderContents().strip()) return u"{} - {}".format(url, comment) #
def refresh_cache(): "gets a page of random bash.org quotes and puts them into a dictionary " print "[+] refreshing bash cache" html = request.get('http://bash.org/?random') soup = BeautifulSoup(html, 'lxml') quote_infos = soup.find_all('p', {'class': 'quote'}) quotes = soup.find_all('p', {'class': 'qt'}) num = 0 while num < len(quotes): quote = quotes[num].text.replace('\n', ' ').replace('\r', ' |') id = quote_infos[num].contents[0].text votes = quote_infos[num].find('font').text cache.append((id, votes, quote)) num += 1
def get_bash_quote(inp): try: inp = request.urlencode(inp) html = request.get('http://bash.org/?' + inp) soup = BeautifulSoup(html, 'lxml') quote_info = soup.find('p', {'class': 'quote'}) quote = soup.find('p', { 'class': 'qt' }).text.replace('\n', ' ').replace('\r', ' |') id = quote_info.contents[0].text votes = quote_info.find('font').text return u'\x02{}\x02 ({} votes): {}'.format(id, votes, quote) except: return "No quote found."
def anus_real(inp, nick=None): if not inp: inp = nick inp = request.urlencode(inp) html = request.get('http://en.inkei.net/anus/' + inp) soup = BeautifulSoup(html, 'lxml') details = soup.find(id='elmDescCmmn') if details is None: return 'Anus: http://en.inkei.net/anus/' + inp details = formatting.compress_whitespace(details.text) details = re.sub('Anus of [a-zA-Z0-9]+ ', 'Anus: ', details) return u'{} - http://en.inkei.net/anus/{}'.format(details, inp)
def query(query, useragent='python-duckduckgo ' + str(__version__), safesearch=False, html=False, meanings=True, **kwargs): """ Query DuckDuckGo, returning a Results object. Here's a query that's unlikely to change: >>> result = query('1 + 1') >>> result.type 'nothing' >>> result.answer.text '1 + 1 = 2' >>> result.answer.type 'calc' Keword arguments: useragent: UserAgent to use while querying. Default: "python-duckduckgo %d" (str) safesearch: True for on, False for off. Default: True (bool) html: True to allow HTML in output. Default: False (bool) meanings: True to include disambiguations in results (bool) Any other keyword arguments are passed directly to DuckDuckGo as URL params. """ % __version__ safesearch = '1' if safesearch else '-1' html = '0' if html else '1' meanings = '0' if meanings else '1' params = { 'q': query.encode('utf-8'), 'o': 'json', 'kp': safesearch, 'no_redirect': '1', 'no_html': html, 'd': meanings, } params.update(kwargs) response = request.get('https://api.duckduckgo.com/', params=params, headers={'User-Agent': useragent}) json = json_loads(response) return Results(json)
def amazon(inp): """amazon [query] -- Searches amazon for query""" if not inp: return "usage: amazon <search>" inp = request.urlencode(inp) html = request.get('https://www.amazon.com/s?k=' + inp) results = parse(html) if len(results) == 0: return 'No results found' title, price, url = results[0] if len(title) > 80: title = title[:80] + '...' # \x03 = color, 03 = green return u'[Amazon] {} \x0303{}\x03 {}'.format(title, price, url)
def validate(inp): """validate <url> -- Runs url through the w3c markup validator.""" if not inp.startswith('http'): inp = 'https://' + inp url = 'https://validator.w3.org/nu/?doc=' + request.urlencode(inp) html = request.get(url) soup = BeautifulSoup(html, 'lxml') results = soup.find('div', attrs={'id': 'results'}) errors = len(results.find_all('li', attrs={'class': 'error'})) warns = len(results.find_all('li', attrs={'class': 'warning'})) info = len(results.find_all('li', attrs={'class': 'info'})) if errors == 0 and warns == 0 and info == 0: return "[w3c] Successfully validated with no errors" return "[w3c] Found {} errors, {} warnings and {} notices.".format( errors, warns, info)
def scrape_text(url): html = request.get(url) soup = BeautifulSoup(html, 'lxml') title = soup.find('h1', attrs={'id': 'firstHeading'}) body = soup.find('div', attrs={'id': 'mw-content-text'}) if title: title = title.text.strip() if body is None: return "Error reading the article" output = [] for paragraph in body.find_all('p'): text = paragraph.text.strip() if len(text) > 4: # skip empty paragraphs output.append(text) output = ' '.join(output) return output, title
def wordoftheday(inp): html = request.get('https://www.merriam-webster.com/word-of-the-day') soup = BeautifulSoup(html) word = soup.find('div', attrs={ 'class': 'word-and-pronunciation' }).find('h1').text paragraphs = soup.find('div', attrs={ 'class': 'wod-definition-container' }).find_all('p') definitions = [] for paragraph in iterable.limit(4, paragraphs): definitions.append(paragraph.text.strip()) output = u"The word of the day is \x02{}\x02: {}".format( word, '; '.join(definitions)) if len(output) > 320: output = output[:320] + '... More at https://www.merriam-webster.com/word-of-the-day' return output
def koran(inp): "koran <chapter.verse> -- gets <chapter.verse> from the Koran. it can also search any text." url = 'https://quod.lib.umich.edu/cgi/k/koran/koran-idx?type=simple&q1=' + request.urlencode(inp) html = request.get(url) soup = BeautifulSoup(html, 'lxml') query = soup.find_all('li') if not query or len(query) == 0: return 'No results for ' + inp output = '[Koran] ' lines = [] for li in iterable.limit(4, query): lines.append(compress_whitespace(li.text)) output = output + ' '.join(lines) if len(output) > 320: output = output[:320] + '...' return output
def define(inp): "define <word> -- Fetches definition of <word>." html = request.get(dict_url + request.urlencode(inp)) soup = BeautifulSoup(html, 'lxml') definitions = soup.find_all('dd') if len(definitions) == 0: return "Definition not found" output = 'Definition of "' + inp + '":' # used to number the many definitions i = 1 for definition in definitions: if 'article' in definition['class']: text = formatting.compress_whitespace(definition.text.strip()) output = output + ' \x02' + text + '\x02' i = 1 elif 'entry' in definition['class']: definition = definition.find('div', attrs={'class': 'definition'}) text = formatting.compress_whitespace(definition.text.strip()) output = output + text.replace(u'\xb0', ' \x02{}.\x02 '.format(i)) i = i + 1 # theres 'synonyms' and 'examples' too # arbitrary length limit if len(output) > 360: output = output[: 360] + '\x0f... More at https://en.wiktionary.org/wiki/' + inp return output
def etymology(inp): "etymology <word> -- Retrieves the etymology of <word>." html = request.get(eth_url + request.urlencode(inp)) soup = BeautifulSoup(html, 'lxml') # the page uses weird class names like "section.word__definatieon--81fc4ae" # if it breaks change the selector to [class~="word_"] results = soup.select('div[class^="word"] section[class^="word__def"] > p') if len(results) == 0: return 'No etymology found for ' + inp output = u'Ethymology of "' + inp + '":' i = 1 for result in results: text = formatting.compress_whitespace(result.text.strip()) output = output + u' \x02{}.\x02 {}'.format(i, text) i = i + 1 if len(output) > 400: output = output[:400] + '\x0f... More at https://www.etymonline.com/word/select' return output