def find_title(url, verify=True): """Return the title for the given URL.""" try: response = requests.get(url, stream=True, verify=verify, headers=default_headers) content = b'' for byte in response.iter_content(chunk_size=512): content += byte if b'</title>' in content or len(content) > max_bytes: break content = content.decode('utf-8', errors='ignore') # Need to close the connection because we have not read all # the data response.close() except requests.exceptions.ConnectionError: return None # Some cleanup that I don't really grok, but was in the original, so # we'll keep it (with the compiled regexes made global) for now. content = title_tag_data.sub(r'<\1title>', content) content = quoted_title.sub('', content) start = content.rfind('<title>') end = content.rfind('</title>') if start == -1 or end == -1: return title = web.decode(content[start + 7:end]) title = title.strip()[:200] title = ' '.join(title.split()) # cleanly remove multiple spaces # More cryptic regex substitutions. This one looks to be myano's invention. title = re_dcc.sub('', title) return title or None
def tr(bot, trigger): """Translates a phrase, with an optional language hint.""" in_lang, out_lang, phrase = trigger.groups() if (len(phrase) > 350) and (not trigger.admin): return bot.reply('Phrase must be under 350 characters.') if phrase.strip() == '': return bot.reply('You need to specify a string for me to translate!') in_lang = in_lang or 'auto' out_lang = out_lang or 'en' if in_lang != out_lang: msg, in_lang = translate(phrase, in_lang, out_lang) if not in_lang: return bot.say( "Translation failed, probably because of a rate-limit.") if sys.version_info.major < 3 and isinstance(msg, str): msg = msg.decode('utf-8') if msg: msg = web.decode(msg) msg = '"%s" (%s to %s, translate.google.com)' % (msg, in_lang, out_lang) else: msg = 'The %s to %s translation failed, are you sure you specified valid language abbreviations?' % ( in_lang, out_lang) bot.reply(msg) else: bot.reply('Language guessing failed, so try suggesting one!')
def text(html): text = r_sup.sub('', html) # Remove superscripts that are references from definition text = r_tag.sub('', text).strip() text = text.replace('\n', ' ') text = text.replace('\r', '') text = text.replace('(intransitive', '(intr.') text = text.replace('(transitive', '(trans.') text = web.decode(text) return text.strip()
def tr2(bot, trigger): """Translates a phrase, with an optional language hint.""" command = trigger.group(2) if not command: bot.reply('You did not give me anything to translate.') return def langcode(p): return p.startswith(':') and (2 < len(p) < 10) and p[1:].isalpha() args = ['auto', 'en'] for i in range(2): if ' ' not in command: break prefix, cmd = command.split(' ', 1) if langcode(prefix): args[i] = prefix[1:] command = cmd phrase = command if (len(phrase) > 350) and (not trigger.admin): bot.reply('Phrase must be under 350 characters.') return if phrase.strip() == '': bot.reply('You need to specify a string for me to translate!') return src, dest = args if src == dest: bot.reply('Language guessing failed, so try suggesting one!') return msg, src = translate(phrase, src, dest) if not src: return bot.say("Translation failed, probably because of a rate-limit.") if not msg: bot.reply('The %s to %s translation failed; ' 'are you sure you specified valid language abbreviations?' % (src, dest)) return if sys.version_info.major < 3 and isinstance(msg, str): msg = msg.decode('utf-8') msg = web.decode(msg) # msg.replace(''', "'") msg = '"%s" (%s to %s, translate.google.com)' % (msg, src, dest) bot.say(msg)
def gettld(bot, trigger): """Show information about the given Top Level Domain.""" page = requests.get(uri).text tld = trigger.group(2) if not tld: bot.reply("You must provide a top-level domain to search.") return # Stop if no tld argument is provided if tld[0] == '.': tld = tld[1:] search = r'(?i)<td><a href="\S+" title="\S+">\.{0}</a></td>\n(<td><a href=".*</a></td>\n)?<td>([A-Za-z0-9].*?)</td>\n<td>(.*)</td>\n<td[^>]*>(.*?)</td>\n<td[^>]*>(.*?)</td>\n' search = search.format(tld) re_country = re.compile(search) matches = re_country.findall(page) if not matches: search = r'(?i)<td><a href="\S+" title="(\S+)">\.{0}</a></td>\n<td><a href=".*">(.*)</a></td>\n<td>([A-Za-z0-9].*?)</td>\n<td[^>]*>(.*?)</td>\n<td[^>]*>(.*?)</td>\n' search = search.format(tld) re_country = re.compile(search) matches = re_country.findall(page) if matches: matches = list(matches[0]) i = 0 while i < len(matches): matches[i] = r_tag.sub("", matches[i]) i += 1 desc = matches[2] if len(desc) > 400: desc = desc[:400] + "..." reply = "%s -- %s. IDN: %s, DNSSEC: %s" % (matches[1], desc, matches[3], matches[4]) else: search = r'<td><a href="\S+" title="\S+">.{0}</a></td>\n<td><span class="flagicon"><img.*?\">(.*?)</a></td>\n<td[^>]*>(.*?)</td>\n<td[^>]*>(.*?)</td>\n<td[^>]*>(.*?)</td>\n<td[^>]*>(.*?)</td>\n<td[^>]*>(.*?)</td>\n' search = search.format(unicode(tld)) re_country = re.compile(search) matches = re_country.findall(page) if matches: matches = matches[0] dict_val = dict() dict_val["country"], dict_val["expl"], dict_val["notes"], dict_val[ "idn"], dict_val["dnssec"], dict_val["sld"] = matches for key in dict_val: if dict_val[key] == " ": dict_val[key] = "N/A" dict_val[key] = r_tag.sub('', dict_val[key]) if len(dict_val["notes"]) > 400: dict_val["notes"] = dict_val["notes"][:400] + "..." reply = "%s (%s, %s). IDN: %s, DNSSEC: %s, SLD: %s" % ( dict_val["country"], dict_val["expl"], dict_val["notes"], dict_val["idn"], dict_val["dnssec"], dict_val["sld"]) else: reply = "No matches found for TLD: {0}".format(unicode(tld)) # Final touches + output reply = web.decode(reply) bot.reply(reply)
def duck_search(query): query = query.replace('!', '') base = 'https://duckduckgo.com/html/' parameters = { 'kl': 'us-en', 'q': query, } bytes = requests.get(base, parameters, headers=header_spoof).text if 'web-result' in bytes: # filter out the adds on top of the page bytes = bytes.split('web-result')[1] m = r_duck.search(bytes) if m: return web.decode(m.group(1))
def tr(bot, trigger): """Translates a phrase, with an optional language hint.""" in_lang, out_lang, phrase = trigger.groups() if (len(phrase) > 350) and (not trigger.admin): bot.reply('Phrase must be under 350 characters.') return if phrase.strip() == '': bot.reply('You need to specify a string for me to translate!') return in_lang = in_lang or 'auto' out_lang = out_lang or 'en' if in_lang == out_lang: bot.reply('Language guessing failed, so try suggesting one!') return try: msg, in_lang = translate(phrase, in_lang, out_lang) except requests.Timeout: bot.reply("Translation service unavailable (timeout).") LOGGER.error( 'Translate API error (%s to %s: "%s"): timeout.', in_lang, out_lang, phrase) return except requests.RequestException as http_error: bot.reply("Translation request failed.") LOGGER.exception( 'Translate API error (%s to %s: "%s"): %s.', in_lang, out_lang, phrase, http_error) return if not in_lang: bot.reply("Translation failed, probably because of a rate-limit.") return if not msg: bot.reply( 'The %s to %s translation failed; are you sure you specified ' 'valid language abbreviations?' % (in_lang, out_lang) ) return msg = web.decode(msg) msg = '"%s" (%s to %s, translate.google.com)' % (msg, in_lang, out_lang) bot.say(msg)
def duck_search(query): query = query.replace('!', '') base = 'https://duckduckgo.com/html/' parameters = { 'kl': 'us-en', 'q': query, } headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36' } bytes = requests.get(base, parameters, headers=headers).text if 'web-result' in bytes: # filter out the adds on top of the page bytes = bytes.split('web-result')[1] m = r_duck.search(bytes) if m: return web.decode(m.group(1))
def find_title(url: str, verify: bool = True) -> Optional[str]: """Return the title for the given URL. :param verify: Whether to require a valid certificate when using https """ try: response = requests.get(url, stream=True, verify=verify, headers=DEFAULT_HEADERS) raw_content = b'' for byte in response.iter_content(chunk_size=512): raw_content += byte if b'</title>' in raw_content or len(raw_content) > MAX_BYTES: break content = raw_content.decode('utf-8', errors='ignore') # Need to close the connection because we have not read all # the data response.close() except requests.exceptions.ConnectionError as e: LOGGER.debug("Unable to reach URL: %r: %s", url, e) return None except ( requests.exceptions.InvalidURL, # e.g. http:/// UnicodeError, # e.g. http://.example.com (urllib3<1.26) LocationValueError, # e.g. http://.example.com (urllib3>=1.26) ): LOGGER.debug('Invalid URL: %s', url) return None # Some cleanup that I don't really grok, but was in the original, so # we'll keep it (with the compiled regexes made global) for now. content = TITLE_TAG_DATA.sub(r'<\1title>', content) content = QUOTED_TITLE.sub('', content) start = content.rfind('<title>') end = content.rfind('</title>') if start == -1 or end == -1: return None title = web.decode(content[start + 7:end]) title = title.strip()[:200] title = ' '.join(title.split()) # cleanly remove multiple spaces return title or None
def find_title(url, verify=True): """Return the title for the given URL.""" try: response = requests.get(url, stream=True, verify=verify, headers=DEFAULT_HEADERS) content = b'' for byte in response.iter_content(chunk_size=512): content += byte if b'</title>' in content or len(content) > MAX_BYTES: break content = content.decode('utf-8', errors='ignore') # Need to close the connection because we have not read all # the data response.close() except requests.exceptions.ConnectionError: LOGGER.exception('Unable to reach URL: %s', url) return None except ( requests.exceptions.InvalidURL, # e.g. http:/// UnicodeError, # e.g. http://.example.com (urllib3<1.26) LocationValueError, # e.g. http://.example.com (urllib3>=1.26) ): LOGGER.debug('Invalid URL: %s', url) return None # Some cleanup that I don't really grok, but was in the original, so # we'll keep it (with the compiled regexes made global) for now. content = TITLE_TAG_DATA.sub(r'<\1title>', content) content = QUOTED_TITLE.sub('', content) start = content.rfind('<title>') end = content.rfind('</title>') if start == -1 or end == -1: return title = web.decode(content[start + 7:end]) title = title.strip()[:200] title = ' '.join(title.split()) # cleanly remove multiple spaces # More cryptic regex substitutions. This one looks to be myano's invention. title = RE_DCC.sub('', title) return title or None
def tr2(bot, trigger): """Translates a phrase, with an optional language hint.""" command = trigger.group(2) if not command: bot.reply('You did not give me anything to translate.') return def langcode(p): return p.startswith(':') and (2 < len(p) < 10) and p[1:].isalpha() args = ['auto', 'en'] for i in range(2): if ' ' not in command: break prefix, cmd = command.split(' ', 1) if langcode(prefix): args[i] = prefix[1:] command = cmd phrase = command if (len(phrase) > 350) and (not trigger.admin): bot.reply('Phrase must be under 350 characters.') return if phrase.strip() == '': bot.reply('You need to specify a string for me to translate!') return src, dest = args if src == dest: bot.reply('Language guessing failed, so try suggesting one!') return try: msg, src = translate(phrase, src, dest) except requests.Timeout: bot.reply("Translation service unavailable (timeout).") LOGGER.error( 'Translate API error (%s to %s: "%s"): timeout.', src, dest, phrase) return except requests.RequestException as http_error: bot.reply("Translation request failed.") LOGGER.exception( 'Translate API error (%s to %s: "%s"): %s.', src, dest, phrase, http_error) return if not src: return bot.say("Translation failed, probably because of a rate-limit.") if not msg: bot.reply( 'The %s to %s translation failed; ' 'are you sure you specified valid language abbreviations?' % (src, dest)) return msg = web.decode(msg) # msg.replace(''', "'") msg = '"%s" (%s to %s, translate.google.com)' % (msg, src, dest) bot.say(msg)
def clean_html(input): output = bleach.clean(input, tags=[], strip=True) return web.decode(output)
def tr2(bot, trigger): """Translates a phrase, with an optional language hint.""" command = trigger.group(2) if not command: bot.reply('You did not give me anything to translate.') return def langcode(p): # TODO: We'd be much better off just using the langcodes PyPI package # also TODO: it'd be nice not to require the : prefix, which using # langcodes would make easier in lieu of adding an API key to fetch # the list of supported languages prefixed = p.startswith(':') # the longest codes Google uses (ca. Jan 2022) are zh-CN and zh-TW short_enough = (2 < len(p) < 8) # pesky - in Chinese codes forced a switch from .isalpha(); see #2241 fits_pattern = re.match(r':[a-z\-]+', p.lower()) return all((prefixed, short_enough, fits_pattern)) args = ['auto', 'en'] for i in range(2): if ' ' not in command: break prefix, cmd = command.split(' ', 1) if langcode(prefix): args[i] = prefix[1:] command = cmd phrase = command if (len(phrase) > 350) and (not trigger.admin): bot.reply('Phrase must be under 350 characters.') return if phrase.strip() == '': bot.reply('You need to specify a string for me to translate!') return src, dest = args if src == dest: bot.reply('Language guessing failed, so try suggesting one!') return try: msg, src = translate(phrase, src, dest) except requests.Timeout: bot.reply("Translation service unavailable (timeout).") LOGGER.error( 'Translate API error (%s to %s: "%s"): timeout.', src, dest, phrase) return except requests.RequestException as http_error: bot.reply("Translation request failed.") LOGGER.exception( 'Translate API error (%s to %s: "%s"): %s.', src, dest, phrase, http_error) return if not src: return bot.say("Translation failed, probably because of a rate-limit.") if not msg: bot.reply( 'The %s to %s translation failed; ' 'are you sure you specified valid language abbreviations?' % (src, dest)) return msg = web.decode(msg) # msg.replace(''', "'") msg = '"%s" (%s to %s, translate.google.com)' % (msg, src, dest) bot.say(msg)