Example #1
0
def find_title(url, verify=True):
    """Return the title for the given URL."""
    try:
        response = requests.get(url, stream=True, verify=verify,
                                headers=default_headers)
        content = b''
        for byte in response.iter_content(chunk_size=512):
            content += byte
            if b'</title>' in content or len(content) > max_bytes:
                break
        content = content.decode('utf-8', errors='ignore')
        # Need to close the connection because we have not read all
        # the data
        response.close()
    except requests.exceptions.ConnectionError:
        return None

    # Some cleanup that I don't really grok, but was in the original, so
    # we'll keep it (with the compiled regexes made global) for now.
    content = title_tag_data.sub(r'<\1title>', content)
    content = quoted_title.sub('', content)

    start = content.rfind('<title>')
    end = content.rfind('</title>')
    if start == -1 or end == -1:
        return
    title = web.decode(content[start + 7:end])
    title = title.strip()[:200]

    title = ' '.join(title.split())  # cleanly remove multiple spaces

    # More cryptic regex substitutions. This one looks to be myano's invention.
    title = re_dcc.sub('', title)

    return title or None
Example #2
0
def tr(bot, trigger):
    """Translates a phrase, with an optional language hint."""
    in_lang, out_lang, phrase = trigger.groups()

    if (len(phrase) > 350) and (not trigger.admin):
        return bot.reply('Phrase must be under 350 characters.')

    if phrase.strip() == '':
        return bot.reply('You need to specify a string for me to translate!')

    in_lang = in_lang or 'auto'
    out_lang = out_lang or 'en'

    if in_lang != out_lang:
        msg, in_lang = translate(phrase, in_lang, out_lang)
        if not in_lang:
            return bot.say(
                "Translation failed, probably because of a rate-limit.")
        if sys.version_info.major < 3 and isinstance(msg, str):
            msg = msg.decode('utf-8')
        if msg:
            msg = web.decode(msg)
            msg = '"%s" (%s to %s, translate.google.com)' % (msg, in_lang,
                                                             out_lang)
        else:
            msg = 'The %s to %s translation failed, are you sure you specified valid language abbreviations?' % (
                in_lang, out_lang)

        bot.reply(msg)
    else:
        bot.reply('Language guessing failed, so try suggesting one!')
Example #3
0
def text(html):
    text = r_sup.sub('', html)  # Remove superscripts that are references from definition
    text = r_tag.sub('', text).strip()
    text = text.replace('\n', ' ')
    text = text.replace('\r', '')
    text = text.replace('(intransitive', '(intr.')
    text = text.replace('(transitive', '(trans.')
    text = web.decode(text)
    return text.strip()
Example #4
0
def tr2(bot, trigger):
    """Translates a phrase, with an optional language hint."""
    command = trigger.group(2)

    if not command:
        bot.reply('You did not give me anything to translate.')
        return

    def langcode(p):
        return p.startswith(':') and (2 < len(p) < 10) and p[1:].isalpha()

    args = ['auto', 'en']

    for i in range(2):
        if ' ' not in command:
            break
        prefix, cmd = command.split(' ', 1)
        if langcode(prefix):
            args[i] = prefix[1:]
            command = cmd

    phrase = command
    if (len(phrase) > 350) and (not trigger.admin):
        bot.reply('Phrase must be under 350 characters.')
        return

    if phrase.strip() == '':
        bot.reply('You need to specify a string for me to translate!')
        return

    src, dest = args

    if src == dest:
        bot.reply('Language guessing failed, so try suggesting one!')
        return

    msg, src = translate(phrase, src, dest)
    if not src:
        return bot.say("Translation failed, probably because of a rate-limit.")

    if not msg:
        bot.reply('The %s to %s translation failed; '
                  'are you sure you specified valid language abbreviations?' %
                  (src, dest))
        return

    if sys.version_info.major < 3 and isinstance(msg, str):
        msg = msg.decode('utf-8')

    msg = web.decode(msg)  # msg.replace('&#39;', "'")
    msg = '"%s" (%s to %s, translate.google.com)' % (msg, src, dest)

    bot.say(msg)
Example #5
0
def gettld(bot, trigger):
    """Show information about the given Top Level Domain."""
    page = requests.get(uri).text
    tld = trigger.group(2)
    if not tld:
        bot.reply("You must provide a top-level domain to search.")
        return  # Stop if no tld argument is provided
    if tld[0] == '.':
        tld = tld[1:]
    search = r'(?i)<td><a href="\S+" title="\S+">\.{0}</a></td>\n(<td><a href=".*</a></td>\n)?<td>([A-Za-z0-9].*?)</td>\n<td>(.*)</td>\n<td[^>]*>(.*?)</td>\n<td[^>]*>(.*?)</td>\n'
    search = search.format(tld)
    re_country = re.compile(search)
    matches = re_country.findall(page)
    if not matches:
        search = r'(?i)<td><a href="\S+" title="(\S+)">\.{0}</a></td>\n<td><a href=".*">(.*)</a></td>\n<td>([A-Za-z0-9].*?)</td>\n<td[^>]*>(.*?)</td>\n<td[^>]*>(.*?)</td>\n'
        search = search.format(tld)
        re_country = re.compile(search)
        matches = re_country.findall(page)
    if matches:
        matches = list(matches[0])
        i = 0
        while i < len(matches):
            matches[i] = r_tag.sub("", matches[i])
            i += 1
        desc = matches[2]
        if len(desc) > 400:
            desc = desc[:400] + "..."
        reply = "%s -- %s. IDN: %s, DNSSEC: %s" % (matches[1], desc,
                                                   matches[3], matches[4])
    else:
        search = r'<td><a href="\S+" title="\S+">.{0}</a></td>\n<td><span class="flagicon"><img.*?\">(.*?)</a></td>\n<td[^>]*>(.*?)</td>\n<td[^>]*>(.*?)</td>\n<td[^>]*>(.*?)</td>\n<td[^>]*>(.*?)</td>\n<td[^>]*>(.*?)</td>\n'
        search = search.format(unicode(tld))
        re_country = re.compile(search)
        matches = re_country.findall(page)
        if matches:
            matches = matches[0]
            dict_val = dict()
            dict_val["country"], dict_val["expl"], dict_val["notes"], dict_val[
                "idn"], dict_val["dnssec"], dict_val["sld"] = matches
            for key in dict_val:
                if dict_val[key] == "&#160;":
                    dict_val[key] = "N/A"
                dict_val[key] = r_tag.sub('', dict_val[key])
            if len(dict_val["notes"]) > 400:
                dict_val["notes"] = dict_val["notes"][:400] + "..."
            reply = "%s (%s, %s). IDN: %s, DNSSEC: %s, SLD: %s" % (
                dict_val["country"], dict_val["expl"], dict_val["notes"],
                dict_val["idn"], dict_val["dnssec"], dict_val["sld"])
        else:
            reply = "No matches found for TLD: {0}".format(unicode(tld))
    # Final touches + output
    reply = web.decode(reply)
    bot.reply(reply)
Example #6
0
def duck_search(query):
    query = query.replace('!', '')
    base = 'https://duckduckgo.com/html/'
    parameters = {
        'kl': 'us-en',
        'q': query,
    }
    bytes = requests.get(base, parameters, headers=header_spoof).text
    if 'web-result' in bytes:  # filter out the adds on top of the page
        bytes = bytes.split('web-result')[1]
    m = r_duck.search(bytes)
    if m:
        return web.decode(m.group(1))
Example #7
0
def tr(bot, trigger):
    """Translates a phrase, with an optional language hint."""
    in_lang, out_lang, phrase = trigger.groups()

    if (len(phrase) > 350) and (not trigger.admin):
        bot.reply('Phrase must be under 350 characters.')
        return

    if phrase.strip() == '':
        bot.reply('You need to specify a string for me to translate!')
        return

    in_lang = in_lang or 'auto'
    out_lang = out_lang or 'en'

    if in_lang == out_lang:
        bot.reply('Language guessing failed, so try suggesting one!')
        return

    try:
        msg, in_lang = translate(phrase, in_lang, out_lang)
    except requests.Timeout:
        bot.reply("Translation service unavailable (timeout).")
        LOGGER.error(
            'Translate API error (%s to %s: "%s"): timeout.',
            in_lang, out_lang, phrase)
        return
    except requests.RequestException as http_error:
        bot.reply("Translation request failed.")
        LOGGER.exception(
            'Translate API error (%s to %s: "%s"): %s.',
            in_lang, out_lang, phrase, http_error)
        return

    if not in_lang:
        bot.reply("Translation failed, probably because of a rate-limit.")
        return

    if not msg:
        bot.reply(
            'The %s to %s translation failed; are you sure you specified '
            'valid language abbreviations?' % (in_lang, out_lang)
        )
        return

    msg = web.decode(msg)
    msg = '"%s" (%s to %s, translate.google.com)' % (msg, in_lang, out_lang)
    bot.say(msg)
Example #8
0
def duck_search(query):
    query = query.replace('!', '')
    base = 'https://duckduckgo.com/html/'
    parameters = {
        'kl': 'us-en',
        'q': query,
    }
    headers = {
        'User-Agent':
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'
    }
    bytes = requests.get(base, parameters, headers=headers).text
    if 'web-result' in bytes:  # filter out the adds on top of the page
        bytes = bytes.split('web-result')[1]
    m = r_duck.search(bytes)
    if m:
        return web.decode(m.group(1))
Example #9
0
def find_title(url: str, verify: bool = True) -> Optional[str]:
    """Return the title for the given URL.

    :param verify: Whether to require a valid certificate when using https
    """
    try:
        response = requests.get(url,
                                stream=True,
                                verify=verify,
                                headers=DEFAULT_HEADERS)
        raw_content = b''
        for byte in response.iter_content(chunk_size=512):
            raw_content += byte
            if b'</title>' in raw_content or len(raw_content) > MAX_BYTES:
                break
        content = raw_content.decode('utf-8', errors='ignore')
        # Need to close the connection because we have not read all
        # the data
        response.close()
    except requests.exceptions.ConnectionError as e:
        LOGGER.debug("Unable to reach URL: %r: %s", url, e)
        return None
    except (
            requests.exceptions.InvalidURL,  # e.g. http:///
            UnicodeError,  # e.g. http://.example.com (urllib3<1.26)
            LocationValueError,  # e.g. http://.example.com (urllib3>=1.26)
    ):
        LOGGER.debug('Invalid URL: %s', url)
        return None

    # Some cleanup that I don't really grok, but was in the original, so
    # we'll keep it (with the compiled regexes made global) for now.
    content = TITLE_TAG_DATA.sub(r'<\1title>', content)
    content = QUOTED_TITLE.sub('', content)

    start = content.rfind('<title>')
    end = content.rfind('</title>')
    if start == -1 or end == -1:
        return None

    title = web.decode(content[start + 7:end])
    title = title.strip()[:200]

    title = ' '.join(title.split())  # cleanly remove multiple spaces

    return title or None
Example #10
0
File: url.py Project: njsmith/sopel
def find_title(url, verify=True):
    """Return the title for the given URL."""
    try:
        response = requests.get(url,
                                stream=True,
                                verify=verify,
                                headers=DEFAULT_HEADERS)
        content = b''
        for byte in response.iter_content(chunk_size=512):
            content += byte
            if b'</title>' in content or len(content) > MAX_BYTES:
                break
        content = content.decode('utf-8', errors='ignore')
        # Need to close the connection because we have not read all
        # the data
        response.close()
    except requests.exceptions.ConnectionError:
        LOGGER.exception('Unable to reach URL: %s', url)
        return None
    except (
            requests.exceptions.InvalidURL,  # e.g. http:///
            UnicodeError,  # e.g. http://.example.com (urllib3<1.26)
            LocationValueError,  # e.g. http://.example.com (urllib3>=1.26)
    ):
        LOGGER.debug('Invalid URL: %s', url)
        return None

    # Some cleanup that I don't really grok, but was in the original, so
    # we'll keep it (with the compiled regexes made global) for now.
    content = TITLE_TAG_DATA.sub(r'<\1title>', content)
    content = QUOTED_TITLE.sub('', content)

    start = content.rfind('<title>')
    end = content.rfind('</title>')
    if start == -1 or end == -1:
        return
    title = web.decode(content[start + 7:end])
    title = title.strip()[:200]

    title = ' '.join(title.split())  # cleanly remove multiple spaces

    # More cryptic regex substitutions. This one looks to be myano's invention.
    title = RE_DCC.sub('', title)

    return title or None
Example #11
0
def tr2(bot, trigger):
    """Translates a phrase, with an optional language hint."""
    command = trigger.group(2)

    if not command:
        bot.reply('You did not give me anything to translate.')
        return

    def langcode(p):
        return p.startswith(':') and (2 < len(p) < 10) and p[1:].isalpha()

    args = ['auto', 'en']

    for i in range(2):
        if ' ' not in command:
            break
        prefix, cmd = command.split(' ', 1)
        if langcode(prefix):
            args[i] = prefix[1:]
            command = cmd

    phrase = command
    if (len(phrase) > 350) and (not trigger.admin):
        bot.reply('Phrase must be under 350 characters.')
        return

    if phrase.strip() == '':
        bot.reply('You need to specify a string for me to translate!')
        return

    src, dest = args

    if src == dest:
        bot.reply('Language guessing failed, so try suggesting one!')
        return

    try:
        msg, src = translate(phrase, src, dest)
    except requests.Timeout:
        bot.reply("Translation service unavailable (timeout).")
        LOGGER.error(
            'Translate API error (%s to %s: "%s"): timeout.',
            src, dest, phrase)
        return
    except requests.RequestException as http_error:
        bot.reply("Translation request failed.")
        LOGGER.exception(
            'Translate API error (%s to %s: "%s"): %s.',
            src, dest, phrase, http_error)
        return

    if not src:
        return bot.say("Translation failed, probably because of a rate-limit.")

    if not msg:
        bot.reply(
            'The %s to %s translation failed; '
            'are you sure you specified valid language abbreviations?'
            % (src, dest))
        return

    msg = web.decode(msg)  # msg.replace('&#39;', "'")
    msg = '"%s" (%s to %s, translate.google.com)' % (msg, src, dest)

    bot.say(msg)
Example #12
0
def clean_html(input):
    output = bleach.clean(input, tags=[], strip=True)
    return web.decode(output)
Example #13
0
def tr2(bot, trigger):
    """Translates a phrase, with an optional language hint."""
    command = trigger.group(2)

    if not command:
        bot.reply('You did not give me anything to translate.')
        return

    def langcode(p):
        # TODO: We'd be much better off just using the langcodes PyPI package
        # also TODO: it'd be nice not to require the : prefix, which using
        # langcodes would make easier in lieu of adding an API key to fetch
        # the list of supported languages
        prefixed = p.startswith(':')
        # the longest codes Google uses (ca. Jan 2022) are zh-CN and zh-TW
        short_enough = (2 < len(p) < 8)
        # pesky - in Chinese codes forced a switch from .isalpha(); see #2241
        fits_pattern = re.match(r':[a-z\-]+', p.lower())
        return all((prefixed, short_enough, fits_pattern))

    args = ['auto', 'en']

    for i in range(2):
        if ' ' not in command:
            break
        prefix, cmd = command.split(' ', 1)
        if langcode(prefix):
            args[i] = prefix[1:]
            command = cmd

    phrase = command
    if (len(phrase) > 350) and (not trigger.admin):
        bot.reply('Phrase must be under 350 characters.')
        return

    if phrase.strip() == '':
        bot.reply('You need to specify a string for me to translate!')
        return

    src, dest = args

    if src == dest:
        bot.reply('Language guessing failed, so try suggesting one!')
        return

    try:
        msg, src = translate(phrase, src, dest)
    except requests.Timeout:
        bot.reply("Translation service unavailable (timeout).")
        LOGGER.error(
            'Translate API error (%s to %s: "%s"): timeout.',
            src, dest, phrase)
        return
    except requests.RequestException as http_error:
        bot.reply("Translation request failed.")
        LOGGER.exception(
            'Translate API error (%s to %s: "%s"): %s.',
            src, dest, phrase, http_error)
        return

    if not src:
        return bot.say("Translation failed, probably because of a rate-limit.")

    if not msg:
        bot.reply(
            'The %s to %s translation failed; '
            'are you sure you specified valid language abbreviations?'
            % (src, dest))
        return

    msg = web.decode(msg)  # msg.replace('&#39;', "'")
    msg = '"%s" (%s to %s, translate.google.com)' % (msg, src, dest)

    bot.say(msg)