def wikipedia(phenny, input, origterm, lang, to_user=None):
    origterm = origterm.strip()
    lang = lang.strip()

    if not origterm:
        return phenny.say('Perhaps you meant ".wik Zen"?')

    section = None

    if "#" in origterm:
        origterm, section = origterm.split("#")[:2]
        section = format_subsection(section)
    term = format_term(origterm)

    w = wiki.Wiki(wikiapi % lang, wikiuri % lang, wikisearch % lang)

    try:
        result = w.search(term)
    except web.ConnectionError:
        error = "Can't connect to en.wikipedia.org ({0})".format(
            wikiuri.format(term))
        return phenny.say(error)

    if result is not None:
        #Disregarding [0], the snippet
        url = result.split("|")[-1]
        check_posted(phenny, input, url)
        if to_user:
            phenny.say(to_user + ', ' + parse_wiki_page(url, term, section))
        else:
            phenny.say(parse_wiki_page(url, term, section))
    else:
        phenny.say(
            'Can\'t find anything in Wikipedia for "{0}".'.format(origterm))
Beispiel #2
0
def apertium_wiki(phenny, input, origterm, to_nick=None):
    term = format_term(origterm)

    try:
        html = str(web.get(wikiuri.format(term)))
    except:
        apiResponse = json.loads(
            str(web.get(wikisearchuri.format(term, 'title'))))
        if len(apiResponse['query']['search']):
            term = apiResponse['query']['search'][0]['title']
            html = str(web.get(wikiuri.format(term)))
        else:
            apiResponse = json.loads(
                str(web.get(wikisearchuri.format(term, 'text'))))
            if len(apiResponse['query']['search']):
                term = apiResponse['query']['search'][0]['title']
                html = str(web.get(wikiuri.format(term)))
            else:
                phenny.reply("No wiki results for that term.")
                return

    page = lxml.html.fromstring(html)

    if "#" in origterm:
        section = format_subsection(origterm.split("#")[1])
        text = page.find(".//span[@id='%s']" % section)
        if text is None:
            phenny.reply("That subsection does not exist.")
            return
        text = text.getparent().getnext()
    else:
        paragraphs = page.findall('.//p')
        if len(paragraphs) > 2:
            text = page.findall('.//p')[1]
        else:
            text = page.findall(".//*[@id='mw-content-text']")[0]

    sentences = text.text_content().split(". ")
    sentence = '"' + sentences[0] + '"'

    maxlength = 430 - len(
        (' - ' + wikiuri.format(format_term_display(term))).encode('utf-8'))
    if len(sentence.encode('utf-8')) > maxlength:
        sentence = sentence.encode('utf-8')[:maxlength].decode(
            'utf-8', 'ignore')
        words = sentence[:-5].split(' ')
        words.pop()
        sentence = ' '.join(words) + ' [...]'

    if hasattr(input, 'sender'):
        check_posted(phenny, input, wikiuri.format(format_term_display(term)))
    if to_nick:
        phenny.say(to_nick + ', ' + sentence + ' - ' +
                   wikiuri.format(format_term_display(term)))
    else:
        phenny.say(sentence + ' - ' +
                   wikiuri.format(format_term_display(term)))
Beispiel #3
0
def gettitle(phenny, input, uri):
    if not ':' in uri:
        uri = 'http://' + uri
    uri = uri.replace('#!', '?_escaped_fragment_=')

    if uri.startswith('http://wiki.apertium.org/wiki/'):
        item = uri[len('http://wiki.apertium.org/wiki/'):]
        return awik(phenny, re.match(r'(blahblah)?(.*)', item))
    if re.match(r'https?://en.wiktionary.org/wiki/(.*)', uri):
        item = re.match(r'https?://en.wiktionary.org/wiki/(.*)', uri).group(1)
        return w(phenny, re.match(r'(blahblah)?(.*)', web.unquote(item)))
    if re.match(r'https?://([a-z]{2,3}).wikipedia.org/wiki/(.*)', uri):
        match = re.match(r'https?://([a-z]{2,3}).wikipedia.org/wiki/(.*)', uri)
        lang, page = match.group(1), match.group(2)
        return wikipedia(phenny, page, lang)

    parts = uri.split(".")
    start = parts[0]
    parts.pop(0)
    uri = start + "." + web.quote('.'.join(parts))
    
    title = None
    localhost = [
        'http://localhost/', 'http://localhost:80/',
        'http://localhost:8080/', 'http://127.0.0.1/',
        'http://127.0.0.1:80/', 'http://127.0.0.1:8080/',
        'https://localhost/', 'https://localhost:80/',
        'https://localhost:8080/', 'https://127.0.0.1/',
        'https://127.0.0.1:80/', 'https://127.0.0.1:8080/',
        'http://localhost:', 'https://localhost:',
    ]
    for s in localhost:
        if uri.startswith(s):
            return #phenny.reply('Sorry, access forbidden.')

    if not hasattr(phenny.config, 'blacklisted_urls'):
        phenny.config.blacklisted_urls = []
    if not hasattr(phenny.bot, 'blacklisted_urls'):
        phenny.bot.blacklisted_urls = []
        for s in phenny.config.blacklisted_urls:
            phenny.bot.blacklisted_urls.append(re.compile(s))
    for regex in phenny.bot.blacklisted_urls:
        if regex.match(uri):
            return

    try:
        redirects = 0
        while True:
            try:
                info = web.head(uri)

                if not isinstance(info, list):
                    status = '200'
                else:
                    status = str(info[1])
                    info = info[0]
            except web.HTTPError:
                try:
                    info = requests.get(uri, headers=web.default_headers, verify=True)
                    status = str(info.status_code)
                    info = info.headers
                except web.HTTPError:
                    return None
                    
            if status.startswith('3'):
                uri = urllib.parse.urljoin(uri, info['Location'])
            else:
                break

            redirects += 1
            if redirects >= 25:
                return None

        try:
            mtype = info['content-type']
        except:
            return None

        if not mtype or not (('/html' in mtype) or ('/xhtml' in mtype)):
            return None

        try:
            bytes = web.get(uri)
        except:
            return None
        #bytes = u.read(262144)
        #u.close()

    except:
        return

    m = r_title.search(bytes)
    if m:
        title = m.group(1)
        title = title.strip()
        title = title.replace('\t', ' ')
        title = title.replace('\r', ' ')
        title = title.replace('\n', ' ')
        while '  ' in title:
            title = title.replace('  ', ' ')
        if len(title) > 200:
            title = title[:200] + '[...]'

        def e(m):
            entity = m.group(0)
            if entity.startswith('&#x'):
                cp = int(entity[3:-1], 16)
                return chr(cp)
            elif entity.startswith('&#'):
                cp = int(entity[2:-1])
                return chr(cp)
            else:
                char = name2codepoint[entity[1:-1]]
                return chr(char)
        title = r_entity.sub(e, title)

        if title:
            title = title.replace('\n', '')
            title = title.replace('\r', '')
            title = "[ {0} ]".format(title)

            if "posted" in phenny.variables:
                from modules.posted import check_posted
                
                posted = check_posted(phenny, input, uri)

                if posted:
                    title = "{0} (posted: {1})".format(title, posted)


        else:
            title = None
    return title
Beispiel #4
0
def gettitle(phenny, input, uri):
    if not ':' in uri:
        uri = 'http://' + uri
    uri = uri.replace('#!', '?_escaped_fragment_=')

    title = None
    localhost = [
        'http://localhost/', 'http://localhost:80/',
        'http://localhost:8080/', 'http://127.0.0.1/',
        'http://127.0.0.1:80/', 'http://127.0.0.1:8080/',
        'https://localhost/', 'https://localhost:80/',
        'https://localhost:8080/', 'https://127.0.0.1/',
        'https://127.0.0.1:80/', 'https://127.0.0.1:8080/',
    ]
    for s in localhost:
        if uri.startswith(s):
            return phenny.reply('Sorry, access forbidden.')

    try:
        redirects = 0
        while True:
            info = web.head(uri)

            if not isinstance(info, list):
                status = '200'
            else:
                status = str(info[1])
                info = info[0]
            if status.startswith('3'):
                uri = urllib.parse.urljoin(uri, info['Location'])
            else:
                break

            redirects += 1
            if redirects >= 25:
                return None

        try:
            mtype = info['content-type']
        except:
            return None

        if not (('/html' in mtype) or ('/xhtml' in mtype)):
            return None

        bytes = web.get(uri)
        #bytes = u.read(262144)
        #u.close()

    except:
        return

    m = r_title.search(bytes)
    if m:
        title = m.group(1)
        title = title.strip()
        title = title.replace('\t', ' ')
        title = title.replace('\r', ' ')
        title = title.replace('\n', ' ')
        while '  ' in title:
            title = title.replace('  ', ' ')
        if len(title) > 200:
            title = title[:200] + '[...]'

        def e(m):
            entity = m.group(0)
            if entity.startswith('&#x'):
                cp = int(entity[3:-1], 16)
                return chr(cp)
            elif entity.startswith('&#'):
                cp = int(entity[2:-1])
                return chr(cp)
            else:
                char = name2codepoint[entity[1:-1]]
                return chr(char)
        title = r_entity.sub(e, title)

        if title:
            title = title.replace('\n', '')
            title = title.replace('\r', '')
            title = "[ {0} ]".format(title)

            if "posted" in phenny.variables:
                from modules.posted import check_posted
                
                posted = check_posted(phenny, input, uri)

                if posted:
                    title = "{0} (posted: {1})".format(title, posted)


        else:
            title = None
    return title
Beispiel #5
0
def gettitle(phenny, input, uri):
    if not ':' in uri:
        uri = 'http://' + uri
    uri = uri.replace('#!', '?_escaped_fragment_=')

    title = None
    localhost = [
        'http://localhost/',
        'http://localhost:80/',
        'http://localhost:8080/',
        'http://127.0.0.1/',
        'http://127.0.0.1:80/',
        'http://127.0.0.1:8080/',
        'https://localhost/',
        'https://localhost:80/',
        'https://localhost:8080/',
        'https://127.0.0.1/',
        'https://127.0.0.1:80/',
        'https://127.0.0.1:8080/',
    ]
    for s in localhost:
        if uri.startswith(s):
            return phenny.reply('Sorry, access forbidden.')

    try:
        redirects = 0
        while True:
            info = web.head(uri)

            if not isinstance(info, list):
                status = '200'
            else:
                status = str(info[1])
                info = info[0]
            if status.startswith('3'):
                uri = urllib.parse.urljoin(uri, info['Location'])
            else:
                break

            redirects += 1
            if redirects >= 25:
                return None

        try:
            mtype = info['content-type']
        except:
            return None

        if not (('/html' in mtype) or ('/xhtml' in mtype)):
            return None

        bytes = web.get(uri)
        #bytes = u.read(262144)
        #u.close()

    except:
        return

    m = r_title.search(bytes)
    if m:
        title = m.group(1)
        title = title.strip()
        title = title.replace('\t', ' ')
        title = title.replace('\r', ' ')
        title = title.replace('\n', ' ')
        while '  ' in title:
            title = title.replace('  ', ' ')
        if len(title) > 200:
            title = title[:200] + '[...]'

        def e(m):
            entity = m.group(0)
            if entity.startswith('&#x'):
                cp = int(entity[3:-1], 16)
                return chr(cp)
            elif entity.startswith('&#'):
                cp = int(entity[2:-1])
                return chr(cp)
            else:
                char = name2codepoint[entity[1:-1]]
                return chr(char)

        title = r_entity.sub(e, title)

        if title:
            title = title.replace('\n', '')
            title = title.replace('\r', '')
            title = "[ {0} ]".format(title)

            if "posted" in phenny.variables:
                from modules.posted import check_posted

                posted = check_posted(phenny, input, uri)

                if posted:
                    title = "{0} (posted: {1})".format(title, posted)

        else:
            title = None
    return title