Esempi in Python per normalize, esempi in Python per util.urlnorm.normalize

Esempio n. 1

0

Mostra file

File: urltools.py Progetto: Red-M/frogbot

def urlparser(match, say = None, input=None, bot=None):
    inpo = input.params.replace(input.chan+" :","")
    url = match.group().encode('utf-8')
    regexs = re.compile(r"(.+?)(\'|\"|\(|\)|\{|\}|\]|\[|\<|\>)")
    matchs = regexs.search(url)
    if matchs:
        url = matchs.group(0).replace(matchs.group(0)[-1:],"")
    url = urlnorm.normalize(url)
    url2 = urltest(url,match)
    if (not input.conn.conf['autotitle']==False) and (not (perm.isignored(input) or perm.isbot(input))) and not (inpo.startswith(",t") or inpo.startswith(",title") or inpo.startswith(",shor")) and not ("@" in url):
        #print "[debug] URL found"
        if url.startswith("www."):
            url = "http://"+url
        for x in ignored_urls:
            if x in url:
                return
        title = parse(url)
        title = multiwordReplace(title, wordDic)
        try:
            realurl = http.get_url(url)
        except Exception, msg:
            return("(Link) %s" % msg)
        api_user = bot.config.get("api_keys", {}).get("bitly_user", None)
        api_key = bot.config.get("api_keys", {}).get("bitly_api", None)
        if api_key is None:
            return "error: no api key set"
        realurl = isgd(realurl)
        if realurl == url:
            return("(Link) %s" % title)
        else:
            return("(Link) %s <=> %s" % (realurl, title))

Esempio n. 2

0

Mostra file

File: urltools.py Progetto: Red-M/frogbot

def parse(match):
    url = urlnorm.normalize(match.encode('utf-8'))
    if url not in ignored_urls:
        url = url.decode('utf-8')
        try:
            parts = urlparse.urlsplit(url)
            conn = httplib.HTTPConnection(parts.hostname, timeout=10)
            path = parts.path
            if parts.query:
                path += "?" + parts.query
            conn.request('HEAD', path)
            resp = conn.getresponse()
            if not (200 <= resp.status < 400):
                return "Error: HEAD %s %s " % (resp.status, resp.reason)
            errors = check_response(dict(resp.getheaders()))
            if errors:
                return errors
        except Exception as e:
            return "Error: " + str(e)
        try:
            req = urllib2.urlopen(url)
        except Exception as e:
            return "Error: GET %s " % e
        errors = check_response(req.headers)
        if errors:
            return errors
        text = req.read(maxlen).decode('utf8', 'ignore')
        match = titler.search(text)
        if not match:
            return "Error: no title "
        rawtitle = match.group(1)
        title = repaste.decode_html(rawtitle)
        title = " ".join(title.split())
        return title

Esempio n. 3

0

Mostra file

File: urls.py Progetto: kennethlove/forrstdotcom

def show_title(match, nick='', chan='', say=None):
    matched = match.group().encode('utf-8')
    url     = urlnorm.normalize(matched)
    host    = Request(url).get_host()

    if not nick in ignore:
        page, response = http.get_html_and_response(url)
        message        = ''

        if host not in ignore_hosts:
            parser = BeautifulSoup(response)
            title  = parser.title.string.strip()

            if title:
                message = 'URL title: %s' % (title)

        # Shorten URLs that are over 80 characters.
        if len(url) >= 80:
            short_url = http.get(
                'http://is.gd/create.php',
                query_params = {'format': 'simple', 'url': matched}
            )

            # Cheap error checking
            if 'error: please' not in short_url.lower():
                if message:
                    message += ' | Short URL: %s'
                else:
                    message = 'Short URL: %s'

                message = message % (short_url)

        if message:
            say(message)

Esempio n. 4

0

Mostra file

File: pagecheck.py Progetto: Zarthus/CloudBotRefresh

def isup(inp):
    """isup -- uses isup.me to see if a site is up or not
    :type inp: str
    """

    # slightly overcomplicated, esoteric URL parsing
    scheme, auth, path, query, fragment = urllib.parse.urlsplit(inp.strip())

    domain = auth or path
    url = urlnorm.normalize(domain, assume_scheme="http")

    try:
        soup = http.get_soup('http://isup.me/' + domain)
    except http.HTTPError:
        return "Failed to get status."

    content = soup.find('div').text.strip()

    if "not just you" in content:
        return "It's not just you. {} looks \x02\x034down\x02\x0f from here!".format(
            url)
    elif "is up" in content:
        return "It's just you. {} is \x02\x033up\x02\x0f.".format(url)
    else:
        return "Huh? That doesn't look like a site on the interweb."

Esempio n. 5

0

Mostra file

File: urlhistory.py Progetto: underisk/skybot

def urlinput(match, nick='', chan='', db=None, bot=None):
    db_init(db)
    url = urlnorm.normalize(match.group().encode('utf-8'))
    if url not in ignored_urls:
        url = url.decode('utf-8')
        history = get_history(db, chan, url)
        insert_history(db, chan, url, nick)
        if nick not in dict(history):
            return format_reply(history)

Esempio n. 6

0

Mostra file

File: urlparse.py Progetto: frozenMC/CloudBot

def parse(match):
    url = urlnorm.normalize(match.encode('utf-8'))
    if url not in ignored_urls:
        url = url.decode('utf-8')
        try:
            soup = BeautifulSoup.BeautifulSoup(http.get(url))
            return soup.title.string
        except:
            return "fail"

Esempio n. 7

0

Mostra file

File: urlhistory.py Progetto: Ipsum/skybot

def urlinput(match, nick='', chan='', db=None, bot=None):
    db_init(db)
    url = urlnorm.normalize(match.group().encode('utf-8'))
    if url not in ignored_urls:
        url = url.decode('utf-8')
        history = get_history(db, chan, url)
        insert_history(db, chan, url, nick)
        if nick not in dict(history):
            return format_reply(history)

Esempio n. 8

0

Mostra file

def isup(inp):
    """isup -- uses isup.me to see if a site is up or not"""

    # slightly overcomplicated, esoteric URL parsing
    scheme, auth, path, query, fragment = urlparse.urlsplit(inp.strip())

    domain = auth.encode('utf-8') or path.encode('utf-8')
    url = urlnorm.normalize(domain, assume_scheme="http")

    try:
        soup = http.get_soup('http://isup.me/' + domain)
    except http.HTTPError, http.URLError:
        return "Could not get status."

Esempio n. 9

0

Mostra file

File: urls.py Progetto: bendavis78/ircbot

def show_title(match, nick='', chan='', say=None):
    url = urlnorm.normalize(match.group().encode('utf-8'))
    if not url in ignore and not nick in ignore:
        page = http.get_html(url)
        title = page.xpath('//title')
        if title and 'youtube' not in url and 'twitter' not in url:
            titleList = []
            for i in title:
                if i.text_content():
                    titleList.append(i.text_content().strip())
            if titleList:
                titleList = ''.join(titleList)
                string = "URL title: %s" % (''.join(titleList))
                say(string)

Esempio n. 10

0

Mostra file

File: misc.py Progetto: whoflungpoop/badbot

def urlinput(match, nick='', chan='', db=None, bot=None):
    url = urlnorm.normalize(match.group().encode('utf-8'))
    should_ignore = False
    for domain in ignored_domains:
        temp_url = url.replace('https://', '').replace('http://', '').replace('www.', '')
        if domain in temp_url:
            should_ignore = True
            break
    print url
    if not should_ignore:
        url = url.decode('utf-8')
        page = urllib2.urlopen(url)
        soup = BeautifulSoup(page)
        title = soup.title.find(text=True).strip()
        if title != "" and title is not None:
            return u"\x02Title:\x02 {}".format(title)

Esempio n. 11

0

Mostra file

def urlinput(match, nick='', chan='', db=None, bot=None):
    db_init(db)
    url = urlnorm.normalize(match.group())
    if url not in ignored_urls:
        url = url
        history = get_history(db, chan, url)
        insert_history(db, chan, url, nick)

        inp = match.string.lower()

        for name in dict(history):
            if name.lower() in inp:  # person was probably quoting a line
                return  # that had a link. don't remind them.

        if nick not in dict(history):
            return format_reply(history)

Esempio n. 12

0

Mostra file

File: urlhistory.py Progetto: Juboo/UguuBot

def url(inp, nick='', chan='', db=None, bot=None):
    db_init(db)
    url = urlnorm.normalize(inp.group().encode('utf-8'))
    if url not in ignored_urls:
        url = url.decode('utf-8')
        history = get_history(db, chan, url)
        insert_history(db, chan, url, nick)

        inp = match.string.lower()

        for name in dict(history):
            if name.lower() in inp:  # person was probably quoting a line
                return               # that had a link. don't remind them.

        if nick not in dict(history):
            return format_reply(history)

Esempio n. 13

0

Mostra file

def title(inp):
    "title <url> -- gets the title of a web page"
    url = urlnorm.normalize(inp.encode('utf-8'), assume_scheme="http")

    try:
        page = http.open(url)
        real_url = page.geturl()
        soup = BeautifulSoup(page.read())
    except (http.HTTPError, http.URLError):
        return "Could not fetch page."

    title = soup.find('title').contents[0]

    if not title:
        return "Could not find title."

    return u"{} [{}]".format(title, real_url)

Esempio n. 14

0

Mostra file

File: skynet.py Progetto: limnick/siri

def poop_url(match, nick='', chan='', bot=None):
    if chan.find('#') == -1:
        return
    else:
        chan = chan.strip('#')

    if chan == "thesite":
        return

    url = urlnorm.normalize(match.group().encode('utf-8'))
    url = url.decode('utf-8')

    plink = parse_link(url)
    plink['nick'] = nick
    plink['chan'] = chan

    data = json.dumps(plink)
    jug.publish('links', data)

Esempio n. 15

0

Mostra file

File: urlhistory.py Progetto: nojusr/Taigabot

def url(inp, nick='', chan='', db=None, bot=None):
    db_init(db)
    url = urlnorm.normalize(inp.group(0).lower().encode('utf-8'))
    print url
    if url not in ignored_urls:
        url = url.decode('utf-8')
        history = get_history(db, chan, url)
        print history
        #insert_history(db, chan, url, nick)

        # inp = url.lower()

        for name in dict(history):
            if name.lower() in url:  # person was probably quoting a line
                return  # that had a link. don't remind them.

        if nick not in dict(history):
            return format_reply(history)

Esempio n. 16

0

Mostra file

File: urlhistory.py Progetto: TZer0/botmily

def urlinput(message_data, bot):
    # Verify the database exists;
    db_init(db)

    # normalise the url
    url = urlnorm.normalize(message_data['re'].group().encode('utf-8'))

    if url not in ignored_urls:
        url = url.decode('utf-8')

        # Load our primitives from message_data
        chan = message_data['channel']
        nick = message_data['nick']

        history = get_history(db, chan, url)
        insert_history(db, chan, url, nick)
        if nick not in dict(history):
            return format_reply(history)

Esempio n. 17

0

Mostra file

def urlinput(message_data, bot):
    # Verify the database exists;
    db_init(db)

    # normalise the url
    url = urlnorm.normalize(message_data['re'].group().encode('utf-8'))

    if url not in ignored_urls:
        url = url.decode('utf-8')

        # Load our primitives from message_data
        chan = message_data['channel']
        nick = message_data['nick']

        history = get_history(db, chan, url)
        insert_history(db, chan, url, nick)
        if nick not in dict(history):
            return format_reply(history)

Esempio n. 18

0

Mostra file

File: urlparse.py Progetto: frozenMC/CloudBot

def urlparser(match, say=None):
    url = urlnorm.normalize(match.group().encode('utf-8'))
    if url[:7] != "http://":
        if url[:8] != "https://":
            url = "http://" + url
    for x in ignored_urls:
        if x in url:
            return
    title = parse(url)
    if title == "fail":
        return
    title = http.unescape(title)
    realurl = http.get_url(url)
    if realurl == url:
        say("(Link) %s" % title)
        return
    else:
        say("(Link) %s [%s]" % (title, realurl))
        return

Esempio n. 19

0

Mostra file

File: somethingawful.py Progetto: TZer0/botmily

def urltranslatesa(message_data, bot):
	#get url and normalize
	url = urlnorm.normalize(message_data['re'].group().encode('utf-8'))
	if config.sa_user is None or config.sa_password is None:
		return
	login(config.sa_user, config.sa_password)
	thread = http.get_html(showthread, threadid=message_data['re'].group(1), perpage='1', cookies=True)
	breadcrumbs = thread.xpath('//div[@class="breadcrumbs"]//a/text()')
	if not breadcrumbs:
		return
	thread_title = breadcrumbs[-1]
	forum_title = breadcrumbs[-2]
	poster = thread.xpath('//dt[contains(@class, author)]//text()')[0]
	# 1 post per page => n_pages = n_posts
	num_posts = thread.xpath('//a[@title="Last page"]/@href')
	print(num_posts)
	if not num_posts:
		num_posts = 1
	else:
		num_posts = int(num_posts[0].rsplit('=', 1)[1])
	return '\x02%s\x02 > \x02%s\x02 by \x02%s\x02, %s post%s' % (forum_title, thread_title, poster, num_posts,'s' if num_posts > 1 else '')

Esempio n. 20

0

Mostra file

File: urlparse.py Progetto: ShadowDev/CloudBot

def get_title(url):
    url = urlnorm.normalize(url.encode('utf-8'))
    url = url.decode('utf-8')
    # add http if its missing
    if not url.startswith("http"):
        url = "http://" + url
    try:
        # get the title
        request = http.open(url)
        real_url = request.geturl()
        text = request.read()
        text = text.decode('utf8')
        match = titler.search(text)
        title = match.group(1)
    except:
        return "Could not parse URL! Are you sure its valid?"

    title = http.unescape(title)

    # if the url has been redirected, show us
    if real_url == url:
        return title
    else:
        return u"%s [%s]" % (title, real_url)

Esempio n. 21

0

Mostra file

File: pagecheck.py Progetto: FurCode/RoboCop2

def isup(inp):
    """isup -- uses isup.me to see if a site is up or not
    :type inp: str
    """

    # slightly overcomplicated, esoteric URL parsing
    scheme, auth, path, query, fragment = urllib.parse.urlsplit(inp.strip())

    domain = auth or path
    url = urlnorm.normalize(domain, assume_scheme="http")

    try:
        soup = http.get_soup('http://isup.me/' + domain)
    except http.HTTPError:
        return "Failed to get status."

    content = soup.find('div').text.strip()

    if "not just you" in content:
        return "It's not just you. {} looks \x02\x034down\x02\x0f from here!".format(url)
    elif "is up" in content:
        return "It's just you. {} is \x02\x033up\x02\x0f.".format(url)
    else:
        return "Huh? That doesn't look like a site on the interweb."

Esempio n. 22

0

Mostra file

from __future__ import division, unicode_literals
from past.utils import old_div
import math
import time

from util import hook, urlnorm, timesince

expiration_period = 60 * 60 * 24  # 1 day

ignored_urls = [urlnorm.normalize("http://google.com")]


def db_init(db):
    db.execute("create table if not exists urlhistory"
               "(chan, url, nick, time)")
    db.commit()


def insert_history(db, chan, url, nick):
    db.execute(
        "insert into urlhistory(chan, url, nick, time) "
        "values(?,?,?,?)", (chan, url, nick, time.time()))
    db.commit()


def get_history(db, chan, url):
    db.execute("delete from urlhistory where time < ?",
               (time.time() - expiration_period, ))
    return db.execute(
        "select nick, time from urlhistory where "
        "chan=? and url=? order by time desc", (chan, url)).fetchall()

Esempio n. 23

0

Mostra file

File: misc.py Progetto: whoflungpoop/badbot

import urllib2
import re
import socket
import subprocess
import time

from util import hook, http, urlnorm, timesince
from bs4 import BeautifulSoup

socket.setdefaulttimeout(10)  # global setting

ignored_domains = [
    urlnorm.normalize("youtube.com"),
    urlnorm.normalize("google.com"),
    urlnorm.normalize("twitter.com"),
    urlnorm.normalize("forums.somethingawful.com"),
    urlnorm.normalize("youtu.be")
]


def get_version():
    # p = subprocess.Popen(['git', 'log', '--oneline'], stdout=subprocess.PIPE)
    # stdout, _ = p.communicate()
    # p.wait()

    # revnumber = len(stdout.splitlines())

    # shorthash = stdout.split(None, 1)[0]

    # http.ua_skybot = 'Skybot/r%d %s (http://github.com/rmmh/skybot)' \
    #     % (revnumber, shorthash)

Esempio n. 24

0

Mostra file

File: urlhistory.py Progetto: nojusr/Taigabot

import math
import re
import time

from util import hook, urlnorm, timesince

expiration_period = 60 * 60 * 24  # 1 day

ignored_urls = [
    urlnorm.normalize("http://google.com"),
]


def db_init(db):
    db.execute("create table if not exists urls" "(chan, url, nick, time)")
    db.commit()


def insert_history(db, chan, url, nick):
    now = time.time()
    db.execute("insert into urls(chan, url, nick, time) "
               "values(?,?,?,?)", (chan, url, nick, time.time()))
    db.commit()


def get_history(db, chan, url):
    db.execute("delete from urls where time < ?",
               (time.time() - expiration_period, ))
    return db.execute(
        "select nick, time from urls where "
        "chan=? and url=? order by time desc", (chan, url)).fetchall()

Esempio n. 25

0

Mostra file

File: urltools.py Progetto: Red-M/frogbot

def urltest(url,match):
    if not type(url) is type("i"):
        return urlnorm.normalize(match.group().group("id").encode('utf-8'))
    else:
        return ""

Esempio n. 26

0

Mostra file

File: urlhistory.py Progetto: Ipsum/skybot

import math
import re
import time

from util import hook, urlnorm, timesince

url_re = r'([a-zA-Z]+://|www\.)[^ ]+'

expiration_period = 60 * 60 * 24  # 1 day

ignored_urls = [urlnorm.normalize("http://google.com")]


def db_init(db):
    db.execute("create table if not exists urlhistory"
                 "(chan, url, nick, time)")
    db.commit()


def insert_history(db, chan, url, nick):
    now = time.time()
    db.execute("insert into urlhistory(chan, url, nick, time) "
                 "values(?,?,?,?)", (chan, url, nick, time.time()))
    db.commit()


def get_history(db, chan, url):
    db.execute("delete from urlhistory where time < ?",
                 (time.time() - expiration_period,))
    return db.execute("select nick, time from urlhistory where "
            "chan=? and url=? order by time desc", (chan, url)).fetchall()