def urlparser(match, say = None, input=None, bot=None): inpo = input.params.replace(input.chan+" :","") url = match.group().encode('utf-8') regexs = re.compile(r"(.+?)(\'|\"|\(|\)|\{|\}|\]|\[|\<|\>)") matchs = regexs.search(url) if matchs: url = matchs.group(0).replace(matchs.group(0)[-1:],"") url = urlnorm.normalize(url) url2 = urltest(url,match) if (not input.conn.conf['autotitle']==False) and (not (perm.isignored(input) or perm.isbot(input))) and not (inpo.startswith(",t") or inpo.startswith(",title") or inpo.startswith(",shor")) and not ("@" in url): #print "[debug] URL found" if url.startswith("www."): url = "http://"+url for x in ignored_urls: if x in url: return title = parse(url) title = multiwordReplace(title, wordDic) try: realurl = http.get_url(url) except Exception, msg: return("(Link) %s" % msg) api_user = bot.config.get("api_keys", {}).get("bitly_user", None) api_key = bot.config.get("api_keys", {}).get("bitly_api", None) if api_key is None: return "error: no api key set" realurl = isgd(realurl) if realurl == url: return("(Link) %s" % title) else: return("(Link) %s <=> %s" % (realurl, title))
def parse(match): url = urlnorm.normalize(match.encode('utf-8')) if url not in ignored_urls: url = url.decode('utf-8') try: parts = urlparse.urlsplit(url) conn = httplib.HTTPConnection(parts.hostname, timeout=10) path = parts.path if parts.query: path += "?" + parts.query conn.request('HEAD', path) resp = conn.getresponse() if not (200 <= resp.status < 400): return "Error: HEAD %s %s " % (resp.status, resp.reason) errors = check_response(dict(resp.getheaders())) if errors: return errors except Exception as e: return "Error: " + str(e) try: req = urllib2.urlopen(url) except Exception as e: return "Error: GET %s " % e errors = check_response(req.headers) if errors: return errors text = req.read(maxlen).decode('utf8', 'ignore') match = titler.search(text) if not match: return "Error: no title " rawtitle = match.group(1) title = repaste.decode_html(rawtitle) title = " ".join(title.split()) return title
def show_title(match, nick='', chan='', say=None): matched = match.group().encode('utf-8') url = urlnorm.normalize(matched) host = Request(url).get_host() if not nick in ignore: page, response = http.get_html_and_response(url) message = '' if host not in ignore_hosts: parser = BeautifulSoup(response) title = parser.title.string.strip() if title: message = 'URL title: %s' % (title) # Shorten URLs that are over 80 characters. if len(url) >= 80: short_url = http.get( 'http://is.gd/create.php', query_params = {'format': 'simple', 'url': matched} ) # Cheap error checking if 'error: please' not in short_url.lower(): if message: message += ' | Short URL: %s' else: message = 'Short URL: %s' message = message % (short_url) if message: say(message)
def isup(inp): """isup -- uses isup.me to see if a site is up or not :type inp: str """ # slightly overcomplicated, esoteric URL parsing scheme, auth, path, query, fragment = urllib.parse.urlsplit(inp.strip()) domain = auth or path url = urlnorm.normalize(domain, assume_scheme="http") try: soup = http.get_soup('http://isup.me/' + domain) except http.HTTPError: return "Failed to get status." content = soup.find('div').text.strip() if "not just you" in content: return "It's not just you. {} looks \x02\x034down\x02\x0f from here!".format( url) elif "is up" in content: return "It's just you. {} is \x02\x033up\x02\x0f.".format(url) else: return "Huh? That doesn't look like a site on the interweb."
def urlinput(match, nick='', chan='', db=None, bot=None): db_init(db) url = urlnorm.normalize(match.group().encode('utf-8')) if url not in ignored_urls: url = url.decode('utf-8') history = get_history(db, chan, url) insert_history(db, chan, url, nick) if nick not in dict(history): return format_reply(history)
def parse(match): url = urlnorm.normalize(match.encode('utf-8')) if url not in ignored_urls: url = url.decode('utf-8') try: soup = BeautifulSoup.BeautifulSoup(http.get(url)) return soup.title.string except: return "fail"
def isup(inp): """isup -- uses isup.me to see if a site is up or not""" # slightly overcomplicated, esoteric URL parsing scheme, auth, path, query, fragment = urlparse.urlsplit(inp.strip()) domain = auth.encode('utf-8') or path.encode('utf-8') url = urlnorm.normalize(domain, assume_scheme="http") try: soup = http.get_soup('http://isup.me/' + domain) except http.HTTPError, http.URLError: return "Could not get status."
def show_title(match, nick='', chan='', say=None): url = urlnorm.normalize(match.group().encode('utf-8')) if not url in ignore and not nick in ignore: page = http.get_html(url) title = page.xpath('//title') if title and 'youtube' not in url and 'twitter' not in url: titleList = [] for i in title: if i.text_content(): titleList.append(i.text_content().strip()) if titleList: titleList = ''.join(titleList) string = "URL title: %s" % (''.join(titleList)) say(string)
def urlinput(match, nick='', chan='', db=None, bot=None): url = urlnorm.normalize(match.group().encode('utf-8')) should_ignore = False for domain in ignored_domains: temp_url = url.replace('https://', '').replace('http://', '').replace('www.', '') if domain in temp_url: should_ignore = True break print url if not should_ignore: url = url.decode('utf-8') page = urllib2.urlopen(url) soup = BeautifulSoup(page) title = soup.title.find(text=True).strip() if title != "" and title is not None: return u"\x02Title:\x02 {}".format(title)
def urlinput(match, nick='', chan='', db=None, bot=None): db_init(db) url = urlnorm.normalize(match.group()) if url not in ignored_urls: url = url history = get_history(db, chan, url) insert_history(db, chan, url, nick) inp = match.string.lower() for name in dict(history): if name.lower() in inp: # person was probably quoting a line return # that had a link. don't remind them. if nick not in dict(history): return format_reply(history)
def url(inp, nick='', chan='', db=None, bot=None): db_init(db) url = urlnorm.normalize(inp.group().encode('utf-8')) if url not in ignored_urls: url = url.decode('utf-8') history = get_history(db, chan, url) insert_history(db, chan, url, nick) inp = match.string.lower() for name in dict(history): if name.lower() in inp: # person was probably quoting a line return # that had a link. don't remind them. if nick not in dict(history): return format_reply(history)
def title(inp): "title <url> -- gets the title of a web page" url = urlnorm.normalize(inp.encode('utf-8'), assume_scheme="http") try: page = http.open(url) real_url = page.geturl() soup = BeautifulSoup(page.read()) except (http.HTTPError, http.URLError): return "Could not fetch page." title = soup.find('title').contents[0] if not title: return "Could not find title." return u"{} [{}]".format(title, real_url)
def poop_url(match, nick='', chan='', bot=None): if chan.find('#') == -1: return else: chan = chan.strip('#') if chan == "thesite": return url = urlnorm.normalize(match.group().encode('utf-8')) url = url.decode('utf-8') plink = parse_link(url) plink['nick'] = nick plink['chan'] = chan data = json.dumps(plink) jug.publish('links', data)
def url(inp, nick='', chan='', db=None, bot=None): db_init(db) url = urlnorm.normalize(inp.group(0).lower().encode('utf-8')) print url if url not in ignored_urls: url = url.decode('utf-8') history = get_history(db, chan, url) print history #insert_history(db, chan, url, nick) # inp = url.lower() for name in dict(history): if name.lower() in url: # person was probably quoting a line return # that had a link. don't remind them. if nick not in dict(history): return format_reply(history)
def urlinput(message_data, bot): # Verify the database exists; db_init(db) # normalise the url url = urlnorm.normalize(message_data['re'].group().encode('utf-8')) if url not in ignored_urls: url = url.decode('utf-8') # Load our primitives from message_data chan = message_data['channel'] nick = message_data['nick'] history = get_history(db, chan, url) insert_history(db, chan, url, nick) if nick not in dict(history): return format_reply(history)
def urlparser(match, say=None): url = urlnorm.normalize(match.group().encode('utf-8')) if url[:7] != "http://": if url[:8] != "https://": url = "http://" + url for x in ignored_urls: if x in url: return title = parse(url) if title == "fail": return title = http.unescape(title) realurl = http.get_url(url) if realurl == url: say("(Link) %s" % title) return else: say("(Link) %s [%s]" % (title, realurl)) return
def urltranslatesa(message_data, bot): #get url and normalize url = urlnorm.normalize(message_data['re'].group().encode('utf-8')) if config.sa_user is None or config.sa_password is None: return login(config.sa_user, config.sa_password) thread = http.get_html(showthread, threadid=message_data['re'].group(1), perpage='1', cookies=True) breadcrumbs = thread.xpath('//div[@class="breadcrumbs"]//a/text()') if not breadcrumbs: return thread_title = breadcrumbs[-1] forum_title = breadcrumbs[-2] poster = thread.xpath('//dt[contains(@class, author)]//text()')[0] # 1 post per page => n_pages = n_posts num_posts = thread.xpath('//a[@title="Last page"]/@href') print(num_posts) if not num_posts: num_posts = 1 else: num_posts = int(num_posts[0].rsplit('=', 1)[1]) return '\x02%s\x02 > \x02%s\x02 by \x02%s\x02, %s post%s' % (forum_title, thread_title, poster, num_posts,'s' if num_posts > 1 else '')
def get_title(url): url = urlnorm.normalize(url.encode('utf-8')) url = url.decode('utf-8') # add http if its missing if not url.startswith("http"): url = "http://" + url try: # get the title request = http.open(url) real_url = request.geturl() text = request.read() text = text.decode('utf8') match = titler.search(text) title = match.group(1) except: return "Could not parse URL! Are you sure its valid?" title = http.unescape(title) # if the url has been redirected, show us if real_url == url: return title else: return u"%s [%s]" % (title, real_url)
def isup(inp): """isup -- uses isup.me to see if a site is up or not :type inp: str """ # slightly overcomplicated, esoteric URL parsing scheme, auth, path, query, fragment = urllib.parse.urlsplit(inp.strip()) domain = auth or path url = urlnorm.normalize(domain, assume_scheme="http") try: soup = http.get_soup('http://isup.me/' + domain) except http.HTTPError: return "Failed to get status." content = soup.find('div').text.strip() if "not just you" in content: return "It's not just you. {} looks \x02\x034down\x02\x0f from here!".format(url) elif "is up" in content: return "It's just you. {} is \x02\x033up\x02\x0f.".format(url) else: return "Huh? That doesn't look like a site on the interweb."
from __future__ import division, unicode_literals from past.utils import old_div import math import time from util import hook, urlnorm, timesince expiration_period = 60 * 60 * 24 # 1 day ignored_urls = [urlnorm.normalize("http://google.com")] def db_init(db): db.execute("create table if not exists urlhistory" "(chan, url, nick, time)") db.commit() def insert_history(db, chan, url, nick): db.execute( "insert into urlhistory(chan, url, nick, time) " "values(?,?,?,?)", (chan, url, nick, time.time())) db.commit() def get_history(db, chan, url): db.execute("delete from urlhistory where time < ?", (time.time() - expiration_period, )) return db.execute( "select nick, time from urlhistory where " "chan=? and url=? order by time desc", (chan, url)).fetchall()
import urllib2 import re import socket import subprocess import time from util import hook, http, urlnorm, timesince from bs4 import BeautifulSoup socket.setdefaulttimeout(10) # global setting ignored_domains = [ urlnorm.normalize("youtube.com"), urlnorm.normalize("google.com"), urlnorm.normalize("twitter.com"), urlnorm.normalize("forums.somethingawful.com"), urlnorm.normalize("youtu.be") ] def get_version(): # p = subprocess.Popen(['git', 'log', '--oneline'], stdout=subprocess.PIPE) # stdout, _ = p.communicate() # p.wait() # revnumber = len(stdout.splitlines()) # shorthash = stdout.split(None, 1)[0] # http.ua_skybot = 'Skybot/r%d %s (http://github.com/rmmh/skybot)' \ # % (revnumber, shorthash)
import math import re import time from util import hook, urlnorm, timesince expiration_period = 60 * 60 * 24 # 1 day ignored_urls = [ urlnorm.normalize("http://google.com"), ] def db_init(db): db.execute("create table if not exists urls" "(chan, url, nick, time)") db.commit() def insert_history(db, chan, url, nick): now = time.time() db.execute("insert into urls(chan, url, nick, time) " "values(?,?,?,?)", (chan, url, nick, time.time())) db.commit() def get_history(db, chan, url): db.execute("delete from urls where time < ?", (time.time() - expiration_period, )) return db.execute( "select nick, time from urls where " "chan=? and url=? order by time desc", (chan, url)).fetchall()
def urltest(url,match): if not type(url) is type("i"): return urlnorm.normalize(match.group().group("id").encode('utf-8')) else: return ""
import math import re import time from util import hook, urlnorm, timesince url_re = r'([a-zA-Z]+://|www\.)[^ ]+' expiration_period = 60 * 60 * 24 # 1 day ignored_urls = [urlnorm.normalize("http://google.com")] def db_init(db): db.execute("create table if not exists urlhistory" "(chan, url, nick, time)") db.commit() def insert_history(db, chan, url, nick): now = time.time() db.execute("insert into urlhistory(chan, url, nick, time) " "values(?,?,?,?)", (chan, url, nick, time.time())) db.commit() def get_history(db, chan, url): db.execute("delete from urlhistory where time < ?", (time.time() - expiration_period,)) return db.execute("select nick, time from urlhistory where " "chan=? and url=? order by time desc", (chan, url)).fetchall()