def show(self, bugId): assert bugId.isdigit(), "bug id has to be a number" html = geturl2(self.show_url(bugId)) data = {} stat = '' for line in html.splitlines(): line = line.strip() if not line: continue elif '<td headers="category">' in line: stat = 'category' elif '<td headers="status">' in line: stat = 'status' elif '<td headers="assignedto">' in line: stat = 'assigned to' elif '<td headers="os">' in line: data['os'] = striphtml(line).strip() elif '<td headers="severity">' in line: data['severity'] = striphtml(line).strip() elif '<td headers="priority">' in line: data['priority'] = striphtml(line).strip() elif '<td headers="reportedver">' in line: data['version'] = striphtml(line).strip() elif '<h2 class="summary' in line: stat = 'summary' elif '<a href="#comments">Comments (' in line: data['comments'] = line.split('(', 1)[1].split(')')[0] # stats elif stat: if stat in ['category', 'status', 'assigned to', 'summary']: data[stat] = striphtml(line).strip() stat = '' return data
def handle(self, event, url, depth, speed=5): if depth < 0: return if not self.url.base in url: logging.warn("skipping %s (%s)" % (url, self.url.base)) ; return if url in self.errors: logging.warn("skipping %s" % url) ; return urls = [] linknr = 0 follownr = 0 n = 0 try: if url not in self.urls: self.urls.append(url) page = Url(url) time.sleep(10-speed) content = page.fetch() event.reply("fetched %s - %s - %s" % (url, len(content), content.status)) try: urldata = UrlData(url, striphtml(content)) if urldata.data.txt: urldata.save() except Exception, ex: handle_exception() for p in page.geturls(): if not p in self.errors: self.put(6, event, p, depth-1, speed-1) if not self.queue.qsize(): self.stop() except Exception, e: logging.warn("ERROR: Can't process url '%s' (%s)" % (url, e)) self.errors.append(url) handle_exception() if len(self.errors) > 10: self.stop()
def handle_imdb(bot, event): """ arguments: <query> - query the imdb databae at http://www.deanclatworthy.com/imdb/ """ if not event.rest: event.missing("<query>") return query = event.rest.strip() urlquery = query.replace(" ", "+") result = {} rawresult = getjson().loads(geturl2(URL % urlquery)) # the API are limited to 30 query per hour, so avoid querying it just for testing purposes # rawresult = {u'ukscreens': 0, u'rating': u'7.7', u'genres': u'Animation, Drama,Family,Fantasy,Music', u'title': u'Pinocchio', u'series': 0, u'country': u'USA', u'votes': u'23209', u'languages': u'English', u'stv': 0, u'year': None, u'usascreens': 0, u'imdburl': u'http://www.imdb.com/title/tt0032910/'} if not rawresult: event.reply("couldn't look up %s" % query) return if 'error' in rawresult: event.reply("%s" % rawresult['error']) return print rawresult for key in rawresult.keys(): if not rawresult[key]: result[key] = u"n/a" else: result[key] = rawresult[key] for key in result.keys(): try: result[key] = striphtml(decode_html_entities(rawresult[key])) except AttributeError: pass if "year" in rawresult.keys(): event.reply( "%(title)s (%(country)s, %(year)s): %(imdburl)s | rating: %(rating)s (out of %(votes)s votes) | Genres %(genres)s | Language: %(languages)s" % result) else: event.reply( "%(title)s (%(country)s): %(imdburl)s | rating: %(rating)s (out of %(votes)s votes) | Genres %(genres)s | Language: %(languages)s" % result)
def handle(self, event, url, depth, speed=5): if depth < 0: return if not self.url.base in url: logging.warn("skipping %s (%s)" % (url, self.url.base)) return if url in self.errors: logging.warn("skipping %s" % url) return urls = [] linknr = 0 follownr = 0 n = 0 try: if url not in self.urls: self.urls.append(url) page = Url(url) time.sleep(10 - speed) content = page.fetch() event.reply("fetched %s - %s - %s" % (url, len(content), content.status)) try: urldata = UrlData(url, striphtml(content)) if urldata.data.txt: urldata.save() except Exception, ex: handle_exception() for p in page.geturls(): if not p in self.errors: self.put(6, event, p, depth - 1, speed - 1) if not self.queue.qsize(): self.stop()
def geturls(txt): result = [] if "http://" in txt or "https://" in txt: for item in re_url_match.findall(txt): logging.debug("web - raw - found url - %s" % item) try: txt = txt.replace(item, '') except ValueError: logging.error("web - invalid url - %s" % url) i = item if i.endswith('"'): i = i[:-1] if i.endswith('")'): i = i[:-2] result.append(i) return (result, striphtml(txt))
def input(self, html): self.scantime = time.time() words = striphtml(html) words = words.replace("\n", "").split() stats = StatDict() for w in words: stats.upitem(w) self.data.url = self.url.url self.data.words = stats self.save() logging.warn("%s words found for %s" % (len(stats), self.url.url)) return stats
def show(self, bugId): assert bugId.isdigit(), "bug id has to be a number" html = geturl2(self.show_url(bugId)) if 'APPLICATION ERROR #1100' in html: raise BugTrackerNotFound('issue not found') data = {'notes': 0} stat = '' skip = 0 for line in html.splitlines(): line = line.strip().replace('\t', '') if skip > 0: skip -= 1 continue elif not line: continue elif '<!-- Category -->' in line: skip = 1 stat = 'category' elif '<!-- Severity -->' in line: skip = 1 stat = 'severity' elif '<!-- Reproducibility -->' in line: skip = 1 stat = 'reproducibility' elif '<!-- Reporter -->' in line: skip = 3 stat = 'reporter' elif '<!-- Priority -->' in line: skip = 1 stat = 'priority' elif '<!-- Resolution -->' in line: skip = 1 stat = 'resolution' elif '<!-- Status -->' in line: skip = 3 stat = 'status' elif '<!-- Summary -->' in line: skip = 4 stat = 'summary' elif '<td class="bugnote-public">' in line: data['notes'] += 1 # stats elif stat: if stat in [ 'category', 'severity', 'reproducibility', 'reporter', 'priority', 'resolution', 'status', 'summary' ]: data[stat] = striphtml(line) stat = '' return data
def markovlearnurl(url): """ learn an url """ lines = 0 logging.warn('learning %s' % url) try: f = geturl(url) for line in f.split('\n'): line = striphtml(line) if lines % 10 == 0: time.sleep(0.01) line = line.strip() if not line: continue markovtalk_learn(line) lines += 1 except Exception, e: logging.error(str(e))
def formatevent(bot, ievent, channels, forwarded=False): m = { 'datetime': datetime.now(), 'separator': format_opt('separator'), 'event_prefix': format_opt('event_prefix'), 'network': bot.cfg.networkname, 'nick': ievent.nick, 'target': stripname(ievent.channel), 'botname': bot.cfg.name, 'txt': ievent.txt, 'type': ievent.cbtype } m = LazyDict(m) if ievent.cmnd == 'PRIVMSG': if ievent.txt.startswith('\001ACTION'): m.txt = '* %s %s' % (m.nick, ievent.txt[7:-1].strip()) else: if bot.type == "irc": m.txt = '<%s> %s' % (m.nick, striphtml(ievent.txt)) elif not forwarded: m.txt = '<%s> %s' % (m.nick, bot.normalize(ievent.txt)) else: m.txt = bot.normalize(ievent.txt) elif ievent.cmnd == 'NOTICE': m.target = ievent.arguments[0] m.txt = "-%s- %s"%(ievent.nick, ievent.txt) elif ievent.cmnd == 'TOPIC': m.txt = '%s changes topic to "%s"'%(ievent.nick, ievent.txt) elif ievent.cmnd == 'MODE': margs = ' '.join(ievent.arguments[1:]) m.txt = '%s sets mode: %s'% (ievent.nick, margs) elif ievent.cmnd == 'JOIN': m.txt = '%s (%s) has joined %s'%(ievent.nick, ievent.userhost, ievent.channel) elif ievent.cmnd == 'KICK': m.txt = '%s was kicked by %s (%s)'% (ievent.arguments[1], ievent.nick, ievent.txt) elif ievent.cmnd == 'PART': m.txt = '%s (%s) has left %s'% (ievent.nick, ievent.userhost, ievent.channel) elif ievent.cmnd in ('QUIT', 'NICK'): if not ievent.user or not ievent.user.data.channels: logging.debug("chatlog - can't find joined channels for %s" % ievent.userhost) return m cmd = ievent.cmnd nick = cmd == 'NICK' and ievent.txt or ievent.nick for c in event.user.channels: if [bot.cfg.name, c] in channels: if True: if cmd == 'NICK': m.txt = '%s (%s) is now known as %s'% (ievent.nick, ievent.userhost, ievent.txt) else: m.txt= '%s (%s) has quit: %s'% (ievent.nick, ievent.userhost, ievent.txt) m.type = ievent.cmnd.lower() m.target = c elif ievent.cbtype == 'PRESENCE': if ievent.type == 'unavailable': m.txt = "%s left" % ievent.nick else: m.txt = "%s joined" % ievent.nick elif ievent.cbtype == "MESSAGE": m.txt = "<%s> %s" % (m.nick, ievent.txt) elif ievent.cbtype == "OUTPUT": m.txt = "<%s> %s" % (bot.cfg.nick, ievent.txt) return m
def show(self, bugId): assert bugId.isdigit(), "bug id has to be a number" html = geturl2(self.show_url(bugId)) if 'APPLICATION ERROR #1100' in html: raise BugTrackerNotFound('issue not found') data = {'notes': 0} stat = '' skip = 0 for line in html.splitlines(): line = line.strip().replace('\t', '') if skip > 0: skip -= 1 continue elif not line: continue elif '<!-- Category -->' in line: skip = 1 stat = 'category' elif '<!-- Severity -->' in line: skip = 1 stat = 'severity' elif '<!-- Reproducibility -->' in line: skip = 1 stat = 'reproducibility' elif '<!-- Reporter -->' in line: skip = 3 stat = 'reporter' elif '<!-- Priority -->' in line: skip = 1 stat = 'priority' elif '<!-- Resolution -->' in line: skip = 1 stat = 'resolution' elif '<!-- Status -->' in line: skip = 3 stat = 'status' elif '<!-- Summary -->' in line: skip = 4 stat = 'summary' elif '<td class="bugnote-public">' in line: data['notes'] += 1 # stats elif stat: if stat in ['category', 'severity', 'reproducibility', 'reporter', 'priority', 'resolution', 'status', 'summary']: data[stat] = striphtml(line) stat = '' return data
def markovlearnspider(target): logging.warn("starting spider learn on %s" % target) coll = PersistCollection(getdatadir() + os.sep + 'spider' + os.sep + "data") if target.startswith("spider://"): target = target[9:] objs = coll.search('url', target) for obj in objs: if not obj.data and obj.data.url: print "skip - no url" ; continue time.sleep(0.001) if target not in obj.data.url: continue logging.warn("url is %s" % obj.data.url) try: if obj.data and obj.data.txt: for line in obj.data.txt.split("\n"): if line.count(";") > 1: continue markovtalk_learn(striphtml(line)) except: handle_exception()
def markovlearnurl(url): """ learn an url """ lines = 0 logging.warn('learning %s' % url) try: f = geturl(url) for line in f.split('\n'): line = striphtml(line) if lines % 10 == 0: time.sleep(0.01) line = line.strip() if not line: continue markovtalk_learn(line) lines += 1 except Exception, e: logging.error(str(e)) logging.warn('learning %s done' % url) return lines
def handle_imdb(bot, event): """ arguments: <query> - query the imdb databae at http://www.deanclatworthy.com/imdb/ """ if not event.rest: event.missing("<query>") ; return query = event.rest.strip() urlquery = query.replace(" ", "+") result = {} rawresult = getjson().loads(geturl2(URL % urlquery)) # the API are limited to 30 query per hour, so avoid querying it just for testing purposes # rawresult = {u'ukscreens': 0, u'rating': u'7.7', u'genres': u'Animation, Drama,Family,Fantasy,Music', u'title': u'Pinocchio', u'series': 0, u'country': u'USA', u'votes': u'23209', u'languages': u'English', u'stv': 0, u'year': None, u'usascreens': 0, u'imdburl': u'http://www.imdb.com/title/tt0032910/'} if not rawresult: event.reply("couldn't look up %s" % query) ; return if 'error' in rawresult: event.reply("%s" % rawresult['error']) ; return for key in rawresult.keys(): if not rawresult[key]: result[key] = u"n/a" else: result[key] = rawresult[key] for key in result.keys(): try: result[key] = striphtml(decode_html_entities(rawresult[key])) except AttributeError: pass event.reply("%(title)s (%(country)s, %(year)s): %(imdburl)s | rating: %(rating)s (out of %(votes)s votes) | Genres %(genres)s | Language: %(languages)s" % result )
def markovlearnspider(target): logging.warn("starting spider learn on %s" % target) coll = PersistCollection(getdatadir() + os.sep + 'spider' + os.sep + "data") if target.startswith("spider://"): target = target[9:] objs = coll.search('url', target) for obj in objs: if not obj.data and obj.data.url: print "skip - no url" continue time.sleep(0.001) if target not in obj.data.url: continue logging.warn("url is %s" % obj.data.url) try: if obj.data and obj.data.txt: for line in obj.data.txt.split("\n"): if line.count(";") > 1: continue markovtalk_learn(striphtml(line)) except: handle_exception()
def comments(self, bugId): assert bugId.isdigit(), "bug id has to be a number" bugrss = geturl(self.comments_url(bugId)) bugdom = xml.dom.minidom.parseString(bugrss) bugall = bugdom.getElementsByTagName('item') comments = [] if bugall: for item in bugall: title = item.getElementsByTagName('title')[0].firstChild.nodeValue if 'comment added' in title: try: author = item.getElementsByTagName('dc:creator')[0].firstChild.nodeValue except IndexError: author = 'anonymous' comment = item.getElementsByTagName('description')[0].firstChild.nodeValue comment = striphtml(comment.replace('\n', ' ')).strip() while ' ' in comment: comment = comment.replace(' ', ' ') comments.append('%s: %s' % (author, comment)) return comments
def comments(self, bugId): assert bugId.isdigit(), "bug id has to be a number" bugrss = geturl(self.comments_url(bugId)) bugdom = xml.dom.minidom.parseString(bugrss) bugall = bugdom.getElementsByTagName('item') comments = [] if bugall: for item in bugall: title = item.getElementsByTagName( 'title')[0].firstChild.nodeValue if 'comment added' in title: try: author = item.getElementsByTagName( 'dc:creator')[0].firstChild.nodeValue except IndexError: author = 'anonymous' comment = item.getElementsByTagName( 'description')[0].firstChild.nodeValue comment = striphtml(comment.replace('\n', ' ')).strip() while ' ' in comment: comment = comment.replace(' ', ' ') comments.append('%s: %s' % (author, comment)) return comments
def handle_imdb(bot, event): if not event.rest: event.missing("<query>") return query = build_query(scan_query(event.rest.strip())) result = {} rawresult = do_imdb_api_query(query) # the API are limited to 30 query per hour, so avoid querying it just for # testing purposes # rawresult = {u'ukscreens': 0, u'rating': u'7.7', u'genres': u'Animation, Drama,Family,Fantasy,Music', u'title': u'Pinocchio', u'series': 0, u'country': u'USA', u'votes': u'23209', u'languages': u'English', u'stv': 0, u'year': None, u'usascreens': 0, u'imdburl': u'http://www.imdb.com/title/tt0032910/'} if not rawresult: event.reply("Couldn't look up %s" % query) return if 'error' in rawresult: event.reply("%s" % rawresult['error']) return for key in rawresult.keys(): if not rawresult[key]: result[key] = u"n/a" else: result[key] = rawresult[key] logging.warn(rawresult, result) for key in result.keys(): try: result[key] = striphtml(decode_html_entities(rawresult[key])) except AttributeError: # if the value is not a string, ignore the error and keep going pass event.reply("%(title)s (%(country)s, %(year)s): %(imdburl)s | rating:\ %(rating)s (out of %(votes)s votes) | Genres %(genres)s | Language: %(languages)s" % result)
def handle_imdb(bot, event): if not event.rest: event.missing("<query>") return query = build_query(scan_query(event.rest.strip())) result = {} rawresult = do_imdb_api_query(query) # the API are limited to 30 query per hour, so avoid querying it just for # testing purposes # rawresult = {u'ukscreens': 0, u'rating': u'7.7', u'genres': u'Animation, Drama,Family,Fantasy,Music', u'title': u'Pinocchio', u'series': 0, u'country': u'USA', u'votes': u'23209', u'languages': u'English', u'stv': 0, u'year': None, u'usascreens': 0, u'imdburl': u'http://www.imdb.com/title/tt0032910/'} if not rawresult: event.reply("Couldn't look up %s" % query) return if 'error' in rawresult: event.reply("%s" % rawresult['error']) return for key in rawresult.keys(): if not rawresult[key]: result[key] = u"n/a" else: result[key] = rawresult[key] logging.warn(rawresult, result) for key in result.keys(): try: result[key] = striphtml(decode_html_entities(rawresult[key])) except AttributeError: # if the value is not a string, ignore the error and keep going pass event.reply("%(title)s (%(country)s, %(year)s): %(imdburl)s | rating:\ %(rating)s (out of %(votes)s votes) | Genres %(genres)s | Language: %(languages)s" % result )
('url', url), ] postdata = urllib.urlencode(postarray) req = urllib2.Request(url=plugcfg.url, data=postdata) req.add_header('User-agent', useragent()) try: res = urllib2.urlopen(req).readlines() except urllib2.URLError, e: logging.warn('tinyurl - %s - URLError: %s' % (url, str(e))) ; return except urllib2.HTTPError, e: logging.warn('tinyurl - %s - HTTP error: %s' % (url, str(e))) ; return except Exception, ex: if "DownloadError" in str(ex): logging.warn('tinyurl - %s - DownloadError: %s' % (url, str(e))) else: handle_exception() return urls = [] for line in res: if line.startswith('<blockquote><b>'): urls.append(striphtml(line.strip()).split('[Open')[0]) if len(urls) == 3: urls.pop(0) set(url, json.dumps(urls), namespace='tinyurl') return urls def handle_tinyurl(bot, ievent): """ get tinyurl from provided url. """ if not ievent.rest and (not urlcache.has_key(bot.name) or not urlcache[bot.name].has_key(ievent.target)): ievent.missing('<url>') return elif not ievent.rest: url = urlcache[bot.name][ievent.target] else: url = ievent.rest url = valid_url(url) if not url: ievent.reply('invalid or bad URL') ; return tinyurl = get_tinyurl(url) if tinyurl: ievent.reply(' .. '.join(tinyurl))
def handle_overflowanswers(bot, event): result = [] for aa in getanswers(event.rest): a = LazyDict(aa) result.append("%s - %s" % (a.owner['display_name'], striphtml(a.body))) event.reply("answers for %s: " % event.rest, result)
except urllib2.URLError, e: logging.warn('tinyurl - %s - URLError: %s' % (url, str(e))) return except urllib2.HTTPError, e: logging.warn('tinyurl - %s - HTTP error: %s' % (url, str(e))) return except Exception, ex: if "DownloadError" in str(ex): logging.warn('tinyurl - %s - DownloadError: %s' % (url, str(e))) else: handle_exception() return urls = [] for line in res: if line.startswith('<blockquote><b>'): urls.append(striphtml(line.strip()).split('[Open')[0]) if len(urls) == 3: urls.pop(0) set(url, json.dumps(urls), namespace='tinyurl') return urls ## tinyurl command def handle_tinyurl(bot, ievent): """ arguments: <url> - get tinyurl from provided url. """ if not ievent.rest and (not urlcache.has_key(bot.cfg.name) or not urlcache[bot.cfg.name].has_key(ievent.target)): ievent.missing('<url>') return elif not ievent.rest:
def formatevent(bot, ievent, channels, forwarded=False): m = { 'datetime': datetime.now(), 'separator': format_opt('separator'), 'event_prefix': format_opt('event_prefix'), 'network': bot.cfg.networkname, 'nick': ievent.nick, 'target': stripname(ievent.channel), 'botname': bot.cfg.name, 'txt': ievent.txt, 'type': ievent.cbtype } m = LazyDict(m) if ievent.cmnd == 'PRIVMSG': if ievent.txt.startswith('\001ACTION'): m.txt = '* %s %s' % (m.nick, ievent.txt[7:-1].strip()) else: if bot.type == "irc": m.txt = '<%s> %s' % (m.nick, striphtml(ievent.txt)) elif not forwarded: m.txt = '<%s> %s' % (m.nick, bot.normalize(ievent.txt)) else: m.txt = bot.normalize(ievent.txt) elif ievent.cmnd == 'NOTICE': m.target = ievent.arguments[0] m.txt = "-%s- %s" % (ievent.nick, ievent.txt) elif ievent.cmnd == 'TOPIC': m.txt = '%s changes topic to "%s"' % (ievent.nick, ievent.txt) elif ievent.cmnd == 'MODE': margs = ' '.join(ievent.arguments[1:]) m.txt = '%s sets mode: %s' % (ievent.nick, margs) elif ievent.cmnd == 'JOIN': m.txt = '%s (%s) has joined %s' % (ievent.nick, ievent.userhost, ievent.channel) elif ievent.cmnd == 'KICK': m.txt = '%s was kicked by %s (%s)' % (ievent.arguments[1], ievent.nick, ievent.txt) elif ievent.cmnd == 'PART': m.txt = '%s (%s) has left %s' % (ievent.nick, ievent.userhost, ievent.channel) elif ievent.cmnd in ('QUIT', 'NICK'): if not ievent.user or not ievent.user.data.channels: logging.debug("chatlog - can't find joined channels for %s" % ievent.userhost) return m cmd = ievent.cmnd nick = cmd == 'NICK' and ievent.txt or ievent.nick for c in event.user.channels: if [bot.cfg.name, c] in channels: if True: if cmd == 'NICK': m.txt = '%s (%s) is now known as %s' % ( ievent.nick, ievent.userhost, ievent.txt) else: m.txt = '%s (%s) has quit: %s' % ( ievent.nick, ievent.userhost, ievent.txt) m.type = ievent.cmnd.lower() m.target = c elif ievent.cbtype == 'PRESENCE': if ievent.type == 'unavailable': m.txt = "%s left" % ievent.nick else: m.txt = "%s joined" % ievent.nick elif ievent.cbtype == "MESSAGE": m.txt = "<%s> %s" % (m.nick, ievent.txt) elif ievent.cbtype == "OUTPUT": m.txt = "<%s> %s" % (bot.cfg.nick, ievent.txt) return m