def show(self, bugId): assert bugId.isdigit(), "bug id has to be a number" html = geturl2(self.show_url(bugId)) data = {} stat = '' for line in html.splitlines(): line = line.strip() if not line: continue elif '<td headers="category">' in line: stat = 'category' elif '<td headers="status">' in line: stat = 'status' elif '<td headers="assignedto">' in line: stat = 'assigned to' elif '<td headers="os">' in line: data['os'] = striphtml(line).strip() elif '<td headers="severity">' in line: data['severity'] = striphtml(line).strip() elif '<td headers="priority">' in line: data['priority'] = striphtml(line).strip() elif '<td headers="reportedver">' in line: data['version'] = striphtml(line).strip() elif '<h2 class="summary' in line: stat = 'summary' elif '<a href="#comments">Comments (' in line: data['comments'] = line.split('(', 1)[1].split(')')[0] # stats elif stat: if stat in ['category', 'status', 'assigned to', 'summary']: data[stat] = striphtml(line).strip() stat = '' return data
def handle(self, job): speed, event, url, depth, spiderspeed = job.args if not url: logging.error("no url provided") ; return if depth < 0: return if not self.url.base in url: logging.warn("skipping %s (%s)" % (url, self.url.base)) ; return if url in self.errors: logging.warn("skipping %s" % url) ; return urls = [] linknr = 0 follownr = 0 n = 0 try: if url not in self.urls: self.urls.append(url) page = Url(url) time.sleep(10-spiderspeed) content = page.fetch() event.reply("fetched %s - %s - %s" % (url, len(content), content.status)) try: urldata = UrlData(url, striphtml(content)) if urldata.data.txt: urldata.save() except Exception as ex: handle_exception() for p in page.geturls(): if not p in self.errors: self.put(6, event, p, depth-1, spiderspeed-1) if not self.queue.qsize(): self.stop() except urllib.error.URLError as ex: logging.warn("error fetching %s url: %s" % (url, str(ex))) except Exception as e: logging.warn("ERROR: Can't process url '%s' (%s)" % (url, e)) self.errors.append(url) handle_exception() if len(self.errors) > 10: self.stop()
def get_tinyurl(url): """ grab a tinyurl. """ res = get(url, namespace='tinyurl') ; logging.debug('tinyurl - cache - %s' % str(res)) if res and res[0] == '[': return json.loads(res) postarray = [ ('submit', 'submit'), ('url', url), ] postdata = urllib.parse.urlencode(postarray) req = urllib.request.Request(url=plugcfg.url, data=bytes(postdata, "utf-8")) req.add_header('User-agent', useragent()) try: res = urllib.request.urlopen(req).readlines() except urllib.error.URLError as e: logging.warn('tinyurl - %s - URLError: %s' % (url, str(e))) ; return except urllib.error.HTTPError as e: logging.warn('tinyurl - %s - HTTP error: %s' % (url, str(e))) ; return except Exception as ex: if "DownloadError" in str(ex): logging.warn('tinyurl - %s - DownloadError: %s' % (url, str(e))) else: handle_exception() return urls = [] for line in res: l = str(line, "utf-8") if l.startswith('<blockquote><b>'): urls.append(striphtml(l.strip()).split('[Open')[0]) if len(urls) == 3: urls.pop(0) set(url, json.dumps(urls), namespace='tinyurl') return urls
def geturls(txt): result = [] if "http://" in txt or "https://" in txt: for item in re_url_match.findall(txt): logging.debug("web - raw - found url - %s" % item) try: txt = txt.replace(item, '') except ValueError: logging.error("web - invalid url - %s" % url) i = item if i.endswith('"'): i = i[:-1] if i.endswith('")'): i = i[:-2] result.append(i) return (result, striphtml(txt))
def input(self, html): self.scantime = time.time() words = striphtml(html) words = words.replace("\n", "").split() stats = StatDict() for w in words: stats.upitem(w) self.data.url = self.url.url self.data.words = stats self.save() logging.warn("%s words found for %s" % (len(stats), self.url.url)) return stats
def formatevent(bot, ievent, channels, forwarded=False): m = { 'datetime': datetime.now(), 'separator': format_opt('separator'), 'event_prefix': format_opt('event_prefix'), 'network': bot.cfg.networkname, 'nick': ievent.nick, 'target': stripname(ievent.channel), 'botname': bot.cfg.name, 'txt': ievent.txt, 'type': ievent.cbtype } m = LazyDict(m) if ievent.cmnd == 'PRIVMSG': if ievent.txt.startswith('\001ACTION'): m.txt = '* %s %s' % (m.nick, ievent.txt[7:-1].strip()) else: if bot.type == "irc": m.txt = '<%s> %s' % (m.nick, striphtml(ievent.txt)) elif not forwarded: m.txt = '<%s> %s' % (m.nick, bot.normalize(ievent.txt)) else: m.txt = bot.normalize(ievent.txt) elif ievent.cmnd == 'NOTICE': m.target = ievent.arguments[0] m.txt = "-%s- %s"%(ievent.nick, ievent.txt) elif ievent.cmnd == 'TOPIC': m.txt = '%s changes topic to "%s"'%(ievent.nick, ievent.txt) elif ievent.cmnd == 'MODE': margs = ' '.join(ievent.arguments[1:]) m.txt = '%s sets mode: %s'% (ievent.nick, margs) elif ievent.cmnd == 'JOIN': m.txt = '%s (%s) has joined %s'%(ievent.nick, ievent.userhost, ievent.channel) elif ievent.cmnd == 'KICK': m.txt = '%s was kicked by %s (%s)'% (ievent.arguments[1], ievent.nick, ievent.txt) elif ievent.cmnd == 'PART': m.txt = '%s (%s) has left %s'% (ievent.nick, ievent.userhost, ievent.channel) elif ievent.cmnd in ('QUIT', 'NICK'): if not ievent.user or not ievent.user.data.channels: logging.debug("chatlog - can't find joined channels for %s" % ievent.userhost) return m cmd = ievent.cmnd nick = cmd == 'NICK' and ievent.txt or ievent.nick for c in event.user.channels: if [bot.cfg.name, c] in channels: if True: if cmd == 'NICK': m.txt = '%s (%s) is now known as %s'% (ievent.nick, ievent.userhost, ievent.txt) else: m.txt= '%s (%s) has quit: %s'% (ievent.nick, ievent.userhost, ievent.txt) m.type = ievent.cmnd.lower() m.target = c elif ievent.cbtype == 'PRESENCE': if ievent.type == 'unavailable': m.txt = "%s left" % ievent.nick else: m.txt = "%s joined" % ievent.nick elif ievent.cbtype == "MESSAGE": m.txt = "<%s> %s" % (m.nick, ievent.txt) elif ievent.cbtype == "OUTPUT": m.txt = "<%s> %s" % (bot.cfg.nick, ievent.txt) return m
def show(self, bugId): assert bugId.isdigit(), "bug id has to be a number" html = geturl2(self.show_url(bugId)) if 'APPLICATION ERROR #1100' in html: raise BugTrackerNotFound('issue not found') data = {'notes': 0} stat = '' skip = 0 for line in html.splitlines(): line = line.strip().replace('\t', '') if skip > 0: skip -= 1 continue elif not line: continue elif '<!-- Category -->' in line: skip = 1 stat = 'category' elif '<!-- Severity -->' in line: skip = 1 stat = 'severity' elif '<!-- Reproducibility -->' in line: skip = 1 stat = 'reproducibility' elif '<!-- Reporter -->' in line: skip = 3 stat = 'reporter' elif '<!-- Priority -->' in line: skip = 1 stat = 'priority' elif '<!-- Resolution -->' in line: skip = 1 stat = 'resolution' elif '<!-- Status -->' in line: skip = 3 stat = 'status' elif '<!-- Summary -->' in line: skip = 4 stat = 'summary' elif '<td class="bugnote-public">' in line: data['notes'] += 1 # stats elif stat: if stat in ['category', 'severity', 'reproducibility', 'reporter', 'priority', 'resolution', 'status', 'summary']: data[stat] = striphtml(line) stat = '' return data
def comments(self, bugId): assert bugId.isdigit(), "bug id has to be a number" bugrss = geturl(self.comments_url(bugId)) bugdom = xml.dom.minidom.parseString(bugrss) bugall = bugdom.getElementsByTagName('item') comments = [] if bugall: for item in bugall: title = item.getElementsByTagName('title')[0].firstChild.nodeValue if 'comment added' in title: try: author = item.getElementsByTagName('dc:creator')[0].firstChild.nodeValue except IndexError: author = 'anonymous' comment = item.getElementsByTagName('description')[0].firstChild.nodeValue comment = striphtml(comment.replace('\n', ' ')).strip() while ' ' in comment: comment = comment.replace(' ', ' ') comments.append('%s: %s' % (author, comment)) return comments
def markovlearnurl(url): """ learn an url """ lines = 0 logging.warn("learning %s" % url) try: f = geturl2(url) except urllib.error.URLError as ex: logging.warn("error learning from url: %s" % url) return [] for line in f.split("\n"): line = striphtml(line) if lines % 10 == 0: time.sleep(0.01) line = line.strip() if not line: continue markovtalk_learn(line) lines += 1 logging.warn("learning %s done" % url) return lines
def markovlearnspider(target): logging.warn("starting spider learn on %s" % target) coll = PersistCollection(getdatadir() + os.sep + "spider" + os.sep + "data") if target.startswith("spider://"): target = target[9:] objs = coll.search("url", target) for obj in objs: if not obj.data and obj.data.url: continue time.sleep(0.001) if target not in obj.data.url: continue logging.warn("url is %s" % obj.data.url) try: if obj.data and obj.data.txt: for line in obj.data.txt.split("\n"): if line.count(";") > 1: continue markovtalk_learn(striphtml(line)) except: handle_exception()
def handle_imdb(bot, event): """ arguments: <query> - query the imdb databae at http://www.deanclatworthy.com/imdb/ """ if not event.rest: event.missing("<query>") ; return query = event.rest.strip() urlquery = query.replace(" ", "+") result = {} res = geturl2(URL % urlquery) if not res: event.reply("%s didn't return a result" % (URL % urlquery)) ; return try: rawresult = getjson().loads(res) except ValueError: event.reply("sorry cannot parse data returned from the server: %s" % res) ; return # the API are limited to 30 query per hour, so avoid querying it just for testing purposes # rawresult = {u'ukscreens': 0, u'rating': u'7.7', u'genres': u'Animation, Drama,Family,Fantasy,Music', u'title': u'Pinocchio', u'series': 0, u'country': u'USA', u'votes': u'23209', u'languages': u'English', u'stv': 0, u'year': None, u'usascreens': 0, u'imdburl': u'http://www.imdb.com/title/tt0032910/'} if not rawresult: event.reply("couldn't look up %s" % query) ; return if 'error' in rawresult: event.reply("%s" % rawresult['error']) ; return for key in list(rawresult.keys()): if not rawresult[key]: result[key] = "n/a" else: result[key] = rawresult[key] for key in list(result.keys()): try: result[key] = striphtml(decode_html_entities(str(rawresult[key]))) except AttributeError: pass if "year" in list(rawresult.keys()): event.reply("%(title)s (%(country)s, %(year)s): %(imdburl)s | rating: %(rating)s (out of %(votes)s votes) | Genres %(genres)s | Language: %(languages)s" % result ) else: event.reply("%(title)s (%(country)s): %(imdburl)s | rating: %(rating)s (out of %(votes)s votes) | Genres %(genres)s | Language: %(languages)s" % result )
def get_tinyurl(url): """ grab a tinyurl. """ from tl.utils.url import enabled if not enabled: raise URLNotEnabled res = get(url, namespace="tinyurl") logging.debug("tinyurl - cache - %s" % str(res)) if res and res[0] == "[": return json.loads(res) postarray = [("submit", "submit"), ("url", url)] postdata = urllib.parse.urlencode(postarray) postbytes = bytes(postdata, "utf-8") req = urllib.request.Request(url=posturl, data=postbytes) req.add_header("User-agent", useragent()) try: res = urllib.request.urlopen(req).readlines() except urllib.error.URLError as e: logging.warn("tinyurl - %s - URLError: %s" % (url, str(e))) return except urllib.error.HTTPError as e: logging.warn("tinyurl - %s - HTTP error: %s" % (url, str(e))) return except Exception as ex: if "DownloadError" in str(ex): logging.warn("tinyurl - %s - DownloadError: %s" % (url, str(e))) else: handle_exception() return urls = [] for line in res: bline = str(line, "utf-8") if bline.startswith("<blockquote><b>"): urls.append(striphtml(bline.strip()).split("[Open")[0]) if len(urls) == 3: urls.pop(0) set(url, json.dumps(urls), namespace="tinyurl") return urls
def handle_overflowanswers(bot, event): result = [] for aa in getanswers(event.rest): a = LazyDict(aa) result.append("%s - %s" % (a.owner['display_name'], striphtml(a.body))) event.reply("answers for %s: " % event.rest, result)