Python striphtml Exemples, tl.utils.url.striphtml Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : bugtracker.py Projet : buzzworkers/tl

    def show(self, bugId):
        assert bugId.isdigit(), "bug id has to be a number"
        html = geturl2(self.show_url(bugId))
        data = {}
        stat = ''
        for line in html.splitlines():
            line = line.strip()
            if not line:
                continue
            

            elif '<td headers="category">' in line:
                stat = 'category'
            elif '<td headers="status">' in line:
                stat = 'status'
            elif '<td headers="assignedto">' in line:
                stat = 'assigned to'
            elif '<td headers="os">' in line:
                data['os'] = striphtml(line).strip()
            elif '<td headers="severity">' in line:
                data['severity'] = striphtml(line).strip()
            elif '<td headers="priority">' in line:
                data['priority'] = striphtml(line).strip()
            elif '<td headers="reportedver">' in line:
                data['version'] = striphtml(line).strip()
            elif '<h2 class="summary' in line:
                stat = 'summary'
            elif '<a href="#comments">Comments (' in line:
                data['comments'] = line.split('(', 1)[1].split(')')[0]
            # stats
            elif stat:
                if stat in ['category', 'status', 'assigned to', 'summary']:
                    data[stat] = striphtml(line).strip()
                stat = ''
        return data

Exemple #2

0

Afficher le fichier

Fichier : spider.py Projet : buzzworkers/tl

    def handle(self, job):
        speed, event, url, depth, spiderspeed = job.args
        if not url: logging.error("no url provided") ; return
        if depth < 0: return
        if not self.url.base in url: logging.warn("skipping %s (%s)" % (url, self.url.base)) ; return
        if url in self.errors: logging.warn("skipping %s" % url) ; return
        urls = []
        linknr = 0
        follownr = 0
        n = 0
        try:
            if url not in self.urls:

                self.urls.append(url)
                page = Url(url)
                time.sleep(10-spiderspeed)
                content = page.fetch()
                event.reply("fetched %s - %s - %s" % (url, len(content), content.status))
                try:
                    urldata = UrlData(url, striphtml(content))
                    if urldata.data.txt: urldata.save()
                except Exception as ex: handle_exception()
                for p in page.geturls():
                    if not p in self.errors:
                        self.put(6, event, p, depth-1, spiderspeed-1)
            if not self.queue.qsize(): self.stop()
        except urllib.error.URLError as ex: logging.warn("error fetching %s url: %s" % (url, str(ex)))
        except Exception as e:
            logging.warn("ERROR: Can't process url '%s' (%s)" % (url, e))
            self.errors.append(url)
            handle_exception()
            if len(self.errors) > 10: self.stop()

Exemple #3

0

Afficher le fichier

Fichier : tinyurl.py Projet : buzzworkers/tl

def get_tinyurl(url):
    """ grab a tinyurl. """
    res = get(url, namespace='tinyurl') ; logging.debug('tinyurl - cache - %s' % str(res))
    if res and res[0] == '[': return json.loads(res)
    postarray = [
        ('submit', 'submit'),
        ('url', url),
        ]
    postdata = urllib.parse.urlencode(postarray)
    req = urllib.request.Request(url=plugcfg.url, data=bytes(postdata, "utf-8"))
    req.add_header('User-agent', useragent())
    try: res = urllib.request.urlopen(req).readlines()
    except urllib.error.URLError as e: logging.warn('tinyurl - %s - URLError: %s' % (url, str(e))) ; return
    except urllib.error.HTTPError as e: logging.warn('tinyurl - %s - HTTP error: %s' % (url, str(e))) ; return
    except Exception as ex:
        if "DownloadError" in str(ex): logging.warn('tinyurl - %s - DownloadError: %s' % (url, str(e)))
        else: handle_exception()
        return
    urls = []
    for line in res:
        l = str(line, "utf-8")
        if l.startswith('<blockquote><b>'): urls.append(striphtml(l.strip()).split('[Open')[0])
    if len(urls) == 3: urls.pop(0)
    set(url, json.dumps(urls), namespace='tinyurl')
    return urls

Exemple #4

0

Afficher le fichier

Fichier : overflow.py Projet : buzzworkers/tl

def geturls(txt):
    result = []
    if "http://" in txt or "https://" in txt:
        for item in re_url_match.findall(txt):
            logging.debug("web - raw - found url - %s" % item)
            try: txt = txt.replace(item, '')
            except ValueError:  logging.error("web - invalid url - %s" % url)
            i = item
            if i.endswith('"'): i = i[:-1]
            if i.endswith('")'): i = i[:-2]
            result.append(i)
    return (result, striphtml(txt))

Exemple #5

0

Afficher le fichier

Fichier : urlstats.py Projet : buzzworkers/tl

 def input(self, html):
     self.scantime = time.time()
     words = striphtml(html)
     words = words.replace("\n", "").split()
     stats = StatDict()
     for w in words:
         stats.upitem(w)
     self.data.url = self.url.url
     self.data.words = stats
     self.save()
     logging.warn("%s words found for %s" % (len(stats), self.url.url))
     return stats

Exemple #6

0

Afficher le fichier

Fichier : format.py Projet : buzzworkers/tl

def formatevent(bot, ievent, channels, forwarded=False):
    m = {
        'datetime': datetime.now(),
        'separator': format_opt('separator'),
        'event_prefix': format_opt('event_prefix'),
        'network': bot.cfg.networkname,
        'nick': ievent.nick,
        'target': stripname(ievent.channel),
        'botname': bot.cfg.name,
        'txt': ievent.txt,
        'type': ievent.cbtype
    }
    m = LazyDict(m)
    if ievent.cmnd == 'PRIVMSG':
        if ievent.txt.startswith('\001ACTION'): m.txt = '* %s %s' % (m.nick, ievent.txt[7:-1].strip())
        else:
             if bot.type == "irc": m.txt = '<%s> %s' % (m.nick, striphtml(ievent.txt))
             elif not forwarded: m.txt = '<%s> %s' % (m.nick, bot.normalize(ievent.txt))
             else: m.txt = bot.normalize(ievent.txt)
    elif ievent.cmnd == 'NOTICE':
            m.target = ievent.arguments[0]
            m.txt = "-%s- %s"%(ievent.nick, ievent.txt)
    elif ievent.cmnd == 'TOPIC': m.txt = '%s changes topic to "%s"'%(ievent.nick, ievent.txt)
    elif ievent.cmnd == 'MODE':
        margs = ' '.join(ievent.arguments[1:])
        m.txt = '%s sets mode: %s'% (ievent.nick, margs)
    elif ievent.cmnd == 'JOIN': m.txt = '%s (%s) has joined %s'%(ievent.nick, ievent.userhost, ievent.channel)
    elif ievent.cmnd == 'KICK': m.txt = '%s was kicked by %s (%s)'% (ievent.arguments[1], ievent.nick, ievent.txt)
    elif ievent.cmnd == 'PART': m.txt = '%s (%s) has left %s'% (ievent.nick, ievent.userhost, ievent.channel)
    elif ievent.cmnd in ('QUIT', 'NICK'):
        if not ievent.user or not ievent.user.data.channels:
            logging.debug("chatlog - can't find joined channels for %s" % ievent.userhost)
            return m
        cmd = ievent.cmnd
        nick = cmd == 'NICK' and ievent.txt or ievent.nick
        for c in event.user.channels:
            if [bot.cfg.name, c] in channels:
                if True:
                    if cmd == 'NICK': m.txt = '%s (%s) is now known as %s'% (ievent.nick, ievent.userhost, ievent.txt)
                    else: m.txt= '%s (%s) has quit: %s'% (ievent.nick, ievent.userhost, ievent.txt)
                    m.type = ievent.cmnd.lower()
                    m.target = c
    elif ievent.cbtype == 'PRESENCE':
            if ievent.type == 'unavailable': m.txt = "%s left" % ievent.nick
            else: m.txt = "%s joined" % ievent.nick
    elif ievent.cbtype == "MESSAGE": m.txt = "<%s> %s" % (m.nick, ievent.txt)
    elif ievent.cbtype == "OUTPUT": m.txt = "<%s> %s" % (bot.cfg.nick, ievent.txt)
    return m

Exemple #7

0

Afficher le fichier

Fichier : bugtracker.py Projet : buzzworkers/tl

 def show(self, bugId):
     assert bugId.isdigit(), "bug id has to be a number"
     html = geturl2(self.show_url(bugId))
     if 'APPLICATION ERROR #1100' in html:
         raise BugTrackerNotFound('issue not found')
     data = {'notes': 0}
     stat = ''
     skip = 0
     for line in html.splitlines():
         line = line.strip().replace('\t', '')
         if skip > 0:
             skip -= 1
             continue
         elif not line:
             continue
         elif '<!-- Category -->' in line:
             skip = 1
             stat = 'category'
         elif '<!-- Severity -->' in line:
             skip = 1
             stat = 'severity'
         elif '<!-- Reproducibility -->' in line:
             skip = 1
             stat = 'reproducibility'
         elif '<!-- Reporter -->' in line:
             skip = 3
             stat = 'reporter'
         elif '<!-- Priority -->' in line:
             skip = 1
             stat = 'priority'
         elif '<!-- Resolution -->' in line:
             skip = 1
             stat = 'resolution'
         elif '<!-- Status -->' in line:
             skip = 3
             stat = 'status'
         elif '<!-- Summary -->' in line:
             skip = 4
             stat = 'summary'
         elif '<td class="bugnote-public">' in line:
             data['notes'] += 1
         # stats
         elif stat:
             if stat in ['category', 'severity', 'reproducibility', 'reporter',
                 'priority', 'resolution', 'status', 'summary']:
                 data[stat] = striphtml(line)
             stat = ''
     return data

Exemple #8

0

Afficher le fichier

Fichier : bugtracker.py Projet : buzzworkers/tl

 def comments(self, bugId):
     assert bugId.isdigit(), "bug id has to be a number"
     bugrss = geturl(self.comments_url(bugId))
     bugdom = xml.dom.minidom.parseString(bugrss)
     bugall = bugdom.getElementsByTagName('item')
     comments = []
     if bugall:
         for item in bugall:
             title = item.getElementsByTagName('title')[0].firstChild.nodeValue
             if 'comment added' in title:
                 try:
                     author = item.getElementsByTagName('dc:creator')[0].firstChild.nodeValue
                 except IndexError:
                     author = 'anonymous'
                 comment = item.getElementsByTagName('description')[0].firstChild.nodeValue
                 comment = striphtml(comment.replace('\n', ' ')).strip()
                 while '  ' in comment:
                     comment = comment.replace('  ', ' ')
                 comments.append('%s: %s' % (author, comment))
     return comments

Exemple #9

0

Afficher le fichier

Fichier : markov.py Projet : buzzworkers/tl

def markovlearnurl(url):
    """ learn an url """
    lines = 0
    logging.warn("learning %s" % url)
    try:
        f = geturl2(url)
    except urllib.error.URLError as ex:
        logging.warn("error learning from url: %s" % url)
        return []
    for line in f.split("\n"):
        line = striphtml(line)
        if lines % 10 == 0:
            time.sleep(0.01)
        line = line.strip()
        if not line:
            continue
        markovtalk_learn(line)
        lines += 1
    logging.warn("learning %s done" % url)
    return lines

Exemple #10

0

Afficher le fichier

Fichier : markov.py Projet : buzzworkers/tl

def markovlearnspider(target):
    logging.warn("starting spider learn on %s" % target)
    coll = PersistCollection(getdatadir() + os.sep + "spider" + os.sep + "data")
    if target.startswith("spider://"):
        target = target[9:]
    objs = coll.search("url", target)
    for obj in objs:
        if not obj.data and obj.data.url:
            continue
        time.sleep(0.001)
        if target not in obj.data.url:
            continue
        logging.warn("url is %s" % obj.data.url)
        try:
            if obj.data and obj.data.txt:
                for line in obj.data.txt.split("\n"):
                    if line.count(";") > 1:
                        continue
                    markovtalk_learn(striphtml(line))
        except:
            handle_exception()

Exemple #11

0

Afficher le fichier

Fichier : imdb.py Projet : buzzworkers/tl

def handle_imdb(bot, event):
    """ arguments: <query> - query the imdb databae at http://www.deanclatworthy.com/imdb/ """
    if not event.rest:  event.missing("<query>") ; return
    query = event.rest.strip()
    urlquery = query.replace(" ", "+")
    result = {}
    res = geturl2(URL % urlquery)
    if not res: event.reply("%s didn't return a result" % (URL % urlquery)) ; return
    try: rawresult = getjson().loads(res)
    except ValueError: event.reply("sorry cannot parse data returned from the server: %s" % res) ; return
    # the API are limited to 30 query per hour, so avoid querying it just for testing purposes
    # rawresult = {u'ukscreens': 0, u'rating': u'7.7', u'genres': u'Animation,&nbsp;Drama,Family,Fantasy,Music', u'title': u'Pinocchio', u'series': 0, u'country': u'USA', u'votes': u'23209', u'languages': u'English', u'stv': 0, u'year': None, u'usascreens': 0, u'imdburl': u'http://www.imdb.com/title/tt0032910/'}
    if not rawresult: event.reply("couldn't look up %s" % query) ; return
    if 'error' in rawresult: event.reply("%s" % rawresult['error']) ; return
    for key in list(rawresult.keys()):
        if not rawresult[key]: result[key] = "n/a"
        else: result[key] = rawresult[key]
    for key in list(result.keys()):
        try: result[key] = striphtml(decode_html_entities(str(rawresult[key])))
        except AttributeError: pass
    if "year" in list(rawresult.keys()): event.reply("%(title)s (%(country)s, %(year)s): %(imdburl)s | rating: %(rating)s (out of %(votes)s votes) | Genres %(genres)s | Language: %(languages)s" % result )
    else: event.reply("%(title)s (%(country)s): %(imdburl)s | rating: %(rating)s (out of %(votes)s votes) | Genres %(genres)s | Language: %(languages)s" % result )

Exemple #12

0

Afficher le fichier

Fichier : tinyurl.py Projet : buzzworkers/tl

def get_tinyurl(url):
    """ grab a tinyurl. """
    from tl.utils.url import enabled

    if not enabled:
        raise URLNotEnabled
    res = get(url, namespace="tinyurl")
    logging.debug("tinyurl - cache - %s" % str(res))
    if res and res[0] == "[":
        return json.loads(res)
    postarray = [("submit", "submit"), ("url", url)]
    postdata = urllib.parse.urlencode(postarray)
    postbytes = bytes(postdata, "utf-8")
    req = urllib.request.Request(url=posturl, data=postbytes)
    req.add_header("User-agent", useragent())
    try:
        res = urllib.request.urlopen(req).readlines()
    except urllib.error.URLError as e:
        logging.warn("tinyurl - %s - URLError: %s" % (url, str(e)))
        return
    except urllib.error.HTTPError as e:
        logging.warn("tinyurl - %s - HTTP error: %s" % (url, str(e)))
        return
    except Exception as ex:
        if "DownloadError" in str(ex):
            logging.warn("tinyurl - %s - DownloadError: %s" % (url, str(e)))
        else:
            handle_exception()
        return
    urls = []
    for line in res:
        bline = str(line, "utf-8")
        if bline.startswith("<blockquote><b>"):
            urls.append(striphtml(bline.strip()).split("[Open")[0])
    if len(urls) == 3:
        urls.pop(0)
    set(url, json.dumps(urls), namespace="tinyurl")
    return urls

Exemple #13

0

Afficher le fichier

Fichier : overflow.py Projet : buzzworkers/tl

def handle_overflowanswers(bot, event):
    result = []
    for aa in getanswers(event.rest):
        a = LazyDict(aa)
        result.append("%s - %s" % (a.owner['display_name'], striphtml(a.body)))
    event.reply("answers for %s: " % event.rest, result)