Python remove_html_tags Exemples, botmodules.tools.remove_html_tags Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : news.py Projet : cactauz/genmaybot

def google_news (self, e):
    query = urllib.quote(e.input)
    url = ""
    if not query:
        url = "http://news.google.com/news?ned=us&topic=h&output=rss"
    else:
        url = "http://news.google.com/news?q=%s&output=rss" % query
    
           
    dom = xml.dom.minidom.parse(urllib2.urlopen(url))
    newest_news = dom.getElementsByTagName('item')[0]
    title = newest_news.getElementsByTagName('title')[0].childNodes[0].data
    description = BeautifulSoup(newest_news.getElementsByTagName('description')[0].childNodes[0].data)
    
    links = description.findAll('a')
    for link in links:
        link.extract()          
    links = description.findAll(color='#6f6f6f')
    for link in links:
        link.extract()
    
    description = str(description).strip().decode("utf-8", 'ignore')
    description = tools.remove_html_tags(description)
    description = tools.decode_htmlentities(description)
    description = description[0:len(description) - 9]
    if description.rfind(".")!=-1:
        description = description[0:description.rfind(".")+1]
    link = tools.shorten_url(newest_news.getElementsByTagName('link')[0].childNodes[0].data)
    
    e.output = "%s - %s [ %s ]" % (title.encode("utf-8", 'ignore'), description.encode("utf-8", 'ignore'), link.encode("utf-8", 'ignore'))
    
    return e

Exemple #2

0

Afficher le fichier

Fichier : sun.py Projet : evanjfraser/genmaybot

def google_sun(self, location, sun, nick):
    if location == "" and user:
       location = user.get_location(nick)
    location = urllib.parse.quote(location)
    url = "http://www.google.com/search?hl=en&client=opera&hs=6At&rls=en&q=%s+%s&aq=f&aqi=g1&aql=&oq=&gs_rfai=" % (sun, location)
    request = urllib.request.Request(url, None, {})
    request.add_header('User-Agent', "Opera/9.80 (Windows NT 6.0; U; en) Presto/2.2.15 Version/10.10")
    request.add_header('Range', "bytes=0-40960")
    response = urllib.request.urlopen(request).read().decode('utf-8')
    

    m = re.search('(-40.gif.*?\<b\>)(.*?)(\<\/b\> )(.*?)( -\s*\<b\>)(.*?)(\<\/b\> in\s*)(.*?)(\s*?\<tr\>.*?top\"\>)(.*?)(\<\/table\>)', response)
    
    try:
      settime = m.group(2)
      setday = m.group(4)
      setday = re.sub("\s+"," ",setday)
      setword = m.group(6)
      setcity = m.group(8)
      settimeword = m.group(10)
      
      result = "%s in %s: %s %s (%s)" % (sun, setcity,settime,setday,settimeword)
   
      #print result
    except:
      pass
      return

    return tools.remove_html_tags(result)

Exemple #3

0

Afficher le fichier

def google_sun(term, sun, nick):
    if term == "" and user:
       term = user.get_location(nick)
    term = urllib.quote(term)
    url = "http://www.google.com/search?hl=en&client=opera&hs=6At&rls=en&q=%s+%s&aq=f&aqi=g1&aql=&oq=&gs_rfai=" % (sun, term)
    request = urllib2.Request(url, None, {})
    request.add_header('User-Agent', "Opera/9.80 (Windows NT 6.0; U; en) Presto/2.2.15 Version/10.10")
    request.add_header('Range', "bytes=0-40960")
    response = urllib2.urlopen(request).read()

    m = re.search('(-40.gif.*?\<b\>)(.*?)(\<\/b\> )(.*?)( -\s*\<b\>)(.*?)(\<\/b\> in\s*)(.*?)(\s*?\<tr\>.*?top\"\>)(.*?)(\<\/table\>)', response)
    
    try:
      settime = m.group(2)
      setday = m.group(4)
      setday = re.sub("\s+"," ",setday)
      setword = m.group(6)
      setcity = m.group(8)
      settimeword = m.group(10)
      
      result = "%s in %s: %s %s (%s)" % (sun, setcity,settime,setday,settimeword)
   
      #print result
    except:
      pass
      return

    return tools.remove_html_tags(result)

Exemple #4

0

Afficher le fichier

Fichier : news.py Projet : cactauz/genmaybot

def google_news(self, e):
    query = urllib.quote(e.input)
    url = ""
    if not query:
        url = "http://news.google.com/news?ned=us&topic=h&output=rss"
    else:
        url = "http://news.google.com/news?q=%s&output=rss" % query

    dom = xml.dom.minidom.parse(urllib2.urlopen(url))
    newest_news = dom.getElementsByTagName('item')[0]
    title = newest_news.getElementsByTagName('title')[0].childNodes[0].data
    description = BeautifulSoup(
        newest_news.getElementsByTagName('description')[0].childNodes[0].data)

    links = description.findAll('a')
    for link in links:
        link.extract()
    links = description.findAll(color='#6f6f6f')
    for link in links:
        link.extract()

    description = str(description).strip().decode("utf-8", 'ignore')
    description = tools.remove_html_tags(description)
    description = tools.decode_htmlentities(description)
    description = description[0:len(description) - 9]
    if description.rfind(".") != -1:
        description = description[0:description.rfind(".") + 1]
    link = tools.shorten_url(
        newest_news.getElementsByTagName('link')[0].childNodes[0].data)

    e.output = "%s - %s [ %s ]" % (title.encode(
        "utf-8", 'ignore'), description.encode(
            "utf-8", 'ignore'), link.encode("utf-8", 'ignore'))

    return e

Exemple #5

0

Afficher le fichier

Fichier : gwiki.py Projet : iamsix/genmaybot

def gwiki(bot, e):
    url = (
        'http://ajax.googleapis.com/ajax/services/search/web?v=1.0&q=site:wikipedia.org+'
        + urllib.parse.quote(e.input))
    request = urllib.request.Request(url, None,
                                     {'Referer': 'http://irc.00id.net'})
    response = urllib.request.urlopen(request)

    results_json = json.loads(response.read().decode('utf-8'))
    results = results_json['responseData']['results']
    regexstring = "wikipedia.org/wiki/"
    result = results[0]
    m = re.search(regexstring, result['url'])
    if (m):
        url = result['url']
        url = tools.shorten_url(url.replace('%25', '%'))
        #content = result['content'].encode('utf-8')

        content = tools.decode_htmlentities(
            tools.remove_html_tags(result['content']))
        content = re.sub('\s+', ' ', content)
        content = content.replace("...", "")
        #print content
        #content = content.decode('unicode-escape')
        #e.output = content
        e.output = "%s [ %s ]" % (content, url)
    return e

Exemple #6

0

Afficher le fichier

Fichier : url.py Projet : cactauz/genmaybot

def get_title(url):
    # extracts the title tag from a page
    title = ""
    try:
        opener = urllib2.build_opener()
        readlength = 10240
        if url.find("amazon.") != -1:
            readlength = 100096  # because amazon is coded like shit

        opener.addheaders = [("User-Agent", "Opera/9.10 (YourMom 8.0)"), ("Range", "bytes=0-" + str(readlength))]

        pagetmp = opener.open(url)
        page = pagetmp.read(readlength)
        opener.close()

        titletmp = tools.remove_html_tags(re.search("(?is)\<title\>.*?<\/title\>", page).group(0))
        title = "Title: " + titletmp.strip()[0:180]
    except:
        pass

    return title

Exemple #7

0

Afficher le fichier

def get_title(url):
    #extracts the title tag from a page
    title = ""
    try:
        opener = urllib2.build_opener()
        readlength = 10240
        if url.find("amazon.") != -1:
            readlength = 100096  #because amazon is coded like shit

        opener.addheaders = [('User-Agent', "Opera/9.10 (YourMom 8.0)"),
                             ('Range', "bytes=0-" + str(readlength))]

        pagetmp = opener.open(url)
        page = pagetmp.read(readlength)
        opener.close()

        titletmp = tools.remove_html_tags(
            re.search('(?is)\<title\>.*?<\/title\>', page).group(0))
        title = "Title: " + titletmp.strip()[0:180]
    except:
        pass

    return title

Exemple #8

0

Afficher le fichier

Fichier : gwiki.py Projet : KpaBap/genmaybot

def gwiki(bot, e):
      url = ('http://ajax.googleapis.com/ajax/services/search/web?v=1.0&q=site:wikipedia.org+' + urllib.parse.quote(e.input))
      request = urllib.request.Request(url, None, {'Referer': 'http://irc.00id.net'})
      response = urllib.request.urlopen(request)

      results_json = json.loads(response.read().decode('utf-8'))
      results = results_json['responseData']['results']
      regexstring = "wikipedia.org/wiki/"
      result = results[0]
      m = re.search(regexstring,result['url'])   
      if (m):
         url = result['url']
         url = tools.shorten_url(url.replace('%25','%'))
         #content = result['content'].encode('utf-8')
         
         content = tools.decode_htmlentities(tools.remove_html_tags(result['content']))
         content = re.sub('\s+', ' ', content)
         content = content.replace("...", "")
         #print content
         #content = content.decode('unicode-escape')
         #e.output = content
         e.output = "%s [ %s ]" % (content, url)
      return e

Exemple #9

0

Afficher le fichier

def get_imdb(self, e, urlposted=False):
    #reads title, rating, and movie description of movie titles
    searchterm = e.input
    if urlposted:
        url = searchterm
    else:
        url = tools.google_url("site:imdb.com/title " + searchterm,
                               "imdb.com/title/tt\\d{7}/")

    title = ""
    if not url:
        pass
    elif url.find("imdb.com/title/tt") != -1:
        try:
            movietitle = ""
            rating = ""
            summary = ""
            imdbid = re.search("tt\\d{7}", url)
            imdburl = ('http://www.imdb.com/title/' + imdbid.group(0) + '/')
            opener = urllib2.build_opener()
            opener.addheaders = [('User-Agent', "Opera/9.10 (YourMom 8.0)"),
                                 ('Range', "bytes=0-40960")]
            pagetmp = opener.open(imdburl)
            page = BeautifulSoup(pagetmp.read(40960))
            opener.close()

            movietitle = tools.decode_htmlentities(
                tools.remove_html_tags(str(page.find('title'))).replace(
                    " - IMDb", ""))
            movietitle = movietitle.replace("IMDb - ", "")
            movietitle = "Title: " + movietitle

            if page.find(id="overview-top") != None:
                page = page.find(id="overview-top").extract()

                if page.find("div", "star-box-giga-star") != None:
                    rating = tools.remove_html_tags(
                        str(page.find("div", "star-box-giga-star").text))
                    rating = " - Rating: " + rating

                if len(page.findAll('p')) == 2:

                    summary = str(page.findAll('p')[1])

                    removelink = re.compile(r'\<a.*\/a\>')
                    summary = removelink.sub('', summary)
                    summary = tools.remove_html_tags(summary)
                    summary = summary.replace('&raquo;', "")
                    summary = tools.decode_htmlentities(
                        summary.decode("utf-8", 'ignore'))
                    summary = re.sub("\&.*?\;", " ", summary)
                    summary = summary.replace("\n", " ")
                    summary = " - " + summary

            title = movietitle + rating + summary
            if not urlposted:
                title = title + " [ %s ]" % url

            e.output = title.encode('utf-8', 'ignore')

            return e
        except Exception as inst:
            print "!imdb " + searchterm + ": " + str(inst)
            return None

Exemple #10

0

Afficher le fichier

def get_wiki(self, e, urlposted=False):
    #read the first paragraph of a wikipedia article
  searchterm = e.input
  
  if urlposted:
      url = searchterm
  else:
      if searchterm == "":
          url = "http://en.wikipedia.org/wiki/Special:Random"
      else:
          url = tools.google_url("site:wikipedia.org " + searchterm,"wikipedia.org/wiki")
  
  title = "" 
  
  if url and url.find("wikipedia.org/wiki/File:") != -1:
    
    file_title=get_wiki_file_description(url)
    
    if file_title:
        e.output = file_title
        return e 
    
  if url and url.find("wikipedia.org/wiki/") != -1:

    try:
      opener = urllib2.build_opener()
      opener.addheaders = [('User-Agent',"Opera/9.10 (YourMom 8.0)")]
      pagetmp = opener.open(url)
      page = pagetmp.read()
      url = pagetmp.geturl()
      opener.close()

      if url.find('#') != -1:
        anchor = url.split('#')[1]
        page = page[page.find('id="' + anchor):]

      page = BeautifulSoup(page)
      tables = page.findAll('table')
      for table in tables:
        table.extract()
        
      page = page.findAll('p')
      if str(page[0])[0:9] == '<p><span ':
          page = unicode(page[1].extract())
      else:
          page = unicode(page[0].extract())

      title = tools.remove_html_tags(re.search('(?s)\<p\>(.*?)\<\/p\>',page).group(1))
      title = title.encode("utf-8", 'ignore')
      title = title.replace("<","");
      rembracket = re.compile(r'\[.*?\]')
      title = rembracket.sub('',title)
      #title = re.sub("\&.*?\;", " ", title)
      title = title.replace("\n", " ")
      
      title = tools.decode_htmlentities(title.decode("utf-8", 'ignore')).encode("utf-8", 'ignore')

      title = title[0:420]
      if title.rfind(".")!=-1:
        title = title[0:title.rfind(".")+1]
      
      if not urlposted:
        url = tools.shorten_url(url)
        title = (title.decode('utf-8', 'ignore') + " [ %s ]" % url).encode('utf-8', 'ignore')
    except Exception as inst: 
      print "!wiki " + searchterm + " : " + str(inst)
      title = tools.remove_html_tags(re.search('\<p\>(.*?\.) ',str(page)).group(1))

  e.output = title
  return e

Exemple #11

0

Afficher le fichier

Fichier : wiki.py Projet : cactauz/genmaybot

def get_wiki(self, e, urlposted=False):
    # read the first paragraph of a wikipedia article
    searchterm = e.input

    if urlposted:
        url = searchterm
    else:
        if searchterm == "":
            url = "http://en.wikipedia.org/wiki/Special:Random"
        else:
            url = tools.google_url("site:wikipedia.org " + searchterm, "wikipedia.org/wiki")

    title = ""

    if url and url.find("wikipedia.org/wiki/File:") != -1:

        file_title = get_wiki_file_description(url)

        if file_title:
            e.output = file_title
            return e

    if url and url.find("wikipedia.org/wiki/") != -1:

        try:
            opener = urllib2.build_opener()
            opener.addheaders = [("User-Agent", "Opera/9.10 (YourMom 8.0)")]
            pagetmp = opener.open(url)
            page = pagetmp.read()
            url = pagetmp.geturl()
            opener.close()

            if url.find("#") != -1:
                anchor = url.split("#")[1]
                page = page[page.find('id="' + anchor) :]

            page = BeautifulSoup(page)
            tables = page.findAll("table")
            for table in tables:
                table.extract()

            page = page.findAll("p")
            if str(page[0])[0:9] == "<p><span ":
                page = unicode(page[1].extract())
            else:
                page = unicode(page[0].extract())

            title = tools.remove_html_tags(re.search("(?s)\<p\>(.*?)\<\/p\>", page).group(1))
            title = title.encode("utf-8", "ignore")
            title = title.replace("<", "")
            rembracket = re.compile(r"\[.*?\]")
            title = rembracket.sub("", title)
            # title = re.sub("\&.*?\;", " ", title)
            title = title.replace("\n", " ")

            title = tools.decode_htmlentities(title.decode("utf-8", "ignore")).encode("utf-8", "ignore")

            title = title[0:420]
            if title.rfind(".") != -1:
                title = title[0 : title.rfind(".") + 1]

            if not urlposted:
                url = tools.shorten_url(url)
                title = (title.decode("utf-8", "ignore") + " [ %s ]" % url).encode("utf-8", "ignore")
        except Exception as inst:
            print "!wiki " + searchterm + " : " + str(inst)
            title = tools.remove_html_tags(re.search("\<p\>(.*?\.) ", str(page)).group(1))

    e.output = title
    return e