Python decode_htmlentitiesの例、botmodules.tools.decode_htmlentities Pythonの例

コード例 #1

0

ファイルを表示

ファイル: urban_dictionary.py プロジェクト: cactauz/genmaybot

def get_urbandictionary(self, e):
    searchterm = e.input
    url = "http://www.urbandictionary.com/define.php?term=%s" % urllib2.quote(searchterm)
    if searchterm=="wotd":
      e.output = get_urbandictionary_wotd()
      return e
    
    if searchterm== "":
      url = "http://www.urbandictionary.com/random.php"
    
    try:
      opener = urllib2.build_opener()
      opener.addheaders = [('User-Agent',"Opera/9.10 (YourMom 8.0)")]
      pagetmp = opener.open(url)
      page = pagetmp.read()
      url = pagetmp.geturl()
      opener.close()

      page = BeautifulSoup(page)
      first_definition= ""
      
      if page.find(id='not_defined_yet') != None:
          return None
      
      ## depending on the search results the first word may be contained directly under the <td class='word'> tag
      ## or it may be the text contents of a <a href> tag
      ## we first try to get it from inside a <td><a href>[word]</a></td> type structure
      ## if that fails, get the word under the initial <td> tag

      try:
        first_word = page.findAll('td',attrs={"class" : "word"})[0].contents[1].string
      except:
        first_word = page.findAll('td',attrs={"class" : "word"})[0].contents[0].string     
      
      first_word = first_word.replace("\n","")
      #first_word = first_word.encode("utf-8", 'ignore')

      for content in page.findAll('div',attrs={"class" : "definition"})[0].contents:
        if content.string != None:
          first_definition += content.string

      #first_definition = first_definition.encode("utf-8", 'ignore')
      first_definition = tools.decode_htmlentities(first_definition).encode("utf-8", 'ignore')
      first_word = tools.decode_htmlentities(first_word).encode("utf-8", 'ignore')

      first_definition = first_definition.replace("\n", " ")
      first_definition = first_definition.replace("\r", " ")
      first_definition = first_definition[0:392]

      first_definition = ((first_word + ": " + first_definition).decode("utf-8", 'ignore') + " [ %s ]" % tools.shorten_url(url)).encode('utf-8', 'ignore')
      #print first_definition
      e.output = first_definition
      return e
      
    except:
      print "!ud %s went wrong" % searchterm
      return

コード例 #2

0

ファイルを表示

ファイル: fml.py プロジェクト: cactauz/genmaybot

def get_fml(self, e):  
    #queries a random fmylife.com passage
  try:
    fmlxml = urllib2.urlopen("http://api.betacie.com/view/random?key=%s&language=en" % tools.config.fmlAPIkey).read()
    start = fmlxml.find("<text>") + 6
    end = fmlxml.find("</text>")
    
    fml = fmlxml[start:end]
    
    start = fmlxml.find("<agree>") + 7
    end = fmlxml.find("</agree>")
    
    fml = fml + " [FYL: " + str(fmlxml[start:end])
    
    start = fmlxml.find("<deserved>") + 10
    end = fmlxml.find("</deserved>")   
    
    fml = fml + " Deserved it: " + str(fmlxml[start:end]) + "]"
    
    
    fml = fml.replace('&quot;', '"')
    fml = fml.replace('&amp;quot;', '"')
    fml = fml.replace('&amp;', "&")
    e.output = tools.decode_htmlentities(fml)
    
    return e
  except Exception as inst:
    print "!fml " + str(inst)
    return None

コード例 #3

0

ファイルを表示

ファイル: gwiki.py プロジェクト: iamsix/genmaybot

def gwiki(bot, e):
    url = (
        'http://ajax.googleapis.com/ajax/services/search/web?v=1.0&q=site:wikipedia.org+'
        + urllib.parse.quote(e.input))
    request = urllib.request.Request(url, None,
                                     {'Referer': 'http://irc.00id.net'})
    response = urllib.request.urlopen(request)

    results_json = json.loads(response.read().decode('utf-8'))
    results = results_json['responseData']['results']
    regexstring = "wikipedia.org/wiki/"
    result = results[0]
    m = re.search(regexstring, result['url'])
    if (m):
        url = result['url']
        url = tools.shorten_url(url.replace('%25', '%'))
        #content = result['content'].encode('utf-8')

        content = tools.decode_htmlentities(
            tools.remove_html_tags(result['content']))
        content = re.sub('\s+', ' ', content)
        content = content.replace("...", "")
        #print content
        #content = content.decode('unicode-escape')
        #e.output = content
        e.output = "%s [ %s ]" % (content, url)
    return e

コード例 #4

0

ファイルを表示

ファイル: urban_dictionary.py プロジェクト: cactauz/genmaybot

def get_urbandictionary_wotd():

  url = "http://www.urbandictionary.com"
  try:
    opener = urllib2.build_opener()
    opener.addheaders = [('User-Agent',"Opera/9.10 (YourMom 8.0)")]
    pagetmp = opener.open(url)
    page = pagetmp.read()
    opener.close()

    page = BeautifulSoup(page)
    
    first_definition = ""
    
    first_word = page.findAll('div',attrs={"class" : "word"})[0].contents[1].contents[0].string
    first_word = first_word.encode("utf-8", 'ignore')
    
    
    for content in page.findAll('div',attrs={"class" : "definition"})[0].contents:
      if content.string != None:
        first_definition += content.string

    first_definition = first_definition.encode("utf-8", 'ignore')
    first_definition = tools.decode_htmlentities(first_definition.decode("utf-8", 'ignore')).encode("utf-8", 'ignore')
    first_definition = first_definition.replace("\n", " ")

    wotd = (first_word.decode('utf-8') + ": " + first_definition.decode('utf-8') + " [ %s ]" % tools.shorten_url(url)).encode('utf-8', 'ignore')

    return wotd
  except:
    print "!ud wotd went wrong"
    return

コード例 #5

0

ファイルを表示

ファイル: news.py プロジェクト: cactauz/genmaybot

def google_news(self, e):
    query = urllib.quote(e.input)
    url = ""
    if not query:
        url = "http://news.google.com/news?ned=us&topic=h&output=rss"
    else:
        url = "http://news.google.com/news?q=%s&output=rss" % query

    dom = xml.dom.minidom.parse(urllib2.urlopen(url))
    newest_news = dom.getElementsByTagName('item')[0]
    title = newest_news.getElementsByTagName('title')[0].childNodes[0].data
    description = BeautifulSoup(
        newest_news.getElementsByTagName('description')[0].childNodes[0].data)

    links = description.findAll('a')
    for link in links:
        link.extract()
    links = description.findAll(color='#6f6f6f')
    for link in links:
        link.extract()

    description = str(description).strip().decode("utf-8", 'ignore')
    description = tools.remove_html_tags(description)
    description = tools.decode_htmlentities(description)
    description = description[0:len(description) - 9]
    if description.rfind(".") != -1:
        description = description[0:description.rfind(".") + 1]
    link = tools.shorten_url(
        newest_news.getElementsByTagName('link')[0].childNodes[0].data)

    e.output = "%s - %s [ %s ]" % (title.encode(
        "utf-8", 'ignore'), description.encode(
            "utf-8", 'ignore'), link.encode("utf-8", 'ignore'))

    return e

コード例 #6

0

ファイルを表示

ファイル: news.py プロジェクト: cactauz/genmaybot

def google_news (self, e):
    query = urllib.quote(e.input)
    url = ""
    if not query:
        url = "http://news.google.com/news?ned=us&topic=h&output=rss"
    else:
        url = "http://news.google.com/news?q=%s&output=rss" % query
    
           
    dom = xml.dom.minidom.parse(urllib2.urlopen(url))
    newest_news = dom.getElementsByTagName('item')[0]
    title = newest_news.getElementsByTagName('title')[0].childNodes[0].data
    description = BeautifulSoup(newest_news.getElementsByTagName('description')[0].childNodes[0].data)
    
    links = description.findAll('a')
    for link in links:
        link.extract()          
    links = description.findAll(color='#6f6f6f')
    for link in links:
        link.extract()
    
    description = str(description).strip().decode("utf-8", 'ignore')
    description = tools.remove_html_tags(description)
    description = tools.decode_htmlentities(description)
    description = description[0:len(description) - 9]
    if description.rfind(".")!=-1:
        description = description[0:description.rfind(".")+1]
    link = tools.shorten_url(newest_news.getElementsByTagName('link')[0].childNodes[0].data)
    
    e.output = "%s - %s [ %s ]" % (title.encode("utf-8", 'ignore'), description.encode("utf-8", 'ignore'), link.encode("utf-8", 'ignore'))
    
    return e

コード例 #7

0

ファイルを表示

def get_fml(self, e):
    #queries a random fmylife.com passage
    try:
        fmlxml = urllib2.urlopen(
            "http://api.betacie.com/view/random?key=%s&language=en" %
            tools.config.fmlAPIkey).read()
        start = fmlxml.find("<text>") + 6
        end = fmlxml.find("</text>")

        fml = fmlxml[start:end]

        start = fmlxml.find("<agree>") + 7
        end = fmlxml.find("</agree>")

        fml = fml + " [FYL: " + str(fmlxml[start:end])

        start = fmlxml.find("<deserved>") + 10
        end = fmlxml.find("</deserved>")

        fml = fml + " Deserved it: " + str(fmlxml[start:end]) + "]"

        fml = fml.replace('&quot;', '"')
        fml = fml.replace('&amp;quot;', '"')
        fml = fml.replace('&amp;', "&")
        e.output = tools.decode_htmlentities(fml)

        return e
    except Exception as inst:
        print "!fml " + str(inst)
        return None

コード例 #8

0

ファイルを表示

ファイル: wiki.py プロジェクト: cactauz/genmaybot

def get_wiki_file_description(url):
    try:
        opener = urllib2.build_opener()
        opener.addheaders = [("User-Agent", "Opera/9.10 (YourMom 8.0)")]
        pagetmp = opener.open(url)
        page = pagetmp.read()
        opener.close()

        page = BeautifulSoup(page)

        try:
            desc = page.findAll("div", attrs={"class": "description en"})[0].getText(separator=" ")
            # print "hit 1st case"
        except:
            try:
                desc = (
                    page.find("th", attrs={"id": "fileinfotpl_desc"})
                    .findNextSibling("td")
                    .find("p")
                    .getText(separator=" ")
                )
            # print "hit 2nd case"
            except:
                try:
                    desc = (
                        page.find("th", attrs={"id": "fileinfotpl_desc"})
                        .findNextSibling("td")
                        .find("div")
                        .getText(separator=" ")
                    )
                    # print "hit 3rd case"
                except:
                    try:
                        desc = page.find("div", attrs={"id": "shared-image-desc"}).next.getText(separator=" ")
                        # print "hit 4th case"
                    except:
                        print "Couldn't find description for file %s" % url
                        return False

        desc = desc.encode("utf-8", "ignore")
        desc = desc.replace("English:", "")
        desc = tools.decode_htmlentities(desc.decode("utf-8", "ignore")).encode("utf-8", "ignore")
        desc = desc[0:420]
        if desc.rfind(".") != -1:
            desc = desc[0 : desc.rfind(".") + 1]

        # print desc
        return desc.strip()

    except:
        print "Finding a file description failed miserably. The URL probably didn't even load."
        return

コード例 #9

0

ファイルを表示

def get_woot(self, e):
    #display the current woot.com sale
      try:
          url = "http://www.woot.com/salerss.aspx"
          dom = xml.dom.minidom.parse(urllib2.urlopen(url))
      
          product = dom.getElementsByTagName("woot:product")[0].childNodes[0].data
          product = tools.decode_htmlentities(product)
      
          price = dom.getElementsByTagName("woot:price")[0].childNodes[0].data
      
          e.output = product + " [" + price + "]"
          return e
      except:
          pass

コード例 #10

0

ファイルを表示

def get_wiki_file_description(url):
  try:
    opener = urllib2.build_opener()
    opener.addheaders = [('User-Agent',"Opera/9.10 (YourMom 8.0)")]
    pagetmp = opener.open(url)
    page = pagetmp.read()
    opener.close()

    page = BeautifulSoup(page)
    
    try:
      desc = page.findAll("div",attrs={"class":"description en"})[0].getText(separator=" ")
      #print "hit 1st case"
    except:
      try:
        desc = page.find("th",attrs={"id" : "fileinfotpl_desc"}).findNextSibling("td").find("p").getText(separator=" ")
       #print "hit 2nd case"
      except:
        try:
          desc = page.find("th",attrs={"id" : "fileinfotpl_desc"}).findNextSibling("td").find("div").getText(separator=" ")   
          #print "hit 3rd case"
        except:
          try:
            desc = page.find("div",attrs={"id":"shared-image-desc"}).next.getText(separator=" ")
            #print "hit 4th case"
          except:
            print "Couldn't find description for file %s" % url
            return False
      
    
    desc = desc.encode("utf-8", 'ignore')
    desc = desc.replace("English:","")
    desc = tools.decode_htmlentities(desc.decode("utf-8", 'ignore')).encode("utf-8", 'ignore')
    desc = desc[0:420]
    if desc.rfind(".")!=-1:
      desc = desc[0:desc.rfind(".")+1]
    
    #print desc  
    return desc.strip()
      
  except:
    print "Finding a file description failed miserably. The URL probably didn't even load."  
    return

コード例 #11

0

ファイルを表示

ファイル: urban_dictionary.py プロジェクト: cactauz/genmaybot

def get_urbandictionary_wotd():

    url = "http://www.urbandictionary.com"
    try:
        opener = urllib2.build_opener()
        opener.addheaders = [('User-Agent', "Opera/9.10 (YourMom 8.0)")]
        pagetmp = opener.open(url)
        page = pagetmp.read()
        opener.close()

        page = BeautifulSoup(page)

        first_definition = ""

        first_word = page.findAll('div',
                                  attrs={"class": "word"
                                         })[0].contents[1].contents[0].string
        first_word = first_word.encode("utf-8", 'ignore')

        for content in page.findAll('div', attrs={"class":
                                                  "definition"})[0].contents:
            if content.string != None:
                first_definition += content.string

        first_definition = first_definition.encode("utf-8", 'ignore')
        first_definition = tools.decode_htmlentities(
            first_definition.decode("utf-8",
                                    'ignore')).encode("utf-8", 'ignore')
        first_definition = first_definition.replace("\n", " ")

        wotd = (first_word.decode('utf-8') + ": " +
                first_definition.decode('utf-8') +
                " [ %s ]" % tools.shorten_url(url)).encode('utf-8', 'ignore')

        return wotd
    except:
        print "!ud wotd went wrong"
        return

コード例 #12

0

ファイルを表示

ファイル: gwiki.py プロジェクト: KpaBap/genmaybot

def gwiki(bot, e):
      url = ('http://ajax.googleapis.com/ajax/services/search/web?v=1.0&q=site:wikipedia.org+' + urllib.parse.quote(e.input))
      request = urllib.request.Request(url, None, {'Referer': 'http://irc.00id.net'})
      response = urllib.request.urlopen(request)

      results_json = json.loads(response.read().decode('utf-8'))
      results = results_json['responseData']['results']
      regexstring = "wikipedia.org/wiki/"
      result = results[0]
      m = re.search(regexstring,result['url'])   
      if (m):
         url = result['url']
         url = tools.shorten_url(url.replace('%25','%'))
         #content = result['content'].encode('utf-8')
         
         content = tools.decode_htmlentities(tools.remove_html_tags(result['content']))
         content = re.sub('\s+', ' ', content)
         content = content.replace("...", "")
         #print content
         #content = content.decode('unicode-escape')
         #e.output = content
         e.output = "%s [ %s ]" % (content, url)
      return e

コード例 #13

0

ファイルを表示

def get_wiki(self, e, urlposted=False):
    #read the first paragraph of a wikipedia article
  searchterm = e.input
  
  if urlposted:
      url = searchterm
  else:
      if searchterm == "":
          url = "http://en.wikipedia.org/wiki/Special:Random"
      else:
          url = tools.google_url("site:wikipedia.org " + searchterm,"wikipedia.org/wiki")
  
  title = "" 
  
  if url and url.find("wikipedia.org/wiki/File:") != -1:
    
    file_title=get_wiki_file_description(url)
    
    if file_title:
        e.output = file_title
        return e 
    
  if url and url.find("wikipedia.org/wiki/") != -1:

    try:
      opener = urllib2.build_opener()
      opener.addheaders = [('User-Agent',"Opera/9.10 (YourMom 8.0)")]
      pagetmp = opener.open(url)
      page = pagetmp.read()
      url = pagetmp.geturl()
      opener.close()

      if url.find('#') != -1:
        anchor = url.split('#')[1]
        page = page[page.find('id="' + anchor):]

      page = BeautifulSoup(page)
      tables = page.findAll('table')
      for table in tables:
        table.extract()
        
      page = page.findAll('p')
      if str(page[0])[0:9] == '<p><span ':
          page = unicode(page[1].extract())
      else:
          page = unicode(page[0].extract())

      title = tools.remove_html_tags(re.search('(?s)\<p\>(.*?)\<\/p\>',page).group(1))
      title = title.encode("utf-8", 'ignore')
      title = title.replace("<","");
      rembracket = re.compile(r'\[.*?\]')
      title = rembracket.sub('',title)
      #title = re.sub("\&.*?\;", " ", title)
      title = title.replace("\n", " ")
      
      title = tools.decode_htmlentities(title.decode("utf-8", 'ignore')).encode("utf-8", 'ignore')

      title = title[0:420]
      if title.rfind(".")!=-1:
        title = title[0:title.rfind(".")+1]
      
      if not urlposted:
        url = tools.shorten_url(url)
        title = (title.decode('utf-8', 'ignore') + " [ %s ]" % url).encode('utf-8', 'ignore')
    except Exception as inst: 
      print "!wiki " + searchterm + " : " + str(inst)
      title = tools.remove_html_tags(re.search('\<p\>(.*?\.) ',str(page)).group(1))

  e.output = title
  return e

コード例 #14

0

ファイルを表示

def url_posted(self, e):
    url = e.input
    #checks if the URL is a dupe (if mysql is enabled)
    #detects if a wikipedia or imdb url is posted and does the appropriate command for it

    try:

        repost = ""
        days = ""

        if tools.config.sqlmode > 0:
            urlhash = hashlib.sha224(url).hexdigest()

            conn = MySQLdb.connect(host="localhost",
                                   user=tools.config.sqlusername,
                                   passwd=tools.config.sqlpassword,
                                   db="irc_links")
            cursor = conn.cursor()
            query = "SELECT reposted, timestamp FROM links WHERE hash='%s'" % urlhash
            result = cursor.execute(query)

            if result != 0:
                result = cursor.fetchone()

                repost = "LOL REPOST %s " % (result[0] + 1)

                orig = result[1]
                now = datetime.datetime.now()
                delta = now - orig

                plural = ""
                if delta.days > 0:
                    if delta.days > 1:
                        plural = "s"
                    days = " (posted %s day%s ago)" % (str(delta.days), plural)
                else:
                    hrs = int(round(delta.seconds / 3600.0, 0))
                    if hrs == 0:
                        mins = delta.seconds / 60
                        if mins > 1:
                            plural = "s"
                        days = " (posted %s minute%s ago)" % (str(mins),
                                                              plural)
                        if mins == 0:
                            repost = ""
                            days = ""
                    else:
                        if hrs > 1:
                            plural = "s"
                        days = " (posted %s hour%s ago)" % (str(hrs), plural)

        title = ""

        try:
            wiki = self.bangcommands["!wiki"](self, e, True)
        except:
            pass
        try:
            imdb = self.bangcommands["!imdb"](self, e, True)
        except:
            pass
        if wiki and wiki.output:
            title = wiki.output
        elif imdb and imdb.output:
            title = imdb.output
        else:
            if url.find("imgur.com") != -1:
                imgurid = url[url.rfind('/') + 1:url.rfind('/') + 6]
                url = "http://imgur.com/" + imgurid
            title = get_title(url)
            if title.find("imgur: the simple") != -1:
                title = ""

        title = title.replace("\n", " ")
        title = re.sub('\s+', ' ', title)
        pattern = re.compile('whatsisname', re.IGNORECASE)
        title = pattern.sub('', title)
        title = tools.decode_htmlentities(title.decode("utf-8",
                                                       'ignore')).encode(
                                                           "utf-8", 'ignore')

        titler = "%s%s%s" % (repost, title, days)

        if tools.config.sqlmode == 2:
            title = MySQLdb.escape_string(title)
            url = MySQLdb.escape_string(url)
            query = "INSERT INTO links (url, title, hash) VALUES ('%s','%s','%s') ON DUPLICATE KEY UPDATE reposted=reposted+1,title='%s'" % (
                url, title, urlhash, title)
            cursor.execute(query)
        if tools.config.sqlmode > 0:
            conn.close()

        e.output = titler
        return e

    except Exception as inst:
        print url + ": " + str(inst)
        pass
    return

コード例 #15

0

ファイルを表示

ファイル: wiki.py プロジェクト: cactauz/genmaybot

def get_wiki(self, e, urlposted=False):
    # read the first paragraph of a wikipedia article
    searchterm = e.input

    if urlposted:
        url = searchterm
    else:
        if searchterm == "":
            url = "http://en.wikipedia.org/wiki/Special:Random"
        else:
            url = tools.google_url("site:wikipedia.org " + searchterm, "wikipedia.org/wiki")

    title = ""

    if url and url.find("wikipedia.org/wiki/File:") != -1:

        file_title = get_wiki_file_description(url)

        if file_title:
            e.output = file_title
            return e

    if url and url.find("wikipedia.org/wiki/") != -1:

        try:
            opener = urllib2.build_opener()
            opener.addheaders = [("User-Agent", "Opera/9.10 (YourMom 8.0)")]
            pagetmp = opener.open(url)
            page = pagetmp.read()
            url = pagetmp.geturl()
            opener.close()

            if url.find("#") != -1:
                anchor = url.split("#")[1]
                page = page[page.find('id="' + anchor) :]

            page = BeautifulSoup(page)
            tables = page.findAll("table")
            for table in tables:
                table.extract()

            page = page.findAll("p")
            if str(page[0])[0:9] == "<p><span ":
                page = unicode(page[1].extract())
            else:
                page = unicode(page[0].extract())

            title = tools.remove_html_tags(re.search("(?s)\<p\>(.*?)\<\/p\>", page).group(1))
            title = title.encode("utf-8", "ignore")
            title = title.replace("<", "")
            rembracket = re.compile(r"\[.*?\]")
            title = rembracket.sub("", title)
            # title = re.sub("\&.*?\;", " ", title)
            title = title.replace("\n", " ")

            title = tools.decode_htmlentities(title.decode("utf-8", "ignore")).encode("utf-8", "ignore")

            title = title[0:420]
            if title.rfind(".") != -1:
                title = title[0 : title.rfind(".") + 1]

            if not urlposted:
                url = tools.shorten_url(url)
                title = (title.decode("utf-8", "ignore") + " [ %s ]" % url).encode("utf-8", "ignore")
        except Exception as inst:
            print "!wiki " + searchterm + " : " + str(inst)
            title = tools.remove_html_tags(re.search("\<p\>(.*?\.) ", str(page)).group(1))

    e.output = title
    return e

コード例 #16

0

ファイルを表示

ファイル: urban_dictionary.py プロジェクト: cactauz/genmaybot

def get_urbandictionary(self, e):
    searchterm = e.input
    url = "http://www.urbandictionary.com/define.php?term=%s" % urllib2.quote(
        searchterm)
    if searchterm == "wotd":
        e.output = get_urbandictionary_wotd()
        return e

    if searchterm == "":
        url = "http://www.urbandictionary.com/random.php"

    try:
        opener = urllib2.build_opener()
        opener.addheaders = [('User-Agent', "Opera/9.10 (YourMom 8.0)")]
        pagetmp = opener.open(url)
        page = pagetmp.read()
        url = pagetmp.geturl()
        opener.close()

        page = BeautifulSoup(page)
        first_definition = ""

        if page.find(id='not_defined_yet') != None:
            return None

        ## depending on the search results the first word may be contained directly under the <td class='word'> tag
        ## or it may be the text contents of a <a href> tag
        ## we first try to get it from inside a <td><a href>[word]</a></td> type structure
        ## if that fails, get the word under the initial <td> tag

        try:
            first_word = page.findAll('td',
                                      attrs={"class":
                                             "word"})[0].contents[1].string
        except:
            first_word = page.findAll('td',
                                      attrs={"class":
                                             "word"})[0].contents[0].string

        first_word = first_word.replace("\n", "")
        #first_word = first_word.encode("utf-8", 'ignore')

        for content in page.findAll('div', attrs={"class":
                                                  "definition"})[0].contents:
            if content.string != None:
                first_definition += content.string

        #first_definition = first_definition.encode("utf-8", 'ignore')
        first_definition = tools.decode_htmlentities(first_definition).encode(
            "utf-8", 'ignore')
        first_word = tools.decode_htmlentities(first_word).encode(
            "utf-8", 'ignore')

        first_definition = first_definition.replace("\n", " ")
        first_definition = first_definition.replace("\r", " ")
        first_definition = first_definition[0:392]

        first_definition = (
            (first_word + ": " + first_definition).decode("utf-8", 'ignore') +
            " [ %s ]" % tools.shorten_url(url)).encode('utf-8', 'ignore')
        #print first_definition
        e.output = first_definition
        return e

    except:
        print "!ud %s went wrong" % searchterm
        return

コード例 #17

0

ファイルを表示

def get_imdb(self, e, urlposted=False):
    #reads title, rating, and movie description of movie titles
    searchterm = e.input
    if urlposted:
        url = searchterm
    else:
        url = tools.google_url("site:imdb.com/title " + searchterm,
                               "imdb.com/title/tt\\d{7}/")

    title = ""
    if not url:
        pass
    elif url.find("imdb.com/title/tt") != -1:
        try:
            movietitle = ""
            rating = ""
            summary = ""
            imdbid = re.search("tt\\d{7}", url)
            imdburl = ('http://www.imdb.com/title/' + imdbid.group(0) + '/')
            opener = urllib2.build_opener()
            opener.addheaders = [('User-Agent', "Opera/9.10 (YourMom 8.0)"),
                                 ('Range', "bytes=0-40960")]
            pagetmp = opener.open(imdburl)
            page = BeautifulSoup(pagetmp.read(40960))
            opener.close()

            movietitle = tools.decode_htmlentities(
                tools.remove_html_tags(str(page.find('title'))).replace(
                    " - IMDb", ""))
            movietitle = movietitle.replace("IMDb - ", "")
            movietitle = "Title: " + movietitle

            if page.find(id="overview-top") != None:
                page = page.find(id="overview-top").extract()

                if page.find("div", "star-box-giga-star") != None:
                    rating = tools.remove_html_tags(
                        str(page.find("div", "star-box-giga-star").text))
                    rating = " - Rating: " + rating

                if len(page.findAll('p')) == 2:

                    summary = str(page.findAll('p')[1])

                    removelink = re.compile(r'\<a.*\/a\>')
                    summary = removelink.sub('', summary)
                    summary = tools.remove_html_tags(summary)
                    summary = summary.replace('&raquo;', "")
                    summary = tools.decode_htmlentities(
                        summary.decode("utf-8", 'ignore'))
                    summary = re.sub("\&.*?\;", " ", summary)
                    summary = summary.replace("\n", " ")
                    summary = " - " + summary

            title = movietitle + rating + summary
            if not urlposted:
                title = title + " [ %s ]" % url

            e.output = title.encode('utf-8', 'ignore')

            return e
        except Exception as inst:
            print "!imdb " + searchterm + ": " + str(inst)
            return None

コード例 #18

0

ファイルを表示

ファイル: url.py プロジェクト: cactauz/genmaybot

def url_posted(self, e):
    url = e.input
    # checks if the URL is a dupe (if mysql is enabled)
    # detects if a wikipedia or imdb url is posted and does the appropriate command for it

    try:

        repost = ""
        days = ""

        if tools.config.sqlmode > 0:
            urlhash = hashlib.sha224(url).hexdigest()

            conn = MySQLdb.connect(
                host="localhost", user=tools.config.sqlusername, passwd=tools.config.sqlpassword, db="irc_links"
            )
            cursor = conn.cursor()
            query = "SELECT reposted, timestamp FROM links WHERE hash='%s'" % urlhash
            result = cursor.execute(query)

            if result != 0:
                result = cursor.fetchone()

                repost = "LOL REPOST %s " % (result[0] + 1)

                orig = result[1]
                now = datetime.datetime.now()
                delta = now - orig

                plural = ""
                if delta.days > 0:
                    if delta.days > 1:
                        plural = "s"
                    days = " (posted %s day%s ago)" % (str(delta.days), plural)
                else:
                    hrs = int(round(delta.seconds / 3600.0, 0))
                    if hrs == 0:
                        mins = delta.seconds / 60
                        if mins > 1:
                            plural = "s"
                        days = " (posted %s minute%s ago)" % (str(mins), plural)
                        if mins == 0:
                            repost = ""
                            days = ""
                    else:
                        if hrs > 1:
                            plural = "s"
                        days = " (posted %s hour%s ago)" % (str(hrs), plural)

        title = ""

        try:
            wiki = self.bangcommands["!wiki"](self, e, True)
        except:
            pass
        try:
            imdb = self.bangcommands["!imdb"](self, e, True)
        except:
            pass
        if wiki and wiki.output:
            title = wiki.output
        elif imdb and imdb.output:
            title = imdb.output
        else:
            if url.find("imgur.com") != -1:
                imgurid = url[url.rfind("/") + 1 : url.rfind("/") + 6]
                url = "http://imgur.com/" + imgurid
            title = get_title(url)
            if title.find("imgur: the simple") != -1:
                title = ""

        title = title.replace("\n", " ")
        title = re.sub("\s+", " ", title)
        pattern = re.compile("whatsisname", re.IGNORECASE)
        title = pattern.sub("", title)
        title = tools.decode_htmlentities(title.decode("utf-8", "ignore")).encode("utf-8", "ignore")

        titler = "%s%s%s" % (repost, title, days)

        if tools.config.sqlmode == 2:
            title = MySQLdb.escape_string(title)
            url = MySQLdb.escape_string(url)
            query = (
                "INSERT INTO links (url, title, hash) VALUES ('%s','%s','%s') ON DUPLICATE KEY UPDATE reposted=reposted+1,title='%s'"
                % (url, title, urlhash, title)
            )
            cursor.execute(query)
        if tools.config.sqlmode > 0:
            conn.close()

        e.output = titler
        return e

    except Exception as inst:
        print url + ": " + str(inst)
        pass
    return