Python BeautifulSoupの例、include.BeautifulSoup.BeautifulSoup Pythonの例

コード例 #1

0

ファイルを表示

 def get_lyrics_from_url(self, url):
     page = geturl(url, referer=self.baseurl)
     soup = BeautifulSoup(page)
     content = soup.find('div', attrs={'id': 'content'})
     [div.extract() for div in content.findAll('div')]
     [link.extract() for link in content.findAll('a')]
     [script.extract() for script in content.findAll('script')]
     lines = [str(line) for line in content.contents]
     data = ''.join(lines)
     data = self._newline.sub('', data)
     data = self._leadbreak.sub('', data)
     data = self._endbreak.sub('', data)
     lines = self._break.split(data)
     verses = []
     while True:
         try:
             i = lines.index('')
             verse, lines = lines[:i], lines[i+1:]
             verses.append(verse)
         except ValueError:
             verses.append(lines)
             break
     for i, verse in enumerate(verses):
         verse = ' / '.join(verse)
         verse = whitespace.sub(' ', verse)
         verses[i] = verse
     if self._spam in verses:
         del verses[verses.index(self._spam)]
     return verses

コード例 #2

0

ファイルを表示

ファイル: google.py プロジェクト: compbrain/madcow

 def clock(self, query):
     """Use google to look up time in a given location"""
     try:
         doc = self.ua.open(self.search, {'q': 'time in %s' % query})
         soup = BeautifulSoup(doc)
         time = soup.find('img', src=self.clock_re).findNext('td')
         try:
             time.find('table').extract()
         except AttributeError:
             pass
         return stripHTML(time.renderContents().decode('utf-8')).strip()
     except:
         pass

コード例 #3

0

ファイルを表示

ファイル: war.py プロジェクト: gipi/Richie

 def bodycount(self):
     try:
         doc = geturl(IraqWar._bodycount_url)
         soup = BeautifulSoup(doc)
         data = soup.find('td', attrs={'class': 'main-num'})
         data = data.find('a')
         data = str(data.contents[0])
         data = stripHTML(data)
         data = IraqWar._re_whitespace.sub(' ', data)
         data = data.strip()
         return data
     except Exception, e:
         log.warn('error in %s: %s' % (self.__module__, e))
         log.exception(e)
         return 'UNKNOWN'

コード例 #4

0

ファイルを表示

 def bodycount(self):
     try:
         doc = geturl(IraqWar._bodycount_url)
         soup = BeautifulSoup(doc)
         data = soup.find('td', attrs={'class': 'main-num'})
         data = data.find('a')
         data = str(data.contents[0])
         data = stripHTML(data)
         data = IraqWar._re_whitespace.sub(' ', data)
         data = data.strip()
         return data
     except Exception, e:
         log.warn('error in %s: %s' % (self.__module__, e))
         log.exception(e)
         return 'UNKNOWN'

コード例 #5

0

ファイルを表示

ファイル: hugs.py プロジェクト: gipi/Richie

 def response(self, nick, args, kwargs):
     try:
         doc = geturl(self.random)
         soup = BeautifulSoup(doc)
         confs = soup.findAll('div', attrs={'class': 'content'})[3:]
         conf = random.choice(confs)
         conf = [str(p) for p in conf.findAll('p')]
         conf = ' '.join(conf)
         conf = stripHTML(conf)
         conf = conf.strip()
         return conf
     except Exception, e:
         log.warn('error in %s: %s' % (self.__module__, e))
         log.exception(e)
         return '%s: I had some issues with that..' % nick

コード例 #6

0

ファイルを表示

 def response(self, nick, args, kwargs):
     try:
         doc = geturl(self.random)
         soup = BeautifulSoup(doc)
         confs = soup.findAll('div', attrs={'class': 'content'})[3:]
         conf = random.choice(confs)
         conf = [str(p) for p in conf.findAll('p')]
         conf = ' '.join(conf)
         conf = stripHTML(conf)
         conf = conf.strip()
         return conf
     except Exception, e:
         log.warn('error in %s: %s' % (self.__module__, e))
         log.exception(e)
         return '%s: I had some issues with that..' % nick

コード例 #7

0

ファイルを表示

ファイル: stupid.py プロジェクト: gipi/Richie

    def get_comment(self):
        page = geturl(self.url)

        # remove high ascii since this is going to IRC
        page = self.utf8.sub('', page)

        # create BeautifulSoup document tree
        soup = BeautifulSoup(page)
        table = soup.find('table')
        rows = table.findAll('tr')
        row = rows[1]
        cells = row.findAll('td')
        source = cells[1].string
        comment = cells[2].string
        author = cells[3].string
        return '<%s@%s> %s' % (author, source, comment)

コード例 #8

0

ファイルを表示

ファイル: hugs.py プロジェクト: compbrain/madcow

 def response(self, nick, args, kwargs):
     try:
         doc = geturl(self.random, add_headers={'Accept': '*/*'})
         soup = BeautifulSoup(doc)
         main = soup.find(u'div', attrs={u'id': u'main'})
         confs = main.findAll(u'div', attrs={u'class': u'content'})
         conf = random.choice(confs)
         conf = [unicode(p) for p in conf.findAll(u'p')]
         conf = u' '.join(conf)
         conf = stripHTML(conf)
         conf = conf.strip()
         return conf
     except Exception, error:
         log.warn(u'error in module %s' % self.__module__)
         log.exception(error)
         return u'%s: I had some issues with that..' % nick

コード例 #9

0

ファイルを表示

    def get_comment(self):
        page = geturl(self.url)

        # remove high ascii since this is going to IRC
        page = self.utf8.sub('', page)

        # create BeautifulSoup document tree
        soup = BeautifulSoup(page)
        table = soup.find('table')
        rows = table.findAll('tr')
        row = rows[1]
        cells = row.findAll('td')
        source = cells[1].string
        comment = cells[2].string
        author = cells[3].string
        return '<%s@%s> %s' % (author, source, comment)

コード例 #10

0

ファイルを表示

ファイル: trek.py プロジェクト: compbrain/madcow

 def response(self, nick, args, kwargs):
     try:
         fail = BeautifulSoup(geturl(self.url)).h1
         return self.spaces_re.sub(
             " ",
             stripHTML(
                 u"%s: %s: %s %s: %s"
                 % (
                     nick,
                     self.col("red", text="FAIL"),
                     self.fail_re.search(fail.renderContents()).group(1),
                     self.col("green", text="FIX"),
                     self.fail_re.search(fail.findNext("h1").renderContents()).group(1),
                 )
             ),
         )
     except Exception, error:
         log.warn("error in module %s" % self.__module__)
         log.exception(error)
         return u"%s: Too much fail for technobabble" % (nick, error)

コード例 #11

0

ファイルを表示

ファイル: movie.py プロジェクト: compbrain/madcow

 def rate_rt(self, name):
     """Rating from rotten tomatoes"""
     page = geturl(self.rt_search, {'search': name}, referer=self.rt_url)
     soup = BeautifulSoup(page)
     for table in soup.body('table'):
         if table.caption.renderContents() == 'Movies':
             break
     else:
         raise ValueError('no movies found in search results')
     name = self.normalize(name)
     for row in table.tbody('tr'):
         link = row.a
         if self.normalize(link.renderContents()) == name:
             url = urljoin(self.rt_url, link['href'])
             break
     else:
         raise ValueError('no exact matches')
     soup = BeautifulSoup(geturl(url, referer=self.rt_search))
     info = soup.body.find('div', 'movie_info_area')
     return stripHTML(info.h1.renderContents()), info.a['title']

コード例 #12

0

ファイルを表示

ファイル: wikiquotes.py プロジェクト: compbrain/madcow

    def get_soup(self, query):
        if isinstance(query, (list, tuple)):
            query = u' '.join(query)

        # load page
        if query == u'random':
            opts = {}
            url = urljoin(self.base_url, self.random_path)
        else:
            opts = {u'search': query, u'go': u'Go'}
            url = urljoin(self.base_url, self.search_path)
        page = geturl(url, referer=self.base_url, opts=opts,
                      size=self.sample_size)

        # create BeautifulSoup document tree
        soup = BeautifulSoup(page)

        # extract title minus WP advert
        title = soup.title.string.replace(self.advert, u'')

        # remove all tabular data/sidebars
        for table in soup.findAll(u'table'):
            table.extract()

        # remove disambiguation links
        for dablink in soup.findAll(u'div', attrs={u'class': u'dablink'}):
            dablink.extract()

        # remove latitude/longitude metadata for places
        for coord in soup.findAll(u'span', attrs={u'id': u'coordinates'}):
            coord.extract()

        # strip non-english content wrappers
        for span in soup.findAll(u'span', attrs={u'lang': True}):
            span.extract()

        # remove IPA pronounciation guidelines
        for span in soup.findAll(u'span', attrs={u'class': u'IPA'}):
            span.extract()
        for link in soup.findAll(u'a', text=u'IPA'):
            link.extract()
        for span in soup.findAll(u'span', attrs={u'class': Wiki._audio}):
            span.extract()

        return soup, title

コード例 #13

0

ファイルを表示

ファイル: wikimedia.py プロジェクト: compbrain/madcow

    def _getpage(self, url, opts=None):
        page = geturl(url, referer=self.baseurl, opts=opts)
        # HTMLParser doesn't handle this very well.. see:
        # http://www.crummy.com/software/BeautifulSoup/3.1-problems.html
        page = self.scripts_re.sub('', page)
        soup = BeautifulSoup(page)

        # get page title
        title = soup.title.string
        if self.advert and self.advert in title:
            title = title.replace(self.advert, '')

        # remove all tabular data/sidebars
        for table in soup.findAll('table'):
            table.extract()

        # remove disambiguation links
        for div in soup.findAll('div', 'dablink'):
            div.extract()

        # remove latitude/longitude metadata for places
        for span in soup.findAll('span', id='coordinates'):
            span.extract()

        # strip non-english content wrappers
        for span in soup.findAll('span', lang=True):
            span.extract()

        # remove IPA pronounciation guidelines
        for span in soup.findAll('span', 'IPA'):
            span.extract()
        for a in soup.findAll('a', text='IPA'):
            a.extract()
        for span in soup.findAll('span', 'audiolink'):
            span.extract()

        return soup, title

コード例 #14

0

ファイルを表示

    def forecast(self, location):
        page = geturl(url=self.search, opts={'query': location},
                referer=self.baseurl)
        soup = BeautifulSoup(page)

        # disambiguation page
        if 'Search Results' in str(soup):
            table = soup.find('table', attrs={'class': 'boxB full'})
            rows = table.findAll('tr')
            results = []
            match = None
            for row in rows:
                cells = row.findAll('td', attrs={'class': 'sortC'})
                for cell in cells:
                    link = cell.find('a')
                    if link is None or 'addfav' in str(link['href']):
                        continue
                    city = str(link.contents[0])
                    href = urljoin(self.baseurl, str(link['href']))
                    results.append(city)
                    if city.lower() == location.lower():
                        match = urljoin(self.baseurl, href)
                        break
                if match:
                    break
            if match:
                page = geturl(url=match)
                soup = BeautifulSoup(page)
            else:
                return 'Multiple results found: %s' % ', '.join(results)

        rss_url = soup.find('link', attrs=self._rss_link)['href']
        rss = rssparser.parse(rss_url)
        title = str(soup.find('h1').string).strip()
        conditions = stripHTML(rss['items'][0]['description'])
        fields = self._bar.split(conditions)
        data = {}
        for field in fields:
            try:
                key, val = self._keyval.search(field).groups()
                data[key] = val
            except:
                pass

        try:
            temp = float(self._tempF.search(data['Temperature']).group(1))
            blink = False
            if temp < 0:
                color = 6
            elif temp >=0 and temp < 40:
                color = 2
            elif temp >= 40 and temp < 60:
                color = 10
            elif temp >= 60 and temp < 80:
                color = 3
            elif temp >= 80 and temp < 90:
                color = 7
            elif temp >= 90 and temp < 100:
                color = 5
            elif temp >= 100:
                color = 5
                blink = True
            data['Temperature'] = '\x03%s\x16\x16%s\x0F' % (color,
                    data['Temperature'])
            if blink:
                data['Temperature'] = '\x1b[5m' + data['Temperature'] + \
                        '\x1b[0m'

        except:
            pass

        output = []
        for key, val in data.items():
            line = '%s: %s' % (key, val)
            output.append(line)

        output = ' | '.join(output)

        return '%s: %s' % (title, output)

コード例 #15

0

ファイルを表示

ファイル: page_processor.py プロジェクト: pipifuyj/resys

class PageProcessor():
    
    def __init__(self,html):
        self.html=html
        self.soup = BeautifulSoup(self.html)
    
    def isFirstPage(self):
        if self.soup.find('div',{'class':'userMsg', 'id':'firstPostText'})!=None:
            return True
        else:
            return False
        
    def getTitle(self):
        if self.isFirstPage()==True:
            title = self.soup.find('div',{'class':'post_title'}).findAll(lambda tag:tag.name=='a',text=True)
            title = ''.join(title)
            title = title.replace('google_ad_region_start=title', '')
            title = title.replace('google_ad_region_end=title', '')
            title = title.replace('Archived From: Hot Deals', '')
            title = title.replace('&amp', '')
            title = title.replace('\n','')
            return title.strip()
        else:
            print >> sys.stderr, 'it is not the first page'
    
    def getRating(self):
        pass
    
    def getReplyNum(self):
        pass
    
    def getViewNum(self):
        pass
    
    def getPostTime(self):
        if self.isFirstPage():
            time = self.soup.find('div',{'class':'post_date'}).findAll(lambda tag:tag.name!='b',text=True)
            time = ''.join(time)
            time = time.replace('posted:', '')
            time = time.replace('updated:', '')
            time = time.replace('\n','')
            return time.strip()
        else:
            print >> sys.stderr, 'it is not the first page' 
    
    def getDescription(self):
        if self.isFirstPage():
            content = self.soup.find('div',{'class':'userMsg', 'id':'firstPostText'}).findAll(lambda tag:tag.name=='table',text=True)
            return (''.join(content[1:-1])).strip()
        else:
            print >> sys.stderr, 'it is not the first page' 
                
    def getCategory(self):
        pass
    
    def getFeedback(self):
        pass
    
    def getUser(self):
        if self.isFirstPage():
            username = self.soup.find('li',{'class':'user_name'}).findAll(lambda tag:tag.name!='span',text=True)
            return (''.join(username)).strip()
        else:
            print >> sys.stderr, 'it is not the first page'

コード例 #16

0

ファイルを表示

ファイル: page_processor.py プロジェクト: pipifuyj/resys

 def __init__(self,html):
     self.html=html
     self.soup = BeautifulSoup(self.html)

コード例 #17

0

ファイルを表示

ファイル: weather.py プロジェクト: compbrain/madcow

    def forecast(self, location):
        page = geturl(url=self.search, opts={u'query': location},
                      referer=self.baseurl)
        soup = BeautifulSoup(page)

        # disambiguation page
        if u'Search Results' in unicode(soup):
            table = soup.find(u'table', attrs={u'class': u'dataTable'})
            tbody = soup.find(u'tbody')
            results = [row.findAll(u'td')[0].find(u'a')
                       for row in tbody.findAll(u'tr')]
            results = [(normalize(unicode(result.contents[0])),
                        urljoin(Weather.baseurl, unicode(result[u'href'])))
                       for result in results]

            match = None
            for result in results:
                if result[0] == normalize(location):
                    match = result[1]
                    break
            if match is None:
                match = results[0][1]
            page = geturl(url=match, referer=self.search)
            soup = BeautifulSoup(page)

        title = soup.find(u'h1').string.strip()
        rss_url = soup.find(u'link', attrs=self._rss_link)[u'href']
        rss = feedparser.parse(rss_url)
        conditions = rss.entries[0].description

        # XXX ok, here's the deal. this page has raw utf-8 bytes encoded
        # as html entities, and in some cases latin1.  this demonstrates a
        # total misunderstanding of how unicode works on the part of the
        # authors, so we need to jump through some hoops to make it work
        conditions = conditions.encode(u'raw-unicode-escape')
        conditions = stripHTML(conditions)
        conditions = encoding.convert(conditions)
        fields = self._bar.split(conditions)
        data = {}
        for field in fields:
            try:
                key, val = self._keyval.search(field).groups()
                data[key] = val
            except:
                pass

        try:
            temp = float(self._tempF.search(data[u'Temperature']).group(1))
            blink = False
            if temp < 0:
                color = u'magenta'
            elif temp >=0 and temp < 40:
                color = u'blue'
            elif temp >= 40 and temp < 60:
                color = u'cyan'
            elif temp >= 60 and temp < 80:
                color = u'green'
            elif temp >= 80 and temp < 90:
                color = u'yellow'
            elif temp >= 90 and temp < 100:
                color = u'red'
            elif temp >= 100:
                color = u'red'
                blink = True
            data[u'Temperature'] = self.colorlib.get_color(color,
                    text=data[u'Temperature'])

            # XXX this seems ill-conceived
            if blink:
                data[u'Temperature'] = u'\x1b[5m' + data[u'Temperature'] + \
                        u'\x1b[0m'

        except Exception, error:
            log.exception(error)

コード例 #18

0

ファイルを表示

ファイル: stockquote.py プロジェクト: gipi/Richie

    def get_quote(self, symbol):
        url = Yahoo._quote_url.replace('SYMBOL', symbol)
        page = geturl(url)
        soup = BeautifulSoup(page)
        company = ' '.join([str(item) for item in soup.find('h1').contents])
        company = stripHTML(company)
        tables = soup.findAll('table')
        table = tables[0]
        rows = table.findAll('tr')
        data = {}
        current_value = 0.0
        open_value = 0.0
        for row in rows:
            key, val = row.findAll('td')
            key = str(key.contents[0])
            if key == 'Change:':
                try:
                    img = val.find('img')
                    alt = str(img['alt'])
                    val = alt + stripHTML(str(val.contents[0]))
                except:
                    val = '0.00%'
            elif key == 'Ask:':
                continue
            else:
                val = stripHTML(str(val.contents[0]))

            val = val.replace(',', '')
            if Yahoo._isfloat.search(val):
                val = float(val)

            data[key] = val

            if key == 'Last Trade:' or key == 'Index Value:':
                current_value = val

            elif key == 'Prev Close:':
                open_value = val

        # see if we can calculate percentage
        try:
            change = 100 * (current_value - open_value) / open_value
            data['Change:'] += ' (%.2f%%)' % change
        except:
            pass

        # try and colorize the change field
        try:
            if 'Up' in data['Change:']:
                data['Change:'] = self._green + data['Change:'] + self._reset
            elif 'Down' in data['Change:']:
                data['Change:'] = self._red + data['Change:'] + self._reset
        except:
            pass

        # build friendly output
        output = []
        for key, val in data.items():
            if isinstance(val, float):
                val = '%.2f' % val
            output.append('%s %s' % (key, val))

        return '%s - ' % company + ' | '.join(output)