Python BeautifulSoup.div Examples

Programming Language: Python

Namespace/Package Name: BeautifulSoup

Class/Type: BeautifulSoup

Method/Function: div

Examples at hotexamples.com: 5

Python BeautifulSoup.div - 5 examples found. These are the top rated real world Python examples of BeautifulSoup.BeautifulSoup.div extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

BeautifulSoup(30)

decompose(30)

first(30)

find_all(30)

findAll(30)

find(30)

fetch(30)

feed(30)

getText(29)

insert(20)

findChildren(19)

body(12)

close(11)

__str__(11)

encode(8)

new_tag(6)

findChild(5)

append(4)

prettify(4)

findSelect(4)

decode(4)

get(4)

__unicode__(3)

goahead(3)

lower(3)

div(3)

findall(3)

pretify(3)

__init__(3)

firstText(2)

pop(2)

data(2)

findNext(2)

read(2)

index(1)

html(1)

query(1)

json(1)

load(1)

re_left(1)

noscript(1)

orig_url(1)

partition(1)

popTag(1)

pretiffy(1)

head(1)

findNextSiblings(1)

group(1)

encodeContents(1)

attrs(1)

Example #1

Show file

File: show_posts_macro_test.py Project: FelixSchwarz/TracWikiBlog

 def test_title_is_only_shown_once(self):
     self._grant_permission('anonymous', 'TRAC_ADMIN')
     page = create_tagged_page(self.env, self.req(), 'Foo', '= SomeTitle =\ncontent', ('blog',))
     page.save(None, None, '127.0.0.1')
     
     soup = BeautifulSoup(self._expand_macro())
     plain_text = ''.join(soup.div(text=True))
     # looking in plain text (with all tags stripped) as Trac will add the 
     # title also in the dom node id and an anchor to link to that heading
     matches = re.findall('SomeTitle', plain_text)
     assert_length(1, matches)

Example #2

Show file

File: wired.py Project: esilverberg/magscraper

    def parse(self):
        
        links = []
        page = urllib2.urlopen(self.base_url)
        page_content = page.read()
        match = re.search("<div id=\"this_month\">.*<div class=\"matchbook_rain_light\">", page_content, re.DOTALL)
        contents = BeautifulSoup(match.group(0))

        match = re.search("<div id=\"mag_package\">.*<div id=\"mag_archive\">", page_content, re.DOTALL)
        contents2 = BeautifulSoup(match.group(0))

        match = re.search('<area.*?href="(?P<url>.*?)"', page_content)
        main_url = match.group('url')
        url = 'http://www.wired.com/print%s' % main_url
        page = urllib2.urlopen(url)
        main_article_content = page.read()
        match = re.search('<h1 id="articlehed">(?P<title>.*?)</h1>', main_article_content)
        main_title = match.group('title')
        self.parse_article(main_title, url)

        for div in contents.div(attrs={'class': 'story'}):
            link = div.find('a')
            links.append(link)
            
        for div in contents2.div(attrs={'class': 'headline_image'}):
            link = div.find('a')
            links.append(link)
            
        for div in contents2.div(attrs={'class': 'headline'}):
            link = div.find('a')
            links.append(link)

        for link_element in links:
            title = self.text(link_element)
            href = link_element['href']
            url = 'http://www.wired.com/print%s' % href
            
            self.parse_article(title, url)

Example #3

Show file

    def getMeta(self, url):
        """Get metadata from gtnpdatabase.org/boreholes/view/#### page"""
        try:
            html = urllib2.urlopen(url).read()
            soup = BeautifulSoup(html)
        except:
            print("Page not found. Error 404. %s added to list of failed sites"
                  ) % url
            if url not in self.failedURL:
                self.failedURL.append(url)
            return
        # Get pagename ##TODO: assert this is the same as the page name somewhere else?
        pageName = soup.div(id="formHeader")[0].h1.text

        # Get other data from table structure
        meta = [x.text for x in soup.findAll("tr")]
        meta = [
            unicodedata.normalize('NFKD', x).encode('ascii', 'ignore')
            for x in meta
        ]
        metaDict = dict()
        for info in meta:
            if ":" in info:
                info = re.sub("&#176", "deg", info)
                info = re.sub("&nbsp;", "", info)
                info = info.split(":")
                metaDict.update({info[0]: info[1]})
        index = re.search("view/(\d+)", url)
        index = index.group(1)
        metaDict.update({"URL": url})
        metaDict.update({"index": index})
        metaDict.update({"Name": pageName})
        self.cur_siteMeta = metaDict

        outstr = self.buildNameStringBH()

        ## write to csv
        metafile = self.out_dir + "/" + outstr + "_metadata.csv"
        writer = csv.writer(open(metafile, 'wb'))
        for key, value in self.cur_siteMeta.items():
            writer.writerow([key, value])

Example #4

Show file

File: GTNP.py Project: nicholas512/Data5000

    def getMeta(self,url):  
        """Get metadata from gtnpdatabase.org/boreholes/view/#### page"""
        try:
            html = urllib2.urlopen(url).read()
            soup = BeautifulSoup(html)
        except:
            print("Page not found. Error 404. %s added to list of failed sites") %url
            if url not in self.failedURL:
                self.failedURL.append(url)
            return
        # Get pagename ##TODO: assert this is the same as the page name somewhere else?
        pageName = soup.div(id="formHeader")[0].h1.text
        
        # Get other data from table structure
        meta = [x.text for x in soup.findAll("tr")]
        meta = [unicodedata.normalize('NFKD', x).encode('ascii','ignore') for x in meta]
        metaDict = dict()
        for info in meta:
            if ":" in info:
                info = re.sub("&#176","deg",info)
                info = re.sub("&nbsp;","",info)
                info = info.split(":")
                metaDict.update({info[0]:info[1]})
        index = re.search("view/(\d+)",url)
        index = index.group(1)
        metaDict.update({"URL":url})
        metaDict.update({"index":index})
        metaDict.update({"Name":pageName})
        self.cur_siteMeta = metaDict
        
        outstr = self.buildNameStringBH()

        ## write to csv
        metafile = self.out_dir + "/" + outstr + "_metadata.csv"
        writer = csv.writer(open(metafile, 'wb'))
        for key, value in self.cur_siteMeta.items():
            writer.writerow([key, value])

Example #5

Show file

File: beautiful.py Project: tnoulas/Mining-Georeferenced-Data

#!/usr/bin/env python

from BeautifulSoup import BeautifulSoup
import urllib2
import requests
import posixpath

url = "http://4sq.com/18aGENW"

headers = {"User-agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:25.0) Gecko/20100101 Firefox/25.0"}

request = requests.get(url, headers=headers)

final_url = request.url
parsed = urllib2.urlparse.urlparse(final_url)
query = parsed.query
signature = urllib2.urlparse.parse_qs(query)["s"][0]

checkin_id = posixpath.basename(parsed.path)
user = posixpath.dirname(parsed.path).split('/')[1]

soup = BeautifulSoup(request.text)

venue_push = soup.div(attrs={"class": "venue push"})[0]
screen_name = venue_push.h1.strong.text

venue = venue_push.a["href"]

print "Checkin %s is for User \"%s\" with Name \"%s\" checking in at %s"\
    % (checkin_id, user, screen_name, posixpath.basename(venue))