Python BeautifulSoup Examples

Programming Language: Python

Namespace/Package Name: xgoogle.BeautifulSoup

Class/Type: BeautifulSoup

Examples at hotexamples.com: 6

Python BeautifulSoup - 6 examples found. These are the top rated real world Python examples of xgoogle.BeautifulSoup.BeautifulSoup extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

BeautifulSoup(3)

find(1)

findAll(1)

prettify(1)

Example #1

Show file

    def crawl_url(self, url, level=0):
        if (
                url.count("/") == 2
        ):  # If the user provides 'http://www.google.com' append an / to it.
            url += "/"

        code = self.__simpleGetRequest(url)
        domain = self.getDomain(url, True)

        if (code != None):
            soup = None

            try:
                soup = BeautifulSoup(code)
            except:
                pass

            if soup != None:
                for tag in soup.findAll('a'):
                    isCool = False
                    new_url = None
                    try:
                        new_url = tag['href']
                    except KeyError, err:
                        pass

                    if new_url != None and not new_url.startswith(
                            "#") and not new_url.startswith("javascript:"):
                        if (new_url.startswith("http://")
                                or new_url.startswith("https://")):
                            if (new_url.lower().startswith(domain.lower())):
                                isCool = True
                        else:
                            if (new_url.startswith("/")):
                                new_url = os.path.join(domain, new_url[1:])
                            else:
                                new_url = os.path.join(os.path.dirname(url),
                                                       new_url)
                            isCool = True

                        if (isCool and self.isURLinPool(new_url)):
                            isCool = False

                        if (isCool):
                            tmpUrl = new_url
                            if (tmpUrl.find("?") != -1):
                                tmpUrl = tmpUrl[:tmpUrl.find("?")]

                            for suffix in self.goodTypes:
                                if (tmpUrl.endswith(suffix)):
                                    if (level + 1 <= self.config["p_depth"]):
                                        self.urlpool.append(
                                            (new_url, level + 1))
                                        break

Example #2

Show file

def google(search, tokenizer):
    search = search.replace(" ", "%20")
    url = 'http://www.google.com/search?q=' + search
    user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
    headers = {'User-Agent': user_agent}
    req = urllib2.Request(url, None, headers)
    response = urllib2.urlopen(req)
    html = response.read()
    soup = BeautifulSoup(html)
    html = soup.prettify().split("\n")
    html = html[167:]
    html = "\n".join(html)
    links = re.findall(r"url\?q=(.+)&amp;s", html)
    return links

Example #3

Show file

File: google.py Project: atiassa/recommend-2011

def google(search,tokenizer):
    search = search.replace(" ","%20")
    url = 'http://www.google.com/search?q='+search
    user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
    headers = {'User-Agent' : user_agent}
    req = urllib2.Request(url, None, headers)
    response = urllib2.urlopen(req)
    html = response.read()
    soup = BeautifulSoup(html)
    html =  soup.prettify().split("\n")
    html = html[167 :]
    html = "\n".join(html)
    links =re.findall(r"url\?q=(.+)&amp;s",html)
    return links

Example #4

Show file

File: crawler.py Project: bupt007/fimap

    def crawl_url(self, url, level=0):
        if (url.count("/") == 2): # If the user provides 'http://www.google.com' append an / to it.
            url += "/" 
        
        code = self.__simpleGetRequest(url)
        domain = self.getDomain(url, True)

        if (code != None):
            soup = None
            
            try:
                soup = BeautifulSoup(code)
            except:
                pass

            if soup != None:
                for tag in soup.findAll('a'):
                    isCool = False
                    new_url = None
                    try:
                        new_url = tag['href']
                    except KeyError, err:
                        pass

                    if new_url != None and not new_url.startswith("#") and not new_url.startswith("javascript:"):
                        if(new_url.startswith("http://") or new_url.startswith("https://")):
                            if (new_url.lower().startswith(domain.lower())):
                                isCool = True
                        else:
                            if (new_url.startswith("/")):
                                new_url = os.path.join(domain, new_url[1:])
                            else:
                                new_url = os.path.join(os.path.dirname(url), new_url)
                            isCool = True

                        if (isCool and self.isURLinPool(new_url)):
                            isCool = False

                        if (isCool):
                            tmpUrl = new_url
                            if (tmpUrl.find("?") != -1):
                                tmpUrl = tmpUrl[:tmpUrl.find("?")]

                            for suffix in self.goodTypes:
                                if (tmpUrl.endswith(suffix)):
                                    if (level+1 <= self.config["p_depth"]):
                                        self.urlpool.append((new_url, level+1))
                                        break

Example #5

Show file

File: kq.py Project: caozhzh/rss

os.chdir('/home/caozhzh/work/rss')

import re
import urllib
from xgoogle.BeautifulSoup import BeautifulSoup
from xgoogle.browser import Browser, BrowserError
from xgoogle.GeneralFetch import GeneralFetch

url = "http://blog.sina.com.cn/u/1696709200"
b = Browser()
page = b.get_page(url)
page = page.replace('<!–[if lte IE 6]>', '')
page = page.replace('<![endif]–>', '')
#print page

be = BeautifulSoup(page)
div = be.find('div', {'class': 'diywidget'})
txt = ''.join(div.findAll(text=True))
#print type(txt)

import feedparser
origin_feed = feedparser.parse('http://blog.sina.com.cn/rss/1696709200.xml')

from feedformatter import Feed
import time
import datetime
import uuid

# Create the feed
feed = Feed()

Example #6

Show file

File: kq.py Project: caozhzh/rss

os.chdir('/home/caozhzh/work/rss')

import re
import urllib
from xgoogle.BeautifulSoup import BeautifulSoup
from xgoogle.browser import Browser, BrowserError
from xgoogle.GeneralFetch import GeneralFetch

url = "http://blog.sina.com.cn/u/1696709200"
b = Browser()
page = b.get_page(url)
page = page.replace('<!–[if lte IE 6]>','')
page = page.replace('<![endif]–>','')
#print page

be = BeautifulSoup(page)
div = be.find('div', {'class': 'diywidget'})
txt = ''.join(div.findAll(text=True))
#print type(txt)

import feedparser
origin_feed = feedparser.parse('http://blog.sina.com.cn/rss/1696709200.xml')

from feedformatter import Feed
import time
import datetime
import uuid

# Create the feed
feed = Feed()