Python BeautifulSoup.startswith Examples

Programming Language: Python

Namespace/Package Name: BeautifulSoup

Class/Type: BeautifulSoup

Method/Function: startswith

Examples at hotexamples.com: 4

Python BeautifulSoup.startswith - 4 examples found. These are the top rated real world Python examples of BeautifulSoup.BeautifulSoup.startswith extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

BeautifulSoup(30)

decompose(30)

first(30)

find_all(30)

findAll(30)

find(30)

fetch(30)

feed(30)

getText(29)

insert(20)

findChildren(19)

body(12)

close(11)

__str__(11)

encode(8)

new_tag(6)

findChild(5)

append(4)

prettify(4)

findSelect(4)

decode(4)

get(4)

__unicode__(3)

goahead(3)

lower(3)

div(3)

findall(3)

pretify(3)

__init__(3)

firstText(2)

pop(2)

data(2)

findNext(2)

read(2)

index(1)

html(1)

query(1)

json(1)

load(1)

re_left(1)

noscript(1)

orig_url(1)

partition(1)

popTag(1)

pretiffy(1)

head(1)

findNextSiblings(1)

group(1)

encodeContents(1)

attrs(1)

Example #1

Show file

File: utils.py Project: xazrad/chicago

def get_ico_host(resp, name=None):
    data = False

    if not data:
        try:
            data = BeautifulSoup(resp.text).findAll('link', rel="shortcut icon")[0]['href']
        except:
            pass

    if not data:
        try:
            data = BeautifulSoup(resp.text).findAll('link', rel="icon")[0]['href']
        except:
            pass

    if not data:
        try:
            data = BeautifulSoup(resp.text).findAll('link', rel="SHORTCUT ICON")[0]['href']
        except:
            pass

    if not data:
        delimeter = '/'
        if resp.url.endswith('/'):
            delimeter = ''

        url_ico = '%s%s%s' % (resp.url, delimeter, 'favicon.ico')
    else:
        if data.startswith('//'):
            url_ico = '%s:%s' % (urlparse(resp.url).scheme, data)
        elif data.startswith('/') or '/' not in data:
            url_ico = urlunsplit((urlparse(resp.url).scheme, urlparse(resp.url).hostname, data, None, None))
        else:
            url_ico = data

    try:
        respose_data = urllib.urlopen(url_ico).read()
    except:
        return

    try:
        file = cStringIO.StringIO(respose_data)
    except:
        return

    try:
        img = Image.open(file)
    except:
        return
    if name is None:
        name = '%s%s' % (resp.url.replace('/', '').replace(':', '-'), resp.peer[1])

    f = FileStorage(stream=file,
                    filename='%s.%s' % (name, img.format.lower()))

    return f

Example #2

Show file

File: scrape.py Project: tristanfisher/newsworm

def scrape():
	'''
	retrieve the headline and link url of the current top story on google news
	'''
	#using RSS so we're thriftier and don't need to parse html
	content_url = 'http://news.google.com/news/url?output=rss'

	try: content = requests.get(content_url)
	except requests.ConnectionError : print("Error loading :: {0}".format(content_url))
	except requests.Timeout : print("Timed out :: {0}".format(content_url))
	except requests.HTTPError : print("Invalid HTTP response :: {0}".format(content_url))
	except requests.TooManyRedirects: print("Too many redirects :: {0}".format(content_url))


	#expecting xml response.  json and html scraping options would be a todo.
	if 'application/xml' in content.headers['content-type']:
		content_xml = etree.fromstring(content.text)

		'''
		Structure of response looks like:
		<rss>
			<channel>		<---- element[0]
			<item>			<---- Top Story
				<title>		<---- [0]
				<link>		<---- [1]
				<guid>
				<category>
				<pubDate>
				<description>
		'''

		top_story_item = content_xml[0].find('item')
		top_story_item = top_story_item.getchildren()
		top_story_image = BeautifulSoup(top_story_item[-1].text).find('img')
		#grab the first attribute, then the value for the img tag
		#(u'src', u'//t2.gstatic.com/images?q=tbn:ANd9GcRbKgR....)
		top_story_image =  top_story_image.attrs[0][1]

		#if image starts with //, prefix it with http
		if top_story_image.startswith('//'):
			top_story_image = 'http:' + top_story_image

		#This could be an object, but I don't see the need until different
		#methods are needed to scrape a myriad of sources
		top_story_dict = {
			'title': top_story_item[0].text,
			'repeat_count' : None,
            'image': top_story_image,
			'link': top_story_item[1].text,
			'time_scraped': datetime.datetime.now()
		}

		return top_story_dict

Example #3

Show file

File: info.py Project: davideuler/wolfspider

    def parse_store_detail(self, response):
        hxs = HtmlXPathSelector(response)

        item = KoubeiStoreItem()
        # Url
        item['link_url'] = response.url
        match = self.city_pattern.match(response.url)
        if match:
            item['city'] = match.group(1)

        # Bread Crumb
        crumb_elems = hxs.select("//div[@class='crumb k2-fix-float']/*").extract()
        if crumb_elems:
            item['bread_crumb'] = u'\xbb'.join([ BeautifulSoup(c).text for c in crumb_elems ])

        # Name
        name_elem = hxs.select("//input[@id='store-full-name']/@value").extract()
        if name_elem:
            item['name'] = name_elem[0]

        # Address
        address_elem = hxs.select("//input[@id='store-address']/@value").extract()
        if address_elem:
            item['address'] = address_elem[0]

        # Telephone
        tel_elem = hxs.select("//input[@id='store-tel']/@value").extract()
        if tel_elem:
            item['tel'] = tel_elem[0]
        
        # Average Cost
        avg_elem = hxs.select("//div[@class='store-info-card']//li/text()").extract()
        for text in avg_elem:
            if text.startswith("人均".decode('utf-8')):
                item['avg_cost'] = text.split(u'\uff1a')[1]
                break

        # Rating
        rating_elem = hxs.select("//div[@class='store-free-title k2-fix-float']/p/b/text()").extract()
        if rating_elem:
            item['rating'] = rating_elem[0]
            item['n_rating'] = int(rating_elem[1])

        # Detail
        detail_elem = hxs.select("//div[@class='detail-main']/ul/li").extract()
        for elem in detail_elem:
            text = BeautifulSoup(elem).find('label').text
            if text.startswith('网站地址'.decode('utf-8')):
                item['url'] = text.split(u'\uff1a')[1].strip()
            if text.startswith('店铺标签'.decode('utf-8')):
                item['tag_list'] = [a.text for a in BeautifulSoup(elem).findAll('a')]
            
        # Description
        desc_elem = hxs.select("//div[@class='detail-intro']/div/text()").extract()
        if desc_elem:
            item['description'] = desc_elem[0].strip()

        # Promote
        promote_elems= hxs.select("//div[@id='promote-more']//p").extract()
        promotes = []
        for elem in promote_elems:
            name = BeautifulSoup(elem).find('a').text.strip()
            count = int(BeautifulSoup(elem).find('span').text[1:-1])
            promotes.append((name, count))
        if promotes != []:
            item['promote_list'] = promotes
            
        # Impress
        impress_elems = hxs.select("//div[@id='impress-more']//span/text()").extract()
        if impress_elems:
            item['impress_list'] = [imp.strip() for imp in impress_elems]

        #print "PARSING : %s | %s | %s | %s" % (item['name'], item['tel'], item['address'], item['avg_cost'])
        return item

Example #4

Show file

    def parse_store_detail(self, response):
        hxs = HtmlXPathSelector(response)

        item = KoubeiStoreItem()
        # Url
        item['link_url'] = response.url
        match = self.city_pattern.match(response.url)
        if match:
            item['city'] = match.group(1)

        # Bread Crumb
        crumb_elems = hxs.select(
            "//div[@class='crumb k2-fix-float']/*").extract()
        if crumb_elems:
            item['bread_crumb'] = u'\xbb'.join(
                [BeautifulSoup(c).text for c in crumb_elems])

        # Name
        name_elem = hxs.select(
            "//input[@id='store-full-name']/@value").extract()
        if name_elem:
            item['name'] = name_elem[0]

        # Address
        address_elem = hxs.select(
            "//input[@id='store-address']/@value").extract()
        if address_elem:
            item['address'] = address_elem[0]

        # Telephone
        tel_elem = hxs.select("//input[@id='store-tel']/@value").extract()
        if tel_elem:
            item['tel'] = tel_elem[0]

        # Average Cost
        avg_elem = hxs.select(
            "//div[@class='store-info-card']//li/text()").extract()
        for text in avg_elem:
            if text.startswith("人均".decode('utf-8')):
                item['avg_cost'] = text.split(u'\uff1a')[1]
                break

        # Rating
        rating_elem = hxs.select(
            "//div[@class='store-free-title k2-fix-float']/p/b/text()"
        ).extract()
        if rating_elem:
            item['rating'] = rating_elem[0]
            item['n_rating'] = int(rating_elem[1])

        # Detail
        detail_elem = hxs.select("//div[@class='detail-main']/ul/li").extract()
        for elem in detail_elem:
            text = BeautifulSoup(elem).find('label').text
            if text.startswith('网站地址'.decode('utf-8')):
                item['url'] = text.split(u'\uff1a')[1].strip()
            if text.startswith('店铺标签'.decode('utf-8')):
                item['tag_list'] = [
                    a.text for a in BeautifulSoup(elem).findAll('a')
                ]

        # Description
        desc_elem = hxs.select(
            "//div[@class='detail-intro']/div/text()").extract()
        if desc_elem:
            item['description'] = desc_elem[0].strip()

        # Promote
        promote_elems = hxs.select("//div[@id='promote-more']//p").extract()
        promotes = []
        for elem in promote_elems:
            name = BeautifulSoup(elem).find('a').text.strip()
            count = int(BeautifulSoup(elem).find('span').text[1:-1])
            promotes.append((name, count))
        if promotes != []:
            item['promote_list'] = promotes

        # Impress
        impress_elems = hxs.select(
            "//div[@id='impress-more']//span/text()").extract()
        if impress_elems:
            item['impress_list'] = [imp.strip() for imp in impress_elems]

        #print "PARSING : %s | %s | %s | %s" % (item['name'], item['tel'], item['address'], item['avg_cost'])
        return item