Beispiel #1
0
def get_ico_host(resp, name=None):
    data = False

    if not data:
        try:
            data = BeautifulSoup(resp.text).findAll('link', rel="shortcut icon")[0]['href']
        except:
            pass

    if not data:
        try:
            data = BeautifulSoup(resp.text).findAll('link', rel="icon")[0]['href']
        except:
            pass

    if not data:
        try:
            data = BeautifulSoup(resp.text).findAll('link', rel="SHORTCUT ICON")[0]['href']
        except:
            pass

    if not data:
        delimeter = '/'
        if resp.url.endswith('/'):
            delimeter = ''

        url_ico = '%s%s%s' % (resp.url, delimeter, 'favicon.ico')
    else:
        if data.startswith('//'):
            url_ico = '%s:%s' % (urlparse(resp.url).scheme, data)
        elif data.startswith('/') or '/' not in data:
            url_ico = urlunsplit((urlparse(resp.url).scheme, urlparse(resp.url).hostname, data, None, None))
        else:
            url_ico = data

    try:
        respose_data = urllib.urlopen(url_ico).read()
    except:
        return

    try:
        file = cStringIO.StringIO(respose_data)
    except:
        return

    try:
        img = Image.open(file)
    except:
        return
    if name is None:
        name = '%s%s' % (resp.url.replace('/', '').replace(':', '-'), resp.peer[1])

    f = FileStorage(stream=file,
                    filename='%s.%s' % (name, img.format.lower()))

    return f
Beispiel #2
0
def scrape():
	'''
	retrieve the headline and link url of the current top story on google news
	'''
	#using RSS so we're thriftier and don't need to parse html
	content_url = 'http://news.google.com/news/url?output=rss'

	try: content = requests.get(content_url)
	except requests.ConnectionError : print("Error loading :: {0}".format(content_url))
	except requests.Timeout : print("Timed out :: {0}".format(content_url))
	except requests.HTTPError : print("Invalid HTTP response :: {0}".format(content_url))
	except requests.TooManyRedirects: print("Too many redirects :: {0}".format(content_url))


	#expecting xml response.  json and html scraping options would be a todo.
	if 'application/xml' in content.headers['content-type']:
		content_xml = etree.fromstring(content.text)

		'''
		Structure of response looks like:
		<rss>
			<channel>		<---- element[0]
			<item>			<---- Top Story
				<title>		<---- [0]
				<link>		<---- [1]
				<guid>
				<category>
				<pubDate>
				<description>
		'''

		top_story_item = content_xml[0].find('item')
		top_story_item = top_story_item.getchildren()
		top_story_image = BeautifulSoup(top_story_item[-1].text).find('img')
		#grab the first attribute, then the value for the img tag
		#(u'src', u'//t2.gstatic.com/images?q=tbn:ANd9GcRbKgR....)
		top_story_image =  top_story_image.attrs[0][1]

		#if image starts with //, prefix it with http
		if top_story_image.startswith('//'):
			top_story_image = 'http:' + top_story_image

		#This could be an object, but I don't see the need until different
		#methods are needed to scrape a myriad of sources
		top_story_dict = {
			'title': top_story_item[0].text,
			'repeat_count' : None,
            'image': top_story_image,
			'link': top_story_item[1].text,
			'time_scraped': datetime.datetime.now()
		}

		return top_story_dict
Beispiel #3
0
    def parse_store_detail(self, response):
        hxs = HtmlXPathSelector(response)

        item = KoubeiStoreItem()
        # Url
        item['link_url'] = response.url
        match = self.city_pattern.match(response.url)
        if match:
            item['city'] = match.group(1)

        # Bread Crumb
        crumb_elems = hxs.select("//div[@class='crumb k2-fix-float']/*").extract()
        if crumb_elems:
            item['bread_crumb'] = u'\xbb'.join([ BeautifulSoup(c).text for c in crumb_elems ])

        # Name
        name_elem = hxs.select("//input[@id='store-full-name']/@value").extract()
        if name_elem:
            item['name'] = name_elem[0]

        # Address
        address_elem = hxs.select("//input[@id='store-address']/@value").extract()
        if address_elem:
            item['address'] = address_elem[0]

        # Telephone
        tel_elem = hxs.select("//input[@id='store-tel']/@value").extract()
        if tel_elem:
            item['tel'] = tel_elem[0]
        
        # Average Cost
        avg_elem = hxs.select("//div[@class='store-info-card']//li/text()").extract()
        for text in avg_elem:
            if text.startswith("人均".decode('utf-8')):
                item['avg_cost'] = text.split(u'\uff1a')[1]
                break

        # Rating
        rating_elem = hxs.select("//div[@class='store-free-title k2-fix-float']/p/b/text()").extract()
        if rating_elem:
            item['rating'] = rating_elem[0]
            item['n_rating'] = int(rating_elem[1])

        # Detail
        detail_elem = hxs.select("//div[@class='detail-main']/ul/li").extract()
        for elem in detail_elem:
            text = BeautifulSoup(elem).find('label').text
            if text.startswith('网站地址'.decode('utf-8')):
                item['url'] = text.split(u'\uff1a')[1].strip()
            if text.startswith('店铺标签'.decode('utf-8')):
                item['tag_list'] = [a.text for a in BeautifulSoup(elem).findAll('a')]
            
        # Description
        desc_elem = hxs.select("//div[@class='detail-intro']/div/text()").extract()
        if desc_elem:
            item['description'] = desc_elem[0].strip()

        # Promote
        promote_elems= hxs.select("//div[@id='promote-more']//p").extract()
        promotes = []
        for elem in promote_elems:
            name = BeautifulSoup(elem).find('a').text.strip()
            count = int(BeautifulSoup(elem).find('span').text[1:-1])
            promotes.append((name, count))
        if promotes != []:
            item['promote_list'] = promotes
            
        # Impress
        impress_elems = hxs.select("//div[@id='impress-more']//span/text()").extract()
        if impress_elems:
            item['impress_list'] = [imp.strip() for imp in impress_elems]

        #print "PARSING : %s | %s | %s | %s" % (item['name'], item['tel'], item['address'], item['avg_cost'])
        return item
Beispiel #4
0
    def parse_store_detail(self, response):
        hxs = HtmlXPathSelector(response)

        item = KoubeiStoreItem()
        # Url
        item['link_url'] = response.url
        match = self.city_pattern.match(response.url)
        if match:
            item['city'] = match.group(1)

        # Bread Crumb
        crumb_elems = hxs.select(
            "//div[@class='crumb k2-fix-float']/*").extract()
        if crumb_elems:
            item['bread_crumb'] = u'\xbb'.join(
                [BeautifulSoup(c).text for c in crumb_elems])

        # Name
        name_elem = hxs.select(
            "//input[@id='store-full-name']/@value").extract()
        if name_elem:
            item['name'] = name_elem[0]

        # Address
        address_elem = hxs.select(
            "//input[@id='store-address']/@value").extract()
        if address_elem:
            item['address'] = address_elem[0]

        # Telephone
        tel_elem = hxs.select("//input[@id='store-tel']/@value").extract()
        if tel_elem:
            item['tel'] = tel_elem[0]

        # Average Cost
        avg_elem = hxs.select(
            "//div[@class='store-info-card']//li/text()").extract()
        for text in avg_elem:
            if text.startswith("人均".decode('utf-8')):
                item['avg_cost'] = text.split(u'\uff1a')[1]
                break

        # Rating
        rating_elem = hxs.select(
            "//div[@class='store-free-title k2-fix-float']/p/b/text()"
        ).extract()
        if rating_elem:
            item['rating'] = rating_elem[0]
            item['n_rating'] = int(rating_elem[1])

        # Detail
        detail_elem = hxs.select("//div[@class='detail-main']/ul/li").extract()
        for elem in detail_elem:
            text = BeautifulSoup(elem).find('label').text
            if text.startswith('网站地址'.decode('utf-8')):
                item['url'] = text.split(u'\uff1a')[1].strip()
            if text.startswith('店铺标签'.decode('utf-8')):
                item['tag_list'] = [
                    a.text for a in BeautifulSoup(elem).findAll('a')
                ]

        # Description
        desc_elem = hxs.select(
            "//div[@class='detail-intro']/div/text()").extract()
        if desc_elem:
            item['description'] = desc_elem[0].strip()

        # Promote
        promote_elems = hxs.select("//div[@id='promote-more']//p").extract()
        promotes = []
        for elem in promote_elems:
            name = BeautifulSoup(elem).find('a').text.strip()
            count = int(BeautifulSoup(elem).find('span').text[1:-1])
            promotes.append((name, count))
        if promotes != []:
            item['promote_list'] = promotes

        # Impress
        impress_elems = hxs.select(
            "//div[@id='impress-more']//span/text()").extract()
        if impress_elems:
            item['impress_list'] = [imp.strip() for imp in impress_elems]

        #print "PARSING : %s | %s | %s | %s" % (item['name'], item['tel'], item['address'], item['avg_cost'])
        return item