def get_ico_host(resp, name=None): data = False if not data: try: data = BeautifulSoup(resp.text).findAll('link', rel="shortcut icon")[0]['href'] except: pass if not data: try: data = BeautifulSoup(resp.text).findAll('link', rel="icon")[0]['href'] except: pass if not data: try: data = BeautifulSoup(resp.text).findAll('link', rel="SHORTCUT ICON")[0]['href'] except: pass if not data: delimeter = '/' if resp.url.endswith('/'): delimeter = '' url_ico = '%s%s%s' % (resp.url, delimeter, 'favicon.ico') else: if data.startswith('//'): url_ico = '%s:%s' % (urlparse(resp.url).scheme, data) elif data.startswith('/') or '/' not in data: url_ico = urlunsplit((urlparse(resp.url).scheme, urlparse(resp.url).hostname, data, None, None)) else: url_ico = data try: respose_data = urllib.urlopen(url_ico).read() except: return try: file = cStringIO.StringIO(respose_data) except: return try: img = Image.open(file) except: return if name is None: name = '%s%s' % (resp.url.replace('/', '').replace(':', '-'), resp.peer[1]) f = FileStorage(stream=file, filename='%s.%s' % (name, img.format.lower())) return f
def scrape(): ''' retrieve the headline and link url of the current top story on google news ''' #using RSS so we're thriftier and don't need to parse html content_url = 'http://news.google.com/news/url?output=rss' try: content = requests.get(content_url) except requests.ConnectionError : print("Error loading :: {0}".format(content_url)) except requests.Timeout : print("Timed out :: {0}".format(content_url)) except requests.HTTPError : print("Invalid HTTP response :: {0}".format(content_url)) except requests.TooManyRedirects: print("Too many redirects :: {0}".format(content_url)) #expecting xml response. json and html scraping options would be a todo. if 'application/xml' in content.headers['content-type']: content_xml = etree.fromstring(content.text) ''' Structure of response looks like: <rss> <channel> <---- element[0] <item> <---- Top Story <title> <---- [0] <link> <---- [1] <guid> <category> <pubDate> <description> ''' top_story_item = content_xml[0].find('item') top_story_item = top_story_item.getchildren() top_story_image = BeautifulSoup(top_story_item[-1].text).find('img') #grab the first attribute, then the value for the img tag #(u'src', u'//t2.gstatic.com/images?q=tbn:ANd9GcRbKgR....) top_story_image = top_story_image.attrs[0][1] #if image starts with //, prefix it with http if top_story_image.startswith('//'): top_story_image = 'http:' + top_story_image #This could be an object, but I don't see the need until different #methods are needed to scrape a myriad of sources top_story_dict = { 'title': top_story_item[0].text, 'repeat_count' : None, 'image': top_story_image, 'link': top_story_item[1].text, 'time_scraped': datetime.datetime.now() } return top_story_dict
def parse_store_detail(self, response): hxs = HtmlXPathSelector(response) item = KoubeiStoreItem() # Url item['link_url'] = response.url match = self.city_pattern.match(response.url) if match: item['city'] = match.group(1) # Bread Crumb crumb_elems = hxs.select("//div[@class='crumb k2-fix-float']/*").extract() if crumb_elems: item['bread_crumb'] = u'\xbb'.join([ BeautifulSoup(c).text for c in crumb_elems ]) # Name name_elem = hxs.select("//input[@id='store-full-name']/@value").extract() if name_elem: item['name'] = name_elem[0] # Address address_elem = hxs.select("//input[@id='store-address']/@value").extract() if address_elem: item['address'] = address_elem[0] # Telephone tel_elem = hxs.select("//input[@id='store-tel']/@value").extract() if tel_elem: item['tel'] = tel_elem[0] # Average Cost avg_elem = hxs.select("//div[@class='store-info-card']//li/text()").extract() for text in avg_elem: if text.startswith("人均".decode('utf-8')): item['avg_cost'] = text.split(u'\uff1a')[1] break # Rating rating_elem = hxs.select("//div[@class='store-free-title k2-fix-float']/p/b/text()").extract() if rating_elem: item['rating'] = rating_elem[0] item['n_rating'] = int(rating_elem[1]) # Detail detail_elem = hxs.select("//div[@class='detail-main']/ul/li").extract() for elem in detail_elem: text = BeautifulSoup(elem).find('label').text if text.startswith('网站地址'.decode('utf-8')): item['url'] = text.split(u'\uff1a')[1].strip() if text.startswith('店铺标签'.decode('utf-8')): item['tag_list'] = [a.text for a in BeautifulSoup(elem).findAll('a')] # Description desc_elem = hxs.select("//div[@class='detail-intro']/div/text()").extract() if desc_elem: item['description'] = desc_elem[0].strip() # Promote promote_elems= hxs.select("//div[@id='promote-more']//p").extract() promotes = [] for elem in promote_elems: name = BeautifulSoup(elem).find('a').text.strip() count = int(BeautifulSoup(elem).find('span').text[1:-1]) promotes.append((name, count)) if promotes != []: item['promote_list'] = promotes # Impress impress_elems = hxs.select("//div[@id='impress-more']//span/text()").extract() if impress_elems: item['impress_list'] = [imp.strip() for imp in impress_elems] #print "PARSING : %s | %s | %s | %s" % (item['name'], item['tel'], item['address'], item['avg_cost']) return item
def parse_store_detail(self, response): hxs = HtmlXPathSelector(response) item = KoubeiStoreItem() # Url item['link_url'] = response.url match = self.city_pattern.match(response.url) if match: item['city'] = match.group(1) # Bread Crumb crumb_elems = hxs.select( "//div[@class='crumb k2-fix-float']/*").extract() if crumb_elems: item['bread_crumb'] = u'\xbb'.join( [BeautifulSoup(c).text for c in crumb_elems]) # Name name_elem = hxs.select( "//input[@id='store-full-name']/@value").extract() if name_elem: item['name'] = name_elem[0] # Address address_elem = hxs.select( "//input[@id='store-address']/@value").extract() if address_elem: item['address'] = address_elem[0] # Telephone tel_elem = hxs.select("//input[@id='store-tel']/@value").extract() if tel_elem: item['tel'] = tel_elem[0] # Average Cost avg_elem = hxs.select( "//div[@class='store-info-card']//li/text()").extract() for text in avg_elem: if text.startswith("人均".decode('utf-8')): item['avg_cost'] = text.split(u'\uff1a')[1] break # Rating rating_elem = hxs.select( "//div[@class='store-free-title k2-fix-float']/p/b/text()" ).extract() if rating_elem: item['rating'] = rating_elem[0] item['n_rating'] = int(rating_elem[1]) # Detail detail_elem = hxs.select("//div[@class='detail-main']/ul/li").extract() for elem in detail_elem: text = BeautifulSoup(elem).find('label').text if text.startswith('网站地址'.decode('utf-8')): item['url'] = text.split(u'\uff1a')[1].strip() if text.startswith('店铺标签'.decode('utf-8')): item['tag_list'] = [ a.text for a in BeautifulSoup(elem).findAll('a') ] # Description desc_elem = hxs.select( "//div[@class='detail-intro']/div/text()").extract() if desc_elem: item['description'] = desc_elem[0].strip() # Promote promote_elems = hxs.select("//div[@id='promote-more']//p").extract() promotes = [] for elem in promote_elems: name = BeautifulSoup(elem).find('a').text.strip() count = int(BeautifulSoup(elem).find('span').text[1:-1]) promotes.append((name, count)) if promotes != []: item['promote_list'] = promotes # Impress impress_elems = hxs.select( "//div[@id='impress-more']//span/text()").extract() if impress_elems: item['impress_list'] = [imp.strip() for imp in impress_elems] #print "PARSING : %s | %s | %s | %s" % (item['name'], item['tel'], item['address'], item['avg_cost']) return item