Python strip_accents Examples, utils.funcs.strip_accents Python Examples

Example #1

0

Show file

File: dailymotion_crawler.py Project: rauljuarez/vxvCrawler

 def _get_video_details(self,html_data):
     soup= BeautifulSoup(''.join(html_data),convertEntities=BeautifulStoneSoup.HTML_ENTITIES)
     script=soup.find('script',text=re.compile('flashvars'))
     
     t=soup.find('h1',{'class':'dmco_title'})
     title=t.string if t != None else ''
     
     d=soup.find('div',id='video_description')
     description=d.string if d!=None else None
     
     c=soup.find('a',{'class':re.compile('fromchannel_link')})
     category=c.string if c!=None else None
     
     tags_el=soup.find('div',{'class':re.compile('tags_cont')}).findAll('a')
     tags_list=[]
     for a in tags_el:
         tags_list.append(a.string)
     tags=','.join(tags_list)    
         
     
     video=DailyMotionVideo()
     video.title=strip_accents(title)
     video.description=strip_accents(description) if description!=None else None
     video.category=strip_accents(category)
     video.tags=strip_accents(tags)
     
     return video

Example #2

0

Show file

File: metacafe_crawler.py Project: rauljuarez/vxvCrawler

 def _get_metacafe_videos_from_content(self,xml_data,count):
     #load the xml in memory
     sanitized_xml_data=''.join([c for c in xml_data if ord(c)<128])
     
     tree=ElementTree.fromstring(sanitized_xml_data)
     videos = []
     n=0
     for i, elem in enumerate(tree.getiterator('item')):
         if n < count:
             n=n+1
             try:
                 video=MetacafeVideo()
                 video.title=strip_accents(elem.find('title').text)
                 video.description=strip_accents(elem.find('description').text)
                 video.url=strip_accents(elem.find('link').text)
                 other_source,source_id,new_url=self._verify_source_of_video(video.url)
                 if other_source:
                     video.url=new_url
                     video.source=source_id
                     
                 video.category=strip_accents(elem.find('category').text)
                 video.tags=strip_accents(elem.find('{http://search.yahoo.com/mrss/}keywords').text)
                 videos.append(video)
                 self._logger.info('Parsed metacafe video at url: %s',video.url)
             except:
                 self._logger.exception('An error occurred while parsing a video ... Moving on to the next video...')
                 continue
         else:
             break 
     return videos

Example #3

0

Show file

File: vimeo_crawler.py Project: rauljuarez/vxvCrawler

 def _get_vimeo_videos_from_content(self,xml_data,count):
     #load the xml in memory
     sanitized_xml_data=''.join([c for c in xml_data if ord(c)<128])
    
     tree=ElementTree.fromstring(sanitized_xml_data)
     videos = []
     n=0
     for i, elem in enumerate(tree.getiterator('video')):
         if n < count:
             n=n+1
             try:
                 video=VimeoVideo()
                 video.title=strip_accents(elem.find('title').text)
                 video.description=strip_accents(elem.find('caption').text)
                 
                 urls=elem.find('urls')
                 if urls!=None:
                     for url in urls.findall('url'):
                         video.urls.append(strip_accents(url.text))
                 
                 tags=elem.find('tags')
                 if tags!=None:
                     for tag in tags.findall('tag'):
                         video.tags.append(strip_accents(tag.text))
                 
                 videos.append(video)
                 self._logger.info('Parsed vimeo video at url: %s',video.urls)
             except:
                     self._logger.exception('An error occurred while parsing a video ... Moving on to the next video...')
                     continue 
         else:
             break
     return videos

Example #4

0

Show file

File: megavideo_crawler.py Project: rauljuarez/vxvCrawler

 def _get_video_details(self,html_data):
     soup= BeautifulSoup(''.join(html_data),convertEntities=BeautifulStoneSoup.HTML_ENTITIES)
     script=soup.find('script',text=re.compile('flashvars'))
     
     title=re.compile('flashvars.title = "(.+?)";').findall(script.string)
     description=re.compile('flashvars.description = "(.+?)";').findall(script.string)
     tags=re.compile('flashvars.tags = "(.+?)";').findall(script.string)
     category=re.compile('flashvars.category = "(.+?)";').findall(script.string)
     
     video=MegaVideoVideo()
     video.title=strip_accents(urllib.unquote(title[0].replace('+', ' ')))
     video.description=strip_accents(urllib.unquote(description[0].replace('+', ' ')))
     video.category=strip_accents(urllib.unquote(category[0].replace('+', ' ')))
     video.tags=strip_accents(urllib.unquote(tags[0].replace('+', ' ')))
     
     return video

Example #5

0

Show file

File: youtube_crawler.py Project: rauljuarez/vxvCrawler

 def _parse_entry(self,entry):
     """ Collects the relevant metadata from a search result entry."""
     lq_url,hq_url,hd_url=self._parse_page(entry.media.player.url)                   
     item_meta={'title':strip_accents(entry.media.title.text),
                'description':strip_accents(entry.media.description.text),
                'category':strip_accents(entry.media.category[0].text),
                'tags':strip_accents(entry.media.keywords.text),
                'page_url':entry.media.player.url,
                'lq_url':lq_url,
                'hq_url':hq_url,
                'hd_url':hd_url,
                'search-id':self.search_id,
                'source':'1',}
     self._logger.info('Parsed youtube video at url: %s',entry.media.player.url)
     self._logger.debug('Video Metadata: %s',item_meta)
     return item_meta