Python BeautifulSoup.index Examples

Programming Language: Python

Namespace/Package Name: BeautifulSoup

Class/Type: BeautifulSoup

Method/Function: index

Examples at hotexamples.com: 3

Python BeautifulSoup.index - 3 examples found. These are the top rated real world Python examples of BeautifulSoup.BeautifulSoup.index extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

BeautifulSoup(30)

decompose(30)

first(30)

find_all(30)

findAll(30)

find(30)

fetch(30)

feed(30)

getText(29)

insert(20)

findChildren(19)

body(12)

close(11)

__str__(11)

encode(8)

new_tag(6)

findChild(5)

append(4)

prettify(4)

findSelect(4)

decode(4)

get(4)

__unicode__(3)

goahead(3)

lower(3)

div(3)

findall(3)

pretify(3)

__init__(3)

firstText(2)

pop(2)

data(2)

findNext(2)

read(2)

index(1)

html(1)

query(1)

json(1)

load(1)

re_left(1)

noscript(1)

orig_url(1)

partition(1)

popTag(1)

pretiffy(1)

head(1)

findNextSiblings(1)

group(1)

encodeContents(1)

attrs(1)

Example #1

Show file

File: arteVIDEOS.py Project: stefanct/arteVIDEOS

 def request(self, url, indirect=False):
     if not self.more:
         self.npage = 1
         self.stop = False
         self.results = Results(self.video_per_page)
     try:
         soup = BeautifulSoup(urllib2.urlopen(url).read(), convertEntities=BeautifulSoup.ALL_ENTITIES)
         if indirect:
             try:
                 soup = unicode(soup)
                 start = soup.index('thumbnailViewUrl: "')+19
                 url = DOMAIN + soup[start:soup.index('"', start)] + QUERY_STRING % (self.npage,)
                 soup = BeautifulSoup(urllib2.urlopen(url).read(), convertEntities=BeautifulSoup.ALL_ENTITIES)
             except ValueError:
                 print >> sys.stderr, 'Error: when parsing the page'
                 return
         videos = extract_videos(soup, self.options)
         if len(videos) < VIDEO_PER_PAGE:
             self.stop = True
         self.results.extend(videos)
     except urllib2.URLError:
         die("Can't complete request")

Example #2

Show file

File: arteVIDEOS.py Project: alx/arteVIDEOS

 def request(self, url, indirect=False):
     if not self.more:
         self.npage = 1
         self.stop = False
         self.results = Results(self.video_per_page)
     try:
         soup = BeautifulSoup(urllib2.urlopen(url).read(), convertEntities=BeautifulSoup.ALL_ENTITIES)
         if indirect:
             try:
                 soup = unicode(soup)
                 start = soup.index('thumbnailViewUrl: "')+19
                 url = DOMAIN + soup[start:soup.index('"', start)] + QUERY_STRING % (self.npage,)
                 soup = BeautifulSoup(urllib2.urlopen(url).read(), convertEntities=BeautifulSoup.ALL_ENTITIES)
             except ValueError:
                 print >> sys.stderr, 'Error: when parsing the page'
                 return
         videos = extract_videos(soup, self.options)
         if len(videos) < VIDEO_PER_PAGE:
             self.stop = True
         self.results.extend(videos)
     except urllib2.URLError:
         die("Can't complete request")

Example #3

Show file

File: zapposconnector.py Project: jsyadav/CrawlerFramework

 def _addReviews(self):
     try:
         pat=re.compile('<i>\d+-\d+-\d+ \d+:\d+:\d+</i>')
         review_soup=[each for each in self.soup.findAll('td') if each.find('font', {'face':'verdana, geneva, arial'})]
         date=re.findall(pat, str(review_soup))
         date.insert(0,0)# dummy value to be stored becos 0th element of review_info is not been considered
         review_info=re.split(pat, str(review_soup))
         if not checkSessionInfo(self.genre, self.session_info_out,
                                 self.currenturi,  self.task.instance_data.get('update'),
                                 parent_list=[self.parent_url]):
             for i in xrange(1, len(review_info)):
                 page={}
                 review=BeautifulSoup(review_info[i])
                 try:
                     if review.find(text=re.compile('Reviewer:' , re.DOTALL)):
                         try:
                             page['et_author_name']=review.find(text=re.compile('Reviewer:' , re.DOTALL)).findNext().renderContents().strip()
                         except:
                             page['et_author_name']=''
                             log.exception(self.log_msg("exception in fetching the author name"))
                 except:
                     log.exception(self.log_msg("exception in fetching the author name"))
                 try:
                     loc= review.find(text=re.compile('Reviewer:')).findNext().next.next
                     if  loc.__contains__('from'):
                         page['et_author_location']=review.find(text=re.compile('Reviewer:')).\
                             findNext().next.next.strip().strip('from')
                     else:
                         page['et_author_location']=''
                 except:
                     page['et_author_location']=''
                     log.exception(self.log_msg("exception in fetching the author Location"))
                 try:
                     page['ef_overall_rating']=float(review.find(text=re.compile('Overall:')).\
                                                              findNext()['alt'].strip('stars'))
                 except:
                     log.exception(self.log_msg("exception in fetching the overall_ratings"))
                 try:
                     page['ef_comfort_rating']= float(review.find(text=re.compile('Comfort:')).\
                                                          findNext()['alt'].strip('stars'))
                 except:
                     log.exception(self.log_msg("exception in fetching the Comfort_ratings"))
                 try:
                     page['ef_look_rating']= float(review.find(text=re.compile('Look:')).\
                                                       findNext()['alt'].strip('stars'))
                 except:
                     log.exception(self.log_msg("exception in fetching the Look_ratings"))
                 try:
                     feature=review.findAll('font', color="#333399" )
                     for each in feature[:-1]:
                         page['et_feature_'+each.renderContents()]=each.next.next
                 except:
                     log.exception(self.log_msg("exception in fetching the et_features like"\
                                                    "shoe arch, shoe width etc.. "))
                 try:
                     if review.find(text=re.compile('Shoe Arch:')):
                         page['data']=stripHtml(review.find(text=re.compile('Shoe Arch:')).\
                                                    findNext().next.next).strip()
                     elif review.find(text=re.compile('Shoe Width:')):
                         page['data']=stripHtml(review.find(text=re.compile('Shoe Width:')).\
                                                     findNext().next.next).strip()
                     elif review.find(text=re.compile('Shoe Size:')):
                         page['data']=stripHtml(review.find(text=re.compile('Shoe Size:')).\
                                                    findNext().next.next).strip()
                     elif  review.find(text=re.compile('Look:')):
                         page['data']=stripHtml(review.find(text=re.compile('Look:')).findNext().next.next.next).strip()
                                           
                 except:
                     page['data']=''
                     log.exception(self.log_msg("exception in fetching Data"))
                 try:
                     review=str(review)
                     page['title']= review[0:review.index('<br />')]
                 except:
                     page['title']=''
                     log.exception(self.log_msg("exception in fetching Title"))
                 try:
                     review_hash = get_hash(page)
                 except:
                     log.exception(self.log_msg('could not generate review_hash for '+ 
                                                self.currenturi))
                 try:
                     page['posted_date']= datetime.strftime(datetime.strptime(date[i].\
                            strip('</?i>'),'%Y-%m-%d %H:%M:%S'),'%Y-%m-%dT%H:%M:%SZ')
                 except:
                     log.exception(self.log_msg("exception in fetching Posted_date"))
                     page['posted_date']=datetime.strftime(datetime.utcnow(),"%Y-%m-%dT%H:%M:%SZ")
                 result=updateSessionInfo(self.genre, self.session_info_out, review_hash, \
                                              review_hash,'Review', self.task.instance_data.get('update'),\
                                               parent_list=[self.parent_url])
                 if result['updated']:
                     page['uri'] = self.currenturi
                     page['id'] = result['id']
                     page['priority']=self.task.priority
                     page['level']=self.task.level
                     page['pickup_date'] = datetime.strftime(datetime.utcnow(),"%Y-%m-%dT%H:%M:%SZ")
                     page['connector_instance_log_id'] = self.task.connector_instance_log_id
                     page['connector_instance_id'] = self.task.connector_instance_id
                     page['workspace_id'] = self.task.workspace_id
                     page['client_id'] = self.task.client_id
                     page['client_name'] = self.task.client_name
                     page['last_updated_time'] = page['pickup_date']
                     page['versioned'] = False
                     page['first_version_id']=result['first_version_id']
                     page['entity'] = 'Review'
                     page['category'] = self.task.instance_data.get('category','')
                     page['parent_id']= '-'.join(result['id'].split('-')[:-1])
                     page['task_log_id']=self.task.id
                     page['uri_domain'] = urlparse.urlparse(page['uri'])[1]
                     self.pages.append(page)
                     self.count=self.count+1
                     log.info(self.log_msg('Adding %dth review of  %s ' % (self.count, self.currenturi)))
         return True
                 
     except Exception, e:
         log.exception(self.log_msg('Exception in addReviews'))
         raise e