Example #1
0
 def request(self, url, indirect=False):
     if not self.more:
         self.npage = 1
         self.stop = False
         self.results = Results(self.video_per_page)
     try:
         soup = BeautifulSoup(urllib2.urlopen(url).read(), convertEntities=BeautifulSoup.ALL_ENTITIES)
         if indirect:
             try:
                 soup = unicode(soup)
                 start = soup.index('thumbnailViewUrl: "')+19
                 url = DOMAIN + soup[start:soup.index('"', start)] + QUERY_STRING % (self.npage,)
                 soup = BeautifulSoup(urllib2.urlopen(url).read(), convertEntities=BeautifulSoup.ALL_ENTITIES)
             except ValueError:
                 print >> sys.stderr, 'Error: when parsing the page'
                 return
         videos = extract_videos(soup, self.options)
         if len(videos) < VIDEO_PER_PAGE:
             self.stop = True
         self.results.extend(videos)
     except urllib2.URLError:
         die("Can't complete request")
Example #2
0
 def request(self, url, indirect=False):
     if not self.more:
         self.npage = 1
         self.stop = False
         self.results = Results(self.video_per_page)
     try:
         soup = BeautifulSoup(urllib2.urlopen(url).read(), convertEntities=BeautifulSoup.ALL_ENTITIES)
         if indirect:
             try:
                 soup = unicode(soup)
                 start = soup.index('thumbnailViewUrl: "')+19
                 url = DOMAIN + soup[start:soup.index('"', start)] + QUERY_STRING % (self.npage,)
                 soup = BeautifulSoup(urllib2.urlopen(url).read(), convertEntities=BeautifulSoup.ALL_ENTITIES)
             except ValueError:
                 print >> sys.stderr, 'Error: when parsing the page'
                 return
         videos = extract_videos(soup, self.options)
         if len(videos) < VIDEO_PER_PAGE:
             self.stop = True
         self.results.extend(videos)
     except urllib2.URLError:
         die("Can't complete request")
 def _addReviews(self):
     try:
         pat=re.compile('<i>\d+-\d+-\d+ \d+:\d+:\d+</i>')
         review_soup=[each for each in self.soup.findAll('td') if each.find('font', {'face':'verdana, geneva, arial'})]
         date=re.findall(pat, str(review_soup))
         date.insert(0,0)# dummy value to be stored becos 0th element of review_info is not been considered
         review_info=re.split(pat, str(review_soup))
         if not checkSessionInfo(self.genre, self.session_info_out,
                                 self.currenturi,  self.task.instance_data.get('update'),
                                 parent_list=[self.parent_url]):
             for i in xrange(1, len(review_info)):
                 page={}
                 review=BeautifulSoup(review_info[i])
                 try:
                     if review.find(text=re.compile('Reviewer:' , re.DOTALL)):
                         try:
                             page['et_author_name']=review.find(text=re.compile('Reviewer:' , re.DOTALL)).findNext().renderContents().strip()
                         except:
                             page['et_author_name']=''
                             log.exception(self.log_msg("exception in fetching the author name"))
                 except:
                     log.exception(self.log_msg("exception in fetching the author name"))
                 try:
                     loc= review.find(text=re.compile('Reviewer:')).findNext().next.next
                     if  loc.__contains__('from'):
                         page['et_author_location']=review.find(text=re.compile('Reviewer:')).\
                             findNext().next.next.strip().strip('from')
                     else:
                         page['et_author_location']=''
                 except:
                     page['et_author_location']=''
                     log.exception(self.log_msg("exception in fetching the author Location"))
                 try:
                     page['ef_overall_rating']=float(review.find(text=re.compile('Overall:')).\
                                                              findNext()['alt'].strip('stars'))
                 except:
                     log.exception(self.log_msg("exception in fetching the overall_ratings"))
                 try:
                     page['ef_comfort_rating']= float(review.find(text=re.compile('Comfort:')).\
                                                          findNext()['alt'].strip('stars'))
                 except:
                     log.exception(self.log_msg("exception in fetching the Comfort_ratings"))
                 try:
                     page['ef_look_rating']= float(review.find(text=re.compile('Look:')).\
                                                       findNext()['alt'].strip('stars'))
                 except:
                     log.exception(self.log_msg("exception in fetching the Look_ratings"))
                 try:
                     feature=review.findAll('font', color="#333399" )
                     for each in feature[:-1]:
                         page['et_feature_'+each.renderContents()]=each.next.next
                 except:
                     log.exception(self.log_msg("exception in fetching the et_features like"\
                                                    "shoe arch, shoe width etc.. "))
                 try:
                     if review.find(text=re.compile('Shoe Arch:')):
                         page['data']=stripHtml(review.find(text=re.compile('Shoe Arch:')).\
                                                    findNext().next.next).strip()
                     elif review.find(text=re.compile('Shoe Width:')):
                         page['data']=stripHtml(review.find(text=re.compile('Shoe Width:')).\
                                                     findNext().next.next).strip()
                     elif review.find(text=re.compile('Shoe Size:')):
                         page['data']=stripHtml(review.find(text=re.compile('Shoe Size:')).\
                                                    findNext().next.next).strip()
                     elif  review.find(text=re.compile('Look:')):
                         page['data']=stripHtml(review.find(text=re.compile('Look:')).findNext().next.next.next).strip()
                                           
                 except:
                     page['data']=''
                     log.exception(self.log_msg("exception in fetching Data"))
                 try:
                     review=str(review)
                     page['title']= review[0:review.index('<br />')]
                 except:
                     page['title']=''
                     log.exception(self.log_msg("exception in fetching Title"))
                 try:
                     review_hash = get_hash(page)
                 except:
                     log.exception(self.log_msg('could not generate review_hash for '+ 
                                                self.currenturi))
                 try:
                     page['posted_date']= datetime.strftime(datetime.strptime(date[i].\
                            strip('</?i>'),'%Y-%m-%d %H:%M:%S'),'%Y-%m-%dT%H:%M:%SZ')
                 except:
                     log.exception(self.log_msg("exception in fetching Posted_date"))
                     page['posted_date']=datetime.strftime(datetime.utcnow(),"%Y-%m-%dT%H:%M:%SZ")
                 result=updateSessionInfo(self.genre, self.session_info_out, review_hash, \
                                              review_hash,'Review', self.task.instance_data.get('update'),\
                                               parent_list=[self.parent_url])
                 if result['updated']:
                     page['uri'] = self.currenturi
                     page['id'] = result['id']
                     page['priority']=self.task.priority
                     page['level']=self.task.level
                     page['pickup_date'] = datetime.strftime(datetime.utcnow(),"%Y-%m-%dT%H:%M:%SZ")
                     page['connector_instance_log_id'] = self.task.connector_instance_log_id
                     page['connector_instance_id'] = self.task.connector_instance_id
                     page['workspace_id'] = self.task.workspace_id
                     page['client_id'] = self.task.client_id
                     page['client_name'] = self.task.client_name
                     page['last_updated_time'] = page['pickup_date']
                     page['versioned'] = False
                     page['first_version_id']=result['first_version_id']
                     page['entity'] = 'Review'
                     page['category'] = self.task.instance_data.get('category','')
                     page['parent_id']= '-'.join(result['id'].split('-')[:-1])
                     page['task_log_id']=self.task.id
                     page['uri_domain'] = urlparse.urlparse(page['uri'])[1]
                     self.pages.append(page)
                     self.count=self.count+1
                     log.info(self.log_msg('Adding %dth review of  %s ' % (self.count, self.currenturi)))
         return True
                 
     except Exception, e:
         log.exception(self.log_msg('Exception in addReviews'))
         raise e