def request(self, url, indirect=False): if not self.more: self.npage = 1 self.stop = False self.results = Results(self.video_per_page) try: soup = BeautifulSoup(urllib2.urlopen(url).read(), convertEntities=BeautifulSoup.ALL_ENTITIES) if indirect: try: soup = unicode(soup) start = soup.index('thumbnailViewUrl: "')+19 url = DOMAIN + soup[start:soup.index('"', start)] + QUERY_STRING % (self.npage,) soup = BeautifulSoup(urllib2.urlopen(url).read(), convertEntities=BeautifulSoup.ALL_ENTITIES) except ValueError: print >> sys.stderr, 'Error: when parsing the page' return videos = extract_videos(soup, self.options) if len(videos) < VIDEO_PER_PAGE: self.stop = True self.results.extend(videos) except urllib2.URLError: die("Can't complete request")
def _addReviews(self): try: pat=re.compile('<i>\d+-\d+-\d+ \d+:\d+:\d+</i>') review_soup=[each for each in self.soup.findAll('td') if each.find('font', {'face':'verdana, geneva, arial'})] date=re.findall(pat, str(review_soup)) date.insert(0,0)# dummy value to be stored becos 0th element of review_info is not been considered review_info=re.split(pat, str(review_soup)) if not checkSessionInfo(self.genre, self.session_info_out, self.currenturi, self.task.instance_data.get('update'), parent_list=[self.parent_url]): for i in xrange(1, len(review_info)): page={} review=BeautifulSoup(review_info[i]) try: if review.find(text=re.compile('Reviewer:' , re.DOTALL)): try: page['et_author_name']=review.find(text=re.compile('Reviewer:' , re.DOTALL)).findNext().renderContents().strip() except: page['et_author_name']='' log.exception(self.log_msg("exception in fetching the author name")) except: log.exception(self.log_msg("exception in fetching the author name")) try: loc= review.find(text=re.compile('Reviewer:')).findNext().next.next if loc.__contains__('from'): page['et_author_location']=review.find(text=re.compile('Reviewer:')).\ findNext().next.next.strip().strip('from') else: page['et_author_location']='' except: page['et_author_location']='' log.exception(self.log_msg("exception in fetching the author Location")) try: page['ef_overall_rating']=float(review.find(text=re.compile('Overall:')).\ findNext()['alt'].strip('stars')) except: log.exception(self.log_msg("exception in fetching the overall_ratings")) try: page['ef_comfort_rating']= float(review.find(text=re.compile('Comfort:')).\ findNext()['alt'].strip('stars')) except: log.exception(self.log_msg("exception in fetching the Comfort_ratings")) try: page['ef_look_rating']= float(review.find(text=re.compile('Look:')).\ findNext()['alt'].strip('stars')) except: log.exception(self.log_msg("exception in fetching the Look_ratings")) try: feature=review.findAll('font', color="#333399" ) for each in feature[:-1]: page['et_feature_'+each.renderContents()]=each.next.next except: log.exception(self.log_msg("exception in fetching the et_features like"\ "shoe arch, shoe width etc.. ")) try: if review.find(text=re.compile('Shoe Arch:')): page['data']=stripHtml(review.find(text=re.compile('Shoe Arch:')).\ findNext().next.next).strip() elif review.find(text=re.compile('Shoe Width:')): page['data']=stripHtml(review.find(text=re.compile('Shoe Width:')).\ findNext().next.next).strip() elif review.find(text=re.compile('Shoe Size:')): page['data']=stripHtml(review.find(text=re.compile('Shoe Size:')).\ findNext().next.next).strip() elif review.find(text=re.compile('Look:')): page['data']=stripHtml(review.find(text=re.compile('Look:')).findNext().next.next.next).strip() except: page['data']='' log.exception(self.log_msg("exception in fetching Data")) try: review=str(review) page['title']= review[0:review.index('<br />')] except: page['title']='' log.exception(self.log_msg("exception in fetching Title")) try: review_hash = get_hash(page) except: log.exception(self.log_msg('could not generate review_hash for '+ self.currenturi)) try: page['posted_date']= datetime.strftime(datetime.strptime(date[i].\ strip('</?i>'),'%Y-%m-%d %H:%M:%S'),'%Y-%m-%dT%H:%M:%SZ') except: log.exception(self.log_msg("exception in fetching Posted_date")) page['posted_date']=datetime.strftime(datetime.utcnow(),"%Y-%m-%dT%H:%M:%SZ") result=updateSessionInfo(self.genre, self.session_info_out, review_hash, \ review_hash,'Review', self.task.instance_data.get('update'),\ parent_list=[self.parent_url]) if result['updated']: page['uri'] = self.currenturi page['id'] = result['id'] page['priority']=self.task.priority page['level']=self.task.level page['pickup_date'] = datetime.strftime(datetime.utcnow(),"%Y-%m-%dT%H:%M:%SZ") page['connector_instance_log_id'] = self.task.connector_instance_log_id page['connector_instance_id'] = self.task.connector_instance_id page['workspace_id'] = self.task.workspace_id page['client_id'] = self.task.client_id page['client_name'] = self.task.client_name page['last_updated_time'] = page['pickup_date'] page['versioned'] = False page['first_version_id']=result['first_version_id'] page['entity'] = 'Review' page['category'] = self.task.instance_data.get('category','') page['parent_id']= '-'.join(result['id'].split('-')[:-1]) page['task_log_id']=self.task.id page['uri_domain'] = urlparse.urlparse(page['uri'])[1] self.pages.append(page) self.count=self.count+1 log.info(self.log_msg('Adding %dth review of %s ' % (self.count, self.currenturi))) return True except Exception, e: log.exception(self.log_msg('Exception in addReviews')) raise e