def process_outline(self, outline): folders = [] for item in outline: if not hasattr(item, 'xmlUrl') and hasattr(item, 'text'): folder = item # if hasattr(folder, 'text'): # logging.info(' ---> [%s] ~FRNew Folder: %s' % (self.user, folder.text)) folders.append({folder.text: self.process_outline(folder)}) elif hasattr(item, 'xmlUrl'): feed = item if not hasattr(feed, 'htmlUrl'): setattr(feed, 'htmlUrl', None) # If feed title matches what's in the DB, don't override it on subscription. feed_title = getattr(feed, 'title', None) or getattr(feed, 'text', None) if not feed_title: setattr(feed, 'title', feed.htmlUrl or feed.xmlUrl) user_feed_title = None else: setattr(feed, 'title', feed_title) user_feed_title = feed.title feed_address = urlnorm.normalize(feed.xmlUrl) feed_link = urlnorm.normalize(feed.htmlUrl) if len(feed_address) > Feed._meta.get_field('feed_address').max_length: continue if feed_link and len(feed_link) > Feed._meta.get_field('feed_link').max_length: continue # logging.info(' ---> \t~FR%s - %s - %s' % (feed.title, feed_link, feed_address,)) feed_data = dict(feed_address=feed_address, feed_link=feed_link, feed_title=feed.title) # feeds.append(feed_data) # See if it exists as a duplicate first duplicate_feed = DuplicateFeed.objects.filter(duplicate_address=feed_address) if duplicate_feed: feed_db = duplicate_feed[0].feed else: feed_data['active_subscribers'] = 1 feed_data['num_subscribers'] = 1 feed_db, _ = Feed.objects.get_or_create(feed_address=feed_address, feed_link=feed_link, defaults=dict(**feed_data)) if user_feed_title == feed_db.feed_title: user_feed_title = None us, _ = UserSubscription.objects.get_or_create( feed=feed_db, user=self.user, defaults={ 'needs_unread_recalc': True, 'mark_read_date': datetime.datetime.utcnow() - datetime.timedelta(days=1), 'active': self.user.profile.is_premium, 'user_title': user_feed_title } ) if self.user.profile.is_premium and not us.active: us.active = True us.save() folders.append(feed_db.pk) return folders
def _process_item(self, item): feed_title = item.xpath('./string[@name="title"]') and \ item.xpath('./string[@name="title"]')[0].text feed_address = item.xpath('./string[@name="id"]') and \ item.xpath('./string[@name="id"]')[0].text.replace('feed/', '', 1) feed_link = item.xpath('./string[@name="htmlUrl"]') and \ item.xpath('./string[@name="htmlUrl"]')[0].text category = item.xpath('./list[@name="categories"]/object/string[@name="label"]') and \ item.xpath('./list[@name="categories"]/object/string[@name="label"]')[0].text if not feed_address: feed_address = feed_link try: feed_link = urlnorm.normalize(feed_link) feed_address = urlnorm.normalize(feed_address) feed = { 'title': feed_title, 'url': feed_address, 'link': feed_link, 'category': category, } return feed except Exception, e: print '---->Exception: %s: %s' % (e, item)
def _addLinksToCrawler(self): """ """ try: log.info(self.log_msg('levels : %s , %s:%s:%s'%(self.currenturi,self.task.level,self.level,self.max_recursion_level))) if self.task.level > self.max_recursion_level and not self.task.instance_data.get('metapage'): log.debug('TaskID:%s::Client:%s::recursion level greater then MAX, returning for %s' % (self.task.id, self.task.client_name,self.currenturi)) return #increment=1 #if self.task.instance_data['metapage']: #increment=0 for anchor in self.soup.findAll('a',href=True): try: url = normalize(unicode(anchor['href']), self.currenturi, self.base) #apply regex patters to urls : if self.task.instance_data.get('url_filter'): url_pattern = re.compile(self.task.instance_data['url_filter'], re.IGNORECASE|re.DOTALL) if not url_pattern.search(url): continue log.info(self.log_msg("clone uri :: %s"%normalize(unicode(anchor['href']), self.currenturi, self.base))) temp_task=self.task.clone() temp_task.instance_data['uri']=normalize(unicode(anchor['href']), self.currenturi, self.base) #temp_task.level=int(self.task.level)+increment temp_task.pagedata['title']=getTitleFromLink(anchor) temp_task.priority=self.task.priority self.linksOut.append(temp_task) except: log.exception('TaskID:%s::Client:%s::failed to create one of the clone tasks' % (self.task.id, self.task.client_name)) continue return True #intentional indentation except: log.exception('TaskID:%s::Client:%s::addLinksToCrawler failed' % (self.task.id, self.task.client_name))
def process_outline(self, outline): folders = [] for item in outline: if not hasattr(item, 'xmlUrl'): folder = item # if hasattr(folder, 'text'): # logging.info(' ---> [%s] ~FRNew Folder: %s' % (self.user, folder.text)) folders.append({folder.text: self.process_outline(folder)}) elif hasattr(item, 'xmlUrl'): feed = item if not hasattr(feed, 'htmlUrl'): setattr(feed, 'htmlUrl', None) if not hasattr(feed, 'title') or not feed.title: setattr(feed, 'title', feed.htmlUrl or feed.xmlUrl) feed_address = urlnorm.normalize(feed.xmlUrl) feed_link = urlnorm.normalize(feed.htmlUrl) if len(feed_address) > Feed._meta.get_field( 'feed_address').max_length: continue if feed_link and len(feed_link) > Feed._meta.get_field( 'feed_link').max_length: continue if len(feed.title) > Feed._meta.get_field( 'feed_title').max_length: feed.title = feed.title[:255] # logging.info(' ---> \t~FR%s - %s - %s' % (feed.title, feed_link, feed_address,)) feed_data = dict(feed_address=feed_address, feed_link=feed_link, feed_title=feed.title) # feeds.append(feed_data) # See if it exists as a duplicate first duplicate_feed = DuplicateFeed.objects.filter( duplicate_address=feed_address) if duplicate_feed: feed_db = duplicate_feed[0].feed else: feed_data['active_subscribers'] = 1 feed_data['num_subscribers'] = 1 feed_db, _ = Feed.objects.get_or_create( feed_address=feed_address, defaults=dict(**feed_data)) us, _ = UserSubscription.objects.get_or_create( feed=feed_db, user=self.user, defaults={ 'needs_unread_recalc': True, 'mark_read_date': datetime.datetime.utcnow() - datetime.timedelta(days=1), 'active': self.user.profile.is_premium, }) if self.user.profile.is_premium and not us.active: us.active = True us.save() folders.append(feed_db.pk) return folders
def process_item(self, item, folders): feed_title = item.xpath('./string[@name="title"]') and \ item.xpath('./string[@name="title"]')[0].text feed_address = item.xpath('./string[@name="id"]') and \ item.xpath('./string[@name="id"]')[0].text.replace('feed/', '') feed_link = item.xpath('./string[@name="htmlUrl"]') and \ item.xpath('./string[@name="htmlUrl"]')[0].text category = item.xpath('./list[@name="categories"]/object/string[@name="label"]') and \ item.xpath('./list[@name="categories"]/object/string[@name="label"]')[0].text if not feed_address: feed_address = feed_link try: feed_link = urlnorm.normalize(feed_link) feed_address = urlnorm.normalize(feed_address) if len(feed_address) > Feed._meta.get_field( 'feed_address').max_length: return folders # See if it exists as a duplicate first duplicate_feed = DuplicateFeed.objects.filter( duplicate_address=feed_address) if duplicate_feed: feed_db = duplicate_feed[0].feed else: feed_data = dict(feed_title=feed_title) feed_data['active_subscribers'] = 1 feed_data['num_subscribers'] = 1 feed_db, _ = Feed.find_or_create(feed_address=feed_address, feed_link=feed_link, defaults=dict(**feed_data)) us, _ = UserSubscription.objects.get_or_create( feed=feed_db, user=self.user, defaults={ 'needs_unread_recalc': True, 'mark_read_date': datetime.datetime.utcnow() - datetime.timedelta(days=1), 'active': self.user.profile.is_premium or self.auto_active, }) if not us.needs_unread_recalc: us.needs_unread_recalc = True us.save() if not category: category = "" if category: obj = {category: []} folders = add_object_to_folder(obj, '', folders) folders = add_object_to_folder(feed_db.pk, category, folders) # if feed_db.pk not in folders[category]: # folders[category].append(feed_db.pk) except Exception, e: logging.info(' *** -> Exception: %s: %s' % (e, item))
def get_or_create(cls, address, title='', link=''): address = urlnorm.normalize(address) link = link and urlnorm.normalize(link) feed = cls.get_by_url(address) if feed: return feed, True feed = Feed(address, title = title, link = link) feed.save() return feed.update(), False
def process_item(self, item, folders): feed_title = item.xpath('./string[@name="title"]') and item.xpath('./string[@name="title"]')[0].text feed_address = item.xpath('./string[@name="id"]') and item.xpath('./string[@name="id"]')[0].text.replace( "feed/", "" ) feed_link = item.xpath('./string[@name="htmlUrl"]') and item.xpath('./string[@name="htmlUrl"]')[0].text category = ( item.xpath('./list[@name="categories"]/object/string[@name="label"]') and item.xpath('./list[@name="categories"]/object/string[@name="label"]')[0].text ) if not feed_address: feed_address = feed_link try: feed_link = urlnorm.normalize(feed_link) feed_address = urlnorm.normalize(feed_address) if len(feed_address) > Feed._meta.get_field("feed_address").max_length: return folders # See if it exists as a duplicate first duplicate_feed = DuplicateFeed.objects.filter(duplicate_address=feed_address) if duplicate_feed: feed_db = duplicate_feed[0].feed else: feed_data = dict(feed_title=feed_title) feed_data["active_subscribers"] = 1 feed_data["num_subscribers"] = 1 feed_db, _ = Feed.find_or_create( feed_address=feed_address, feed_link=feed_link, defaults=dict(**feed_data) ) us, _ = UserSubscription.objects.get_or_create( feed=feed_db, user=self.user, defaults={ "needs_unread_recalc": True, "mark_read_date": datetime.datetime.utcnow() - datetime.timedelta(days=1), "active": self.user.profile.is_premium or self.auto_active, }, ) if not us.needs_unread_recalc: us.needs_unread_recalc = True us.save() if not category: category = "" if category: obj = {category: []} folders = add_object_to_folder(obj, "", folders) folders = add_object_to_folder(feed_db.pk, category, folders) # if feed_db.pk not in folders[category]: # folders[category].append(feed_db.pk) except Exception, e: logging.info(" *** -> Exception: %s: %s" % (e, item))
def process_item(self, item, folders): feed_title = item.xpath('./string[@name="title"]') and \ item.xpath('./string[@name="title"]')[0].text feed_address = item.xpath('./string[@name="id"]') and \ item.xpath('./string[@name="id"]')[0].text.replace('feed/', '') feed_link = item.xpath('./string[@name="htmlUrl"]') and \ item.xpath('./string[@name="htmlUrl"]')[0].text category = item.xpath('./list[@name="categories"]/object/string[@name="label"]') and \ item.xpath('./list[@name="categories"]/object/string[@name="label"]')[0].text if not feed_address: feed_address = feed_link try: feed_link = urlnorm.normalize(feed_link) feed_address = urlnorm.normalize(feed_address) if len(feed_address) > Feed._meta.get_field( 'feed_address').max_length: return folders # See if it exists as a duplicate first duplicate_feed = DuplicateFeed.objects.filter( duplicate_address=feed_address) if duplicate_feed: feed_db = duplicate_feed[0].feed else: feed_data = dict(feed_address=feed_address, feed_link=feed_link, feed_title=feed_title) feed_data['active_subscribers'] = 1 feed_data['num_subscribers'] = 1 feeds = Feed.objects.filter( feed_address=feed_address, branch_from_feed__isnull=True).order_by('-num_subscribers') if feeds: feed_db = feeds[0] else: feed_db = Feed.objects.create(**feed_data) us, _ = UserSubscription.objects.get_or_create( feed=feed_db, user=self.user, defaults={ 'needs_unread_recalc': True, 'mark_read_date': datetime.datetime.utcnow() - datetime.timedelta(days=1), 'active': self.user.profile.is_premium, }) if not category: category = "Root" folders[category].append(feed_db.pk) except Exception, e: logging.info(' *** -> Exception: %s' % e)
def process_item(self, item, folders): feed_title = item.xpath('./string[@name="title"]') and item.xpath('./string[@name="title"]')[0].text feed_address = item.xpath('./string[@name="id"]') and item.xpath('./string[@name="id"]')[0].text.replace( "feed/", "" ) feed_link = item.xpath('./string[@name="htmlUrl"]') and item.xpath('./string[@name="htmlUrl"]')[0].text category = ( item.xpath('./list[@name="categories"]/object/string[@name="label"]') and item.xpath('./list[@name="categories"]/object/string[@name="label"]')[0].text ) if not feed_address: feed_address = feed_link try: feed_link = urlnorm.normalize(feed_link) feed_address = urlnorm.normalize(feed_address) if len(feed_address) > Feed._meta.get_field("feed_address").max_length: return folders # See if it exists as a duplicate first duplicate_feed = DuplicateFeed.objects.filter(duplicate_address=feed_address) if duplicate_feed: feed_db = duplicate_feed[0].feed else: feed_data = dict(feed_address=feed_address, feed_link=feed_link, feed_title=feed_title) feed_data["active_subscribers"] = 1 feed_data["num_subscribers"] = 1 feeds = Feed.objects.filter(feed_address=feed_address, branch_from_feed__isnull=True).order_by( "-num_subscribers" ) if feeds: feed_db = feeds[0] else: feed_db = Feed.objects.create(**feed_data) us, _ = UserSubscription.objects.get_or_create( feed=feed_db, user=self.user, defaults={ "needs_unread_recalc": True, "mark_read_date": datetime.datetime.utcnow() - datetime.timedelta(days=1), "active": self.user.profile.is_premium, }, ) if not category: category = "Root" folders[category].append(feed_db.pk) except Exception, e: logging.info(" *** -> Exception: %s" % e)
def process_outline(self, outline): folders = [] for item in outline: if not hasattr(item, "xmlUrl"): folder = item # if hasattr(folder, 'text'): # logging.info(' ---> [%s] ~FRNew Folder: %s' % (self.user, folder.text)) folders.append({folder.text: self.process_outline(folder)}) elif hasattr(item, "xmlUrl"): feed = item if not hasattr(feed, "htmlUrl"): setattr(feed, "htmlUrl", None) if not hasattr(feed, "title") or not feed.title: setattr(feed, "title", feed.htmlUrl or feed.xmlUrl) feed_address = urlnorm.normalize(feed.xmlUrl) feed_link = urlnorm.normalize(feed.htmlUrl) if len(feed_address) > Feed._meta.get_field("feed_address").max_length: continue if feed_link and len(feed_link) > Feed._meta.get_field("feed_link").max_length: continue if len(feed.title) > Feed._meta.get_field("feed_title").max_length: feed.title = feed.title[:255] # logging.info(' ---> \t~FR%s - %s - %s' % (feed.title, feed_link, feed_address,)) feed_data = dict(feed_address=feed_address, feed_link=feed_link, feed_title=feed.title) # feeds.append(feed_data) # See if it exists as a duplicate first duplicate_feed = DuplicateFeed.objects.filter(duplicate_address=feed_address) if duplicate_feed: feed_db = duplicate_feed[0].feed else: feed_data["active_subscribers"] = 1 feed_data["num_subscribers"] = 1 feed_db, _ = Feed.objects.get_or_create(feed_address=feed_address, defaults=dict(**feed_data)) us, _ = UserSubscription.objects.get_or_create( feed=feed_db, user=self.user, defaults={ "needs_unread_recalc": True, "mark_read_date": datetime.datetime.utcnow() - datetime.timedelta(days=1), "active": self.user.profile.is_premium, }, ) if self.user.profile.is_premium and not us.active: us.active = True us.save() folders.append(feed_db.pk) return folders
def process_item(self, item, folders): feed_title = item.xpath('./string[@name="title"]') and \ item.xpath('./string[@name="title"]')[0].text feed_address = item.xpath('./string[@name="id"]') and \ item.xpath('./string[@name="id"]')[0].text.replace('feed/', '') feed_link = item.xpath('./string[@name="htmlUrl"]') and \ item.xpath('./string[@name="htmlUrl"]')[0].text category = item.xpath('./list[@name="categories"]/object/string[@name="label"]') and \ item.xpath('./list[@name="categories"]/object/string[@name="label"]')[0].text if not feed_address: feed_address = feed_link try: feed_link = urlnorm.normalize(feed_link) feed_address = urlnorm.normalize(feed_address) if len(feed_address) > Feed._meta.get_field('feed_address').max_length: return folders # See if it exists as a duplicate first duplicate_feed = DuplicateFeed.objects.filter(duplicate_address=feed_address) if duplicate_feed: feed_db = duplicate_feed[0].feed else: feed_data = dict(feed_title=feed_title) feed_data['active_subscribers'] = 1 feed_data['num_subscribers'] = 1 feed_db, _ = Feed.find_or_create(feed_address=feed_address, feed_link=feed_link, defaults=dict(**feed_data)) us, _ = UserSubscription.objects.get_or_create( feed=feed_db, user=self.user, defaults={ 'needs_unread_recalc': True, 'mark_read_date': datetime.datetime.utcnow() - datetime.timedelta(days=1), 'active': self.user.profile.is_premium or self.auto_active, } ) if not us.needs_unread_recalc: us.needs_unread_recalc = True us.save() if not category: category = "Root" if feed_db.pk not in folders[category]: folders[category].append(feed_db.pk) except Exception, e: logging.info(' *** -> Exception: %s: %s' % (e, item))
def fetch(self): """ same fetch method, I need to write something for doc string So I m writing this doc string """ try: self.parenturi = self.currenturi self.genre = "Review" if self.currenturi == 'http://www.laptopical.com/laptop-reviews.html': if not self._setSoup(): return False hrefs = [ 'http://www.laptopical.com' + div.find('a')['href'] \ for div in self.soup.find('div',{'id':'review-listing'})\ .find('ul').findAll('li') if not div.find('a') == None ] for href in hrefs: temp_task=self.task.clone() temp_task.instance_data[ 'uri' ] = normalize( href ) self.linksOut.append( temp_task ) log.info('Total uris are %d'%(len( hrefs ))) return True if re.compile('http://www.laptopical.com/.+?\.html').match(self.currenturi): if not self._setSoup(): return False self._getParentPage() self._addReview() return True except: log.exception('error in fetch ') return False
def __processRSSFeeds(self): '''This will process the RSS Feeds of Facebook ''' log.debug(self.log_msg("Entry Webpage: "+str(self.currenturi))) parser = feedparser.parse(self.currenturi) if len(parser.version) == 0 or not parser: log.info(self.log_msg('parser version not found , returning')) return False log.info('number of entries %s'%(len(parser.entries))) for entity in parser.entries: try: if checkSessionInfo('Review',self.session_info_out, entity['link'], self.task.instance_data.get('update')): log.info(self.log_msg('Session info returns True for uri %s'%entity['link'])) continue result = updateSessionInfo('Review', self.session_info_out, entity['link'], '', 'Post', self.task.instance_data.get('update')) if not result['updated']: log.info(self.log_msg('Result not updated for uri %s'%entity['link'])) continue temp_task = self.task.clone() temp_task.instance_data['uri'] = normalize(entity['link']) temp_task.pagedata['title'] = entity['title'] temp_task.pagedata['source'] = 'facebook.com' temp_task.instance_data['connector_name'] = 'HTMLConnector' temp_task.pagedata['source_type'] = 'rss' self.linksOut.append(temp_task) except: log.exception(self.log_msg("exception in adding temptask to linksout")) return True
def fetch(self): self.genre="Review" try: self.__base_uri = 'http://answers.yahoo.com/' code = None parent_uri = self.currenturi res=self._getHTML() self.rawpage=res['result'] self._setCurrentPage() self.POSTS_ITERATIONS = tg.config.get(path='Connector',key='yahooanswers_numposts') self.__max_date_submission_date = tg.config.get(path='Connector',key='yahooanswers_max_date_submission') self.curiter = 0 if '/question/index' not in self.currenturi: self.__createSiteUrl() next_page = self.soup.find('li',{'class':'next'}) while self.addQuestionUrls(parent_uri) and next_page: try: self.currenturi = normalize(self.__base_uri + next_page.a['href']) log.debug(self.log_msg("Fetching url %s" %(self.currenturi))) res=self._getHTML() self.rawpage=res['result'] self._setCurrentPage() next_page = self.soup.find('li',{'class':'next'}) except Exception, e: log.exception(self.log_msg('exception in iterating pages in fetch')) break else:
def get_feed_from_url(cls, url): feed = None def by_url(address): feed = cls.objects.filter(feed_address=address) if not feed: duplicate_feed = DuplicateFeed.objects.filter( duplicate_address=address).order_by('pk') if duplicate_feed: feed = [duplicate_feed[0].feed] return feed url = urlnorm.normalize(url) feed = by_url(url) if feed: feed = feed[0] else: if feedfinder.isFeed(url): feed = cls.objects.create(feed_address=url) feed = feed.update() else: feed_finder_url = feedfinder.feed(url) if feed_finder_url: feed = by_url(feed_finder_url) if not feed: feed = cls.objects.create(feed_address=feed_finder_url) feed = feed.update() else: feed = feed[0] return feed
def __getParentPage(self): """ This will get the parent info """ page = {} try: self.hierarchy = page['et_thread_hierarchy'] = [stripHtml(x.renderContents()) for x in self.soup.find('div','CommonBreadCrumbArea').findAll('a')][1:] page['title']= page['et_thread_hierarchy'][-1] except: log.info(self.log_msg('Thread hierarchy is not found')) page['title']='' try: self.thread_id = page['et_thread_id'] = unicode(self.currenturi.split('/')[-1].replace('.aspx','')) except: log.info(self.log_msg('Thread id not found')) if checkSessionInfo(self.genre, self.session_info_out, self.parent_uri,\ self.task.instance_data.get('update')): log.info(self.log_msg('Session info return True, Already exists')) return False for each in ['et_thread_last_post_author','ei_thread_replies_count','edate_last_post_date']: try: page[each] = self.task.pagedata[each] except: log.info(self.log_msg('page data cannot be extracted for %s'%each)) try: post_hash = get_hash( page ) id=None if self.session_info_out=={}: id=self.task.id result=updateSessionInfo( self.genre, self.session_info_out, self.\ parent_uri, post_hash,'Forum',self.task.instance_data.get('update'), Id=id) if not result['updated']: return False page['path']=[self.parent_uri] page['parent_path']=[] page['uri'] = normalize( self.currenturi ) page['uri_domain'] = unicode(urlparse.urlparse(page['uri'])[1]) page['priority']=self.task.priority page['level']=self.task.level page['pickup_date'] = datetime.strftime(datetime.utcnow(),"%Y-%m-%dT%H:%M:%SZ") page['posted_date'] = datetime.strftime(datetime.utcnow(),"%Y-%m-%dT%H:%M:%SZ") page['connector_instance_log_id'] = self.task.connector_instance_log_id page['connector_instance_id'] = self.task.connector_instance_id page['workspace_id'] = self.task.workspace_id page['client_id'] = self.task.client_id page['client_name'] = self.task.client_name page['last_updated_time'] = page['pickup_date'] page['versioned'] = False page['data'] = '' page['task_log_id']=self.task.id page['entity'] = 'Post' page['category']=self.task.instance_data.get('category','') self.pages.append(page) log.info(page) log.info(self.log_msg('Parent Page added')) return True except : log.exception(self.log_msg("parent post couldn't be parsed")) return False
def api_save_new_subscription(request): user = request.user body = request.body_json fields = body.get('actionFields') url = urlnorm.normalize(fields['url']) folder = fields['folder'] if folder == "Top Level": folder = " " code, message, us = UserSubscription.add_subscription( user=user, feed_address=url, folder=folder, bookmarklet=True ) logging.user(request, "~FRAdding URL from ~FC~SBIFTTT~SN~FR: ~SB%s (in %s)" % (url, folder)) if us and us.feed: url = us.feed.feed_address return {"data": [{ "id": us and us.feed_id, "url": url, }]}
def __getParentPage(self): '' if checkSessionInfo(self.genre, self.session_info_out, self.currenturi,\ self.task.instance_data.get('update')): log.info(self.log_msg('Session info return True, Already exists')) return False page = {} try: page['et_thread_hierarchy'] = [each.replace('>','').strip() for each in stripHtml(self.soup.find('span','navbar').findParent('table').renderContents()).split('\n') if not each.strip()==''] page['title']= page['et_thread_hierarchy'][-1] except: log.info(self.log_msg('Thread hierarchy is not found')) page['title']='' for each in ['title','et_last_post_author_name','ei_thread_replies_count','ei_thread_views_count','edate_last_post_date','ei_thread_votes_count','ef_thread_rating']: try: page[each] = self.task.pagedata[each] except: log.info(self.log_msg('page data cannot be extracted')) try: page['et_thread_id'] = self.currenturi.split('&')[-1].split('=')[-1] except: log.info(self.log_msg('Thread id not found')) try: post_hash = get_hash( page ) id=None if self.session_info_out=={}: id=self.task.id result=updateSessionInfo( self.genre, self.session_info_out, self.\ currenturi, post_hash,'Post',self.task.instance_data.get('update'), Id=id) if not result['updated']: return False page['path']=[self.currenturi] page['parent_path']=[] page['uri'] = normalize( self.currenturi ) page['uri_domain'] = unicode(urlparse.urlparse(page['uri'])[1]) page['priority']=self.task.priority page['level']=self.task.level page['pickup_date'] = datetime.strftime(datetime.utcnow(),"%Y-%m-%dT%H:%M:%SZ") page['posted_date'] = datetime.strftime(datetime.utcnow(),"%Y-%m-%dT%H:%M:%SZ") page['connector_instance_log_id'] = self.task.connector_instance_log_id page['connector_instance_id'] = self.task.connector_instance_id page['workspace_id'] = self.task.workspace_id page['client_id'] = self.task.client_id page['client_name'] = self.task.client_name page['last_updated_time'] = page['pickup_date'] page['versioned'] = False #page['first_version_id']=result['first_version_id'] page['data'] = '' #page['id'] = result['id'] page['task_log_id']=self.task.id page['entity'] = 'Post' page['category']=self.task.instance_data.get('category','') self.pages.append(page) log.info(page) log.info(self.log_msg('Parent Page added')) return True except : log.exception(self.log_msg("parent post couldn't be parsed")) return False
def get_by_url(cls, url): url = urlnorm.normalize(url) feeds = cls.filter({'feed_address': url}) if feeds and len(feeds) > 0: return feeds[0] feeds = cls.filter({'feed_link': url}) if feeds and len(feeds) > 0: return feeds[0] return None
def api_share_new_story(request): user = request.user body = request.body_json fields = body.get('actionFields') story_url = urlnorm.normalize(fields['story_url']) content = fields.get('story_content', "") story_title = fields.get('story_title', "[Untitled]") story_author = fields.get('story_author', "") comments = fields.get('comments', None) feed = Feed.get_feed_from_url(story_url, create=True, fetch=True) content = lxml.html.fromstring(content) content.make_links_absolute(story_url) content = lxml.html.tostring(content) shared_story = MSharedStory.objects.filter(user_id=user.pk, story_feed_id=feed and feed.pk or 0, story_guid=story_url).limit(1).first() if not shared_story: story_db = { "story_guid": story_url, "story_permalink": story_url, "story_title": story_title, "story_feed_id": feed and feed.pk or 0, "story_content": content, "story_author": story_author, "story_date": datetime.datetime.now(), "user_id": user.pk, "comments": comments, "has_comments": bool(comments), } shared_story = MSharedStory.objects.create(**story_db) socialsubs = MSocialSubscription.objects.filter(subscription_user_id=user.pk) for socialsub in socialsubs: socialsub.needs_unread_recalc = True socialsub.save() logging.user(request, "~BM~FYSharing story from ~SB~FCIFTTT~FY: ~SB%s: %s" % (story_url, comments)) else: logging.user(request, "~BM~FY~SBAlready~SN shared story from ~SB~FCIFTTT~FY: ~SB%s: %s" % (story_url, comments)) try: socialsub = MSocialSubscription.objects.get(user_id=user.pk, subscription_user_id=user.pk) except MSocialSubscription.DoesNotExist: socialsub = None if socialsub: socialsub.mark_story_ids_as_read([shared_story.story_hash], shared_story.story_feed_id, request=request) else: RUserStory.mark_read(user.pk, shared_story.story_feed_id, shared_story.story_hash) shared_story.publish_update_to_subscribers() return {"data": [{ "id": shared_story and shared_story.story_guid, "url": shared_story and shared_story.blurblog_permalink() }]}
def fetch(self): """ This will fetch the post of a tea review and add all info to the base class """ try: self.genre ="Review" self.parent_uri = self.currenturi if not self._setSoup(): log.info(self.log_msg('Task uri not set, cannot proceed') ) return False if self.currenturi == 'http://www.teadiscussion.com/categories/index.php': for each in ['http://www.teadiscussion.com/categories/' + each['href'] for each in self.soup.find('p',text='Reviews of tea by types of tea:').parent.findNext('ul').findAll('a')]: self.currenturi = each if self._setSoup(): for href in [ahref['href'] for ahref in self.soup.findAll('a','categoryTitle')]: temp_task=self.task.clone() temp_task.instance_data[ 'uri' ] = normalize( href ) self.linksOut.append( temp_task ) return True if not self.__getParentPage(): log.info(self.log_msg('Parent page not posted ')) self.__addReview() return True except: log.exception(self.log_msg('Error in Fetch')) return False
def get_feed_from_url(cls, url): feed = None def by_url(address): feed = cls.objects.filter(feed_address=address) if not feed: duplicate_feed = DuplicateFeed.objects.filter(duplicate_address=address).order_by('pk') if duplicate_feed: feed = [duplicate_feed[0].feed] return feed url = urlnorm.normalize(url) feed = by_url(url) if feed: feed = feed[0] else: if feedfinder.isFeed(url): feed = cls.objects.create(feed_address=url) feed = feed.update() else: feed_finder_url = feedfinder.feed(url) if feed_finder_url: feed = by_url(feed_finder_url) if not feed: feed = cls.objects.create(feed_address=feed_finder_url) feed = feed.update() else: feed = feed[0] return feed
def __addPosts(self, links, parent_list): """Given a list of links to the discussion post, fetch the post contents and the author info """ h = HTTPConnection() for link in links: try: page = {} object_id = re.search('objectID=(\d+)', link).group(1) link = "http://communities.vmware.com/message/%s#%s" %(object_id, object_id) # Using the redirected url instead of the url given by the search page self.currenturi = link page['uri'] = normalize(link) log.debug(self.log_msg("Fetching the post url %s" %(self.currenturi))) if checkSessionInfo(self.genre, self.session_info_out, self.currenturi, self.task.instance_data.get('update'), parent_list=parent_list): # No need to pick this page continue res = self._getHTML() self.rawpage = res['result'] self._setCurrentPage() # First try extracting from the post body if not self.__extractPostBody(page, object_id): # if that fails, extract from the replies self.__extractReplyBody(page, object_id) except: log.exception(self.log_msg("exception in extracting page")) continue page['posted_date'] = datetime.datetime.strftime(page['posted_date'], "%Y-%m-%dT%H:%M:%SZ") checksum = md5.md5(''.join(sorted(page.values())).encode('utf-8','ignore')).hexdigest() id = None if self.session_info_out=={}: id = self.task.id result = updateSessionInfo(self.genre, self.session_info_out, self.currenturi, checksum, 'Post', self.task.instance_data.get('update'), parent_list=parent_list, Id=id) if result['updated']: page['path'] = page['parent_path'] = parent_list page['path'].append(self.currenturi) page['priority']=self.task.priority page['level']=self.task.level page['pickup_date'] = datetime.datetime.strftime(datetime.datetime.utcnow(),"%Y-%m-%dT%H:%M:%SZ") page['connector_instance_log_id'] = self.task.connector_instance_log_id page['connector_instance_id'] = self.task.connector_instance_id page['workspace_id'] = self.task.workspace_id page['client_id'] = self.task.client_id # TODO: Get the client from the project page['client_name'] = self.task.client_name page['last_updated_time'] = page['pickup_date'] page['versioned'] = False page['entity'] = 'Review' page['category'] = self.task.instance_data.get('category','') page['task_log_id']=self.task.id page['uri_domain'] = urlparse.urlparse(page['uri'])[1] # Calculate the hash and get the session info thingy self.pages.append(page) return True
def process_outline(self, outline): folders = [] for item in outline: if not hasattr(item, 'xmlUrl'): folder = item if hasattr(folder, 'text'): logging.info(' ---> [%s] New Folder: %s' % (self.user, folder.text)) folders.append({folder.text: self.process_outline(folder)}) elif hasattr(item, 'xmlUrl'): feed = item if not hasattr(feed, 'htmlUrl'): setattr(feed, 'htmlUrl', None) if not hasattr(feed, 'title'): setattr(feed, 'title', feed.htmlUrl) feed_address = urlnorm.normalize(feed.xmlUrl) feed_link = urlnorm.normalize(feed.htmlUrl) if len(feed_address) > Feed._meta.get_field('feed_address').max_length: continue if feed_link and len(feed_link) > Feed._meta.get_field('feed_link').max_length: continue if feed.title and len(feed.title) > Feed._meta.get_field('feed_title').max_length: feed.title = feed.title[:255] logging.info(' ---> \t%s - %s - %s' % (feed.title, feed_link, feed_address,)) feed_data = dict(feed_address=feed_address, feed_link=feed_link, feed_title=feed.title) # feeds.append(feed_data) # See if it exists as a duplicate first duplicate_feed = DuplicateFeed.objects.filter(duplicate_address=feed_address) if duplicate_feed: feed_db = duplicate_feed[0].feed else: feed_data['active_subscribers'] = 1 feed_data['num_subscribers'] = 1 feed_db, _ = Feed.objects.get_or_create(feed_address=feed_address, defaults=dict(**feed_data)) us, _ = UserSubscription.objects.get_or_create( feed=feed_db, user=self.user, defaults={ 'needs_unread_recalc': True, 'mark_read_date': datetime.datetime.utcnow() - datetime.timedelta(days=1) } ) folders.append(feed_db.pk) return folders
def __getParentPage(self): ''' This will get the Parent Page info ''' page = {} try: self.hierarchy = page['et_thread_hierarchy'] = [stripHtml(x.renderContents()) for x in self.soup.find('div',{'class':'rd Microsoft_Msn_Boards_Read_List Web_Bindings_Base'}).findAll('li')] except: log.info(self.log_msg('Thread hierarchy is not found')) try: self.forum_title = page['title'] = stripHtml(self.soup.find('h2').renderContents()) except: log.info(self.log_msg('Title Not Found')) page['title'] = '' if checkSessionInfo(self.genre, self.session_info_out, self.parent_uri, self.task.instance_data.get('update')): log.info(self.log_msg('Session info return True')) return False for each in ['et_author_name','ei_thread_replies_count','ei_thread_view_count','ei_author_count','et_last_post_author','edate_last_post_date','posted_date']: try: page[each] = self.task.pagedata[each] except: log.info(self.log_msg('Page data cannot be extracted for %s'%each)) try: page['ei_thread_id'] = int(urlparse.urlparse(self.currenturi)[4].split('&')[0].split('ThreadId=')[1]) except: log.info(self.log_msg('Thread id not found')) try: post_hash = get_hash(page) id = None if self.session_info_out == {}: id = self.task.id result = updateSessionInfo(self.genre, self.session_info_out, self.parent_uri, post_hash, 'Post', self.task.instance_data.get('update'),Id=id) if not result['updated']: return False page['path'] = [self.parent_uri] page['parent_path'] = [] page['uri'] = normalize(self.currenturi) page['uri_domain'] = unicode(urlparse.urlparse(page['uri'])[1]) page['priority'] = self.task.priority page['level'] = self.task.level page['pickup_date'] = datetime.strftime(datetime.utcnow(),'%Y-%m-%dT%H:%M:%SZ') page['connector_instance_log_id'] = self.task.connector_instance_log_id page['connector_instance_id'] = self.task.connector_instance_id page['workspace_id'] = self.task.workspace_id page['client_id'] = self.task.client_id page['client_name'] = self.task.client_name page['last_updated_time'] = page['pickup_date'] page['versioned'] = False page['data'] = '' page['task_log_id']=self.task.id page['entity'] = 'Post' page['category']=self.task.instance_data.get('category','') self.pages.append(page) log.info(page) log.info(self.log_msg('Parent Page added')) return True except : log.exception(self.log_msg("parent post couldn't be parsed")) return False
def __getThreadPage( self ): """ It will fetch each thread and its associate infomarmation threads=[each.parent for each in soup.findAll('td','AFTopicRow1')] for thread in threads: thread_title = stripHtml(thread.find('td','afsubject').renderContents()) thread_title = re.sub('^New article:','',thread_title).strip() thread_info = thread.findAll('td','AFTopicRow1') if len(thread_info) == 5: thread_owner = stripHtml(thread_info[1].renderContents()) thread_no_replies = int(stripHtml(thread_info[2].renderContents())) thread_no_of_views = int(stripHtml(thread_info[3].renderContents())) date_str = stripHtml(thre ad_info[4].renderContents()) posted_date = datetime.strftime(datetime.strptime(date_str,'%m/%d/%Y %I:%M %p'),"%Y-%m-%dT%H:%M:%SZ") """ threads = [each.parent for each in self.soup.findAll('td','afcol1')] for thread in threads: page = {} thread_info = thread.findAll('td','AFTopicRow1') if not len(thread_info) == 5: continue try: page['title'] = stripHtml(thread_info[0].renderContents()) except: log.info(self.log_msg('Title not found')) try: page['et_author_name'] = stripHtml(thread_info[1].renderContents()) except: log.info(self.log_msg('Thread author name not found')) try: page['ei_thread_num_replies'] = int(stripHtml(thread_info[2].renderContents())) except: log.info(self.log_msg('No of Replies not found')) try: page['ei_thread_num_views'] = int(stripHtml(thread_info[3].renderContents())) except: log.info(self.log_msg('no of views not found')) try: date_str = stripHtml(thread_info[4].renderContents()).split('\n')[-1].strip() thread_time = datetime.strptime(date_str,'%m/%d/%Y %I:%M %p') page['edate_last_post_date'] = datetime.strftime(thread_time,"%Y-%m-%dT%H:%M:%SZ") self.last_timestamp = max(thread_time , self.last_timestamp ) except: log.exception( self.log_msg('Posted date not found') ) continue self.total_posts_count = self.total_posts_count + 1 try: if not checkSessionInfo('Search',self.session_info_out, thread_time,self.task.instance_data.get('update')) and self.max_posts_count >= self.total_posts_count: temp_task=self.task.clone() temp_task.instance_data[ 'uri' ] = normalize( thread_info[0].find('a')['href'] ) temp_task.pagedata['title']= page['title'] temp_task.pagedata['et_author_name'] = page['et_author_name'] temp_task.pagedata['ei_thread_num_replies'] = page['ei_thread_num_replies'] temp_task.pagedata['ei_thread_num_views'] = page['ei_thread_num_views'] temp_task.pagedata['edate_last_post_date']= page['edate_last_post_date'] self.linksOut.append( temp_task ) except: log.exception(self.log_msg('Task cannot be created'))
def fetch_feeds(urls, n, output_path, options): if not options['quite']: print 'fetch feeds' for url in urls: url = urlnorm.normalize(url) if True or feedfinder.isFeed(url): fetch(url, n, output_path, options) else: print '%s is not a feed url' % url
def __getParentPage(self): ''' Get All Parent Page Information ''' #log.info('URL::'+self.currenturi) page = {} try: for field in ['et_professor_name', 'et_is_hot', 'et_department', 'et_total_ratings', 'ef_rating_overall', 'ef_avg_rating_easiness']: default_value = '' if field.startswith('ef'): default_value = 0 page[field] = self.task.pagedata.get(field, default_value) print ">>>>>>>", self.task.pagedata if checkSessionInfo( self.genre, self.session_info_out, self.parenturi,\ self.task.instance_data.get( 'update' ) ): log.info(self.log_msg('Session Info Returns True for Parent Page')) post_hash = get_hash(page) id=None if self.session_info_out=={}: id=self.task.id result=updateSessionInfo( self.genre, self.session_info_out, \ self.parenturi, post_hash,'Post', \ self.task.instance_data.get('update'), Id=id) if not result['updated']: log.debug(self.log_msg("Parent page not stored")) return False page['path']=[self.parenturi] page['parent_path']=[] page['task_log_id'] = self.task.id page['versioned'] = self.task.instance_data.get('versioned',False) page['category'] = self.task.instance_data.get('category','generic') page['last_updated_time'] = datetime.strftime(datetime.utcnow()\ ,"%Y-%m-%dT%H:%M:%SZ") page['client_name'] = self.task.client_name page['entity'] = 'post' page['data'] = '' page['title'] = page['et_professor_name'] page['uri'] = normalize(self.parenturi) page['uri_domain'] = urlparse(page['uri'])[1] page['priority'] = self.task.priority page['level'] = self.task.level page['pickup_date'] = datetime.strftime(datetime.utcnow()\ ,"%Y-%m-%dT%H:%M:%SZ") page['posted_date'] = datetime.strftime(datetime.utcnow()\ ,"%Y-%m-%dT%H:%M:%SZ") page['connector_instance_log_id'] = self.task.connector_instance_log_id page['connector_instance_id'] = self.task.connector_instance_id page['workspace_id'] = self.task.workspace_id page['client_id'] = self.task.client_id self.pages.append( page ) log.debug(self.log_msg("Parent page added")) return True except: log.exception(self.log_msg('Exception in GetParentPage')) return False
def _addArticlePage( self ): """ get the article info, It will get title, posted_date,uri,source """ page = {} self.genre = 'Review' head_str ='' title = self.soup.find('h1','a4') if title: page = self.__getArticlePage( page ) elif self.soup.find('span','homeSplashTitle'): page = self.__getMagazinePage(page) else: log.info(self.log_msg('It is a article page from other site')) return False try: article_hash = get_hash( page ) if checkSessionInfo(self.genre,self.session_info_out, self.currenturi\ , self.task.instance_data.get('update')): log.info(self.log_msg ('check Session Info return True')) return False id=None if self.session_info_out=={}: id=self.task.id result=updateSessionInfo( self.genre, self.session_info_out, \ self.currenturi, article_hash,'Post', \ self.task.instance_data.get('update'), Id=id) if not result['updated']: log.info(self.log_msg ('result of update is false')) return False page[ 'id' ] = result[ 'id' ] page['first_version_id'] = result[ 'first_version_id' ] page['task_log_id'] = self.task.id page['versioned'] = self.task.instance_data.get('versioned',False) page['category'] = self.task.instance_data.get('category','generic') page['last_updated_time'] = datetime.strftime(datetime.utcnow()\ ,"%Y-%m-%dT%H:%M:%SZ") page['client_name'] = self.task.client_name page['entity'] = 'Review' page['uri'] = normalize( self.currenturi ) page['uri_domain'] = urlparse(page['uri'])[1] page['priority'] = self.task.priority page['level'] = self.task.level page['pickup_date'] = datetime.strftime(datetime.utcnow()\ ,"%Y-%m-%dT%H:%M:%SZ") page['connector_instance_log_id'] = self.task.connector_instance_log_id page['connector_instance_id'] = self.task.connector_instance_id page['workspace_id'] = self.task.workspace_id page['client_id'] = self.task.client_id self.pages.append( page ) log.info( self.log_msg("Review added") ) return True except: log.exception( self.log_msg('Error with session info' ) ) return False
def __getParentPage(self): """ This will fetch the parent info This has nothing but title """ if checkSessionInfo(self.genre, self.session_info_out, self.parent_uri,\ self.task.instance_data.get('update')): log.info(self.log_msg('Session info return True, Already exists')) return False page = {} try: page['title'] = stripHtml(self.soup.find('h1').renderContents()) #page['title'] = stripHtml(self.soup.find('span',id=re.compile('.*ProductTitle$')).renderContents()) except: log.info(self.log_msg(' Title cannot be found')) try: page['et_product_rating'] = float(stripHtml(self.soup.find('span','mark').renderContents()).split('/')[0].strip()) except: log.info(self.log_msg('Price not found')) try: log.info(page) post_hash = get_hash( page ) id=None if self.session_info_out=={}: id=self.task.id result=updateSessionInfo( self.genre, self.session_info_out, self.\ parent_uri, post_hash,'Post',self.task.instance_data.get('update'), Id=id) if not result['updated']: log.info(self.log_msg('result [update] return false')) return False #page['first_version_id']=result['first_version_id'] #page['id'] = result['id'] page['parent_path'] = [] page['path'] = [self.parent_uri] page['uri'] = normalize( self.currenturi ) page['uri_domain'] = unicode(urlparse(page['uri'])[1]) page['priority']=self.task.priority page['level']=self.task.level page['pickup_date'] = datetime.strftime(datetime.utcnow(),"%Y-%m-%dT%H:%M:%SZ") page['posted_date'] = datetime.strftime(datetime.utcnow(),"%Y-%m-%dT%H:%M:%SZ") page['connector_instance_log_id'] = self.task.connector_instance_log_id page['connector_instance_id'] = self.task.connector_instance_id page['workspace_id'] = self.task.workspace_id page['client_id'] = self.task.client_id page['client_name'] = self.task.client_name page['last_updated_time'] = page['pickup_date'] page['versioned'] = False page['data'] = '' page['task_log_id']=self.task.id page['entity'] = 'Post' page['category']=self.task.instance_data.get('category','') self.pages.append(page) log.info(self.log_msg('Parent Page added')) return True except : log.exception(self.log_msg("parent post couldn't be parsed")) return False
def __getParentPage(self): try: #To sort the results in date descending order if not self.category_id: sort_url ="http://www.linkedin.com/searchAnswers?runSearch=&keywords=%s&searchScope=&questionStatus=all&sortType=dat"%(quote_plus(self.keyword_term)) else: sort_url ="http://www.linkedin.com/searchAnswers?runSearch=&keywords=%s&searchScope=&questionStatus=all&sortType=dat&categoryID=%s"%(quote_plus(self.keyword_term),self.category_id) res=self._getHTML(sort_url) if res: self.rawpage=res['result'] self._setCurrentPage() page={} page['title']=self.task.instance_data['uri'] page['data'] = '' try: post_hash= get_hash(page) except: log.info(self.log_msg("Error Occured while making parent post hash, Not fetching the parent page data")) return False if not checkSessionInfo(self.genre, self.session_info_out, self.keyword_term + str(self.category_id), self.task.instance_data.get('update')): id=None if self.session_info_out=={}: id=self.task.id log.debug(id) result=updateSessionInfo(self.genre, self.session_info_out, self.keyword_term + str(self.category_id), post_hash, 'Post', self.task.instance_data.get('update'), Id=id) if result['updated']: page['path'] = [self.keyword_term + str(self.category_id)] page['parent_path'] = [] page['task_log_id']=self.task.id page['versioned']=self.task.instance_data.get('versioned',False) page['category']=self.task.instance_data.get('category','generic') page['last_updated_time']= datetime.strftime(datetime.utcnow(),"%Y-%m-%dT%H:%M:%SZ") page['client_name']=self.task.client_name page['entity']='post' page['uri'] = normalize(self.currenturi) page['uri_domain'] = unicode(urlparse(self.currenturi)[1]) page['priority']=self.task.priority page['level']=self.task.level page['pickup_date'] = datetime.strftime(datetime.utcnow(),"%Y-%m-%dT%H:%M:%SZ") page['posted_date'] = datetime.strftime(datetime.utcnow(),"%Y-%m-%dT%H:%M:%SZ") page['connector_instance_log_id'] = self.task.connector_instance_log_id page['connector_instance_id'] = self.task.connector_instance_id page['workspace_id'] = self.task.workspace_id page['client_id'] = self.task.client_id self.pages.append(page) log.debug(self.log_msg("Main page details stored")) else: log.debug(self.log_msg("Main page details NOT stored")) else: log.debug(self.log_msg("Main page details NOT stored")) return True except: log.exception(self.log_msg("Exception occured in __getParentPage()")) return False
def fetch(self): """ same fetch method, I need to write something for doc string So I m writing this doc string """ try: ## self.currenturi = 'http://www.notebookreview.com/price/default.asp?productID=0&productFamilyID=592&display=opinionDetail' ## self.currenturi = 'http://www.notebookreview.com/price/default.asp?productID=0&productFamilyID=448&display=opinionDetail' self.parenturi = self.currenturi self.genre = "Review" self.calling_page_no = 1 if self.currenturi == 'http://www.notebookreview.com/price/': if not self._setSoup(): return False task_uris = [ ] try: brand_list = ['http://www.notebookreview.com/price/'+li_tag.\ find('a')['href'] for li_tag in self.soup.findAll('li')] for brand_uri in brand_list: self.currenturi = brand_uri if not self._setSoup(): continue model_list = [ 'http://www.notebookreview.com/price/' + \ ahref.find('a')['href'] for ahref in\ self.soup.findAll('li')] for model_uri in model_list: self.currenturi = model_uri if not self._setSoup(): continue ahref_tag = self.soup.find(text='User Opinions') if ahref_tag: task_uris.append ('http://www.notebookreview.com/price/'+ \ ahref_tag.parent['href'] ) except: log.exception(self.log_msg('exception in getting task uris')) log.info(self.log_msg('Total no of task found is %d'%hrefs)) for href in task_uris: temp_task=self.task.clone() temp_task.instance_data[ 'uri' ] = normalize( href ) self.linksOut.append( temp_task ) return True else: if not self._setSoup(): return False self.parenturi = self.currenturi self.__getParentPage() while True: self.__addReviews() if not self._setNextPage(): break return True except: log.exception(self.log_msg('error in fetch ')) return False
def __getParentPage(self): """ It appends the information about the parent page, excecuted once at beginning of fetch method """ if checkSessionInfo( self.genre, self.session_info_out, self.parenturi, \ self.task.instance_data.get('update') ): return False page= {'title':''} try: if not self.product_review: page['title'] = stripHtml( self.soup.find('h3').find('i').renderContents() ) else: title_str = stripHtml(self.soup.find('h2').renderContents()).strip() title_str = re.sub( 'Reviews$', '', title_str ).strip() if title_str.endswith('-'): title_str = title_str[:-1] page['title'] = title_str except: log.exception( self.log_msg('could not parse page title') ) try: post_hash = get_hash( page ) id = None if self.session_info_out == {}: id = self.task.id result = updateSessionInfo( self.genre, self.session_info_out, self.parenturi,\ post_hash,'Post', self.task.instance_data.get('update'), Id=id ) if not result[ 'updated' ]: return False page['path']=[self.parenturi] page['parent_path']=[] #page['id'] = result['id'] #page['first_version_id']=result['first_version_id'] page[ 'uri' ] = normalize(self.currenturi) page[ 'uri_domain' ] = unicode( urlparse.urlparse( page[ 'uri' ] )[1] ) page[ 'priority' ] = self.task.priority page[ 'level' ] = self.task.level page[ 'pickup_date' ] = datetime.strftime( datetime.utcnow(), "%Y-%m-%dT%H:%M:%SZ" ) page[ 'posted_date' ] = datetime.strftime( datetime.utcnow(), "%Y-%m-%dT%H:%M:%SZ" ) page[ 'connector_instance_log_id' ] = self.task.connector_instance_log_id page[ 'connector_instance_id' ] = self.task.connector_instance_id page[ 'workspace_id' ] = self.task.workspace_id page[ 'client_id' ] = self.task.client_id page[ 'client_name' ] = self.task.client_name page[ 'last_updated_time' ] = page[ 'pickup_date' ] page[ 'versioned' ] = False page[ 'data' ] = '' page[ 'task_log_id' ] = self.task.id page[ 'entity' ] = 'Post' page[ 'category' ] = self.task.instance_data.get( 'category', '' ) self.pages.append( page ) log.info('Parent page added') except: log.exception(self.log_msg("Parent page could not be posted ")) return False
def __getThreads( self ): try: threads=[each.findParent('tr') for each in self.soup.find('table',id='threadslist').findAll('td',id=re.compile('^td_threadtitle_.*$'))] except: log.exception(self.log_msg('No thread found, cannot proceed')) return False for thread in threads: if self.max_posts_count <= self.total_posts_count : log.info(self.log_msg('Reaching maximum post,Return false')) return False self.total_posts_count = self.total_posts_count + 1 try: thread_title = stripHtml(thread.find('a',id=re.compile('^thread_title_.*$')).renderContents()) thread_replies_soup = thread.find('td',title=re.compile('^Replies.*')) if not thread_replies_soup: log.debug(self.log_msg("Thread with title:%s has been Moved. Ignoring this thread" %thread_title)) continue last_post_and_author = [each.strip() for each in stripHtml(thread_replies_soup.find('div').renderContents()).split('\n') if not each ==''] post_date = last_post_and_author[0] if post_date.startswith('Today'): post_date = post_date.replace('Today',datetime.strftime(datetime.utcnow(),'%m-%d-%Y')) if post_date.startswith('Yesterday'): post_date = post_date.replace('Yesterday',datetime.strftime(datetime.utcnow() - timedelta(days=1),'%m-%d-%Y')) log.info(post_date) thread_time = datetime.strptime (post_date,'%m-%d-%Y %I:%M %p') if checkSessionInfo('Search',self.session_info_out, thread_time,self.task.instance_data.get('update')) and self.max_posts_count >= self.total_posts_count: log.info(self.log_msg('Session info return True or Reaches max count')) continue self.last_timestamp = max(thread_time , self.last_timestamp ) temp_task=self.task.clone() temp_task.instance_data[ 'uri' ] = normalize(thread.find('a',id=re.compile('^thread_title_.*$'))['href'] ) temp_task.pagedata['et_author_name'] = stripHtml(thread.find('a',id=re.compile('^thread_title_.*$')).findNext('div','smallfont').renderContents()) temp_task.pagedata['title'] = thread_title thread_reply_and_views = thread.find('td',title=re.compile('^Replies.*'))['title'].split('Views:') log.info(self.log_msg('the thread title is:')) log.info(temp_task.pagedata['title']) try: temp_task.pagedata['ei_thread_replies_count'] = int(re.sub('[^\d]','', thread_reply_and_views[0]).strip()) temp_task.pagedata['ei_thread_views_count'] = int(re.sub('[^\d]','', thread_reply_and_views[1]).strip()) temp_task.pagedata['et_last_post_author_name'] = re.sub('by','',last_post_and_author[1]).strip() temp_task.pagedata['edate_last_post_date']= datetime.strftime(thread_time,"%Y-%m-%dT%H:%M:%SZ") except: log.info(self.log_msg('Post details may not be found')) try: rating_and_votes = thread.find('img',alt=re.compile('^Thread Rating:.*'))['alt'] rating_match = re.search('(\d+)\s*votes,\s*(.*)\s*average',rating_and_votes) temp_task.pagedata['ei_thread_votes_count'] = int(rating_match.group(1).strip()) temp_task.pagedata['ef_thread_rating'] = float(rating_match.group(2).strip()) except: log.info(self.log_msg('thread votes and ratings may not be found')) self.linksOut.append( temp_task ) except: log.exception( self.log_msg('Posted date not found') ) continue return True
def api_save_new_story(request): user = request.user body = request.body_json fields = body.get('actionFields') story_url = urlnorm.normalize(fields['story_url']) story_content = fields.get('story_content', "") story_title = fields.get('story_title', "") story_author = fields.get('story_author', "") user_tags = fields.get('user_tags', "") story = None logging.user(request.user, "~FBFinding feed (api_save_new_story): %s" % story_url) original_feed = Feed.get_feed_from_url(story_url) if not story_content or not story_title: ti = TextImporter(feed=original_feed, story_url=story_url, request=request) original_story = ti.fetch(return_document=True) if original_story: story_url = original_story['url'] if not story_content: story_content = original_story['content'] if not story_title: story_title = original_story['title'] try: story_db = { "user_id": user.pk, "starred_date": datetime.datetime.now(), "story_date": datetime.datetime.now(), "story_title": story_title or '[Untitled]', "story_permalink": story_url, "story_guid": story_url, "story_content": story_content, "story_author_name": story_author, "story_feed_id": original_feed and original_feed.pk or 0, "user_tags": [tag for tag in user_tags.split(',')] } story = MStarredStory.objects.create(**story_db) logging.user( request, "~FCStarring by ~SBIFTTT~SN: ~SB%s~SN in ~SB%s" % (story_db['story_title'][:50], original_feed and original_feed)) MStarredStoryCounts.count_for_user(user.pk) except OperationError: logging.user( request, "~FCAlready starred by ~SBIFTTT~SN: ~SB%s" % (story_db['story_title'][:50])) pass return { "data": [{ "id": story and story.id, "url": story and story.story_permalink }] }
def process_outline(self, outline, folders, in_folder=''): for item in outline: if (not hasattr(item, 'xmlUrl') and (hasattr(item, 'text') or hasattr(item, 'title'))): folder = item title = getattr(item, 'text', None) or getattr(item, 'title', None) # if hasattr(folder, 'text'): # logging.info(' ---> [%s] ~FRNew Folder: %s' % (self.user, folder.text)) obj = {title: []} folders = add_object_to_folder(obj, in_folder, folders) folders = self.process_outline(folder, folders, title) elif hasattr(item, 'xmlUrl'): feed = item if not hasattr(feed, 'htmlUrl'): setattr(feed, 'htmlUrl', None) # If feed title matches what's in the DB, don't override it on subscription. feed_title = getattr(feed, 'title', None) or getattr(feed, 'text', None) if not feed_title: setattr(feed, 'title', feed.htmlUrl or feed.xmlUrl) user_feed_title = None else: setattr(feed, 'title', feed_title) user_feed_title = feed.title feed_address = urlnorm.normalize(feed.xmlUrl) feed_link = urlnorm.normalize(feed.htmlUrl) if len(feed_address) > Feed._meta.get_field('feed_address').max_length: continue if feed_link and len(feed_link) > Feed._meta.get_field('feed_link').max_length: continue # logging.info(' ---> \t~FR%s - %s - %s' % (feed.title, feed_link, feed_address,)) feed_data = dict(feed_address=feed_address, feed_link=feed_link, feed_title=feed.title) # feeds.append(feed_data) # See if it exists as a duplicate first duplicate_feed = DuplicateFeed.objects.filter(duplicate_address=feed_address) if duplicate_feed: feed_db = duplicate_feed[0].feed else: feed_data['active_subscribers'] = 1 feed_data['num_subscribers'] = 1 feed_db, _ = Feed.find_or_create(feed_address=feed_address, feed_link=feed_link, defaults=dict(**feed_data)) if user_feed_title == feed_db.feed_title: user_feed_title = None us, _ = UserSubscription.objects.get_or_create( feed=feed_db, user=self.user, defaults={ 'needs_unread_recalc': True, 'mark_read_date': datetime.datetime.utcnow() - datetime.timedelta(days=1), 'active': self.user.profile.is_premium, 'user_title': user_feed_title } ) if self.user.profile.is_premium and not us.active: us.active = True us.save() if not us.needs_unread_recalc: us.needs_unread_recalc = True us.save() folders = add_object_to_folder(feed_db.pk, in_folder, folders) return folders
def api_share_new_story(request): user = request.user body = request.body_json fields = body.get('actionFields') story_url = urlnorm.normalize(fields['story_url']) story_content = fields.get('story_content', "") story_title = fields.get('story_title', "") story_author = fields.get('story_author', "") comments = fields.get('comments', None) logging.user(request.user, "~FBFinding feed (api_share_new_story): %s" % story_url) original_feed = Feed.get_feed_from_url(story_url, create=True, fetch=True) story_hash = MStory.guid_hash_unsaved(story_url) feed_id = (original_feed and original_feed.pk or 0) if not user.profile.is_premium and MSharedStory.feed_quota(user.pk, story_hash, feed_id=feed_id): return {"errors": [{ 'message': 'Only premium users can share multiple stories per day from the same site.' }]} quota = 3 if MSharedStory.feed_quota(user.pk, story_hash, quota=quota): logging.user(request, "~BM~FRNOT ~FYSharing story from ~SB~FCIFTTT~FY, over quota: ~SB%s: %s" % (story_url, comments)) return {"errors": [{ 'message': 'You can only share %s stories per day.' % quota }]} if not story_content or not story_title: ti = TextImporter(feed=original_feed, story_url=story_url, request=request) original_story = ti.fetch(return_document=True) if original_story: story_url = original_story['url'] if not story_content: story_content = original_story['content'] if not story_title: story_title = original_story['title'] if story_content: story_content = lxml.html.fromstring(story_content) story_content.make_links_absolute(story_url) story_content = lxml.html.tostring(story_content) shared_story = MSharedStory.objects.filter(user_id=user.pk, story_feed_id=original_feed and original_feed.pk or 0, story_guid=story_url).limit(1).first() if not shared_story: title_max = MSharedStory._fields['story_title'].max_length story_db = { "story_guid": story_url, "story_permalink": story_url, "story_title": story_title and story_title[:title_max] or "[Untitled]", "story_feed_id": original_feed and original_feed.pk or 0, "story_content": story_content, "story_author_name": story_author, "story_date": datetime.datetime.now(), "user_id": user.pk, "comments": comments, "has_comments": bool(comments), } try: shared_story = MSharedStory.objects.create(**story_db) socialsubs = MSocialSubscription.objects.filter(subscription_user_id=user.pk) for socialsub in socialsubs: socialsub.needs_unread_recalc = True socialsub.save() logging.user(request, "~BM~FYSharing story from ~SB~FCIFTTT~FY: ~SB%s: %s" % (story_url, comments)) except NotUniqueError: logging.user(request, "~BM~FY~SBAlready~SN shared story from ~SB~FCIFTTT~FY: ~SB%s: %s" % (story_url, comments)) else: logging.user(request, "~BM~FY~SBAlready~SN shared story from ~SB~FCIFTTT~FY: ~SB%s: %s" % (story_url, comments)) try: socialsub = MSocialSubscription.objects.get(user_id=user.pk, subscription_user_id=user.pk) except MSocialSubscription.DoesNotExist: socialsub = None if socialsub and shared_story: socialsub.mark_story_ids_as_read([shared_story.story_hash], shared_story.story_feed_id, request=request) elif shared_story: RUserStory.mark_read(user.pk, shared_story.story_feed_id, shared_story.story_hash) if shared_story: shared_story.publish_update_to_subscribers() return {"data": [{ "id": shared_story and shared_story.story_guid, "url": shared_story and shared_story.blurblog_permalink() }]}
def api_share_new_story(request): user = request.user body = request.body_json fields = body.get('actionFields') story_url = urlnorm.normalize(fields['story_url']) story_content = fields.get('story_content', "") story_title = fields.get('story_title', "") story_author = fields.get('story_author', "") comments = fields.get('comments', None) original_feed = Feed.get_feed_from_url(story_url, create=True, fetch=True) if not story_content or not story_title: ti = TextImporter(feed=original_feed, story_url=story_url, request=request) original_story = ti.fetch(return_document=True) if original_story: story_url = original_story['url'] if not story_content: story_content = original_story['content'] if not story_title: story_title = original_story['title'] story_content = lxml.html.fromstring(story_content) story_content.make_links_absolute(story_url) story_content = lxml.html.tostring(story_content) shared_story = MSharedStory.objects.filter( user_id=user.pk, story_feed_id=original_feed and original_feed.pk or 0, story_guid=story_url).limit(1).first() if not shared_story: story_db = { "story_guid": story_url, "story_permalink": story_url, "story_title": story_title or "[Untitled]", "story_feed_id": original_feed and original_feed.pk or 0, "story_content": story_content, "story_author": story_author, "story_date": datetime.datetime.now(), "user_id": user.pk, "comments": comments, "has_comments": bool(comments), } shared_story = MSharedStory.objects.create(**story_db) socialsubs = MSocialSubscription.objects.filter( subscription_user_id=user.pk) for socialsub in socialsubs: socialsub.needs_unread_recalc = True socialsub.save() logging.user( request, "~BM~FYSharing story from ~SB~FCIFTTT~FY: ~SB%s: %s" % (story_url, comments)) else: logging.user( request, "~BM~FY~SBAlready~SN shared story from ~SB~FCIFTTT~FY: ~SB%s: %s" % (story_url, comments)) try: socialsub = MSocialSubscription.objects.get( user_id=user.pk, subscription_user_id=user.pk) except MSocialSubscription.DoesNotExist: socialsub = None if socialsub: socialsub.mark_story_ids_as_read([shared_story.story_hash], shared_story.story_feed_id, request=request) else: RUserStory.mark_read(user.pk, shared_story.story_feed_id, shared_story.story_hash) shared_story.publish_update_to_subscribers() return { "data": [{ "id": shared_story and shared_story.story_guid, "url": shared_story and shared_story.blurblog_permalink() }] }
def add_subscription(cls, user, feed_address, folder=None, bookmarklet=False): feed = None us = None logging.info(" ---> [%s] ~FRAdding URL: ~SB%s (in %s)" % (user, feed_address, folder)) if feed_address: feed_address = urlnorm.normalize(feed_address) # See if it exists as a duplicate first duplicate_feed = DuplicateFeed.objects.filter( duplicate_address=feed_address).order_by('pk') if duplicate_feed: feed = [duplicate_feed[0].feed] else: feed = Feed.objects.filter( feed_address=feed_address).order_by('pk') if feed: feed = feed[0] else: try: feed = fetch_address_from_page(feed_address) except: code = -2 message = "This feed has been added, but something went wrong"\ " when downloading it. Maybe the server's busy." if not feed: code = -1 if bookmarklet: message = "This site does not have an RSS feed. Nothing is linked to from this page." else: message = "This site does not point to an RSS feed or a website with an RSS feed." else: us, subscription_created = cls.objects.get_or_create( feed=feed, user=user, defaults={ 'needs_unread_recalc': True, 'active': True, }) code = 1 message = "" if us and not subscription_created: code = -3 message = "You are already subscribed to this site." elif us: user_sub_folders_object, created = UserSubscriptionFolders.objects.get_or_create( user=user, defaults={'folders': '[]'}) if created: user_sub_folders = [] else: user_sub_folders = json.decode(user_sub_folders_object.folders) user_sub_folders = add_object_to_folder(feed.pk, folder, user_sub_folders) user_sub_folders_object.folders = json.encode(user_sub_folders) user_sub_folders_object.save() feed.setup_feed_for_premium_subscribers() if feed.last_update < datetime.datetime.utcnow( ) - datetime.timedelta(days=1): feed.update() print code, message, us return code, message, us