Example #1
0
 def get_new(self):
     new_reddits = 0
     first_uri = 'http://www.reddit.com/reddits/new/.json'
     current_uri = first_uri
     counter = 0
     while True:
         page = self._get_json(current_uri)
         if not page:
             self.logger.error(
                 'ERROR retrieving page %s. Spidering aborted.\n'
                 '%s reddits scanned.\n%s new reddits found.' %
                 (current_uri, counter, new_reddits))
             return
         reddits = page['data']['children']
         for reddit in reddits:
             reddit = reddit['data']
             id = reddit['id']
             s = session.query(Subreddit).filter_by(id=id).first()
             if not s:
                 self.logger.info('new subreddit: %s' % reddit['url'])
                 new_reddits += 1
                 s = Subreddit()
             s.name = reddit['name']
             s.created = unix_string(int(reddit['created']))
             s.url = reddit['url'][3:-1]
             s.title = reddit['title']
             s.over18 = reddit['over18']
             s.subscribers = reddit['subscribers']
             s.id = reddit['id']
             s.description = reddit['description']
             session.commit()
         counter += len(reddits)
         after = page['data']['after']
         current_uri = '%s?count=%s&after=%s' % (first_uri, counter, after)
         if not after:
             self.logger.info('Finished spidering.\n'
                              '%s reddits scanned.\n%s new reddits found.' %
                              (counter, new_reddits))
             return
Example #2
0
 def get_new(self):
     new_reddits = 0
     first_uri = 'http://www.reddit.com/reddits/new/.json' 
     current_uri = first_uri
     counter = 0
     while True:
         page = self._get_json(current_uri)
         if not page:
             self.logger.error('ERROR retrieving page %s. Spidering aborted.\n'
                     '%s reddits scanned.\n%s new reddits found.' 
                     % (current_uri, counter, new_reddits))
             return
         reddits = page['data']['children']
         for reddit in reddits:
             reddit = reddit['data']
             id = reddit['id']
             s = session.query(Subreddit).filter_by(id=id).first()
             if not s: 
                 self.logger.info('new subreddit: %s' % reddit['url'])
                 new_reddits += 1
                 s = Subreddit()
             s.name = reddit['name']
             s.created = unix_string(int(reddit['created']))
             s.url = reddit['url'][3:-1]
             s.title = reddit['title']
             s.over18 = reddit['over18']
             s.subscribers = reddit['subscribers']
             s.id = reddit['id']
             s.description = reddit['description'] 
             session.commit()
         counter += len(reddits)
         after = page['data']['after']
         current_uri = '%s?count=%s&after=%s' % (first_uri, counter, after)
         if not after:
             self.logger.info('Finished spidering.\n'
                     '%s reddits scanned.\n%s new reddits found.' 
                     % (counter, new_reddits))
             return
Example #3
0
 def _scan_posts(self, url, newest):
     seen = 0
     data = self.spider._get_json(url) 
     posts = data['data']['children']
     after = data['data']['after']
     for i, c in enumerate(posts):
         post = c['data']
         if i == 0:
             next_newest = post['id']
         if post['id'] <= newest:
             seen = len(posts) - i
             break
         title = post['title'].lower()
         selftext = post['selftext'].lower()
         for k in self._mentioned_keywords(title, text2=selftext):
             mention = Mention()
             mention.keyword_uid = k.uid
             mention.thread_id = post['id']
             mention.author = post['author']
             mention.subreddit = post['subreddit']
             mention.created = unix_string(int(post['created_utc']))
     session.commit()
     return (seen, after, next_newest)
Example #4
0
 def _scan_comments(self, url, newest):
     seen = 0
     data = self.spider._get_json(url) 
     comments = data['data']['children']
     after = data['data']['after']
     for i, c in enumerate(comments):
         comment = c['data']
         if i == 0:
             next_newest = comment['id']
         if comment['id'] <= newest:
             seen = len(comments) - i - 1
             break
         body = comment['body'].lower()
         for k in self._mentioned_keywords(body):
             mention = Mention()
             mention.keyword_uid = k.uid
             mention.thread_id = comment['link_id'][3:]
             mention.comment_id = comment['id']
             mention.author = comment['author']
             mention.subreddit = comment['subreddit']
             mention.created = unix_string(int(comment['created_utc']))
     session.commit()
     return (seen, after, next_newest)
Example #5
0
 def _scan_posts(self, url, newest):
     seen = 0
     data = self.spider._get_json(url)
     posts = data['data']['children']
     after = data['data']['after']
     for i, c in enumerate(posts):
         post = c['data']
         if i == 0:
             next_newest = post['id']
         if post['id'] <= newest:
             seen = len(posts) - i
             break
         title = post['title'].lower()
         selftext = post['selftext'].lower()
         for k in self._mentioned_keywords(title, text2=selftext):
             mention = Mention()
             mention.keyword_uid = k.uid
             mention.thread_id = post['id']
             mention.author = post['author']
             mention.subreddit = post['subreddit']
             mention.created = unix_string(int(post['created_utc']))
     session.commit()
     return (seen, after, next_newest)
Example #6
0
 def _scan_comments(self, url, newest):
     seen = 0
     data = self.spider._get_json(url)
     comments = data['data']['children']
     after = data['data']['after']
     for i, c in enumerate(comments):
         comment = c['data']
         if i == 0:
             next_newest = comment['id']
         if comment['id'] <= newest:
             seen = len(comments) - i - 1
             break
         body = comment['body'].lower()
         for k in self._mentioned_keywords(body):
             mention = Mention()
             mention.keyword_uid = k.uid
             mention.thread_id = comment['link_id'][3:]
             mention.comment_id = comment['id']
             mention.author = comment['author']
             mention.subreddit = comment['subreddit']
             mention.created = unix_string(int(comment['created_utc']))
     session.commit()
     return (seen, after, next_newest)