Example #1
0
	def update_submission_content(self, topics=None, sub_per_topic=10):
		if topics is None:
			print "No topics specified. Updating all subreddits."
		else:
			try:
				topicfile = "../data/" + topics
				topics = open(topicfile).read().splitlines()
			except IOError:
				print "Cannot find specified topics file."
				return

		while True:
			# Go through each subreddit in db or specified by topic list 
			if topics:
				subreddits = topics
			else:
				subreddits = self.db.get_subreddits()
			for subreddit in subreddits:
				if topics:
					subreddit = self.db.subreddit_exists(subreddit)
					if not subreddit:
						continue

				topic = subreddit.get("subreddit_name")
				print "========", topic, "========"

				num_content = 0
				# Only get the submissions whose (non-empty) urls haven't been scraped
				for submission in self.db.empty_submissions(topic):
					link = submission.get("url")
					
					try:
						link = str(link)
					except UnicodeEncodeError:
						continue

					html = links.scrape_link(str(link), topic)
					if html:
						print "Adding content from:", link
						html = str(html)
						num_content += 1
					
					self.db.add_link_content(submission.get("_id"), html)

					if num_content >= sub_per_topic:
						break

				print "Crawled", num_content, "links for", topic, datetime.datetime.today()
Example #2
0
    def add_submission(self, submission, subreddit_name, follow_link=False):
        subreddit = self.subreddit_exists(subreddit_name)
        if subreddit:
            subreddit_id = subreddit.get("_id")
            print "found subreddit:", subreddit_id

            # If the comment has an author (could have been deleted),
            # take care of updating or adding user information
            if submission.author:
                print submission.author
                # If the submission's author already exists, update
                # the author's information.
                auth = self.user_exists(submission.author.id)
                if auth:
                    auth_id = auth.get("_id")
                    print self.update_user(auth_id, submission.subreddit, "submissions")
                # Otherwise create a new user object
                else:
                    self.add_user(submission.author)
                author = submission.author.fullname
            else:
                print "No author"
                author = None

            # If the submission already exists, update comments
            sub = self.submission_exists(submission)
            if sub:
                print "Submission exists-updating"
                submission_id = sub.get("_id")
                self.update_comments(submission, submission_id, subreddit_id)

            # Otherwise create submission object and add all the current comments
            else:
                print "Adding new submission"
                document = {"subreddit_id": subreddit_id,
                            "submission_title": submission.title,
                            "submission_text": submission.selftext,
                            "karma": submission.ups,
                            "downvotes": submission.downs,
                            "num_comments": submission.num_comments,
                            "flair": submission.link_flair_text,
                            "url": submission.url,
                            "praw_id": submission.id,
                            "praw_fullid": submission.fullname,
                            "created": submission.created,
                            "author": author
                }

                # First add the submission to the db to make sure we have it
                submission_id = self.submission_collection.insert(document)
                self.subreddit_collection.update({"_id": subreddit_id},
                                                 {"$set": {"last_update": int(datetime.today().strftime("%s"))}})

                # Next add the comments, if they're available
                try:
                    comments = layer_comments(submission.comments)
                    for layer, comment_list in comments.iteritems():
                        for comment in comment_list:
                            self.add_comment(comment, layer, submission_id, subreddit_id)
                # Sometimes submissions don't have comment attribute
                except AttributeError:
                    pass

                # Now follow link if this was specified and if
                # submission contains a link to follow
                if follow_link and submission.url:
                    content = links.scrape_link(submission.url, subreddit_name)
                    self.add_link_content(submission_id, str(content))

            return submission_id

        else:
            raise errors.MissingError("Subreddit %s does not exist in the database." % subreddit_name)