def run(self): # Initialize threadpool # we need at least one thread for each board num_workers = len(self.board_info)* WORKERS_PER_BOARD + 1 self.pool = ThreadPool(num_workers) def archive_board_task(board, refresh): while True: self.archive_board(board) throttle_thread(refresh*60) for alias,name,refresh in self.board_info: board_url = "%s/%s" % (self.base_url, alias) # create board if not in db with transaction.atomic(): try: board = Board.objects.get(url=board_url) except ObjectDoesNotExist: board = Board(url=board_url, name=name, alias=alias) board.save() self.pool.add_task(archive_board_task, board, refresh) # we want boards to start at different times to spread out load throttle_thread(BOARD_STAGGER_TIME) # hang thread, so daemon keeps running while True: throttle_thread(10)
class Archiver(Daemon): """ A daemon that scrapers and saves Boards """ def __init__(self, board_info=settings.GFAQS_BOARDS, base=settings.GFAQS_BOARD_URL, pidfile=settings.GFAQS_ARCHIVER_PID_FILE, gfaqs_client=None): super(Archiver, self).__init__(pidfile) self.board_info = board_info self.base_url = base self.gfaqs_client = gfaqs_client def run(self): """ # Build GFAQSClient to access webpage if settings.GFAQS_LOGIN_AS_USER: self.gfaqs_client = AuthenticatedGFAQSClient( settings.GFAQS_LOGIN_EMAIL, settings.GFAQS_LOGIN_PASSWORD) else: self.gfaqs_client = GFAQSClient() """ def run(self): # Initialize threadpool # we need at least one thread for each board num_workers = len(self.board_info)* WORKERS_PER_BOARD + 1 self.pool = ThreadPool(num_workers) def archive_board_task(board, refresh): while True: self.archive_board(board) throttle_thread(refresh*60) for alias,name,refresh in self.board_info: board_url = "%s/%s" % (self.base_url, alias) # create board if not in db with transaction.atomic(): try: board = Board.objects.get(url=board_url) except ObjectDoesNotExist: board = Board(url=board_url, name=name, alias=alias) board.save() self.pool.add_task(archive_board_task, board, refresh) # we want boards to start at different times to spread out load throttle_thread(BOARD_STAGGER_TIME) # hang thread, so daemon keeps running while True: throttle_thread(10) @log_on_error def archive_board(self, b, recursive=True): """ scrapes and saves the topics of a board to the db b: the models.Board to archive recursive: archives the posts of each topic as well """ bs = BoardScraper(b, self.gfaqs_client) logger.info("Archiving Board (%s) started" % b.alias) topics_examined, topics_saved = 0, 0 for t in bs.retrieve(): topics_examined += 1 if t.status in Topic.ARCHIVED_STATUSES: # we reached archived topics; don't continue break try: t_db = Topic.objects.get(gfaqs_id=t.gfaqs_id) t.pk = t_db.pk if t_db.number_of_posts == t.number_of_posts: if t.status in Topic.STICKY_STATUSES: continue else: # this is the first topic that hasn't been updated since # last archive run, so we stop break except ObjectDoesNotExist: t.pk = None with transaction.atomic(): t.creator = self.add_user(t.creator) t.save() topics_saved += 1 logger.debug("Saved topic %s" % t) if recursive: self.pool.add_task(self.archive_topic, t) throttle_thread() logger.info("Archiving Board (%s) finished; %s topics examined, %s new" % \ (b.alias, topics_examined, topics_saved)) @log_on_error def archive_topic(self, t): """Scrapes the given topic and saves its posts""" ts = TopicScraper(t, self.gfaqs_client) logger.info("Archiving Topic (%s) started" % t.gfaqs_id) posts_examined, posts_saved = 0, 0 posts = list(ts.retrieve()) for p in reversed(posts): posts_examined += 1 # Check if post exists already in db to determine update or add with transaction.atomic(): try: p_db = Post.objects.filter(topic=t).get(post_num=p.post_num) # we already have the rest of the posts in the db break except ObjectDoesNotExist: p.creator = self.add_user(p.creator) p.save() posts_saved += 1 logger.debug("Added Post %s" % t) throttle_thread() # update poll results if applicable if posts and t.status in Topic.POLL_STATUSES: p = posts[0] p_db = Post.objects.filter(topic=t).get(post_num=p.post_num) p_db.contents = p.contents p_db.save() logger.debug("Updated Post [%s] for poll" % p.topic) logger.debug("Archiving Topic (%s) finished; %s posts examined, %s new" % \ (t.gfaqs_id, posts_examined, posts_saved)) def add_user(self, user): """ Check if user exists already in db, if not add it """ if user.id: return user user_mutex.acquire() try: return User.objects.get(username=user.username) except ObjectDoesNotExist: user.save() logger.debug("User added (%s)" % user.username) return user finally: user_mutex.release()