def run(self): for pos in forever(0): offset = pos * 20 logger.info('now %s-%s' % (self.topic_id, offset)) data = self._run(offset) for item in data: self.save(item) if len(data) != 20: break
def main(): question_crawler = QuestionCrawler() # question_crawler.run('39833760') for times in forever(1): logger.info('now times : %s' % times) todo = MONGO[DB][QUESTION_TODO_COLL].find_one() if not todo: logger.info('no more task, sleeping') sleep(SLEEP_TIME) continue logger.info('crawling question %s' % todo['_id']) question_crawler.run(todo['_id']) MONGO[DB][QUESTION_TODO_COLL].delete_one({'_id': todo['_id']})
def run(self, question_id): html = self.get(QUESTION_URL.format(id=question_id)) soup = BeautifulSoup(html) question = QuestionParser(self, soup).parse() question['_id'] = question_id for index in forever(): ids, has_more = self._run(question_id, index * 20) question['answers'].extend(ids) QuestionParser.save(question) logger.info('update question %s-%s' % (index, question_id)) if not has_more: break