Example #1
0
 def run(self):
     for pos in forever(0):
         offset = pos * 20
         logger.info('now %s-%s' % (self.topic_id, offset))
         data = self._run(offset)
         for item in data:
             self.save(item)
         if len(data) != 20:
             break
 def run(self):
     for pos in forever(0):
         offset = pos * 20
         logger.info('now %s-%s' % (self.topic_id, offset))
         data = self._run(offset)
         for item in data:
             self.save(item)
         if len(data) != 20:
             break
Example #3
0
def main():
    question_crawler = QuestionCrawler()
    # question_crawler.run('39833760')
    for times in forever(1):
        logger.info('now times : %s' % times)
        todo = MONGO[DB][QUESTION_TODO_COLL].find_one()
        if not todo:
            logger.info('no more task, sleeping')
            sleep(SLEEP_TIME)
            continue
        logger.info('crawling question %s' % todo['_id'])
        question_crawler.run(todo['_id'])
        MONGO[DB][QUESTION_TODO_COLL].delete_one({'_id': todo['_id']})
Example #4
0
    def run(self, question_id):
        html = self.get(QUESTION_URL.format(id=question_id))
        soup = BeautifulSoup(html)
        question = QuestionParser(self, soup).parse()
        question['_id'] = question_id

        for index in forever():
            ids, has_more = self._run(question_id, index * 20)
            question['answers'].extend(ids)
            QuestionParser.save(question)
            logger.info('update question %s-%s' % (index, question_id))
            if not has_more:
                break
Example #5
0
def main():
    question_crawler = QuestionCrawler()
    # question_crawler.run('39833760')
    for times in forever(1):
        logger.info('now times : %s' % times)
        todo = MONGO[DB][QUESTION_TODO_COLL].find_one()
        if not todo:
            logger.info('no more task, sleeping')
            sleep(SLEEP_TIME)
            continue
        logger.info('crawling question %s' % todo['_id'])
        question_crawler.run(todo['_id'])
        MONGO[DB][QUESTION_TODO_COLL].delete_one({'_id': todo['_id']})
Example #6
0
    def run(self, question_id):
        html = self.get(QUESTION_URL.format(id=question_id))
        soup = BeautifulSoup(html)
        question = QuestionParser(self, soup).parse()
        question['_id'] = question_id

        for index in forever():
            ids, has_more = self._run(question_id, index * 20)
            question['answers'].extend(ids)
            QuestionParser.save(question)
            logger.info('update question %s-%s' % (index, question_id))
            if not has_more:
                break