def _crawl_by_question(self, question, depth): # Stopping crawling when depth exceeds maxdepth if depth > self.maxdepth: return # Not crawling the question that was already crawled if question in self.crawled_questions or question in self.bad_questions: return print 'crawling question: %s' % question question_stats = Quora.get_question_stats(question) # If something went awry crawling particular question if question_stats == {}: self.bad_questions.add(question) return latest_answers = Quora.get_latest_answers(question) question_stats['latest_answers'] = latest_answers print 'question_stats:\n', question_stats print 'latest_answers:\n', latest_answers print 'related_questions: \n', question_stats['related_questions'] print '---------------------------------------------------' self.crawled_questions[question] = question_stats # Inserting into database as we go... self.db.questions.insert({question: question_stats}) for related_question in question_stats['related_questions']: # Only considering complete questions (i.e. not ending in ...) if not related_question.endswith('...'): self._crawl_by_question(_sanitize_question(related_question), depth + 1)
def crawl_questions_and_answers(self): ## This is for downloading - uncomment if you want to download ## # questions_data = list(self.db.questions.find()) # for document in questions_data: # question = _get_question(document) # print question # question_author, answers_authors = Quora.get_authors_of_questions_and_answers(question) # question_author = _sanitize_username(question_author) # answers_authors = [_sanitize_username(author) for author in answers_authors] # stats = {'question_author' : question_author, 'answers_authors': answers_authors} # print 'question_author:', question_author # print 'answers_authors:', answers_authors # # Inserting into database: # self.db.answers.insert({question: stats}) ## This is purely for updating ## answers_data = list(self.db.answers.find()) for document in answers_data: question = _get_question(document) if document[question]['question_author'] == '': print question print document['_id'] question_author, answers_authors = Quora.get_authors_of_questions_and_answers( question) question_author = _sanitize_username(question_author) answers_authors = [ _sanitize_username(author) for author in answers_authors ] print 'question_author:', question_author print 'answers_authors:', answers_authors stats = { 'question_author': question_author, 'answers_authors': answers_authors } self.db.answers.update({'_id': document['_id']}, {"$set": { question: stats }}, upsert=False) else: question_author = document[question]['question_author'] answers_authors = document[question]['answers_authors'] question_author = _sanitize_username(question_author) answers_authors = [ _sanitize_username(author) for author in answers_authors ] stats = { 'question_author': question_author, 'answers_authors': answers_authors } self.db.answers.update({'_id': document['_id']}, {"$set": { question: stats }}, upsert=False)