def process_question(question): dbquestion = Questions() dbquestion.author_identifier = question['posters'][0]['user_id'] dbquestion.answer_count = question['reply_count'] dbquestion.question_identifier = question['id'] dbquestion.view_count = question['views'] if question['last_posted_at'] is not None: dbquestion.last_activity_at = question['last_posted_at'] else: dbquestion.last_activity_at = question['created_at'] dbquestion.title = question['title'] dbquestion.url = question['slug'] dbquestion.added_at = question['created_at'] dbquestion.score = question['like_count'] # dbquestion.last_activity_by = question['last_poster_username'] dbquestion.body = None if 'excerpt' in question: dbquestion.body = question['excerpt'] # Additional fields in Discourse: liked,pinned_globally, visible, highest_post_number, unseen,posts_count # bumped_at, bookmarked, archived,archetype,has_summary,pinned,image_url,closed,unpinned,bumped, fancy_title if self.question_new_or_changed(dbquestion): # Question is new or changed self.session.add(dbquestion) self.session.commit() self.process_answers(question['slug']) self.process_dbquestiontags(dbquestion.question_identifier, category) update_users = False self.total_questions += 1
def process_questions(self, tag): logging.debug("Processing questions for " + tag) has_more = True base_url = self.url + '/2.2/questions?' base_url += 'order=desc&sort=activity&site=stackoverflow&key=' + self.api_key + '&' base_url += 'tagged=' + tag # get total number of questions url_total = base_url + '&' + 'pagesize=1&filter=total' data = self._get_api_data(url_total) # Hack: total not provided in API as a JSON object data = json.loads(data) total = data['total'] logging.info('Total number of questions to download: ' + str(total)) page = 1 done = 0 while has_more: questions_ids = [] # used to get answers and comments url = base_url + '&' + 'pagesize=' + str( self.pagesize) + '&' + 'page=' + str(page) if not self.debug: data = self._get_api_data(url) else: data = StackSampleData.questions parser = JSONParser(unicode(data)) parser.parse() # [u'has_more', u'items', u'quota_max', u'quota_remaining'] data = parser.data['items'] has_more = parser.data['has_more'] if self.debug: has_more = False page += 1 for question in data: # Each of the question is initialized here # [u'is_answered', u'view_count', u'tags', u'last_activity_date', u'answer_count', u'creation_date', # u'score', u'link', u'accepted_answer_id', u'owner', u'title', u'question_id'] dbquestion = Questions() if 'user_id' in question['owner']: dbquestion.author_identifier = question['owner']['user_id'] dbquestion.answer_count = question['answer_count'] dbquestion.question_identifier = question['question_id'] dbquestion.view_count = question['view_count'] if question['last_activity_date'] is not None: dbquestion.last_activity_at = datetime.datetime.fromtimestamp( int(question['last_activity_date'])).strftime( '%Y-%m-%d %H:%M:%S') else: dbquestion.last_activity_at = datetime.datetime.fromtimestamp( int(question['creation_date'])).strftime( '%Y-%m-%d %H:%M:%S') dbquestion.title = question['title'] dbquestion.url = question['link'] dbquestion.added_at = datetime.datetime.fromtimestamp( int(question['creation_date'])).strftime( '%Y-%m-%d %H:%M:%S') dbquestion.score = question['score'] # Missing fields in Stack dbquestion.last_activity_by = None dbquestion.body = None # TODO: we need to get it # Additional fields in Stack: is_answered, accepted_answer_id if self.question_new_or_changed(dbquestion): # Question is new or changed self.session.add(dbquestion) self.session.commit() self.process_dbquestiontags(dbquestion.question_identifier, question['tags']) questions_ids.append(question['question_id']) if dbquestion.author_identifier: if dbquestion.author_identifier not in self.user_ids_questions: self.user_ids_questions.append( dbquestion.author_identifier) self.total_questions += 1 done += 1 if self.total_questions % 10 == 0: logging.info("Done: " + str(done) + "/" + str(total)) logging.info("Done: " + str(done) + "/" + str(total)) ids = ";".join([str(x) for x in questions_ids]) if len(ids) > 0: # Get all answers for the pagesize questions updated self.process_answers(ids) # Get all comments for the pagesize questions updated self.process_comments(ids) return
def process_questions(self, tag): logging.debug("Processing questions for " + tag) has_more = True base_url = self.url + '/2.2/questions?' base_url += 'order=desc&sort=activity&site=stackoverflow&key='+self.api_key+'&' base_url += 'tagged='+ tag # get total number of questions url_total = base_url +'&'+'pagesize=1&filter=total' data = self._get_api_data(url_total) # Hack: total not provided in API as a JSON object data = json.loads(data) total = data['total'] logging.info('Total number of questions to download: ' + str(total)) page = 1 done = 0 while has_more: questions_ids = [] # used to get answers and comments url = base_url + '&' + 'pagesize='+str(self.pagesize)+'&'+'page='+str(page) if not self.debug: data = self._get_api_data(url) else: data = StackSampleData.questions parser = JSONParser(unicode(data)) parser.parse() # [u'has_more', u'items', u'quota_max', u'quota_remaining'] data = parser.data['items'] has_more = parser.data['has_more'] if self.debug: has_more = False page += 1 for question in data: # Each of the question is initialized here # [u'is_answered', u'view_count', u'tags', u'last_activity_date', u'answer_count', u'creation_date', # u'score', u'link', u'accepted_answer_id', u'owner', u'title', u'question_id'] dbquestion = Questions() if 'user_id' in question['owner']: dbquestion.author_identifier = question['owner']['user_id'] dbquestion.answer_count = question['answer_count'] dbquestion.question_identifier = question['question_id'] dbquestion.view_count = question['view_count'] if question['last_activity_date'] is not None: dbquestion.last_activity_at = datetime.datetime.fromtimestamp(int(question['last_activity_date'])).strftime('%Y-%m-%d %H:%M:%S') else: dbquestion.last_activity_at = datetime.datetime.fromtimestamp(int(question['creation_date'])).strftime('%Y-%m-%d %H:%M:%S') dbquestion.title = question['title'] dbquestion.url = question['link'] dbquestion.added_at = datetime.datetime.fromtimestamp(int(question['creation_date'])).strftime('%Y-%m-%d %H:%M:%S') dbquestion.score = question['score'] # Missing fields in Stack dbquestion.last_activity_by = None dbquestion.body = None # TODO: we need to get it # Additional fields in Stack: is_answered, accepted_answer_id if self.question_new_or_changed(dbquestion): # Question is new or changed self.session.add(dbquestion) self.session.commit() self.process_dbquestiontags(dbquestion.question_identifier, question['tags']) questions_ids.append(question['question_id']) if dbquestion.author_identifier: if dbquestion.author_identifier not in self.user_ids_questions: self.user_ids_questions.append(dbquestion.author_identifier) self.total_questions += 1 done +=1 if self.total_questions % 10 == 0: logging.info("Done: " + str(done) + "/"+str(total)) logging.info("Done: " + str(done) + "/"+str(total)) ids = ";".join([str(x) for x in questions_ids]) if len(ids)>0: # Get all answers for the pagesize questions updated self.process_answers(ids) # Get all comments for the pagesize questions updated self.process_comments(ids) return