Exemple #1
0
        def process_question(question):
            dbquestion = Questions()
            dbquestion.author_identifier = question['posters'][0]['user_id']
            dbquestion.answer_count = question['reply_count']
            dbquestion.question_identifier = question['id']
            dbquestion.view_count = question['views']
            if question['last_posted_at'] is not None:
                dbquestion.last_activity_at = question['last_posted_at']
            else:
                dbquestion.last_activity_at = question['created_at']
            dbquestion.title = question['title']
            dbquestion.url = question['slug']
            dbquestion.added_at = question['created_at']
            dbquestion.score = question['like_count']
            # dbquestion.last_activity_by = question['last_poster_username']
            dbquestion.body = None
            if 'excerpt' in question:
                dbquestion.body = question['excerpt']
            # Additional fields in Discourse: liked,pinned_globally, visible, highest_post_number, unseen,posts_count
            # bumped_at, bookmarked, archived,archetype,has_summary,pinned,image_url,closed,unpinned,bumped, fancy_title

            if self.question_new_or_changed(dbquestion):
                # Question is new or changed
                self.session.add(dbquestion)
                self.session.commit()
                self.process_answers(question['slug'])
                self.process_dbquestiontags(dbquestion.question_identifier, category)
                update_users = False
            self.total_questions += 1
Exemple #2
0
    def process_questions(self, tag):
        logging.debug("Processing questions for " + tag)

        has_more = True
        base_url = self.url + '/2.2/questions?'
        base_url += 'order=desc&sort=activity&site=stackoverflow&key=' + self.api_key + '&'
        base_url += 'tagged=' + tag

        # get total number of questions
        url_total = base_url + '&' + 'pagesize=1&filter=total'
        data = self._get_api_data(url_total)
        # Hack: total not provided in API as a JSON object
        data = json.loads(data)
        total = data['total']
        logging.info('Total number of questions to download: ' + str(total))

        page = 1
        done = 0
        while has_more:
            questions_ids = []  # used to get answers and comments
            url = base_url + '&' + 'pagesize=' + str(
                self.pagesize) + '&' + 'page=' + str(page)
            if not self.debug:
                data = self._get_api_data(url)
            else:
                data = StackSampleData.questions

            parser = JSONParser(unicode(data))
            parser.parse()
            # [u'has_more', u'items', u'quota_max', u'quota_remaining']
            data = parser.data['items']
            has_more = parser.data['has_more']
            if self.debug: has_more = False
            page += 1

            for question in data:
                # Each of the question is initialized here
                # [u'is_answered', u'view_count', u'tags', u'last_activity_date', u'answer_count', u'creation_date',
                # u'score', u'link', u'accepted_answer_id', u'owner', u'title', u'question_id']
                dbquestion = Questions()
                if 'user_id' in question['owner']:
                    dbquestion.author_identifier = question['owner']['user_id']
                dbquestion.answer_count = question['answer_count']
                dbquestion.question_identifier = question['question_id']
                dbquestion.view_count = question['view_count']
                if question['last_activity_date'] is not None:
                    dbquestion.last_activity_at = datetime.datetime.fromtimestamp(
                        int(question['last_activity_date'])).strftime(
                            '%Y-%m-%d %H:%M:%S')
                else:
                    dbquestion.last_activity_at = datetime.datetime.fromtimestamp(
                        int(question['creation_date'])).strftime(
                            '%Y-%m-%d %H:%M:%S')
                dbquestion.title = question['title']
                dbquestion.url = question['link']
                dbquestion.added_at = datetime.datetime.fromtimestamp(
                    int(question['creation_date'])).strftime(
                        '%Y-%m-%d %H:%M:%S')
                dbquestion.score = question['score']
                # Missing fields in Stack
                dbquestion.last_activity_by = None
                dbquestion.body = None  # TODO: we need to get it
                # Additional fields in Stack: is_answered, accepted_answer_id

                if self.question_new_or_changed(dbquestion):
                    # Question is new or changed
                    self.session.add(dbquestion)
                    self.session.commit()
                    self.process_dbquestiontags(dbquestion.question_identifier,
                                                question['tags'])
                    questions_ids.append(question['question_id'])
                    if dbquestion.author_identifier:
                        if dbquestion.author_identifier not in self.user_ids_questions:
                            self.user_ids_questions.append(
                                dbquestion.author_identifier)

                self.total_questions += 1
                done += 1

                if self.total_questions % 10 == 0:
                    logging.info("Done: " + str(done) + "/" + str(total))

            logging.info("Done: " + str(done) + "/" + str(total))

            ids = ";".join([str(x) for x in questions_ids])
            if len(ids) > 0:
                # Get all answers for the pagesize questions updated
                self.process_answers(ids)
                # Get all comments for the pagesize questions updated
                self.process_comments(ids)
        return
Exemple #3
0
        def process_question(question):
            dbquestion = Questions()
            dbquestion.author_identifier = question['posters'][0]['user_id']
            dbquestion.answer_count = question['reply_count']
            dbquestion.question_identifier = question['id']
            dbquestion.view_count = question['views']
            if question['last_posted_at'] is not None:
                dbquestion.last_activity_at = question['last_posted_at']
            else:
                dbquestion.last_activity_at = question['created_at']
            dbquestion.title = question['title']
            dbquestion.url = question['slug']
            dbquestion.added_at = question['created_at']
            dbquestion.score = question['like_count']
            # dbquestion.last_activity_by = question['last_poster_username']
            dbquestion.body = None
            if 'excerpt' in question:
                dbquestion.body = question['excerpt']
            # Additional fields in Discourse: liked,pinned_globally, visible, highest_post_number, unseen,posts_count
            # bumped_at, bookmarked, archived,archetype,has_summary,pinned,image_url,closed,unpinned,bumped, fancy_title

            if self.question_new_or_changed(dbquestion):
                # Question is new or changed
                self.session.add(dbquestion)
                self.session.commit()
                self.process_answers(question['slug'])
                self.process_dbquestiontags(dbquestion.question_identifier,
                                            category)
                update_users = False
            self.total_questions += 1
    def process_questions(self, tag):
        logging.debug("Processing questions for " + tag)

        has_more = True
        base_url = self.url + '/2.2/questions?'
        base_url += 'order=desc&sort=activity&site=stackoverflow&key='+self.api_key+'&'
        base_url += 'tagged='+ tag

        # get total number of questions
        url_total = base_url +'&'+'pagesize=1&filter=total'
        data = self._get_api_data(url_total)
        # Hack: total not provided in API as a JSON object
        data = json.loads(data)
        total = data['total']
        logging.info('Total number of questions to download: ' + str(total))

        page = 1
        done = 0
        while has_more:
            questions_ids = [] # used to get answers and comments
            url = base_url + '&' + 'pagesize='+str(self.pagesize)+'&'+'page='+str(page)
            if not self.debug:
                data = self._get_api_data(url)
            else:
                data = StackSampleData.questions

            parser = JSONParser(unicode(data))
            parser.parse()
            # [u'has_more', u'items', u'quota_max', u'quota_remaining']
            data = parser.data['items']
            has_more = parser.data['has_more']
            if self.debug: has_more = False
            page += 1

            for question in data:
                # Each of the question is initialized here
                # [u'is_answered', u'view_count', u'tags', u'last_activity_date', u'answer_count', u'creation_date',
                # u'score', u'link', u'accepted_answer_id', u'owner', u'title', u'question_id']
                dbquestion = Questions()
                if 'user_id' in question['owner']:
                    dbquestion.author_identifier = question['owner']['user_id']
                dbquestion.answer_count = question['answer_count']
                dbquestion.question_identifier = question['question_id']
                dbquestion.view_count = question['view_count']
                if question['last_activity_date'] is not None:
                    dbquestion.last_activity_at = datetime.datetime.fromtimestamp(int(question['last_activity_date'])).strftime('%Y-%m-%d %H:%M:%S')
                else:
                    dbquestion.last_activity_at = datetime.datetime.fromtimestamp(int(question['creation_date'])).strftime('%Y-%m-%d %H:%M:%S')
                dbquestion.title = question['title']
                dbquestion.url = question['link']
                dbquestion.added_at = datetime.datetime.fromtimestamp(int(question['creation_date'])).strftime('%Y-%m-%d %H:%M:%S')
                dbquestion.score = question['score']
                # Missing fields in Stack
                dbquestion.last_activity_by = None
                dbquestion.body = None # TODO: we need to get it
                # Additional fields in Stack: is_answered, accepted_answer_id

                if self.question_new_or_changed(dbquestion):
                    # Question is new or changed
                    self.session.add(dbquestion)
                    self.session.commit()
                    self.process_dbquestiontags(dbquestion.question_identifier, question['tags'])
                    questions_ids.append(question['question_id'])
                    if dbquestion.author_identifier:
                        if dbquestion.author_identifier not in self.user_ids_questions:
                            self.user_ids_questions.append(dbquestion.author_identifier)

                self.total_questions += 1
                done +=1

                if self.total_questions % 10 == 0: logging.info("Done: " + str(done) + "/"+str(total))

            logging.info("Done: " + str(done) + "/"+str(total))

            ids = ";".join([str(x) for x in questions_ids])
            if len(ids)>0:
                # Get all answers for the pagesize questions updated
                self.process_answers(ids)
                # Get all comments for the pagesize questions updated
                self.process_comments(ids)
        return