Example #1
0
    def process_users(self, users_ids):
        if users_ids is None: return

        for user_id in users_ids:
            if user_id in self.users_blacklist: continue
            user = self.session.query(People).filter(People.username == user_id).first()
            if user is not None: continue

            url = self.url + "/users/" + user_id + ".json"
            logging.info("Getting user " + user_id)
            logging.info(url)
            stream = requests.get(url, verify=False)
            try:
                parser = JSONParser(unicode(stream.text))
                parser.parse()
            except:
                logging.error("Can't get " + user_id + " data")
                self.users_blacklist.append(user_id)
                # print unicode(stream.text)
                continue

            user = parser.data['user']

            dbuser = People()
            dbuser.username = user['username']
            dbuser.reputation = user['trust_level']
            dbuser.avatar = user['uploaded_avatar_id']
            dbuser.last_seen_at = user['last_posted_at']
            dbuser.joined_at = user['created_at']
            dbuser.identifier = user['id']
            self.session.add(dbuser)
            self.total_users += 1
        self.session.commit()

        return
Example #2
0
 def categories(self):
     stream = requests.get(self.url + "/categories.json", verify=False)
     logging.info(stream.url)
     #print(self.url + "/api/v1/users/" + str(user_id) + "/")
     parser = JSONParser(unicode(stream.text))
     parser.parse()
     categories = parser.data['category_list']['categories']
     return categories
Example #3
0
 def categories(self):
     stream = requests.get(self.url + "/categories.json", verify=False)
     logging.info(stream.url)
     #print(self.url + "/api/v1/users/" + str(user_id) + "/")
     parser = JSONParser(unicode(stream.text))
     parser.parse()
     categories = parser.data['category_list']['categories']
     return categories
Example #4
0
    def process_comments(self, dbpost_ids, kind='question'):
        # coments associated to a post (question or answer) that question
        if kind == 'question': base_url = self.url + '/2.2/questions/'
        if kind == 'answer': base_url = self.url + '/2.2/answers/'
        base_url += str(dbpost_ids) + '/comments?'
        base_url += 'order=desc&sort=creation&site=stackoverflow&key=' + self.api_key
        base_url += '&' + 'pagesize=' + str(self.pagesize)
        logging.debug("Getting comments for " + str(dbpost_ids))
        has_more = True
        page = 1

        while has_more:
            url = base_url + "&page=" + str(page)
            if not self.debug:
                data = self._get_api_data(url)
            else:
                data = StackSampleData.comments
                has_more = False

            parser = JSONParser(unicode(data))
            parser.parse()
            # [u'has_more', u'items', u'quota_max', u'quota_remaining']
            if 'has_more' not in parser.data:
                logging.error("No has_more in JSON response. Exiting.")
                print parser.data
                raise
            has_more = parser.data['has_more']
            page += 1
            if 'items' in parser.data:
                data = parser.data['items']
            else:
                logging.error("No items in comments")
                logging.error(parser.data)
                return

            for comment in data:
                dbcomment = Comments()

                # question or answer identifier
                if kind == "question":
                    dbcomment.question_identifier = comment['post_id']
                if kind == "answer":
                    dbcomment.answer_identifier = comment['post_id']
                if 'body' in comment.keys(): dbcomment.body = comment['body']
                if 'user_id' in comment['owner']:
                    dbcomment.user_identifier = comment['owner']['user_id']
                    if dbcomment.user_identifier not in self.user_ids_comments:
                        self.user_ids_comments.append(
                            dbcomment.user_identifier)

                cdate = datetime.datetime.fromtimestamp(
                    int(comment['creation_date']))
                dbcomment.submitted_on = cdate.strftime('%Y-%m-%d %H:%M:%S')

                self.session.add(dbcomment)
                self.total_comments += 1
            self.session.commit()
Example #5
0
    def process_answers(self, dbquestion_ids):
        """ Get all answers for the list of question ids """
        has_more = True
        page = 1
        base_url = self.url + '/2.2/questions/' + str(
            dbquestion_ids) + '/answers?'
        base_url += 'order=desc&sort=activity&site=stackoverflow&key=' + self.api_key
        base_url += '&' + 'pagesize=' + str(self.pagesize)
        logging.debug("Getting answers for dbquestion ids" +
                      str(dbquestion_ids))
        dbanswers_ids = []

        while has_more:
            url = base_url + "&page=" + str(page)
            if not self.debug:
                data = self._get_api_data(url)
            else:
                has_more = False
                data = StackSampleData.answers

            parser = JSONParser(unicode(data))
            parser.parse()
            # [u'has_more', u'items', u'quota_max', u'quota_remaining']
            has_more = parser.data['has_more']
            page += 1
            data = parser.data['items']

            for answer in data:
                dbanswer = Answers()
                dbanswer.identifier = answer['answer_id']
                dbanswers_ids.append(dbanswer.identifier)
                # dbanswer.body = text
                if 'user_id' in answer['owner']:
                    dbanswer.user_identifier = answer['owner']['user_id']
                    if dbanswer.user_identifier not in self.user_ids_answers:
                        self.user_ids_answers.append(dbanswer.user_identifier)
                dbanswer.question_identifier = answer['question_id']
                create_date = datetime.datetime.fromtimestamp(
                    int(answer['creation_date']))
                dbanswer.submitted_on = create_date.strftime(
                    '%Y-%m-%d %H:%M:%S')
                dbanswer.votes = answer['score']

                self.session.add(dbanswer)
                self.total_answers += 1
                self.user_ids_answers.append(dbanswer.user_identifier)
            self.session.commit()
        # Time to get comments for all answers
        while len(dbanswers_ids) > 0:
            ids = []
            for i in range(self.pagesize):
                if len(dbanswers_ids) > 0:
                    val = dbanswers_ids.pop()
                    if val is not None: ids.append(val)
                    else: logging.info("Found None Answer")
            ids = ";".join([str(x) for x in ids])
            self.process_comments(ids, 'answer')
Example #6
0
    def process_comments(self, dbpost_ids, kind = 'question'):
        # coments associated to a post (question or answer) that question
        if kind == 'question': base_url = self.url + '/2.2/questions/'
        if kind == 'answer': base_url = self.url + '/2.2/answers/'
        base_url += str(dbpost_ids) +'/comments?'
        base_url += 'order=desc&sort=creation&site=stackoverflow&key='+self.api_key
        base_url += '&' + 'pagesize='+str(self.pagesize)
        logging.debug("Getting comments for " + str(dbpost_ids))
        has_more = True
        page = 1

        while has_more:
            url = base_url + "&page="+str(page)
            if not self.debug:
                data = self._get_api_data(url)
            else:
                data = StackSampleData.comments
                has_more = False

            parser = JSONParser(unicode(data))
            parser.parse()
            # [u'has_more', u'items', u'quota_max', u'quota_remaining']
            if 'has_more' not in parser.data:
                logging.error("No has_more in JSON response. Exiting.")
                print parser.data
                raise
            has_more = parser.data['has_more']
            page += 1
            if 'items' in parser.data:
                data = parser.data['items']
            else:
                logging.error("No items in comments")
                logging.error(parser.data)
                return

            for comment in data:
                dbcomment = Comments()

                # question or answer identifier
                if kind == "question":
                    dbcomment.question_identifier = comment['post_id']
                if kind == "answer":
                    dbcomment.answer_identifier = comment['post_id']
                if 'body' in comment.keys(): dbcomment.body = comment['body']
                if 'user_id' in comment['owner']:
                    dbcomment.user_identifier = comment['owner']['user_id']
                    if dbcomment.user_identifier not in self.user_ids_comments:
                        self.user_ids_comments.append(dbcomment.user_identifier)

                cdate = datetime.datetime.fromtimestamp(int(comment['creation_date']))
                dbcomment.submitted_on = cdate.strftime('%Y-%m-%d %H:%M:%S')

                self.session.add(dbcomment)
                self.total_comments += 1
            self.session.commit()
Example #7
0
    def process_answers(self, dbquestion_ids):
        """ Get all answers for the list of question ids """
        has_more = True
        page = 1
        base_url = self.url + '/2.2/questions/'+str(dbquestion_ids)+'/answers?'
        base_url += 'order=desc&sort=activity&site=stackoverflow&key='+self.api_key
        base_url += '&' + 'pagesize='+str(self.pagesize)
        logging.debug("Getting answers for dbquestion ids" + str(dbquestion_ids))
        dbanswers_ids = []

        while has_more:
            url = base_url + "&page="+str(page)
            if not self.debug:
                data = self._get_api_data(url)
            else:
                has_more = False
                data = StackSampleData.answers

            parser = JSONParser(unicode(data))
            parser.parse()
            # [u'has_more', u'items', u'quota_max', u'quota_remaining']
            has_more = parser.data['has_more']
            page += 1
            data = parser.data['items']

            for answer in data:
                dbanswer = Answers()
                dbanswer.identifier = answer['answer_id']
                dbanswers_ids.append(dbanswer.identifier)
                # dbanswer.body = text
                if 'user_id' in answer['owner']:
                    dbanswer.user_identifier = answer['owner']['user_id']
                    if dbanswer.user_identifier not in self.user_ids_answers:
                        self.user_ids_answers.append(dbanswer.user_identifier)
                dbanswer.question_identifier = answer['question_id']
                create_date = datetime.datetime.fromtimestamp(int(answer['creation_date']))
                dbanswer.submitted_on = create_date.strftime('%Y-%m-%d %H:%M:%S')
                dbanswer.votes = answer['score']

                self.session.add(dbanswer)
                self.total_answers += 1
                self.user_ids_answers.append(dbanswer.user_identifier)
            self.session.commit()
        # Time to get comments for all answers
        while len(dbanswers_ids)>0:
            ids = []
            for i in range(self.pagesize):
                if len(dbanswers_ids)>0:
                    val = dbanswers_ids.pop()
                    if val is not None: ids.append(val)
                    else: logging.info("Found None Answer")
            ids = ";".join([str(x) for x in ids])
            self.process_comments(ids,'answer')
Example #8
0
    def process_users(self, users_ids):
        if users_ids is None: return
        if len(users_ids.split(";")) > self.pagesize:
            logging.error("Max ids overcome in process_users " + users_ids)
            raise Exception
        base_url = self.url + '/2.2/users/' + str(users_ids) + '?'
        base_url += 'order=desc&sort=reputation&site=stackoverflow&key=' + self.api_key
        base_url += '&' + 'pagesize=' + str(self.pagesize)
        has_more = True
        page = 1

        while has_more:
            url = base_url + "&page=" + str(page)
            if not self.debug:
                data = self._get_api_data(url)
            else:
                data = StackSampleData.users
                has_more = False
            parser = JSONParser(unicode(data))
            parser.parse()
            # [u'has_more', u'items', u'quota_max', u'quota_remaining']
            if 'has_more' not in parser.data:
                logging.error("No has_more in JSON response")
                print parser.data
                raise
            has_more = parser.data['has_more']
            data = parser.data['items']

            for user in data:
                dbuser = People()
                dbuser.username = user['display_name']
                dbuser.reputation = user['reputation']
                if 'profile_image' in user:
                    dbuser.avatar = user['profile_image']
                dbuser.last_seen_at = datetime.datetime.fromtimestamp(
                    int(user['last_access_date'])).strftime(
                        '%Y-%m-%d %H:%M:%S')
                dbuser.joined_at = datetime.datetime.fromtimestamp(
                    int(user['creation_date'])).strftime('%Y-%m-%d %H:%M:%S')
                dbuser.identifier = user['user_id']
                self.session.add(dbuser)
            self.session.commit()

        return
Example #9
0
    def get_search_tags(self):
        found_tags = []

        logging.info("Getting all tags based on: " + self.tags)
        url = self.url + "/2.2/tags?key="+self.api_key+"&"
        url += "order=desc&sort=popular&site=stackoverflow"
        url += "&inname=" + str(self.tags)
        if not self.debug:
            data = self._get_api_data(url)
        else:
            data = StackSampleData.tags

        parser = JSONParser(unicode(data))
        parser.parse()
        tags_data = parser.data['items']
        for tag in tags_data:
            found_tags.append(tag['name'])
        logging.info(found_tags)
        return found_tags
Example #10
0
    def get_search_tags(self):
        found_tags = []

        logging.info("Getting all tags based on: " + self.tags)
        url = self.url + "/2.2/tags?key=" + self.api_key + "&"
        url += "order=desc&sort=popular&site=stackoverflow"
        url += "&inname=" + str(self.tags)
        if not self.debug:
            data = self._get_api_data(url)
        else:
            data = StackSampleData.tags

        parser = JSONParser(unicode(data))
        parser.parse()
        tags_data = parser.data['items']
        for tag in tags_data:
            found_tags.append(tag['name'])
        logging.info(found_tags)
        return found_tags
Example #11
0
    def process_users(self, users_ids):
        if users_ids is None: return
        if len(users_ids.split(";"))>self.pagesize:
            logging.error("Max ids overcome in process_users " + users_ids)
            raise Exception
        base_url = self.url + '/2.2/users/'+str(users_ids)+'?'
        base_url += 'order=desc&sort=reputation&site=stackoverflow&key='+self.api_key
        base_url += '&' + 'pagesize='+str(self.pagesize)
        has_more = True
        page = 1

        while has_more:
            url = base_url + "&page="+str(page)
            if not self.debug:
                data = self._get_api_data(url)
            else:
                data = StackSampleData.users
                has_more = False
            parser = JSONParser(unicode(data))
            parser.parse()
            # [u'has_more', u'items', u'quota_max', u'quota_remaining']
            if 'has_more' not in parser.data:
                logging.error("No has_more in JSON response")
                print parser.data
                raise
            has_more = parser.data['has_more']
            data = parser.data['items']

            for user in data:
                dbuser = People()
                dbuser.username = user['display_name']
                dbuser.reputation = user['reputation']
                if 'profile_image' in user:
                    dbuser.avatar = user['profile_image']
                dbuser.last_seen_at = datetime.datetime.fromtimestamp(int(user['last_access_date'])).strftime('%Y-%m-%d %H:%M:%S')
                dbuser.joined_at = datetime.datetime.fromtimestamp(int(user['creation_date'])).strftime('%Y-%m-%d %H:%M:%S')
                dbuser.identifier = user['user_id']
                self.session.add(dbuser)
            self.session.commit()

        return
Example #12
0
    def process_users(self, users_ids):
        if users_ids is None: return

        for user_id in users_ids:
            if user_id in self.users_blacklist: continue
            user = self.session.query(People).filter(
                People.username == user_id).first()
            if user is not None: continue

            url = self.url + "/users/" + user_id + ".json"
            logging.info("Getting user " + user_id)
            logging.info(url)
            stream = requests.get(url, verify=False)
            try:
                parser = JSONParser(unicode(stream.text))
                parser.parse()
            except:
                logging.error("Can't get " + user_id + " data")
                self.users_blacklist.append(user_id)
                # print unicode(stream.text)
                continue

            user = parser.data['user']

            dbuser = People()
            dbuser.username = user['username']
            dbuser.reputation = user['trust_level']
            dbuser.avatar = user['uploaded_avatar_id']
            dbuser.last_seen_at = user['last_posted_at']
            dbuser.joined_at = user['created_at']
            dbuser.identifier = user['id']
            self.session.add(dbuser)
            self.total_users += 1
        self.session.commit()

        return
Example #13
0
    def process_answers(self, question_slug):
        """ Get all answers for the question with slug dbquestion_slug  """

        def process_answer(answer):
            dbanswer = Answers()
            dbanswer.identifier = answer['id']
            # dbanswer.body = text
            dbanswer.user_identifier = answer['user_id']
            if answer['username'] not in self.user_ids_answers:
                self.user_ids_answers.append(answer['username'])
            dbanswer.question_identifier = question_id
            dbanswer.submitted_on = answer['updated_at']
            dbanswer.votes = answer['score']
            dbanswer.body = answer['cooked']

            self.session.add(dbanswer)
            self.total_answers += 1


        url = self.url + "/t/" + question_slug + ".json"
        logging.info("Getting answers for " + question_slug)
        logging.info(url)
        stream = requests.get(url, verify=False)
        parser = JSONParser(unicode(stream.text))
        try:
            parser.parse()
        except:
            logging.error("Cant parse answers for question " + question_slug)
            print unicode(stream.text)
            return

        data = parser.data

        question_id = parser.data['id']
        data = data['post_stream']['posts']

        for answer in data:
            process_answer(answer)
        self.session.commit()
        self.process_users(self.user_ids_answers)

        # It there are more than 20 answers we need to retrieve the rest
        discoure_max_answers_query = 20

        if len(parser.data['post_stream']['stream']) > 20:
            pending = parser.data['post_stream']['stream']
            for i in range(0,discoure_max_answers_query): pending.pop(0)
            url = self.url + "/t/"+ str(question_id) + "/posts.json?"
            for answer_id in pending:
                url += "post_ids%5B%5D="+str(answer_id)+"&"
            stream = requests.get(url, verify=False)
            parser = JSONParser(unicode(stream.text))
            try:
                parser.parse()
            except:
                logging.error("Cant parse additional answers for question " + question_slug)
                logging.error(url)
                print unicode(stream.text)
                return

            data = parser.data

            data = data['post_stream']['posts']

            for answer in data:
                process_answer(answer)
            self.session.commit()
            self.process_users(self.user_ids_answers)
Example #14
0
    def process_questions(self, category):
        logging.debug("Processing questions for " + category)

        def update_users(users):
            for user in users:
                if user['username'] not in self.user_ids_questions:
                    self.user_ids_questions.append(user['username'])

        def process_question(question):
            dbquestion = Questions()
            dbquestion.author_identifier = question['posters'][0]['user_id']
            dbquestion.answer_count = question['reply_count']
            dbquestion.question_identifier = question['id']
            dbquestion.view_count = question['views']
            if question['last_posted_at'] is not None:
                dbquestion.last_activity_at = question['last_posted_at']
            else:
                dbquestion.last_activity_at = question['created_at']
            dbquestion.title = question['title']
            dbquestion.url = question['slug']
            dbquestion.added_at = question['created_at']
            dbquestion.score = question['like_count']
            # dbquestion.last_activity_by = question['last_poster_username']
            dbquestion.body = None
            if 'excerpt' in question:
                dbquestion.body = question['excerpt']
            # Additional fields in Discourse: liked,pinned_globally, visible, highest_post_number, unseen,posts_count
            # bumped_at, bookmarked, archived,archetype,has_summary,pinned,image_url,closed,unpinned,bumped, fancy_title

            if self.question_new_or_changed(dbquestion):
                # Question is new or changed
                self.session.add(dbquestion)
                self.session.commit()
                self.process_answers(question['slug'])
                self.process_dbquestiontags(dbquestion.question_identifier,
                                            category)
                update_users = False
            self.total_questions += 1

        url = self.url + "/c/" + category + ".json"
        stream = requests.get(url, verify=False)
        print url
        parser = JSONParser(unicode(stream.text))
        parser.parse()
        data = parser.data

        data = data['topic_list']['topics']

        for question in data:
            process_question(question)
        update_users(parser.data['users'])
        self.process_users(self.user_ids_questions)

        while 'more_topics_url' in parser.data['topic_list']:
            url = self.url + parser.data['topic_list']['more_topics_url']
            print url
            stream = requests.get(url, verify=False)
            parser = JSONParser(unicode(stream.text))
            parser.parse()
            data = parser.data

            data = data['topic_list']['topics']

            for question in data:
                process_question(question)
            if 'users' in parser.data:
                update_users(parser.data['users'])
                self.process_users(self.user_ids_questions)
            else:
                logging.info("Questions without users")
                print(parser.data)
        return
Example #15
0
    def process_answers(self, question_slug):
        """ Get all answers for the question with slug dbquestion_slug  """
        def process_answer(answer):
            dbanswer = Answers()
            dbanswer.identifier = answer['id']
            # dbanswer.body = text
            dbanswer.user_identifier = answer['user_id']
            if answer['username'] not in self.user_ids_answers:
                self.user_ids_answers.append(answer['username'])
            dbanswer.question_identifier = question_id
            dbanswer.submitted_on = answer['updated_at']
            dbanswer.votes = answer['score']
            dbanswer.body = answer['cooked']

            self.session.add(dbanswer)
            self.total_answers += 1

        url = self.url + "/t/" + question_slug + ".json"
        logging.info("Getting answers for " + question_slug)
        logging.info(url)
        stream = requests.get(url, verify=False)
        parser = JSONParser(unicode(stream.text))
        try:
            parser.parse()
        except:
            logging.error("Cant parse answers for question " + question_slug)
            print unicode(stream.text)
            return

        data = parser.data

        question_id = parser.data['id']
        data = data['post_stream']['posts']

        for answer in data:
            process_answer(answer)
        self.session.commit()
        self.process_users(self.user_ids_answers)

        # It there are more than 20 answers we need to retrieve the rest
        discoure_max_answers_query = 20

        if len(parser.data['post_stream']['stream']) > 20:
            pending = parser.data['post_stream']['stream']
            for i in range(0, discoure_max_answers_query):
                pending.pop(0)
            url = self.url + "/t/" + str(question_id) + "/posts.json?"
            for answer_id in pending:
                url += "post_ids%5B%5D=" + str(answer_id) + "&"
            stream = requests.get(url, verify=False)
            parser = JSONParser(unicode(stream.text))
            try:
                parser.parse()
            except:
                logging.error("Cant parse additional answers for question " +
                              question_slug)
                logging.error(url)
                print unicode(stream.text)
                return

            data = parser.data

            data = data['post_stream']['posts']

            for answer in data:
                process_answer(answer)
            self.session.commit()
            self.process_users(self.user_ids_answers)
Example #16
0
    def process_questions(self, tag):
        logging.debug("Processing questions for " + tag)

        has_more = True
        base_url = self.url + '/2.2/questions?'
        base_url += 'order=desc&sort=activity&site=stackoverflow&key=' + self.api_key + '&'
        base_url += 'tagged=' + tag

        # get total number of questions
        url_total = base_url + '&' + 'pagesize=1&filter=total'
        data = self._get_api_data(url_total)
        # Hack: total not provided in API as a JSON object
        data = json.loads(data)
        total = data['total']
        logging.info('Total number of questions to download: ' + str(total))

        page = 1
        done = 0
        while has_more:
            questions_ids = []  # used to get answers and comments
            url = base_url + '&' + 'pagesize=' + str(
                self.pagesize) + '&' + 'page=' + str(page)
            if not self.debug:
                data = self._get_api_data(url)
            else:
                data = StackSampleData.questions

            parser = JSONParser(unicode(data))
            parser.parse()
            # [u'has_more', u'items', u'quota_max', u'quota_remaining']
            data = parser.data['items']
            has_more = parser.data['has_more']
            if self.debug: has_more = False
            page += 1

            for question in data:
                # Each of the question is initialized here
                # [u'is_answered', u'view_count', u'tags', u'last_activity_date', u'answer_count', u'creation_date',
                # u'score', u'link', u'accepted_answer_id', u'owner', u'title', u'question_id']
                dbquestion = Questions()
                if 'user_id' in question['owner']:
                    dbquestion.author_identifier = question['owner']['user_id']
                dbquestion.answer_count = question['answer_count']
                dbquestion.question_identifier = question['question_id']
                dbquestion.view_count = question['view_count']
                if question['last_activity_date'] is not None:
                    dbquestion.last_activity_at = datetime.datetime.fromtimestamp(
                        int(question['last_activity_date'])).strftime(
                            '%Y-%m-%d %H:%M:%S')
                else:
                    dbquestion.last_activity_at = datetime.datetime.fromtimestamp(
                        int(question['creation_date'])).strftime(
                            '%Y-%m-%d %H:%M:%S')
                dbquestion.title = question['title']
                dbquestion.url = question['link']
                dbquestion.added_at = datetime.datetime.fromtimestamp(
                    int(question['creation_date'])).strftime(
                        '%Y-%m-%d %H:%M:%S')
                dbquestion.score = question['score']
                # Missing fields in Stack
                dbquestion.last_activity_by = None
                dbquestion.body = None  # TODO: we need to get it
                # Additional fields in Stack: is_answered, accepted_answer_id

                if self.question_new_or_changed(dbquestion):
                    # Question is new or changed
                    self.session.add(dbquestion)
                    self.session.commit()
                    self.process_dbquestiontags(dbquestion.question_identifier,
                                                question['tags'])
                    questions_ids.append(question['question_id'])
                    if dbquestion.author_identifier:
                        if dbquestion.author_identifier not in self.user_ids_questions:
                            self.user_ids_questions.append(
                                dbquestion.author_identifier)

                self.total_questions += 1
                done += 1

                if self.total_questions % 10 == 0:
                    logging.info("Done: " + str(done) + "/" + str(total))

            logging.info("Done: " + str(done) + "/" + str(total))

            ids = ";".join([str(x) for x in questions_ids])
            if len(ids) > 0:
                # Get all answers for the pagesize questions updated
                self.process_answers(ids)
                # Get all comments for the pagesize questions updated
                self.process_comments(ids)
        return
Example #17
0
    def process_questions(self, category):
        logging.debug("Processing questions for " + category)

        def update_users(users):
            for user in users:
                if user['username'] not in self.user_ids_questions:
                    self.user_ids_questions.append(user['username'])

        def process_question(question):
            dbquestion = Questions()
            dbquestion.author_identifier = question['posters'][0]['user_id']
            dbquestion.answer_count = question['reply_count']
            dbquestion.question_identifier = question['id']
            dbquestion.view_count = question['views']
            if question['last_posted_at'] is not None:
                dbquestion.last_activity_at = question['last_posted_at']
            else:
                dbquestion.last_activity_at = question['created_at']
            dbquestion.title = question['title']
            dbquestion.url = question['slug']
            dbquestion.added_at = question['created_at']
            dbquestion.score = question['like_count']
            # dbquestion.last_activity_by = question['last_poster_username']
            dbquestion.body = None
            if 'excerpt' in question:
                dbquestion.body = question['excerpt']
            # Additional fields in Discourse: liked,pinned_globally, visible, highest_post_number, unseen,posts_count
            # bumped_at, bookmarked, archived,archetype,has_summary,pinned,image_url,closed,unpinned,bumped, fancy_title

            if self.question_new_or_changed(dbquestion):
                # Question is new or changed
                self.session.add(dbquestion)
                self.session.commit()
                self.process_answers(question['slug'])
                self.process_dbquestiontags(dbquestion.question_identifier, category)
                update_users = False
            self.total_questions += 1


        url = self.url + "/c/" + category + ".json"
        stream = requests.get(url, verify=False)
        print url
        parser = JSONParser(unicode(stream.text))
        parser.parse()
        data = parser.data

        data = data['topic_list']['topics']

        for question in data:
            process_question(question)
        update_users(parser.data['users'])
        self.process_users(self.user_ids_questions)

        while 'more_topics_url' in parser.data['topic_list']:
            url = self.url + parser.data['topic_list']['more_topics_url']
            print url
            stream = requests.get(url, verify=False)
            parser = JSONParser(unicode(stream.text))
            parser.parse()
            data = parser.data

            data = data['topic_list']['topics']

            for question in data:
                process_question(question)
            if 'users' in parser.data:
                update_users(parser.data['users'])
                self.process_users(self.user_ids_questions)
            else:
                logging.info("Questions without users")
                print (parser.data)
        return
Example #18
0
    def process_questions(self, tag):
        logging.debug("Processing questions for " + tag)

        has_more = True
        base_url = self.url + '/2.2/questions?'
        base_url += 'order=desc&sort=activity&site=stackoverflow&key='+self.api_key+'&'
        base_url += 'tagged='+ tag

        # get total number of questions
        url_total = base_url +'&'+'pagesize=1&filter=total'
        data = self._get_api_data(url_total)
        # Hack: total not provided in API as a JSON object
        data = json.loads(data)
        total = data['total']
        logging.info('Total number of questions to download: ' + str(total))

        page = 1
        done = 0
        while has_more:
            questions_ids = [] # used to get answers and comments
            url = base_url + '&' + 'pagesize='+str(self.pagesize)+'&'+'page='+str(page)
            if not self.debug:
                data = self._get_api_data(url)
            else:
                data = StackSampleData.questions

            parser = JSONParser(unicode(data))
            parser.parse()
            # [u'has_more', u'items', u'quota_max', u'quota_remaining']
            data = parser.data['items']
            has_more = parser.data['has_more']
            if self.debug: has_more = False
            page += 1

            for question in data:
                # Each of the question is initialized here
                # [u'is_answered', u'view_count', u'tags', u'last_activity_date', u'answer_count', u'creation_date',
                # u'score', u'link', u'accepted_answer_id', u'owner', u'title', u'question_id']
                dbquestion = Questions()
                if 'user_id' in question['owner']:
                    dbquestion.author_identifier = question['owner']['user_id']
                dbquestion.answer_count = question['answer_count']
                dbquestion.question_identifier = question['question_id']
                dbquestion.view_count = question['view_count']
                if question['last_activity_date'] is not None:
                    dbquestion.last_activity_at = datetime.datetime.fromtimestamp(int(question['last_activity_date'])).strftime('%Y-%m-%d %H:%M:%S')
                else:
                    dbquestion.last_activity_at = datetime.datetime.fromtimestamp(int(question['creation_date'])).strftime('%Y-%m-%d %H:%M:%S')
                dbquestion.title = question['title']
                dbquestion.url = question['link']
                dbquestion.added_at = datetime.datetime.fromtimestamp(int(question['creation_date'])).strftime('%Y-%m-%d %H:%M:%S')
                dbquestion.score = question['score']
                # Missing fields in Stack
                dbquestion.last_activity_by = None
                dbquestion.body = None # TODO: we need to get it
                # Additional fields in Stack: is_answered, accepted_answer_id

                if self.question_new_or_changed(dbquestion):
                    # Question is new or changed
                    self.session.add(dbquestion)
                    self.session.commit()
                    self.process_dbquestiontags(dbquestion.question_identifier, question['tags'])
                    questions_ids.append(question['question_id'])
                    if dbquestion.author_identifier:
                        if dbquestion.author_identifier not in self.user_ids_questions:
                            self.user_ids_questions.append(dbquestion.author_identifier)

                self.total_questions += 1
                done +=1

                if self.total_questions % 10 == 0: logging.info("Done: " + str(done) + "/"+str(total))

            logging.info("Done: " + str(done) + "/"+str(total))

            ids = ";".join([str(x) for x in questions_ids])
            if len(ids)>0:
                # Get all answers for the pagesize questions updated
                self.process_answers(ids)
                # Get all comments for the pagesize questions updated
                self.process_comments(ids)
        return