Beispiel #1
0
    def _create_tf_idf_stack_vectors(self, only_questions=False):
        """ Create the tfidf vectors for the Stackexchange data. """

        # Load question and answer corpus
        logging.info("Loading stack corpus and dictionary ...")
        question_corpus = self.stack_importer.get_question_corpus()
        answer_corpus = self.stack_importer.get_answer_corpus()

        corpus = question_corpus + answer_corpus
        dictionary = self.stack_importer.get_dictionary_from_corpora(
            [question_corpus, answer_corpus])
        dict_size = len(dictionary)

        # Save stack dictionary
        stack_dict = {}
        for word_id, word in enumerate(dictionary.token2id):
            stack_dict[unicode(word)] = word_id

        self.idf_values = zeros(dict_size)

        logging.info("Determining question vectors ...")
        questions = StackCorpus(self.stack_importer.connection, "question")
        for question in questions:
            question_vector = zeros(dict_size)

            for word in question.body:
                word_id = stack_dict.get(unicode(word), -1)

                if word_id != -1:
                    question_vector[word_id] = self.tf_idf(
                        word, word_id, question.body, corpus)

            self.question_vectors[question.id] = question_vector

        logging.info("\n\tDone.")

        if only_questions:  # Skip the answers
            return stack_dict

        logging.info("Determining answer vectors ...")
        answers = StackCorpus(self.stack_importer.connection, "answer")

        for answer in answers:
            answer_vector = zeros(dict_size)

            for word in answer.body:
                word_id = stack_dict.get(unicode(word), -1)

                if word_id != -1:
                    tf_idf = self.tf_idf(word, word_id, answer.body, corpus)
                    answer_vector[word_id] = tf_idf

            self.answer_vectors[answer.id] = answer_vector

        logging.info("\n\tDone.")

        return stack_dict
Beispiel #2
0
    def calculate_similarities(self):
        """ Applies the ESA algorithm to the global stack data """

        # Open database connections
        self.stack_importer.open_stack_db()
        self.esa_importer.open_esa_db()

        # Clean tables
        logging.info("Cleaning similarity tables ...")
        self.esa_importer.create_clean_concept_doc_relation()
        self.esa_importer.create_clean_similarities_table()

        logging.info("Loading the inverted index ...")
        self.esa_importer.get_pruned_inverted_index(self.inverted_index)

        #print "Has beer " + str(self.inverted_index.get(unicode("beer"), None))

        logging.info("Calculating stack tfidf vectors ...")
        stack_dictionary = self._create_tf_idf_stack_vectors()

        # For each question calculate similarity with each answer
        logging.info("\nCalculating questions-answers similarities ...")
        question_corpus = StackCorpus(self.stack_importer.connection,
                                      "question")

        for question in question_corpus:
            q_vector = self.get_esa_vector(question.id, question.body,
                                           self.question_vectors[question.id],
                                           stack_dictionary, 1)
            q_vector_norm = norm(q_vector)
            similarities = []

            answer_corpus = StackCorpus(self.stack_importer.connection,
                                        "answer")

            for answer in answer_corpus:
                a_vector = self.get_esa_vector(answer.id, answer.body,
                                               self.answer_vectors[answer.id],
                                               stack_dictionary, 2)
                sim = self.similarity(q_vector, q_vector_norm, a_vector)
                similarities.append((question.id, answer.id, sim))

            # Save similarities to databse
            logging.info("\nSaving similarities to database ...")
            self.esa_importer.save_similarities(similarities)

        self.esa_importer.close_esa_db()
        self.stack_importer.close_stack_db()

        logging.info("\nDone")
Beispiel #3
0
    def calculate_similarities(self):

        # Open database connections
        self.lda_importer.open_lda_db()
        self.stack_importer.open_stack_db()

        # Clean similarity table
        self.lda_importer.create_clean_similarities_table()

        self._learn_model()

        logging.info("Loading dictionary ...")
        self._load_dictionary()

        logging.info("Calculating questions/answers similarities ...")
        question_corpus = StackCorpus(self.stack_importer.connection,
                                      "question")

        for question in question_corpus:

            print "Question " + str(question.id)
            similarities = []
            answer_corpus = StackCorpus(self.stack_importer.connection,
                                        "answer")

            # Get topics in the question
            bow = self.dictionary.doc2bow(question.body)
            question_topics = self.model[bow]

            for answer in answer_corpus:

                # Get topics in the answer
                bow = self.dictionary.doc2bow(answer.body)
                answer_topics = self.model[bow]

                # Similarities
                similarities.append(
                    (question.id, answer.id,
                     self._compare_documents(question_topics, answer_topics)))

            # Save similarities to databse
            logging.info("\nSaving similarities to database ...")
            self.lda_importer.save_similarities(similarities)

        # Close database connections
        self.stack_importer.close_stack_db()
        self.lda_importer.close_lda_db()
Beispiel #4
0
    def run_experiment_2_avg(self,
                             experiment_type='2_avg',
                             algorithm='lda_local_2'):

        self.experiments.open_experiment_db()

        self.lda_importer.open_lda_db()
        self.stack_importer.open_stack_db()

        total_answers = self.stack_importer.get_number_of_answers()

        # Get number of answers for each question
        number_of_answers = self.stack_importer.get_number_of_original_answers(
        )

        # Load similarities for each question
        logging.info("Loading similarities ...")
        question_corpus = StackCorpus(self.stack_importer.connection,
                                      "question")
        similar_answers = {}
        original_answers = {}

        for question in question_corpus:
            original_answers[
                question.
                id] = self.stack_importer.get_question_original_answers(
                    question.id)
            similar_answers[
                question.
                id] = self.lda_importer.load_similarities_for_question(
                    question.id, -1, False)

        self.stack_importer.close_stack_db()
        self.lda_importer.close_lda_db()

        # Calculate avg precision and recall for each case
        precision = {}
        recall = {}
        for limit in xrange(1, total_answers + 1):
            print "Calculating with limit " + str(limit)

            avg_precision, avg_recall = self.experiments.run_experiment_1_avg(
                number_of_answers, original_answers, similar_answers,
                experiment_type, limit)
            precision[limit] = avg_precision
            recall[limit] = avg_recall

        # Save into the database
        self.experiments.save_experiment_results(experiment_type, precision,
                                                 recall)

        # Write them in a file
        folder = self.setting[
            "experiments_folder"] + experiment_type + '_' + algorithm + '.dat'
        self.experiments.write_pr_curve(experiment_type, folder)

        self.experiments.close_experiment_db()

        logging.info("\nDone")
Beispiel #5
0
    def run_experiment_3_avg(self, algorithm='esa', experiment_type='3_avg'):
        """ Similar to experiment_1, but checking users instead of answers """

        self.experiments.open_experiment_db()
        self.esa_importer.open_esa_db()
        self.stack_importer.open_stack_db()

        # Get the number of active users
        active_users = len(self.stack_importer.get_active_users())

        # Get the users that gave an answer to each question
        asked_users = self.stack_importer.get_original_users()

        # Load similarities for each question
        logging.info("Loading similarities ...")
        question_corpus = StackCorpus(self.stack_importer.connection,
                                      "question")
        similar_users = {}
        original_users = {}

        for question in question_corpus:

            aux = asked_users.get(question.id, None)
            if aux is not None:
                original_users[question.id] = aux
                similar_users[
                    question.
                    id] = self.esa_importer.load_similarities_for_question(
                        question.id, -1, False)

        self.stack_importer.close_stack_db()
        self.esa_importer.close_esa_db()

        # Calculate avg precision and recall for each case
        precision = {}
        recall = {}
        for limit in xrange(1, active_users + 1):
            #print "Calculating with limit " + str(limit)
            logging.info("Calculating with limit %s", str(limit))

            avg_precision, avg_recall = self.experiments.run_experiment_3_avg(
                asked_users, original_users, similar_users, experiment_type,
                limit)
            precision[limit] = avg_precision
            recall[limit] = avg_recall

        # Save into the database
        self.experiments.save_experiment_results(experiment_type, precision,
                                                 recall)

        # Write them in a file
        folder = self.setting[
            "experiments_folder"] + experiment_type + '_' + algorithm + '.dat'
        self.experiments.write_pr_curve(experiment_type, folder)

        self.experiments.close_experiment_db()

        logging.info("\nDone")
Beispiel #6
0
    def calculate_tf_idf_similarities(self):
        """Applies the TF-IDF algorithm to the global stack data"""

        # Open database connections
        self.stack_importer.open_stack_db()
        self.esa_importer.open_esa_db()

        # Clean tables
        logging.info("Cleaning similarity tables ...")
        self.esa_importer.create_clean_similarities_table()

        logging.info("Calculating stack tfidf vectors ...")
        stack_dictionary = self._create_tf_idf_stack_vectors()

        # For each question calculate similarity with each answer
        question_corpus = StackCorpus(self.stack_importer.connection,
                                      "question")

        logging.info("\nCalculating questions-answers similarities ...")
        for question in question_corpus:
            q_vector = self.question_vectors[question.id]
            q_vector_norm = norm(q_vector)
            similarities = []

            answer_corpus = StackCorpus(self.stack_importer.connection,
                                        "answer")
            for answer in answer_corpus:
                a_vector = self.answer_vectors[answer.id]
                sim = self.similarity(q_vector, q_vector_norm, a_vector)
                similarities.append((question.id, answer.id, sim))

            # Save similarities to databse
            logging.info("\nSaving similarities to database ...")
            self.esa_importer.save_similarities(similarities)

        self.esa_importer.close_esa_db()
        self.stack_importer.close_stack_db()

        logging.info("\nDone")
Beispiel #7
0
    def calculate_esa_similarities_to_users(self):

        # Open database connections
        self.stack_importer.open_stack_db()
        self.esa_importer.open_esa_db()

        # Clean tables
        logging.info("Cleaning similarity tables ...")
        self.esa_importer.create_clean_similarities_table()

        logging.info("Loading the inverted index ...")
        self.esa_importer.get_pruned_inverted_index(self.inverted_index)

        logging.info("Calculating questions tfidf vectors ...")
        stack_dictionary = self._create_tf_idf_stack_vectors(
            only_questions=True)

        # For each question determine which other users would have been asked
        logging.info("Calculating questions/users similarities ...")
        question_corpus = StackCorpus(self.stack_importer.connection,
                                      "question")

        users = self.stack_importer.get_active_users()

        for question in question_corpus:
            print "Question " + str(question.id)
            q_vector = self.get_esa_vector(question.id, question.body,
                                           self.question_vectors[question.id],
                                           stack_dictionary, 1)
            q_vector_norm = norm(q_vector)
            similarities = []

            for user_id in users:
                user_body = self._create_user_tf_idf_stack_vector(
                    user_id, stack_dictionary)
                u_vector = self.get_esa_vector(user_id, user_body,
                                               self.user_vectors[user_id],
                                               stack_dictionary, 2)
                sim = self.similarity(q_vector, q_vector_norm, u_vector)
                similarities.append((question.id, user_id, sim))

            # Save similarities to databse
            logging.info("\nSaving similarities to database ...")
            self.esa_importer.save_similarities(similarities)

        self.esa_importer.close_esa_db()
        self.stack_importer.close_stack_db()

        logging.info("\nDone")
Beispiel #8
0
    def calculate_local_esa_similarities(self):
        """ Applies the ESA algorithm to the local stack data.
		This local data is measured per user. Returns the list
		of filtered users """

        # Keep filtered users
        filtered_users = []

        # Open database connections
        self.stack_importer.open_stack_db()
        self.esa_importer.open_esa_db()

        # Clean tables
        logging.info("Cleaning similarity tables ...")
        #self.esa_importer.create_clean_concept_doc_relation()
        self.esa_importer.create_clean_similarities_table()

        logging.info("Loading the inverted index ...")
        self.esa_importer.get_pruned_inverted_index(self.inverted_index)

        # For each question calculate its similarity with all the answers given
        # by the users who answered the given question
        logging.info("Calculating questions/answers similarities ...")
        question_corpus = StackCorpus(self.stack_importer.connection,
                                      "question")

        for question in question_corpus:

            print "Question " + str(question.id)
            similarities = []

            # Get the users that gave an answer to the question
            users = self.stack_importer.get_users_from_question(question.id)
            print "Users that replied: " + str(len(users))

            # Calculate the similarities of question with all
            # answers from the given users (related or not to question)
            for user_id in users:
                user_answers = self.stack_importer.get_user_answers_to_questions(
                    user_id)

                # Only consider users with more than 5 answers
                if len(user_answers) > 5:
                    print "User " + str(user_id)

                    # Calculate tf_idf vectors for the given user
                    self.question_vectors.clear()
                    self.answer_vectors.clear()
                    stack_dictionary = self._create_local_tf_idf_stack_vectors(
                        user_id)

                    q_vector = self.get_esa_vector(
                        question.id, question.body,
                        self.question_vectors[question.id], stack_dictionary,
                        1)
                    q_vector_norm = norm(q_vector)

                    for answer in user_answers:
                        a_vector = self.get_esa_vector(
                            answer.id, answer.body,
                            self.answer_vectors[answer.id], stack_dictionary,
                            2)
                        sim = self.similarity(q_vector, q_vector_norm,
                                              a_vector)
                        similarities.append((question.id, answer.id, sim))

                else:
                    filtered_users.append(user_id)

            # Save similarities to databse
            logging.info("\nSaving similarities to database ...")
            self.esa_importer.save_similarities(similarities)

        self.esa_importer.close_esa_db()
        self.stack_importer.close_stack_db()

        logging.info("\nDone")

        return filtered_users
Beispiel #9
0
    def calculate_local_similarities(self):
        """ Calculates similarities between local questions/answers.
			Returns the list of filtered users """

        # Keep filtered users
        filtered_users = []

        # Open database connections
        self.lda_importer.open_lda_db()
        self.stack_importer.open_stack_db()

        # Clean similarity table
        self.lda_importer.create_clean_similarities_table()

        # For each question calculate its similarity with the all the answers given
        # by the users who answered the given question
        logging.info("Calculating questions/answers similarities ...")
        question_corpus = StackCorpus(self.stack_importer.connection,
                                      "question")

        for question in question_corpus:

            print "Question " + str(question.id)

            similarities = []

            # Get the users that gave an answer to the question
            users = self.stack_importer.get_users_from_question(question.id)
            print "Users that replied: " + str(len(users))

            # Calculate the similarities of question with all
            # answers from the given users (related or not to question)
            for user_id in users:
                user_answers = self.stack_importer.get_user_answers_to_questions(
                    user_id)

                # Only consider users with more than 1 answer
                if len(user_answers) > 5:
                    print "User " + str(user_id)

                    self._learn_local_model(user_id)

                    # Get topics in the question
                    bow = self.dictionary.doc2bow(question.body)
                    question_topics = self.model[bow]

                    # Get topics in the answers and calculate similarities with current question
                    for answer in user_answers:
                        bow = self.dictionary.doc2bow(answer.body)
                        answer_topics = self.model[bow]

                        # Similarities
                        similarities.append(
                            (question.id, answer.id,
                             self._compare_documents(question_topics,
                                                     answer_topics)))
                else:
                    filtered_users.append(user_id)

            # Save similarities to databse
            logging.info("\nSaving similarities to database ...")
            self.lda_importer.save_similarities(similarities)

        # Close database connections
        self.stack_importer.close_stack_db()
        self.lda_importer.close_lda_db()

        return filtered_users