def main(): # ----- settings: experiment_type = 1 split_in_cross_validation_again = False find_ranks_in_PSA_again = False portion_of_test_in_dataset = 0.3 number_of_folds = 10 portion_of_sampled_dataset_vector = [ 0.02, 0.06, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9 ] classifiers_for_experiments = [ 'SVM', 'LDA', 'QDA', 'Random Forest', 'Logistic Regression', 'Gaussian Naive Bayes' ] path_to_save = './PSA_outputs/' # ---- path of dataset: path_dataset = './dataset/Breast_cancer_dataset/wdbc_data.txt' # ---- read the dataset: print( '############################## Reading dataset and splitting it to K-fold train and test sets' ) data = pd.read_csv( path_dataset, sep=",", header=None ) # read text file using pandas dataFrame: https://stackoverflow.com/questions/21546739/load-data-from-txt-with-pandas labels_of_classes = ['M', 'B'] X, y = read_dataset(data=data, labels_of_classes=labels_of_classes) experiments = Experiments() # # --- saving/loading split dataset in/from folder: # if split_in_cross_validation_again: # train_indices_in_folds, test_indices_in_folds, X_train_in_folds, X_test_in_folds, y_train_in_folds, y_test_in_folds = experiments.cross_validation(X=X, y=y, n_splits=number_of_folds, test_size=portion_of_test_in_dataset) # save_variable(train_indices_in_folds, 'train_indices_in_folds', path_to_save=path_to_save) # save_variable(test_indices_in_folds, 'test_indices_in_folds', path_to_save=path_to_save) # save_variable(X_train_in_folds, 'X_train_in_folds', path_to_save=path_to_save) # save_variable(X_test_in_folds, 'X_test_in_folds', path_to_save=path_to_save) # save_variable(y_train_in_folds, 'y_train_in_folds', path_to_save=path_to_save) # save_variable(y_test_in_folds, 'y_test_in_folds', path_to_save=path_to_save) # else: # file = open(path_to_save+'train_indices_in_folds.pckl','rb') # train_indices_in_folds = pickle.load(file); file.close() # file = open(path_to_save+'test_indices_in_folds.pckl','rb') # test_indices_in_folds = pickle.load(file); file.close() # file = open(path_to_save+'X_train_in_folds.pckl','rb') # X_train_in_folds = pickle.load(file); file.close() # file = open(path_to_save+'X_test_in_folds.pckl','rb') # X_test_in_folds = pickle.load(file); file.close() # file = open(path_to_save+'y_train_in_folds.pckl','rb') # y_train_in_folds = pickle.load(file); file.close() # file = open(path_to_save+'y_test_in_folds.pckl','rb') # y_test_in_folds = pickle.load(file); file.close() # ----- experiments: if experiment_type == 1: experiments.multi_class_demo()
def __init__(self): # parameters self.global_planner = rospy.get_param( 'social_experiments/global_planner', '') self.local_planner = rospy.get_param( 'social_experiments/local_planner', '') self.world_model_name = rospy.get_param( 'social_experiments/world_model_name', '') self.robot_model_name = rospy.get_param( 'social_experiments/robot_model_name', '') self.max_experiments = rospy.get_param( 'social_experiments/max_experiments', 100) self.path_storage = rospy.get_param('social_experiments/path_storage', '') self.robot_vel = rospy.get_param('social_experiments/robot_vel', 0.3) self.space_factor_tolerance = rospy.get_param( 'social_experiments/space_factor_tolerance', 5) self.time_factor_tolerance = rospy.get_param( 'social_experiments/time_factor_tolerance', 5) # self.start_service = rospy.get_param('social_experiments/start_service', '/regions/start') # self.goal_service = rospy.get_param('social_experiments/goal_service', '/regions/goal') self.checkpoint_services = rospy.get_param( 'social_experiments/checkpoint_services', '') if (self.checkpoint_services is ''): self.checkpoint_services = [] else: self.checkpoint_services = list( self.checkpoint_services.split(" ")) # log rospy.loginfo('global_planner: ' + self.global_planner) rospy.loginfo('local_planner: ' + self.local_planner) rospy.loginfo('world_model_name: ' + self.world_model_name) rospy.loginfo('robot: ' + self.robot_model_name) rospy.loginfo('robot vel: ' + str(self.robot_vel)) rospy.loginfo('space factor tolerance: ' + str(self.space_factor_tolerance)) rospy.loginfo('time factor tolerance: ' + str(self.time_factor_tolerance)) rospy.loginfo('max experiments: ' + str(self.max_experiments)) # rospy.loginfo('start service: ' + str(self.start_service)) # rospy.loginfo('goal service: ' + str(self.goal_service)) # rospy.loginfo('checkpoint services: ' + str(self.checkpoint_services)) print('') # data self.data = [] # init experiments self.ex = Experiments(self.global_planner, self.local_planner, self.world_model_name, self.robot_model_name)
def shs_test_set_evals(size, method="msd_title", with_duplicates=True): """ :param size: Required prune size of the results :param method: (string type) {default:"msd_title"} choose the method of experiment available modes are ["msd_title", "pre-msd_title", "mxm_lyrics", "title_mxm_lyrics", "pre-title_mxm_lyrics"] :param with_duplicates: (boolean) {default:True} include or exclude MSD official duplicate tracks from the experiments :return: """ es = SearchModule(presets.uri_config) if with_duplicates: exp = Experiments(es, './data/test_shs.csv', presets.shs_msd) else: exp = Experiments(es, './data/test_shs.csv', presets.shs_msd_no_dup) if method == "msd_title": LOGGER.info("\n%s with size %s and duplicates=%s " % (method, size, with_duplicates)) results = exp.run_song_title_match_task(size=size) elif method == "pre-msd_title": LOGGER.info("\n%s with size %s and duplicates=%s" % (method, size, with_duplicates)) results = exp.run_cleaned_song_title_task(size=size) elif method == "mxm_lyrics": LOGGER.info("\n%s with size %s and duplicates=%s" % (method, size, with_duplicates)) results = exp.run_mxm_lyrics_search_task(presets.more_like_this, size=size) elif method == "title_mxm_lyrics": LOGGER.info("\n%s with size %s and duplicates=%s" % (method, size, with_duplicates)) results = exp.run_rerank_title_with_mxm_lyrics_task(size=size, with_cleaned=False) elif method == "pre-title_mxm_lyrics": LOGGER.info("\n%s with size %s and duplicates=%s" % (method, size, with_duplicates)) results = exp.run_rerank_title_with_mxm_lyrics_task(size=size, with_cleaned=True) else: raise Exception("\nInvalid 'method' parameter for the experiment ! ") mean_avg_precision = exp.mean_average_precision(results) LOGGER.info("\n Mean Average Precision (MAP) = %s" % mean_avg_precision) return
def main(): settings = Settings() settings.Initalize_Global_Settings() preprocess = Preprocess(settings) preprocess.Load_Into_Dataframes() analysis = Analysis(preprocess) experiments = Experiments(analysis) data = analysis.Core(experiments) data_experimentals = experiments.Run_Experiments() models, best_fit, gals_df = analysis.Mocks_And_Models(experiments) plotting = Plotting(preprocess) plotting.Plot_Core(data, models, best_fit) plotting.Plot_Experiments(data, data_experimentals, models, best_fit)
def __init__(self, setting): self.setting = setting self.mallet_path = setting['malletpath'] self.number_of_topics = setting['nooftopics'] self.number_of_iter = setting['noofiterations'] self.stack_importer = StackImporter(setting) self.lda_importer = LDAImporter(setting) self.experiments = Experiments(setting) self.model = None self.corpus = None self.dictionary = None self.answer_corpus = None directory = self.setting['lda_folder'] file_name = 'local_lda_model' + self.setting['theme'] + '.gs' self.path = ''.join([directory, file_name])
def main(): config = get_config_from_json('config.json') # create an instance of the model model = VAE(config) # create experiments instance experiments = Experiments(config, model) # create trainer instance trainer = Trainer(config, model, experiments) # train the model trainer.train()
def __init__(self, setting): self.setting = setting self.idf_values = None self.wiki_corpus = None self.wiki_dictionary = None self.wiki_vectors = [] self.wiki_processor = WikiPreprocessor(setting) self.wiki_importer = WikiImporter(setting, self.wiki_processor) self.stack_corpus = None self.answer_vectors = {} self.question_vectors = {} self.user_vectors = {} self.user_content = {} self.stack_importer = StackImporter(setting) self.esa_importer = ESAImporter(setting) self.inverted_index = defaultdict(list) self.number_of_concepts = 0 self.experiments = Experiments(setting)
class ESA(object): """ ESA - Explicit Semantic Analysis """ def __init__(self, setting): self.setting = setting self.idf_values = None self.wiki_corpus = None self.wiki_dictionary = None self.wiki_vectors = [] self.wiki_processor = WikiPreprocessor(setting) self.wiki_importer = WikiImporter(setting, self.wiki_processor) self.stack_corpus = None self.answer_vectors = {} self.question_vectors = {} self.user_vectors = {} self.user_content = {} self.stack_importer = StackImporter(setting) self.esa_importer = ESAImporter(setting) self.inverted_index = defaultdict(list) self.number_of_concepts = 0 self.experiments = Experiments(setting) ############################################################################### # Clean and load data ############################################################################### def clean_and_load_data(self): """ Cleans the data and saves it in a database """ self.wiki_importer.import_wiki_data() ############################################################################### # Create and manage data used by ESA algorithm ############################################################################### def build_esa_db(self): """ Initializes the ESA database """ logging.info("\nCreating ESA database ...") self.esa_importer.open_esa_db() # Initialize database self.esa_importer.create_esa_db() # Save the dictionary and corpus of the Wikipedia data self.wiki_dictionary = self.wiki_importer.build_wiki_kb() # Save the inverse document frequencies in the ESA database number_of_documents = self.wiki_dictionary.num_docs #self.wiki_importer.get_number_of_concepts() self.esa_importer.save_wiki_inverse_document_frequencies( number_of_documents) self.esa_importer.close_esa_db() def load_esa_index(self): """ Gets the inverted index from the database """ self.esa_importer.open_esa_db() self.esa_importer.get_pruned_inverted_index(self.inverted_index) logging.info("\nDone") self.esa_importer.close_esa_db() ############################################################################### # Build TF-IDF Vectors ############################################################################### def create_tf_idf_vectors(self): """ Creates them if not already in database """ self.esa_importer.open_esa_db() # Calculate tfidf vectors for the Wikipedia articles self.create_tf_idf_wiki_vectors() # Save terms and vectors to ESA db #self.esa_importer.save_inverted_index(self.wiki_vectors) logging.info("\nDone") self.esa_importer.close_esa_db() def create_tf_idf_wiki_vectors(self): """ Keeping only non-zero entries of the vectors """ wiki_corpus, self.wiki_dictionary = self.esa_importer.get_wiki_corpus_dictionary( ) logging.info("Retrieving idf values ...") inv_doc_freq = {} self.esa_importer.get_wiki_inverse_document_frequencies(inv_doc_freq) logging.info("Building the tfidf vectors and the inverse index ...") tfidf_model = TfidfModel(self.wiki_dictionary, inv_doc_freq) inverted_index = defaultdict(list) for document in wiki_corpus: vector = tfidf_model[document] for term_id, value in vector: inverted_index[term_id].append((document.document_id, value)) #print "Added " + str(document.document_id) logging.info("\n\tDone.") self.esa_importer.save_inverted_index(inverted_index) self.save_index_to_file(inverted_index) def _create_tf_idf_stack_vectors(self, only_questions=False): """ Create the tfidf vectors for the Stackexchange data. """ # Load question and answer corpus logging.info("Loading stack corpus and dictionary ...") question_corpus = self.stack_importer.get_question_corpus() answer_corpus = self.stack_importer.get_answer_corpus() corpus = question_corpus + answer_corpus dictionary = self.stack_importer.get_dictionary_from_corpora( [question_corpus, answer_corpus]) dict_size = len(dictionary) # Save stack dictionary stack_dict = {} for word_id, word in enumerate(dictionary.token2id): stack_dict[unicode(word)] = word_id self.idf_values = zeros(dict_size) logging.info("Determining question vectors ...") questions = StackCorpus(self.stack_importer.connection, "question") for question in questions: question_vector = zeros(dict_size) for word in question.body: word_id = stack_dict.get(unicode(word), -1) if word_id != -1: question_vector[word_id] = self.tf_idf( word, word_id, question.body, corpus) self.question_vectors[question.id] = question_vector logging.info("\n\tDone.") if only_questions: # Skip the answers return stack_dict logging.info("Determining answer vectors ...") answers = StackCorpus(self.stack_importer.connection, "answer") for answer in answers: answer_vector = zeros(dict_size) for word in answer.body: word_id = stack_dict.get(unicode(word), -1) if word_id != -1: tf_idf = self.tf_idf(word, word_id, answer.body, corpus) answer_vector[word_id] = tf_idf self.answer_vectors[answer.id] = answer_vector logging.info("\n\tDone.") return stack_dict def _create_local_tf_idf_stack_vectors(self, user_id): """ Create the tfidf vectors for the local Stackexchange data of the given user """ # Load question and answer corpus #logging.info("Loading stack corpus and dictionary ...") question_corpus = self.stack_importer.get_user_question_corpus(user_id) answer_corpus = self.stack_importer.get_user_answer_corpus(user_id) corpus = question_corpus + answer_corpus dictionary = self.stack_importer.get_dictionary_from_corpora( [question_corpus, answer_corpus]) dict_size = len(dictionary) # Save stack dictionary stack_dict = {} for word_id, word in enumerate(dictionary.token2id): stack_dict[unicode(word)] = word_id self.idf_values = zeros(dict_size) #logging.info("Determining question vectors ...") questions = self.stack_importer.get_user_local_questions(user_id) for question in questions: question_vector = zeros(dict_size) for word in question.body: word_id = stack_dict.get(unicode(word), -1) if word_id != -1: question_vector[word_id] = self.tf_idf( word, word_id, question.body, corpus) self.question_vectors[question.id] = question_vector #logging.info("\n\tDone.") #logging.info("Determining answer vectors ...") answers = self.stack_importer.get_user_local_answers(user_id) for answer in answers: answer_vector = zeros(dict_size) for word in answer.body: word_id = stack_dict.get(unicode(word), -1) if word_id != -1: tf_idf = self.tf_idf(word, word_id, answer.body, corpus) answer_vector[word_id] = tf_idf self.answer_vectors[answer.id] = answer_vector #logging.info("\n\tDone.") return stack_dict def _create_user_tf_idf_stack_vector(self, user_id, stack_dict): """ Create the tfidf vector representation of a user, based on her answers""" aux = self.user_content.get(user_id, None) if aux is not None: return aux user_corpus = [] user_words = [] answers = self.stack_importer.get_user_answers_to_questions(user_id) for answer in answers: user_corpus.append(answer.body) for word in answer.body: user_words.append(word) self.user_content[user_id] = user_words dict_size = len(stack_dict) user_vector = zeros(dict_size) for word in set(user_words): word_id = stack_dict.get(unicode(word), -1) if word_id != -1: tf_idf = self.tf_idf(word, word_id, user_words, user_corpus) user_vector[word_id] = tf_idf self.user_vectors[user_id] = user_vector return user_words @staticmethod def tf(word, document): """ Returns the normalized frequency of the word in the given document """ word_count = document.count(unicode(word)) return float(word_count) / len(document) @staticmethod def df(word, corpus): """ Returns the number of documents in the collection that contain the given word """ return sum(1 for document in corpus if unicode(word) in document) #@staticmethod def idf(self, word, corpus): """ Returns the inverse document frequency of the word in the documents collection """ return math.log(len(corpus)) / self.df(word, corpus) def tf_idf(self, word, word_index, document, corpus): """ Returns the TF-IDF value for the given word in the document of the corpus """ # Calculate the term frequency value (tf) tf = self.tf(word, document) if tf == 0.0: return 0.0 # Calculate the inverse document frequency value (idf) if self.idf_values[word_index] == 0.0: self.idf_values[word_index] = self.idf(word, corpus) return float(tf * self.idf_values[word_index]) ############################################################################### # Associations and Similarities of Stackexchange questions/answers using # Wikipedia's articles as concepts. ############################################################################### def calculate_similarities(self): """ Applies the ESA algorithm to the global stack data """ # Open database connections self.stack_importer.open_stack_db() self.esa_importer.open_esa_db() # Clean tables logging.info("Cleaning similarity tables ...") self.esa_importer.create_clean_concept_doc_relation() self.esa_importer.create_clean_similarities_table() logging.info("Loading the inverted index ...") self.esa_importer.get_pruned_inverted_index(self.inverted_index) #print "Has beer " + str(self.inverted_index.get(unicode("beer"), None)) logging.info("Calculating stack tfidf vectors ...") stack_dictionary = self._create_tf_idf_stack_vectors() # For each question calculate similarity with each answer logging.info("\nCalculating questions-answers similarities ...") question_corpus = StackCorpus(self.stack_importer.connection, "question") for question in question_corpus: q_vector = self.get_esa_vector(question.id, question.body, self.question_vectors[question.id], stack_dictionary, 1) q_vector_norm = norm(q_vector) similarities = [] answer_corpus = StackCorpus(self.stack_importer.connection, "answer") for answer in answer_corpus: a_vector = self.get_esa_vector(answer.id, answer.body, self.answer_vectors[answer.id], stack_dictionary, 2) sim = self.similarity(q_vector, q_vector_norm, a_vector) similarities.append((question.id, answer.id, sim)) # Save similarities to databse logging.info("\nSaving similarities to database ...") self.esa_importer.save_similarities(similarities) self.esa_importer.close_esa_db() self.stack_importer.close_stack_db() logging.info("\nDone") def calculate_tf_idf_similarities(self): """Applies the TF-IDF algorithm to the global stack data""" # Open database connections self.stack_importer.open_stack_db() self.esa_importer.open_esa_db() # Clean tables logging.info("Cleaning similarity tables ...") self.esa_importer.create_clean_similarities_table() logging.info("Calculating stack tfidf vectors ...") stack_dictionary = self._create_tf_idf_stack_vectors() # For each question calculate similarity with each answer question_corpus = StackCorpus(self.stack_importer.connection, "question") logging.info("\nCalculating questions-answers similarities ...") for question in question_corpus: q_vector = self.question_vectors[question.id] q_vector_norm = norm(q_vector) similarities = [] answer_corpus = StackCorpus(self.stack_importer.connection, "answer") for answer in answer_corpus: a_vector = self.answer_vectors[answer.id] sim = self.similarity(q_vector, q_vector_norm, a_vector) similarities.append((question.id, answer.id, sim)) # Save similarities to databse logging.info("\nSaving similarities to database ...") self.esa_importer.save_similarities(similarities) self.esa_importer.close_esa_db() self.stack_importer.close_stack_db() logging.info("\nDone") def calculate_local_tfidf_similarities(self): """ Applies TF-IDF to the local stack data, in order to calculate questions/answers similarities. The local data is measured per user. Returns the list of users that were filtered. """ # Keep filtered users filtered_users = [] # Open database connections self.esa_importer.open_esa_db() self.stack_importer.open_stack_db() # Clean similarity table self.esa_importer.create_clean_similarities_table() # For each question calculate its similarity with the all the answers given # by the users who answered the given question logging.info("Calculating questions/answers similarities ...") question_corpus = StackCorpus(self.stack_importer.connection, "question") for question in question_corpus: print "Question " + str(question.id) similarities = [] # Get the users that gave an answer to the question users = self.stack_importer.get_users_from_question(question.id) print "Users that replied: " + str(len(users)) # Calculate the similarities of question with all # answers from the given users (related or not to question) for user_id in users: user_answers = self.stack_importer.get_user_answers_to_questions( user_id) # Only consider users with more than 1 answer if len(user_answers) > 5: print "User " + str(user_id) a = [] for answer in user_answers: a.append(answer.id) print a # Calculate tf_idf vectors for the given user self.question_vectors.clear() self.answer_vectors.clear() stack_dictionary = self._create_local_tf_idf_stack_vectors( user_id) q_vector = self.question_vectors[question.id] q_vector_norm = norm(q_vector) for answer in user_answers: a_vector = self.answer_vectors[answer.id] sim = self.similarity(q_vector, q_vector_norm, a_vector) similarities.append((question.id, answer.id, sim)) else: filtered_users.append(user_id) # Save similarities to databse logging.info("\nSaving similarities to database ...") self.esa_importer.save_similarities(similarities) # Close database connections self.esa_importer.close_esa_db() self.stack_importer.close_stack_db() logging.info("\nDone") return filtered_users def calculate_local_esa_similarities(self): """ Applies the ESA algorithm to the local stack data. This local data is measured per user. Returns the list of filtered users """ # Keep filtered users filtered_users = [] # Open database connections self.stack_importer.open_stack_db() self.esa_importer.open_esa_db() # Clean tables logging.info("Cleaning similarity tables ...") #self.esa_importer.create_clean_concept_doc_relation() self.esa_importer.create_clean_similarities_table() logging.info("Loading the inverted index ...") self.esa_importer.get_pruned_inverted_index(self.inverted_index) # For each question calculate its similarity with all the answers given # by the users who answered the given question logging.info("Calculating questions/answers similarities ...") question_corpus = StackCorpus(self.stack_importer.connection, "question") for question in question_corpus: print "Question " + str(question.id) similarities = [] # Get the users that gave an answer to the question users = self.stack_importer.get_users_from_question(question.id) print "Users that replied: " + str(len(users)) # Calculate the similarities of question with all # answers from the given users (related or not to question) for user_id in users: user_answers = self.stack_importer.get_user_answers_to_questions( user_id) # Only consider users with more than 5 answers if len(user_answers) > 5: print "User " + str(user_id) # Calculate tf_idf vectors for the given user self.question_vectors.clear() self.answer_vectors.clear() stack_dictionary = self._create_local_tf_idf_stack_vectors( user_id) q_vector = self.get_esa_vector( question.id, question.body, self.question_vectors[question.id], stack_dictionary, 1) q_vector_norm = norm(q_vector) for answer in user_answers: a_vector = self.get_esa_vector( answer.id, answer.body, self.answer_vectors[answer.id], stack_dictionary, 2) sim = self.similarity(q_vector, q_vector_norm, a_vector) similarities.append((question.id, answer.id, sim)) else: filtered_users.append(user_id) # Save similarities to databse logging.info("\nSaving similarities to database ...") self.esa_importer.save_similarities(similarities) self.esa_importer.close_esa_db() self.stack_importer.close_stack_db() logging.info("\nDone") return filtered_users def get_esa_vector(self, id, document, tfidf_vector, dictionary, type): """ Creates the interpretation vector of the given document. - The document should be a set of tokens, already preprocessed - The vector represents the relatedness of the document with all the Wikipedia articles - Type indicates the type of document: question (1) or answer (2) """ # Interpretation vector with dimensions = Wikipedia articles interpretation = zeros(2080905) for token in set(document): documents = self.inverted_index.get(unicode(token), None) word_id = dictionary.get(unicode(token), -1) if documents is not None and word_id != -1: #print str(len(documents)) for document_id, value in documents: interpretation[document_id] += (value * tfidf_vector[word_id]) return interpretation def similarity(self, vector1, norm_vector1, vector2): """ Calculates the cosine similarity between the given vectors """ # Cosine similartity sim = float(dot(vector1, vector2) / (norm_vector1 * norm(vector2))) return sim def save_relatedness_to_file(self, file_name): self.esa_importer.open_esa_db() self.esa_importer.write_relatedness_to_file(file_name) self.esa_importer.close_esa_db() ### EXTRA ### def save_index_to_file(self, index=None, file_name='../data/ESA/index.txt'): index = defaultdict(list) # Extract it from DB self.esa_importer.open_esa_db() self.esa_importer.get_pruned_inverted_index(index) self.esa_importer.close_esa_db() # Copy to file logging.info("Saving them in a file ...") with open(file_name, 'a') as f: for word, doc_list in index.iteritems(): #print word f.write(word + '\n') f.write(' '.join([str(x) for x in doc_list])) f.write('\n') def testing_beer_concept(self): tfidf_norm_values = [] tfidf_values = [] append_values = tfidf_values.append append_norm_values = tfidf_norm_values.append self.esa_importer.open_esa_db() wiki_corpus, self.wiki_dictionary = self.esa_importer.get_wiki_corpus_dictionary( ) # IDF is fixed idf = 4.8225774331876625 df = 0 for document in wiki_corpus: content = document.content.split(' ') if unicode("beer") in content: doc_tf = defaultdict(float) size = 0 # length of the document df += 1 # Faster than Counter for word in content: doc_tf[word] += 1.0 size += 1 # Calculate tfidf value for word "beer" in Wiki data norm_value = (doc_tf[unicode("beer")] / size) * idf value = doc_tf[unicode("beer")] * idf append_values((document.document_id, value)) append_norm_values((document.document_id, norm_value)) print "DF : " + str(df) # Sort each list sorted_norm_values = sorted(tfidf_norm_values, key=itemgetter(1)) sorted_norm_values = sorted_norm_values[::-1] sorted_values = sorted(tfidf_values, key=itemgetter(1)) sorted_values = sorted_values[::-1] # Print top 10 in each list print "Normalized : " print ' , '.join( [str(id) + " " + str(value) for id, value in sorted_norm_values]) print "\nNot normalized" print ' , '.join( [str(id) + " " + str(value) for id, value in sorted_values]) self.esa_importer.close_esa_db() def prun_inverted_index(self): """ Prun the inverted index """ self.esa_importer.open_esa_db() index = EsaIndex(self.esa_importer.connection) result = [] append = result.append for term, vector in index: append((term, vector)) self.esa_importer.save_pruned_index(result) self.esa_importer.close_esa_db() ############################################################################### # Find the right person # Then, following a naive strong tie strategy, we could check for each question # which other users would have been asked following two strategies: (a) based # on the social network ties (the ones with strongest ties) and (b) based on # the content similarity (which answer is most similar to the question using # TF-IDF or ESA, whatever you like best). Finally, we can compare both results # with the ground truth (which users got actually asked in the dataset). ############################################################################### def calculate_esa_similarities_to_users(self): # Open database connections self.stack_importer.open_stack_db() self.esa_importer.open_esa_db() # Clean tables logging.info("Cleaning similarity tables ...") self.esa_importer.create_clean_similarities_table() logging.info("Loading the inverted index ...") self.esa_importer.get_pruned_inverted_index(self.inverted_index) logging.info("Calculating questions tfidf vectors ...") stack_dictionary = self._create_tf_idf_stack_vectors( only_questions=True) # For each question determine which other users would have been asked logging.info("Calculating questions/users similarities ...") question_corpus = StackCorpus(self.stack_importer.connection, "question") users = self.stack_importer.get_active_users() for question in question_corpus: print "Question " + str(question.id) q_vector = self.get_esa_vector(question.id, question.body, self.question_vectors[question.id], stack_dictionary, 1) q_vector_norm = norm(q_vector) similarities = [] for user_id in users: user_body = self._create_user_tf_idf_stack_vector( user_id, stack_dictionary) u_vector = self.get_esa_vector(user_id, user_body, self.user_vectors[user_id], stack_dictionary, 2) sim = self.similarity(q_vector, q_vector_norm, u_vector) similarities.append((question.id, user_id, sim)) # Save similarities to databse logging.info("\nSaving similarities to database ...") self.esa_importer.save_similarities(similarities) self.esa_importer.close_esa_db() self.stack_importer.close_stack_db() logging.info("\nDone") def calculate_tfidf_similarities_to_users(self): # Open database connections self.stack_importer.open_stack_db() self.esa_importer.open_esa_db() # Clean tables logging.info("Cleaning similarity tables ...") #self.esa_importer.create_clean_concept_doc_relation() self.esa_importer.create_clean_similarities_table() logging.info("Calculating questions tfidf vectors ...") stack_dictionary = self._create_tf_idf_stack_vectors( only_questions=True) # For each question determine which other users would have been asked logging.info("Calculating questions/users similarities ...") question_corpus = StackCorpus(self.stack_importer.connection, "question") users = self.stack_importer.get_active_users() for question in question_corpus: print "Question " + str(question.id) q_vector = self.question_vectors[question.id] q_vector_norm = norm(q_vector) similarities = [] for user_id in users: user_body = self._create_user_tf_idf_stack_vector( user_id, stack_dictionary) u_vector = self.user_vectors[user_id] sim = self.similarity(q_vector, q_vector_norm, u_vector) similarities.append((question.id, user_id, sim)) # Save similarities to databse logging.info("\nSaving similarities to database ...") self.esa_importer.save_similarities(similarities) self.esa_importer.close_esa_db() self.stack_importer.close_stack_db() logging.info("\nDone") ############################################################################### # Experiments - Calculate statistics on the data ############################################################################### def initialize_experiments(self): self.experiments.open_experiment_db() self.experiments.create_experiments_db() self.experiments.close_experiment_db() def run_experiment_1(self): self.experiments.open_experiment_db() self.experiments.run_experiment_1(True) self.experiments.close_experiment_db() def run_experiment_1_avg(self, experiment_type='1_avg', algorithm='esa'): self.experiments.open_experiment_db() self.esa_importer.open_esa_db() self.stack_importer.open_stack_db() total_answers = self.stack_importer.get_number_of_answers() # Get number of answers for each question number_of_answers = self.stack_importer.get_number_of_original_answers( ) # Load similarities for each question logging.info("Loading similarities ...") question_corpus = StackCorpus(self.stack_importer.connection, "question") similar_answers = {} original_answers = {} for question in question_corpus: original_answers[ question. id] = self.stack_importer.get_question_original_answers( question.id) similar_answers[ question. id] = self.esa_importer.load_similarities_for_question( question.id, -1, False) self.stack_importer.close_stack_db() self.esa_importer.close_esa_db() # Calculate avg precision and recall for each case precision = {} recall = {} for limit in xrange(1, total_answers + 1): logging.info("Calculating with limit %s", str(limit)) avg_precision, avg_recall = self.experiments.run_experiment_1_avg( number_of_answers, original_answers, similar_answers, experiment_type, limit) precision[limit] = avg_precision recall[limit] = avg_recall # Save into the database self.experiments.save_experiment_results(experiment_type, precision, recall) # Write them in a file folder = self.setting[ "experiments_folder"] + experiment_type + '_' + algorithm + '.dat' self.experiments.write_pr_curve(experiment_type, folder) self.experiments.close_experiment_db() logging.info("\nDone") def run_experiment_2_avg(self, algorithm='esa'): """ Same as run_experiment_1_avg but similarities were calculated with local data per user """ self.run_experiment_1_avg('2_avg', algorithm) def run_experiment_3_avg(self, algorithm='esa', experiment_type='3_avg'): """ Similar to experiment_1, but checking users instead of answers """ self.experiments.open_experiment_db() self.esa_importer.open_esa_db() self.stack_importer.open_stack_db() # Get the number of active users active_users = len(self.stack_importer.get_active_users()) # Get the users that gave an answer to each question asked_users = self.stack_importer.get_original_users() # Load similarities for each question logging.info("Loading similarities ...") question_corpus = StackCorpus(self.stack_importer.connection, "question") similar_users = {} original_users = {} for question in question_corpus: aux = asked_users.get(question.id, None) if aux is not None: original_users[question.id] = aux similar_users[ question. id] = self.esa_importer.load_similarities_for_question( question.id, -1, False) self.stack_importer.close_stack_db() self.esa_importer.close_esa_db() # Calculate avg precision and recall for each case precision = {} recall = {} for limit in xrange(1, active_users + 1): #print "Calculating with limit " + str(limit) logging.info("Calculating with limit %s", str(limit)) avg_precision, avg_recall = self.experiments.run_experiment_3_avg( asked_users, original_users, similar_users, experiment_type, limit) precision[limit] = avg_precision recall[limit] = avg_recall # Save into the database self.experiments.save_experiment_results(experiment_type, precision, recall) # Write them in a file folder = self.setting[ "experiments_folder"] + experiment_type + '_' + algorithm + '.dat' self.experiments.write_pr_curve(experiment_type, folder) self.experiments.close_experiment_db() logging.info("\nDone")
import pandas as pd from model_training import ModelTraining from preprocessing import Preprocessing from metrics import Metrics from data_source import DataSource from experiments import Experiments from model_inference import ModelInference model = Experiments().run_experiment() ModelTraining().model_training() ModelInference().predict()
class SocialExperimentsNode(): def __init__(self): # parameters self.global_planner = rospy.get_param( 'social_experiments/global_planner', '') self.local_planner = rospy.get_param( 'social_experiments/local_planner', '') self.world_model_name = rospy.get_param( 'social_experiments/world_model_name', '') self.robot_model_name = rospy.get_param( 'social_experiments/robot_model_name', '') self.max_experiments = rospy.get_param( 'social_experiments/max_experiments', 100) self.path_storage = rospy.get_param('social_experiments/path_storage', '') self.robot_vel = rospy.get_param('social_experiments/robot_vel', 0.3) self.space_factor_tolerance = rospy.get_param( 'social_experiments/space_factor_tolerance', 5) self.time_factor_tolerance = rospy.get_param( 'social_experiments/time_factor_tolerance', 5) # self.start_service = rospy.get_param('social_experiments/start_service', '/regions/start') # self.goal_service = rospy.get_param('social_experiments/goal_service', '/regions/goal') self.checkpoint_services = rospy.get_param( 'social_experiments/checkpoint_services', '') if (self.checkpoint_services is ''): self.checkpoint_services = [] else: self.checkpoint_services = list( self.checkpoint_services.split(" ")) # log rospy.loginfo('global_planner: ' + self.global_planner) rospy.loginfo('local_planner: ' + self.local_planner) rospy.loginfo('world_model_name: ' + self.world_model_name) rospy.loginfo('robot: ' + self.robot_model_name) rospy.loginfo('robot vel: ' + str(self.robot_vel)) rospy.loginfo('space factor tolerance: ' + str(self.space_factor_tolerance)) rospy.loginfo('time factor tolerance: ' + str(self.time_factor_tolerance)) rospy.loginfo('max experiments: ' + str(self.max_experiments)) # rospy.loginfo('start service: ' + str(self.start_service)) # rospy.loginfo('goal service: ' + str(self.goal_service)) # rospy.loginfo('checkpoint services: ' + str(self.checkpoint_services)) print('') # data self.data = [] # init experiments self.ex = Experiments(self.global_planner, self.local_planner, self.world_model_name, self.robot_model_name) def start_experiments(self): # experiments loop for i in range(0, self.max_experiments): rospy.loginfo('Preparing experiment %i/%i' % (i + 1, self.max_experiments)) self.data.append(Data()) rospy.loginfo('Fiding checkpoints...') self.data[-1].checkpoints = self.ex.get_checkpoints_random( "/regions/path") self.data[-1].path_executed.append( self.data[-1].checkpoints[0].pose.position) for n, cp in enumerate(self.data[-1].checkpoints): rospy.loginfo('checkpoint ' + str(n) + ': ' + '(x=' + str(cp.pose.position.x) + ',y=' + str(cp.pose.position.x) + ',ang=' + str(cp.pose.orientation.z) + ')') rospy.loginfo('Finding a path plan...') for n in range(1, len(self.data[-1].checkpoints)): plan = self.ex.find_new_path( self.data[-1].checkpoints[n - 1], self.data[-1].checkpoints[n]).poses rospy.loginfo('Path plan from checkpoint ' + str(n - 1) + ' to ' + str(n) + ': ' + str(len(plan))) self.data[-1].path_plan += plan rospy.loginfo('Total path plan size: ' + str(len(self.data[-1].path_plan))) self.ex.reset_world() rospy.loginfo('Resetting world model') self.ex.reset_model(self.world_model_name) rospy.loginfo('Resetting robot model') self.ex.reset_model(self.robot_model_name, self.data[-1].checkpoints[0].pose) rospy.loginfo("setting min dist and time to reach destination") (self.data[-1].space_min, self.data[-1].time_min) = self.ex.get_min_dist_time( self.data[-1].path_plan, self.robot_vel) rospy.loginfo('Space min: ' + str(self.data[-1].space_min) + ' meters') rospy.loginfo('Time min: ' + str(self.data[-1].time_min) + ' seconds') rospy.loginfo("setting max dist and time to reach destination") self.data[-1].space_max = self.data[ -1].space_min * self.space_factor_tolerance self.data[-1].time_max = self.data[ -1].time_min * self.time_factor_tolerance rospy.loginfo('Space max: ' + str(self.data[-1].space_max) + ' meters') rospy.loginfo('Time max: ' + str(self.data[-1].time_max) + ' seconds') self.ex.robot_update(self.data[-1].checkpoints[0]) rospy.loginfo('Start experiment %i/%i' % (i + 1, self.max_experiments)) self.ex.send_move_base_command(self.data[-1].checkpoints[1]) rospy.loginfo('Experiment in progress...') self.data[-1].delta_space.append(0) self.data[-1].delta_time.append(rospy.Time.now()) self.data[-1].total_space = 0 self.data[-1].total_time = 0 self.ex.start(self.data[-1]) self.ex.cancel_all_goals() rospy.loginfo('Space elapsed: ' + str(self.data[-1].total_space) + ' meters') rospy.loginfo('Time elapsed: ' + str(self.data[-1].total_time) + ' seconds') rospy.loginfo('Status: ' + self.data[-1].status) rospy.loginfo('Finish experiment ' + str(i + 1) + '/' + str(self.max_experiments)) print('') def generate_csv(self): # print params file_params = open(self.path_storage + "/params.yaml", "w+") file_params.write("environment: " + str(self.world_model_name) + "\n") file_params.write("robot_name: " + str(self.robot_model_name) + "\n") file_params.write("robot_vel: " + str(self.robot_vel) + "\n") file_params.write("space_factor_tolerance: " + str(self.space_factor_tolerance) + "\n") file_params.write("time_factor_tolerance: " + str(self.time_factor_tolerance) + "\n") file_params.write("max_experiments: " + str(self.max_experiments) + "\n") file_params.close() # print real time factor file_factor = open(self.path_storage + "/real_time_factor.json", "w+") i = 0 list_f = [] for e1 in self.data: list_f.append('"' + str(i) + '":[' + ','.join([str(x) for x in e1.factor_array]) + ']') i += 1 file_factor.write('{' + ',\n'.join([str(x) for x in list_f]) + '}') file_factor.close() # print localization error file_loc_err = open(self.path_storage + "/localization_error.json", "w+") i = 0 list_e = [] for e1 in self.data: list_e.append( '"' + str(i) + '":[' + ','.join([str(x) for x in e1.localization_error_array]) + ']') i += 1 file_loc_err.write('{' + ',\n'.join([str(x) for x in list_e]) + '}') file_loc_err.close() # print path plan file_path_min_x = open(self.path_storage + "/path_plan_x.json", "w+") file_path_min_y = open(self.path_storage + "/path_plan_y.json", "w+") i = 0 list_ex = [] list_ey = [] for e1 in self.data: list_x = [] list_y = [] for e2 in e1.path_plan: list_x.append(e2.pose.position.x) list_y.append(e2.pose.position.y) list_ex.append('"' + str(i) + '":[' + ','.join([str(x) for x in list_x]) + ']') list_ey.append('"' + str(i) + '":[' + ','.join([str(y) for y in list_y]) + ']') i += 1 file_path_min_x.write('{' + ',\n'.join([str(x) for x in list_ex]) + '}') file_path_min_y.write('{' + ',\n'.join([str(y) for y in list_ey]) + '}') file_path_min_x.close() file_path_min_y.close() # print path executed file_path_elapsed_x = open(self.path_storage + "/path_executed_x.json", "w+") file_path_elapsed_y = open(self.path_storage + "/path_executed_y.json", "w+") i = 0 list_ex = [] list_ey = [] for e1 in self.data: list_x = [] list_y = [] for e2 in e1.path_executed: list_x.append(e2.x) list_y.append(e2.y) list_ex.append('"' + str(i) + '":[' + ','.join([str(x) for x in list_x]) + ']') list_ey.append('"' + str(i) + '":[' + ','.join([str(y) for y in list_y]) + ']') i += 1 file_path_elapsed_x.write('{' + ',\n'.join([str(x) for x in list_ex]) + '}') file_path_elapsed_y.write('{' + ',\n'.join([str(y) for y in list_ey]) + '}') file_path_elapsed_x.close() file_path_elapsed_y.close() # print people file_people = open(self.path_storage + "/people.json", "w+") i = 0 list_1 = [] for e1 in self.data: list_2 = [] for e2 in e1.people_array: list_3 = [] for e3 in e2: list_3.append('[' + str(e3.position.x) + ',' + str(e3.position.y) + ']') list_2.append('[' + ','.join([str(x) for x in list_3]) + ']') list_1.append('"' + str(i) + '":[' + ','.join([str(x) for x in list_2]) + ']') i += 1 file_people.write('{' + ',\n'.join([str(x) for x in list_1]) + '}') file_people.close() # print result file_result = open(self.path_storage + "/result.csv", "w+") file_result.write( "i,start_x,start_y,start_ang,goal_x,goal_y,goal_ang," + "space_min,time_min,space_elapsed,time_elapsed,status\n") i = 0 for e1 in self.data: (_, _, start_yaw) = tf.transformations.euler_from_quaternion([ e1.checkpoints[0].pose.orientation.x, e1.checkpoints[0].pose.orientation.y, e1.checkpoints[0].pose.orientation.z, e1.checkpoints[0].pose.orientation.w ]) (_, _, goal_yaw) = tf.transformations.euler_from_quaternion([ e1.checkpoints[-1].pose.orientation.x, e1.checkpoints[-1].pose.orientation.y, e1.checkpoints[-1].pose.orientation.z, e1.checkpoints[-1].pose.orientation.w ]) file_result.write(str(i) + ",") file_result.write(str(e1.checkpoints[0].pose.position.x) + ",") file_result.write(str(e1.checkpoints[0].pose.position.y) + ",") file_result.write(str(start_yaw) + ",") file_result.write(str(e1.checkpoints[-1].pose.position.x) + ",") file_result.write(str(e1.checkpoints[-1].pose.position.y) + ",") file_result.write(str(goal_yaw) + ",") file_result.write(str(e1.space_min) + ",") file_result.write(str(e1.time_min) + ",") file_result.write(str(e1.total_space) + ",") file_result.write(str(e1.total_time) + ",") file_result.write(str(e1.status) + "\n") i += 1 file_result.close()
class ESA (object): """ ESA - Explicit Semantic Analysis """ def __init__(self, setting): self.setting = setting self.idf_values = None self.wiki_corpus = None self.wiki_dictionary = None self.wiki_vectors = [] self.wiki_processor = WikiPreprocessor(setting) self.wiki_importer = WikiImporter(setting, self.wiki_processor) self.stack_corpus = None self.answer_vectors = {} self.question_vectors = {} self.user_vectors = {} self.user_content = {} self.stack_importer = StackImporter(setting) self.esa_importer = ESAImporter(setting) self.inverted_index = defaultdict(list) self.number_of_concepts = 0 self.experiments = Experiments(setting) ############################################################################### # Clean and load data ############################################################################### def clean_and_load_data(self): """ Cleans the data and saves it in a database """ self.wiki_importer.import_wiki_data() ############################################################################### # Create and manage data used by ESA algorithm ############################################################################### def build_esa_db(self): """ Initializes the ESA database """ logging.info("\nCreating ESA database ...") self.esa_importer.open_esa_db() # Initialize database self.esa_importer.create_esa_db() # Save the dictionary and corpus of the Wikipedia data self.wiki_dictionary = self.wiki_importer.build_wiki_kb() # Save the inverse document frequencies in the ESA database number_of_documents = self.wiki_dictionary.num_docs #self.wiki_importer.get_number_of_concepts() self.esa_importer.save_wiki_inverse_document_frequencies(number_of_documents) self.esa_importer.close_esa_db() def load_esa_index(self): """ Gets the inverted index from the database """ self.esa_importer.open_esa_db() self.esa_importer.get_pruned_inverted_index(self.inverted_index) logging.info("\nDone") self.esa_importer.close_esa_db() ############################################################################### # Build TF-IDF Vectors ############################################################################### def create_tf_idf_vectors(self): """ Creates them if not already in database """ self.esa_importer.open_esa_db() # Calculate tfidf vectors for the Wikipedia articles self.create_tf_idf_wiki_vectors() # Save terms and vectors to ESA db #self.esa_importer.save_inverted_index(self.wiki_vectors) logging.info("\nDone") self.esa_importer.close_esa_db() def create_tf_idf_wiki_vectors(self): """ Keeping only non-zero entries of the vectors """ wiki_corpus, self.wiki_dictionary = self.esa_importer.get_wiki_corpus_dictionary() logging.info("Retrieving idf values ...") inv_doc_freq = {} self.esa_importer.get_wiki_inverse_document_frequencies(inv_doc_freq) logging.info("Building the tfidf vectors and the inverse index ...") tfidf_model = TfidfModel(self.wiki_dictionary, inv_doc_freq) inverted_index = defaultdict(list) for document in wiki_corpus: vector = tfidf_model[document] for term_id, value in vector: inverted_index[term_id].append( (document.document_id, value) ) #print "Added " + str(document.document_id) logging.info("\n\tDone.") self.esa_importer.save_inverted_index(inverted_index) self.save_index_to_file(inverted_index) def _create_tf_idf_stack_vectors(self, only_questions=False): """ Create the tfidf vectors for the Stackexchange data. """ # Load question and answer corpus logging.info("Loading stack corpus and dictionary ...") question_corpus = self.stack_importer.get_question_corpus() answer_corpus = self.stack_importer.get_answer_corpus() corpus = question_corpus + answer_corpus dictionary = self.stack_importer.get_dictionary_from_corpora([question_corpus, answer_corpus]) dict_size = len(dictionary) # Save stack dictionary stack_dict = {} for word_id, word in enumerate(dictionary.token2id): stack_dict[unicode(word)] = word_id self.idf_values = zeros(dict_size) logging.info("Determining question vectors ...") questions = StackCorpus(self.stack_importer.connection, "question") for question in questions: question_vector = zeros(dict_size) for word in question.body: word_id = stack_dict.get(unicode(word), -1) if word_id != -1: question_vector[word_id] = self.tf_idf(word, word_id, question.body, corpus) self.question_vectors[question.id] = question_vector logging.info("\n\tDone.") if only_questions: # Skip the answers return stack_dict logging.info("Determining answer vectors ...") answers = StackCorpus(self.stack_importer.connection, "answer") for answer in answers: answer_vector = zeros(dict_size) for word in answer.body: word_id = stack_dict.get(unicode(word), -1) if word_id != -1: tf_idf = self.tf_idf(word, word_id, answer.body, corpus) answer_vector[word_id] = tf_idf self.answer_vectors[answer.id] = answer_vector logging.info("\n\tDone.") return stack_dict def _create_local_tf_idf_stack_vectors(self, user_id): """ Create the tfidf vectors for the local Stackexchange data of the given user """ # Load question and answer corpus #logging.info("Loading stack corpus and dictionary ...") question_corpus = self.stack_importer.get_user_question_corpus(user_id) answer_corpus = self.stack_importer.get_user_answer_corpus(user_id) corpus = question_corpus + answer_corpus dictionary = self.stack_importer.get_dictionary_from_corpora([question_corpus, answer_corpus]) dict_size = len(dictionary) # Save stack dictionary stack_dict = {} for word_id, word in enumerate(dictionary.token2id): stack_dict[unicode(word)] = word_id self.idf_values = zeros(dict_size) #logging.info("Determining question vectors ...") questions = self.stack_importer.get_user_local_questions(user_id) for question in questions: question_vector = zeros(dict_size) for word in question.body: word_id = stack_dict.get(unicode(word), -1) if word_id != -1: question_vector[word_id] = self.tf_idf(word, word_id, question.body, corpus) self.question_vectors[question.id] = question_vector #logging.info("\n\tDone.") #logging.info("Determining answer vectors ...") answers = self.stack_importer.get_user_local_answers(user_id) for answer in answers: answer_vector = zeros(dict_size) for word in answer.body: word_id = stack_dict.get(unicode(word), -1) if word_id != -1: tf_idf = self.tf_idf(word, word_id, answer.body, corpus) answer_vector[word_id] = tf_idf self.answer_vectors[answer.id] = answer_vector #logging.info("\n\tDone.") return stack_dict def _create_user_tf_idf_stack_vector(self, user_id, stack_dict): """ Create the tfidf vector representation of a user, based on her answers""" aux = self.user_content.get(user_id, None) if aux is not None: return aux user_corpus = [] user_words = [] answers = self.stack_importer.get_user_answers_to_questions(user_id) for answer in answers: user_corpus.append(answer.body) for word in answer.body: user_words.append(word) self.user_content[user_id] = user_words dict_size = len(stack_dict) user_vector = zeros(dict_size) for word in set(user_words): word_id = stack_dict.get(unicode(word), -1) if word_id != -1: tf_idf = self.tf_idf(word, word_id, user_words, user_corpus) user_vector[word_id] = tf_idf self.user_vectors[user_id] = user_vector return user_words @staticmethod def tf(word, document): """ Returns the normalized frequency of the word in the given document """ word_count = document.count(unicode(word)) return float(word_count) / len(document) @staticmethod def df(word, corpus): """ Returns the number of documents in the collection that contain the given word """ return sum(1 for document in corpus if unicode(word) in document) #@staticmethod def idf(self, word, corpus): """ Returns the inverse document frequency of the word in the documents collection """ return math.log(len(corpus)) / self.df(word, corpus) def tf_idf(self, word, word_index, document, corpus): """ Returns the TF-IDF value for the given word in the document of the corpus """ # Calculate the term frequency value (tf) tf = self.tf(word, document) if tf == 0.0: return 0.0 # Calculate the inverse document frequency value (idf) if self.idf_values[word_index] == 0.0: self.idf_values[word_index] = self.idf(word, corpus) return float(tf * self.idf_values[word_index]) ############################################################################### # Associations and Similarities of Stackexchange questions/answers using # Wikipedia's articles as concepts. ############################################################################### def calculate_similarities(self): """ Applies the ESA algorithm to the global stack data """ # Open database connections self.stack_importer.open_stack_db() self.esa_importer.open_esa_db() # Clean tables logging.info("Cleaning similarity tables ...") self.esa_importer.create_clean_concept_doc_relation() self.esa_importer.create_clean_similarities_table() logging.info("Loading the inverted index ...") self.esa_importer.get_pruned_inverted_index(self.inverted_index) #print "Has beer " + str(self.inverted_index.get(unicode("beer"), None)) logging.info("Calculating stack tfidf vectors ...") stack_dictionary = self._create_tf_idf_stack_vectors() # For each question calculate similarity with each answer logging.info("\nCalculating questions-answers similarities ...") question_corpus = StackCorpus(self.stack_importer.connection, "question") for question in question_corpus: q_vector = self.get_esa_vector(question.id, question.body, self.question_vectors[question.id], stack_dictionary, 1) q_vector_norm = norm(q_vector) similarities = [] answer_corpus = StackCorpus(self.stack_importer.connection, "answer") for answer in answer_corpus: a_vector = self.get_esa_vector(answer.id, answer.body, self.answer_vectors[answer.id], stack_dictionary, 2) sim = self.similarity(q_vector, q_vector_norm, a_vector) similarities.append( (question.id, answer.id, sim) ) # Save similarities to databse logging.info("\nSaving similarities to database ...") self.esa_importer.save_similarities(similarities) self.esa_importer.close_esa_db() self.stack_importer.close_stack_db() logging.info("\nDone") def calculate_tf_idf_similarities(self): """Applies the TF-IDF algorithm to the global stack data""" # Open database connections self.stack_importer.open_stack_db() self.esa_importer.open_esa_db() # Clean tables logging.info("Cleaning similarity tables ...") self.esa_importer.create_clean_similarities_table() logging.info("Calculating stack tfidf vectors ...") stack_dictionary = self._create_tf_idf_stack_vectors() # For each question calculate similarity with each answer question_corpus = StackCorpus(self.stack_importer.connection, "question") logging.info("\nCalculating questions-answers similarities ...") for question in question_corpus: q_vector = self.question_vectors[question.id] q_vector_norm = norm(q_vector) similarities = [] answer_corpus = StackCorpus(self.stack_importer.connection, "answer") for answer in answer_corpus: a_vector = self.answer_vectors[answer.id] sim = self.similarity(q_vector, q_vector_norm, a_vector) similarities.append( (question.id, answer.id, sim) ) # Save similarities to databse logging.info("\nSaving similarities to database ...") self.esa_importer.save_similarities(similarities) self.esa_importer.close_esa_db() self.stack_importer.close_stack_db() logging.info("\nDone") def calculate_local_tfidf_similarities(self): """ Applies TF-IDF to the local stack data, in order to calculate questions/answers similarities. The local data is measured per user. Returns the list of users that were filtered. """ # Keep filtered users filtered_users = [] # Open database connections self.esa_importer.open_esa_db() self.stack_importer.open_stack_db() # Clean similarity table self.esa_importer.create_clean_similarities_table() # For each question calculate its similarity with the all the answers given # by the users who answered the given question logging.info("Calculating questions/answers similarities ...") question_corpus = StackCorpus(self.stack_importer.connection, "question") for question in question_corpus: print "Question " + str(question.id) similarities = [] # Get the users that gave an answer to the question users = self.stack_importer.get_users_from_question(question.id) print "Users that replied: " + str(len(users)) # Calculate the similarities of question with all # answers from the given users (related or not to question) for user_id in users: user_answers = self.stack_importer.get_user_answers_to_questions(user_id) # Only consider users with more than 1 answer if len(user_answers) > 5: print "User " + str(user_id) a = [] for answer in user_answers: a.append(answer.id) print a # Calculate tf_idf vectors for the given user self.question_vectors.clear() self.answer_vectors.clear() stack_dictionary = self._create_local_tf_idf_stack_vectors(user_id) q_vector = self.question_vectors[question.id] q_vector_norm = norm(q_vector) for answer in user_answers: a_vector = self.answer_vectors[answer.id] sim = self.similarity(q_vector, q_vector_norm, a_vector) similarities.append( (question.id, answer.id, sim) ) else: filtered_users.append(user_id) # Save similarities to databse logging.info("\nSaving similarities to database ...") self.esa_importer.save_similarities(similarities) # Close database connections self.esa_importer.close_esa_db() self.stack_importer.close_stack_db() logging.info("\nDone") return filtered_users def calculate_local_esa_similarities(self): """ Applies the ESA algorithm to the local stack data. This local data is measured per user. Returns the list of filtered users """ # Keep filtered users filtered_users = [] # Open database connections self.stack_importer.open_stack_db() self.esa_importer.open_esa_db() # Clean tables logging.info("Cleaning similarity tables ...") #self.esa_importer.create_clean_concept_doc_relation() self.esa_importer.create_clean_similarities_table() logging.info("Loading the inverted index ...") self.esa_importer.get_pruned_inverted_index(self.inverted_index) # For each question calculate its similarity with all the answers given # by the users who answered the given question logging.info("Calculating questions/answers similarities ...") question_corpus = StackCorpus(self.stack_importer.connection, "question") for question in question_corpus: print "Question " + str(question.id) similarities = [] # Get the users that gave an answer to the question users = self.stack_importer.get_users_from_question(question.id) print "Users that replied: " + str(len(users)) # Calculate the similarities of question with all # answers from the given users (related or not to question) for user_id in users: user_answers = self.stack_importer.get_user_answers_to_questions(user_id) # Only consider users with more than 5 answers if len(user_answers) > 5: print "User " + str(user_id) # Calculate tf_idf vectors for the given user self.question_vectors.clear() self.answer_vectors.clear() stack_dictionary = self._create_local_tf_idf_stack_vectors(user_id) q_vector = self.get_esa_vector(question.id, question.body, self.question_vectors[question.id], stack_dictionary, 1) q_vector_norm = norm(q_vector) for answer in user_answers: a_vector = self.get_esa_vector(answer.id, answer.body, self.answer_vectors[answer.id], stack_dictionary, 2) sim = self.similarity(q_vector, q_vector_norm, a_vector) similarities.append( (question.id, answer.id, sim) ) else: filtered_users.append(user_id) # Save similarities to databse logging.info("\nSaving similarities to database ...") self.esa_importer.save_similarities(similarities) self.esa_importer.close_esa_db() self.stack_importer.close_stack_db() logging.info("\nDone") return filtered_users def get_esa_vector(self, id, document, tfidf_vector, dictionary, type): """ Creates the interpretation vector of the given document. - The document should be a set of tokens, already preprocessed - The vector represents the relatedness of the document with all the Wikipedia articles - Type indicates the type of document: question (1) or answer (2) """ # Interpretation vector with dimensions = Wikipedia articles interpretation = zeros(2080905) for token in set(document): documents = self.inverted_index.get(unicode(token), None) word_id = dictionary.get(unicode(token), -1) if documents is not None and word_id != -1: #print str(len(documents)) for document_id, value in documents: interpretation[document_id] += (value * tfidf_vector[word_id]) return interpretation def similarity(self, vector1, norm_vector1, vector2): """ Calculates the cosine similarity between the given vectors """ # Cosine similartity sim = float(dot(vector1, vector2) / (norm_vector1 * norm(vector2))) return sim def save_relatedness_to_file(self, file_name): self.esa_importer.open_esa_db() self.esa_importer.write_relatedness_to_file(file_name) self.esa_importer.close_esa_db() ### EXTRA ### def save_index_to_file(self, index=None, file_name='../data/ESA/index.txt'): index = defaultdict(list) # Extract it from DB self.esa_importer.open_esa_db() self.esa_importer.get_pruned_inverted_index(index) self.esa_importer.close_esa_db() # Copy to file logging.info("Saving them in a file ...") with open(file_name, 'a') as f: for word, doc_list in index.iteritems(): #print word f.write(word + '\n') f.write(' '.join([str(x) for x in doc_list])) f.write('\n') def testing_beer_concept(self): tfidf_norm_values = [] tfidf_values = [] append_values = tfidf_values.append append_norm_values = tfidf_norm_values.append self.esa_importer.open_esa_db() wiki_corpus, self.wiki_dictionary = self.esa_importer.get_wiki_corpus_dictionary() # IDF is fixed idf = 4.8225774331876625 df = 0 for document in wiki_corpus: content = document.content.split(' ') if unicode("beer") in content: doc_tf = defaultdict(float) size = 0 # length of the document df += 1 # Faster than Counter for word in content: doc_tf[word] += 1.0 size += 1 # Calculate tfidf value for word "beer" in Wiki data norm_value = (doc_tf[unicode("beer")] / size) * idf value = doc_tf[unicode("beer")] * idf append_values( (document.document_id, value) ) append_norm_values( (document.document_id, norm_value) ) print "DF : " + str(df) # Sort each list sorted_norm_values = sorted(tfidf_norm_values, key=itemgetter(1)) sorted_norm_values = sorted_norm_values[::-1] sorted_values = sorted(tfidf_values, key=itemgetter(1)) sorted_values = sorted_values[::-1] # Print top 10 in each list print "Normalized : " print ' , '.join([str(id) + " " + str(value) for id,value in sorted_norm_values]) print "\nNot normalized" print ' , '.join([str(id) + " " + str(value) for id,value in sorted_values]) self.esa_importer.close_esa_db() def prun_inverted_index(self): """ Prun the inverted index """ self.esa_importer.open_esa_db() index = EsaIndex(self.esa_importer.connection) result = [] append = result.append for term, vector in index: append( (term, vector) ) self.esa_importer.save_pruned_index(result) self.esa_importer.close_esa_db() ############################################################################### # Find the right person # Then, following a naive strong tie strategy, we could check for each question # which other users would have been asked following two strategies: (a) based # on the social network ties (the ones with strongest ties) and (b) based on # the content similarity (which answer is most similar to the question using # TF-IDF or ESA, whatever you like best). Finally, we can compare both results # with the ground truth (which users got actually asked in the dataset). ############################################################################### def calculate_esa_similarities_to_users(self): # Open database connections self.stack_importer.open_stack_db() self.esa_importer.open_esa_db() # Clean tables logging.info("Cleaning similarity tables ...") self.esa_importer.create_clean_similarities_table() logging.info("Loading the inverted index ...") self.esa_importer.get_pruned_inverted_index(self.inverted_index) logging.info("Calculating questions tfidf vectors ...") stack_dictionary = self._create_tf_idf_stack_vectors(only_questions=True) # For each question determine which other users would have been asked logging.info("Calculating questions/users similarities ...") question_corpus = StackCorpus(self.stack_importer.connection, "question") users = self.stack_importer.get_active_users() for question in question_corpus: print "Question " + str(question.id) q_vector = self.get_esa_vector(question.id, question.body, self.question_vectors[question.id], stack_dictionary, 1) q_vector_norm = norm(q_vector) similarities = [] for user_id in users: user_body = self._create_user_tf_idf_stack_vector(user_id, stack_dictionary) u_vector = self.get_esa_vector(user_id, user_body, self.user_vectors[user_id], stack_dictionary, 2) sim = self.similarity(q_vector, q_vector_norm, u_vector) similarities.append( (question.id, user_id, sim) ) # Save similarities to databse logging.info("\nSaving similarities to database ...") self.esa_importer.save_similarities(similarities) self.esa_importer.close_esa_db() self.stack_importer.close_stack_db() logging.info("\nDone") def calculate_tfidf_similarities_to_users(self): # Open database connections self.stack_importer.open_stack_db() self.esa_importer.open_esa_db() # Clean tables logging.info("Cleaning similarity tables ...") #self.esa_importer.create_clean_concept_doc_relation() self.esa_importer.create_clean_similarities_table() logging.info("Calculating questions tfidf vectors ...") stack_dictionary = self._create_tf_idf_stack_vectors(only_questions=True) # For each question determine which other users would have been asked logging.info("Calculating questions/users similarities ...") question_corpus = StackCorpus(self.stack_importer.connection, "question") users = self.stack_importer.get_active_users() for question in question_corpus: print "Question " + str(question.id) q_vector = self.question_vectors[question.id] q_vector_norm = norm(q_vector) similarities = [] for user_id in users: user_body = self._create_user_tf_idf_stack_vector(user_id, stack_dictionary) u_vector = self.user_vectors[user_id] sim = self.similarity(q_vector, q_vector_norm, u_vector) similarities.append( (question.id, user_id, sim) ) # Save similarities to databse logging.info("\nSaving similarities to database ...") self.esa_importer.save_similarities(similarities) self.esa_importer.close_esa_db() self.stack_importer.close_stack_db() logging.info("\nDone") ############################################################################### # Experiments - Calculate statistics on the data ############################################################################### def initialize_experiments(self): self.experiments.open_experiment_db() self.experiments.create_experiments_db() self.experiments.close_experiment_db() def run_experiment_1(self): self.experiments.open_experiment_db() self.experiments.run_experiment_1(True) self.experiments.close_experiment_db() def run_experiment_1_avg(self, experiment_type='1_avg', algorithm='esa'): self.experiments.open_experiment_db() self.esa_importer.open_esa_db() self.stack_importer.open_stack_db() total_answers = self.stack_importer.get_number_of_answers() # Get number of answers for each question number_of_answers = self.stack_importer.get_number_of_original_answers() # Load similarities for each question logging.info("Loading similarities ...") question_corpus = StackCorpus(self.stack_importer.connection, "question") similar_answers = {} original_answers = {} for question in question_corpus: original_answers[question.id] = self.stack_importer.get_question_original_answers(question.id) similar_answers[question.id] = self.esa_importer.load_similarities_for_question(question.id, -1, False) self.stack_importer.close_stack_db() self.esa_importer.close_esa_db() # Calculate avg precision and recall for each case precision = {} recall = {} for limit in xrange(1,total_answers+1): logging.info("Calculating with limit %s", str(limit)) avg_precision, avg_recall = self.experiments.run_experiment_1_avg(number_of_answers, original_answers, similar_answers, experiment_type, limit) precision[limit] = avg_precision recall[limit] = avg_recall # Save into the database self.experiments.save_experiment_results(experiment_type, precision, recall) # Write them in a file folder = self.setting["experiments_folder"] + experiment_type + '_' + algorithm + '.dat' self.experiments.write_pr_curve(experiment_type, folder) self.experiments.close_experiment_db() logging.info("\nDone") def run_experiment_2_avg(self, algorithm='esa'): """ Same as run_experiment_1_avg but similarities were calculated with local data per user """ self.run_experiment_1_avg('2_avg', algorithm) def run_experiment_3_avg(self, algorithm='esa', experiment_type='3_avg'): """ Similar to experiment_1, but checking users instead of answers """ self.experiments.open_experiment_db() self.esa_importer.open_esa_db() self.stack_importer.open_stack_db() # Get the number of active users active_users = len(self.stack_importer.get_active_users()) # Get the users that gave an answer to each question asked_users = self.stack_importer.get_original_users() # Load similarities for each question logging.info("Loading similarities ...") question_corpus = StackCorpus(self.stack_importer.connection, "question") similar_users = {} original_users = {} for question in question_corpus: aux = asked_users.get(question.id, None) if aux is not None: original_users[question.id] = aux similar_users[question.id] = self.esa_importer.load_similarities_for_question(question.id, -1, False) self.stack_importer.close_stack_db() self.esa_importer.close_esa_db() # Calculate avg precision and recall for each case precision = {} recall = {} for limit in xrange(1,active_users+1): #print "Calculating with limit " + str(limit) logging.info("Calculating with limit %s", str(limit)) avg_precision, avg_recall = self.experiments.run_experiment_3_avg(asked_users, original_users, similar_users, experiment_type, limit) precision[limit] = avg_precision recall[limit] = avg_recall # Save into the database self.experiments.save_experiment_results(experiment_type, precision, recall) # Write them in a file folder = self.setting["experiments_folder"] + experiment_type + '_' + algorithm + '.dat' self.experiments.write_pr_curve(experiment_type, folder) self.experiments.close_experiment_db() logging.info("\nDone")
# # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ myapp runs planout as a service """ from flask import Flask, jsonify, request from experiments import Experiments experiments = Experiments() # Create the application, elastic beanstalk expects the name "application" app = Flask(__name__) @app.route("/") def get_experiments_for_team(): """Return JSON for team's experiments get_expirments_for_team returns experiments json of all experiments associated with a team Args: team_name: name of the team (group_id) unit: unique identifier for user
if args.plot: plot_times_by_batch(args.database) else: if args.load_database: exps = pkl.load(open(args.database)) else: ## Determine the type of sparsity layer to use if args.layer_class == 'HiddenRandomBlockLayer': layer_class = HiddenRandomBlockLayer else: layer_class = HiddenBlockLayer ## Create experiments exps = Experiments( input_dim=784, # data.train_set_x.shape[-1].eval(), num_classes=10) # Add descriptions of models exps.add_layers_description( 0, { 'n_hids': (25, ), 'n_units_per': args.units_per_block, 'k_pers': (1, 1), 'activations': (T.tanh, None), 'layer_classes': [ HiddenBlockLayer, HiddenBlockLayer, ], }) exps.add_layers_description(
class LDA(object): """ LDA - Latent Dirichlet Porcesses """ def __init__(self, setting): self.setting = setting self.mallet_path = setting['malletpath'] self.number_of_topics = setting['nooftopics'] self.number_of_iter = setting['noofiterations'] self.stack_importer = StackImporter(setting) self.lda_importer = LDAImporter(setting) self.experiments = Experiments(setting) self.model = None self.corpus = None self.dictionary = None self.answer_corpus = None directory = self.setting['lda_folder'] file_name = 'local_lda_model' + self.setting['theme'] + '.gs' self.path = ''.join([directory, file_name]) def __iter__(self): for document in self.corpus: yield self.dictionary.doc2bow(document) def calculate_similarities(self): # Open database connections self.lda_importer.open_lda_db() self.stack_importer.open_stack_db() # Clean similarity table self.lda_importer.create_clean_similarities_table() self._learn_model() logging.info("Loading dictionary ...") self._load_dictionary() logging.info("Calculating questions/answers similarities ...") question_corpus = StackCorpus(self.stack_importer.connection, "question") for question in question_corpus: print "Question " + str(question.id) similarities = [] answer_corpus = StackCorpus(self.stack_importer.connection, "answer") # Get topics in the question bow = self.dictionary.doc2bow(question.body) question_topics = self.model[bow] for answer in answer_corpus: # Get topics in the answer bow = self.dictionary.doc2bow(answer.body) answer_topics = self.model[bow] # Similarities similarities.append( (question.id, answer.id, self._compare_documents(question_topics, answer_topics))) # Save similarities to databse logging.info("\nSaving similarities to database ...") self.lda_importer.save_similarities(similarities) # Close database connections self.stack_importer.close_stack_db() self.lda_importer.close_lda_db() def _learn_model(self): self.model = models.wrappers.LdaMallet( self.mallet_path, corpus=self, num_topics=self.number_of_topics, id2word=self.dictionary, iterations=self.number_of_iter) def _load_dictionary(self): self.stack_importer.open_stack_db() # Load dictionary question_corpus = self.stack_importer.get_question_corpus() answer_corpus = self.stack_importer.get_answer_corpus() corpus = question_corpus + answer_corpus self.dictionary = self.stack_importer.get_dictionary_from_corpora( [question_corpus, answer_corpus]) self.stack_importer.close_stack_db() def run_experiment_1_avg(self, experiment_type='1_avg', algorithm='esa'): self.experiments.open_experiment_db() self.lda_importer.open_lda_db() self.stack_importer.open_stack_db() total_answers = self.stack_importer.get_number_of_answers() # Get number of answers for each question number_of_answers = self.stack_importer.get_number_of_original_answers( ) # Load similarities for each question logging.info("Loading similarities ...") question_corpus = StackCorpus(self.stack_importer.connection, "question") similar_answers = {} original_answers = {} for question in question_corpus: original_answers[ question. id] = self.stack_importer.get_question_original_answers( question.id) similar_answers[ question. id] = self.esa_importer.load_similarities_for_question( question.id, -1, False) self.stack_importer.close_stack_db() self.lda_importer.close_lda_db() # Calculate avg precision and recall for each case precision = {} recall = {} for limit in xrange(1, total_answers + 1): logging.info("Calculating with limit %s", str(limit)) avg_precision, avg_recall = self.experiments.run_experiment_1_avg( number_of_answers, original_answers, similar_answers, experiment_type, limit) precision[limit] = avg_precision recall[limit] = avg_recall # Save into the database self.experiments.save_experiment_results(experiment_type, precision, recall) # Write them in a file folder = self.setting[ "experiments_folder"] + experiment_type + '_' + algorithm + '.dat' self.experiments.write_pr_curve(experiment_type, folder) self.experiments.close_experiment_db() logging.info("\nDone") ############################################################################### # Create the local model ############################################################################### def calculate_local_similarities(self): """ Calculates similarities between local questions/answers. Returns the list of filtered users """ # Keep filtered users filtered_users = [] # Open database connections self.lda_importer.open_lda_db() self.stack_importer.open_stack_db() # Clean similarity table self.lda_importer.create_clean_similarities_table() # For each question calculate its similarity with the all the answers given # by the users who answered the given question logging.info("Calculating questions/answers similarities ...") question_corpus = StackCorpus(self.stack_importer.connection, "question") for question in question_corpus: print "Question " + str(question.id) similarities = [] # Get the users that gave an answer to the question users = self.stack_importer.get_users_from_question(question.id) print "Users that replied: " + str(len(users)) # Calculate the similarities of question with all # answers from the given users (related or not to question) for user_id in users: user_answers = self.stack_importer.get_user_answers_to_questions( user_id) # Only consider users with more than 1 answer if len(user_answers) > 5: print "User " + str(user_id) self._learn_local_model(user_id) # Get topics in the question bow = self.dictionary.doc2bow(question.body) question_topics = self.model[bow] # Get topics in the answers and calculate similarities with current question for answer in user_answers: bow = self.dictionary.doc2bow(answer.body) answer_topics = self.model[bow] # Similarities similarities.append( (question.id, answer.id, self._compare_documents(question_topics, answer_topics))) else: filtered_users.append(user_id) # Save similarities to databse logging.info("\nSaving similarities to database ...") self.lda_importer.save_similarities(similarities) # Close database connections self.stack_importer.close_stack_db() self.lda_importer.close_lda_db() return filtered_users def _learn_local_model(self, user_id): """ Learns the LDA model with local knowledge """ # Load question and answer corpus question_corpus = self.stack_importer.get_user_question_corpus(user_id) self.answer_corpus = self.stack_importer.get_user_answer_corpus( user_id) self.corpus = question_corpus + self.answer_corpus self.dictionary = self.stack_importer.get_dictionary_from_corpora( [question_corpus, self.answer_corpus]) # Create model self.model = models.wrappers.LdaMallet( self.mallet_path, corpus=self, num_topics=self.number_of_topics, id2word=self.dictionary, iterations=self.number_of_iter) @staticmethod def _compare_documents(document1, document2): """ Calculates the distance between the given documents """ doc1_topic_description = [] doc2_topic_description = [] for (topic, weight) in document1: doc1_topic_description.append(weight) for (topic, weight) in document2: doc2_topic_description.append(weight) return Metric.js_distance(doc1_topic_description, doc2_topic_description) def run_experiment_2_avg(self, experiment_type='2_avg', algorithm='lda_local_2'): self.experiments.open_experiment_db() self.lda_importer.open_lda_db() self.stack_importer.open_stack_db() total_answers = self.stack_importer.get_number_of_answers() # Get number of answers for each question number_of_answers = self.stack_importer.get_number_of_original_answers( ) # Load similarities for each question logging.info("Loading similarities ...") question_corpus = StackCorpus(self.stack_importer.connection, "question") similar_answers = {} original_answers = {} for question in question_corpus: original_answers[ question. id] = self.stack_importer.get_question_original_answers( question.id) similar_answers[ question. id] = self.lda_importer.load_similarities_for_question( question.id, -1, False) self.stack_importer.close_stack_db() self.lda_importer.close_lda_db() # Calculate avg precision and recall for each case precision = {} recall = {} for limit in xrange(1, total_answers + 1): print "Calculating with limit " + str(limit) avg_precision, avg_recall = self.experiments.run_experiment_1_avg( number_of_answers, original_answers, similar_answers, experiment_type, limit) precision[limit] = avg_precision recall[limit] = avg_recall # Save into the database self.experiments.save_experiment_results(experiment_type, precision, recall) # Write them in a file folder = self.setting[ "experiments_folder"] + experiment_type + '_' + algorithm + '.dat' self.experiments.write_pr_curve(experiment_type, folder) self.experiments.close_experiment_db() logging.info("\nDone")
classification_col=2) # test test_data = np.loadtxt("data/test.txt", delimiter=",") x_test, y_test = timestamped_to_vector(test_data, timestamp_col=0, time_start=1, classification_col=2) # all data x = np.concatenate((x_train, x_test)) y = np.concatenate((y_train, y_test)) # random search of hyperparameters expt = Experiments.Experiment(Configs.get_all(), folds=10, search_algorithm="random", data=(x_train, y_train), folder_name="random_search", thresholding=True, threshold=0.5) expt.run_experiments(num_experiments=400) # Config A with separate test set params_A = Configs.get_A() params_A["sequence_length"] = list(range(1, 31)) # total real time length expt = Experiments.Experiment(params_A, search_algorithm="grid", x_test=x_test, y_test=y_test, x_train=x_train, y_train=y_train,
class LDA (object): """ LDA - Latent Dirichlet Porcesses """ def __init__(self, setting): self.setting = setting self.mallet_path = setting['malletpath'] self.number_of_topics = setting['nooftopics'] self.number_of_iter = setting['noofiterations'] self.stack_importer = StackImporter(setting) self.lda_importer = LDAImporter(setting) self.experiments = Experiments(setting) self.model = None self.corpus = None self.dictionary = None self.answer_corpus = None directory = self.setting['lda_folder'] file_name = 'local_lda_model' + self.setting['theme'] + '.gs' self.path = ''.join([directory, file_name]) def __iter__(self): for document in self.corpus: yield self.dictionary.doc2bow(document) def calculate_similarities(self): # Open database connections self.lda_importer.open_lda_db() self.stack_importer.open_stack_db() # Clean similarity table self.lda_importer.create_clean_similarities_table() self._learn_model() logging.info("Loading dictionary ...") self._load_dictionary() logging.info("Calculating questions/answers similarities ...") question_corpus = StackCorpus(self.stack_importer.connection, "question") for question in question_corpus: print "Question " + str(question.id) similarities = [] answer_corpus = StackCorpus(self.stack_importer.connection, "answer") # Get topics in the question bow = self.dictionary.doc2bow(question.body) question_topics = self.model[bow] for answer in answer_corpus: # Get topics in the answer bow = self.dictionary.doc2bow(answer.body) answer_topics = self.model[bow] # Similarities similarities.append((question.id, answer.id, self._compare_documents(question_topics, answer_topics))) # Save similarities to databse logging.info("\nSaving similarities to database ...") self.lda_importer.save_similarities(similarities) # Close database connections self.stack_importer.close_stack_db() self.lda_importer.close_lda_db() def _learn_model(self): self.model = models.wrappers.LdaMallet(self.mallet_path, corpus=self, num_topics=self.number_of_topics, id2word=self.dictionary, iterations=self.number_of_iter) def _load_dictionary(self): self.stack_importer.open_stack_db() # Load dictionary question_corpus = self.stack_importer.get_question_corpus() answer_corpus = self.stack_importer.get_answer_corpus() corpus = question_corpus + answer_corpus self.dictionary = self.stack_importer.get_dictionary_from_corpora([question_corpus, answer_corpus]) self.stack_importer.close_stack_db() def run_experiment_1_avg(self, experiment_type='1_avg', algorithm='esa'): self.experiments.open_experiment_db() self.lda_importer.open_lda_db() self.stack_importer.open_stack_db() total_answers = self.stack_importer.get_number_of_answers() # Get number of answers for each question number_of_answers = self.stack_importer.get_number_of_original_answers() # Load similarities for each question logging.info("Loading similarities ...") question_corpus = StackCorpus(self.stack_importer.connection, "question") similar_answers = {} original_answers = {} for question in question_corpus: original_answers[question.id] = self.stack_importer.get_question_original_answers(question.id) similar_answers[question.id] = self.esa_importer.load_similarities_for_question(question.id, -1, False) self.stack_importer.close_stack_db() self.lda_importer.close_lda_db() # Calculate avg precision and recall for each case precision = {} recall = {} for limit in xrange(1,total_answers+1): logging.info("Calculating with limit %s", str(limit)) avg_precision, avg_recall = self.experiments.run_experiment_1_avg(number_of_answers, original_answers, similar_answers, experiment_type, limit) precision[limit] = avg_precision recall[limit] = avg_recall # Save into the database self.experiments.save_experiment_results(experiment_type, precision, recall) # Write them in a file folder = self.setting["experiments_folder"] + experiment_type + '_' + algorithm + '.dat' self.experiments.write_pr_curve(experiment_type, folder) self.experiments.close_experiment_db() logging.info("\nDone") ############################################################################### # Create the local model ############################################################################### def calculate_local_similarities(self): """ Calculates similarities between local questions/answers. Returns the list of filtered users """ # Keep filtered users filtered_users = [] # Open database connections self.lda_importer.open_lda_db() self.stack_importer.open_stack_db() # Clean similarity table self.lda_importer.create_clean_similarities_table() # For each question calculate its similarity with the all the answers given # by the users who answered the given question logging.info("Calculating questions/answers similarities ...") question_corpus = StackCorpus(self.stack_importer.connection, "question") for question in question_corpus: print "Question " + str(question.id) similarities = [] # Get the users that gave an answer to the question users = self.stack_importer.get_users_from_question(question.id) print "Users that replied: " + str(len(users)) # Calculate the similarities of question with all # answers from the given users (related or not to question) for user_id in users: user_answers = self.stack_importer.get_user_answers_to_questions(user_id) # Only consider users with more than 1 answer if len(user_answers) > 5: print "User " + str(user_id) self._learn_local_model(user_id) # Get topics in the question bow = self.dictionary.doc2bow(question.body) question_topics = self.model[bow] # Get topics in the answers and calculate similarities with current question for answer in user_answers: bow = self.dictionary.doc2bow(answer.body) answer_topics = self.model[bow] # Similarities similarities.append((question.id, answer.id, self._compare_documents(question_topics, answer_topics))) else: filtered_users.append(user_id) # Save similarities to databse logging.info("\nSaving similarities to database ...") self.lda_importer.save_similarities(similarities) # Close database connections self.stack_importer.close_stack_db() self.lda_importer.close_lda_db() return filtered_users def _learn_local_model(self, user_id): """ Learns the LDA model with local knowledge """ # Load question and answer corpus question_corpus = self.stack_importer.get_user_question_corpus(user_id) self.answer_corpus = self.stack_importer.get_user_answer_corpus(user_id) self.corpus = question_corpus + self.answer_corpus self.dictionary = self.stack_importer.get_dictionary_from_corpora([question_corpus, self.answer_corpus]) # Create model self.model = models.wrappers.LdaMallet(self.mallet_path, corpus=self, num_topics=self.number_of_topics, id2word=self.dictionary, iterations=self.number_of_iter) @staticmethod def _compare_documents(document1, document2): """ Calculates the distance between the given documents """ doc1_topic_description = [] doc2_topic_description = [] for (topic, weight) in document1: doc1_topic_description.append(weight) for (topic, weight) in document2: doc2_topic_description.append(weight) return Metric.js_distance(doc1_topic_description, doc2_topic_description) def run_experiment_2_avg(self, experiment_type='2_avg', algorithm='lda_local_2'): self.experiments.open_experiment_db() self.lda_importer.open_lda_db() self.stack_importer.open_stack_db() total_answers = self.stack_importer.get_number_of_answers() # Get number of answers for each question number_of_answers = self.stack_importer.get_number_of_original_answers() # Load similarities for each question logging.info("Loading similarities ...") question_corpus = StackCorpus(self.stack_importer.connection, "question") similar_answers = {} original_answers = {} for question in question_corpus: original_answers[question.id] = self.stack_importer.get_question_original_answers(question.id) similar_answers[question.id] = self.lda_importer.load_similarities_for_question(question.id, -1, False) self.stack_importer.close_stack_db() self.lda_importer.close_lda_db() # Calculate avg precision and recall for each case precision = {} recall = {} for limit in xrange(1,total_answers+1): print "Calculating with limit " + str(limit) avg_precision, avg_recall = self.experiments.run_experiment_1_avg(number_of_answers, original_answers, similar_answers, experiment_type, limit) precision[limit] = avg_precision recall[limit] = avg_recall # Save into the database self.experiments.save_experiment_results(experiment_type, precision, recall) # Write them in a file folder = self.setting["experiments_folder"] + experiment_type + '_' + algorithm + '.dat' self.experiments.write_pr_curve(experiment_type, folder) self.experiments.close_experiment_db() logging.info("\nDone")
x_test, y_test = timestamped_to_vector(test, vector_col=v, time_start=0, classification_col=c) x_train, y_train = timestamped_to_vector(train, vector_col=v, time_start=0, classification_col=c) # Random search with thresholding rand_params = Configs.get_all() expt = Experiments.Experiment(rand_params, search_algorithm="random", data=(x_train, y_train), folds=10, folder_name="random_search_reults", thresholding=True, threshold=0.5) # parameter configurations A_B_C = Configs.get_A_B_C # Ensemble model ensemble_config = Experiments.Ensemble_configurations( list(A_B_C.values()), x_test=x_test, y_test=y_test, x_train=x_train, y_train=y_train, folder_name="test_train_results",
if args.plot: plot_times_by_batch(args.database) else: if args.load_database: exps = pkl.load(open(args.database)) else: ## Determine the type of sparsity layer to use if args.layer_class == 'HiddenRandomBlockLayer': layer_class = HiddenRandomBlockLayer else: layer_class = HiddenBlockLayer ## Create experiments exps = Experiments( input_dim=784, # data.train_set_x.shape[-1].eval(), num_classes=10 ) # Add descriptions of models exps.add_layers_description( 0, { 'n_hids': (25,), 'n_units_per': args.units_per_block, 'k_pers': (1, 1), 'activations': (T.tanh, None), 'layer_classes': [ HiddenBlockLayer, HiddenBlockLayer, ], }
def test_init(self): experiments = Experiments() self.assertEqual( experiments.getNumOfExperiments(), 0 ) self.assertEqual( experiments.getExperiments(), {} ) try: experiments.runAllExperiments() fail(self) except ValueError as ve: self.assertEqual( str(ve), 'Experiments object has no models to run!') try: experiments.addExperiment('random forest') fail(self) except ValueError as ve: self.assertEqual( str(ve), 'Object must be Experiment object: random forest') try: experiments.addExperiment( Experiment(1) ) fail(self) except ValueError as ve: self.assertEqual( str(ve), 'Experiment name attribute must be string, not <class \'int\'>' ) self.assertEqual( experiments.getNumOfExperiments(), 0 ) experiments.addExperiment( Experiment('1') ) experiments.addExperiment( Experiment('2') ) experiments.addExperiment( Experiment('3') ) experiments.addExperiment( Experiment('4') ) self.assertEqual( experiments.getNumOfExperiments(), 4 ) self.assertEqual( experiments.getExperimentNames(), ['1', '2', '3', '4'] )