def compare_real_data_to_reconstructed_data(): weights = s.load(open(env_paths.get_dbn_weight_path(),"rb")) batches = s.load(open(env_paths.get_batches_path(train=False),"rb")) class_indices = s.load(open(env_paths.get_class_indices_path(False,batches[0]).replace(".0",""),"rb")) batch = batches[0] data = data_processing.get_bag_of_words_matrix(batch,training = False) dict = {} for i in range(len(class_indices)): idx = class_indices[i] if idx in dict.keys(): continue dict[idx] = data[i] if len(dict) >= 10: break print dict.keys() data_points = dict.values() output_data_points = [] for d in data_points: d = append(d,1.) out = generate_output_data(d,weights) output_data_points.append(out) visualise_data_points(data_points,output_data_points)
def get_class_indices(batch, training=True): """ Get all class indices of the documents in a batch. @param batch: the number of the batch. @param training: is this the training set or the test set. """ indices = s.load(env_paths.get_class_indices_path(training, batch), "rb") return indices
def get_document_class(row, batch, training=True): """ The class of a document corresponding to a row in a batch. @param row: row in the bag of words matrix in batch. @param batch: the number of the batch. @param training: is this the training set or the test set. """ class_indices_for_batch = s.load(open(env_paths.get_class_indices_path(training, batch), "rb")) class_names_for_batch = s.load(open(env_paths.get_class_names_path(training), "rb")) return class_names_for_batch[class_indices_for_batch[row]]
def get_all_class_indices(training=True): """ Get all class indices for all batches in one list. @param training: is this the training set or the test set. """ batches = get_batch_list(training) indices_collected = [] for batch in batches: indices_collected += list(s.load(open(env_paths.get_class_indices_path(training, int(batch)), "rb"))) return indices_collected
def __save_batch_loading_docs(self, batch_number, docs_list, docs_names, class_indices): """ Save batches for the document loading process in the initialization phase. This is done due to vast sizes of data - lack of memory. @param batch_number: Representing the number of documents in the batch. @param docs_list: List containing a string for each document in the batch. @param docs_names: List containing the names of each document in the same order as the docs_list. @param class_indices: List containing which class/folder each document belongs to. """ # Serialize all relevant variables s.dump(docs_list, open(env_paths.get_doc_list_path(self.training, batch_number), "wb")) s.dump(docs_names, open(env_paths.get_doc_names_path(self.training, batch_number), "wb")) s.dump(class_indices, open(env_paths.get_class_indices_path(self.training, batch_number), "wb"))
def save_batch(batch,batch_lbl,batchno,training): pickle.dump(batch_lbl, open(env_paths.get_class_indices_path(training,batchno), "wb")) pickle.dump(batch, open(env_paths.get_bow_matrix_path(training,batchno), "wb"))