def get_all_document_names(training=True): batches = get_batch_list(training) doc_names_collected = [] for batch in batches: doc_names_collected += list(s.load(open(env_paths.get_doc_names_path(training, int(batch)), "rb"))) return doc_names_collected
def get_document_names(batch, training=True): """ Get document names. @param batch: the number of the batch. @param training: is this the training set or the test set. """ names = s.load(open(env_paths.get_doc_names_path(training, batch), "rb")) return names
def get_document_name(row, batch, training=True): """ The name of the document corresponding to a row in a batch. @param row: row in the bag of words matrix in batch. @param batch: the number of the batch. @param training: is this the training set or the test set. """ return s.load(open(env_paths.get_doc_names_path(training, batch), "rb"))[row]
def __save_batch_loading_docs(self, batch_number, docs_list, docs_names, class_indices): """ Save batches for the document loading process in the initialization phase. This is done due to vast sizes of data - lack of memory. @param batch_number: Representing the number of documents in the batch. @param docs_list: List containing a string for each document in the batch. @param docs_names: List containing the names of each document in the same order as the docs_list. @param class_indices: List containing which class/folder each document belongs to. """ # Serialize all relevant variables s.dump(docs_list, open(env_paths.get_doc_list_path(self.training, batch_number), "wb")) s.dump(docs_names, open(env_paths.get_doc_names_path(self.training, batch_number), "wb")) s.dump(class_indices, open(env_paths.get_class_indices_path(self.training, batch_number), "wb"))