def __set_attributes(self): """ Set the attributes containing of a list of words of all attributes in the bag of words matrix. @return: The generated list of words acting as attributes for the BOWs. """ batches = s.load(open(env_paths.get_batches_path(self.training), "rb")) length = len(batches) attributes = [] processed = 1 for batch in batches: docs_list = s.load(open(env_paths.get_doc_list_path(self.training, batch), "rb")) tmp_attributes = list( set(sorted(list(chain(*docs_list)))) ) # Retrieve the each word of the docs list in a sorted list attributes += tmp_attributes attributes = list( set(sorted(attributes)) ) # Sort the attributes list so that there is no 2 occurrences of the same word. if not self.acceptance_lst == None: attributes = list( set(attributes).intersection(self.acceptance_lst) ) # Only consider words in the acceptance list. print "Processed attribute " + str(processed) + " of " + str(length) + " batches" processed += 1 # Find attributes of the most common words. d = dict.fromkeys(attributes) processed = 1 for batch in batches: docs_list = s.load(open(env_paths.get_doc_list_path(self.training, batch), "rb")) words = list(list(chain(*docs_list))) for w in words: try: if d[w] == None: d[w] = 1 else: d[w] += 1 except KeyError: continue print "Processed summing " + str(processed) + " of " + str(length) + " batches" processed += 1 sorted_att = sorted(d.items(), key=lambda x: x[1]) sorted_att = sorted_att[len(sorted_att) - self.max_words_matrix :] attributes = [elem[0] for elem in sorted_att] # Serialize attributes s.dump(attributes, open(env_paths.get_attributes_path(self.training), "wb")) return attributes
def __generate_word_matrix(self, index_lookup): """ Generate a BOW matrix with rows, columns corresponding to documents, words respectively. @param index_lookup: A dictionary with keys for the attributes. In order to know which colounm should be incremented in word_matrix. """ batches = s.load(open(env_paths.get_batches_path(self.training), "rb")) length = len(batches) processed = 1 for batch in batches: docs_list = s.load(open(env_paths.get_doc_list_path(self.training, batch), "rb")) bag_of_words_matrix = zeros([len(docs_list), len(index_lookup)]) row = 0 for doc in docs_list: for token in doc: try: # If word is not found in the dictionary col = index_lookup[token] bag_of_words_matrix[row, col] += 1 except KeyError: continue row += 1 # Serialize bag of words s.dump(bag_of_words_matrix.tolist(), open(env_paths.get_bow_matrix_path(self.training, batch), "wb")) print "Processed " + str(processed) + " of " + str(length) + " batches" processed += 1
def __generate_word_matrix(self, index_lookup): """ Generate a BOW matrix with rows, columns corresponding to documents, words respectively. @param index_lookup: A dictionary with keys for the attributes. In order to know which colounm should be incremented in word_matrix. """ batches = s.load(open(env_paths.get_batches_path(self.training), "rb")) length = len(batches) processed = 1 for batch in batches: docs_list = s.load(open(env_paths.get_doc_list_path(self.training, batch), "rb")) bag_of_words_matrix = zeros([len(docs_list), len(index_lookup)]) row = 0 for doc in docs_list: for token in doc: try: # If word is not found in the dictionary col = index_lookup[token] bag_of_words_matrix[row, col] += 1 except KeyError: continue row += 1 # Serialize bag of words s.dump(bag_of_words_matrix.tolist(), open(env_paths.get_bow_matrix_path(self.training, batch), "wb")) print 'Processed ' + str(processed) + ' of ' + str(length) + ' batches' processed += 1
def __set_attributes(self): """ Set the attributes containing of a list of words of all attributes in the bag of words matrix. @return: The generated list of words acting as attributes for the BOWs. """ batches = s.load(open(env_paths.get_batches_path(self.training), "rb")) length = len(batches) attributes = [] processed = 1 for batch in batches: docs_list = s.load(open(env_paths.get_doc_list_path(self.training, batch), "rb")) tmp_attributes = list( set(sorted(list(chain(*docs_list))))) # Retrieve the each word of the docs list in a sorted list attributes += tmp_attributes attributes = list( set(sorted(attributes))) # Sort the attributes list so that there is no 2 occurrences of the same word. if not self.acceptance_lst == None: attributes = list( set(attributes).intersection(self.acceptance_lst)) # Only consider words in the acceptance list. print 'Processed attribute ' + str(processed) + ' of ' + str(length) + ' batches' processed += 1 # Find attributes of the most common words. d = dict.fromkeys(attributes) processed = 1 for batch in batches: docs_list = s.load(open(env_paths.get_doc_list_path(self.training, batch), "rb")) words = list(list(chain(*docs_list))) for w in words: try: if d[w] == None: d[w] = 1 else: d[w] += 1 except KeyError: continue print 'Processed summing ' + str(processed) + ' of ' + str(length) + ' batches' processed += 1 sorted_att = sorted(d.items(), key=lambda x: x[1]) sorted_att = sorted_att[len(sorted_att) - self.max_words_matrix:] attributes = [elem[0] for elem in sorted_att] # Serialize attributes s.dump(attributes, open(env_paths.get_attributes_path(self.training), "wb")) return attributes
def __save_batch_loading_docs(self, batch_number, docs_list, docs_names, class_indices): """ Save batches for the document loading process in the initialization phase. This is done due to vast sizes of data - lack of memory. @param batch_number: Representing the number of documents in the batch. @param docs_list: List containing a string for each document in the batch. @param docs_names: List containing the names of each document in the same order as the docs_list. @param class_indices: List containing which class/folder each document belongs to. """ # Serialize all relevant variables s.dump(docs_list, open(env_paths.get_doc_list_path(self.training, batch_number), "wb")) s.dump(docs_names, open(env_paths.get_doc_names_path(self.training, batch_number), "wb")) s.dump(class_indices, open(env_paths.get_class_indices_path(self.training, batch_number), "wb"))