def get_attributes(training=True): """ Get the attributes. @param training: is this the training set or the test set. """ return s.load(open(env_paths.get_attributes_path(training), "rb"))
def __set_attributes(self): """ Set the attributes containing of a list of words of all attributes in the bag of words matrix. @return: The generated list of words acting as attributes for the BOWs. """ batches = s.load(open(env_paths.get_batches_path(self.training), "rb")) length = len(batches) attributes = [] processed = 1 for batch in batches: docs_list = s.load(open(env_paths.get_doc_list_path(self.training, batch), "rb")) tmp_attributes = list( set(sorted(list(chain(*docs_list)))) ) # Retrieve the each word of the docs list in a sorted list attributes += tmp_attributes attributes = list( set(sorted(attributes)) ) # Sort the attributes list so that there is no 2 occurrences of the same word. if not self.acceptance_lst == None: attributes = list( set(attributes).intersection(self.acceptance_lst) ) # Only consider words in the acceptance list. print "Processed attribute " + str(processed) + " of " + str(length) + " batches" processed += 1 # Find attributes of the most common words. d = dict.fromkeys(attributes) processed = 1 for batch in batches: docs_list = s.load(open(env_paths.get_doc_list_path(self.training, batch), "rb")) words = list(list(chain(*docs_list))) for w in words: try: if d[w] == None: d[w] = 1 else: d[w] += 1 except KeyError: continue print "Processed summing " + str(processed) + " of " + str(length) + " batches" processed += 1 sorted_att = sorted(d.items(), key=lambda x: x[1]) sorted_att = sorted_att[len(sorted_att) - self.max_words_matrix :] attributes = [elem[0] for elem in sorted_att] # Serialize attributes s.dump(attributes, open(env_paths.get_attributes_path(self.training), "wb")) return attributes
def __set_attributes(self): """ Set the attributes containing of a list of words of all attributes in the bag of words matrix. @return: The generated list of words acting as attributes for the BOWs. """ batches = s.load(open(env_paths.get_batches_path(self.training), "rb")) length = len(batches) attributes = [] processed = 1 for batch in batches: docs_list = s.load(open(env_paths.get_doc_list_path(self.training, batch), "rb")) tmp_attributes = list( set(sorted(list(chain(*docs_list))))) # Retrieve the each word of the docs list in a sorted list attributes += tmp_attributes attributes = list( set(sorted(attributes))) # Sort the attributes list so that there is no 2 occurrences of the same word. if not self.acceptance_lst == None: attributes = list( set(attributes).intersection(self.acceptance_lst)) # Only consider words in the acceptance list. print 'Processed attribute ' + str(processed) + ' of ' + str(length) + ' batches' processed += 1 # Find attributes of the most common words. d = dict.fromkeys(attributes) processed = 1 for batch in batches: docs_list = s.load(open(env_paths.get_doc_list_path(self.training, batch), "rb")) words = list(list(chain(*docs_list))) for w in words: try: if d[w] == None: d[w] = 1 else: d[w] += 1 except KeyError: continue print 'Processed summing ' + str(processed) + ' of ' + str(length) + ' batches' processed += 1 sorted_att = sorted(d.items(), key=lambda x: x[1]) sorted_att = sorted_att[len(sorted_att) - self.max_words_matrix:] attributes = [elem[0] for elem in sorted_att] # Serialize attributes s.dump(attributes, open(env_paths.get_attributes_path(self.training), "wb")) return attributes
def generate_bows(self): """ Run through all steps of the dataprocessing to generate the BOWs for a training set and/or a testset. 1. Take all serialized stemmed documents and assign them into batches. Each batch should represent an equal number of docs from a category, except the last batch. 2. Calculate the number of words to extract an attributes list corresponding to the X (word_count) most used words. 3. Generate the BOWs for all batches. The BOWs will be saved in an output folder of the project root. """ print "Data Processing Started" timer = time() completed = self.__read_docs_from_filesystem() if not completed: print "Dataprocessing ended with an error." return print "Time ", time() - timer print "Filtering Words" timer = time() # Add all text of docs as a tokenized list if self.trainingset_attributes == None: attributes = self.__set_attributes() else: attributes = self.trainingset_attributes s.dump(attributes, open(env_paths.get_attributes_path(self.training), "wb")) print "Time ", time() - timer print "Generate bag of words matrix" timer = time() # Generate a dictionary for lookup of the words index_lookup = dict(zip(attributes, range(len(attributes)))) # Generate word matrix self.__generate_word_matrix(index_lookup) print "Time ", time() - timer
def generate_bows(self): """ Run through all steps of the dataprocessing to generate the BOWs for a training set and/or a testset. 1. Take all serialized stemmed documents and assign them into batches. Each batch should represent an equal number of docs from a category, except the last batch. 2. Calculate the number of words to extract an attributes list corresponding to the X (word_count) most used words. 3. Generate the BOWs for all batches. The BOWs will be saved in an output folder of the project root. """ print 'Data Processing Started' timer = time() completed = self.__read_docs_from_filesystem() if not completed: print 'Dataprocessing ended with an error.' return print 'Time ', time() - timer print 'Filtering Words' timer = time() # Add all text of docs as a tokenized list if self.trainingset_attributes == None: attributes = self.__set_attributes() else: attributes = self.trainingset_attributes s.dump(attributes, open(env_paths.get_attributes_path(self.training), "wb")) print 'Time ', time() - timer print 'Generate bag of words matrix' timer = time() # Generate a dictionary for lookup of the words index_lookup = dict(zip(attributes, range(len(attributes)))) # Generate word matrix self.__generate_word_matrix(index_lookup) print 'Time ', time() - timer