def get_attributes(training=True):
    """
    Get the attributes.
    
    @param training: is this the training set or the test set.
    """
    return s.load(open(env_paths.get_attributes_path(training), "rb"))
def get_attributes(training=True):
    """
    Get the attributes.
    
    @param training: is this the training set or the test set.
    """
    return s.load(open(env_paths.get_attributes_path(training), "rb"))
    def __set_attributes(self):
        """
        Set the attributes containing of a list of words of all attributes
        in the bag of words matrix.

        @return: The generated list of words acting as attributes for the BOWs.
        """
        batches = s.load(open(env_paths.get_batches_path(self.training), "rb"))
        length = len(batches)
        attributes = []
        processed = 1
        for batch in batches:
            docs_list = s.load(open(env_paths.get_doc_list_path(self.training, batch), "rb"))
            tmp_attributes = list(
                set(sorted(list(chain(*docs_list))))
            )  # Retrieve the each word of the docs list in a sorted list
            attributes += tmp_attributes
            attributes = list(
                set(sorted(attributes))
            )  # Sort the attributes list so that there is no 2 occurrences of the same word.
            if not self.acceptance_lst == None:
                attributes = list(
                    set(attributes).intersection(self.acceptance_lst)
                )  # Only consider words in the acceptance list.
            print "Processed attribute " + str(processed) + " of " + str(length) + " batches"
            processed += 1

        # Find attributes of the most common words.
        d = dict.fromkeys(attributes)
        processed = 1
        for batch in batches:
            docs_list = s.load(open(env_paths.get_doc_list_path(self.training, batch), "rb"))
            words = list(list(chain(*docs_list)))
            for w in words:
                try:
                    if d[w] == None:
                        d[w] = 1
                    else:
                        d[w] += 1
                except KeyError:
                    continue
            print "Processed summing " + str(processed) + " of " + str(length) + " batches"
            processed += 1
        sorted_att = sorted(d.items(), key=lambda x: x[1])
        sorted_att = sorted_att[len(sorted_att) - self.max_words_matrix :]
        attributes = [elem[0] for elem in sorted_att]

        # Serialize attributes
        s.dump(attributes, open(env_paths.get_attributes_path(self.training), "wb"))
        return attributes
    def __set_attributes(self):
        """
        Set the attributes containing of a list of words of all attributes
        in the bag of words matrix.

        @return: The generated list of words acting as attributes for the BOWs.
        """
        batches = s.load(open(env_paths.get_batches_path(self.training), "rb"))
        length = len(batches)
        attributes = []
        processed = 1
        for batch in batches:
            docs_list = s.load(open(env_paths.get_doc_list_path(self.training, batch), "rb"))
            tmp_attributes = list(
                set(sorted(list(chain(*docs_list)))))  # Retrieve the each word of the docs list in a sorted list
            attributes += tmp_attributes
            attributes = list(
                set(sorted(attributes)))  # Sort the attributes list so that there is no 2 occurrences of the same word.
            if not self.acceptance_lst == None: attributes = list(
                set(attributes).intersection(self.acceptance_lst))  # Only consider words in the acceptance list.
            print 'Processed attribute ' + str(processed) + ' of ' + str(length) + ' batches'
            processed += 1

        # Find attributes of the most common words.
        d = dict.fromkeys(attributes)
        processed = 1
        for batch in batches:
            docs_list = s.load(open(env_paths.get_doc_list_path(self.training, batch), "rb"))
            words = list(list(chain(*docs_list)))
            for w in words:
                try:
                    if d[w] == None:
                        d[w] = 1
                    else:
                        d[w] += 1
                except KeyError:
                    continue
            print 'Processed summing ' + str(processed) + ' of ' + str(length) + ' batches'
            processed += 1
        sorted_att = sorted(d.items(), key=lambda x: x[1])
        sorted_att = sorted_att[len(sorted_att) - self.max_words_matrix:]
        attributes = [elem[0] for elem in sorted_att]

        # Serialize attributes
        s.dump(attributes, open(env_paths.get_attributes_path(self.training), "wb"))
        return attributes
    def generate_bows(self):
        """
        Run through all steps of the dataprocessing to generate the BOWs for a training set and/or a testset.

        1. Take all serialized stemmed documents and assign them into batches. Each batch should represent an
        equal number of docs from a category, except the last batch.
        2. Calculate the number of words to extract an attributes list corresponding to the X (word_count) most used words.
        3. Generate the BOWs for all batches.

        The BOWs will be saved in an output folder of the project root.
        """
        print "Data Processing Started"
        timer = time()
        completed = self.__read_docs_from_filesystem()
        if not completed:
            print "Dataprocessing ended with an error."
            return
        print "Time ", time() - timer

        print "Filtering Words"
        timer = time()
        # Add all text of docs as a tokenized list
        if self.trainingset_attributes == None:
            attributes = self.__set_attributes()
        else:
            attributes = self.trainingset_attributes
            s.dump(attributes, open(env_paths.get_attributes_path(self.training), "wb"))
        print "Time ", time() - timer

        print "Generate bag of words matrix"
        timer = time()
        # Generate a dictionary for lookup of the words
        index_lookup = dict(zip(attributes, range(len(attributes))))
        # Generate word matrix
        self.__generate_word_matrix(index_lookup)
        print "Time ", time() - timer
    def generate_bows(self):
        """
        Run through all steps of the dataprocessing to generate the BOWs for a training set and/or a testset.

        1. Take all serialized stemmed documents and assign them into batches. Each batch should represent an
        equal number of docs from a category, except the last batch.
        2. Calculate the number of words to extract an attributes list corresponding to the X (word_count) most used words.
        3. Generate the BOWs for all batches.

        The BOWs will be saved in an output folder of the project root.
        """
        print 'Data Processing Started'
        timer = time()
        completed = self.__read_docs_from_filesystem()
        if not completed:
            print 'Dataprocessing ended with an error.'
            return
        print 'Time ', time() - timer

        print 'Filtering Words'
        timer = time()
        # Add all text of docs as a tokenized list
        if self.trainingset_attributes == None:
            attributes = self.__set_attributes()
        else:
            attributes = self.trainingset_attributes
            s.dump(attributes, open(env_paths.get_attributes_path(self.training), "wb"))
        print 'Time ', time() - timer

        print 'Generate bag of words matrix'
        timer = time()
        # Generate a dictionary for lookup of the words
        index_lookup = dict(zip(attributes, range(len(attributes))))
        # Generate word matrix
        self.__generate_word_matrix(index_lookup)
        print 'Time ', time() - timer