def get_bag_of_words_matrix(batch, training=True):
    """
    Retrieve the bag of words matrix for a batch.
    
    @param batch: the number of the batch.
    """
    return array(s.load(open(env_paths.get_bow_matrix_path(training, int(batch)), "rb")))
def get_bag_of_words_matrix(batch, training=True):
    """
    Retrieve the bag of words matrix for a batch.
    
    @param batch: the number of the batch.
    """
    return array(s.load(open(env_paths.get_bow_matrix_path(training, int(batch)), "rb")))
    def __generate_word_matrix(self, index_lookup):
        """
        Generate a BOW matrix with rows, columns corresponding to documents, words respectively.

        @param index_lookup: A dictionary with keys for the attributes. In order to know which colounm should be incremented in word_matrix.
        """
        batches = s.load(open(env_paths.get_batches_path(self.training), "rb"))
        length = len(batches)
        processed = 1
        for batch in batches:
            docs_list = s.load(open(env_paths.get_doc_list_path(self.training, batch), "rb"))
            bag_of_words_matrix = zeros([len(docs_list), len(index_lookup)])
            row = 0
            for doc in docs_list:
                for token in doc:
                    try:  # If word is not found in the dictionary
                        col = index_lookup[token]
                        bag_of_words_matrix[row, col] += 1
                    except KeyError:
                        continue
                row += 1
            # Serialize bag of words
            s.dump(bag_of_words_matrix.tolist(), open(env_paths.get_bow_matrix_path(self.training, batch), "wb"))
            print "Processed " + str(processed) + " of " + str(length) + " batches"
            processed += 1
    def __generate_word_matrix(self, index_lookup):
        """
        Generate a BOW matrix with rows, columns corresponding to documents, words respectively.

        @param index_lookup: A dictionary with keys for the attributes. In order to know which colounm should be incremented in word_matrix.
        """
        batches = s.load(open(env_paths.get_batches_path(self.training), "rb"))
        length = len(batches)
        processed = 1
        for batch in batches:
            docs_list = s.load(open(env_paths.get_doc_list_path(self.training, batch), "rb"))
            bag_of_words_matrix = zeros([len(docs_list), len(index_lookup)])
            row = 0
            for doc in docs_list:
                for token in doc:
                    try:  # If word is not found in the dictionary
                        col = index_lookup[token]
                        bag_of_words_matrix[row, col] += 1
                    except KeyError:
                        continue
                row += 1
            # Serialize bag of words
            s.dump(bag_of_words_matrix.tolist(), open(env_paths.get_bow_matrix_path(self.training, batch), "wb"))
            print 'Processed ' + str(processed) + ' of ' + str(length) + ' batches'
            processed += 1
def save_batch(batch,batch_lbl,batchno,training):
    pickle.dump(batch_lbl, open(env_paths.get_class_indices_path(training,batchno), "wb"))
    pickle.dump(batch, open(env_paths.get_bow_matrix_path(training,batchno), "wb"))