def get_document_class(row, batch, training=True):
    """
    The class of a document corresponding to a row
    in a batch.
    
    @param row: row in the bag of words matrix in batch.
    @param batch: the number of the batch.
    @param training: is this the training set or the test set.
    """
    class_indices_for_batch = s.load(open(env_paths.get_class_indices_path(training, batch), "rb"))
    class_names_for_batch = s.load(open(env_paths.get_class_names_path(training), "rb"))
    return class_names_for_batch[class_indices_for_batch[row]]
def get_document_class(row, batch, training=True):
    """
    The class of a document corresponding to a row
    in a batch.
    
    @param row: row in the bag of words matrix in batch.
    @param batch: the number of the batch.
    @param training: is this the training set or the test set.
    """
    class_indices_for_batch = s.load(open(env_paths.get_class_indices_path(training, batch), "rb"))
    class_names_for_batch = s.load(open(env_paths.get_class_names_path(training), "rb"))
    return class_names_for_batch[class_indices_for_batch[row]]
    def __read_docs_from_filesystem(self):
        """
        Read all docs and assign them to batches, so that each doc category is represented equally across batches.

        """
        docs_names = []
        docs_names_split = []
        class_indices = []
        class_indices_split = []
        class_names = []
        batches = []

        print "Generating class indices and docs names list."
        doc_count = 0
        for folder in self.paths:
            docs_names_split.append([])
            class_indices_split.append([])
            class_names.append(folder.split("/")[len(folder.split("/")) - 1])
            if self.trainingset_size == None:  # If data processing should be done on all data in the specified folders.
                docs = os.listdir(folder)
            elif (
                not self.trainingset_size == None and self.trainingset_attributes == None
            ):  # If data processing should be done on parts of the docs in the specified folders - for training and testing purposes.
                docs = os.listdir(folder)[: int(len(os.listdir(folder)) * self.trainingset_size)]
            else:  # If data processing should be done on a test set.
                docs = os.listdir(folder)[int(len(os.listdir(folder)) * self.trainingset_size) :]
            for doc in docs:
                if doc.endswith(".p"):
                    # Append the name of the document to the list containing document names.
                    docs_names_split[-1].append(folder + "/" + doc)
                    class_indices_split[-1].append(len(class_names) - 1)
                    doc_count += 1

        if len(docs_names_split) == 0:  # Check if docs have been stemmed.
            print "Documents have not been stemmed. Please stem documents in order to create bag of words matrices."
            return 0

        # Ensure that batches contain an even amount of docs from each category.
        print "Arranging the documents."
        if doc_count < self.batchsize:
            print "Number of documents must be greater than batchsize. Please revise the batchsize."
            return 0
        number_of_batches = doc_count / self.batchsize
        number_of_classes = len(self.paths)
        batches_collected_class_indices = []
        batches_collected_docs_names = []

        # Calculate fraction of category in each batch.
        d = {}
        for i in range(len(class_indices_split)):
            d[i] = float(len(class_indices_split[i])) / number_of_batches

        count = 0
        for i in range(number_of_batches):
            batch_class_indices = []
            batch_docs_names = []
            d_tmp = array([int(v) for v in d.values()])
            while True:
                if (
                    (len(batch_class_indices) == self.batchsize)
                    and (not doc_count - count < self.batchsize)
                    or (count == doc_count)
                ):
                    break
                if len(d_tmp[d_tmp > 0]) == 0:
                    break
                for j in range(number_of_classes):
                    if (
                        (len(batch_class_indices) == self.batchsize)
                        and (not doc_count - count < self.batchsize)
                        or (count == doc_count)
                    ):
                        break
                    if len(class_indices_split[j]) > 0 and d_tmp[j] != 0:
                        batch_class_indices.append(class_indices_split[j].pop(0))
                        batch_docs_names.append(docs_names_split[j].pop(0))
                        d_tmp[j] -= 1
                        count += 1
            batches_collected_class_indices.append(batch_class_indices)
            batches_collected_docs_names.append(batch_docs_names)

        for i in range(number_of_batches):
            bsize = self.batchsize if i < number_of_batches - 1 else self.batchsize + (doc_count % self.batchsize)
            batch_class_indices = batches_collected_class_indices[i]
            batch_docs_names = batches_collected_docs_names[i]
            if len(batch_class_indices) < bsize:
                while True:
                    if len(batch_class_indices) == bsize:
                        break
                    for j in range(number_of_classes):
                        if len(batch_class_indices) == bsize:
                            break
                        if len(class_indices_split[j]) > 0:
                            batch_class_indices.append(class_indices_split[j].pop(0))
                            batch_docs_names.append(docs_names_split[j].pop(0))

            # Shuffle the batch
            batch_class_indices_shuf = []
            batch_docs_names_shuf = []
            index_shuf = range(len(batch_class_indices))
            shuffle(index_shuf)
            for k in index_shuf:
                batch_class_indices_shuf.append(batch_class_indices[k])
                batch_docs_names_shuf.append(batch_docs_names[k])

            # Append batch to full lists
            class_indices += batch_class_indices_shuf
            docs_names += batch_docs_names_shuf

        print "Reading and saving docs from file system"
        count = 0
        class_indices_batch = []
        docs_names_batch = []
        docs_list = []
        for i in xrange(len(class_indices)):
            if (
                not count == 0 and (count % self.batchsize) == 0
            ):  # Save the batch if batchsize is reached or if the last document has been read.
                if not (len(class_indices) - count) < self.batchsize:
                    print "Read ", str(count), " of ", len(class_indices)
                    self.__save_batch_loading_docs(count, docs_list, docs_names_batch, class_indices_batch)
                    batches.append(count)
                    # Reset the lists
                    docs_list = []
                    docs_names_batch = []
                    class_indices_batch = []

            d = s.load(open(docs_names[i], "rb"))
            docs_list.append(d)
            docs_names_batch.append(docs_names[i])
            class_indices_batch.append(class_indices[i])
            count += 1

        # Save the remaining docs
        if len(docs_list) > 0:
            print "Read ", str(count), " of ", len(class_indices)
            self.__save_batch_loading_docs(count, docs_list, docs_names_batch, class_indices_batch)
            batches.append(count)

        s.dump(class_names, open(env_paths.get_class_names_path(self.training), "wb"))
        s.dump(batches, open(env_paths.get_batches_path(self.training), "wb"))
        return 1
def get_all_class_names():
    """
    Get all class names for training set.
    """

    return s.load(open(env_paths.get_class_names_path(train=True), "rb"))
    def __read_docs_from_filesystem(self):
        """
        Read all docs and assign them to batches, so that each doc category is represented equally across batches.

        """
        docs_names = []
        docs_names_split = []
        class_indices = []
        class_indices_split = []
        class_names = []
        batches = []

        print 'Generating class indices and docs names list.'
        doc_count = 0
        for folder in self.paths:
            docs_names_split.append([])
            class_indices_split.append([])
            class_names.append(folder.split('/')[len(folder.split('/')) - 1])
            if self.trainingset_size == None:  # If data processing should be done on all data in the specified folders.
                docs = os.listdir(folder)
            elif not self.trainingset_size == None and self.trainingset_attributes == None:  # If data processing should be done on parts of the docs in the specified folders - for training and testing purposes.
                docs = os.listdir(folder)[:int(len(os.listdir(folder)) * self.trainingset_size)]
            else:  # If data processing should be done on a test set.
                docs = os.listdir(folder)[int(len(os.listdir(folder)) * self.trainingset_size):]
            for doc in docs:
                if doc.endswith('.p'):
                    # Append the name of the document to the list containing document names.
                    docs_names_split[-1].append(folder + '/' + doc)
                    class_indices_split[-1].append(len(class_names) - 1)
                    doc_count += 1

        if len(docs_names_split) == 0:  # Check if docs have been stemmed.
            print 'Documents have not been stemmed. Please stem documents in order to create bag of words matrices.'
            return 0

        # Ensure that batches contain an even amount of docs from each category.
        print 'Arranging the documents.'
        if doc_count < self.batchsize:
            print 'Number of documents must be greater than batchsize. Please revise the batchsize.'
            return 0
        number_of_batches = doc_count / self.batchsize
        number_of_classes = len(self.paths)
        batches_collected_class_indices = []
        batches_collected_docs_names = []

        # Calculate fraction of category in each batch.
        d = {}
        for i in range(len(class_indices_split)):
            d[i] = float(len(class_indices_split[i])) / number_of_batches

        count = 0
        for i in range(number_of_batches):
            batch_class_indices = []
            batch_docs_names = []
            d_tmp = array([int(v) for v in d.values()])
            while True:
                if (len(batch_class_indices) == self.batchsize) and (not doc_count - count < self.batchsize) or (
                            count == doc_count):
                    break
                if len(d_tmp[d_tmp > 0]) == 0:
                    break
                for j in range(number_of_classes):
                    if (len(batch_class_indices) == self.batchsize) and (not doc_count - count < self.batchsize) or (
                                count == doc_count):
                        break
                    if len(class_indices_split[j]) > 0 and d_tmp[j] != 0:
                        batch_class_indices.append(class_indices_split[j].pop(0))
                        batch_docs_names.append(docs_names_split[j].pop(0))
                        d_tmp[j] -= 1
                        count += 1
            batches_collected_class_indices.append(batch_class_indices)
            batches_collected_docs_names.append(batch_docs_names)

        for i in range(number_of_batches):
            bsize = self.batchsize if i < number_of_batches - 1 else self.batchsize + (doc_count % self.batchsize)
            batch_class_indices = batches_collected_class_indices[i]
            batch_docs_names = batches_collected_docs_names[i]
            if len(batch_class_indices) < bsize:
                while True:
                    if len(batch_class_indices) == bsize: break
                    for j in range(number_of_classes):
                        if len(batch_class_indices) == bsize: break
                        if len(class_indices_split[j]) > 0:
                            batch_class_indices.append(class_indices_split[j].pop(0))
                            batch_docs_names.append(docs_names_split[j].pop(0))


            # Shuffle the batch
            batch_class_indices_shuf = []
            batch_docs_names_shuf = []
            index_shuf = range(len(batch_class_indices))
            shuffle(index_shuf)
            for k in index_shuf:
                batch_class_indices_shuf.append(batch_class_indices[k])
                batch_docs_names_shuf.append(batch_docs_names[k])

            # Append batch to full lists
            class_indices += batch_class_indices_shuf
            docs_names += batch_docs_names_shuf

        print 'Reading and saving docs from file system'
        count = 0
        class_indices_batch = []
        docs_names_batch = []
        docs_list = []
        for i in xrange(len(class_indices)):
            if not count == 0 and (
                        count % self.batchsize) == 0:  # Save the batch if batchsize is reached or if the last document has been read.
                if not (len(class_indices) - count) < self.batchsize:
                    print 'Read ', str(count), ' of ', len(class_indices)
                    self.__save_batch_loading_docs(count, docs_list, docs_names_batch, class_indices_batch)
                    batches.append(count)
                    # Reset the lists
                    docs_list = []
                    docs_names_batch = []
                    class_indices_batch = []

            d = s.load(open(docs_names[i], 'rb'))
            docs_list.append(d)
            docs_names_batch.append(docs_names[i])
            class_indices_batch.append(class_indices[i])
            count += 1

        # Save the remaining docs
        if len(docs_list) > 0:
            print 'Read ', str(count), ' of ', len(class_indices)
            self.__save_batch_loading_docs(count, docs_list, docs_names_batch, class_indices_batch)
            batches.append(count)

        s.dump(class_names, open(env_paths.get_class_names_path(self.training), "wb"))
        s.dump(batches, open(env_paths.get_batches_path(self.training), "wb"))
        return 1
def get_all_class_names():
    """
    Get all class names for training set.
    """

    return s.load(open(env_paths.get_class_names_path(train=True), 'rb'))