def compare_real_data_to_reconstructed_data():
    weights = s.load(open(env_paths.get_dbn_weight_path(),"rb"))
    batches = s.load(open(env_paths.get_batches_path(train=False),"rb"))
    class_indices = s.load(open(env_paths.get_class_indices_path(False,batches[0]).replace(".0",""),"rb"))
    batch = batches[0]
    data = data_processing.get_bag_of_words_matrix(batch,training = False)


    dict = {}
    for i in range(len(class_indices)):
        idx = class_indices[i]
        if idx in dict.keys(): continue
        dict[idx] = data[i]
        if len(dict) >= 10:
            break

    print dict.keys()

    data_points = dict.values()

    output_data_points = []
    for d in data_points:
        d = append(d,1.)
        out = generate_output_data(d,weights)
        output_data_points.append(out)

    visualise_data_points(data_points,output_data_points)
    def __generate_word_matrix(self, index_lookup):
        """
        Generate a BOW matrix with rows, columns corresponding to documents, words respectively.

        @param index_lookup: A dictionary with keys for the attributes. In order to know which colounm should be incremented in word_matrix.
        """
        batches = s.load(open(env_paths.get_batches_path(self.training), "rb"))
        length = len(batches)
        processed = 1
        for batch in batches:
            docs_list = s.load(open(env_paths.get_doc_list_path(self.training, batch), "rb"))
            bag_of_words_matrix = zeros([len(docs_list), len(index_lookup)])
            row = 0
            for doc in docs_list:
                for token in doc:
                    try:  # If word is not found in the dictionary
                        col = index_lookup[token]
                        bag_of_words_matrix[row, col] += 1
                    except KeyError:
                        continue
                row += 1
            # Serialize bag of words
            s.dump(bag_of_words_matrix.tolist(), open(env_paths.get_bow_matrix_path(self.training, batch), "wb"))
            print "Processed " + str(processed) + " of " + str(length) + " batches"
            processed += 1
def get_batch_list(training=True):
    """
    Retrieve the list containing the batch numbers.

    @param training: is this the training set or the test set.
    """
    return s.load(open(env_paths.get_batches_path(training), "rb"))
def get_batch_list(training=True):
    """
    Retrieve the list containing the batch numbers.

    @param training: is this the training set or the test set.
    """
    return s.load(open(env_paths.get_batches_path(training), "rb"))
    def __generate_word_matrix(self, index_lookup):
        """
        Generate a BOW matrix with rows, columns corresponding to documents, words respectively.

        @param index_lookup: A dictionary with keys for the attributes. In order to know which colounm should be incremented in word_matrix.
        """
        batches = s.load(open(env_paths.get_batches_path(self.training), "rb"))
        length = len(batches)
        processed = 1
        for batch in batches:
            docs_list = s.load(open(env_paths.get_doc_list_path(self.training, batch), "rb"))
            bag_of_words_matrix = zeros([len(docs_list), len(index_lookup)])
            row = 0
            for doc in docs_list:
                for token in doc:
                    try:  # If word is not found in the dictionary
                        col = index_lookup[token]
                        bag_of_words_matrix[row, col] += 1
                    except KeyError:
                        continue
                row += 1
            # Serialize bag of words
            s.dump(bag_of_words_matrix.tolist(), open(env_paths.get_bow_matrix_path(self.training, batch), "wb"))
            print 'Processed ' + str(processed) + ' of ' + str(length) + ' batches'
            processed += 1
    def __set_attributes(self):
        """
        Set the attributes containing of a list of words of all attributes
        in the bag of words matrix.

        @return: The generated list of words acting as attributes for the BOWs.
        """
        batches = s.load(open(env_paths.get_batches_path(self.training), "rb"))
        length = len(batches)
        attributes = []
        processed = 1
        for batch in batches:
            docs_list = s.load(open(env_paths.get_doc_list_path(self.training, batch), "rb"))
            tmp_attributes = list(
                set(sorted(list(chain(*docs_list))))
            )  # Retrieve the each word of the docs list in a sorted list
            attributes += tmp_attributes
            attributes = list(
                set(sorted(attributes))
            )  # Sort the attributes list so that there is no 2 occurrences of the same word.
            if not self.acceptance_lst == None:
                attributes = list(
                    set(attributes).intersection(self.acceptance_lst)
                )  # Only consider words in the acceptance list.
            print "Processed attribute " + str(processed) + " of " + str(length) + " batches"
            processed += 1

        # Find attributes of the most common words.
        d = dict.fromkeys(attributes)
        processed = 1
        for batch in batches:
            docs_list = s.load(open(env_paths.get_doc_list_path(self.training, batch), "rb"))
            words = list(list(chain(*docs_list)))
            for w in words:
                try:
                    if d[w] == None:
                        d[w] = 1
                    else:
                        d[w] += 1
                except KeyError:
                    continue
            print "Processed summing " + str(processed) + " of " + str(length) + " batches"
            processed += 1
        sorted_att = sorted(d.items(), key=lambda x: x[1])
        sorted_att = sorted_att[len(sorted_att) - self.max_words_matrix :]
        attributes = [elem[0] for elem in sorted_att]

        # Serialize attributes
        s.dump(attributes, open(env_paths.get_attributes_path(self.training), "wb"))
        return attributes
    def __set_attributes(self):
        """
        Set the attributes containing of a list of words of all attributes
        in the bag of words matrix.

        @return: The generated list of words acting as attributes for the BOWs.
        """
        batches = s.load(open(env_paths.get_batches_path(self.training), "rb"))
        length = len(batches)
        attributes = []
        processed = 1
        for batch in batches:
            docs_list = s.load(open(env_paths.get_doc_list_path(self.training, batch), "rb"))
            tmp_attributes = list(
                set(sorted(list(chain(*docs_list)))))  # Retrieve the each word of the docs list in a sorted list
            attributes += tmp_attributes
            attributes = list(
                set(sorted(attributes)))  # Sort the attributes list so that there is no 2 occurrences of the same word.
            if not self.acceptance_lst == None: attributes = list(
                set(attributes).intersection(self.acceptance_lst))  # Only consider words in the acceptance list.
            print 'Processed attribute ' + str(processed) + ' of ' + str(length) + ' batches'
            processed += 1

        # Find attributes of the most common words.
        d = dict.fromkeys(attributes)
        processed = 1
        for batch in batches:
            docs_list = s.load(open(env_paths.get_doc_list_path(self.training, batch), "rb"))
            words = list(list(chain(*docs_list)))
            for w in words:
                try:
                    if d[w] == None:
                        d[w] = 1
                    else:
                        d[w] += 1
                except KeyError:
                    continue
            print 'Processed summing ' + str(processed) + ' of ' + str(length) + ' batches'
            processed += 1
        sorted_att = sorted(d.items(), key=lambda x: x[1])
        sorted_att = sorted_att[len(sorted_att) - self.max_words_matrix:]
        attributes = [elem[0] for elem in sorted_att]

        # Serialize attributes
        s.dump(attributes, open(env_paths.get_attributes_path(self.training), "wb"))
        return attributes
def compare_real_data_to_reconstructed_data_random():
    weights = s.load(open(env_paths.get_dbn_weight_path(),"rb"))
    batches = s.load(open(env_paths.get_batches_path(train=False),"rb"))
    batch = choice(batches) # make sure to pick batch at random
    data = data_processing.get_bag_of_words_matrix(batch,training = False)
    # choose 10 data points at random
    data_points = []
    indices = random.randint(0,len(data),10)
    for idx in indices:
        data_points.append(data[idx])

    output_data_points = []
    for d in data_points:
        d = append(d,1.)
        out = generate_output_data(d,weights)
        output_data_points.append(out)

    visualise_data_points(data_points,output_data_points)
def save_batches(batches,training):
    pickle.dump(batches, open(env_paths.get_batches_path(training), "wb"))
    def __read_docs_from_filesystem(self):
        """
        Read all docs and assign them to batches, so that each doc category is represented equally across batches.

        """
        docs_names = []
        docs_names_split = []
        class_indices = []
        class_indices_split = []
        class_names = []
        batches = []

        print "Generating class indices and docs names list."
        doc_count = 0
        for folder in self.paths:
            docs_names_split.append([])
            class_indices_split.append([])
            class_names.append(folder.split("/")[len(folder.split("/")) - 1])
            if self.trainingset_size == None:  # If data processing should be done on all data in the specified folders.
                docs = os.listdir(folder)
            elif (
                not self.trainingset_size == None and self.trainingset_attributes == None
            ):  # If data processing should be done on parts of the docs in the specified folders - for training and testing purposes.
                docs = os.listdir(folder)[: int(len(os.listdir(folder)) * self.trainingset_size)]
            else:  # If data processing should be done on a test set.
                docs = os.listdir(folder)[int(len(os.listdir(folder)) * self.trainingset_size) :]
            for doc in docs:
                if doc.endswith(".p"):
                    # Append the name of the document to the list containing document names.
                    docs_names_split[-1].append(folder + "/" + doc)
                    class_indices_split[-1].append(len(class_names) - 1)
                    doc_count += 1

        if len(docs_names_split) == 0:  # Check if docs have been stemmed.
            print "Documents have not been stemmed. Please stem documents in order to create bag of words matrices."
            return 0

        # Ensure that batches contain an even amount of docs from each category.
        print "Arranging the documents."
        if doc_count < self.batchsize:
            print "Number of documents must be greater than batchsize. Please revise the batchsize."
            return 0
        number_of_batches = doc_count / self.batchsize
        number_of_classes = len(self.paths)
        batches_collected_class_indices = []
        batches_collected_docs_names = []

        # Calculate fraction of category in each batch.
        d = {}
        for i in range(len(class_indices_split)):
            d[i] = float(len(class_indices_split[i])) / number_of_batches

        count = 0
        for i in range(number_of_batches):
            batch_class_indices = []
            batch_docs_names = []
            d_tmp = array([int(v) for v in d.values()])
            while True:
                if (
                    (len(batch_class_indices) == self.batchsize)
                    and (not doc_count - count < self.batchsize)
                    or (count == doc_count)
                ):
                    break
                if len(d_tmp[d_tmp > 0]) == 0:
                    break
                for j in range(number_of_classes):
                    if (
                        (len(batch_class_indices) == self.batchsize)
                        and (not doc_count - count < self.batchsize)
                        or (count == doc_count)
                    ):
                        break
                    if len(class_indices_split[j]) > 0 and d_tmp[j] != 0:
                        batch_class_indices.append(class_indices_split[j].pop(0))
                        batch_docs_names.append(docs_names_split[j].pop(0))
                        d_tmp[j] -= 1
                        count += 1
            batches_collected_class_indices.append(batch_class_indices)
            batches_collected_docs_names.append(batch_docs_names)

        for i in range(number_of_batches):
            bsize = self.batchsize if i < number_of_batches - 1 else self.batchsize + (doc_count % self.batchsize)
            batch_class_indices = batches_collected_class_indices[i]
            batch_docs_names = batches_collected_docs_names[i]
            if len(batch_class_indices) < bsize:
                while True:
                    if len(batch_class_indices) == bsize:
                        break
                    for j in range(number_of_classes):
                        if len(batch_class_indices) == bsize:
                            break
                        if len(class_indices_split[j]) > 0:
                            batch_class_indices.append(class_indices_split[j].pop(0))
                            batch_docs_names.append(docs_names_split[j].pop(0))

            # Shuffle the batch
            batch_class_indices_shuf = []
            batch_docs_names_shuf = []
            index_shuf = range(len(batch_class_indices))
            shuffle(index_shuf)
            for k in index_shuf:
                batch_class_indices_shuf.append(batch_class_indices[k])
                batch_docs_names_shuf.append(batch_docs_names[k])

            # Append batch to full lists
            class_indices += batch_class_indices_shuf
            docs_names += batch_docs_names_shuf

        print "Reading and saving docs from file system"
        count = 0
        class_indices_batch = []
        docs_names_batch = []
        docs_list = []
        for i in xrange(len(class_indices)):
            if (
                not count == 0 and (count % self.batchsize) == 0
            ):  # Save the batch if batchsize is reached or if the last document has been read.
                if not (len(class_indices) - count) < self.batchsize:
                    print "Read ", str(count), " of ", len(class_indices)
                    self.__save_batch_loading_docs(count, docs_list, docs_names_batch, class_indices_batch)
                    batches.append(count)
                    # Reset the lists
                    docs_list = []
                    docs_names_batch = []
                    class_indices_batch = []

            d = s.load(open(docs_names[i], "rb"))
            docs_list.append(d)
            docs_names_batch.append(docs_names[i])
            class_indices_batch.append(class_indices[i])
            count += 1

        # Save the remaining docs
        if len(docs_list) > 0:
            print "Read ", str(count), " of ", len(class_indices)
            self.__save_batch_loading_docs(count, docs_list, docs_names_batch, class_indices_batch)
            batches.append(count)

        s.dump(class_names, open(env_paths.get_class_names_path(self.training), "wb"))
        s.dump(batches, open(env_paths.get_batches_path(self.training), "wb"))
        return 1
    def __read_docs_from_filesystem(self):
        """
        Read all docs and assign them to batches, so that each doc category is represented equally across batches.

        """
        docs_names = []
        docs_names_split = []
        class_indices = []
        class_indices_split = []
        class_names = []
        batches = []

        print 'Generating class indices and docs names list.'
        doc_count = 0
        for folder in self.paths:
            docs_names_split.append([])
            class_indices_split.append([])
            class_names.append(folder.split('/')[len(folder.split('/')) - 1])
            if self.trainingset_size == None:  # If data processing should be done on all data in the specified folders.
                docs = os.listdir(folder)
            elif not self.trainingset_size == None and self.trainingset_attributes == None:  # If data processing should be done on parts of the docs in the specified folders - for training and testing purposes.
                docs = os.listdir(folder)[:int(len(os.listdir(folder)) * self.trainingset_size)]
            else:  # If data processing should be done on a test set.
                docs = os.listdir(folder)[int(len(os.listdir(folder)) * self.trainingset_size):]
            for doc in docs:
                if doc.endswith('.p'):
                    # Append the name of the document to the list containing document names.
                    docs_names_split[-1].append(folder + '/' + doc)
                    class_indices_split[-1].append(len(class_names) - 1)
                    doc_count += 1

        if len(docs_names_split) == 0:  # Check if docs have been stemmed.
            print 'Documents have not been stemmed. Please stem documents in order to create bag of words matrices.'
            return 0

        # Ensure that batches contain an even amount of docs from each category.
        print 'Arranging the documents.'
        if doc_count < self.batchsize:
            print 'Number of documents must be greater than batchsize. Please revise the batchsize.'
            return 0
        number_of_batches = doc_count / self.batchsize
        number_of_classes = len(self.paths)
        batches_collected_class_indices = []
        batches_collected_docs_names = []

        # Calculate fraction of category in each batch.
        d = {}
        for i in range(len(class_indices_split)):
            d[i] = float(len(class_indices_split[i])) / number_of_batches

        count = 0
        for i in range(number_of_batches):
            batch_class_indices = []
            batch_docs_names = []
            d_tmp = array([int(v) for v in d.values()])
            while True:
                if (len(batch_class_indices) == self.batchsize) and (not doc_count - count < self.batchsize) or (
                            count == doc_count):
                    break
                if len(d_tmp[d_tmp > 0]) == 0:
                    break
                for j in range(number_of_classes):
                    if (len(batch_class_indices) == self.batchsize) and (not doc_count - count < self.batchsize) or (
                                count == doc_count):
                        break
                    if len(class_indices_split[j]) > 0 and d_tmp[j] != 0:
                        batch_class_indices.append(class_indices_split[j].pop(0))
                        batch_docs_names.append(docs_names_split[j].pop(0))
                        d_tmp[j] -= 1
                        count += 1
            batches_collected_class_indices.append(batch_class_indices)
            batches_collected_docs_names.append(batch_docs_names)

        for i in range(number_of_batches):
            bsize = self.batchsize if i < number_of_batches - 1 else self.batchsize + (doc_count % self.batchsize)
            batch_class_indices = batches_collected_class_indices[i]
            batch_docs_names = batches_collected_docs_names[i]
            if len(batch_class_indices) < bsize:
                while True:
                    if len(batch_class_indices) == bsize: break
                    for j in range(number_of_classes):
                        if len(batch_class_indices) == bsize: break
                        if len(class_indices_split[j]) > 0:
                            batch_class_indices.append(class_indices_split[j].pop(0))
                            batch_docs_names.append(docs_names_split[j].pop(0))


            # Shuffle the batch
            batch_class_indices_shuf = []
            batch_docs_names_shuf = []
            index_shuf = range(len(batch_class_indices))
            shuffle(index_shuf)
            for k in index_shuf:
                batch_class_indices_shuf.append(batch_class_indices[k])
                batch_docs_names_shuf.append(batch_docs_names[k])

            # Append batch to full lists
            class_indices += batch_class_indices_shuf
            docs_names += batch_docs_names_shuf

        print 'Reading and saving docs from file system'
        count = 0
        class_indices_batch = []
        docs_names_batch = []
        docs_list = []
        for i in xrange(len(class_indices)):
            if not count == 0 and (
                        count % self.batchsize) == 0:  # Save the batch if batchsize is reached or if the last document has been read.
                if not (len(class_indices) - count) < self.batchsize:
                    print 'Read ', str(count), ' of ', len(class_indices)
                    self.__save_batch_loading_docs(count, docs_list, docs_names_batch, class_indices_batch)
                    batches.append(count)
                    # Reset the lists
                    docs_list = []
                    docs_names_batch = []
                    class_indices_batch = []

            d = s.load(open(docs_names[i], 'rb'))
            docs_list.append(d)
            docs_names_batch.append(docs_names[i])
            class_indices_batch.append(class_indices[i])
            count += 1

        # Save the remaining docs
        if len(docs_list) > 0:
            print 'Read ', str(count), ' of ', len(class_indices)
            self.__save_batch_loading_docs(count, docs_list, docs_names_batch, class_indices_batch)
            batches.append(count)

        s.dump(class_names, open(env_paths.get_class_names_path(self.training), "wb"))
        s.dump(batches, open(env_paths.get_batches_path(self.training), "wb"))
        return 1