Python get_doc_list_path Examples, env_paths.get_doc_list_path Python Examples

Example #1

0

Show file

File: data_processing.py Project: jaycode/deep-belief-nets-for-topic-modeling

    def __set_attributes(self):
        """
        Set the attributes containing of a list of words of all attributes
        in the bag of words matrix.

        @return: The generated list of words acting as attributes for the BOWs.
        """
        batches = s.load(open(env_paths.get_batches_path(self.training), "rb"))
        length = len(batches)
        attributes = []
        processed = 1
        for batch in batches:
            docs_list = s.load(open(env_paths.get_doc_list_path(self.training, batch), "rb"))
            tmp_attributes = list(
                set(sorted(list(chain(*docs_list))))
            )  # Retrieve the each word of the docs list in a sorted list
            attributes += tmp_attributes
            attributes = list(
                set(sorted(attributes))
            )  # Sort the attributes list so that there is no 2 occurrences of the same word.
            if not self.acceptance_lst == None:
                attributes = list(
                    set(attributes).intersection(self.acceptance_lst)
                )  # Only consider words in the acceptance list.
            print "Processed attribute " + str(processed) + " of " + str(length) + " batches"
            processed += 1

        # Find attributes of the most common words.
        d = dict.fromkeys(attributes)
        processed = 1
        for batch in batches:
            docs_list = s.load(open(env_paths.get_doc_list_path(self.training, batch), "rb"))
            words = list(list(chain(*docs_list)))
            for w in words:
                try:
                    if d[w] == None:
                        d[w] = 1
                    else:
                        d[w] += 1
                except KeyError:
                    continue
            print "Processed summing " + str(processed) + " of " + str(length) + " batches"
            processed += 1
        sorted_att = sorted(d.items(), key=lambda x: x[1])
        sorted_att = sorted_att[len(sorted_att) - self.max_words_matrix :]
        attributes = [elem[0] for elem in sorted_att]

        # Serialize attributes
        s.dump(attributes, open(env_paths.get_attributes_path(self.training), "wb"))
        return attributes

Example #2

0

Show file

File: data_processing.py Project: jaycode/deep-belief-nets-for-topic-modeling

    def __generate_word_matrix(self, index_lookup):
        """
        Generate a BOW matrix with rows, columns corresponding to documents, words respectively.

        @param index_lookup: A dictionary with keys for the attributes. In order to know which colounm should be incremented in word_matrix.
        """
        batches = s.load(open(env_paths.get_batches_path(self.training), "rb"))
        length = len(batches)
        processed = 1
        for batch in batches:
            docs_list = s.load(open(env_paths.get_doc_list_path(self.training, batch), "rb"))
            bag_of_words_matrix = zeros([len(docs_list), len(index_lookup)])
            row = 0
            for doc in docs_list:
                for token in doc:
                    try:  # If word is not found in the dictionary
                        col = index_lookup[token]
                        bag_of_words_matrix[row, col] += 1
                    except KeyError:
                        continue
                row += 1
            # Serialize bag of words
            s.dump(bag_of_words_matrix.tolist(), open(env_paths.get_bow_matrix_path(self.training, batch), "wb"))
            print "Processed " + str(processed) + " of " + str(length) + " batches"
            processed += 1

Example #3

0

Show file

File: data_processing.py Project: irscut/IR-SemanticHashing-Python

    def __generate_word_matrix(self, index_lookup):
        """
        Generate a BOW matrix with rows, columns corresponding to documents, words respectively.

        @param index_lookup: A dictionary with keys for the attributes. In order to know which colounm should be incremented in word_matrix.
        """
        batches = s.load(open(env_paths.get_batches_path(self.training), "rb"))
        length = len(batches)
        processed = 1
        for batch in batches:
            docs_list = s.load(open(env_paths.get_doc_list_path(self.training, batch), "rb"))
            bag_of_words_matrix = zeros([len(docs_list), len(index_lookup)])
            row = 0
            for doc in docs_list:
                for token in doc:
                    try:  # If word is not found in the dictionary
                        col = index_lookup[token]
                        bag_of_words_matrix[row, col] += 1
                    except KeyError:
                        continue
                row += 1
            # Serialize bag of words
            s.dump(bag_of_words_matrix.tolist(), open(env_paths.get_bow_matrix_path(self.training, batch), "wb"))
            print 'Processed ' + str(processed) + ' of ' + str(length) + ' batches'
            processed += 1

Example #4

0

Show file

File: data_processing.py Project: irscut/IR-SemanticHashing-Python

    def __set_attributes(self):
        """
        Set the attributes containing of a list of words of all attributes
        in the bag of words matrix.

        @return: The generated list of words acting as attributes for the BOWs.
        """
        batches = s.load(open(env_paths.get_batches_path(self.training), "rb"))
        length = len(batches)
        attributes = []
        processed = 1
        for batch in batches:
            docs_list = s.load(open(env_paths.get_doc_list_path(self.training, batch), "rb"))
            tmp_attributes = list(
                set(sorted(list(chain(*docs_list)))))  # Retrieve the each word of the docs list in a sorted list
            attributes += tmp_attributes
            attributes = list(
                set(sorted(attributes)))  # Sort the attributes list so that there is no 2 occurrences of the same word.
            if not self.acceptance_lst == None: attributes = list(
                set(attributes).intersection(self.acceptance_lst))  # Only consider words in the acceptance list.
            print 'Processed attribute ' + str(processed) + ' of ' + str(length) + ' batches'
            processed += 1

        # Find attributes of the most common words.
        d = dict.fromkeys(attributes)
        processed = 1
        for batch in batches:
            docs_list = s.load(open(env_paths.get_doc_list_path(self.training, batch), "rb"))
            words = list(list(chain(*docs_list)))
            for w in words:
                try:
                    if d[w] == None:
                        d[w] = 1
                    else:
                        d[w] += 1
                except KeyError:
                    continue
            print 'Processed summing ' + str(processed) + ' of ' + str(length) + ' batches'
            processed += 1
        sorted_att = sorted(d.items(), key=lambda x: x[1])
        sorted_att = sorted_att[len(sorted_att) - self.max_words_matrix:]
        attributes = [elem[0] for elem in sorted_att]

        # Serialize attributes
        s.dump(attributes, open(env_paths.get_attributes_path(self.training), "wb"))
        return attributes

Example #5

0

Show file

File: data_processing.py Project: jaycode/deep-belief-nets-for-topic-modeling

    def __save_batch_loading_docs(self, batch_number, docs_list, docs_names, class_indices):
        """
        Save batches for the document loading process in the initialization phase. This is done due to vast sizes
        of data - lack of memory.

        @param batch_number: Representing the number of documents in the batch.
        @param docs_list: List containing a string for each document in the batch.
        @param docs_names: List containing the names of each document in the same order as the docs_list.
        @param class_indices: List containing which class/folder each document belongs to.
        """
        # Serialize all relevant variables
        s.dump(docs_list, open(env_paths.get_doc_list_path(self.training, batch_number), "wb"))
        s.dump(docs_names, open(env_paths.get_doc_names_path(self.training, batch_number), "wb"))
        s.dump(class_indices, open(env_paths.get_class_indices_path(self.training, batch_number), "wb"))

Example #6

0

Show file

File: data_processing.py Project: irscut/IR-SemanticHashing-Python

    def __save_batch_loading_docs(self, batch_number, docs_list, docs_names, class_indices):
        """
        Save batches for the document loading process in the initialization phase. This is done due to vast sizes
        of data - lack of memory.

        @param batch_number: Representing the number of documents in the batch.
        @param docs_list: List containing a string for each document in the batch.
        @param docs_names: List containing the names of each document in the same order as the docs_list.
        @param class_indices: List containing which class/folder each document belongs to.
        """
        # Serialize all relevant variables
        s.dump(docs_list, open(env_paths.get_doc_list_path(self.training, batch_number), "wb"))
        s.dump(docs_names, open(env_paths.get_doc_names_path(self.training, batch_number), "wb"))
        s.dump(class_indices, open(env_paths.get_class_indices_path(self.training, batch_number), "wb"))