def _read_document(self):
     """Must set the self._content and self._metadata variables."""
     with io.open(self._document_path, 'r', encoding='utf-8', errors='ignore') as f:
         text = f.read()
     metadata, content = basic_tools.seperate_metadata_and_content(text)
     self._metadata = basic_tools.metadata_to_dict(metadata)
     for f in self._filters:
         content = f(content)
     self._content = content
Example #2
0
    def __init__(self, dataset_directory, is_recursive=True):
        """
        dataset_directory -- a relative or absolute file path to the \
        directory that contains the documents directory and the dataset_metadata.txt file.
        is_recursive -- find documents recursively in the documents directory, \
                        by default documents will be found recursively.
        Document metadata types will be infered from the document metadata.
        """
        # create commonly used directory and file paths
        self._dataset_directory = dataset_directory
        self._abs_dataset_directory = os.path.abspath(dataset_directory)
        self._metadata_file = os.path.join(self._abs_dataset_directory,
                                           'dataset_metadata.txt')
        self._documents_directory = os.path.join(self._abs_dataset_directory,
                                                 'documents')
        self.is_recursive = is_recursive
        self._filters = []

        self.metadata = {}
        # load the dataset metadata
        with io.open(self._metadata_file,
                     'r',
                     encoding='utf-8',
                     errors='ignore') as meta_file:
            content = meta_file.read()
            metadata, __ = basic_tools.seperate_metadata_and_content(content)
            self.metadata = basic_tools.metadata_to_dict(metadata)
        self.metadata_types = {}
        basic_tools.collect_types(self.metadata_types, self.metadata)

        if not 'readable_name' in self.metadata:
            identifier = self._dataset_directory.replace('_', ' ').replace(
                '/', ' ').title()
        else:
            identifier = self.metadata['readable_name']
        self.name = basic_tools.remove_punctuation(identifier)

        # find and sort all file paths
        self._list_of_documents = basic_tools.get_all_files_from_directory(
            self._documents_directory, self.is_recursive)
        self._list_of_documents.sort()

        # find any bad documents and find document metadata
        self.document_metadata_types = {}
        bad_doc_indices = []
        for doc_index, doc in enumerate(self):
            try:
                basic_tools.collect_types(self.document_metadata_types,
                                          doc.metadata)
            except Exception as e:
                print("Bad document: ", self._list_of_documents[doc_index])
                bad_doc_indices.append(doc_index)
        while len(bad_doc_indices) != 0:
            remove_index = bad_doc_indices.pop()
            del self._list_of_documents[remove_index]
Example #3
0
 def _read_document(self):
     """Must set the self._content and self._metadata variables."""
     with io.open(self._document_path,
                  'r',
                  encoding='utf-8',
                  errors='ignore') as f:
         text = f.read()
     metadata, content = basic_tools.seperate_metadata_and_content(text)
     self._metadata = basic_tools.metadata_to_dict(metadata)
     for f in self._filters:
         content = f(content)
     self._content = content
 def __init__(self, dataset_directory, is_recursive=True):
     """
     dataset_directory -- a relative or absolute file path to the \
     directory that contains the documents directory and the dataset_metadata.txt file.
     is_recursive -- find documents recursively in the documents directory, \
                     by default documents will be found recursively.
     Document metadata types will be infered from the document metadata.
     """
     # create commonly used directory and file paths
     self._dataset_directory = dataset_directory
     self._abs_dataset_directory = os.path.abspath(dataset_directory)
     self._metadata_file = os.path.join(self._abs_dataset_directory,
                                        'dataset_metadata.txt')
     self._documents_directory = os.path.join(self._abs_dataset_directory, 
                                              'documents')
     self.is_recursive = is_recursive
     self._filters = []
     
     self.metadata = {}
     # load the dataset metadata
     with io.open(self._metadata_file, 'r', encoding='utf-8', errors='ignore') as meta_file:
         content = meta_file.read()
         metadata, __ = basic_tools.seperate_metadata_and_content(content)
         self.metadata = basic_tools.metadata_to_dict(metadata)
     self.metadata_types = {}
     basic_tools.collect_types(self.metadata_types, self.metadata)
     
     if not 'readable_name' in self.metadata:
         identifier = self._dataset_directory.replace('_', ' ').replace('/', ' ').title()
     else:
         identifier = self.metadata['readable_name']
     self.name = basic_tools.remove_punctuation(identifier)
     
     # find and sort all file paths
     self._list_of_documents = basic_tools.get_all_files_from_directory(self._documents_directory, self.is_recursive)
     self._list_of_documents.sort()
     
     # find any bad documents and find document metadata
     self.document_metadata_types = {}
     bad_doc_indices = []
     for doc_index, doc in enumerate(self):
         try:
             basic_tools.collect_types(self.document_metadata_types, doc.metadata)
         except Exception as e:
             print("Bad document: ", self._list_of_documents[doc_index])
             bad_doc_indices.append(doc_index)
     while len(bad_doc_indices) != 0:
         remove_index = bad_doc_indices.pop()
         del self._list_of_documents[remove_index]