Example #1
0
def get_database_configurations(file_path):
    """
    Get the database configurations, an error is thrown if the \
    file cannot be read or is not found.
    If the contents don't make sense return an empty dictionary.
    For relative filenames, the folder they will be relative to is \
    the working directory.
    file_path -- file specifying the database configurations in key: value pairs.
    """
    database_config = {}

    key_names = [
        'ENGINE', 'NAME', 'HOST', 'OPTIONS', 'PASSWORD', 'PORT', 'USER'
    ]
    with open(file_path, 'r') as f:
        database_config = basic_tools.metadata_to_dict(
            f.read())  # read in the database configurations
    # make sure the key names are upper case
    for key in key_names:
        if key.lower() in database_config:
            database_config[key] = database_config[key.lower()]
            del database_config[key.lower()]
    # make sure that the relative path is relative to the working directory
    if 'NAME' in database_config and not isdir(database_config['NAME']):
        if not isabs(
                database_config['NAME']
        ):  # create an absolute path if a relative one is specified
            topical_guide_dir = os.path.abspath(os.path.dirname(__file__))
            database_config['NAME'] = os.path.join(topical_guide_dir,
                                                   'working',
                                                   database_config['NAME'])

    return database_config
Example #2
0
def get_database_configurations(file_path):
    """
    Get the database configurations, an error is thrown if the \
    file cannot be read or is not found.
    If the contents don't make sense return an empty dictionary.
    For relative filenames, the folder they will be relative to is \
    the working directory.
    file_path -- file specifying the database configurations in key: value pairs.
    """
    database_config = {}
    
    key_names = ['ENGINE', 'NAME', 'HOST', 'OPTIONS', 'PASSWORD', 'PORT', 'USER']
    with open(file_path, 'r') as f:
        database_config = basic_tools.metadata_to_dict(f.read()) # read in the database configurations
    # make sure the key names are upper case
    for key in key_names:
        if key.lower() in database_config:
            database_config[key] = database_config[key.lower()]
            del database_config[key.lower()]
    # make sure that the relative path is relative to the working directory
    if 'NAME' in database_config and not isdir(database_config['NAME']):
        if not isabs(database_config['NAME']): # create an absolute path if a relative one is specified
            topical_guide_dir = os.path.abspath(os.path.dirname(__file__))
            database_config['NAME'] = os.path.join(topical_guide_dir, 'working', database_config['NAME'])
    
    return database_config
 def _read_document(self):
     """Must set the self._content and self._metadata variables."""
     with io.open(self._document_path, 'r', encoding='utf-8', errors='ignore') as f:
         text = f.read()
     metadata, content = basic_tools.seperate_metadata_and_content(text)
     self._metadata = basic_tools.metadata_to_dict(metadata)
     for f in self._filters:
         content = f(content)
     self._content = content
Example #4
0
    def __init__(self, dataset_directory, is_recursive=True):
        """
        dataset_directory -- a relative or absolute file path to the \
        directory that contains the documents directory and the dataset_metadata.txt file.
        is_recursive -- find documents recursively in the documents directory, \
                        by default documents will be found recursively.
        Document metadata types will be infered from the document metadata.
        """
        # create commonly used directory and file paths
        self._dataset_directory = dataset_directory
        self._abs_dataset_directory = os.path.abspath(dataset_directory)
        self._metadata_file = os.path.join(self._abs_dataset_directory,
                                           'dataset_metadata.txt')
        self._documents_directory = os.path.join(self._abs_dataset_directory,
                                                 'documents')
        self.is_recursive = is_recursive
        self._filters = []

        self.metadata = {}
        # load the dataset metadata
        with io.open(self._metadata_file,
                     'r',
                     encoding='utf-8',
                     errors='ignore') as meta_file:
            content = meta_file.read()
            metadata, __ = basic_tools.seperate_metadata_and_content(content)
            self.metadata = basic_tools.metadata_to_dict(metadata)
        self.metadata_types = {}
        basic_tools.collect_types(self.metadata_types, self.metadata)

        if not 'readable_name' in self.metadata:
            identifier = self._dataset_directory.replace('_', ' ').replace(
                '/', ' ').title()
        else:
            identifier = self.metadata['readable_name']
        self.name = basic_tools.remove_punctuation(identifier)

        # find and sort all file paths
        self._list_of_documents = basic_tools.get_all_files_from_directory(
            self._documents_directory, self.is_recursive)
        self._list_of_documents.sort()

        # find any bad documents and find document metadata
        self.document_metadata_types = {}
        bad_doc_indices = []
        for doc_index, doc in enumerate(self):
            try:
                basic_tools.collect_types(self.document_metadata_types,
                                          doc.metadata)
            except Exception as e:
                print("Bad document: ", self._list_of_documents[doc_index])
                bad_doc_indices.append(doc_index)
        while len(bad_doc_indices) != 0:
            remove_index = bad_doc_indices.pop()
            del self._list_of_documents[remove_index]
Example #5
0
 def _read_document(self):
     """Must set the self._content and self._metadata variables."""
     with io.open(self._document_path,
                  'r',
                  encoding='utf-8',
                  errors='ignore') as f:
         text = f.read()
     metadata, content = basic_tools.seperate_metadata_and_content(text)
     self._metadata = basic_tools.metadata_to_dict(metadata)
     for f in self._filters:
         content = f(content)
     self._content = content
 def __init__(self, dataset_directory, is_recursive=True):
     """
     dataset_directory -- a relative or absolute file path to the \
     directory that contains the documents directory and the dataset_metadata.txt file.
     is_recursive -- find documents recursively in the documents directory, \
                     by default documents will be found recursively.
     Document metadata types will be infered from the document metadata.
     """
     # create commonly used directory and file paths
     self._dataset_directory = dataset_directory
     self._abs_dataset_directory = os.path.abspath(dataset_directory)
     self._metadata_file = os.path.join(self._abs_dataset_directory,
                                        'dataset_metadata.txt')
     self._documents_directory = os.path.join(self._abs_dataset_directory, 
                                              'documents')
     self.is_recursive = is_recursive
     self._filters = []
     
     self.metadata = {}
     # load the dataset metadata
     with io.open(self._metadata_file, 'r', encoding='utf-8', errors='ignore') as meta_file:
         content = meta_file.read()
         metadata, __ = basic_tools.seperate_metadata_and_content(content)
         self.metadata = basic_tools.metadata_to_dict(metadata)
     self.metadata_types = {}
     basic_tools.collect_types(self.metadata_types, self.metadata)
     
     if not 'readable_name' in self.metadata:
         identifier = self._dataset_directory.replace('_', ' ').replace('/', ' ').title()
     else:
         identifier = self.metadata['readable_name']
     self.name = basic_tools.remove_punctuation(identifier)
     
     # find and sort all file paths
     self._list_of_documents = basic_tools.get_all_files_from_directory(self._documents_directory, self.is_recursive)
     self._list_of_documents.sort()
     
     # find any bad documents and find document metadata
     self.document_metadata_types = {}
     bad_doc_indices = []
     for doc_index, doc in enumerate(self):
         try:
             basic_tools.collect_types(self.document_metadata_types, doc.metadata)
         except Exception as e:
             print("Bad document: ", self._list_of_documents[doc_index])
             bad_doc_indices.append(doc_index)
     while len(bad_doc_indices) != 0:
         remove_index = bad_doc_indices.pop()
         del self._list_of_documents[remove_index]
Example #7
0
def test_metadata_to_dict():
    meta = """KEY WITH Spaces: Value with: Semi-colon
YEAR: 1983
MONTH: Nov.
SPEAKER: Marvin J. Ashton
CALLING: Of the Quorum of the Twelve Apostles
TOPIC: commitment"""
    meta_dict = basic_tools.metadata_to_dict(meta)
    
    actual_meta_dict = {'key_with_spaces': 'Value with: Semi-colon', 
                        'year': '1983', 
                        'month': 'Nov.', 
                        'speaker': 'Marvin J. Ashton',
                        'topic': 'commitment'}
    
    for key in actual_meta_dict:
        assert key in meta_dict
        assert meta_dict[key] == actual_meta_dict[key]