Ejemplo n.º 1
0
    def load(self,
             profile_name='default',
             metadata_files=None,
             dotenv_path=None):
        """
        Load the profile, given a list of yml files and a .env filename
        profiles inherit from the defaul profile, a profile not found will contain the same elements as the default profile

        :param profile_name: the profile to load (default: 'default')
        :param metadata_files: a list of metadata files to read
        :param dotenv_path: the path of a dotenv file to read
        :return: the loaded metadata profile dict
        """

        # get metadata by scanning rootdir, if no list is provided
        if metadata_files is None:
            metadata_files = []

            # defaults metadata
            dir_path = os.path.dirname(os.path.realpath(__file__))
            metadata_files += abspath(['schemas/default.yml'], dir_path)

            # project metadata
            metadata_files += abspath(
                files.get_metadata_files(paths.rootdir()), paths.rootdir())

        # get dotenv_path by scanning rootdir, if no dotenv file is provided
        if dotenv_path is None:
            dotenv_path = abspath(files.get_dotenv_path(paths.rootdir()),
                                  paths.rootdir())

        # get env variables from .env file
        if dotenv_path and os.path.isfile(dotenv_path):
            load_dotenv(dotenv_path)

        profiles = self.read(metadata_files)

        # empty profile if profile not found
        if profile_name not in self._info['profiles']:
            self.raiseException(f'Profile "{profile_name}" not found.')

        # read metadata, get the profile, if not found get an empty profile
        profiles = self.inherit(profiles)
        metadata = profiles[profile_name]

        # render any jinja templates in the profile
        md = self.render(metadata)

        # validate
        self.validate(md)

        # format
        md = self.formatted(md)

        self._profile = YamlDict(md)
        self._info['active'] = profile_name
Ejemplo n.º 2
0
    def __init__(self, logger, extra):
        """
        Initialize the adapter with a logger and a dict-like object which
        provides contextual information. This constructor signature allows
        easy stacking of LoggerAdapters, if so desired.
        You can effectively pass keyword arguments as shown in the
        following example:
        adapter = LoggerAdapter(someLogger, dict(p1=v1, p2="v2"))
        """
        self.logger = logger
        self.extra = {
            'dlf_username':
            getpass.getuser(),
            'dlf_filename':
            os.path.relpath(files.get_current_filename(), paths.rootdir()),
            'dlf_repo_name':
            repo_data()['name'],
            'dlf_repo_hash':
            repo_data()['hash'],
            'ci_pipeline_iid':
            os.getenv('CI_PIPELINE_IID', default=str(uuid.uuid4())[:7]),
            'ci_pipeline_id':
            os.getenv('CI_PIPELINE_ID', default=str(uuid.uuid4())[:7]),
            'gitlab_user_email':
            os.environ.get('GITLAB_USER_EMAIL'),
            'gitlab_user_login':
            os.environ.get('GITLAB_USER_LOGIN')
        }

        self.extra.update(extra)
Ejemplo n.º 3
0
    def config(self):
        self.check_if_loaded()

        return YamlDict({
            'version': __version__,
            'username': self._username,
            'session_id': self._session_id,
            'profile': self._profile,
            'rootdir': paths.rootdir(),
            'script_path': self._script_path,
            'dotenv_path': self._dotenv_path,
            'notebooks_files': self._notebook_files,
            'python_files': self._python_files,
            'metadata_files': self._metadata_files,
            'repository': self._repo
        })
Ejemplo n.º 4
0
    def load(self, profile='default', rootdir_path=None, search_parent_dirs=True, dotenv=True, factory_defaults=True):

        # init workdir and rootdir paths
        paths.set_rootdir(rootdir_path, search_parent_dirs)

        # set dotenv default file
        if dotenv is True:
            self._dotenv_path = os.path.join(paths.rootdir(), '.env')
        else:
            self._dotenv_path = None

        # metadata files
        metadata_files = files.get_metadata_files(paths.rootdir())

        # load metadata
        try:
            md_files = [ os.path.join(paths.rootdir(), x) for x in metadata_files]
            self._metadata = reader.load(profile,md_files,self._dotenv_path, factory_defaults)
        except ValueError as e:
            print(e)
            self._metadata = {}
 
        # bail if no metadata
        if not self._metadata:
            raise ValueError('No valid metadata to load.')
            
        # set profile from metadata
        self._profile = self._metadata['profile']

        # add roothpath to the list of python sys paths
        if paths.rootdir() not in sys.path:
            sys.path.append(paths.rootdir())

        # register hook for loading ipynb files
        if 'NotebookFinder' not in str(sys.meta_path):
            sys.meta_path.append(NotebookFinder())

        # initialize the engine
        repo_name = repo_data()['name']
        jobname = '{}-{}'.format(self._profile, repo_name) if repo_name else self._profile

        # stop existing engine
        if self._engine:
            self._engine.stop()

        self._engine = engine.get(jobname, self._metadata, paths.rootdir())

        # get all project info
        self._info = self.get_info()

        # initialize logging
        logging.init(self._metadata, self._info['session_id'])
        
        return self
Ejemplo n.º 5
0
 def __init__(self, logger, extra):
     """
     Initialize the adapter with a logger and a dict-like object which
     provides contextual information. This constructor signature allows
     easy stacking of LoggerAdapters, if so desired.
     You can effectively pass keyword arguments as shown in the
     following example:
     adapter = LoggerAdapter(someLogger, dict(p1=v1, p2="v2"))
     """
     self.logger = logger
     self.extra = {
         'dlf_username': getpass.getuser(),
         'dlf_filename': os.path.relpath(files.get_current_filename(), paths.rootdir()),
         'dlf_repo_name': repo_data()['name'],
         'dlf_repo_hash': repo_data()['hash']
     }
     
     self.extra.update(extra)
Ejemplo n.º 6
0
    def info(self):
        if not self.loaded:
            logging.error("No project profile loaded. " +
                          "Execute datalabframework.project.load(...) first.")
            return None

        return YamlDict({
            'version': __version__,
            'username': self._username,
            'session_name': self._session_name,
            'session_id': self._session_id,
            'profile': self._profile,
            'rootdir': paths.rootdir(),
            'script_path': self._script_path,
            'dotenv_path': self._dotenv_path,
            'notebooks_files': self._notebook_files,
            'python_files': self._python_files,
            'metadata_files': self._metadata_files,
            'repository': self._repo
        })
Ejemplo n.º 7
0
 def get_info(self):
     return {
             'dlf_version': __version__,
             'python_version': '.'.join([str(x) for x in sys.version_info[0:3]]),
             'session_id': hex(uuid.uuid1().int>>64),
             'profile': self._profile,
             'filename': os.path.relpath(files.get_current_filename(), paths.rootdir()),
             'rootdir': paths.rootdir(),
             'workdir': paths.workdir(),
             'username': getpass.getuser(),
             'repository': repo_data(),
             'files': {
                 'notebooks': files.get_jupyter_notebook_files(paths.rootdir()),
                 'python': files.get_python_files(paths.rootdir()),
                 'metadata': files.get_metadata_files(paths.rootdir()),
                 'dotenv': os.path.relpath(self._dotenv_path, paths.rootdir()) if self._dotenv_path else None,
             },
             'engine': self._engine.config().to_dict()
     }
Ejemplo n.º 8
0
 def resource(self, path=None, provider=None, md=dict()):
     self.check_if_loaded()
     md = get_resource_metadata(paths.rootdir(), self._metadata, path,
                                provider, md)
     return md
Ejemplo n.º 9
0
    def load(self, profile='default', rootpath=None):
        """
        Performs the following steps:
            - set rootdir for the given project
            - import variables from  <rootdir>/.env (if present),
            - load the `profile` from the metadata files
            - setup and start the data engine

        :param profile: load the given metadata profile (default: 'default')
        
        :param rootpath: root directory for loaded project 
               default behaviour: search parent dirs to detect rootdir by 
               looking for a '__main__.py' or 'main.ipynb' file. 
               When such a file is found, the corresponding directory is the 
               root path for the project. If nothing is found, the current 
               working directory, will be the rootpath

        :return: None

        Notes abount metadata configuration:

        1)  Metadata files are merged up, so you can split the information in 
            multiple files as long as they end with `metadata.yml`. 

            For example: `metadata.yml`, `abc.metadata.yaml`, `abc_metadata.yml` 
            are all valid metadata file names.

        2)  All metadata files in all subdirectories from the project root directory 
            are loaded, unless the directory contains a file `metadata.ignore.yml`

        3)  Metadata files can provide multiple profile configurations,
            by separating each profile configuration with a Document Marker 
            ( a line with `---`) (see https://yaml.org/spec/1.2/spec.html#YAML)

        4)  Each metadata profile, can be broken down in multiple yaml files,
            When loading the files all configuration belonging to the same profile 
            with be merged.

        5)  All metadata profiles inherit the settings from profile 'default'

        Metadata files are composed of 6 sections:
            - profile
            - variables
            - providers
            - resources
            - engine
            - loggers

        For more information about metadata configuration,
        type `help(datalabframework.project.metadata)`    
        """

        if self._loaded and self._no_reload:
            logging.notice(f"Profile {self._profile} already loaded. "
                           "Skipping project.load()")
            return self

        # set rootpath
        paths.set_rootdir(rootpath)

        # set loaded to false
        self._loaded = False

        # set session id
        self._session_id = hex(uuid.uuid1().int >> 64)

        # set username
        self._username = getpass.getuser()

        # get repo data
        self._repo = repo_data()

        # get currently running script path
        self._script_path = files.get_script_path(paths.rootdir())

        # set dotenv default file, check the file exists
        self._dotenv_path = files.get_dotenv_path(paths.rootdir())

        # get files
        self._metadata_files = files.get_metadata_files(paths.rootdir())
        self._notebook_files = files.get_jupyter_notebook_files(
            paths.rootdir())
        self._python_files = files.get_python_files(paths.rootdir())

        # metadata defaults
        dir_path = os.path.dirname(os.path.realpath(__file__))
        default_md_files = [os.path.join(dir_path, 'schemas/default.yml')]
        project_md_files = abspath(self._metadata_files, paths.rootdir())

        # load metadata
        try:
            md_paths = default_md_files + project_md_files
            dotenv_path = abspath(self._dotenv_path, paths.rootdir())

            self._metadata = get_project_metadata(profile, md_paths,
                                                  dotenv_path)
        except ValueError as e:
            print(e)
            self._metadata = {}

        # bail if no metadata
        if not self._metadata:
            raise ValueError('No valid metadata to load.')

        # set profile from metadata
        self._profile = self._metadata['profile']

        # add roothpath to the list of python sys paths
        if paths.rootdir() not in sys.path:
            sys.path.append(paths.rootdir())

        # stop existing engine
        if self._engine:
            self._engine.stop()

        # craft the engine name
        L = [self._profile, self._repo.get('name')]
        name = '-'.join([x for x in L if x])

        #initialize the engine
        self._engine = dlf_engine.get(name, self._metadata, paths.rootdir())

        # initialize logging
        logging.init(self._metadata.get('loggers'), self._session_id,
                     self._username, self._script_path, self._repo['name'],
                     self._repo['hash'])

        # set loaded to True
        self._loaded = True

        # return object
        return self
Ejemplo n.º 10
0
    def resource(self, path=None, provider=None, md=dict()):
        if not self.profile:
            raise ValueError("No project profile loaded. Try first: datalabframework.project.load(...)") 

        md = resource.get_metadata(paths.rootdir(), self._metadata , path, provider, md)
        return md
Ejemplo n.º 11
0
def process_metadata(md):

    # update format from
    md['format'] = get_format(md)

    # if no service, at this point use file
    md['service'] = md['service'] or 'file'

    # standardize some service names
    services = {'minio': 's3a', 'local': 'file'}
    md['service'] = services.get(md['service'], md['service'])

    # if no host, use localhost
    md['host'] = md['host'] or '127.0.0.1'

    # if local file system and rel path, prepend rootdir
    if md['service'] in ['file', 'sqlite'] and not os.path.isabs(md['path']):
        md['path'] = os.path.join(rootdir(), md['path'])

    # if service is s3a, remove leading '/'
    if md['service'] == 's3a' and md['path']:
        md['path'] = md['path'].lstrip('/')

    # generate database, table from path
    if md['format'] == 'jdbc':
        md['database'], md['table'], md['path'] = path_to_jdbc(md)

        # set driver
        md['driver'] = md['driver'] or get_driver(md['service'])

        # if not table, provide no result query
        md['table'] = md['table'] or 'SELECT 0 as result where 1 = 0'

        # if schema is not yet defined,
        # take the default for each service
        default_schemas = {
            'mysql': md['database'],
            'mssql': 'dbo',
            'postgres': 'public',
            'oracle': md['user']
        }

        md['schema'] = md['schema'] or default_schemas.get(md['service'])

        query = get_sql_query(md['table'])
        if query and not query.endswith('as _query'):
            md['table'] = '( {} ) as _query'.format(query)

    md['port'] = md['port'] or get_port(md['service'])
    md['port'] = int(md['port']) if md['port'] else None
    md['url'] = get_url(md)

    if not isinstance(md['options'], dict):
        md['options'] = {}

    compression = get_compression(md['path'])
    if md['format'] != 'jdbc' and compression:
        md['options']['compression'] = compression

    h_list = []
    for k in ['url', 'format', 'table', 'database']:
        v = zlib.crc32(md[k].encode()) if md[k] else 0
        h_list.append(v)

    md['hash'] = functools.reduce(lambda a, b: a ^ b, h_list)
    md['hash'] = hex(ctypes.c_size_t(md['hash']).value)

    return md