def load(self, profile_name='default', metadata_files=None, dotenv_path=None): """ Load the profile, given a list of yml files and a .env filename profiles inherit from the defaul profile, a profile not found will contain the same elements as the default profile :param profile_name: the profile to load (default: 'default') :param metadata_files: a list of metadata files to read :param dotenv_path: the path of a dotenv file to read :return: the loaded metadata profile dict """ # get metadata by scanning rootdir, if no list is provided if metadata_files is None: metadata_files = [] # defaults metadata dir_path = os.path.dirname(os.path.realpath(__file__)) metadata_files += abspath(['schemas/default.yml'], dir_path) # project metadata metadata_files += abspath( files.get_metadata_files(paths.rootdir()), paths.rootdir()) # get dotenv_path by scanning rootdir, if no dotenv file is provided if dotenv_path is None: dotenv_path = abspath(files.get_dotenv_path(paths.rootdir()), paths.rootdir()) # get env variables from .env file if dotenv_path and os.path.isfile(dotenv_path): load_dotenv(dotenv_path) profiles = self.read(metadata_files) # empty profile if profile not found if profile_name not in self._info['profiles']: self.raiseException(f'Profile "{profile_name}" not found.') # read metadata, get the profile, if not found get an empty profile profiles = self.inherit(profiles) metadata = profiles[profile_name] # render any jinja templates in the profile md = self.render(metadata) # validate self.validate(md) # format md = self.formatted(md) self._profile = YamlDict(md) self._info['active'] = profile_name
def __init__(self, logger, extra): """ Initialize the adapter with a logger and a dict-like object which provides contextual information. This constructor signature allows easy stacking of LoggerAdapters, if so desired. You can effectively pass keyword arguments as shown in the following example: adapter = LoggerAdapter(someLogger, dict(p1=v1, p2="v2")) """ self.logger = logger self.extra = { 'dlf_username': getpass.getuser(), 'dlf_filename': os.path.relpath(files.get_current_filename(), paths.rootdir()), 'dlf_repo_name': repo_data()['name'], 'dlf_repo_hash': repo_data()['hash'], 'ci_pipeline_iid': os.getenv('CI_PIPELINE_IID', default=str(uuid.uuid4())[:7]), 'ci_pipeline_id': os.getenv('CI_PIPELINE_ID', default=str(uuid.uuid4())[:7]), 'gitlab_user_email': os.environ.get('GITLAB_USER_EMAIL'), 'gitlab_user_login': os.environ.get('GITLAB_USER_LOGIN') } self.extra.update(extra)
def config(self): self.check_if_loaded() return YamlDict({ 'version': __version__, 'username': self._username, 'session_id': self._session_id, 'profile': self._profile, 'rootdir': paths.rootdir(), 'script_path': self._script_path, 'dotenv_path': self._dotenv_path, 'notebooks_files': self._notebook_files, 'python_files': self._python_files, 'metadata_files': self._metadata_files, 'repository': self._repo })
def load(self, profile='default', rootdir_path=None, search_parent_dirs=True, dotenv=True, factory_defaults=True): # init workdir and rootdir paths paths.set_rootdir(rootdir_path, search_parent_dirs) # set dotenv default file if dotenv is True: self._dotenv_path = os.path.join(paths.rootdir(), '.env') else: self._dotenv_path = None # metadata files metadata_files = files.get_metadata_files(paths.rootdir()) # load metadata try: md_files = [ os.path.join(paths.rootdir(), x) for x in metadata_files] self._metadata = reader.load(profile,md_files,self._dotenv_path, factory_defaults) except ValueError as e: print(e) self._metadata = {} # bail if no metadata if not self._metadata: raise ValueError('No valid metadata to load.') # set profile from metadata self._profile = self._metadata['profile'] # add roothpath to the list of python sys paths if paths.rootdir() not in sys.path: sys.path.append(paths.rootdir()) # register hook for loading ipynb files if 'NotebookFinder' not in str(sys.meta_path): sys.meta_path.append(NotebookFinder()) # initialize the engine repo_name = repo_data()['name'] jobname = '{}-{}'.format(self._profile, repo_name) if repo_name else self._profile # stop existing engine if self._engine: self._engine.stop() self._engine = engine.get(jobname, self._metadata, paths.rootdir()) # get all project info self._info = self.get_info() # initialize logging logging.init(self._metadata, self._info['session_id']) return self
def __init__(self, logger, extra): """ Initialize the adapter with a logger and a dict-like object which provides contextual information. This constructor signature allows easy stacking of LoggerAdapters, if so desired. You can effectively pass keyword arguments as shown in the following example: adapter = LoggerAdapter(someLogger, dict(p1=v1, p2="v2")) """ self.logger = logger self.extra = { 'dlf_username': getpass.getuser(), 'dlf_filename': os.path.relpath(files.get_current_filename(), paths.rootdir()), 'dlf_repo_name': repo_data()['name'], 'dlf_repo_hash': repo_data()['hash'] } self.extra.update(extra)
def info(self): if not self.loaded: logging.error("No project profile loaded. " + "Execute datalabframework.project.load(...) first.") return None return YamlDict({ 'version': __version__, 'username': self._username, 'session_name': self._session_name, 'session_id': self._session_id, 'profile': self._profile, 'rootdir': paths.rootdir(), 'script_path': self._script_path, 'dotenv_path': self._dotenv_path, 'notebooks_files': self._notebook_files, 'python_files': self._python_files, 'metadata_files': self._metadata_files, 'repository': self._repo })
def get_info(self): return { 'dlf_version': __version__, 'python_version': '.'.join([str(x) for x in sys.version_info[0:3]]), 'session_id': hex(uuid.uuid1().int>>64), 'profile': self._profile, 'filename': os.path.relpath(files.get_current_filename(), paths.rootdir()), 'rootdir': paths.rootdir(), 'workdir': paths.workdir(), 'username': getpass.getuser(), 'repository': repo_data(), 'files': { 'notebooks': files.get_jupyter_notebook_files(paths.rootdir()), 'python': files.get_python_files(paths.rootdir()), 'metadata': files.get_metadata_files(paths.rootdir()), 'dotenv': os.path.relpath(self._dotenv_path, paths.rootdir()) if self._dotenv_path else None, }, 'engine': self._engine.config().to_dict() }
def resource(self, path=None, provider=None, md=dict()): self.check_if_loaded() md = get_resource_metadata(paths.rootdir(), self._metadata, path, provider, md) return md
def load(self, profile='default', rootpath=None): """ Performs the following steps: - set rootdir for the given project - import variables from <rootdir>/.env (if present), - load the `profile` from the metadata files - setup and start the data engine :param profile: load the given metadata profile (default: 'default') :param rootpath: root directory for loaded project default behaviour: search parent dirs to detect rootdir by looking for a '__main__.py' or 'main.ipynb' file. When such a file is found, the corresponding directory is the root path for the project. If nothing is found, the current working directory, will be the rootpath :return: None Notes abount metadata configuration: 1) Metadata files are merged up, so you can split the information in multiple files as long as they end with `metadata.yml`. For example: `metadata.yml`, `abc.metadata.yaml`, `abc_metadata.yml` are all valid metadata file names. 2) All metadata files in all subdirectories from the project root directory are loaded, unless the directory contains a file `metadata.ignore.yml` 3) Metadata files can provide multiple profile configurations, by separating each profile configuration with a Document Marker ( a line with `---`) (see https://yaml.org/spec/1.2/spec.html#YAML) 4) Each metadata profile, can be broken down in multiple yaml files, When loading the files all configuration belonging to the same profile with be merged. 5) All metadata profiles inherit the settings from profile 'default' Metadata files are composed of 6 sections: - profile - variables - providers - resources - engine - loggers For more information about metadata configuration, type `help(datalabframework.project.metadata)` """ if self._loaded and self._no_reload: logging.notice(f"Profile {self._profile} already loaded. " "Skipping project.load()") return self # set rootpath paths.set_rootdir(rootpath) # set loaded to false self._loaded = False # set session id self._session_id = hex(uuid.uuid1().int >> 64) # set username self._username = getpass.getuser() # get repo data self._repo = repo_data() # get currently running script path self._script_path = files.get_script_path(paths.rootdir()) # set dotenv default file, check the file exists self._dotenv_path = files.get_dotenv_path(paths.rootdir()) # get files self._metadata_files = files.get_metadata_files(paths.rootdir()) self._notebook_files = files.get_jupyter_notebook_files( paths.rootdir()) self._python_files = files.get_python_files(paths.rootdir()) # metadata defaults dir_path = os.path.dirname(os.path.realpath(__file__)) default_md_files = [os.path.join(dir_path, 'schemas/default.yml')] project_md_files = abspath(self._metadata_files, paths.rootdir()) # load metadata try: md_paths = default_md_files + project_md_files dotenv_path = abspath(self._dotenv_path, paths.rootdir()) self._metadata = get_project_metadata(profile, md_paths, dotenv_path) except ValueError as e: print(e) self._metadata = {} # bail if no metadata if not self._metadata: raise ValueError('No valid metadata to load.') # set profile from metadata self._profile = self._metadata['profile'] # add roothpath to the list of python sys paths if paths.rootdir() not in sys.path: sys.path.append(paths.rootdir()) # stop existing engine if self._engine: self._engine.stop() # craft the engine name L = [self._profile, self._repo.get('name')] name = '-'.join([x for x in L if x]) #initialize the engine self._engine = dlf_engine.get(name, self._metadata, paths.rootdir()) # initialize logging logging.init(self._metadata.get('loggers'), self._session_id, self._username, self._script_path, self._repo['name'], self._repo['hash']) # set loaded to True self._loaded = True # return object return self
def resource(self, path=None, provider=None, md=dict()): if not self.profile: raise ValueError("No project profile loaded. Try first: datalabframework.project.load(...)") md = resource.get_metadata(paths.rootdir(), self._metadata , path, provider, md) return md
def process_metadata(md): # update format from md['format'] = get_format(md) # if no service, at this point use file md['service'] = md['service'] or 'file' # standardize some service names services = {'minio': 's3a', 'local': 'file'} md['service'] = services.get(md['service'], md['service']) # if no host, use localhost md['host'] = md['host'] or '127.0.0.1' # if local file system and rel path, prepend rootdir if md['service'] in ['file', 'sqlite'] and not os.path.isabs(md['path']): md['path'] = os.path.join(rootdir(), md['path']) # if service is s3a, remove leading '/' if md['service'] == 's3a' and md['path']: md['path'] = md['path'].lstrip('/') # generate database, table from path if md['format'] == 'jdbc': md['database'], md['table'], md['path'] = path_to_jdbc(md) # set driver md['driver'] = md['driver'] or get_driver(md['service']) # if not table, provide no result query md['table'] = md['table'] or 'SELECT 0 as result where 1 = 0' # if schema is not yet defined, # take the default for each service default_schemas = { 'mysql': md['database'], 'mssql': 'dbo', 'postgres': 'public', 'oracle': md['user'] } md['schema'] = md['schema'] or default_schemas.get(md['service']) query = get_sql_query(md['table']) if query and not query.endswith('as _query'): md['table'] = '( {} ) as _query'.format(query) md['port'] = md['port'] or get_port(md['service']) md['port'] = int(md['port']) if md['port'] else None md['url'] = get_url(md) if not isinstance(md['options'], dict): md['options'] = {} compression = get_compression(md['path']) if md['format'] != 'jdbc' and compression: md['options']['compression'] = compression h_list = [] for k in ['url', 'format', 'table', 'database']: v = zlib.crc32(md[k].encode()) if md[k] else 0 h_list.append(v) md['hash'] = functools.reduce(lambda a, b: a ^ b, h_list) md['hash'] = hex(ctypes.c_size_t(md['hash']).value) return md