def load(self, profile_name='default', metadata_files=None, dotenv_path=None): """ Load the profile, given a list of yml files and a .env filename profiles inherit from the defaul profile, a profile not found will contain the same elements as the default profile :param profile_name: the profile to load (default: 'default') :param metadata_files: a list of metadata files to read :param dotenv_path: the path of a dotenv file to read :return: the loaded metadata profile dict """ # get metadata by scanning rootdir, if no list is provided if metadata_files is None: metadata_files = [] # defaults metadata dir_path = os.path.dirname(os.path.realpath(__file__)) metadata_files += abspath(['schemas/default.yml'], dir_path) # project metadata metadata_files += abspath( files.get_metadata_files(paths.rootdir()), paths.rootdir()) # get dotenv_path by scanning rootdir, if no dotenv file is provided if dotenv_path is None: dotenv_path = abspath(files.get_dotenv_path(paths.rootdir()), paths.rootdir()) # get env variables from .env file if dotenv_path and os.path.isfile(dotenv_path): load_dotenv(dotenv_path) profiles = self.read(metadata_files) # empty profile if profile not found if profile_name not in self._info['profiles']: self.raiseException(f'Profile "{profile_name}" not found.') # read metadata, get the profile, if not found get an empty profile profiles = self.inherit(profiles) metadata = profiles[profile_name] # render any jinja templates in the profile md = self.render(metadata) # validate self.validate(md) # format md = self.formatted(md) self._profile = YamlDict(md) self._info['active'] = profile_name
def info(self): if not self.loaded: logging.error("No project profile loaded. " + "Execute datafaucet.project.load(...) first.") return None return YamlDict({ 'version': __version__, 'username': self._username, 'session_name': self._session_name, 'session_id': self._session_id, 'profile': self._profile, 'rootdir': paths.rootdir(), 'script_path': self._script_path, 'dotenv_path': self._dotenv_path, 'notebooks_files': self._notebook_files, 'python_files': self._python_files, 'metadata_files': self._metadata_files, 'repository': self._repo })
def init_adapter(logger=None, sid=None): sid = sid or hex(uuid.uuid1().int >> 64) username = getpass.getuser() filepath = files.get_script_path(paths.rootdir()) repo = git.repo_data() reponame = repo['name'] repohash = repo['hash'] # configure context extra = { 'dfc_sid': sid, 'dfc_repohash': repohash, 'dfc_reponame': reponame, 'dfc_username': username, 'dfc_filepath': filepath } # setup adapter return LoggerAdapter(logger, extra)
def process_metadata(md): # update format from md['format'] = get_format(md) # if no service, at this point use file md['service'] = md['service'] or 'file' # standardize some service names services = {'minio': 's3a', 'local': 'file'} md['service'] = services.get(md['service'], md['service']) # if no host, use localhost md['host'] = md['host'] or '127.0.0.1' # if local file system and rel path, prepend rootdir if md['service'] in ['file', 'sqlite'] and not os.path.isabs(md['path']): md['path'] = os.path.join(rootdir(), md['path']) # if service is s3a, remove leading '/' if md['service'] == 's3a' and md['path']: md['path'] = md['path'].lstrip('/') # generate database, table from path if md['format'] == 'jdbc': md['database'], md['table'], md['path'] = path_to_jdbc(md) # set driver md['driver'] = md['driver'] or get_driver(md['service']) # if schema is not yet defined, # take the default for each service default_schemas = { 'mysql': md['database'], 'mssql': 'dbo', 'postgres': 'public', 'clickhouse': 'default', 'oracle': md['user'] } md['schema'] = md['schema'] or default_schemas.get(md['service']) query = get_sql_query(md['table']) if query and not query.endswith('as _query'): md['table'] = '( {} ) as _query'.format(query) md['version'] = md['version'] or get_version(md['service']) md['port'] = md['port'] or get_port(md['service']) md['port'] = int(md['port']) if md['port'] else None md['url'] = get_url(md) if not isinstance(md['options'], dict): md['options'] = {} compression = get_compression(md['path']) if md['format'] != 'jdbc' and compression: md['options']['compression'] = compression h_list = [] for k in ['url', 'format', 'table', 'database']: v = zlib.crc32(md[k].encode()) if md[k] else 0 h_list.append(v) md['hash'] = functools.reduce(lambda a, b: a ^ b, h_list) md['hash'] = hex(ctypes.c_size_t(md['hash']).value) return md
def load(self, profile='default', rootpath=None): """ Performs the following steps: - set rootdir for the given project - import variables from <rootdir>/.env (if present), - load the `profile` from the metadata files - setup and start the data engine :param profile: load the given metadata profile (default: 'default') :param rootpath: root directory for loaded project default behaviour: search parent dirs to detect rootdir by looking for a '__main__.py' or 'main.ipynb' file. When such a file is found, the corresponding directory is the root path for the project. If nothing is found, the current working directory, will be the rootpath :return: None Notes abount metadata configuration: 1) Metadata files are merged up, so you can split the information in multiple files as long as they end with `metadata.yml`. For example: `metadata.yml`, `abc.metadata.yaml`, `abc_metadata.yml` are all valid metadata file names. 2) All metadata files in all subdirectories from the project root directory are loaded, unless the directory contains a file `metadata.ignore.yml` 3) Metadata files can provide multiple profile configurations, by separating each profile configuration with a Document Marker ( a line with `---`) (see https://yaml.org/spec/1.2/spec.html#YAML) 4) Each metadata profile, can be broken down in multiple yaml files, When loading the files all configuration belonging to the same profile with be merged. 5) All metadata profiles inherit the settings from profile 'default' Metadata files are composed of 6 sections: - profile - variables - providers - resources - engine - loggers For more information about metadata configuration, type `help(datafaucet.project.metadata)` """ if self.loaded and self._no_reload: logging.notice(f"Profile {self._profile} already loaded. " "Skipping project.load()") return self # set rootpath paths.set_rootdir(rootpath) # set loaded to false self.loaded = False # set username self._username = getpass.getuser() # get repo data self._repo = repo_data() # set session name L = [self._profile, self._repo.get('name')] self._session_name = '-'.join([x for x in L if x]) # set session id self._session_id = hex(uuid.uuid1().int >> 64) # get currently running script path self._script_path = files.get_script_path(paths.rootdir()) # set dotenv default file, check the file exists self._dotenv_path = files.get_dotenv_path(paths.rootdir()) # get files self._metadata_files = files.get_metadata_files(paths.rootdir()) self._notebook_files = files.get_jupyter_notebook_files( paths.rootdir()) self._python_files = files.get_python_files(paths.rootdir()) # metadata defaults dir_path = os.path.dirname(os.path.realpath(__file__)) default_md_files = [os.path.join(dir_path, 'schemas/default.yml')] project_md_files = abspath(self._metadata_files, paths.rootdir()) # load metadata try: md_paths = default_md_files + project_md_files dotenv_path = abspath(self._dotenv_path, paths.rootdir()) metadata.load(profile, md_paths, dotenv_path) except ValueError as e: print(e) # bail if no metadata if metadata.profile is None: raise ValueError('No valid metadata to load.') # set profile from metadata self._profile_name = metadata.info()['active'] # add roothpath to the list of python sys paths if paths.rootdir() not in sys.path: sys.path.append(paths.rootdir()) # stop existing engine if self._engine: self._engine.stop() #services services = dict() all_aliases = list(metadata.profile()['providers'].keys()) # get services from aliases for alias in all_aliases: r = Resource(alias) services[r['service']] = r # get one service from each type to # load drivers, jars etc via the engine init services = list(services.values()) #initialize the engine md = metadata.profile()['engine'] engines.Engine(md['type'], session_name=self._session_name, session_id=self._session_id, master=md['master'], timezone=md['timezone'], jars=md['submit']['jars'], packages=md['submit']['packages'], pyfiles=md['submit']['py-files'], files=md['submit']['files'], repositories=md['submit']['repositories'], conf=md['submit']['conf'], services=services) # initialize logging logging.init(metadata.profile()['loggers'], self._session_id, self._username, self._script_path, self._repo['name'], self._repo['hash']) # set loaded to True self.loaded = True # return object return self