Exemple #1
0
    def test_multiple_files(self, tempdir):
        yml_1 = '''\
            ---
            a:
                b: 'ohoh'
            ---
            profile: second
            c:
                d: 'lalala'
           '''
        yml_2 = '''\
            ---
            resources:
                hello:
                    a:1
            ---
            profile: second
            resources:
                world:
                    b: 2
           '''

        tempdir.write('__main__.py', b'')
        tempdir.write('metadata.yml', dedent(yml_1).encode())
        tempdir.write('abc/metadata.yml', dedent(yml_2).encode())
        res = [
            os.path.join('metadata.yml'),
            os.path.join('abc', 'metadata.yml')
        ]
        assert (files.get_metadata_files(tempdir.path) == res)
Exemple #2
0
    def load(self,
             profile_name='default',
             metadata_files=None,
             dotenv_path=None):
        """
        Load the profile, given a list of yml files and a .env filename
        profiles inherit from the defaul profile, a profile not found will contain the same elements as the default profile

        :param profile_name: the profile to load (default: 'default')
        :param metadata_files: a list of metadata files to read
        :param dotenv_path: the path of a dotenv file to read
        :return: the loaded metadata profile dict
        """

        # get metadata by scanning rootdir, if no list is provided
        if metadata_files is None:
            metadata_files = []

            # defaults metadata
            dir_path = os.path.dirname(os.path.realpath(__file__))
            metadata_files += abspath(['schemas/default.yml'], dir_path)

            # project metadata
            metadata_files += abspath(
                files.get_metadata_files(paths.rootdir()), paths.rootdir())

        # get dotenv_path by scanning rootdir, if no dotenv file is provided
        if dotenv_path is None:
            dotenv_path = abspath(files.get_dotenv_path(paths.rootdir()),
                                  paths.rootdir())

        # get env variables from .env file
        if dotenv_path and os.path.isfile(dotenv_path):
            load_dotenv(dotenv_path)

        profiles = self.read(metadata_files)

        # empty profile if profile not found
        if profile_name not in self._info['profiles']:
            self.raiseException(f'Profile "{profile_name}" not found.')

        # read metadata, get the profile, if not found get an empty profile
        profiles = self.inherit(profiles)
        metadata = profiles[profile_name]

        # render any jinja templates in the profile
        md = self.render(metadata)

        # validate
        self.validate(md)

        # format
        md = self.formatted(md)

        self._profile = YamlDict(md)
        self._info['active'] = profile_name
Exemple #3
0
    def load(self, profile='default', rootpath=None):
        """
        Performs the following steps:
            - set rootdir for the given project
            - import variables from  <rootdir>/.env (if present),
            - load the `profile` from the metadata files
            - setup and start the data engine

        :param profile: load the given metadata profile (default: 'default')
        
        :param rootpath: root directory for loaded project 
               default behaviour: search parent dirs to detect rootdir by 
               looking for a '__main__.py' or 'main.ipynb' file. 
               When such a file is found, the corresponding directory is the 
               root path for the project. If nothing is found, the current 
               working directory, will be the rootpath

        :return: None

        Notes abount metadata configuration:

        1)  Metadata files are merged up, so you can split the information in 
            multiple files as long as they end with `metadata.yml`. 

            For example: `metadata.yml`, `abc.metadata.yaml`, `abc_metadata.yml` 
            are all valid metadata file names.

        2)  All metadata files in all subdirectories from the project root directory 
            are loaded, unless the directory contains a file `metadata.ignore.yml`

        3)  Metadata files can provide multiple profile configurations,
            by separating each profile configuration with a Document Marker 
            ( a line with `---`) (see https://yaml.org/spec/1.2/spec.html#YAML)

        4)  Each metadata profile, can be broken down in multiple yaml files,
            When loading the files all configuration belonging to the same profile 
            with be merged.

        5)  All metadata profiles inherit the settings from profile 'default'

        Metadata files are composed of 6 sections:
            - profile
            - variables
            - providers
            - resources
            - engine
            - loggers

        For more information about metadata configuration,
        type `help(datafaucet.project.metadata)`    
        """

        if self.loaded and self._no_reload:
            logging.notice(f"Profile {self._profile} already loaded. "
                           "Skipping project.load()")
            return self

        # set rootpath
        paths.set_rootdir(rootpath)

        # set loaded to false
        self.loaded = False

        # set username
        self._username = getpass.getuser()

        # get repo data
        self._repo = repo_data()

        # set session name
        L = [self._profile, self._repo.get('name')]
        self._session_name = '-'.join([x for x in L if x])

        # set session id
        self._session_id = hex(uuid.uuid1().int >> 64)

        # get currently running script path
        self._script_path = files.get_script_path(paths.rootdir())

        # set dotenv default file, check the file exists
        self._dotenv_path = files.get_dotenv_path(paths.rootdir())

        # get files
        self._metadata_files = files.get_metadata_files(paths.rootdir())
        self._notebook_files = files.get_jupyter_notebook_files(
            paths.rootdir())
        self._python_files = files.get_python_files(paths.rootdir())

        # metadata defaults
        dir_path = os.path.dirname(os.path.realpath(__file__))
        default_md_files = [os.path.join(dir_path, 'schemas/default.yml')]
        project_md_files = abspath(self._metadata_files, paths.rootdir())

        # load metadata
        try:
            md_paths = default_md_files + project_md_files
            dotenv_path = abspath(self._dotenv_path, paths.rootdir())

            metadata.load(profile, md_paths, dotenv_path)
        except ValueError as e:
            print(e)

        # bail if no metadata
        if metadata.profile is None:
            raise ValueError('No valid metadata to load.')

        # set profile from metadata
        self._profile_name = metadata.info()['active']

        # add roothpath to the list of python sys paths
        if paths.rootdir() not in sys.path:
            sys.path.append(paths.rootdir())

        # stop existing engine
        if self._engine:
            self._engine.stop()

        #services
        services = dict()

        all_aliases = list(metadata.profile()['providers'].keys())

        # get services from aliases
        for alias in all_aliases:
            r = Resource(alias)
            services[r['service']] = r

        # get one service from each type to
        # load drivers, jars etc via the engine init
        services = list(services.values())

        #initialize the engine
        md = metadata.profile()['engine']
        engines.Engine(md['type'],
                       session_name=self._session_name,
                       session_id=self._session_id,
                       master=md['master'],
                       timezone=md['timezone'],
                       jars=md['submit']['jars'],
                       packages=md['submit']['packages'],
                       pyfiles=md['submit']['py-files'],
                       files=md['submit']['files'],
                       repositories=md['submit']['repositories'],
                       conf=md['submit']['conf'],
                       services=services)

        # initialize logging
        logging.init(metadata.profile()['loggers'], self._session_id,
                     self._username, self._script_path, self._repo['name'],
                     self._repo['hash'])

        # set loaded to True
        self.loaded = True

        # return object
        return self