Beispiel #1
0
def assemble_metadata(md):
    keys = [
        'hash',
        'url',
        'service',
        'version',
        'format',

        'host'
    ]

    if md['service'] != 'file':
        keys.append('port')

    if md['service'] == 's3a' or md['format'] == 'jdbc':
        keys.extend([
            'user',
            'password'])

    if md['format'] == 'jdbc':
        keys.extend([
            'driver',
            'database',
            'schema',
            'table'])

    keys.append('options')
    return YamlDict(to_ordered_dict(md, keys))
Beispiel #2
0
    def load(self,
             profile_name='default',
             metadata_files=None,
             dotenv_path=None):
        """
        Load the profile, given a list of yml files and a .env filename
        profiles inherit from the defaul profile, a profile not found will contain the same elements as the default profile

        :param profile_name: the profile to load (default: 'default')
        :param metadata_files: a list of metadata files to read
        :param dotenv_path: the path of a dotenv file to read
        :return: the loaded metadata profile dict
        """

        # get metadata by scanning rootdir, if no list is provided
        if metadata_files is None:
            metadata_files = []

            # defaults metadata
            dir_path = os.path.dirname(os.path.realpath(__file__))
            metadata_files += abspath(['schemas/default.yml'], dir_path)

            # project metadata
            metadata_files += abspath(
                files.get_metadata_files(paths.rootdir()), paths.rootdir())

        # get dotenv_path by scanning rootdir, if no dotenv file is provided
        if dotenv_path is None:
            dotenv_path = abspath(files.get_dotenv_path(paths.rootdir()),
                                  paths.rootdir())

        # get env variables from .env file
        if dotenv_path and os.path.isfile(dotenv_path):
            load_dotenv(dotenv_path)

        profiles = self.read(metadata_files)

        # empty profile if profile not found
        if profile_name not in self._info['profiles']:
            self.raiseException(f'Profile "{profile_name}" not found.')

        # read metadata, get the profile, if not found get an empty profile
        profiles = self.inherit(profiles)
        metadata = profiles[profile_name]

        # render any jinja templates in the profile
        md = self.render(metadata)

        # validate
        self.validate(md)

        # format
        md = self.formatted(md)

        self._profile = YamlDict(md)
        self._info['active'] = profile_name
Beispiel #3
0
    def get_environment(self):
        vars = [
            'SPARK_HOME',
            'HADOOP_HOME',
            'JAVA_HOME',
            'PYSPARK_PYTHON',
            'PYSPARK_DRIVER_PYTHON',
            'PYTHONPATH',
            'PYSPARK_SUBMIT_ARGS',
            'SPARK_DIST_CLASSPATH',
        ]

        return YamlDict({v: os.environ.get(v) for v in vars})
Beispiel #4
0
    def start_session(self, conf):
        try:
            # init the spark session
            session = pyspark.sql.SparkSession.builder.config(
                conf=conf).getOrCreate()

            # store the spark session
            self.session = session

            # fix SQLContext for back compatibility
            initialize_spark_sql_context(session, session.sparkContext)

            # pyspark set log level method
            # (this will not suppress WARN before starting the context)
            session.sparkContext.setLogLevel("ERROR")

            # bootstrap datafaucet.zip in the cluster
            if not self.is_spark_local():
                dir_path = os.path.dirname(os.path.realpath(__file__))
                filename = os.path.abspath(
                    os.path.join(dir_path, 'dist/datafaucet.zip'))
                session.sparkContext.addPyFile(filename)

            # collect configuration
            self.conf = dict(dict(session.sparkContext.getConf().getAll()))

            # set the engine version
            self.version = session.version

            # set environment
            self.env = self.get_environment()

            # set info
            self.info['spark_classpath'] = self.info['spark_classpath'][
                0].split(' ')
            self.info = YamlDict(self.info)

            # set version if spark is loaded
            logging.notice(
                f'Engine context {self.engine_type}:{self.version} successfully started'
            )

            # session is running
            self.stopped = False

        except Exception as e:
            print(e)
            logging.error('Could not start the engine context')
            return None
Beispiel #5
0
    def info(self):
        if not self.loaded:
            logging.error("No project profile loaded. " +
                          "Execute datafaucet.project.load(...) first.")
            return None

        return YamlDict({
            'version': __version__,
            'username': self._username,
            'session_name': self._session_name,
            'session_id': self._session_id,
            'profile': self._profile,
            'rootdir': paths.rootdir(),
            'script_path': self._script_path,
            'dotenv_path': self._dotenv_path,
            'notebooks_files': self._notebook_files,
            'python_files': self._python_files,
            'metadata_files': self._metadata_files,
            'repository': self._repo
        })
Beispiel #6
0
    def __init__(self,
                 session_name=None,
                 session_id=0,
                 master=None,
                 timezone=None,
                 jars=None,
                 packages=None,
                 pyfiles=None,
                 files=None,
                 repositories=None,
                 services=None,
                 conf=None):

        #call base class
        # stop the previous instance,
        # register self a the new instance
        super().__init__('dask', session_name, session_id)

        # bundle all submit in a dictionary
        self.submit = {
            'jars': [jars] if isinstance(jars, str) else jars or [],
            'packages':
            [packages] if isinstance(packages, str) else packages or [],
            'py-files':
            [pyfiles] if isinstance(pyfiles, str) else pyfiles or [],
            'files': [files] if isinstance(files, str) else files or [],
            'repositories': [repositories]
            if isinstance(repositories, str) else repositories or [],
            'conf': [conf] if isinstance(conf, tuple) else conf or [],
        }

        # collect info
        self.set_info()

        # detect packages and configuration from services
        detected = self.detect_submit_params(services)

        # merge up with those passed with the init
        for k in self.submit.keys():
            self.submit[k] = list(sorted(set(self.submit[k] + detected[k])))

        #set submit args via env variable
        self.set_submit_args()

        # set other environment variables
        self.set_env_variables()

        # set spark conf object
        print(f"Setting context to dask.")

        # config passed through the api call go via the config
        for c in self.submit['conf']:
            k, v, *_ = list(c) + ['']
            if isinstance(v, (bool, int, float, str)):
                #todo:
                #conf.set(k, v)
                pass

        # stop the current session if running
        self._stop()

        # start spark
        session = self.start_context(conf)

        # record the data in the engine object for debug and future references
        self.conf = YamlDict(nested_to_record(get_options(pd)))

        if session:
            # set the engine version
            self.version = dask.__version__

            # set environment
            self.env = self.get_environment()

            # record the data in the engine object for debug and future references
            self.conf = YamlDict(nested_to_record(get_options(pd)))

            # set version if spark is loaded
            print(
                f'Engine context {self.engine_type}:{self.version} successfully started'
            )

            # store the spark session
            self.context = session

            # session is running
            self.stopped = False
Beispiel #7
0
    def get_environment(self):
        vars = ['SPARK_HOME', 'JAVA_HOME', 'PYTHONPATH']

        return YamlDict({v: os.environ.get(v) for v in vars})
Beispiel #8
0
    def __init__(self,
                 session_name=None,
                 session_id=0,
                 master='local[*]',
                 timezone=None,
                 jars=None,
                 packages=None,
                 pyfiles=None,
                 files=None,
                 repositories=None,
                 services=None,
                 conf=None):

        #call base class
        # stop the previous instance,
        # register self a the new instance
        super().__init__('spark', session_name, session_id)

        # bundle all submit in a dictionary
        self.submit = {
            'jars': [jars] if isinstance(jars, str) else jars or [],
            'packages':
            [packages] if isinstance(packages, str) else packages or [],
            'py-files':
            [pyfiles] if isinstance(pyfiles, str) else pyfiles or [],
            'files': [files] if isinstance(files, str) else files or [],
            'repositories': [repositories]
            if isinstance(repositories, str) else repositories or [],
            'conf':
            conf or {}
        }

        # suppress INFO logging for java_gateway
        python_logging.getLogger('py4j.java_gateway').setLevel(
            python_logging.ERROR)

        # collect info
        self.set_info()

        # detect packages and configuration from services
        detected = self.detect_submit_params(services)

        # merge up with those passed with the init
        for k in self.submit.keys() - {'conf'}:
            self.submit[k] = list(sorted(set(self.submit[k] + detected[k])))
        self.submit['conf'] = merge(detected['conf'], self.submit['conf'])

        #set submit args via env variable
        self.set_submit_args()

        # set other spark-related environment variables
        self.set_env_variables()

        # set spark conf object
        print(f"Connecting to spark master: {master}")

        conf = pyspark.SparkConf()
        self.set_conf_timezone(conf, timezone)

        # set session name
        conf.setAppName(session_name)

        # set master
        conf.setMaster(master)

        # config passed through the api call go via the config
        for c in self.submit['conf']:
            k, v, *_ = list(c) + ['']
            if isinstance(v, (bool, int, float, str)):
                conf.set(k, v)

        # stop the current session if running
        self._stop()

        # start spark
        spark_session = self.start_context(conf)

        # record the data in the engine object for debug and future references
        self.conf = YamlDict(dict(conf.getAll()))

        if spark_session:
            self.conf = dict(
                dict(spark_session.sparkContext.getConf().getAll()))

            # set version if spark is loaded
            self._version = spark_session.version
            print(
                f'Engine context {self.engine_type}:{self.version} successfully started'
            )

            # store the spark session
            self.context = spark_session

            # session is running
            self.stopped = False
Beispiel #9
0
 def info(self):
     return YamlDict(self._info)