def inherit(self, profiles): """ Profiles inherit from a default profile. Inherit merges each profile with the configuration of the default profile. :param profiles: dict of profiles :return: dict of profiles """ # inherit from default for all other profiles for k in profiles.get('default', {}).keys(): for p in set(profiles.keys()) - {'default'}: profiles[p][k] = merge(profiles['default'][k], profiles[p].get(k)) return profiles
def test_minimal(self, tempdir): pmd = {'service': 'local', 'format': 'csv', 'path': tempdir.path} # noinspection PyProtectedMember d = resource._build_resource_metadata(tempdir.path, pmd=pmd) m = self.empty(tempdir).copy() u = { 'provider_path': tempdir.path, 'service': 'local', 'format': 'csv', 'url': f'{tempdir.path}' } m = merge(m, u) assert (d == m)
def to_resource(url_alias=None, *args, **kwargs): md = None # if a dict, create from dictionary if isinstance(url_alias, dict): md = resource_from_dict(url_alias) # if a string, and a metadata profile is loaded, check for aliases if metadata.profile(): if not md and url_alias in metadata.profile().get('resources', {}).keys(): md = metadata.profile()['resources'][url_alias] if not md and url_alias in metadata.profile().get('providers', {}).keys(): md = metadata.profile()['providers'][url_alias] # if nothing found yet, interpret as a urn/path if not md and url_alias: md = resource_from_urn(urnparse(url_alias)) # empty default if not md: md = get_default_md() # sanitize path if it's a url or a query if md['path']: url_md = resource_from_urn(urnparse(md['path'])) md = merge(url_md, md) md['path'] = url_md['path'] # override using kwargs md = metadata_overrides(md, **kwargs) if 'hostname' in md: del md['hostname'] if 'username' in md: del md['username'] return md
def metadata_overrides(md, host=None, service=None, port=None, user=None, password=None, driver=None, database=None, schema=None, table=None, format=None, version=None, hostname=None, username=None, **options): d = {} d['path'] = md.get('url') or md.get('path') d['provider'] = md.get('provider') d['host'] = host or hostname or md['host'] or md.get('hostname') d['port'] = port or md['port'] d['service'] = service or md['service'] d['format'] = format or md['format'] d['version'] = version or md['version'] d['user'] = user or username or md['user'] or md.get('username') d['password'] = password or md['password'] d['database'] = database or md['database'] d['schema'] = schema or md['schema'] d['table'] = table or md['table'] d['driver'] = driver or md['driver'] d['options'] = merge(md['options'], options) if database or table: d['path'] = None return d
def read(self, file_paths=None): """ Return all profiles, stored in a nested dictionary Profiles are merged over the list provided of provided metadata files to read. The order in the list of metadata files determines how profile properties are override :param file_paths: list of yaml files paths :return: dict of profiles """ # empty profiles, before start reading profiles = {} if not file_paths: file_paths = [] self._info['files'] = [] for filename in file_paths: if os.path.isfile(filename): with open(filename, 'r') as f: try: docs = list(yaml.load_all(f)) self._info['files'].append(filename) except yaml.YAMLError as e: if hasattr(e, 'problem_mark'): mark = e.problem_mark logging.error( "Error loading yml file {} at position: (%s:%s): skipping file" .format(filename, mark.line + 1, mark.column + 1)) docs = [] finally: for doc in docs: doc['profile'] = doc.get('profile', 'default') profiles[doc['profile']] = merge( profiles.get(doc['profile'], {}), doc) self._info['profiles'] = sorted(list(profiles.keys())) return profiles
def process(self, msg, kwargs): """ Process the logging message and keyword arguments passed in to a logging call to insert contextual information. You can either manipulate the message itself, the keyword args or both. Return the message and kwargs modified (or not) to suit your needs. Normally, you'll only need to override this one method in a LoggerAdapter subclass for your specific needs. """ d = self.extra d.update({'dfc_funcname': func_name(5)}) if isinstance(msg, MutableMapping): merged = merge(msg, kwargs.get('extra', {})) d.update({'dfc_data': merged}) msg = 'data' elif isinstance(msg, str): d.update({'dfc_data': kwargs.get('extra', {})}) else: raise ValueError('log message must be a str or a dict') kwargs["extra"] = d return msg, kwargs
def test_resource_provider(self, tempdir): pmd = { 'alias': 'p', 'service': 'local', 'format': 'csv', 'path': tempdir.path } rmd = {'alias': 'r', 'path': 'abc/def'} # noinspection PyProtectedMember d = resource._build_resource_metadata(tempdir.path, pmd=pmd, rmd=rmd) m = self.empty(tempdir).copy() u = { 'provider_path': tempdir.path, 'provider_alias': 'p', 'resource_alias': 'r', 'resource_path': 'abc/def', 'service': 'local', 'format': 'csv', 'url': f'{tempdir.path}/abc/def' } m = merge(m, u) assert (d == m)
def Resource(path_or_alias_or_url=None, provider_path_or_alias_or_url=None, host=None, service=None, port=None, user=None, password=None, driver=None, database=None, schema=None, table=None, format=None, version=None, hostname=None, username=None, **options): prov = provider_path_or_alias_or_url path = path_or_alias_or_url # get the resource, by alias metadata or by url rmd = to_resource(path, host=host, service=service, port=port, user=user, password=password, driver=driver, database=database, schema=schema, table=table, format=format, version=version, hostname=hostname, username=username, **options) # get the provider by reference from the resource, if available prov = prov or rmd.get('provider') # get the provider, by alias metadata or by url pmd = to_resource(prov) # check if the provider is a jdbc connection, if so set it pmd['database'], pmd['table'], pmd['path'] = path_to_jdbc(pmd, True) # merge provider and resource metadata md = merge(pmd, rmd) # concatenate paths, if no table is defined if md['table']: md['path'] = None else: md['path'] = os.path.join(pmd['path'] or '', rmd['path'] or '') #process metadata md = process_metadata(md) #todo: verify resource # check format and other minimum requirements are met # assemble output md = assemble_metadata(md) return md
def __init__(self, session_name=None, session_id=0, master='local[*]', timezone=None, jars=None, packages=None, pyfiles=None, files=None, repositories=None, services=None, conf=None): #call base class # stop the previous instance, # register self a the new instance super().__init__('spark', session_name, session_id) # bundle all submit in a dictionary self.submit = { 'jars': [jars] if isinstance(jars, str) else jars or [], 'packages': [packages] if isinstance(packages, str) else packages or [], 'py-files': [pyfiles] if isinstance(pyfiles, str) else pyfiles or [], 'files': [files] if isinstance(files, str) else files or [], 'repositories': [repositories] if isinstance(repositories, str) else repositories or [], 'conf': conf or {} } # suppress INFO logging for java_gateway python_logging.getLogger('py4j.java_gateway').setLevel( python_logging.ERROR) # collect info self.set_info() # detect packages and configuration from services detected = self.detect_submit_params(services) # merge up with those passed with the init for k in self.submit.keys() - {'conf'}: self.submit[k] = list(sorted(set(self.submit[k] + detected[k]))) self.submit['conf'] = merge(detected['conf'], self.submit['conf']) #set submit args via env variable self.set_submit_args() # set other spark-related environment variables self.set_env_variables() # set spark conf object print(f"Connecting to spark master: {master}") conf = pyspark.SparkConf() self.set_conf_timezone(conf, timezone) # set session name conf.setAppName(session_name) # set master conf.setMaster(master) # config passed through the api call go via the config for c in self.submit['conf']: k, v, *_ = list(c) + [''] if isinstance(v, (bool, int, float, str)): conf.set(k, v) # stop the current session if running self._stop() # start spark spark_session = self.start_context(conf) # record the data in the engine object for debug and future references self.conf = YamlDict(dict(conf.getAll())) if spark_session: self.conf = dict( dict(spark_session.sparkContext.getConf().getAll())) # set version if spark is loaded self._version = spark_session.version print( f'Engine context {self.engine_type}:{self.version} successfully started' ) # store the spark session self.context = spark_session # session is running self.stopped = False
def test_merge(): a = {'a': 1, 'b': 4, 'c': {'merge1': 2}} b = {'d': 'add', 'b': 'override', 'c': {'merge2': 4}} r1 = merge(a, b) r2 = {'a': 1, 'd': 'add', 'b': 'override', 'c': {'merge2': 4, 'merge1': 2}} assert (r1 == r2)
def test_resource_provider_2path_absolute(self, tempdir): pmd = { 'alias': 'p', 'service': 'local', 'format': 'csv', 'path': '/absolute/path' } rmd = {'alias': 'r', 'path': 'abc/def'} # noinspection PyProtectedMember d = resource._build_resource_metadata(tempdir.path, pmd=pmd, rmd=rmd) m = self.empty(tempdir).copy() u = { 'provider_path': '/absolute/path', 'provider_alias': 'p', 'resource_alias': 'r', 'resource_path': 'abc/def', 'service': 'local', 'format': 'csv', 'url': f'/absolute/path/abc/def' } m = merge(m, u) assert (d == m) # resource('SELECT 0 as result where 1 = 0', 'pagila') # resource('foo.csv', '/bar') # resource('foo.csv', 'bar') # resource('foo.csv', 'hdfs') # resource('/foo.abc', 'hdfs') # resource('/foo.abc', 'test') # resource('hello/foo.abc', 'test') # resource('foo.abc', 'hdfs://*****:*****@1.2.3.4:3306/sakila?useSSL=false&serverTimezone=UTC&zeroDateTimeBehavior=CONVERT_TO_NULL') # resource('staff', 'jdbc:mysql://1.2.3.4/sakila', useSSL='false', serverTimezone='UTC', zeroDateTimeBehavior='CONVERT_TO_NULL') # resource('staff', service='mysql', database='sakila', serverTimezone='UTC') # resource('sakila/staff', service='mysql', serverTimezone='UTC', user='******', password='******') # resource('foo/bar.tsv', service='s3a') # resource('/foo/bar.tsv', service='s3a') # resource('/apples/orange', service='minio') # resource('SELECT count(*) as cnt from employees;', 'jdbc:mysql://1.2.3.4:3306/sakila?useSSL=false&serverTimezone=UTC&zeroDateTimeBehavior=CONVERT_TO_NULL', user='******', password='******') # resource('ascombe') # resource('ascombe', 'saywhat') # resource('ascombe', 'hdfs://*****:*****@//123.123.123:345/schema/database' # parsed = Urn(scheme=['jdbc', 'oracle', 'thin', 'name/pass@'], user='******', password='******', host='123.123.123', port='345', path='/schema/database', params=[], query='', fragment='') # assert(parsed == urnparse(urn)) # urn = 'jdbc:oracle:thin:name@//123.123.123:345/schema/database' # parsed = Urn(scheme=['jdbc', 'oracle', 'thin', 'name@'], user='******', password='', host='123.123.123', port='345', path='/schema/database', params=[], query='', fragment='') # assert(parsed == urnparse(urn)) # urn = 'jdbc:oracle:thin:@//123.123.123/schema/database' # parsed = Urn(scheme=['jdbc', 'oracle', 'thin', '@'], user='', password='', host='123.123.123', port='', path='/schema/database', params=[], query='', fragment='') # assert(parsed == urnparse(urn)) # urn = 'hdfs://123.123.123/schema/database' # parsed = Urn(scheme=['hdfs'], user='', password='', host='123.123.123', port='', path='/schema/database', params=[], query='', fragment='') # assert(parsed == urnparse(urn)) # urn = '/schema/database' # parsed = Urn(scheme=[], user='', password='', host='', port='', path='/schema/database', params=[], query='', fragment='') # assert(parsed == urnparse(urn)) # urn = 's3a://schema/database' # parsed = Urn(scheme=['s3a'], user='', password='', host='', port='', path='schema/database', params=[], query='', fragment='') # assert(parsed == urnparse(urn)) # urn = '1.2.34/schema/database' # parsed = Urn(scheme=[], user='', password='', host='', port='', path='1.2.34/schema/database', params=[], query='', fragment='') # assert(parsed == urnparse(urn)) # urn = 'file://1.2.34/schema/database' # parsed = Urn(scheme=['file'], user='', password='', host='1.2.34', port='', path='/schema/database', params=[], query='', fragment='') # assert(parsed == urnparse(urn)) # urn = 'jdbc:sqlite://localdir/a/b/c' # parsed = Urn(scheme=['jdbc', 'sqlite'], user='', password='', host='', port='', path='localdir/a/b/c', params=[], query='', fragment='') # assert(parsed == urnparse(urn)) # urn = 'jdbc:oracle:thin:@ldap://xyz.acme.com:7777/sales,cn=salesdept,cn=OracleContext,dc=com' # parsed = Urn(scheme=['jdbc', 'oracle', 'thin', '@ldap'], user='', password='', host='xyz.acme.com', port='7777', path='/sales', params=[('cn', 'salesdept'), ('cn', 'OracleContext'), ('dc', 'com')], query='cn=salesdept&cn=OracleContext&dc=com', fragment='') # assert(parsed == urnparse(urn)) # urn = 'http://xyz.acme.com:7777/foo/bar?a=1&edf=abc#anchor1' # parsed = Urn(scheme=['http'], user='', password='', host='xyz.acme.com', port='7777', path='/foo/bar', params=[('a', '1'), ('edf', 'abc')], query='a=1&edf=abc', fragment='anchor1') # assert(parsed == urnparse(urn)) # urn = 'jdbc:sqlserver://localhost:1433;databaseName=AdventureWorks;integratedSecurity=true;' # parsed = Urn(scheme=['jdbc', 'sqlserver'], user='', password='', host='localhost', port='1433', path='', params=[('databaseName', 'AdventureWorks'), ('integratedSecurity', 'true')], query='databaseName=AdventureWorks&integratedSecurity=true', fragment='') # assert(parsed == urnparse(urn)) # urn = 'jdbc:sqlserver://localhost;databaseName=AdventureWorks;integratedSecurity=true;' # parsed = Urn(scheme=['jdbc', 'sqlserver'], user='', password='', host='localhost', port='', path='', params=[('databaseName', 'AdventureWorks'), ('integratedSecurity', 'true')], query='databaseName=AdventureWorks&integratedSecurity=true', fragment='') # assert(parsed == urnparse(urn)) # urn = 'jdbc:postgresql://localhost/test?user=fred&password=secret&ssl=false' # parsed = Urn(scheme=['jdbc', 'postgresql'], user='', password='', host='localhost', port='', path='/test', params=[('user', 'fred'), ('password', 'secret'), ('ssl', 'false')], query='user=fred&password=secret&ssl=false', fragment='') # assert(parsed == urnparse(urn)) # urn = 'jdbc:mysql://localhost:3306/youdatabase?useSSL=false&serverTimezone=UTC&zeroDateTimeBehavior=CONVERT_TO_NULL' # parsed = Urn(scheme=['jdbc', 'mysql'], user='', password='', host='localhost', port='3306', path='/youdatabase', params=[('useSSL', 'false'), ('serverTimezone', 'UTC'), ('zeroDateTimeBehavior', 'CONVERT_TO_NULL')], query='useSSL=false&serverTimezone=UTC&zeroDateTimeBehavior=CONVERT_TO_NULL', fragment='') # assert(parsed == urnparse(urn))