Beispiel #1
0
def _build_resource_metadata(rootdir, pmd={}, rmd={}, user_md=dict()):

    d = merge(pmd, rmd)

    d['provider_path'] = pmd.get('path', '')
    d['resource_path'] = rmd.get('path', '')
    d['provider_alias'] = pmd.get('alias')
    d['resource_alias'] = rmd.get('alias')
    d.pop('alias', None)
    d.pop('path', None)

    d['rootdir'] = rootdir

    if not d.get('service'):
        parts = d['provider_path'].split('://')
        if len(parts) > 1:
            d['service'] = parts[0]
            d['provider_path'] = parts[1]

    if not d.get('service'):
        parts = d['resource_path'].split('://')
        if len(parts) > 1:
            d['service'] = parts[0]
            d['resource_path'] = parts[1]

    if not d.get('service'):
        d['service'] = 'file'

    # if service is local or sqlite,
    # relative path is allowed, and prefixed with rootpath
    if d['service'] in ['file', 'sqlite'] and \
            not os.path.isabs(d['provider_path']) and \
            not os.path.isabs(d['resource_path']):
        d['provider_path'] = os.path.realpath(
            os.path.join(d['rootdir'], d['provider_path']))

    d['format'] = _format(d)
    d['driver'] = _driver(d)

    #default hostname is localhost
    d['host'] = d.get('hostname', d.get('host', '127.0.0.1'))

    # provider path can be use as database name
    if not d.get('database') and d.get('format') in ['jdbc', 'nosql']:
        d['database'] = d.get('path')

    d['port'] = d.get('port', _port(d['service']))
    d['url'] = _url(d)

    d['options'] = d['options'] if d.get('options') else {}
    d['mapping'] = d['mapping'] if d.get('mapping') else {}

    # override with function provided metadata
    d = merge(d, user_md)

    d['hash'] = hash(d['url']) ^ hash(d['format']) ^ hash(d['resource_path'])
    d['hash'] = hex(ctypes.c_size_t(d['hash']).value)

    return d
Beispiel #2
0
def read(file_paths=None):
    """
    Return all profiles, stored in a nested dictionary
    profiles are merged over the list provided profiles. list order determines override
    each profile name
    :param file_paths: list of yaml files
    :return: dict of profiles
    """
    profiles = {}

    if not file_paths:
        file_paths = []

    for filename in file_paths:
        if os.path.isfile(filename):
            with open(filename, 'r') as f:
                try:
                    docs = list(yaml.load_all(f))
                except yaml.YAMLError as e:
                    if hasattr(e, 'problem_mark'):
                        mark = e.problem_mark
                        logging.error(
                            "Error loading yml file {} at position: (%s:%s): skipping file"
                            .format(filename, mark.line + 1, mark.column + 1))
                        docs = []
                finally:
                    for doc in docs:
                        doc['profile'] = doc.get('profile', 'default')
                        profiles[doc['profile']] = merge(
                            profiles.get(doc['profile'], {}), doc)

    return profiles
Beispiel #3
0
def read(file_paths=None):
    """
    Return all profiles, stored in a nested dictionary
    Profiles are merged over the list provided of provided metadata files to read. 
    The order in the list of metadata files determines how profile properties are override
    :param file_paths: list of yaml files paths
    :return: dict of profiles
    """
    global loaded_md_files, profiles
    
    # empty profiles, before start reading 
    profiles = {}

    if not file_paths:
        file_paths = []
    
    loaded_md_files = []
    for filename in file_paths:
        if os.path.isfile(filename):
            with open(filename, 'r') as f:
                try:
                    docs = list(yaml.load_all(f))
                    loaded_md_files.append(filename)
                except yaml.YAMLError as e:
                    if hasattr(e, 'problem_mark'):
                        mark = e.problem_mark
                        logging.error("Error loading yml file {} at position: (%s:%s): skipping file".format(filename, mark.line+1, mark.column+1))
                        docs = []
                finally:
                    for doc in docs:
                        doc['profile'] = doc.get('profile', 'default')
                        profiles[doc['profile']] = merge(profiles.get(doc['profile'],{}), doc)

    return profiles
Beispiel #4
0
def inherit(profiles):
    """
    Modify profiles to inherit from default profile
    :param profiles: input dict of profiles
    :return: profile
    """

    # inherit from default for all other profiles
    for k in profiles.get('default', {}).keys():
        for p in profiles.keys() - 'default':
            profiles[p][k] = merge(profiles['default'][k], profiles[p].get(k))

    return profiles
Beispiel #5
0
def inherit(profiles):
    """
    Profiles inherit from a default profile.
    Inherit merges each profile with the configuration of the default profile.
    :param profiles: dict of profiles
    :return: dict of profiles
    """

    # inherit from default for all other profiles
    for k in profiles.get('default', {}).keys():
        for p in set(profiles.keys()) - {'default'}:
            profiles[p][k] = merge(profiles['default'][k], profiles[p].get(k))

    return profiles
Beispiel #6
0
    def test_minimal(self, tempdir):
        pmd = {'service': 'local', 'format': 'csv', 'path': tempdir.path}
        # noinspection PyProtectedMember
        d = resource._build_resource_metadata(tempdir.path, pmd=pmd)

        m = self.empty(tempdir).copy()
        u = {
            'provider_path': tempdir.path,
            'service': 'local',
            'format': 'csv',
            'url': f'{tempdir.path}'
        }
        m = merge(m, u)

        assert (d == m)
Beispiel #7
0
def to_resource(url_alias=None, *args, **kwargs):

    md = None

    # if a dict, create from dictionary
    if isinstance(url_alias, dict):
        md = resource_from_dict(url_alias)

    # if a string, and a metadata profile is loaded, check for aliases
    if metadata.profile():
        if not md and url_alias in metadata.profile().get('resources',
                                                          {}).keys():
            md = metadata.profile()['resources'][url_alias]

        if not md and url_alias in metadata.profile().get('providers',
                                                          {}).keys():
            md = metadata.profile()['providers'][url_alias]

    # if nothing found yet, interpret as a urn/path
    if not md and url_alias:
        md = resource_from_urn(urnparse(url_alias))

    # empty default
    if not md:
        md = get_default_md()

    # sanitize path if it's a url or a query
    if md['path']:
        url_md = resource_from_urn(urnparse(md['path']))
        md = merge(url_md, md)
        md['path'] = url_md['path']

    # override using kwargs
    md = metadata_overrides(md, **kwargs)

    if 'hostname' in md:
        del md['hostname']

    if 'username' in md:
        del md['username']

    return md
Beispiel #8
0
 def process(self, msg, kwargs):
     """
     Process the logging message and keyword arguments passed in to
     a logging call to insert contextual information. You can either
     manipulate the message itself, the keyword args or both. Return
     the message and kwargs modified (or not) to suit your needs.
     Normally, you'll only need to override this one method in a
     LoggerAdapter subclass for your specific needs.
     """
     d = self.extra
     d.update({'dlf_funcname': func_name(5)})
     
     if isinstance(msg, dict):
         d.update({'dlf_data': merge(msg, kwargs.get('extra', {}))})
         msg = 'data'
     elif isinstance(msg, str):
         d.update({'dlf_data': kwargs.get('extra', {})})
     else:
         raise ValueError('log message must be a str or a dict')
         
     kwargs["extra"] = d
     return msg, kwargs
Beispiel #9
0
def metadata_overrides(md,
                       host=None,
                       service=None,
                       port=None,
                       user=None,
                       password=None,
                       driver=None,
                       database=None,
                       schema=None,
                       table=None,
                       format=None,
                       hostname=None,
                       username=None,
                       **options):

    d = {}
    d['path'] = md.get('url') or md.get('path')
    d['provider'] = md.get('provider')

    d['host'] = host or hostname or md['host'] or md.get('hostname')
    d['port'] = port or md['port']

    d['service'] = service or md['service']
    d['format'] = format or md['format']

    d['user'] = user or username or md['user'] or md.get('username')
    d['password'] = password or md['password']

    d['database'] = database or md['database']
    d['schema'] = schema or md['schema']
    d['table'] = table or md['table']
    d['driver'] = driver or md['driver']
    d['options'] = merge(md['options'], options)

    if database or table:
        d['path'] = None

    return d
Beispiel #10
0
    def test_resource_provider_2path_absolute(self, tempdir):
        pmd = {
            'alias': 'p',
            'service': 'local',
            'format': 'csv',
            'path': '/absolute/path'
        }
        rmd = {'alias': 'r', 'path': 'abc/def'}
        # noinspection PyProtectedMember
        d = resource._build_resource_metadata(tempdir.path, pmd=pmd, rmd=rmd)

        m = self.empty(tempdir).copy()
        u = {
            'provider_path': '/absolute/path',
            'provider_alias': 'p',
            'resource_alias': 'r',
            'resource_path': 'abc/def',
            'service': 'local',
            'format': 'csv',
            'url': f'/absolute/path/abc/def'
        }
        m = merge(m, u)

        assert (d == m)
Beispiel #11
0
def test_merge():
    a = {'a': 1, 'b': 4, 'c': {'merge1': 2}}
    b = {'d': 'add', 'b': 'override', 'c': {'merge2': 4}}
    r1 = merge(a, b)
    r2 = {'a': 1, 'd': 'add', 'b': 'override', 'c': {'merge2': 4, 'merge1': 2}}
    assert (r1 == r2)
Beispiel #12
0
def _build_resource_metadata(rootdir, pmd={}, rmd={}, user_md=dict()):

    d = merge(pmd, rmd)

    d['provider_alias'] = pmd.get('alias', pmd.get('path', ''))
    d['resource_alias'] = rmd.get('alias', rmd.get('path', ''))

    d['provider_path'] = pmd.get('path', pmd.get('alias', ''))
    d['resource_path'] = rmd.get('path', rmd.get('alias', ''))

    d.pop('alias', None)
    d.pop('path', None)

    d['rootdir'] = rootdir

    if not d.get('service'):
        parts = d['provider_path'].split('://')
        if len(parts) > 1:
            d['service'] = parts[0]
            d['provider_path'] = parts[1]

    if not d.get('service'):
        parts = d['resource_path'].split('://')
        if len(parts) > 1:
            d['service'] = parts[0]
            d['resource_path'] = parts[1]

    if not d.get('service'):
        d['service'] = 'file'

    # if service is local or sqlite,
    # relative path is allowed, and prefixed with rootpath
    if d['service'] in ['file', 'sqlite'] and \
            not os.path.isabs(d['provider_path']) and \
            not os.path.isabs(d['resource_path']):
        d['provider_path'] = os.path.realpath(
            os.path.join(d['rootdir'], d['provider_path']))

    d['format'] = _format(d)
    d['driver'] = _driver(d)

    #default hostname is localhost
    d['host'] = d.get('hostname', d.get('host', '127.0.0.1'))

    # provider path can be use as database name, if database is undefined
    # for some special rdbms, database and path are both required
    # if both database and path are provided, path is interpreted as a database schema
    # if only path or database is provided, assume that the schema is 'public'

    if d['format'] == 'jdbc':
        d['table'] = d['resource_path']
        d['table'] = d['table'] if d[
            'table'] else 'SELECT 0 as result where 1 = 0'

        if d.get('database'):
            d['database'] = d.get('database')
            d['schema'] = d['provider_path'] if d['provider_path'] else ''
        else:
            d['database'] = d['provider_path']
            d['schema'] = ''

        #if schema is not yet defined, take the default for each service
        if not d['schema']:
            if d.get('service') == 'mysql':
                d['schema'] = d['database']
            elif d.get('service') == 'mssql':
                d['schema'] = 'dbo'
            elif d.get('service') == 'postgres':
                d['schema'] = 'public'
            elif d.get('service') == 'oracle':
                d['schema'] = d.get('username', '')
            else:
                #use postgres default if service unkown
                d['schema'] = 'public'

        # if format is jdbc and an SQL query is detected,
        # wrap the resource path as a temp table
        sql_query = d['table']
        sql_query = sql_query.replace('\n', ' ')
        sql_query = sql_query.replace('\t', ' ')
        sql_query = sql_query.replace('\r', ' ')
        sql_query = ' '.join(sql_query.split())
        sql_query = sql_query.rstrip(' ')
        sql_query = sql_query.rstrip(';')

        if ' from ' in sql_query.lower():
            d['table'] = '( {} ) as _query'.format(sql_query)

    d['port'] = d.get('port', _port(d['service']))
    d['url'] = _url(d)

    d['options'] = d['options'] if d.get('options') else {}
    d['mapping'] = d['mapping'] if d.get('mapping') else {}

    # override with function provided metadata
    d = merge(d, user_md)

    d['hash'] = hash(d['url']) ^ hash(d['format']) ^ hash(d['resource_path'])
    d['hash'] = hex(ctypes.c_size_t(d['hash']).value)

    return d
Beispiel #13
0
def _override_metadata(access, param, pmd=dict(), rmd=dict()):
    d = merge(
        pmd.get(access, {}).get(param, {}),
        rmd.get(access, {}).get(param, {}))
    return d
Beispiel #14
0
    def test_resource_provider_2path_absolute(self, tempdir):
        pmd = {
            'alias': 'p',
            'service': 'local',
            'format': 'csv',
            'path': '/absolute/path'
        }
        rmd = {'alias': 'r', 'path': 'abc/def'}
        # noinspection PyProtectedMember
        d = resource._build_resource_metadata(tempdir.path, pmd=pmd, rmd=rmd)

        m = self.empty(tempdir).copy()
        u = {
            'provider_path': '/absolute/path',
            'provider_alias': 'p',
            'resource_alias': 'r',
            'resource_path': 'abc/def',
            'service': 'local',
            'format': 'csv',
            'url': f'/absolute/path/abc/def'
        }
        m = merge(m, u)

        assert (d == m)


# resource('SELECT 0 as result where 1 = 0', 'pagila')
# resource('foo.csv', '/bar')
# resource('foo.csv', 'bar')
# resource('foo.csv', 'hdfs')
# resource('/foo.abc', 'hdfs')
# resource('/foo.abc', 'test')
# resource('hello/foo.abc', 'test')
# resource('foo.abc', 'hdfs://*****:*****@1.2.3.4:3306/sakila?useSSL=false&serverTimezone=UTC&zeroDateTimeBehavior=CONVERT_TO_NULL')
# resource('staff', 'jdbc:mysql://1.2.3.4/sakila', useSSL='false', serverTimezone='UTC', zeroDateTimeBehavior='CONVERT_TO_NULL')
# resource('staff', service='mysql', database='sakila', serverTimezone='UTC')
# resource('sakila/staff', service='mysql', serverTimezone='UTC', user='******', password='******')
# resource('foo/bar.tsv', service='s3a')
# resource('/foo/bar.tsv', service='s3a')
# resource('/apples/orange', service='minio')
# resource('SELECT count(*) as cnt from employees;', 'jdbc:mysql://1.2.3.4:3306/sakila?useSSL=false&serverTimezone=UTC&zeroDateTimeBehavior=CONVERT_TO_NULL', user='******', password='******')
# resource('ascombe')
# resource('ascombe', 'saywhat')
# resource('ascombe', 'hdfs://*****:*****@//123.123.123:345/schema/database'
# parsed = Urn(scheme=['jdbc', 'oracle', 'thin', 'name/pass@'], user='******', password='******', host='123.123.123', port='345', path='/schema/database', params=[], query='', fragment='')

# assert(parsed == urnparse(urn))

# urn = 'jdbc:oracle:thin:name@//123.123.123:345/schema/database'
# parsed = Urn(scheme=['jdbc', 'oracle', 'thin', 'name@'], user='******', password='', host='123.123.123', port='345', path='/schema/database', params=[], query='', fragment='')

# assert(parsed == urnparse(urn))

# urn = 'jdbc:oracle:thin:@//123.123.123/schema/database'
# parsed = Urn(scheme=['jdbc', 'oracle', 'thin', '@'], user='', password='', host='123.123.123', port='', path='/schema/database', params=[], query='', fragment='')

# assert(parsed == urnparse(urn))

# urn = 'hdfs://123.123.123/schema/database'
# parsed = Urn(scheme=['hdfs'], user='', password='', host='123.123.123', port='', path='/schema/database', params=[], query='', fragment='')

# assert(parsed == urnparse(urn))

# urn = '/schema/database'
# parsed = Urn(scheme=[], user='', password='', host='', port='', path='/schema/database', params=[], query='', fragment='')

# assert(parsed == urnparse(urn))

# urn = 's3a://schema/database'
# parsed = Urn(scheme=['s3a'], user='', password='', host='', port='', path='schema/database', params=[], query='', fragment='')

# assert(parsed == urnparse(urn))

# urn = '1.2.34/schema/database'
# parsed = Urn(scheme=[], user='', password='', host='', port='', path='1.2.34/schema/database', params=[], query='', fragment='')

# assert(parsed == urnparse(urn))

# urn = 'file://1.2.34/schema/database'
# parsed = Urn(scheme=['file'], user='', password='', host='1.2.34', port='', path='/schema/database', params=[], query='', fragment='')

# assert(parsed == urnparse(urn))

# urn = 'jdbc:sqlite://localdir/a/b/c'
# parsed = Urn(scheme=['jdbc', 'sqlite'], user='', password='', host='', port='', path='localdir/a/b/c', params=[], query='', fragment='')

# assert(parsed == urnparse(urn))

# urn = 'jdbc:oracle:thin:@ldap://xyz.acme.com:7777/sales,cn=salesdept,cn=OracleContext,dc=com'
# parsed = Urn(scheme=['jdbc', 'oracle', 'thin', '@ldap'], user='', password='', host='xyz.acme.com', port='7777', path='/sales', params=[('cn', 'salesdept'), ('cn', 'OracleContext'), ('dc', 'com')], query='cn=salesdept&cn=OracleContext&dc=com', fragment='')

# assert(parsed == urnparse(urn))

# urn = 'http://xyz.acme.com:7777/foo/bar?a=1&edf=abc#anchor1'
# parsed = Urn(scheme=['http'], user='', password='', host='xyz.acme.com', port='7777', path='/foo/bar', params=[('a', '1'), ('edf', 'abc')], query='a=1&edf=abc', fragment='anchor1')

# assert(parsed == urnparse(urn))

# urn = 'jdbc:sqlserver://localhost:1433;databaseName=AdventureWorks;integratedSecurity=true;'
# parsed = Urn(scheme=['jdbc', 'sqlserver'], user='', password='', host='localhost', port='1433', path='', params=[('databaseName', 'AdventureWorks'), ('integratedSecurity', 'true')], query='databaseName=AdventureWorks&integratedSecurity=true', fragment='')

# assert(parsed == urnparse(urn))

# urn = 'jdbc:sqlserver://localhost;databaseName=AdventureWorks;integratedSecurity=true;'
# parsed = Urn(scheme=['jdbc', 'sqlserver'], user='', password='', host='localhost', port='', path='', params=[('databaseName', 'AdventureWorks'), ('integratedSecurity', 'true')], query='databaseName=AdventureWorks&integratedSecurity=true', fragment='')

# assert(parsed == urnparse(urn))

# urn = 'jdbc:postgresql://localhost/test?user=fred&password=secret&ssl=false'
# parsed = Urn(scheme=['jdbc', 'postgresql'], user='', password='', host='localhost', port='', path='/test', params=[('user', 'fred'), ('password', 'secret'), ('ssl', 'false')], query='user=fred&password=secret&ssl=false', fragment='')

# assert(parsed == urnparse(urn))

# urn = 'jdbc:mysql://localhost:3306/youdatabase?useSSL=false&serverTimezone=UTC&zeroDateTimeBehavior=CONVERT_TO_NULL'
# parsed = Urn(scheme=['jdbc', 'mysql'], user='', password='', host='localhost', port='3306', path='/youdatabase', params=[('useSSL', 'false'), ('serverTimezone', 'UTC'), ('zeroDateTimeBehavior', 'CONVERT_TO_NULL')], query='useSSL=false&serverTimezone=UTC&zeroDateTimeBehavior=CONVERT_TO_NULL', fragment='')

# assert(parsed == urnparse(urn))
Beispiel #15
0
def Resource(path_or_alias_or_url=None,
             provider_path_or_alias_or_url=None,
             host=None,
             service=None,
             port=None,
             user=None,
             password=None,
             driver=None,
             database=None,
             schema=None,
             table=None,
             format=None,
             hostname=None,
             username=None,
             **options):

    prov = provider_path_or_alias_or_url
    path = path_or_alias_or_url

    # get the resource, by alias metadata or by url
    rmd = to_resource(path,
                      host=host,
                      service=service,
                      port=port,
                      user=user,
                      password=password,
                      driver=driver,
                      database=database,
                      schema=schema,
                      table=table,
                      format=format,
                      hostname=hostname,
                      username=username,
                      **options)

    # get the provider by reference from the resource, if available
    prov = prov or rmd.get('provider')

    # get the provider, by alias metadata or by url
    pmd = to_resource(prov)

    # check if the provider is a jdbc connection, if so set it
    pmd['database'], pmd['table'], pmd['path'] = path_to_jdbc(pmd, True)

    # merge provider and resource metadata
    md = merge(pmd, rmd)

    # concatenate paths, if no table is defined
    if md['table']:
        md['path'] = None
    else:
        md['path'] = os.path.join(pmd['path'] or '', rmd['path'] or '')

    #process metadata
    md = process_metadata(md)

    #todo: verify resource
    # check format and other minimum requirements are met

    # assemble output
    md = assemble_metadata(md)

    return md