def cubetl_config(ctx): #ctx.include('${ ctx.library_path }/geo.py') ctx.add('cubetl.net.domain.tld', Dimension(name='tld', label='TLD')) # TODO: Reference GEO/country if available (add "N/A" to Geo/Country) ctx.add('cubetl.net.domain.domain', Dimension(name='domain', label='Domain')) ctx.add('cubetl.net.domain.subdomain', Dimension(name='subdomain', label='Subdomain')) ctx.add( 'cubetl.net.domain', HierarchyDimension( name='domain3', label='Domain', #info={"cv-formatter": "'.'.join()"}, hierarchies=[ Hierarchy(name='domain3', label='Domain', levels=['tld', 'domain', 'subdomain']) ], attributes=[ DimensionAttribute(ctx.get('cubetl.net.domain.tld')), DimensionAttribute(ctx.get('cubetl.net.domain.domain')), DimensionAttribute(ctx.get('cubetl.net.domain.subdomain')) ]))
def sql2olap(ctx, debug=False, prefix="sql2olap"): """ This method generates a CubETL OLAP schema from an SQL schema defined by CubETL SQL components (such a schema can automatically be generated from an existing SQL database using `sql2cubetl` function). The process can be controlled via a dictionary of options passed via the `options` argument. Options: * `<object_uri>.type=ignore` ignores the given SQL column. * `<object_uri>.type=attribute` forces the SQL column to be used as fact attribute. * `<object_uri>.type=dimension` forces the SQL column to be used as dimension. Details: This method works by walking objects of class SQLTable in the context, and generating an cubetl.olap.Fact for each. Tables referenced via foreign keys are included as dimensions. """ # TODO: New generation refactor # Create a new Dimension for each found field, unless configuration says they are the same dimension # (or can be deduced: ie: same column name + size + same user type (raw dates, 0/1 boolean...). # Then, instance an olap sql-to-olap (process tables and columns, generate olap and olap mappings) # Implement querying # Move these SQL/OLAP method to Cubetl components. # Normalize/formalize column/name/id/schema/database usage # (optionally, at the end, export to cubes) # (should theorically be able to create olap-2-star-schema mappings, then create tables and load) # (theorically, we should be able to generate the same mappings from the generated star-schema (would require identifying split dims/hierarchies) #exclude_columns = ['key', 'entity_id'] #force_dimensions = dimensions if dimensions else [] # Load datetime ctx.include(ctx.library_path + "/datetime.py") # Mappings for datetime datedimension = ctx.get("cubetl.datetime.date") facts = {} factattributes = [] olapmappers = [] logger.info("Generating CubETL Olap schema from SQL schema.") sqltables = ctx.find(type=cubetl.sql.sql.SQLTable) for sqltable in sqltables: olap_type_table = _match_config( ctx.props, 'sql2olap.table.%s.type' % sqltable.name, None) logger.info("Fact: %s" % sqltable.name) if olap_type_table == 'ignore': logger.info("SQL2OLAP ignoring SQL table: %s", sqltable) continue factmappings = [] factattributes = [] key_count = 0 for dbcol in sqltable.columns: olap_type = _match_config(ctx.props, 'sql2olap.%s.type' % dbcol.urn, None) if olap_type: logger.info("Column: %s (forced type: %s)" % (dbcol, olap_type)) else: logger.info("Column: %s" % (dbcol)) if olap_type == 'ignore': logger.info("SQL2OLAP ignoring SQL column: %s", dbcol) continue if dbcol.pk: key_urn = "%s.fact.%s.key.%s" % (prefix, sqltable.name, dbcol.name) key = ctx.add( key_urn, Key(name=dbcol.name, type=dbcol.type, label=dbcol.label)) factattributes.append(key) factmapping = OlapMapping(path=[key], sqlcolumn=dbcol) factmappings.append(factmapping) key_count += 1 if isinstance(dbcol, cubetl.sql.sql.SQLColumnFK): #if len(dbcol.foreign_keys) > 1: # raise Exception("Multiple foreign keys found for column: %s" % (dbcol.name)) related_fact_name = dbcol.fk_sqlcolumn.sqltable.name if related_fact_name == sqltable.name: # Reference to self # TODO: This does not account for circular dependencies across other entities logger.warning( "Ignoring foreign key reference to self: %s", dbcol.name) continue related_fact = facts.get(related_fact_name, None) if related_fact is None: logger.warning( "Ignoring foreign key reference from %s.%s to not available entity: %s", dbcol.sqltable.name, dbcol.name, related_fact_name) continue # Create dimension attribute dimension_attribute = olap.DimensionAttribute( related_fact, name=dbcol.name, label=dbcol.label) factattributes.append(dimension_attribute) # Create a mapping factdimensionmapping = OlapMapping( path=[dimension_attribute], sqlcolumn=dbcol) factmappings.append(factdimensionmapping) if not dbcol.pk and not isinstance( dbcol, cubetl.sql.sql.SQLColumnFK) and ( olap_type == 'dimension' or (olap_type is None and dbcol.type == "String") ): # or (dbcol.name in force_dimensions) # Embedded dimension (single column, string or integer, treated as a dimension) dimension_attribute = olap.Attribute(name=dbcol.name, type=dbcol.type, label=dbcol.label) dimension = olap.Dimension( name=dbcol.name, label=dbcol.label, attributes=[dimension_attribute]) factattributes.append( DimensionAttribute(dimension, dimension.name, dimension.label)) # This dimension is mapped in the parent table factmapping = OlapMapping( path=[dimension, dimension_attribute], sqlcolumn=dbcol) factmappings.append(factmapping) if not dbcol.pk and not isinstance( dbcol, cubetl.sql.sql.SQLColumnFK) and ( olap_type == 'attribute'): # Attribute (detail) attribute = Attribute(name=dbcol.name, type=dbcol.type, label=dbcol.label) factattributes.append(attribute) factmapping = OlapMapping(path=[attribute], sqlcolumn=dbcol) factmappings.append(factmapping) if not dbcol.pk and not isinstance( dbcol, cubetl.sql.sql.SQLColumnFK) and ( olap_type == 'measure' or (olap_type is None and dbcol.type in ("Float", "Integer"))): measure = Measure(name=dbcol.name, type=dbcol.type, label=dbcol.label) factattributes.append(measure) factmapping = OlapMapping(path=[measure], sqlcolumn=dbcol) factmappings.append(factmapping) elif dbcol.type in ("DateTime"): # Date dimension datedimension = ctx.get("cubetl.datetime.date") # Create dimension attribute dimension_attribute = olap.DimensionAttribute( datedimension, name=dbcol.name, label=dbcol.label) factattributes.append(dimension_attribute) # TODO: This shall be common #mapper = olap.sql.EmbeddedDimensionMapper(entity=datedimension, sqltable=None) #olapmapper.mappers.append(mapper) mapping = OlapMapping(path=[ dimension_attribute, dimension_attribute.dimension.attribute('year') ], sqlcolumn=dbcol, function=OlapMapping.FUNCTION_YEAR) factmappings.append(mapping) #mapping = OlapMapping(entity=datedimension, attribute=datedimension.attribute("quarter"), sqlcolumn=dbcol, function=OlapMapping.FUNCTION_QUARTER) #factmappings.append(mapping) mapping = OlapMapping(path=[ dimension_attribute, dimension_attribute.dimension.attribute('month') ], sqlcolumn=dbcol, function=OlapMapping.FUNCTION_MONTH) factmappings.append(mapping) mapping = OlapMapping(path=[ dimension_attribute, dimension_attribute.dimension.attribute('day') ], sqlcolumn=dbcol, function=OlapMapping.FUNCTION_DAY) factmappings.append(mapping) mapping = OlapMapping(path=[ dimension_attribute, dimension_attribute.dimension.attribute('week') ], sqlcolumn=dbcol, function=OlapMapping.FUNCTION_WEEK) factmappings.append(mapping) # Create an alias for this dimension seen from this datetime field point of view # This approach creates a dimension for each different foreign key column name used ''' aliasdimension_urn = "%s.dim.datetime.%s.alias.%s" % (prefix, datedimension.name, dbcol.name) aliasdimension = ctx.get(aliasdimension_urn, False) if not aliasdimension: aliasdimension = ctx.add(aliasdimension_urn, olap.AliasDimension(dimension=datedimension, name=dbcol.name, label=dbcol.label)) fact.dimensions.append(olap.DimensionAttribute(aliasdimension)) # Create a mapping aliasdimensionmapping = OlapMapping(entity=aliasdimension, sqlcolumn=dbcol) factmappings.append(aliasdimensionmapping) mapper = olap.sql.AliasDimensionMapper(entity=aliasdimension) mapper.mappings = [ # These mappings don't have a sqlcolumn because they are meant to be embedded OlapMapping(entity=ctx.get("cubetl.datetime.year"), sqlcolumn=dbcol, function=OlapMapping.FUNCTION_YEAR), OlapMapping(entity=ctx.get("cubetl.datetime.quarter"), sqlcolumn=dbcol, function=OlapMapping.FUNCTION_QUARTER), OlapMapping(entity=ctx.get("cubetl.datetime.month"), sqlcolumn=dbcol, function=OlapMapping.FUNCTION_MONTH), OlapMapping(entity=ctx.get("cubetl.datetime.day"), sqlcolumn=dbcol, function=OlapMapping.FUNCTION_DAY), OlapMapping(entity=ctx.get("cubetl.datetime.week"), sqlcolumn=dbcol, function=OlapMapping.FUNCTION_WEEK) ] olapmapper.mappers.append(mapper) ''' ''' if len(factmappings) == 0: factmappings = [ { 'name': 'index', 'pk': True, 'type': 'Integer' } ] ''' # Ignore table if more than one primary key was found if key_count > 1: logger.warning( "Multiple primary key found in table %s (not supported, ignoring table)", sqltable.name) continue # Ignore table if it contains no primary key if key_count == 0: logger.warning( "No primary key found in table %s (not supported, ignoring table)", sqltable.name) continue # Define fact fact_urn = "%s.fact.%s" % (prefix, sqltable.name) fact = ctx.add( fact_urn, olap.Fact(name=sqltable.name, label=sqltable.label, attributes=factattributes)) facts[fact.name] = fact # Create an olapmapper for this fact olapmapper = olap.OlapMapper( ) # TODO: review whether this is necessary or we could use a single mapper mapper = olap.sql.TableMapper(entity=fact, sqltable=sqltable, mappings=factmappings) olapmapper.mappers.append(mapper) olapmappers.append(olapmapper) #ctx.register(mapper) #, uri='%s:fact' % ctx.uri(sqltable) # IDs should be defined in mappings, not entity Keys # mappings: # - name: id # pk: True # type: Integer # value: ${ int(m["id"]) } #printconfig = PrintConfig() #printflow = Chain(fork=True, steps=[printconfig]) #result = ctx.process(printflow) ''' process = sql.StoreRow(sqltable) result = ctx.process(process) connection = ctx.find(sql.Connection)[0] process = sql.Query(connection, lambda: "SELECT * FROM fin_account_accountmovement", embed=True) result = ctx.process(process) print(result) ''' ''' process = olap.OlapQueryAggregate() result = ctx.process(process, {'fact': 'fin_account_accountmovement', 'cuts': None, 'drill': None}) print result ''' olapmapper = olap.OlapMapper() olapmapper.include = [i for i in olapmappers] olapmapper_urn = "%s.olapmapper" % (prefix) ctx.add(olapmapper_urn, olapmapper) return ctx
def cubetl_config(ctx): ctx.add( 'cubetl.datetime.year', Dimension( name='year', label='Year', role='year', attributes=[Attribute(name='year', type='Integer', label='Year')])) ctx.add( 'cubetl.datetime.half', Dimension( name='half', label='Half', role='half', attributes=[Attribute(name='half', type='Integer', label='Half')])) ctx.add( 'cubetl.datetime.quarter', Dimension(name='quarter', label='Quarter', role='quarter', attributes=[ Attribute(name='quarter', type='Integer', label='Quarter') ])) ctx.add( 'cubetl.datetime.month', Dimension(name='month', label='Month', role='month', attributes=[ Attribute(name='month', type='Integer', label='Month') ])) ctx.add( 'cubetl.datetime.week', Dimension( name='week', label='Week', role='week', attributes=[Attribute(name='week', type='Integer', label='Week')])) ctx.add( 'cubetl.datetime.day', Dimension( name='day', label='Day', role='day', attributes=[Attribute(name='day', type='Integer', label='Day')])) ctx.add( 'cubetl.datetime.date', HierarchyDimension( name='date', label='Date', role='date', hierarchies=[ Hierarchy(name='daily', label='Daily', levels=['year', 'month', 'day']), Hierarchy(name='weekly', label='Weekly', levels=['year', 'week']) ], attributes=[ DimensionAttribute(dimension=ctx.get('cubetl.datetime.year')), DimensionAttribute(dimension=ctx.get('cubetl.datetime.month')), DimensionAttribute(dimension=ctx.get('cubetl.datetime.day')), DimensionAttribute(dimension=ctx.get('cubetl.datetime.week')) ])) ctx.add( 'cubetl.datetime.datemonthly', HierarchyDimension( name='datemonthly', label='Month', role='date', hierarchies=[ Hierarchy(name='monthly', label='Monthly', levels=['year', 'quarter', 'month']) ], attributes=[ DimensionAttribute(dimension=ctx.get('cubetl.datetime.year')), DimensionAttribute( dimension=ctx.get('cubetl.datetime.quarter')), DimensionAttribute(dimension=ctx.get('cubetl.datetime.month')) ])) ctx.add( 'cubetl.datetime.dateyqmd', HierarchyDimension( name='dateyqmd', label='Date', role='date', hierarchies=[ Hierarchy(name='daily', label='Daily', levels=['year', 'quarter', 'month', 'day']), Hierarchy(name='weekly', label='Weekly', levels=['year', 'week']) ], attributes=[ DimensionAttribute(dimension=ctx.get('cubetl.datetime.year')), DimensionAttribute( dimension=ctx.get('cubetl.datetime.quarter')), DimensionAttribute(dimension=ctx.get('cubetl.datetime.month')), DimensionAttribute(dimension=ctx.get('cubetl.datetime.day')), DimensionAttribute(dimension=ctx.get('cubetl.datetime.week')) ])) ctx.add( 'cubetl.datetime.dateyh', HierarchyDimension( name='dateyh', label='Date', role='date', hierarchies=[ Hierarchy(name='yearhalf', label='Year + Half', levels=['year', 'half']) ], attributes=[ DimensionAttribute(dimension=ctx.get('cubetl.datetime.year')), DimensionAttribute(dimension=ctx.get('cubetl.datetime.half')) ]))
def cubetl_config(ctx): ctx.include('${ ctx.library_path }/datetime.py') ctx.include('${ ctx.library_path }/geo.py') ctx.include('${ ctx.library_path }/net.py') ctx.add('cubetl.http.request.client_address', Dimension(name='client_address', label='Address')) ctx.add('cubetl.http.request.client_domain', Dimension(name='client_domain', label='Domain')) ctx.add('cubetl.http.request.username', Dimension(name='username', label='Username')) ctx.add('cubetl.http.request.method', Dimension(name='http_method', label='HTTP Method')) ctx.add('cubetl.http.request.path', HierarchyDimension( name='request_path', label='Path', hierarchies=[Hierarchy(name='path14', label='Path', levels=["path1", "path2", "path3", "path4"])], attributes=[DimensionAttribute(dimension=Dimension(name='path1', label='Path 1')), DimensionAttribute(dimension=Dimension(name='path2', label='Path 2')), DimensionAttribute(dimension=Dimension(name='path3', label='Path 3')), DimensionAttribute(dimension=Dimension(name='path4', label='Path 4'))])) ctx.add('cubetl.http.protocol', Dimension(name='protocol', label='Protocol')) ctx.add('cubetl.http.response.status.code', Dimension(name='status_code', label='Status', attributes=[ Attribute(name='status_code', type='Integer', label='Status Code'), Attribute(name='status_description', type='String', label='Status Description')])) ctx.add('cubetl.http.response.status.type', Dimension(name='status_type', label='Status Type', attributes=[ Attribute(name='status_type_label', type='String', label='Status Type'), Attribute(name='status_type_code', type='String', label='Status Type Code')])) ctx.add('cubetl.http.response.status', HierarchyDimension( name='response_status', label='Status', hierarchies=[Hierarchy(name='http_status', label='Status', levels=['status_type', 'status_code'])], attributes=[DimensionAttribute(ctx.get('cubetl.http.response.status.type')), DimensionAttribute(ctx.get('cubetl.http.response.status.code'))])) ctx.add('cubetl.http.request.referer_path', Dimension(name='referer_path', label='Referer Path')) ''' !!python/object:cubetl.olap.HierarchyDimension id: cubetl.http.referer name: referer label: Referer hierarchies: - name: referer label: Referer levels: referer_domain, referer_path levels: - !ref cubetl.http.referer.domain - !ref cubetl.http.referer.path ''' ctx.add('cubetl.http.user_agent', HierarchyDimension( name='user_agent', label='User Agent', attributes=[DimensionAttribute(Dimension(name='user_agent_family', label='User Agent')), DimensionAttribute(Dimension(name='user_agent_version', label='Version')) ])) ctx.add('cubetl.os.operating_system', HierarchyDimension( name='operating_system', label='Operating System', attributes=[DimensionAttribute(Dimension(name='operating_system_family', label='OS')), DimensionAttribute(Dimension(name='operating_system_version', label='Version')) ])) ctx.add('cubetl.http.mimetype', HierarchyDimension( name='mimetype', label='MIME Type', attributes=[DimensionAttribute(Dimension(name='mimetype_type', label='Type')), DimensionAttribute(Dimension(name='mimetype_subtype', label='Subtype')) ])) ctx.add('cubetl.os.device', Dimension(name='device', label='Device')) ctx.add('cubetl.http.request.is_bot', Dimension(name='is_bot', label='Is Bot', attributes=[ Attribute(name='is_bot', type='Boolean', label='Is Bot')])) ctx.add('cubetl.http.request.is_mobile', Dimension(name='is_mobile', label='Is Mobile', attributes=[ Attribute(name='is_mobile', type='Boolean', label='Is Mobile')])) ctx.add('cubetl.http.request.is_pc', Dimension(name='is_pc', label='Is PC', attributes=[ Attribute(name='is_pc', type='Boolean', label='Is PC')])) ctx.add('cubetl.http.request.is_tablet', Dimension(name='is_tablet', label='Is Tablet', attributes=[ Attribute(name='is_tablet', type='Boolean', label='Is Tablet')])) ctx.add('cubetl.http.request.file_extension', Dimension(name='file_extension', label='File Extension')) ctx.add('cubetl.http.request.referer_origin', Dimension(name='referer_origin', label='Referer Origin')) ctx.add('cubetl.http.response.is_download', Dimension(name='is_download', label='Is Download', attributes=[ Attribute(name='is_download', type='Boolean', label='Is Download')])) ctx.add('cubetl.http.request', Fact( name='http_request', label='HTTP Request', #natural_key= #notes='', attributes=[ DimensionAttribute(ctx.get('cubetl.datetime.date'), name='request_date', label="Request Date"), #ctx.get('cubetl.datetime.date'), DimensionAttribute(ctx.get('cubetl.http.protocol')), DimensionAttribute(ctx.get('cubetl.http.request.client_address')), DimensionAttribute(ctx.get('cubetl.http.request.username')), DimensionAttribute(ctx.get('cubetl.http.request.method')), DimensionAttribute(ctx.get('cubetl.http.request.path')), DimensionAttribute(ctx.get('cubetl.http.request.file_extension')), DimensionAttribute(ctx.get('cubetl.http.request.referer_origin')), DimensionAttribute(ctx.get('cubetl.http.request.is_bot')), DimensionAttribute(ctx.get('cubetl.http.request.is_pc')), DimensionAttribute(ctx.get('cubetl.http.request.is_tablet')), DimensionAttribute(ctx.get('cubetl.http.request.is_mobile')), DimensionAttribute(ctx.get('cubetl.geo.contcountry')), DimensionAttribute(ctx.get('cubetl.http.response.status')), DimensionAttribute(ctx.get('cubetl.http.mimetype')), DimensionAttribute(ctx.get('cubetl.http.response.is_download')), DimensionAttribute(ctx.get('cubetl.net.domain'), name="referer_domain", label="Referer Domain"), DimensionAttribute(ctx.get('cubetl.http.request.referer_path')), #ctx.get('cubetl.http.referer'), # TODO: try nested hierarchydimensions in "referer" hierarchydimension DimensionAttribute(ctx.get('cubetl.http.user_agent')), DimensionAttribute(ctx.get('cubetl.os.operating_system')), DimensionAttribute(ctx.get('cubetl.os.device')), Attribute(name='user_agent_string', type='String', label='User Agent String'), Attribute(name='verb', type='String', label='Verb'), #Attribute(name='referer', type='String', label='Referer') Measure(name='served_bytes', type='Integer', label="Served Bytes"), ])) ''' - !ref #- !ref cubetl.http.request.client_domain #- !ref cubetl.http.request.rlogname #- !ref cubetl.http.request.username ''' ctx.add('cubetl.http.parse.apache_combined', RegExp(regexp=r'([\d\.]+) (-) (-) \[(.*?)\] "(.*?)" (\d+) (\S+) "(.*?)" "(.*?)"', errors=RegExp.ERRORS_WARN, names='address, rlogname, username, date_string, verb, status_code, served_bytes, referer, user_agent_string')) ctx.add('cubetl.http.status.table', table.CSVMemoryTable( data=''' status_code,status_description,status_type 100,Continue,Informational 101,Switching Protocols,Informational 200,OK,Successful 201,Created,Successful 202,Accepted,Successful 203,Non-Authoritative Information,Successful 204,No Content,Successful 205,Reset Content,Successful 206,Partial Content,Successful 300,Multiple Choices,Redirection 301,Moved Permanently,Redirection 302,Found,Redirection 303,See Other,Redirection 304,Not Modified,Redirection 305,Use Proxy,Redirection 307,Temporary Redirect,Redirection 400,Bad Request,Client Error 401,Unauthorized,Client Error 402,Payment Required,Client Error 403,Forbidden,Client Error 404,Not Found,Client Error 405,Method Not Allowed,Client Error 406,Not Acceptable,Client Error 407,Proxy Authentication Required,Client Error 408,Request Timeout,Client Error 409,Conflict,Client Error 410,Gone,Client Error 411,Length Required,Client Error 412,Precondition Failed,Client Error 413,Request Entity Too Large,Client Error 414,Request-URI Too Long,Client Error 415,Unsupported Media Type,Client Error 416,Requested Range Not Satisfiable,Client Error 417,Expectation Failed,Client Error 500,Internal Server Error,Server Error 501,Not Implemented,Server Error 502,Bad Gateway,Server Error 503,Service Unavailable,Server Error 504,Gateway Timeout,Server Error 505,HTTP Version Not Supported,Server Error '''))
def cubetl_config(ctx): ctx.add( 'cubetl.geo.continent', Dimension(name='continent', label='Continent', attributes=[ Attribute(name='continent_code', type='String', label='Continent Code'), Attribute(name='continent_name', type='String', label='Continent') ])) ctx.add( 'cubetl.geo.country', Dimension(name='country', label='Country', role="geo", info={ 'cv-flag-field': 'country_iso2', 'cv-geo-ref-model-attribute': 'country_iso2', 'cv-geo-ref-feature-attribute': 'iso_a2', 'cv-geo-ref-layer': 'countries', 'cv-geo-map-layers': [{ "name": "countries", "type": "vector", "attribution": "© NaturalEarth", "params": { "url": "maps/ne_110m_admin_0_countries.geojson", "format": "geojson", "wrapX": True } }] }, attributes=[ Attribute(name='country_iso2', type='String', label='Country Code'), Attribute(name='country_name', type='String', label='Country') ])) ctx.add( 'cubetl.geo.contcountry', HierarchyDimension( name='contcountry', label='Country', hierarchies=[ Hierarchy(name='contcountry', label='Country', levels=['continent', 'country']) ], attributes=[ DimensionAttribute(ctx.get('cubetl.geo.continent')), DimensionAttribute(ctx.get('cubetl.geo.country')) ])) ctx.add( 'cubetl.geo.region_3', Dimension(name='region_3', label='Region', attributes=[ Attribute(name='region_3_code', type='String', label='Region Code'), Attribute(name='region_3_name', type='String', label='Region') ])) ctx.add( 'cubetl.geo.region_4', Dimension(name='region_4', label='Province', attributes=[ Attribute(name='region_4_code', type='String', label='Province Code'), Attribute(name='region_4_name', type='String', label='Province') ])) ctx.add( 'cubetl.geo.region_5', Dimension(name='region_5', label='Town', attributes=[ Attribute(name='region_5_code', type='String', label='Town Code'), Attribute(name='region_5_name', type='String', label='Town') ])) ctx.add( 'cubetl.geo.region_3_4', HierarchyDimension( name='Region', label='Region', hierarchies=[ Hierarchy(name='region_3_4', label='Region', levels=['region_3', 'region_4']) ], attributes=[ DimensionAttribute(ctx.get('cubetl.geo.region_3')), DimensionAttribute(ctx.get('cubetl.geo.region_4')) ])) ctx.add( 'cubetl.geo.region_3_5', HierarchyDimension( name='Region', label='Region', hierarchies=[ Hierarchy(name='region_3_5', label='Region', levels=['region_3', 'region_4', 'region_5']) ], attributes=[ DimensionAttribute(ctx.get('cubetl.geo.region_3')), DimensionAttribute(ctx.get('cubetl.geo.region_4')), DimensionAttribute(ctx.get('cubetl.geo.region_5')) ]))
def cubetl_config(ctx): ctx.include('${ ctx.library_path }/datetime.py') ctx.include('${ ctx.library_path }/person.py') ctx.add('ine.sql.connection', sql.Connection(url='sqlite:///ine.sqlite3')) ctx.add( 'ine.autonomy_province.table', table.CSVMemoryTable(data=''' province,autonomy Albacete,Castilla la Mancha Alicante/Alacant,Comunidad Valenciana Almería,Andalucía Araba/Álava,País Vasco Asturias,Asturias Ávila,Castilla y León Badajoz,Extremadura "Balears, Illes",Comunidad Balear Barcelona,Cataluña Bizkaia,País Vasco Burgos,Castilla y León Cáceres,Extremadura Cádiz,Andalucía Cantabria,Cantabria Castellón/Castelló,Comunidad Valenciana Ciudad Real,Castilla la Mancha Córdoba,Andalucía "Coruña, A",Galicia Cuenca,Castilla la Mancha Gipuzkoa,País Vasco Girona,Cataluña Granada,Andalucía Guadalajara,Castilla la Mancha Huelva,Andalucía Huesca,Aragón Jaén,Andalucía León,Castilla y León Lleida,Cataluña Lugo,Galicia Madrid,Madrid Málaga,Andalucía Murcia,Murcia Navarra,Aragón Ourense,Galicia Palencia,Castilla y León "Palmas, Las",Canarias Pontevedra,Galicia "Rioja, La",Rioja Salamanca,Castilla y León Santa Cruz de Tenerife,Canarias Segovia,Castilla y León Sevilla,Andalucía Soria,Castilla y León Tarragona,Cataluña Teruel,Aragón Toledo,Castilla la Mancha Valencia/València,Comunidad Valenciana Valladolid,Castilla y León Zamora,Castilla y León Zaragoza,Aragón Ceuta,Ciudades Autónomas Melilla,Ciudades Autónomas ''')) ctx.add( 'ine.autonomy', olap.Dimension(name='autonomy', label='Autonomy', attributes=[olap.Attribute('autonomy', type='String')])) ctx.add( 'ine.province', olap.Dimension(name='province', label='Province', attributes=[olap.Attribute('province', type='String')])) ctx.add( 'ine.autonomyprovince', olap.HierarchyDimension(name='autonomyprovince', label='Province', attributes=[ DimensionAttribute( ctx.get('ine.autonomy')), DimensionAttribute(ctx.get('ine.province')) ])) ctx.add( 'ine.nationality', olap.Dimension( name='nationality', label='Nationality', attributes=[olap.Attribute('nationality', type='String')])) ctx.add( 'ine.census', olap.Fact( name='census', label='Census', #must_slice=ctx.get('cubetl.datetime.datemonthly'), # study when and how dimensions can be aggregated, this cube requires slicing by date or results are invalid #natural_key= #notes='', attributes=[ DimensionAttribute(ctx.get('cubetl.datetime.datemonthly')), DimensionAttribute(ctx.get('ine.autonomyprovince')), DimensionAttribute(ctx.get('ine.nationality')), DimensionAttribute(ctx.get('cubetl.person.gender')), DimensionAttribute(ctx.get('cubetl.person.age_range')), Measure(name='census', type='Integer', label="Population") ])) # TODO: Should not present avg/max/min # Generate a SQL star schema and mappings automatically sqlschema.OLAPToSQL.olap2sql(ctx, connection=ctx.get('ine.sql.connection')) ctx.get('olap2sql.olapmapper').entity_mapper( ctx.get('ine.census')).store_mode = TableMapper.STORE_MODE_INSERT # Define the data load process ctx.add( 'ine.process', flow.Chain(steps=[ #ctx.get('cubetl.config.print'), # Generate a Cubes model cubes10.Cubes10ModelWriter(olapmapper=ctx.get( 'olap2sql.olapmapper'), model_path="ine.cubes-model.json", config_path="ine.cubes-config.ini"), sql.Transaction(connection=ctx.get('ine.sql.connection')), fs.FileReader(path='census-2002.px', encoding=None), pcaxis.PCAxisParser(), flow.Chain( fork=True, steps=[ pcaxis.PCAxisIterator(), script.Delete(['data', 'pcaxis']), flow.Filter(condition="${ m['Sexo'] != 'Ambos sexos' }"), flow.Filter( condition= "${ m['Grupo quinquenal de edad'] != 'Total' }"), #flow.Filter(condition="${ m['Grupo de edad'] != 'Total' }"), flow.Filter(condition="${ m['Nacionalidad'] != 'Total' }"), flow.Filter( condition="${ m['Provincias'] != 'Total Nacional' }"), #flow.Skip(skip="${ random.randint(1, 1000) }"), #flow.Limit(limit=5000), script.Function(process_data), #flow.Filter(condition="${ m['date'].year < 2002 }"), cache.CachedTableLookup( table=ctx.get("ine.autonomy_province.table"), lookup={'province': lambda m: m['province_name']}), ctx.get('cubetl.util.print'), olap.Store(entity=ctx.get('ine.census'), mapper=ctx.get('olap2sql.olapmapper')), log.LogPerformance(), ]), ]))