Exemple #1
0
    def olap2sql(ctx, connection):
        """
        Automatically generates OLAP to SQL mappings using a normalized approach.
        """

        olapmapper = ctx.get('olap2sql.olapmapper', fail=False)
        if not olapmapper:
            olapmapper = olap.OlapMapper()
            ctx.add('olap2sql.olapmapper', olapmapper)

        facts = ctx.find(type=cubetl.olap.Fact)
        for fact in facts:
            entity_mapper = OLAPToSQL.generate_star_schema_mapper_entity(
                ctx, connection, olapmapper, fact)
            ctx.add(entity_mapper.sqltable.urn, entity_mapper.sqltable)
            olapmapper.mappers.append(entity_mapper)
Exemple #2
0
    def sql2olap(ctx, debug=False, prefix="sql2olap"):
        """
        This method generates a CubETL OLAP schema from an SQL schema defined by CubETL SQL components
        (such a schema can automatically be generated from an existing SQL database using `sql2cubetl`
        function).

        The process can be controlled via a dictionary of options passed via the `options` argument.

        Options:

          * `<object_uri>.type=ignore` ignores the given SQL column.
          * `<object_uri>.type=attribute` forces the SQL column to be used as fact attribute.
          * `<object_uri>.type=dimension` forces the SQL column to be used as dimension.

        Details:

        This method works by walking objects of class SQLTable in the context, and generating an
        cubetl.olap.Fact for each. Tables referenced via foreign keys are included as dimensions.
        """

        # TODO: New generation refactor

        # Create a new Dimension for each found field, unless configuration says they are the same dimension
        # (or can be deduced: ie: same column name + size + same user type (raw dates, 0/1 boolean...).

        # Then, instance an olap sql-to-olap (process tables and columns, generate olap and olap mappings)

        # Implement querying

        # Move these SQL/OLAP method to Cubetl components.

        # Normalize/formalize column/name/id/schema/database usage

        # (optionally, at the end, export to cubes)
        # (should theorically be able to create olap-2-star-schema mappings, then create tables and load)
        # (theorically, we should be able to generate the same mappings from the generated star-schema (would require identifying split dims/hierarchies)

        #exclude_columns = ['key', 'entity_id']
        #force_dimensions = dimensions if dimensions else []

        # Load datetime
        ctx.include(ctx.library_path + "/datetime.py")

        # Mappings for datetime
        datedimension = ctx.get("cubetl.datetime.date")

        facts = {}
        factattributes = []
        olapmappers = []

        logger.info("Generating CubETL Olap schema from SQL schema.")

        sqltables = ctx.find(type=cubetl.sql.sql.SQLTable)
        for sqltable in sqltables:

            olap_type_table = _match_config(
                ctx.props, 'sql2olap.table.%s.type' % sqltable.name, None)

            logger.info("Fact: %s" % sqltable.name)
            if olap_type_table == 'ignore':
                logger.info("SQL2OLAP ignoring SQL table: %s", sqltable)
                continue

            factmappings = []
            factattributes = []
            key_count = 0

            for dbcol in sqltable.columns:

                olap_type = _match_config(ctx.props,
                                          'sql2olap.%s.type' % dbcol.urn, None)
                if olap_type:
                    logger.info("Column: %s (forced type: %s)" %
                                (dbcol, olap_type))
                else:
                    logger.info("Column: %s" % (dbcol))

                if olap_type == 'ignore':
                    logger.info("SQL2OLAP ignoring SQL column: %s", dbcol)
                    continue

                if dbcol.pk:
                    key_urn = "%s.fact.%s.key.%s" % (prefix, sqltable.name,
                                                     dbcol.name)
                    key = ctx.add(
                        key_urn,
                        Key(name=dbcol.name,
                            type=dbcol.type,
                            label=dbcol.label))
                    factattributes.append(key)

                    factmapping = OlapMapping(path=[key], sqlcolumn=dbcol)
                    factmappings.append(factmapping)

                    key_count += 1

                if isinstance(dbcol, cubetl.sql.sql.SQLColumnFK):
                    #if len(dbcol.foreign_keys) > 1:
                    #    raise Exception("Multiple foreign keys found for column: %s" % (dbcol.name))

                    related_fact_name = dbcol.fk_sqlcolumn.sqltable.name
                    if related_fact_name == sqltable.name:
                        # Reference to self
                        # TODO: This does not account for circular dependencies across other entities
                        logger.warning(
                            "Ignoring foreign key reference to self: %s",
                            dbcol.name)
                        continue

                    related_fact = facts.get(related_fact_name, None)
                    if related_fact is None:
                        logger.warning(
                            "Ignoring foreign key reference from %s.%s to not available entity: %s",
                            dbcol.sqltable.name, dbcol.name, related_fact_name)
                        continue

                    # Create dimension attribute
                    dimension_attribute = olap.DimensionAttribute(
                        related_fact, name=dbcol.name, label=dbcol.label)
                    factattributes.append(dimension_attribute)

                    # Create a mapping
                    factdimensionmapping = OlapMapping(
                        path=[dimension_attribute], sqlcolumn=dbcol)
                    factmappings.append(factdimensionmapping)

                if not dbcol.pk and not isinstance(
                        dbcol, cubetl.sql.sql.SQLColumnFK) and (
                            olap_type == 'dimension' or
                            (olap_type is None and dbcol.type == "String")
                        ):  # or (dbcol.name in force_dimensions)
                    # Embedded dimension (single column, string or integer, treated as a dimension)

                    dimension_attribute = olap.Attribute(name=dbcol.name,
                                                         type=dbcol.type,
                                                         label=dbcol.label)
                    dimension = olap.Dimension(
                        name=dbcol.name,
                        label=dbcol.label,
                        attributes=[dimension_attribute])

                    factattributes.append(
                        DimensionAttribute(dimension, dimension.name,
                                           dimension.label))

                    # This dimension is mapped in the parent table
                    factmapping = OlapMapping(
                        path=[dimension, dimension_attribute], sqlcolumn=dbcol)
                    factmappings.append(factmapping)

                if not dbcol.pk and not isinstance(
                        dbcol, cubetl.sql.sql.SQLColumnFK) and (
                            olap_type == 'attribute'):
                    # Attribute (detail)
                    attribute = Attribute(name=dbcol.name,
                                          type=dbcol.type,
                                          label=dbcol.label)
                    factattributes.append(attribute)

                    factmapping = OlapMapping(path=[attribute],
                                              sqlcolumn=dbcol)
                    factmappings.append(factmapping)

                if not dbcol.pk and not isinstance(
                        dbcol, cubetl.sql.sql.SQLColumnFK) and (
                            olap_type == 'measure' or
                            (olap_type is None
                             and dbcol.type in ("Float", "Integer"))):

                    measure = Measure(name=dbcol.name,
                                      type=dbcol.type,
                                      label=dbcol.label)
                    factattributes.append(measure)

                    factmapping = OlapMapping(path=[measure], sqlcolumn=dbcol)
                    factmappings.append(factmapping)

                elif dbcol.type in ("DateTime"):

                    # Date dimension
                    datedimension = ctx.get("cubetl.datetime.date")

                    # Create dimension attribute
                    dimension_attribute = olap.DimensionAttribute(
                        datedimension, name=dbcol.name, label=dbcol.label)
                    factattributes.append(dimension_attribute)

                    # TODO: This shall be common
                    #mapper = olap.sql.EmbeddedDimensionMapper(entity=datedimension, sqltable=None)
                    #olapmapper.mappers.append(mapper)

                    mapping = OlapMapping(path=[
                        dimension_attribute,
                        dimension_attribute.dimension.attribute('year')
                    ],
                                          sqlcolumn=dbcol,
                                          function=OlapMapping.FUNCTION_YEAR)
                    factmappings.append(mapping)

                    #mapping = OlapMapping(entity=datedimension, attribute=datedimension.attribute("quarter"), sqlcolumn=dbcol, function=OlapMapping.FUNCTION_QUARTER)
                    #factmappings.append(mapping)

                    mapping = OlapMapping(path=[
                        dimension_attribute,
                        dimension_attribute.dimension.attribute('month')
                    ],
                                          sqlcolumn=dbcol,
                                          function=OlapMapping.FUNCTION_MONTH)
                    factmappings.append(mapping)

                    mapping = OlapMapping(path=[
                        dimension_attribute,
                        dimension_attribute.dimension.attribute('day')
                    ],
                                          sqlcolumn=dbcol,
                                          function=OlapMapping.FUNCTION_DAY)
                    factmappings.append(mapping)

                    mapping = OlapMapping(path=[
                        dimension_attribute,
                        dimension_attribute.dimension.attribute('week')
                    ],
                                          sqlcolumn=dbcol,
                                          function=OlapMapping.FUNCTION_WEEK)
                    factmappings.append(mapping)

                    # Create an alias for this dimension seen from this datetime field point of view
                    # This approach creates a dimension for each different foreign key column name used
                    '''
                    aliasdimension_urn = "%s.dim.datetime.%s.alias.%s" % (prefix, datedimension.name, dbcol.name)
                    aliasdimension = ctx.get(aliasdimension_urn, False)
                    if not aliasdimension:
                        aliasdimension = ctx.add(aliasdimension_urn,
                                                 olap.AliasDimension(dimension=datedimension, name=dbcol.name, label=dbcol.label))
                    fact.dimensions.append(olap.DimensionAttribute(aliasdimension))

                    # Create a mapping
                    aliasdimensionmapping = OlapMapping(entity=aliasdimension, sqlcolumn=dbcol)
                    factmappings.append(aliasdimensionmapping)
                    mapper = olap.sql.AliasDimensionMapper(entity=aliasdimension)
                    mapper.mappings = [
                        # These mappings don't have a sqlcolumn because they are meant to be embedded
                        OlapMapping(entity=ctx.get("cubetl.datetime.year"), sqlcolumn=dbcol, function=OlapMapping.FUNCTION_YEAR),
                        OlapMapping(entity=ctx.get("cubetl.datetime.quarter"), sqlcolumn=dbcol, function=OlapMapping.FUNCTION_QUARTER),
                        OlapMapping(entity=ctx.get("cubetl.datetime.month"), sqlcolumn=dbcol, function=OlapMapping.FUNCTION_MONTH),
                        OlapMapping(entity=ctx.get("cubetl.datetime.day"), sqlcolumn=dbcol, function=OlapMapping.FUNCTION_DAY),
                        OlapMapping(entity=ctx.get("cubetl.datetime.week"), sqlcolumn=dbcol, function=OlapMapping.FUNCTION_WEEK)
                    ]
                    olapmapper.mappers.append(mapper)
                    '''
            '''
            if len(factmappings) == 0:
                factmappings = [ { 'name': 'index', 'pk': True, 'type': 'Integer' } ]
            '''

            # Ignore table if more than one primary key was found
            if key_count > 1:
                logger.warning(
                    "Multiple primary key found in table %s (not supported, ignoring table)",
                    sqltable.name)
                continue

            # Ignore table if it contains no primary key
            if key_count == 0:
                logger.warning(
                    "No primary key found in table %s (not supported, ignoring table)",
                    sqltable.name)
                continue

            # Define fact
            fact_urn = "%s.fact.%s" % (prefix, sqltable.name)
            fact = ctx.add(
                fact_urn,
                olap.Fact(name=sqltable.name,
                          label=sqltable.label,
                          attributes=factattributes))
            facts[fact.name] = fact
            # Create an olapmapper for this fact
            olapmapper = olap.OlapMapper(
            )  # TODO: review whether this is necessary or we could use a single mapper

            mapper = olap.sql.TableMapper(entity=fact,
                                          sqltable=sqltable,
                                          mappings=factmappings)
            olapmapper.mappers.append(mapper)
            olapmappers.append(olapmapper)
            #ctx.register(mapper)  #, uri='%s:fact' % ctx.uri(sqltable)

            # IDs should be defined in mappings, not entity Keys
            #  mappings:
            #  - name: id
            #    pk: True
            #    type: Integer
            #    value: ${ int(m["id"]) }

        #printconfig = PrintConfig()
        #printflow = Chain(fork=True, steps=[printconfig])
        #result = ctx.process(printflow)
        '''
        process = sql.StoreRow(sqltable)
        result = ctx.process(process)

        connection = ctx.find(sql.Connection)[0]
        process = sql.Query(connection, lambda: "SELECT * FROM fin_account_accountmovement", embed=True)
        result = ctx.process(process)
        print(result)
        '''
        '''
        process = olap.OlapQueryAggregate()
        result = ctx.process(process, {'fact': 'fin_account_accountmovement', 'cuts': None, 'drill': None})
        print result
        '''

        olapmapper = olap.OlapMapper()
        olapmapper.include = [i for i in olapmappers]
        olapmapper_urn = "%s.olapmapper" % (prefix)
        ctx.add(olapmapper_urn, olapmapper)

        return ctx
Exemple #3
0
def sql2cubes(db_url,
              model_path=None,
              tables=None,
              dimensions=None,
              debug=False):

    exclude_columns = ['key']
    force_dimensions = dimensions if dimensions else []

    engine = create_engine(db_url)
    engine_connection = engine.connect()

    metadata = sqlalchemy.MetaData()
    metadata.reflect(engine)

    connection = sql.Connection()
    connection.id = "cubesutils.connection"
    connection.url = engine.url

    # Create Cubetl context
    cubesbootstrap = Bootstrap()
    ctx = cubesbootstrap.init(debug=debug)
    ctx.debug = True

    # Load yaml library definitions that are dependencies
    cubetlconfig.load_config(
        ctx,
        os.path.dirname(__file__) + "/cubetl-datetime.yaml")

    olapmappers = {}  # Indexed by table name
    factdimensions = {}  # Indexed by table_name
    facts = {}  # Indexed by table name

    def coltype(dbcol):
        if str(dbcol.type) in ("FLOAT", "REAL", "DECIMAL"):
            return "Float"
        elif str(dbcol.type) in ("INTEGER", "BIGINT"):
            return "Integer"
        elif str(dbcol.type) in ("BOOLEAN", "TEXT") or str(
                dbcol.type).startswith("VARCHAR"):
            return "String"
        return None

    for dbtable in metadata.sorted_tables:

        if dbtable.name.startswith('sqlite_'):
            continue

        print("Table: %s" % dbtable.name)

        tablename = slugify.slugify(dbtable.name, separator="_")

        # Define fact
        fact = olap.Fact()
        fact.id = "cubesutils.%s.fact" % (tablename)
        fact.name = slugify.slugify(dbtable.name, separator="_")
        fact.label = dbtable.name
        fact.dimensions = []
        fact.measures = []
        fact.attributes = []

        facts[dbtable.name] = fact

        olapmapper = olap.OlapMapper()
        olapmapper.id = "cubesutils.%s.olapmapper" % (tablename)
        olapmapper.mappers = []
        olapmapper.include = []

        factmappings = []

        for dbcol in dbtable.columns:

            if dbcol.name in exclude_columns:
                continue

            print("  Column: %s [type=%s, null=%s, pk=%s, fk=%s]" %
                  (dbcol.name, dbcol.type, dbcol.nullable, dbcol.primary_key,
                   dbcol.foreign_keys))

            if dbcol.primary_key:
                if (str(dbcol.type) == "INTEGER"):
                    factmappings.append({
                        'name':
                        slugify.slugify(dbcol.name, separator="_"),
                        'pk':
                        True,
                        'type':
                        'Integer'
                    })
                elif str(dbcol.type) == "TEXT" or str(
                        dbcol.type).startswith("VARCHAR"):
                    factmappings.append({
                        'name':
                        slugify.slugify(dbcol.name, separator="_"),
                        'pk':
                        True,
                        'type':
                        'String'
                    })
                else:
                    raise Exception(
                        "Unknown column type (%s) for primary key column: %s" %
                        (dbcol.type, dbcol.name))

            elif dbcol.foreign_keys and len(dbcol.foreign_keys) > 0:

                if len(dbcol.foreign_keys) > 1:
                    raise Exception(
                        "Multiple foreign keys found for column: %s" %
                        (dbcol.name))

                related_fact = list(dbcol.foreign_keys)[0].column.table.name

                if related_fact == dbtable.name:
                    # Reference to self
                    # TODO: This does not account for circular dependencies across other entities
                    continue

                factdimension = None
                if related_fact in factdimensions:
                    factdimension = factdimensions[related_fact]
                else:
                    factdimension = olap.FactDimension()
                    factdimension.id = "cubesutils.%s.dim.%s" % (
                        tablename, slugify.slugify(related_fact,
                                                   separator="_"))
                    factdimension.name = slugify.slugify(related_fact,
                                                         separator="_")
                    factdimension.label = related_fact
                    factdimension.fact = facts[related_fact]
                    cubetl.container.add_component(factdimension)

                    factdimensions[related_fact] = factdimension

                # Create an alias
                aliasdimension = olap.AliasDimension()
                aliasdimension.dimension = factdimension
                aliasdimension.id = "cubesutils.%s.dim.%s.%s" % (
                    tablename, slugify.slugify(related_fact, separator="_"),
                    slugify.slugify(dbcol.name, separator="_"))
                #aliasdimension.name = slugify.slugify(dbcol.name, separator="_").replace("_id", "")
                #aliasdimension.label = slugify.slugify(dbcol.name, separator="_").replace("_id", "")
                aliasdimension.name = tablename + "_" + related_fact + "_" + slugify.slugify(
                    dbcol.name, separator="_").replace("_id", "")
                aliasdimension.label = tablename + " " + related_fact + " " + slugify.slugify(
                    dbcol.name, separator="_").replace("_id", "")
                cubetl.container.add_component(aliasdimension)

                fact.dimensions.append(aliasdimension)

                mapper = olap.sql.FactDimensionMapper()
                mapper.entity = aliasdimension
                mapper.mappings = [
                    {  #'name': slugify.slugify(dbcol.name, separator="_").replace("_id", ""),
                        'name':
                        tablename + "_" + related_fact + "_" + slugify.slugify(
                            dbcol.name, separator="_").replace("_id", ""),
                        'column':
                        dbcol.name,
                        'pk':
                        True
                    }
                ]
                olapmapper.include.append(olapmappers[related_fact])
                olapmapper.mappers.append(mapper)

            elif (dbcol.name
                  in force_dimensions) or coltype(dbcol) == "String":

                # Create dimension
                dimension = olap.Dimension()
                dimension.id = "cubesutils.%s.dim.%s" % (
                    tablename, slugify.slugify(dbcol.name, separator="_"))
                dimension.name = slugify.slugify(
                    dbtable.name, separator="_") + "_" + slugify.slugify(
                        dbcol.name, separator="_")
                dimension.label = dbcol.name
                dimension.attributes = [{
                    "pk":
                    True,
                    "name":
                    slugify.slugify(dbtable.name, separator="_") + "_" +
                    slugify.slugify(dbcol.name, separator="_"),
                    "type":
                    coltype(dbcol)
                }]

                cubetl.container.add_component(dimension)
                fact.dimensions.append(dimension)

                mapper = olap.sql.EmbeddedDimensionMapper()
                mapper.entity = dimension
                #mapper.table = dbtable.name
                #mapper.connection = connection
                #mapper.lookup_cols = dbcol.name
                mapper.mappings = [{
                    'name':
                    slugify.slugify(dbtable.name, separator="_") + "_" +
                    slugify.slugify(dbcol.name, separator="_"),
                    'column':
                    slugify.slugify(dbcol.name, separator="_")
                }]
                olapmapper.mappers.append(mapper)

            elif str(dbcol.type) in ("FLOAT", "REAL", "DECIMAL", "INTEGER"):

                measure = {
                    "name":
                    dbcol.name,
                    "label":
                    dbcol.name,
                    "type":
                    "Integer" if str(dbcol.type) in ["INTEGER"] else "Float"
                }
                fact.measures.append(measure)

                # Also add dimension if integer, but not too many
                if str(dbcol.type) in ("INTEGER"):
                    # TODO
                    pass

            elif str(dbcol.type) in ("DATETIME"):

                factdimension = cubetl.container.get_component_by_id(
                    "cubetl.datetime.date")

                # Create an alias to a datetime dimension
                aliasdimension = olap.AliasDimension()
                aliasdimension.dimension = factdimension
                aliasdimension.id = "cubesutils.%s.dim.%s.%s" % (
                    slugify.slugify(dbtable.name, separator="_"), "datetime",
                    slugify.slugify(dbcol.name, separator="_"))
                aliasdimension.name = slugify.slugify(
                    dbtable.name, separator="_") + "_" + slugify.slugify(
                        dbcol.name, separator="_").replace("_id", "")
                aliasdimension.label = slugify.slugify(
                    dbtable.name, separator="_") + " " + slugify.slugify(
                        dbcol.name, separator="_").replace("_id", "")
                cubetl.container.add_component(aliasdimension)

                fact.dimensions.append(aliasdimension)

                mapper = olap.sql.EmbeddedDimensionMapper()
                mapper.entity = aliasdimension
                mapper.mappings = [{
                    'name': 'year',
                    'column': dbcol.name,
                    'extract': 'year'
                }, {
                    'name': 'quarter',
                    'column': dbcol.name,
                    'extract': 'quarter'
                }, {
                    'name': 'month',
                    'column': dbcol.name,
                    'extract': 'month'
                }, {
                    'name': 'week',
                    'column': dbcol.name,
                    'extract': 'week'
                }, {
                    'name': 'day',
                    'column': dbcol.name,
                    'extract': 'day'
                }]
                #olapmapper.include.append(olapmappers[related_fact])
                olapmapper.mappers.append(mapper)

            else:

                print("    Cannot map column '%s' (type: %s)" %
                      (dbcol.name, dbcol.type))

        mapper = olap.sql.FactMapper()
        mapper.entity = fact
        mapper.table = dbtable.name
        mapper.connection = connection
        if len(factmappings) > 0:
            mapper.mappings = factmappings
        else:
            mapper.mappings = [{
                'name': 'index',
                'pk': True,
                'type': 'Integer'
            }]
        olapmapper.mappers.append(mapper)

        #  mappings:
        #  - name: id
        #    pk: True
        #    type: Integer
        #    value: ${ int(m["id"]) }

        cubetl.container.add_component(fact)
        olapmappers[dbtable.name] = olapmapper

    # Export process
    modelwriter = cubes.Cubes10ModelWriter()
    modelwriter.id = "cubesutils.export-cubes"
    modelwriter.olapmapper = olap.OlapMapper()
    modelwriter.olapmapper.include = [i for i in olapmappers.values()]

    #modelwriter.olapmapper.mappers = [ ]
    #for om in olapmappers:
    #    for m in om.mappers:
    #        modelwriter.olapmapper.mappers.append(m)
    #        print(m.entity)
    cubetl.container.add_component(modelwriter)

    # Launch process
    ctx.start_node = "cubesutils.export-cubes"
    result = cubesbootstrap.run(ctx)
    model_json = result["cubesmodel_json"]

    # Write model
    if model_path:
        with open(model_path, "w") as tmpfile:
            tmpfile.write(model_json)
    else:
        (tmpfile, model_path) = tempfile.mkstemp(suffix='.json',
                                                 prefix='cubesext-model-')
        os.write(tmpfile, model_json.encode("utf-8"))
        os.close(tmpfile)

    #workspace = Workspace()
    #workspace.register_default_store("sql", url=connection.url)

    # Load model
    #workspace.import_model("model.json")

    #for fact in facts:
    #    print("  %s" % fact)

    return (model_path)