Esempio n. 1
0
def cubetl_config(ctx):

    ctx.add('estat.sql.connection',
            sql.Connection(url='sqlite:///estat.sqlite3'))

    # Load SDMX schema and transform it to CubETL OLAP entities
    sdmx.SDMXToOLAP.sdmx2olap(ctx,
                              path_dsd='data/eip_ext1.dsd.xml',
                              fact_name='estat_eip',
                              fact_label='Eurostat / Entrepreneurship Indicator Programme')

    # Generate a SQL schema from the OLAP schema
    sqlschema.OLAPToSQL.olap2sql(ctx, connection=ctx.get('estat.sql.connection'))  # store_mode='insert'
    ctx.get('olap2sql.olapmapper').entity_mapper(ctx.get('smdx2olap.fact.estat_eip')).store_mode = TableMapper.STORE_MODE_INSERT


    # Define the data load process
    ctx.add('estat.process', flow.Chain(steps=[

        #ctx.get('cubetl.config.print'),

        # Generate a Cubes model
        cubes10.Cubes10ModelWriter(olapmapper=ctx.get('olap2sql.olapmapper'),
                                   model_path="estat.cubes-model.json",
                                   config_path="estat.cubes-config.ini"),

        sql.Transaction(connection=ctx.get('estat.sql.connection')),

        flow.Chain(fork=True, steps=[

            sdmx.SDMXFileReader(path_dsd='data/eip_ext1.dsd.xml',
                                path_sdmx='data/eip_ext1.sdmx.xml'),

            ctx.get('cubetl.util.print'),

            olap.Store(entity=ctx.get('smdx2olap.fact.estat_eip'),
                       mapper=ctx.get('olap2sql.olapmapper')),

            log.LogPerformance(),

            ]),

        ]))
Esempio n. 2
0
def cubetl_config(ctx):

    ctx.props['file_path'] = ctx.props.get('file_path', '../loganalyzer/access.log')

    ctx.include('${ ctx.library_path }/datetime.py')
    ctx.include('${ ctx.library_path }/http.py')

    #ctx.include('${ ctx.library_path }/datetime.py')

    ctx.add('es.connection',
            elasticsearch.ElasticsearchConnection(url='http://localhost:9200'))

    # Define the data load process
    ctx.add('es.process', flow.Chain(steps=[

        fs.FileLineReader(path='${ ctx.props["file_path"] }', encoding=None),

        ctx.get('cubetl.http.parse.apache_combined'),

        util.PrettyPrint(depth=4),

        elasticsearch.Index(es=ctx.get("es.connection"),
                            index="test-index",
                            doc_type="logline",
                            data_id=lambda m: m['data']),

        log.LogPerformance(),
    ]))

    ctx.add('es.search', flow.Chain(steps=[
        elasticsearch.Search(es=ctx.get("es.connection"),
                             index="test-index",
                             query=None),
        util.PrettyPrint(depth=4),
        log.LogPerformance(),
    ]))
Esempio n. 3
0
def cubetl_config(ctx):

    ctx.add('directorycsv.process', flow.Chain(steps=[

        # Generates a message for each file in the given directory
        fs.DirectoryList(path=lambda ctx: ctx.props.get("path", "/"), maxdepth=0),

        fs.FileInfo(),  # path=lambda m: m['path'])

        script.Function(process_data),

        # Print the message
        util.Print(),

        # Generates CSV header and rows and writes them
        csv.CsvFileWriter(),  # path="/tmp/files", overwrite=True

        log.LogPerformance(),

    ]))
Esempio n. 4
0
def cubetl_config(ctx):
    """
    This is a simple ETL process. It simply lists files in the library
    path. Then prints the resulting messages to standard output.
    """

    ctx.add(
        'my_app.process',
        flow.Chain(
            fork=False,
            steps=[

                # Log a message through the logging system
                log.Log(message='CubETL Example (Calling CubETL from Python)',
                        level=log.Log.LEVEL_WARN),

                # Generates a message for each file in the given directory
                fs.DirectoryList(path=ctx.library_path),

                # Print the message (use -q when calling cubetl to hide print output)
                ctx.get('cubetl.util.print'),
            ]))
Esempio n. 5
0
def cubetl_config(ctx):
    """
    This is a simple ETL process. It simply lists files in the library
    path, and adds some extra data. Then prints the resulting messages to
    standard output.
    """

    ctx.add(
        'directorylist.process',
        flow.Chain(steps=[

            # Log a message through the logging system
            log.Log(message='CubETL Example', level=log.Log.LEVEL_WARN),

            # Generates a message for each file in the given directory
            fs.DirectoryList(path=ctx.library_path),

            # Manipulate each message with a custom function
            script.Function(process_data),

            # Print the message (use -q when calling cubetl to hide print output)
            ctx.get('cubetl.util.print'),
        ]))
Esempio n. 6
0
def cubetl_config(ctx):

    ctx.include('${ ctx.library_path }/datetime.py')

    #ctx.include('${ ctx.library_path }/datetime.py')

    ctx.add('odoo.sql.connection',
            sql.Connection(url='sqlite:///odoo.sqlite3'))
    '''
    ctx.add('ine.autonomy', olap.Dimension(
        name='autonomy',
        label='Autonomy',
        attributes=[olap.Attribute('autonomy', type='String')]))

    ctx.add('ine.province', olap.Dimension(
        name='province',
        label='Province',
        attributes=[olap.Attribute('province', type='String')]))

    ctx.add('ine.autonomyprovince', olap.HierarchyDimension(
        name='autonomyprovince',
        label='Province',
        attributes=[DimensionAttribute(ctx.get('ine.autonomy')),
                    DimensionAttribute(ctx.get('ine.province'))]))

    ctx.add('ine.nationality', olap.Dimension(
        name='nationality',
        label='Nationality',
        attributes=[olap.Attribute('nationality', type='String')]))

    ctx.add('ine.census', olap.Fact(
        name='census',
        label='Census',
        #must_slice=ctx.get('cubetl.datetime.datemonthly'),  # study when and how dimensions can be aggregated, this cube requires slicing by date or results are invalid
        #natural_key=
        #notes='',
        attributes=[DimensionAttribute(ctx.get('cubetl.datetime.datemonthly')),
                    DimensionAttribute(ctx.get('ine.autonomyprovince')),
                    DimensionAttribute(ctx.get('ine.nationality')),
                    Measure(name='census', type='Integer', label="Population")]))  # TODO: Should not present avg/max/min
    '''

    # Generate a SQL star schema and mappings automatically
    #sqlschema.OLAPToSQL.olap2sql(ctx, connection=ctx.get('ine.sql.connection'))
    #ctx.get('olap2sql.olapmapper').entity_mapper(ctx.get('ine.census')).store_mode = TableMapper.STORE_MODE_INSERT

    ctx.add(
        "odoo.conn",
        odoo.OdooConnection(url="http://127.0.0.1:8069",
                            database="test",
                            username="******",
                            password="******"))

    # Define the data load process
    ctx.add(
        'odoo.process',
        flow.Chain(steps=[

            #ctx.get('cubetl.config.print'),

            # Generate a Cubes model
            #cubes10.Cubes10ModelWriter(olapmapper=ctx.get('olap2sql.olapmapper'),
            #                           model_path="ine.cubes-model.json",
            #                           config_path="ine.cubes-config.ini"),
            sql.Transaction(connection=ctx.get('odoo.sql.connection')),

            #odoo.Execute(),
            #odoo.Dump('account.move'),
            script.Function(test),
            script.Function(test2),
            flow.Chain(
                fork=True,
                steps=[

                    #script.Delete(['data', 'pcaxis']),

                    #script.Function(process_data),

                    #flow.Filter(condition="${ m['date'].year < 2002 }"),

                    #cache.CachedTableLookup(
                    #    table=ctx.get("ine.autonomy_province.table"),
                    #    lookup={'province': lambda m: m['province_name']}),

                    #ctx.get('cubetl.util.print'),
                    util.PrettyPrint(depth=4),
                    #util.Print(),

                    #olap.Store(entity=ctx.get('ine.census'),
                    #           mapper=ctx.get('olap2sql.olapmapper')),
                    log.LogPerformance(),
                ]),
        ]))
Esempio n. 7
0
def cubetl_config(ctx):

    ctx.include('${ ctx.library_path }/datetime.py')
    ctx.include('${ ctx.library_path }/person.py')

    ctx.add('ine.sql.connection', sql.Connection(url='sqlite:///ine.sqlite3'))

    ctx.add(
        'ine.autonomy_province.table',
        table.CSVMemoryTable(data='''
            province,autonomy
            Albacete,Castilla la Mancha
            Alicante/Alacant,Comunidad Valenciana
            Almería,Andalucía
            Araba/Álava,País Vasco
            Asturias,Asturias
            Ávila,Castilla y León
            Badajoz,Extremadura
            "Balears, Illes",Comunidad Balear
            Barcelona,Cataluña
            Bizkaia,País Vasco
            Burgos,Castilla y León
            Cáceres,Extremadura
            Cádiz,Andalucía
            Cantabria,Cantabria
            Castellón/Castelló,Comunidad Valenciana
            Ciudad Real,Castilla la Mancha
            Córdoba,Andalucía
            "Coruña, A",Galicia
            Cuenca,Castilla la Mancha
            Gipuzkoa,País Vasco
            Girona,Cataluña
            Granada,Andalucía
            Guadalajara,Castilla la Mancha
            Huelva,Andalucía
            Huesca,Aragón
            Jaén,Andalucía
            León,Castilla y León
            Lleida,Cataluña
            Lugo,Galicia
            Madrid,Madrid
            Málaga,Andalucía
            Murcia,Murcia
            Navarra,Aragón
            Ourense,Galicia
            Palencia,Castilla y León
            "Palmas, Las",Canarias
            Pontevedra,Galicia
            "Rioja, La",Rioja
            Salamanca,Castilla y León
            Santa Cruz de Tenerife,Canarias
            Segovia,Castilla y León
            Sevilla,Andalucía
            Soria,Castilla y León
            Tarragona,Cataluña
            Teruel,Aragón
            Toledo,Castilla la Mancha
            Valencia/València,Comunidad Valenciana
            Valladolid,Castilla y León
            Zamora,Castilla y León
            Zaragoza,Aragón
            Ceuta,Ciudades Autónomas
            Melilla,Ciudades Autónomas
        '''))

    ctx.add(
        'ine.autonomy',
        olap.Dimension(name='autonomy',
                       label='Autonomy',
                       attributes=[olap.Attribute('autonomy', type='String')]))

    ctx.add(
        'ine.province',
        olap.Dimension(name='province',
                       label='Province',
                       attributes=[olap.Attribute('province', type='String')]))

    ctx.add(
        'ine.autonomyprovince',
        olap.HierarchyDimension(name='autonomyprovince',
                                label='Province',
                                attributes=[
                                    DimensionAttribute(
                                        ctx.get('ine.autonomy')),
                                    DimensionAttribute(ctx.get('ine.province'))
                                ]))

    ctx.add(
        'ine.nationality',
        olap.Dimension(
            name='nationality',
            label='Nationality',
            attributes=[olap.Attribute('nationality', type='String')]))

    ctx.add(
        'ine.census',
        olap.Fact(
            name='census',
            label='Census',
            #must_slice=ctx.get('cubetl.datetime.datemonthly'),  # study when and how dimensions can be aggregated, this cube requires slicing by date or results are invalid
            #natural_key=
            #notes='',
            attributes=[
                DimensionAttribute(ctx.get('cubetl.datetime.datemonthly')),
                DimensionAttribute(ctx.get('ine.autonomyprovince')),
                DimensionAttribute(ctx.get('ine.nationality')),
                DimensionAttribute(ctx.get('cubetl.person.gender')),
                DimensionAttribute(ctx.get('cubetl.person.age_range')),
                Measure(name='census', type='Integer', label="Population")
            ]))  # TODO: Should not present avg/max/min

    # Generate a SQL star schema and mappings automatically
    sqlschema.OLAPToSQL.olap2sql(ctx, connection=ctx.get('ine.sql.connection'))
    ctx.get('olap2sql.olapmapper').entity_mapper(
        ctx.get('ine.census')).store_mode = TableMapper.STORE_MODE_INSERT

    # Define the data load process
    ctx.add(
        'ine.process',
        flow.Chain(steps=[

            #ctx.get('cubetl.config.print'),

            # Generate a Cubes model
            cubes10.Cubes10ModelWriter(olapmapper=ctx.get(
                'olap2sql.olapmapper'),
                                       model_path="ine.cubes-model.json",
                                       config_path="ine.cubes-config.ini"),
            sql.Transaction(connection=ctx.get('ine.sql.connection')),
            fs.FileReader(path='census-2002.px', encoding=None),
            pcaxis.PCAxisParser(),
            flow.Chain(
                fork=True,
                steps=[
                    pcaxis.PCAxisIterator(),
                    script.Delete(['data', 'pcaxis']),
                    flow.Filter(condition="${ m['Sexo'] != 'Ambos sexos' }"),
                    flow.Filter(
                        condition=
                        "${ m['Grupo quinquenal de edad'] != 'Total' }"),
                    #flow.Filter(condition="${ m['Grupo de edad'] != 'Total' }"),
                    flow.Filter(condition="${ m['Nacionalidad'] != 'Total' }"),
                    flow.Filter(
                        condition="${ m['Provincias'] != 'Total Nacional' }"),

                    #flow.Skip(skip="${ random.randint(1, 1000) }"),
                    #flow.Limit(limit=5000),
                    script.Function(process_data),

                    #flow.Filter(condition="${ m['date'].year < 2002 }"),
                    cache.CachedTableLookup(
                        table=ctx.get("ine.autonomy_province.table"),
                        lookup={'province': lambda m: m['province_name']}),
                    ctx.get('cubetl.util.print'),
                    olap.Store(entity=ctx.get('ine.census'),
                               mapper=ctx.get('olap2sql.olapmapper')),
                    log.LogPerformance(),
                ]),
        ]))
Esempio n. 8
0
def cubetl_config(ctx):

    #ctx.include('${ ctx.library_path }/datetime.py')
    #ctx.include('${ ctx.library_path }/geo.py')
    #ctx.include('${ ctx.library_path }/net.py')
    ctx.include('${ ctx.library_path }/http.py')

    # Process configuration

    ctx.props['db_url'] = 'sqlite:///loganalyzer.sqlite3'

    ctx.props['domain'] = 'cubesviewer.com'  #  ctx.interpolate('${ }')
    ctx.props['file_path'] = ctx.props.get('file_path', 'access.log')

    ctx.props['download_extensions'] = 'zip, tgz, gz, 7z, rar, iso, msi, exe, avi, mp3, mp4, ogg, mkv, pdf'
    ctx.props['download_extensions_list'] = [e.strip().lower() for e in ctx.props['download_extensions'].split(',')]
    ctx.props['download_size_bytes'] = 10 * 1024 * 1024


    # Database connection for loaded OLAP data
    ctx.add('loganalyzer.sql.connection',
            sql.Connection(url=ctx.interpolate(None, '${ ctx.props["db_url"] }')))


    # Generate a SQL star schema and mappings automatically
    sqlschema.OLAPToSQL.olap2sql(ctx, connection=ctx.get('loganalyzer.sql.connection'))
    ctx.get('olap2sql.olapmapper').entity_mapper(ctx.get('cubetl.http.request')).store_mode = TableMapper.STORE_MODE_INSERT


    # Processes a log file and loads the database for OLAP
    ctx.add('loganalyzer.process', flow.Chain(steps=[

        ctx.get('cubetl.config.print'),

        # Generate a Cubes model
        cubes10.Cubes10ModelWriter(olapmapper=ctx.get('olap2sql.olapmapper'),
                                   model_path="loganalyzer.cubes-model.json",
                                   config_path="loganalyzer.cubes-config.ini"),
        script.Delete(['cubesmodel', 'cubesmodel_json']),

        sql.Transaction(connection=ctx.get('loganalyzer.sql.connection')),

        fs.FileLineReader(path='${ ctx.props["file_path"] }', encoding=None),

        ctx.get('cubetl.http.parse.apache_combined'),

        geoip.GeoIPFromAddress(data="${ m['address'] }"),
        useragent.UserAgentParse(data="${ m['user_agent_string'] }"),

        cache.CachedTableLookup(
            table=ctx.get("cubetl.http.status.table"),
            lookup={'status_code': lambda m: m['status_code']},
            default={'status_description': 'Unknown'}),

        script.Function(process_data),

        ctx.get('cubetl.util.print'),

        olap.Store(entity=ctx.get('cubetl.http.request'),
                   mapper=ctx.get('olap2sql.olapmapper')),

        log.LogPerformance(),

        ]))

    # This node runs several test queries
    ctx.add('loganalyzer.query', flow.Chain(steps=[

        #ctx.get('cubetl.config.print'),

        query.OlapQueryAggregate(fact=ctx.get('cubetl.http.request'),
                                 mapper=ctx.get('olap2sql.olapmapper'),
                                 #drills=['referer.source'],
                                 cuts={'contcountry.id': 16}),

        #query.OlapQueryFacts(fact=ctx.get('cubetl.http.request'),
        #                     mapper=ctx.get('olap2sql.olapmapper'),
        #                     cuts={'contcountry': 16}),

        #query.OlapQueryDimension(fact=ctx.get('cubetl.http.request'),
        #                         mapper=ctx.get('olap2sql.olapmapper'),
        #                         drill=['contcountry.country']),

        ctx.get('cubetl.util.print'),

        ]))
Esempio n. 9
0
def cubetl_config(ctx):

    ctx.props['file_path'] = ctx.props.get('file_path', 'spain-latest.osm.pbf')

    ctx.props['osmiumelastic.elastic.mappings'] = {
        "mappings": {
            "properties": {
                "description": {
                    "type": "text",
                },
                "location": {
                    "type": "geo_point"
                },
                "name": {
                    "type": "text",
                },
                "tagkeys": {
                    "type": "keyword",
                },
                "timestamp": {
                    "type": "date"
                },
                "type": {
                    "type": "keyword",
                },
                "uid": {
                    "type": "long"
                },
                #"user": {
                #  "type": "keyword",
                #  #"index": "not_analyzed"
                #}
            }
        }
    }

    #ctx.include('${ ctx.library_path }/datetime.py')

    ctx.add('osmiumelastic.connection',
            elasticsearch.ElasticsearchConnection(url='http://localhost:9200'))

    ctx.add(
        'osmiumelastic.process',
        flow.Chain(steps=[
            elasticsearch.IndexCreate(
                es=ctx.get("osmiumelastic.connection"),
                index="osm-index",
                mappings=ctx.props["osmiumelastic.elastic.mappings"]),
            osm_osmium.OsmiumNode(filename="spain-latest.osm.pbf"),
            util.PrettyPrint(depth=4),
            log.LogPerformance(),
            elasticsearch.Index(es=ctx.get("osmiumelastic.connection"),
                                index="osm-index",
                                doc_type="osm",
                                data_id=lambda m: m['id']),
        ]))

    ctx.add(
        'osmiumelastic.search',
        flow.Chain(steps=[
            elasticsearch.Search(es=ctx.get("osmiumelastic.connection"),
                                 index="osm-index",
                                 query="${ m.get('q', None) }"),
            util.PrettyPrint(depth=4),
            log.LogPerformance(),
        ]))