def postflow(self): steps = [] logger.info('Publisher Flow Preparing') full_name = '{}_{}'.format( self.config.get(CONFIG_TAXONOMY_ID), slugify(self.config.get(CONFIG_EXTRA_METADATA_DATASET_NAME), separator='_', lowercase=True), ) db_table = 'dgp__{}'.format(full_name) source = get_source(self.config) steps.extend([ add_field('_source', 'string', source), append_to_primary_key('_source'), clear_by_source(engine, db_table, source, '_source'), conditional(lambda pkg: True, lambda pkg: self.normalize(pkg, full_name, db_table)), update_stats( dict(view_url='https://api.openfiscal.org/api/3/cubes/{}/model' .format(full_name))), ]) logger.info('Publisher Flow Prepared') return Flow(*steps)
def postflow(self): steps = [] for ct in self.config.get(CONFIG_TAXONOMY_CT): name = ct['name'].replace(':', '-') dataType = ct['dataType'] unique = ct.get('unique') if unique: flow = Flow( add_field(name, dataType, '-', resources=RESOURCE_NAME), append_to_primary_key(name) ) else: flow = Flow( add_field(name, dataType, None, resources=RESOURCE_NAME), ) steps.append( conditional( self.no_such_field(name), flow ) ) return Flow(*steps)
def flow(self): steps = [] if not self.config.get(CONFIG_PUBLISH_ALLOWED): return None logger.info('Publisher Flow Preparing') if self.output_datapackage: logger.info('Publisher Flow: Dump To Path Denorm...') steps.extend([ dump_to_path(self.output_datapackage) ]) if self.output_db: db_table = 'dgp__{}_{}'.format( self.config.get(CONFIG_TAXONOMY_ID), self.config.get(CONFIG_EXTRA_METADATA_DATASET_NAME), ) logger.info('Publisher Flow: Dump To DB... (%s)', db_table) primary_key = self.config.get(CONFIG_PRIMARY_KEY) mapping = self.config.get(CONFIG_MODEL_MAPPING) for m in mapping: if 'columnType' in m and m['columnType']: m['slug'] = self.slugify(m['title']) m['hierarchy'] = self.slugify(m['columnType'].split(':')[0]) m['column'] = self.column(m['columnType']) m['primaryKey'] = m['columnType'] in primary_key m['measure'] = m['hierarchy'] == 'value' m['full_column'] = ( m['column'] if m['measure'] else '{}_{hierarchy}.{column}'.format(db_table, **m) ) m['label'] = self.fetch_label(m['columnType']) m['dataType'] = self.fetch_datatype(m['columnType']) prefixes = set( m['hierarchy'] for m in mapping if m.get('measure') is False ) prefixed = dict( (p, list(filter(lambda m: m.get('hierarchy') == p, mapping))) for p in prefixes ) groups = [ NormGroup([ m['column'] for m in prefixed_items ], self.ref_column(prefix), self.id_column(), db_table='{}_{}'.format(db_table, prefix)) for prefix, prefixed_items in prefixed.items() ] babbage_model = dict( dimensions=dict( (m['slug'], dict( label=m['title'], key_attribute=m['slug'], attributes=dict([ (m['slug'], dict( column=m['full_column'], label=m['title'], type=m['dataType'], )) ] + ([ (m['label']['slug'], dict( column=m['label']['full_column'], label=m['label']['title'], type=m['label']['dataType'], )) ] if m.get('label') else [])), join_column=[ self.ref_column(m['hierarchy']), self.id_column() ], **(dict( label_attribute=m['label']['slug'] ) if m.get('label') else {}) )) for m in self.config.get(CONFIG_MODEL_MAPPING) if m.get('measure') is False and m.get('primaryKey') is True ), fact_table=db_table, measures=dict( ( m['slug'], dict( column=m['column'], label=m['title'], type='number' ) ) for m in self.config.get(CONFIG_MODEL_MAPPING) if m.get('measure') is True ), hierarchies=dict( (prefix, dict( label=prefix, levels=[ m['slug'] for m in prefixed_items if m.get('primaryKey') is True ] )) for prefix, prefixed_items in prefixed.items() ), ) steps.append( update_package(babbage_model=babbage_model) ) source = self.config.get(CONFIG_URL) logger.info('Publisher Flow: _source Handling...') steps.extend([ add_field('_source', 'string', source), append_to_primary_key(['_source']), clear_by_source(self.lazy_engine(), db_table, source), ]) logger.info('Publisher Flow: Normalize...') steps.extend([ normalize_to_db( groups, db_table, RESOURCE_NAME, self.output_db, 'append' ), ]) if self.output_datapackage: logger.info('Publisher Flow: Dump To Path Norm...') steps.extend([ dump_to_path(self.output_datapackage + '-norm') ]) if self.output_es: logger.info('Publisher Flow: ES...') steps.extend([ self.update_es() ]) logger.info('Publisher Flow Prepared') return Flow(*steps)
def objeto_del_gasto(config): logging.info('PREPARING objeto_del_gasto processing') CT = COLUMN_MAPPING CN = dict((k, v.replace(':', '-')) for k, v in CT.items()) lookup = {} codes = datapackage.Package( os.path.join(os.path.dirname(__file__), 'objeto_del_gasto.datapackage.zip')) for resource in codes.resources: kind = resource.name lookup[kind] = {} for row in resource.iter(keyed=True): key = row[kind.upper().replace('Í', 'I')] value = row['DESCRIPCION'] lookup[kind][key] = value def process(row): year = int(row['date-fiscal-year']) # Skip the LAST year of the dataset (currently 2016) it has split columns already if year < 2019: objeto = row[CN['ID_CONCEPTO']] if objeto and objeto != '-': row[CN['ID_CAPITULO']] = objeto[0] + '000' row[CN['ID_CONCEPTO']] = objeto[:2] + '00' row[CN['DESC_CAPITULO']] = lookup['capitulo'].get( row[CN['ID_CAPITULO']]) row[CN['DESC_CONCEPTO']] = lookup['concepto'].get( row[CN['ID_CONCEPTO']]) nb_generica_digits = 4 if year in (2008, 2009, 2010) else 3 if objeto and len(objeto) >= 4: row[CN['ID_PARTIDA_GENERICA']] = objeto[:nb_generica_digits] row[CN['DESC_PARTIDA_GENERICA']] = lookup['partida_generica'].get( row.get(CN['ID_PARTIDA_GENERICA'])) if year not in (2008, 2009, 2010): if objeto and len(objeto) >= 5: row[CN['ID_PARTIDA_ESPECIFICA']] = objeto row[CN['DESC_PARTIDA_ESPECIFICA']] = \ lookup['partida_específica'].get(row.get(CN['ID_PARTIDA_ESPECIFICA'])) def missing_field(mf): def func(dp): return all(f.name != mf for f in dp.resources[0].schema.fields) return func def sort_by_ct(): def func(package): ct_indexes = dict( (ct['name'], i) for i, ct in enumerate(config.get(CONFIG_TAXONOMY_CT))) fields = sorted(((ct_indexes.get(f.get('columnType'), 1000), f) for f in package.pkg.descriptor['resources'][0] ['schema']['fields']), key=lambda x: x[0]) package.pkg.descriptor['resources'][0]['schema']['fields'] = [ f[1] for f in fields ] yield package.pkg yield from package return func return Flow( *[ conditional( missing_field(CN[f]), Flow(add_field(CN[f], 'string', columnType=ct, title=f), append_to_primary_key(CN[f]) if 'ID_' in f else None)) for f, ct in CT.items() ], sort_by_ct(), process)