def update_model_in_registry(pkg, loaded): try: registry = PackageRegistry(self.output_es) except Exception as exception: logger.info('STATUS: FAILED TO UPDATE MODEL') logger.exception(exception) return owner = self.owner_id dataset_name = '{}_{}'.format( self.config.get(CONFIG_TAXONOMY_ID), self.config.get(CONFIG_EXTRA_METADATA_DATASET_NAME), ) dataset_id = '{}:{}'.format( owner, dataset_name ) private = self.config.get(CONFIG_EXTRA_PRIVATE) # TODO: replace by real URL datapackage_url = 'datapackage-url' datapackage = copy.deepcopy(pkg.descriptor) datapackage.update(dict( private=private, owner=owner )) registry.save_model( dataset_id, datapackage_url, datapackage, datapackage.get('babbage_model'), dataset_name, 'openspending', 'done' if loaded else 'loading-data', loaded ) logger.info('STATUS: UPDATED MODEL')
def func(package): logger.info('STATUS: STARTING') yield package.pkg count = dict(i=0) update_model_in_registry(package.pkg, loaded=False) for res in package: yield progress(res, count) update_model_in_registry(package.pkg, loaded=True) logger.info('STATUS: DONE')
def dedup(rows): if rows.res.name == RESOURCE_NAME: logger.info('DEDPULICATING with KEYS %r', key_field_names) for row in rows: key = tuple(row.get(k) for k in key_field_names) if key not in used: used.add(key) yield row else: yield from rows
def func(rows): if rows.res.name == RESOURCE_NAME: used = set() key_field_names = rows.res.descriptor['schema']['primaryKey'] logger.info('DEDPULICATING with KEYS %r', key_field_names) for row in rows: key = tuple(row.get(k) for k in key_field_names) if key not in used: used.add(key) yield row else: yield from rows
def run(self): # Values values = [ x for x in self.config.get(CONFIG_TAXONOMY_CT) if x['name'].startswith('value:') ] mapping = self.config.get(CONFIG_MODEL_MAPPING) existing_cts = set(x.get('columnType') for x in mapping) logger.info('EXISTING CTS %r', existing_cts) missing = [] for x in values: if x['name'] not in existing_cts: missing.append( dict( title=x['title'], name=x['name'].replace('value:', 'MONTO_'), columnType=x['name'], enriched=True, dataType=x.get('dataType', 'string'), )) mapping.extend(missing) logger.info('MISSING CTS VALUES %r', missing) # Objeto Del Gasto title_mapping = dict((v, k) for k, v in COLUMN_MAPPING.items()) missing_cts = [ x for x in COLUMN_MAPPING.values() if x not in existing_cts ] missing_cts = [ x for x in self.config.get(CONFIG_TAXONOMY_CT) if x['name'] in missing_cts ] missing = [ dict( title=title_mapping[x['name']], name=title_mapping[x['name']], columnType=x['name'], enriched=True, dataType=x.get('dataType', 'string'), ) for x in missing_cts ] logger.info('MISSING CTS OBJETO %r', missing) mapping.extend(missing) self.config.set(CONFIG_MODEL_MAPPING, mapping)
def progress(res, count): for row in res: yield row if count['i'] % 1000 == 0: logger.info('STATUS: PROGRESS %d', count['i']) count['i'] += 1
def flow(self): steps = [] if not self.config.get(CONFIG_PUBLISH_ALLOWED): return None logger.info('Publisher Flow Preparing') if self.output_datapackage: logger.info('Publisher Flow: Dump To Path Denorm...') steps.extend([ dump_to_path(self.output_datapackage) ]) if self.output_db: db_table = 'dgp__{}_{}'.format( self.config.get(CONFIG_TAXONOMY_ID), self.config.get(CONFIG_EXTRA_METADATA_DATASET_NAME), ) logger.info('Publisher Flow: Dump To DB... (%s)', db_table) primary_key = self.config.get(CONFIG_PRIMARY_KEY) mapping = self.config.get(CONFIG_MODEL_MAPPING) for m in mapping: if 'columnType' in m and m['columnType']: m['slug'] = self.slugify(m['title']) m['hierarchy'] = self.slugify(m['columnType'].split(':')[0]) m['column'] = self.column(m['columnType']) m['primaryKey'] = m['columnType'] in primary_key m['measure'] = m['hierarchy'] == 'value' m['full_column'] = ( m['column'] if m['measure'] else '{}_{hierarchy}.{column}'.format(db_table, **m) ) m['label'] = self.fetch_label(m['columnType']) m['dataType'] = self.fetch_datatype(m['columnType']) prefixes = set( m['hierarchy'] for m in mapping if m.get('measure') is False ) prefixed = dict( (p, list(filter(lambda m: m.get('hierarchy') == p, mapping))) for p in prefixes ) groups = [ NormGroup([ m['column'] for m in prefixed_items ], self.ref_column(prefix), self.id_column(), db_table='{}_{}'.format(db_table, prefix)) for prefix, prefixed_items in prefixed.items() ] babbage_model = dict( dimensions=dict( (m['slug'], dict( label=m['title'], key_attribute=m['slug'], attributes=dict([ (m['slug'], dict( column=m['full_column'], label=m['title'], type=m['dataType'], )) ] + ([ (m['label']['slug'], dict( column=m['label']['full_column'], label=m['label']['title'], type=m['label']['dataType'], )) ] if m.get('label') else [])), join_column=[ self.ref_column(m['hierarchy']), self.id_column() ], **(dict( label_attribute=m['label']['slug'] ) if m.get('label') else {}) )) for m in self.config.get(CONFIG_MODEL_MAPPING) if m.get('measure') is False and m.get('primaryKey') is True ), fact_table=db_table, measures=dict( ( m['slug'], dict( column=m['column'], label=m['title'], type='number' ) ) for m in self.config.get(CONFIG_MODEL_MAPPING) if m.get('measure') is True ), hierarchies=dict( (prefix, dict( label=prefix, levels=[ m['slug'] for m in prefixed_items if m.get('primaryKey') is True ] )) for prefix, prefixed_items in prefixed.items() ), ) steps.append( update_package(babbage_model=babbage_model) ) source = self.config.get(CONFIG_URL) logger.info('Publisher Flow: _source Handling...') steps.extend([ add_field('_source', 'string', source), append_to_primary_key(['_source']), clear_by_source(self.lazy_engine(), db_table, source), ]) logger.info('Publisher Flow: Normalize...') steps.extend([ normalize_to_db( groups, db_table, RESOURCE_NAME, self.output_db, 'append' ), ]) if self.output_datapackage: logger.info('Publisher Flow: Dump To Path Norm...') steps.extend([ dump_to_path(self.output_datapackage + '-norm') ]) if self.output_es: logger.info('Publisher Flow: ES...') steps.extend([ self.update_es() ]) logger.info('Publisher Flow Prepared') return Flow(*steps)
def handler(pipeline=None): logger.info('Submitted pipeline %r', pipeline)
def postflow(self): metadata = self.config._unflatten().get('extra', {}).get('metadata') logger.info('UPDATING WITH METADATA %r', metadata) return Flow( update_package(**metadata) )
def handler(pipeline=None): logger.info('Accepted pipeline %r', pipeline)
def handler(pipeline=None): logger.info('New pipeline %r', pipeline)
def test(self): logger.info('DEDPULICATING %r', self.config.get('extra.deduplicate')) return self.config.get('extra.deduplicate')