def update_model_in_registry(pkg, loaded):
     try:
         registry = PackageRegistry(self.output_es)
     except Exception as exception:
         logger.info('STATUS: FAILED TO UPDATE MODEL')
         logger.exception(exception)
         return
     owner = self.owner_id
     dataset_name = '{}_{}'.format(
         self.config.get(CONFIG_TAXONOMY_ID),
         self.config.get(CONFIG_EXTRA_METADATA_DATASET_NAME),
     )
     dataset_id = '{}:{}'.format(
         owner,
         dataset_name
     )
     private = self.config.get(CONFIG_EXTRA_PRIVATE)
     # TODO: replace by real URL
     datapackage_url = 'datapackage-url'
     datapackage = copy.deepcopy(pkg.descriptor)
     datapackage.update(dict(
         private=private,
         owner=owner
     ))
     registry.save_model(
         dataset_id,
         datapackage_url,
         datapackage,
         datapackage.get('babbage_model'),
         dataset_name,
         'openspending',
         'done' if loaded else 'loading-data',
         loaded
     )
     logger.info('STATUS: UPDATED MODEL')
 def func(package):
     logger.info('STATUS: STARTING')
     yield package.pkg
     count = dict(i=0)
     update_model_in_registry(package.pkg, loaded=False)
     for res in package:
         yield progress(res, count)
     update_model_in_registry(package.pkg, loaded=True)
     logger.info('STATUS: DONE')
 def dedup(rows):
     if rows.res.name == RESOURCE_NAME:
         logger.info('DEDPULICATING with KEYS %r', key_field_names)
         for row in rows:
             key = tuple(row.get(k) for k in key_field_names)
             if key not in used:
                 used.add(key)
                 yield row
     else:
         yield from rows
 def func(rows):
     if rows.res.name == RESOURCE_NAME:
         used = set()
         key_field_names = rows.res.descriptor['schema']['primaryKey']
         logger.info('DEDPULICATING with KEYS %r', key_field_names)
         for row in rows:
             key = tuple(row.get(k) for k in key_field_names)
             if key not in used:
                 used.add(key)
                 yield row
     else:
         yield from rows
    def run(self):
        # Values
        values = [
            x for x in self.config.get(CONFIG_TAXONOMY_CT)
            if x['name'].startswith('value:')
        ]
        mapping = self.config.get(CONFIG_MODEL_MAPPING)
        existing_cts = set(x.get('columnType') for x in mapping)
        logger.info('EXISTING CTS %r', existing_cts)
        missing = []
        for x in values:
            if x['name'] not in existing_cts:
                missing.append(
                    dict(
                        title=x['title'],
                        name=x['name'].replace('value:', 'MONTO_'),
                        columnType=x['name'],
                        enriched=True,
                        dataType=x.get('dataType', 'string'),
                    ))
        mapping.extend(missing)
        logger.info('MISSING CTS VALUES %r', missing)

        # Objeto Del Gasto
        title_mapping = dict((v, k) for k, v in COLUMN_MAPPING.items())
        missing_cts = [
            x for x in COLUMN_MAPPING.values() if x not in existing_cts
        ]
        missing_cts = [
            x for x in self.config.get(CONFIG_TAXONOMY_CT)
            if x['name'] in missing_cts
        ]
        missing = [
            dict(
                title=title_mapping[x['name']],
                name=title_mapping[x['name']],
                columnType=x['name'],
                enriched=True,
                dataType=x.get('dataType', 'string'),
            ) for x in missing_cts
        ]
        logger.info('MISSING CTS OBJETO %r', missing)
        mapping.extend(missing)
        self.config.set(CONFIG_MODEL_MAPPING, mapping)
 def progress(res, count):
     for row in res:
         yield row
         if count['i'] % 1000 == 0:
             logger.info('STATUS: PROGRESS %d', count['i'])
         count['i'] += 1
 def flow(self):
     steps = []
     if not self.config.get(CONFIG_PUBLISH_ALLOWED):
         return None
     logger.info('Publisher Flow Preparing')
     if self.output_datapackage:
         logger.info('Publisher Flow: Dump To Path Denorm...')
         steps.extend([
             dump_to_path(self.output_datapackage)
         ])
     if self.output_db:
         db_table = 'dgp__{}_{}'.format(
             self.config.get(CONFIG_TAXONOMY_ID),
             self.config.get(CONFIG_EXTRA_METADATA_DATASET_NAME),
         )
         logger.info('Publisher Flow: Dump To DB... (%s)', db_table)
         primary_key = self.config.get(CONFIG_PRIMARY_KEY)
         mapping = self.config.get(CONFIG_MODEL_MAPPING)
         for m in mapping:
             if 'columnType' in m and m['columnType']:
                 m['slug'] = self.slugify(m['title'])
                 m['hierarchy'] = self.slugify(m['columnType'].split(':')[0])
                 m['column'] = self.column(m['columnType'])
                 m['primaryKey'] = m['columnType'] in primary_key
                 m['measure'] = m['hierarchy'] == 'value'
                 m['full_column'] = (
                     m['column'] if m['measure']
                     else '{}_{hierarchy}.{column}'.format(db_table, **m)
                 )
                 m['label'] = self.fetch_label(m['columnType'])
                 m['dataType'] = self.fetch_datatype(m['columnType'])
         prefixes = set(
             m['hierarchy']
             for m in mapping
             if m.get('measure') is False
         )
         prefixed = dict(
             (p, list(filter(lambda m: m.get('hierarchy') == p, mapping)))
             for p in prefixes
         )
         groups = [
             NormGroup([
                     m['column']
                     for m in prefixed_items
                 ], self.ref_column(prefix), self.id_column(),
                 db_table='{}_{}'.format(db_table, prefix))
             for prefix, prefixed_items in prefixed.items()
         ]
         babbage_model = dict(
             dimensions=dict(
                 (m['slug'], dict(
                     label=m['title'],
                     key_attribute=m['slug'],
                     attributes=dict([
                         (m['slug'], dict(
                             column=m['full_column'],
                             label=m['title'],
                             type=m['dataType'],
                         ))
                     ] + ([
                         (m['label']['slug'], dict(
                             column=m['label']['full_column'],
                             label=m['label']['title'],
                             type=m['label']['dataType'],
                         ))
                     ] if m.get('label') else [])),
                     join_column=[
                         self.ref_column(m['hierarchy']),
                         self.id_column()
                     ],
                     **(dict(
                         label_attribute=m['label']['slug']
                     ) if m.get('label') else {})
                 ))
                 for m in self.config.get(CONFIG_MODEL_MAPPING)
                 if m.get('measure') is False and m.get('primaryKey') is True
             ),
             fact_table=db_table,
             measures=dict(
                 (
                     m['slug'],
                     dict(
                         column=m['column'],
                         label=m['title'],
                         type='number'
                     )
                 )
                 for m in self.config.get(CONFIG_MODEL_MAPPING)
                 if m.get('measure') is True
             ),
             hierarchies=dict(
                 (prefix, dict(
                     label=prefix,
                     levels=[
                         m['slug']
                         for m in prefixed_items
                         if m.get('primaryKey') is True
                     ]
                 ))
                 for prefix, prefixed_items in prefixed.items()
             ),
         )
         steps.append(
             update_package(babbage_model=babbage_model)
         )
         source = self.config.get(CONFIG_URL)
         logger.info('Publisher Flow: _source Handling...')
         steps.extend([
             add_field('_source', 'string', source),
             append_to_primary_key(['_source']),
             clear_by_source(self.lazy_engine(), db_table, source),
         ])
         logger.info('Publisher Flow: Normalize...')
         steps.extend([
             normalize_to_db(
                 groups,
                 db_table,
                 RESOURCE_NAME,
                 self.output_db,
                 'append'
             ),
         ])
         if self.output_datapackage:
             logger.info('Publisher Flow: Dump To Path Norm...')
             steps.extend([
                 dump_to_path(self.output_datapackage + '-norm')
             ])
     if self.output_es:
         logger.info('Publisher Flow: ES...')
         steps.extend([
             self.update_es()
         ])
     logger.info('Publisher Flow Prepared')
     return Flow(*steps)
Beispiel #8
0
def handler(pipeline=None):
    logger.info('Submitted pipeline %r', pipeline)
 def postflow(self):
     metadata = self.config._unflatten().get('extra', {}).get('metadata')
     logger.info('UPDATING WITH METADATA %r', metadata)
     return Flow(
         update_package(**metadata)
     )
Beispiel #10
0
def handler(pipeline=None):
    logger.info('Accepted pipeline %r', pipeline)
Beispiel #11
0
def handler(pipeline=None):
    logger.info('New pipeline %r', pipeline)
 def test(self):
     logger.info('DEDPULICATING %r', self.config.get('extra.deduplicate'))
     return self.config.get('extra.deduplicate')