Exemple #1
0
    def generate_pipeline(cls, source):
        project_id = slugify(source['project'])
        schedule = SCHEDULE_DAILY

        discovered_steps = cls._get_pipeline_steps()

        for k, config in source['config'].items():
            # `k` corresponds with `label` in pipeline_steps module.
            if k in discovered_steps.keys():
                pipeline_id = slugify('{}-{}'.format(project_id, k))

                common_steps = [('add_metadata', {
                    'project': project_id,
                    'name': pipeline_id
                })]

                k_steps = discovered_steps[k](common_steps, pipeline_id,
                                              project_id, config)
                _steps = steps(*k_steps)
            else:
                log.warn('No {} pipeline generator available for {}'.format(
                    k, project_id))
                continue

            pipeline_details = {'pipeline': _steps}
            if schedule is not None:
                pipeline_details['schedule'] = {'crontab': schedule}

            yield pipeline_id, pipeline_details
def _get_safe_entity(entity):
    '''Get a url safe version of the entity, base on starting character.'''
    if entity.startswith('@'):
        return 'at-{}'.format(slugify(entity))
    elif entity.startswith('#'):
        return 'hash-{}'.format(slugify(entity))
    elif entity.startswith('url:'):
        return slugify(entity)
    else:
        raise ValueError(ENTITY_VALUE_ERROR_MSG.format(entity))
def process_row(row, row_index, spec, resource_index, parameters, stats):
    resource_matcher = ResourceMatcher(parameters['resource-name'])
    if resource_matcher.match(spec['name']):
        fingerprint_field = parameters['fingerprint-field']
        name_field = parameters['name-field']
        row[fingerprint_field] = slugify(row[name_field], to_lower=True)

    return row
Exemple #4
0
def process_resource(res):
    all_fingerprints = set()
    for row in res:
        name = None
        for src_field in source_fields:
            src_value = row[src_field]
            if src_value:
                if name is None:
                    name = src_value
                fingerprint = slugify(src_value, to_lower=True)
                if fingerprint in all_fingerprints:
                    continue
                all_fingerprints.add(fingerprint)
                yield {name_field: name, fingerprint_field: fingerprint}
def dumper_flow(source, base):

    _, _, resource_name = extract_names(source)
    dataset_id, db_table, _ = extract_storage_ids(source)

    kinds = sorted(
        set(f['columnType'].split(':')[0]
            for f in source['fields']) - {'value'})

    resources = [slugify(kind, separator='_') for kind in kinds]

    deps = ['dimension_flow_{}'.format(res) for res in resources]

    for i, resource, dep, kind in zip(range(len(kinds)), resources, deps,
                                      kinds):
        res_db_table = '{}_{}'.format(db_table, i)
        steps = [('load_resource', {
            'url': 'dependency://' + base + '/' + dep,
            'resource': resource
        }), ('set_types', ), ('fiscal.helpers.fix_null_pks', ),
                 ('dump.to_sql', {
                     'tables': {
                         res_db_table: {
                             'resource-name': resource
                         }
                     }
                 })]
        yield steps, [dep], resource

    steps = [('load_resource', {
        'url': 'dependency://' + base + '/normalized_flow',
        'resource': resource_name
    }), ('fiscal.helpers.fix_null_pks', ),
             ('dump.to_sql', {
                 'tables': {
                     db_table: {
                         'resource-name': resource_name
                     }
                 }
             })]
    yield steps, ['normalized_flow'], ''

    yield [
        ('fiscal.update_model_in_registry', {
            'dataset-id': dataset_id,
            'loaded': True
        }),
    ], ['dumper_flow'], 'update_status'
Exemple #6
0
def dimension_flow(source, base):

    title, dataset_name, resource_name = extract_names(source)

    kinds = sorted(
        set(f['columnType'].split(':')[0]
            for f in source['fields']) - {'value'})

    resources = [slugify(kind, separator='_') for kind in kinds]

    pipeline_ids = ['dimension_{}'.format(res) for res in resources]

    for resource, pipeline_id, kind in zip(resources, pipeline_ids, kinds):
        headers = [
            f['header'] for f in source['fields']
            if f['columnType'].startswith(kind +
                                          ':') or f['columnType'] == kind
        ]
        steps = [('load_resource', {
            'url': 'dependency://' + base + '/denormalized_flow',
            'resource': resource_name
        }),
                 ('concatenate', {
                     'target': {
                         'name': resource
                     },
                     'fields': dict((h, []) for h in headers)
                 }), ('fiscal.helpers.save_primarykey', ),
                 ('join', {
                     'source': {
                         'name': resource,
                         'key': headers,
                         'delete': True
                     },
                     'target': {
                         'name': resource,
                         'key': None
                     },
                     'fields': dict((h, None) for h in headers)
                 }), ('fiscal.helpers.load_primarykey', ),
                 ('fiscal.helpers.enumerate', ),
                 ('dump.to_path', {
                     'out-path': 'normalized/' + resource
                 })]
        yield steps, ['denormalized_flow'], resource
    def generate_pipeline(cls, source, wp):
        pipeline_id = dataset_name = slugify(source['name'])
        host = source['udata-instance']
        action = source['data-kind']

        if action == 'datasets-list':
            schedule = SCHEDULE_MONTHLY
            pipeline_steps = steps(*[('udata.catalog', {
                'udata-instance': host
            }), ('add_metadata', {
                'name': dataset_name
            }), ('dump.to_zip', {
                'out-file': 'udata-list.zip'
            })])

            pipeline_details = {
                'pipeline': pipeline_steps,
                'schedule': {
                    'crontab': schedule
                }
            }

            yield pipeline_id, pipeline_details

        if action == 'dataset':

            pipeline_steps = steps(*[('udata.fetch_metadata', {
                'host': source['udata-instance'],
                'kind': 'dataset',
                'id': source['dataset']
            }), ('add_metadata', {
                'name': source['name']
            }),
                                     ('dump.to_path', {
                                         'handle-non-tabular': 'true',
                                         'pretty-descriptor': 'true'
                                     })])

            pipeline_details = {'pipeline': pipeline_steps}

            yield pipeline_id, pipeline_details
Exemple #8
0
    def generate_pipeline(cls, source):
        for pipeline_id_prefix, defs in source.items():
            repository = defs['repository']
            base_path = defs.get('base-path', 'pipelines/')

            # issues
            issue_policy = defs.get('issues', {})

            # pull requests
            pr_policy = defs.get('pull-requests')

            # code
            code_policy = defs.get('code')

            if code_policy is not None:
                yield from cls.fetch_code(code_policy, repository, base_path)

            issues_url = '/repos/{}/issues'.format(repository)
            issues = URL_GETTER.get(issues_url)

            if issues is not None:
                for issue in issues:
                    for pipeline_id_format, pipeline_steps in \
                            cls.handle_combined_issue(repository, base_path, issue, issue_policy, pr_policy):

                        title_slug = slugify(issue['title'])
                        fmt = {
                            'issue-id': issue['number'],
                            'title-slug': title_slug
                        }
                        pipeline_id = pipeline_id_format.format(**fmt)
                        pipeline_id = urljoin(pipeline_id_prefix, pipeline_id)
                        pipeline_details = {
                            'title': issue['title'],
                            'pipeline': pipeline_steps
                        }
                        if issue.get('body') is not None:
                            pipeline_details['description'] = issue['body']
                        yield pipeline_id, pipeline_details
def form_collector(source_id, source_type, latest_date):
    start_date = FAR_PAST_START_DATE
    if latest_date:
        start_date = latest_date.date()

    response = _request_data_from_google_spreadsheet(start_date)

    resource_content = []
    headers = response['table']['cols']
    headers = [slugify(h['label'].lower()) for h in headers]
    for r in response['table']['rows']:
        row = r['c']
        row_dict = {}
        for i, v in enumerate(row):
            if v is not None:
                row_dict[headers[i]] = v.get('f') or v.get('v')
            else:
                row_dict[headers[i]] = None
        output_date = dateutil.parser.parse(row_dict.get('date')).date() \
            if row_dict.get('date') is not None else None
        res_row = {
            'source_id': source_id,
            'source_type': source_type,
            'source': 'gsheets',
            'source_timestamp':
            dateutil.parser.parse(row_dict.get('timestamp')),
            'source_email': row_dict.get('email-address'),
            'output_title': row_dict.get('title'),
            'output_type': row_dict.get('type-of-output'),
            'output_organization': row_dict.get('for-what-organisation'),
            'output_person': row_dict.get('who-did-this'),
            'output_link': row_dict.get('link-if-published'),
            'output_additional_information':
            row_dict.get('additional-information'),
            'output_date': output_date
        }
        resource_content.append(res_row)

    return resource_content
            latest_date, latest_iter = get_latest_date(next(res_iter))
            yield latest_iter
        else:
            latest_date = None
    yield from res_iter
    yield form_collector(source_id, source_type, latest_date)


parameters, datapackage, res_iter = ingest()

sheet_id = parameters['sheet_id']
gid = parameters['gid']
source_type = parameters['source_type']
source_id = '{0}/{1}'.format(sheet_id, gid)
resource = {
    'name': slugify(sheet_id).lower(),
    'path': 'data/{}.csv'.format(slugify(sheet_id))
}

headers = [
    'source', 'source_type', 'source_timestamp', 'source_email',
    'output_title', 'output_type', 'output_organization', 'output_person',
    'output_link', 'output_additional_information', 'output_date'
]
resource['schema'] = {
    'fields': [{
        'name': h,
        'type': 'string'
    } for h in headers]
}
Exemple #11
0
    metadata = r.json()

    return metadata


dataset_metadata = metadata(url)

datapackage['udata'] = dataset_metadata

# logging.info(datapackage)

for resource in dataset_metadata['resources']:

    # logging.info(resource)

    name = slugify(resource["title"].lower())
    path = resource["url"].split('/')[-1]
    format = resource["format"]
    url = resource["url"]

    logging.info(path)

    if path is not '':

        datapackage['resources'].append({
            'name': name,
            PROP_STREAMED_FROM: url,
            'format': format,
            'path': path
        })
Exemple #12
0
            growth = _request_growth_history_from_mailchimp(
                list_id,
                '{}-{:02d}'.format(activity_date.year, activity_date.month)
            )
            res_row['subscribers'] = growth['existing']

        resource_content.append(res_row)

    return resource_content


parameters, datapackage, res_iter = ingest()

list_id = parameters['list_id']
resource = {
    'name': slugify(list_id),
    'path': 'data/{}.csv'.format(slugify(list_id))
}

headers = ['source', 'date', 'list_id', 'subs', 'unsubs', 'subscribers',
           'campaigns_sent']
resource['schema'] = {'fields': [{'name': h, 'type': 'string'}
                                 for h in headers]}

datapackage['resources'].append(resource)


def process_resources(res_iter, datapackage, list_id):

    def get_latest_row(first):
        latest_row = None
Exemple #13
0
    def generate_pipeline(cls, source):
        for item in source:
            entity_slug = slugify(item['entity'], to_lower=True, separator='_')
            ids = [entity_slug, item['year']]
            if 'subsidiary' in item:
                ids.append(item['subsidiary'])
            pipeline_id = '_'.join(str(i) for i in ids)

            pipeline = [
                {
                    'run': 'add_metadata',
                    'parameters': {
                        'name':
                        pipeline_id,
                        'title':
                        'CRD/IV data for {entity} in the year {year}'.format(
                            **item)
                    },
                },
            ]
            for input in item['inputs']:
                if input['kind'] == 'pdf':
                    parameters = input['parameters']
                    parameters['transpose'] = input.get('transpose', False)
                    parameters['url'] = input['url']
                    parameters['headers'] = item['model']['headers']
                    pipeline.append({
                        'run': 'od4tj.tabula_resource',
                        'parameters': parameters
                    })
            pipeline.append({
                'run': 'concatenate',
                'parameters': {
                    'sources':
                    'tabula-.+',
                    'target': {
                        'name': 'crdiv_data'
                    },
                    'fields':
                    dict((h['mapping'], [])
                         for h in (item['model']['headers'] + [{
                             'mapping': 'url'
                         }]))
                }
            })
            pipeline.extend([
                {
                    'run': 'od4tj.clean_locations',
                    'parameters': {
                        'resource_name': 'crdiv_data',
                        'raw_field': 'country',
                        'clean_field_code': 'country_code',
                        'clean_field_name': 'country_name',
                    }
                },
                {
                    'run': 'od4tj.add_constants',
                    'parameters': {
                        'year': item['year'],
                        'entity': item['entity'],
                        'subsidiary': item.get('subsidiary'),
                        'currency': item['model']['currency'].upper()
                    }
                },
                {
                    'run': 'od4tj.validate_countries',
                    'parameters': {
                        'resource_name': 'crdiv_data',
                        'raw_field': 'country',
                        'clean_field': 'country_code',
                    }
                },
                {
                    'run': 'od4tj.fix_numbers',
                    'parameters': {
                        'factor': item['model']['factor'],
                        'group_char': item['model'].get('group_char', ','),
                        'decimal_char': item['model'].get('decimal_char', '.'),
                    }
                },
                {
                    'run': 'set_types',
                },
                {
                    'run': 'od4tj.validate_totals',
                    'parameters': {
                        'totals': item.get('processing', {}).get('totals', {}),
                        'factor': item['model']['factor'],
                    }
                },
            ])
            pipeline.append({
                'run': 'aws.dump.to_s3',
                'parameters': {
                    'bucket':
                    'od4tj-filestore.okfn.org',
                    'path':
                    'crd_iv_datapackages/{}_{}'.format(entity_slug,
                                                       item['year'])
                }
            })
            pipeline.append({
                'run': 'dump.to_path',
                'parameters': {
                    'out-path': '/tmp/',
                }
            })
            yield pipeline_id, {'pipeline': pipeline}
Exemple #14
0
def normalized_flow(source, base):

    _, _, resource_name = extract_names(source)
    dataset_id, db_table, _ = extract_storage_ids(source)

    kinds = sorted(set(
        f['columnType'].split(':')[0]
        for f in source['fields']
    ) - {'value'})

    resources = [
        slugify(kind, separator='_')
        for kind in kinds
    ]
    db_tables = dict(
        (res, '{}_{}'.format(db_table, i))
        for i, res in enumerate(resources)
    )
    db_tables[''] = db_table

    deps = [
        'dimension_flow_{}'.format(res)
        for res in resources
        ]

    steps = [
        ('load_metadata', {
            'url': 'dependency://' + base + '/denormalized_flow',
        }),
    ]
    steps.extend([
        ('load_resource', {
            'url': 'dependency://' + base + '/' + dep,
            'resource': resource
        })
        for resource, dep in zip(resources, deps)
    ])
    steps.extend([
        ('load_resource', {
            'url': 'dependency://' + base + '/denormalized_flow',
            'resource': resource_name
        }),
        ('fiscal.create_babbage_model', {
            'db-tables': db_tables
        }),
    ])
    for resource, kind in zip(resources, kinds):
        headers = [
            f['header']
            for f in source['fields']
            if f['columnType'].startswith(kind+':') or f['columnType'] == kind
        ]
        steps.extend([
            ('join', {
                'source': {
                    'name': resource,
                    'key': headers,
                    'delete': True
                },
                'target': {
                    'name': resource_name,
                    'key': headers
                },
                'fields': {
                    resource + '_id': {
                        'name': ID_COLUMN_NAME
                    }
                }
            }),
            ('delete_fields', {
                'resources': resource_name,
                'fields': headers
            }),
        ])
    steps.extend([
        ('add_metadata', {
            'savedPk': [resource + '_id' for resource in resources]
        }),
        ('fiscal.helpers.load_primarykey', {}),
        ('fiscal.update_model_in_registry', {
            'dataset-id': dataset_id,
            'loaded': False
        }),
        ('dump.to_path', {
            'out-path': 'normalized/final'
        })
    ])
    yield steps, deps + ['denormalized_flow'], ''
Exemple #15
0
    def generate_pipeline(cls, source):
        for item in source:
            entity_slug = slugify(item['entity'], to_lower=True, separator='_')
            pipeline_id = '{}/{}'.format(entity_slug, item['year'])

            pipeline = [{
                'run': 'add_metadata',
                'parameters': {
                    'name':
                    '{}_{}'.format(entity_slug, item['year']),
                    'title':
                    'CRD/IV data for {entity} in the year {year}'.format(
                        **item)
                },
            }, {
                'run': 'add_resource',
                'parameters': {
                    'name':
                    'country-codes',
                    'url':
                    'https://raw.githubusercontent.com/datasets/country-codes/master/data/country-codes.csv'
                },
            }, {
                'run': 'stream_remote_resources',
            }, {
                'run': 'od4tj.prepare_country_fingerprints',
                'parameters': {
                    'resource-name':
                    'country-codes',
                    'source-fields':
                    ['name', 'official_name_en', 'official_name_fr'],
                    'name-field':
                    'name',
                    'fingerprint-field':
                    'fingerprint'
                }
            }]
            for input in item['inputs']:
                if input['kind'] == 'pdf':
                    for dimension in input['parameters']['dimensions']:
                        parameters = {}
                        parameters['dimensions'] = dimension
                        parameters['url'] = input['url']
                        parameters['headers'] = item['model']['headers']
                        pipeline.append({
                            'run': 'od4tj.tabula_resource',
                            'parameters': parameters
                        })
            pipeline.append({
                'run': 'concatenate',
                'parameters': {
                    'sources':
                    'tabula-.+',
                    'target': {
                        'name': 'crdiv_data'
                    },
                    'fields':
                    dict((h['name'], []) for h in item['model']['headers'])
                }
            })
            pipeline.extend([
                {
                    'run': 'od4tj.fingerprint_countries',
                    'parameters': {
                        'resource-name': 'crdiv_data',
                        'name-field': 'country',
                        'fingerprint-field': 'country-name-fingerprint'
                    }
                },
                {
                    'run': 'join',
                    'parameters': {
                        'source': {
                            'name': 'country-codes',
                            'key': ['fingerprint'],
                            'delete': True
                        },
                        'target': {
                            'name': 'crdiv_data',
                            'key': ['country-name-fingerprint'],
                        },
                        'fields': {
                            'country_name': {
                                'name': 'name'
                            }
                        },
                        'full': True,
                    }
                },
                {
                    'run': 'od4tj.add_constants',
                    'parameters': {
                        'year': item['year'],
                        'entity': item['entity']
                    }
                },
                {
                    'run': 'od4tj.validate_countries'
                },
                {
                    'run': 'od4tj.fix_numbers',
                },
                {
                    'run': 'set_types',
                },
            ])
            pipeline.append({
                'run': 'aws.dump.to_s3',
                'parameters': {
                    'bucket':
                    'od4tj-filestore.okfn.org',
                    'path':
                    'crd_iv_datapackages/{}_{}'.format(entity_slug,
                                                       item['year'])
                }
            })
            yield pipeline_id, {'pipeline': pipeline}
Exemple #16
0
        # add active_users to today's value
        if date == today:
            res_row['active_users'] = active_users_response
        # preserve active_users value in latest_row
        if date == latest_date and latest_row['active_users']:
            res_row['active_users'] = latest_row['active_users']
        resource_content.append(res_row)

    return resource_content


parameters, datapackage, res_iter = ingest()

domain = parameters['domain']
resource = {
    'name': slugify(domain),
    'path': 'data/{}.csv'.format(slugify(domain))
}

headers = [
    'domain', 'source', 'date', 'new_users', 'new_topics', 'new_posts',
    'visits', 'active_users'
]
resource['schema'] = {
    'fields': [{
        'name': h,
        'type': 'string'
    } for h in headers]
}

datapackage['resources'].append(resource)
Exemple #17
0
def add_steps(steps: list, pipeline_id: str, project_id: str,
              config: dict) -> list:

    steps.append(('measure.datastore_get_latest', {
        'resource-name': 'latest-project-entries',
        'table': 'codepackaging',
        'engine': settings.get('DB_ENGINE'),
        'distinct_on': ['project_id', 'package', 'source']
    }))

    if 'npm' in config:
        for package in config['npm']['packages']:
            steps.append(('measure.add_npm_resource', {
                'package': slugify(package)
            }))

    if 'pypi' in config:
        for package in config['pypi']['packages']:
            steps.append(('measure.add_pypi_resource', {
                'package': slugify(package)
            }))

    if 'rubygems' in config:
        for gem in config['rubygems']['gems']:
            steps.append(('measure.add_rubygems_resource', {'gem_id': gem}))

    if 'packagist' in config:
        for package in config['packagist']['packages']:
            steps.append(('measure.add_packagist_resource', {
                'package': package
            }))

    steps.append(('measure.remove_resource', {
        'name': 'latest-project-entries'
    }))

    steps.append(('concatenate', {
        'target': {
            'name': 'code-packaging',
            'path': 'data/code-packaging.csv'
        },
        'fields': {
            'date': [],
            'downloads': [],
            'total_downloads': [],
            'source': [],
            'package': []
        }
    }))

    steps.append(('set_types', {
        'types': {
            'downloads': {
                'type': 'integer'
            },
            'total_downloads': {
                'type': 'integer'
            },
            'source': {
                'type': 'string'
            },
            'date': {
                'type': 'date'
            },
            'package': {
                'type': 'string'
            }
        }
    }))

    steps.append(('measure.add_project_name', {'name': project_id}))
    steps.append(('measure.add_timestamp'))
    steps.append(('measure.add_uuid'))

    # Dump to path if in development mode
    if settings.get('DEVELOPMENT', False):
        steps.append(('dump.to_path', {
            'out-path': '{}/{}'.format(DOWNLOADS_PATH, pipeline_id)
        }))

    steps.append(('dump.to_sql', {
        'engine': settings.get('DB_ENGINE'),
        'tables': {
            'codepackaging': {
                'resource-name': 'code-packaging',
                'mode': 'update',
                'update_keys': ['project_id', 'date', 'package', 'source']
            }
        }
    }))

    return steps
Exemple #18
0
    def generate_pipeline(cls, source):
        title = source['title']
        dataset_name = source.get('dataset-name', title)
        dataset_name = slugify(dataset_name).lower()
        pipeline_id = dataset_name
        resource_name = source.get('resource-name', dataset_name)

        for data_source in source['sources']:
            if data_source['url'].endswith('.csv'):
                data_source['mediatype'] = 'text/csv'
            if 'name' not in data_source:
                data_source['name'] = slugify(
                    os.path.basename(data_source['url'])
                )

        model_params = {
            'options': dict(
                (f['header'], f['options'])
                for f in source['fields']
                if 'options' in f
            ),
            'os-types': dict(
                (f['header'], f['osType'])
                for f in source['fields']
            ),
            'titles': dict(
                (f['header'], f['title'])
                for f in source['fields']
                if 'title' in f
            ),
        }
        extra_measures = []
        measure_handling = []
        if 'measures' in source:
            measures = source['measures']
            normalise_measures = ('fiscal.normalise_measures', {
                'measures': measures['mapping']
            })
            if 'title' in measures:
                normalise_measures[1]['title'] = measures['title']
            measure_handling.append(normalise_measures)
            model_params['os-types']['value'] = 'value'
            model_params['options']['value'] = {
                'currency': measures['currency']
            }
            extra_measures = [
                (measure, [])
                for measure in source['measures']['mapping'].keys()
            ]
            if 'currency-conversion' in measures:
                currency_conversion = measures['currency-conversion']
                date_measure = currency_conversion.get('date_measure')
                if date_measure is None:
                    date_measure = [
                        f['header']
                        for f in source['fields']
                        if f.get('osType', '').startswith('date:')
                    ][0]
                currencies = measures.get('currencies', ['USD'])
                normalise_currencies = ('fiscal.normalise_currencies', {
                    'measures': ['value'],
                    'date-field': date_measure,
                    'to-currencies': currencies,
                    'from-currency': measures['currency']
                })
                if 'title' in currency_conversion:
                    normalise_currencies[1]['title'] = measures['title']
                measure_handling.append(normalise_currencies)
                for currency in currencies:
                    measure_name = 'value_{}'.format(currency)
                    model_params['os-types'][measure_name] = 'value'
                    model_params['options'][measure_name] = {
                        'currency': currency
                    }

        dedpulicate_lines = source.get('deduplicate') is True
        dedpulicate_steps = []
        if dedpulicate_lines:
            dedpulicate_steps.append((
                'set_types',
                {
                    'types': dict(
                        (f['header'],
                         dict(
                            type='number',
                            **f.get('options', {})
                         )
                        )
                        for f in source['fields']
                        if f['osType'] == 'value'
                    )
                }
            ))
            dedpulicate_steps.append((
                'join',
                {
                    'source': {
                        'name': resource_name,
                        'key': [
                            f['header']
                            for f in source['fields']
                            if f['osType'] != 'value'
                        ],
                        'delete': True
                    },
                    'target': {
                        'name': resource_name,
                        'key': None
                    },
                    'fields': dict(
                        (f['header'],
                         {
                             'name': f['header'],
                             'aggregate': 'any' if f['osType'] != 'value' else 'sum'
                         })
                        for f in source['fields']
                    )
                }
            ))


        partial_output_file = '{}.fdp.partial.zip'.format(pipeline_id)
        output_file = '{}.fdp.zip'.format(pipeline_id)
        pipeline_steps = [
            (
                'add_metadata',
                {
                   'title': title,
                   'name': dataset_name,
                }
            )
        ] + [
            ('add_resource', source)
            for source in source['sources']
        ] + [
            ('stream_remote_resources', {}, True),
            ('concatenate', {
                'target': {
                    'name': resource_name
                },
                'fields': dict(
                    [
                        (f['header'], f.get('aliases', []))
                        for f in source['fields']
                    ] + extra_measures
                )
            }),
        ] + dedpulicate_steps + [
            (step['processor'], step.get('parameters', {}))
            for step in source.get('postprocessing', [])
        ] + measure_handling + [
            ('fiscal.model', model_params),
            ('dump.to_zip', {
                'out-file': partial_output_file,
            }),
            ('fiscal.split_resource_per_fiscal_year_and_dump_to_zip', {
                'in-file': partial_output_file,
                'out-file': output_file,
            }),
            ('fiscal.upload', {
                'in-file': output_file,
                'publish': True
            }),
        ]

        pipeline_details = {
            'pipeline': steps(*pipeline_steps),
        }
        yield pipeline_id, pipeline_details
Exemple #19
0
def add_steps(steps: list, pipeline_id: str,
              project_id: str, config: dict) -> list:
    for repo in config['github']['repositories']:
        steps.append(('measure.add_github_resource', {
            'name': slugify(repo),
            'repo': repo,
            'map_fields': {
                'repository': 'name',
                'watchers': 'subscribers_count',
                'stars': 'stargazers_count'
            }
        }))

    steps.append(('concatenate', {
        'sources':
            [slugify(repo) for repo in config['github']['repositories']],
        'target': {
            'name': 'code-hosting',
            'path': 'data/code-hosting.json'},
        'fields': {
            'repository': [],
            'watchers': [],
            'stars': [],
            'source': [],
            'date': []}
    }))

    steps.append(('set_types', {
        'types': {
            'repository': {
                'type': 'string',
            },
            'watchers': {
                'type': 'integer'
            },
            'stars': {
                'type': 'integer'
            },
            'date': {
                'type': 'date',
            },
        }
    }))

    steps.append(('measure.add_project_name', {'name': project_id}))
    steps.append(('measure.add_timestamp'))
    steps.append(('measure.add_uuid'))

    # Dump to path if in development mode
    if settings.get('DEVELOPMENT', False):
        steps.append(('dump.to_path', {
            'out-path': '{}/{}'.format(DOWNLOADS_PATH, pipeline_id)
        }))

    steps.append(('dump.to_sql', {
        'engine': settings['DB_ENGINE'],
        'tables': {
            'codehosting': {
                'resource-name': 'code-hosting',
                'mode': 'update',
                'update_keys': ['repository', 'source', 'project_id', 'date']
            }
        }
    }))

    return steps
Exemple #20
0
            since=start_date_frame.strftime(FACEBOOK_API_DATE_RANGE_FORMAT),
            until=end_date_frame.strftime(FACEBOOK_API_DATE_RANGE_FORMAT)
        )
        for metric in daily_metrics:
            aggregated_metrics[metric['facebook_metric']] += \
                _add_collected_metric_to_aggregation(frame_response, metric)
        start_date_frame = end_date_frame

    return aggregated_metrics


parameters, datapackage, res_iter = ingest()

project_id = parameters['project_id']
entity = parameters['entity']
safe_entity = slugify(entity).lower()
resource = {
    'name': safe_entity,
    'path': 'data/{}.csv'.format(safe_entity)
}
entity_type = 'page'

resource_content = []

row = {
    'entity': entity,
    'entity_type': entity_type,
    'source': 'facebook'
}

lifetime_metrics = _get_lifetime_metrics_from_source(entity)
        row = {
            'package': package,
            'source': 'npm',
            'date': dateutil.parser.parse(response['start']).date(),
            'downloads': response['downloads']
        }
        resource_content.append(row)

    return resource_content


parameters, datapackage, res_iter = ingest()

package = parameters['package']
resource = {
    'name': slugify(package),
    'path': 'data/{}.csv'.format(slugify(package))
}

headers = ['package', 'source', 'date', 'downloads']
resource['schema'] = {
    'fields': [{
        'name': h,
        'type': 'string'
    } for h in headers]
}

datapackage['resources'].append(resource)


def process_resources(res_iter, datapackage, package):
Exemple #22
0
def modify_datapackage(dp, parameters, *_):
    db_tables = parameters['db-tables']
    model = dp['model']
    field_types = dict((x['slug'], x['type'])
                       for x in dp['resources'][-1]['schema']['fields'])

    bbg_hierarchies = {}
    bbg_dimensions = {}
    bbg_measures = {}

    # Iterate on dimensions
    for hierarchy_name, h_props in model['dimensions'].items():

        # Append to hierarchies
        hierarchy_name = slugify(hierarchy_name, separator='_')
        hierarchy = dict(label=hierarchy_name, levels=h_props['primaryKey'])
        bbg_hierarchies[hierarchy_name] = hierarchy

        # Get all hierarchy columns
        attributes = h_props['attributes']
        attributes = list(attributes.items())

        # Separate to codes and labels
        codes = dict(filter(lambda x: 'labelfor' not in x[1], attributes))
        labels = dict(
            map(lambda y: (y[1]['labelfor'], y[1]),
                filter(lambda x: 'labelfor' in x[1], attributes)))

        # For each code, create a babbage dimension
        for fieldname, attribute in codes.items():
            dimension_name = fieldname

            bbg_attributes = {
                fieldname:
                dict(column='.'.join([db_tables[hierarchy_name], fieldname]),
                     label=attribute.get('title', attribute['source']),
                     type=field_types[fieldname])
            }
            bbg_dimension = dict(
                attributes=bbg_attributes,
                key_attribute=fieldname,
                label=attribute.get('title'),
                join_column=[hierarchy_name + '_id', ID_COLUMN_NAME])

            label = labels.get(fieldname)
            if label is not None:
                fieldname = label['source']
                attribute = label
                bbg_attributes.update({
                    fieldname:
                    dict(column='.'.join(
                        [db_tables[hierarchy_name], fieldname]),
                         label=attribute.get('title', attribute['source']),
                         type=field_types[fieldname])
                })
                bbg_dimension.update(dict(label_attribute=fieldname))
            bbg_dimensions[dimension_name] = bbg_dimension

    # Iterate on measures
    for measurename, measure in model['measures'].items():
        bbg_measures[measurename] = dict(column=measurename,
                                         label=measure.get(
                                             'title', attribute['source']),
                                         type=field_types[measurename])

    dp['babbageModel'] = dict(fact_table=db_tables[''],
                              dimensions=bbg_dimensions,
                              hierarchies=bbg_hierarchies,
                              measures=bbg_measures)

    return dp
ckan_error = get_ckan_error(response)
if ckan_error:
    if 'Not found: Resource was not found.' in ckan_error.get('message', []):
        log.exception('CKAN resource {} was not found.'.format(resource_id))
    else:
        log.exception('CKAN returned an error: ' + json.dumps(ckan_error))

    raise Exception

resource = response['result']

if 'name' in resource:
    if 'title' not in resource:
        resource['title'] = resource['name']
    resource['name'] = slugify(resource['name']).lower()

if 'format' in resource:
    resource['format'] = resource['format'].lower()

if 'url' in resource:
    resource['path'] = PATH_PLACEHOLDER
    resource[PROP_STREAMED_FROM] = resource['url']
    del resource['url']

del resource['hash']

resource.update(parameters)

datapackage['resources'].append(resource)
Exemple #24
0
def denormalized_flow(source, base):

    title, dataset_name, resource_name = extract_names(source)
    dataset_id, _, _ = extract_storage_ids(source)

    original_datapackage_url = source.get('datapackage-url')

    for data_source in source['sources']:
        if data_source['url'].endswith('.csv'):
            data_source['mediatype'] = 'text/csv'
        if 'name' not in data_source:
            data_source['name'] = slugify(os.path.basename(data_source['url']),
                                          separator='_').lower()

    model_params = {
        'options':
        dict((f['header'], f['options']) for f in source['fields']
             if 'options' in f),
        'os-types':
        dict((f['header'], f['columnType']) for f in source['fields']),
        'titles':
        dict((f['header'], f['title']) for f in source['fields']
             if 'title' in f),
    }
    extra_measures = []
    measure_handling = []
    if 'measures' in source:
        measures = source['measures']
        normalise_measures = ('fiscal.normalise_measures', {
            'measures': measures['mapping']
        })
        if 'title' in measures:
            normalise_measures[1]['title'] = measures['title']
        measure_handling.append(normalise_measures)
        model_params['os-types']['value'] = 'value'
        model_params['options']['value'] = {'currency': measures['currency']}
        extra_measures = [(measure, [])
                          for measure in source['measures']['mapping'].keys()]
        if 'currency-conversion' in measures:
            currency_conversion = measures['currency-conversion']
            date_measure = currency_conversion.get('date_measure')
            if date_measure is None:
                date_measure = [
                    f['header'] for f in source['fields']
                    if f.get('columnType', '').startswith('date:')
                ][0]
            currencies = measures.get('currencies', ['USD'])
            normalise_currencies = ('fiscal.normalise_currencies', {
                'measures': ['value'],
                'date-field': date_measure,
                'to-currencies': currencies,
                'from-currency': measures['currency']
            })
            if 'title' in currency_conversion:
                normalise_currencies[1]['title'] = measures['title']
            measure_handling.append(normalise_currencies)
            for currency in currencies:
                measure_name = 'value_{}'.format(currency)
                model_params['os-types'][measure_name] = 'value'
                model_params['options'][measure_name] = {'currency': currency}

    dedpulicate_lines = source.get('deduplicate') is True
    dedpulicate_steps = []
    if dedpulicate_lines:
        dedpulicate_steps.append(('set_types', {
            'types':
            dict((f['header'], dict(type='number', **f.get('options', {})))
                 for f in source['fields'] if f['columnType'] == 'value')
        }))
        dedpulicate_steps.append((
            'join',
            {
                'source': {
                    'name':
                    resource_name,
                    'key': [
                        f['header'] for f in source['fields']
                        if f['columnType'] != 'value'
                    ],
                    'delete':
                    True
                },
                'target': {
                    'name': resource_name,
                    'key': None
                },
                'fields':
                dict((
                    f['header'],
                    {
                        'name': f['header'],
                        'aggregate':
                        'any' if f['columnType'] != 'value' else 'sum'  # noqa
                    }) for f in source['fields'])
            }))

    load_metadata_steps = []
    if original_datapackage_url:
        load_metadata_steps.append(('load_metadata', {
            'url': original_datapackage_url
        }))

    pipeline_steps = load_metadata_steps + [
        ('add_metadata', {
            'title': title,
            'name': dataset_name,
            'revision': source.get('revision', 0),
        }),
        ('fiscal.update_model_in_registry', {
            'dataset-id': dataset_id,
            'loaded': False
        }),
    ] + [('add_resource', source) for source in source['sources']] + [
        ('stream_remote_resources', {}),
        ('concatenate', {
            'target': {
                'name': resource_name
            },
            'fields':
            dict([(f['header'], f.get('aliases', []))
                  for f in source['fields']] + extra_measures)
        }),
    ] + dedpulicate_steps + [(step['processor'], step.get('parameters', {}))
                             for step in source.get('postprocessing', [])
                             ] + measure_handling + [
                                 ('fiscal.model', model_params),
                                 ('fiscal.collect-fiscal-years', ),
                                 ('set_types', ),
                                 ('dump.to_path', {
                                     'out-path': 'denormalized',
                                 }),
                             ]

    yield pipeline_steps, [], ''