Ejemplo n.º 1
0
def post_flow(phase, poster, tasks, config: Config, cache=False):
    if cache:
        config = config._unflatten()

        config_json = [config.get('source'), config.get('structure')]
        config_json = json.dumps(config_json, sort_keys=True)
        print(config_json[:64], len(config_json))
        checkpoint_name = hashlib.md5(config_json.encode('utf8')).hexdigest()

        if config.get('source'):
            path = config.get('source').get('path')
            if path:
                checkpoint_name += '_' + os.path.basename(path)

        cache = [checkpoint(checkpoint_name)]
    else:
        cache = []
    steps = [
        row_validator(phase, poster, tasks)
    ] + cache + [
        row_sender(phase, poster, tasks)
    ]
    return Flow(
        *steps
    )
Ejemplo n.º 2
0
def operator(name, params, pipeline):
    with tempfile.NamedTemporaryFile(mode='w', encoding='utf-8') as config_file:
        params['dgpConfig'].setdefault('publish', {})['allowed'] = True
        metadata = params['dgpConfig'].setdefault('extra', {}).setdefault('metadata', {})
        metadata['title'] = name
        metadata['dag_id'] = pipeline['id']
        metadata['updated_at'] = pipeline['__updated_at']
        metadata['created_at'] = pipeline['__created_at']
        for k, v in params.items():
            if k.startswith('extra.'):
                set_dots(params['dgpConfig'], k, v)
        logging.info('\nCONFIGURATION:\n--------------\n%s', 
                     json.dumps(params['dgpConfig'], sort_keys=True, ensure_ascii=False, indent=2))
        yaml.dump(params['dgpConfig'], config_file)
        config_file.flush()
        config = Config(config_file.name)
        taxonomy_registry = TaxonomyRegistry('taxonomies/index.yaml')
        context = Context(config, taxonomy_registry)

        logging.getLogger().setLevel(logging.INFO)

        steps = [
            FileLoaderDGP,
            LoaderDGP,
            PostLoaderDGP,
            TransformDGP,
            EnricherDGP,
            PublisherDGP,
        ]

        dgp = SimpleDGP(
            config, context,
            steps=steps
        )

        ret = dgp.analyze()
        if not ret:
            logging.error('Errors:')
            logging.error('\n\t - '.join([str(x) for x in dgp.errors]))
            assert False

        # logging.info('\nCONF (POST ANALYSIS):\n--------------\n%s', 
        #              json.dumps(config._unflatten(), sort_keys=True, ensure_ascii=False, indent=2))

        logging.info('Creating Flow')
        flow = dgp.flow()
        flow = Flow(
            flow,
            printer(tablefmt='html')
        )
        logging.info('Running Flow')
        _, stats = flow.process()

        logging.info('Success')

        return stats
Ejemplo n.º 3
0
async def events(request: web.Request):
    loop = request.app.loop

    uid = request.match_info['uid']
    error_code = None
    exception = None
    async with sse_response(request, headers=CORS_HEADERS) as resp:
        try:
            config = Config(path_for_uid(uid, 'config.yaml'))
            taxonomy_registry = TaxonomyRegistry('taxonomies/index.yaml')
            context = Context(config, taxonomy_registry)
            poster = Poster(uid, sender(resp))

            tasks = []
            dgp = SimpleDGP(
                config, context,
            )

            try:
                ret = dgp.analyze()
                print('ANALYZED')
                if config.dirty:
                    await poster.post_config(config._unflatten())
                if not ret:
                    await poster.post_errors(list(map(list, dgp.errors)))

                dgp.post_flows = [
                    post_flow(0, poster, tasks, config, cache=True),
                    post_flow(1, poster, tasks, config),
                ]
                flow = dgp.flow()
                await run_flow(flow, tasks)
            finally:
                for task in tasks:
                    await asyncio.gather(task)
        except Exception:
            logging.exception('Error while executing')
        finally:
            await resp.send('close')
            return resp
Ejemplo n.º 4
0
def main():
    config = Config(sys.argv[1] if len(sys.argv) > 1 else 'dgp.yaml')
    taxonomy_registry = TaxonomyRegistry('taxonomies/index.yaml')
    context = Context(config, taxonomy_registry)

    dgp = SimpleDGP(config, context)
    ret = dgp.analyze()
    if not ret:
        print('Errors:', '\n\t - '.join([str(x) for x in dgp.errors]))
        sys.exit(0)

    flow = dgp.flow()
    flow = Flow(flow, dump_to_path('output'))
    flow.process()

    print('----')
    print('Success:', ret)
Ejemplo n.º 5
0
def main(source_spec, config, taxonomy, output_datapackage, output_db,
         output_es):

    if not source_spec and not config:
        raise click.UsageError('Expecting to see either source-spec of config')

    configs = False
    if source_spec:
        source_spec_obj = yaml.load(open(source_spec))
        configs = convert_source_spec(source_spec_obj, taxonomy)
        for i, config in enumerate(configs):
            yaml.dump(config._unflatten(),
                      open('{}.{:02d}.yaml'.format(source_spec, i), 'w'),
                      default_flow_style=False,
                      indent=2,
                      allow_unicode=True,
                      encoding='utf8')
    elif config:
        configs = [Config(config)]

    for config in configs:
        process_source(config, output_datapackage, output_db, output_es)
        break
Ejemplo n.º 6
0
def convert_source_spec(source_spec, taxonomy_id):
    for source in source_spec['sources']:
        config = Config()
        context = Context(config, TaxonomyRegistry('taxonomies/index.yaml'))

        config.set(CONFIG_URL, source['url'])
        if 'encoding' in source:
            config.set(CONFIG_ENCODING, source['encoding'])
        config.set(CONFIG_TAXONOMY_ID, taxonomy_id)

        dgp = SimpleDGP(config, context)
        dgp.analyze()

        headers = config.get(CONFIG_HEADER_FIELDS)

        mapping = []
        for header in headers:
            found = False
            for field in source_spec['fields']:
                aliases = set([field['header']] + field.get('aliases', []))
                if header in aliases:
                    mapping.append(
                        dict(name=header,
                             header=field['header'],
                             title=field.get('title', field['header']),
                             columnType=field['columnType'],
                             options=field.get('options', {})))
                    found = True
                    break
            if not found:
                print('Failed to find mapping for header', header)
        assert len(mapping) == len(headers)
        config.set(CONFIG_MODEL_MAPPING, mapping)

        config.set('extra.deduplicate', source_spec.get('deduplicate') is True)

        title = source_spec['title']
        dataset_name = source_spec.get('dataset-name', title)
        dataset_name = slugify(dataset_name, separator='_').lower()
        resource_name = source_spec.get('resource-name', dataset_name)
        revision = source_spec.get('revision', 0)
        private = source_spec.get('private') is not False

        config.set(CONFIG_EXTRA_METADATA_TITLE, title)
        config.set(CONFIG_EXTRA_METADATA_DATASET_NAME, dataset_name)
        config.set(CONFIG_EXTRA_METADATA_REVISION, revision)
        config.set(CONFIG_EXTRA_RESOURCE_NAME, resource_name)
        config.set(CONFIG_EXTRA_PRIVATE, private)

        dgp = SimpleDGP(config, context)
        if not dgp.analyze():
            for error in dgp.errors:
                print(error)
            break
        else:
            yield config