def post_flow(phase, poster, tasks, config: Config, cache=False): if cache: config = config._unflatten() config_json = [config.get('source'), config.get('structure')] config_json = json.dumps(config_json, sort_keys=True) print(config_json[:64], len(config_json)) checkpoint_name = hashlib.md5(config_json.encode('utf8')).hexdigest() if config.get('source'): path = config.get('source').get('path') if path: checkpoint_name += '_' + os.path.basename(path) cache = [checkpoint(checkpoint_name)] else: cache = [] steps = [ row_validator(phase, poster, tasks) ] + cache + [ row_sender(phase, poster, tasks) ] return Flow( *steps )
def operator(name, params, pipeline): with tempfile.NamedTemporaryFile(mode='w', encoding='utf-8') as config_file: params['dgpConfig'].setdefault('publish', {})['allowed'] = True metadata = params['dgpConfig'].setdefault('extra', {}).setdefault('metadata', {}) metadata['title'] = name metadata['dag_id'] = pipeline['id'] metadata['updated_at'] = pipeline['__updated_at'] metadata['created_at'] = pipeline['__created_at'] for k, v in params.items(): if k.startswith('extra.'): set_dots(params['dgpConfig'], k, v) logging.info('\nCONFIGURATION:\n--------------\n%s', json.dumps(params['dgpConfig'], sort_keys=True, ensure_ascii=False, indent=2)) yaml.dump(params['dgpConfig'], config_file) config_file.flush() config = Config(config_file.name) taxonomy_registry = TaxonomyRegistry('taxonomies/index.yaml') context = Context(config, taxonomy_registry) logging.getLogger().setLevel(logging.INFO) steps = [ FileLoaderDGP, LoaderDGP, PostLoaderDGP, TransformDGP, EnricherDGP, PublisherDGP, ] dgp = SimpleDGP( config, context, steps=steps ) ret = dgp.analyze() if not ret: logging.error('Errors:') logging.error('\n\t - '.join([str(x) for x in dgp.errors])) assert False # logging.info('\nCONF (POST ANALYSIS):\n--------------\n%s', # json.dumps(config._unflatten(), sort_keys=True, ensure_ascii=False, indent=2)) logging.info('Creating Flow') flow = dgp.flow() flow = Flow( flow, printer(tablefmt='html') ) logging.info('Running Flow') _, stats = flow.process() logging.info('Success') return stats
async def events(request: web.Request): loop = request.app.loop uid = request.match_info['uid'] error_code = None exception = None async with sse_response(request, headers=CORS_HEADERS) as resp: try: config = Config(path_for_uid(uid, 'config.yaml')) taxonomy_registry = TaxonomyRegistry('taxonomies/index.yaml') context = Context(config, taxonomy_registry) poster = Poster(uid, sender(resp)) tasks = [] dgp = SimpleDGP( config, context, ) try: ret = dgp.analyze() print('ANALYZED') if config.dirty: await poster.post_config(config._unflatten()) if not ret: await poster.post_errors(list(map(list, dgp.errors))) dgp.post_flows = [ post_flow(0, poster, tasks, config, cache=True), post_flow(1, poster, tasks, config), ] flow = dgp.flow() await run_flow(flow, tasks) finally: for task in tasks: await asyncio.gather(task) except Exception: logging.exception('Error while executing') finally: await resp.send('close') return resp
def main(): config = Config(sys.argv[1] if len(sys.argv) > 1 else 'dgp.yaml') taxonomy_registry = TaxonomyRegistry('taxonomies/index.yaml') context = Context(config, taxonomy_registry) dgp = SimpleDGP(config, context) ret = dgp.analyze() if not ret: print('Errors:', '\n\t - '.join([str(x) for x in dgp.errors])) sys.exit(0) flow = dgp.flow() flow = Flow(flow, dump_to_path('output')) flow.process() print('----') print('Success:', ret)
def main(source_spec, config, taxonomy, output_datapackage, output_db, output_es): if not source_spec and not config: raise click.UsageError('Expecting to see either source-spec of config') configs = False if source_spec: source_spec_obj = yaml.load(open(source_spec)) configs = convert_source_spec(source_spec_obj, taxonomy) for i, config in enumerate(configs): yaml.dump(config._unflatten(), open('{}.{:02d}.yaml'.format(source_spec, i), 'w'), default_flow_style=False, indent=2, allow_unicode=True, encoding='utf8') elif config: configs = [Config(config)] for config in configs: process_source(config, output_datapackage, output_db, output_es) break
def convert_source_spec(source_spec, taxonomy_id): for source in source_spec['sources']: config = Config() context = Context(config, TaxonomyRegistry('taxonomies/index.yaml')) config.set(CONFIG_URL, source['url']) if 'encoding' in source: config.set(CONFIG_ENCODING, source['encoding']) config.set(CONFIG_TAXONOMY_ID, taxonomy_id) dgp = SimpleDGP(config, context) dgp.analyze() headers = config.get(CONFIG_HEADER_FIELDS) mapping = [] for header in headers: found = False for field in source_spec['fields']: aliases = set([field['header']] + field.get('aliases', [])) if header in aliases: mapping.append( dict(name=header, header=field['header'], title=field.get('title', field['header']), columnType=field['columnType'], options=field.get('options', {}))) found = True break if not found: print('Failed to find mapping for header', header) assert len(mapping) == len(headers) config.set(CONFIG_MODEL_MAPPING, mapping) config.set('extra.deduplicate', source_spec.get('deduplicate') is True) title = source_spec['title'] dataset_name = source_spec.get('dataset-name', title) dataset_name = slugify(dataset_name, separator='_').lower() resource_name = source_spec.get('resource-name', dataset_name) revision = source_spec.get('revision', 0) private = source_spec.get('private') is not False config.set(CONFIG_EXTRA_METADATA_TITLE, title) config.set(CONFIG_EXTRA_METADATA_DATASET_NAME, dataset_name) config.set(CONFIG_EXTRA_METADATA_REVISION, revision) config.set(CONFIG_EXTRA_RESOURCE_NAME, resource_name) config.set(CONFIG_EXTRA_PRIVATE, private) dgp = SimpleDGP(config, context) if not dgp.analyze(): for error in dgp.errors: print(error) break else: yield config