def _get_denpendencies(dataset, all_datasets, include_indirect=False): try: etl_type, fn = all_datasets[dataset] except KeyError: # not open_numbers datasets return list() if etl_type == 'recipe': dataset_path = osp.join(datasets_dir, dataset) etl_dir = osp.join(dataset_path, 'etl/scripts') recipe = osp.join(etl_dir, fn) logging.info("using recipe file: " + fn) chef = Chef.from_recipe(recipe, ddf_dir=datasets_dir) dependencies = list() for i in chef.ingredients: if i.dataset is not None: dependencies.append(i.dataset) if include_indirect: for d in _get_denpendencies(i.dataset, all_datasets, include_indirect=True): dependencies.append(d) dependencies = list(set(dependencies)) logging.info("dependencies: {}".format(dependencies)) return dependencies else: return list()
def test_chef_load_recipe(): recipe_file = os.path.join(wd, 'recipes/test_flatten.yml') chef = Chef.from_recipe(recipe_file) try: chef.validate() except ChefRuntimeError: pass assert 1
def test_build_dictionary(): from ddf_utils.chef.api import Chef from ddf_utils.chef.helpers import build_dictionary d = {'China': 'chn', 'USA': 'usa'} c = Chef() assert build_dictionary(c, d) == d d2 = { "imr_lower": "infant_mortality_lower", "imr_median": "infant_mortality_median", "imr_upper": "infant_mortality_upper" } dfp = os.path.join(wd, 'chef', 'translation_dictionaries') fp = 'indicators_cme_to_sg.json' c.add_config(dictionaries_dir=dfp) assert build_dictionary(c, fp) == d2
def build_recipe(recipe, format): """create a complete recipe by expanding all includes in the input recipe.""" from ddf_utils.chef.api import Chef chef = Chef.from_recipe(recipe) fp = click.open_file('-', 'w') if format == 'json': import json json.dump(recipe, fp, indent=4, ensure_ascii=False) elif format == 'yaml': import yaml yaml.dump(recipe, fp)
def run_recipe(recipe, outdir, ddf_dir, update, dry_run, gen_dp, show_tree): """generate new ddf dataset with recipe""" from ddf_utils.chef.api import Chef from ddf_utils.package import create_datapackage from ddf_utils.io import dump_json import json coloredlogs.install(logger=logging.getLogger('Chef'), fmt='%(asctime)s %(name)s %(levelname)s %(message)s', level=LOG_LEVEL) click.echo('building recipe...') if ddf_dir: chef = Chef.from_recipe(recipe, ddf_dir=ddf_dir) else: chef = Chef.from_recipe(recipe) if show_tree: chef.dag.tree_view() return if update: pass serve = not dry_run chef.run(serve=serve, outpath=outdir) if serve and gen_dp: click.echo('creating datapackage file...') datapackage_path = os.path.join(outdir, 'datapackage.json') if os.path.exists(datapackage_path): click.echo('backup old datapackage.json to datapackage.json.bak') shutil.copyfile(datapackage_path, os.path.join(outdir, 'datapackage.json.bak')) dp_old = json.load(open(datapackage_path)) # copy translations info. other info should be in the recipe. if 'translations' in dp_old.keys(): chef = chef.add_metadata(translations=dp_old['translations']) dump_json(os.path.join(outdir, 'datapackage.json'), create_datapackage(outdir, gen_schema=True, **chef.metadata)) click.echo("Done.")
# -*- coding: utf-8 -*- import os from ddf_utils.chef.api import Chef out_dir = '../../' recipe_file = '../recipes/recipe_main.yaml' try: datasets_dir = os.environ['DATASETS_DIR'] except KeyError: datasets_dir = '../../../' if __name__ == '__main__': chef = Chef.from_recipe(recipe_file, ddf_dir=datasets_dir) chef.run(serve=True, outpath=out_dir)
def test_ingredients(): chef = Chef() chef = chef.add_config(ddf_dir=os.path.join(wd, 'datasets')) i = ingredient_from_dict(dictionary={ 'id': 'ddf--cme', 'dataset': 'ddf--cme', 'key': 'country, year', 'value': { '$in': ['*lower'] } }, **chef.config) assert set(list(i.get_data().keys())) == set(['imr_lower']) i = ingredient_from_dict(dictionary={ 'id': 'ddf--cme', 'dataset': 'ddf--cme', 'key': 'country, year', 'value': { '$nin': ['*lower'] } }, **chef.config) assert set(list(i.get_data().keys())) == set(['imr_upper', 'imr_median']) i = ingredient_from_dict(dictionary={ 'id': 'ddf--cme', 'dataset': 'ddf--cme', 'key': 'country, year', 'value': { '$nin': ['imr_lower', 'imr_upper'] } }, **chef.config) assert set(list(i.get_data().keys())) == set(['imr_median']) i = ingredient_from_dict(dictionary={ 'id': 'ddf--cme', 'dataset': 'ddf--cme', 'key': 'country, year', 'value': { '$in': ['imr_lower', 'imr_upper'] } }, **chef.config) assert set(list(i.get_data().keys())) == set(['imr_lower', 'imr_upper']) i = ingredient_from_dict(dictionary={ 'id': 'ddf--cme', 'dataset': 'ddf--cme', 'key': 'country, year', 'value': { '$in': ['imr_lower', 'lsdf'] } }, **chef.config) assert set(list(i.get_data().keys())) == set(['imr_lower']) i = ingredient_from_dict(dictionary={ 'id': 'ddf--cme', 'dataset': 'ddf--cme', 'key': 'country, year', 'value': { '$nin': ['imr_*'] } }, **chef.config) try: i.get_data() except IngredientError: pass i = ingredient_from_dict(dictionary={ 'id': 'ddf--dummy', 'dataset': 'ddf--gapminder--dummy_companies', 'key': 'synonym, region' }, **chef.config) assert set(list(i.get_data().keys())) == set(['region'])
def test_chef_api_call(): from ddf_utils.chef.model.dag import DAG from ddf_utils.chef.model.ingredient import DataPointIngredient # create empty chef dag = DAG() Chef(dag=dag, metadata={}, config={}, cooking={}, serving=[]) # create chef and add config chef = Chef() (chef.add_config(ddf_dir=os.path.join(wd, 'datasets')) .add_metadata(id='test_dataset', base=['ddf--bp--energy']) .add_ingredient(id='bp-datapoints', dataset='ddf--bp--energy', key='geo, year', value='*') .add_procedure(collection='datapoints', procedure='translate_header', ingredients=['bp-datapoints'], result='bp-datapoints-translate', options={'dictionary': {'geo': 'country'}})) def multiply_1000(chef, ingredients, result, **options): # ingredients = [chef.dag.get_node(x) for x in ingredients] ingredient = ingredients[0] new_data = dict() for k, df in ingredient.get_data().items(): df_ = df.copy() df_[k] = df_[k] * 1000 new_data[k] = df_ return DataPointIngredient.from_procedure_result(result, ingredient.key, new_data) chef.register_procedure(multiply_1000) chef.add_procedure(collection='datapoints', procedure='multiply_1000', ingredients=['bp-datapoints-translate'], result='res') chef.serving chef.add_dish(['bp-datapoints-translate'], options={}) chef.to_graph() chef.to_graph(node='res') chef.to_recipe() chef.dag.tree_view() chef.validate() res = chef.run() assert 1
# -*- coding: utf-8 -*- import os from ddf_utils.chef.api import Chef recipe_file = '../recipes/etl.yml' if __name__ == '__main__': try: d = os.environ['DATASETS_DIR'] chef = Chef.from_recipe(recipe_file, ddf_dir=d) except KeyError: chef = Chef.from_recipe(recipe_file) chef.run(serve=True, outpath='../../')
# coding: utf8 import os from ddf_utils.chef.api import Chef recipe_file = '' out_dir = '../../' try: datasets_dir = os.environ['DATASETS_DIR'] except KeyError: datasets_dir = '../../../' if __name__ == '__main__': chef = Chef.from_recipe(recipe_file) chef.add_config(ddf_dir=datasets_dir) chef.run(serve=True, outpath=out_dir)
def chef_fn(fn): return Chef.from_recipe(os.path.join(wd, 'recipes', fn), ddf_dir=os.path.join(wd, 'datasets'), procedure_dir=os.path.join(wd, 'procedures'))