def read_data(row, basedir, transform=True): params = {'data': row['params']['data']} datadir = os.path.join(params_dir(basedir, params, 'data'), 'output') row['data'].read(datadir) if transform: row['data'].transform(**row['params']['transform'])
def grid_search(params, outputdir, drakefile, drakein=None, tag=None, python_args=None, overwrite_tag=False, preview=False): data = list_dict_product(params['data']) transforms = list_dict_product(params['transforms']) models = list_dict_product(params['models']) metrics = list_dict_product(params['metrics']) if drakein is not None: dirname, basename = os.path.split(os.path.abspath(drakein)) drakefile.write("BASE={}\n".format(dirname)) drakefile.write("%include $[BASE]/{}\n".format(basename)) #TODO include a project specific Drakefile via cmd arg bindir = os.path.abspath(os.path.dirname(sys.argv[0])) drakefile.write(""" PYTHONUNBUFFERED=Y\n data() python {python_args} {bindir}/read_write_data.py $INPUT $OUTPUT model() python {python_args} {bindir}/run_model.py $INPUT $OUTPUT $INPUT1 \n """.format(bindir=bindir, python_args=python_args)) # data steps for d in data: p = {'data': d} drakefile.write(drake_step(outputdir, p, 'data', preview=preview)) if tag is not None: tagdir = os.path.join(outputdir, 'tag', tag) if overwrite_tag and not preview: shutil.rmtree(tagdir) if not os.path.exists(tagdir) and not preview: os.makedirs(tagdir) # model steps i = 0 for d,t,m in itertools.product(data,transforms,models): i = i + 1 p = {'data': d, 'transform':t, 'model':m, 'metrics':metrics} d = {'data': d} datadir = os.path.join(params_dir(outputdir, d, 'data'), 'output/') # use data dir for drake dependency tagdir = os.path.join(outputdir, 'tag', tag, util.hash_yaml_dict(p)) if tag is not None else None drakefile.write(drake_step(outputdir, p, 'model', inputs=[datadir], tagdir=tagdir, preview=preview))
def drake_step(basedir, params, method, inputs=[], tagdir=None, preview=False): d = params_dir(basedir, params, method) if not os.path.exists(d) and not preview: os.makedirs(d) dirname = os.path.join(d, 'output/') params_file = os.path.join(d, 'params.yaml') if params_new(params, params_file) and not preview: with open(params_file, 'w') as f: yaml.dump(params, f) if tagdir is not None and not os.path.exists(tagdir) and not preview: os.symlink(d, tagdir) cls = util.get_attr(params[method]['name']) if hasattr(cls, 'DEPENDENCIES'): inputs = inputs + cls.DEPENDENCIES inputs = ', !' + str.join(', !', inputs) if len(inputs) > 0 else '' return '!'+dirname + ' <- ' + '!'+params_file + inputs + ' [method:' + method + ']\n\n'
def read_estimator(row, basedir): modeldir = os.path.join(params_dir(basedir, row['params'], 'model'), 'output') row['estimator'] = joblib.load(os.path.join(modeldir, 'estimator.pkl'))