def add_workflows(args, other_args, subparser=None): """ Add GeneFlow workflows to database. Args: args.workflow_yaml: GeneFlow definition with workflows. args.config: GeneFlow config file path. args.environment: Config environment. Returns: On success: True. On failure: False. """ workflow_yaml = args.workflow_yaml config = args.config environment = args.environment # load config file cfg = Config() if not cfg.load(config): Log.an().error('cannot load config file: %s', config) return False config_dict = cfg.config(environment) if not config_dict: Log.an().error('invalid config environment: %s', environment) return False # connect to data source try: data_source = DataSource(config_dict['database']) except DataSourceException as err: Log.an().error('data source initialization error [%s]', str(err)) return False # import workflow defs = data_source.import_workflows_from_def(workflow_yaml) if not defs: Log.an().error('workflow definition load failed: %s', workflow_yaml) return False data_source.commit() # display new IDs for workflow in defs: Log.some().info('workflow loaded: %s -> %s', workflow, defs[workflow]) return True
def add_apps(args): """ Add GeneFlow apps to database. Args: args.app_yaml: GeneFlow definition with apps. args.config_file: GeneFlow config file path. args.environment: Config environment. Returns: On success: True. On failure: False. """ app_yaml = args.app_yaml config_file = args.config_file environment = args.environment # load config file cfg = Config() if not cfg.load(config_file): Log.an().error('cannot load config file: %s', config_file) return False config_dict = cfg.config(environment) if not config_dict: Log.an().error('invalid config environment: %s', environment) return False # connect to data source try: data_source = DataSource(config_dict['database']) except DataSourceException as err: Log.an().error('data source initialization error [%s]', str(err)) return False # import apps defs = data_source.import_apps_from_def(app_yaml) if not defs: Log.an().error('app definition load failed: %s', app_yaml) return False data_source.commit() # display new IDs for app in defs: Log.some().info('app loaded: %s -> %s', app, defs[app]) return True
def migrate_db(args, other_args, subparser=None): """ Migrate SQL DB schema. Currently only works for MySQL databases. Args: args.config: GeneFlow config file path. args.environment: Config environment. Returns: On success: True. On failure: False. """ config = args.config environment = args.environment cfg = Config() if not cfg.load(config): Log.an().error('cannot load config file: %s', config) return False config_dict = cfg.config(environment) if not config_dict: Log.an().error('invalid config environment: %s', environment) return False if config_dict['database']['type'] != 'mysql': Log.an().error('only mysql databases can be migrated') return False migrations_path = str(Path(GF_PACKAGE_PATH, 'data/migrations')) try: database = get_backend('{}://{}:{}@{}/{}'.format( config_dict['database']['type'], config_dict['database']['user'], config_dict['database']['password'], config_dict['database']['host'], config_dict['database']['database'])) migrations = read_migrations(migrations_path) with database.lock(): database.apply_migrations(database.to_apply(migrations)) except Exception as err: Log.an().error('cannot migrate database [%s]', str(err)) return False return True
def before_all(context): # load workflow config file based on env cfg = Config() assert cfg.load('./test.conf') geneflow_config = cfg.config( env=context.config.userdata.get('environment', 'local') ) if not geneflow_config: raise ValueError('Error loading geneflow config file') context.geneflow_config = geneflow_config # delete all items from database clear_database(context) # setup dict key for agave connection context.agave = {}
def init_db(args, other_args, subparser=None): """ Initialize SQLite DB schema. Args: args.config: GeneFlow config file path. args.environment: Config environment. Returns: On success: True. On failure: False. """ config = args.config environment = args.environment cfg = Config() if not cfg.load(config): Log.an().error('cannot load config file: %s', config) return False config_dict = cfg.config(environment) if not config_dict: Log.an().error('invalid config environment: %s', environment) return False if config_dict['database']['type'] != 'sqlite': Log.an().error('only sqlite databases can be initialized') return False if not Environment.init_sqlite_db(config_dict['database']['path']): Log.an().error('cannot initialize sqlite database: %s', config_dict['database']['path']) return False return True
def install_workflow(args): """ Install a GeneFlow workflow. Args: args: contains all command-line arguments. Returns: On success: True. On failure: False. """ # load config if specified config_dict = None cfg = Config() if args.config: if not args.environment: Log.an().error( 'must specify environment if specifying a config file') return False if not cfg.load(Path(args.config).resolve()): Log.an().error('cannot load config file: %s', args.config) return False config_dict = cfg.config(args.environment) if not config_dict: Log.an().error('invalid config environment: %s', args.environment) return False else: # load default config cfg.default('database.db') config_dict = cfg.config('local') # load agave params if specified agave_params = {} if args.agave_params: try: with open(args.agave_params, 'rU') as yaml_file: yaml_data = yaml_file.read() except IOError as err: Log.an().error('cannot read agave params file: %s [%s]', args.params, str(err)) return False try: agave_params = yaml.safe_load(yaml_data) except yaml.YAMLError as err: Log.an().error('invalid yaml: %s [%s]', yaml_data, str(err)) return False if not agave_params.get('agave'): agave_params['agave'] = {} # override any agave_params keys with command line options if args.agave_apps_prefix: agave_params['agave']['appsPrefix'] = args.agave_apps_prefix if args.agave_execution_system: agave_params['agave']['executionSystem'] = args.agave_execution_system if args.agave_deployment_system: agave_params['agave'][ 'deploymentSystem'] = args.agave_deployment_system if args.agave_apps_dir: agave_params['agave']['appsDir'] = args.agave_apps_dir if args.agave_test_data_dir: agave_params['agave']['testDataDir'] = args.agave_test_data_dir # initialize workflow installer object and install apps wf_installer = WorkflowInstaller(str(Path(args.workflow_path).resolve()), git=args.git, git_branch=args.git_branch, force=args.force, app_name=args.name, app_asset=args.asset, copy_prefix=args.prefix, clean=args.clean, config=config_dict, agave_params=agave_params, agave_username=args.agave_username, agave_publish=args.agave_publish, make_apps=args.make_apps) if not wf_installer.initialize(): Log.an().error('cannot initialize workflow installer') return False if not wf_installer.install_apps(): Log.an().error('cannot install workflow apps') return False if args.agave_test_data: if not wf_installer.upload_agave_test_data(): Log.an().error('cannot upload agave test data') return False return True
def run(args, other_args, subparser): """ Run GeneFlow workflow engine. Args: args.workflow_path: workflow definition or package directory. args.job: path to job definition Returns: On success: True. On failure: False. """ # get absolute path to workflow workflow_path = resolve_workflow_path(args.workflow_path) if workflow_path: Log.some().info('workflow definition found: %s', workflow_path) else: Log.an().error('cannot find workflow definition: %s', args.workflow_path) return False # setup environment env = Environment(workflow_path=workflow_path) if not env.initialize(): Log.an().error('cannot initialize geneflow environment') return False # create default config file and SQLite db cfg = Config() cfg.default(env.get_sqlite_db_path()) cfg.write(env.get_config_path()) config_dict = cfg.config('local') # load workflow into db try: data_source = DataSource(config_dict['database']) except DataSourceException as err: Log.an().error('data source initialization error [%s]', str(err)) return False defs = data_source.import_definition(workflow_path) if not defs: Log.an().error('workflow definition load failed: %s', workflow_path) return False if not defs['workflows']: Log.an().error('workflow definition load failed: %s', workflow_path) return False data_source.commit() for workflow in defs['workflows']: Log.some().info('workflow loaded: %s -> %s', workflow, defs['workflows'][workflow]) # get workflow definition back from database to ensure # that it's a valid definition workflow_id = next(iter(defs['workflows'].values())) workflow_dict = data_source.get_workflow_def_by_id(workflow_id) if not workflow_dict: Log.an().error( 'cannot get workflow definition from data source: workflow_id=%s', workflow_id) return False ### define arg parsing methods def parse_dynamic_args(workflow_dict): """ Parse dynamic args based on workflow dictionary as well as some static args. Args: other_args: List of remaining args from initial parse of workflow path. workflow_dict: Workflow dictionary Returns: On success: List of parsed arguments. On failure: False. """ # parse dynamic args. these are determined from workflow definition dynamic_parser = argparse.ArgumentParser() dynamic_parser.add_argument('-j', '--job', type=str, default=None, dest='job_path', help='Job Definition(s)') for input_key in workflow_dict['inputs']: dynamic_parser.add_argument( '--in.{}'.format(input_key), dest='inputs.{}'.format(input_key), required=False, default=workflow_dict['inputs'][input_key]['default'], help=workflow_dict['inputs'][input_key]['label']) for param_key in workflow_dict['parameters']: dynamic_parser.add_argument( '--param.{}'.format(param_key), dest='parameters.{}'.format(param_key), required=False, default=workflow_dict['parameters'][param_key]['default'], help=workflow_dict['parameters'][param_key]['label']) dynamic_parser.add_argument('-o', '--output', type=str, default='~/geneflow-output', help='Output Folder') dynamic_parser.add_argument('-n', '--name', type=str, default='geneflow-job', help='Name of Job') dynamic_parser.add_argument('-w', '--work', nargs='+', type=str, default=[], help='Work Directory') dynamic_parser.add_argument('--exec-context', '--ec', nargs='+', type=str, dest='exec_context', default=[], help='Execution Contexts') dynamic_parser.add_argument('--exec-method', '--em', nargs='+', type=str, dest='exec_method', default=[], help='Execution Methods') dynamic_parser.add_argument('--exec-param', '--ep', nargs='+', type=str, dest='exec_param', default=[], help='Execution Parameters') dynamic_args = dynamic_parser.parse_known_args(other_args) return dynamic_args[0] if 'gooey' in sys.modules: @Gooey(program_name='GeneFlow: {}'.format(workflow_dict['name']), program_description=workflow_dict['description'], target='gf --log-level={} run {}'.format( args.log_level, args.workflow_path), monospace_display=True) def parse_dynamic_args_gui(workflow_dict): """ Parse dynamic args based on workflow dictionary as well as some static args. Display a GUI interface. Args: other_args: List of remaining args from initial parse of workflow path. workflow_dict: Workflow dictionary Returns: On success: List of parsed arguments. On failure: False. """ # parse dynamic args. these are determined from workflow definition dynamic_parser = GooeyParser() input_group = dynamic_parser.add_argument_group( "Workflow Inputs", "Files or folders to be passed to the workflow") for input_key in workflow_dict['inputs']: widget = 'FileChooser' if workflow_dict['inputs'][input_key]['type'] == 'Directory': widget = 'DirChooser' input_group.add_argument( '--in.{}'.format(input_key), dest='inputs.{}'.format(input_key), required=False, default=workflow_dict['inputs'][input_key]['default'], help=workflow_dict['inputs'][input_key]['label'], widget=widget) param_group = dynamic_parser.add_argument_group( "Workflow Parameters", "Number or string parameters to be passed to the workflow") for param_key in workflow_dict['parameters']: param_group.add_argument( '--param.{}'.format(param_key), dest='parameters.{}'.format(param_key), required=False, default=workflow_dict['parameters'][param_key]['default'], help=workflow_dict['parameters'][param_key]['label']) job_group = dynamic_parser.add_argument_group( "Job Options", "Output/intermediate folders and job name") job_group.add_argument('-o', '--output', type=str, default='~/geneflow-output', help='Output Folder', widget='DirChooser') job_group.add_argument('-n', '--name', type=str, default='geneflow-job', help='Name of Job') job_group.add_argument('-w', '--work', nargs='+', type=str, default=[], help='Work Directory') exec_group = dynamic_parser.add_argument_group( "Execution Options", "Customize workflow execution") exec_group.add_argument('--exec-context', '--ec', nargs='+', type=str, dest='exec_context', default=[], help='Execution Contexts') exec_group.add_argument('--exec-method', '--em', nargs='+', type=str, dest='exec_method', default=[], help='Execution Methods') exec_group.add_argument('--exec-param', '--ep', nargs='+', type=str, dest='exec_param', default=[], help='Execution Parameters') dynamic_args = dynamic_parser.parse_args(other_args) return dynamic_args # get dynamic args if args.gui and 'gooey' in sys.modules: dynamic_args = parse_dynamic_args_gui(workflow_dict) else: dynamic_args = parse_dynamic_args(workflow_dict) # get absolute path to job file if provided job_path = None if dynamic_args.job_path: job_path = Path(dynamic_args.job_path).absolute() # load job definition if provided jobs_dict = {} gf_def = Definition() if job_path: if not gf_def.load(job_path): Log.an().error('Job definition load failed') return False jobs_dict = gf_def.jobs() else: # create default definition jobs_dict = { 'job': { 'name': 'GeneFlow job', 'output_uri': 'geneflow_output', 'work_uri': { 'local': '~/.geneflow/work' } } } # override with known cli parameters apply_job_modifiers(jobs_dict, [ 'name={}'.format(dynamic_args.name), 'output_uri={}'.format( dynamic_args.output) ]) # insert workflow name into job, if not provided workflow_name = next(iter(defs['workflows'])) for job in jobs_dict.values(): if 'workflow_name' not in job: job['workflow_name'] = workflow_name # add inputs and parameters to job definition apply_job_modifiers( jobs_dict, [ '{}={}'.format(dynamic_arg, getattr(dynamic_args, dynamic_arg)) for dynamic_arg in vars(dynamic_args) \ if dynamic_arg.startswith('inputs.') or dynamic_arg.startswith('parameters.') ] ) # add work URIs to job definition work_uris = {} for work_arg in dynamic_args.work: parsed_work_uri = URIParser.parse(work_arg) if not parsed_work_uri: # skip if invalid URI Log.a().warning('invalid work uri: %s', work_arg) else: work_uris[ parsed_work_uri['scheme']] = parsed_work_uri['chopped_uri'] apply_job_modifiers(jobs_dict, [ 'work_uri.{}={}'.format(context, work_uris[context]) for context in work_uris ]) # add execution options to job definition apply_job_modifiers(jobs_dict, [ 'execution.context.{}={}'.format(*exec_arg.split(':', 1)[0:2]) for exec_arg in dynamic_args.exec_context ] + [ 'execution.method.{}={}'.format(*exec_arg.split(':', 1)[0:2]) for exec_arg in dynamic_args.exec_method ] + [ 'execution.parameters.{}={}'.format(*exec_arg.split(':', 1)[0:2]) for exec_arg in dynamic_args.exec_param ]) # get default values from workflow definition for job in jobs_dict.values(): if 'inputs' not in job: job['inputs'] = {} if 'parameters' not in job: job['parameters'] = {} for input_key in workflow_dict['inputs']: if input_key not in job['inputs']: job['inputs'][input_key]\ = workflow_dict['inputs'][input_key]['default'] for param_key in workflow_dict['parameters']: if param_key not in job['parameters']: job['parameters'][param_key]\ = workflow_dict['parameters'][param_key]['default'] # expand URIs for job in jobs_dict.values(): # output URI parsed_uri = URIParser.parse(job['output_uri']) if not parsed_uri: Log.an().error('invalid output uri: %s', job['output_uri']) return False # expand relative path if local if parsed_uri['scheme'] == 'local': job['output_uri'] = str( Path(parsed_uri['chopped_path']).expanduser().resolve()) # work URIs for context in job['work_uri']: parsed_uri = URIParser.parse(job['work_uri'][context]) if not parsed_uri: Log.an().error('invalid work uri: %s', job['work_uri']) return False # expand relative path if local if parsed_uri['scheme'] == 'local': job['work_uri'][context] = str( Path(parsed_uri['chopped_path']).expanduser().resolve()) # input URIs for input_key in job['inputs']: parsed_uri = URIParser.parse(job['inputs'][input_key]) if not parsed_uri: Log.an().error('invalid input uri: %s', job['inputs'][input_key]) return False # expand relative path if local if parsed_uri['scheme'] == 'local': job['inputs'][input_key] = str( Path(parsed_uri['chopped_path']).expanduser().resolve()) # import jobs into database job_ids = data_source.import_jobs_from_dict(jobs_dict) if job_ids is False: Log.an().error('cannot import jobs') return False data_source.commit() # create process pool to run workflows in parallel pool = Pool(min(5, len(job_ids))) jobs = [{'name': job, 'id': job_ids[job], 'log': None} for job in job_ids] result = pool.map( partial(geneflow.cli.common.run_workflow, config=config_dict, log_level=args.log_level), jobs) pool.close() pool.join() if not all(result): Log.an().error('some jobs failed') return result
def run(args): """ Run GeneFlow workflow engine. Args: args.workflow: workflow definition or package directory. args.job_yaml: job definition. Returns: On success: True. On failure: False. """ # get absolute path to workflow workflow_yaml = resolve_workflow_path(args.workflow) if workflow_yaml: Log.some().info('workflow definition found: %s', workflow_yaml) else: Log.an().error('cannot find workflow definition: %s', args.workflow) return False # get absolute path to job file if provided job_yaml = None if args.job_yaml: job_yaml = Path(args.job_yaml).absolute() # setup environment env = Environment(workflow_path=workflow_yaml) if not env.initialize(): Log.an().error('cannot initialize geneflow environment') return False # create default config file and SQLite db cfg = Config() cfg.default(env.get_sqlite_db_path()) cfg.write(env.get_config_path()) config_dict = cfg.config('local') # load workflow into db try: data_source = DataSource(config_dict['database']) except DataSourceException as err: Log.an().error('data source initialization error [%s]', str(err)) return False defs = data_source.import_definition(workflow_yaml) if not defs: Log.an().error('workflow definition load failed: %s', workflow_yaml) return False if not defs['workflows']: Log.an().error('workflow definition load failed: %s', workflow_yaml) return False data_source.commit() for workflow in defs['workflows']: Log.some().info( 'workflow loaded: %s -> %s', workflow, defs['workflows'][workflow] ) # load job definition if provided jobs_dict = {} gf_def = Definition() if job_yaml: if not gf_def.load(job_yaml): Log.an().error('Job definition load failed') return False jobs_dict = gf_def.jobs() else: # create default definition jobs_dict = { 'job': { 'name': 'GeneFlow job', 'output_uri': 'geneflow_output', 'work_uri': { 'local': '~/.geneflow/work' } } } # override with cli parameters if args.data: apply_job_modifiers(jobs_dict, args.data) # insert workflow name, if not provided workflow_name = next(iter(defs['workflows'])) for job in jobs_dict.values(): if 'workflow_name' not in job: job['workflow_name'] = workflow_name # extract workflow defaults for inputs and parameters if not provided # in job definition workflow_id = next(iter(defs['workflows'].values())) workflow_dict = data_source.get_workflow_def_by_id(workflow_id) if not workflow_dict: Log.an().error( 'cannot get workflow definition from data source: workflow_id=%s', workflow_id ) return False for job in jobs_dict.values(): if 'inputs' not in job: job['inputs'] = {} if 'parameters' not in job: job['parameters'] = {} for input_key in workflow_dict['inputs']: if input_key not in job['inputs']: job['inputs'][input_key]\ = workflow_dict['inputs'][input_key]['default'] for param_key in workflow_dict['parameters']: if param_key not in job['parameters']: job['parameters'][param_key]\ = workflow_dict['parameters'][param_key]['default'] # expand URIs for job in jobs_dict.values(): # output URI parsed_uri = URIParser.parse(job['output_uri']) if not parsed_uri: Log.an().error('invalid output uri: %s', job['output_uri']) return False # expand relative path if local if parsed_uri['scheme'] == 'local': job['output_uri'] = str( Path(parsed_uri['chopped_path']).expanduser().resolve() ) # work URIs for context in job['work_uri']: parsed_uri = URIParser.parse(job['work_uri'][context]) if not parsed_uri: Log.an().error('invalid work uri: %s', job['work_uri']) return False # expand relative path if local if parsed_uri['scheme'] == 'local': job['work_uri'][context] = str( Path(parsed_uri['chopped_path']).expanduser().resolve() ) # input URIs for input_key in job['inputs']: parsed_uri = URIParser.parse(job['inputs'][input_key]) if not parsed_uri: Log.an().error( 'invalid input uri: %s', job['inputs'][input_key] ) return False # expand relative path if local if parsed_uri['scheme'] == 'local': job['inputs'][input_key] = str( Path(parsed_uri['chopped_path']).expanduser().resolve() ) # import jobs into database job_ids = data_source.import_jobs_from_dict(jobs_dict) if job_ids is False: Log.an().error('cannot import jobs') return False data_source.commit() # create process pool to run workflows in parallel pool = Pool(min(5, len(job_ids))) jobs = [ { 'name': job, 'id': job_ids[job], 'log': None } for job in job_ids ] result = pool.map( partial( geneflow.cli.common.run_workflow, config=config_dict, log_level=args.log_level ), jobs ) pool.close() pool.join() if not all(result): Log.an().error('some jobs failed') return result
def run_pending(args): """ Run any jobs in database in the PENDING state. Args: args.config_file: GeneFlow config file path. args.environment: Config environment. Returns: On success: True. On failure: False. """ config_file = args.config_file environment = args.environment log_location = args.log_location # load config file cfg = Config() if not cfg.load(config_file): Log.an().error('cannot load config file: %s', config_file) return False config_dict = cfg.config(environment) if not config_dict: Log.an().error('invalid config environment: %s', environment) return False # connect to data source try: data_source = DataSource(config_dict['database']) except DataSourceException as err: Log.an().error('data source initialization error [%s]', str(err)) return False # get pending jobs from database pending_jobs = data_source.get_pending_jobs() if pending_jobs is False: Log.an().error('cannot query for pending jobs') return False if not pending_jobs: # no jobs found return True Log.some().info('pending jobs found:\n%s', pprint.pformat(pending_jobs)) pool = Pool(min(5, len(pending_jobs))) jobs = [{ 'name': job['name'], 'id': job['id'], 'log': str(Path(log_location) / (job['id'] + '.log')) } for job in pending_jobs] result = pool.map( partial(geneflow.cli.common.run_workflow, config=config_dict, log_level=args.log_level), jobs) pool.close() pool.join() if not all(result): Log.an().error('some jobs failed') return result