Esempio n. 1
0
def main():
    parser = argparse.ArgumentParser(
        description='dump restore xpctl databases')
    parser.add_argument('--from_dbtype',
                        help='backend type, `from` database',
                        default='mongo')
    parser.add_argument('--from_cred',
                        help='credential for backend, `from` database',
                        default=os.path.expanduser('~/xpctlcred.json'))
    parser.add_argument('--to_dbtype',
                        help='backend type, `to` database',
                        default='mongo')
    parser.add_argument(
        '--to_cred',
        help='credential for backend, `to` database',
        default=os.path.expanduser('~/xpctlcred-localhost.json'))
    args = parser.parse_args()

    d1 = read_config_file(args.from_cred)
    d1.update({'dbtype': args.from_dbtype})
    from_backend = ExperimentRepo().create_repo(**d1)
    dump_file = from_backend.dump()

    d2 = read_config_file(args.to_cred)
    d2.update({'dbtype': args.to_dbtype})
    to_backend = ExperimentRepo().create_repo(**d2)
    to_backend.restore(dump_file)
Esempio n. 2
0
 def restore(self, dump):
     """ if dump is in zip format, will unzip it. expects the following dir structure in the unzipped file:
     <root>
      - <task>
        - <id>-reporting.log
        - <id>.yml
     """
     dump_dir = unzip_files(dump)
     for task in os.listdir(dump_dir):
         task_dir = os.path.join(dump_dir, task)
         for exp in os.listdir(task_dir):
             exp_dir = os.path.join(task_dir, exp)
             meta = [os.path.join(exp_dir, x) for x in os.listdir(exp_dir) if x.endswith('meta.yml')]
             reporting = [os.path.join(exp_dir, x) for x in os.listdir(exp_dir) if x.endswith('reporting.log')]
             config = [os.path.join(exp_dir, x) for x in os.listdir(exp_dir) if x.endswith('config.yml')]
             try:
                 assert len(config) == 1
                 assert len(reporting) == 1
                 assert len(meta) == 1
                 config = read_config_file(config[0])
                 meta = read_config_file(meta[0])
                 reporting = log2json(reporting[0])
             except AssertionError:
                 raise RuntimeError('There should be exactly one meta file, one config file and one reporting log '
                                    'in {}'.format(exp_dir))
             self._put_result(task, config_obj=config, events_obj=reporting, **meta)
     if dump_dir != dump:
         shutil.rmtree(dump_dir)
Esempio n. 3
0
def get_configs(path):
    """Get configs from disk, if it's a dir read all configs in it."""
    configs = []
    if os.path.isdir(path):
        for file_name in os.listdir(path):
            configs.append(read_config_file(os.path.join(path, file_name)))
    else:
        configs.append(read_config_file(path))
    return configs
Esempio n. 4
0
def main():
    parser = argparse.ArgumentParser(description='xpctl server')
    parser.add_argument('--ll', help='Log level', type=str, default='INFO')
    parser.add_argument('--backend', help='backend', type=str, default='mongo')
    parser.add_argument(
        '--cred',
        help='credential for backend',
        default=os.path.expanduser('~/xpctlcred-mongo-local.json'))
    parser.add_argument('--user', help='user for backend')
    parser.add_argument('--passwd', help='user for backend')
    parser.add_argument('--dbhost', help='host for backend')
    parser.add_argument('--dbport', help='port for backend')
    parser.add_argument('--port', help='port', default='5310')
    args = parser.parse_args()

    logging.basicConfig(level=get_logging_level(args.ll))
    app = connexion.App(__name__, specification_dir='./swagger/')
    if args.user is not None:
        d = {
            'user': args.user,
            'passwd': args.passwd,
            'dbhost': args.dbhost,
            'dbport': args.dbport
        }
    else:
        d = read_config_file(args.cred)
    d.update({'dbtype': args.backend})
    app.app.backend = ExperimentRepo().create_repo(**d)
    app.app.json_encoder = encoder.JSONEncoder
    app.add_api('swagger.yaml', arguments={'title': 'xpctl'})
    app.run(port=args.port)
Esempio n. 5
0
    def __init__(self, **kwargs):
        super(WordEmbeddingsModel, self).__init__()
        self.vocab = kwargs.get('vocab')
        self.vsz = kwargs.get('vsz')
        self.dsz = kwargs.get('dsz')
        self.weights = kwargs.get('weights')
        if 'md_file' in kwargs:
            md = read_config_file(kwargs['md_file'])
            self.vocab = md['vocab']
            self.vsz = md['vsz']
            self.dsz = md['dsz']
        if 'weights_file' in kwargs:
            self.weights = np.load(kwargs['weights_file']).get('arr_0')

        if self.weights is not None:
            if self.vsz is None:
                self.vsz = self.weights.shape[0]
            else:
                assert self.vsz == self.weights.shape[0]
            if self.dsz is None:
                self.dsz = self.weights.shape[1]
            else:
                assert self.dsz == self.weights.shape[1]

        elif self.vsz is not None and self.dsz is not None:
            self.weights = np.zeros((self.vsz, self.dsz))
Esempio n. 6
0
def read_config_file_or_json(config, name=''):
    if isinstance(config, (dict, list)):
        return config
    config = os.path.expanduser(config)
    if os.path.exists(config):
        return read_config_file(config)
    raise Exception('Expected {} config file or a JSON object.'.format(name))
Esempio n. 7
0
def main():
    parser = argparse.ArgumentParser(description='Export a model')
    parser.add_argument('--config', help='JSON Configuration for an experiment', required=True, type=convert_path)
    parser.add_argument('--settings', help='JSON Configuration for mead', required=False, default='config/mead-settings.json', type=convert_path)
    parser.add_argument('--modules', help='modules to load', default=[], nargs='+', required=False)
    parser.add_argument('--datasets', help='json library of dataset labels', default='config/datasets.json', type=convert_path)
    parser.add_argument('--logging', help='json file for logging', default='config/logging.json', type=convert_path)
    parser.add_argument('--task', help='task to run', choices=['classify', 'tagger', 'seq2seq', 'lm'])
    parser.add_argument('--exporter_type', help='exporter type', default='default')
    parser.add_argument('--model', help='model name', required=True, type=unzip_model)
    parser.add_argument('--model_version', help='model_version', default=1)
    parser.add_argument('--output_dir', help='output dir', default='./models')
    parser.add_argument('--beam', help='beam_width', default=30, type=int)
    parser.add_argument('--is_remote', help='if True, separate items for remote server and client. If False bundle everything together',
                        default=True, type=str2bool)


    args = parser.parse_args()

    config_params = read_config_file(args.config)

    task_name = config_params.get('task', 'classify') if args.task is None else args.task

    if task_name == 'seq2seq' and 'beam' not in config_params:
         config_params['beam'] = args.beam

    config_params['modules'] = config_params.get('modules', []) + args.modules

    task = mead.Task.get_task_specific(task_name, args.logging, args.settings)
    task.read_config(config_params, args.datasets, exporter_type=args.exporter_type)

    exporter = create_exporter(task, args.exporter_type)
    exporter.run(args.model, args.output_dir, args.model_version, remote=args.is_remote)
Esempio n. 8
0
    def __init__(self, **kwargs):
        super(WordEmbeddingsModel, self).__init__()
        self.vocab = kwargs.get('vocab')
        self.vsz = kwargs.get('vsz')
        self.dsz = kwargs.get('dsz')
        self.weights = kwargs.get('weights')
        if 'md_file' in kwargs:
            md = read_config_file(kwargs['md_file'])
            self.vocab = md['vocab']
            self.vsz = md['vsz']
            self.dsz = md['dsz']
        if 'weights_file' in kwargs:
            self.weights = np.load(kwargs['weights_file']).get('arr_0')

        if self.weights is not None:
            if self.vsz is None:
                self.vsz = self.weights.shape[0]
            else:
                assert self.vsz == self.weights.shape[0]
            if self.dsz is None:
                self.dsz = self.weights.shape[1]
            else:
                assert self.dsz == self.weights.shape[1]

        elif self.vsz is not None and self.dsz is not None:
            self.weights = np.zeros((self.vsz, self.dsz))
Esempio n. 9
0
def putresult(user, log, task, config, label, cbase, cstore):

    logf = log.format(task)
    if not os.path.exists(logf):
        click.echo(
            "the log file at {} doesn't exist, provide a valid location".
            format(logf))
        return
    if not os.path.exists(config):
        click.echo(
            "the config file at {} doesn't exist, provide a valid location".
            format(config))
        return

    config_file = config
    config_mem = read_config_file(config_file)
    events_mem = log2json(logf)

    RepoManager.get().put_result(task,
                                 config_mem,
                                 events_mem,
                                 username=user,
                                 label=label,
                                 print_fn=click.echo,
                                 checkpoint_base=cbase,
                                 checkpoint_store=cstore)
Esempio n. 10
0
def to_experiment(task, config, log, **kwargs):
    if type(log) is not str:  # this is a log object and not a file
        events_obj = log
    else:
        events_obj = read_logs(log)
    train_events = flatten(
        [convert_to_result(event) for event in list(filter(lambda x: x['phase'] == 'Train', events_obj))]
    )
    valid_events = flatten(
        [convert_to_result(event) for event in list(filter(lambda x: x['phase'] == 'Valid', events_obj))]
    )
    test_events = flatten(
        [convert_to_result(event) for event in list(filter(lambda x: x['phase'] == 'Test', events_obj))]
    )
    if type(config) is not str:  # this is a config object and not a file
        config = json.dumps(config)
    else:
        config = json.dumps(read_config_file(config))
    d = kwargs
    d.update({'task': task,
              'config': config,
              'train_events': train_events,
              'valid_events': valid_events,
              'test_events': test_events
              })
    
    return Experiment(**d)
Esempio n. 11
0
def read_config_file_or_json(config, name=''):
    if isinstance(config, (dict, list)):
        return config
    config = os.path.expanduser(config)
    if os.path.exists(config):
        return read_config_file(config)
    raise Exception('Expected {} config file or a JSON object.'.format(name))
Esempio n. 12
0
def putresult(user, cbase, cstore, label, task, config, log):
    '''
    Puts the results in a database. provide task name, config file, the reporting log file. optionally can put the model files in a persistent storage.
    '''
    logf = log.format(task)
    if not os.path.exists(logf):
        click.echo(
            "the log file at {} doesn't exist, provide a valid location".
            format(logf))
        return
    if not os.path.exists(config):
        click.echo(
            "the config file at {} doesn't exist, provide a valid location".
            format(config))
        return

    config_file = config
    config_mem = read_config_file(config_file)
    events_mem = log2json(logf)

    RepoManager.get().put_result(task,
                                 config_mem,
                                 events_mem,
                                 username=user,
                                 label=label,
                                 print_fn=click.echo,
                                 checkpoint_base=cbase,
                                 checkpoint_store=cstore)
Esempio n. 13
0
def main():
    parser = argparse.ArgumentParser(description='Export a model')
    parser.add_argument('--config', help='JSON Configuration for an experiment', required=True, type=convert_path)
    parser.add_argument('--settings', help='JSON Configuration for mead', required=False, default='config/mead-settings.json', type=convert_path)
    parser.add_argument('--modules', help='modules to load', default=[], nargs='+', required=False)
    parser.add_argument('--datasets', help='json library of dataset labels', default='config/datasets.json', type=convert_path)
    parser.add_argument('--logging', help='json file for logging', default='config/logging.json', type=convert_path)
    parser.add_argument('--task', help='task to run', choices=['classify', 'tagger', 'seq2seq', 'lm'])
    parser.add_argument('--exporter_type', help="exporter type (default 'default')", default=None)
    parser.add_argument('--return_labels', help='if true, the exported model returns actual labels else '
                                                'the indices for labels vocab (default False)', default=None)
    parser.add_argument('--model', help='model name', required=True, type=unzip_files)
    parser.add_argument('--model_version', help='model_version', default=None)
    parser.add_argument('--output_dir', help="output dir (default './models')", default=None)
    parser.add_argument('--project', help='Name of project, used in path first', default=None)
    parser.add_argument('--name', help='Name of the model, used second in the path', default=None)
    parser.add_argument('--beam', help='beam_width', default=30, type=int)
    parser.add_argument('--is_remote', help='if True, separate items for remote server and client. If False bundle everything together (default True)', default=None)

    args = parser.parse_args()
    configure_logger(args.logging)

    config_params = read_config_file(args.config)

    try:
        args.settings = read_config_stream(args.settings)
    except:
        logger.warning('Warning: no mead-settings file was found at [{}]'.format(args.settings))
        args.settings = {}

    task_name = config_params.get('task', 'classify') if args.task is None else args.task

    # Remove multigpu references
    os.environ['CUDA_VISIBLE_DEVICES'] = ""
    os.environ['NV_GPU'] = ""
    if 'gpus' in config_params.get('train', {}):
        del config_params['train']['gpus']

    if task_name == 'seq2seq' and 'beam' not in config_params:
         config_params['beam'] = args.beam

    config_params['modules'] = config_params.get('modules', []) + args.modules

    task = mead.Task.get_task_specific(task_name, args.settings)

    output_dir, project, name, model_version, exporter_type, return_labels, is_remote = get_export_params(
        config_params.get('export', {}),
        args.output_dir,
        args.project, args.name,
        args.model_version,
        args.exporter_type,
        args.return_labels,
        args.is_remote,
    )
    task.read_config(config_params, args.datasets, exporter_type=exporter_type)
    feature_exporter_field_map = create_feature_exporter_field_map(config_params['features'])
    exporter = create_exporter(task, exporter_type, return_labels=return_labels,
                               feature_exporter_field_map=feature_exporter_field_map)
    exporter.run(args.model, output_dir, project, name, model_version, remote=is_remote)
Esempio n. 14
0
def generate_chore_yaml(
    template_loc: str,
    slack: bool = True,
    slack_web_hook: Optional[str] = None,
    git_commit: bool = False,
    k8s_bump: bool = False,
    selected: bool = False,
) -> List:
    """Generate the yaml for the chore file.

    :param template_loc: The location of the various pre-defined chore files.
    :param slack: Should we have a slack chore?
    :param slack_web_hook: A custom endpoint for final slack messages.
    :param git_commit: Should we have a git commit chore?
    :param k8s_bump: Should we have a chore to bump a k8s version?
    :param selected: Should we send a message about what as selected to export?

    :returns:
        List, The chores definitions. If there are no chores it return and empty list
    """
    chores = []
    git_depends = []
    if git_commit:
        chores.extend(
            read_config_file(os.path.join(template_loc, 'git-chore.yml')))
        git_depends = [c['name'] for c in chores]
    if slack:
        slack_chore = read_config_file(
            os.path.join(template_loc, 'slack-chore.yml'))
        if slack_web_hook is not None:
            slack_chore['webhook'] = slack_web_hook
        if git_depends:
            slack_chore['depends'] = deepcopy(
                listify(slack_chore.get('depends', [])) + git_depends)
        chores.append(slack_chore)
    if selected:
        selected_chore = read_config_file(
            os.path.join(template_loc, 'selected-chore.yml'))
        if slack_web_hook is not None:
            selected_chore['webhook'] = slack_web_hook
        chores.append(selected_chore)
    return chores
Esempio n. 15
0
    def __init__(self, **kwargs):
        super(XPCtlReporting, self).__init__(**kwargs)
        # throw exception if the next three can't be read from kwargs
        self.cred = read_config_file(
            os.path.expanduser(kwargs['hook_setting']['cred']))
        self.exp_config = read_config_file(
            os.path.expanduser(kwargs['config_file']))
        self.task = kwargs['task']
        self.print_fn = print
        self.username = kwargs['hook_setting'].get('user', getpass.getuser())
        self.hostname = kwargs['hook_setting'].get('host',
                                                   socket.gethostname())
        self.checkpoint_base = None
        self.checkpoint_store = kwargs['hook_setting'].get(
            'checkpoint_store', '/data/model-checkpoints')
        self.save_model = kwargs['hook_setting'].get(
            'save_model', False)  # optionally save the model

        self.repo = ExperimentRepo().create_repo(**self.cred)
        self.log = []
Esempio n. 16
0
def putresult(task, config, log, dataset, user, label, cbase, cstore):
    """
    Puts the results in a database. provide task name, config file, the reporting log file, and the dataset index file
    used in the experiment. Optionally can put the model files in a persistent storage.
    """
    logf = log.format(task)
    if not os.path.exists(logf):
        click.echo(click.style("the log file at {} doesn't exist, provide a valid location".format(logf), fg='red'))
        return
    if not os.path.exists(config):
        click.echo(click.style("the config file at {} doesn't exist, provide a valid location".format(config), fg='red'))
        return
    if not os.path.exists(dataset):
        click.echo(click.style("the dataset file at {} doesn't exist, provide a valid location".format(dataset), fg='red'))
        return
    config_obj = read_config_file(config)
    datasets_set = index_by_label(read_config_file(dataset))
    dataset_key = config_obj['dataset']
    dataset_key = get_dataset_from_key(dataset_key, datasets_set)
    config_obj['dataset'] = dataset_key['label']
    ServerManager.get()
    result = ServerManager.api.put_result(task, to_swagger_experiment(task, config_obj, log, username=user, label=label))
    if result.response_type == 'success':
        eid = result.message
        click.echo(click.style('results stored with experiment: {}'.format(result.message), fg='green'))
        if cbase is None:
            return
        result = store_model(checkpoint_base=cbase, config_sha1=hash_config(read_config_file(config)),
                             checkpoint_store=cstore, print_fn=click.echo)
        if result is not None:
            click.echo(click.style('model stored at {}'.format(result), fg='green'))
            update_result = ServerManager.api.update_property(task, eid, prop='checkpoint', value=result)
            if update_result.response_type == 'success':
                click.echo(click.style(update_result.message, fg='green'))
            else:
                click.echo(click.style(update_result.message, fg='red'))
        else:
            click.echo(click.style('failed to store model'.format(result), fg='red'))
    else:
        click.echo(click.style(result.message, fg='red'))
Esempio n. 17
0
 def put_result(self, label):
     # Wait to create the experiment repo until after the fork
     if self.repo is None:
         try:
             self.repo = ExperimentRepo.create_repo(**self.xpctl_config)
         except Exception as e:
             return str(e)
     loc = os.path.join(label.exp, label.sha1, label.name)
     config_loc = os.path.join(loc, 'config.json')
     config = read_config_file(config_loc)
     task = config.get('task')
     log_loc = glob.glob(os.path.join(loc, 'reporting-*.log'))[0]
     logs = read_logs(log_loc)
     return str(self.repo.put_result(task, config, logs, print_fn=dummy_print, label=self.name))
Esempio n. 18
0
def add(args):
    conn = create_db(args.db)
    config = read_config_file(args.config)
    speeds = parse_logs(args.log)
    if not speeds:
        return
    si = {}
    si['framework_version'] = get_framework_version(config['backend'])
    si['cuda'], si['cudnn'] = get_cuda_version(config['backend'])
    si['gpu_name'], si['gpu_mem'] = get_gpu_info(args.gpu)
    si['cpu_name'], si['cpu_mem'], si['cpu_cores'] = get_cpu_info()
    si['python'] = get_python_version()
    si['baseline'] = version_str_to_tuple(baseline.__version__)

    save_data(conn, speeds, config, si)
Esempio n. 19
0
    def read_config(self, config_file, datasets_index):
        """
        Read the config file and the datasets index

        Between the config file and the dataset index, we have enough information
        to configure the backend and the models.  We can also initialize the data readers

        :param config_file: The config file
        :param datasets_index: The index of datasets
        :return:
        """
        datasets_set = mead.utils.index_by_label(datasets_index)
        self.config_params = read_config_file(config_file)
        self._setup_task()
        self._configure_reporting()
        self.dataset = datasets_set[self.config_params['dataset']]
        self.reader = self._create_task_specific_reader()
Esempio n. 20
0
def main():
    parser = argparse.ArgumentParser(description='Train a text classifier')
    parser.add_argument('--config', help='JSON Configuration for an experiment', required=True, type=convert_path)
    parser.add_argument('--settings', help='JSON Configuration for mead', default='config/mead-settings.json', type=convert_path)
    parser.add_argument('--datasets', help='json library of dataset labels', default='config/datasets.json', type=convert_path)
    parser.add_argument('--embeddings', help='json library of embeddings', default='config/embeddings.json', type=convert_path)
    parser.add_argument('--logging', help='json file for logging', default='config/logging.json', type=convert_path)
    parser.add_argument('--task', help='task to run', choices=['classify', 'tagger', 'seq2seq', 'lm'])
    args = parser.parse_known_args()[0]

    config_params = read_config_file(args.config)
    task_name = config_params.get('task', 'classify') if args.task is None else args.task
    print('Task: [{}]'.format(task_name))
    task = mead.Task.get_task_specific(task_name, args.logging, args.settings)
    task.read_config(config_params, args.datasets)
    task.initialize(args.embeddings)
    task.train()
Esempio n. 21
0
def main():
    parser = argparse.ArgumentParser(description='Train a text classifier')
    parser.add_argument('--config',
                        help='JSON Configuration for an experiment',
                        required=True,
                        type=convert_path)
    parser.add_argument('--settings',
                        help='JSON Configuration for mead',
                        required=False,
                        default='config/mead-settings.json',
                        type=convert_path)
    parser.add_argument('--datasets',
                        help='json library of dataset labels',
                        default='config/datasets.json',
                        type=convert_path)
    parser.add_argument('--embeddings',
                        help='json library of embeddings',
                        default='config/embeddings.json',
                        type=convert_path)
    parser.add_argument('--logging',
                        help='json file for logging',
                        default='config/logging.json',
                        type=convert_path)
    parser.add_argument('--task',
                        help='task to run',
                        choices=['classify', 'tagger', 'seq2seq', 'lm'])
    parser.add_argument('--exporter_type',
                        help='exporter type',
                        default='default')
    parser.add_argument('--model',
                        help='model name',
                        required=True,
                        type=unzip_model)
    parser.add_argument('--model_version', help='model_version', default=1)
    parser.add_argument('--output_dir', help='output dir', default='./models')
    args = parser.parse_args()

    config_params = read_config_file(args.config)
    task_name = config_params.get(
        'task', 'classify') if args.task is None else args.task
    task = mead.Task.get_task_specific(task_name, args.logging, args.settings)
    task.read_config(config_params, args.datasets)
    exporter = create_exporter(task, args.exporter_type)
    exporter.run(args.model, args.embeddings, args.output_dir,
                 args.model_version)
Esempio n. 22
0
 def put_result(self, label):
     # Wait to create the experiment repo until after the fork
     if self.repo is None:
         try:
             self.repo = ExperimentRepo.create_repo(**self.xpctl_config)
         except Exception as e:
             return str(e)
     loc = os.path.join(label.exp, label.sha1, label.name)
     config_loc = os.path.join(loc, 'config.json')
     config = read_config_file(config_loc)
     task = config.get('task')
     log_loc = glob.glob(os.path.join(loc, 'reporting-*.log'))[0]
     logs = read_logs(log_loc)
     return str(
         self.repo.put_result(task,
                              config,
                              logs,
                              print_fn=dummy_print,
                              label=self.name))
Esempio n. 23
0
def get_images(
    template_loc: Path,
    mead: Optional[str] = None,
    odin: Optional[str] = None,
    claim: Optional[str] = None,
) -> Tuple[Dict[str, str], Dict[str, str]]:
    """Get image names with a fallback to the images file."""
    defaults = read_config_file(os.path.join(template_loc, 'images.yml'))
    mead = mead if mead is not None else defaults['mead-image']
    odin = odin if odin is not None else defaults['odin-image']
    claim = claim if claim is not None else defaults['claim-name']
    images = {
        'mead': mead,
        'odin': odin,
        'template': defaults['template-image'],
        'hpctl': defaults['hpctl-image']
    }
    claims = {'data': claim}
    return images, claims
Esempio n. 24
0
def putresult(user, cbase, cstore, label, task, config, log):
    '''
    Puts the results in a database. provide task name, config file, the reporting log file. optionally can put the model files in a persistent storage.
    '''
    logf = log.format(task)
    if not os.path.exists(logf):
        click.echo("the log file at {} doesn't exist, provide a valid location".format(logf))
        return
    if not os.path.exists(config):
        click.echo("the config file at {} doesn't exist, provide a valid location".format(config))
        return

    config_file = config
    config_mem = read_config_file(config_file)
    events_mem = log2json(logf)

    RepoManager.get().put_result(task, config_mem, events_mem,
                                 username=user, label=label, print_fn=click.echo,
                                 checkpoint_base=cbase, checkpoint_store=cstore)
Esempio n. 25
0
def test_read_config_ymal_dispatch():
    pytest.importorskip('yaml')
    file_name = 'example.yml'
    with mock.patch('baseline.utils.read_yaml') as read_patch:
        read_config_file(file_name)
    read_patch.assert_called_once_with(file_name, strict=True)
Esempio n. 26
0
def test_read_config_json_dispatch():
    file_name = 'example.json'
    with mock.patch('baseline.utils.read_json') as read_patch:
        read_config_file(file_name)
    read_patch.assert_called_once_with(file_name, strict=True)
Esempio n. 27
0
def cli(host, config):
    if host is not None:
        ServerManager.host = host
    else:
        ServerManager.host = read_config_file(
            os.path.expanduser(config))['host']
Esempio n. 28
0
def generate_pipeline(  # pylint: disable=too-many-locals,too-many-branches,too-many-statements
    root_path: str,
    uname: str,
    pipeline_name: str,
    configs: Dict[str, Dict],
    datasets: Optional[Union[str, List[Dict[str, str]]]],
    embeddings: Optional[Union[str, List[Dict[str, str]]]],
    models: int = 1,
    gpus: int = 1,
    metric: str = 'acc',
    export_policy: Optional[str] = None,
    slack: bool = True,
    slack_web_hook: Optional[str] = None,
    git_commit: bool = False,
    mead_image: Optional[str] = None,
    odin_image: Optional[str] = None,
    claim_name: Optional[str] = None,
    pull_policy: str = ALWAYS,
    clobber: bool = False,
    addons: Dict[str, Dict[str, str]] = None,
    mead_eval_dataset: Optional[str] = None,
    template: Optional[Union[Dict, List, str]] = None,
    hpctl: bool = False,
    hpctl_addons: List[Dict[str, str]] = None,
    seed: Optional[int] = None,
    **kwargs,
) -> str:
    """Generate a pipeline.

    This function can generate an odin pipeline from a mead config and some configuration options.

    The pipeline will be named {uname}/{pipeline_name}

    If you use the hpctl flag then hpctl will be used in the pipeline to generate
    configs for each model on the fly. This records it's seed (which is also settable)
    to allow for reproducability

    If a mead_eval dataset is provided a mead-eval task is created for each training task.

    If an export_type is provided then an export task will be created that depends on all the
    previous mead jobs. All of the models are considered by the export decider.

    If any chores are requested (slack, git_commit, etc.) a chore task is added. This depends on
    either the export job (if it exists) otherwise it depends on the mead tasks.

    :param root_path: The location to write the pipeline to.
    :param uname: The username of the person generating the pipeline.
    :param pipeline_name: The name of the pipeline.
    :param configs: The mead configs for the each model, mapped form name to config.
    :param datasets: A custom datasets index definition (or location) to use.
    :param embeddings: A custom embeddings index definition (or location) to use.
    :param models: The number of models to train.
    :param gpus: The number of gpus to train each model with.
    :param metric: The metric to compare models when exporting.
    :param export_policy: The export criteria to use.
    :param slack: Should we send a slack notification?
    :param slack_web_hook: A custom endpoint for final slack messages.
    :param git_commit: Should we commit an exported model to git?
    :param mead_image: The docker image for the mead config.
    :param odin_image: The docker image for the odin config.
    :param claim_name: The name of the pvc to use.
    :param pull_policy: Should k8s repull your containers.
    :param clobber: Should you overwrite an old pipeline?
    :param addons: A mapping of addon file names to source code is contained
       in a mapping keyed by the model name.
    :param mead_eval_dataset: The dataset to use in mead-eval.
    :param template: The raven-template sampling directives or a file.
    :param hpctl: Should we run hpctl sampling at the start of the file?
    :param hpctl_addons: Sampling addons needed from hpctl, not support atm
    :param seed: A seed to set hpctl's RNG.
    :param kwargs: Absorb extra args for now.
    :returns: str, The name of the generated pipeline
    """

    pipeline_loc = make_pipeline_dir(root_path, uname, pipeline_name, clobber)

    pipeline = {}
    pipeline['name'] = pipeline_name if len(
        configs) == 1 else f"{pipeline_name}-auto"

    # Write out any addons that and config needs
    addons = addons if addons is not None else {}
    for _, addon in addons.items():
        for addon_file, addon_source in addon.items():
            addon_file = os.path.join(pipeline_loc, addon_file)
            with open(addon_file, 'w') as wf:
                wf.write(addon_source)
                set_permissions(addon_file)
    for config in configs.values():
        config.pop('modules', None)

    # Write out datasets
    if datasets:
        if isinstance(datasets, list):
            dataset_file = os.path.join(pipeline_loc, 'datasets.yml')
            write_yaml(datasets, dataset_file)
            set_permissions(dataset_file)
            datasets = os.path.join("${WORK_PATH}", "datasets.yml")
    # Write out embeddings
    if embeddings:
        if isinstance(embeddings, list):
            embeddings_file = os.path.join(pipeline_loc, 'embeddings.yml')
            write_yaml(embeddings, embeddings_file)
            set_permissions(embeddings_file)
            embeddings = os.path.join("${WORK_PATH}", "embeddings.yml")

    template_loc = os.path.join(root_path, 'templates')
    images, claims = get_images(template_loc, mead_image, odin_image,
                                claim_name)

    templating_template = read_config_file(
        os.path.join(template_loc, 'templating-template.yml'))
    mead_template = read_config_file(
        os.path.join(template_loc, 'mead-task-template.yml'))
    mead_eval_template = read_config_file(
        os.path.join(template_loc, 'mead-eval-template.yml'))
    export_template = read_config_file(
        os.path.join(template_loc, 'export-template.yml'))
    hpctl_template = read_config_file(
        os.path.join(template_loc, 'hpctl-template.yml'))
    chore_template = read_config_file(
        os.path.join(template_loc, 'chore-template.yml'))

    task = find_const_config_prop('task', configs.values())
    dataset = find_const_config_prop('dataset', configs.values())

    all_tasks = []
    data_files = {}
    dep = None
    # Create the template task to generate datasets
    if template:
        if isinstance(template, str):
            template_file = template
        else:
            file_name = 'sample-template.yml'
            write_yaml(template, os.path.join(pipeline_loc, file_name))
            template_file = os.path.join("${WORK_PATH}", file_name)
        template_task, output_file = generate_template_task(
            templating_template,
            images['template'],
            claims['data'],
            template_file,
            task,
            pull_policy=pull_policy)
        all_tasks.append(template_task)
        data_files = {
            dataset: f"{output_file}.{dataset}"
            for dataset in ("train", "valid", "test")
        }
        dep = template_task['name']

    trained_models = []
    evals = []
    for config_name, config in configs.items():
        addon = addons[config_name]
        config_file = os.path.join(pipeline_loc, f"{config_name}.yml")
        write_yaml(config, config_file)
        set_permissions(config_file)
        config_file = os.path.join("${WORK_PATH}", f"{config_name}.yml")
        if hpctl:
            hpctl_task = generate_hpctl_task(
                hpctl_template,
                images['hpctl'],
                claims['data'],
                f"{config_name}-sample",
                config_file,
                [f"{config_name}-{i}" for i in range(models)],
                seed=seed,
                pull_policy=pull_policy,
            )
            all_tasks.append(hpctl_task)
            dep = hpctl_task['name']
            config_file = os.path.join("${TASK_PATH}", "config.yml")
        for i in range(models):
            task_name = f"{config_name}-{i}"
            train_task = generate_mead_task(
                mead_template,
                task_name,
                config_file,
                images['mead'],
                claims['data'],
                datasets=datasets,
                embeddings=embeddings,
                data_files=data_files,
                gpus=gpus,
                addons=addons[config_name],
                depends=dep,
                pull_policy=pull_policy,
            )
            all_tasks.append(train_task)
            trained_models.append(train_task['name'])
            if mead_eval_dataset:
                name = f"{task_name}-eval"
                eval_task = generate_eval_task(
                    template=mead_eval_template,
                    task_name=name,
                    image=images['odin'],
                    claim=claims['data'],
                    eval_task=task_name,
                    eval_dataset=mead_eval_dataset,
                    config=config,
                    depends=task_name,
                    addons=addons[config_name],
                    pull_policy=pull_policy,
                )
                all_tasks.append(eval_task)
                evals.append(eval_task['name'])

    prev_tasks = list(chain(trained_models, evals))
    export_task = generate_export_task(
        export_template,
        images['odin'],
        claims['data'],
        trained_models,
        task,
        dataset,
        depends=prev_tasks,
        metric=metric,
        export_policy=export_policy,
        pull_policy=pull_policy,
    )
    chore_depends = prev_tasks
    if export_task:
        all_tasks.append(export_task)
        chore_depends = 'export'
    chores = generate_chore_yaml(template_loc,
                                 slack,
                                 slack_web_hook,
                                 git_commit,
                                 selected=export_task)
    if chores:
        chore_task = generate_chore_task(chore_template,
                                         images['odin'],
                                         claims['data'],
                                         depends=chore_depends,
                                         pull_policy=pull_policy)
        all_tasks.append(chore_task)
        chore_file = os.path.join(pipeline_loc, 'chores.yml')
        write_yaml({'chores': chores}, chore_file)
        set_permissions(chore_file)

    pipeline['tasks'] = all_tasks
    main_file = os.path.join(pipeline_loc, 'main.yml')
    write_yaml(pipeline, main_file)
    set_permissions(main_file)

    return os.path.join(uname, pipeline_name)
Esempio n. 29
0
def test_read_config_ymal_dispatch():
    pytest.importorskip('yaml')
    file_name = 'example.yml'
    with mock.patch('baseline.utils.read_yaml') as read_patch:
        read_config_file(file_name)
    read_patch.assert_called_once_with(file_name, strict=True)
Esempio n. 30
0
def test_read_config_json_dispatch():
    file_name = 'example.json'
    with mock.patch('baseline.utils.read_json') as read_patch:
        read_config_file(file_name)
    read_patch.assert_called_once_with(file_name, strict=True)
Esempio n. 31
0
def preprocess_arguments(args: argparse.Namespace) -> Dict:  # pylint: disable=too-many-branches
    """Update the cli args be doing things like reading in the files they point to and things like that.

    :params args: The command line args.
    :returns: The command line args with values updated.
    """
    if args.pipeline_name is None:
        args.pipeline_name, _ = os.path.splitext(
            os.path.basename(args.configs[0]))
    configs = {
        os.path.splitext(os.path.basename(config))[0]: read_config_file(config)
        for config in args.configs
    }
    for config in configs.values():
        if args.task is None and 'task' not in config:
            LOGGER.warning("No task specified, defaulting to `classify`")
            config['task'] = 'classify'
    # If there is requested export type make sure the export section is filled in in the config.
    if args.export_policy is not None:
        export = guess_export_loc(
            list(configs.values())[0], args.output_dir, args.project,
            args.name)
        if export:
            for config in configs.values():
                config['export'] = export
        else:
            exit(1)
    if args.embeddings is not None:
        if os.path.exists(args.embeddings):
            args.embeddings = read_config_file(args.embeddings)
    if args.datasets is not None:
        if os.path.exists(args.datasets):
            args.datasets = read_config_file(args.datasets)
    # If train, valid, or tests files are provided use them to populate the datasets.
    if args.train_file is not None or args.valid_file is not None or args.test_file is not None:
        # If we are not overwriting entries in a dataset index and we don't have enough datasets listed
        if args.datasets is None and (args.train_file is None
                                      or args.valid_file is None):
            LOGGER.warning("Both a train file and a valid file are required.")
            exit(1)
        args.datasets = args.datasets if args.datasets is not None else [{}]
        dataset_label = find_const_config_prop('dataset', configs.values())
        index, dataset = next(((i, d) for i, d in enumerate(args.datasets)
                               if d.get('label') == dataset_label), (0, {}))
        # This populate this if we are build from starch, will be the same otherwise
        dataset['label'] = dataset_label
        dataset[
            'train_file'] = args.train_file if args.train_file is not None else dataset.get(
                'train_file')
        dataset[
            'valid_file'] = args.valid_file if args.valid_file is not None else dataset.get(
                'valid_file')
        dataset[
            'test_file'] = args.test_file if args.test_file is not None else dataset.get(
                'test_file')
        args.datasets[index] = dataset
    dataset = find_const_config_prop('dataset', configs.values())
    if args.datasets is None and ":" not in dataset:
        LOGGER.warning(
            "You did not provide a custom dataset file and the dataset (%s) appears to be an old style dataset."
            " This means the server will most likely not be able to find this dataset.",
            dataset,
        )
    config = list(configs.values())[0]  # Hack
    config_gpus = config['train'].get(
        'gpus', config['model'].get('gpus', config.get('gpus', None)))
    config_gpus = int(config_gpus) if config_gpus is not None else config_gpus
    # If they don't pass gpu via cli set it to match the config with a default of 1
    args.gpus = (config_gpus if config_gpus is not None else
                 1) if args.gpus is None else args.gpus
    if config_gpus is not None and args.gpus != config_gpus:
        LOGGER.warning(
            "The number of gpus requested via cli [%d] is not equal to number requested in the config file [%d]",
            args.gpus,
            config_gpus,
        )
        exit(1)
    addons = {}
    for config_name, config in configs.items():
        config['modules'] = list(
            set(chain(config.get('modules', []), args.modules)))
        addons[config_name] = {
            os.path.basename(mod.__file__): inspect.getsource(mod)
            for mod in (import_user_module(m) for m in config['modules'])
        }
    if args.template is not None and os.path.isfile(args.template):
        args.template = read_config_file(args.template)

    config = {
        'uname': args.user,
        'pipeline_name': args.pipeline_name,
        'configs': configs,
        'datasets': args.datasets,
        'embeddings': args.embeddings,
        'models': args.models,
        'gpus': args.gpus,
        'metric': args.metric,
        'export_policy': args.export_policy,
        'slack': args.slack,
        'slack_web_hook': args.slack_web_hook,
        'git_commit': False,
        'mead_image': args.mead_image,
        'odin_image': args.odin_image,
        'claim_name': args.claim_name,
        'pull_policy': args.pull_policy,
        'clobber': args.clobber,
        'addons': addons,
        'mead_eval_dataset': args.mead_eval_dataset,
        'template': args.template,
        'hpctl': args.hpctl,
        'seed': args.seed,
    }
    return config