Beispiel #1
0
def kill_f(pattern):
    print("trying to kill the pattern: ", pattern)
    to_kill = []
    to_kill_ids = {}
    for instance in get_all_instances():
        name = get_name_tag(instance)
        if name is None or pattern in name:
            instance_id = instance['InstanceId']
            region = instance['Region']
            if name is None:
                if any([x['GroupName'] in AUTOCONFIG.aws_security_groups() for x in instance['SecurityGroups']]):
                    if query_yes_no(question="Kill instance {} without name in region {} (security groups {})?".format(
                            instance_id, region, [x['GroupName'] for x in instance['SecurityGroups']])):
                        name = instance_id
            if name:
                if region not in to_kill_ids:
                    to_kill_ids[region] = []
                to_kill_ids[region].append(instance_id)
                to_kill.append(name)

    print("This will kill the following jobs:")
    print(", ".join(sorted(to_kill)))
    if query_yes_no(question="Proceed?", default="no"):
        for client in get_clients():
            print("Terminating instances in region", client.region)
            ids = to_kill_ids.get(client.region, [])
            if len(ids) > 0:
                client.terminate_instances(
                    InstanceIds=to_kill_ids.get(client.region, [])
                )
Beispiel #2
0
def run_sweep(run_experiment, sweep_params, exp_name, parser, instance_type='c4.xlarge'):
    parser.add_argument('--mode', type=str, default='local',
                        help='Mode for running the experiments - local: runs on local machine, '
                             'ec2: runs on AWS ec2 cluster (requires a proper configuration file)')

    parser.add_argument('--num_gpu', '-g', type=int, default=1,
                        help='Number of GPUs to use for running the experiments')

    parser.add_argument('--exps_per_gpu', '-e', type=int, default=1,
                        help='Number of experiments per GPU simultaneously')

    parser.add_argument('--num_cpu', '-c', type=int, default=multiprocessing.cpu_count(),
                        help='Number of threads to use for running experiments')

    args = parser.parse_args(sys.argv[1:])

    local_mount = mount.MountLocal(local_dir=config.BASE_DIR, pythonpath=True)

    docker_mount_point = os.path.join(config.DOCKER_MOUNT_DIR, exp_name)

    sweeper = launcher.DoodadSweeper([local_mount], docker_img=config.DOCKER_IMAGE,
                                     docker_output_dir=docker_mount_point,
                                     local_output_dir=os.path.join(config.DATA_DIR, 'local', exp_name))
    if args.mode == 'ec2':
        sweeper.mount_out_s3 = mount.MountS3(s3_path='', mount_point=docker_mount_point, output=True)

    if args.mode == 'ec2':
        print("\n" + "**********" * 10 + "\nexp_prefix: {}\nvariants: {}".format(exp_name, len(
            list(itertools.product(*[value for value in sweep_params.values()])))))

        if query_yes_no("Continue?"):
            sweeper.run_sweep_ec2(run_experiment, sweep_params, bucket_name=config.S3_BUCKET_NAME,
                                  instance_type=instance_type,
                                  region='us-west-2', s3_log_name=exp_name, add_date_to_logname=False)

    elif args.mode == 'local_docker':
        mode_docker = dd.mode.LocalDocker(
            image=sweeper.image,
        )
        run_sweep_doodad(run_experiment, sweep_params, run_mode=mode_docker,
                         mounts=sweeper.mounts)

    elif args.mode == 'local':
        sweeper.run_sweep_serial(run_experiment, sweep_params)

    elif args.mode == 'local_par':
        sweeper.run_sweep_parallel(run_experiment, sweep_params)

    elif args.mode == 'multi_gpu':
        run_sweep_multi_gpu(run_experiment, sweep_params, num_gpu=args.num_gpu, exps_per_gpu=args.exps_per_gpu)

    elif args.mode == 'local_singularity':
        mode_singularity = dd.mode.LocalSingularity(
            image='~/maml_zoo.simg')
        run_sweep_doodad(run_experiment, sweep_params, run_mode=mode_singularity,
                         mounts=sweeper.mounts)
    else:
        raise NotImplementedError
Beispiel #3
0
def kill(job):
    to_kill = []
    to_kill_ids = {}
    for instance in get_all_instances():
        name = get_name_tag(instance)
        if name == job:
            region = instance['Region']
            if region not in to_kill_ids:
                to_kill_ids[region] = []
            to_kill_ids[region].append(instance['InstanceId'])
            to_kill.append(name)
            break

    print("This will kill the following jobs:")
    print(", ".join(sorted(to_kill)))
    if query_yes_no(question="Proceed?", default="no"):
        for client in get_clients():
            print("Terminating instances in region", client.region)
            ids = to_kill_ids.get(client.region, [])
            if len(ids) > 0:
                client.terminate_instances(
                    InstanceIds=to_kill_ids.get(client.region, [])
                )
Beispiel #4
0
def launch_experiment(
    exp_name,
    variant,
    sweep_values=None,
    num_seeds=1,
    get_confirmation=True,

    # arguments specifying where the code to run the experiment is
    experiment_class=None,
    get_config=None,
    get_algorithm=None,
    get_offline_algorithm=None,
    load_config=None,

    # misc arguments
    instance_type='c4.2xlarge',
    use_gpu=False,
    include_date=True,
):

    parser = argparse.ArgumentParser()
    parser.add_argument(
        '--mode',
        type=str,
        default='local',
        help='Mode for running the experiments - local: runs on local machine, '
        'ec2: runs on AWS ec2 cluster (requires a proper configuration file)')

    parser.add_argument(
        '--gpu_id',
        '-id',
        type=int,
        default=0,
        help='GPU id for running experiments (if using single GPU)')

    parser.add_argument(
        '--num_gpu',
        '-g',
        type=int,
        default=3,
        help='Number of GPUs to use for running the experiments')

    parser.add_argument('--exps_per_gpu',
                        '-e',
                        type=int,
                        default=1,
                        help='Number of experiments per GPU simultaneously')

    parser.add_argument(
        '--num_cpu',
        '-c',
        type=int,
        default=multiprocessing.cpu_count(),
        help='Number of threads to use for running experiments')

    parser.add_argument('--log_to_wandb',
                        '-w',
                        type=bool,
                        default=False,
                        help='Whether or not to log to Weights and Biases')

    args = parser.parse_args(sys.argv[1:])
    """
    Generating experiment from specified functions:
    
    If the user specifies experiment_class, it is assumed that if get_algorithm and/or
        get_offline_algorithm are specified, then they are located there. This is mostly
        just for backwards compatibility.
    Otherwise, get_algorithm and get_offline_algorithm should be fed into launch_experiment,
        which is generally more modular than specifying the class. get_config must be
        specified, either in experiment_class or in the method call.
    load_config is called after the initialization of the config dict, so it can modify any
        values of the dict in place as needed, and must be fed directly.
    """

    experiment_config = dict()
    if experiment_class is not None:
        experiment_config['get_config'] = experiment_class.get_config
        if hasattr(experiment_class, 'get_algorithm'):
            experiment_config['get_algorithm'] = experiment_class.get_algorithm
        if hasattr(experiment_class, 'get_offline_algorithm'):
            experiment_config['get_offline_algorithm'] = \
                experiment_class.get_offline_algorithm

    if get_config is not None:
        experiment_config['get_config'] = get_config
    if get_algorithm is not None:
        experiment_config['get_algorithm'] = get_algorithm
    if get_offline_algorithm is not None:
        experiment_config['get_offline_algorithm'] = get_offline_algorithm

    if load_config is not None:
        experiment_config['load_config'] = load_config

    if sweep_values is None:
        variants = [variant]
    else:
        variants = generate_variants(variant,
                                     sweep_values,
                                     num_seeds=num_seeds)
    """
    Setup in the form to feed into the doodad sweeper.
    """

    if include_date:
        timestamp = datetime.now().strftime('%m-%d')
        exp_name = '%s-%s' % (timestamp, exp_name)

    gpu_id = args.gpu_id
    log_to_wandb = args.log_to_wandb
    sweep_params = dict(
        experiment_config=[experiment_config],
        exp_prefix=[exp_name],
        variant=variants,
        gpu_kwargs=[{
            'mode':
            use_gpu if args.mode != 'ec2' else False,  # don't use GPU with EC2
            'gpu_id': gpu_id
        }],
        log_to_wandb=[log_to_wandb],
    )
    """
    Confirmation
    """

    print('\n')
    print('=' * 50)
    print('Launching experiment: %s' % exp_name)
    print('num variants: %d, num seeds: %d' %
          (len(variants) // num_seeds, num_seeds))
    print('About to launch %d total experiments' % (len(variants)))
    print('=' * 50)
    for k in sweep_values:
        print('%s:' % k, sweep_values[k])
    print('=' * 50)
    print('\n')

    if get_confirmation and not query_yes_no('Confirm?'):
        return
    """
    Standard run_sweep
    """

    local_mount = mount.MountLocal(local_dir=config.BASE_DIR, pythonpath=True)

    docker_mount_point = os.path.join(config.DOCKER_MOUNT_DIR, exp_name)

    sweeper = launcher.DoodadSweeper([local_mount],
                                     docker_img=config.DOCKER_IMAGE,
                                     docker_output_dir=docker_mount_point,
                                     local_output_dir=os.path.join(
                                         config.DATA_DIR, 'local', exp_name))

    # it's annoying to have to set up s3 if we don't want to use it
    # TODO: if you want to use S3, uncomment this
    sweeper.mount_out_s3 = None  # mount.MountS3(s3_path='', mount_point=docker_mount_point, output=True)

    if args.mode == 'ec2':
        print("\n" + "**********" * 10 +
              "\nexp_prefix: {}\nvariants: {}".format(
                  exp_name,
                  len(
                      list(
                          itertools.product(
                              *[value for value in sweep_params.values()])))))

        if query_yes_no("Continue?"):
            sweeper.run_sweep_ec2(run_experiment,
                                  sweep_params,
                                  bucket_name=config.S3_BUCKET_NAME,
                                  instance_type=instance_type,
                                  region='us-east-2',
                                  s3_log_name=exp_name,
                                  add_date_to_logname=False)

    elif args.mode == 'local_docker':
        mode_docker = dd.mode.LocalDocker(image=sweeper.image, )
        run_sweep_doodad(run_experiment,
                         sweep_params,
                         run_mode=mode_docker,
                         mounts=sweeper.mounts)

    elif args.mode == 'local':
        sweeper.run_sweep_serial(run_experiment, sweep_params)

    elif args.mode == 'local_par':
        sweeper.run_sweep_parallel(run_experiment, sweep_params)

    elif args.mode == 'multi_gpu':
        run_sweep_multi_gpu(run_experiment,
                            sweep_params,
                            num_gpu=args.num_gpu,
                            exps_per_gpu=args.exps_per_gpu)

    else:
        raise NotImplementedError('experiment run mode not recognized')
Beispiel #5
0
    print(config.BASE_DIR)
    local_mount = mount.MountLocal(local_dir=config.BASE_DIR, pythonpath=True)
    docker_mount_point = os.path.join(config.DOCKER_MOUNT_DIR, EXP_NAME)

    sweeper = launcher.DoodadSweeper([local_mount],
                                     docker_img=config.DOCKER_IMAGE,
                                     docker_output_dir=docker_mount_point,
                                     local_output_dir=os.path.join(
                                         config.DATA_DIR, 'local', EXP_NAME))
    sweeper.mount_out_s3 = mount.MountS3(s3_path='',
                                         mount_point=docker_mount_point,
                                         output=True)

    if args.mode == 'ec2':
        if query_yes_no("Continue?"):
            sweeper.run_sweep_ec2(run_experiment, {'alg': [0]},
                                  bucket_name=config.S3_BUCKET_NAME,
                                  instance_type='c4.xlarge',
                                  region='us-west-1',
                                  s3_log_name=EXP_NAME,
                                  add_date_to_logname=True)
    elif args.mode == 'local_docker':
        mode_docker = dd.mode.LocalDocker(image=sweeper.image, )
        run_sweep_doodad(run_experiment, {'alg': [0]},
                         run_mode=mode_docker,
                         mounts=sweeper.mounts)

    else:
        run_experiment()