Beispiel #1
0
def log_variant(log_file, variant_data):
    mkdir_p(os.path.dirname(log_file))
    if hasattr(variant_data, "dump"):
        variant_data = variant_data.dump()
    variant_json = stub_to_json(variant_data)
    with open(log_file, "w") as f:
        json.dump(variant_json, f, indent=2, sort_keys=True, cls=MyEncoder)
Beispiel #2
0
def to_docker_command(params, docker_image, python_command="python", script='scripts/run_experiment.py',
                      pre_commands=None, use_tty=False,
                      post_commands=None, dry=False, use_gpu=False, env=None, local_code_dir=None):
    """
    :param params: The parameters for the experiment. If logging directory parameters are provided, we will create
    docker volume mapping to make sure that the logging files are created at the correct locations
    :param docker_image: docker image to run the command on
    :param script: script command for running experiment
    :return:
    """
    log_dir = params.get("log_dir")
    # script = 'rllab/' + script
    if not dry:
        mkdir_p(log_dir)
    # create volume for logging directory
    if use_gpu:
        command_prefix = "nvidia-docker run"
    else:
        command_prefix = "docker run"
    docker_log_dir = config.DOCKER_LOG_DIR
    if env is not None:
        for k, v in env.items():
            command_prefix += " -e \"{k}={v}\"".format(k=k, v=v)
    command_prefix += " -v {local_mujoco_key_dir}:{docker_mujoco_key_dir}".format(
        local_mujoco_key_dir=config.MUJOCO_KEY_PATH, docker_mujoco_key_dir='/root/.mujoco')
    command_prefix += " -v {local_log_dir}:{docker_log_dir}".format(
        local_log_dir=log_dir,
        docker_log_dir=docker_log_dir
    )
    if local_code_dir is None:
        local_code_dir = config.PROJECT_PATH
    command_prefix += " -v {local_code_dir}:{docker_code_dir}".format(
        local_code_dir=local_code_dir,
        docker_code_dir=config.DOCKER_CODE_DIR
    )
    params = dict(params, log_dir=docker_log_dir)
    if use_tty:
        command_prefix += " -ti " + docker_image + " /bin/bash -c "
    else:
        command_prefix += " -i " + docker_image + " /bin/bash -c "
    command_list = list()
    if pre_commands is not None:
        command_list.extend(pre_commands)
    command_list.append("echo \"Running in docker\"")
    command_list.append(to_local_command(
        params, python_command=python_command, script=osp.join(config.DOCKER_CODE_DIR, script), use_gpu=use_gpu))
    # We for 2 min sleep after termination to allow for last syncs.
    if post_commands is None:
        post_commands = ['sleep 120']
    command_list.extend(post_commands)
    return command_prefix + "'" + "; ".join(command_list) + "'"
Beispiel #3
0
def to_docker_command(params, docker_image, python_command="python", script='scripts/run_experiment.py',
                      pre_commands=None, use_tty=False,
                      post_commands=None, dry=False, use_gpu=False, env=None, local_code_dir=None):
    """
    :param params: The parameters for the experiment. If logging directory parameters are provided, we will create
    docker volume mapping to make sure that the logging files are created at the correct locations
    :param docker_image: docker image to run the command on
    :param script: script command for running experiment
    :return:
    """
    log_dir = params.get("log_dir")
    # script = 'rllab/' + script
    if not dry:
        mkdir_p(log_dir)
    # create volume for logging directory
    if use_gpu:
        command_prefix = "nvidia-docker run"
    else:
        command_prefix = "docker run"
    docker_log_dir = config.DOCKER_LOG_DIR
    if env is not None:
        for k, v in env.items():
            command_prefix += " -e \"{k}={v}\"".format(k=k, v=v)
    command_prefix += " -v {local_mujoco_key_dir}:{docker_mujoco_key_dir}".format(
        local_mujoco_key_dir=config.MUJOCO_KEY_PATH, docker_mujoco_key_dir='/root/.mujoco')
    command_prefix += " -v {local_log_dir}:{docker_log_dir}".format(
        local_log_dir=log_dir,
        docker_log_dir=docker_log_dir
    )
    if local_code_dir is None:
        local_code_dir = config.PROJECT_PATH
    command_prefix += " -v {local_code_dir}:{docker_code_dir}".format(
        local_code_dir=local_code_dir,
        docker_code_dir=config.DOCKER_CODE_DIR
    )
    params = dict(params, log_dir=docker_log_dir)
    if use_tty:
        command_prefix += " -ti " + docker_image + " /bin/bash -c "
    else:
        command_prefix += " -i " + docker_image + " /bin/bash -c "
    command_list = list()
    if pre_commands is not None:
        command_list.extend(pre_commands)
    command_list.append("echo \"Running in docker\"")
    command_list.append(to_local_command(
        params, python_command=python_command, script=osp.join(config.DOCKER_CODE_DIR, script), use_gpu=use_gpu))
    # We for 2 min sleep after termination to allow for last syncs.
    if post_commands is None:
        post_commands = ['sleep 120']
    command_list.extend(post_commands)
    return command_prefix + "'" + "; ".join(command_list) + "'"
def log_parameters(log_file, args, classes):
    log_params = {}
    for param_name, param_value in args.__dict__.items():
        if any([param_name.startswith(x) for x in list(classes.keys())]):
            continue
        log_params[param_name] = param_value
    for name, cls in classes.items():
        if isinstance(cls, type):
            params = get_all_parameters(cls, args)
            params["_name"] = getattr(args, name)
            log_params[name] = params
        else:
            log_params[name] = getattr(cls, "__kwargs", dict())
            log_params[name]["_name"] = cls.__module__ + "." + cls.__class__.__name__
    mkdir_p(os.path.dirname(log_file))
    with open(log_file, "w") as f:
        json.dump(log_params, f, indent=2, sort_keys=True)
Beispiel #5
0
def log_parameters(log_file, args, classes):
    log_params = {}
    for param_name, param_value in list(args.__dict__.items()):
        if any([param_name.startswith(x) for x in list(classes.keys())]):
            continue
        log_params[param_name] = param_value
    for name, cls in list(classes.items()):
        if isinstance(cls, type):
            params = get_all_parameters(cls, args)
            params["_name"] = getattr(args, name)
            log_params[name] = params
        else:
            log_params[name] = getattr(cls, "__kwargs", dict())
            log_params[name]["_name"] = cls.__module__ + "." + cls.__class__.__name__
    mkdir_p(os.path.dirname(log_file))
    with open(log_file, "w") as f:
        json.dump(log_params, f, indent=2, sort_keys=True)
Beispiel #6
0
def log_parameters_lite(log_file, args):
    log_params = {}
    for param_name, param_value in list(args.__dict__.items()):
        log_params[param_name] = param_value
    if args.args_data is not None:
        stub_method = pickle.loads(base64.b64decode(args.args_data))
        method_args = stub_method.kwargs
        log_params["json_args"] = dict()
        for k, v in list(method_args.items()):
            log_params["json_args"][k] = stub_to_json(v)
        kwargs = stub_method.obj.kwargs
        for k in ["baseline", "env", "policy"]:
            if k in kwargs:
                log_params["json_args"][k] = stub_to_json(kwargs.pop(k))
        log_params["json_args"]["algo"] = stub_to_json(stub_method.obj)
    mkdir_p(os.path.dirname(log_file))
    with open(log_file, "w") as f:
        json.dump(log_params, f, indent=2, sort_keys=True)
def log_parameters_lite(log_file, args):
    log_params = {}
    for param_name, param_value in args.__dict__.items():
        log_params[param_name] = param_value
    if args.args_data is not None:
        stub_method = pickle.loads(base64.b64decode(args.args_data))
        method_args = stub_method.kwargs
        log_params["json_args"] = dict()
        for k, v in list(method_args.items()):
            log_params["json_args"][k] = stub_to_json(v)
        kwargs = stub_method.obj.kwargs
        for k in ["baseline", "env", "policy"]:
            if k in kwargs:
                log_params["json_args"][k] = stub_to_json(kwargs.pop(k))
        log_params["json_args"]["algo"] = stub_to_json(stub_method.obj)
    mkdir_p(os.path.dirname(log_file))
    with open(log_file, "w") as f:
        json.dump(log_params, f, indent=2, sort_keys=True, cls=MyEncoder)
def setup_ec2():
    for region in ["us-east-1", "us-west-1", "us-west-2"]:
        print("Setting up region %s" % region)

        ec2 = boto3.resource(
            "ec2",
            region_name=region,
            aws_access_key_id=ACCESS_KEY,
            aws_secret_access_key=ACCESS_SECRET,
        )
        ec2_client = boto3.client(
            "ec2",
            region_name=region,
            aws_access_key_id=ACCESS_KEY,
            aws_secret_access_key=ACCESS_SECRET,
        )
        existing_vpcs = list(ec2.vpcs.all())
        assert len(existing_vpcs) >= 1
        vpc = existing_vpcs[0]
        print("Creating security group in VPC %s" % str(vpc.id))
        try:
            security_group = vpc.create_security_group(
                GroupName='rllab-sg', Description='Security group for rllab')
        except botocore.exceptions.ClientError as e:
            if e.response['Error']['Code'] == 'InvalidGroup.Duplicate':
                sgs = list(vpc.security_groups.filter(GroupNames=['rllab-sg']))
                security_group = sgs[0]
            else:
                raise e

        ALL_REGION_AWS_SECURITY_GROUP_IDS[region] = [security_group.id]

        ec2_client.create_tags(Resources=[security_group.id],
                               Tags=[{
                                   'Key': 'Name',
                                   'Value': 'rllab-sg'
                               }])
        try:
            security_group.authorize_ingress(FromPort=22,
                                             ToPort=22,
                                             IpProtocol='tcp',
                                             CidrIp='0.0.0.0/0')
        except botocore.exceptions.ClientError as e:
            if e.response['Error']['Code'] == 'InvalidPermission.Duplicate':
                pass
            else:
                raise e
        print("Security group created with id %s" % str(security_group.id))

        key_name = 'rllab-%s' % region
        try:
            print("Trying to create key pair with name %s" % key_name)
            key_pair = ec2_client.create_key_pair(KeyName=key_name)
        except botocore.exceptions.ClientError as e:
            if e.response['Error']['Code'] == 'InvalidKeyPair.Duplicate':
                if not query_yes_no(
                        "Key pair with name %s exists. Proceed to delete and recreate?"
                        % key_name, "no"):
                    sys.exit()
                print("Deleting existing key pair with name %s" % key_name)
                ec2_client.delete_key_pair(KeyName=key_name)
                print("Recreating key pair with name %s" % key_name)
                key_pair = ec2_client.create_key_pair(KeyName=key_name)
            else:
                raise e

        key_pair_folder_path = os.path.join(config.PROJECT_PATH, "private",
                                            "key_pairs")
        file_name = os.path.join(key_pair_folder_path, "%s.pem" % key_name)

        print("Saving keypair file")
        console.mkdir_p(key_pair_folder_path)
        with os.fdopen(os.open(file_name, os.O_WRONLY | os.O_CREAT, 0o600),
                       'w') as handle:
            handle.write(key_pair['KeyMaterial'] + '\n')

        # adding pem file to ssh
        os.system("ssh-add %s" % file_name)

        ALL_REGION_AWS_KEY_NAMES[region] = key_name
Beispiel #9
0
def to_lab_kube_pod(
        params, docker_image, code_full_path,
        python_command="python",
        script='scripts/run_experiment.py',
        is_gpu=False,
        sync_s3_pkl=False,
        periodic_sync=True,
        periodic_sync_interval=15,
        sync_all_data_node_to_s3=False,
        terminate_machine=True
):
    """
    :param params: The parameters for the experiment. If logging directory parameters are provided, we will create
    docker volume mapping to make sure that the logging files are created at the correct locations
    :param docker_image: docker image to run the command on
    :param script: script command for running experiment
    :return:
    """
    log_dir = params.get("log_dir")
    remote_log_dir = params.pop("remote_log_dir")
    resources = params.pop("resources")
    node_selector = params.pop("node_selector")
    exp_prefix = params.pop("exp_prefix")

    kube_env = [
        {"name": k, "value": v}
        for k, v in (params.pop("env", None) or dict()).items()
        ]
    mkdir_p(log_dir)
    pre_commands = list()
    pre_commands.append('mkdir -p ~/.aws')
    pre_commands.append('mkdir ~/.mujoco')
    # fetch credentials from the kubernetes secret file
    pre_commands.append('echo "[default]" >> ~/.aws/credentials')
    pre_commands.append(
        "echo \"aws_access_key_id = %s\" >> ~/.aws/credentials" % config.AWS_ACCESS_KEY)
    pre_commands.append(
        "echo \"aws_secret_access_key = %s\" >> ~/.aws/credentials" % config.AWS_ACCESS_SECRET)
    s3_mujoco_key_path = config.AWS_CODE_SYNC_S3_PATH + '/.mujoco/'
    pre_commands.append(
        'aws s3 cp --recursive {} {}'.format(s3_mujoco_key_path, '~/.mujoco'))

    if config.FAST_CODE_SYNC:
        pre_commands.append('aws s3 cp %s /tmp/rllab_code.tar.gz' % code_full_path)
        pre_commands.append('mkdir -p %s' % config.DOCKER_CODE_DIR)
        pre_commands.append('tar -zxvf /tmp/rllab_code.tar.gz -C %s' % config.DOCKER_CODE_DIR)
    else:
        pre_commands.append('aws s3 cp --recursive %s %s' %
                            (code_full_path, config.DOCKER_CODE_DIR))
    pre_commands.append('cd %s' % config.DOCKER_CODE_DIR)
    pre_commands.append('mkdir -p %s' %
                        (log_dir))

    if sync_all_data_node_to_s3:
        print('Syncing all data from node to s3.')
        if periodic_sync:
            if sync_s3_pkl:
                pre_commands.append("""
                            while /bin/true; do
                                aws s3 sync {log_dir} {remote_log_dir} --region {aws_region} --quiet
                                sleep {periodic_sync_interval}
                            done & echo sync initiated""".format(log_dir=log_dir, remote_log_dir=remote_log_dir,
                                                                 aws_region=config.AWS_REGION_NAME,
                                                                 periodic_sync_interval=periodic_sync_interval))
            else:
                pre_commands.append("""
                            while /bin/true; do
                                aws s3 sync {log_dir} {remote_log_dir} --region {aws_region} --quiet
                                sleep {periodic_sync_interval}
                            done & echo sync initiated""".format(log_dir=log_dir, remote_log_dir=remote_log_dir,
                                                                 aws_region=config.AWS_REGION_NAME,
                                                                 periodic_sync_interval=periodic_sync_interval))
    else:
        if periodic_sync:
            if sync_s3_pkl:
                pre_commands.append("""
                    while /bin/true; do
                        aws s3 sync --exclude '*' --include '*.csv' --include '*.json' --include '*.pkl' {log_dir} {remote_log_dir} --region {aws_region} --quiet
                        sleep {periodic_sync_interval}
                    done & echo sync initiated""".format(log_dir=log_dir, remote_log_dir=remote_log_dir,
                                                         aws_region=config.AWS_REGION_NAME,
                                                         periodic_sync_interval=periodic_sync_interval))
            else:
                pre_commands.append("""
                    while /bin/true; do
                        aws s3 sync --exclude '*' --include '*.csv' --include '*.json' {log_dir} {remote_log_dir} --region {aws_region} --quiet
                        sleep {periodic_sync_interval}
                    done & echo sync initiated""".format(log_dir=log_dir, remote_log_dir=remote_log_dir,
                                                         aws_region=config.AWS_REGION_NAME,
                                                         periodic_sync_interval=periodic_sync_interval))
    # copy the file to s3 after execution
    post_commands = list()
    post_commands.append('aws s3 cp --recursive %s %s' %
                         (log_dir,
                          remote_log_dir))
    if not terminate_machine:
        post_commands.append('sleep infinity')
    command_list = list()
    if pre_commands is not None:
        command_list.extend(pre_commands)
    command_list.append("echo \"Running in docker\"")
    command_list.append(
        "%s 2>&1 | tee -a %s" % (
            to_local_command(params, python_command=python_command, script=script),
            "%s/stdouterr.log" % log_dir
        )
    )
    if post_commands is not None:
        command_list.extend(post_commands)
    command = "; ".join(command_list)
    pod_name = config.KUBE_PREFIX + params["exp_name"]
    # underscore is not allowed in pod names
    pod_name = pod_name.replace("_", "-")
    print("Is gpu: ", is_gpu)
    if not is_gpu:
        return {
            "apiVersion": "v1",
            "kind": "Pod",
            "metadata": {
                "name": pod_name,
                "labels": {
                    "owner": config.LABEL,
                    "expt": pod_name,
                    "exp_time": timestamp,
                    "exp_prefix": exp_prefix,
                },
            },
            "spec": {
                "containers": [
                    {
                        "name": "foo",
                        "image": docker_image,
                        "command": [
                            "/bin/bash",
                            "-c",
                            "-li",  # to load conda env file
                            command,
                        ],
                        "resources": resources,
                        "imagePullPolicy": "Always",
                    }
                ],
                "restartPolicy": "Never",
                "nodeSelector": node_selector,
                "dnsPolicy": "Default",
            }
        }
    return {
        "apiVersion": "v1",
        "kind": "Pod",
        "metadata": {
            "name": pod_name,
            "labels": {
                "owner": config.LABEL,
                "expt": pod_name,
                "exp_time": timestamp,
                "exp_prefix": exp_prefix,
            },
        },
        "spec": {
            "containers": [
                {
                    "name": "foo",
                    "image": docker_image,
                    "env": kube_env,
                    "command": [
                        "/bin/bash",
                        "-c",
                        "-li",  # to load conda env file
                        command,
                    ],
                    "resources": resources,
                    "imagePullPolicy": "Always",
                    # gpu specific
                    "volumeMounts": [
                        {
                            "name": "nvidia",
                            "mountPath": "/usr/local/nvidia",
                            "readOnly": True,
                        }
                    ],
                    "securityContext": {
                        "privileged": True,
                    }
                }
            ],
            "volumes": [
                {
                    "name": "nvidia",
                    "hostPath": {
                        "path": "/var/lib/docker/volumes/nvidia_driver_352.63/_data",
                    }
                }
            ],
            "restartPolicy": "Never",
            "nodeSelector": node_selector,
            "dnsPolicy": "Default",
        }
    }
Beispiel #10
0
def to_lab_kube_pod(params,
                    docker_image,
                    code_full_path,
                    python_command="python",
                    script='scripts/run_experiment.py',
                    is_gpu=False,
                    sync_s3_pkl=False,
                    periodic_sync=True,
                    periodic_sync_interval=15,
                    sync_all_data_node_to_s3=False,
                    terminate_machine=True):
    """
    :param params: The parameters for the experiment. If logging directory parameters are provided, we will create
    docker volume mapping to make sure that the logging files are created at the correct locations
    :param docker_image: docker image to run the command on
    :param script: script command for running experiment
    :return:
    """
    log_dir = params.get("log_dir")
    remote_log_dir = params.pop("remote_log_dir")
    resources = params.pop("resources")
    node_selector = params.pop("node_selector")
    exp_prefix = params.pop("exp_prefix")

    kube_env = [{
        "name": k,
        "value": v
    } for k, v in (params.pop("env", None) or dict()).items()]
    mkdir_p(log_dir)
    pre_commands = list()
    pre_commands.append('mkdir -p ~/.aws')
    pre_commands.append('mkdir ~/.mujoco')
    # fetch credentials from the kubernetes secret file
    pre_commands.append('echo "[default]" >> ~/.aws/credentials')
    pre_commands.append(
        "echo \"aws_access_key_id = %s\" >> ~/.aws/credentials" %
        config.AWS_ACCESS_KEY)
    pre_commands.append(
        "echo \"aws_secret_access_key = %s\" >> ~/.aws/credentials" %
        config.AWS_ACCESS_SECRET)
    s3_mujoco_key_path = config.AWS_CODE_SYNC_S3_PATH + '/.mujoco/'
    pre_commands.append('aws s3 cp --recursive {} {}'.format(
        s3_mujoco_key_path, '~/.mujoco'))

    if config.FAST_CODE_SYNC:
        pre_commands.append('aws s3 cp %s /tmp/rllab_code.tar.gz' %
                            code_full_path)
        pre_commands.append('mkdir -p %s' % config.DOCKER_CODE_DIR)
        pre_commands.append('tar -zxvf /tmp/rllab_code.tar.gz -C %s' %
                            config.DOCKER_CODE_DIR)
    else:
        pre_commands.append('aws s3 cp --recursive %s %s' %
                            (code_full_path, config.DOCKER_CODE_DIR))
    pre_commands.append('cd %s' % config.DOCKER_CODE_DIR)
    pre_commands.append('mkdir -p %s' % (log_dir))

    if sync_all_data_node_to_s3:
        print('Syncing all data from node to s3.')
        if periodic_sync:
            if sync_s3_pkl:
                pre_commands.append("""
                            while /bin/true; do
                                aws s3 sync {log_dir} {remote_log_dir} --region {aws_region} --quiet
                                sleep {periodic_sync_interval}
                            done & echo sync initiated""".format(
                    log_dir=log_dir,
                    remote_log_dir=remote_log_dir,
                    aws_region=config.AWS_REGION_NAME,
                    periodic_sync_interval=periodic_sync_interval))
            else:
                pre_commands.append("""
                            while /bin/true; do
                                aws s3 sync {log_dir} {remote_log_dir} --region {aws_region} --quiet
                                sleep {periodic_sync_interval}
                            done & echo sync initiated""".format(
                    log_dir=log_dir,
                    remote_log_dir=remote_log_dir,
                    aws_region=config.AWS_REGION_NAME,
                    periodic_sync_interval=periodic_sync_interval))
    else:
        if periodic_sync:
            if sync_s3_pkl:
                pre_commands.append("""
                    while /bin/true; do
                        aws s3 sync --exclude '*' --include '*.csv' --include '*.json' --include '*.pkl' {log_dir} {remote_log_dir} --region {aws_region} --quiet
                        sleep {periodic_sync_interval}
                    done & echo sync initiated""".format(
                    log_dir=log_dir,
                    remote_log_dir=remote_log_dir,
                    aws_region=config.AWS_REGION_NAME,
                    periodic_sync_interval=periodic_sync_interval))
            else:
                pre_commands.append("""
                    while /bin/true; do
                        aws s3 sync --exclude '*' --include '*.csv' --include '*.json' {log_dir} {remote_log_dir} --region {aws_region} --quiet
                        sleep {periodic_sync_interval}
                    done & echo sync initiated""".format(
                    log_dir=log_dir,
                    remote_log_dir=remote_log_dir,
                    aws_region=config.AWS_REGION_NAME,
                    periodic_sync_interval=periodic_sync_interval))
    # copy the file to s3 after execution
    post_commands = list()
    post_commands.append('aws s3 cp --recursive %s %s' %
                         (log_dir, remote_log_dir))
    if not terminate_machine:
        post_commands.append('sleep infinity')
    command_list = list()
    if pre_commands is not None:
        command_list.extend(pre_commands)
    command_list.append("echo \"Running in docker\"")
    command_list.append(
        "%s 2>&1 | tee -a %s" %
        (to_local_command(params, python_command=python_command,
                          script=script), "%s/stdouterr.log" % log_dir))
    if post_commands is not None:
        command_list.extend(post_commands)
    command = "; ".join(command_list)
    pod_name = config.KUBE_PREFIX + params["exp_name"]
    # underscore is not allowed in pod names
    pod_name = pod_name.replace("_", "-")
    print("Is gpu: ", is_gpu)
    if not is_gpu:
        return {
            "apiVersion": "v1",
            "kind": "Pod",
            "metadata": {
                "name": pod_name,
                "labels": {
                    "owner": config.LABEL,
                    "expt": pod_name,
                    "exp_time": timestamp,
                    "exp_prefix": exp_prefix,
                },
            },
            "spec": {
                "containers": [{
                    "name":
                    "foo",
                    "image":
                    docker_image,
                    "command": [
                        "/bin/bash",
                        "-c",
                        "-li",  # to load conda env file
                        command,
                    ],
                    "resources":
                    resources,
                    "imagePullPolicy":
                    "Always",
                }],
                "restartPolicy":
                "Never",
                "nodeSelector":
                node_selector,
                "dnsPolicy":
                "Default",
            }
        }
    return {
        "apiVersion": "v1",
        "kind": "Pod",
        "metadata": {
            "name": pod_name,
            "labels": {
                "owner": config.LABEL,
                "expt": pod_name,
                "exp_time": timestamp,
                "exp_prefix": exp_prefix,
            },
        },
        "spec": {
            "containers": [{
                "name":
                "foo",
                "image":
                docker_image,
                "env":
                kube_env,
                "command": [
                    "/bin/bash",
                    "-c",
                    "-li",  # to load conda env file
                    command,
                ],
                "resources":
                resources,
                "imagePullPolicy":
                "Always",
                # gpu specific
                "volumeMounts": [{
                    "name": "nvidia",
                    "mountPath": "/usr/local/nvidia",
                    "readOnly": True,
                }],
                "securityContext": {
                    "privileged": True,
                }
            }],
            "volumes": [{
                "name": "nvidia",
                "hostPath": {
                    "path":
                    "/var/lib/docker/volumes/nvidia_driver_352.63/_data",
                }
            }],
            "restartPolicy":
            "Never",
            "nodeSelector":
            node_selector,
            "dnsPolicy":
            "Default",
        }
    }
Beispiel #11
0
def to_lab_kube_pod(
        params, docker_image, code_full_path,
        script='scripts/run_experiment.py', is_gpu=False
):
    """
    :param params: The parameters for the experiment. If logging directory parameters are provided, we will create
    docker volume mapping to make sure that the logging files are created at the correct locations
    :param docker_image: docker image to run the command on
    :param script: script command for running experiment
    :return:
    """
    log_dir = params.get("log_dir")
    remote_log_dir = params.pop("remote_log_dir")
    resources = params.pop("resources")
    node_selector = params.pop("node_selector")
    exp_prefix = params.pop("exp_prefix")
    mkdir_p(log_dir)
    pre_commands = list()
    pre_commands.append('mkdir -p ~/.aws')
    # fetch credentials from the kubernetes secret file
    pre_commands.append('echo "[default]" >> ~/.aws/credentials')
    pre_commands.append(
        "echo \"aws_access_key_id = %s\" >> ~/.aws/credentials" % config.AWS_ACCESS_KEY)
    pre_commands.append(
        "echo \"aws_secret_access_key = %s\" >> ~/.aws/credentials" % config.AWS_ACCESS_SECRET)
    pre_commands.append('aws s3 cp --recursive %s %s' %
                        (code_full_path, config.DOCKER_CODE_DIR))
    pre_commands.append('cd %s' %
                        (config.DOCKER_CODE_DIR))
    pre_commands.append('mkdir -p %s' %
                        (log_dir))
    pre_commands.append("""
        while /bin/true; do
            aws s3 sync --exclude *.pkl {log_dir} {remote_log_dir} --region {aws_region}
            sleep 5
        done & echo sync initiated""".format(log_dir=log_dir, remote_log_dir=remote_log_dir,
                                             aws_region=config.AWS_REGION_NAME))
    # copy the file to s3 after execution
    post_commands = list()
    post_commands.append('aws s3 cp --recursive %s %s' %
                         (log_dir,
                          remote_log_dir))
    # command = to_docker_command(params, docker_image=docker_image, script=script,
    #                             pre_commands=pre_commands,
    #                             post_commands=post_commands)
    command_list = list()
    if pre_commands is not None:
        command_list.extend(pre_commands)
    command_list.append("echo \"Running in docker\"")
    command_list.append(
        "%s 2>&1 | tee -a %s" % (
            to_local_command(params, script),
            "%s/stdouterr.log" % log_dir
        )
    )
    if post_commands is not None:
        command_list.extend(post_commands)
    command = "; ".join(command_list)
    pod_name = config.KUBE_PREFIX + params["exp_name"]
    # underscore is not allowed in pod names
    pod_name = pod_name.replace("_", "-")
    print "Is gpu: ", is_gpu
    if not is_gpu:
        return {
            "apiVersion": "v1",
            "kind": "Pod",
            "metadata": {
                "name": pod_name,
                "labels": {
                    "owner": config.LABEL,
                    "expt": pod_name,
                    "exp_time": timestamp,
                    "exp_prefix": exp_prefix,
                },
            },
            "spec": {
                "containers": [
                    {
                        "name": "foo",
                        "image": docker_image,
                        "command": [
                            "/bin/bash",
                            "-c",
                            "-li",  # to load conda env file
                            command,
                        ],
                        "resources": resources,
                        "imagePullPolicy": "Always",
                    }
                ],
                "restartPolicy": "Never",
                "nodeSelector": node_selector,
            }
        }
    return {
        "apiVersion": "v1",
        "kind": "Pod",
        "metadata": {
            "name": pod_name,
            "labels": {
                "owner": config.LABEL,
                "expt": pod_name,
                "exp_time": timestamp,
                "exp_prefix": exp_prefix,
            },
        },
        "spec": {
            "containers": [
                {
                    "name": "foo",
                    "image": docker_image,
                    "command": [
                        "/bin/bash",
                        "-c",
                        "-li",  # to load conda env file
                        command,
                    ],
                    "resources": resources,
                    "imagePullPolicy": "Always",
                    # gpu specific
                    "volumeMounts": [
                        {
                            "name": "nvidia",
                            "mountPath": "/usr/local/nvidia",
                            "readOnly": True,
                        }
                    ],
                    "securityContext": {
                        "privileged": True,
                    }
                }
            ],
            "volumes": [
                {
                    "name": "nvidia",
                    "hostPath": {
                        "path": "/var/lib/docker/volumes/nvidia_driver_352.63/_data",
                    }
                }
            ],
            "restartPolicy": "Never",
            "nodeSelector": node_selector,
        }
    }
Beispiel #12
0
    'file',
    type=str,
    help='Path to the snapshot file. Usually it is ".pkl" file')
parser.add_argument('--env', type=str, help='Name of the environemtn')
parser.add_argument('--visualize_conv',
                    type=bool,
                    default=True,
                    help='Visualize convolution layer')
parser.add_argument('--max_path_length',
                    type=int,
                    default=np.inf,
                    help='Maximal path length ')
args = parser.parse_args()

analysis_log_dir = os.path.join('analysis', args.env)
mkdir_p(analysis_log_dir)

# If the snapshot file use tensorflow, do:
# import tensorflow as tf
# with tf.Session():
#     [rest of the code]
with tf.Session() as sess:
    data = joblib.load(args.file)
    policy = data['policy']
    if args.env:
        agent_history_length = 4
        resized_shape = (84, 84)
        env = GymEnv(env_name=args.env,
                     record_video=True,
                     log_dir=analysis_log_dir,
                     record_log=True,
Beispiel #13
0
def to_lab_kube_pod(
        params, docker_image, code_full_path,
        script='scripts/run_experiment.py', is_gpu=False
):
    """
    :param params: The parameters for the experiment. If logging directory parameters are provided, we will create
    docker volume mapping to make sure that the logging files are created at the correct locations
    :param docker_image: docker image to run the command on
    :param script: script command for running experiment
    :return:
    """
    log_dir = params.get("log_dir")
    remote_log_dir = params.pop("remote_log_dir")
    resources = params.pop("resources")
    node_selector = params.pop("node_selector")
    exp_prefix = params.pop("exp_prefix")
    mkdir_p(log_dir)
    pre_commands = list()
    pre_commands.append('mkdir -p ~/.aws')
    pre_commands.append('mkdir ~/.mujoco')
    # fetch credentials from the kubernetes secret file
    pre_commands.append('echo "[default]" >> ~/.aws/credentials')
    pre_commands.append(
        "echo \"aws_access_key_id = %s\" >> ~/.aws/credentials" % config.AWS_ACCESS_KEY)
    pre_commands.append(
        "echo \"aws_secret_access_key = %s\" >> ~/.aws/credentials" % config.AWS_ACCESS_SECRET)
    pre_commands.append('cd %s' % '/root/code/')
    pre_commands.append('mkdir -p %s' % log_dir)
    pre_commands.append("""
        while /bin/true; do
            aws s3 sync {log_dir} {remote_log_dir} --region {aws_region} --quiet
            sleep 15
        done & echo sync initiated""".format(log_dir=log_dir, remote_log_dir=remote_log_dir,
                                             aws_region=config.AWS_REGION_NAME))
    # copy the file to s3 after execution
    post_commands = list()
    post_commands.append('aws s3 cp --recursive %s %s' % (log_dir, remote_log_dir))
    command_list = list()
    if pre_commands is not None:
        command_list.extend(pre_commands)
    command_list.append("echo \"Running in docker\"")
    command_list.append(
        "%s 2>&1 | tee -a %s" % (
            to_local_command(params, script),
            "%s/stdouterr.log" % log_dir
        )
    )
    if post_commands is not None:
        command_list.extend(post_commands)
    command = "; ".join(command_list)
    pod_name = config.KUBE_PREFIX + params["exp_name"]
    # underscore is not allowed in pod names
    pod_name = pod_name.replace("_", "-")
    print("Is gpu: ", is_gpu)
    if not is_gpu:
        return {
            "apiVersion": "v1",
            "kind": "Pod",
            "metadata": {
                "name": pod_name,
                "labels": {
                    "owner": config.LABEL,
                    "expt": pod_name,
                    "exp_time": timestamp,
                    "exp_prefix": exp_prefix,
                },
            },
            "spec": {
                "containers": [
                    {
                        "name": "foo",
                        "image": docker_image,
                        "command": [
                            "/bin/bash",
                            "-c",
                            "-li",  # to load conda env file
                            command,
                        ],
                        "resources": resources,
                        "imagePullPolicy": "Always",
                    }
                ],
                "restartPolicy": "Never",
                "nodeSelector": node_selector,
            }
        }
    return {
        "apiVersion": "v1",
        "kind": "Pod",
        "metadata": {
            "name": pod_name,
            "labels": {
                "owner": config.LABEL,
                "expt": pod_name,
                "exp_time": timestamp,
                "exp_prefix": exp_prefix,
            },
        },
        "spec": {
            "containers": [
                {
                    "name": "foo",
                    "image": docker_image,
                    "command": [
                        "/bin/bash",
                        "-c",
                        "-li",  # to load conda env file
                        command,
                    ],
                    "resources": resources,
                    "imagePullPolicy": "Always",
                    # gpu specific
                    "volumeMounts": [
                        {
                            "name": "nvidia",
                            "mountPath": "/usr/local/nvidia",
                            "readOnly": True,
                        }
                    ],
                    "securityContext": {
                        "privileged": True,
                    }
                }
            ],
            "volumes": [
                {
                    "name": "nvidia",
                    "hostPath": {
                        "path": "/var/lib/docker/volumes/nvidia_driver_352.63/_data",
                    }
                }
            ],
            "restartPolicy": "Never",
            "nodeSelector": node_selector,
        }
    }
Beispiel #14
0
def _add_output(file_name, arr, fds, mode='a'):
    if file_name not in arr:
        mkdir_p(os.path.dirname(file_name))
        arr.append(file_name)
        fds[file_name] = open(file_name, mode)
Beispiel #15
0
def to_img(obs, frame_size=(100, 100)):
    return cv2.resize(np.cast['uint8'](obs), frame_size)
    # return cv2.resize(np.cast['uint8']((obs / 2 + 0.5) * 255.0), frame_size)
    # return obs


with tf.Session() as sess:
    np.random.seed(0)
    random.seed(0)

    pkl_file = osp.join(
        config.PROJECT_PATH,
        '/Users/florensacc/Library/goal-rl/rllab_goal_rl/sandbox/dave/upload/maze_best/itr_199/itr_4.pkl'
    )
    output_path = osp.join(config.PROJECT_PATH, "data/video/goalGAN_maze")
    console.mkdir_p(output_path)

    # import pdb; pdb.set_trace()
    data = joblib.load(pkl_file)

    policy = data["policy"]

    env = data["env"]
    # env = SwimmerEnv()

    for idx in range(7, 8):

        encoder = ImageEncoder(output_path=osp.join(
            output_path, '%d_goalGAN_maze.mp4' % idx),
                               frame_shape=frame_size + (3, ),
                               frames_per_sec=15)
def _add_output(file_name, arr, fds, mode='a'):
    if file_name not in arr:
        mkdir_p(os.path.dirname(file_name))
        arr.append(file_name)
        fds[file_name] = open(file_name, mode)
def setup_ec2():
    #for region in ["us-east-1", "us-west-1", "us-west-2"]:
    for region in ["us-west-1"]:
        print("Setting up region %s" % region)

        ec2 = boto3.resource(
            "ec2",
            region_name=region,
            aws_access_key_id=ACCESS_KEY,
            aws_secret_access_key=ACCESS_SECRET,
        )
        ec2_client = boto3.client(
            "ec2",
            region_name=region,
            aws_access_key_id=ACCESS_KEY,
            aws_secret_access_key=ACCESS_SECRET,
        )
        existing_vpcs = list(ec2.vpcs.all())
        assert len(existing_vpcs) >= 1
        vpc = existing_vpcs[0]
        print("Creating security group in VPC %s" % str(vpc.id))
        try:
            security_group = vpc.create_security_group(
                GroupName='rllab-sg', Description='Security group for rllab'
            )
        except botocore.exceptions.ClientError as e:
            if e.response['Error']['Code'] == 'InvalidGroup.Duplicate':
                sgs = list(vpc.security_groups.filter(GroupNames=['rllab-sg']))
                security_group = sgs[0]
            else:
                raise e

        ALL_REGION_AWS_SECURITY_GROUP_IDS[region] = [security_group.id]

        import pdb; pdb.set_trace()

        ec2_client.create_tags(Resources=[security_group.id], Tags=[{'Key': 'Name', 'Value': 'rllab-sg'}])
        try:
            security_group.authorize_ingress(FromPort=22, ToPort=22, IpProtocol='tcp', CidrIp='0.0.0.0/0')
        except botocore.exceptions.ClientError as e:
            if e.response['Error']['Code'] == 'InvalidPermission.Duplicate':
                pass
            else:
                raise e
        print("Security group created with id %s" % str(security_group.id))

        key_name = 'rllab-%s' % region
        try:
            print("Trying to create key pair with name %s" % key_name)
            key_pair = ec2_client.create_key_pair(KeyName=key_name)
        except botocore.exceptions.ClientError as e:
            if e.response['Error']['Code'] == 'InvalidKeyPair.Duplicate':
                if not query_yes_no("Key pair with name %s exists. Proceed to delete and recreate?" % key_name, "no"):
                    sys.exit()
                print("Deleting existing key pair with name %s" % key_name)
                ec2_client.delete_key_pair(KeyName=key_name)
                print("Recreating key pair with name %s" % key_name)
                key_pair = ec2_client.create_key_pair(KeyName=key_name)
            else:
                raise e

        key_pair_folder_path = os.path.join(config.PROJECT_PATH, "private", "key_pairs")
        file_name = os.path.join(key_pair_folder_path, "%s.pem" % key_name)

        print("Saving keypair file")
        console.mkdir_p(key_pair_folder_path)
        with os.fdopen(os.open(file_name, os.O_WRONLY | os.O_CREAT, 0o600), 'w') as handle:
            handle.write(key_pair['KeyMaterial'] + '\n')

        # adding pem file to ssh
        os.system("ssh-add %s" % file_name)

        ALL_REGION_AWS_KEY_NAMES[region] = key_name