Example #1
0
def log_variant(log_file, variant_data):
    mkdir_p(os.path.dirname(log_file))
    if hasattr(variant_data, "dump"):
        variant_data = variant_data.dump()
    variant_json = stub_to_json(variant_data)
    with open(log_file, "w") as f:
        json.dump(variant_json, f, indent=2, sort_keys=True, cls=MyEncoder)
Example #2
0
def log_parameters_lite(log_file, args):
    log_params = {}
    for param_name, param_value in args.__dict__.items():
        log_params[param_name] = param_value
    if args.args_data is not None:
        log_params["json_args"] = dict()
    mkdir_p(os.path.dirname(log_file))
    with open(log_file, "w") as f:
        json.dump(log_params, f, indent=2, sort_keys=True, cls=MyEncoder)
Example #3
0
def log_parameters(log_file, args):
    """Log parameters to file."""
    log_params = {}
    for param_name, param_value in args.__dict__.items():
        log_params[param_name] = param_value
    if args.args_data is not None:
        log_params['json_args'] = dict()
    mkdir_p(os.path.dirname(log_file))
    with open(log_file, 'w') as f:
        json.dump(log_params, f, indent=2, sort_keys=True, cls=LogEncoder)
Example #4
0
    def set_dir(self, dir_name):
        if not dir_name:
            if self._writer:
                self._writer.close()
                self._writer = None
        else:
            mkdir_p(dirname(dir_name))
            self._writer_dir = dir_name
            self._writer = tf.summary.FileWriter(dir_name)

            self._layout_writer_dir = dirname(dirname(
                abspath(dir_name))) + '/custom_scalar_config'

            self._default_step = 0
            assert self._writer is not None
Example #5
0
def log_parameters_lite(log_file, args):
    log_params = {}
    for param_name, param_value in args.__dict__.items():
        log_params[param_name] = param_value
    if args.args_data is not None:
        stub_method = pickle.loads(base64.b64decode(args.args_data))
        method_args = stub_method.kwargs
        log_params["json_args"] = dict()
        for k, v in list(method_args.items()):
            log_params["json_args"][k] = stub_to_json(v)
        kwargs = stub_method.obj.kwargs
        for k in ["baseline", "env", "policy"]:
            if k in kwargs:
                log_params["json_args"][k] = stub_to_json(kwargs.pop(k))
        log_params["json_args"]["algo"] = stub_to_json(stub_method.obj)
    mkdir_p(os.path.dirname(log_file))
    with open(log_file, "w") as f:
        json.dump(log_params, f, indent=2, sort_keys=True, cls=MyEncoder)
Example #6
0
def log_parameters(log_file, args, classes):
    log_params = {}
    for param_name, param_value in args.__dict__.items():
        if any([param_name.startswith(x) for x in list(classes.keys())]):
            continue
        log_params[param_name] = param_value
    for name, cls in classes.items():
        if isinstance(cls, type):
            params = get_all_parameters(cls, args)
            params["_name"] = getattr(args, name)
            log_params[name] = params
        else:
            log_params[name] = getattr(cls, "__kwargs", dict())
            log_params[name][
                "_name"] = cls.__module__ + "." + cls.__class__.__name__
    mkdir_p(os.path.dirname(log_file))
    with open(log_file, "w") as f:
        json.dump(log_params, f, indent=2, sort_keys=True)
Example #7
0
def setup_ec2():
    for region in ["us-east-1", "us-west-1", "us-west-2"]:
        print("Setting up region %s" % region)

        ec2 = boto3.resource(
            "ec2",
            region_name=region,
            aws_access_key_id=AWS_ACCESS_KEY,
            aws_secret_access_key=AWS_ACCESS_SECRET,
        )
        ec2_client = boto3.client(
            "ec2",
            region_name=region,
            aws_access_key_id=AWS_ACCESS_KEY,
            aws_secret_access_key=AWS_ACCESS_SECRET,
        )
        existing_vpcs = list(ec2.vpcs.all())
        assert len(existing_vpcs) >= 1
        vpc = existing_vpcs[0]
        print("Creating security group in VPC %s" % str(vpc.id))
        try:
            security_group = vpc.create_security_group(
                GroupName='garage-sg', Description='Security group for garage')
        except botocore.exceptions.ClientError as e:
            if e.response['Error']['Code'] == 'InvalidGroup.Duplicate':
                sgs = list(
                    vpc.security_groups.filter(GroupNames=['garage-sg']))
                security_group = sgs[0]
            else:
                raise e

        ALL_REGION_AWS_SECURITY_GROUP_IDS[region] = [security_group.id]

        ec2_client.create_tags(Resources=[security_group.id],
                               Tags=[{
                                   'Key': 'Name',
                                   'Value': 'garage-sg'
                               }])
        try:
            security_group.authorize_ingress(FromPort=22,
                                             ToPort=22,
                                             IpProtocol='tcp',
                                             CidrIp='0.0.0.0/0')
        except botocore.exceptions.ClientError as e:
            if e.response['Error']['Code'] == 'InvalidPermission.Duplicate':
                pass
            else:
                raise e
        print("Security group created with id %s" % str(security_group.id))

        key_name = 'garage-%s' % region
        try:
            print("Trying to create key pair with name %s" % key_name)
            key_pair = ec2_client.create_key_pair(KeyName=key_name)
        except botocore.exceptions.ClientError as e:
            if e.response['Error']['Code'] == 'InvalidKeyPair.Duplicate':
                if not config.query_yes_no(
                    ("Key pair with name %s exists. "
                     "Proceed to delete and recreate?") % key_name, "no"):
                    sys.exit()
                print("Deleting existing key pair with name %s" % key_name)
                ec2_client.delete_key_pair(KeyName=key_name)
                print("Recreating key pair with name %s" % key_name)
                key_pair = ec2_client.create_key_pair(KeyName=key_name)
            else:
                raise e

        key_pair_folder_path = os.path.join(config.PROJECT_PATH, "private",
                                            "key_pairs")
        file_name = os.path.join(key_pair_folder_path, "%s.pem" % key_name)

        print("Saving keypair file")
        console.mkdir_p(key_pair_folder_path)
        with os.fdopen(os.open(file_name, os.O_WRONLY | os.O_CREAT, 0o600),
                       'w') as handle:
            handle.write(key_pair['KeyMaterial'] + '\n')

        # adding pem file to ssh
        os.system("ssh-add %s" % file_name)

        ALL_REGION_AWS_KEY_NAMES[region] = key_name
Example #8
0
def _add_output(file_name, arr, fds, mode='a'):
    if file_name not in arr:
        mkdir_p(os.path.dirname(file_name))
        arr.append(file_name)
        fds[file_name] = open(file_name, mode)
Example #9
0
def dump_variant(log_file, variant_data):
    """Dump the variant file."""
    mkdir_p(os.path.dirname(log_file))
    with open(log_file, 'w') as f:
        json.dump(variant_data, f, indent=2, sort_keys=True, cls=LogEncoder)
Example #10
0
 def __init__(self, file_name, mode='w'):
     mkdir_p(os.path.dirname(file_name))
     # Open the log file in child class
     self._log_file = open(file_name, mode)
Example #11
0
 def set_snapshot_dir(self, dir_name):
     mkdir_p(dir_name)
     self._snapshot_dir = dir_name
Example #12
0
def to_lab_kube_pod(params,
                    docker_image,
                    code_full_path,
                    python_command="python",
                    script='scripts/run_experiment.py',
                    is_gpu=False,
                    sync_s3_pkl=False,
                    periodic_sync=True,
                    periodic_sync_interval=15,
                    sync_all_data_node_to_s3=False,
                    terminate_machine=True):
    """
    :param params: The parameters for the experiment. If logging directory
    parameters are provided, we will create docker volume mapping to make sure
    that the logging files are created at the correct locations
    :param docker_image: docker image to run the command on
    :param script: script command for running experiment
    :return:
    """
    log_dir = params.get("log_dir")
    remote_log_dir = params.pop("remote_log_dir")
    resources = params.pop("resources")
    node_selector = params.pop("node_selector")
    exp_prefix = params.pop("exp_prefix")

    kube_env = [{
        "name": k,
        "value": v
    } for k, v in (params.pop("env", None) or dict()).items()]
    mkdir_p(log_dir)
    pre_commands = list()
    pre_commands.append('mkdir -p ~/.aws')
    pre_commands.append('mkdir ~/.mujoco')
    # fetch credentials from the kubernetes secret file
    pre_commands.append('echo "[default]" >> ~/.aws/credentials')
    pre_commands.append("echo \"aws_access_key_id = %s\" >> ~/.aws/credentials"
                        % config.AWS_ACCESS_KEY)
    pre_commands.append(
        "echo \"aws_secret_access_key = %s\" >> ~/.aws/credentials" %
        config.AWS_ACCESS_SECRET)
    s3_mujoco_key_path = config.AWS_CODE_SYNC_S3_PATH + '/.mujoco/'
    pre_commands.append('aws s3 cp --recursive {} {}'.format(
        s3_mujoco_key_path, '~/.mujoco'))

    if config.FAST_CODE_SYNC:
        pre_commands.append(
            'aws s3 cp %s /tmp/garage_code.tar.gz' % code_full_path)
        pre_commands.append('mkdir -p %s' % config.DOCKER_CODE_DIR)
        pre_commands.append(
            'tar -zxvf /tmp/garage_code.tar.gz -C %s' % config.DOCKER_CODE_DIR)
    else:
        pre_commands.append('aws s3 cp --recursive %s %s' %
                            (code_full_path, config.DOCKER_CODE_DIR))
    pre_commands.append('cd %s' % config.DOCKER_CODE_DIR)
    pre_commands.append('mkdir -p %s' % (log_dir))

    if sync_all_data_node_to_s3:
        print('Syncing all data from node to s3.')
        if periodic_sync:
            if sync_s3_pkl:
                pre_commands.append("""
                            while /bin/true; do
                                aws s3 sync {log_dir} {remote_log_dir} --region {aws_region} --quiet
                                sleep {periodic_sync_interval}
                            done & echo sync initiated""".format(  # noqa: E501
                    log_dir=log_dir,
                    remote_log_dir=remote_log_dir,
                    aws_region=config.AWS_REGION_NAME,
                    periodic_sync_interval=periodic_sync_interval))
            else:
                pre_commands.append("""
                            while /bin/true; do
                                aws s3 sync {log_dir} {remote_log_dir} --region {aws_region} --quiet
                                sleep {periodic_sync_interval}
                            done & echo sync initiated""".format(  # noqa: E501
                    log_dir=log_dir,
                    remote_log_dir=remote_log_dir,
                    aws_region=config.AWS_REGION_NAME,
                    periodic_sync_interval=periodic_sync_interval))
    else:
        if periodic_sync:
            if sync_s3_pkl:
                pre_commands.append("""
                    while /bin/true; do
                        aws s3 sync --exclude '*' --include '*.csv' --include '*.json' --include '*.pkl' {log_dir} {remote_log_dir} --region {aws_region} --quiet
                        sleep {periodic_sync_interval}
                    done & echo sync initiated""".format(  # noqa: E501
                    log_dir=log_dir,
                    remote_log_dir=remote_log_dir,
                    aws_region=config.AWS_REGION_NAME,
                    periodic_sync_interval=periodic_sync_interval))
            else:
                pre_commands.append("""
                    while /bin/true; do
                        aws s3 sync --exclude '*' --include '*.csv' --include '*.json' {log_dir} {remote_log_dir} --region {aws_region} --quiet
                        sleep {periodic_sync_interval}
                    done & echo sync initiated""".format(  # noqa: E501
                    log_dir=log_dir,
                    remote_log_dir=remote_log_dir,
                    aws_region=config.AWS_REGION_NAME,
                    periodic_sync_interval=periodic_sync_interval))
    # copy the file to s3 after execution
    post_commands = list()
    post_commands.append(
        'aws s3 cp --recursive %s %s' % (log_dir, remote_log_dir))
    if not terminate_machine:
        post_commands.append('sleep infinity')
    command_list = list()
    if pre_commands is not None:
        command_list.extend(pre_commands)
    command_list.append("echo \"Running in docker\"")
    command_list.append("{} 2>&1 | tee -a {}".format(
        to_local_command(params, python_command=python_command, script=script),
        "{}/stdouterr.log".format(log_dir)))
    if post_commands is not None:
        command_list.extend(post_commands)
    command = "; ".join(command_list)
    pod_name = config.KUBE_PREFIX + params["exp_name"]
    # underscore is not allowed in pod names
    pod_name = pod_name.replace("_", "-")
    print("Is gpu: ", is_gpu)
    if not is_gpu:
        return {
            "apiVersion": "v1",
            "kind": "Pod",
            "metadata": {
                "name": pod_name,
                "labels": {
                    "owner": config.LABEL,
                    "expt": pod_name,
                    "exp_time": timestamp,
                    "exp_prefix": exp_prefix,
                },
            },
            "spec": {
                "containers": [{
                    "name":
                    "foo",
                    "image":
                    docker_image,
                    "command": [
                        "/bin/bash",
                        "-c",
                        "-li",  # to load conda env file
                        command,
                    ],
                    "resources":
                    resources,
                    "imagePullPolicy":
                    "Always",
                }],
                "restartPolicy":
                "Never",
                "nodeSelector":
                node_selector,
                "dnsPolicy":
                "Default",
            }
        }
    return {
        "apiVersion": "v1",
        "kind": "Pod",
        "metadata": {
            "name": pod_name,
            "labels": {
                "owner": config.LABEL,
                "expt": pod_name,
                "exp_time": timestamp,
                "exp_prefix": exp_prefix,
            },
        },
        "spec": {
            "containers": [{
                "name":
                "foo",
                "image":
                docker_image,
                "env":
                kube_env,
                "command": [
                    "/bin/bash",
                    "-c",
                    "-li",  # to load conda env file
                    command,
                ],
                "resources":
                resources,
                "imagePullPolicy":
                "Always",
                # gpu specific
                "volumeMounts": [{
                    "name": "nvidia",
                    "mountPath": "/usr/local/nvidia",
                    "readOnly": True,
                }],
                "securityContext": {
                    "privileged": True,
                }
            }],
            "volumes": [{
                "name": "nvidia",
                "hostPath": {
                    "path":
                    "/var/lib/docker/volumes/nvidia_driver_352.63/_data",
                }
            }],
            "restartPolicy":
            "Never",
            "nodeSelector":
            node_selector,
            "dnsPolicy":
            "Default",
        }
    }