Esempio n. 1
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--env', type=str, default='ppaquette/SuperMarioBros-1-1-v0')
    parser.add_argument('--render', action='store_true')
    parser.add_argument('--threads', type=int, default=8)
    parser.add_argument('--load', type=str)
    args = parser.parse_args()

    sess = tf.Session()
    sess.__enter__()

    model = make_network(
        [[32, 3, 2, 0], [32, 3, 2, 0], [32, 3, 2, 0], [32, 3, 2, 0]])

    icm_model = make_icm(
        [[32, 3, 2, 0], [32, 3, 2, 0], [32, 3, 2, 0], [32, 3, 2, 0]])

    env_name = args.env
    actions = np.arange(14).tolist()
    master = Agent(model, icm_model, len(actions), name='global')

    global_step = tf.Variable(0, dtype=tf.int64, name='global_step')

    global_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, 'global')
    saver = tf.train.Saver(global_vars)
    if args.load:
        saver.restore(sess, args.load)

    workers = []
    for i in range(args.threads):
        render = False
        if args.render and i == 0:
            render = True
        worker = Worker('worker{}'.format(i), model, icm_model, global_step, env_name, render=render)
        workers.append(worker)

    summary_writer = tf.summary.FileWriter('log', sess.graph)

    if args.render:
        sample_worker = workers.pop(0)

    initialize()

    coord = tf.train.Coordinator()
    threads = []
    for i in range(len(workers)):
        worker_thread = lambda: workers[i].run(sess, summary_writer, saver)
        thread = threading.Thread(target=worker_thread)
        thread.start()
        threads.append(thread)
        time.sleep(0.1)

    if args.render:
        sample_worker.run(sess, summary_writer)

    coord.join(threads)
Esempio n. 2
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--env', type=str, default='MyAnt-v1')
    parser.add_argument('--gpu', type=int, default=0)
    parser.add_argument('--load', type=str, default=None)
    parser.add_argument('--render', action='store_true')
    args = parser.parse_args()

    env = gym.make(args.env)

    obs_dim = env.observation_space.shape[0]
    n_actions = env.action_space.shape[0]

    network = make_network([100, 100, 100])
    # critic = make_critic_network()

    sess = tf.Session()
    sess.__enter__()

    agent = Agent(network, obs_dim, n_actions, None)

    saver = tf.train.Saver()
    if args.load is not None:
        saver.restore(sess, args.load)

    global_step = 0
    episode = 0

    while True:
        sum_of_rewards = 0
        done = False
        step = 0
        state = env.reset()

        while True:
            if args.render:
                env.render()

            action = agent.act(state)

            if done:
                break
            print action
            state, reward, done, info = env.step(action)

            sum_of_rewards += reward
            step += 1
            global_step += 1

        episode += 1

        print('Episode: {}, Step: {}: Reward: {}'.format(
                episode, global_step, sum_of_rewards))
Esempio n. 3
0
def build_vr_loss(convs,
                  fcs,
                  padding,
                  lstm,
                  obs_t,
                  actions_tm1,
                  rewards_t,
                  num_actions,
                  lstm_unit,
                  returns_t):
    init_state = tf.zeros((1, lstm_unit), dtype=tf.float32)
    rnn_state_tuple = tf.contrib.rnn.LSTMStateTuple(init_state, init_state)
    _, value_t, _ = make_network(convs, fcs, padding, lstm, obs_t, actions_tm1,
                                 rewards_t, rnn_state_tuple, num_actions,
                                 lstm_unit, scope='model', reuse=True)
    returns_t = tf.reshape(returns_t, [-1, 1])
    loss = tf.reduce_sum((returns_t - value_t) ** 2)
    return loss
Esempio n. 4
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--env',
                        type=str,
                        default='ppaquette/SuperMarioBros-1-1-v0')
    parser.add_argument('--render', action='store_true')
    parser.add_argument('--load', type=str)
    args = parser.parse_args()

    sess = tf.Session()
    sess.__enter__()

    model = make_network([[32, 3, 2, 0], [32, 3, 2, 0], [32, 3, 2, 0],
                          [32, 3, 2, 0]])
    icm_model = make_icm([[32, 3, 2, 0], [32, 3, 2, 0], [32, 3, 2, 0],
                          [32, 3, 2, 0]])

    env_name = args.env
    actions = np.arange(14).tolist()

    global_step = tf.Variable(0, dtype=tf.int64, name='global_step')

    worker = Worker('global',
                    model,
                    icm_model,
                    global_step,
                    env_name,
                    render=args.render,
                    training=False)
    global_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, 'global')

    saver = tf.train.Saver(global_vars)
    if args.load:
        saver.restore(sess, args.load)

    summary_writer = tf.summary.FileWriter('log', sess.graph)

    worker.run(sess, summary_writer, saver)
Esempio n. 5
0
def main():
    date = datetime.now().strftime('%Y%m%d%H%M%S')
    parser = argparse.ArgumentParser()
    parser.add_argument('--env', type=str, default='PongDeterministic-v4')
    parser.add_argument('--threads', type=int, default=8)
    parser.add_argument('--load', type=str)
    parser.add_argument('--logdir', type=str, default=date)
    parser.add_argument('--render', action='store_true')
    parser.add_argument('--demo', action='store_true')
    parser.add_argument('--record', action='store_true')
    args = parser.parse_args()

    outdir = os.path.join(os.path.dirname(__file__), 'results/' + args.logdir)
    if not os.path.exists(outdir):
        os.makedirs(outdir)
    logdir = os.path.join(os.path.dirname(__file__), 'logs/' + args.logdir)

    env_name = args.env
    tmp_env = gym.make(env_name)
    is_atari = len(tmp_env.observation_space.shape) != 1
    # box environment
    if not is_atari:
        observation_space = tmp_env.observation_space
        constants = box_constants
        actions = range(tmp_env.action_space.n)
        state_shape = [observation_space.shape[0], constants.STATE_WINDOW]
        state_preprocess = lambda s: s
        # (window_size, dim) -> (dim, window_size)
        phi = lambda s: np.transpose(s, [1, 0])
    # atari environment
    else:
        constants = atari_constants
        actions = get_action_space(env_name)
        state_shape = constants.STATE_SHAPE + [constants.STATE_WINDOW]
        def state_preprocess(state):
            # atari specific preprocessing
            state = atari_preprocess(state, constants.STATE_SHAPE)
            state = np.array(state, dtype=np.float32)
            return state / 255.0
        # (window_size, H, W) -> (H, W, window_size)
        phi = lambda s: np.transpose(s, [1, 2, 0])

    # save settings
    dump_constants(constants, os.path.join(outdir, 'constants.json'))

    sess = tf.Session()
    sess.__enter__()

    model = make_network(
        constants.CONVS, constants.FCS,
        lstm=constants.LSTM, padding=constants.PADDING)

    # share Adam optimizer with all threads!
    lr = tf.Variable(constants.LR)
    decayed_lr = tf.placeholder(tf.float32)
    decay_lr_op = lr.assign(decayed_lr)
    if constants.OPTIMIZER == 'rmsprop':
        optimizer = tf.train.RMSPropOptimizer(lr, decay=0.99, epsilon=0.1)
    else:
        optimizer = tf.train.AdamOptimizer(lr)


    master = make_agent(
        model, actions, optimizer, state_shape, phi, 'global', constants)

    global_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, 'global')
    saver = tf.train.Saver(global_vars)
    if args.load:
        saver.restore(sess, args.load)

    agents = []
    envs = []
    for i in range(args.threads):
        name = 'worker{}'.format(i)
        agent = make_agent(
            model, actions, optimizer, state_shape, phi, name, constants)
        agents.append(agent)
        env = gym.make(args.env)
        env.seed(constants.RANDOM_SEED)
        if is_atari:
            env = NoopResetEnv(env)
            env = EpisodicLifeEnv(env)
        wrapped_env = EnvWrapper(
            env,
            r_preprocess=lambda r: np.clip(r, -1, 1),
            s_preprocess=state_preprocess
        )
        envs.append(wrapped_env)

    sess.run(tf.global_variables_initializer())

    summary_writer = tf.summary.FileWriter(logdir, sess.graph)
    tflogger = TfBoardLogger(summary_writer)
    tflogger.register('reward', dtype=tf.float32)
    tflogger.register('eval_reward', dtype=tf.float32)
    end_episode = lambda r, gs, s, ge, e: tflogger.plot('reward', r, gs)

    def after_action(state, reward, shared_step, global_step, local_step):
        if constants.LR_DECAY == 'linear':
            decay = 1.0 - (float(shared_step) / constants.FINAL_STEP)
            if decay < 0.0:
                decay = 0.0
            sess.run(decay_lr_op, feed_dict={decayed_lr: constants.LR * decay})
        if shared_step % 10 ** 6 == 0:
            path = os.path.join(outdir, 'model.ckpt')
            saver.save(sess, path, global_step=shared_step)

    trainer = AsyncTrainer(
        envs=envs,
        agents=agents,
        render=args.render,
        state_shape=state_shape[:-1],
        state_window=constants.STATE_WINDOW,
        final_step=constants.FINAL_STEP,
        after_action=after_action,
        end_episode=end_episode,
        training=not args.demo,
        n_threads=args.threads
    )
    trainer.start()
Esempio n. 6
0
def main():
    date = datetime.now().strftime('%Y%m%d%H%M%S')
    parser = argparse.ArgumentParser()
    parser.add_argument('--env', type=str, default='PongNoFrameskip-v4')
    parser.add_argument('--load', type=str)
    parser.add_argument('--logdir', type=str, default=date)
    parser.add_argument('--render', action='store_true')
    parser.add_argument('--demo', action='store_true')
    args = parser.parse_args()

    outdir = os.path.join(os.path.dirname(__file__), 'results/' + args.logdir)
    if not os.path.exists(outdir):
        os.makedirs(outdir)
    logdir = os.path.join(os.path.dirname(__file__), 'logs/' + args.logdir)

    env_name = args.env
    tmp_env = gym.make(env_name)
    is_atari = len(tmp_env.observation_space.shape) != 1
    if not is_atari:
        observation_space = tmp_env.observation_space
        constants = box_constants
        if isinstance(tmp_env.action_space, gym.spaces.Box):
            num_actions = tmp_env.action_space.shape[0]
        else:
            num_actions = tmp_env.action_space.n
        state_shape = [observation_space.shape[0], constants.STATE_WINDOW]
        state_preprocess = lambda s: s
        reward_preprocess = lambda r: r / 10.0
        # (window_size, dim) -> (dim, window_size)
        phi = lambda s: np.transpose(s, [1, 0])
    else:
        constants = atari_constants
        num_actions = tmp_env.action_space.n
        state_shape = constants.STATE_SHAPE + [constants.STATE_WINDOW]
        def state_preprocess(state):
            state = atari_preprocess(state, constants.STATE_SHAPE)
            state = np.array(state, dtype=np.float32)
            return state / 255.0
        reward_preprocess = lambda r: np.clip(r, -1.0, 1.0)
        # (window_size, H, W) -> (H, W, window_size)
        phi = lambda s: np.transpose(s, [1, 2, 0])

    # flag of continuous action space
    continuous = isinstance(tmp_env.action_space, gym.spaces.Box)
    upper_bound = tmp_env.action_space.high if continuous else None

    # save settings
    dump_constants(constants, os.path.join(outdir, 'constants.json'))

    sess = tf.Session()
    sess.__enter__()

    model = make_network(
        constants.CONVS, constants.FCS, use_lstm=constants.LSTM,
        padding=constants.PADDING, continuous=continuous)

    # learning rate with decay operation
    if constants.LR_DECAY == 'linear':
        lr = LinearScheduler(constants.LR, constants.FINAL_STEP, 'lr')
        epsilon = LinearScheduler(
            constants.EPSILON, constants.FINAL_STEP, 'epsilon')
    else:
        lr = ConstantScheduler(constants.LR, 'lr')
        epsilon = ConstantScheduler(constants.EPSILON, 'epsilon')

    agent = Agent(
        model,
        num_actions,
        nenvs=constants.ACTORS,
        lr=lr,
        epsilon=epsilon,
        gamma=constants.GAMMA,
        lam=constants.LAM,
        lstm_unit=constants.LSTM_UNIT,
        value_factor=constants.VALUE_FACTOR,
        entropy_factor=constants.ENTROPY_FACTOR,
        time_horizon=constants.TIME_HORIZON,
        batch_size=constants.BATCH_SIZE,
        grad_clip=constants.GRAD_CLIP,
        state_shape=state_shape,
        epoch=constants.EPOCH,
        phi=phi,
        use_lstm=constants.LSTM,
        continuous=continuous,
        upper_bound=upper_bound
    )

    saver = tf.train.Saver()
    if args.load:
        saver.restore(sess, args.load)

    # create environemtns
    envs = []
    for i in range(constants.ACTORS):
        env = gym.make(args.env)
        env.seed(constants.RANDOM_SEED)
        if is_atari:
            env = NoopResetEnv(env, noop_max=30)
            env = MaxAndSkipEnv(env)
            env = EpisodicLifeEnv(env)
        wrapped_env = EnvWrapper(
            env,
            r_preprocess=reward_preprocess,
            s_preprocess=state_preprocess
        ) 
        envs.append(wrapped_env)
    batch_env = BatchEnvWrapper(envs)

    sess.run(tf.global_variables_initializer())

    summary_writer = tf.summary.FileWriter(logdir, sess.graph)
    logger = TfBoardLogger(summary_writer)
    logger.register('reward', dtype=tf.float32)
    end_episode = lambda r, s, e: logger.plot('reward', r, s)

    def after_action(state, reward, global_step, local_step):
        if global_step % 10 ** 6 == 0:
            path = os.path.join(outdir, 'model.ckpt')
            saver.save(sess, path, global_step=global_step)

    trainer = BatchTrainer(
        env=batch_env,
        agent=agent,
        render=args.render,
        state_shape=state_shape[:-1],
        state_window=constants.STATE_WINDOW,
        final_step=constants.FINAL_STEP,
        after_action=after_action,
        end_episode=end_episode,
        training=not args.demo
    )
    trainer.start()
Esempio n. 7
0
def build_train(convs,
                fcs,
                padding,
                lstm,
                num_actions,
                optimizer,
                lstm_unit=256,
                state_shape=[84, 84, 1],
                grad_clip=40.0,
                value_factor=0.5,
                policy_factor=1.0,
                entropy_factor=0.01,
                rp_frame=3,
                scope='a3c',
                reuse=None):
    with tf.variable_scope(scope, reuse=reuse):
        # placeholers
        obs_t_ph = tf.placeholder(tf.float32, [None] + state_shape, name='obs_t')
        rnn_state_ph0 = tf.placeholder(
            tf.float32, [1, lstm_unit], name='rnn_state_0')
        rnn_state_ph1 = tf.placeholder(
            tf.float32, [1, lstm_unit], name='rnn_state_1')
        actions_tm1_ph = tf.placeholder(tf.int32, [None], name="action_tm1")
        rewards_t_ph = tf.placeholder(tf.float32, [None], name="reward_t")

        # placeholders for A3C update
        actions_t_ph = tf.placeholder(tf.uint8, [None], name='action_t')
        returns_t_ph = tf.placeholder(tf.float32, [None], name='return_t')
        advantages_t_ph = tf.placeholder(tf.float32, [None], name='advantage_t')

        # placeholders for reward prediction update
        rp_obs_ph = tf.placeholder(
            tf.float32, [rp_frame] + state_shape, name='rp_obs')
        rp_reward_tp1_ph = tf.placeholder(tf.int32, [], name='rp_reward_tp1')

        # placeholders for value function replay update
        vr_obs_t_ph = tf.placeholder(
            tf.float32, [None] + state_shape, name='vr_obs_t')
        vr_actions_tm1_ph = tf.placeholder(tf.int32, [None], name='vr_action_tm1')
        vr_rewards_t_ph = tf.placeholder(tf.float32, [None], name='vr_reward_t')
        vr_returns_t_ph = tf.placeholder(tf.float32, [None], name='vr_returns_t')

        # rnn state in tuple
        rnn_state_tuple = tf.contrib.rnn.LSTMStateTuple(
            rnn_state_ph0, rnn_state_ph1)

        # network outpus
        actions_tm1_one_hot = tf.one_hot(
            actions_tm1_ph, num_actions, dtype=tf.float32)
        policy_t, value_t, state_out = make_network(
            convs, fcs, padding, lstm, obs_t_ph, actions_tm1_one_hot,
            rewards_t_ph, rnn_state_tuple, num_actions, lstm_unit, scope='model')

        actions_t_one_hot = tf.one_hot(actions_t_ph, num_actions, dtype=tf.float32)
        log_policy_t = tf.log(tf.clip_by_value(policy_t, 1e-20, 1.0))
        log_prob = tf.reduce_sum(
            log_policy_t * actions_t_one_hot, axis=1, keep_dims=True)

        # A3C loss
        advantages_t = tf.reshape(advantages_t_ph, [-1, 1])
        returns_t = tf.reshape(returns_t_ph, [-1, 1])
        with tf.variable_scope('value_loss'):
            value_loss = tf.reduce_sum((returns_t - value_t) ** 2)
        with tf.variable_scope('entropy_penalty'):
            entropy = -tf.reduce_sum(policy_t * log_policy_t)
        with tf.variable_scope('policy_loss'):
            policy_loss = tf.reduce_sum(log_prob * advantages_t)
        a3c_loss = value_factor * value_loss\
            - policy_factor * policy_loss - entropy_factor * entropy

        # reward prediction loss
        rp_loss = build_rp_loss(
            convs, padding, rp_frame, rp_obs_ph, rp_reward_tp1_ph)

        vr_actions_tm1_one_hot = tf.one_hot(
            vr_actions_tm1_ph, num_actions, dtype=tf.float32)
        vr_loss = build_vr_loss(convs, fcs, padding, lstm, vr_obs_t_ph,
                                vr_actions_tm1_one_hot, vr_rewards_t_ph,
                                num_actions, lstm_unit, vr_returns_t_ph)

        # final loss
        loss = a3c_loss + rp_loss + vr_loss

        # local network weights
        local_vars = tf.get_collection(
            tf.GraphKeys.TRAINABLE_VARIABLES, scope)
        # global network weights
        global_vars = tf.get_collection(
            tf.GraphKeys.TRAINABLE_VARIABLES, 'global')

        # gradients
        gradients = tf.gradients(loss, local_vars)
        gradients, _ = tf.clip_by_global_norm(gradients, grad_clip)

        optimize_expr = optimizer.apply_gradients(zip(gradients, global_vars))

        update_local_expr = []
        for local_var, global_var in zip(local_vars, global_vars):
            update_local_expr.append(local_var.assign(global_var))
        update_local_expr = tf.group(*update_local_expr)

        def update_local():
            sess = tf.get_default_session()
            sess.run(update_local_expr)

        def train(obs_t, rnn_state0, rnn_state1, actions_t, rewards_t,
                  actions_tm1, returns_t, advantages_t, rp_obs, rp_reward_tp1,
                  vr_obs_t, vr_actions_tm1, vr_rewards_t, vr_returns_t):
            feed_dict = {
                obs_t_ph: obs_t,
                rnn_state_ph0: rnn_state0,
                rnn_state_ph1: rnn_state1,
                actions_t_ph: actions_t,
                actions_tm1_ph: actions_tm1,
                rewards_t_ph: rewards_t,
                returns_t_ph: returns_t,
                advantages_t_ph: advantages_t,
                rp_obs_ph: rp_obs,
                rp_reward_tp1_ph: rp_reward_tp1,
                vr_obs_t_ph: vr_obs_t,
                vr_actions_tm1_ph: vr_actions_tm1,
                vr_rewards_t_ph: vr_rewards_t,
                vr_returns_t_ph: vr_returns_t
            }
            sess = tf.get_default_session()
            return sess.run([loss, optimize_expr], feed_dict=feed_dict)[0]

        def act(obs_t, actions_tm1, rewards_t, rnn_state0, rnn_state1):
            feed_dict = {
                obs_t_ph: obs_t,
                actions_tm1_ph: actions_tm1,
                rewards_t_ph: rewards_t,
                rnn_state_ph0: rnn_state0,
                rnn_state_ph1: rnn_state1
            }
            sess = tf.get_default_session()
            return sess.run([policy_t, value_t, state_out], feed_dict=feed_dict)

    return act, train, update_local
Esempio n. 8
0
import tensorflow as tf
import numpy as np

from network import make_network
from data_provider import DataProvider
from tensorflow.core.protobuf import saver_pb2

import time
import os

from IPython import embed

with tf.Session(config=tf.ConfigProto(log_device_placement=True)) as sess:
    network = make_network()
    sess.run(tf.global_variables_initializer())

    saver = tf.train.Saver(write_version=saver_pb2.SaverDef.V2)
    saver.restore(sess, './data/step-10500.ckpt')

    val_provider = DataProvider('val.tfrecords', sess)

    one_batch = val_provider.get_minibatch()

    for i in range(120):
        one_image = one_batch.images[i, ...][None]
        one_speed = one_batch.data[0][i][None]
        a = time.time()
        target_control, = sess.run(network['outputs'],
                                   feed_dict={
                                       network['inputs'][0]: one_image,
                                       network['inputs'][1]: one_speed
Esempio n. 9
0
                                                            [0.229, 0.224, 0.225])])
data_transforms = transforms.Compose([transforms.Resize(256),
                                      transforms.CenterCrop(224),
                                      transforms.ToTensor(),
                                      transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])])

training_dataset = datasets.ImageFolder(train_dir, transform=training_transforms)
validation_dataset = datasets.ImageFolder(valid_dir, transform=data_transforms)

trainloader = torch.utils.data.DataLoader(training_dataset, batch_size=64, shuffle=True)
validationloader = torch.utils.data.DataLoader(validation_dataset, batch_size=32, shuffle=True)

with open('cat_to_name.json', 'r') as f:
    cat_to_name = json.load(f)
    
model = make_network(arch, hidden_units)
   
def train_network(model):
    criterion = nn.NLLLoss()
    optimizer = optim.Adam(model.classifier.parameters(), lr=learning_rate)
    print_every = 40
    if gpu:
        model.to('cuda')
    curr_epoch = 0
    for e in range(epochs): 
        curr_epoch += 1
        training_loss = 0
        steps = 0
        for images, labels in iter(trainloader):
            steps += 1
            if gpu:
Esempio n. 10
0
        #beta = np.broadcast_to(beta[:, np.newaxis, np.newaxis, np.newaxis], shp)
        alpha1.set_value(alpha)
        alpha2.set_value(1 - alpha)
        beta1.set_value(beta)
        beta2.set_value(1 - beta)


if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    os.system("rm -r tbdata/")
    tb = TB("tbdata/")

    with TrainingEnv(name="lyy.{}.test".format(net_name),
                     part_count=2,
                     custom_parser=parser) as env:
        net, SS_list = make_network(minibatch_size=minibatch_size)
        preloss = net.loss_var
        net.loss_var = WeightDecay(net.loss_var, {
            "*conv*:W": 1e-4,
            "*fc*:W": 1e-4,
            "*bnaff*:k": 1e-4,
            "*offset*": 1e-4
        })

        train_func = env.make_func_from_loss_var(net.loss_var,
                                                 "train",
                                                 train_state=True)

        lr = 0.1
        optimizer = Momentum(lr, 0.9)
        optimizer(train_func)
Esempio n. 11
0
    args = parser.parse_args()

    if args.outdir is None:
        args.outdir = os.path.join(os.path.dirname(__file__), 'results')
        if not os.path.exists(args.outdir):
            os.makedirs(args.outdir)
    if args.logdir is None:
        args.logdir = os.path.join(os.path.dirname(__file__), 'logs')

    env = gym.make(args.env)

    obs_dim = env.observation_space.shape[0]
    n_actions = env.action_space.shape[0]

<<<<<<< HEAD
    network = make_network([512, 256, 128 ])
=======
    network = make_network([100, 100, 100])
>>>>>>> 6b94eef89a5953d6ede5ebcd655291f1185553b1

    sess = tf.Session()
    sess.__enter__()

    agent = Agent(network, obs_dim, n_actions)

    initialize()
    agent.sync_old()

    saver = tf.train.Saver()
    if args.load is not None:
        saver.restore(sess, args.load)
Esempio n. 12
0
def main():
    date = datetime.now().strftime('%Y%m%d%H%M%S')
    parser = argparse.ArgumentParser()
    parser.add_argument('--env', type=str, default='PongNoFrameskip-v4')
    parser.add_argument('--load', type=str) # how to load
    parser.add_argument('--logdir', type=str, default=date)
    parser.add_argument('--render', action='store_true')
    parser.add_argument('--demo', action='store_true') # training or not training
    args = parser.parse_args()

    outdir = os.path.join(os.path.dirname(__file__), 'results/' + args.logdir)
    if not os.path.exists(outdir):
        os.makedirs(outdir)
    logdir = os.path.join(os.path.dirname(__file__), 'logs/' + args.logdir)

    env_name = args.env
    tmp_env = gym.make(env_name)
    is_atari = len(tmp_env.observation_space.shape) != 1
    if not is_atari:
        observation_space = tmp_env.observation_space
        constants = box_constants
        if isinstance(tmp_env.action_space, gym.spaces.Box):
            num_actions = tmp_env.action_space.shape[0] # for continuous action space, num_actions means how many continuous actions
        else:
            num_actions = tmp_env.action_space.n # for discrete action space, num_actions means how many selectable actions.
        state_shape = [observation_space.shape[0], constants.STATE_WINDOW]
        state_preprocess = lambda s: s
        reward_preprocess = lambda r: r / 10.0
        # (window_size, dim) -> (dim, window_size)
        phi = lambda s: np.transpose(s, [1, 0])
    else:
        constants = atari_constants
        num_actions = tmp_env.action_space.n
        state_shape = constants.STATE_SHAPE + [constants.STATE_WINDOW]
        def state_preprocess(state):
            state = atari_preprocess(state, constants.STATE_SHAPE)
            state = np.array(state, dtype=np.float32)
            return state / 255.0
        reward_preprocess = lambda r: np.clip(r, -1.0, 1.0)
        # (window_size, H, W) -> (H, W, window_size)
        phi = lambda s: np.transpose(s, [1, 2, 0]) # a transformation function

    # flag of continuous action space
    continuous = isinstance(tmp_env.action_space, gym.spaces.Box) # 'gym.spaces.Box' means continuous action space
    upper_bound = tmp_env.action_space.high if continuous else None

    # save settings
    dump_constants(constants, os.path.join(outdir, 'constants.json'))

    sess = tf.Session()
    sess.__enter__()

    model = make_network( # !!! just a lambda function
        constants.CONVS, constants.FCS, use_lstm=constants.LSTM,
        padding=constants.PADDING, continuous=continuous) # model is a function instance, 
                                                        # mlp network for continuous action space, cnn network for discrete

    # learning rate with decay operation
    if constants.LR_DECAY == 'linear':
        lr = LinearScheduler(constants.LR, constants.FINAL_STEP, 'lr')
        epsilon = LinearScheduler(
            constants.EPSILON, constants.FINAL_STEP, 'epsilon')
    else:
        lr = ConstantScheduler(constants.LR, 'lr')
        epsilon = ConstantScheduler(constants.EPSILON, 'epsilon')

    agent = Agent(
        model, # !!!
        num_actions,
        nenvs=constants.ACTORS,
        lr=lr,
        epsilon=epsilon,
        gamma=constants.GAMMA,
        lam=constants.LAM,
        lstm_unit=constants.LSTM_UNIT,
        value_factor=constants.VALUE_FACTOR,
        entropy_factor=constants.ENTROPY_FACTOR,
        time_horizon=constants.TIME_HORIZON,
        batch_size=constants.BATCH_SIZE,
        grad_clip=constants.GRAD_CLIP,
        state_shape=state_shape,
        epoch=constants.EPOCH,
        phi=phi,
        use_lstm=constants.LSTM,
        continuous=continuous,
        upper_bound=upper_bound
    )

    saver = tf.train.Saver(max_to_keep=5)
    if args.load:
        saver.restore(sess, args.load)
    else: # this else is important
        sess.run(tf.global_variables_initializer()) # 
    # create environemtns
    envs = []
    for i in range(constants.ACTORS): # 8 actors
        env = gym.make(args.env)
        env.seed(constants.RANDOM_SEED)
        if is_atari:
            env = NoopResetEnv(env, noop_max=30)
            env = MaxAndSkipEnv(env)
            env = EpisodicLifeEnv(env)
        wrapped_env = EnvWrapper(
            env,
            r_preprocess=reward_preprocess,
            s_preprocess=state_preprocess
        ) 
        envs.append(wrapped_env) # append all wrapped_envs
    batch_env = BatchEnvWrapper(envs) # envs is a list

    # sess.run(tf.global_variables_initializer()) # should not be here? otherwise it will override the loaded checkpoint

    summary_writer = tf.summary.FileWriter(logdir, sess.graph)
    logger = TfBoardLogger(summary_writer)
    logger.register('reward', dtype=tf.float32)
    end_episode = lambda r, s, e: logger.plot('reward', r, s) # record the reward a episode

    def after_action(state, reward, global_step, local_step):# after an action, check weather need to save model
        # demo mode will not save the model params
        if (global_step % 10**5 >=0 and global_step % 10**5 <= 10 ) and not args.demo : # save model about every 10 ** 5, can't use global step% 10**5 ==0, because global_step may not 
                                                                    # get the number of multiple of 10**5.
            path = os.path.join(outdir, 'model.ckpt')
            print('model saved, global step:{}'.format(global_step))
            saver.save(sess, path, global_step=global_step)

    trainer = BatchTrainer(
        env=batch_env,
        agent=agent, # Agent instannce
        render=args.render,
        state_shape=state_shape[:-1],
        state_window=constants.STATE_WINDOW,
        final_step=constants.FINAL_STEP, # final_step is a total time step limit
        # final_step=12345,
        after_action=after_action, # callback function after an action
        end_episode=end_episode,
        training=not args.demo # if --demo, then not training, if no --demo, then training the policy net and value net
    )
    trainer.start()
Esempio n. 13
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--env', type=str, default='Pendulum-v0')
    parser.add_argument('--outdir',
                        type=str,
                        default="/home/aditya/NIPS18/output")
    parser.add_argument('--logdir',
                        type=str,
                        default="/home/aditya/NIPS18/output")
    parser.add_argument('--load', type=str, default=None)
    parser.add_argument('--final-steps', type=int, default=10**7)
    parser.add_argument('--render', action='store_true')
    parser.add_argument('--batch', type=int, default=64)
    parser.add_argument('--epoch', type=int, default=10)
    args = parser.parse_args()

    if args.outdir is None:
        args.outdir = os.path.join(os.path.dirname(__file__), 'results')
        if not os.path.exists(args.outdir):
            os.makedirs(args.outdir)
    if args.logdir is None:
        args.logdir = os.path.join(os.path.dirname(__file__), 'logs')

    #env = gym.make(args.env)
    env = ProstheticsEnv(visualize=False)
    obs_dim = 160
    n_actions = env.action_space.shape[0]

    network = make_network([128, 128, 128])

    sess = tf.Session()
    sess.__enter__()

    agent = Agent(network, obs_dim, n_actions)

    initialize()
    agent.sync_old()

    saver = tf.train.Saver()
    if args.load is not None:
        saver.restore(sess, args.load)

    reward_summary = tf.placeholder(tf.int32, (), name='reward_summary')
    tf.summary.scalar('reward_summary', reward_summary)
    merged = tf.summary.merge_all()
    train_writer = tf.summary.FileWriter(args.logdir, sess.graph)

    global_step = 0
    episode = 0
    while True:
        local_step = 0

        while True:
            training_data = []
            sum_of_reward = 0
            reward = 0
            obs = env.reset()
            last_obs = None
            last_action = None
            last_value = None
            done = False

            while not done:
                if args.render:
                    env.render()

                action, value = agent.act_and_train(last_obs, last_action,
                                                    last_value, reward, obs)

                last_obs = obs
                last_action = action
                last_value = value
                action = tf.clip_by_value(action, 1e-10, 1.0)
                print(action)
                obs, reward, done, info = env.step(action)

                sum_of_reward += reward
                global_step += 1
                local_step += 1

                # save model
                if global_step % 10**6 == 0:
                    path = os.path.join(args.outdir,
                                        '{}/model.ckpt'.format(global_step))
                    saver.save(sess, path)

                # the end of episode
                if done:
                    summary, _ = sess.run(
                        [merged, reward_summary],
                        feed_dict={reward_summary: sum_of_reward})
                    train_writer.add_summary(summary, global_step)
                    agent.stop_episode(last_obs, last_action, last_value,
                                       reward)
                    print('Episode: {}, Step: {}: Reward: {}'.format(
                        episode, global_step, sum_of_reward))
                    episode += 1
                    break

            # append data for training
            training_data.append(agent.get_training_data())

            if local_step > 2048:
                break

        # train network
        obs = []
        actions = []
        returns = []
        deltas = []
        for o, a, r, d in training_data:
            obs.extend(o)
            actions.extend(a)
            returns.extend(r)
            deltas.extend(d)
        for epoch in range(args.epoch):
            indices = random.sample(range(len(obs)), args.batch)
            sampled_obs = np.array(obs)[indices]
            sampled_actions = np.array(actions)[indices]
            sampled_returns = np.array(returns)[indices]
            sampled_deltas = np.array(deltas)[indices]
            ratio = agent.train(sampled_obs, sampled_actions, sampled_returns,
                                sampled_deltas)

        if args.final_steps < global_step:
            break
Esempio n. 14
0
])
#test_dataset = datasets.ImageFolder('flowers/test', transform=data_transforms)
#testloader = torch.utils.data.DataLoader(test_dataset, batch_size=32, shuffle=True)
if cat_names:
    with open(cat_names, 'r') as f:
        cat_to_name = json.load(f)

#Import pretrained model

if gpu:
    checkpoint = torch.load('checkpoint.pth')
else:
    checkpoint = torch.load('checkpoint.pth',
                            map_location=lambda storage, loc: storage)

model = make_network(checkpoint['arch'], checkpoint['hidden_units'])
model.load_state_dict(checkpoint['model_state_dict'])
class_to_idx = checkpoint['class_to_idx']


def process_image(image):
    image = Image.open(image)
    image = data_transforms(image)
    return image.numpy()


def predict(image_path, model, topk=3):
    model.eval()
    image = process_image(image_path)
    image = torch.from_numpy(image)
    image.unsqueeze_(0)
Esempio n. 15
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--env', type=str, default='Pendulum-v0')
    parser.add_argument('--outdir', type=str, default=None)
    parser.add_argument('--logdir', type=str, default=None)
    parser.add_argument('--load', type=str, default=None)
    parser.add_argument('--final-steps', type=int, default=10**7)
    parser.add_argument('--render', action='store_true')
    parser.add_argument('--batch', type=int, default=64)
    parser.add_argument('--epoch', type=int, default=10)
    args = parser.parse_args()

    if args.outdir is None:
        args.outdir = os.path.join(os.path.dirname(__file__), 'results')
        if not os.path.exists(args.outdir):
            os.makedirs(args.outdir)
    if args.logdir is None:
        args.logdir = os.path.join(os.path.dirname(__file__), 'logs')

    env = gym.make(args.env)
    dam = gym.make("MyAntdam-v1")
    heal = env
    obs_dim = env.observation_space.shape[0]
    n_actions = env.action_space.shape[0]

    network = make_network([64, 64])

    sess = tf.Session()
    sess.__enter__()

    agent = Agent(network, obs_dim, n_actions)

    initialize()
    agent.sync_old()

    saver = tf.train.Saver(max_to_keep=50)
    if args.load is not None:
        saver.restore(sess, args.load)

    reward_summary = tf.placeholder(tf.int32, (), name='reward_summary')
    tf.summary.scalar('reward_summary', reward_summary)
    merged = tf.summary.merge_all()
    train_writer = tf.summary.FileWriter(args.logdir, sess.graph)

    global_step = 0
    episode = 0
    prob = 0.9
    while True:
        local_step = 0
        if global_step > 100000:
            prob = 0.5
        elif global_step > 50000:
            prob = 0.7
        elif global_step > 30000:
            prob = 0.8
        while True:
            if (np.random.randint(0, 2, 1) > prob):
                env = dam
            else:
                env = heal

            training_data = []
            sum_of_reward = 0
            reward = 0
            obs = env.reset()
            last_obs = None
            last_action = None
            last_value = None
            done = False

            while not done:
                if args.render:
                    env.render()

                action, value = agent.act_and_train(last_obs, last_action,
                                                    last_value, reward, obs)
                if numpy.isnan(action).any():
                    print "NaN found"
                    path = os.path.join(args.outdir,
                                        '{}/model.ckpt'.format(global_step))
                    saver.save(sess, path)
                    local_step = 3000
                    global_step = args.final_steps
                    break
                last_obs = obs
                last_action = action
                last_value = value
                obs, reward, done, info = env.step(action)

                sum_of_reward += reward
                global_step += 1
                local_step += 1

                # save model
                if global_step % (5000) == 0:
                    path = os.path.join(args.outdir,
                                        '{}/model.ckpt'.format(global_step))
                    saver.save(sess, path)

                # the end of episode
                if done:
                    summary, _ = sess.run(
                        [merged, reward_summary],
                        feed_dict={reward_summary: sum_of_reward})
                    train_writer.add_summary(summary, global_step)
                    agent.stop_episode(last_obs, last_action, last_value,
                                       reward)
                    print('Episode: {}, Step: {}: Reward: {} Dam: {}'.format(
                        episode, global_step, sum_of_reward, last_obs[-1]))
                    episode += 1
                    break

            # append data for training
            training_data.append(agent.get_training_data())

            if local_step > 2048:
                break

        # train network
        obs = []
        actions = []
        returns = []
        deltas = []
        for o, a, r, d in training_data:
            obs.extend(o)
            actions.extend(a)
            returns.extend(r)
            deltas.extend(d)
        print "Now Training"
        for epoch in range(args.epoch):
            indices = random.sample(range(len(obs)), min(len(obs), args.batch))
            sampled_obs = np.array(obs)[indices]
            sampled_actions = np.array(actions)[indices]
            sampled_returns = np.array(returns)[indices]
            sampled_deltas = np.array(deltas)[indices]

            ratio = agent.train(sampled_obs, sampled_actions, sampled_returns,
                                sampled_deltas)

        if args.final_steps < global_step:
            break
Esempio n. 16
0
    labels = []
    for i in range(size):
        #a = p.get()
        #(img, label) = msgpack.unpackb(a, object_hook = m.decode)
        (img, label) = p.get()
        data.append(img)
        labels.append(label)
    return {
        "data": np.array(data).astype(np.float32),
        "label": np.array(labels)
    }


if __name__ == '__main__':
    with TrainingEnv(name="lyy.{}.test".format(net_name), part_count=2) as env:
        net = make_network(minibatch_size=minibatch_size)
        preloss = net.loss_var
        net.loss_var = WeightDecay(net.loss_var, {
            "*conv*:W": 1e-4,
            "*fc*:W": 1e-4,
            "*bnaff*:k": 1e-4,
            "*offset*": 1e-4
        })

        train_func = env.make_func_from_loss_var(net.loss_var,
                                                 "train",
                                                 train_state=True)
        valid_func = env.make_func_from_loss_var(net.loss_var,
                                                 "val",
                                                 train_state=False)
Esempio n. 17
0
def main():
    date = datetime.now().strftime("%Y%m%d%H%M%S")
    parser = argparse.ArgumentParser()
    parser.add_argument('--env', type=str, default='CartPole-v0')
    parser.add_argument('--outdir', type=str, default=date)
    parser.add_argument('--logdir', type=str, default=date)
    parser.add_argument('--gpu', type=int, default=0)
    parser.add_argument('--load', type=str, default=None)
    parser.add_argument('--render', action='store_true')
    parser.add_argument('--eval-render', action='store_true')
    parser.add_argument('--record', action='store_true')
    parser.add_argument('--demo', action='store_true')
    args = parser.parse_args()

    # learned model path settings
    outdir = os.path.join(os.path.dirname(__file__), 'results/' + args.outdir)
    if not os.path.exists(outdir):
        os.makedirs(outdir)
    # log path settings
    logdir = os.path.join(os.path.dirname(__file__), 'logs/' + args.logdir)

    env = gym.make(args.env)

    # box environment
    if len(env.observation_space.shape) == 1:
        constants = box_constants
        actions = range(env.action_space.n)
        state_shape = [env.observation_space.shape[0], constants.STATE_WINDOW]
        state_preprocess = lambda state: state
        # (window_size, dim) -> (dim, window_size)
        phi = lambda state: np.transpose(state, [1, 0])
    # atari environment
    else:
        constants = atari_constants
        actions = get_action_space(args.env)
        state_shape = [84, 84, constants.STATE_WINDOW]
        def state_preprocess(state):
            state = cv2.cvtColor(state, cv2.COLOR_RGB2GRAY)
            state = cv2.resize(state, (84, 84))
            return np.array(state, dtype=np.float32) / 255.0
        # (window_size, H, W) -> (H, W, window_size)
        phi = lambda state: np.transpose(state, [1, 2, 0])

    # save constant variables
    dump_constants(constants, os.path.join(outdir, 'constants.json'))

    # exploration
    if constants.EXPLORATION_TYPE == 'linear':
        duration = constants.EXPLORATION_DURATION
        explorer = LinearDecayExplorer(final_exploration_step=duration)
    else:
        explorer = ConstantExplorer(constants.EXPLORATION_EPSILON)

    # wrap gym environment
    env = EnvWrapper(
        env,
        s_preprocess=state_preprocess,
        r_preprocess=lambda r: np.clip(r, -1, 1)
    )

    # create encoder network
    network = make_network(
        constants.CONVS,
        constants.FCS,
        constants.DND_KEY_SIZE
    )

    replay_buffer = NECReplayBuffer(constants.REPLAY_BUFFER_SIZE)

    sess = tf.Session()
    sess.__enter__()

    # create DNDs
    dnds = []
    for i in range(len(actions)):
        dnd = DND(
            constants.DND_KEY_SIZE,
            constants.DND_CAPACITY,
            constants.DND_P,
            device=constants.DEVICES[i],
            scope='dnd{}'.format(i)
        )
        dnd._init_vars()
        dnds.append(dnd)

    # create NEC agent
    agent = Agent(
        network,
        dnds,
        actions,
        state_shape,
        replay_buffer,
        explorer,
        constants,
        phi=phi,
        run_options=run_options,
        run_metadata=run_metadata
    )

    sess.run(tf.global_variables_initializer())

    saver = tf.train.Saver()
    if args.load is not None:
        saver.restore(sess, args.load)

    # tensorboard logger
    train_writer = tf.summary.FileWriter(logdir, sess.graph)
    tflogger = TfBoardLogger(train_writer)
    tflogger.register('reward', dtype=tf.float32)
    tflogger.register('eval_reward', dtype=tf.float32)
    # json logger
    trainlogger = JsonLogger(os.path.join(outdir, 'train.json'))
    evallogger = JsonLogger(os.path.join(outdir, 'evaluation.json'))

    # callback on the end of episode
    def end_episode(reward, step, episode):
        tflogger.plot('reward', reward, step)
        trainlogger.plot(reward=reward, step=step, episode=episode)

    evaluator = Evaluator(
        env=copy.deepcopy(env),
        state_shape=state_shape[:-1],
        state_window=constants.STATE_WINDOW,
        eval_episodes=constants.EVAL_EPISODES,
        recorder=Recorder(outdir) if args.record else None,
        record_episodes=constants.RECORD_EPISODES,
        render=args.eval_render
    )
    def should_eval(step, episode):
        return step > 0 and step % constants.EVAL_INTERVAL == 0
    def end_eval(step, episode, rewards):
        mean_rewards = np.mean(rewards)
        tflogger.plot('eval_reward', mean_rewards, step)
        evallogger.plot(reward=mean_rewards, step=step, episode=episode)

    trainer = Trainer(
        env=env,
        agent=agent,
        render=args.render,
        state_shape=state_shape[:-1], # ignore last channel
        state_window=constants.STATE_WINDOW,
        final_step=constants.FINAL_STEP,
        end_episode=end_episode,
        training=not args.demo,
        evaluator=evaluator,
        should_eval=should_eval,
        end_eval=end_eval
    )
    trainer.start()