def train(seed, save_dir): set_global_seeds(seed) save_dir_0 = os.path.join(save_dir, 'seed_%d' % seed) os.makedirs(save_dir_0) env = envs.make(args.env, 'classic_control') with tf.device(args.device): with tf.compat.v1.variable_scope('seed_%d' % seed): model = models.mlp([args.num_units] * args.num_layers, init_mean=args.init_mean, init_sd=args.init_sd) act = deepadfq.learn( env, q_func=model, lr=args.learning_rate, lr_decay_factor=args.learning_rate_decay_factor, lr_growth_factor=args.learning_rate_growth_factor, max_timesteps=args.nb_train_steps, buffer_size=args.buffer_size, batch_size=args.batch_size, exploration_fraction=args.eps_fraction, exploration_final_eps=args.eps_min, target_network_update_freq=args.target_update_freq, print_freq=args.nb_epoch_steps, checkpoint_freq=int(args.nb_train_steps / 5), learning_starts=args.nb_warmup_steps, gamma=args.gamma, prioritized_replay=bool(args.prioritized), prioritized_replay_alpha=args.prioritized_replay_alpha, callback=None, #callback, alg=args.alg, scope=args.scope, sdMin=np.sqrt(args.varth), noise=args.noise, act_policy=args.act_policy, epoch_steps=args.nb_epoch_steps, eval_logger=Logger(args.env, 'classic_control', save_dir=save_dir_0, render=bool(args.render)), save_dir=save_dir_0, test_eps=args.test_eps, gpu_memory=args.gpu_memory, render=bool(args.render), ) if args.record == 1: env.moviewriter.finish()
def main(): env = envs.create_env(None) model = models.mlp([64]) act = simple.learn( env, q_func=model, lr=1e-3, max_timesteps=100000, buffer_size=50000, exploration_fraction=0.01, exploration_final_eps=0.0, print_freq=10, callback=callback, prioritized_replay=True ) print("Saving model to {}_model.pkl".format(envs.VSTR)) act.save("{}_model.pkl".format(envs.VSTR))
def main(): env = gym.make("CartPole-v0") #env = gym.make("MountainCar-v0") model = models.mlp([256, 20]) act = learn(env, q_func=model, lr=1e-2, max_timesteps=100000, buffer_size=90000, exploration_fraction=0.1, exploration_final_eps=0.1, print_freq=25, checkpoint_path='model_chkpoints/cart_model', callback=callback, param_noise=True) print("Saving model to cartpole_model.pkl") act.save("cartpole_model.pkl")
def train(hparams): #wandb.init(project="ebm-gaussians") seed_everything(hparams.seed) model = mlp(sizes=[2, 100, 100, 1], activation=nn.ReLU) optimizer = Adam(model.parameters(), lr=hparams.lr) # load dataset N_train = 5000 X_train = sample_data(N_train) train_dl = DataLoader(X_train, batch_size=100, shuffle=True, num_workers=8) losses = [] for _ in range(hparams.n_epochs): for x in train_dl: neg_x = torch.randn_like(x) neg_x = sample_langevin(neg_x, model, hparams.stepsize, hparams.n_steps) optimizer.zero_grad() pos_out = model(x) neg_out = model(neg_x) loss = (pos_out - neg_out) + hparams.alpha * (pos_out**2 + neg_out**2) loss = loss.mean() loss.backward() #torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=0.1) optimizer.step() losses.append(loss.item()) # wandb.log({'loss': loss.item()}) print('saving a trained model') torch.save(model, hparams.model_path)
def main(): # Prepare data x_list, y_list = generate_training_data_lists() steps = len(x_list) / batch_size train_sequence = TrainSequence(x_list, y_list, batch_size) # Prepare model model = mlp(10000) model.compile(loss='binary_crossentropy', optimizer='sgd', metrics=['acc']) # Plot model plot_model(model, to_file='./{}.png'.format(model_name)) # Fit model history = model.fit_generator(train_sequence, epochs=epochs, steps_per_epoch=steps, verbose=1).history # Plot loss vs accuracy plot_performance(history, model_name, epochs, batch_size) # Write loss and acc to file dump_pickle('./history.pkl', history) # Save model model.save_weights('./{}_weights.h5'.format(model_name))
def main(): run = True state = 2 env_name = 'HumanoidFlagrunBulletEnv-v0' if state == 0: env = atari_wrappers.wrap_deepmind(atari_wrappers.make_atari(env_name), episode_life=True, clip_rewards=True, frame_stack=True, scale=True) else: env = gym.make(env_name) if isinstance(env.action_space, Box): output_size = env.action_space.shape[0] else: output_size = env.action_space.n with tf.Session() as sess: name = 'flag_rnd3' with tf.variable_scope(name): input = tf.placeholder(tf.float32, [None, *env.observation_space.shape]) state_rms = RunningMeanStd(sess, shape=env.observation_space.shape) norm_input = tf.clip_by_value( (input - state_rms._mean) / tf.sqrt(state_rms._var), -5, 5) if state == 0: with tf.variable_scope('policy'): network = models.nature_cnn(input) norm_input = norm_input[:, :, :, 0] with tf.variable_scope('target'): target_net = models.add_dense( models.nature_cnn(norm_input), 256, name='dense1') with tf.variable_scope('predict'): predict_net = models.add_dense( models.nature_cnn(norm_input), 256, name='dense1') with tf.variable_scope('value'): value_net = models.nature_cnn(input) with tf.variable_scope('value_in'): value_in_net = models.nature_cnn(input) model = RND(sess, input, state_rms, network, actiontype.Discrete, output_size, target_net, predict_net, value_in_net,\ value_network=value_net, gamma=0.999, learning_rate=lambda f : 0.0001, epochs=4, minibatch_size=4, beta2=0.01, name=name) else: if state == 1: with tf.variable_scope('policy'): network, seq_len, init_state, last_state = models.lstm( models.mlp(input), 64) with tf.variable_scope('target'): target_net = models.add_dense(models.mlp(norm_input), 256, name='dense2') with tf.variable_scope('predict'): predict_net = models.add_dense(models.mlp(norm_input), 256, name='dense2') with tf.variable_scope('value_in'): value_in_net = models.mlp(input) model = RND(sess, input, state_rms, network, actiontype.Discrete, output_size, target_net, predict_net, value_in_net, epochs=4, minibatch_size=8, gamma=0.99, beta2=0.01, epsilon=0.1,\ coef_in=1., learning_rate=lambda f : 2.5e-4*(1-f), name=name, ) elif state == 2: with tf.variable_scope('policy'): network = models.mlp(norm_input) with tf.variable_scope('target'): target_net = models.add_dense(models.mlp(norm_input), 256, name='dense2') with tf.variable_scope('predict'): predict_net = models.add_dense(models.mlp(norm_input), 256, name='dense2') with tf.variable_scope('value'): value_net = models.mlp(norm_input) with tf.variable_scope('value_in'): value_in_net = models.mlp(norm_input) model = RND(sess, input, state_rms, network, actiontype.Continuous, output_size, target_net, predict_net, value_in_net, value_network=value_net, epochs=10, minibatch_size=32, gamma=0.99, beta2=0.000, epsilon=0.2, \ coef_in=.5, learning_rate=lambda f : 3e-4*(1-f), name=name) if run: run_only(sess, model, env, render=True) else: if state == 0: train(sess, model, env_name, 10000000, 256, num_envs=16, atari=True) elif state == 1: train(sess, model, env_name, 5e6, 128, num_envs=16) elif state == 2: train(sess, model, env_name, 100e6, 2048, num_envs=24, log_interval=5) env.close()
def train(): set_global_seeds(args.seed) directory = os.path.join( args.log_dir, '_'.join([args.env, datetime.datetime.now().strftime("%m%d%H%M")])) if not os.path.exists(directory): os.makedirs(directory) else: ValueError("The directory already exists...", directory) json.dump(vars(args), open(os.path.join(directory, 'learning_prop.json'), 'w')) env = envs.make( args.env, render=bool(args.render), record=bool(args.record), ros=bool(args.ros), dirname=directory, map_name=args.map, num_targets=args.nb_targets, im_size=args.im_size, ) hiddens = args.hiddens.split(':') hiddens = [int(h) for h in hiddens] with tf.device(args.device): if args.env == 'TargetTracking-v5': import simple_imtracking as simple model = models.cnn_plus_mlp( convs=[(8, 4, 2), (16, 3, 1)], hiddens=hiddens, dueling=bool(args.dueling), init_mean=args.init_mean, init_sd=args.init_sd, ) else: import simple_tracking as simple model = models.mlp(hiddens, init_mean=args.init_mean, init_sd=args.init_sd) act, records = simple.learn( env, q_func=model, lr=args.learning_rate, lr_decay_factor=args.learning_rate_decay_factor, lr_growth_factor=args.learning_rate_growth_factor, max_timesteps=args.nb_train_steps, buffer_size=args.buffer_size, batch_size=args.batch_size, exploration_fraction=args.eps_fraction, exploration_final_eps=args.eps_min, target_network_update_freq=args.target_update_freq, print_freq=10, checkpoint_freq=int(args.nb_train_steps / 10), learning_starts=args.nb_warmup_steps, gamma=args.gamma, prioritized_replay=bool(args.prioritized), prioritized_replay_alpha=args.prioritized_replay_alpha, callback=None, #callback, epoch_steps=args.nb_epoch_steps, noise=args.noise, varTH=args.varth, alg=args.alg, gpu_memory=args.gpu_memory, act_policy=args.act_policy, save_dir=directory, nb_test_steps=args.nb_test_steps, scope=args.scope, test_eps=args.test_eps, render=(bool(args.render) or bool(args.ros)), map_name=args.map, num_targets=args.nb_targets, im_size=args.im_size, ) print("Saving model to model.pkl") act.save(os.path.join(directory, "model.pkl")) plot(records, directory) memo = input("Memo for this experiment?: ") f = open(os.path.join(directory, "memo.txt"), 'w') f.write(memo) f.close() if args.record == 1: env.moviewriter.finish()
ccp_alpha=0.005, min_samples_split=2) if option == "Perceptron": model = models.single_layer_perceptron(X, Y, labels, dataset_name, eta0=0.1, random_state=0, max_iter=100) if option == "MLP": model = models.mlp(X, Y, labels, dataset_name, random_state=0, learning_rate=0.05, activation='logistic', hidden_layer_sizes=(6, ), max_iter=500) if option == "XGBoost": model = models.xgboost_model(X, Y, labels, dataset_name) else: pass menu = st.sidebar.checkbox("About Info") if menu: st.write( "Supervised ML for Airbnb dataset. Using Streamlit for visualisation and applying Naive Bayes, Decision Tree, Single and Multi-layer Perceptron, XGBoost" ) st.write(
train_losses, test_losses = [], [] for iteration in range(args.batch_size * args.num_batches + 1): if iteration % args.batch_size == 0: params = get_params(opt_state) train_loss = loss(params, (data['x'], data['dx'])) train_losses.append(train_loss) test_loss = loss(params, (data['test_x'], data['test_dx'])) test_losses.append(test_loss) if iteration % (args.batch_size * args.test_every) == 0: print( f"iteration={iteration}, train_loss={train_loss:.6f}, test_loss={test_loss:.6f}" ) opt_state = update_derivative(iteration, opt_state, (data['x'], data['dx'])) params = get_params(opt_state) return params, train_losses, test_losses if __name__ == "__main__": args = ObjectView(get_args()) dblpend.get_dataset(t_span=[0, args.dataset_size], fps=1, samples=1) mlp = lagrangian_nns.mlp rng = jax.random.PRNGKey(args.seed) init_random_params, nn_forward_fn = mlp(args) _, init_params = init_random_params(rng, (-1, 4)) model = (nn_forward_fn, init_params) data = dblpend.get_dataset(t_span=[0, args.dataset_size], fps=1, samples=1) result = train(args, model, data)
def train_VGG_classifier(use_validation=False, use_val_for_training = False, num_features=4096, learning_rate=0.0001, epochs=3000, threshold=0.5, exp='', batch_norm=True, mini_batch_size=64, save_plots=True, save_features=False, classification_method='MLP', val_size=10, weight_0=1, dataset_name='', features_file='', labels_file=''): # ======================================================================== # FETCH FEATURE EXTRACTOR # ======================================================================== model = VGG16(num_features) # ======================================================================== # WEIGHT INITIALIZATION # ======================================================================== layerscaffe = ['conv1_1', 'conv1_2', 'conv2_1', 'conv2_2', 'conv3_1', 'conv3_2', 'conv3_3', 'conv4_1', 'conv4_2', 'conv4_3', 'conv5_1', 'conv5_2', 'conv5_3', 'fc6', 'fc7', 'fc8'] h5 = h5py.File(vgg_16_weights, 'r') layer_dict = dict([(layer.name, layer) for layer in model.layers]) # Copy the weights stored in the 'vgg_16_weights' file to the # feature extractor part of the VGG16 for layer in layerscaffe[:-3]: w2, b2 = h5['data'][layer]['0'], h5['data'][layer]['1'] w2 = np.transpose(np.asarray(w2), (2,3,1,0)) w2 = w2[::-1, ::-1, :, :] b2 = np.asarray(b2) layer_dict[layer].set_weights((w2, b2)) # Copy the weights of the first fully-connected layer (fc6) layer = layerscaffe[-3] w2, b2 = h5['data'][layer]['0'], h5['data'][layer]['1'] w2 = np.transpose(np.asarray(w2), (1,0)) b2 = np.asarray(b2) layer_dict[layer].set_weights((w2, b2)) # ======================================================================== # FEATURE EXTRACTION # ======================================================================== if save_features: saveFeatures(model, features_file, labels_file, features_key, labels_key, num_features) # ======================================================================== # TRAINING # ======================================================================== adam = Adam(lr=learning_rate, beta_1=0.9, beta_2=0.999, epsilon=1e-08) model.compile(optimizer=adam, loss='categorical_crossentropy', metrics=['accuracy']) h5features = h5py.File(features_file, 'r') h5labels = h5py.File(labels_file, 'r') # X_full will contain all the feature vectors extracted # from optical flow images X_full = h5features[features_key] _y_full = np.asarray(h5labels[labels_key]) zeroes_full = np.asarray(np.where(_y_full==0)[0]) ones_full = np.asarray(np.where(_y_full==1)[0]) zeroes_full.sort() ones_full.sort() # Use a 5 fold cross-validation kf_falls = KFold(n_splits=5, shuffle=True) kf_falls.get_n_splits(X_full[zeroes_full, ...]) kf_nofalls = KFold(n_splits=5, shuffle=True) kf_nofalls.get_n_splits(X_full[ones_full, ...]) sensitivities = [] specificities = [] fars = [] mdrs = [] accuracies = [] fold_number = 1 # CROSS-VALIDATION: Stratified partition of the dataset into # train/test sets for ((train_index_falls, test_index_falls), (train_index_nofalls, test_index_nofalls)) in zip( kf_falls.split(X_full[zeroes_full, ...]), kf_nofalls.split(X_full[ones_full, ...]) ): train_index_falls = np.asarray(train_index_falls) test_index_falls = np.asarray(test_index_falls) train_index_nofalls = np.asarray(train_index_nofalls) test_index_nofalls = np.asarray(test_index_nofalls) X = np.concatenate(( X_full[zeroes_full, ...][train_index_falls, ...], X_full[ones_full, ...][train_index_nofalls, ...] )) _y = np.concatenate(( _y_full[zeroes_full, ...][train_index_falls, ...], _y_full[ones_full, ...][train_index_nofalls, ...] )) X_test = np.concatenate(( X_full[zeroes_full, ...][test_index_falls, ...], X_full[ones_full, ...][test_index_nofalls, ...] )) y_test = np.concatenate(( _y_full[zeroes_full, ...][test_index_falls, ...], _y_full[ones_full, ...][test_index_nofalls, ...] )) if use_validation: # Create a validation subset from the training set zeroes = np.asarray(np.where(_y==0)[0]) ones = np.asarray(np.where(_y==1)[0]) zeroes.sort() ones.sort() trainval_split_0 = StratifiedShuffleSplit(n_splits=1, test_size=int(val_size/2), random_state=7) indices_0 = trainval_split_0.split(X[zeroes,...], np.argmax(_y[zeroes,...], 1)) trainval_split_1 = StratifiedShuffleSplit(n_splits=1, test_size=int(val_size/2), random_state=7) indices_1 = trainval_split_1.split(X[ones,...], np.argmax(_y[ones,...], 1)) train_indices_0, val_indices_0 = indices_0.__next__() train_indices_1, val_indices_1 = indices_1.__next__() X_train = np.concatenate([X[zeroes,...][train_indices_0,...], X[ones,...][train_indices_1,...]],axis=0) y_train = np.concatenate([_y[zeroes,...][train_indices_0,...], _y[ones,...][train_indices_1,...]],axis=0) X_val = np.concatenate([X[zeroes,...][val_indices_0,...], X[ones,...][val_indices_1,...]],axis=0) y_val = np.concatenate([_y[zeroes,...][val_indices_0,...], _y[ones,...][val_indices_1,...]],axis=0) else: X_train = X y_train = _y # Balance the number of positive and negative samples so that # there is the same amount of each of them all0 = np.asarray(np.where(y_train==0)[0]) all1 = np.asarray(np.where(y_train==1)[0]) if len(all0) < len(all1): all1 = np.random.choice(all1, len(all0), replace=False) else: all0 = np.random.choice(all0, len(all1), replace=False) allin = np.concatenate((all0.flatten(),all1.flatten())) allin.sort() X_train = X_train[allin,...] y_train = y_train[allin] # ==================== CLASSIFIER ======================== if classification_method == 'MLP': classifier = mlp(num_features, batch_norm) else: # TODO: handle case where validation is not done new_feature_length = int(num_features / 4) data = sample_data([X_train, X_test, X_val], new_feature_length) X_train = data[0] X_train = np.reshape(X_train, (X_train.shape[0], 1, X_train.shape[1])) X_test = data[1] X_test = np.reshape(X_test, (X_test.shape[0], 1, X_test.shape[1])) X_val = data[2] X_val = np.reshape(X_val, (X_val.shape[0], 1, X_val.shape[1])) classifier = lstm(seq_length=1, feature_length=new_feature_length, nb_classes=1) fold_best_model_path = best_model_path + '{}_fold_{}.h5'.format( dataset_name, fold_number ) classifier.compile(optimizer=adam, loss='binary_crossentropy', metrics=['accuracy']) if not use_checkpoint: # ==================== TRAINING ======================== # weighting of each class: only the fall class gets # a different weight class_weight = {0: weight_0, 1: 1} callbacks = None if use_validation: # callback definition metric = 'val_loss' e = EarlyStopping(monitor=metric, min_delta=0, patience=2, mode='auto') c = ModelCheckpoint(fold_best_model_path, monitor=metric, save_best_only=True, save_weights_only=False, mode='auto') callbacks = [e, c] validation_data = None if use_validation: validation_data = (X_val,y_val) _mini_batch_size = mini_batch_size if mini_batch_size == 0: _mini_batch_size = X_train.shape[0] history = classifier.fit( X_train, y_train, validation_data=validation_data, batch_size=_mini_batch_size, epochs=epochs, shuffle=True, class_weight=class_weight, callbacks=callbacks ) #if not use_validation: # classifier.save(fold_best_model_path) plot_training_info(plots_folder + exp, ['accuracy', 'loss'], save_plots, history.history) if use_validation and use_val_for_training: #classifier = load_model(fold_best_model_path) # Use full training set (training+validation) X_train = np.concatenate((X_train, X_val), axis=0) y_train = np.concatenate((y_train, y_val), axis=0) history = classifier.fit( X_train, y_train, validation_data=validation_data, batch_size=_mini_batch_size, epochs=epochs, shuffle='batch', class_weight=class_weight, callbacks=callbacks ) classifier.save(fold_best_model_path) # ==================== EVALUATION ======================== # TODO: Load model as required # Load best model #print('Model loaded from checkpoint') #classifier = load_model(fold_best_model_path) predicted = classifier.predict(np.asarray(X_test)) for i in range(len(predicted)): if predicted[i] < threshold: predicted[i] = 0 else: predicted[i] = 1 # Array of predictions 0/1 predicted = np.asarray(predicted).astype(int) # Compute metrics and print them cm = confusion_matrix(y_test, predicted,labels=[0,1]) tp = cm[0][0] fn = cm[0][1] fp = cm[1][0] tn = cm[1][1] tpr = tp/float(tp+fn) fpr = fp/float(fp+tn) fnr = fn/float(fn+tp) tnr = tn/float(tn+fp) precision = tp/float(tp+fp) recall = tp/float(tp+fn) specificity = tn/float(tn+fp) f1 = 2*float(precision*recall)/float(precision+recall) accuracy = accuracy_score(y_test, predicted) print('FOLD {} results:'.format(fold_number)) print('TP: {}, TN: {}, FP: {}, FN: {}'.format(tp,tn,fp,fn)) print('TPR: {}, TNR: {}, FPR: {}, FNR: {}'.format( tpr,tnr,fpr,fnr)) print('Sensitivity/Recall: {}'.format(recall)) print('Specificity: {}'.format(specificity)) print('Precision: {}'.format(precision)) print('F1-measure: {}'.format(f1)) print('Accuracy: {}'.format(accuracy)) # Store the metrics for this epoch sensitivities.append(tp/float(tp+fn)) specificities.append(tn/float(tn+fp)) fars.append(fpr) mdrs.append(fnr) accuracies.append(accuracy) fold_number += 1 print('5-FOLD CROSS-VALIDATION RESULTS ===================') print("Sensitivity: %.2f%% (+/- %.2f%%)" % (np.mean(sensitivities)*100., np.std(sensitivities)*100.)) print("Specificity: %.2f%% (+/- %.2f%%)" % (np.mean(specificities)*100., np.std(specificities)*100.)) print("FAR: %.2f%% (+/- %.2f%%)" % (np.mean(fars)*100., np.std(fars)*100.)) print("MDR: %.2f%% (+/- %.2f%%)" % (np.mean(mdrs)*100., np.std(mdrs)*100.)) print("Accuracy: %.2f%% (+/- %.2f%%)" % (np.mean(accuracies)*100., np.std(accuracies)*100.))
def main(): env = envstandalone.BallCatch() max_timesteps = 40000 learning_starts = 1000 buffer_size = 50000 # buffer_size=1 exploration_fraction = 0.2 exploration_final_eps = 0.02 print_freq = 10 gamma = .98 target_network_update_freq = 500 learning_alpha = 0.2 batch_size = 32 train_freq = 1 obsShape = (8, 8, 1) # deicticShape = (3,3,1) # deicticShape = (3,3,2) # deicticShape = (4,4,1) # deicticShape = (4,4,2) deicticShape = (4, 4, 3) # deicticShape = (3,3,4) num_deictic_patches = 25 # num_actions = 4 num_actions = 3 episode_rewards = [0.0] num_cpu = 16 num_cascade = 5 # Create the schedule for exploration starting from 1. exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * max_timesteps), initial_p=1.0, final_p=exploration_final_eps) # Extract deictic patches for an input obs. Each deictic patch has a low level # and a foveated view. # input: n x n x 1 # output: dn x dn x 4 def getDeicticObs(obs): windowLen = deicticShape[0] obsShape = np.shape(obs) obsPadded = np.zeros( (obsShape[0] + 2 * windowLen, obsShape[1] + 2 * windowLen)) obsPadded[windowLen:windowLen + obsShape[0], windowLen:windowLen + obsShape[1]] = obs[:, :, 0] deicticObsThis = np.zeros( (windowLen, windowLen, 4) ) # channel1: zoomin window; channel2: agent in zoomout window; channel3: ball in zoomout window deicticObs = [] for i in range(obsShape[0] - windowLen + 1): for j in range(obsShape[1] - windowLen + 1): deicticObsThis[:, :, 0] = obs[i:i + windowLen, j:j + windowLen, 0] == 1 # agent zoomin deicticObsThis[:, :, 1] = obs[i:i + windowLen, j:j + windowLen, 0] == 2 # ball zoomin patch = obsPadded[i:i + 3 * windowLen, j:j + 3 * windowLen] for k in range(1, 3): # THE VERSION BELOW USES A FIXED VIEW # deicticObsThis[:,:,k+1] = [[(k in obs[0:3,0:3,0]), (k in obs[0:3,3:5]), (k in obs[0:3,5:8,0])], # [(k in obs[3:5,0:3,0]), (k in obs[3:5,3:5,0]), (k in obs[3:5,5:8,0])], # [(k in obs[5:8,0:3,0]), (k in obs[5:8,3:5,0]), (k in obs[5:8,5:8,0])]] # THE VERSION BELOW USES A WIDE VIEW W/ 2 UNITS IN EACH CELL # deicticObsThis[:,:,k+1] = [[(k in patch[1:3,1:3]), (k in patch[1:3,3:5]), (k in patch[1:3,5:7])], # [(k in patch[3:5,1:3]), (k in patch[3:5,3:5]), (k in patch[3:5,5:7])], # [(k in patch[5:7,1:3]), (k in patch[5:7,3:5]), (k in patch[5:7,5:7])]] # THE VERSION BELOW USES A WIDE VIEW W/ 3 UNITS IN EACH CELL deicticObsThis[:, :, k + 1] = [[(k in patch[0:3, 0:3]), (k in patch[0:3, 3:6]), (k in patch[0:3, 6:9])], [(k in patch[3:6, 0:3]), (k in patch[3:6, 3:6]), (k in patch[3:6, 6:9])], [(k in patch[6:9, 0:3]), (k in patch[6:9, 3:6]), (k in patch[6:9, 6:9])]] deicticObs.append( deicticObsThis.copy() ) # CAREFUL WITH APPENDING REFERENCES VS APPENDING COPIES!!! THIS WAS A BUG BEFORE I CORRECTED IT... return np.array(deicticObs) # Same as getDeicticObs, but it operates on a batch rather than a single obs # input: obs -> batches x glances x 3 x 3 x 4 def getDeicticObsBatch(obs): obsShape = np.shape(obs) deicticObsBatch = [] for batch in range(obsShape[0]): deicticObsBatch.append(getDeicticObs(obs[batch])) shape = np.shape(deicticObsBatch) return (np.reshape( np.array(deicticObsBatch), [shape[0] * shape[1], shape[2], shape[3], shape[4]])) # CNN version # conv model parameters: (num_outputs, kernel_size, stride) # model = models.cnn_to_mlp( # convs=[(16,4,1)], # hiddens=[16], # dueling=True # ) # MLP version model = models.mlp([16, 32]) q_func = model lr = 0.001 def make_obs_ph(name): return U.BatchInput(obsShape, name=name) def make_obsDeic_ph(name): # CNN version # return U.BatchInput(deicticShape, name=name) # MLP version return U.BatchInput( [deicticShape[0] * deicticShape[1] * deicticShape[2]], name=name) def make_target_ph(name): # return U.BatchInput([num_actions], name=name) return U.BatchInput([num_cascade, num_actions], name=name) sess = U.make_session(num_cpu) sess.__enter__() getq = build_getq(make_obsDeic_ph=make_obsDeic_ph, q_func=q_func, num_actions=num_actions, num_cascade=num_cascade) targetTrain = build_targetTrain( make_obsDeic_ph=make_obsDeic_ph, make_target_ph=make_target_ph, q_func=q_func, num_actions=env.action_space.n, num_cascade=num_cascade, optimizer=tf.train.AdamOptimizer(learning_rate=lr)) getDeic = build_getDeic(make_obs_ph=make_obs_ph, deicticShape=deicticShape) # Initialize the parameters and copy them to the target network. U.initialize() replay_buffer = ReplayBuffer(buffer_size) obs = env.reset() timerStart = time.time() for t in range(max_timesteps): # obsDeictic = getDeicticObs(obs) obsDeictic = getDeic([obs]) # obsDeictic, patchesTiledStacked2 = getDeic([obs]) # # CNN version # qCurr = getq(np.array(obsDeictic)) # MLP version qCurr = getq( np.reshape( obsDeictic, [-1, deicticShape[0] * deicticShape[1] * deicticShape[2]])) # select action qCurrNoise = qCurr + np.random.random(np.shape( qCurr)) * 0.01 # add small amount of noise to break ties randomly action = np.argmax(np.max(qCurrNoise[:, -1, :], 0)) selPatch = np.argmax(np.max(qCurrNoise[:, -1, :], 1)) if np.random.rand() < exploration.value(t): action = np.random.randint(env.action_space.n) # take action new_obs, rew, done, _ = env.step(action) replay_buffer.add(obs, action, rew, new_obs, float(done)) # sample from replay buffer and train if t > learning_starts and t % train_freq == 0: # Sample from replay buffer obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample( batch_size) # Put observations in deictic form obses_t_deic = getDeic(obses_t) obses_tp1_deic = getDeic(obses_tp1) # obses_t_deic = getDeicticObsBatch(obses_t) # obses_tp1_deic = getDeicticObsBatch(obses_tp1) # Reshape everything to (1152,) form donesTiled = np.repeat(dones, num_deictic_patches) rewardsTiled = np.repeat(rewards, num_deictic_patches) actionsTiled = np.repeat(actions, num_deictic_patches) # # Get curr, next values: CNN version # qNext = getq(obses_tp1_deic) # qCurr = getq(obses_t_deic) # Get curr, next values: MLP version qNext = getq( np.reshape( obses_tp1_deic, [-1, deicticShape[0] * deicticShape[1] * deicticShape[2]])) qCurr = getq( np.reshape( obses_t_deic, [-1, deicticShape[0] * deicticShape[1] * deicticShape[2]])) # This version pairs a glimpse with the same glimpse on the next time step qNextmax = np.max(qNext[:, -1, :], 1) # # This version takes the max over all glimpses # qNextTiled = np.reshape(qNext[:,-1,:],[batch_size,num_deictic_patches,num_actions]) # qNextmax = np.repeat(np.max(np.max(qNextTiled,2),1),num_deictic_patches) # Compute Bellman estimate targets = rewardsTiled + (1 - donesTiled) * gamma * qNextmax # targetsTiled = np.tile(np.reshape(targets,[-1,1]),[1,num_cascade]) qCurrTargets = np.copy(qCurr) # # Copy into cascade without pruning # for i in range(num_cascade): # qCurrTargets[range(batch_size*num_deictic_patches),i,actionsTiled] = targets # Copy into cascade with pruning. qCurrTargets[range(batch_size * num_deictic_patches), 0, actionsTiled] = targets for i in range(num_cascade - 1): mask = targets < qCurrTargets[range(batch_size * num_deictic_patches), i, actionsTiled] qCurrTargets[range(batch_size*num_deictic_patches),i+1,actionsTiled] = \ mask*targets + \ (1-mask)*qCurrTargets[range(batch_size*num_deictic_patches),i+1,actionsTiled] # # CNN version # td_error_out, obses_deic_out, targets_out = targetTrain( # obses_t_deic, # qCurrTargets # ) # MLP version td_error_out, obses_deic_out, targets_out = targetTrain( np.reshape( obses_t_deic, [-1, deicticShape[0] * deicticShape[1] * deicticShape[2]]), qCurrTargets) # bookkeeping for storing episode rewards episode_rewards[-1] += rew if done: new_obs = env.reset() episode_rewards.append(0.0) mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1) num_episodes = len(episode_rewards) if done and print_freq is not None and len( episode_rewards) % print_freq == 0: timerFinal = time.time() print("steps: " + str(t) + ", episodes: " + str(num_episodes) + ", mean 100 episode reward: " + str(mean_100ep_reward) + ", % time spent exploring: " + str(int(100 * exploration.value(t))) + ", time elapsed: " + str(timerFinal - timerStart)) timerStart = timerFinal obs = new_obs
def dense_concat_net(*args, **varargs): return models.mlp(out_size = 16, output_activation = tf.tanh, scope = "concat_net", flatten = False, *args, **varargs)
# nomal=True, # fill_mode='constant') # generator = train_datagen.flow_from_directory(file_path=train_file_path, # data_dir=data_dir, data_suffix=data_suffix, # label_dir=label_dir, label_suffix=label_suffix, # target_size=target_shape, color_mode='grayscale', # batch_size=batch_size, shuffle=True, # loss_shape=None) scheduler = LearningRateScheduler(lr_scheduler) callbacks = [scheduler] # ################### checkpoint saver####################### checkpoint = ModelCheckpoint(filepath=os.path.join(save_path, 'checkpoint_weights.h5'), save_weights_only=True) # .{epoch:d} callbacks.append(checkpoint) # model = srcnn(input_shape=input_shape, kernel_size=[3, 3]) model = mlp() # model.load_weights('unet_optics_l2.h5') model.compile(loss=mean_squared_error, optimizer='adadelta') model.summary() history = model.fit(input_data, input_label, batch_size=batch_size, nb_epoch=epochs, callbacks=callbacks, verbose=1) model.save_weights('mlp_noise100_64.h5')
def run_with_random_hyperparameters(_): # Loading within the process to save time from rnn_ppo import RNN_PPO import tensorflow as tf import loggy import random import schedules import models import discrete_maze.maze tf.reset_default_graph() log = loggy.Log("maze-hyperparam-search", autosave_freq = 15.0, autosave_vars_freq = 180.0, continuing = False) lr = 10 ** random.uniform(-5.5, -2.5) value_prop = 10 ** random.uniform(-1.5, 1.5) if random.random() > 0.8: separate_value_network = (lambda *args, **varargs: tf.squeeze(models.mlp(scope = "value_network", out_size = 1, hiddens = [64, 64], flatten = False, *args, **varargs), axis = 2)) else: separate_value_network = None history_size = 1 if random.random() > 0.15 else random.randint(2, 5) id_size = 1 if random.random() > 0.15 else random.randint(2, 8) reward_type = random.choice(discrete_maze.maze.ExploreTask.reward_types) scale_reward_by_difficulty = random.random() > 0.5 place_agent_far_from_dest = random.random() > 0.2 agent_placement_prop = random.uniform(0.2, 0.9) time_penalty = 10 ** random.uniform(-2.3, -.8) invalid_move_penalty = 10 ** random.uniform(-1, 0.5) def dense_concat_net(*args, **varargs): return models.mlp(out_size = 16, output_activation = tf.tanh, scope = "concat_net", flatten = False, *args, **varargs) concat_net = dense_concat_net if random.random() > 0.5 else None params = { # 'env_creator': schedules.GridMazeSchedule(), 'env_creator': schedules.ExploreCreatorSchedule(is_tree = False, history_size = history_size, id_size = id_size, reward_type = reward_type, scale_reward_by_difficulty = scale_reward_by_difficulty, place_agent_far_from_dest = place_agent_far_from_dest, agent_placement_prop = agent_placement_prop, time_penalty = time_penalty, invalid_move_penalty = invalid_move_penalty), 'clip_ratio': random.uniform(0.18, 0.22), # this seems to be set well 'max_policy_steps': random.randint(50, 100), 'max_kl': random.uniform(0.01, 0.02), 'lambda_gae': random.uniform(0.95, 1.0), 'lr_schedule': (lambda t: lr), 'value_prop_schedule': (lambda t: value_prop), 'log': log, 'gamma': random.uniform(0.95, 1.0), 'min_observations_per_step': 4000, 'render': False, 'rnn_stacks': random.randint(1, 3), 'hidden_units': 2 ** random.randint(4, 8), 'separate_value_network': separate_value_network, 'concat_net': concat_net } params = log.process_params(params) log.add_hyperparams(params) print("Running with parameters:", params) ppo = RNN_PPO(**params) ppo.initialize_variables() def early_stop(policy): maze_size = policy.log.get_last('current maze size', 4) steps = policy.log.get_last('simulation steps', 0) return maze_size == 4 and steps >= 50000 ppo.optimize(500000, early_stop = early_stop) log.close()
from __future__ import division, print_function, absolute_import import tflearn import tflearn.datasets.mnist as mnist from models import mlp def train(net, trainX, trainY, testX, testY): model = tflearn.DNN(net, tensorboard_verbose=0) model.fit(trainX, trainY, n_epoch=20, validation_set=(testX, testY), show_metric=True, run_id="dense_model") if __name__ == '__main__': trainX, trainY, testX, testY = mnist.load_data(one_hot=True) net = mlp() train(net, trainX, trainY, testX, testY)
drop_constant_features(epigenomes) robust_zscoring(epigenomes) run_correlation_tests(epigenomes, labels) scores = extremely_correlated(epigenomes) seaborn_plot_most_correlated(epigenomes, labels, scores, cell_line) seaborn_plot_least_correlated(epigenomes, labels, scores, cell_line) get_top_most_different(epigenomes, labels, cell_line) get_top_most_different_tuples(epigenomes, cell_line) pca_plot(epigenomes, labels, cell_line) tsne_plot(epigenomes, labels, cell_line) for region in ['enhancers', 'promoters']: set_shape(epigenomes, region) modelperc, kwargsperc = perceptron(500, 1024) modeltree, kwargstree = decision_tree(500) modelmlp, kwargsmlp = mlp(500, 1024) modelffnn, kwargsffnn = ffnn(500, 1024) models.extend([modeltree, modelperc, modelmlp, modelffnn]) kwargs.extend([kwargstree, kwargsperc, kwargsmlp, kwargsffnn]) train_result = train(epigenomes, labels, models, kwargs, region, cell_line) barplot(train_result, cell_line, region) models.clear() kwargs.clear() print('Wilcoxon ' + region + ':') wilcoxon_test(train_result, 'FFNN', 'DecisionTreeClassifier') wilcoxon_test(train_result, 'FFNN', 'Perceptron') wilcoxon_test(train_result, 'FFNN', 'MLP') wilcoxon_test(train_result, 'Perceptron', 'DecisionTreeClassifier') wilcoxon_test(train_result, 'Perceptron', 'MLP') wilcoxon_test(train_result, 'MLP', 'DecisionTreeClassifier')
def train(seed, save_dir): set_global_seeds(seed) save_dir_0 = os.path.join(save_dir, 'seed_%d'%seed) os.makedirs(save_dir_0) env = envs.make(args.env, 'target_tracking', render=bool(args.render), record=bool(args.record), directory=save_dir_0, ros=bool(args.ros), map_name=args.map, num_targets=args.nb_targets, im_size=args.im_size, ) with tf.device(args.device): with tf.compat.v1.variable_scope('seed_%d'%seed): hiddens = args.hiddens.split(':') hiddens = [int(h) for h in hiddens] if args.env == 'TargetTracking-v5': model = models.cnn_plus_mlp( convs=[(4, 8, 4), (8, 4, 2)], hiddens= hiddens, dueling=bool(args.dueling), init_mean = args.init_mean, init_sd = args.init_sd, inpt_dim = (args.im_size, args.im_size), ) else: model = models.mlp(hiddens, init_mean=args.init_mean, init_sd=args.init_sd) act = deepadfq.learn( env, q_func=model, lr=args.learning_rate, lr_decay_factor=args.learning_rate_decay_factor, lr_growth_factor=args.learning_rate_growth_factor, max_timesteps=args.nb_train_steps, buffer_size=args.buffer_size, batch_size=args.batch_size, exploration_fraction=args.eps_fraction, exploration_final_eps=args.eps_min, target_network_update_freq=args.target_update_freq, checkpoint_freq=args.checkpoint_freq, learning_starts=args.nb_warmup_steps, gamma=args.gamma, prioritized_replay=bool(args.prioritized), prioritized_replay_alpha=args.prioritized_replay_alpha, callback=None,#callback, alg=args.alg, scope=args.scope, sdMin=np.sqrt(args.varth), noise=args.noise, act_policy=args.act_policy, epoch_steps=args.nb_epoch_steps, eval_logger=Logger(args.env, env_type='target_tracking', save_dir=save_dir_0, render=bool(args.render), figID=1, ros=bool(args.ros), map_name=args.map, num_targets=args.nb_targets, im_size=args.im_size, eval_type=args.eval_type, init_file_path=args.init_file_path, ), save_dir=save_dir_0, test_eps=args.test_eps, gpu_memory=args.gpu_memory, render=(bool(args.render) or bool(args.ros)), ) print("Saving model to model.pkl") act.save(os.path.join(save_dir_0, "model.pkl")) if args.record == 1: env.moviewriter.finish()
def main(): # env = gym.make("CartPoleRob-v0") # env = gym.make("CartPole-v0") # env = gym.make("CartPole-v1") # env = gym.make("Acrobot-v1") # env = gym.make("MountainCarRob-v0") # env = gym.make("FrozenLake-v0") # env = gym.make("FrozenLake8x8-v0") env = gym.make("FrozenLake8x8nohole-v0") # robShape = (2,) # robShape = (3,) # robShape = (200,) # robShape = (16,) robShape = (64,) def make_obs_ph(name): # return U.BatchInput(env.observation_space.shape, name=name) return U.BatchInput(robShape, name=name) # # these params are specific to mountaincar # def getOneHotObs(obs): # obsFraction = (obs[0] + 1.2) / 1.8 # idx1 = np.int32(np.trunc(obsFraction*100)) # obsFraction = (obs[1] + 0.07) / 0.14 # idx2 = np.int32(np.trunc(obsFraction*100)) # ident = np.identity(100) # return np.r_[ident[idx1,:],ident[idx2,:]] # these params are specific to frozenlake def getOneHotObs(obs): # ident = np.identity(16) ident = np.identity(64) return ident[obs,:] model = models.mlp([32]) # model = models.mlp([64]) # model = models.mlp([64], layer_norm=True) # model = models.mlp([16, 16]) # parameters q_func=model lr=1e-3 # max_timesteps=100000 max_timesteps=50000 # max_timesteps=10000 buffer_size=50000 exploration_fraction=0.1 # exploration_fraction=0.3 exploration_final_eps=0.02 # exploration_final_eps=0.1 train_freq=1 batch_size=32 print_freq=10 checkpoint_freq=10000 learning_starts=1000 gamma=1.0 target_network_update_freq=500 # prioritized_replay=False prioritized_replay=True prioritized_replay_alpha=0.6 prioritized_replay_beta0=0.4 prioritized_replay_beta_iters=None prioritized_replay_eps=1e-6 num_cpu=16 # # try mountaincar w/ different input dimensions # inputDims = [50,2] sess = U.make_session(num_cpu) sess.__enter__() act, train, update_target, debug = build_graph.build_train( make_obs_ph=make_obs_ph, q_func=q_func, num_actions=env.action_space.n, optimizer=tf.train.AdamOptimizer(learning_rate=lr), gamma=gamma, grad_norm_clipping=10 ) act_params = { 'make_obs_ph': make_obs_ph, 'q_func': q_func, 'num_actions': env.action_space.n, } # Create the replay buffer if prioritized_replay: replay_buffer = PrioritizedReplayBuffer(buffer_size, alpha=prioritized_replay_alpha) if prioritized_replay_beta_iters is None: prioritized_replay_beta_iters = max_timesteps beta_schedule = LinearSchedule(prioritized_replay_beta_iters, initial_p=prioritized_replay_beta0, final_p=1.0) else: replay_buffer = ReplayBuffer(buffer_size) beta_schedule = None # Create the schedule for exploration starting from 1. exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * max_timesteps), initial_p=1.0, final_p=exploration_final_eps) # Initialize the parameters and copy them to the target network. U.initialize() update_target() episode_rewards = [0.0] saved_mean_reward = None obs = env.reset() obs = getOneHotObs(obs) # with tempfile.TemporaryDirectory() as td: model_saved = False # model_file = os.path.join(td, "model") for t in range(max_timesteps): # Take action and update exploration to the newest value action = act(np.array(obs)[None], update_eps=exploration.value(t))[0] new_obs, rew, done, _ = env.step(action) new_obs = getOneHotObs(new_obs) # Store transition in the replay buffer. replay_buffer.add(obs, action, rew, new_obs, float(done)) obs = new_obs episode_rewards[-1] += rew if done: obs = env.reset() obs = getOneHotObs(obs) episode_rewards.append(0.0) if t > learning_starts and t % train_freq == 0: # Minimize the error in Bellman's equation on a batch sampled from replay buffer. if prioritized_replay: experience = replay_buffer.sample(batch_size, beta=beta_schedule.value(t)) (obses_t, actions, rewards, obses_tp1, dones, weights, batch_idxes) = experience else: obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(batch_size) weights, batch_idxes = np.ones_like(rewards), None td_errors = train(obses_t, actions, rewards, obses_tp1, dones, weights) if prioritized_replay: new_priorities = np.abs(td_errors) + prioritized_replay_eps replay_buffer.update_priorities(batch_idxes, new_priorities) if t > learning_starts and t % target_network_update_freq == 0: # Update target network periodically. update_target() mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1) num_episodes = len(episode_rewards) if done and print_freq is not None and len(episode_rewards) % print_freq == 0: # if done: print("steps: " + str(t) + ", episodes: " + str(num_episodes) + ", mean 100 episode reward: " + str(mean_100ep_reward) + ", % time spent exploring: " + str(int(100 * exploration.value(t)))) # if done and print_freq is not None and len(episode_rewards) % print_freq == 0: # logger.record_tabular("steps", t) # logger.record_tabular("episodes", num_episodes) # logger.record_tabular("mean 100 episode reward", mean_100ep_reward) # logger.record_tabular("% time spent exploring", int(100 * exploration.value(t))) # logger.dump_tabular() # sess num2avg = 20 rListAvg = np.convolve(episode_rewards,np.ones(num2avg))/num2avg plt.plot(rListAvg) # plt.plot(episode_rewards) plt.show() sess
def main(): env = envstandalone.MultiGhostEvade() # env = envstandalone.GhostEvade() # env = envstandalone.BallCatch() max_timesteps=40000 # max_timesteps=80000 learning_starts=1000 # buffer_size=50000 buffer_size=1000 # exploration_fraction=0.2 exploration_fraction=0.4 exploration_final_eps=0.02 print_freq=10 gamma=.98 # target_network_update_freq=500 # target_network_update_freq=100 # target_network_update_freq=10 target_network_update_freq=1 learning_alpha = 0.2 # batch_size=32 # batch_size=64 batch_size=512 # batch_size=1024 train_freq=1 obsShape = (8,8,1) # deicticShape = (3,3,2) # deicticShape = (3,3,4) # deicticShape = (4,4,2) # deicticShape = (4,4,4) deicticShape = (5,5,2) # deicticShape = (6,6,2) # deicticShape = (8,8,2) # num_deictic_patches = 36 # num_deictic_patches = 25 num_deictic_patches = 16 # num_deictic_patches = 9 # num_deictic_patches = 1 # num_actions = 4 # num_actions = 3 num_actions = env.action_space.n episode_rewards = [0.0] num_cpu=16 num_cascade = 5 # Create the schedule for exploration starting from 1. exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * max_timesteps), initial_p=1.0, final_p=exploration_final_eps) # # CNN version # # conv model parameters: (num_outputs, kernel_size, stride) # model = models.cnn_to_mlp( ### model = models.cnn_to_mlp_2pathways( ### convs=[(16,3,1)], # convs=[(32,3,1)], ### convs=[(32,4,1)], ### convs=[(16,4,1)], ## hiddens=[16], # hiddens=[32], # dueling=True # ) # MLP version # model = models.mlp([8, 16]) # model = models.mlp([16, 16]) # model = models.mlp([16, 32]) # model = models.mlp([16, 16]) # model = models.mlp([32, 32]) # model = models.mlp([32]) model = models.mlp([]) q_func=model # lr=0.01 lr=0.001 # lr=0.0005 def make_obs_ph(name): return U.BatchInput(obsShape, name=name) def make_obsDeic_ph(name): # # CNN version # return U.BatchInput(deicticShape, name=name) # MLP version return U.BatchInput([deicticShape[0]*deicticShape[1]*deicticShape[2]], name=name) def make_target_ph(name): # return U.BatchInput([num_actions], name=name) return U.BatchInput([num_cascade,num_actions], name=name) sess = U.make_session(num_cpu) sess.__enter__() getq = build_getq( make_obsDeic_ph=make_obsDeic_ph, q_func=q_func, num_actions=num_actions, num_cascade=num_cascade, scope="deepq", qscope="q_func" ) getqTarget = build_getq( make_obsDeic_ph=make_obsDeic_ph, q_func=q_func, num_actions=num_actions, num_cascade=num_cascade, scope="deepq", qscope="q_func_target" ) update_target = build_update_target(scope="deepq", qscope="q_func", qscopeTarget="q_func_target") targetTrain = build_targetTrain( make_obsDeic_ph=make_obsDeic_ph, make_target_ph=make_target_ph, q_func=q_func, num_actions=env.action_space.n, num_cascade=num_cascade, optimizer=tf.train.AdamOptimizer(learning_rate=lr), # optimizer=tf.train.GradientDescentOptimizer(learning_rate=lr), scope="deepq", qscope="q_func", grad_norm_clipping=1. # grad_norm_clipping=0.1 ) getDeic = build_getDeic_Foc(make_obs_ph=make_obs_ph,deicticShape=deicticShape) # getDeic = build_getDeic_FocCoarse(make_obs_ph=make_obs_ph,deicticShape=deicticShape) # Initialize the parameters and copy them to the target network. U.initialize() update_target() replay_buffer = ReplayBuffer(buffer_size) obs = env.reset() timerStart = time.time() for t in range(max_timesteps): obsDeictic = getDeic([obs]) ## CNN version # qCurr = getq(np.array(obsDeictic)) # MLP version qCurr = getq(np.reshape(obsDeictic,[-1,deicticShape[0]*deicticShape[1]*deicticShape[2]])) # select action qCurrNoise = qCurr + np.random.random(np.shape(qCurr))*0.01 # add small amount of noise to break ties randomly action = np.argmax(np.max(qCurrNoise[:,-1,:],0)) # USE CASCADE # action = np.argmax(np.max(qCurrNoise[:,0,:],0)) # DO NOT USE CASCADE if np.random.rand() < exploration.value(t): action = np.random.randint(env.action_space.n) # take action new_obs, rew, done, _ = env.step(action) replay_buffer.add(obs, action, rew, new_obs, float(done)) # MONTE CARLO VERSION # update rewards to actual monte carlo experiences if done: replay_buffer.update_montecarlo(gamma) # sample from replay buffer and train if t > learning_starts and t % train_freq == 0: # Sample from replay buffer obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(batch_size) # Put observations in deictic form obses_t_deic = getDeic(obses_t) obses_tp1_deic = getDeic(obses_tp1) # obses_t_deic = getDeic(obses_t)[:,:,:,0:2] # obses_tp1_deic = getDeic(obses_tp1)[:,:,:,0:2] # Reshape everything to (1152,) form donesTiled = np.repeat(dones,num_deictic_patches) rewardsTiled = np.repeat(rewards,num_deictic_patches) actionsTiled = np.repeat(actions,num_deictic_patches) # # Get curr, next values: CNN version: NO ROTATION-AUGMENTATION # qNextTarget = getqTarget(obses_tp1_deic) # qNext = getq(obses_tp1_deic) # qCurr = getq(obses_t_deic) # Get curr, next values: MLP version qNext = getq(np.reshape(obses_tp1_deic,[-1,deicticShape[0]*deicticShape[1]*deicticShape[2]])) qCurr = getq(np.reshape(obses_t_deic,[-1,deicticShape[0]*deicticShape[1]*deicticShape[2]])) # # ROTATION-AUGMENTATION: AUGMENT EXPERIENCES WITH FOUR ROTATIONS # obses_t_deicRot1 = np.rot90(obses_t_deic,k=3,axes=(1,2)) # obses_t_deicRot2 = np.rot90(obses_t_deic,k=2,axes=(1,2)) # obses_t_deicRot3 = np.rot90(obses_t_deic,k=1,axes=(1,2)) # obses_t_deic = np.r_[obses_t_deic, obses_t_deicRot1, obses_t_deicRot2, obses_t_deicRot3] # obses_tp1_deicRot1 = np.rot90(obses_tp1_deic,k=3,axes=(1,2)) # obses_tp1_deicRot2 = np.rot90(obses_tp1_deic,k=2,axes=(1,2)) # obses_tp1_deicRot3 = np.rot90(obses_tp1_deic,k=1,axes=(1,2)) # obses_tp1_deic = np.r_[obses_tp1_deic, obses_tp1_deicRot1, obses_tp1_deicRot2, obses_tp1_deicRot3] # qCurr = getq(np.array(obses_t_deic)) # qNext = getq(np.array(obses_tp1_deic)) # actionsTiled = np.r_[actionsTiled, actionsTiled+1, actionsTiled+2, actionsTiled+3] # actionsTiled = actionsTiled - 4 * (actionsTiled>3) # rewardsTiled = np.r_[rewardsTiled,rewardsTiled,rewardsTiled,rewardsTiled] # donesTiled = np.r_[donesTiled,donesTiled,donesTiled,donesTiled] # This version pairs a glimpse with the same glimpse on the next time step qNextmax = np.max(qNext[:,-1,:],1) # standard # actionsNext = np.argmax(qNextTarget[:,-1,:],1) # double-q # qNextmax = qNext[range(num_deictic_patches*batch_size),-1,actionsNext] # # This version takes the max over all glimpses # qNextTiled = np.reshape(qNext[:,-1,:],[batch_size,num_deictic_patches,num_actions]) # qNextmax = np.repeat(np.max(np.max(qNextTiled,2),1),num_deictic_patches) # BELLMAN VERSION targets = rewardsTiled + (1-donesTiled) * gamma * qNextmax # MONTE CARLO VERSION targets = rewardsTiled # # Take min over targets in same group # obses_t_deic_reshape = np.reshape(obses_t_deic,[-1,deicticShape[0]*deicticShape[1]*deicticShape[2]]) # unique_deic, uniqueIdx, uniqueCounts= np.unique(obses_t_deic_reshape,return_inverse=True,return_counts=True,axis=0) # for i in range(np.shape(uniqueCounts)[0]): # targets[uniqueIdx==i] = np.min(targets[uniqueIdx==i]) qCurrTargets = np.copy(qCurr) # Copy into cascade with pruning. expLen = np.shape(qCurr)[0] qCurrTargets[range(expLen),0,actionsTiled] = targets for i in range(num_cascade-1): mask = targets < qCurrTargets[range(expLen),i,actionsTiled] qCurrTargets[range(expLen),i+1,actionsTiled] = \ mask*targets + \ (1-mask)*qCurrTargets[range(expLen),i+1,actionsTiled] # # CNN version # td_error_out, obses_deic_out, targets_out = targetTrain( # obses_t_deic, # qCurrTargets # ) # MLP version td_error_out, obses_deic_out, targets_out = targetTrain( np.reshape(obses_t_deic,[-1,deicticShape[0]*deicticShape[1]*deicticShape[2]]), qCurrTargets ) # Update target network periodically. if t > learning_starts and t % target_network_update_freq == 0: update_target() # bookkeeping for storing episode rewards episode_rewards[-1] += rew if done: new_obs = env.reset() episode_rewards.append(0.0) mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1) num_episodes = len(episode_rewards) if done and print_freq is not None and len(episode_rewards) % print_freq == 0: timerFinal = time.time() print("steps: " + str(t) + ", episodes: " + str(num_episodes) + ", mean 100 episode reward: " + str(mean_100ep_reward) + ", % time spent exploring: " + str(int(100 * exploration.value(t))) + ", time elapsed: " + str(timerFinal - timerStart)) timerStart = timerFinal obs = new_obs
# log = loggy.Log("miniworld-ppo", autosave_freq = 30.0) log = loggy.Log("dense-gerem8", autosave_freq=15.0, autosave_vars_freq=60.0, continuing=False) params = { 'clip_ratio': 0.2, 'max_policy_steps': 80, 'max_val_steps': 80, 'max_kl': 0.015, 'model': (lambda *args, **varargs: models.mlp(*args, **varargs)), # 'model': (lambda *args, **varargs: models.mlp(models.miniworld_preprocess(*args, time = False), **varargs)), 'value_model': (lambda *args, **varargs: tf.squeeze( models.mlp(out_size=1, *args, **varargs), axis=1)), # 'env_creator': schedules.GridMazeSchedule(), # 'env_creator': schedules.ExploreCreatorSchedule(is_tree = False, history_size = 1, # id_size = 1, reward_type = 'penalty+finished', scale_reward_by_difficulty = False), # 'env_creator': schedules.DummyGymSchedule('LunarLander-v2'), # 'env_creator': schedules.DummyGymSchedule('MiniWorld-Hallway-v0'), 'env_creator': schedules.ConstantMazeSchedule('saved_mazes/gerem8.dill'), 'lr_schedule': (lambda t: 3e-4), 'min_observations_per_step': 5000, 'log': log,
def main(): # ******* Deictic parameters ******** # deicticShape is the shape of the patch that is used. For example, a 3,3,2 patch # is a 2-channel 3x3 patch. num_deictic_patches must be set to the number of deicticShape # patches in an entire image. # For example, there are 36 3x3 patches that are contained in an 8x8 observation space # (assuming no zero padding). You must set this number to correspond to deicticShape. # deicticShape = (3,3,2) # deicticShape = (3,3,4) deicticShape = (4, 4, 2) # deicticShape = (4,4,4) # num_deictic_patches = 36 num_deictic_patches = 25 # Desired network type. So far, I've done better w/ CNN WHICH_Q = "CNN" # WHICH_Q = "MLP" # Method used to evaluate value of next state. So far, I've found that PAIRED_NEXT works # much better than MAX_NEXT. MAX_NEXT only works if you also set MIN_OVER_BATCH to True. # OW, it doesn't converge. # PAIRED_NEXT -> use value of corresponding patch on the next step # MAX_NEXT -> use max value over all next-step patches NEXT_PATCH = "PAIRED_NEXT" # NEXT_PATCH = "MAX_NEXT" # If MIN_OVER_BATCH is true, then we find the min value over all targets that have # the same corresponding patch. In principle, this should always help. The larger # the batch size, the more it should help. However, in practice, I find that # it seems to cap the maximum achievable performance. On the other hand, it can # help convergence when using NEXT_PATCH = "MAX_NEXT". # MIN_OVER_BATCH = True MIN_OVER_BATCH = False # If MIN_OR_AVG_Q is "MIN", then we use the minimum Q value as calculated via the cascade. # OW (if "AVG"), we use the standard expected value Q value. "MIN" should work. "AVG" is # equivalent to the standard DQN backup applied to the patches. # best here. MIN_OR_AVG_Q = "MIN" # MIN_OR_AVG_Q = "AVG" # If true, ROTATION_AUGMENTATION augments the agent's experience with # rotated versions of the patches. I typically turn this off. # ROTATION_AUGMENTATION = True ROTATION_AUGMENTATION = False # ******* Load the environment ******** env = envstandalone.StandaloneEnv() obsShape = env.observation_space.shape num_actions = env.action_space.n # ******* Standard DQN parameters ******** max_timesteps = 40000 learning_starts = 1000 buffer_size = 50000 exploration_fraction = 0.4 exploration_final_eps = 0.02 print_freq = 10 gamma = .98 target_network_update_freq = 1 lr = 0.001 batch_size = 32 train_freq = 1 num_cascade = 5 # number of Q-functions in the cascade used to estimate a minimum value for each s,a pair num_cpu = 16 replay_buffer = ReplayBuffer(buffer_size) exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * max_timesteps), initial_p=1.0, final_p=exploration_final_eps) if MIN_OR_AVG_Q == "MIN": minoravg = -1 elif MIN_OR_AVG_Q == "AVG": minoravg = 0 else: print("error") # ******* Create neural network model ******** if WHICH_Q == "CNN": # conv model parameters: (num_outputs, kernel_size, stride) model = models.cnn_to_mlp(convs=[(32, 3, 1)], hiddens=[32], dueling=True) networkShapeOfObservation = [ -1, deicticShape[0], deicticShape[1], deicticShape[2] ] elif WHICH_Q == "MLP": # MLP version # model = models.mlp([8, 16]) model = models.mlp([16, 32]) # model = models.mlp([32]) # model = models.mlp([]) networkShapeOfObservation = [ -1, deicticShape[0] * deicticShape[1] * deicticShape[2] ] else: print("WHICH_Q error: must select valid q-function") q_func = model # ******* Build tensorflow functions ******** def make_obs_ph(name): return U.BatchInput(obsShape, name=name) def make_obsDeic_ph(name): if WHICH_Q == "CNN": return U.BatchInput(deicticShape, name=name) elif WHICH_Q == "MLP": return U.BatchInput( [deicticShape[0] * deicticShape[1] * deicticShape[2]], name=name) else: print("WHICH_Q error: must select valid q-function") def make_target_ph(name): # return U.BatchInput([num_actions], name=name) return U.BatchInput([num_cascade, num_actions], name=name) getq = build_getq(make_obsDeic_ph=make_obsDeic_ph, q_func=q_func, num_actions=num_actions, num_cascade=num_cascade, scope="deepq", qscope="q_func") targetTrain = build_targetTrain( make_obsDeic_ph=make_obsDeic_ph, make_target_ph=make_target_ph, q_func=q_func, num_actions=env.action_space.n, num_cascade=num_cascade, optimizer=tf.train.AdamOptimizer(learning_rate=lr), # optimizer=tf.train.GradientDescentOptimizer(learning_rate=lr), scope="deepq", qscope="q_func", grad_norm_clipping=1.) getDeic = build_getDeic_Foc(make_obs_ph=make_obs_ph, deicticShape=deicticShape) # getDeic = build_getDeic_FocCoarse(make_obs_ph=make_obs_ph,deicticShape=deicticShape) sess = U.make_session(num_cpu) sess.__enter__() obs = env.reset() U.initialize() episode_rewards = [0.0] timerStart = time.time() for t in range(max_timesteps): # get q-values for current deictic patches obsDeictic = getDeic([obs]) qCurr = getq(np.reshape(obsDeictic, networkShapeOfObservation)) # select action qCurrNoise = qCurr + np.random.random(np.shape( qCurr)) * 0.01 # add small amount of noise to break ties randomly action = np.argmax(np.max(qCurrNoise[:, minoravg, :], 0)) # USE CASCADE # action = np.argmax(np.max(qCurrNoise[:,0,:],0)) # DO NOT USE CASCADE if np.random.rand() < exploration.value(t): action = np.random.randint(env.action_space.n) # take action new_obs, rew, done, _ = env.step(action) replay_buffer.add(obs, action, rew, new_obs, float(done)) # sample from replay buffer and train if t > learning_starts and t % train_freq == 0: # Sample from replay buffer obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample( batch_size) # Put observations in deictic form obses_t_deic = getDeic(obses_t) obses_tp1_deic = getDeic(obses_tp1) # Reshape such that patches and batches are interleaved in the same column donesTiled = np.repeat(dones, num_deictic_patches) rewardsTiled = np.repeat(rewards, num_deictic_patches) actionsTiled = np.repeat(actions, num_deictic_patches) # # Get curr, next values: NO ROTATION-AUGMENTATION qNext = getq(np.reshape(obses_tp1_deic, networkShapeOfObservation)) qCurr = getq(np.reshape(obses_t_deic, networkShapeOfObservation)) # # ROTATION-AUGMENTATION: AUGMENT EXPERIENCES WITH FOUR ROTATIONS if ROTATION_AUGMENTATION: obses_t_deicRot1 = np.rot90(obses_t_deic, k=3, axes=(1, 2)) obses_t_deicRot2 = np.rot90(obses_t_deic, k=2, axes=(1, 2)) obses_t_deicRot3 = np.rot90(obses_t_deic, k=1, axes=(1, 2)) obses_t_deic = np.r_[obses_t_deic, obses_t_deicRot1, obses_t_deicRot2, obses_t_deicRot3] obses_tp1_deicRot1 = np.rot90(obses_tp1_deic, k=3, axes=(1, 2)) obses_tp1_deicRot2 = np.rot90(obses_tp1_deic, k=2, axes=(1, 2)) obses_tp1_deicRot3 = np.rot90(obses_tp1_deic, k=1, axes=(1, 2)) obses_tp1_deic = np.r_[obses_tp1_deic, obses_tp1_deicRot1, obses_tp1_deicRot2, obses_tp1_deicRot3] qCurr = getq(np.array(obses_t_deic)) qNext = getq(np.array(obses_tp1_deic)) actionsTiled = np.r_[actionsTiled, actionsTiled + 1, actionsTiled + 2, actionsTiled + 3] actionsTiled = actionsTiled - 4 * (actionsTiled > 3) rewardsTiled = np.r_[rewardsTiled, rewardsTiled, rewardsTiled, rewardsTiled] donesTiled = np.r_[donesTiled, donesTiled, donesTiled, donesTiled] # Get value of next state if NEXT_PATCH == "PAIRED_NEXT": qNextmax = np.max(qNext[:, minoravg, :], 1) # standard elif NEXT_PATCH == "MAX_NEXT": qNextTiled = np.reshape(qNext[:, minoravg, :], [-1, num_deictic_patches, num_actions]) qNextmax = np.repeat(np.max(np.max(qNextTiled, 2), 1), num_deictic_patches) else: print("error") # Compute Bellman estimate targets = rewardsTiled + (1 - donesTiled) * gamma * qNextmax # Take min over targets in same group if MIN_OVER_BATCH: obses_t_deic_reshape = np.reshape( obses_t_deic, [-1, deicticShape[0] * deicticShape[1] * deicticShape[2]]) unique_deic, uniqueIdx, uniqueCounts = np.unique( obses_t_deic_reshape, return_inverse=True, return_counts=True, axis=0) for i in range(np.shape(uniqueCounts)[0]): targets[uniqueIdx == i] = np.min(targets[uniqueIdx == i]) # Copy into cascade with pruning. qCurrTargets = np.copy(qCurr) expLen = np.shape(qCurr)[0] qCurrTargets[range(expLen), 0, actionsTiled] = targets for i in range(num_cascade - 1): mask = targets < qCurrTargets[range(expLen), i, actionsTiled] qCurrTargets[range(expLen),i+1,actionsTiled] = \ mask*targets + \ (1-mask)*qCurrTargets[range(expLen),i+1,actionsTiled] td_error_out, obses_deic_out, targets_out = targetTrain( np.reshape(obses_t_deic, networkShapeOfObservation), qCurrTargets) # bookkeeping for storing episode rewards episode_rewards[-1] += rew if done: new_obs = env.reset() episode_rewards.append(0.0) mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1) num_episodes = len(episode_rewards) if done and print_freq is not None and len( episode_rewards) % print_freq == 0: timerFinal = time.time() print("steps: " + str(t) + ", episodes: " + str(num_episodes) + ", mean 100 episode reward: " + str(mean_100ep_reward) + ", % time spent exploring: " + str(int(100 * exploration.value(t))) + ", time elapsed: " + str(timerFinal - timerStart)) timerStart = timerFinal obs = new_obs
def mlp_mnist_sgd_experiment(rng, sample_size, hidden_size, depth, initializer, learning_rate, momentum, nesterov, epochs, batch_size): X, Y = mix_datasets(*mnist_mlp_connector(load_mnist())) x_train, y_train, inds = sample_train((X, Y), sample_size, rng) assert len(x_train) == len(y_train) print(f"Sampled {len(x_train)} datapoints iid") model = mlp(Y.shape[1], depth=depth, hidden=hidden_size, initializer=initializer) opt = tf.keras.optimizers.SGD(learning_rate=learning_rate, momentum=momentum, nesterov=nesterov) loss = tf.keras.losses.CategoricalCrossentropy(from_logits=True, label_smoothing=0) model.compile(optimizer=opt, loss=loss, metrics=[ tf.keras.metrics.CategoricalAccuracy(name="accuracy", dtype=None) ], loss_weights=None, weighted_metrics=None, run_eagerly=None, steps_per_execution=None) model_extra_summary(model) print( f"Training model for {epochs} epochs and with {batch_size} batch size." ) model.fit(x=x_train, y=y_train, batch_size=batch_size, epochs=epochs, verbose=0, callbacks=None, validation_split=0.0, validation_data=None, shuffle=True, class_weight=None, sample_weight=None, initial_epoch=0, steps_per_epoch=None, validation_steps=None, validation_freq=1, max_queue_size=10, workers=1, use_multiprocessing=False) # DEBUG model.summary() # measure generalization error train_results = model.evaluate(x=x_train, y=y_train, batch_size=None, verbose=0, sample_weight=None, steps=None, callbacks=None, max_queue_size=10, workers=1, use_multiprocessing=False, return_dict=True) expected_results = model.evaluate(x=X, y=Y, batch_size=None, verbose=0, sample_weight=None, steps=None, callbacks=None, max_queue_size=10, workers=1, use_multiprocessing=False, return_dict=True) (Xtr_uniq, Ytr_uniq), (Xtest, Ytest) = retrieve_split((X, Y), inds) train_unique_results = model.evaluate(x=Xtr_uniq, y=Ytr_uniq, batch_size=None, verbose=0, sample_weight=None, steps=None, callbacks=None, max_queue_size=10, workers=1, use_multiprocessing=False, return_dict=True) train_risk = 1 - train_results["accuracy"] expected_risk = 1 - expected_results["accuracy"] train_unique_risk = 1 - train_unique_results["accuracy"] test_risk = 1. / len(Ytest) * (len(Y) * expected_risk - len(Ytr_uniq) * train_unique_risk) generalization = expected_risk - train_risk return { "train_risk": train_risk, "expected_risk": expected_risk, "generalization": generalization, "test_risk": test_risk, "train_unique_risk": train_unique_risk }
'average reward': np.mean(path['reward_totals']), 'std of reward': np.std(path['reward_totals']), 'approximate action entropy': approximate_entropy, 'simulation steps': steps, 'value loss': value_loss } self.env_creator.add_logging_data(log_data) self.log.step(log_data) self.log.print_step() if __name__ == '__main__': log = loggy.Log("maze-h1-pggae") vpgae = VanillaPolicyGAE( model = (lambda *args, **varargs: models.mlp(*args, **varargs)), value_model = (lambda *args, **varargs: tf.squeeze(models.mlp(out_size = 1, *args, **varargs), axis = 1)), env_creator = schedules.ExploreCreatorSchedule(is_tree = False, history_size = 1, id_size = 1, reward_type = 'penalty+finished', scale_reward_by_difficulty = False), # env_creator = schedules.DummyGymSchedule('Acrobot-v1'), lr_schedule = (lambda t: 2e-4), value_lr_schedule = (lambda t: 2.4e-3), lambda_gae = .97, min_observations_per_step = 4000, log = log, gamma = 0.999, render = False, render_mod = 128 ) vpgae.initialize_variables()
self.log.step(log_data) self.log.print_step() if __name__ == '__main__': argument_parser = argparse.ArgumentParser( description="Train a network to navigate a discrete maze.") argument_parser.add_argument( "--history-size", default=1, type=int, help="Number of previous frames to give the network.") options = argument_parser.parse_args() log = loggy.Log("pg") vp = VanillaPolicy( model=(lambda *args, **varargs: models.mlp( hiddens=[64, 64], *args, **varargs)), # env_creator = schedules.ExploreCreatorSchedule(is_tree = False, history_size = options.history_size), env_creator=schedules.DummyGymSchedule('CartPole-v1'), lr_schedule=lambda t: 5e-3, min_observations_per_step=1000, log=log, gamma=1.0, render=True, render_mod=256) vp.initialize_variables() vp.optimize(100000) log.close()