def load_policy(model_path, input_dim, output_dim, num_hidden, num_layers, init_logstd=1., discrete=False, beta=1.0): observation_space = Box(low=-np.inf, high=np.inf, shape=(input_dim, )) if discrete: action_space = Discrete(n=output_dim) else: action_space = Box(low=-np.inf, high=np.inf, shape=(output_dim, )) tf.reset_default_graph() config = tf.ConfigProto(allow_soft_placement=True, inter_op_parallelism_threads=8, intra_op_parallelism_threads=8, device_count={'CPU': 8}) config.gpu_options.allow_growth = True sess = U.make_session(make_default=True, config=config) network = mlp(num_hidden=num_hidden, num_layers=num_layers) policy_train = build_policy(observation_space, action_space, network, trainable_variance=True, state_dependent_variance=True, beta=beta, init_logstd=init_logstd)() U.initialize() policy_train.load(model_path) return policy_train
def load_policy(X_dim, model, num_actions=4, continuous=False, n_bases=50, beta=1., trainable_variance=False, init_logstd=-0.4, linear=False, num_layers=0, num_hidden=16): print(num_layers) if linear: policy_train = LinearGaussianPolicy() with open(model, "rb") as f: if model.endswith('.npy'): K = np.load(f) else: print(f) K = pickle.load(f) policy_train.set_weights(K.T, np.e**init_logstd) return policy_train if continuous: observation_space = Box(low=-np.inf, high=np.inf, shape=(X_dim, )) action_space = Box(low=-1 * np.ones(num_actions), high=np.ones(num_actions)) else: observation_space = Box(low=-np.inf, high=np.inf, shape=(X_dim, )) action_space = Discrete(num_actions) tf.reset_default_graph() sess = U.make_session(make_default=True) network = mlp(num_hidden=num_hidden, num_layers=num_layers) if 'checkpoint' in model: pi = build_policy(observation_space, action_space, network, train=False, beta=beta, trainable_variance=trainable_variance, init_logstd=init_logstd) with tf.variable_scope('pi'): policy_train = pi() U.initialize() policy_train.load(model) else: try: pi = build_policy(observation_space, action_space, network, train=False, beta=beta, trainable_variance=trainable_variance, init_logstd=init_logstd) policy_train = pi() U.initialize() policy_train.load(model) except KeyError: sess.close() tf.reset_default_graph() sess = U.make_session(make_default=True) network = mlp(num_hidden=num_hidden, num_layers=num_layers) pi = build_policy(observation_space, action_space, network, train=False, beta=beta, trainable_variance=trainable_variance, init_logstd=init_logstd) with tf.variable_scope('pi'): policy_train = pi() U.initialize() policy_train.load(model) return policy_train
def bc_twitter(states, actions, args, tf_path, model_path): tweet_states = states[actions == 1] tweet_actions = actions[actions == 1] nop_states = states[actions == 0] nop_actions = actions[actions == 0] num_tweet = (actions == 1).sum() num_nop = (actions == 0).sum() if num_nop > args.ratio * num_tweet: for i in range(min(int((num_nop / num_tweet) // 2), 10)): states = np.concatenate((states, tweet_states)) actions = np.concatenate((actions, tweet_actions)) elif num_tweet > args.ratio * num_nop: for i in range(min(int((num_tweet / num_nop) // 2), 10)): states = np.concatenate((states, nop_states)) actions = np.concatenate((actions, nop_actions)) num_tweet = (actions == 1).sum() num_nop = (actions == 0).sum() if args.weight_classes: ratio = num_tweet / (num_tweet + num_nop) class_weights = [ratio, 1 - ratio] else: class_weights = None dataset = list(zip(states, actions)) random.shuffle(dataset) observation_space = Box(low=-np.inf, high=np.inf, shape=(len(states[0]), )) action_space = Discrete(len(ACTIONS)) tf.reset_default_graph() sess = U.make_session(make_default=True) network = mlp(num_hidden=args.num_hidden, num_layers=args.num_layers) policy_train = build_policy(observation_space, action_space, network, l2=args.l2, lr=args.lr, train=True, class_weights=class_weights)() U.initialize() writer = tf.summary.FileWriter(tf_path) if args.validation > 0.: k = math.floor(args.validation * len(dataset)) dataset_training = dataset[:-k] dataset_validation = dataset[-k:] else: dataset_training = dataset[:] # pre-processing statistics num_batches = len(dataset_training) // args.batch_size if len(dataset_training) % args.batch_size > 0: num_batches += 1 print('# batches: ', num_batches) print('# training samples: ', len(dataset_training)) logger = { 'training_samples': len(dataset_training), 'batch_size': args.batch_size, 'num_batches': num_batches, 'num_epochs': args.num_epochs } if args.validation > 0.: print('# validation samples: ', len(dataset_validation)) logger['validation_samples'] = len(dataset_validation) # validation samples built X_val, y_val = zip(*dataset_validation) X_val, y_val = np.array(X_val), np.array(y_val) XX_val, yy_val = [], [] for i in range(len(ACTIONS)): XX_val.append(X_val[y_val == i]) yy_val.append(y_val[y_val == i]) # train + accuracy over epochs counter = 0 best_accuracy = 0 for epoch in trange(args.num_epochs): # train batches built random.shuffle(dataset_training) batches = [] for i in range(num_batches): base = args.batch_size * i batches.append(dataset_training[base:base + args.batch_size]) # train try: for batch in batches: batch_X, batch_y = zip(*batch) output = policy_train.fit(batch_X, batch_y) summary = tf.Summary(value=[ tf.Summary.Value(tag="loss", simple_value=output[0]), ]) writer.add_summary(summary, counter) counter += 1 except: print("Error") # validation if args.validation > 0.: overall_accuracy = 0 for i in range(len(ACTIONS)): try: accuracy, _, _, _ = policy_train.evaluate( XX_val[i], yy_val[i], args.stochastic_eval) except Exception as e: print(e) summary = tf.Summary(value=[ tf.Summary.Value(tag="accuracy_" + ACTIONS[i], simple_value=accuracy), ]) writer.add_summary(summary, epoch) overall_accuracy += accuracy overall_accuracy /= len(ACTIONS) summary = tf.Summary(value=[ tf.Summary.Value(tag="accuracy_overall", simple_value=overall_accuracy), ]) writer.add_summary(summary, epoch) if epoch % 10 == 0 and best_accuracy <= overall_accuracy: policy_train.save(model_path + 'best') best_accuracy = overall_accuracy with open(tf_path + '/log.txt', 'w') as log_file: log_file.write(str(logger)) return policy_train, sess
else: class_weights = None dataset = list(zip(states, actions)) random.shuffle(dataset) observation_space = Box(low=-np.inf, high=np.inf, shape=(len(X_dataset[0]), )) action_space = Discrete(len(ACTIONS)) tf.reset_default_graph() sess = U.make_session(make_default=True) network = mlp(num_hidden=args.num_hidden, num_layers=args.num_layers) policy_train = build_policy(observation_space, action_space, network, l2=args.l2, lr=args.lr, train=True, class_weights=class_weights)() U.initialize() writer = tf.summary.FileWriter(tf_path) if args.validation > 0.: k = math.floor(args.validation * len(dataset)) dataset_training = dataset[:-k] dataset_validation = dataset[-k:] else: dataset_training = dataset[:] # pre-processing statistics num_batches = len(dataset_training) // args.batch_size
def load_policy(model_path, input_dim, output_dim, num_hidden, num_layers, init_logstd=1., discrete=False, beta=1.0, use_bias=True, X=None, Y=None): observation_space = Box(low=-np.inf, high=np.inf, shape=(input_dim, )) if discrete: action_space = Discrete(n=output_dim) else: action_space = Box(low=-np.inf, high=np.inf, shape=(output_dim, )) tf.reset_default_graph() config = tf.ConfigProto(allow_soft_placement=True, inter_op_parallelism_threads=8, intra_op_parallelism_threads=8, device_count={'CPU': 8}) config.gpu_options.allow_growth = True sess = U.make_session(make_default=True, config=config) network = mlp(num_hidden=num_hidden, num_layers=num_layers) policy_train = build_policy(observation_space, action_space, network, trainable_variance=True, trainable_bias=use_bias, state_dependent_variance=True, beta=beta, init_logstd=init_logstd)() U.initialize() if model_path != '': policy_train.load(model_path) accuracy_nops = accuracy_left = accuracy_right = -1. if X is not None: states = np.array(X) actions = np.array(Y) nops_state = states[actions == 0] right_state = states[actions == 2] left_state = states[actions == 1] nops_action = actions[actions == 0] right_action = actions[actions == 2] left_action = actions[actions == 1] distribution = np.zeros(3) classes = np.unique(Y) class_counts = np.array([np.sum(Y == cl) for cl in classes]) max_count = max(class_counts) for j, c in enumerate(classes): distribution[int(c)] = class_counts[j] / states.shape[0] accuracy_nops, _, loss, _ = policy_train.evaluate( nops_state, nops_action, False) print("Nops Accuracy:", accuracy_nops) if right_state.shape[0] > 0: accuracy_right, _, loss, _ = policy_train.evaluate( right_state, right_action, False) print("Right Accuracy:", accuracy_right) if left_state.shape[0] > 0: accuracy_left, _, loss, _ = policy_train.evaluate( left_state, left_action, False) print("Left Accuracy:", accuracy_left) return policy_train, [accuracy_nops, accuracy_left, accuracy_right], distribution
def behavioral_cloning_nn(num_epochs, num_layers, num_hidden, X, Y, validation=0.2, lr=1e-4, l2=0., batch_size=128, starting_point='', name='', beta=1.0): input_dim = X.shape[-1] observation_space = Box(low=-np.inf, high=np.inf, shape=(input_dim, )) action_space = Discrete(n=np.max(np.unique(Y)) + 1) tf.reset_default_graph() config = tf.ConfigProto(allow_soft_placement=True, inter_op_parallelism_threads=8, intra_op_parallelism_threads=8, device_count={'CPU': 8}) config.gpu_options.allow_growth = True sess = U.make_session(make_default=True, config=config) network = mlp(num_hidden=num_hidden, num_layers=num_layers) policy_train = build_policy(observation_space, action_space, network, l2=l2, lr=lr, beta=beta)() U.initialize() if starting_point != '': policy_train.load(starting_point) model_name = str(num_epochs) + '_' + str(num_layers) + '_' + str( num_hidden) tf_path = 'logs/tensorboards/' + name + '/' + model_name + '_' + str( time.time()) + '/' writer = tf.summary.FileWriter(tf_path) states = np.array(X) actions = np.array(Y) nops_state = states[actions == 0] right_state = states[actions == 2] left_state = states[actions == 1] not_nops_state = np.concatenate([right_state, left_state]) nops_action = actions[actions == 0] right_action = actions[actions == 2] left_action = actions[actions == 1] not_nops_action = np.concatenate([right_action, left_action]) dataset_not_nops = list( zip(np.array(not_nops_state), np.array(not_nops_action))) # check accuracy on lane changes only since they are a tiny percentage of the action performed X_val, y_val = zip(*dataset_not_nops) print("Original Dataset Size:", states.shape[0]) classes = np.unique(Y) class_counts = np.array([np.sum(Y == cl) for cl in classes]) max_count = max(class_counts) ratios = class_counts / max_count print("Class Distribution:", class_counts / states.shape[0]) print("Class ratios:", ratios) states_to_add = [] actions_to_add = [] for j, ratio in enumerate(ratios): if ratio != 1: for i in range(int(1 / ratio)): states_to_add += states[actions == classes[j]].tolist() actions_to_add += actions[actions == classes[j]].tolist() remaining = int((1 / ratio - int(1 / ratio)) * class_counts[j]) all_indexes = np.array([x for x in range(class_counts[j])]) random.shuffle(all_indexes) shuffled_indexes = all_indexes[0:remaining] states_to_add += states[actions == classes[j]][shuffled_indexes].tolist() actions_to_add += actions[actions == classes[j]][shuffled_indexes].tolist() states_to_add = np.array(states_to_add) actions_to_add = np.array(actions_to_add) states = np.concatenate([states, states_to_add], axis=0) actions = np.concatenate([actions, actions_to_add], axis=0) print("Oversampled Dataset Size", states.shape[0]) logger = {'batch_size': batch_size, 'num_epochs': num_epochs} dataset = list(zip(states, actions)) random.shuffle(dataset) dataset_training = dataset[:] # pre-processing statistics num_batches = len(dataset_training) // batch_size num_batches += (0 if len(dataset_training) % batch_size == 0 else 1) print('# batches: ', num_batches) print('# training samples: ', len(dataset_training)) logger['num_batches'] = num_batches logger['training_samples'] = len(dataset_training) counter = 0 for epoch in trange(num_epochs): #train batches built random.shuffle(dataset_training) batches = [] for i in range(num_batches): base = batch_size * i batches.append(dataset_training[base:base + batch_size]) if validation > 0. or True: accuracy, _, loss, _ = policy_train.evaluate( X_val[:], y_val[:], False) summary = tf.Summary(value=[ tf.Summary.Value(tag="accuracy validation", simple_value=accuracy), ]) writer.add_summary(summary, epoch) else: policy_train.save(tf_path + '/best') for batch in batches: batch_X, batch_y = zip(*batch) target = batch_y output = policy_train.fit(batch_X, target) summaries = [ tf.Summary.Value(tag="loss", simple_value=output[0]), tf.Summary.Value(tag="r2", simple_value=output[1]) ] summaries += [ tf.Summary.Value(tag="entropy", simple_value=output[2]), tf.Summary.Value(tag="stochastic_accuracy", simple_value=output[3]) ] summary = tf.Summary(value=summaries) writer.add_summary(summary, counter) counter += 1 # validation if validation > 0.: accuracy, _, loss, _ = policy_train.evaluate(X_val[:], y_val[:], False) summary = tf.Summary(value=[ tf.Summary.Value(tag="accuracy no nops", simple_value=accuracy), ]) writer.add_summary(summary, epoch) if validation > 0: policy_train.save(tf_path + '/best') else: policy_train.save(tf_path + '/best') accuracy, _, loss, _ = policy_train.evaluate(nops_state, nops_action, False) print("Nops Accuracy:", accuracy) if right_state.shape[0] > 0: accuracy, _, loss, _ = policy_train.evaluate(right_state, right_action, False) print("Right Accuracy:", accuracy) if left_state.shape[0] > 0: accuracy, _, loss, _ = policy_train.evaluate(left_state, left_action, False) print("Left Accuracy:", accuracy) return policy_train, logger, tf_path
dx, dy = [0, 0] return dx, dy policy = input_policy if args.run_policy or args.debug_model: state_space = np.prod(env.observation_space.shape) observation_space = Box(low=-np.inf, high=np.inf, shape=(state_space, )) action_space = Discrete(action_dim) tf.reset_default_graph() network = mlp(num_hidden=state_space, num_layers=0) policy_train = build_policy(observation_space, action_space, network, train=False, init_bias=0., trainable_bias=False) pi = policy_train() U.initialize() if not args.random: log("loading_model") theta = np.load(args.model) print(theta.shape) pi.set_theta(np.ravel(theta)) def linear_policy(): #if (np.random.uniform(0, 1) < args.epsilon): # return np.random.randint(0, action_dim) s = env.get_state(rbf=True) logits, a, state, neglogp = pi.step(s, stochastic=True)
def behavioral_cloning_nn(num_epochs, num_layers, num_hidden, X, Y, validation=0.2, lr=1e-4, l2=0., batch_size=128, init_logstd=1., state_dependent_variance=True, starting_point='', discrete=False, beta=1.0): input_dim = X.shape[-1] output_dim = Y.shape[-1] observation_space = Box(low=-np.inf, high=np.inf, shape=(input_dim, )) if discrete: action_space = Discrete(n=len(np.unique(Y))) else: action_space = Box(low=-np.inf, high=np.inf, shape=(output_dim, )) tf.reset_default_graph() config = tf.ConfigProto(allow_soft_placement=True, inter_op_parallelism_threads=8, intra_op_parallelism_threads=8, device_count={'CPU': 8}) config.gpu_options.allow_growth = True sess = U.make_session(make_default=True, config=config) network = mlp(num_hidden=num_hidden, num_layers=num_layers) policy_train = build_policy( observation_space, action_space, network, l2=l2, lr=lr, trainable_variance=state_dependent_variance, init_logstd=init_logstd, beta=beta, state_dependent_variance=state_dependent_variance)() U.initialize() if starting_point != '': policy_train.load(starting_point) # dataset build states = X actions = Y if discrete: print("Original Dataset Size:", states.shape[0]) classes = np.unique(Y) class_counts = np.array([np.sum(Y == cl) for cl in classes]) max_count = max(class_counts) ratios = class_counts / max_count print("Class Distribution:", class_counts / states.shape[0]) print("Class ratios:", ratios) states_to_add = [] actions_to_add = [] for j, ratio in enumerate(ratios): if ratio != 1: for i in range(int(1 / ratio)): states_to_add += states[actions == classes[j]].tolist() actions_to_add += actions[actions == classes[j]].tolist() remaining = int((1 / ratio - int(1 / ratio)) * class_counts[j]) all_indexes = np.array([x for x in range(class_counts[j])]) random.shuffle(all_indexes) shuffled_indexes = all_indexes[0:remaining] states_to_add += states[actions == classes[j]][shuffled_indexes].tolist() actions_to_add += actions[ actions == classes[j]][shuffled_indexes].tolist() states_to_add = np.array(states_to_add) actions_to_add = np.array(actions_to_add) states = np.concatenate([states, states_to_add], axis=0) actions = np.concatenate([actions, actions_to_add], axis=0) print("Oversampled Dataset Size", states.shape[0]) dataset = list(zip(states, actions)) random.shuffle(dataset) if validation > 0.: k = math.floor(validation * len(dataset)) dataset_training = dataset[:-k] dataset_validation = dataset[-k:] else: dataset_training = dataset[:] # pre-processing statistics num_batches = len(dataset_training) // batch_size num_batches += (0 if len(dataset_training) % batch_size == 0 else 1) print('# batches: ', num_batches) print('# training samples: ', len(dataset_training)) logger = { 'training_samples': len(dataset_training), 'batch_size': batch_size, 'num_batches': num_batches, 'num_epochs': num_epochs } if validation > 0.: print('# validation samples: ', len(dataset_validation)) logger['validation_samples'] = len(dataset_validation) # validation samples built X_val, y_val = zip(*dataset_validation) X_val, y_val = np.array(X_val), np.array(y_val) # train + accuracy over epochs counter = 0 best_loss = np.inf for epoch in trange(num_epochs): # train batches built random.shuffle(dataset_training) batches = [] for i in range(num_batches): base = batch_size * i batches.append(dataset_training[base:base + batch_size]) # train if validation > 0.: target = y_val accuracy, _, loss = policy_train.evaluate(X_val[:], target, False) if epoch % 1 == 0 and loss <= best_loss: best_loss = loss else: pass for batch in batches: batch_X, batch_y = zip(*batch) target = batch_y output = policy_train.fit(batch_X, target) summaries = [ tf.Summary.Value(tag="loss", simple_value=output[0]), tf.Summary.Value(tag="r2", simple_value=output[1]) ] if not discrete: summaries += [ tf.Summary.Value(tag="mean_std", simple_value=output[2]), tf.Summary.Value(tag="min_std", simple_value=output[3]), tf.Summary.Value(tag="max_std", simple_value=output[4]) ] else: summaries += [ tf.Summary.Value(tag="entropy", simple_value=output[2]), tf.Summary.Value(tag="stochastic_accuracy", simple_value=output[3]) ] counter += 1 # validation if validation > 0.: target = y_val accuracy, _, loss = policy_train.evaluate(X_val[:], target, False) summary = tf.Summary(value=[ tf.Summary.Value(tag="accuracy", simple_value=accuracy), tf.Summary.Value(tag="test_loss", simple_value=loss) ]) if num_epochs % 1 == 0 and loss <= best_loss: best_loss = loss batch_X, batch_Y = zip(*dataset) _, _, loss, ll = policy_train.evaluate(batch_X[:], batch_Y[:], False) logger['cost'] = loss logger['ll'] = ll return policy_train, logger, None