def _get_operands_from(field): if isinstance(field, list): return [flat(map(_get_operands_from, field))] if not isinstance(field, dict): return [[]] if 'input' in field: return [[field['input']]] if 'agg' in field: return [field['operands']] ops = [x for op in field['operands'] for x in _get_operands_from(op) if x] return [flat(ops)] if field['op'] not in ('AND', 'OR') else ops
def encoder(h, noise_std, update_BN): # Perform encoding for each layer h += tf.random_normal(tf.shape(h)) * noise_std h = tf.identity(h, "h0") for i, layer_spec in enumerate(layers): with tf.variable_scope("encoder_bloc_" + str(i + 1), reuse=tf.AUTO_REUSE): # Create an encoder bloc if the layer type is dense or conv2d if layer_spec["type"] == "flat": h = flat(h, output_name="h") elif layer_spec["type"] == "max_pool_2x2": h = max_pool_2x2(h, output_name="h") else: if i == L - 1: activation = tf.nn.softmax # Only for the last layer else: activation = tf.nn.relu h = encoder_bloc(h, layer_spec, noise_std, update_BN=update_BN, activation=activation) y = tf.identity(h, name="y") return y
def _clipped_minimize(self, optimizer, loss, vars, grad_name=None): grads, _ = zip(*optimizer.compute_gradients(loss, vars)) grads, _ = tf.clip_by_global_norm(grads, clip_norm=self.dconfig.clip_gradient) if grad_name is not None: tf.summary.histogram(grad_name, utils.flat(grads)) return optimizer.apply_gradients(zip(grads, vars))
def process_dataset(dataset, n_clusters, pca_enabled=False): X = np.load("./data/" + dataset + '.npy') if pca_enabled: pca = utils.load_model(dataset + '_pca') X = pca.transform(utils.flat(X)).reshape(X.shape[0], X.shape[1], -1) segments = segmentation(X, n_clusters) np.save("./results/" + dataset + "_segments.npy", segments) color_segments = color_true_map(segments, back_color=[1, 1, 1]) save_image(color_segments, dataset + "_segments") print("Segments:", len(np.bincount(segments.reshape(-1))) - 1)
def generate_work_data(dataset, labels, colors, parameters, pca_enabled=False): X_img = np.load('./data/' + dataset + '.npy') y_img = np.load('./data/' + dataset + '_labels.npy') save_image(color_true_map(y_img, labels_colors=colors), dataset + "_labels") X = utils.flat(X_img) y = utils.flat(y_img) train_ratio, val_ratio = 0.1, 0.1 test_ratio = 1 - (train_ratio + val_ratio) tv_mask, test_mask = utils.balanced_train_test_mask( y, np.isin(y, labels), test_ratio) train_mask, val_mask = utils.balanced_train_test_mask( y, tv_mask, val_ratio / (val_ratio + train_ratio)) np.save("./data/" + dataset + "_train_mask.npy", train_mask) np.save("./data/" + dataset + "_val_mask.npy", val_mask) np.save("./data/" + dataset + "_test_mask.npy", test_mask) if pca_enabled: pca = utils.pca(X[tv_mask, :], 0.99) utils.save_model(pca, dataset + '_pca') train = pca.transform(X[train_mask, :]) test = pca.transform(X[test_mask]) flat = pca.transform(X) else: train = X[train_mask, :] test = X[test_mask, :] flat = X svc = utils.svc(train, y[train_mask], parameters["C"], parameters["gamma"]) utils.save_model(svc, dataset + '_svc') test_pred = svc.predict(test) np.save("./data/" + dataset + "_test_pred.npy", test_pred) classification = svc.predict(flat).reshape(y_img.shape).astype(np.uint8) np.save("./data/" + dataset + "_clasification.npy", classification) save_image(color_true_map(classification, labels_colors=colors), dataset + "_clasification") score = utils.balanced_score(y[test_mask], test_pred) utils.save_json({"original": score}, dataset + "_original_score") print("Test Score:", score)
def cpu_p_v2(*F, N=(500, 5000, 500)): '''Mostra gráfico com o consumo de tempo das funções *F, para entradas com tamanho variando no intervalo N=(500,5000,500). Um parâmetro E = g define a função que gera as entradas''' X = [x for x in range(N[0], N[1] + 1, N[2])] Y = list(map(lambda f: [(f, sample(range(0, i), i)) for i in X], F)) Y = flat(Y) inicio = time() pool = multiprocessing.Pool() Y = pool.map(executar_teste, Y) print('\nTempo total: %.1fs' % (time() - inicio)) Y = [[Y[j + i * (len(Y) // len(F))] for j in range(len(Y) // len(F))] for i in range(len(F))] mostrar_gráfico(X, Y, F)
def action_loss(logits, action, criterion, log=None): """ Sum of losses of one hot vectors encoding an action :param logits: network output vector of [action, [[type_i, ent_i], for i in ents]] :param action: target vector size [7] :param criterion: loss function :return: """ losses = [] for idx, action_part in enumerate(flat(action)): tgt = _variable(torch.LongTensor([action_part])) losses.append(criterion(logits[idx], tgt)) loss = torch.stack(losses, 0).mean() if log is not None: sl.log_loss(losses, loss) return loss
def __init__(self, config, init_vars): import tensorflow as tf dconfig = utils.DotDict(config) plasma.load_plasma_tensorflow_op() store_socket = utils.get_store_socket() self.var_oid = None self.obj_vars = [ tf.Variable(init_var, name='obj_var', dtype=tf.float32) for init_var in init_vars ] self.plasma_grads_oids = tf.placeholder(shape=[dconfig.agent_count], dtype=tf.string, name="plasma_grads_oids") self.plasma_vars_oid = tf.placeholder(shape=[], dtype=tf.string, name="plasma_vars_oids") shapes = [v.shape for v in self.obj_vars] grads = utils.reverse_flat( tf.reduce_mean([ plasma.tf_plasma_op.plasma_to_tensor( self.plasma_grads_oids[a], dtype=tf.float32, plasma_store_socket_name=store_socket) for a in range(dconfig.agent_count) ], axis=0), shapes) obj_optimizer = tf.train.AdamOptimizer( learning_rate=dconfig.obj_func_learning_rate) self.train_obj_op = obj_optimizer.apply_gradients( zip(grads, self.obj_vars)) with tf.control_dependencies([self.train_obj_op]): import tensorflow as tf self.update_vars = plasma.tf_plasma_op.tensor_to_plasma( [utils.flat(self.obj_vars)], self.plasma_vars_oid, plasma_store_socket_name=store_socket) config = tf.ConfigProto() config.gpu_options.allow_growth = True self.sess = tf.Session(config=config) self.sess.run(tf.global_variables_initializer())
def segmentation(img, n_clusters, sigma=0.3): fcm = FCM(n_clusters=n_clusters, max_iter=10000, m=2) fcm.fit(utils.flat(img)) abun = fcm.u.reshape((img.shape[0], img.shape[1], n_clusters)) masks = np.empty(abun.shape, dtype=bool) for i in range(n_clusters): thresh = filters.threshold_otsu(abun[:, :, i]) filters.gaussian(abun[:, :, i], sigma=sigma, output=abun[:, :, i]) masks[:, :, i] = abun[:, :, i] > thresh masks[masks.sum(axis=2) > 1, :] = 0 label_imgs = [np.zeros(img.shape, dtype=np.uint8)] for i in range(n_clusters): binary_opening(masks[:, :, i], out=masks[:, :, i]) label_img = label(masks[:, :, i]) label_img[label_img > 0] += np.max(label_imgs[-1]) label_imgs.append(label_img) return np.dstack(label_imgs).sum(axis=2)
def vec_to_ix(self, _vec): """ Input target vec representing cross entropy loss target [1 0 2 0 0 0 0] Returns a one hot version of it as training input [01 00, 100, 000, 000, 000] :param _vec: :return: """ merged = flat(_vec) action = self.expr_o_h[merged[0]].unsqueeze(0) ix_ent = [] merged = merged[1:] if self.mapping is not None: expr_str = EXPRESSIONS[_vec[0]] # print(expr_str) mp = self.mapping[expr_str] permute = mp['permute'] if 'permute' in mp else None insert_ = mp['insert'] if 'insert' in mp else None test = mp['tst'] if 'tst' in mp else None idx = mp['idx'] if 'idx' in mp else None if insert_ is not None and test is not None and idx is not None: text_ix = _vec[1:][test[0]] for i, insrt_ent in zip(idx, insert_): if test[1] in self.lookup_ix_to_expr(text_ix): merged.insert(i, insrt_ent) else: merged.append(insrt_ent) elif insert_ is not None and idx is None: for insrt_ent in insert_: merged.append(insrt_ent) if permute is not None: nperm = np.argsort(permute) merged = np.asarray(merged)[nperm] else: # print(len(self.ents_o_h)) for idx in range(len(self.ents_o_h) - len(merged)): merged.append(0) for idx, value in enumerate(merged): ix_ent.append(self.ents_o_h[idx][value].unsqueeze(0)) return torch.cat([action, torch.cat(ix_ent, 1)], 1)
def combined_ent_loss(logits, action, criterion, log=None): """ some hand tunining of penalties for illegal actions... trying to force learning of types. action type => type_e... :param logits: network output vector of one_hot distributions [action, [type_i, ent_i], for i in ents] :param action: target vector size [7] :param criterion: loss function :return: """ losses = [] for idx, action_part in enumerate(flat(action)): tgt = _variable(torch.Tensor([action_part]).float()) losses.append(criterion(logits[idx], tgt)) lfs = [[losses[0]]] n = 2 for l in(losses[i:i+n] for i in range(1, len(losses), n)): lfs.append(torch.stack(losses, 0).sum()) loss = torch.stack(lfs, 0).mean() if log is not None: sl.log_loss(losses, loss) return loss
def merge_clauses(clauses): return Or(*flat(clause.sub_formulas for clause in clauses))
def visitConnector(self, obj): return flat(self.visit(form) for form in obj.sub_formulas)
def train_plan(args, data, DNC, lstm_state, optimizer): """ Things to test after some iterations: - on planning phase and on with goals - chose a goal and work toward that :param args: :return: """ criterion = nn.CrossEntropyLoss().cuda( ) if args.cuda is True else nn.CrossEntropyLoss() cum_correct, cum_total, prob_times, n_success = [], [], [], 0 penalty = 1.1 for trial in range(args.iters): start_prob = time.time() phase_masks = data.make_new_problem() n_total, n_correct, prev_action, loss, stats = 0, 0, None, 0, [] dnc_state = DNC.init_state(grad=False) lstm_state = DNC.init_rnn(grad=False) # lstm_state, optimizer.zero_grad() for phase_idx in phase_masks: if phase_idx == 0 or phase_idx == 1: inputs = _variable(data.getitem_combined()) logits, dnc_state, lstm_state = DNC(inputs, lstm_state, dnc_state) _, prev_action = data.strip_ix_mask(logits) elif phase_idx == 2: mask = _variable(data.getmask()) inputs = torch.cat([mask, prev_action], 1) logits, dnc_state, lstm_state = DNC(inputs, lstm_state, dnc_state) _, prev_action = data.strip_ix_mask(logits) else: # sample from best moves actions_star, all_actions = data.get_actions(mode='both') if not actions_star: break if args.zero_at == 'step': optimizer.zero_grad() mask = data.getmask() prev_action = prev_action.cuda( ) if args.cuda is True else prev_action pr = u.depackage(prev_action) final_inputs = _variable(torch.cat([mask, pr], 1)) logits, dnc_state, lstm_state = DNC(final_inputs, lstm_state, dnc_state) exp_logits = data.ix_input_to_ixs(logits) guided = random.random() < args.beta # thing 1 if guided: # guided loss final_action, lstep = L.naive_loss(exp_logits, actions_star, criterion, log=True) else: # pick own move final_action, lstep = L.naive_loss(exp_logits, all_actions, criterion, log=True) # penalty for todo tests this !!!! action_own = u.get_prediction(exp_logits) if args.penalty and not [tuple(flat(t)) for t in all_actions]: final_loss = lstep * _variable([args.penalty]) else: final_loss = lstep if args.opt_at == 'problem': loss += final_loss else: final_loss.backward(retain_graph=args.ret_graph) if args.clip: torch.nn.utils.clip_grad_norm(DNC.parameters(), args.clip) optimizer.step() loss = lstep data.send_action(final_action) if (trial + 1) % args.show_details == 0: action_accs = u.human_readable_res(data, all_actions, actions_star, action_own, guided, lstep.data[0]) stats.append(action_accs) n_total, _ = tick(n_total, n_correct, action_own, flat(final_action)) n_correct += 1 if action_own in [ tuple(flat(t)) for t in actions_star ] else 0 prev_action = data.vec_to_ix(final_action) if stats: arr = np.array(stats) correct = len([ 1 for i in list(arr.sum(axis=1)) if i == len(stats[0]) ]) / len(stats) sl.log_acc(list(arr.mean(axis=0)), correct) if args.opt_at == 'problem': floss = loss / n_total floss.backward(retain_graph=args.ret_graph) if args.clip: torch.nn.utils.clip_grad_norm(DNC.parameters(), args.clip) optimizer.step() sl.writer.add_scalar('losses.end', floss.data[0], sl.global_step) n_success += 1 if n_correct / n_total > args.passing else 0 cum_total.append(n_total) cum_correct.append(n_correct) sl.add_scalar('recall.pct_correct', n_correct / n_total, sl.global_step) print( "trial {}, step {} trial accy: {}/{}, {:0.2f}, running total {}/{}, running avg {:0.4f}, loss {:0.4f} " .format(trial, sl.global_step, n_correct, n_total, n_correct / n_total, n_success, trial, running_avg(cum_correct, cum_total), loss.data[0])) end_prob = time.time() prob_times.append(start_prob - end_prob) print("solved {} out of {} -> {}".format(n_success, args.iters, n_success / args.iters)) return DNC, optimizer, lstm_state, running_avg(cum_correct, cum_total)
def visitAnd(self, obj): return And(*flat(self.visit(form).sub_formulas for form in obj.sub_formulas))
def future_policy_value(self, x, a, trans, seq_len, seq_mask, agent, opt, create_summary=False): """ Computes the value of a policy according to the critic when updated using the objective function :param x: observations :param a: actions :param trans: entire tuple of transition (s_t, a_t, r_t, d_t, s_{t+1}) :param seq_len: Length of trajectories :param seq_mask: Binary mask of trajectories :param agent: agent to compute value for :param opt: optimizer to use for the policy update :param create_summary: whether to create summary ops :return: tensor of batched future policy value """ with tf.variable_scope('future_policy_value'): policy = agent.main.policy policy_vars = policy.trainable_variables # The replace manager can replace the policy variables with updated variables replace_manager = policy.variable_scope.custom_getter use_adam = self.dconfig.obj_func_second_order_adam step_size = self.dconfig.obj_func_second_order_stepsize step_count = self.dconfig.obj_func_second_order_steps + 1 batch_size = self.dconfig.buffer_sample_size # Split tensors according to number of inner gradient descent steps x_s = tf.split(x, step_count, axis=0) a_s = tf.split(a, step_count, axis=0) if seq_len is not None: seq_len_s = tf.split(seq_len, step_count, axis=0) seq_mask_s = tf.split(seq_mask, step_count, axis=0) else: seq_len_s = utils.ConstArray() seq_mask_s = utils.ConstArray(seq_mask) trans_s = list( zip(*(tf.split(e, step_count, axis=0) for e in trans))) objective_val = None policy_grads = None opt_args_dict = {} current_vars = policy_vars var_names = [var.op.name for var in policy_vars] for i in range(step_count - 1): # Run policy policy_result = policy(x_s[i], seq_len=seq_len_s[i]) # Run objective objective_val = self.objective(x_s[i], a_s[i], trans_s[i], seq_len_s[i], seq_mask_s[i], agent, policy_result, create_summary) # Compute policy gradients policy_grads = tf.gradients(objective_val * seq_mask_s[i], current_vars) if use_adam: def grad_transform(grad, var, var_name): if var_name in opt_args_dict: opt_args = opt_args_dict[var_name] else: opt_args = [] new_grad, *opt_args = opt.adapt_gradients(grad, var, *opt_args, lr=step_size) opt_args_dict[var_name] = opt_args return new_grad else: def grad_transform(grad, *args): return step_size * grad # Use adam or vanilla SGD for inner gradient step transformed_grads = [ grad_transform(grad, var, var_name) for grad, var, var_name in zip( policy_grads, current_vars, var_names) ] one_step_updated_policy_vars = [ var - grad for var, grad in zip(current_vars, transformed_grads) ] one_step_updated_policy_vars_dict = OrderedDict( zip(var_names, one_step_updated_policy_vars)) # # Updates replace manager to run policy with updated variables in the next loop iteration replace_manager.replace_dict = one_step_updated_policy_vars_dict current_vars = one_step_updated_policy_vars # Run policy with final parameters future_policy = policy(x, seq_len=seq_len) replace_manager.replace_dict = None # Estimate the final policy value future_policy_value = agent.main.critic( x, future_policy.action) * seq_mask if create_summary: orig_policy = policy(x_s[-1], seq_len=seq_len_s[-1]) partial_future_policy_value = future_policy_value[-batch_size:] tf.summary.histogram('objective_value', objective_val) tf.summary.histogram('policy_grads', utils.flat(policy_grads)) tf.summary.histogram('policy_value', orig_policy.value) tf.summary.histogram('future_policy_value', partial_future_policy_value) tf.summary.histogram( 'policy_value_gain', partial_future_policy_value - orig_policy.value) sample_axis = [ 0, 1 ] if self.dconfig.recurrent_time_steps > 1 else 0 cor = utils.correlation(-orig_policy.value, objective_val, sample_axis) tf.summary.scalar('objective_critic_correlation', tf.squeeze(cor)) grad, = tf.gradients(objective_val, policy_result.value) if grad is not None: tf.summary.histogram('objective_critic_grads', grad) return future_policy_value
def __getitem__(self, key): if key not in self: return self._wrapper([key]) else: return self._wrapper(flat([self[x] for x in self._aliases[key]]))
def _setup(self, dconfig, logdir): """ Create tensorflow graph and summary writer :param dconfig: configuration to use to build the graph :param logdir: log directory to write tensorflow logs to """ env = gym.make(dconfig.env_name) obs_dim = env.observation_space.shape[0] act_dim = env.action_space.shape[0] # Action limit for clamping: critically, assumes all dimensions share the same bound! act_limit = env.action_space.high[0] agent = Agent(dconfig, env) objective = Objective(dconfig) # Experience buffer replay_buffer = ReplayBuffer(obs_dim=obs_dim, act_dim=act_dim, size=dconfig.buffer_size, discount_factor=dconfig.discount_factor) time = dconfig.recurrent_time_steps if dconfig.recurrent_time_steps > 1 else None # Create datasets from replay buffer replay_buffer_dataset = replay_buffer.create_dataset( dconfig.buffer_sample_size, time) replay_buffer_dataset_iterator = replay_buffer_dataset.make_initializable_iterator( ) # If we perform multiple gradient steps in the inner loop, provide different data for each step large_batch_size = (self.dconfig.obj_func_second_order_steps + 1) * dconfig.buffer_sample_size large_replay_buffer_dataset = replay_buffer.create_dataset( large_batch_size, time) large_replay_buffer_dataset_iterator = large_replay_buffer_dataset.make_initializable_iterator( ) handle = tf.placeholder(tf.string, shape=[]) iterator = tf.data.Iterator.from_string_handle( handle, replay_buffer_dataset.output_types, replay_buffer_dataset.output_shapes) itr_elem = utils.DotDict(iterator.get_next()) x_ph, a_ph, x2_ph, r_ph, d_ph, lens_ph = itr_elem.obs1, itr_elem.acts, itr_elem.obs2,\ itr_elem.rews, itr_elem.done, itr_elem.lens # Mask for different trajectory lengths if lens_ph is not None: seq_mask = tf.sequence_mask(lens_ph, time, dtype=tf.float32) else: seq_mask = tf.ones([], dtype=tf.float32) x_ph_behv = placeholder(obs_dim, name='ObsBehavior') timestep = tf.placeholder(tf.float32, [], 'timestep') if dconfig.policy_is_recurrent: state_shape = [2, 1, dconfig.policy_units] init_policy_state = tf.placeholder_with_default( tf.zeros(state_shape), [2, 1, dconfig.policy_units]) else: init_policy_state = None transition = [ x_ph, a_ph, x2_ph, r_ph[..., tf.newaxis], d_ph[..., tf.newaxis] ] # Learning rate annealing if dconfig.policy_update_start: base = dconfig.policy_lr_annealing_base lr_progress = (base**tf.minimum( 1.0, timestep / dconfig.policy_update_start) - 1) / (base - 1) else: lr_progress = 1 # Optimizers pi_optimizer = utils.TensorAdamOptimizer( learning_rate=dconfig.policy_learning_rate * lr_progress) q_optimizer = tf.train.AdamOptimizer( learning_rate=dconfig.critic_learning_rate) obj_optimizer = tf.train.AdamOptimizer( learning_rate=dconfig.obj_func_learning_rate) # Main outputs from computation graph main = agent.main policy = main.policy(x_ph, seq_len=lens_ph) pi_action = policy.action q1_pi = policy.value pi_behv = main.policy(x_ph_behv[:, tf.newaxis], initial_state=init_policy_state) q1 = main.critic(x_ph, a_ph) q2 = main.critic2(x_ph, a_ph) obj = objective.objective(x_ph, a_ph, transition, lens_ph, seq_mask, agent, policy) # Target policy network pi_action_targ = agent.target.policy(x2_ph, seq_len=lens_ph).action # Target Q networks # Target policy smoothing, by adding clipped noise to target actions epsilon = tf.random_normal(tf.shape(pi_action_targ), stddev=dconfig.critic_noise) epsilon = tf.clip_by_value(epsilon, -dconfig.critic_noise_clip, dconfig.critic_noise_clip) a2 = pi_action_targ + epsilon a2 = tf.clip_by_value(a2, -act_limit, act_limit) q1_targ = agent.target.critic(x2_ph, a2) q2_targ = agent.target.critic2(x2_ph, a2) # Bellman backup for Q functions, using Clipped Double-Q targets min_q_targ = tf.minimum(q1_targ, q2_targ) gamma = dconfig.discount_factor backup = tf.stop_gradient(r_ph + gamma * (1 - d_ph) * min_q_targ + d_ph) # Objective function annealing if dconfig.obj_func_anneal_steps: progress = tf.minimum(1.0, timestep / dconfig.obj_func_anneal_steps) obj = progress * obj - (1 - progress) * q1_pi # TD3 losses pi_loss = -tf.reduce_mean(q1_pi * seq_mask) pi_obj_loss = tf.reduce_mean(obj * seq_mask) q1_loss = tf.reduce_mean((q1 - backup)**2 * seq_mask) q2_loss = tf.reduce_mean((q2 - backup)**2 * seq_mask) q_loss = q1_loss + q2_loss main_vars = sorted(get_vars('main', trainable_only=False), key=lambda v: v.name) target_vars = sorted(get_vars('target', trainable_only=False), key=lambda v: v.name) # Train policy directly using critic train_pi_op = self._clipped_minimize(pi_optimizer, pi_loss, get_vars('main/policy'), grad_name='ddpg_policy_grads') # Train policy using objective function train_pi_obj_op = self._clipped_minimize( pi_optimizer, pi_obj_loss, get_vars('main/policy'), grad_name='objective_policy_grads') # Train critic train_q_op = q_optimizer.minimize(q_loss, var_list=get_vars('main/critic')) tf.summary.histogram('policy_params', utils.flat(get_vars('main/policy'))) # Objective function loss q1_obj = objective.future_policy_value( x_ph, a_ph, transition, lens_ph, seq_mask, agent, pi_optimizer, create_summary=dconfig.obj_func_enabled) obj_loss = -tf.reduce_mean(q1_obj) # Objective function optimization using ray (send gradients to ObjectiveServer) obj_vars = get_vars('objective') store_socket = utils.get_store_socket() shapes = [v.shape for v in obj_vars] plasma_var_oid = tf.placeholder(shape=[], dtype=tf.string, name="plasma_var_oid") retrieved_vars = utils.reverse_flat( plasma.tf_plasma_op.plasma_to_tensor( plasma_var_oid, dtype=tf.float32, plasma_store_socket_name=store_socket), shapes) # Op to read new objective parameters from ray object store plasma_read_vars = [ var.assign(retrieved) for var, retrieved in zip(obj_vars, retrieved_vars) ] grads, vars = zip(*obj_optimizer.compute_gradients(obj_loss, obj_vars)) grads, _ = tf.clip_by_global_norm(grads, clip_norm=dconfig.clip_gradient) tf.summary.histogram('objective_params', utils.flat(vars)) tf.summary.histogram('objective_param_grads', utils.flat(grads)) objective_grads = grads # Op to send gradients to ObjectiveServer train_obj_op = obj_optimizer.apply_gradients(zip( objective_grads, vars)) plasma_grad_oid = tf.placeholder(shape=[], dtype=tf.string, name="plasma_grad_oid") # Op to send gradients to ObjectiveServer plasma_write_grads = plasma.tf_plasma_op.tensor_to_plasma( [utils.flat(objective_grads)], plasma_grad_oid, plasma_store_socket_name=store_socket) # Print number of parameters print(f''' =================================================================== Parameters Policy {np.sum(np.prod(v.shape) for v in get_vars('main/policy'))} Critic {np.sum(np.prod(v.shape) for v in get_vars('main/critic'))} Objective {np.sum(np.prod(v.shape) for v in obj_vars)} =================================================================== ''') # Polyak averaging for target variables polyak = 1 - dconfig.target_network_update_speed target_update = tf.group([ tf.assign(v_targ, polyak * v_targ + (1 - polyak) * v_main) for v_main, v_targ in zip(main_vars, target_vars) ]) # Initializing target networks to match main variables target_init = tf.group([ tf.assign(v_targ, v_main) for v_main, v_targ in zip(main_vars, target_vars) ]) # Ops for copying and resetting the policy (currently not used) reset_policy = tf.variables_initializer(get_vars('main')) copy_policy = tf.group([ tf.assign(v_targ, v_main) for v_main, v_targ in zip(get_vars('main'), get_vars('target')) ]) # Summaries tflog_utils.log_scalars(policy_loss=pi_loss, q_loss=q_loss) if dconfig.obj_func_enabled: tflog_utils.log_scalars(policy_obj_loss=pi_obj_loss, objective_loss=obj_loss) self.restore_savers = self._create_restore_savers(dconfig) self.saver = tf.train.Saver(max_to_keep=1000, save_relative_paths=True) self.summary = tf.summary.merge_all() self.summary_writer = tf.summary.FileWriter( f'{logdir}_agent{self.worker_index}') config = tf.ConfigProto() config.gpu_options.allow_growth = True self.sess = tf.Session(config=config) self.sess.run(tf.global_variables_initializer()) init_ops = [target_init] self.sess.run(init_ops) rb_handle, large_rb_handle = self.sess.run([ replay_buffer_dataset_iterator.string_handle(), large_replay_buffer_dataset_iterator.string_handle() ]) # Return all created tf ops return utils.DotDict(locals())
def _antialias(cls, columns): return set(flat(map(cls._aliases, columns)))