def create_actor_network(self, state_size, action_dim): log('[DDPG] Building the actor model') S = Input(shape=[state_size]) # Use default initializer to initialize weights h0 = Dense( HIDDEN1_UNITS, activation='relu', kernel_initializer=RandomUniform(minval=-1.0 / np.sqrt(state_size), maxval=1.0 / np.sqrt(state_size)), bias_initializer=RandomUniform(minval=-1.0 / np.sqrt(state_size), maxval=1.0 / np.sqrt(state_size)))(S) h1 = Dense(HIDDEN2_UNITS, activation='relu', kernel_initializer=RandomUniform( minval=-1.0 / np.sqrt(HIDDEN1_UNITS), maxval=1.0 / np.sqrt(HIDDEN1_UNITS)), bias_initializer=RandomUniform( minval=-1.0 / np.sqrt(HIDDEN1_UNITS), maxval=1.0 / np.sqrt(HIDDEN1_UNITS)))(h0) Left_gain_factor = Dense( 1, activation='sigmoid', # to bound output between 0 and 1 kernel_initializer=RandomUniform(minval=-0.003, maxval=0.003), bias_initializer=RandomUniform(minval=-0.003, maxval=0.003))(h1) Right_gain_factor = Dense( 1, activation='sigmoid', # to bound output between 0 and 1 kernel_initializer=RandomUniform(minval=-0.003, maxval=0.003), bias_initializer=RandomUniform(minval=-0.003, maxval=0.003))(h1) V = merge([Left_gain_factor, Right_gain_factor], mode='concat') model = Model(input=S, output=V) return model, model.trainable_weights, S
def create_critic_network(self, state_size, action_dim): log('[DDPG] Building the critic model') S = Input(shape=[state_size]) A = Input(shape=[action_dim], name='action2') w1 = Dense( HIDDEN1_UNITS, activation='relu', kernel_regularizer=regularizers.l2(0.01), kernel_initializer=RandomUniform(minval=-1.0 / np.sqrt(state_size), maxval=1.0 / np.sqrt(state_size)), bias_initializer=RandomUniform(minval=-1.0 / np.sqrt(state_size), maxval=1.0 / np.sqrt(state_size)))(S) a1 = Dense( HIDDEN2_UNITS, activation='relu', kernel_regularizer=regularizers.l2(0.01), kernel_initializer=RandomUniform(minval=-1.0 / np.sqrt(action_dim), maxval=1.0 / np.sqrt(action_dim)), bias_initializer=RandomUniform(minval=-1.0 / np.sqrt(action_dim), maxval=1.0 / np.sqrt(action_dim)))(A) h1 = Dense(HIDDEN2_UNITS, activation='relu', kernel_regularizer=regularizers.l2(0.01), kernel_initializer=RandomUniform( minval=-1.0 / np.sqrt(HIDDEN1_UNITS), maxval=1.0 / np.sqrt(HIDDEN1_UNITS)), bias_initializer=RandomUniform( minval=-1.0 / np.sqrt(HIDDEN1_UNITS), maxval=1.0 / np.sqrt(HIDDEN1_UNITS)))(w1) h2 = merge([h1, a1], mode='sum') h3 = Dense(HIDDEN2_UNITS, activation='relu', kernel_regularizer=regularizers.l2(0.01), kernel_initializer=RandomUniform( minval=-1.0 / np.sqrt(HIDDEN2_UNITS), maxval=1.0 / np.sqrt(HIDDEN2_UNITS)), bias_initializer=RandomUniform( minval=-1.0 / np.sqrt(HIDDEN2_UNITS), maxval=1.0 / np.sqrt(HIDDEN2_UNITS)))(h2) V = Dense( action_dim, activation='linear', # Linear activation function kernel_initializer=RandomUniform(minval=-0.003, maxval=0.003), bias_initializer=RandomUniform(minval=-0.003, maxval=0.003))(h3) model = Model(input=[S, A], output=V) adam = Adam(lr=self.LEARNING_RATE) model.compile(loss='mse', optimizer=adam) return model, A, S
def gait_eval(position_vector, description, serial, oscillator_nw, max_evals=max_evals, max_duration=max_duration): for i in range(max_evals): result = oscillator_nw(position_vector, max_time=max_duration) log('[EVAL] Description: {0}, Serial#: {1}, Run#: {2}, Result: << {3}, {4}, {5}, {6}, {7}, {8}, {9}, {10}, {11} >>' .format(description, serial, i + 1, result['fitness'], result['fallen'], result['up'], result['x_distance'], result['abs_y_deviation'], result['avg_footstep_x'], result['var_torso_alpha'], result['var_torso_beta'], result['var_torso_gamma'])) log('#################################################')
def oscillator_nw(kf, GAIN1, GAIN2, GAIN3, GAIN4, GAIN5, GAIN6, BIAS1, BIAS2, BIAS3, BIAS4, max_time=15.0, fitness_option=1): # Try to connect to VREP vrep = None try_counter = 0 try_max = 5 while vrep is None: try: log('Trying to create robot handle (attempt: {0} of {1})'.format( try_counter, try_max)) try_counter += 1 vrep = VrepIO(vrep_host='127.0.0.1', vrep_port=19997, scene=None, start=False) except Exception, e: log('Could not connect to VREP') log('Error: {0}'.format(e.message)) time.sleep(1.0) if try_counter > try_max: log('Unable to create robot handle after {0} tries'.format( try_max)) exit(1)
vrep = VrepIO(vrep_host='127.0.0.1', vrep_port=19997, scene=None, start=False) except Exception, e: log('Could not connect to VREP') log('Error: {0}'.format(e.message)) time.sleep(1.0) if try_counter > try_max: log('Unable to create robot handle after {0} tries'.format( try_max)) exit(1) if vrep is not None: log('Successfully connected to VREP') # Start the simulation vrep.start_simulation() # Start the monitoring thread monitor_thread = RobotMonitorThread(portnum=19998, objname='torso_11_respondable', height_threshold=0.3) monitor_thread.start() log('Started monitoring thread') # Note the current position start_pos_x = monitor_thread.x start_pos_y = monitor_thread.y start_pos_z = monitor_thread.z
import os import numpy as np from matsuoka_walk import Logger, log from matsuoka_walk.oscillator_3_test_yaw import oscillator_nw as oscillator_3_test_yaw # Set the home directory home_dir = os.path.expanduser('~') # Set the logging variables # This also creates a new log file Logger(log_dir=os.path.join(home_dir, '.bio_walk/logs/'), log_flag=True) LOWEST_POSSIBLE_GAIN = 0.4 log('[STATIC TEST] LOWEST_POSSIBLE_GAIN: {}'.format(LOWEST_POSSIBLE_GAIN)) wtmpc23_run3_best30 = [ 0.3178385532762875, 0.3777451259604342, 0.023411599863716586, 0.013217696615302215, 0.4566963469455763, 0.20194162123716233, 0.3309010463046798, -0.05187677829896087, 0.09633745660574622, -0.11559976203529859, 0.4814311312157089, 1.5364038978521224 ] asus_run1_bestall = [ 0.7461913734531209, 0.8422944031253159, 0.07043758116681641, 0.14236621222553963, 0.48893497409925746, 0.5980055418720059, 0.740811806645801, -0.11618361090424223, 0.492832184960149, -0.2949145038394889, 0.175450703085948, -0.3419733470484183 ] best_chromosome = asus_run1_bestall
def deviation_controller(train_indicator=0, identifier=''): #1 means Train, 0 means simply Run # np.random.seed(1337) # The train_indicator is switched internally to test the model after every n runs # So a separate flag indicates if the entire run is a test run, in which case the train_indicator always stays 0 only_test_run = False if train_indicator == 0: log('[DDPG TEST ] This is a test run') only_test_run = True else: log('[DDPG] This is a training run') done = False step = 0 epsilon = 1 # Tensorflow GPU optimization config = tf.ConfigProto() # config.gpu_options.allow_growth = True sess = tf.Session(config=config) from keras import backend as K K.set_session(sess) # Create the actor and acritic models and the replay buffer actor = ActorNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU, LRA) critic = CriticNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU, LRC) buff = ReplayBuffer(BUFFER_SIZE) # Create replay buffer # Register the matsuoka environment ENV_NAME = 'matsuoka_env-v0' gym.undo_logger_setup() env = gym.make(ENV_NAME) env._max_episode_steps = max_steps np.random.seed(SEED_FOR_RANDOM) env.seed(SEED_FOR_RANDOM) # Load existing weights if this is a test run if only_test_run: log('[DDPG TEST ] Loading existing weights') try: actor.model.load_weights( os.path.join(model_dir, 'actormodel_' + identifier + '.h5')) critic.model.load_weights( os.path.join(model_dir, 'criticmodel_' + identifier + '.h5')) actor.target_model.load_weights( os.path.join(model_dir, 'actormodel_' + identifier + '.h5')) critic.target_model.load_weights( os.path.join(model_dir, 'criticmodel_' + identifier + '.h5')) log('[DDPG TEST ] Weight load successfully') except: print("Cannot find the weight") # This flag indicates if a test has just been done just_tested = False # Counter for episodes i = 1 # While max number of episodes is not over episode_count = train_episode_count if train_indicator == 1 else test_episode_count log('[DDPG ' + ('' if train_indicator else 'TEST') + '] Number of max episodes: {}'.format(episode_count)) while i <= episode_count: # Test the policy after every n episodes # So after episode 20 completes, i will be 21 and the if will evaluate to True # If train_indicator is initially set to 0, then execute the else block only # This logic of switching the train_indicator is only needed during a training run if not only_test_run: if not just_tested and (i - 1) > 0 and ( (i - 1) % TEST_AFTER_N_EPISODES == 0): train_indicator = 0 # We are testing for the last episode i -= 1 just_tested = True log('[DDPG TEST] Testing network after episode {}'.format(i)) else: train_indicator = 1 just_tested = False log('[DDPG ' + ('' if train_indicator else 'TEST') + '] Episode : ' + str(i) + ' Replay Buffer ' + str(buff.count())) ob = env.reset() s_t = ob total_reward = 0. for j in range(max_steps): loss = 0 epsilon -= 1.0 / EXPLORE a_t = np.zeros([1, action_dim]) noise_t = np.zeros([1, action_dim]) a_t_original = actor.model.predict(s_t.reshape(1, s_t.shape[0])) # Include noise only during training noise_t[0][0] = train_indicator * max(epsilon, 0) * OU.function( a_t_original[0][0], 0.0, 0.15, 0.2) noise_t[0][1] = train_indicator * max(epsilon, 0) * OU.function( a_t_original[0][1], 0.0, 0.15, 0.2) a_t[0][0] = a_t_original[0][0] + noise_t[0][0] a_t[0][1] = a_t_original[0][1] + noise_t[0][1] # Step the environment and fetch the observation, reward and terminal_flag ob, r_t, done, info = env.step(a_t[0]) # Set the new state s_t1 = ob # Add to replay buffer buff.add(s_t, a_t[0], r_t, s_t1, done) # Do the batch update batch = buff.getBatch(BATCH_SIZE) states = np.asarray([e[0] for e in batch]) actions = np.asarray([e[1] for e in batch]) rewards = np.asarray([e[2] for e in batch]) new_states = np.asarray([e[3] for e in batch]) dones = np.asarray([e[4] for e in batch]) y_t = np.asarray([e[1] for e in batch]) target_q_values = critic.target_model.predict( [new_states, actor.target_model.predict(new_states)]) for k in range(len(batch)): if dones[k]: y_t[k] = rewards[k] else: y_t[k] = rewards[k] + GAMMA * target_q_values[k] if train_indicator: log('[DDPG] Updating the models') loss += critic.model.train_on_batch([states, actions], y_t) a_for_grad = actor.model.predict(states) grads = critic.gradients(states, a_for_grad) actor.train(states, grads) actor.target_train() critic.target_train() total_reward += r_t s_t = s_t1 log('[DDPG ' + ('' if train_indicator else 'TEST') + '] Episode: {0} Step: {1} Action: {2} Reward: {3} Loss: {4}'. format(i, step, a_t, r_t, loss)) step += 1 if done: break # Save the model after every n episodes if i > 0 and np.mod(i, TEST_AFTER_N_EPISODES) == 0: if (train_indicator): log('[DDPG] Saving the model') actor.model.save_weights(os.path.join( model_dir, 'actormodel_' + identifier + '_{}'.format(i) + '.h5'), overwrite=True) with open( os.path.join( model_dir, 'actormodel_' + identifier + '_{}'.format(i) + '.json'), "w") as outfile: json.dump(actor.model.to_json(), outfile) critic.model.save_weights(os.path.join( model_dir, 'criticmodel_' + identifier + '_{}'.format(i) + '.h5'), overwrite=True) with open( os.path.join( model_dir, 'criticmodel_' + identifier + '_{}'.format(i) + '.json'), "w") as outfile: json.dump(critic.model.to_json(), outfile) # Reinitialize step count after an episode is done step = 0 log('[DDPG ' + ('' if train_indicator else 'TEST') + '] TOTAL REWARD @ ' + str(i) + '-th Episode : Reward ' + str(total_reward)) log('') # Increment the episode count i += 1 env.close() log('[DDPG] Finish')
# Increment the episode count i += 1 env.close() log('[DDPG] Finish') if __name__ == "__main__": from matsuoka_walk.matsuoka_env import MatsuokaEnv from gym.envs.registration import register register( id='matsuoka_env-v0', entry_point='matsuoka_walk:matsuoka_env.MatsuokaEnv', max_episode_steps=40, ) # Set the logging variables # This also creates a new log file Logger(log_dir=os.path.join(home_dir, '.bio_walk/logs/'), log_flag=True) # Identifier used for saving model weights identifier = Logger.datetime_str log('[DDPG MAIN] Model weight identifier is {}'.format(identifier)) # Start the DDPG algorithm # Set train_indicator=1 for training and train_indicator=0 for testing # For testing, set identifier to that of the desired weights to be loaded # identifier = '20171027_144930' deviation_controller(train_indicator=1, identifier=identifier)
def main(): #random.seed(64) # Create an initial population of `POP_SIZE` individuals (where each individual is a list of floats) pop = toolbox.population(n=POP_SIZE) # CXPB is the probability with which two individuals are crossed # MUTPB is the probability for mutating an individual CXPB, MUTPB = 0.8, 0.1 log('[GA] Starting genetic algorithm') # Evaluate the entire population and store the fitness of each individual log('[GA] Finding the fitness of individuals in the initial generation') fitnesses = list(map(toolbox.evaluate, pop)) for ind, fit in zip(pop, fitnesses): print ind, fit ind.fitness.values = (fit, ) # Extracting all the fitnesses fits = [ind.fitness.values[0] for ind in pop] # Variable keeping track of the number of generations g = 0 best_ind_ever = None best_fitness_ever = 0.0 # Begin the evolution while max(fits) < 100 and g < MAX_GEN: # A new generation g = g + 1 log('[GA] Running generation {0}'.format(g)) # Select the next generation individuals log('[GA] Selecting the next generation') offspring = toolbox.select(pop, len(pop)) # Clone the selected individuals offspring = list(map(toolbox.clone, offspring)) # Apply crossover and mutation on the offspring for child1, child2 in zip(offspring[::2], offspring[1::2]): # cross two individuals with probability CXPB if random.random() < CXPB: toolbox.mate(child1, child2) # fitness values of the children # must be recalculated later del child1.fitness.values del child2.fitness.values for mutant in offspring: # mutate an individual with probability MUTPB if random.random() < MUTPB: toolbox.mutate(mutant) del mutant.fitness.values # Since the content of some of our offspring changed during the last step, we now need to # re-evaluate their fitnesses. To save time and resources, we just map those offspring which # fitnesses were marked invalid. # Evaluate the individuals with an invalid fitness invalid_ind = [ind for ind in offspring if not ind.fitness.valid] fitnesses = map(toolbox.evaluate, invalid_ind) for ind, fit in zip(invalid_ind, fitnesses): ind.fitness.values = (fit, ) log('[GA] Evaluated {0} individuals (invalid fitness)'.format( len(invalid_ind))) # The population is entirely replaced by the offspring pop[:] = offspring # Gather all the fitnesses in one list and print the stats fits = [ind.fitness.values[0] for ind in pop] length = len(pop) mean = sum(fits) / length sum2 = sum(x * x for x in fits) std = abs(sum2 / length - mean**2)**0.5 log('[GA] Results for generation {0}'.format(g)) log('[GA] Min %s' % min(fits)) log('[GA] Max %s' % max(fits)) log('[GA] Avg %s' % mean) log('[GA] Std %s' % std) best_ind_g = tools.selBest(pop, 1)[0] # Store the best individual over all generations if best_ind_g.fitness.values[0] > best_fitness_ever: best_fitness_ever = best_ind_g.fitness.values[0] best_ind_ever = best_ind_g log('[GA] Best individual for generation {0}: {1}, {2}'.format( g, best_ind_g, best_ind_g.fitness.values[0])) log('[GA] ############################# End of generation {0} #############################' .format(g)) log('[GA] ===================== End of evolution =====================') best_ind = tools.selBest(pop, 1)[0] log('[GA] Best individual in the population: %s, %s' % (best_ind, best_ind.fitness.values[0])) log('[GA] Best individual ever: %s, %s' % (best_ind_ever, best_fitness_ever))
from deap import base from deap import creator from deap import tools from matsuoka_walk import oscillator_nw, Logger, log # Set the home directory home_dir = os.path.expanduser('~') # Set the logging variables # This also creates a new log file Logger(log_dir=os.path.join(home_dir, '.bio_walk/logs/'), log_flag=True) # Create the position bounds of the individual log('[GA] Creating position bounds') FLT_MIN_KF, FLT_MAX_KF = 0.2, 0.5 FLT_MIN_GAIN1, FLT_MAX_GAIN1 = 0.01, 1.0 FLT_MIN_GAIN2, FLT_MAX_GAIN2 = 0.01, 1.0 FLT_MIN_GAIN3, FLT_MAX_GAIN3 = 0.01, 1.0 FLT_MIN_GAIN4, FLT_MAX_GAIN4 = 0.01, 1.0 FLT_MIN_GAIN5, FLT_MAX_GAIN5 = 0.01, 1.0 FLT_MIN_GAIN6, FLT_MAX_GAIN6 = 0.01, 1.0 FLT_MIN_BIAS1, FLT_MAX_BIAS1 = -0.6, 0.0 FLT_MIN_BIAS2, FLT_MAX_BIAS2 = 0.0, 0.5 FLT_MIN_BIAS3, FLT_MAX_BIAS3 = -0.5, 0.0 FLT_MIN_BIAS4, FLT_MAX_BIAS4 = 0.0, 1.0 log('[GA] Logging position bounds') log('[GA] FLT_MIN_KF={0}, FLT_MAX_KF={1}'.format(FLT_MIN_KF, FLT_MAX_KF)) log('[GA] FLT_MIN_GAIN1={0}, FLT_MAX_GAIN1={1}'.format(FLT_MIN_GAIN1,
from deap import base from deap import creator from deap import tools from matsuoka_walk import Logger, log from matsuoka_walk.oscillator_4 import oscillator_nw # Set the home directory home_dir = os.path.expanduser('~') # Set the logging variables # This also creates a new log file Logger(log_dir=os.path.join(home_dir, '.bio_walk/logs/'), log_flag=True) log('[GA] Running ga_4') # Create the position bounds of the individual log('[GA] Creating position bounds') FLT_MIN_KF, FLT_MAX_KF = 0.2, 1.0 FLT_MIN_GAIN1, FLT_MAX_GAIN1 = 0.01, 1.0 FLT_MIN_GAIN2, FLT_MAX_GAIN2 = 0.01, 1.0 FLT_MIN_GAIN3, FLT_MAX_GAIN3 = 0.01, 1.0 FLT_MIN_GAIN4, FLT_MAX_GAIN4 = 0.01, 1.0 FLT_MIN_GAIN5, FLT_MAX_GAIN5 = 0.01, 1.0 FLT_MIN_GAIN6, FLT_MAX_GAIN6 = 0.01, 1.0 FLT_MIN_BIAS1, FLT_MAX_BIAS1 = -0.6, 0.0 FLT_MIN_BIAS2, FLT_MAX_BIAS2 = 0.0, 0.5 FLT_MIN_BIAS3, FLT_MAX_BIAS3 = -0.5, 0.0 FLT_MIN_BIAS4, FLT_MAX_BIAS4 = 0.0, 1.0 FLT_MIN_K_HIP_Y, FLT_MAX_K_HIP_Y = -2.5, 2.5