Example #1
0
UPDATE_RATE = tc.UPDATE_RATE  # Homotopy rate to update the networks
REPLAY_SIZE = tc.REPLAY_SIZE  # Size of replay buffer
BATCH_SIZE = tc.BATCH_SIZE  # Number of points to be fed in stochastic gradient
NH1 = tc.NH1
NH2 = tc.NH2  # Hidden layer size
range_esp = tc.range_esp
time_step = tc.time_step

reward_weights = [1., 0.0, 0.00]

sim_number = 2
RANDSET = 0
env = Robot("single_pendulum.urdf")
env_rend = Robot("single_pendulum.urdf", sim_number=sim_number)  #for rendering

env.RANDSET = 0

step_expl = 0.
epi_expl = 0.

NX = 3  # ... training converges with q,qdot with 2x more neurones.
NU = 1  # Control is dim-1: joint torque


def angle_normalize(x):
    return min(x % (2 * np.pi), abs(x % (2 * np.pi) - 2 * np.pi))


class QValueNetwork:
    def __init__(self):
        nvars = len(tf.compat.v1.trainable_variables())
Example #2
0
# tip of Acrobot is considered to be in the center of gravity of link2
# -> goal position 

length = .035 + .01 + 0.1088
goal = np.array([0,0,length])





env                 = Robot("double_pendulum.urdf")       
env_rend            = Robot("double_pendulum.urdf",sim_number=SIM_NUMBER) #for rendering


env.GUI_ENABLED=0 
env.RANDSET = 0
env.SINCOS = 1
env.time_step= time_step
env.actuated_index =[2]
env.max_torque=5.0

env_rend.SINCOS = 1
env_rend.GUI_ENABLED = 1
env_rend.actuated_index = [2]






step_expl = 0.
Example #3
0
POLICY_LEARNING_RATE = tc.POLICY_LEARNING_RATE  # Base learning rate for the policy network
DECAY_RATE = tc.DECAY_RATE  # Discount factor
UPDATE_RATE = tc.UPDATE_RATE  # Homotopy rate to update the networks
REPLAY_SIZE = tc.REPLAY_SIZE  # Size of replay buffer
BATCH_SIZE = tc.BATCH_SIZE  # Number of points to be fed in stochastic gradient
NH1 = NH2 = tc.NH1  # Hidden layer size
range_esp = tc.range_esp
time_step = tc.time_step

SIM_NUMBER = 1.2

model = DDPG.load("ddpg_pendulum_stb_baselines_" + str(SIM_NUMBER))

robot = Robot("single_pendulum.urdf")
robot.sim_number = 1
robot.RANDSET = 0
robot.GUI_ENABLED = 1
robot.SINCOS = 1
path_log = "/home/pasquale/Desktop/thesis/thesis-code/1D_pendulum/stable_baselines/"
robot.time_step = time_step
robot.setupSim()

#Evaluate policy
#env.robot.stopSim()
#env = PendulumPyB()

#Check convergence
#confronta
#convergenza
#tempo di training
#average reward
Example #4
0
            rsum += reward
            #print(rsum)
        print(x)
        if verbose: print('Lasted ',i,' timestep -- total reward:',rsum)
    
    signal.signal(signal.SIGTSTP, lambda x,y:rendertrial()) # Roll-out when CTRL-Z is pressed
 

    h_rwd = []
    h_qva = []
    h_ste = []    
 
 
    ##  Envirnonment Configuration ##
    env.GUI_ENABLED=0 
    env.RANDSET = RANDSET
    env.SINCOS = 1
    env.time_step= time_step
    env.setupSim() 
    
    
    
    ##  Training Loop ##
    start_time = time.time()
    for episode in range(1,NEPISODES):
        env.resetRobot()
        x    = np.array([[env.states_sincos[1][0],env.states_sincos[1][1],
                       env.states_dot[1][3]]])  #remove .T
        
        rsum = 0.0