policy = PolicyNetwork().setupOptim() policyTarget = PolicyNetwork().setupTargetAssign(policy) qvalue = QValueNetwork().setupOptim() qvalueTarget = QValueNetwork().setupTargetAssign(qvalue) SIM_NUMBER = 1.5 model_save = "/home/pasquale/Desktop/thesis/thesis-code/2D_Acrobot/ddpg/trined_agents/DDPG_saved_" + str( SIM_NUMBER) + ".chkpt" sess = tf.compat.v1.InteractiveSession() tf.compat.v1.global_variables_initializer().run() tf.compat.v1.train.Saver().restore(sess, model_save) robot = Robot("single_pendulum.urdf") robot.sim_number = SIM_NUMBER robot.RANDSET = 0 robot.GUI_ENABLED = 0 robot.SINCOS = 1 path_log = "/home/pasquale/Desktop/thesis/thesis-code/1D_pendulum/continuous/" robot.time_step = time_step robot.setupSim() #Evaluate policy #env.robot.stopSim() #env = PendulumPyB() #Check convergence c = 100000000
QVALUE_LEARNING_RATE = tc.QVALUE_LEARNING_RATE # Base learning rate for the Q-value Network POLICY_LEARNING_RATE = tc.POLICY_LEARNING_RATE # Base learning rate for the policy network DECAY_RATE = tc.DECAY_RATE # Discount factor UPDATE_RATE = tc.UPDATE_RATE # Homotopy rate to update the networks REPLAY_SIZE = tc.REPLAY_SIZE # Size of replay buffer BATCH_SIZE = tc.BATCH_SIZE # Number of points to be fed in stochastic gradient NH1 = NH2 = tc.NH1 # Hidden layer size range_esp = tc.range_esp time_step = tc.time_step SIM_NUMBER = 1.2 model = DDPG.load("ddpg_pendulum_stb_baselines_" + str(SIM_NUMBER)) robot = Robot("single_pendulum.urdf") robot.sim_number = 1 robot.RANDSET = 0 robot.GUI_ENABLED = 1 robot.SINCOS = 1 path_log = "/home/pasquale/Desktop/thesis/thesis-code/1D_pendulum/stable_baselines/" robot.time_step = time_step robot.setupSim() #Evaluate policy #env.robot.stopSim() #env = PendulumPyB() #Check convergence #confronta #convergenza #tempo di training