def train(args):
    total_training_episodes = 0
    total_reward_list = []

    if not os.path.exists(args.path_to_model + args.model_name +
                          args.model_date):
        os.makedirs(args.path_to_model + args.model_name + args.model_date)

    env = Husky()

    if args.model_name == 'TD3':
        model = TD3(a_dim=env.action_dim, s_dim=env.state_dim)
    if args.model_name == 'DDPG':
        model = DDPG(a_dim=env.action_dim, s_dim=env.state_dim)

    #load pre_trained model
    if args.pre_train:
        model.load_model(args.path_to_model + args.model_name,
                         args.model_date_ + '/')
        #load memory data
        memory = np.loadtxt(args.path_to_model + args.model_name +
                            args.model_date_ + '/memory.txt')
        model.memory = memory
        counter = np.loadtxt(args.path_to_model + args.model_name +
                             args.model_date_ + '/memory_counter.txt')
        model.memory_counter = int(counter)

        training_episodes_temp_2 = np.loadtxt(args.path_to_model +
                                              args.model_name +
                                              args.model_date_ +
                                              '/total_training_episodes.txt')
        total_training_episodes = int(training_episodes_temp_2)
        # total_reward_list_temp = np.loadtxt(args.path_to_model+args.model_name+args.model_date_+'/total_reward_list.txt')
        # total_reward_list = total_reward_list_temp.tolist()
        print 'load model successfully'
    a1 = np.array([np.mean(model.memory[:, :15], axis=0)])  # (1,3)
    b1 = np.cov(model.memory[:, :15].T)
    print("memory shape is " + str(model.memory.shape))
    # np.linalg.pinv(b1)
    # print("Okay")

    model.mode(mode='train')

    print_once = True
    print_memory_counter = True
    print_return_learning = False

    start_time = time.time()
    for epoch in range(args.train_epoch):
        state = env.reset()
        total_reward = 0
        episode_step = 0
        online_experience = []
        while True:

            #print('model.memory_counter: ' + str(model.memory_counter))
            # if(model.memory_counter < 1000):
            #     action = env.get_joy_action()
            # else:
            #     checkJoyAction, receivedJoyAction = env.check_joy_action()
            #     #print('checkJoyAction : ' + str(checkJoyAction))
            #     if(checkJoyAction):
            #         action = receivedJoyAction
            #         print('user intervention')
            #         print_return_learning = True
            #     else:
            #         if(print_return_learning):
            #             print('return to learning')
            #         print_return_learning = False
            #         action = model.choose_action(state)

            #action = env.get_joy_action()

            action = model.choose_action(state)

            action_bounded = np.multiply(action, np.array(args.action_bound))
            #print('action[0]: ' + str(action[0]) + '   action[1]: ' + str(action[1]))
            #state_, reward, terminal = env.step(np.multiply(action,np.array(args.action_bound)))
            state_, reward, terminal = env.step(action_bounded)
            #print('reward: ' + str(reward))
            # model.store_transition(state,action,reward,state_,terminal)
            online_experience.append(
                np.hstack((state, action, [reward], state_, [terminal])))
            state = state_
            total_reward += reward
            episode_step += 1
            #start optimization after more than 100 transitions
            if (print_memory_counter):
                print('memory_counter: ' + str(model.memory_counter))
            if model.memory_counter > 1000:
                model.Learn()
                # if(print_once):
                #     print('memory_counter reached 1000. learning...')
                #     print_once = False
                #     print_memory_counter = False
            if terminal or episode_step > args.train_step:
                break
        a2 = np.array([np.mean(np.array(online_experience)[:, :15], axis=0)])
        # print(a2)
        b2 = np.cov(np.array(online_experience)[:, :15].T)
        # print(b2)
        kl = KL(a1, b1, a2, b2, 14)
        if kl < 800:

            print("online shape", np.array(online_experience).shape)
            print("append shape",
                  np.array([[float(kl)] * len(online_experience)]).shape)
            online_experience = np.column_stack(
                (np.array(online_experience),
                 np.array([[float(kl)] * len(online_experience)]).T))
            print("online_experience shape is", online_experience.shape)
            model.memory = np.concatenate(
                (model.memory, np.array(online_experience)), axis=0)
            model.memory_counter += len(online_experience)
        total_reward_list.append(total_reward)
        total_training_episodes += 1

        print 'epoch:', epoch, '||', 'Reward:', total_reward, '||', 'total_training_episodes:', total_training_episodes, '||', 'memory_counter:', model.memory_counter

        if (epoch + 1) % args.epoch_store == 0:
            model.save_model(args.path_to_model + args.model_name,
                             args.model_date + '/')
            #save memory
            memory = model.memory
            np.savetxt(
                args.path_to_model + args.model_name + args.model_date +
                '/memory.txt', memory)
            model.plot_loss(args.path_to_model + args.model_name,
                            args.model_date + '/')
            counter = [model.memory_counter]
            np.savetxt(
                args.path_to_model + args.model_name + args.model_date +
                '/memory_counter.txt', counter)
            training_episodes_temp = [total_training_episodes]
            np.savetxt(
                args.path_to_model + args.model_name + args.model_date +
                '/total_training_episodes.txt', training_episodes_temp)
            np.savetxt(
                args.path_to_model + args.model_name + args.model_date +
                '/total_reward_list.txt', total_reward_list)

            plt.figure('total_reward_list')
            plt.plot(np.arange(len(total_reward_list)), total_reward_list)
            plt.ylabel('Total_reward')
            plt.xlabel('training epoch')
            plt.savefig(args.path_to_model + args.model_name +
                        args.model_date + '/reward.png')
            get_time(start_time)
            plt.clf()
def train(args):

    if not os.path.exists(args.path_to_model+args.model_name+args.model_date):
        os.makedirs(args.path_to_model+args.model_name+args.model_date)

    env = Husky()

    if args.model_name == 'TD3': model = TD3(a_dim=env.action_dim,s_dim=env.state_dim)
    if args.model_name == 'DDPG': model = DDPG(a_dim=env.action_dim,s_dim=env.state_dim)

    #load pre_trained model
    if args.pre_train:
        model.load_model(args.path_to_model+args.model_name, args.model_date_+'/')
        #load memory data
        memory = np.loadtxt(args.path_to_model+args.model_name+args.model_date_+'/memory.txt')
        model.memory = memory
        counter = np.loadtxt(args.path_to_model+args.model_name+args.model_date_+'/memory_counter.txt')
        model.memory_counter = int(counter)
        print 'load model successfully'



    model.mode(mode='train')

    print_once = True
    print_memory_counter = True
    print_return_learning = False

    total_reward_list = []
    start_time = time.time()
    #state = env.reset()
    for epoch in range(args.train_epoch):
        state = env.reset()
        total_reward = 0
        for i in range(args.train_step):

            #print('model.memory_counter: ' + str(model.memory_counter))
            # if(model.memory_counter < 1000):
            #     action = env.get_joy_action()
            # else:
            #     checkJoyAction, receivedJoyAction = env.check_joy_action()
            #     #print('checkJoyAction : ' + str(checkJoyAction))
            #     if(checkJoyAction):
            #         action = receivedJoyAction
            #         print('user intervention')
            #         print_return_learning = True
            #     else:
            #         if(print_return_learning):
            #             print('return to learning')
            #         print_return_learning = False
            #         action = model.choose_action(state)

            #action = env.get_joy_action()

            action = model.choose_action(state)

            action_bounded = np.multiply(action,np.array(args.action_bound))
            #print('action[0]: ' + str(action[0]) + '   action[1]: ' + str(action[1]))
            #state_, reward, terminal = env.step(np.multiply(action,np.array(args.action_bound)))
            state_, reward, terminal = env.step(action_bounded)
            #print('reward: ' + str(reward))
            model.store_transition(state,action,reward,state_,terminal)
            state = state_
            total_reward += reward
            #start optimization after more than 100 transitions
            if(print_memory_counter):
                print('memory_counter: ' + str(model.memory_counter))
            if model.memory_counter > 1000:
                model.Learn()
                if(print_once):
                    print('memory_counter reached 1000. learning...')
                    print_once = False
                    print_memory_counter = False
            if terminal:
                state = env.reset()
            '''
            if (i+1) % 200 == 0:
                env.target_generate()
                env.target_vis(env.target_pos)
            '''
        total_reward_list.append(total_reward)
        print 'epoch:', epoch,  '||',  'Reward:', total_reward

        if (epoch+1) % args.epoch_store == 0:
            model.save_model(args.path_to_model+args.model_name, args.model_date+'/')
            #save memory
            memory = model.memory
            np.savetxt(args.path_to_model+args.model_name+args.model_date+'/memory.txt',memory)
            model.plot_loss(args.path_to_model+args.model_name, args.model_date+'/')
            counter = [model.memory_counter]
            np.savetxt(args.path_to_model+args.model_name+args.model_date+'/memory_counter.txt',counter)

            plt.figure()
            plt.plot(np.arange(len(total_reward_list)), total_reward_list)
            plt.ylabel('Total_reward')
            plt.xlabel('training epoch')
            plt.savefig(args.path_to_model+args.model_name+args.model_date+'/reward.png')
            get_time(start_time)
            plt.clf()