コード例 #1
0
def modelTest(test=False, chkpt=None, device='cuda'):
    """
    modelTest is used to upload a saved model, and test out the results
    :param test: Bool should be set to true
    :param chkpt: string path of where the model exists
    :param device: cuda or cpu
    :return: None
    """
    # device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    if not test:
        wandb.init(project="MultiSection Continum",
                   name="Reaching Task 32 Per Layer")

    robot = Robot()
    robot.newSection()
    robot.newSection()

    env = Environment(robot)
    if test:
        env.staticPoint([-75, 150])
        env.render()
    else:
        env.staticPoint([-75, 150])

    lastObs = env.getObservation()

    rb = ReplayBuffer()

    minRBSize = 10000
    sampleSize = 2500
    envStepsBeforeTrain = 100
    targetModelUpdate = 150

    epsMin = 0.01
    epsDecay = 0.99998

    model = Model(len(lastObs.state), len(env.robot.actions))
    if chkpt != None:
        model.load_state_dict(
            torch.load(chkpt, map_location=torch.device('cpu')))

    targetModel = Model(len(lastObs.state), len(env.robot.actions))
    updateTGTModel(model, targetModel)

    stepSinceTrain = 0
    stepSinceTGTUpdate = 0
    stepNum = -1 * minRBSize

    episodeRewards = []
    rollingReward = 0

    # Copying over the weights
    tq = tqdm()
    # Work in progress
    while True:
        if test:
            env.render()
            time.sleep(0.05)
        tq.update(1)
        eps = epsDecay**(stepNum / 100)
        if test:
            eps = 0

        if random() < eps:
            action = env.robot.randomAction()
        else:
            actNum = model(torch.tensor(lastObs.state)).max(-1)[-1].item()
            action = env.robot.actions[actNum]

        obs = env.robotStep(action[0], action[1])
        rollingReward = obs.reward

        # print(obs)
        # # env.render()
        # x = model(torch.Tensor(obs.state))
        # # print(x)
        #
        episodeRewards.append(rollingReward)
        #
        # if stepSinceTGTUpdate > targetModelUpdate:
        # # if env.done():
        #     episodeRewards.append(rollingReward)
        #     if test:
        #         print(rollingReward)
        #     print(episodeRewards)
        #     rollingReward = 0
        #     # env.reset()

        obs.reward = obs.reward / 100

        stepSinceTrain += 1
        stepNum += 1
        rb.insert(obs)
        if (not test) and len(
                rb.buffer
        ) >= minRBSize and stepSinceTrain > envStepsBeforeTrain:
            stepSinceTGTUpdate += 1
            loss = trainStep(rb.sample(sampleSize), model, targetModel,
                             len(env.robot.actions), device)
            wandb.log(
                {
                    "Loss": loss.detach().item(),
                    "eps": eps,
                    "Step Rewards:": np.mean(episodeRewards)
                },
                step=stepNum)
            stepSinceTrain = 0

            if stepSinceTGTUpdate > targetModelUpdate:
                print("Updating Target Model")
                updateTGTModel(model, targetModel)
                stepSinceTGTUpdate = 0
                torch.save(targetModel.state_dict(), f"Models/{stepNum}.pth")
                episodeRewards = []
コード例 #2
0
def main(test=False, chkpt=None, device='cuda'):
    """
    main is used to start and preform the training in non-render mode
    :param test: Not required
    :param chkpt: Not required
    :param device: string (cuda or cpu)
    :return: None
    """
    # device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    if not test:
        wandb.init(project="MultiSection Continum",
                   name="Reaching Task 32 Per Layer")

    robot = Robot()
    robot.newSection()
    robot.newSection()

    env = Environment(robot)
    if test:
        # env.staticPoint([-9.966711079379195, 99.3346653975306])
        env.render()
    # else:
    #     env.staticPoint([-9.966711079379195, 99.3346653975306])

    lastObs = env.getObservation()

    rb = ReplayBuffer()

    memorySize = 500000
    minRBSize = 20000

    sampleSize = 750

    envStepsBeforeTrain = 250

    targetModelUpdate = 500

    epsMin = 0.01
    epsDecay = 0.99999

    model = Model(len(lastObs.state), len(env.robot.actions)).to(device)
    if chkpt != None:
        model.load_state_dict(torch.load(chkpt))

    targetModel = Model(len(lastObs.state), len(env.robot.actions)).to(device)
    updateTGTModel(model, targetModel)

    stepSinceTrain = 0
    # stepSinceTrain keeps track of the number of steps since the last main network training
    # in this case main network updates after every envStepsBeforeTrain

    stepSinceTGTUpdate = 0
    # stepSinceTGTUpdate keeps track of the number of steps since the last target network update (ie transfering main network weights)
    # in this case the target network updates after every targetModelUpdate

    stepNum = -1 * minRBSize

    episodeRewards = []
    rollingReward = 0

    # Copying over the weights
    tq = tqdm()
    # Work in progress
    while True:
        if test:
            env.render()
            time.sleep(0.05)
        tq.update(1)
        eps = epsDecay**(stepNum / 10)
        if test:
            eps = 0

        if random() < eps:
            # print("Taking random action")
            action = env.robot.randomAction()
        else:
            actNum = model(torch.tensor(
                lastObs.state).to(device)).max(-1)[-1].item()
            action = env.robot.actions[actNum]

        obs = env.robotStep(action[0], action[1])

        rollingReward = obs.reward

        # print(obs)
        # # env.render()
        # x = model(torch.Tensor(obs.state))
        # # print(x)
        #
        episodeRewards.append(rollingReward)
        #
        # if stepSinceTGTUpdate > targetModelUpdate:
        # # if env.done():
        #     episodeRewards.append(rollingReward)
        #     if test:
        #         print(rollingReward)
        #     print(episodeRewards)
        #     rollingReward = 0
        #     # env.reset()
        if env.done():
            env.reset()
            # env.staticPoint([-9.966711079379195, 99.3346653975306])

        # obs.reward = obs.reward / 100

        stepSinceTrain += 1
        stepNum += 1
        rb.insert(obs)
        if (
                not test
        ) and rb.index >= minRBSize and stepSinceTrain > envStepsBeforeTrain:
            stepSinceTGTUpdate += 1
            loss = trainStep(rb.sample(sampleSize), model, targetModel,
                             len(env.robot.actions), device)
            wandb.log(
                {
                    "Loss": loss.detach().cpu().item(),
                    "eps": eps,
                    "Step Rewards:": np.mean(episodeRewards)
                },
                step=stepNum)
            stepSinceTrain = 0

            if stepSinceTGTUpdate > targetModelUpdate:
                print("Updating Target Model")
                updateTGTModel(model, targetModel)
                stepSinceTGTUpdate = 0
                torch.save(
                    targetModel.state_dict(),
                    f"/u/meharabd/research/CRLMachineLearningProject/Models/{stepNum}.pth"
                )
                episodeRewards = []