Beispiel #1
0
def CFagentv2(defaults):
    env = Game(**defaults)
    mover = Mover(env, _extra_dim=1, **defaults)
    teleporter = Teleporter(env, **defaults)
    buffer = ReplayBuffer(**defaults)
    CFagent = CFAgent(env, **defaults)
    CFbuffer = CFReplayBuffer(**defaults)
    simulator = Simulator(env, **defaults)
    simbuffer = SimBuffer(**defaults)
    collector = Collector(**defaults)

    with Save(env, collector, mover, teleporter, CFagent, simulator, **defaults) as save:
        intervention_idx, modified_board = teleporter.pre_process(env)
        dones = CFagent.pre_process(env)
        CF_dones, cfs = None, None
        for frame in loop(env, collector, save, teleporter):
            CFagent.counterfact2(env, dones, teleporter, simulator, CF_dones, cfs)
            modified_board = teleporter.interveen(env.board, intervention_idx, modified_board)
            actions = mover(modified_board)
            observations, rewards, dones, info = env.step(actions)
            modified_board, modified_rewards, modified_dones, teleport_rewards, intervention_idx = teleporter.modify(observations, rewards, dones, info)
            buffer.teleporter_save_data(teleporter.boards, observations, teleporter.interventions, teleport_rewards, dones, intervention_idx)
            mover.learn(modified_board, actions, modified_rewards, modified_dones)
            board_before, board_after, intervention, tele_rewards, tele_dones = buffer.sample_data()
            teleporter.learn(board_after, intervention, tele_rewards, tele_dones, board_before)
            simbuffer.simulator_save_data(teleporter.boards, observations, teleporter.interventions, teleport_rewards, dones, intervention_idx)
            board_before, board_after, intervention = simbuffer.sample_data()
            lossboard = simulator.learn(board_before, board_after, intervention)
            collector.collect_loss(lossboard)
            collector.collect([rewards, modified_rewards, teleport_rewards], [dones, modified_dones])
            CF_dones, cfs = CFagent.counterfact_check(dones, env, **defaults)
            CFbuffer.CF_save_data(CFagent.boards, observations, CFagent.counterfactuals, rewards, dones, CF_dones)
            CFboard, CFobs, cf, CFrewards, CFdones1 = CFbuffer.sample_data()
            CFagent.learn(CFobs, cf, CFrewards, CFdones1, CFboard)
def graphTrain(defaults):
    layers: List[LayerType] = environments[defaults['level']][2]
    explorationN = defaults['K1']
    data = Data(layers, defaults['graphMode'])
    env = Game(**defaults)
    mover = Mover(env, _extra_dim=1, **defaults)
    teleporter = Teleporter(env, **defaults)
    collector = Collector(**defaults)
    model = BayesionNN(layers,
                       depth=defaults['depth'],
                       exploration=defaults['model_explore'],
                       samples=defaults['samples'])
    use_model = defaults['use_model']
    with Save(env, collector, mover, data, **defaults) as save:
        convert = [env.layers.types.index(layer) for layer in layers]
        player = env.layers.types.index(LayerType.Player)
        goal = env.layers.types.index(LayerType.Goal)
        old_states = [state for state in states(env.board, convert, layers)]
        dones = tensor([0 for _ in range(env.batch)])
        rewards = tensor([0 for _ in range(env.batch)])
        eatCheese, interventions = ([True] * env.batch, [None] * env.batch)
        for frame in loop(env, collector, save, teleporter=teleporter):
            data.t = frame
            new_states = [
                state for state in states(env.board, convert, layers)
            ]
            loss = transform(old_states,
                             new_states,
                             dones,
                             rewards,
                             data,
                             layers,
                             model,
                             use_model=use_model)
            loss = transformNot(env.board,
                                new_states,
                                player,
                                goal,
                                convert,
                                data,
                                layers,
                                model,
                                use_model=use_model)
            stateChanged = [
                old != new for old, new in zip(old_states, new_states)
            ]
            shouldInterviene = [
                cond1 or cond2 for cond1, cond2 in zip(stateChanged, eatCheese)
            ]
            exploration = max((explorationN - frame) / explorationN,
                              defaults['softmax_cap'])
            if use_model:
                interventions = [
                    (getInterventionsmodel(
                        state, env.layers.types, layers, model, env,
                        data.layers_not_in(state), frame) if should else old)
                    for state, should, old in zip(new_states, shouldInterviene,
                                                  interventions)
                ]
            else:
                interventions = [
                    (getInterventions(env, state, data, exploration)
                     if should else old) for state, should, old in zip(
                         new_states, shouldInterviene, interventions)
                ]
            modification = env.board[tensor(interventions)].unsqueeze(1)
            teleporter.interventions = tensor(
                [m.flatten().argmax().item() for m in list(modification)],
                device=device)
            modified_board = cat((env.board, modification), dim=1)
            actions = mover(modified_board)
            observations, rewards, dones, info = env.step(actions)
            modified_board, modified_rewards, modified_dones, _, _ = teleporter.modify(
                observations, rewards, dones, info)
            mover.learn(modified_board, actions, modified_rewards,
                        modified_dones)
            playerPositions = [
                (t := env.layers.dict[LayerType.Player].positions[i][0])[1] *
                env.layers.width + t[0] for i in range(env.batch)
            ]
            eatCheese = [
                intervention == player_pos for intervention, player_pos in zip(
                    teleporter.interventions, playerPositions)
            ]
            old_states = new_states
            collector.collect([rewards, modified_rewards],
                              [dones, modified_dones])
            collector.collect_loss(loss)