def CFagentv2(defaults): env = Game(**defaults) mover = Mover(env, _extra_dim=1, **defaults) teleporter = Teleporter(env, **defaults) buffer = ReplayBuffer(**defaults) CFagent = CFAgent(env, **defaults) CFbuffer = CFReplayBuffer(**defaults) simulator = Simulator(env, **defaults) simbuffer = SimBuffer(**defaults) collector = Collector(**defaults) with Save(env, collector, mover, teleporter, CFagent, simulator, **defaults) as save: intervention_idx, modified_board = teleporter.pre_process(env) dones = CFagent.pre_process(env) CF_dones, cfs = None, None for frame in loop(env, collector, save, teleporter): CFagent.counterfact2(env, dones, teleporter, simulator, CF_dones, cfs) modified_board = teleporter.interveen(env.board, intervention_idx, modified_board) actions = mover(modified_board) observations, rewards, dones, info = env.step(actions) modified_board, modified_rewards, modified_dones, teleport_rewards, intervention_idx = teleporter.modify(observations, rewards, dones, info) buffer.teleporter_save_data(teleporter.boards, observations, teleporter.interventions, teleport_rewards, dones, intervention_idx) mover.learn(modified_board, actions, modified_rewards, modified_dones) board_before, board_after, intervention, tele_rewards, tele_dones = buffer.sample_data() teleporter.learn(board_after, intervention, tele_rewards, tele_dones, board_before) simbuffer.simulator_save_data(teleporter.boards, observations, teleporter.interventions, teleport_rewards, dones, intervention_idx) board_before, board_after, intervention = simbuffer.sample_data() lossboard = simulator.learn(board_before, board_after, intervention) collector.collect_loss(lossboard) collector.collect([rewards, modified_rewards, teleport_rewards], [dones, modified_dones]) CF_dones, cfs = CFagent.counterfact_check(dones, env, **defaults) CFbuffer.CF_save_data(CFagent.boards, observations, CFagent.counterfactuals, rewards, dones, CF_dones) CFboard, CFobs, cf, CFrewards, CFdones1 = CFbuffer.sample_data() CFagent.learn(CFobs, cf, CFrewards, CFdones1, CFboard)
def graphTrain(defaults): layers: List[LayerType] = environments[defaults['level']][2] explorationN = defaults['K1'] data = Data(layers, defaults['graphMode']) env = Game(**defaults) mover = Mover(env, _extra_dim=1, **defaults) teleporter = Teleporter(env, **defaults) collector = Collector(**defaults) model = BayesionNN(layers, depth=defaults['depth'], exploration=defaults['model_explore'], samples=defaults['samples']) use_model = defaults['use_model'] with Save(env, collector, mover, data, **defaults) as save: convert = [env.layers.types.index(layer) for layer in layers] player = env.layers.types.index(LayerType.Player) goal = env.layers.types.index(LayerType.Goal) old_states = [state for state in states(env.board, convert, layers)] dones = tensor([0 for _ in range(env.batch)]) rewards = tensor([0 for _ in range(env.batch)]) eatCheese, interventions = ([True] * env.batch, [None] * env.batch) for frame in loop(env, collector, save, teleporter=teleporter): data.t = frame new_states = [ state for state in states(env.board, convert, layers) ] loss = transform(old_states, new_states, dones, rewards, data, layers, model, use_model=use_model) loss = transformNot(env.board, new_states, player, goal, convert, data, layers, model, use_model=use_model) stateChanged = [ old != new for old, new in zip(old_states, new_states) ] shouldInterviene = [ cond1 or cond2 for cond1, cond2 in zip(stateChanged, eatCheese) ] exploration = max((explorationN - frame) / explorationN, defaults['softmax_cap']) if use_model: interventions = [ (getInterventionsmodel( state, env.layers.types, layers, model, env, data.layers_not_in(state), frame) if should else old) for state, should, old in zip(new_states, shouldInterviene, interventions) ] else: interventions = [ (getInterventions(env, state, data, exploration) if should else old) for state, should, old in zip( new_states, shouldInterviene, interventions) ] modification = env.board[tensor(interventions)].unsqueeze(1) teleporter.interventions = tensor( [m.flatten().argmax().item() for m in list(modification)], device=device) modified_board = cat((env.board, modification), dim=1) actions = mover(modified_board) observations, rewards, dones, info = env.step(actions) modified_board, modified_rewards, modified_dones, _, _ = teleporter.modify( observations, rewards, dones, info) mover.learn(modified_board, actions, modified_rewards, modified_dones) playerPositions = [ (t := env.layers.dict[LayerType.Player].positions[i][0])[1] * env.layers.width + t[0] for i in range(env.batch) ] eatCheese = [ intervention == player_pos for intervention, player_pos in zip( teleporter.interventions, playerPositions) ] old_states = new_states collector.collect([rewards, modified_rewards], [dones, modified_dones]) collector.collect_loss(loss)