# return stdscr if __name__ == '__main__': # Curses standard screen stdscr = curses.initscr() # Init environment width, height = 10, 20 # standard tetris friends rules env = TetrisEngine(width, height) # Play games on repeat while True: init() stdscr.clear() env.clear() db = play_game() # Return to terminal terminate() # Should the game info be saved? if save_game(): try: fr = open('training_data.npy', 'rb') x = np.load(fr) fr.close() fw = open('training_data.npy', 'wb') x = np.concatenate((x, db)) # print('Saving {0} moves...'.format(len(db))) np.save(fw, x) print('{0} data points in the training set'.format(len(x)))
def main(episode, load, learn, debug, random_rate, session): load_model = load print("load model", load_model, "learn", learn, "debug", debug, "episode", episode) width, height = 7, 14 # standard tetris friends rules env = TetrisEngine(width, height) action_count = 7 agent = Agent(lr=1e-4, input_dims=width * height, gamma=0.5, n_actions=action_count, l1_size=512, l2_size=128) if session: model_filename = "%s-trained_model.torch" % session else: model_filename = "trained_model.torch" parameter_size = sum([len(p) for p in agent.policy.parameters()]) print("network parameter size:", parameter_size) action_idx = 0 if load_model: agent.policy.load_state_dict(T.load(model_filename)) for i in range(episode): done = False score = 0 state = env.clear() counter = 0 while not done: counter += 1 action, probs = agent.choose_action(state) prob = probs[action].item() state, reward, done = env.step(action) agent.store_rewards(reward) score += reward if debug: stdscr = curses.initscr() stdscr.clear() stdscr.addstr(str(env)) stdscr.addstr('\ncumulative reward: ' + str(score)) stdscr.addstr('\nreward: ' + str(reward)) time.sleep(.2) continue if not debug and i % 100 == 0 and counter % 100 == 1: idx2direction = { 0: "left", 1: "right", 2: "hard_drop", 3: "soft_drop", 4: "rotate_left", 5: "rotate_right", 6: "idle" } probs_str = "" for z, item in enumerate(probs): probs_str += "%s:%0.2f, " % (idx2direction[z], item.item()) print(probs_str) print('episode: ', i, 'counter: ', counter, 'reward %0.3f' % reward, 'action: %s (%0.2f)' % (action, prob)) writer.add_scalar("action prob", prob, action_idx) action_idx += 1 if not debug and i % 100 == 0: print('episode: ', i, 'score %0.3f' % score) writer.add_scalar("final score", score, i) if learn: agent.learn() if i % 1000 == 0: T.save(agent.policy.state_dict(), model_filename) writer.close()
CHECKPOINT_FILE, start_epoch)) else: print("=> no checkpoint found at '{}'".format(CHECKPOINT_FILE)) ###################################################################### # # Below, you can find the main training loop. At the beginning we reset # the environment and initialize the ``state`` variable. Then, we sample # an action, execute it, observe the next screen and the reward (always # 1), and optimize our model once. When the episode ends (our model # fails), we restart the loop. f = open('log.out', 'w+') for i_episode in count(start_epoch): # Initialize the environment and state state = FloatTensor(engine.clear()[None, None, :, :]) score = 0 for t in count(): # Select and perform an action action = select_action(state).type(LongTensor) # Observations last_state = state state, reward, done = engine.step(action[0, 0]) state = FloatTensor(state[None, None, :, :]) # Accumulate reward score += int(reward) reward = FloatTensor([float(reward)])
def print_placement(state): s = np.asarray(state) s = np.swapaxes(s, 1, 0) print(s) agent = FixedPolicyAgent() if __name__ == '__main__': # Check if user specified to resume from a checkpoint start_epoch = 0 best_score = float('-inf') for i_episode in count(start_epoch): # Initialize the environment and state state = engine.clear() last_state = None score = 0 cl = 0 for t in count(): # Select and perform an action actions_name, placement, actions = agent.select_action( engine, engine.shape, engine.anchor, engine.board) # Observations state, reward, done, cleared_lines, sent_lines = engine.step_to_final( actions) if not done: last_state = state # Accumulate reward score += reward cl += cleared_lines