def __getitem__(self, idx): if self.examples: item = json.loads(self.examples[idx]) else: result = bisect.bisect_left(self.file_offsets, (idx, len(self.fnames))) - 1 offset, fnames_idx = self.file_offsets[result] fname = self.fnames[fnames_idx] to_go = idx - offset for i, item in enumerate(jsons.stream(fname)): if i == to_go: break else: raise ValueError('offset too high: %s, %s, %s' % (idx, offset, fname)) if self.transform: item = self.transform(item) fs = self.policy.extract(item['state'], item['candidates']) rv = { 'actions': np.asarray([item['action']], dtype=config.nt()), 'advantages': np.asarray([item['advantage']], dtype=config.nt()), 'value_preds': np.asarray([item['value_pred']], dtype=config.nt()), 'returns': np.asarray([item['return']], dtype=config.nt()), } for k in ['probs', 'log_probs']: if k in item: rv[k] = item[k] for k, v in fs.items(): rv['features_' + k] = v return rv
def box(s): if not isinstance(s, (list, tuple, np.ndarray)): s = (s, ) arr = np.asarray(s) if not np.issubdtype(arr.dtype, np.integer): arr = arr.astype(config.nt()) return arr
def nhot(self, names): if not isinstance(names, (list, tuple)): names = (names,) rv = np.zeros(self.size, dtype = config.nt()) for name in names: rv[self.to_index(name)] = 1 return rv
def extract(self, state, candidates): rv = extract(state, self.poke_features) candidates = list(candidates) while len(candidates) < len(parser.all_actions_singles()): candidates.append(None) rv['mask'] = np.asarray([float(bool(c)) for c in candidates]).astype(config.nt()) return rv
def _convert(self, item): if self.transform: item = self.transform(item) fs = self.policy.extract(item['state'], item['candidates']) rv = { 'actions': np.asarray([item['action']], dtype=config.nt()), 'advantages': np.asarray([item['advantage']], dtype=config.nt()), 'value_preds': np.asarray([item['value_pred']], dtype=config.nt()), 'returns': np.asarray([item['return']], dtype=config.nt()), } for k in ['probs', 'log_probs']: if k in item: rv[k] = item[k] for k, v in fs.items(): rv['features_' + k] = v return rv
def gen7rb_level(base): mbst = F.mbst(base, 100.) mbstmin = np.min(mbst) level = np.floor(100 * mbstmin / mbst) mbst = F.mbst(base, level) while True: mbst = F.mbst(base, level) done = np.dtype(bool).type((mbst >= mbstmin) | (level >= 100)) if np.all(done): break level += np.dtype(config.nt()).type(~done) return level
def act(self, state, candidates): move_candidates = [] switch_candidates = [] me = nav.get_player_side(state) opp = nav.get_opponent_side(state) active = nav.get_active_pokes(me) active = active and active[0] moves = active and [v[0] for v in active['moveTrack']] opp_active = nav.get_active_pokes(opp) opp_active = opp_active and opp_active[0] if not self._type_aware: opp_active = None for i, action in enumerate(actions.GEN7SINGLES): if candidates[i]: if action.type == 'move': move_key = moves[action.slot - 1] power = compute_power(move_key, opp_active) move_candidates.append((power, action, i)) else: # action.type == 'switch' poke = me['pokemon'][action.slot - 1] weakness = 0. if opp_active: for opp_type in opp_active['types']: weakness += compute_multiplier(opp_type, poke['types']) switch_candidates.append((weakness, action, i)) selected = None if move_candidates: random.shuffle(move_candidates) move_candidates.sort(key = self._move_sort_key, reverse = True) selected = move_candidates[0][-1] else: random.shuffle(switch_candidates) switch_candidates.sort(key = self._switch_sort_key) selected = switch_candidates[0][-1] rv = np.zeros(len(candidates), dtype = config.nt()) rv[selected] = 1. return dict(probs = rv)
def fn(): state = self.engine.fetch(self.gid, self.request) if self.request.get( 'teamPreview') and self.candidates == 'teampreview': order = [ str(i + 1) for i in range(self.request['maxTeamSize']) ] random.shuffle(order) action_string = 'team ' + ','.join(order) result = dict(state=state, actionString=action_string, _updates=block_updates) else: result = self.policy.act(state, self.candidates) mask = np.asarray([1. if c else 0. for c in self.candidates]) if isinstance(result, gevent.event.AsyncResult): result = result.get() probs = result['probs'] probs = (1. - self._epsilon) * probs + ( self._epsilon * mask / sum(mask)).astype(config.nt()) if self._play_best_move: action = np.argmax(probs) else: action = np.random.choice(len(self.candidates), p=probs) # TODO: why not just use self.candidates? if self.request.get('teamPreview'): all_actions = _teampreview_actions else: all_actions = _singles_actions action_string = all_actions[action] result['candidates'] = self.candidates result['state'] = state result['action'] = action result['actionString'] = action_string result['_updates'] = block_updates self.blocks.append(result) rv.set(action_string)
def extract(self, state, candidates): mask = np.asarray([float(bool(c)) for c in candidates]).astype(config.nt()) state = np.asarray([state]).astype(config.nt()) return dict(state=state, mask=mask)
def rollup(policy, iter_dir, gamma, lam, reward_shaper=None, num_workers=0, progress_type='bar'): # Concatenate rollouts from this iteration, and store in parallel arrays: # - Features # - Action taken (as an index) # - Actual Return # - Advantage assert progress_type in {'bar', 'log', 'none'} if isinstance(iter_dir, six.string_types): fnames = list(utils.find(iter_dir, '*.jsons.gz')) else: assert isinstance(iter_dir, list) fnames = iter_dir fnames = sorted(fnames) logger.info('Rollup has %s files' % len(fnames)) pool = mulproc.Pool() linecount = dict(list(zip(fnames, pool.map(utils.linecount, fnames)))) pool.close() start_rows = {} nrows = 0 for fname in fnames: start_rows[fname] = nrows nrows += (linecount[fname] - 1) logger.info('Rollup has %s rows' % nrows) # read the first file, to see what the sizes are t = battlelogs.parse(fnames[0], gamma=gamma, lam=lam, reward_shaper=reward_shaper)[-1] fs = policy.extract(t['state'], t['candidates']) n_actions = 0 if 'mask' in fs: n_actions = fs['mask'].shape[0] type_info = { 'actions': ((nrows, ), 'int64'), 'advantages': ((nrows, ), config.nt()), 'returns': ((nrows, ), config.nt()), 'value_preds': ((nrows, ), config.nt()), } for k in ['probs', 'log_probs']: if k in t: na = max(t[k].shape[0], n_actions) type_info[k] = ((nrows, na), config.nt()) for k, v in fs.items(): type_info['features_' + k] = ((nrows, ) + v.shape, v.dtype) if num_workers > 0: mk_buf_fn = _mk_RawArray queue_mod = mulproc else: mk_buf_fn = _mk_volatile_buffer queue_mod = queue underlying = {} data = {} for k, (shape, dtype) in type_info.items(): size = six.moves.reduce(lambda x, y: x * y, shape, 1) u = underlying[k] = mk_buf_fn(shape, dtype) d = np.frombuffer(u, dtype=dtype, count=size) d.shape = shape data[k] = d in_queue = queue_mod.Queue() for kv in start_rows.items(): in_queue.put(kv) out_queue = queue_mod.Queue() kwargs = dict( type_info=type_info, underlying=underlying, policy_pkl=policy.pkl(), gamma=gamma, lam=lam, reward_shaper=reward_shaper, in_queue=in_queue, out_queue=out_queue, ) # Read the rest of the files if num_workers == 0: in_queue.put(None) worker = threading.Thread(target=_worker_loop, kwargs=kwargs) worker.daemon = True worker.start() workers = [worker] else: workers = [ mulproc.Process(target=_worker_loop, kwargs=kwargs) for _ in six.moves.range(num_workers) ] for worker in workers: worker.daemon = True in_queue.put(None) worker.start() pbar = fnames total = len(fnames) if progress_type == 'bar': pbar = tqdm.tqdm(pbar) for i, _ in enumerate(pbar): fname = out_queue.get() if isinstance(fname, Exception): raise fname if progress_type == 'bar': pbar.set_description(fname) elif progress_type == 'log': current_pct = int(100 * (i + 1) / total) prev_pct = int(100 * i / total) if current_pct > prev_pct: logger.info('Rolled up: [%d/%d (%d%%)] %s', i + 1, total, current_pct, fname) for worker in workers: worker.join() return data
def extract(self, state, candidates): pp = (state == +1).flatten().astype(config.nt()) pm = (state == -1).flatten().astype(config.nt()) return dict(pp = pp, pm = pm, mask = np.asarray(candidates).astype(config.nt()))
def zeros(*args, **kwargs): cuda = kwargs.get('cuda') return _ctor(config.nt(), cuda)(*args).fill_(0)
def gen7_base_stats(): df = pd.read_csv(os.path.join(self._dirname, 'base-stats.tsv'), sep = '\t') del df['Pokemon'] return df.astype(config.nt())
def extract(self, state, candidates): rv = {} rv['mask'] = np.asarray([float(bool(c)) for c in candidates]).astype(config.nt()) return rv
def test_gradient_step_direction(self): 'Test that good actions are boosted and bad actions are dampened' policy = Policy().type(config.tt()) updater = PPOUpdater( policy=policy, opt_lr=1e-1, num_epochs=1, vbatch_size=2, clip_param=0.1, ) # Fake a training example policy = policy.eval() state = np.array([[0, 0, 0], [0, 1, 0], [0, 0, 0]], dtype=int) mask = np.array([1, 1, 1, 1, 1, 1, 1, 1, 1]) results = policy.act(state, mask) features = policy.extract(state, mask) old_probs = results['probs'] old_log_probs = results['log_probs'] features = { k: np.repeat(np.expand_dims(v, axis=0), 2, axis=0) for k, v in features.items() } advantages = np.array([-1., +1.], dtype=config.nt()) log_probs = np.repeat(np.expand_dims(old_log_probs, axis=0), 2, axis=0) actions = np.array([4, 2], dtype='int64') extras = dict( advantages=advantages, log_probs=log_probs, actions=actions, value_preds=np.zeros(2, dtype=config.nt()), returns=np.zeros(2, dtype=config.nt()), ) for k, v in features.items(): extras['features_' + k] = v learner.post_prepare(extras) extras['advantages'] = np.array([-1., +1.], dtype=config.nt()) extras = TTensorDictDataset( {k: torch.from_numpy(v) for k, v in extras.items()}) updater.update(extras) policy = policy.eval() results = policy.act(state, mask) new_probs = results['probs'] new_log_probs = results['log_probs'] # print new_probs - old_probs # print # print new_log_probs - old_log_probs self.assertTrue(np.allclose(np.log(old_probs), old_log_probs)) self.assertTrue(np.allclose(np.log(new_probs), new_log_probs)) self.assertGreater(new_probs[2], old_probs[2]) self.assertLess(new_probs[4], old_probs[4])
def convert(replay): '''Returns two sequences of Blocks, one for p1 and one for p2. Each Block (that is not the last) contains: - _updates: The sequence of log messages that that user sees on that turn - request: The request object that is associated with this block. - state: The current state. - action: an integer in [0, N) (now N = 10, soon N = 10 + 12 for mega/zmove/ultra) representing the action that was actually taken - candidates: a list of N action names (directly sent to PS) - the name in the slot will be None if action is not allowed The last block contains the winner of the battle, and the logs that lead up until that log. The two sequences of blocks may not be of equal length; there are some turns where only one player is required to make a decision. ''' raise ValueError('Deprecated') blocks = replay['blocks'] out_blocks = dict( p1=dict(name=replay['p1'][0], blocks=[{ '_updates': [] }]), p2=dict(name=replay['p2'][0], blocks=[{ '_updates': [] }]), ) engine.start('p1') engine.start('p2') for i in range(len(blocks) - 1): cur = blocks[i] nex = blocks[i + 1] _1, _2, p1_action, p2_action = nex['choice'].split('|') actions = dict(p1=p1_action, p2=p2_action) for p in ['p1', 'p2']: request, updates = extract(cur[p]) #candidates = parser.parse_valid_actions(request, replay_names = True) candidates = parser.parse_valid_actions(request) for update in updates: engine.update(p, update) blks = out_blocks[p]['blocks'] blk = blks[-1] blk['_updates'].extend(updates) if actions[p]: blk['candidates'] = candidates blk['action'] = candidates.index(actions[p]) blk['request'] = request blk['state'] = engine.fetch(p, request) blk['probs'] = np.zeros(len(candidates), dtype=config.nt()) blk['probs'][blk['action']] = 1.0 blks.append({'_updates': []}) last = blocks[-1]['logs'] assert last[-1].startswith('|win|') _1, _2, winner = last[-1].split('|') for p in ['p1', 'p2']: blk = out_blocks[p]['blocks'][-1] blk['_updates'].extend( [l for l in blocks[-1][p] if not l.startswith('|request|')]) blk['result'] = 'winner' if winner == out_blocks[p]['name'] else 'loser' engine.stop('p1') engine.stop('p2') return out_blocks['p1']['blocks'], out_blocks['p2']['blocks']