def preprocess_samples_discrete( self, samples: Samples, minibatch_size: int, one_hot_action: bool = True) -> List[TrainingDataPage]: logger.info("Shuffling...") samples.shuffle() logger.info("Preprocessing...") net = core.Net("gridworld_preprocessing") C2.set_net(net) preprocessor = PreprocessorNet(True) saa = StackedAssociativeArray.from_dict_list(samples.states, "states") state_matrix, _ = preprocessor.normalize_sparse_matrix( saa.lengths, saa.keys, saa.values, self.normalization, "state_norm", False, False, False, ) saa = StackedAssociativeArray.from_dict_list(samples.next_states, "next_states") next_state_matrix, _ = preprocessor.normalize_sparse_matrix( saa.lengths, saa.keys, saa.values, self.normalization, "next_state_norm", False, False, False, ) workspace.RunNetOnce(net) logger.info("Converting to Torch...") actions_one_hot = torch.tensor((np.array(samples.actions).reshape( -1, 1) == np.array(self.ACTIONS)).astype(np.int64)) actions = actions_one_hot.argmax(dim=1, keepdim=True) rewards = torch.tensor(samples.rewards, dtype=torch.float32).reshape(-1, 1) action_probabilities = torch.tensor(samples.action_probabilities, dtype=torch.float32).reshape( -1, 1) next_actions_one_hot = torch.tensor( (np.array(samples.next_actions).reshape(-1, 1) == np.array( self.ACTIONS)).astype(np.int64)) logger.info("Converting PNA to Torch...") possible_next_action_strings = np.array( list( itertools.zip_longest(*samples.possible_next_actions, fillvalue=""))).T possible_next_actions_mask = torch.zeros( [len(samples.next_actions), len(self.ACTIONS)]) for i, action in enumerate(self.ACTIONS): possible_next_actions_mask[:, i] = torch.tensor( np.max(possible_next_action_strings == action, axis=1).astype(np.int64)) terminals = torch.tensor(samples.terminals, dtype=torch.int32).reshape(-1, 1) not_terminals = 1 - terminals episode_values = None logger.info("Converting RT to Torch...") episode_values = torch.tensor(samples.episode_values, dtype=torch.float32).reshape(-1, 1) time_diffs = torch.ones([len(samples.states), 1]) logger.info("Preprocessing...") preprocessor = Preprocessor(self.normalization, False) states_ndarray = workspace.FetchBlob(state_matrix) states_ndarray = preprocessor.forward(states_ndarray) next_states_ndarray = workspace.FetchBlob(next_state_matrix) next_states_ndarray = preprocessor.forward(next_states_ndarray) logger.info("Batching...") tdps = [] for start in range(0, states_ndarray.shape[0], minibatch_size): end = start + minibatch_size if end > states_ndarray.shape[0]: break tdp = TrainingDataPage( states=states_ndarray[start:end], actions=actions_one_hot[start:end] if one_hot_action else actions[start:end], propensities=action_probabilities[start:end], rewards=rewards[start:end], next_states=next_states_ndarray[start:end], not_terminals=not_terminals[start:end], next_actions=next_actions_one_hot[start:end], possible_next_actions=possible_next_actions_mask[start:end], episode_values=episode_values[start:end] if episode_values is not None else None, time_diffs=time_diffs[start:end], ) tdp.set_type(torch.FloatTensor) tdps.append(tdp) return tdps
def preprocess_samples( self, samples: Samples, minibatch_size: int, use_gpu: bool = False, one_hot_action: bool = True, normalize_actions: bool = True, ) -> List[TrainingDataPage]: logger.info("Shuffling...") samples.shuffle() logger.info("Sparse2Dense...") net = core.Net("gridworld_preprocessing") C2.set_net(net) saa = StackedAssociativeArray.from_dict_list(samples.states, "states") sorted_state_features, _ = sort_features_by_normalization(self.normalization) state_matrix, _ = sparse_to_dense( saa.lengths, saa.keys, saa.values, sorted_state_features ) saa = StackedAssociativeArray.from_dict_list(samples.next_states, "next_states") next_state_matrix, _ = sparse_to_dense( saa.lengths, saa.keys, saa.values, sorted_state_features ) sorted_action_features, _ = sort_features_by_normalization( self.normalization_action ) saa = StackedAssociativeArray.from_dict_list(samples.actions, "action") action_matrix, _ = sparse_to_dense( saa.lengths, saa.keys, saa.values, sorted_action_features ) saa = StackedAssociativeArray.from_dict_list( samples.next_actions, "next_action" ) next_action_matrix, _ = sparse_to_dense( saa.lengths, saa.keys, saa.values, sorted_action_features ) action_probabilities = torch.tensor( samples.action_probabilities, dtype=torch.float32 ).reshape(-1, 1) rewards = torch.tensor(samples.rewards, dtype=torch.float32).reshape(-1, 1) pnas_lengths_list = [] pnas_flat: List[List[str]] = [] for pnas in samples.possible_next_actions: pnas_lengths_list.append(len(pnas)) pnas_flat.extend(pnas) saa = StackedAssociativeArray.from_dict_list(pnas_flat, "possible_next_actions") pnas_lengths = torch.tensor(pnas_lengths_list, dtype=torch.int32) pna_lens_blob = "pna_lens_blob" workspace.FeedBlob(pna_lens_blob, pnas_lengths.numpy()) possible_next_actions_matrix, _ = sparse_to_dense( saa.lengths, saa.keys, saa.values, sorted_action_features ) state_pnas_tile_blob = C2.LengthsTile(next_state_matrix, pna_lens_blob) workspace.RunNetOnce(net) logger.info("Preprocessing...") state_preprocessor = Preprocessor(self.normalization, False) action_preprocessor = Preprocessor(self.normalization_action, False) states_ndarray = workspace.FetchBlob(state_matrix) states_ndarray = state_preprocessor.forward(states_ndarray) actions_ndarray = torch.from_numpy(workspace.FetchBlob(action_matrix)) if normalize_actions: actions_ndarray = action_preprocessor.forward(actions_ndarray) next_states_ndarray = workspace.FetchBlob(next_state_matrix) next_states_ndarray = state_preprocessor.forward(next_states_ndarray) next_actions_ndarray = torch.from_numpy(workspace.FetchBlob(next_action_matrix)) if normalize_actions: next_actions_ndarray = action_preprocessor.forward(next_actions_ndarray) logged_possible_next_actions = action_preprocessor.forward( workspace.FetchBlob(possible_next_actions_matrix) ) state_pnas_tile = state_preprocessor.forward( workspace.FetchBlob(state_pnas_tile_blob) ) logged_possible_next_state_actions = torch.cat( (state_pnas_tile, logged_possible_next_actions), dim=1 ) logger.info("Reward Timeline to Torch...") possible_next_actions_ndarray = logged_possible_next_actions possible_next_actions_state_concat = logged_possible_next_state_actions time_diffs = torch.ones([len(samples.states), 1]) tdps = [] pnas_start = 0 logger.info("Batching...") for start in range(0, states_ndarray.shape[0], minibatch_size): end = start + minibatch_size if end > states_ndarray.shape[0]: break pnas_end = pnas_start + torch.sum(pnas_lengths[start:end]) pnas = possible_next_actions_ndarray[pnas_start:pnas_end] pnas_concat = possible_next_actions_state_concat[pnas_start:pnas_end] pnas_start = pnas_end tdp = TrainingDataPage( states=states_ndarray[start:end], actions=actions_ndarray[start:end], propensities=action_probabilities[start:end], rewards=rewards[start:end], next_states=next_states_ndarray[start:end], next_actions=next_actions_ndarray[start:end], possible_next_actions=None, not_terminals=(pnas_lengths[start:end] > 0).reshape(-1, 1), time_diffs=time_diffs[start:end], possible_next_actions_lengths=pnas_lengths[start:end], possible_next_actions_state_concat=pnas_concat, ) tdp.set_type(torch.cuda.FloatTensor if use_gpu else torch.FloatTensor) tdps.append(tdp) return tdps
def preprocess_samples_discrete( self, states: List[Dict[int, float]], actions: List[str], rewards: List[float], next_states: List[Dict[int, float]], next_actions: List[str], is_terminals: List[bool], possible_next_actions: List[List[str]], reward_timelines: Optional[List[Dict[int, float]]], minibatch_size: int, ) -> List[TrainingDataPage]: # Shuffle if reward_timelines is None: merged = list( zip(states, actions, rewards, next_states, next_actions, is_terminals, possible_next_actions)) random.shuffle(merged) states, actions, rewards, next_states, next_actions, \ is_terminals, possible_next_actions = zip(*merged) else: merged = list( zip(states, actions, rewards, next_states, next_actions, is_terminals, possible_next_actions, reward_timelines)) random.shuffle(merged) states, actions, rewards, next_states, next_actions, \ is_terminals, possible_next_actions, reward_timelines = zip(*merged) net = core.Net('gridworld_preprocessing') C2.set_net(net) preprocessor = PreprocessorNet(net, True) saa = StackedAssociativeArray.from_dict_list(states, 'states') state_matrix, _ = preprocessor.normalize_sparse_matrix( saa.lengths, saa.keys, saa.values, self.normalization, 'state_norm', ) saa = StackedAssociativeArray.from_dict_list(next_states, 'next_states') next_state_matrix, _ = preprocessor.normalize_sparse_matrix( saa.lengths, saa.keys, saa.values, self.normalization, 'next_state_norm', ) workspace.RunNetOnce(net) actions_one_hot = np.zeros( [len(actions), len(self.ACTIONS)], dtype=np.float32) for i, action in enumerate(actions): actions_one_hot[i, self.ACTIONS.index(action)] = 1 rewards = np.array(rewards, dtype=np.float32).reshape(-1, 1) next_actions_one_hot = np.zeros( [len(next_actions), len(self.ACTIONS)], dtype=np.float32) for i, action in enumerate(next_actions): if action == '': continue next_actions_one_hot[i, self.ACTIONS.index(action)] = 1 possible_next_actions_mask = [] for pna in possible_next_actions: pna_mask = [0] * self.num_actions for action in pna: pna_mask[self.ACTIONS.index(action)] = 1 possible_next_actions_mask.append(pna_mask) possible_next_actions_mask = np.array(possible_next_actions_mask, dtype=np.float32) is_terminals = np.array(is_terminals, dtype=np.bool).reshape(-1, 1) not_terminals = np.logical_not(is_terminals) if reward_timelines is not None: reward_timelines = np.array(reward_timelines, dtype=np.object) states_ndarray = workspace.FetchBlob(state_matrix) next_states_ndarray = workspace.FetchBlob(next_state_matrix) tdps = [] for start in range(0, states_ndarray.shape[0], minibatch_size): end = start + minibatch_size if end > states_ndarray.shape[0]: break tdps.append( TrainingDataPage( states=states_ndarray[start:end], actions=actions_one_hot[start:end], rewards=rewards[start:end], next_states=next_states_ndarray[start:end], not_terminals=not_terminals[start:end], next_actions=next_actions_one_hot[start:end], possible_next_actions=possible_next_actions_mask[ start:end], reward_timelines=reward_timelines[start:end] if reward_timelines is not None else None, )) return tdps
def preprocess_samples_discrete( self, samples: Samples, minibatch_size: int) -> List[TrainingDataPage]: samples.shuffle() net = core.Net("gridworld_preprocessing") C2.set_net(net) preprocessor = PreprocessorNet(True) saa = StackedAssociativeArray.from_dict_list(samples.states, "states") state_matrix, _ = preprocessor.normalize_sparse_matrix( saa.lengths, saa.keys, saa.values, self.normalization, "state_norm", False, False, ) saa = StackedAssociativeArray.from_dict_list(samples.next_states, "next_states") next_state_matrix, _ = preprocessor.normalize_sparse_matrix( saa.lengths, saa.keys, saa.values, self.normalization, "next_state_norm", False, False, ) workspace.RunNetOnce(net) actions_one_hot = np.zeros( [len(samples.actions), len(self.ACTIONS)], dtype=np.float32) for i, action in enumerate(samples.actions): actions_one_hot[i, self.action_to_index(action)] = 1 rewards = np.array(samples.rewards, dtype=np.float32).reshape(-1, 1) propensities = np.array(samples.propensities, dtype=np.float32).reshape(-1, 1) next_actions_one_hot = np.zeros( [len(samples.next_actions), len(self.ACTIONS)], dtype=np.float32) for i, action in enumerate(samples.next_actions): if action == "": continue next_actions_one_hot[i, self.action_to_index(action)] = 1 possible_next_actions_mask = [] for pna in samples.possible_next_actions: pna_mask = [0] * self.num_actions for action in pna: pna_mask[self.action_to_index(action)] = 1 possible_next_actions_mask.append(pna_mask) possible_next_actions_mask = np.array(possible_next_actions_mask, dtype=np.float32) is_terminals = np.array(samples.is_terminal, dtype=np.bool).reshape(-1, 1) not_terminals = np.logical_not(is_terminals) if samples.reward_timelines is not None: reward_timelines = np.array(samples.reward_timelines, dtype=np.object) states_ndarray = workspace.FetchBlob(state_matrix) next_states_ndarray = workspace.FetchBlob(next_state_matrix) tdps = [] for start in range(0, states_ndarray.shape[0], minibatch_size): end = start + minibatch_size if end > states_ndarray.shape[0]: break tdps.append( TrainingDataPage( states=states_ndarray[start:end], actions=actions_one_hot[start:end], propensities=propensities[start:end], rewards=rewards[start:end], next_states=next_states_ndarray[start:end], not_terminals=not_terminals[start:end], next_actions=next_actions_one_hot[start:end], possible_next_actions=possible_next_actions_mask[ start:end], reward_timelines=reward_timelines[start:end] if reward_timelines is not None else None, )) return tdps
def preprocess_samples( self, samples: Samples, minibatch_size: int, use_gpu: bool = False, one_hot_action: bool = True, normalize_actions: bool = True, ) -> List[TrainingDataPage]: logger.info("Shuffling...") samples = shuffle_samples(samples) logger.info("Sparse2Dense...") net = core.Net("gridworld_preprocessing") C2.set_net(net) sorted_state_features, _ = sort_features_by_normalization( self.normalization) sorted_action_features, _ = sort_features_by_normalization( self.normalization_action) state_sparse_to_dense_processor = Caffe2SparseToDenseProcessor( sorted_state_features) action_sparse_to_dense_processor = Caffe2SparseToDenseProcessor( sorted_action_features) saa = StackedAssociativeArray.from_dict_list(samples.states, "states") state_matrix, state_matrix_presence, _ = state_sparse_to_dense_processor( saa) saa = StackedAssociativeArray.from_dict_list(samples.next_states, "next_states") next_state_matrix, next_state_matrix_presence, _ = state_sparse_to_dense_processor( saa) saa = StackedAssociativeArray.from_dict_list( # type: ignore samples.actions, "action") action_matrix, action_matrix_presence, _ = action_sparse_to_dense_processor( saa) saa = StackedAssociativeArray.from_dict_list( # type: ignore samples.next_actions, "next_action") next_action_matrix, next_action_matrix_presence, _ = action_sparse_to_dense_processor( saa) action_probabilities = torch.tensor(samples.action_probabilities, dtype=torch.float32).reshape( -1, 1) rewards = torch.tensor(samples.rewards, dtype=torch.float32).reshape(-1, 1) max_action_size = 4 pnas_mask_list: List[List[int]] = [] pnas_flat: List[Dict[str, float]] = [] for pnas in samples.possible_next_actions: pnas_mask_list.append([1] * len(pnas) + [0] * (max_action_size - len(pnas))) pnas_flat.extend(pnas) # type: ignore for _ in range(max_action_size - len(pnas)): pnas_flat.append({}) # Filler saa = StackedAssociativeArray.from_dict_list( # type: ignore pnas_flat, "possible_next_actions") pnas_mask = torch.Tensor(pnas_mask_list) possible_next_actions_matrix, possible_next_actions_matrix_presence, _ = action_sparse_to_dense_processor( saa) workspace.RunNetOnce(net) logger.info("Preprocessing...") state_preprocessor = Preprocessor(self.normalization, False) action_preprocessor = Preprocessor(self.normalization_action, False) states_ndarray = state_preprocessor( torch.from_numpy(workspace.FetchBlob(state_matrix)), torch.from_numpy( workspace.FetchBlob(state_matrix_presence)).float(), ) if normalize_actions: actions_ndarray = action_preprocessor( torch.from_numpy(workspace.FetchBlob(action_matrix)), torch.from_numpy( workspace.FetchBlob(action_matrix_presence)).float(), ) else: actions_ndarray = torch.from_numpy( workspace.FetchBlob(action_matrix)) next_states_ndarray = torch.from_numpy( workspace.FetchBlob(next_state_matrix)) next_states_ndarray = state_preprocessor( next_states_ndarray, (next_states_ndarray != MISSING_VALUE).float()) state_pnas_tile = next_states_ndarray.repeat( 1, max_action_size).reshape(-1, next_states_ndarray.shape[1]) if normalize_actions: next_actions_ndarray = action_preprocessor( torch.from_numpy(workspace.FetchBlob(next_action_matrix)), torch.from_numpy( workspace.FetchBlob(next_action_matrix_presence)).float(), ) else: next_actions_ndarray = torch.from_numpy( workspace.FetchBlob(next_action_matrix)) if normalize_actions: logged_possible_next_actions = action_preprocessor( torch.from_numpy( workspace.FetchBlob(possible_next_actions_matrix)), torch.from_numpy( workspace.FetchBlob( possible_next_actions_matrix_presence)).float(), ) else: logged_possible_next_actions = torch.from_numpy( workspace.FetchBlob(possible_next_actions_matrix)) assert state_pnas_tile.shape[0] == logged_possible_next_actions.shape[ 0], ("Invalid shapes: " + str(state_pnas_tile.shape) + " != " + str(logged_possible_next_actions.shape)) logged_possible_next_state_actions = torch.cat( (state_pnas_tile, logged_possible_next_actions), dim=1) logger.info("Reward Timeline to Torch...") time_diffs = torch.ones([len(samples.states), 1]) tdps = [] pnas_start = 0 logger.info("Batching...") for start in range(0, states_ndarray.shape[0], minibatch_size): end = start + minibatch_size if end > states_ndarray.shape[0]: break pnas_end = pnas_start + (minibatch_size * max_action_size) tdp = TrainingDataPage( states=states_ndarray[start:end], actions=actions_ndarray[start:end], propensities=action_probabilities[start:end], rewards=rewards[start:end], next_states=next_states_ndarray[start:end], next_actions=next_actions_ndarray[start:end], not_terminal=(pnas_mask[start:end, :].sum(dim=1, keepdim=True) > 0), time_diffs=time_diffs[start:end], possible_next_actions_mask=pnas_mask[start:end, :], possible_next_actions_state_concat= logged_possible_next_state_actions[pnas_start:pnas_end, :], ) pnas_start = pnas_end tdp.set_type(torch.cuda.FloatTensor if use_gpu else torch. FloatTensor # type: ignore ) tdps.append(tdp) return tdps
def preprocess_samples_discrete( self, samples: Samples, minibatch_size: int, one_hot_action: bool = True, use_gpu: bool = False, do_shuffle: bool = True, ) -> List[TrainingDataPage]: if do_shuffle: logger.info("Shuffling...") samples = shuffle_samples(samples) logger.info("Preprocessing...") sparse_to_dense_processor = Caffe2SparseToDenseProcessor() if self.sparse_to_dense_net is None: self.sparse_to_dense_net = core.Net("gridworld_sparse_to_dense") C2.set_net(self.sparse_to_dense_net) saa = StackedAssociativeArray.from_dict_list(samples.states, "states") sorted_features, _ = sort_features_by_normalization(self.normalization) self.state_matrix, _ = sparse_to_dense_processor(sorted_features, saa) saa = StackedAssociativeArray.from_dict_list( samples.next_states, "next_states" ) self.next_state_matrix, _ = sparse_to_dense_processor(sorted_features, saa) C2.set_net(None) else: StackedAssociativeArray.from_dict_list(samples.states, "states") StackedAssociativeArray.from_dict_list(samples.next_states, "next_states") workspace.RunNetOnce(self.sparse_to_dense_net) logger.info("Converting to Torch...") actions_one_hot = torch.tensor( (np.array(samples.actions).reshape(-1, 1) == np.array(self.ACTIONS)).astype( np.int64 ) ) actions = actions_one_hot.argmax(dim=1, keepdim=True) rewards = torch.tensor(samples.rewards, dtype=torch.float32).reshape(-1, 1) action_probabilities = torch.tensor( samples.action_probabilities, dtype=torch.float32 ).reshape(-1, 1) next_actions_one_hot = torch.tensor( ( np.array(samples.next_actions).reshape(-1, 1) == np.array(self.ACTIONS) ).astype(np.int64) ) logger.info("Converting PA to Torch...") possible_action_strings = np.array( list(itertools.zip_longest(*samples.possible_actions, fillvalue="")) ).T possible_actions_mask = torch.zeros([len(samples.actions), len(self.ACTIONS)]) for i, action in enumerate(self.ACTIONS): possible_actions_mask[:, i] = torch.tensor( np.max(possible_action_strings == action, axis=1).astype(np.int64) ) logger.info("Converting PNA to Torch...") possible_next_action_strings = np.array( list(itertools.zip_longest(*samples.possible_next_actions, fillvalue="")) ).T possible_next_actions_mask = torch.zeros( [len(samples.next_actions), len(self.ACTIONS)] ) for i, action in enumerate(self.ACTIONS): possible_next_actions_mask[:, i] = torch.tensor( np.max(possible_next_action_strings == action, axis=1).astype(np.int64) ) terminals = torch.tensor(samples.terminals, dtype=torch.int32).reshape(-1, 1) not_terminal = 1 - terminals logger.info("Converting RT to Torch...") time_diffs = torch.ones([len(samples.states), 1]) logger.info("Preprocessing...") preprocessor = Preprocessor(self.normalization, False) states_ndarray = workspace.FetchBlob(self.state_matrix) states_ndarray = preprocessor.forward(states_ndarray) next_states_ndarray = workspace.FetchBlob(self.next_state_matrix) next_states_ndarray = preprocessor.forward(next_states_ndarray) logger.info("Batching...") tdps = [] for start in range(0, states_ndarray.shape[0], minibatch_size): end = start + minibatch_size if end > states_ndarray.shape[0]: break tdp = TrainingDataPage( states=states_ndarray[start:end], actions=actions_one_hot[start:end] if one_hot_action else actions[start:end], propensities=action_probabilities[start:end], rewards=rewards[start:end], next_states=next_states_ndarray[start:end], not_terminal=not_terminal[start:end], next_actions=next_actions_one_hot[start:end], possible_actions_mask=possible_actions_mask[start:end], possible_next_actions_mask=possible_next_actions_mask[start:end], time_diffs=time_diffs[start:end], ) tdp.set_type(torch.cuda.FloatTensor if use_gpu else torch.FloatTensor) tdps.append(tdp) return tdps
def preprocess_samples(self, samples: Samples, minibatch_size: int) -> List[TrainingDataPage]: samples.shuffle() net = core.Net("gridworld_preprocessing") C2.set_net(net) preprocessor = PreprocessorNet(True) saa = StackedAssociativeArray.from_dict_list(samples.states, "states") state_matrix, _ = preprocessor.normalize_sparse_matrix( saa.lengths, saa.keys, saa.values, self.normalization, "state_norm", False, False, ) saa = StackedAssociativeArray.from_dict_list(samples.next_states, "next_states") next_state_matrix, _ = preprocessor.normalize_sparse_matrix( saa.lengths, saa.keys, saa.values, self.normalization, "next_state_norm", False, False, ) saa = StackedAssociativeArray.from_dict_list(samples.actions, "action") action_matrix, _ = preprocessor.normalize_sparse_matrix( saa.lengths, saa.keys, saa.values, self.normalization_action, "action_norm", False, False, ) saa = StackedAssociativeArray.from_dict_list(samples.next_actions, "next_action") next_action_matrix, _ = preprocessor.normalize_sparse_matrix( saa.lengths, saa.keys, saa.values, self.normalization_action, "next_action_norm", False, False, ) propensities = np.array(samples.propensities, dtype=np.float32).reshape(-1, 1) rewards = np.array(samples.rewards, dtype=np.float32).reshape(-1, 1) pnas_lengths_list = [] pnas_flat: List[List[str]] = [] for pnas in samples.possible_next_actions: pnas_lengths_list.append(len(pnas)) pnas_flat.extend(pnas) saa = StackedAssociativeArray.from_dict_list(pnas_flat, "possible_next_actions") pnas_lengths = np.array(pnas_lengths_list, dtype=np.int32) possible_next_actions_matrix, _ = preprocessor.normalize_sparse_matrix( saa.lengths, saa.keys, saa.values, self.normalization_action, "possible_next_action_norm", False, False, ) workspace.RunNetOnce(net) states_ndarray = workspace.FetchBlob(state_matrix) actions_ndarray = workspace.FetchBlob(action_matrix) next_states_ndarray = workspace.FetchBlob(next_state_matrix) next_actions_ndarray = workspace.FetchBlob(next_action_matrix) possible_next_actions_ndarray = workspace.FetchBlob( possible_next_actions_matrix) tdps = [] pnas_start = 0 for start in range(0, states_ndarray.shape[0], minibatch_size): end = start + minibatch_size if end > states_ndarray.shape[0]: break pnas_end = pnas_start + np.sum(pnas_lengths[start:end]) pnas = possible_next_actions_ndarray[pnas_start:pnas_end] pnas_start = pnas_end tdps.append( TrainingDataPage( states=states_ndarray[start:end], actions=actions_ndarray[start:end], propensities=propensities[start:end], rewards=rewards[start:end], next_states=next_states_ndarray[start:end], next_actions=next_actions_ndarray[start:end], possible_next_actions=StackedArray(pnas_lengths[start:end], pnas), not_terminals=(pnas_lengths[start:end] > 0).reshape(-1, 1), reward_timelines=samples.reward_timelines[start:end] if samples.reward_timelines else None, )) return tdps
def export( cls, trainer, state_normalization_parameters, action_normalization_parameters, model_on_gpu=False, normalize_actions=True, ): """Export caffe2 preprocessor net and pytorch DQN forward pass as one caffe2 net. :param trainer ParametricDQNTrainer :param state_normalization_parameters state NormalizationParameters :param action_normalization_parameters action NormalizationParameters :param model_on_gpu boolean indicating if the model is a GPU model or CPU model """ input_dim = trainer.num_features if isinstance(trainer.q_network, DataParallel): trainer.q_network = trainer.q_network.module buffer = PytorchCaffe2Converter.pytorch_net_to_buffer( trainer.q_network, input_dim, model_on_gpu) qnet_input_blob, qnet_output_blob, caffe2_netdef = PytorchCaffe2Converter.buffer_to_caffe2_netdef( buffer) torch_workspace = caffe2_netdef.workspace parameters = torch_workspace.Blobs() for blob_str in parameters: workspace.FeedBlob(blob_str, torch_workspace.FetchBlob(blob_str)) # Remove the input blob from parameters since it's not a real # input (will be calculated by preprocessor) parameters.remove(qnet_input_blob) torch_init_net = core.Net(caffe2_netdef.init_net) torch_predict_net = core.Net(caffe2_netdef.predict_net) # While converting to metanetdef, the external_input of predict_net # will be recomputed. Add the real output of init_net to parameters # to make sure they will be counted. parameters.extend( set(caffe2_netdef.init_net.external_output) - set(caffe2_netdef.init_net.external_input)) # ensure state and action IDs have no intersection assert (len( set(state_normalization_parameters.keys()) & set(action_normalization_parameters.keys())) == 0) model = model_helper.ModelHelper(name="predictor") net = model.net C2.set_model(model) workspace.FeedBlob("input/float_features.lengths", np.zeros(1, dtype=np.int32)) workspace.FeedBlob("input/float_features.keys", np.zeros(1, dtype=np.int64)) workspace.FeedBlob("input/float_features.values", np.zeros(1, dtype=np.float32)) input_feature_lengths = "input_feature_lengths" input_feature_keys = "input_feature_keys" input_feature_values = "input_feature_values" C2.net().Copy(["input/float_features.lengths"], [input_feature_lengths]) C2.net().Copy(["input/float_features.keys"], [input_feature_keys]) C2.net().Copy(["input/float_features.values"], [input_feature_values]) preprocessor = PreprocessorNet() sparse_to_dense_processor = Caffe2SparseToDenseProcessor() sorted_state_features, _ = sort_features_by_normalization( state_normalization_parameters) state_dense_matrix, new_parameters = sparse_to_dense_processor( sorted_state_features, StackedAssociativeArray(input_feature_lengths, input_feature_keys, input_feature_values), ) parameters.extend(new_parameters) state_normalized_dense_matrix, new_parameters = preprocessor.normalize_dense_matrix( state_dense_matrix, sorted_state_features, state_normalization_parameters, "state_norm", False, ) parameters.extend(new_parameters) sorted_action_features, _ = sort_features_by_normalization( action_normalization_parameters) action_dense_matrix, new_parameters = sparse_to_dense_processor( sorted_action_features, StackedAssociativeArray(input_feature_lengths, input_feature_keys, input_feature_values), ) parameters.extend(new_parameters) if normalize_actions: action_normalized_dense_matrix, new_parameters = preprocessor.normalize_dense_matrix( action_dense_matrix, sorted_action_features, action_normalization_parameters, "action_norm", False, ) parameters.extend(new_parameters) else: action_normalized_dense_matrix = action_dense_matrix state_action_normalized = "state_action_normalized" state_action_normalized_dim = "state_action_normalized_dim" net.Concat( [state_normalized_dense_matrix, action_normalized_dense_matrix], [state_action_normalized, state_action_normalized_dim], axis=1, ) net.Copy([state_action_normalized], [qnet_input_blob]) workspace.RunNetOnce(model.param_init_net) workspace.RunNetOnce(torch_init_net) net.AppendNet(torch_predict_net) new_parameters, q_values = RLPredictor._forward_pass( model, trainer, state_action_normalized, ["Q"], qnet_output_blob) parameters.extend(new_parameters) flat_q_values_key = ( "output/string_weighted_multi_categorical_features.values.values") num_examples, _ = C2.Reshape(C2.Size(flat_q_values_key), shape=[1]) q_value_blob, _ = C2.Reshape(flat_q_values_key, shape=[1, -1]) # Get 1 x n (number of examples) action index tensor under the max_q policy max_q_act_idxs = "max_q_policy_actions" C2.net().FlattenToVec([C2.ArgMax(q_value_blob)], [max_q_act_idxs]) max_q_act_blob = C2.Tile(max_q_act_idxs, num_examples, axis=0) # Get 1 x n (number of examples) action index tensor under the softmax policy temperature = C2.NextBlob("temperature") parameters.append(temperature) workspace.FeedBlob( temperature, np.array([trainer.rl_temperature], dtype=np.float32)) tempered_q_values = C2.Div(q_value_blob, temperature, broadcast=1) softmax_values = C2.Softmax(tempered_q_values) softmax_act_idxs_nested = "softmax_act_idxs_nested" C2.net().WeightedSample([softmax_values], [softmax_act_idxs_nested]) softmax_act_blob = C2.Tile(C2.FlattenToVec(softmax_act_idxs_nested), num_examples, axis=0) # Concat action idx vecs to get 2 x n tensor [[a_maxq, ..], [a_softmax, ..]] # transpose & flatten to get [a_maxq, a_softmax, a_maxq, a_softmax, ...] max_q_act_blob = C2.Cast(max_q_act_blob, to=caffe2_pb2.TensorProto.INT64) softmax_act_blob = C2.Cast(softmax_act_blob, to=caffe2_pb2.TensorProto.INT64) max_q_act_blob_nested, _ = C2.Reshape(max_q_act_blob, shape=[1, -1]) softmax_act_blob_nested, _ = C2.Reshape(softmax_act_blob, shape=[1, -1]) C2.net().Append([max_q_act_blob_nested, softmax_act_blob_nested], [max_q_act_blob_nested]) transposed_action_idxs = C2.Transpose(max_q_act_blob_nested) flat_transposed_action_idxs = C2.FlattenToVec(transposed_action_idxs) output_values = "output/int_single_categorical_features.values" workspace.FeedBlob(output_values, np.zeros(1, dtype=np.int64)) C2.net().Copy([flat_transposed_action_idxs], [output_values]) output_lengths = "output/int_single_categorical_features.lengths" workspace.FeedBlob(output_lengths, np.zeros(1, dtype=np.int32)) C2.net().ConstantFill( [flat_q_values_key], [output_lengths], value=2, dtype=caffe2_pb2.TensorProto.INT32, ) output_keys = "output/int_single_categorical_features.keys" workspace.FeedBlob(output_keys, np.zeros(1, dtype=np.int64)) output_keys_tensor, _ = C2.Concat( C2.ConstantFill(shape=[1, 1], value=0, dtype=caffe2_pb2.TensorProto.INT64), C2.ConstantFill(shape=[1, 1], value=1, dtype=caffe2_pb2.TensorProto.INT64), axis=0, ) output_key_tile = C2.Tile(output_keys_tensor, num_examples, axis=0) C2.net().FlattenToVec([output_key_tile], [output_keys]) workspace.CreateNet(net) return _ParametricDQNPredictor(net, torch_init_net, parameters)
def preprocess_samples(self, samples: Samples, minibatch_size: int) -> List[TrainingDataPage]: samples.shuffle() net = core.Net("gridworld_preprocessing") C2.set_net(net) preprocessor = PreprocessorNet(True) saa = StackedAssociativeArray.from_dict_list(samples.states, "states") state_matrix, _ = preprocessor.normalize_sparse_matrix( saa.lengths, saa.keys, saa.values, self.normalization, "state_norm", False, False, False, ) saa = StackedAssociativeArray.from_dict_list(samples.next_states, "next_states") next_state_matrix, _ = preprocessor.normalize_sparse_matrix( saa.lengths, saa.keys, saa.values, self.normalization, "next_state_norm", False, False, False, ) saa = StackedAssociativeArray.from_dict_list(samples.actions, "action") action_matrix, _ = preprocessor.normalize_sparse_matrix( saa.lengths, saa.keys, saa.values, self.normalization_action, "action_norm", False, False, False, ) saa = StackedAssociativeArray.from_dict_list(samples.next_actions, "next_action") next_action_matrix, _ = preprocessor.normalize_sparse_matrix( saa.lengths, saa.keys, saa.values, self.normalization_action, "next_action_norm", False, False, False, ) propensities = np.array(samples.propensities, dtype=np.float32).reshape(-1, 1) rewards = np.array(samples.rewards, dtype=np.float32).reshape(-1, 1) pnas_lengths_list = [] pnas_flat: List[List[str]] = [] for pnas in samples.possible_next_actions: pnas_lengths_list.append(len(pnas)) pnas_flat.extend(pnas) saa = StackedAssociativeArray.from_dict_list(pnas_flat, "possible_next_actions") pnas_lengths = np.array(pnas_lengths_list, dtype=np.int32) pna_lens_blob = "pna_lens_blob" workspace.FeedBlob(pna_lens_blob, pnas_lengths) possible_next_actions_matrix, _ = preprocessor.normalize_sparse_matrix( saa.lengths, saa.keys, saa.values, self.normalization_action, "possible_next_action_norm", False, False, False, ) state_pnas_tile_blob = C2.LengthsTile(next_state_matrix, pna_lens_blob) workspace.RunNetOnce(net) state_preprocessor = Preprocessor(self.normalization, False) action_preprocessor = Preprocessor(self.normalization_action, False) states_ndarray = workspace.FetchBlob(state_matrix) states_ndarray = state_preprocessor.forward(states_ndarray).numpy() actions_ndarray = workspace.FetchBlob(action_matrix) actions_ndarray = action_preprocessor.forward(actions_ndarray).numpy() next_states_ndarray = workspace.FetchBlob(next_state_matrix) next_states_ndarray = state_preprocessor.forward( next_states_ndarray).numpy() next_actions_ndarray = workspace.FetchBlob(next_action_matrix) next_actions_ndarray = action_preprocessor.forward( next_actions_ndarray).numpy() logged_possible_next_actions = action_preprocessor.forward( workspace.FetchBlob(possible_next_actions_matrix)) state_pnas_tile = state_preprocessor.forward( workspace.FetchBlob(state_pnas_tile_blob)) logged_possible_next_state_actions = torch.cat( (state_pnas_tile, logged_possible_next_actions), dim=1) possible_next_actions_ndarray = logged_possible_next_actions.cpu( ).numpy() next_state_pnas_concat = logged_possible_next_state_actions.cpu( ).numpy() time_diffs = np.ones(len(states_ndarray)) episode_values = None if samples.reward_timelines is not None: episode_values = np.zeros(rewards.shape, dtype=np.float32) for i, reward_timeline in enumerate(samples.reward_timelines): for time_diff, reward in reward_timeline.items(): episode_values[i, 0] += reward * (DISCOUNT**time_diff) tdps = [] pnas_start = 0 for start in range(0, states_ndarray.shape[0], minibatch_size): end = start + minibatch_size if end > states_ndarray.shape[0]: break pnas_end = pnas_start + np.sum(pnas_lengths[start:end]) pnas = possible_next_actions_ndarray[pnas_start:pnas_end] pnas_concat = next_state_pnas_concat[pnas_start:pnas_end] pnas_start = pnas_end tdps.append( TrainingDataPage( states=states_ndarray[start:end], actions=actions_ndarray[start:end], propensities=propensities[start:end], rewards=rewards[start:end], next_states=next_states_ndarray[start:end], next_actions=next_actions_ndarray[start:end], possible_next_actions=StackedArray(pnas_lengths[start:end], pnas), not_terminals=(pnas_lengths[start:end] > 0).reshape(-1, 1), episode_values=episode_values[start:end] if episode_values is not None else None, time_diffs=time_diffs[start:end], possible_next_actions_lengths=pnas_lengths[start:end], next_state_pnas_concat=pnas_concat, )) return tdps
def preprocess_samples( self, states: List[Dict[int, float]], actions: List[Dict[int, float]], rewards: List[float], next_states: List[Dict[int, float]], next_actions: List[Dict[int, float]], is_terminals: List[bool], possible_next_actions: List[List[Dict[int, float]]], reward_timelines: List[Dict[int, float]], minibatch_size: int, ) -> List[TrainingDataPage]: # Shuffle merged = list( zip(states, actions, rewards, next_states, next_actions, is_terminals, possible_next_actions, reward_timelines)) random.shuffle(merged) states, actions, rewards, next_states, next_actions, is_terminals, \ possible_next_actions, reward_timelines = zip(*merged) net = core.Net('gridworld_preprocessing') C2.set_net(net) preprocessor = PreprocessorNet(net, True) saa = StackedAssociativeArray.from_dict_list(states, 'states') state_matrix, _ = preprocessor.normalize_sparse_matrix( saa.lengths, saa.keys, saa.values, self.normalization, 'state_norm', ) saa = StackedAssociativeArray.from_dict_list(next_states, 'next_states') next_state_matrix, _ = preprocessor.normalize_sparse_matrix( saa.lengths, saa.keys, saa.values, self.normalization, 'next_state_norm', ) saa = StackedAssociativeArray.from_dict_list(actions, 'action') action_matrix, _ = preprocessor.normalize_sparse_matrix( saa.lengths, saa.keys, saa.values, self.normalization_action, 'action_norm', ) saa = StackedAssociativeArray.from_dict_list(next_actions, 'next_action') next_action_matrix, _ = preprocessor.normalize_sparse_matrix( saa.lengths, saa.keys, saa.values, self.normalization_action, 'next_action_norm', ) rewards = np.array(rewards, dtype=np.float32).reshape(-1, 1) pnas_lengths_list = [] pnas_flat = [] for pnas in possible_next_actions: pnas_lengths_list.append(len(pnas)) pnas_flat.extend(pnas) saa = StackedAssociativeArray.from_dict_list(pnas_flat, 'possible_next_actions') pnas_lengths = np.array(pnas_lengths_list, dtype=np.int32) possible_next_actions_matrix, _ = preprocessor.normalize_sparse_matrix( saa.lengths, saa.keys, saa.values, self.normalization_action, 'possible_next_action_norm', ) workspace.RunNetOnce(net) states_ndarray = workspace.FetchBlob(state_matrix) actions_ndarray = workspace.FetchBlob(action_matrix) next_states_ndarray = workspace.FetchBlob(next_state_matrix) next_actions_ndarray = workspace.FetchBlob(next_action_matrix) possible_next_actions_ndarray = workspace.FetchBlob( possible_next_actions_matrix) tdps = [] pnas_start = 0 for start in range(0, states_ndarray.shape[0], minibatch_size): end = start + minibatch_size if end > states_ndarray.shape[0]: break pnas_end = pnas_start + np.sum(pnas_lengths[start:end]) pnas = possible_next_actions_ndarray[pnas_start:pnas_end] pnas_start = pnas_end tdps.append( TrainingDataPage( states=states_ndarray[start:end], actions=actions_ndarray[start:end], rewards=rewards[start:end], next_states=next_states_ndarray[start:end], next_actions=next_actions_ndarray[start:end], possible_next_actions=StackedArray(pnas_lengths[start:end], pnas), not_terminals=(pnas_lengths[start:end] > 0).reshape(-1, 1), reward_timelines=reward_timelines[start:end] if reward_timelines else None, )) return tdps
def export_actor( cls, trainer, state_normalization_parameters, action_feature_ids, min_action_range_tensor_serving, max_action_range_tensor_serving, model_on_gpu=False, ): """Export caffe2 preprocessor net and pytorch actor forward pass as one caffe2 net. :param trainer DDPGTrainer :param state_normalization_parameters state NormalizationParameters :param min_action_range_tensor_serving pytorch tensor that specifies min action value for each dimension :param max_action_range_tensor_serving pytorch tensor that specifies min action value for each dimension :param state_normalization_parameters state NormalizationParameters :param model_on_gpu boolean indicating if the model is a GPU model or CPU model """ model = model_helper.ModelHelper(name="predictor") net = model.net C2.set_model(model) parameters: List[str] = [] workspace.FeedBlob("input/float_features.lengths", np.zeros(1, dtype=np.int32)) workspace.FeedBlob("input/float_features.keys", np.zeros(1, dtype=np.int64)) workspace.FeedBlob("input/float_features.values", np.zeros(1, dtype=np.float32)) input_feature_lengths = "input_feature_lengths" input_feature_keys = "input_feature_keys" input_feature_values = "input_feature_values" C2.net().Copy(["input/float_features.lengths"], [input_feature_lengths]) C2.net().Copy(["input/float_features.keys"], [input_feature_keys]) C2.net().Copy(["input/float_features.values"], [input_feature_values]) preprocessor = PreprocessorNet() sparse_to_dense_processor = Caffe2SparseToDenseProcessor() sorted_features, _ = sort_features_by_normalization( state_normalization_parameters) state_dense_matrix, new_parameters = sparse_to_dense_processor( sorted_features, StackedAssociativeArray(input_feature_lengths, input_feature_keys, input_feature_values), ) parameters.extend(new_parameters) state_normalized_dense_matrix, new_parameters = preprocessor.normalize_dense_matrix( state_dense_matrix, sorted_features, state_normalization_parameters, "state_norm", False, ) parameters.extend(new_parameters) torch_init_net, torch_predict_net, new_parameters, actor_input_blob, actor_output_blob, min_action_training_blob, max_action_training_blob, min_action_serving_blob, max_action_serving_blob = DDPGPredictor.generate_train_net( trainer, model, min_action_range_tensor_serving, max_action_range_tensor_serving, model_on_gpu, ) parameters.extend(new_parameters) net.Copy([state_normalized_dense_matrix], [actor_input_blob]) workspace.RunNetOnce(model.param_init_net) workspace.RunNetOnce(torch_init_net) net.AppendNet(torch_predict_net) # Scale actors actions from [-1, 1] to serving range prev_range = C2.Sub(max_action_training_blob, min_action_training_blob) new_range = C2.Sub(max_action_serving_blob, min_action_serving_blob) subtract_prev_min = C2.Sub(actor_output_blob, min_action_training_blob) div_by_prev_range = C2.Div(subtract_prev_min, prev_range) scaled_for_serving_actions = C2.Add( C2.Mul(div_by_prev_range, new_range), min_action_serving_blob) output_lengths = "output/float_features.lengths" workspace.FeedBlob(output_lengths, np.zeros(1, dtype=np.int32)) C2.net().ConstantFill( [C2.FlattenToVec(C2.ArgMax(actor_output_blob))], [output_lengths], value=trainer.actor.layers[-1].out_features, dtype=caffe2_pb2.TensorProto.INT32, ) action_feature_ids_blob = C2.NextBlob("action_feature_ids") workspace.FeedBlob(action_feature_ids_blob, np.array(action_feature_ids, dtype=np.int64)) parameters.append(action_feature_ids_blob) output_keys = "output/float_features.keys" workspace.FeedBlob(output_keys, np.zeros(1, dtype=np.int64)) num_examples, _ = C2.Reshape(C2.Size("input/float_features.lengths"), shape=[1]) C2.net().Tile([action_feature_ids_blob, num_examples], [output_keys], axis=0) output_values = "output/float_features.values" workspace.FeedBlob(output_values, np.zeros(1, dtype=np.float32)) C2.net().FlattenToVec([scaled_for_serving_actions], [output_values]) workspace.CreateNet(net) return DDPGPredictor(net, torch_init_net, parameters)
def preprocess_samples( self, samples: Samples, minibatch_size: int, use_gpu: bool = False, one_hot_action: bool = True, normalize_actions: bool = True, ) -> List[TrainingDataPage]: logger.info("Shuffling...") samples = shuffle_samples(samples) logger.info("Sparse2Dense...") net = core.Net("gridworld_preprocessing") C2.set_net(net) saa = StackedAssociativeArray.from_dict_list(samples.states, "states") sorted_state_features, _ = sort_features_by_normalization(self.normalization) state_matrix, _ = sparse_to_dense( saa.lengths, saa.keys, saa.values, sorted_state_features ) saa = StackedAssociativeArray.from_dict_list(samples.next_states, "next_states") next_state_matrix, _ = sparse_to_dense( saa.lengths, saa.keys, saa.values, sorted_state_features ) sorted_action_features, _ = sort_features_by_normalization( self.normalization_action ) saa = StackedAssociativeArray.from_dict_list(samples.actions, "action") action_matrix, _ = sparse_to_dense( saa.lengths, saa.keys, saa.values, sorted_action_features ) saa = StackedAssociativeArray.from_dict_list( samples.next_actions, "next_action" ) next_action_matrix, _ = sparse_to_dense( saa.lengths, saa.keys, saa.values, sorted_action_features ) action_probabilities = torch.tensor( samples.action_probabilities, dtype=torch.float32 ).reshape(-1, 1) rewards = torch.tensor(samples.rewards, dtype=torch.float32).reshape(-1, 1) max_action_size = 4 pnas_mask_list: List[List[int]] = [] pnas_flat: List[Dict[str, float]] = [] for pnas in samples.possible_next_actions: pnas_mask_list.append([1] * len(pnas) + [0] * (max_action_size - len(pnas))) pnas_flat.extend(pnas) for _ in range(max_action_size - len(pnas)): pnas_flat.append({}) # Filler saa = StackedAssociativeArray.from_dict_list(pnas_flat, "possible_next_actions") pnas_mask = torch.Tensor(pnas_mask_list) possible_next_actions_matrix, _ = sparse_to_dense( saa.lengths, saa.keys, saa.values, sorted_action_features ) workspace.RunNetOnce(net) logger.info("Preprocessing...") state_preprocessor = Preprocessor(self.normalization, False) action_preprocessor = Preprocessor(self.normalization_action, False) states_ndarray = workspace.FetchBlob(state_matrix) states_ndarray = state_preprocessor.forward(states_ndarray) actions_ndarray = torch.from_numpy(workspace.FetchBlob(action_matrix)) if normalize_actions: actions_ndarray = action_preprocessor.forward(actions_ndarray) next_states_ndarray = workspace.FetchBlob(next_state_matrix) next_states_ndarray = state_preprocessor.forward(next_states_ndarray) state_pnas_tile = next_states_ndarray.repeat(1, max_action_size).reshape( -1, next_states_ndarray.shape[1] ) next_actions_ndarray = torch.from_numpy(workspace.FetchBlob(next_action_matrix)) if normalize_actions: next_actions_ndarray = action_preprocessor.forward(next_actions_ndarray) logged_possible_next_actions = action_preprocessor.forward( workspace.FetchBlob(possible_next_actions_matrix) ) assert state_pnas_tile.shape[0] == logged_possible_next_actions.shape[0], ( "Invalid shapes: " + str(state_pnas_tile.shape) + " != " + str(logged_possible_next_actions.shape) ) logged_possible_next_state_actions = torch.cat( (state_pnas_tile, logged_possible_next_actions), dim=1 ) logger.info("Reward Timeline to Torch...") time_diffs = torch.ones([len(samples.states), 1]) tdps = [] pnas_start = 0 logger.info("Batching...") for start in range(0, states_ndarray.shape[0], minibatch_size): end = start + minibatch_size if end > states_ndarray.shape[0]: break pnas_end = pnas_start + (minibatch_size * max_action_size) tdp = TrainingDataPage( states=states_ndarray[start:end], actions=actions_ndarray[start:end], propensities=action_probabilities[start:end], rewards=rewards[start:end], next_states=next_states_ndarray[start:end], next_actions=next_actions_ndarray[start:end], not_terminal=(pnas_mask[start:end, :].sum(dim=1, keepdim=True) > 0), time_diffs=time_diffs[start:end], possible_next_actions_mask=pnas_mask[start:end, :], possible_next_actions_state_concat=logged_possible_next_state_actions[ pnas_start:pnas_end, : ], ) pnas_start = pnas_end tdp.set_type(torch.cuda.FloatTensor if use_gpu else torch.FloatTensor) tdps.append(tdp) return tdps
def preprocess_samples_discrete( self, samples: Samples, minibatch_size: int, one_hot_action: bool = True, use_gpu: bool = False, ) -> List[TrainingDataPage]: logger.info("Shuffling...") samples = shuffle_samples(samples) logger.info("Preprocessing...") if self.sparse_to_dense_net is None: self.sparse_to_dense_net = core.Net("gridworld_sparse_to_dense") C2.set_net(self.sparse_to_dense_net) saa = StackedAssociativeArray.from_dict_list(samples.states, "states") sorted_features, _ = sort_features_by_normalization(self.normalization) self.state_matrix, _ = sparse_to_dense( saa.lengths, saa.keys, saa.values, sorted_features ) saa = StackedAssociativeArray.from_dict_list( samples.next_states, "next_states" ) self.next_state_matrix, _ = sparse_to_dense( saa.lengths, saa.keys, saa.values, sorted_features ) C2.set_net(None) else: StackedAssociativeArray.from_dict_list(samples.states, "states") StackedAssociativeArray.from_dict_list(samples.next_states, "next_states") workspace.RunNetOnce(self.sparse_to_dense_net) logger.info("Converting to Torch...") actions_one_hot = torch.tensor( (np.array(samples.actions).reshape(-1, 1) == np.array(self.ACTIONS)).astype( np.int64 ) ) actions = actions_one_hot.argmax(dim=1, keepdim=True) rewards = torch.tensor(samples.rewards, dtype=torch.float32).reshape(-1, 1) action_probabilities = torch.tensor( samples.action_probabilities, dtype=torch.float32 ).reshape(-1, 1) next_actions_one_hot = torch.tensor( ( np.array(samples.next_actions).reshape(-1, 1) == np.array(self.ACTIONS) ).astype(np.int64) ) logger.info("Converting PA to Torch...") possible_action_strings = np.array( list(itertools.zip_longest(*samples.possible_actions, fillvalue="")) ).T possible_actions_mask = torch.zeros([len(samples.actions), len(self.ACTIONS)]) for i, action in enumerate(self.ACTIONS): possible_actions_mask[:, i] = torch.tensor( np.max(possible_action_strings == action, axis=1).astype(np.int64) ) logger.info("Converting PNA to Torch...") possible_next_action_strings = np.array( list(itertools.zip_longest(*samples.possible_next_actions, fillvalue="")) ).T possible_next_actions_mask = torch.zeros( [len(samples.next_actions), len(self.ACTIONS)] ) for i, action in enumerate(self.ACTIONS): possible_next_actions_mask[:, i] = torch.tensor( np.max(possible_next_action_strings == action, axis=1).astype(np.int64) ) terminals = torch.tensor(samples.terminals, dtype=torch.int32).reshape(-1, 1) not_terminal = 1 - terminals logger.info("Converting RT to Torch...") time_diffs = torch.ones([len(samples.states), 1]) logger.info("Preprocessing...") preprocessor = Preprocessor(self.normalization, False) states_ndarray = workspace.FetchBlob(self.state_matrix) states_ndarray = preprocessor.forward(states_ndarray) next_states_ndarray = workspace.FetchBlob(self.next_state_matrix) next_states_ndarray = preprocessor.forward(next_states_ndarray) logger.info("Batching...") tdps = [] for start in range(0, states_ndarray.shape[0], minibatch_size): end = start + minibatch_size if end > states_ndarray.shape[0]: break tdp = TrainingDataPage( states=states_ndarray[start:end], actions=actions_one_hot[start:end] if one_hot_action else actions[start:end], propensities=action_probabilities[start:end], rewards=rewards[start:end], next_states=next_states_ndarray[start:end], not_terminal=not_terminal[start:end], next_actions=next_actions_one_hot[start:end], possible_actions_mask=possible_actions_mask[start:end], possible_next_actions_mask=possible_next_actions_mask[start:end], time_diffs=time_diffs[start:end], ) tdp.set_type(torch.cuda.FloatTensor if use_gpu else torch.FloatTensor) tdps.append(tdp) return tdps
def export( cls, trainer, actions, state_normalization_parameters, model_on_gpu=False, set_missing_value_to_zero=False, ): """Export caffe2 preprocessor net and pytorch DQN forward pass as one caffe2 net. :param trainer DQNTrainer :param state_normalization_parameters state NormalizationParameters :param model_on_gpu boolean indicating if the model is a GPU model or CPU model """ input_dim = trainer.num_features q_network = (trainer.q_network.module if isinstance( trainer.q_network, DataParallel) else trainer.q_network) buffer = PytorchCaffe2Converter.pytorch_net_to_buffer( q_network, input_dim, model_on_gpu) qnet_input_blob, qnet_output_blob, caffe2_netdef = PytorchCaffe2Converter.buffer_to_caffe2_netdef( buffer) torch_workspace = caffe2_netdef.workspace parameters = torch_workspace.Blobs() for blob_str in parameters: workspace.FeedBlob(blob_str, torch_workspace.FetchBlob(blob_str)) # Remove the input blob from parameters since it's not a real # input (will be calculated by preprocessor) parameters.remove(qnet_input_blob) torch_init_net = core.Net(caffe2_netdef.init_net) torch_predict_net = core.Net(caffe2_netdef.predict_net) logger.info("Generated ONNX predict net:") logger.info(str(torch_predict_net.Proto())) # While converting to metanetdef, the external_input of predict_net # will be recomputed. Add the real output of init_net to parameters # to make sure they will be counted. parameters.extend( set(caffe2_netdef.init_net.external_output) - set(caffe2_netdef.init_net.external_input)) model = model_helper.ModelHelper(name="predictor") net = model.net C2.set_model(model) workspace.FeedBlob("input/image", np.zeros([1, 1, 1, 1], dtype=np.int32)) workspace.FeedBlob("input/float_features.lengths", np.zeros(1, dtype=np.int32)) workspace.FeedBlob("input/float_features.keys", np.zeros(1, dtype=np.int64)) workspace.FeedBlob("input/float_features.values", np.zeros(1, dtype=np.float32)) input_feature_lengths = "input_feature_lengths" input_feature_keys = "input_feature_keys" input_feature_values = "input_feature_values" C2.net().Copy(["input/float_features.lengths"], [input_feature_lengths]) C2.net().Copy(["input/float_features.keys"], [input_feature_keys]) C2.net().Copy(["input/float_features.values"], [input_feature_values]) if state_normalization_parameters is not None: sorted_feature_ids = sort_features_by_normalization( state_normalization_parameters)[0] sparse_to_dense_processor = Caffe2SparseToDenseProcessor() dense_matrix, new_parameters = sparse_to_dense_processor( sorted_feature_ids, StackedAssociativeArray(input_feature_lengths, input_feature_keys, input_feature_values), set_missing_value_to_zero=set_missing_value_to_zero, ) parameters.extend(new_parameters) preprocessor_net = PreprocessorNet() state_normalized_dense_matrix, new_parameters = preprocessor_net.normalize_dense_matrix( dense_matrix, sorted_feature_ids, state_normalization_parameters, "state_norm_", True, ) parameters.extend(new_parameters) else: # Image input. Note: Currently this does the wrong thing if # more than one image is passed at a time. state_normalized_dense_matrix = "input/image" net.Copy([state_normalized_dense_matrix], [qnet_input_blob]) workspace.RunNetOnce(model.param_init_net) workspace.RunNetOnce(torch_init_net) net.AppendNet(torch_predict_net) new_parameters, q_values = RLPredictor._forward_pass( model, trainer, state_normalized_dense_matrix, actions, qnet_output_blob) parameters.extend(new_parameters) # Get 1 x n action index tensor under the max_q policy max_q_act_idxs = "max_q_policy_actions" C2.net().Flatten([C2.ArgMax(q_values)], [max_q_act_idxs], axis=0) shape_of_num_of_states = "num_states_shape" C2.net().FlattenToVec([max_q_act_idxs], [shape_of_num_of_states]) num_states, _ = C2.Reshape(C2.Size(shape_of_num_of_states), shape=[1]) # Get 1 x n action index tensor under the softmax policy temperature = C2.NextBlob("temperature") parameters.append(temperature) workspace.FeedBlob( temperature, np.array([trainer.rl_temperature], dtype=np.float32)) tempered_q_values = C2.Div(q_values, temperature, broadcast=1) softmax_values = C2.Softmax(tempered_q_values) softmax_act_idxs_nested = "softmax_act_idxs_nested" C2.net().WeightedSample([softmax_values], [softmax_act_idxs_nested]) softmax_act_idxs = "softmax_policy_actions" C2.net().Flatten([softmax_act_idxs_nested], [softmax_act_idxs], axis=0) action_names = C2.NextBlob("action_names") parameters.append(action_names) workspace.FeedBlob(action_names, np.array(actions)) # Concat action index tensors to get 2 x n tensor - [[max_q], [softmax]] # transpose & flatten to get [a1_maxq, a1_softmax, a2_maxq, a2_softmax, ...] max_q_act_blob = C2.Cast(max_q_act_idxs, to=caffe2_pb2.TensorProto.INT32) softmax_act_blob = C2.Cast(softmax_act_idxs, to=caffe2_pb2.TensorProto.INT32) C2.net().Append([max_q_act_blob, softmax_act_blob], [max_q_act_blob]) transposed_action_idxs = C2.Transpose(max_q_act_blob) flat_transposed_action_idxs = C2.FlattenToVec(transposed_action_idxs) workspace.FeedBlob(OUTPUT_SINGLE_CAT_VALS_NAME, np.zeros(1, dtype=np.int64)) C2.net().Gather([action_names, flat_transposed_action_idxs], [OUTPUT_SINGLE_CAT_VALS_NAME]) workspace.FeedBlob(OUTPUT_SINGLE_CAT_LENGTHS_NAME, np.zeros(1, dtype=np.int32)) C2.net().ConstantFill( [shape_of_num_of_states], [OUTPUT_SINGLE_CAT_LENGTHS_NAME], value=2, dtype=caffe2_pb2.TensorProto.INT32, ) workspace.FeedBlob(OUTPUT_SINGLE_CAT_KEYS_NAME, np.zeros(1, dtype=np.int64)) output_keys_tensor, _ = C2.Concat( C2.ConstantFill(shape=[1, 1], value=0, dtype=caffe2_pb2.TensorProto.INT64), C2.ConstantFill(shape=[1, 1], value=1, dtype=caffe2_pb2.TensorProto.INT64), axis=0, ) output_key_tile = C2.Tile(output_keys_tensor, num_states, axis=0) C2.net().FlattenToVec([output_key_tile], [OUTPUT_SINGLE_CAT_KEYS_NAME]) workspace.CreateNet(net) return DQNPredictor(net, torch_init_net, parameters)