Ejemplo n.º 1
0
class FittedQEvaluation(object):
    def __init__(self,
                 data,
                 gamma,
                 frameskip=2,
                 frameheight=2,
                 modeltype='conv',
                 processor=None):
        self.data = data
        self.gamma = gamma
        self.frameskip = frameskip
        self.frameheight = frameheight
        self.modeltype = modeltype
        self.processor = processor

        # self.setup(deepcopy(self.trajectories))

    def setup(self, dataset):
        '''
        '''
        transitions = np.vstack([
            np.array([x['x'] for x in dataset]).reshape(-1, 1).T,
            np.array([x['a'] for x in dataset]).reshape(-1, 1).T,
            np.array([x['x_prime'] for x in dataset]).reshape(-1, 1).T
        ]).T

        unique, idx, count = np.unique(transitions,
                                       return_index=True,
                                       return_counts=True,
                                       axis=0)

        partial_transitions = np.vstack([
            np.array([x['x'] for x in dataset]).reshape(-1, 1).T,
            np.array([x['a'] for x in dataset]).reshape(-1, 1).T,
        ]).T
        unique_a_given_x, idx_a_given_x, count_a_given_x = np.unique(
            partial_transitions, return_index=True, return_counts=True, axis=0)

        # key=(state, action). value= number of times a was taking in state
        all_counts_a_given_x = {
            tuple(key): value
            for key, value in zip(unique_a_given_x, count_a_given_x)
        }

        prob = {}
        for idx, row in enumerate(unique):
            if tuple(row[:-1]) in prob:
                prob[tuple(
                    row[:-1])][row[-1]] = count[idx] / all_counts_a_given_x[
                        (row[0], row[1])]
            else:
                prob[tuple(row[:-1])] = {}
                prob[tuple(
                    row[:-1])][row[-1]] = count[idx] / all_counts_a_given_x[
                        (row[0], row[1])]

        all_transitions = np.vstack([
            np.array([x['x'] for x in dataset]).reshape(-1, 1).T,
            np.array([x['a'] for x in dataset]).reshape(-1, 1).T,
            np.array([x['x_prime'] for x in dataset]).reshape(-1, 1).T,
            np.array([x['done'] for x in dataset]).reshape(-1, 1).T,
        ]).T
        self.terminal_transitions = {
            tuple([x, a, x_prime]): 1
            for x, a, x_prime in all_transitions[all_transitions[:, -1] ==
                                                 True][:, :-1]
        }

        self.P = prob

        transitions = np.vstack([
            np.array([x['x'] for x in dataset]).reshape(-1, 1).T,
            np.array([x['a'] for x in dataset]).reshape(-1, 1).T,
            # np.array([x['x_prime'] for x in dataset]).reshape(-1,1).T,
            np.array([x['r'] for x in dataset]).reshape(-1, 1).T,
        ]).T
        unique, idxs, counts = np.unique(transitions,
                                         return_index=True,
                                         return_counts=True,
                                         axis=0)

        partial_transitions = np.vstack([
            np.array([x['x'] for x in dataset]).reshape(-1, 1).T,
            np.array([x['a'] for x in dataset]).reshape(-1, 1).T,
            # np.array([x['x_prime'] for x in dataset]).reshape(-1,1).T,
        ]).T
        unique_a_given_x, idx_a_given_x, count_a_given_x = np.unique(
            partial_transitions, return_index=True, return_counts=True, axis=0)

        # key=(state, action). value= number of times a was taking in state
        all_counts_a_given_x = {
            tuple(key): value
            for key, value in zip(unique_a_given_x, count_a_given_x)
        }

        rew = {}
        for idx, row in enumerate(unique):
            if tuple(row[:-1]) in rew:
                rew[tuple(row[:-1])][
                    row[-1]] = counts[idx] / all_counts_a_given_x[tuple(
                        row[:-1])]
            else:
                rew[tuple(row[:-1])] = {}
                rew[tuple(row[:-1])][
                    row[-1]] = counts[idx] / all_counts_a_given_x[tuple(
                        row[:-1])]

        self.R = rew

        transitions = np.vstack([
            np.array([x['x'] for x in dataset]).reshape(-1, 1).T,
            np.array([x['a'] for x in dataset]).reshape(-1, 1).T,
            np.array([x['x_prime'] for x in dataset]).reshape(-1, 1).T,
            np.array([range(len(x['x'])) for x in dataset]).reshape(-1, 1).T,
            np.array([x['r'] for x in dataset]).reshape(-1, 1).T,
        ]).T
        unique, idxs, counts = np.unique(transitions,
                                         return_index=True,
                                         return_counts=True,
                                         axis=0)

        partial_transitions = np.vstack([
            np.array([x['x'] for x in dataset]).reshape(-1, 1).T,
            np.array([x['a'] for x in dataset]).reshape(-1, 1).T,
            np.array([x['x_prime'] for x in dataset]).reshape(-1, 1).T,
            np.array([range(len(x['x'])) for x in dataset]).reshape(-1, 1).T,
        ]).T
        unique_a_given_x, idx_a_given_x, count_a_given_x = np.unique(
            partial_transitions, return_index=True, return_counts=True, axis=0)

        # key=(state, action). value= number of times a was taking in state
        all_counts_a_given_x = {
            tuple(key): value
            for key, value in zip(unique_a_given_x, count_a_given_x)
        }

        rew = {}
        for idx, row in enumerate(unique):
            if tuple(row[:-2]) in rew:
                if row[-2] in rew[tuple(row[:-2])]:
                    rew[tuple(row[:-2])][row[-2]][
                        row[-1]] = counts[idx] / all_counts_a_given_x[tuple(
                            row[:-1])]
                else:
                    rew[tuple(row[:-2])][row[-2]] = {}
                    rew[tuple(row[:-2])][row[-2]][
                        row[-1]] = counts[idx] / all_counts_a_given_x[tuple(
                            row[:-1])]
            else:
                rew[tuple(row[:-2])] = {}
                rew[tuple(row[:-2])][row[-2]] = {}
                rew[tuple(row[:-2])][row[-2]][
                    row[-1]] = counts[idx] / all_counts_a_given_x[tuple(
                        row[:-1])]

        self.R1 = rew

    def run(self, pi_b, pi_e, epsilon=0.001, max_epochs=10000, verbose=True):

        data = self.data.basic_transitions()

        action_space_dim = pi_b.action_space_dim
        state_space_dim = len(np.unique(data[:, [0, 3]].reshape(-1)))
        # L = max(data[:,-1]) + 1

        mapping = {
            state: idx
            for idx, state in enumerate(np.unique(data[:, [0, 3]].reshape(-1)))
        }

        U1 = np.zeros(shape=(state_space_dim, action_space_dim))
        # print('Num unique in FQE: ', data.shape[0])

        df = pd.DataFrame(data,
                          columns=['x', 'a', 't', 'x_prime', 'r', 'done'])
        initial_states = Counter(df[df['t'] == 0]['x'])
        total = sum(initial_states.values())
        initial_states = {
            key: val / total
            for key, val in initial_states.items()
        }

        count = -1
        while True:
            U = U1.copy()
            delta = 0
            count += 1

            for (x, a), group in df.groupby(['x', 'a']):
                x, a = int(x), int(a)
                x = mapping[x]

                # expected_reward = np.mean(group['r'])
                # expected_Q = np.mean([[pi_e.predict([x_prime])[act]*U[x_prime,act] for x_prime in group['x_prime']] for act in range(action_space_dim)])

                vals = np.zeros(group['x_prime'].shape)

                x_primes = np.array([mapping[key] for key in group['x_prime']])
                vals = np.array(group['r']) + self.gamma * np.sum(
                    pi_e.predict(x_primes) * U[x_primes, :],
                    axis=1) * (1 - np.array(group['done']))
                # for act in range(action_space_dim):
                #     try:

                #         vals += self.gamma*pi_e.predict(np.array(group['x_prime']))[range(len(x_primes)), act ]*U[x_primes,act]*(1-group['done'])
                #     except:
                #         import pdb; pdb.set_trace()

                # vals += group['r']

                U1[x,
                   a] = np.mean(vals)  #expected_reward + self.gamma*expected_Q

                delta = max(delta, abs(U1[x, a] - U[x, a]))
            if verbose: print(count, delta)

            if self.gamma == 1:
                # TODO: include initial state distribution
                if delta < epsilon:
                    out = np.sum([
                        prob * U1[0, new_a]
                        for new_a, prob in enumerate(pi_e.predict([0])[0])
                    ])  #U[0,pi_e([0])][0]
                    return None, U1, mapping
                    # return out, U1, mapping
            else:
                if delta < epsilon * (
                        1 - self.gamma) / self.gamma or count > max_epochs:
                    return None, U1, mapping  #U[0,pi_e([0])][0]
                    # return np.sum([prob*U1[mapping[0], new_a] for new_a,prob in enumerate(pi_e.predict([0])[0])]), U1, mapping #U[0,pi_e([0])][0]

    @staticmethod
    def build_model(input_size, scope, action_space_dim=3, modeltype='conv'):

        inp = keras.layers.Input(input_size, name='frames')
        actions = keras.layers.Input((action_space_dim, ), name='mask')

        # conv1 = Conv2D(64, kernel_size=16, strides=2, activation='relu', data_format='channels_first')(inp)
        # #pool1 = MaxPool2D(data_format='channels_first')(conv1)
        # conv2 = Conv2D(64, kernel_size=8, strides=2, activation='relu', data_format='channels_first')(conv1)
        # #pool2 = MaxPool2D(data_format='channels_first')(conv2)
        # conv3 = Conv2D(64, kernel_size=4, strides=2, activation='relu', data_format='channels_first')(conv2)
        # #pool3 = MaxPool2D(data_format='channels_first')(conv3)
        # flat = Flatten()(conv3)
        # dense1 = Dense(10, activation='relu')(flat)
        # dense2 = Dense(30, activation='relu')(dense1)
        # out = Dense(action_space_dim, activation='linear', name=scope+ 'all_Q')(dense2)
        # filtered_output = keras.layers.dot([out, actions], axes=1)

        # model = keras.models.Model(input=[inp, actions], output=[filtered_output])

        # all_Q = keras.models.Model(inputs=[inp],
        #                          outputs=model.get_layer(scope + 'all_Q').output)

        # rmsprop = keras.optimizers.RMSprop(lr=0.0005, rho=0.9, epsilon=1e-5, decay=0.0)
        # model.compile(loss='mse', optimizer=rmsprop)

        def init():
            return keras.initializers.TruncatedNormal(mean=0.0,
                                                      stddev=0.1,
                                                      seed=np.random.randint(
                                                          2**32))

        if modeltype == 'conv':
            conv1 = Conv2D(8, (7, 7),
                           strides=(3, 3),
                           padding='same',
                           data_format='channels_first',
                           activation='elu',
                           kernel_initializer=init(),
                           bias_initializer=init(),
                           kernel_regularizer=regularizers.l2(1e-6))(inp)
            pool1 = MaxPool2D(data_format='channels_first')(conv1)
            conv2 = Conv2D(16, (3, 3),
                           strides=(1, 1),
                           padding='same',
                           data_format='channels_first',
                           activation='elu',
                           kernel_initializer=init(),
                           bias_initializer=init(),
                           kernel_regularizer=regularizers.l2(1e-6))(pool1)
            pool2 = MaxPool2D(data_format='channels_first')(conv2)
            flat1 = Flatten(name='flattened')(pool2)
            out = Dense(256,
                        activation='elu',
                        kernel_initializer=init(),
                        bias_initializer=init(),
                        kernel_regularizer=regularizers.l2(1e-6))(flat1)
        elif modeltype == 'conv1':

            def init():
                return keras.initializers.TruncatedNormal(
                    mean=0.0, stddev=0.001, seed=np.random.randint(2**32))

            conv1 = Conv2D(16, (2, 2),
                           strides=(1, 1),
                           padding='same',
                           data_format='channels_first',
                           activation='elu',
                           kernel_initializer=init(),
                           bias_initializer=init(),
                           kernel_regularizer=regularizers.l2(1e-6))(inp)
            # pool1 = MaxPool2D(data_format='channels_first')(conv1)
            # conv2 = Conv2D(16, (2,2), strides=(1,1), padding='same', data_format='channels_first', activation='elu',kernel_initializer=init(), bias_initializer=init(), kernel_regularizer=regularizers.l2(1e-6))(pool1)
            # pool2 = MaxPool2D(data_format='channels_first')(conv2)
            flat1 = Flatten(name='flattened')(conv1)
            out = Dense(8,
                        activation='elu',
                        kernel_initializer=init(),
                        bias_initializer=init(),
                        kernel_regularizer=regularizers.l2(1e-6))(flat1)
            out = Dense(8,
                        activation='elu',
                        kernel_initializer=init(),
                        bias_initializer=init(),
                        kernel_regularizer=regularizers.l2(1e-6))(out)
        else:

            def init():
                return keras.initializers.TruncatedNormal(
                    mean=0.0, stddev=.1, seed=np.random.randint(2**32))

            flat = Flatten()(inp)
            dense1 = Dense(64,
                           activation='elu',
                           kernel_initializer=init(),
                           bias_initializer=init())(flat)
            # dense2 = Dense(256, activation='relu',kernel_initializer=init(), bias_initializer=init())(dense1)
            dense3 = Dense(32,
                           activation='elu',
                           kernel_initializer=init(),
                           bias_initializer=init())(dense1)
            out = Dense(8,
                        activation='elu',
                        name='out',
                        kernel_initializer=init(),
                        bias_initializer=init())(dense3)

        all_actions = Dense(action_space_dim,
                            name=scope + 'all_Q',
                            activation="linear",
                            kernel_initializer=init(),
                            bias_initializer=init())(out)

        output = keras.layers.dot([all_actions, actions], 1)

        model = keras.models.Model(inputs=[inp, actions], outputs=output)

        all_Q = keras.models.Model(inputs=[inp],
                                   outputs=model.get_layer(scope +
                                                           'all_Q').output)

        rmsprop = keras.optimizers.RMSprop(lr=0.05,
                                           rho=0.95,
                                           epsilon=1e-08,
                                           decay=1e-3)  #, clipnorm=1.)
        adam = keras.optimizers.Adam(clipnorm=1.)
        model.compile(loss='mse', optimizer=adam, metrics=['accuracy'])
        return model, all_Q

    @staticmethod
    def copy_over_to(source, target):
        target.set_weights(source.get_weights())

    @staticmethod
    def weight_change_norm(model, target_model):
        norm_list = []
        number_of_layers = len(model.layers)
        for i in range(number_of_layers):
            model_matrix = model.layers[i].get_weights()
            target_model_matrix = target_model.layers[i].get_weights()
            if len(model_matrix) > 0:
                #print "layer ", i, " has shape ", model_matrix[0].shape
                if model_matrix[0].shape[0] > 0:
                    norm_change = np.linalg.norm(model_matrix[0] -
                                                 target_model_matrix[0])
                    norm_list.append(norm_change)
        return sum(norm_list) * 1.0 / len(norm_list)

    def run_linear(self,
                   env,
                   pi_b,
                   pi_e,
                   max_epochs,
                   epsilon=.001,
                   fit_intercept=True):
        initial_states = self.data.initial_states()
        self.Q_k = LinearRegression(fit_intercept=fit_intercept)
        values = []

        states = self.data.states()
        states = states.reshape(-1, np.prod(states.shape[2:]))
        actions = self.data.actions().reshape(-1)
        actions = np.eye(env.n_actions)[actions]
        X = np.hstack([states, actions])

        next_states = self.data.next_states()
        next_states = next_states.reshape(-1, np.prod(next_states.shape[2:]))

        policy_action = self.data.target_propensity()
        lengths = self.data.lengths()
        omega = self.data.omega()
        rewards = self.data.rewards()

        not_dones = 1 - self.data.dones()

        for epoch in tqdm(range(max_epochs)):

            if epoch:
                inp = np.repeat(next_states, env.n_actions, axis=0)
                act = np.tile(np.arange(env.n_actions), len(next_states))
                inp = np.hstack([
                    inp.reshape(inp.shape[0], -1),
                    np.eye(env.n_actions)[act]
                ])
                Q_val = self.Q_k.predict(inp).reshape(policy_action.shape)
            else:
                Q_val = np.zeros_like(policy_action)
            Q = rewards + self.gamma * (Q_val *
                                        policy_action).sum(axis=-1) * not_dones
            Q = Q.reshape(-1)

            self.Q_k.fit(X, Q)

            # Check if converged
            actions = pi_e.sample(initial_states)
            Q_val = self.Q_k.predict(
                np.hstack([
                    initial_states.reshape(initial_states.shape[0], -1),
                    np.eye(env.n_actions)[actions]
                ]))
            values.append(np.mean(Q_val))
            M = 20
            # print(values[-1], np.mean(values[-M:]), np.abs(np.mean(values[-M:])- np.mean(values[-(M+1):-1])), 1e-4*np.abs(np.mean(values[-(M+1):-1])))
            if epoch > M and np.abs(
                    np.mean(values[-M:]) - np.mean(values[-(M + 1):-1])
            ) < 1e-4 * np.abs(np.mean(values[-(M + 1):-1])):
                break
        #np.mean(values[-10:]), self.Q_k,
        return self.Q_k

    def run_linear_value_iter(self, env, pi_b, pi_e, max_epochs, epsilon=.001):
        initial_states = self.data.initial_states()
        self.Q_k = LinearRegression()
        values = []

        states = self.data.states()
        states = states.reshape(-1, np.prod(states.shape[2:]))
        actions = self.data.actions().reshape(-1)
        actions = np.eye(env.n_actions)[actions]
        X = states  #np.hstack([states, actions])

        next_states = self.data.next_states()
        next_states = next_states.reshape(-1, np.prod(next_states.shape[2:]))

        policy_action = self.data.target_propensity()
        lengths = self.data.lengths()
        omega = self.data.omega()
        rewards = self.data.rewards()

        not_dones = 1 - self.data.dones()

        for epoch in tqdm(range(max_epochs)):

            if epoch:
                # inp = np.repeat(next_states, env.n_actions, axis=0)
                inp = next_states
                # act = np.tile(np.arange(env.n_actions), len(next_states))
                inp = inp.reshape(
                    inp.shape[0], -1
                )  #np.hstack([inp.reshape(inp.shape[0],-1), np.eye(env.n_actions)[act]])
                Q_val = self.Q_k.predict(inp).reshape(policy_action[...,
                                                                    0].shape)
            else:
                Q_val = np.zeros_like(policy_action[..., 0]) + 1
            Q = rewards + self.gamma * Q_val * not_dones
            Q = Q.reshape(-1)

            self.Q_k.fit(X, Q)

            # Check if converged
            actions = pi_e.sample(initial_states)
            # Q_val = self.Q_k.predict(np.hstack([initial_states.reshape(initial_states.shape[0],-1), np.eye(env.n_actions)[actions]]))
            Q_val = self.Q_k.predict(
                initial_states.reshape(initial_states.shape[0], -1)
            )  #self.Q_k.predict(np.hstack([, np.eye(env.n_actions)[actions]]))
            values.append(np.mean(Q_val))
            M = 20
            # print(values[-1], np.mean(values[-M:]), np.abs(np.mean(values[-M:])- np.mean(values[-(M+1):-1])), 1e-4*np.abs(np.mean(values[-(M+1):-1])))
            print(self.Q_k.coef_)
            if epoch > M and np.abs(
                    np.mean(values[-M:]) - np.mean(values[-(M + 1):-1])
            ) < 1e-4 * np.abs(np.mean(values[-(M + 1):-1])):
                break

        #np.mean(values[-10:]), self.Q_k,
        return self.Q_k

    def run_NN(self,
               env,
               pi_b,
               pi_e,
               max_epochs,
               epsilon=0.001,
               perc_of_dataset=1.):

        initial_states = self.data.initial_states()
        if self.processor: initial_states = self.processor(initial_states)
        self.dim_of_actions = env.n_actions
        self.Q_k = None
        self.Q_k_minus_1 = None

        # earlyStopping = EarlyStopping(monitor='val_loss', min_delta=1e-4,  patience=10, verbose=1, mode='min', restore_best_weights=True)
        # mcp_save = ModelCheckpoint('fqe.hdf5', save_best_only=True, monitor='val_loss', mode='min')
        # reduce_lr_loss = ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=7, verbose=1, min_delta=1e-4, mode='min')

        self.more_callbacks = []  #[earlyStopping, mcp_save, reduce_lr_loss]

        # if self.modeltype == 'conv':
        #     im = env.pos_to_image(np.array(self.trajectories[0]['x'][0])[np.newaxis,...])
        # else:
        #     im = np.array(self.trajectories[0]['frames'])[np.array(self.trajectories[0]['x'][0]).astype(int)][np.newaxis,...]

        im = self.data.states()[0]
        if self.processor: im = self.processor(im)
        self.Q_k, self.Q_k_all = self.build_model(
            im.shape[1:],
            'Q_k',
            modeltype=self.modeltype,
            action_space_dim=env.n_actions)
        self.Q_k_minus_1, self.Q_k_minus_1_all = self.build_model(
            im.shape[1:],
            'Q_k_minus_1',
            modeltype=self.modeltype,
            action_space_dim=env.n_actions)

        # print('testing Q_k:', )
        tmp_act = np.eye(env.n_actions)[[0]]
        self.Q_k.predict([[im[0]], tmp_act])
        # print('testing Q_k all:', )
        self.Q_k_all.predict([[im[0]]])
        # print('testing Q_k_minus_1:', )
        self.Q_k_minus_1.predict([[im[0]], tmp_act])
        # print('testing Q_k_minus_1 all:', )
        self.Q_k_minus_1_all.predict([[im[0]]])

        self.copy_over_to(self.Q_k, self.Q_k_minus_1)
        values = []

        # policy_action = np.vstack([episode['target_propensity'] for episode in self.trajectories])

        # if self.modeltype == 'conv':
        #     initial_states = env.pos_to_image(env.initial_states())
        # else:
        #     #only works for mountain car
        #     initial_states = np.array([np.tile([x[0],0],self.frameheight).reshape(-1,self.frameheight) for x in env.initial_states()])

        # transitions = np.hstack([ np.vstack([x['x'] for x in self.trajectories]),
        #                           np.hstack([x['a'] for x in self.trajectories]).T.reshape(-1, 1),
        #                           np.hstack([x['r'] for x in self.trajectories]).T.reshape(-1, 1),
        #                           np.vstack([x['x_prime'] for x in self.trajectories]),
        #                           np.hstack([x['done'] for x in self.trajectories]).T.reshape(-1, 1),
        #                           policy_action,
        #                           np.hstack([[n]*len(x['x']) for n,x in enumerate(self.trajectories)]).T.reshape(-1,1),])

        # frames = np.array([x['frames'] for x in self.trajectories])
        # #import pdb; pdb.set_trace()
        print('Training: FQE')
        losses = []
        self.processed_data = self.fill(env)
        self.Q_k_minus_1_all.epoch = 0
        for k in tqdm(range(max_epochs)):
            batch_size = 32

            dataset_length = self.data.num_tuples()
            perm = np.random.permutation(range(dataset_length))
            eighty_percent_of_set = int(1. * len(perm))
            training_idxs = perm[:eighty_percent_of_set]
            validation_idxs = perm[eighty_percent_of_set:]
            training_steps_per_epoch = int(
                perc_of_dataset *
                np.ceil(len(training_idxs) / float(batch_size)))
            validation_steps_per_epoch = int(
                np.ceil(len(validation_idxs) / float(batch_size)))
            # steps_per_epoch = 1 #int(np.ceil(len(dataset)/float(batch_size)))
            train_gen = self.generator(env,
                                       pi_e,
                                       training_idxs,
                                       fixed_permutation=True,
                                       batch_size=batch_size)
            # val_gen = self.generator(policy, dataset, validation_idxs, fixed_permutation=True, batch_size=batch_size)

            # import pdb; pdb.set_trace()
            # train_gen = self.generator(env, pi_e, (transitions,frames), training_idxs, fixed_permutation=True, batch_size=batch_size)
            # inp, out = next(train_gen)
            M = 5
            hist = self.Q_k.fit_generator(
                train_gen,
                steps_per_epoch=training_steps_per_epoch,
                #validation_data=val_gen,
                #validation_steps=validation_steps_per_epoch,
                epochs=1,
                max_queue_size=50,
                workers=2,
                use_multiprocessing=False,
                verbose=1,
                callbacks=self.more_callbacks)

            norm_change = self.weight_change_norm(self.Q_k, self.Q_k_minus_1)
            self.copy_over_to(self.Q_k, self.Q_k_minus_1)

            losses.append(hist.history['loss'])
            actions = pi_e.sample(initial_states)
            assert len(actions) == initial_states.shape[0]
            Q_val = self.Q_k_all.predict(initial_states)[
                np.arange(len(actions)), actions]
            values.append(np.mean(Q_val))
            print(values[-1], norm_change, np.mean(values[-M:]),
                  np.abs(np.mean(values[-M:]) - np.mean(values[-(M + 1):-1])),
                  1e-4 * np.abs(np.mean(values[-(M + 1):-1])))
            if k > M and np.abs(
                    np.mean(values[-M:]) - np.mean(values[-(M + 1):-1])
            ) < 1e-4 * np.abs(np.mean(values[-(M + 1):-1])):
                break

        return np.mean(values[-10:]), self.Q_k, self.Q_k_all
        # actions = policy(initial_states[:,np.newaxis,...], x_preprocessed=True)
        # Q_val = self.Q_k.all_actions([initial_states], x_preprocessed=True)[np.arange(len(actions)), actions]
        # return np.mean(Q_val)*dataset.scale, values

    def fill(self, env):
        states = self.data.states()
        states = states.reshape(-1, np.prod(states.shape[2:]))
        actions = self.data.actions().reshape(-1)
        actions = np.eye(env.n_actions)[actions]

        next_states = self.data.next_states()
        original_shape = next_states.shape
        next_states = next_states.reshape(-1, np.prod(next_states.shape[2:]))

        policy_action = self.data.next_target_propensity().reshape(
            -1, env.n_actions)
        rewards = self.data.rewards().reshape(-1)

        dones = self.data.dones()
        dones = dones.reshape(-1)

        return DataHolder(states, actions, rewards, next_states, dones,
                          policy_action, original_shape)

    @threadsafe_generator
    def generator(self,
                  env,
                  pi_e,
                  all_idxs,
                  fixed_permutation=False,
                  batch_size=64):
        # dataset, frames = dataset
        data_length = len(all_idxs)
        steps = int(np.ceil(data_length / float(batch_size)))

        # states = self.data.states()
        # states = states.reshape(-1,np.prod(states.shape[2:]))
        # actions = self.data.actions().reshape(-1)
        # actions = np.eye(env.n_actions)[actions]

        # next_states = self.data.next_states()
        # original_shape = next_states.shape
        # next_states = next_states.reshape(-1,np.prod(next_states.shape[2:]))

        # policy_action = self.data.target_propensity().reshape(-1, env.n_actions)
        # rewards = self.data.rewards().reshape(-1)

        # dones = self.data.dones()
        # dones = dones.reshape(-1)

        states = self.processed_data.states
        actions = self.processed_data.actions
        next_states = self.processed_data.next_states
        original_shape = self.processed_data.original_shape
        policy_action = self.processed_data.policy_action
        rewards = self.processed_data.rewards
        dones = self.processed_data.dones

        alpha = 1.

        # Rebalance dataset
        # probs = np.hstack([np.zeros((dones.shape[0],2)), dones,])[:,:-2]
        # if np.sum(probs):
        #     done_probs = probs / np.sum(probs)
        #     probs = 1 - probs + done_probs
        # else:
        #     probs = 1 - probs
        # probs = probs.reshape(-1)
        # probs /= np.sum(probs)
        # probs = probs[all_idxs]
        # probs /= np.sum(probs)

        # while True:
        #     batch_idxs = np.random.choice(all_idxs, batch_size, p = probs)

        while True:
            perm = np.random.permutation(all_idxs)
            for batch in np.arange(steps):
                batch_idxs = perm[(batch * batch_size):((batch + 1) *
                                                        batch_size)]

                x = states[batch_idxs].reshape(
                    tuple([-1]) + original_shape[2:])
                if self.processor: x = self.processor(x)

                acts = actions[batch_idxs]
                x_ = next_states[batch_idxs].reshape(
                    tuple([-1]) + original_shape[2:])
                if self.processor: x_ = self.processor(x_)

                pi_a_given_x = policy_action[batch_idxs]
                not_dones = 1 - dones[batch_idxs]
                rew = rewards[batch_idxs]

                Q_val = self.Q_k_minus_1_all.predict(x_).reshape(
                    pi_a_given_x.shape)
                # if self.Q_k_minus_1_all.epoch == 0:
                #     Q_val = np.zeros_like(Q_val)
                # Q_val = Q_val[np.arange(len(acts)), np.argmax(acts,axis=1)]
                Q_val = (Q_val * pi_a_given_x).sum(axis=-1)
                new_Q = rew + self.gamma * (Q_val * not_dones).reshape(-1)

                old_Q = 0  #(self.Q_k.predict([x, acts]).reshape(-1) * not_dones)
                Q = (old_Q) + (alpha) * (
                    new_Q - old_Q
                )  # Q-learning style update w/ learning rate, to stabilize

                yield ([x, acts], Q)
Ejemplo n.º 2
0
class DirectMethodRegression(object):
    def __init__(self,
                 data,
                 gamma,
                 frameskip=2,
                 frameheight=2,
                 modeltype='conv',
                 processor=None):
        self.data = data
        self.gamma = gamma
        self.frameskip = frameskip
        self.frameheight = frameheight
        self.modeltype = modeltype
        self.processor = processor

        # self.setup(deepcopy(self.trajectories))

    def wls_sherman_morrison(self,
                             phi_in,
                             rewards_in,
                             omega_in,
                             lamb,
                             omega_regularizer,
                             cond_number_threshold_A,
                             block_size=None):
        # omega_in_2 = block_diag(*omega_in)
        # omega_in_2 += omega_regularizer * np.eye(len(omega_in_2))
        # Aw = phi_in.T.dot(omega_in_2).dot(phi_in)
        # Aw = Aw + lamb * np.eye(phi_in.shape[1])
        # print(np.linalg.cond(Aw))
        # bw = phi_in.T.dot(omega_in_2).dot(rewards_in)
        feat_dim = phi_in.shape[1]
        b = np.zeros((feat_dim, 1))
        B = np.eye(feat_dim)
        data_count = len(omega_in)
        if np.isscalar(omega_in[0]):
            omega_size = 1
            I_a = 1
        else:
            omega_size = omega_in[0].shape[0]
            I_a = np.eye(omega_size)

        for i in range(data_count):
            if omega_in[i] is None:
                # if omega_in[i] is None or (omega_size==1 and omega_in[i] == 0):
                #omega_in[i] = I_a
                #rewards_in[i] = 1
                continue
            omeg_i = omega_in[i] + omega_regularizer * I_a
            #if omega_size > 1:
            #    omeg_i = omeg_i / np.max(omeg_i)

            feat = phi_in[i * omega_size:(i + 1) * omega_size, :]
            # A = A + feat.T.dot(omega_list[i]).dot(feat)
            rews_i = np.reshape(
                rewards_in[i * omega_size:(i + 1) * omega_size],
                [omega_size, 1])

            b = b + feat.T.dot(omeg_i).dot(rews_i)

            #  Sherman–Morrison–Woodbury formula:
            # (B + UCV)^-1 = B^-1 - B^-1 U ( C^-1 + V B^-1 U)^-1 V B^-1
            # in our case: U = feat.T   C = omega_list[i]  V = feat
            # print(omeg_i)
            if omega_size > 1:
                C_inv = np.linalg.inv(omeg_i)
            else:
                C_inv = 1 / omeg_i
            if np.linalg.norm(feat.dot(B).dot(feat.T)) < 0.0000001:
                inner_inv = omeg_i
            else:
                inner_inv = np.linalg.inv(C_inv + feat.dot(B).dot(feat.T))

            B = B - B.dot(feat.T).dot(inner_inv).dot(feat).dot(B)

        weight_prim = B.dot(b)
        weight = weight_prim.reshape((-1, ))
        return weight

    def run(self, pi_b, pi_e, epsilon=0.001):

        dataset = self.data.all_transitions()
        frames = self.data.frames()
        omega = self.data.omega()
        rewards = self.data.rewards()

        omega = [np.cumprod(om) for om in omega]
        gamma_vec = self.gamma**np.arange(max([len(x) for x in omega]))

        factors, Rs = [], []
        for data in dataset:
            ts = data[-1]
            traj_num = data[-2]

            i, t = int(traj_num), int(ts)
            Rs.append(
                np.sum(omega[i][t:] / omega[i][t] * gamma_vec[t:] /
                       gamma_vec[t] * rewards[i][t:]))
            factors.append(gamma_vec[t] * omega[i][t])

        self.alpha = 1
        self.lamb = 1
        self.cond_number_threshold_A = 1
        block_size = len(dataset)

        phi = self.compute_grid_features()
        self.weight = self.wls_sherman_morrison(phi, Rs, factors, self.lamb,
                                                self.alpha,
                                                self.cond_number_threshold_A,
                                                block_size)

        return DMModel(self.weight, self.data)

    def compute_feature_without_time(self, state, action, step):
        T = max(self.data.lengths())
        n_dim = self.data.n_dim
        n_actions = self.data.n_actions

        # feature_dim = n_dim + n_actions
        # feature_dim =
        phi = np.zeros((n_dim, n_actions))
        # for k in range(step, T):
        #     phi[state * n_actions + action] = env.gamma_vec[k - step]

        # phi = np.hstack([np.eye(n_dim)[int(state)] , np.eye(n_actions)[action] ])
        # phi[action*n_dim: (action+1)*n_dim] = state + 1
        # phi[int(state*n_actions + action)] = 1
        phi[int(state), int(action)] = 1
        phi = phi.reshape(-1)

        return phi

    def compute_feature(self, state, action, step):
        return self.compute_feature_without_time(state, action, step)

    def compute_grid_features(self):

        T = max(self.data.lengths())
        n_dim = self.data.n_dim
        n_actions = self.data.n_actions

        n = len(self.data)

        data_dim = n * T

        phi = data_dim * [None]

        lengths = self.data.lengths()
        for i in range(n):
            states = self.data.states(False, i, i + 1)
            actions = self.data.actions()[i]

            for t in range(max(lengths)):
                if t < lengths[i]:
                    s = states[t]
                    action = int(actions[t])
                    phi[i * T + t] = self.compute_feature(s, action, t)
                else:
                    phi[i * T + t] = np.zeros(len(phi[0]))

        return np.array(phi, dtype='float')

    @staticmethod
    def build_model(input_size, scope, action_space_dim=3, modeltype='conv'):

        inp = keras.layers.Input(input_size, name='frames')
        actions = keras.layers.Input((action_space_dim, ), name='mask')
        factors = keras.layers.Input((1, ), name='weights')

        # conv1 = Conv2D(64, kernel_size=16, strides=2, activation='relu', data_format='channels_first')(inp)
        # #pool1 = MaxPool2D(data_format='channels_first')(conv1)
        # conv2 = Conv2D(64, kernel_size=8, strides=2, activation='relu', data_format='channels_first')(conv1)
        # #pool2 = MaxPool2D(data_format='channels_first')(conv2)
        # conv3 = Conv2D(64, kernel_size=4, strides=2, activation='relu', data_format='channels_first')(conv2)
        # #pool3 = MaxPool2D(data_format='channels_first')(conv3)
        # flat = Flatten()(conv3)
        # dense1 = Dense(10, activation='relu')(flat)
        # dense2 = Dense(30, activation='relu')(dense1)
        # out = Dense(action_space_dim, activation='linear', name=scope+ 'all_Q')(dense2)
        # filtered_output = keras.layers.dot([out, actions], axes=1)

        # model = keras.models.Model(input=[inp, actions], output=[filtered_output])

        # all_Q = keras.models.Model(inputs=[inp],
        #                          outputs=model.get_layer(scope + 'all_Q').output)

        # rmsprop = keras.optimizers.RMSprop(lr=0.0005, rho=0.9, epsilon=1e-5, decay=0.0)
        # model.compile(loss='mse', optimizer=rmsprop)

        def init():
            return keras.initializers.TruncatedNormal(mean=0.0,
                                                      stddev=0.1,
                                                      seed=np.random.randint(
                                                          2**32))

        if modeltype == 'conv':
            conv1 = Conv2D(8, (7, 7),
                           strides=(3, 3),
                           padding='same',
                           data_format='channels_first',
                           activation='elu',
                           kernel_initializer=init(),
                           bias_initializer=init(),
                           kernel_regularizer=regularizers.l2(1e-6))(inp)
            pool1 = MaxPool2D(data_format='channels_first')(conv1)
            conv2 = Conv2D(16, (3, 3),
                           strides=(1, 1),
                           padding='same',
                           data_format='channels_first',
                           activation='elu',
                           kernel_initializer=init(),
                           bias_initializer=init(),
                           kernel_regularizer=regularizers.l2(1e-6))(pool1)
            pool2 = MaxPool2D(data_format='channels_first')(conv2)
            flat1 = Flatten(name='flattened')(pool2)
            out = Dense(256,
                        activation='elu',
                        kernel_initializer=init(),
                        bias_initializer=init(),
                        kernel_regularizer=regularizers.l2(1e-6))(flat1)
        elif modeltype == 'conv1':

            def init():
                return keras.initializers.TruncatedNormal(
                    mean=0.0, stddev=0.001, seed=np.random.randint(2**32))

            conv1 = Conv2D(16, (2, 2),
                           strides=(1, 1),
                           padding='same',
                           data_format='channels_first',
                           activation='elu',
                           kernel_initializer=init(),
                           bias_initializer=init(),
                           kernel_regularizer=regularizers.l2(1e-6))(inp)
            # pool1 = MaxPool2D(data_format='channels_first')(conv1)
            # conv2 = Conv2D(16, (2,2), strides=(1,1), padding='same', data_format='channels_first', activation='elu',kernel_initializer=init(), bias_initializer=init(), kernel_regularizer=regularizers.l2(1e-6))(pool1)
            # pool2 = MaxPool2D(data_format='channels_first')(conv2)
            flat1 = Flatten(name='flattened')(conv1)
            out = Dense(8,
                        activation='elu',
                        kernel_initializer=init(),
                        bias_initializer=init(),
                        kernel_regularizer=regularizers.l2(1e-6))(flat1)
            out = Dense(8,
                        activation='elu',
                        kernel_initializer=init(),
                        bias_initializer=init(),
                        kernel_regularizer=regularizers.l2(1e-6))(out)
        else:

            def init():
                return keras.initializers.TruncatedNormal(
                    mean=0.0, stddev=.1, seed=np.random.randint(2**32))

            # flat = Flatten()(inp)
            # dense1 = Dense(256, activation='relu',kernel_initializer=init(), bias_initializer=init())(flat)
            # dense2 = Dense(256, activation='relu',kernel_initializer=init(), bias_initializer=init())(dense1)
            # dense3 = Dense(128, activation='relu',kernel_initializer=init(), bias_initializer=init())(dense2)
            # out = Dense(32, activation='relu', name='out',kernel_initializer=init(), bias_initializer=init())(dense3)
            flat = Flatten()(inp)
            dense1 = Dense(16,
                           activation='relu',
                           kernel_initializer=init(),
                           bias_initializer=init())(flat)
            # dense2 = Dense(256, activation='relu',kernel_initializer=init(), bias_initializer=init())(dense1)
            dense3 = Dense(8,
                           activation='relu',
                           kernel_initializer=init(),
                           bias_initializer=init())(dense1)
            out = Dense(4,
                        activation='relu',
                        name='out',
                        kernel_initializer=init(),
                        bias_initializer=init())(dense3)

        all_actions = Dense(action_space_dim,
                            name=scope + 'all_Q',
                            activation="linear",
                            kernel_initializer=init(),
                            bias_initializer=init())(out)

        output = keras.layers.dot([all_actions, actions], 1)

        model = keras.models.Model(inputs=[inp, actions], outputs=output)

        model1 = keras.models.Model(inputs=[inp, actions, factors],
                                    outputs=output)

        all_Q = keras.models.Model(inputs=[inp],
                                   outputs=model.get_layer(scope +
                                                           'all_Q').output)

        # rmsprop = keras.optimizers.RMSprop(lr=0.005, rho=0.95, epsilon=1e-08, decay=1e-3)#, clipnorm=1.)
        adam = keras.optimizers.Adam()

        def DMloss(y_true, y_pred, weights):
            return K.sum(weights * K.square(y_pred - y_true))

        weighted_loss = partial(DMloss, weights=factors)

        model1.compile(loss=weighted_loss,
                       optimizer=adam,
                       metrics=['accuracy'])
        # print(model.summary())
        return model1, model, all_Q

    @staticmethod
    def copy_over_to(source, target):
        target.set_weights(source.get_weights())

    @staticmethod
    def weight_change_norm(model, target_model):
        norm_list = []
        number_of_layers = len(model.layers)
        for i in range(number_of_layers):
            model_matrix = model.layers[i].get_weights()
            target_model_matrix = target_model.layers[i].get_weights()
            if len(model_matrix) > 0:
                #print "layer ", i, " has shape ", model_matrix[0].shape
                if model_matrix[0].shape[0] > 0:
                    norm_change = np.linalg.norm(model_matrix[0] -
                                                 target_model_matrix[0])
                    norm_list.append(norm_change)
        return sum(norm_list) * 1.0 / len(norm_list)

    def run_linear(self, env, pi_b, pi_e, max_epochs, epsilon=.001):

        self.Q_k = LinearRegression()

        states = self.data.states()
        states = states.reshape(-1, np.prod(states.shape[2:]))
        lengths = self.data.lengths()
        omega = self.data.omega()
        rewards = self.data.rewards()
        actions = self.data.actions().reshape(-1)

        omega = [np.cumprod(om) for om in omega]
        gamma_vec = self.gamma**np.arange(max([len(x) for x in omega]))

        factors, Rs = [], []
        for traj_num, ts in enumerate(self.data.ts()):
            for t in ts:
                i, t = int(traj_num), int(t)
                if omega[i][t]:
                    Rs.append(
                        np.sum(omega[i][t:] / omega[i][t] * gamma_vec[t:] /
                               gamma_vec[t] * rewards[i][t:]))
                else:
                    Rs.append(0)
                factors.append(gamma_vec[t] * omega[i][t])

        Rs = np.array(Rs)
        factors = np.array(factors)

        actions = np.eye(self.data.n_actions)[actions]
        return self.Q_k.fit(np.hstack([states, actions]), Rs, factors)

    def run_NN(self, env, pi_b, pi_e, max_epochs, epsilon=0.001):

        self.dim_of_actions = env.n_actions
        self.Q_k = None
        self.Q_k_minus_1 = None

        earlyStopping = EarlyStopping(monitor='val_loss',
                                      min_delta=1e-4,
                                      patience=5,
                                      verbose=1,
                                      mode='min',
                                      restore_best_weights=True)
        mcp_save = ModelCheckpoint('dm_regression.hdf5',
                                   save_best_only=True,
                                   monitor='val_loss',
                                   mode='min')
        reduce_lr_loss = ReduceLROnPlateau(monitor='val_loss',
                                           factor=0.1,
                                           patience=3,
                                           verbose=1,
                                           min_delta=1e-4,
                                           mode='min')

        self.more_callbacks = [earlyStopping, reduce_lr_loss]

        # if self.modeltype == 'conv':
        #     im = self.trajectories.states()[0,0,...] #env.pos_to_image(np.array(self.trajectories[0]['x'][0])[np.newaxis,...])
        # else:
        #     im = np.array(self.trajectories[0]['frames'])[np.array(self.trajectories[0]['x'][0]).astype(int)][np.newaxis,...]
        im = self.data.states()[0]
        if self.processor: im = self.processor(im)
        self.Q_k, self.Q, self.Q_k_all = self.build_model(
            im.shape[1:],
            'Q_k',
            modeltype=self.modeltype,
            action_space_dim=env.n_actions)

        print('Training: Model Free')
        losses = []
        for k in tqdm(range(1)):
            batch_size = 32

            dataset_length = self.data.num_tuples()
            perm = np.random.permutation(range(dataset_length))
            eighty_percent_of_set = int(.8 * len(perm))
            training_idxs = perm[:eighty_percent_of_set]
            validation_idxs = perm[eighty_percent_of_set:]
            training_steps_per_epoch = int(
                1. * np.ceil(len(training_idxs) / float(batch_size)))
            validation_steps_per_epoch = int(
                np.ceil(len(validation_idxs) / float(batch_size)))
            train_gen = self.generator(env,
                                       pi_e,
                                       training_idxs,
                                       fixed_permutation=True,
                                       batch_size=batch_size)
            val_gen = self.generator(env,
                                     pi_e,
                                     validation_idxs,
                                     fixed_permutation=True,
                                     batch_size=batch_size,
                                     is_train=False)

            hist = self.Q_k.fit_generator(
                train_gen,
                steps_per_epoch=training_steps_per_epoch,
                validation_data=val_gen,
                validation_steps=validation_steps_per_epoch,
                epochs=max_epochs,
                max_queue_size=1,
                workers=1,
                use_multiprocessing=False,
                verbose=1,
                callbacks=self.more_callbacks)

        return self.Q_k, self.Q_k_all

    @threadsafe_generator
    def generator(self,
                  env,
                  pi_e,
                  all_idxs,
                  fixed_permutation=False,
                  batch_size=64,
                  is_train=True):
        data_length = len(all_idxs)
        steps = int(np.ceil(data_length / float(batch_size)))

        states = self.data.states()
        states = states.reshape(tuple([-1]) + states.shape[2:])
        lengths = self.data.lengths()
        omega = self.data.omega()
        rewards = self.data.rewards()
        actions = self.data.actions().reshape(-1)

        omega = [np.cumprod(om) for om in omega]
        gamma_vec = self.gamma**np.arange(max([len(x) for x in omega]))

        factors, Rs = [], []
        for traj_num, ts in enumerate(self.data.ts()):
            for t in ts:
                i, t = int(traj_num), int(t)
                if omega[i][t]:
                    Rs.append(
                        np.sum(omega[i][t:] / omega[i][t] * gamma_vec[t:] /
                               gamma_vec[t] * rewards[i][t:]))
                else:
                    Rs.append(0)
                factors.append(gamma_vec[t] * omega[i][t])

        Rs = np.array(Rs)
        factors = np.array(factors)

        dones = self.data.dones()
        alpha = 1.

        # Rebalance dataset
        probs = np.hstack([
            np.zeros((dones.shape[0], 2)),
            dones,
        ])[:, :-2]
        if np.sum(probs):
            done_probs = probs / np.sum(probs)
            probs = 1 - probs + done_probs
        else:
            probs = 1 - probs
        probs = probs.reshape(-1)
        probs /= np.sum(probs)
        probs = probs[all_idxs]
        probs /= np.sum(probs)

        dones = dones.reshape(-1)

        # if is_train:
        #     while True:
        #         batch_idxs = np.random.choice(all_idxs, batch_size, p = probs)

        #         x = states[batch_idxs]
        #         weight = factors[batch_idxs]
        #         R = Rs[batch_idxs]
        #         acts = actions[batch_idxs]

        #         yield ([x, np.eye(3)[acts], np.array(weight).reshape(-1,1)], [np.array(R).reshape(-1,1)])
        # else:
        #
        while True:
            perm = np.random.permutation(all_idxs)
            for batch in np.arange(steps):
                batch_idxs = perm[(batch * batch_size):((batch + 1) *
                                                        batch_size)]

                x = states[batch_idxs]
                if self.processor: x = self.processor(x)
                weight = factors[batch_idxs]  #* probs[batch_idxs]
                R = Rs[batch_idxs]
                acts = actions[batch_idxs]

                yield ([
                    x,
                    np.eye(env.n_actions)[acts],
                    np.array(weight).reshape(-1, 1)
                ], [np.array(R).reshape(-1, 1)])
Ejemplo n.º 3
0
model_name = 'schnet_edgeupdate_fixed'

if not os.path.exists(model_name):
    os.makedirs(model_name)

filepath = model_name + "/best_model.hdf5"
checkpoint = ModelCheckpoint(filepath,
                             save_best_only=True,
                             period=10,
                             verbose=1)
csv_logger = CSVLogger(model_name + '/log.csv')


def decay_fn(epoch, learning_rate):
    """ Jorgensen decays to 0.96*lr every 100,000 batches, which is approx
    every 28 epochs """

    if (epoch % 28) == 0:
        return 0.96 * learning_rate
    else:
        return learning_rate


lr_decay = LearningRateScheduler(decay_fn)

hist = model.fit_generator(train_sequence,
                           validation_data=valid_sequence,
                           epochs=epochs,
                           verbose=1,
                           callbacks=[checkpoint, csv_logger, lr_decay])