def _create(self): observation = input(STATE_COUNT, np.float32, name="s") q_target = input(ACTION_COUNT, np.float32, name="q") l1 = Dense(H, activation=relu) l2 = Dense(ACTION_COUNT) unbound_model = Sequential([l1, l2]) model = unbound_model(observation) self.params = dict(W1=l1.W, b1=l1.b, W2=l2.W, b2=l2.b) lr = 0.00025 # opt = RMSprop(lr=0.00025) # model.compile(loss='mse', optimizer=opt) # loss='mse' loss = reduce_mean(square(model - q_target), axis=0) meas = reduce_mean(square(model - q_target), axis=0) # optimizer=opt lr_schedule = learning_rate_schedule(lr, UnitType.minibatch) #learner = sgd(model.parameters, lr_schedule, gradient_clipping_threshold_per_sample=10) learner = adam(model.parameters, lr_schedule, momentum = momentum_schedule(0.9), gradient_clipping_threshold_per_sample=10) trainer = Trainer(model, (loss, meas), learner) # CNTK: return trainer and loss as well return model, trainer, loss
def create_inputs(vocab_dim): input_seq_axis = Axis('inputAxis') input_sequence = sequence.input(shape=vocab_dim, sequence_axis=input_seq_axis) label_sequence = sequence.input(shape=vocab_dim, sequence_axis=input_seq_axis) return input_sequence, label_sequence
def _sparse_to_dense_network_cache(input_shape, is_sequence, device): from cntk.ops import times, input, sequence if is_sequence: temp_input = sequence.input(input_shape, is_sparse=True) else: temp_input = input(input_shape, is_sparse=True) eye_shape = input_shape[-1] return times(temp_input, np.eye(eye_shape))
def train_sequence_classifier(debug_output=False): input_dim = 2000 cell_dim = 25 hidden_dim = 25 embedding_dim = 50 num_output_classes = 5 # Input variables denoting the features and label data features = sequence.input(shape=input_dim, is_sparse=True) label = input(num_output_classes) # Instantiate the sequence classification model classifier_output = LSTM_sequence_classifer_net(features, num_output_classes, embedding_dim, hidden_dim, cell_dim) ce = cross_entropy_with_softmax(classifier_output, label) pe = classification_error(classifier_output, label) rel_path = r"../../../../Tests/EndToEndTests/Text/SequenceClassification/Data/Train.ctf" path = os.path.join(os.path.dirname(os.path.abspath(__file__)), rel_path) reader = create_reader(path, True, input_dim, num_output_classes) input_map = { features: reader.streams.features, label: reader.streams.labels } lr_per_sample = learning_rate_schedule(0.0005, UnitType.sample) # Instantiate the trainer object to drive the model training trainer = Trainer(classifier_output, (ce, pe), sgd(classifier_output.parameters, lr=lr_per_sample)) # Get minibatches of sequences to train with and perform model training minibatch_size = 200 training_progress_output_freq = 10 if debug_output: training_progress_output_freq = training_progress_output_freq / 3 for i in range(251): mb = reader.next_minibatch(minibatch_size, input_map=input_map) trainer.train_minibatch(mb) print_training_progress(trainer, i, training_progress_output_freq) import copy evaluation_average = copy.copy( trainer.previous_minibatch_evaluation_average) loss_average = copy.copy(trainer.previous_minibatch_loss_average) return evaluation_average, loss_average
def policy_gradient(): import cntk as C TOTAL_EPISODES = 2000 if isFast else 10000 H = 100 # number of hidden layer neurons observations = input(STATE_COUNT, np.float32, name="obs") W1 = C.parameter(shape=(STATE_COUNT, H), init=C.glorot_uniform(), name="W1") b1 = C.parameter(shape=H, name="b1") layer1 = C.relu(C.times(observations, W1) + b1) W2 = C.parameter(shape=(H, ACTION_COUNT), init=C.glorot_uniform(), name="W2") b2 = C.parameter(shape=ACTION_COUNT, name="b2") score = C.times(layer1, W2) + b2 # Until here it was similar to DQN probability = C.sigmoid(score, name="prob") input_y = input(1, np.float32, name="input_y") advantages = input(1, np.float32, name="advt") loss = -C.reduce_mean(C.log(C.square(input_y - probability) + 1e-4) * advantages, axis=0, name='loss') lr = 1e-4 lr_schedule = learning_rate_schedule(lr, UnitType.sample) sgd = C.sgd([W1, W2], lr_schedule) gradBuffer = dict((var.name, np.zeros(shape=var.shape)) for var in loss.parameters if var.name in ['W1', 'W2', 'b1', 'b2']) xs, hs, label, drs = [], [], [], [] running_reward = None reward_sum = 0 episode_number = 1 observation = env.reset() actionlist = [i for i in range(env.action_space['n']) ] #%% while episode_number <= TOTAL_EPISODES: x = np.reshape(observation, [1, STATE_COUNT]).astype(np.float32) # Run the policy network and get an action to take. #prob = probability.eval(arguments={observations: x})[0][0][0] prob = probability.eval(arguments={observations: x}) normalized_weights = (prob / np.sum(prob))[0][0] action = numpy.random.choice(actionlist, p=normalized_weights) #action = 1 if np.random.uniform() < prob else 0 xs.append(x) # observation # grad that encourages the action that was taken to be taken y = 1 if action == 0 else 0 # a "fake label" label.append(y) # step the environment and get new measurements observation, reward, done, info = env.step(action) reward_sum += float(reward) # Record reward (has to be done after we call step() to get reward for previous action) drs.append(float(reward)) if done: # Stack together all inputs, hidden states, action gradients, and rewards for this episode epx = np.vstack(xs) epl = np.vstack(label).astype(np.float32) epr = np.vstack(drs).astype(np.float32) xs, label, drs = [], [], [] # reset array memory # Compute the discounted reward backwards through time. discounted_epr = discount_rewards(epr) # Size the rewards to be unit normal (helps control the gradient estimator variance) discounted_epr -= np.mean(discounted_epr) discounted_epr /= (np.std(discounted_epr) + 0.000000000001) # Forward pass arguments = {observations: epx, input_y: epl, advantages: discounted_epr} state, outputs_map = loss.forward(arguments, outputs=loss.outputs, keep_for_backward=loss.outputs) # Backward psas root_gradients = {v: np.ones_like(o) for v, o in outputs_map.items()} vargrads_map = loss.backward(state, root_gradients, variables=set([W1, W2])) for var, grad in vargrads_map.items(): gradBuffer[var.name] += grad # Wait for some batches to finish to reduce noise if episode_number % BATCH_SIZE_BASELINE == 0: grads = {W1: gradBuffer['W1'].astype(np.float32), W2: gradBuffer['W2'].astype(np.float32)} updated = sgd.update(grads, BATCH_SIZE_BASELINE) # reset the gradBuffer gradBuffer = dict((var.name, np.zeros(shape=var.shape)) for var in loss.parameters if var.name in ['W1', 'W2', 'b1', 'b2']) print('Episode: %d. Average reward for episode %f.' % (episode_number, reward_sum / BATCH_SIZE_BASELINE)) if reward_sum / BATCH_SIZE_BASELINE > REWARD_TARGET: print('Task solved in: %d ' % episode_number) break reward_sum = 0 observation = env.reset() # reset env episode_number += 1 probability.save('pg.mod')
def _sparse_to_dense_network_cache(input_shape): from cntk.ops import times, sequence temp_input = sequence.input(input_shape) eye_shape = input_shape[-1] return times(temp_input, np.eye(eye_shape))