예제 #1
0
    def _create(self):
        observation = input(STATE_COUNT, np.float32, name="s")
        q_target = input(ACTION_COUNT, np.float32, name="q")

        l1 = Dense(H, activation=relu)
        l2 = Dense(ACTION_COUNT)
        unbound_model = Sequential([l1, l2])
        model = unbound_model(observation)

        self.params = dict(W1=l1.W, b1=l1.b, W2=l2.W, b2=l2.b)

        lr = 0.00025
        # opt = RMSprop(lr=0.00025)
        # model.compile(loss='mse', optimizer=opt)

        # loss='mse'
        loss = reduce_mean(square(model - q_target), axis=0)
        meas = reduce_mean(square(model - q_target), axis=0)

        # optimizer=opt
        lr_schedule = learning_rate_schedule(lr, UnitType.minibatch)
        #learner = sgd(model.parameters, lr_schedule, gradient_clipping_threshold_per_sample=10)
        
        learner = adam(model.parameters, lr_schedule, momentum = momentum_schedule(0.9), gradient_clipping_threshold_per_sample=10)
        
        
        
        trainer = Trainer(model, (loss, meas), learner)

        # CNTK: return trainer and loss as well
        return model, trainer, loss
예제 #2
0
def create_inputs(vocab_dim):
    input_seq_axis = Axis('inputAxis')
    input_sequence = sequence.input(shape=vocab_dim,
                                    sequence_axis=input_seq_axis)
    label_sequence = sequence.input(shape=vocab_dim,
                                    sequence_axis=input_seq_axis)

    return input_sequence, label_sequence
예제 #3
0
파일: sanitize.py 프로젝트: ondrocks/CNTK
def _sparse_to_dense_network_cache(input_shape, is_sequence, device):
    from cntk.ops import times, input, sequence

    if is_sequence:
        temp_input = sequence.input(input_shape, is_sparse=True)
    else:
        temp_input = input(input_shape, is_sparse=True)

    eye_shape = input_shape[-1]
    return times(temp_input, np.eye(eye_shape))
예제 #4
0
def train_sequence_classifier(debug_output=False):
    input_dim = 2000
    cell_dim = 25
    hidden_dim = 25
    embedding_dim = 50
    num_output_classes = 5

    # Input variables denoting the features and label data
    features = sequence.input(shape=input_dim, is_sparse=True)
    label = input(num_output_classes)

    # Instantiate the sequence classification model
    classifier_output = LSTM_sequence_classifer_net(features,
                                                    num_output_classes,
                                                    embedding_dim, hidden_dim,
                                                    cell_dim)

    ce = cross_entropy_with_softmax(classifier_output, label)
    pe = classification_error(classifier_output, label)

    rel_path = r"../../../../Tests/EndToEndTests/Text/SequenceClassification/Data/Train.ctf"
    path = os.path.join(os.path.dirname(os.path.abspath(__file__)), rel_path)

    reader = create_reader(path, True, input_dim, num_output_classes)

    input_map = {
        features: reader.streams.features,
        label: reader.streams.labels
    }

    lr_per_sample = learning_rate_schedule(0.0005, UnitType.sample)
    # Instantiate the trainer object to drive the model training
    trainer = Trainer(classifier_output, (ce, pe),
                      sgd(classifier_output.parameters, lr=lr_per_sample))

    # Get minibatches of sequences to train with and perform model training
    minibatch_size = 200
    training_progress_output_freq = 10

    if debug_output:
        training_progress_output_freq = training_progress_output_freq / 3

    for i in range(251):
        mb = reader.next_minibatch(minibatch_size, input_map=input_map)
        trainer.train_minibatch(mb)
        print_training_progress(trainer, i, training_progress_output_freq)

    import copy

    evaluation_average = copy.copy(
        trainer.previous_minibatch_evaluation_average)
    loss_average = copy.copy(trainer.previous_minibatch_loss_average)

    return evaluation_average, loss_average
예제 #5
0
def policy_gradient():
    import cntk as C

    TOTAL_EPISODES = 2000 if isFast else 10000

    H = 100 # number of hidden layer neurons
    
    observations = input(STATE_COUNT, np.float32, name="obs")
    
    W1 = C.parameter(shape=(STATE_COUNT, H), init=C.glorot_uniform(), name="W1")
    b1 = C.parameter(shape=H, name="b1")
    layer1 = C.relu(C.times(observations, W1) + b1)
    
    W2 = C.parameter(shape=(H, ACTION_COUNT), init=C.glorot_uniform(), name="W2")
    b2 = C.parameter(shape=ACTION_COUNT, name="b2")
    score = C.times(layer1, W2) + b2
    # Until here it was similar to DQN
    
    probability = C.sigmoid(score, name="prob")
    input_y = input(1, np.float32, name="input_y")
    advantages = input(1, np.float32, name="advt")
    
    loss = -C.reduce_mean(C.log(C.square(input_y - probability) + 1e-4) * advantages, axis=0, name='loss')
    
    lr = 1e-4
    lr_schedule = learning_rate_schedule(lr, UnitType.sample)
    sgd = C.sgd([W1, W2], lr_schedule)
    
    gradBuffer = dict((var.name, np.zeros(shape=var.shape)) for var in loss.parameters if var.name in ['W1', 'W2', 'b1', 'b2'])
    
    xs, hs, label, drs = [], [], [], []
    running_reward = None
    reward_sum = 0
    episode_number = 1
    
    observation = env.reset()
    actionlist = [i for i in range(env.action_space['n']) ]
#%%
    while episode_number <= TOTAL_EPISODES:
        x = np.reshape(observation, [1, STATE_COUNT]).astype(np.float32)
    
        # Run the policy network and get an action to take.
        #prob = probability.eval(arguments={observations: x})[0][0][0]
        prob = probability.eval(arguments={observations: x})        
        normalized_weights = (prob / np.sum(prob))[0][0]                
        action = numpy.random.choice(actionlist, p=normalized_weights)
        #action = 1 if np.random.uniform() < prob else 0
    
        xs.append(x)  # observation
        # grad that encourages the action that was taken to be taken
    
        y = 1 if action == 0 else 0  # a "fake label"
        label.append(y)
    
        # step the environment and get new measurements
        observation, reward, done, info = env.step(action)
        reward_sum += float(reward)
    
        # Record reward (has to be done after we call step() to get reward for previous action)
        drs.append(float(reward))
    
        if done:
            # Stack together all inputs, hidden states, action gradients, and rewards for this episode
            epx = np.vstack(xs)
            epl = np.vstack(label).astype(np.float32)
            epr = np.vstack(drs).astype(np.float32)
            xs, label, drs = [], [], []  # reset array memory
    
            # Compute the discounted reward backwards through time.
            discounted_epr = discount_rewards(epr)
            # Size the rewards to be unit normal (helps control the gradient estimator variance)
            discounted_epr -= np.mean(discounted_epr)
            discounted_epr /= (np.std(discounted_epr) + 0.000000000001)
    
            # Forward pass
            arguments = {observations: epx, input_y: epl, advantages: discounted_epr}
            state, outputs_map = loss.forward(arguments, outputs=loss.outputs,
                                              keep_for_backward=loss.outputs)
    
            # Backward psas
            root_gradients = {v: np.ones_like(o) for v, o in outputs_map.items()}
            vargrads_map = loss.backward(state, root_gradients, variables=set([W1, W2]))
    
            for var, grad in vargrads_map.items():
                gradBuffer[var.name] += grad
    
            # Wait for some batches to finish to reduce noise
            if episode_number % BATCH_SIZE_BASELINE == 0:
                grads = {W1: gradBuffer['W1'].astype(np.float32),
                         W2: gradBuffer['W2'].astype(np.float32)}
                updated = sgd.update(grads, BATCH_SIZE_BASELINE)
    
                # reset the gradBuffer
                gradBuffer = dict((var.name, np.zeros(shape=var.shape))
                                  for var in loss.parameters if var.name in ['W1', 'W2', 'b1', 'b2'])
    
                print('Episode: %d. Average reward for episode %f.' % (episode_number, reward_sum / BATCH_SIZE_BASELINE))
    
                if reward_sum / BATCH_SIZE_BASELINE > REWARD_TARGET:
                    print('Task solved in: %d ' % episode_number)
                    break
    
                reward_sum = 0    
            observation = env.reset()  # reset env
            episode_number += 1    
    probability.save('pg.mod')
예제 #6
0
def _sparse_to_dense_network_cache(input_shape):
    from cntk.ops import times, sequence

    temp_input = sequence.input(input_shape)
    eye_shape = input_shape[-1]
    return times(temp_input, np.eye(eye_shape))