def get_lstm_layer(prior_layer, layer_name): # OUTPUT_DIM == C_DIM # INPUT_DIM == C_DIM # [BN x OUTPUT_DIM] prior_ht = Identity([], name='prior_' + layer_name + '_ht') # [BN x (OUTPUT_DIM + INPUT_DIM)] with_prior_ht = Concat([prior_layer, prior_ht]) # [(OUTPUT_DIM + INPUT_DIM) x C_DIM ] ft_w = Identity([], name=layer_name + '_ft_w') # C_DIM ] ft_b = Identity([], name=layer_name + '_ft_b') # [BN x C_DIM] ft_mult = MatrixMult([with_prior_ht, ft_w]) # [ BN x C_DIM ] ft = Sigmoid(MatrixAdd([ft_mult, ft_b])) # [(OUTPUT_DIM + INPUT_DIM) x C_DIM ] it_w = Identity([], name=layer_name + '_it_w') # [ C_DIM ] it_b = Identity([], name=layer_name + '_it_b') # [ BN x C_DIM ] it_mult = MatrixMult([with_prior_ht, it_w]) # [ BN x C_DIM ] it = Sigmoid(MatrixAdd([it_mult, it_b])) # [(OUTPUT_DIM + INPUT_DIM) x C_DIM ] delta_c_w = Identity([], name=layer_name + '_delta_c_w') # [ C_DIM ] delta_c_b = Identity([], name=layer_name + '_delta_c_b') # [ BN x C_DIM ] delta_c_mult = MatrixMult([with_prior_ht, delta_c_w]) # [ BN x C_DIM ] delta_c = TanH(MatrixAdd([delta_c_mult, delta_c_b])) prior_ct = Identity([], name='prior_' + layer_name + '_ct') # [ BN, C_DIM ] and [ BN, C_DIM ] ct_after_forget = ElementwiseMult([prior_ct, ft]) # [ BN, C_DIM ] and [ BN, C_DIM ] ct = MatrixAddExact([ct_after_forget, ElementwiseMult([delta_c, it])]) ct_pass = Identity([ct], name=layer_name + '_ct') # [ (OUTPUT_DIM + INPUT_DIM), OUTPUT_DIM ] output_c_w = Identity([], name=layer_name + '_output_c_w') # [ OUTPUT_DIM ] output_c_b = Identity([], name=layer_name + '_output_c_b') # [ BN, OUTPUT_DIM ] output_mult = MatrixMult([with_prior_ht, output_c_w]) # [ BN, OUTPUT_DIM ] output_before_cond = Sigmoid(MatrixAdd([output_mult, output_c_b])) # [ BN, OUTPUT_DIM ] and [ BN, C_DIM ] so OUTPUT_DIM == C_DIM output = ElementwiseMult([output_before_cond, ct]) output_pass = Identity([output], name=layer_name + '_ht') return output
def test_rnn_works_simple(): i = Identity([], name='input') def get_recursive_layer(prior, layer_name, weight_name, bias_name): fcw1 = Identity([], name=weight_name) fcb1 = Identity([], name=bias_name) ii = Identity([], name='prior_' + layer_name) joined = Concat([prior, ii]) multed = MatrixMult([joined, fcw1]) added = MatrixAdd([multed, fcb1]) h1 = Relu(added, name=layer_name + '_internal') h11 = Identity([h1], name=layer_name) return h1 h1 = get_recursive_layer(i, 'h1', 'fc_w1', 'fc_b1') fcw2 = Identity([], name='fc_w2') fcb2 = Identity([], name='fc_b2') output = (Relu(MatrixAdd([MatrixMult([h1, fcw2]), fcb2]), name='output')) BN = 4 NUM = 3 H_SIZE = 6 weights = { 'fc_w1': 0.05 * np.random.rand(3 + H_SIZE, H_SIZE), 'fc_b1': 0.05 * np.random.rand(H_SIZE), 'fc_w2': 0.05 * np.random.rand(H_SIZE, NUM), 'fc_b2': 0.05 * np.random.rand(NUM), } optimizer = get_sgd_optimizer(0.0025) trainer = RNNTrainer( output, weights, {'h1': np.zeros((BN, H_SIZE))}, running_rnn_loss('input', 'output', mean_squared_loss), optimizer) def batch_gen(): return {'input': stupid_fsm()} test_batch = batch_gen() initial_loss = trainer.test(test_batch) trainer.train_batch(300, batch_gen) other_loss = trainer.test(test_batch) assert other_loss * 3 < initial_loss trainer.initial_hidden = {'h1': np.zeros((1, H_SIZE))} num = 20 initial = np.array([[1, 0, 0]]) def concretizer(val): m = np.random.choice(np.array([0, 1, 2]), p=val['output'][0] / sum(val['output'][0])) ret = np.array([0, 0, 0]) ret[m] = 1 return {**val, 'input': np.array([ret])} predicted = trainer.predict(num, {'input': initial}, concretizer)
def get_recursive_layer(prior, layer_name, weight_name, bias_name): fcw1 = Identity([], name=weight_name) fcb1 = Identity([], name=bias_name) ii = Identity([], name='prior_' + layer_name) joined = Concat([prior, ii]) multed = MatrixMult([joined, fcw1]) added = MatrixAdd([multed, fcb1]) h1 = Relu(added, name=layer_name + '_internal') h11 = Identity([h1], name=layer_name) return h1
def test_rnn_multistep(): i = Identity([], name='input') def get_recursive_layer(prior, layer_name, weight_name, bias_name): fcw1 = Identity([], name=weight_name) fcb1 = Identity([], name=bias_name) ii = Identity([], name='prior_' + layer_name) joined = Concat([prior, ii]) multed = MatrixMult([joined, fcw1]) added = MatrixAdd([multed, fcb1]) h1 = Relu(added, name=layer_name + '_internal') h11 = Identity([h1], name=layer_name) return h1 h1 = get_recursive_layer(i, 'h1', 'fc_w1', 'fc_b1') h2 = get_recursive_layer(h1, 'h2', 'fc_w2', 'fc_b2') fcw3 = Identity([], name='fc_w3') fcb3 = Identity([], name='fc_b3') output = (Exponent(MatrixAdd([MatrixMult([h2, fcw3]), fcb3]), name='output')) BN = 4 T = 15 NUM = 3 H_SIZE = 13 weights = { 'fc_w1': 0.2 * (np.random.rand(3 + H_SIZE, H_SIZE) - 0.5), 'fc_b1': 0.2 * (np.random.rand(H_SIZE) - 0.5), 'fc_w2': 0.2 * (np.random.rand(H_SIZE + H_SIZE, H_SIZE) - 0.5), 'fc_b2': 0.2 * (np.random.rand(H_SIZE) - 0.5), 'fc_w3': 0.2 * (np.random.rand(H_SIZE, NUM) - 0.5), 'fc_b3': 0.2 * (np.random.rand(NUM) - 0.5), } optimizer = get_sgd_optimizer(0.004) trainer = RNNTrainer( output, weights, { 'h1': np.zeros((BN, H_SIZE)), 'h2': np.zeros((BN, H_SIZE)) }, running_rnn_loss('input', 'output', mean_squared_loss), optimizer) def batch_gen(): nmn = alt_patterns() return {'input': nmn} test_batch = batch_gen() initial_loss = trainer.test(test_batch) trainer.train_batch(500, batch_gen) final_loss = trainer.test(test_batch) assert final_loss * 3 < initial_loss trainer.initial_hidden = { 'h1': np.zeros((1, H_SIZE)), 'h2': np.zeros((1, H_SIZE)) } num = 20 initial = np.array([[0, 0, 1]]) def concretizer(val): #m = np.random.choice(np.array([0, 1, 2]), p=val['output'][0] / sum(val['output'][0])) print(val['output']) m = np.argmax(val['output']) ret = np.array([0, 0, 0]) ret[m] = 1 return {**val, 'input': np.array([ret])} predicted = trainer.predict(num, {'input': initial}, concretizer) print([x['input'] for x in predicted])
def test_all_backprob_again(): i = Input('input') iw = Parameter('fc_w1') ib = Parameter('fc_b1') h1 = Sigmoid([MatrixAdd([MatrixMult([i, iw], name='mult1'), ib], name='add1')], name='h1') iw2 = Parameter('fc_w2') ib2 = Parameter('fc_b2') h2 = Sigmoid([MatrixAdd([MatrixMult([h1, iw2], name='mult2'), ib2], name='add2')], name='h2') h3 = MatrixAddExact([h1, h2], name='added') iw3 = Parameter('fc_w3') ib3 = Parameter('fc_b3') h4 = Relu(MatrixAdd([MatrixMult([h3, iw3], name='mult3'), ib3], name='add3'), name='h4') output = Exponent(h4, name='output') full = output rand = np.random.rand def input_generator(): return { 'input': rand(*[7, 10]), 'fc_w1': rand(*[10, 11]), 'fc_b1': rand(*[11]), 'fc_w2': rand(*[11, 11]), 'fc_b2': rand(*[11]), 'fc_w3': rand(*[11, 10]), 'fc_b3': rand(*[10]), } skips = 0 for n in range(100): inpp = input_generator() desired = rand(*[7, 10]) forward1 = full.forw(inpp) loss1, deriv1 = mean_squared_loss( prediction=forward1['output'], truth=desired) derivatives = full.back( { 'output': deriv1 }, forward1, list(inpp.keys())) k = list(derivatives.keys()) r = np.random.choice(k) indiv = inpp[r].copy() random_point = [ floor(i * random()) for i in indiv.shape] this_deriv = derivatives[r] for ii in range(len(random_point)): cord = random_point[ii] this_deriv = this_deriv[cord] if np.abs(this_deriv) < 0.001: skips += 1 continue LR = 0.001 change_amount = LR if len(random_point) == 1: indiv[random_point[0]] = indiv[random_point[0]] - change_amount elif len(random_point) == 2: indiv[random_point[0]][random_point[1]] = indiv[random_point[0]][random_point[1]] - change_amount elif len(random_point) == 3: indiv[random_point[0]][random_point[1]][random_point[2]] = indiv[random_point[0]][random_point[1]][random_point[2]] - change_amount elif len(random_point) == 4: indiv[random_point[0]][random_point[1]][random_point[2]][random_point[3]] = indiv[random_point[0]][random_point[1]][random_point[2]][random_point[3]] - change_amount else: assert False inpp[r] = indiv forward2 = full.forw(inpp) loss2, deriv2 = mean_squared_loss( prediction=forward2['output'], truth=desired) amount = (loss1 - loss2) assert np.isclose(loss1, loss2, this_deriv * LR, atol=0.01) or (loss1 == 0.0 and loss2 == 0.0) assert skips < 50
def test_names_manual(): i = Identity([], name='input') iw = Identity([], name='fc_w1') ib = Identity([], name='fc_b1') h1 = Relu([MatrixAdd([MatrixMult([i, iw], name='mult1'), ib], name='add1')], name='h1') iw2 = Identity([], name='fc_w2') ib2 = Identity([], name='fc_b2') h2 = Relu([MatrixAdd([MatrixMult([h1, iw2]), ib2])], name='h2') output = Probabilize(Exponent(h2)) full = output includes = full.get_names() should_include = [ 'input', 'fc_w1', 'fc_b1', 'h1', 'fc_b2', 'fc_w2', 'h2'] for name in should_include: assert name in includes predecessors = full.get_inputs_required_for(['h1']) assert len(predecessors) == 3 should_include = ['input', 'fc_w1', 'fc_b1'] for name in should_include: assert name in includes requires_input = full.get_inputs() assert len(requires_input) == 5 for _ in range(200): i = np.random.rand(10, 21) w1 = np.random.rand(21, 13) b1 = np.random.rand(13) i_dict = { 'input': i, 'fc_w1': w1, 'fc_b1': b1 } results = full.forw(i_dict, [ 'h1' ]) desired_h1 = np.random.rand(*results['h1'].shape) old_loss, deriv = mean_squared_loss( prediction=results['h1'], truth=desired_h1) for __ in range(2): i_dict = { 'input': i, 'fc_w1': w1, 'fc_b1': b1 } results = full.forw(i_dict, [ 'h1' ]) loss, deriv = mean_squared_loss( prediction=results['h1'], truth=desired_h1) back_derivs = full.back( { 'h1': deriv }, results, [ 'fc_w1', 'fc_b1']) w1 = w1 - back_derivs['fc_w1'] * 0.001 b1 = b1 - back_derivs['fc_b1'] * 0.001 i_dict = { 'input': i, 'fc_w1': w1, 'fc_b1': b1 } results = full.forw(i_dict, [ 'h1' ]) new_loss, deriv = mean_squared_loss( prediction=results['h1'], truth=desired_h1) assert new_loss < old_loss
def get_layer(prev, weight_name, bias_name): iw = Parameter(weight_name) ib = Parameter(bias_name) mult = MatrixMult([prev, iw], name='mult_' + weight_name) add = MatrixAdd([mult, ib], name='add_' + bias_name) return Relu(add)
def test_basic_rnn(): i = Identity([], name='input') fcw1 = Identity([], name='fc_w1') fcb1 = Identity([], name='fc_b1') ii = Identity([], name='prior_h1') joined = Concat([i, ii]) h1 = LeakyRelu(MatrixAdd([MatrixMult([joined, fcw1]), fcb1]), name='internal_h1') h11 = Identity([h1], name='h1') fcw2 = Identity([], name='fc_w2') fcb2 = Identity([], name='fc_b2') i2 = Identity([], name='prior_h2') joined2 = Concat([h1, i2]) h2 = LeakyRelu(MatrixAdd([MatrixMult([joined2, fcw2]), fcb2]), name='internal_h2') h22 = Identity([h2], name='h2') fcw3 = Identity([], name='fc_w3') fcb3 = Identity([], name='fc_b3') output = (LeakyRelu(MatrixAdd([MatrixMult([h2, fcw3]), fcb3]), name='output')) BN = 4 T = 15 NUM = 3 rnn = to_rnn(output) H_SIZE = 13 weights = { 'fc_w1': 0.2 * (np.random.rand(3 + H_SIZE, H_SIZE) - 0.5), 'fc_b1': 0.2 * (np.random.rand(H_SIZE) - 0.5), 'fc_w2': 0.2 * (np.random.rand(H_SIZE + H_SIZE, H_SIZE) - 0.5), 'fc_b2': 0.2 * (np.random.rand(H_SIZE) - 0.5), 'fc_w3': 0.2 * (np.random.rand(H_SIZE, NUM) - 0.5), 'fc_b3': 0.2 * (np.random.rand(NUM) - 0.5), } first_loss = None last_loss = None for i in range(300): forward_data = alt_patterns() forward = rnn.forw({'input': forward_data}, weights, { 'h1': np.zeros((BN, H_SIZE)), 'h2': np.zeros((BN, H_SIZE)) }) losses = [] derivs = [] for ii in range(0, T - 1): loss, deriv = mean_squared_loss(prediction=forward[ii]['output'], truth=forward_data[:, ii + 1, :]) derivs.append({'output': deriv}) losses.append(loss) derivs.append({'output': np.zeros(derivs[0]['output'].shape)}) print("Loss at ", i, " is ", sum(losses)) if (first_loss is None): first_loss = sum(losses) last_loss = sum(losses) backwards = rnn.back(forward, derivs, [ 'fc_w1', 'fc_w2', 'fc_w3', 'fc_b1', 'fc_b2', 'fc_b3', 'prior_h1', 'prior_h2' ]) for key in weights.keys(): weights[key] = weights[key] - 0.005 * backwards[key] assert last_loss * 3 < first_loss