def __init__(self, input_shape, num_classes, name=None): self.num_classes = num_classes self.input_shape = input_shape self.name = name ''' if load: weight_dict = np.load(load).item() self.B = tf.cast(tf.Variable(weight_dict[self.name]), tf.float32) elif std is not None: b = np.random.normal(loc=0., scale=std, size=(self.num_classes, self.output_size)) self.B = tf.cast(tf.Variable(b), tf.float32) else: # var = 1. / self.output_size # std = np.sqrt(var) # b = np.random.normal(loc=0., scale=std, size=(self.num_classes, self.output_size)) b = FeedbackMatrix(size=(self.num_classes, self.output_size), sparse=self.sparse, rank=self.rank) self.B = tf.cast(tf.Variable(b), tf.float32) ''' # THE PROBLEM WAS NEVER THE BIAS ... IT WAS THE FACT WE WERNT DIVIDING BY N # l0 = FullyConnected(input_shape=input_shape, size=self.input_shape, init='alexnet', activation=Relu(), bias=1., name=self.name) self.l0 = FullyConnected(input_shape=input_shape, size=self.num_classes, init='alexnet', activation=Linear(), bias=0., name=self.name)
def __init__(self, input_shape, pool_shape, nactions, name=None): self.input_shape = input_shape self.batch_size, self.h, self.w, self.fin = self.input_shape self.pool_shape = pool_shape self.nactions = nactions self.name = name self.action_name = self.name + '_action' self.value_name = self.name + '_value' self.nlp_name = self.name + '_nlp' self.pool = AvgPool(size=self.input_shape, ksize=self.pool_shape, strides=self.pool_shape, padding='SAME') l2_input_shape = l1.output_shape() self.conv2fc = ConvToFullyConnected(input_shape=l2_input_shape) l3_input_shape = l2.output_shape() self.actions = FullyConnected(input_shape=l3_input_shape, size=self.nactions, init='alexnet', name=self.name + '_actions') self.values = FullyConnected(input_shape=l3_input_shape, size=1, init='alexnet', name=self.name + '_values') #################################################### self.logits_bias = tf.Variable(np.zeros(shape=(self.nbatch, self.nclass)), dtype=tf.float32) self.values_bias = tf.Variable(np.zeros(shape=(self.nbatch, 1)), dtype=tf.float32) # self.actions_model = Model(layers=[l1, l2, actions]) # self.values_model = Model(layers=[l1, l2, values]) #################################################### self.advantages = tf.placeholder("float", [None]) self.rewards = tf.placeholder("float", [None]) self.old_actions = tf.placeholder("int32", [None]) self.old_values = tf.placeholder("float", [None]) self.old_nlps = tf.placeholder("float", [None])
def __init__(self, input_shape, pool_shape, num_classes, name=None): self.input_shape = input_shape self.batch_size, self.h, self.w, self.fin = self.input_shape self.pool_shape = pool_shape self.num_classes = num_classes self.name = name l1 = AvgPool(size=self.input_shape, ksize=self.pool_shape, strides=self.pool_shape, padding='SAME') l2_input_shape = l1.output_shape() l2 = ConvToFullyConnected(input_shape=l2_input_shape) l3_input_shape = l2.output_shape() l3 = FullyConnected(input_shape=l3_input_shape, size=self.num_classes, init='alexnet', activation=Linear(), bias=0., name=self.name) self.B = Model(layers=[l1, l2, l3])
l4 = MaxPool(size=[batch_size, 27, 27, 256], ksize=[1, 3, 3, 1], strides=[1, 2, 2, 1], padding="VALID") l5 = FeedbackConv(size=[batch_size, 13, 13, 256], num_classes=1000, sparse=args.sparse, rank=args.rank, name='conv2_fb') l6 = Convolution(input_shape=[batch_size, 13, 13, 256], filter_sizes=[3, 3, 256, 384], init=args.init, activation=act, bias=args.bias, load=weights_conv, name='conv3', train=train_conv) l7 = FeedbackConv(size=[batch_size, 13, 13, 384], num_classes=1000, sparse=args.sparse, rank=args.rank, name='conv3_fb') l8 = Convolution(input_shape=[batch_size, 13, 13, 384], filter_sizes=[3, 3, 384, 384], init=args.init, activation=act, bias=args.bias, load=weights_conv, name='conv4', train=train_conv) l9 = FeedbackConv(size=[batch_size, 13, 13, 384], num_classes=1000, sparse=args.sparse, rank=args.rank, name='conv4_fb') l10 = Convolution(input_shape=[batch_size, 13, 13, 384], filter_sizes=[3, 3, 384, 256], init=args.init, activation=act, bias=args.bias, load=weights_conv, name='conv5', train=train_conv) l11 = MaxPool(size=[batch_size, 13, 13, 256], ksize=[1, 3, 3, 1], strides=[1, 2, 2, 1], padding="VALID") l12 = FeedbackConv(size=[batch_size, 6, 6, 256], num_classes=1000, sparse=args.sparse, rank=args.rank, name='conv5_fb') l13 = ConvToFullyConnected(input_shape=[6, 6, 256]) l14 = FullyConnected(input_shape=6*6*256, size=4096, init=args.init, activation=act, bias=args.bias, load=weights_fc, name='fc1', train=train_fc) l15 = Dropout(rate=dropout_rate) l16 = FeedbackFC(size=[6*6*256, 4096], num_classes=1000, sparse=args.sparse, rank=args.rank, name='fc1_fb') l17 = FullyConnected(input_shape=4096, size=4096, init=args.init, activation=act, bias=args.bias, load=weights_fc, name='fc2', train=train_fc) l18 = Dropout(rate=dropout_rate) l19 = FeedbackFC(size=[4096, 4096], num_classes=1000, sparse=args.sparse, rank=args.rank, name='fc2_fb') l20 = FullyConnected(input_shape=4096, size=1000, init=args.init, bias=args.bias, load=weights_fc, name='fc3', train=train_fc) ############################################################### model = Model(layers=[l0, l1, l2, l3, l4, l5, l6, l7, l8, l9, l10, l11, l12, l13, l14, l15, l16, l17, l18, l19, l20]) predict = tf.nn.softmax(model.predict(X=features)) weights = model.get_weights()
def main(): parser = argparse.ArgumentParser() parser.add_argument('--epochs', type=int, default=200) parser.add_argument('--batch_size', type=int, default=128) parser.add_argument('--alpha', type=float, default=1e-4) parser.add_argument('--beta', type=float, default=1e-4) #feedback weights, B, learning rate parser.add_argument('--sigma', type=float, default=0.1) #node pert standard deviation parser.add_argument('--l2', type=float, default=0.) parser.add_argument('--decay', type=float, default=1.) parser.add_argument('--eps', type=float, default=1e-5) parser.add_argument('--dropout', type=float, default=0.5) parser.add_argument('--act', type=str, default='tanh') parser.add_argument('--bias', type=float, default=0.1) parser.add_argument('--gpu', type=int, default=1) parser.add_argument('--dfa', type=int, default=1) parser.add_argument('--feedbacklearning', type=int, default=1) #Whether or not to learn feedback weights parser.add_argument('--sparse', type=int, default=0) parser.add_argument('--rank', type=int, default=0) parser.add_argument('--init', type=str, default="sqrt_fan_in") parser.add_argument('--opt', type=str, default="adam") parser.add_argument('--N', type=int, default=50) parser.add_argument('--save', type=int, default=0) parser.add_argument('--name', type=str, default="cifar10_conv_np") parser.add_argument('--load', type=str, default=None) args = parser.parse_args() if args.gpu >= 0: os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" os.environ["CUDA_VISIBLE_DEVICES"] = str(args.gpu) cifar10 = tf.keras.datasets.cifar10.load_data() ############################################## EPOCHS = args.epochs TRAIN_EXAMPLES = 50000 TEST_EXAMPLES = 10000 BATCH_SIZE = args.batch_size if args.act == 'tanh': act = Tanh() elif args.act == 'relu': act = Relu() else: assert (False) train_fc = True if args.load: train_conv = False else: train_conv = True weights_fc = None weights_conv = args.load #Setup the parameters attrs = ['sigma', 'alpha', 'beta'] log_scale = [True, True, True] ranges = [[-4, -1], [-6, -3], [-6, -3]] params = [] isnan = [] train_accs = [] test_accs = [] #Here we run a bunch of times for different parameters... for idx in range(args.N): #Choose some random parameters... param = set_random_hyperparameters(args, attrs, ranges, log_scale) params.append(param) if args.feedbacklearning == 0: args.beta = 0 #Tell me the params.... print('Alpha, beta, sigma are: ', args.alpha, args.beta, args.sigma) tf.set_random_seed(0) tf.reset_default_graph() batch_size = tf.placeholder(tf.int32, shape=()) dropout_rate = tf.placeholder(tf.float32, shape=()) learning_rate = tf.placeholder(tf.float32, shape=()) sigma = tf.placeholder(tf.float32, shape=(), name="Sigma") X = tf.placeholder(tf.float32, [None, 32, 32, 3]) X = tf.map_fn(lambda frame: tf.image.per_image_standardization(frame), X) Y = tf.placeholder(tf.float32, [None, 10]) l0 = Convolution(input_sizes=[batch_size, 32, 32, 3], filter_sizes=[5, 5, 3, 96], num_classes=10, init_filters=args.init, strides=[1, 1, 1, 1], padding="SAME", alpha=learning_rate, activation=act, bias=args.bias, last_layer=False, name='conv1', load=weights_conv, train=train_conv) l1 = MaxPool(size=[batch_size, 32, 32, 96], ksize=[1, 3, 3, 1], strides=[1, 2, 2, 1], padding="SAME") #Add perturbation to activity to get output to train feedback weights with l2p = NodePert(size=[batch_size, 16, 16, 96], sigma=sigma) l2 = FeedbackConv(size=[batch_size, 16, 16, 96], num_classes=10, sparse=args.sparse, rank=args.rank, name='conv1_fb') l3 = Convolution(input_sizes=[batch_size, 16, 16, 96], filter_sizes=[5, 5, 96, 128], num_classes=10, init_filters=args.init, strides=[1, 1, 1, 1], padding="SAME", alpha=learning_rate, activation=act, bias=args.bias, last_layer=False, name='conv2', load=weights_conv, train=train_conv) l4 = MaxPool(size=[batch_size, 16, 16, 128], ksize=[1, 3, 3, 1], strides=[1, 2, 2, 1], padding="SAME") l5p = NodePert(size=[batch_size, 8, 8, 128], sigma=sigma) l5 = FeedbackConv(size=[batch_size, 8, 8, 128], num_classes=10, sparse=args.sparse, rank=args.rank, name='conv2_fb') l6 = Convolution(input_sizes=[batch_size, 8, 8, 128], filter_sizes=[5, 5, 128, 256], num_classes=10, init_filters=args.init, strides=[1, 1, 1, 1], padding="SAME", alpha=learning_rate, activation=act, bias=args.bias, last_layer=False, name='conv3', load=weights_conv, train=train_conv) l7 = MaxPool(size=[batch_size, 8, 8, 256], ksize=[1, 3, 3, 1], strides=[1, 2, 2, 1], padding="SAME") l8p = NodePert(size=[batch_size, 4, 4, 256], sigma=sigma) l8 = FeedbackConv(size=[batch_size, 4, 4, 256], num_classes=10, sparse=args.sparse, rank=args.rank, name='conv3_fb') l9 = ConvToFullyConnected(shape=[4, 4, 256]) l10p = NodePert(size=[batch_size, 4 * 4 * 256], sigma=sigma) l10 = FullyConnected(size=[4 * 4 * 256, 2048], num_classes=10, init_weights=args.init, alpha=learning_rate, activation=act, bias=args.bias, last_layer=False, name='fc1', load=weights_fc, train=train_fc) l11 = Dropout(rate=dropout_rate) l12 = FeedbackFC(size=[4 * 4 * 256, 2048], num_classes=10, sparse=args.sparse, rank=args.rank, name='fc1_fb') l13p = NodePert(size=[batch_size, 2048], sigma=sigma) l13 = FullyConnected(size=[2048, 2048], num_classes=10, init_weights=args.init, alpha=learning_rate, activation=act, bias=args.bias, last_layer=False, name='fc2', load=weights_fc, train=train_fc) l14 = Dropout(rate=dropout_rate) l15 = FeedbackFC(size=[2048, 2048], num_classes=10, sparse=args.sparse, rank=args.rank, name='fc2_fb') l16 = FullyConnected(size=[2048, 10], num_classes=10, init_weights=args.init, alpha=learning_rate, activation=Linear(), bias=args.bias, last_layer=True, name='fc3', load=weights_fc, train=train_fc) ############################################## model = Model(layers=[ l0, l1, l2, l3, l4, l5, l6, l7, l8, l9, l10, l11, l12, l13, l14, l15, l16 ]) model_perturbed = Model(layers=[ l0, l1, l2p, l2, l3, l4, l5p, l5, l6, l7, l8p, l8, l9, l10p, l10, l11, l12, l13p, l13, l14, l15, l16 ]) predict = model.predict(X=X) predict_perturbed = model_perturbed.predict(X=X) ####### #Pairs of perturbations and feedback weights #feedbackpairs = [[l2p, l2], [l5p, l5], [l8p, l8], [l10p, l12], [l13p, l15]] #Test one at a time... this works, so it must be l10p, 12 pair that fails feedbackpairs = [[l2p, l2], [l5p, l5], [l8p, l8], [l13p, l15]] #Get noise, feedback matrices, and loss function and unperturbed loss function, to make update rule for feedback weights loss = tf.reduce_sum(tf.pow(tf.nn.softmax(predict) - Y, 2), 1) / 2 loss_perturbed = tf.reduce_sum( tf.pow(tf.nn.softmax(predict_perturbed) - Y, 2), 1) / 2 train_B = [] E = tf.nn.softmax(predict) - Y for idx, (noise, feedback) in enumerate(feedbackpairs): print(idx, batch_size, feedback.output_size) xi = tf.reshape(noise.get_noise(), (batch_size, feedback.output_size)) B = feedback.B lambd = tf.matmul( tf.diag(loss_perturbed - loss) / args.sigma / args.sigma, xi) np_error = tf.matmul(E, B) - lambd grad_B = tf.matmul(tf.transpose(E), np_error) new_B = B.assign(B - args.beta * grad_B) train_B.append(new_B) ####### weights = model.get_weights() if args.opt == "adam" or args.opt == "rms" or args.opt == "decay": if args.dfa: grads_and_vars = model.dfa_gvs(X=X, Y=Y) else: grads_and_vars = model.gvs(X=X, Y=Y) if args.opt == "adam": train = tf.train.AdamOptimizer( learning_rate=learning_rate, beta1=0.9, beta2=0.999, epsilon=args.eps).apply_gradients( grads_and_vars=grads_and_vars) elif args.opt == "rms": train = tf.train.RMSPropOptimizer( learning_rate=learning_rate, decay=0.99, epsilon=args.eps).apply_gradients( grads_and_vars=grads_and_vars) elif args.opt == "decay": train = tf.train.GradientDescentOptimizer( learning_rate=learning_rate).apply_gradients( grads_and_vars=grads_and_vars) else: assert (False) else: if args.dfa: train = model.dfa(X=X, Y=Y) else: train = model.train(X=X, Y=Y) correct = tf.equal(tf.argmax(predict, 1), tf.argmax(Y, 1)) total_correct = tf.reduce_sum(tf.cast(correct, tf.float32)) ############################################## sess = tf.InteractiveSession() tf.global_variables_initializer().run() tf.local_variables_initializer().run() (x_train, y_train), (x_test, y_test) = cifar10 x_train = x_train.reshape(TRAIN_EXAMPLES, 32, 32, 3) y_train = keras.utils.to_categorical(y_train, 10) x_test = x_test.reshape(TEST_EXAMPLES, 32, 32, 3) y_test = keras.utils.to_categorical(y_test, 10) ############################################## filename = args.name + '.results' f = open(filename, "w") f.write(filename + "\n") f.write("total params: " + str(model.num_params()) + "\n") f.close() ############################################## for ii in range(EPOCHS): if args.opt == 'decay' or args.opt == 'gd': decay = np.power(args.decay, ii) lr = args.alpha * decay else: lr = args.alpha print(ii) ############################# _count = 0 _total_correct = 0 #The training loop... here we add something to also update the feedback weights with the node pert for jj in range(int(TRAIN_EXAMPLES / BATCH_SIZE)): xs = x_train[jj * BATCH_SIZE:(jj + 1) * BATCH_SIZE] ys = y_train[jj * BATCH_SIZE:(jj + 1) * BATCH_SIZE] _correct, _ = sess.run( [total_correct, train], feed_dict={ sigma: 0.0, batch_size: BATCH_SIZE, dropout_rate: args.dropout, learning_rate: lr, X: xs, Y: ys }) #Add step to update B...... _ = sess.run( [train_B], feed_dict={ sigma: args.sigma, batch_size: BATCH_SIZE, dropout_rate: args.dropout, learning_rate: lr, X: xs, Y: ys }) _total_correct += _correct _count += BATCH_SIZE train_acc = 1.0 * _total_correct / _count train_accs.append(train_acc) ############################# _count = 0 _total_correct = 0 for jj in range(int(TEST_EXAMPLES / BATCH_SIZE)): xs = x_test[jj * BATCH_SIZE:(jj + 1) * BATCH_SIZE] ys = y_test[jj * BATCH_SIZE:(jj + 1) * BATCH_SIZE] _correct = sess.run(total_correct, feed_dict={ sigma: 0.0, batch_size: BATCH_SIZE, dropout_rate: 0.0, learning_rate: 0.0, X: xs, Y: ys }) _total_correct += _correct _count += BATCH_SIZE test_acc = 1.0 * _total_correct / _count test_accs.append(test_acc) isnan.append(None) #try: # trainer.train() #except ValueError: # print("Method fails to converge for these parameters") # isnan[n,m] = 1 #Save results... ############################# print("train acc: %f test acc: %f" % (train_acc, test_acc)) f = open(filename, "a") f.write("train acc: %f test acc: %f\n" % (train_acc, test_acc)) f.close() #Save params after each run fn = "./cifar10_conv_np_hyperparam_search_varalpha_septsearch_2_dfa_%d_fblearning_%d.npz" % ( args.dfa, args.feedbacklearning) to_save = { 'attr': attrs, 'params': params, 'train_accs': train_accs, 'test_accs': test_accs, 'isnan': isnan } pickle.dump(to_save, open(fn, "wb"))
l2 = FeedbackConv(size=[batch_size, 16, 16, 96], num_classes=10, sparse=args.sparse, rank=args.rank, name='conv1_fb') l3 = Convolution(input_sizes=[batch_size, 16, 16, 96], filter_sizes=[5, 5, 96, 128], num_classes=10, init_filters=args.init, strides=[1, 1, 1, 1], padding="SAME", alpha=learning_rate, activation=act, bias=args.bias, last_layer=False, name='conv2', load=weights_conv, train=train_conv) l4 = MaxPool(size=[batch_size, 16, 16, 128], ksize=[1, 3, 3, 1], strides=[1, 2, 2, 1], padding="SAME") l5p = NodePert(size=[batch_size, 8, 8, 128], sigma = sigma) l5 = FeedbackConv(size=[batch_size, 8, 8, 128], num_classes=10, sparse=args.sparse, rank=args.rank, name='conv2_fb') l6 = Convolution(input_sizes=[batch_size, 8, 8, 128], filter_sizes=[5, 5, 128, 256], num_classes=10, init_filters=args.init, strides=[1, 1, 1, 1], padding="SAME", alpha=learning_rate, activation=act, bias=args.bias, last_layer=False, name='conv3', load=weights_conv, train=train_conv) l7 = MaxPool(size=[batch_size, 8, 8, 256], ksize=[1, 3, 3, 1], strides=[1, 2, 2, 1], padding="SAME") l8p = NodePert(size=[batch_size, 4, 4, 256], sigma = sigma) l8 = FeedbackConv(size=[batch_size, 4, 4, 256], num_classes=10, sparse=args.sparse, rank=args.rank, name='conv3_fb') l9 = ConvToFullyConnected(shape=[4, 4, 256]) l10p = NodePert(size=[batch_size, 4*4*256], sigma = sigma) l10 = FullyConnected(size=[4*4*256, 2048], num_classes=10, init_weights=args.init, alpha=learning_rate, activation=act, bias=args.bias, last_layer=False, name='fc1', load=weights_fc, train=train_fc) l11 = Dropout(rate=dropout_rate) l12 = FeedbackFC(size=[4*4*256, 2048], num_classes=10, sparse=args.sparse, rank=args.rank, name='fc1_fb') l13p = NodePert(size=[batch_size, 2048], sigma = sigma) l13 = FullyConnected(size=[2048, 2048], num_classes=10, init_weights=args.init, alpha=learning_rate, activation=act, bias=args.bias, last_layer=False, name='fc2', load=weights_fc, train=train_fc) l14 = Dropout(rate=dropout_rate) l15 = FeedbackFC(size=[2048, 2048], num_classes=10, sparse=args.sparse, rank=args.rank, name='fc2_fb') l16 = FullyConnected(size=[2048, 10], num_classes=10, init_weights=args.init, alpha=learning_rate, activation=Linear(), bias=args.bias, last_layer=True, name='fc3', load=weights_fc, train=train_fc) ############################################## model = Model(layers=[l0, l1, l2, l3, l4, l5, l6, l7, l8, l9, l10, l11, l12, l13, l14, l15, l16]) model_perturbed = Model(layers=[l0, l1, l2p, l2, l3, l4, l5p, l5, l6, l7, l8p, l8, l9, l10p, l10, l11, l12, l13p, l13, l14, l15, l16])
tf.set_random_seed(0) tf.reset_default_graph() batch_size = tf.placeholder(tf.int32, shape=()) dropout_rate = tf.placeholder(tf.float32, shape=()) lr = tf.placeholder(tf.float32, shape=()) X = tf.placeholder(tf.float32, [None, 32, 32, 3]) Y = tf.placeholder(tf.float32, [None, 10]) l0 = ConvToFullyConnected(input_shape=[32, 32, 3]) l1 = Dropout(rate=0.1) l2 = FullyConnected(input_shape=3072, size=1000, init=args.init, activation=act, bias=args.bias, name='fc1') l3 = Dropout(rate=dropout_rate) l4 = FeedbackFC(size=[3072, 1000], num_classes=10, sparse=args.sparse, rank=args.rank, name='fc1_fb') l5 = FullyConnected(input_shape=1000, size=1000, init=args.init, activation=act, bias=args.bias, name='fc2')
tf.set_random_seed(0) tf.reset_default_graph() batch_size = tf.placeholder(tf.int32, shape=()) dropout_rate = tf.placeholder(tf.float32, shape=()) learning_rate = tf.placeholder(tf.float32, shape=()) Y = tf.placeholder(tf.float32, [None, 10]) X = tf.placeholder(tf.float32, [None, 3072]) l0 = Dropout(rate=dropout_rate / 5.) l1 = FullyConnected(size=[3072, 1000], num_classes=10, init_weights=args.init, alpha=learning_rate, activation=act, bias=args.bias, last_layer=False, name="fc1") l2 = Dropout(rate=dropout_rate) l3 = FeedbackFC(size=[3072, 1000], num_classes=10, sparse=args.sparse, rank=args.rank, name="fc1_fb") l4 = FullyConnected(size=[1000, 1000], num_classes=10, init_weights=args.init, alpha=learning_rate, activation=act,
name='conv2') l3 = MaxPool(size=[batch_size, 28, 28, 64], ksize=[1, 3, 3, 1], strides=[1, 2, 2, 1], padding="SAME") l4 = FeedbackConv(size=[batch_size, 14, 14, 64], num_classes=10, sparse=args.sparse, rank=args.rank, name='conv2_fb') l5 = ConvToFullyConnected(input_shape=[14, 14, 64]) l6 = FullyConnected(input_shape=14 * 14 * 64, size=128, init=args.init, activation=act, bias=args.bias, name='fc1') l7 = Dropout(rate=dropout_rate) l8 = FeedbackFC(size=[14 * 14 * 64, 128], num_classes=10, sparse=args.sparse, rank=args.rank, name='fc1_fb') l9 = FullyConnected(input_shape=128, size=10, init=args.init, bias=args.bias, name='fc2')
class LELPPO(Layer): def __init__(self, input_shape, pool_shape, nactions, name=None): self.input_shape = input_shape self.batch_size, self.h, self.w, self.fin = self.input_shape self.pool_shape = pool_shape self.nactions = nactions self.name = name self.action_name = self.name + '_action' self.value_name = self.name + '_value' self.nlp_name = self.name + '_nlp' self.pool = AvgPool(size=self.input_shape, ksize=self.pool_shape, strides=self.pool_shape, padding='SAME') l2_input_shape = l1.output_shape() self.conv2fc = ConvToFullyConnected(input_shape=l2_input_shape) l3_input_shape = l2.output_shape() self.actions = FullyConnected(input_shape=l3_input_shape, size=self.nactions, init='alexnet', name=self.name + '_actions') self.values = FullyConnected(input_shape=l3_input_shape, size=1, init='alexnet', name=self.name + '_values') #################################################### self.logits_bias = tf.Variable(np.zeros(shape=(self.nbatch, self.nclass)), dtype=tf.float32) self.values_bias = tf.Variable(np.zeros(shape=(self.nbatch, 1)), dtype=tf.float32) # self.actions_model = Model(layers=[l1, l2, actions]) # self.values_model = Model(layers=[l1, l2, values]) #################################################### self.advantages = tf.placeholder("float", [None]) self.rewards = tf.placeholder("float", [None]) self.old_actions = tf.placeholder("int32", [None]) self.old_values = tf.placeholder("float", [None]) self.old_nlps = tf.placeholder("float", [None]) #################################################### def get_weights(self): return [] def output_shape(self): return self.input_shape def num_params(self): return 0 def place_holders(self): place_holders_dict = {} place_holders_dict[self.name + '_advantages'] = self.advantages place_holders_dict[self.name + '_rewards'] = self.rewards place_holders_dict[self.name + '_old_actions'] = self.old_actions place_holders_dict[self.name + '_old_values'] = self.old_values place_holders_dict[self.name + '_old_nlps'] = self.old_nlps return place_holders_dict ################################################################### def forward(self, X): return X def predict(self, X): # [logits, logits_forward] = self.actions_model.forward(X) # [values, values_forward] = self.values_model.forward(X) pool = self.pool.forward(AI) conv2fc = self.conv2fc.forward(pool) logits = self.actions.forward(conv2fc) values = self.values.forward(conv2fc) values = tf.reshape(values, (-1, )) actions = sample(logits) nlps = neg_log_prob(logits, actions) # states, rewards, advantages, old_actions, old_values, old_nlps cache = { self.action_name: actions, self.value_name: values, self.nlp_name: nlps } return X, cache ################################################################### def backward(self, AI, AO, DO): return DO def gv(self, AI, AO, DO): return [] ################################################################### def dfa_backward(self, AI, AO, E, DO): return DO def dfa_gv(self, AI, AO, E, DO): return [] ################################################################### def lel_backward(self, AI, AO, DO, cache): pool = self.pool.forward(AI) conv2fc = self.conv2fc.forward(pool) logits = self.actions.forward(conv2fc) values = self.values.forward(conv2fc) # [logits, logits_forward] = self.actions_model.forward(AI) # [values, values_forward] = self.values_model.forward(AI) logits = logits + self.logits_bias values = values + self.values_bias values = tf.reshape(values, (-1, )) nlps = neg_log_prob(logits, self.old_actions) ratio = tf.exp(nlps - self.old_nlps) ratio = tf.clip_by_value(ratio, 0, 10) surr1 = self.advantages * ratio surr2 = self.advantages * tf.clip_by_value(ratio, 1 - epsilon_decay, 1 + epsilon_decay) policy_loss = -tf.reduce_mean(tf.minimum(surr1, surr2)) entropy_loss = -policy_entropy(train) clipped_value_estimate = self.old_values + tf.clip_by_value( values - self.old_values, -epsilon_decay, epsilon_decay) value_loss_1 = tf.squared_difference(clipped_value_estimate, self.rewards) value_loss_2 = tf.squared_difference(values, self.rewards) value_loss = 0.5 * tf.reduce_mean( tf.maximum(value_loss_1, value_loss_2)) ################################################################### loss = policy_loss + 0.01 * entropy_loss + 1. * value_loss # grads = tf.gradients(self.loss, [self.logits_bias, self.values_bias] + self.params) grads = tf.gradients(self.loss, [self.logits_bias, self.values_bias]) do_logits = grads[0] do_values = grads[1] # we never call forward in lel, until backwards... forward just returns X. # actually works out nicely. # perhaps we dont actually need a cache then. # a few cheap redundant computations isnt so bad. dlogits = self.actions.backward(conv2fc, logits, do_logits) dvalues = self.values.backward(conv2fc, values, do_values) dconv2fc = self.conv2fc.backward(pool, conv2fc, dlogits + dvalues) dpool = self.pool.backward(AI, pool, dconv2fc) return dpool def lel_gv(self, AI, AO, DO, cache): pool = self.pool.forward(AI) conv2fc = self.conv2fc.forward(pool) logits = self.actions.forward(conv2fc) values = self.values.forward(conv2fc) # [logits, logits_forward] = self.actions_model.forward(AI) # [values, values_forward] = self.values_model.forward(AI) logits = logits + self.logits_bias values = values + self.values_bias values = tf.reshape(values, (-1, )) nlps = neg_log_prob(logits, self.old_actions) ratio = tf.exp(nlps - self.old_nlps) ratio = tf.clip_by_value(ratio, 0, 10) surr1 = self.advantages * ratio surr2 = self.advantages * tf.clip_by_value(ratio, 1 - epsilon_decay, 1 + epsilon_decay) policy_loss = -tf.reduce_mean(tf.minimum(surr1, surr2)) entropy_loss = -policy_entropy(train) clipped_value_estimate = self.old_values + tf.clip_by_value( values - self.old_values, -epsilon_decay, epsilon_decay) value_loss_1 = tf.squared_difference(clipped_value_estimate, self.rewards) value_loss_2 = tf.squared_difference(values, self.rewards) value_loss = 0.5 * tf.reduce_mean( tf.maximum(value_loss_1, value_loss_2)) ################################################################### loss = policy_loss + 0.01 * entropy_loss + 1. * value_loss # grads = tf.gradients(self.loss, [self.logits_bias, self.values_bias] + self.params) grads = tf.gradients(self.loss, [self.logits_bias, self.values_bias]) do_logits = grads[0] do_values = grads[1] # we never call forward in lel, until backwards... forward just returns X. # actually works out nicely. # perhaps we dont actually need a cache then. # a few cheap redundant computations isnt so bad. gvs = [] dlogits = self.actions.gv(conv2fc, logits, do_logits) dvalues = self.values.gv(conv2fc, values, do_values) # dconv2fc = self.conv2fc.backward(pool, conv2fc, dlogits + dvalues) # dpool = self.pool.backward(AI, pool, dconv2fc) gvs.extend(dlogits, dvalues) return gvs
tf.set_random_seed(0) tf.reset_default_graph() batch_size = tf.placeholder(tf.int32, shape=()) dropout_rate = tf.placeholder(tf.float32, shape=()) lr = tf.placeholder(tf.float32, shape=()) X = tf.placeholder(tf.float32, [None, 28, 28, 1]) X = tf.map_fn(lambda frame: tf.image.per_image_standardization(frame), X) Y = tf.placeholder(tf.float32, [None, 10]) l0 = ConvToFullyConnected(input_shape=[28, 28, 1]) l1 = FullyConnected(input_shape=784, size=400, init=args.init, activation=act, bias=args.bias, name='fc1') l2 = Dropout(rate=dropout_rate) l3 = FeedbackFC(size=[784, 400], num_classes=10, sparse=args.sparse, rank=args.rank, name='fc1_fb') l4 = FullyConnected(input_shape=400, size=10, init=args.init, bias=args.bias, name='fc2')
class LELFC(Layer): def __init__(self, input_shape, num_classes, name=None): self.num_classes = num_classes self.input_shape = input_shape self.name = name ''' if load: weight_dict = np.load(load).item() self.B = tf.cast(tf.Variable(weight_dict[self.name]), tf.float32) elif std is not None: b = np.random.normal(loc=0., scale=std, size=(self.num_classes, self.output_size)) self.B = tf.cast(tf.Variable(b), tf.float32) else: # var = 1. / self.output_size # std = np.sqrt(var) # b = np.random.normal(loc=0., scale=std, size=(self.num_classes, self.output_size)) b = FeedbackMatrix(size=(self.num_classes, self.output_size), sparse=self.sparse, rank=self.rank) self.B = tf.cast(tf.Variable(b), tf.float32) ''' # THE PROBLEM WAS NEVER THE BIAS ... IT WAS THE FACT WE WERNT DIVIDING BY N # l0 = FullyConnected(input_shape=input_shape, size=self.input_shape, init='alexnet', activation=Relu(), bias=1., name=self.name) self.l0 = FullyConnected(input_shape=input_shape, size=self.num_classes, init='alexnet', activation=Linear(), bias=0., name=self.name) # self.B = Model(layers=[l1]) def get_weights(self): # return self.l0.get_weights() return [] def get_feedback(self): return self.B def output_shape(self): return self.input_shape def num_params(self): return 0 def forward(self, X): return X ################################################################### def backward(self, AI, AO, DO): return DO def gv(self, AI, AO, DO): return [] def train(self, AI, AO, DO): return [] ################################################################### def dfa_backward(self, AI, AO, E, DO): return DO def dfa_gv(self, AI, AO, E, DO): return [] def dfa(self, AI, AO, E, DO): return [] ################################################################### # > https://ml-cheatsheet.readthedocs.io/en/latest/loss_functions.html # > https://www.ics.uci.edu/~pjsadows/notes.pdf # > https://deepnotes.io/softmax-crossentropy def lel_backward(self, AI, AO, E, DO, Y): ''' S = tf.matmul(AO, tf.transpose(self.B)) # should be doing cross entropy here. # is this right ? # just adding softmax ? ES = tf.subtract(tf.nn.softmax(S), Y) DO = tf.matmul(ES, self.B) # (* activation.gradient) and (* AI) occur in the actual layer itself. return DO ''' # ''' S = self.l0.forward(AI) ES = tf.subtract(tf.nn.softmax(S), Y) DI = self.l0.backward(AI, S, ES) # ''' # DI = self.B.backwards(AI, Y) return DI def lel_gv(self, AI, AO, E, DO, Y): # ''' S = self.l0.forward(AI) ES = tf.subtract(tf.nn.softmax(S), Y) gvs = self.l0.gv(AI, S, ES) # ''' # gvs = self.B.gvs(AI, Y) return gvs def lel(self, AI, AO, E, DO, Y): assert (False)
TEST_EXAMPLES = 10000 BATCH_SIZE = args.batch_size ############################################## tf.set_random_seed(0) tf.reset_default_graph() batch_size = tf.placeholder(tf.int32, shape=()) dropout_rate = tf.placeholder(tf.float32, shape=()) learning_rate = tf.placeholder(tf.float32, shape=()) X = tf.placeholder(tf.float32, [None, 784]) Y = tf.placeholder(tf.float32, [None, 10]) l0 = FullyConnected(size=[784, 400], num_classes=10, init_weights=args.init, alpha=learning_rate, activation=Tanh(), bias=args.bias, l2=args.l2, last_layer=False, name="fc1") l1 = Dropout(rate=dropout_rate) l2 = FeedbackFC(size=[784, 400], num_classes=10, sparse=args.sparse, rank=args.rank, name="fc1_fb") l3 = FullyConnected(size=[400, 10], num_classes=10, init_weights=args.init, alpha=learning_rate, activation=Linear(), bias=args.bias, l2=args.l2, last_layer=True, name="fc2") model = Model(layers=[l0, l1, l2, l3]) ############################################## predict = model.predict(X=X) weights = model.get_weights() if args.opt == "adam" or args.opt == "rms" or args.opt == "decay": if args.dfa:
name='conv3') l7 = MaxPool(size=[batch_size, 8, 8, 256], ksize=[1, 3, 3, 1], strides=[1, 2, 2, 1], padding="SAME") l8 = FeedbackConv(size=[batch_size, 4, 4, 256], num_classes=10, sparse=args.sparse, rank=args.rank, name='conv3_fb') l9 = ConvToFullyConnected(input_shape=[4, 4, 256]) l10 = FullyConnected(input_shape=4 * 4 * 256, size=2048, init=args.init, activation=act, bias=args.bias, name='fc1') l11 = Dropout(rate=dropout_rate) l12 = FeedbackFC(size=[4 * 4 * 256, 2048], num_classes=10, sparse=args.sparse, rank=args.rank, name='fc1_fb') l13 = FullyConnected(input_shape=2048, size=2048, init=args.init, activation=act, bias=args.bias, name='fc2')