def main(): story_limit = 150 epoch_batches_count = 64 epochs_count = 1024 lr = 1e-11 optim = 1 starting_epoch = -1 bs = 32 pgd = PreGenData(bs) task_dir = os.path.dirname(abspath(__file__)) processed_data_dir = join(task_dir, 'data', "processed") lexicon_dictionary = pickle.load( open(join(processed_data_dir, 'lexicon-dict.pkl'), 'rb')) x = len(lexicon_dictionary) computer = DNC(x=x, v_t=x, bs=bs, W=64, L=64, R=32, h=256) # if load model # computer, optim, starting_epoch = load_model(computer) computer = computer.cuda() if optim is None: optimizer = torch.optim.Adam(computer.parameters(), lr=lr) else: print('use Adadelta optimizer with learning rate ', lr) optimizer = torch.optim.Adadelta(computer.parameters(), lr=lr) # starting with the epoch after the loaded one train(computer, optimizer, story_limit, bs, pgd, x, int(starting_epoch) + 1, epochs_count, epoch_batches_count)
def test_rnn_no_memory_pass(): T.manual_seed(1111) input_size = 100 hidden_size = 100 rnn_type = 'gru' num_layers = 3 num_hidden_layers = 5 dropout = 0.2 nr_cells = 12 cell_size = 17 read_heads = 3 gpu_id = -1 debug = True lr = 0.001 sequence_max_length = 10 batch_size = 10 cuda = gpu_id clip = 20 length = 13 rnn = DNC(input_size=input_size, hidden_size=hidden_size, rnn_type=rnn_type, num_layers=num_layers, num_hidden_layers=num_hidden_layers, dropout=dropout, nr_cells=nr_cells, cell_size=cell_size, read_heads=read_heads, gpu_id=gpu_id, debug=debug) optimizer = optim.Adam(rnn.parameters(), lr=lr) optimizer.zero_grad() input_data, target_output = generate_data(batch_size, length, input_size, cuda) target_output = target_output.transpose(0, 1).contiguous() (chx, mhx, rv) = (None, None, None) outputs = [] for x in range(6): output, (chx, mhx, rv), v = rnn(input_data, (chx, mhx, rv), pass_through_memory=False) output = output.transpose(0, 1) outputs.append(output) output = functools.reduce(lambda x, y: x + y, outputs) loss = criterion((output), target_output) loss.backward() T.nn.utils.clip_grad_norm(rnn.parameters(), clip) optimizer.step() assert target_output.size() == T.Size([27, 10, 100]) assert chx[0].size() == T.Size([num_hidden_layers, 10, 100]) assert mhx['memory'].size() == T.Size([10, 12, 17]) assert rv == None
def test_rnn_n(): T.manual_seed(1111) input_size = 100 hidden_size = 100 rnn_type = 'rnn' num_layers = 3 num_hidden_layers = 5 dropout = 0.2 nr_cells = 12 cell_size = 17 read_heads = 3 gpu_id = -1 debug = True lr = 0.001 sequence_max_length = 10 batch_size = 10 cuda = gpu_id clip = 20 length = 13 rnn = DNC( input_size=input_size, hidden_size=hidden_size, rnn_type=rnn_type, num_layers=num_layers, num_hidden_layers=num_hidden_layers, dropout=dropout, nr_cells=nr_cells, cell_size=cell_size, read_heads=read_heads, gpu_id=gpu_id, debug=debug ) optimizer = optim.Adam(rnn.parameters(), lr=lr) optimizer.zero_grad() input_data, target_output = generate_data(batch_size, length, input_size, cuda) target_output = target_output.transpose(0, 1).contiguous() output, (chx, mhx, rv), v = rnn(input_data, None) output = output.transpose(0, 1) loss = criterion((output), target_output) loss.backward() T.nn.utils.clip_grad_norm(rnn.parameters(), clip) optimizer.step() assert target_output.size() == T.Size([27, 10, 100]) assert chx[1].size() == T.Size([num_hidden_layers,10,100]) assert mhx['memory'].size() == T.Size([10,12,17]) assert rv.size() == T.Size([10, 51])
def _build_encoder_cell(self, hparams, num_layers, num_residual_layers, base_gpu=0): """Build a multi-layer RNN cell that can be used by encoder.""" if hparams.model == 'model3': if hparams.mann == 'ntm': return NTMCell(hparams.num_layers, hparams.num_units, use_att_memory=False, att_memory=False, att_memory_size=None, att_memory_vector_dim=None, use_ext_memory=True, ext_memory_size=hparams.num_memory_locations, ext_memory_vector_dim=hparams.memory_unit_size, ext_read_head_num=hparams.read_heads, ext_write_head_num=hparams.write_heads, dropout=hparams.dropout, batch_size=hparams.batch_size, mode=self.mode, shift_range=1, output_dim=hparams.num_units, reuse=False, record_w_history=hparams.record_w_history) elif hparams.mann == 'dnc': access_config = { 'memory_size': hparams.num_memory_locations, 'word_size': hparams.memory_unit_size, 'num_reads': hparams.read_heads, 'num_writes': hparams.write_heads } controller_config = { 'num_units': hparams.num_units, 'num_layers': hparams.num_layers } return DNC(access_config, controller_config, hparams.num_units, 20, hparams.dropout, self.mode, hparams.batch_size) else: return model_helper.create_rnn_cell( unit_type=hparams.unit_type, num_units=hparams.num_units, num_layers=num_layers, num_residual_layers=num_residual_layers, forget_bias=hparams.forget_bias, dropout=hparams.dropout, num_gpus=hparams.num_gpus, mode=self.mode, base_gpu=base_gpu, single_cell_fn=self.single_cell_fn, num_proj=None)
def main(): # Set random seed if given torch.manual_seed(RANDOM_SEED or torch.initial_seed()) # Choose dataset and initialize size of data's input and output dataset = RepeatCopy() # default parameters # Initialize DNC dnc = DNC(dataset.input_size, dataset.output_size, controller_config, memory_config) train(dnc, dataset)
def __init__(self, idim, cdim, num_heads, N, M, gpu): super(EncoderDNC, self).__init__() self.idim = idim self.hdim = idim self.rnn = DNC(input_size=idim, hidden_size=cdim, nr_cells=N, cell_size=M, read_heads=num_heads, batch_first=False, gpu_id=gpu)
def __init__(self, gpu_id=0, input_size=1, output_size=1, num_layers=4, hidden_size=128, rnn_type='gru', rnn_num_layers=2, rnn_hidden_size=128): super(ConflictMonitoringNet, self).__init__() self.gpu_id = gpu_id self.input_size = input_size self.output_size = output_size self.num_layers = num_layers self.hidden_size = hidden_size self.rnn_type = rnn_type self.rnn_num_layers = rnn_num_layers self.rnn_hidden_size = rnn_hidden_size self.cmd_regulariser = 0 self.layers = nn.Sequential( CMDrop(nn.Conv1d( input_size, hidden_size, 1), nn.ReLU()), CMDrop(nn.Conv1d( hidden_size, hidden_size, 1), nn.ReLU()), CMDrop(nn.Conv1d( hidden_size, hidden_size, 1), nn.ReLU()), CMDrop(nn.Conv1d( hidden_size, output_size, 1)) ) if self.rnn_type =='gru': self.rnn = nn.GRU( input_size=output_size, hidden_size=rnn_hidden_size, num_layers=rnn_num_layers, batch_first=True) self.rnn_hidden = None elif self.rnn_type == 'dnc': self.rnn = DNC( input_size=output_size, hidden_size=rnn_hidden_size, rnn_type='gru', num_layers=rnn_num_layers, nr_cells=10, cell_size=64, read_heads=4, batch_first=True, gpu_id=self.gpu_id) self.rnn_hidden = (controller_hidden, memory, read_vectors) = (None, None, None) self.rnn_output_layer = nn.Conv1d(rnn_hidden_size, num_layers, 1)
def __init__(self, vocab_size, emb_dim=64, device=torch.device('cpu:0')): super(DMNC, self).__init__() K = len(vocab_size) self.K = K self.vocab_size = vocab_size self.device = device self.token_start = vocab_size[2] self.token_end = vocab_size[2] + 1 self.embeddings = nn.ModuleList([ nn.Embedding(vocab_size[i] if i != 2 else vocab_size[2] + 2, emb_dim) for i in range(K) ]) self.dropout = nn.Dropout(p=0.5) self.encoders = nn.ModuleList([ DNC(input_size=emb_dim, hidden_size=emb_dim, rnn_type='gru', num_layers=1, num_hidden_layers=1, nr_cells=16, cell_size=emb_dim, read_heads=1, batch_first=True, gpu_id=0, independent_linears=False) for _ in range(K - 1) ]) self.decoder = nn.GRU( emb_dim + emb_dim * 2, emb_dim * 2, batch_first=True) # input: (y, r1, r2,) hidden: (hidden1, hidden2) self.interface_weighting = nn.Linear( emb_dim * 2, 2 * (emb_dim + 1 + 3)) # 2 read head (key, str, mode) self.decoder_r2o = nn.Linear(2 * emb_dim, emb_dim * 2) self.output = nn.Linear(emb_dim * 2, vocab_size[2] + 2)
def __init__(self, batch_size, input_seq_length, output_seq_length): self._encoder_inputs = [ tf.placeholder(tf.float32, shape=[batch_size, data_point_dim], name='inputs_{}'.format(i)) for i in xrange(input_seq_length) ] self._labels = [ tf.placeholder(tf.float32, shape=[batch_size, data_point_dim], name='labels_{}'.format(i)) for i in xrange(output_seq_length) ] self._decoder_inputs = [ tf.zeros_like(self._encoder_inputs[0], dtype=tf.float32, name='GO') ] + self._labels[:-1] rnn_cell = DNC(access_config, controller_config, data_point_dim, clip_value) model_outputs, states = legacy_seq2seq.tied_rnn_seq2seq( self._encoder_inputs, self._decoder_inputs, rnn_cell, loop_function=lambda prev, _: prev) self._batch_size = batch_size self._input_seq_length = input_seq_length self._output_seq_length = output_seq_length self._squashed_output = tf.nn.softmax(model_outputs) self._cost = loss_function(tf.reshape(self._squashed_output, [-1]), tf.reshape(self._labels, [-1])) self._step = tf.train.AdamOptimizer(learning_rate=0.001).minimize( self._cost) self._session = tf.Session() init = tf.global_variables_initializer() self._session.run(init)
from tasks import CopyTask, RepeatCopyTask, AndTask, XorTask, MergeTask from utils import * INPUT_SIZE = 8 BATCH_SIZE = 32 memory = Memory(25, 6) memory.add_head(NTMReadHead, shifts=[-1, 0, 1]) memory.add_head(NTMReadHead, shifts=[-1, 0, 1]) memory.add_head(NTMWriteHead, shifts=[-1, 0, 1]) input = tf.placeholder(tf.float32, shape=(None, None, INPUT_SIZE+2)) #lstm = tf.nn.rnn_cell.MultiRNNCell([LSTMCell(256) for i in range(3)]) lstm = LSTMCell(100) net = DNC(input, memory, INPUT_SIZE+2, controller = lstm, log_memory=True) targets = tf.placeholder(dtype=tf.float32, shape=[None, None, INPUT_SIZE+2]) mask = tf.placeholder(dtype=tf.float32, shape=[None, None, INPUT_SIZE+2]) output = net[0] loss = tf.losses.sigmoid_cross_entropy(logits=output, weights=mask, multi_class_labels=targets) cost = tf.reduce_sum( mask*((1 - targets * (1 - tf.exp(-output))) * tf.sigmoid(output)) ) / BATCH_SIZE opt = tf.train.RMSPropOptimizer(1e-4, momentum=0.9) train = minimize_and_clip(opt, loss) img_summary = [tf.summary.image(key, concate_to_image(net[2][key]), max_outputs=1) for key in net[2]] img_summary +=[tf.summary.image("IO/input", concate_to_image(input), max_outputs=1)] img_summary +=[tf.summary.image("IO/targets", concate_to_image(targets), max_outputs=1)] img_summary +=[tf.summary.image("IO/output", tf.sigmoid(concate_to_image(net[0])), max_outputs=1)] img_summary +=[tf.summary.image("IO/output x mask", concate_to_image(tf.sigmoid(net[0])*mask), max_outputs=1)] img_summary = tf.summary.merge(img_summary)
# Step 01: Load configuration try: with open(CONFIG_FILE, 'r') as fp: config = attrdict.AttrDict(yaml.load(fp)) except IOError: log.error('Could not load configuration file: {}'.format(CONFIG_FILE)) sys.exit(1) # Step 02: Load the training and testing data log.info('Loading data') try: data = BabiDatasetLoader.load( config.data.cache_dir, config.data.data_dir, config.dataset ) if not data: log.error('Could not load or reprocess the data. Aborting') sys.exit(1) except IOError as exc: log.error('Failed to load the bAbI data set') print(exc) sys.exit(1) # Step 03: Train the model dnc = DNC(data, config.model) dnc.build(config.model)
def sum2_task_single(args): dirname = os.path.dirname( os.path.abspath(__file__)) + '/data/save/sum2/{}'.format(args.name) if not os.path.isdir(dirname): os.mkdir(dirname) print(dirname) ckpts_dir = dirname batch_size = 50 vocab_lower = 2 vocab_upper = 150 length_from = 1 length_to = args.seq_len input_size = vocab_upper output_size = vocab_upper words_count = 32 word_size = 64 read_heads = 1 iterations = args.num_iter start_step = 0 sequence_max_length = 100 if args.mode == 'train': ntest = 10 else: ntest = 50 graph = tf.Graph() with graph.as_default(): with tf.Session(graph=graph) as session: llprint("Building Computational Graph ... ") ncomputer = DNC(StatelessRecurrentController, input_size, output_size, sequence_max_length, words_count, word_size, read_heads, batch_size, hidden_controller_dim=args.hidden_dim, use_mem=args.use_mem, dual_emb=True, decoder_mode=True, dual_controller=True, write_protect=True) output, prob, loss, apply_gradients = ncomputer.build_loss_function_mask( ) llprint("Done!\n") llprint("Initializing Variables ... ") session.run(tf.global_variables_initializer()) if args.mode == 'test': ncomputer.restore(session, ckpts_dir, ncomputer.print_config()) iterations = 1 llprint("Done!\n") last_100_losses = [] start = 0 if start_step == 0 else start_step + 1 end = start_step + iterations + 1 minloss = 1000 start_time_100 = time.time() end_time_100 = None avg_100_time = 0. avg_counter = 0 if args.mode == 'train': train_writer = tf.summary.FileWriter( './data/summary/sum2/{}/'.format(ncomputer.print_config()), session.graph) for i in range(start, end + 1): try: llprint("\rIteration %d/%d" % (i, end)) if args.mode == 'train': input_vec, output_vec, seq_len, decoder_point, masks, all_ose \ = sum2_sample_single_batch(vocab_lower, vocab_upper / 3, length_from, length_to, vocab_size=vocab_upper, bs=batch_size) loss_value, _ = session.run( [loss, apply_gradients], feed_dict={ ncomputer.input_data: input_vec, ncomputer.target_output: output_vec, ncomputer.sequence_length: seq_len, ncomputer.decoder_point: decoder_point, ncomputer.mask: masks }) last_100_losses.append(loss_value) summerize = (i % 100 == 0) if summerize: llprint( "\n\t episode %d -->Avg. Cross-Entropy: %.7f\n" % (i, np.mean(last_100_losses))) if args.mode == 'train': summary = tf.Summary() summary.value.add( tag='batch_train_loss', simple_value=np.mean(last_100_losses)) trscores = [] mloss = 1000 for ii in range(ntest): input_vec, output_vec, seq_len, decoder_point, masks, all_ose \ = sum2_sample_single_batch(vocab_lower, vocab_upper / 3, length_from, length_to, vocab_size=vocab_upper, bs=batch_size) tloss, out = session.run( [loss, prob], feed_dict={ ncomputer.input_data: input_vec, ncomputer.sequence_length: seq_len, ncomputer.decoder_point: decoder_point, ncomputer.target_output: output_vec, ncomputer.mask: masks }) out = np.reshape(np.asarray(out), [-1, seq_len, vocab_upper]) out = np.argmax(out, axis=-1) bout_list = [] # print('{} vs {}'.format(seq_len,out.shape[1])) for b in range(out.shape[0]): out_list = [] for io in range(decoder_point, out.shape[1]): if out[b][io] == 0: break out_list.append(out[b][io]) bout_list.append(out_list) # for io in range(decoder_point, out.shape[1]): # out_list.append(out[0][io]) if tloss < mloss: mloss = tloss trscores.append( exact_acc(np.asarray(all_ose), np.asarray(bout_list), 0.9)) if args.mode == 'train' and mloss < minloss: minloss = mloss print('save model') ncomputer.save(session, ckpts_dir, ncomputer.print_config()) print('test bleu {}', format(np.mean(trscores))) print('test bleu {}', format(np.mean(trscores))) if args.mode == 'train': summary.value.add(tag='train_bleu', simple_value=np.mean(trscores)) train_writer.add_summary(summary, i) train_writer.flush() end_time_100 = time.time() elapsed_time = (end_time_100 - start_time_100) / 60 avg_counter += 1 avg_100_time += (1. / avg_counter) * (elapsed_time - avg_100_time) estimated_time = (avg_100_time * ((end - i) / 100.)) / 60. print("\tAvg. 100 iterations time: %.2f minutes" % (avg_100_time)) print("\tApprox. time to completion: %.2f hours" % (estimated_time)) start_time_100 = time.time() last_100_losses = [] except KeyboardInterrupt: sys.exit(0) llprint("\nSaving Checkpoint ... "),
input = tf.placeholder(tf.float32, shape=(None, None, task.input_size)) # if args.controller == 'lstm': controller = LSTMCell(args.controller_size) elif args.controller == 'multilstm': controller = tf.nn.rnn_cell.MultiRNNCell( [LSTMCell(args.controller_size) for i in range(3)]) elif args.controller == 'ff': controller = dnc.ff.FFWrapper( dnc.ff.simple_feedforward(hidden=[args.controller_size] * 2)) if not args.no_dnc: net = DNC(input, memory, output_size=task.output_size, controller=controller, log_memory=True) output = net[0] else: output, _ = tf.nn.dynamic_rnn(controller, input, dtype=tf.float32) output = tf.layers.dense(output, task.output_size, use_bias=False) targets = tf.placeholder(dtype=tf.float32, shape=[None, None, task.output_size]) mask = tf.placeholder(dtype=tf.float32, shape=[None, None, task.output_size]) if not args.no_mask: loss = tf.losses.sigmoid_cross_entropy(logits=output, weights=mask, multi_class_labels=targets) cost = tf.reduce_sum(mask * (1 - targets * (1 - tf.exp(-output))) *
file = open('trainset', 'rb') trainset = pickle.load(file) file.close() file = open('trainlabels', 'rb') trainlables = pickle.load(file) file.close() train_data_loader = T.utils.data.DataLoader(dataset=trainset, batch_size=1, shuffle=False) trainset = iter(train_data_loader) print('Defining model...') diffy = DNC(25, 128, num_layers=2, independent_linears=True) loss_fn = T.nn.MSELoss() optimizer = T.optim.Adam(diffy.parameters(), lr=0.0001, betas=[0.9, 0.98]) maxVal = 0 maxItem = [] print('Finding max...') for item in trainset: if maxVal < len(item): maxVal = len(item) maxItem = item
summarize_freq = args.summarize_freq check_freq = args.check_freq # input_size = output_size = args.input_size mem_slot = args.mem_slot mem_size = args.mem_size read_heads = args.read_heads if args.memory_type == 'dnc': rnn = DNC(input_size=args.input_size, hidden_size=args.nhid, rnn_type=args.rnn_type, num_layers=args.nlayer, num_hidden_layers=args.nhlayer, dropout=args.dropout, nr_cells=mem_slot, cell_size=mem_size, read_heads=read_heads, gpu_id=args.cuda, debug=args.visdom, batch_first=True, independent_linears=True) elif args.memory_type == 'sdnc': rnn = SDNC(input_size=args.input_size, hidden_size=args.nhid, rnn_type=args.rnn_type, num_layers=args.nlayer, num_hidden_layers=args.nhlayer, dropout=args.dropout, nr_cells=mem_slot, cell_size=mem_size,
np.random.seed(1) g = tf.Graph() with g.as_default(): batch_size = 4 output_size = 20 input_size = 10 memory_config = {'words_num': 256, 'word_size': 64, 'read_heads_num': 4} controller_config = { "hidden_size": 128, } dnc = DNC( controller_config, memory_config, output_size, ) initial_state = dnc.initial_state(batch_size) example_input = np.random.uniform(0, 1, (batch_size, input_size)).astype( np.float32) output_op, _ = dnc( tf.convert_to_tensor(example_input), initial_state, ) init = tf.global_variables_initializer() with tf.Session(graph=g) as sess: init.run() example_output = sess.run(output_op) tf.summary.FileWriter("graphs", g).close()
mem_slot = args.mem_slot mem_size = args.mem_size read_heads = args.read_heads # options, _ = getopt.getopt(sys.argv[1:], '', ['iterations=']) # for opt in options: # if opt[0] == '-iterations': # iterations = int(opt[1]) rnn = DNC( input_size=args.input_size, hidden_size=args.nhid, rnn_type='lstm', num_layers=args.nlayer, nr_cells=mem_slot, cell_size=mem_size, read_heads=read_heads, gpu_id=args.cuda ) if args.cuda != -1: rnn = rnn.cuda(args.cuda) last_save_losses = [] optimizer = optim.Adam(rnn.parameters(), lr=args.lr) for epoch in range(iterations + 1): llprint("\rIteration {ep}/{tot}".format(ep=epoch, tot=iterations)) optimizer.zero_grad()
parser = argparse.ArgumentParser() parser.add_argument("task", type=str) parser.add_argument("datadir", type=str, help="Where do you want your datasets to be?") parser.add_argument("--summary-dir", type=str, help="Summary directory for tensorboard", default=None) args = parser.parse_args() args.task = args.task.lower() if not args.task in SUPPORTED_TASKS: print("Unsupported task: {}".format(args.task)) sys.exit(1) if args.task == 'babi': print("== BABI ==") download_babi(args.datadir) X_train, X_test, y_train, y_test = load_babi(args.datadir, lesson=1) elif args.task == "copy": print("== COPY ==") X_train, X_test, y_train, y_test = make_copy_dataset(args.datadir) #X_train = X_train[:, -2:, :] # for debugging #X_test = X_test[:, -2:, :] # for debugging print("== DNC ==") print() machine = DNC(X_train, y_train, X_test, y_test, summary_dir=args.summary_dir, N=X_train.shape[1], W=10, R=3, #checkpoint_file="{}.ckpt".format(args.task), optimizer="RMSProp") print("== Training ==") print() machine.train()
import torch as T from dnc import DNC import torchvision as tv train = tv.datasets.MNIST('.', train=True, transform=tv.transforms.ToTensor()) test = tv.datasets.MNIST('.', transform=tv.transforms.ToTensor()) batch_size = 1 train_data_loader = T.utils.data.DataLoader(dataset=train, batch_size=batch_size, shuffle=True) trainset = iter(train_data_loader) trainsize = len(train_data_loader) diffy = DNC(28, 128, num_layers=1, independent_linears=True) loss_fn = T.nn.MSELoss() optimizer = T.optim.Adam(diffy.parameters(), lr=0.0001, eps=1e-9, betas=[0.9, 0.98]) (controller_hidden, memory, read_vectors) = (None, None, None) ranges = 2 * trainsize for it in range(ranges): optimizer.zero_grad() img, true_out = next(trainset)
# helper funcs def binary_cross_entropy(y_hat, y): return tf.reduce_mean(-y * tf.log(y_hat) - (1 - y) * tf.log(1 - y_hat)) def llprint(message): sys.stdout.write(message) sys.stdout.flush() # build graph sess = tf.InteractiveSession() llprint("building graph...\n") optimizer = tf.train.RMSPropOptimizer(FLAGS.lr, momentum=FLAGS.momentum) dnc = DNC(RNNController, FLAGS, input_steps=FLAGS.ascii_steps) llprint("defining loss...\n") y_hat, outputs = dnc.get_outputs() # TODO: fix this loss: l2 on [:,:,:2] and then do binary cross entropy on <EOS> tags loss = tf.nn.l2_loss(dnc.y - y_hat) * 100. / (FLAGS.batch_size * (FLAGS.ascii_steps + FLAGS.stroke_steps)) llprint("computing gradients...\n") gradients = optimizer.compute_gradients(loss) for i, (grad, var) in enumerate(gradients): if grad is not None: gradients[i] = (tf.clip_by_value(grad, -10, 10), var) grad_op = optimizer.apply_gradients(gradients)
For the Associative task, input_size: seq_width + 2, output_size: seq_width For the NGram task, input_size: 1, output_size: 1 For the Priority Sort task, input_size: seq_width + 1, output_size: seq_width """ has_tau=0 if args.model=='ntm': model = NTM(input_size= input_size, output_size=output_size, controller_size=args.lstm_size, memory_units=128, memory_unit_size=20, num_heads=1)#task_params['num_heads']) elif args.model=='dnc': model = DNC(input_size= input_size, output_size=output_size, hidden_size=args.lstm_size, nr_cells=128, cell_size=20, read_heads=1)#task_params['num_heads']) model.init_param() elif args.model=='sam': model = SAM(input_size= input_size, output_size=output_size, hidden_size=args.lstm_size, nr_cells=128, cell_size=20, read_heads=1)#read_heads=4???#task_params['num_heads']) model.init_param() elif args.model=='lstm': marnn_config=args print('marnn_config:\n',marnn_config) model = MARNN(marnn_config,input_size=input_size,
def _build_model(self): if args.mann == 'none': def single_cell(num_units): return tf.contrib.rnn.BasicLSTMCell(num_units, forget_bias=1.0) cell = tf.contrib.rnn.OutputProjectionWrapper( tf.contrib.rnn.MultiRNNCell([ single_cell(args.num_units) for _ in range(args.num_layers) ]), args.num_bits_per_vector, activation=None) initial_state = tuple( tf.contrib.rnn.LSTMStateTuple( c=expand(tf.tanh(learned_init(args.num_units)), dim=0, N=args.batch_size), h=expand(tf.tanh(learned_init(args.num_units)), dim=0, N=args.batch_size)) for _ in range(args.num_layers)) elif args.mann == 'ntm': cell = NTMCell(args.num_layers, args.num_units, args.num_memory_locations, args.memory_size, args.num_read_heads, args.num_write_heads, addressing_mode='content_and_location', shift_range=args.conv_shift_range, reuse=False, output_dim=args.num_bits_per_vector, clip_value=args.clip_value, init_mode=args.init_mode) initial_state = cell.zero_state(args.batch_size, tf.float32) elif args.mann == 'dnc': access_config = { 'memory_size': args.num_memory_locations, 'word_size': args.memory_size, 'num_reads': args.num_read_heads, 'num_writes': args.num_write_heads, } controller_config = { 'hidden_size': args.num_units, } cell = DNC(access_config, controller_config, args.num_bits_per_vector, args.clip_value) initial_state = cell.initial_state(args.batch_size) output_sequence, _ = tf.nn.dynamic_rnn(cell=cell, inputs=self.inputs, time_major=False, initial_state=initial_state) if args.task == 'copy' or args.task == 'repeat_copy': self.output_logits = output_sequence[:, self.max_seq_len + 1:, :] elif args.task == 'associative_recall': self.output_logits = output_sequence[:, 3 * (self.max_seq_len + 1) + 2:, :] elif args.task in ('traversal', 'shortest_path'): self.output_logits = output_sequence[:, -self.max_seq_len:, :] if args.task in ('copy', 'repeat_copy', 'associative_recall'): self.outputs = tf.sigmoid(self.output_logits) if args.task in ('traversal', 'shortest_path'): output_logits_split = tf.split(self.output_logits, 9, axis=2) self.outputs = tf.concat( [tf.nn.softmax(logits) for logits in output_logits_split], axis=2)
def model_fn(features, labels, mode, params): # -------------------------------------------------------------------------- # Model # -------------------------------------------------------------------------- access_config = { "memory_size": 16, "word_size": 16, "num_reads": 4, "num_writes": 1, } controller_config = { "hidden_size": 64, } clip_value = 20 dnc_core = DNC(access_config, controller_config, 5, clip_value) initial_state = dnc_core.initial_state(params["batch_size"]) output_logits, _ = tf.nn.dynamic_rnn(cell=dnc_core, inputs=features, time_major=True, initial_state=initial_state) # -------------------------------------------------------------------------- # Build EstimatorSpec # -------------------------------------------------------------------------- train_loss = params["dataset_" + mode].cost(output_logits, labels["target"], labels["mask"]) # Set up optimizer with global norm clipping. trainable_variables = tf.trainable_variables() grads, _ = tf.clip_by_global_norm( tf.gradients(train_loss, trainable_variables), params["max_grad_norm"]) global_step = tf.get_variable( name="global_step", shape=[], dtype=tf.int64, initializer=tf.zeros_initializer(), trainable=False, collections=[tf.GraphKeys.GLOBAL_VARIABLES, tf.GraphKeys.GLOBAL_STEP]) optimizer = tf.train.RMSPropOptimizer(params["lr"], epsilon=params["optimizer_epsilon"]) train_step = optimizer.apply_gradients(zip(grads, trainable_variables), global_step=global_step) # dataset_tensors_np, output_np = sess.run([dataset_tensors, output]) # dataset_string = dataset.to_human_readable(dataset_tensors_np, output_np) output_sigmoid = tf.nn.sigmoid(output_logits) delta = tf.abs(output_sigmoid - labels["target"]) tf.summary.histogram("delta", delta) equality = tf.cast(delta < 0.1, tf.float32) * tf.expand_dims( labels["mask"], -1) correct_elements = tf.reduce_mean(tf.reduce_sum(equality, [0, 2])) pct_correct = tf.reduce_mean( tf.reduce_sum(equality, [0, 2]) / tf.cast(labels["total_targ_batch"], tf.float32)) eval_metric_ops = { "accuracy": tf.metrics.mean(pct_correct), "loss": tf.metrics.mean(train_loss), "correct_elements": tf.metrics.mean(correct_elements), "total_elements": tf.metrics.mean(tf.cast(labels["total_targ_batch"], tf.float32)) } image_mask = tf.expand_dims(tf.expand_dims(labels["mask"], -1), -1) xent = tf.expand_dims( tf.nn.sigmoid_cross_entropy_with_logits( labels=labels["target"], logits=output_logits * tf.expand_dims(labels["mask"], -1)), -1) image = tf.concat( [ # tf.expand_dims(output_logits, -1), output_sigmoid, labels["target"], # tf.expand_dims(equality, -1), # xent / tf.reduce_max(xent) ], -1) # tf summary image expects shape [batch_size, height, width, channels] image = tf.transpose(image, perm=[1, 0, 2]) tf.summary.image("output_compare", tf.expand_dims(image, -1), 4) tf.summary.scalar("train_loss", tf.reduce_mean(train_loss)) tf.summary.scalar("train_accuracy", pct_correct) tf.summary.scalar("correct_elements", correct_elements) tf.summary.scalar("total_elements", tf.reduce_mean(labels["total_targ_batch"], axis=-1)) tf.summary.scalar( "max_length", tf.convert_to_tensor(params["dataset_" + mode]._max_length)) tf.summary.scalar( "max_repeats", tf.convert_to_tensor(params["dataset_" + mode]._max_repeats)) return tf.estimator.EstimatorSpec(mode, loss=train_loss, train_op=train_step, eval_metric_ops=eval_metric_ops, scaffold=gen_scaffold(params))