def main(): story_limit = 150 epoch_batches_count = 64 epochs_count = 1024 lr = 1e-11 optim = 1 starting_epoch = -1 bs = 32 pgd = PreGenData(bs) task_dir = os.path.dirname(abspath(__file__)) processed_data_dir = join(task_dir, 'data', "processed") lexicon_dictionary = pickle.load( open(join(processed_data_dir, 'lexicon-dict.pkl'), 'rb')) x = len(lexicon_dictionary) computer = DNC(x=x, v_t=x, bs=bs, W=64, L=64, R=32, h=256) # if load model # computer, optim, starting_epoch = load_model(computer) computer = computer.cuda() if optim is None: optimizer = torch.optim.Adam(computer.parameters(), lr=lr) else: print('use Adadelta optimizer with learning rate ', lr) optimizer = torch.optim.Adadelta(computer.parameters(), lr=lr) # starting with the epoch after the loaded one train(computer, optimizer, story_limit, bs, pgd, x, int(starting_epoch) + 1, epochs_count, epoch_batches_count)
def test_rnn_no_memory_pass(): T.manual_seed(1111) input_size = 100 hidden_size = 100 rnn_type = 'gru' num_layers = 3 num_hidden_layers = 5 dropout = 0.2 nr_cells = 12 cell_size = 17 read_heads = 3 gpu_id = -1 debug = True lr = 0.001 sequence_max_length = 10 batch_size = 10 cuda = gpu_id clip = 20 length = 13 rnn = DNC(input_size=input_size, hidden_size=hidden_size, rnn_type=rnn_type, num_layers=num_layers, num_hidden_layers=num_hidden_layers, dropout=dropout, nr_cells=nr_cells, cell_size=cell_size, read_heads=read_heads, gpu_id=gpu_id, debug=debug) optimizer = optim.Adam(rnn.parameters(), lr=lr) optimizer.zero_grad() input_data, target_output = generate_data(batch_size, length, input_size, cuda) target_output = target_output.transpose(0, 1).contiguous() (chx, mhx, rv) = (None, None, None) outputs = [] for x in range(6): output, (chx, mhx, rv), v = rnn(input_data, (chx, mhx, rv), pass_through_memory=False) output = output.transpose(0, 1) outputs.append(output) output = functools.reduce(lambda x, y: x + y, outputs) loss = criterion((output), target_output) loss.backward() T.nn.utils.clip_grad_norm(rnn.parameters(), clip) optimizer.step() assert target_output.size() == T.Size([27, 10, 100]) assert chx[0].size() == T.Size([num_hidden_layers, 10, 100]) assert mhx['memory'].size() == T.Size([10, 12, 17]) assert rv == None
def test_rnn_n(): T.manual_seed(1111) input_size = 100 hidden_size = 100 rnn_type = 'rnn' num_layers = 3 num_hidden_layers = 5 dropout = 0.2 nr_cells = 12 cell_size = 17 read_heads = 3 gpu_id = -1 debug = True lr = 0.001 sequence_max_length = 10 batch_size = 10 cuda = gpu_id clip = 20 length = 13 rnn = DNC( input_size=input_size, hidden_size=hidden_size, rnn_type=rnn_type, num_layers=num_layers, num_hidden_layers=num_hidden_layers, dropout=dropout, nr_cells=nr_cells, cell_size=cell_size, read_heads=read_heads, gpu_id=gpu_id, debug=debug ) optimizer = optim.Adam(rnn.parameters(), lr=lr) optimizer.zero_grad() input_data, target_output = generate_data(batch_size, length, input_size, cuda) target_output = target_output.transpose(0, 1).contiguous() output, (chx, mhx, rv), v = rnn(input_data, None) output = output.transpose(0, 1) loss = criterion((output), target_output) loss.backward() T.nn.utils.clip_grad_norm(rnn.parameters(), clip) optimizer.step() assert target_output.size() == T.Size([27, 10, 100]) assert chx[1].size() == T.Size([num_hidden_layers,10,100]) assert mhx['memory'].size() == T.Size([10,12,17]) assert rv.size() == T.Size([10, 51])
output, (chx, mhx, rv), v = rnn(input_data, (None, mhx, None), reset_experience=True, pass_through_memory=True) else: output, (chx, mhx, rv) = rnn(input_data, (None, mhx, None), reset_experience=True, pass_through_memory=True) loss = criterion((output), target_output) if args.optim_type == 'ldni': rnn.register_loss(loss) loss.backward() T.nn.utils.clip_grad_norm(rnn.parameters(), args.clip) if args.optim_type != 'dni': optimizer.step() loss_value = loss.data[0] summarize = (epoch % summarize_freq == 0) take_checkpoint = (epoch != 0) and (epoch % check_freq == 0) increment_curriculum = (epoch != 0) and (epoch % args.curriculum_freq == 0) # detach memory from graph if mhx is not None: mhx = { k: (v.detach() if isinstance(v, var) else v) for k, v in mhx.items() }
independent_linears=False) elif args.memory_type == 'lstm': rnn = LSTMModel(args.input_size, args.nhid, num_layers=args.nhlayer, dropout=args.dropout, batch_first=True) else: raise Exception('Not recognized type of memory') # register_nan_checks(rnn) last_save_losses = [] if args.optim == 'adam': optimizer = optim.Adam(rnn.parameters(), lr=args.lr, eps=1e-9, betas=[0.9, 0.98]) # 0.0001 elif args.optim == 'adamax': optimizer = optim.Adamax(rnn.parameters(), lr=args.lr, eps=1e-9, betas=[0.9, 0.98]) # 0.0001 elif args.optim == 'rmsprop': optimizer = optim.RMSprop(rnn.parameters(), lr=args.lr, momentum=0.9, eps=1e-10) # 0.0001 elif args.optim == 'sgd': optimizer = optim.SGD(rnn.parameters(), lr=args.lr) # 0.01
train = tv.datasets.MNIST('.', train=True, transform=tv.transforms.ToTensor()) test = tv.datasets.MNIST('.', transform=tv.transforms.ToTensor()) batch_size = 1 train_data_loader = T.utils.data.DataLoader(dataset=train, batch_size=batch_size, shuffle=True) trainset = iter(train_data_loader) trainsize = len(train_data_loader) diffy = DNC(28, 128, num_layers=1, independent_linears=True) loss_fn = T.nn.MSELoss() optimizer = T.optim.Adam(diffy.parameters(), lr=0.0001, eps=1e-9, betas=[0.9, 0.98]) (controller_hidden, memory, read_vectors) = (None, None, None) ranges = 2 * trainsize for it in range(ranges): optimizer.zero_grad() img, true_out = next(trainset) img = T.squeeze(img, 1) output, (controller_hidden, memory, read_vectors) = diffy(img, (None, memory, None),
sparse_reads=args.sparse_reads, read_heads=args.read_heads, gpu_id=args.cuda, debug=args.visdom, batch_first=True, independent_linears=False ) else: raise Exception('Not recognized type of memory') print(rnn) last_save_losses = [] if args.optim == 'adam': optimizer = optim.Adam(rnn.parameters(), lr=args.lr, eps=1e-9, betas=[0.9, 0.98]) # 0.0001 elif args.optim == 'adamax': optimizer = optim.Adamax(rnn.parameters(), lr=args.lr, eps=1e-9, betas=[0.9, 0.98]) # 0.0001 elif args.optim == 'rmsprop': optimizer = optim.RMSprop(rnn.parameters(), lr=args.lr, momentum=0.9, eps=1e-10) # 0.0001 elif args.optim == 'sgd': optimizer = optim.SGD(rnn.parameters(), lr=args.lr) # 0.01 elif args.optim == 'adagrad': optimizer = optim.Adagrad(rnn.parameters(), lr=args.lr) elif args.optim == 'adadelta': optimizer = optim.Adadelta(rnn.parameters(), lr=args.lr) debug_enabled = rnn.debug rnn = DNI(rnn, hidden_size=args.nhid, optim=optimizer) if args.cuda != -1:
input_size=args.input_size, hidden_size=args.nhid, rnn_type='lstm', num_layers=args.nlayer, nr_cells=mem_slot, cell_size=mem_size, read_heads=read_heads, gpu_id=args.cuda ) if args.cuda != -1: rnn = rnn.cuda(args.cuda) last_save_losses = [] optimizer = optim.Adam(rnn.parameters(), lr=args.lr) for epoch in range(iterations + 1): llprint("\rIteration {ep}/{tot}".format(ep=epoch, tot=iterations)) optimizer.zero_grad() random_length = np.random.randint(1, sequence_max_length + 1) input_data, target_output = generate_data(batch_size, random_length, args.input_size, args.cuda) # input_data = input_data.transpose(0, 1).contiguous() target_output = target_output.transpose(0, 1).contiguous() output, _ = rnn(input_data, None) output = output.transpose(0, 1) loss = criterion((output), target_output)
file = open('trainlabels', 'rb') trainlables = pickle.load(file) file.close() train_data_loader = T.utils.data.DataLoader(dataset=trainset, batch_size=1, shuffle=False) trainset = iter(train_data_loader) print('Defining model...') diffy = DNC(25, 128, num_layers=2, independent_linears=True) loss_fn = T.nn.MSELoss() optimizer = T.optim.Adam(diffy.parameters(), lr=0.0001, betas=[0.9, 0.98]) maxVal = 0 maxItem = [] print('Finding max...') for item in trainset: if maxVal < len(item): maxVal = len(item) maxItem = item print('Padding values...') for i in range(len(trainset)): if len(trainset[i]) < maxVal: while len(trainset[i]) < maxVal: