def main(): story_limit = 150 epoch_batches_count = 64 epochs_count = 1024 lr = 1e-11 optim = 1 starting_epoch = -1 bs = 32 pgd = PreGenData(bs) task_dir = os.path.dirname(abspath(__file__)) processed_data_dir = join(task_dir, 'data', "processed") lexicon_dictionary = pickle.load( open(join(processed_data_dir, 'lexicon-dict.pkl'), 'rb')) x = len(lexicon_dictionary) computer = DNC(x=x, v_t=x, bs=bs, W=64, L=64, R=32, h=256) # if load model # computer, optim, starting_epoch = load_model(computer) computer = computer.cuda() if optim is None: optimizer = torch.optim.Adam(computer.parameters(), lr=lr) else: print('use Adadelta optimizer with learning rate ', lr) optimizer = torch.optim.Adadelta(computer.parameters(), lr=lr) # starting with the epoch after the loaded one train(computer, optimizer, story_limit, bs, pgd, x, int(starting_epoch) + 1, epochs_count, epoch_batches_count)
elif args.optim == 'rmsprop': optimizer = optim.RMSprop(rnn.last_layer.parameters(), lr=args.lr, momentum=0.9, eps=1e-10) # 0.0001 elif args.optim == 'sgd': optimizer = optim.SGD(rnn.last_layer.parameters(), lr=args.lr) # 0.01 elif args.optim == 'adagrad': optimizer = optim.Adagrad(rnn.last_layer.parameters(), lr=args.lr) elif args.optim == 'adadelta': optimizer = optim.Adadelta(rnn.last_layer.parameters(), lr=args.lr) debug_enabled = hasattr(rnn, 'debug') and rnn.debug if args.cuda != -1: rnn = rnn.cuda(args.cuda) (chx, mhx, rv) = (None, None, None) for epoch in range(iterations + 1): llprint("\rIteration {ep}/{tot}".format(ep=epoch, tot=iterations)) optimizer.zero_grad() random_length = np.random.randint(1, sequence_max_length + 1) input_data, target_output = generate_data(batch_size, random_length, args.input_size, args.cuda) if debug_enabled: output, (chx, mhx, rv), v = rnn(input_data, (None, mhx, None), reset_experience=True, pass_through_memory=True)