class HyperdashCallback(Callback): exp = None last = 1 def on_train_begin(self, logs=None): self.exp = Experiment("Deep Weather") def on_train_end(self, logs=None): self.exp.end() def on_epoch_end(self, epoch, logs=None): if 'loss' in logs: self.exp.metric("progress", min(0.1, self.last - logs["loss"])) self.last = logs["loss"] self.exp.metric("loss", min(0.5, logs["loss"])) self.exp.metric("val_loss", min(0.5, logs["val_loss"]))
def run(): BATCH_SIZE = args.batch_size LEARNING_RATE = args.learning_rate DATA_SAMPLING = args.data_sampling NUM_EPOCHS = args.epochs MODEL = args.model use_cuda = torch.cuda.is_available() device = torch.device("cuda" if use_cuda else "cpu") logger = Logger('./logs/{}'.format(time.localtime())) print("Created model...") model = cdssm.CDSSM() model = model.cuda() model = model.to(device) if torch.cuda.device_count() > 0: print("Let's use", torch.cuda.device_count(), "GPU(s)!") model = nn.DataParallel(model) model.load_state_dict(torch.load(MODEL)) print("Created dataset...") dataset = pytorch_data_loader.WikiDataset(test, claims_dict, data_sampling=DATA_SAMPLING, testFile="shared_task_dev.jsonl") dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, num_workers=3, shuffle=True, collate_fn=pytorch_data_loader.PadCollate()) OUTPUT_FREQ = int((len(dataset) / BATCH_SIZE) * 0.02) criterion = torch.nn.BCEWithLogitsLoss() parameters = { "batch size": BATCH_SIZE, "loss": criterion.__class__.__name__, "data batch size": DATA_SAMPLING, "data": args.data } exp_params = {} exp = Experiment("CLSM V2") for key, value in parameters.items(): exp_params[key] = exp.param(key, value) true = [] pred = [] print("Evaluating...") model.eval() test_running_accuracy = 0.0 test_running_loss = 0.0 num_batches = 0 for batch_num, inputs in enumerate(dataloader): num_batches += 1 claims_tensors, claims_text, evidences_tensors, evidences_text, labels = inputs y_pred = model(claims, evidences) y = (labels).float() y_pred = y_pred.squeeze() y = y.squeeze() y = y.view(-1) y_pred = y_pred.view(-1) bin_acc = torch.sigmoid(y_pred).round() loss = criterion(y_pred, y) true.extend(y.tolist()) pred.extend(bin_acc.tolist()) accuracy = (y == bin_acc).float().mean() test_running_accuracy += accuracy.item() test_running_loss += loss.item() if batch_num % OUTPUT_FREQ == 0 and batch_num > 0: print("[{}]: {}".format(batch_num, test_running_accuracy / OUTPUT_FREQ)) # 1. Log scalar values (scalar summary) info = { 'test_loss': test_running_loss / OUTPUT_FREQ, 'test_accuracy': test_running_accuracy / OUTPUT_FREQ } for tag, value in info.items(): exp.metric(tag, value, log=False) # logger.scalar_summary(tag, value, batch_num+1) # 2. Log values and gradients of the parameters (histogram summary) for tag, value in model.named_parameters(): tag = tag.replace('.', '/') # logger.histo_summary(tag, value.data.cpu().numpy(), batch_num+1) test_running_loss = 0.0 test_running_accuracy = 0.0 print(true[0], pred[0]) true = np.array(true).astype("int") pred = np.array(pred).astype("int") final_accuracy = accuracy_score(true, pred) print("Final accuracy: {}".format(final_accuracy)) print(classification_report(true, pred)) filename = "predicted_labels" for key, value in parameters.items(): filename += "_{}-{}".format(key.replace(" ", "_"), value) joblib.dump({"true": true, "pred": pred}, filename)
if len(reward_avg) == 0: reward = reward_raw else: reward = reward_raw - np.mean(reward_avg) # update running mean reward_avg.append(reward_raw) if len(reward_avg) > REWARD_BUF: reward_avg.pop(0) rewards.append(reward) entropies.append(entropy[0]) log_probs.append(log_prob[0]) agent.update_parameters(rewards, log_probs, entropies, GAMMA) if i_episode % CHKP_FREQ == 0: torch.save( agent.model.state_dict(), os.path.join(exp_dir, 'reinforce-' + str(i_episode) + '.pkl')) # print("Episode: {}, reward: {}".format(i_episode, np.sum(rewards))) exp.metric("episode", i_episode) exp.metric("rewards", np.mean(reward_raw_log)) del rewards del log_probs del entropies del state
def train_reconstruction(train_loader, test_loader, encoder, decoder, args): exp = Experiment("Reconstruction Training") try: lr = args.lr encoder_opt = torch.optim.Adam(encoder.parameters(), lr=lr) decoder_opt = torch.optim.Adam(decoder.parameters(), lr=lr) encoder.train() decoder.train() steps = 0 for epoch in range(1, args.epochs+1): print("=======Epoch========") print(epoch) for batch in train_loader: feature = Variable(batch) if args.use_cuda: encoder.cuda() decoder.cuda() feature = feature.cuda() encoder_opt.zero_grad() decoder_opt.zero_grad() h = encoder(feature) prob = decoder(h) reconstruction_loss = compute_cross_entropy(prob, feature) reconstruction_loss.backward() encoder_opt.step() decoder_opt.step() steps += 1 print("Epoch: {}".format(epoch)) print("Steps: {}".format(steps)) print("Loss: {}".format(reconstruction_loss.data[0] / args.sentence_len)) exp.metric("Loss", reconstruction_loss.data[0] / args.sentence_len) # check reconstructed sentence if steps % args.log_interval == 0: print("Test!!") input_data = feature[0] single_data = prob[0] _, predict_index = torch.max(single_data, 1) input_sentence = util.transform_id2word(input_data.data, train_loader.dataset.index2word, lang="en") predict_sentence = util.transform_id2word(predict_index.data, train_loader.dataset.index2word, lang="en") print("Input Sentence:") print(input_sentence) print("Output Sentence:") print(predict_sentence) if steps % args.test_interval == 0: eval_reconstruction(encoder, decoder, test_loader, args) if epoch % args.lr_decay_interval == 0: # decrease learning rate lr = lr / 5 encoder_opt = torch.optim.Adam(encoder.parameters(), lr=lr) decoder_opt = torch.optim.Adam(decoder.parameters(), lr=lr) encoder.train() decoder.train() if epoch % args.save_interval == 0: util.save_models(encoder, args.save_dir, "encoder", steps) util.save_models(decoder, args.save_dir, "decoder", steps) # finalization # save vocabulary with open("word2index", "wb") as w2i, open("index2word", "wb") as i2w: pickle.dump(train_loader.dataset.word2index, w2i) pickle.dump(train_loader.dataset.index2word, i2w) # save models util.save_models(encoder, args.save_dir, "encoder", "final") util.save_models(decoder, args.save_dir, "decoder", "final") print("Finish!!!") finally: exp.end()
optimizer.step() x_buf = [] y_buf = [] epi_x_old = epi_x loss_episode = loss.clone().cpu().data.numpy()[0] diff_episode = F.mse_loss(x_cat[:, :, :12], y_cat).clone().cpu().data.numpy()[0] loss.detach_() net.hidden[0].detach_() net.hidden[1].detach_() if exp is not None: exp.metric("loss episode", loss_episode) exp.metric("diff episode", diff_episode) exp.metric("epoch", epoch) loss_epoch += loss_episode diff_epoch += diff_episode x_buf.append(x) y_buf.append(y) printEpochLoss(epoch, epi_x_old, loss_epoch, diff_epoch) saveModel(state=net.state_dict(), epoch=epoch, epoch_len=epi_x_old, loss_epoch=loss_epoch, diff_epoch=diff_epoch,
def demo(args=None): from_file = get_api_key_from_file() from_env = get_api_key_from_env() api_key = from_env or from_file if not api_key: print(""" `hyperdash demo` requires a Hyperdash API key. Try setting your API key in the HYPERDASH_API_KEY environment variable, or in a hyperdash.json file in the local directory or your user's home directory with the following format: { "api_key": "<YOUR_API_KEY>" } """) return print(""" Running the following program: from hyperdash import Experiment exp = Experiment("Dogs vs. Cats") # Parameters estimators = exp.param("Estimators", 500) epochs = exp.param("Epochs", 5) batch = exp.param("Batch Size", 64) for epoch in xrange(1, epochs + 1): accuracy = 1. - 1./epoch loss = float(epochs - epoch)/epochs print("Training model (epoch {})".format(epoch)) time.sleep(1) # Metrics exp.metric("Accuracy", accuracy) exp.metric("Loss", loss) exp.end() """) from hyperdash import Experiment exp = Experiment("Dogs vs. Cats") # Parameters estimators = exp.param("Estimators", 500) epochs = exp.param("Epochs", 5) batch = exp.param("Batch Size", 64) for epoch in xrange(epochs): print("Training model (epoch {})".format(epoch)) accuracy = 1. - 1. / (epoch + 1) loss = float(epochs - epoch) / (epochs + 1) # Metrics exp.metric("Accuracy", accuracy) exp.metric("Loss", loss) time.sleep(1) exp.end()
} f.attrs['split'] = H5PYDataset.create_split_array(split_dict) def match_env(ev1, ev2): # set env1 (simulator) to that of env2 (real robot) ev1.env.env.set_state(ev2.env.env.model.data.qpos.ravel(), ev2.env.env.model.data.qvel.ravel()) i = 0 exp = Experiment("dataset pusher") for i in tqdm(range(max_steps)): exp.metric("episode", i) obs = env.reset() obs2 = env2.reset() match_env(env, env2) for j in range(episode_length): # env.render() # env2.render() if j % action_steps == 0: action = env.action_space.sample() new_obs, reward, done, info = env.step(action) new_obs2, reward2, done2, info2 = env2.step(action) # print (j, done, new_obs[0][0])
#Visualize output out = model.session.run(net_out, feed_dict={network: X, labels: Y}) out_ = [] for j in range(out.shape[1]): out_.append(out[0][j]) out = out_ out_log.append(out) train_acc, train_loss = model.session.run([acc, cost], feed_dict={ network: X, labels: Y }) if hyperdash: exp.metric("Accuracy", train_acc) exp.metric("Loss", train_loss) train_summary = model.session.run(merged, feed_dict={ network: X, labels: Y }) writer2.add_summary(train_summary, i) if i % 200 == 0: # Save model and acc/error curves os.chdir('/home/mpcr/Desktop/rodrigo/deepcontrol/saved_models') model.save(m_save + modelswitch[model_num].__name__) # Save model output throughout training
def test_experiment_handles_numpy_numbers(self): nums_to_test = [ ("int_", np.int_()), ("intc", np.intc()), ("intp", np.intp()), ("int8", np.int8()), ("int16", np.int16()), ("int32", np.int32()), ("int64", np.int64()), ("uint8", np.uint8()), ("uint16", np.uint16()), ("uint32", np.uint32()), ("uint64", np.uint64()), ("float16", np.float16()), ("float32", np.float32()), ("float64", np.float64()), ] # Make sure the SDK doesn't choke and JSON serialization works exp = Experiment("MNIST") for name, num in nums_to_test: exp.metric("test_metric_{}".format(name), num) exp.param("test_param_{}".format(name), num) exp.end() # Test params match what is expected params_messages = [] for msg in server_sdk_messages: payload = msg["payload"] if "params" in payload: params_messages.append(payload) expected_params = [] for name, num in nums_to_test: obj = { "params": {}, "is_internal": False, } obj["params"]["test_param_{}".format(name)] = num obj["is_internal"] = False expected_params.append(obj) assert len(expected_params) == len(params_messages) for i, message in enumerate(params_messages): print(message) print(expected_params[i]) assert message == expected_params[i] # Test metrics match what is expected metrics_messages = [] for msg in server_sdk_messages: payload = msg["payload"] if "name" in payload: metrics_messages.append(payload) expected_metrics = [] for name, num in nums_to_test: expected_metrics.append({ "name": "test_metric_{}".format(name), "value": num, "is_internal": False, }) assert len(expected_metrics) == len(metrics_messages) for i, message in enumerate(metrics_messages): assert message == expected_metrics[i]
def test_experiment(self): # Run a test job via the Experiment API # Make sure log file is where is supposed to be # look at decorator # verify run start/stop is sent with patch("sys.stdout", new=StringIO()) as faked_out: exp = Experiment("MNIST") exp.log("test print") exp.param("batch size", 32) for i in exp.iter(2): time.sleep(1) exp.metric("accuracy", i * 0.2) time.sleep(0.1) exp.end() # Test params match what is expected params_messages = [] for msg in server_sdk_messages: payload = msg["payload"] if "params" in payload: params_messages.append(payload) expect_params = [ { "params": { "batch size": 32, }, "is_internal": False, }, { "params": { "hd_iter_0_epochs": 2, }, "is_internal": True, }, ] assert len(expect_params) == len(params_messages) for i, message in enumerate(params_messages): assert message == expect_params[i] # Test metrics match what is expected metrics_messages = [] for msg in server_sdk_messages: payload = msg["payload"] if "name" in payload: metrics_messages.append(payload) expect_metrics = [ { "is_internal": True, "name": "hd_iter_0", "value": 0 }, { "is_internal": False, "name": "accuracy", "value": 0 }, { "is_internal": True, "name": "hd_iter_0", "value": 1 }, { "is_internal": False, "name": "accuracy", "value": 0.2 }, ] assert len(expect_metrics) == len(metrics_messages) for i, message in enumerate(metrics_messages): assert message == expect_metrics[i] captured_out = faked_out.getvalue() assert "error" not in captured_out # Make sure correct API name / version headers are sent assert server_sdk_headers[0][API_KEY_NAME] == API_NAME_EXPERIMENT assert server_sdk_headers[0][ VERSION_KEY_NAME] == get_hyperdash_version() # Make sure logs were persisted expect_logs = [ "{ batch size: 32 }", "test print", "| Iteration 0 of 1 |", "| accuracy: 0.000000 |", ] log_dir = get_hyperdash_logs_home_path_for_job("MNIST") latest_log_file = max([ os.path.join(log_dir, filename) for filename in os.listdir(log_dir) ], key=os.path.getmtime) with open(latest_log_file, "r") as log_file: data = log_file.read() for log in expect_logs: assert_in(log, data) os.remove(latest_log_file)
def main(): args = parse_args() # set random seed #logger.info('> set random seed {}'.format(args.seed)) random.seed(args.seed) np.random.seed(args.seed) # Set up Devices #logger.info('> set gpu device {}'.format(args.gpus)) num_cuda_devices = utils.set_devices(args.gpus) # Load model #logger.info('> load model {}'.format(args.model_name)) ext = os.path.splitext(args.model_file)[1] model_path = '.'.join(os.path.split(args.model_file)).replace(ext, '') model = import_module(model_path) model = getattr(model, args.model_name)(args.output_class) if num_cuda_devices > 0: model = torch.nn.DataParallel(model) model.cuda() logger.info('> set optimizer') criterion = torch.nn.CrossEntropyLoss() optimizer = optim.SGD(model.parameters(), lr=args.initial_lr, momentum=args.lr_momentum) # Create result dir result_dir = create_result_dir(args.model_name) fh_handler = logging.FileHandler(os.path.join(result_dir, "log")) fh_handler.setFormatter( logging.Formatter('%(asctime)s %(levelname)s %(message)s')) logger.addHandler(fh_handler) shutil.copy(args.model_file, os.path.join(result_dir, os.path.basename(args.model_file))) script_file_list = glob.glob('./*.py') + glob.glob('./*.sh') for file_name in script_file_list: shutil.copy(file_name, os.path.join(result_dir, os.path.basename(file_name))) with open(os.path.join(result_dir, 'args'), 'w') as fp: fp.write(json.dumps(vars(args))) print(json.dumps(vars(args), sort_keys=True, indent=4)) # Create Dataset logger.info('> Creating DataSet') train_transform = partial(transforms.transform_f, random_angle=args.random_angle, expand_ratio=args.expand_ratio, crop_size=args.crop_size, train=True) train = getdataset.getCcoreDataset(args.train_json, train_transform, args.train_mode) val_transform = partial(transforms.transform_f, random_angle=args.random_angle, expand_ratio=args.expand_ratio, crop_size=args.crop_size, train=True) val = getdataset.getCcoreDataset(args.train_json, val_transform, args.train_mode) # Create DataLoader logger.info('> create dataloader') train_loader = torch.utils.data.DataLoader(train, batch_size=args.batchsize, shuffle=True, num_workers=4) val_loader = torch.utils.data.DataLoader(val, batch_size=args.batchsize, shuffle=False, num_workers=4) # Training logger.info('> run training') best_prec = 0 # Create Hyperdash Experiment logger.info('> Create Hyperdash Experiment {}'.format( args.experiment_name)) exp = Experiment(args.experiment_name, api_key_getter=utils.get_api_key_from_env) for epoch in tqdm(range(args.training_epoch)): training_result = training(train_loader, model, criterion, optimizer) val_result = validate(val_loader, model, criterion) result_str = 'epoch : {} / {}\ main/loss : {:.3f}\ main/acc : {:.3f}\ val/loss : {:.3f}\ val/acc : {:.3f}'.format(epoch, args.training_epoch, training_result['loss'], training_result['acc'], val_result['loss'], val_result['acc']) logger.info(result_str) exp.log(result_str) prec1 = val_result['acc'] # remember best prec@1 and save checkpoint is_best = prec1 > best_prec best_prec = max(prec1, best_prec) if is_best: save_checkpoint( state={ 'epoch': epoch + 1, #'arch': args.arch, 'state_dict': model.state_dict(), 'best_prec': best_prec, 'optimizer': optimizer.state_dict(), }, is_best=is_best, result_dir=result_dir) exp.metric('main/loss', training_result['loss']) exp.metric('val/loss', val_result['loss']) logger.info('> end training') exp.end()
def main(): exp = Experiment("duckietown_vae") model_name = 'models/duckietown_vae_fc_model.pt' # changed configuration to this instead of argparse for easier interaction CUDA = True SEED = 1 BATCH_SIZE = 32 LOG_INTERVAL = 10 EPOCHS = 25 # connections through the autoencoder bottleneck # in the pytorch VAE example, this is 20 ZDIMS = 100 torch.manual_seed(SEED) if CUDA: torch.cuda.manual_seed(SEED) # DataLoader instances will load tensors directly into GPU memory kwargs = {'num_workers': 1, 'pin_memory': True} if CUDA else {} # Download or load downloaded MNIST dataset # shuffle data at every epoch #train_loader = torch.utils.data.DataLoader( #datasets.MNIST('data', train=True, download=True, #transform=transforms.ToTensor()), #batch_size=BATCH_SIZE, shuffle=True, **kwargs) # Same for test data #test_loader = torch.utils.data.DataLoader( #datasets.MNIST('data', train=False, transform=transforms.ToTensor()), #batch_size=BATCH_SIZE, shuffle=True, **kwargs) train_loader = load_dataset('images/train', BATCH_SIZE) test_loader = load_dataset('images/test', BATCH_SIZE) vae = FC_VAE(input_size=64 * 64 * 3, HIDDEN_1=400, batch_size=BATCH_SIZE, NUM_Z=ZDIMS, learning_rate=1e-3, CUDA=CUDA) #vae = Conv_VAE(batchsize=BATCH_SIZE) vae.cuda() total_test_loss = [] total_train_loss = [] for epoch in range(1, EPOCHS + 1): train_loss = vae.train_model(epoch, train_loader, LOG_INTERVAL) total_train_loss.append(train_loss) exp.metric("train_loss", train_loss.item()) test_loss = vae.test_model(epoch, test_loader) total_test_loss.append(test_loss) exp.metric("test_loss", test_loss.item()) torch.save(vae.state_dict(), model_name) # 64 sets of random ZDIMS-float vectors, i.e. 64 locations / MNIST # digits in latent space #sample = Variable(torch.randn(64, ZDIMS)) #if CUDA: #sample = sample.cuda() #sample = vae.decode(sample).cpu() # save out as an 8x8 matrix of MNIST digits # this will give you a visual idea of how well latent space can generate things # that look like digits #save_image(sample.data.view(64, 1, 28, 28), 'results/sample_' + str(epoch) + '.png') fig = plt.figure() ax = fig.add_subplot(1, 1, 1) line, = ax.plot(range(EPOCHS), total_train_loss, color='blue', lw=2, label='Train') line, = ax.plot(range(EPOCHS), total_test_loss, color='red', lw=2, label='Test') plt.xlabel('Epoch') plt.ylabel('Loss') ax.legend() plt.show() #main()
target = Variable(torch.from_numpy(ds.next_real[epi, frame])).cuda() loss = loss_function(new_obs, target) losses += loss loss_epi += loss.clone().cpu().data.numpy()[0] diff_epi += F.mse_loss( tmp_var(ds.current_real[epi, frame]), tmp_var(ds.next_real[epi, frame])).clone().cpu().data.numpy()[0] losses.backward() optimizer.step() losses.detach_() del losses del loss env.net.hidden[0].detach_() env.net.hidden[1].detach_() env.net.zero_grad() env.net.zero_hidden() optimizer.zero_grad() exp.metric("loss episode", loss_epi) exp.metric("diff episode", diff_epi) exp.metric("epoch", epoch) saveModel(env.net.state_dict())
def train_reconstruction(train_loader, test_loader, encoder, decoder, args): exp = Experiment("Reconstruction Training") #vis = Visualizations() vis = visdom.Visdom(port=8098) try: lr = args.lr encoder_opt = torch.optim.Adam(encoder.parameters(), lr=lr) decoder_opt = torch.optim.Adam(decoder.parameters(), lr=lr) encoder.train() decoder.train() steps = 0 all_losses = [] for epoch in range(1, args.epochs + 1): epoch_losses = [] print("=======Epoch========") print(epoch) for batch in train_loader: feature = batch # Variable if args.use_cuda: encoder.cuda() decoder.cuda() feature = feature.cuda() encoder_opt.zero_grad() decoder_opt.zero_grad() h = encoder(feature) prob = decoder(h) reconstruction_loss = compute_cross_entropy(prob, feature) reconstruction_loss.backward() encoder_opt.step() decoder_opt.step() print("Epoch: {}".format(epoch)) print("Steps: {}".format(steps)) print("Loss: {}".format(reconstruction_loss.item() / args.sentence_len)) exp.metric("Loss", reconstruction_loss.item() / args.sentence_len) epoch_losses.append(reconstruction_loss.item()) # check reconstructed sentence if steps % args.log_interval == 0: print("Test!!") input_data = feature[0] single_data = prob[0] _, predict_index = torch.max(single_data, 1) input_sentence = transform_id2word( input_data.data, train_loader.dataset.index2word, lang="en") predict_sentence = transform_id2word( predict_index.data, train_loader.dataset.index2word, lang="en") print("Input Sentence:") print(input_sentence) print("Output Sentence:") print(predict_sentence) steps += 1 # Visualization data epoch_loss = sum(epoch_losses) / float(len(epoch_losses)) all_losses.append(epoch_loss) if epoch == 1: # vis.plot_loss(np.mean(epoch_losses), steps) win = vis.line(X=np.array((epoch, )), Y=np.array((epoch_loss, )), name="train_loss", opts=dict(xlabel='Epoch', ylabel='Loss', title='Train and Eval Loss')) else: vis.line(X=np.array((epoch, )), Y=np.array((epoch_loss, )), name="train_loss", update="append", win=win) #epoch_losses.clear() if epoch % args.test_interval == 0: eval_reconstruction(encoder, decoder, test_loader, args, vis, win, epoch) if epoch % args.lr_decay_interval == 0: # decrease learning rate lr = lr / 1.05 encoder_opt = torch.optim.Adam(encoder.parameters(), lr=lr) decoder_opt = torch.optim.Adam(decoder.parameters(), lr=lr) encoder.train() decoder.train() if epoch % args.save_interval == 0: save_models(encoder, args.save_dir, "encoder", steps) save_models(decoder, args.save_dir, "decoder", steps) if epoch % 20 == 0: # finalization # save vocabulary #with open("word2index", "wb") as w2i, open("index2word", "wb") as i2w: # pickle.dump(train_loader.dataset.word2index, w2i) # pickle.dump(train_loader.dataset.index2word, i2w) torch.save(train_loader.dataset.index2word, "/home/avshalom/ext/ae_cnn_code/index2word.pt") torch.save(train_loader.dataset.word2index, "/home/avshalom/ext/ae_cnn_code/word2index.pt") # save models #save_models(encoder, args.save_dir, "encoder", "final") #save_models(decoder, args.save_dir, "decoder", "final") torch.save( encoder, "/home/avshalom/ext/ae_cnn_code/encoder_lsize_%s_epoch_%s.pt" % (args.latent_size, epoch)) print("Finish!!!") finally: exp.end()
def calc_accuricy(): error = 0 for step, (batch_x, batch_y) in enumerate(loader_test): # for each training step b_x = Variable(batch_x) b_y = Variable(batch_y) prediction = net(b_x) loss = loss_func(prediction, b_y) # must be (1. nn output, 2. target) error += float(loss) exp.metric('accuracy', error) # start training for epoch in range(EPOCH): exp.metric('epoch', epoch) for step, (batch_x, batch_y) in enumerate(loader_train): # for each training step b_x = Variable(batch_x) b_y = Variable(batch_y) prediction = net(b_x) # input x and predict based on x loss = loss_func(prediction, b_y) # must be (1. nn output, 2. target) if float(loss) < 15000: exp.metric('loss', float(loss)) calc_accuricy() optimizer.zero_grad() # clear gradients for next train loss.backward() # backpropagation, compute gradients optimizer.step() # apply gradients
while not done: action = agent.choose_action(observation) observation_, reward, done, info = env.step(action) score += reward if not load_checkpoint: agent.store_transition(observation, action, reward, observation_, int(done)) agent.learn() observation = observation_ n_steps += 1 scores.append(score) steps_array.append(n_steps) avg_score = np.mean(scores[-100:]) exp.metric("epsilon", agent.epsilon) exp.metric("score", score) print('episode ', i, 'score: ', score, 'average score %.1f best score %.1f epsilon %.2f' % (avg_score, best_score, agent.epsilon), 'steps ', n_steps) if avg_score > best_score: if not load_checkpoint: agent.save_models() best_score = avg_score eps_history.append(agent.epsilon) plot_learning_curve(steps_array, scores, eps_history, figure_file) exp.end()
class NeuralNet(object): LAYER1_SIZE = 522 # 12 LAYER2_SIZE = 256 # 6 LAYER3_SIZE = 128 LAYER4_SIZE = 64 LAYER5_SIZE = 32 OUTPUT_SIZE = 1 def __init__(self, name='nn-model', cached_model=None, seed=None, lr=1e-4, training=False): self.graph = tf.Graph() self.training = training if self.training: from hyperdash import Experiment self.exp = Experiment(name) with self.graph.as_default(): if seed is not None: tf.set_random_seed(seed) self.session = tf.Session() self.features = tf.placeholder(dtype=tf.float32, name="input_features", shape=(None, PLANET_MAX_NUM, PER_PLANET_FEATURES)) # target_distribution describes what the bot did in a real game. # For instance, if it sent 20% of the ships to the first planet and 15% of the ships to the second planet, # then expected_distribution = [0.2, 0.15 ...] self.target_distribution = tf.placeholder( dtype=tf.float32, name="target_distribution", shape=(None, PLANET_MAX_NUM)) # Combine all the planets from all the frames together, so it's easier to share # the weights and biases between them in the network. flattened_frames = tf.reshape(self.features, [-1, PER_PLANET_FEATURES]) layer1 = fully_connected(flattened_frames, 512) layer2 = fully_connected(layer1, 256) layer3 = fully_connected(layer2, 128) # Group back into frames layer4 = fully_connected(layer3, 64) layer5 = fully_connected(layer4, 32) layer6 = fully_connected(layer5, 1, activation_fn=None) logits = tf.reshape(layer6, [-1, PLANET_MAX_NUM]) self.prediction_normalized = tf.nn.softmax(logits) self.loss_op = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits( logits=logits, labels=self.target_distribution)) self.optimizer = tf.train.AdamOptimizer( learning_rate=lr) # returns Op self.train_op = self.optimizer.minimize(self.loss_op) # self.acc_op = tf.reduce_mean(tf.reduce_min(tf.cast(self.prediction_normalized, tf.float32), 1)) # self.acc, self.update_acc_op = tf.metrics.mean_per_class_accuracy(self.target_distribution, self.prediction_normalized, 28) # multilabel_accuracy(self.prediction_normalized, self.target_distribution) self.saver = tf.train.Saver() if self.training: self.exp.param("lr", lr) if cached_model is None: self.session.run([ tf.global_variables_initializer(), tf.local_variables_initializer() ]) else: self.session.run(tf.local_variables_initializer()) self.saver.restore(self.session, cached_model) def fit(self, input_data, expected_output_data): loss, _ = self.session.run( [self.loss_op, self.train_op], feed_dict={ self.features: normalize_input(input_data), self.target_distribution: expected_output_data }) if self.training: self.exp.metric("training_loss", loss) return loss def predict(self, input_data): """ Given data from 1 frame, predict where the ships should be sent. :param input_data: numpy array of shape (PLANET_MAX_NUM, PER_PLANET_FEATURES) :return: 1-D numpy array of length (PLANET_MAX_NUM) describing percentage of ships that should be sent to each planet """ return self.session.run( self.prediction_normalized, feed_dict={self.features: normalize_input(np.array([input_data]))})[0] def compute_loss(self, input_data, expected_output_data): """ Compute loss on the input data without running any training. :param input_data: numpy array of shape (number of frames, PLANET_MAX_NUM, PER_PLANET_FEATURES) :param expected_output_data: numpy array of shape (number of frames, PLANET_MAX_NUM) :return: training loss on the input data """ loss = self.session.run(self.loss_op, feed_dict={ self.features: normalize_input(input_data), self.target_distribution: expected_output_data }) if self.training: self.exp.metric("val_loss", loss) return loss def save(self, path): """ Serializes this neural net to given path. :param path: """ self.saver.save(self.session, path)
# idn_u_stars = idn_data["idn_u_stars"] # idn_u_preds, idn_f_preds = model.idn_predict(idn_t_stars, idn_x_stars) # idn_error_us = [ # np.linalg.norm(star - pred, 2) / np.linalg.norm(star, 2) # for star, pred in zip(idn_u_stars, idn_u_preds) # ] # for i, e in enumerate(idn_error_us): # model.logger.info(f"Data{i} Error u: {e:.3e}") sol_t_star = sol_data["sol_t_star"] sol_x_star = sol_data["sol_x_star"] sol_u_star = sol_data["sol_u_star"] model.train_solver(args.niter * 2, args.scipyopt) u_pred, f_pred = model.solver_predict(sol_t_star, sol_x_star) error_u = np.linalg.norm(sol_u_star - u_pred, 2) / np.linalg.norm( sol_u_star, 2) model.logger.info(f"Error u: {error_u:.3e}") exp.metric("Error u", error_u) sol_X_star = sol_data["sol_X_star"] sol_T = sol_data["sol_T"] sol_X = sol_data["sol_X"] sol_exact = sol_data["sol_exact"] U_pred = griddata(sol_X_star, u_pred.flatten(), (sol_T, sol_X), method="cubic") plt_saver(U_pred, sol_exact, sol_lb, sol_ub, figurename) exp.end()
loss += criterion_contrast(internals[0], internals[1 + answers[b]], 0) # add contrastive loss for the non-matching answers loss += criterion_contrast(internals[0], internals[1 + other_answers[0]], 1) loss += criterion_contrast(internals[0], internals[1 + other_answers[1]], 1) # Backward and optimize optimizer.zero_grad() loss.backward() optimizer.step() exp.metric("epoch", epoch) exp.metric("loss", loss.item()) if (i + 1) % 100 == 0: print('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}'.format( epoch + 1, EPOCHS, i + 1, total_step, loss.item())) # Save the model checkpoint torch.save( model.state_dict(), 'model-siam2-s{}-e{}-b{}-lr{}.ckpt'.format(SIZE, EPOCHS, BATCH, LEARNING_RATE)) # # Test the model # model.eval() # eval mode (batchnorm uses moving mean/variance instead of mini-batch mean/variance) # with torch.no_grad():
baseline = np.mean(rewards_raw[:k] + rewards_raw[k + 1:]) rewards.append(rewards_raw[k] - baseline) # calculate additional VAE loss # 0.5 * sum(1 + log(sigma^2) - mu^2 - sigma^2) KLD = -0.5 * torch.sum(1 + torch.log(latent_stddev.pow(2)) - latent_mu.pow(2) - latent_stddev.pow(2)) # update model returns = torch.Tensor(rewards).to(device) returns = (returns - returns.mean()) / (returns.std() + eps) policy_loss = [] for log_prob, R in zip(log_probs, returns): policy_loss.append(-log_prob * R) optimizer.zero_grad() policy_loss_sum = torch.cat(policy_loss).sum() + KLD loss_copy = policy_loss_sum.detach().cpu().numpy().copy() policy_loss_sum.backward() optimizer.step() if i_episode % CHKP_FREQ == 0: torch.save( policy.state_dict(), os.path.join(exp_dir, 'reinforce-' + str(i_episode) + '.pkl')) exp.metric("episode", i_episode) exp.metric("rewards", np.mean(rewards_raw)) exp.metric("loss", float(loss_copy))
opt_dec.zero_grad() z, mean = encoder(inputs) z, mu, logvar = ca(z) recon_batch = decoder(z) loss = loss_function(recon_batch, inputs, mu, logvar) loss.backward() opt_enc.step() opt_ca.step() opt_dec.step() exp.metric('loss', loss.item()) end_t = time.time() if idx % 15 == 0: print('''[%d/%d][%d/%d] Loss: %.2f Time: %.2fs''' % (i, args.epochs, idx, len(loader), loss.item(), end_t - start_t)) e_time = 100 * len(loader) * (end_t - start_t) / 60 / 60 print(e_time, "h") if i % 3 == 0: show_plts_mono(inputs[0],recon_batch[0], f"/content/drive/My Drive/RMSprop/vae_16/images/reconstruction_epoch{i}_1.png") show_plts_mono(inputs[1],recon_batch[1], f"/content/drive/My Drive/RMSprop/vae_16/images/reconstruction_epoch{i}_2.png") show_plts_mono(inputs[2],recon_batch[2], f"/content/drive/My Drive/RMSprop/vae_16/images/reconstruction_epoch{i}_3.png") torch.save(encoder.module.state_dict(), f"/content/drive/My Drive/RMSprop/vae_16/models/encoder_epoch{i}_{idx}.pth") torch.save(ca.module.state_dict(), f"/content/drive/My Drive/RMSprop/vae_16/models/ca_epoch{i}_{idx}.pth") torch.save(decoder.module.state_dict(), f"/content/drive/My Drive/RMSprop/vae_16/models/decoder_epoch{i}_{idx}.pth")
def train(train_list, test_list, lr, epoch, batchsize, insize, outsize, save_interval=10, weight_decay=5e-4, lr_step=10, model_name='resnet34', loss_name='focal_loss', metric_name='arc_margin', optim_name='adam', num_workers=4, print_freq=1e+6, debug=False): device = torch.device("cuda") train_dataset = Dataset(train_list, mode='train', insize=insize, debug=debug) trainloader = torch.utils.data.DataLoader(train_dataset, batch_size=batchsize, shuffle=True, num_workers=num_workers) test_dataset = Dataset(test_list, mode='test', insize=insize, debug=debug) testloader = torch.utils.data.DataLoader(test_dataset, batch_size=batchsize, shuffle=False, num_workers=num_workers) class_num = train_dataset.get_classnum() print('{} train iters per epoch:'.format(len(trainloader))) print('{} test iters per epoch:'.format(len(testloader))) if loss_name == 'focal_loss': criterion = FocalLoss(gamma=2) else: criterion = torch.nn.CrossEntropyLoss() if model_name == 'resnet18': model = resnet_face18(insize, outsize) elif model_name == 'resnet34': model = resnet34(insize, outsize) elif model_name == 'resnet50': model = resnet50(insize, outsize) elif model_name == 'resnet101': model = resnet101(insize, outsize) elif model_name == 'resnet152': model = resnet152(insize, outsize) elif model_name == 'shuffle': model = ShuffleFaceNet(outsize) elif model_name == 'simplev1': model = CNNv1(insize, outsize, activation='relu', kernel_pattern='v1') else: raise ValueError('Invalid model name: {}'.format(model_name)) if metric_name == 'add_margin': metric_fc = AddMarginProduct(outsize, class_num, s=30, m=0.35) elif metric_name == 'arc_margin': metric_fc = ArcMarginProduct(outsize, class_num, s=30, m=0.5, easy_margin=False) elif metric_name == 'sphere': metric_fc = SphereProduct(outsize, class_num, m=4) else: metric_fc = nn.Linear(outsize, class_num) # view_model(model, opt.input_shape) print(model) model.to(device) model = DataParallel(model) metric_fc.to(device) metric_fc = DataParallel(metric_fc) assert optim_name in ['sgd', 'adam'] if optim_name == 'sgd': optimizer = torch.optim.SGD([{ 'params': model.parameters() }, { 'params': metric_fc.parameters() }], lr=lr, weight_decay=weight_decay) elif optim_name == 'adam': optimizer = torch.optim.Adam([{ 'params': model.parameters() }, { 'params': metric_fc.parameters() }], lr=lr, weight_decay=weight_decay) scheduler = StepLR(optimizer, step_size=lr_step, gamma=0.1) start = time.time() training_id = datetime.datetime.now().strftime('%Y%m%d_%H%M%S') hyperdash_exp = Experiment(training_id) checkpoints_dir = os.path.join('logs', training_id) if not os.path.exists(checkpoints_dir): os.makedirs(checkpoints_dir) logging_path = os.path.join(checkpoints_dir, 'history.csv') config = {} config['train_list'] = train_list config['test_list'] = test_list config['lr'] = lr config['epoch'] = epoch config['batchsize'] = batchsize config['insize'] = insize config['outsize'] = outsize config['save_interval'] = save_interval config['weight_decay'] = weight_decay config['lr_step'] = lr_step config['model_name'] = model_name config['loss_name'] = loss_name config['metric_name'] = metric_name config['optim_name'] = optim_name config['num_workers'] = num_workers config['debug'] = debug for k, v in config.items(): hyperdash_exp.param(k, v, log=False) with open(os.path.join(checkpoints_dir, 'train_config.json'), 'w') as f: json.dump(config, f, indent=4) with open(logging_path, 'w') as f: f.write('epoch,time_elapsed,train_loss,train_acc,test_loss,test_acc\n') prev_time = datetime.datetime.now() for i in range(epoch): model.train() for ii, data in enumerate(tqdm(trainloader, disable=True)): data_input, label = data data_input = data_input.to(device) label = label.to(device).long() feature = model(data_input) output = metric_fc(feature, label) loss = criterion(output, label) pred_classes = np.argmax(output.data.cpu().numpy(), axis=1) acc = np.mean( (pred_classes == label.data.cpu().numpy()).astype(int)) optimizer.zero_grad() loss.backward() #import pdb; pdb.set_trace() optimizer.step() #scheduler.step() iters = i * len(trainloader) + ii if iters % print_freq == 0 or debug: speed = print_freq / (time.time() - start) time_str = time.asctime(time.localtime(time.time())) print('{} train epoch {} iter {} {} iters/s loss {} acc {}'. format(time_str, i, ii, speed, loss.item(), acc)) start = time.time() model.eval() for ii, data in enumerate(tqdm(testloader, disable=True)): data_input, label = data data_input = data_input.to(device) label = label.to(device).long() feature = model(data_input) output = metric_fc(feature, label) test_loss = criterion(output, label) output = np.argmax(output.data.cpu().numpy(), axis=1) test_acc = np.mean( (output == label.data.cpu().numpy()).astype(int)) #test_acc = np.mean((torch.argmax(output, dim=1) == label).type(torch.int32)) if i % save_interval == 0 or i == epoch: save_model(model.module, checkpoints_dir, model_name, i) save_model(metric_fc.module, checkpoints_dir, metric_name, i) new_time = datetime.datetime.now() with open(logging_path, 'a') as f: f.write('{},{},{},{},{},{}\n'.format( i, (new_time - prev_time).total_seconds(), loss.item(), acc, test_loss.item(), test_acc)) prev_time = datetime.datetime.now() hyperdash_exp.metric('train_loss', loss.item(), log=False) hyperdash_exp.metric('train_acc', acc, log=False) hyperdash_exp.metric('test_loss', test_loss.item(), log=False) hyperdash_exp.metric('test_acc', test_acc, log=False) hyperdash_exp.end() print('Finished {}'.format(training_id))
# digits.py from sklearn import svm, datasets from hyperdash import Experiment # Preprocess data digits = datasets.load_digits(100) test_cases = 50 X_train, y_train = digits.data[:-test_cases], digits.target[:-test_cases] X_test, y_test = digits.data[-test_cases:], digits.target[-test_cases:] # Create an experiment with a model name, then autostart exp = Experiment("Digits Classifier") # Record the value of hyperparameter gamma for this experiment gamma = exp.param("gamma", 0.1) # Param can record any basic type (Number, Boolean, String) classifer = svm.SVC(gamma=gamma) classifer.fit(X_train, y_train) # Record a numerical performance metric exp.metric("accuracy", classifer.score(X_test, y_test)) # Cleanup and mark that the experiment successfully completed exp.end()
loss.detach_() net.hidden[0].detach_() net.hidden[1].detach_() net.zero_grad() net.zero_hidden() optimizer.zero_grad() loss_buffer.zero_() del loss_concat x_buf = [] y_buf = [] epi_x_old = epi_x if exp is not None: exp.metric("loss episode la", loss_episode_la) exp.metric("loss episode lb", loss_episode_lb) exp.metric("loss episode cc", loss_episode_cc) exp.metric("diff episode", diff_episode) exp.metric("epoch", epoch) x_buf.append(x.squeeze(0)) y_buf.append(y.squeeze(0)) # Validation step loss_total_la = [] loss_total_lb = [] loss_total_cc = [] diff_total = []
class GAN(object): def __init__(self): warnings.filterwarnings('ignore') self.start_time = time() self.args = get_args() if self.args.checkpoint_dir_name: dir_name = self.args.checkpoint_dir_name else: dir_name = datetime.datetime.now().strftime('%y%m%d%H%M%S') self.path_to_dir = Path(__file__).resolve().parents[1] self.path_to_dir = os.path.join(self.path_to_dir, *['log', dir_name]) os.makedirs(self.path_to_dir, exist_ok=True) # tensorboard path_to_tensorboard = os.path.join(self.path_to_dir, 'tensorboard') os.makedirs(path_to_tensorboard, exist_ok=True) self.writer = SummaryWriter(path_to_tensorboard) # model saving os.makedirs(os.path.join(self.path_to_dir, 'model'), exist_ok=True) path_to_model = os.path.join(self.path_to_dir, *['model', 'model.tar']) # csv os.makedirs(os.path.join(self.path_to_dir, 'csv'), exist_ok=True) self.path_to_results_csv = os.path.join(self.path_to_dir, *['csv', 'results.csv']) path_to_args_csv = os.path.join(self.path_to_dir, *['csv', 'args.csv']) if not self.args.checkpoint_dir_name: with open(path_to_args_csv, 'a') as f: args_dict = vars(self.args) param_writer = csv.DictWriter(f, list(args_dict.keys())) param_writer.writeheader() param_writer.writerow(args_dict) # logging by hyperdash if not self.args.no_hyperdash: from hyperdash import Experiment self.exp = Experiment('Generation task on ' + self.args.dataset + ' dataset with GAN') for key in vars(self.args).keys(): exec("self.args.%s = self.exp.param('%s', self.args.%s)" % (key, key, key)) else: self.exp = None self.dataloader = get_dataloader(self.args.dataset, self.args.image_size, self.args.batch_size) sample_data = self.dataloader.__iter__().__next__()[0] image_channels = sample_data.shape[1] z = torch.randn(self.args.batch_size, self.args.z_dim) self.sample_z = z.view(z.size(0), z.size(1), 1, 1) self.Generator = Generator(self.args.z_dim, image_channels, self.args.image_size) self.Generator_optimizer = optim.Adam(self.Generator.parameters(), lr=self.args.lr_Generator, betas=(self.args.beta1, self.args.beta2)) self.writer.add_graph(self.Generator, self.sample_z) self.Generator.to(self.args.device) self.Discriminator = Discriminator(image_channels, self.args.image_size) self.Discriminator_optimizer = optim.Adam( self.Discriminator.parameters(), lr=self.args.lr_Discriminator, betas=(self.args.beta1, self.args.beta2)) self.writer.add_graph(self.Discriminator, sample_data) self.Discriminator.to(self.args.device) self.BCELoss = nn.BCELoss() self.sample_z = self.sample_z.to(self.args.device) def train(self): self.train_hist = {} self.train_hist['Generator_loss'] = 0.0 self.train_hist['Discriminator_loss'] = 0.0 # real ---> y = 1 # fake ---> y = 0 self.y_real = torch.ones(self.args.batch_size, 1).to(self.args.device) self.y_fake = torch.zeros(self.args.batch_size, 1).to(self.args.device) self.Discriminator.train() global_step = 0 # -----training ----- for epoch in range(1, self.args.n_epoch + 1): self.Generator.train() for idx, (x, _) in enumerate(self.dataloader): if idx == self.dataloader.dataset.__len__( ) // self.args.batch_size: break z = torch.randn(self.args.batch_size, self.args.z_dim) z = z.view(z.size(0), z.size(1), 1, 1) z = z.to(self.args.device) x = x.to(self.args.device) # ----- update Discriminator ----- # minimize: -{ log[D(x)] + log[1-D(G(z))] } self.Discriminator_optimizer.zero_grad() # real # ---> log[D(x)] Discriminator_real, _ = self.Discriminator(x) Discriminator_real_loss = self.BCELoss(Discriminator_real, self.y_real) # fake # ---> log[1-D(G(z))] Discriminator_fake, _ = self.Discriminator(self.Generator(z)) Discriminator_fake_loss = self.BCELoss(Discriminator_fake, self.y_fake) Discriminator_loss = Discriminator_real_loss + Discriminator_fake_loss self.train_hist[ 'Discriminator_loss'] = Discriminator_loss.item() Discriminator_loss.backward() self.Discriminator_optimizer.step() # ----- update Generator ----- # As stated in the original paper, # we want to train the Generator # by minimizing log(1−D(G(z))) # in an effort to generate better fakes. # As mentioned, this was shown by Goodfellow # to not provide sufficient gradients, # especially early in the learning process. # As a fix, we instead wish to maximize log(D(G(z))). # ---> minimize: -log[D(G(z))] self.Generator_optimizer.zero_grad() Discriminator_fake, _ = self.Discriminator(self.Generator(z)) Generator_loss = self.BCELoss(Discriminator_fake, self.y_real) self.train_hist['Generator_loss'] = Generator_loss.item() Generator_loss.backward() self.Generator_optimizer.step() # ----- logging by tensorboard, csv and hyperdash # tensorboard self.writer.add_scalar('loss/Generator_loss', Generator_loss.item(), global_step) self.writer.add_scalar('loss/Discriminator_loss', Discriminator_loss.item(), global_step) # csv with open(self.path_to_results_csv, 'a') as f: result_writer = csv.DictWriter( f, list(self.train_hist.keys())) if epoch == 1 and idx == 0: result_writer.writeheader() result_writer.writerow(self.train_hist) # hyperdash if self.exp: self.exp.metric('Generator loss', Generator_loss.item()) self.exp.metric('Discriminator loss', Discriminator_loss.item()) if (idx % 10) == 0: self._plot_sample(global_step) global_step += 1 elapsed_time = time() - self.start_time print('\nTraining Finish, elapsed time ---> %f' % (elapsed_time)) def _plot_sample(self, global_step): with torch.no_grad(): total_n_sample = min(self.args.n_sample, self.args.batch_size) image_frame_dim = int(np.floor(np.sqrt(total_n_sample))) samples = self.Generator(self.sample_z) samples = samples.cpu().data.numpy().transpose(0, 2, 3, 1) samples = (samples + 1) / 2 fig = plt.figure(figsize=(24, 15)) for i in range(image_frame_dim * image_frame_dim): ax = fig.add_subplot( image_frame_dim, image_frame_dim * 2, (int(i / image_frame_dim) + 1) * image_frame_dim + i + 1, xticks=[], yticks=[]) if samples[i].shape[2] == 3: ax.imshow(samples[i]) else: ax.imshow(samples[i][:, :, 0], cmap='gray') self.writer.add_figure('sample images generated by GAN', fig, global_step)
def main(): start_time = time() args = get_args() if args.checkpoint_dir_name: dir_name = args.checkpoint_dir_name else: dir_name = datetime.datetime.now().strftime('%y%m%d%H%M%S') path_to_dir = Path(__file__).resolve().parents[1] path_to_dir = os.path.join(path_to_dir, *['log', dir_name]) os.makedirs(path_to_dir, exist_ok=True) # tensorboard path_to_tensorboard = os.path.join(path_to_dir, 'tensorboard') os.makedirs(path_to_tensorboard, exist_ok=True) writer = SummaryWriter(path_to_tensorboard) # model saving os.makedirs(os.path.join(path_to_dir, 'model'), exist_ok=True) path_to_model = os.path.join(path_to_dir, *['model', 'model.tar']) # csv os.makedirs(os.path.join(path_to_dir, 'csv'), exist_ok=True) path_to_results_csv = os.path.join(path_to_dir, *['csv', 'results.csv']) path_to_args_csv = os.path.join(path_to_dir, *['csv', 'args.csv']) if not args.checkpoint_dir_name: with open(path_to_args_csv, 'a') as f: args_dict = vars(args) param_writer = csv.DictWriter(f, list(args_dict.keys())) param_writer.writeheader() param_writer.writerow(args_dict) # logging using hyperdash if not args.no_hyperdash: from hyperdash import Experiment exp = Experiment('Classification task on CIFAR10 dataset with CNN') for key in vars(args).keys(): exec("args.%s = exp.param('%s', args.%s)" % (key, key, key)) else: exp = None path_to_dataset = os.path.join( Path(__file__).resolve().parents[2], 'datasets') os.makedirs(path_to_dataset, exist_ok=True) train_loader, eval_loader, classes = get_loader( batch_size=args.batch_size, num_workers=args.num_workers, path_to_dataset=path_to_dataset) # show some of the training images, for fun. dataiter = iter(train_loader) images, labels = dataiter.next() img_grid = torchvision.utils.make_grid(images) matplotlib_imshow(img_grid) writer.add_image('four_CIFAR10_images', img_grid) # define a network, loss function and optimizer model = CNN() writer.add_graph(model, images) model = torch.nn.DataParallel(model) criterion = nn.CrossEntropyLoss() optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum) start_epoch = 0 # resume training if args.checkpoint_dir_name: print('\nLoading the model...') checkpoint = torch.load(path_to_model) model.state_dict(checkpoint['model_state_dict']) optimizer.load_state_dict(checkpoint['optimizer_state_dict']) start_epoch = checkpoint['epoch'] + 1 summary(model, input_size=(3, 32, 32)) model.to(args.device) # train the network print('\n--------------------') print('Start training and evaluating the CNN') for epoch in range(start_epoch, args.n_epoch): start_time_per_epoch = time() train_loss, train_acc = train(train_loader, model, criterion, optimizer, args.device, writer, epoch, classes) eval_loss, eval_acc = eval(eval_loader, model, criterion, args.device) elapsed_time_per_epoch = time() - start_time_per_epoch result_dict = { 'epoch': epoch, 'train_loss': train_loss, 'eval_loss': eval_loss, 'train_acc': train_acc, 'eval_acc': eval_acc, 'elapsed time': elapsed_time_per_epoch } with open(path_to_results_csv, 'a') as f: result_writer = csv.DictWriter(f, list(result_dict.keys())) if epoch == 0: result_writer.writeheader() result_writer.writerow(result_dict) # checkpoint torch.save( { 'epoch': epoch, 'model_state_dict': model.state_dict(), 'optimizer_state_dict': optimizer.state_dict() }, path_to_model) if exp: exp.metric('train loss', train_loss) exp.metric('eval loss', eval_loss) exp.metric('train acc', train_acc) exp.metric('eval acc', eval_acc) else: print(result_dict) writer.add_scalar('loss/train_loss', train_loss, epoch * len(train_loader)) writer.add_scalar('loss/eval_loss', eval_loss, epoch * len(eval_loader)) writer.add_scalar('acc/train_acc', train_acc, epoch * len(train_loader)) writer.add_scalar('acc/eval_acc', eval_acc, epoch * len(eval_loader)) elapsed_time = time() - start_time print('\nFinished Training, elapsed time ===> %f' % elapsed_time) if exp: exp.end() writer.close()
class BaseTrainer(_BaseTrainer): """ Base trainer to make pytorch training be easier. Args: data-augmentation (bool): Crop randomly and add random noise for data augmentation. epoch (int): Number of epochs to train. opt (str): Optimization method. gpu (bool): Use GPU. seed (str): Random seed to train. train (str): Path to training image-pose list file. val (str): Path to validation image-pose list file. batchsize (int): Learning minibatch size. out (str): Output directory. resume (str): Initialize the trainer from given file. The file name is 'epoch-{epoch number}.iter'. resume_model (str): Load model definition file to use for resuming training (it\'s necessary when you resume a training). The file name is 'epoch-{epoch number}.model'. resume_opt (str): Load optimization states from this file (it\'s necessary when you resume a training). The file name is 'epoch-{epoch number}.state'. """ def __init__(self, **kwargs): self.data_augmentation = kwargs['data_augmentation'] self.epoch = kwargs['epoch'] self.gpu = (kwargs['gpu'] >= 0) self.opt = kwargs['opt'] self.seed = kwargs['seed'] self.train = kwargs['train'] self.val = kwargs['val'] self.batchsize = kwargs['batchsize'] self.out = kwargs['out'] self.resume = kwargs['resume'] self.resume_model = kwargs['resume_model'] self.resume_opt = kwargs['resume_opt'] self.hyperdash = kwargs['hyperdash'] if self.hyperdash: self.experiment = Experiment(self.hyperdash) for key, val in kwargs.items(): self.experiment.param(key, val) # validate arguments. self._validate_arguments() self.lowest_loss = 0 self.device = torch.device('cuda' if kwargs['gpu'] >= 0 else 'cpu') #self.experiment.log_multiple_params(kwargs) self.dataloader = torch.utils.data.DataLoader def _validate_arguments(self): if self.seed is not None and self.data_augmentation: raise NotSupportedError('It is not supported to fix random seed for data augmentation.') if self.gpu and not torch.cuda.is_available(): raise GPUNotFoundError('GPU is not found.') #for path in (self.train, self.val): # if not os.path.isfile(path): # raise FileNotFoundError('{0} is not found.'.format(path)) if self.opt not in ('MomentumSGD', 'Adam'): raise UnknownOptimizationMethodError( '{0} is unknown optimization method.'.format(self.opt)) if self.resume is not None: for path in (self.resume, self.resume_model, self.resume_opt): if not os.path.isfile(path): raise FileNotFoundError('{0} is not found.'.format(path)) # TODO: make it acceptable multiple optimizer, or define out of this trainer. def _get_optimizer(self, model, **kwargs): if self.opt == 'MomentumSGD': optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.9) elif self.opt == "Adam": optimizer = optim.Adam(model.parameters()) else: try: optimizer = getattr(optim, self.opt)(**kwargs) except OptimNotSupportedError: print("This optim is not available. See https://pytorch.org/docs/stable/optim.html") return optimizer def forward(self, batch, model, criterion): data, target = map(lambda d: d.to(self.device), batch) output = model(data) loss = criterion(output, target) return loss def _train(self, model, optimizer, criterion, train_iter, logger, start_time, log_interval=10): model.train() loss_sum = 0.0 for iteration, batch in enumerate(tqdm(train_iter, desc='this epoch'), 1): optimizer.zero_grad() loss = self.forward(batch, model, criterion, isTest=False) loss_sum += loss loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), 500) optimizer.step() if self.hyperdash: self.experiment.metric("loss", int(loss.cpu().data.numpy()), log=False) if iteration % log_interval == 0: log = 'elapsed_time: {0}, loss: {1}'.format(time.time() - start_time, loss.data[0]) logger.write(log) return loss_sum / len(train_iter) def _test(self, model, test_iter, criterion, logger, start_time): model.eval() test_loss = 0 for batch in test_iter: loss = self.forward(batch, model, criterion, isTest=True) print('Test loss: {}'.format(loss.data)) test_loss += loss.item() test_loss /= len(test_iter) log = 'elapsed_time: {0}, validation/loss: {1}'.format(time.time() - start_time, test_loss) if self.hyperdash: self.experiment.metric('test_loss', int(test_loss.cpu().data.numpy())) logger.write(log) return test_loss def _checkpoint(self, epoch, model, optimizer, logger): filename = os.path.join(self.out, 'epoch-{0}'.format(epoch + 1)) torch.save({'epoch': epoch + 1, 'logger': logger.state_dict()}, filename + '.iter') torch.save(model.state_dict(), filename + '.model') torch.save(optimizer.state_dict(), filename + '.state') def _best_checkpoint(self, epoch, model, optimizer, logger): filename = os.path.join(self.out, 'best_model') torch.save({'epoch': epoch + 1, 'logger': logger.state_dict()}, filename + '.iter') torch.save(model.state_dict(), filename + '.model') torch.save(optimizer.state_dict(), filename + '.state') def fit(self, model, train_data, val_data, criterion): """ Execute training """ # set random seed. if self.seed is not None: random.seed(self.seed) torch.manual_seed(self.seed) if self.gpu: torch.cuda.manual_seed(self.seed) # initialize model to train. if self.resume_model: model.load_state_dict(torch.load(self.resume_model)) # prepare gpu. if self.gpu: model.cuda() # load the datasets. train_iter = self.dataloader(train_data, batch_size=self.batchsize, shuffle=True) val_iter = self.dataloader(val_data, batch_size=3, shuffle=False) # set up an optimizer. optimizer = self._get_optimizer(model) if self.resume_opt: optimizer.load_state_dict(torch.load(self.resume_opt)) # set intervals. val_interval = 3 resume_interval = self.epoch / 10 log_interval = 10 # set logger and start epoch. logger = TrainLogger(self.out) start_epoch = 0 if self.resume: resume = torch.load(self.resume) start_epoch = resume['epoch'] logger.load_state_dict(resume['logger']) # start training. start_time = time.time() loss = 0 for epoch in trange(start_epoch, self.epoch, initial=start_epoch, total=self.epoch, desc=' total'): self._train(model, optimizer, criterion, train_iter, log_interval, logger, start_time) if (epoch) % val_interval == 0: loss = self._test(model, val_iter, criterion, logger, start_time) if self.lowest_loss == 0 or self.lowest_loss > loss: logger.write('Best model updated. loss: {} => {}'.format(self.lowest_loss, loss)) self._best_checkpoint(epoch, model, optimizer, logger) self.lowest_loss = loss if (epoch + 1) % resume_interval == 0: self._checkpoint(epoch, model, optimizer, logger) if self.hyperdash: self.experiment.end() @staticmethod def get_args(): # arg definition parser = argparse.ArgumentParser( description='Training pose net for comparison \ between chainer and pytorch about implementing DeepPose.') parser.add_argument( '--data-augmentation', '-a', action='store_true', help='Crop randomly and add random noise for data augmentation.') parser.add_argument( '--epoch', '-e', type=int, default=100, help='Number of epochs to train.') parser.add_argument( '--opt', '-o', type=str, default='Adam', choices=['MomentumSGD', 'Adam'], help='Optimization method.') parser.add_argument( '--gpu', '-g', type=int, default=0, help='GPU ID (negative value indicates CPU).') parser.add_argument( '--seed', '-s', type=int, help='Random seed to train.') parser.add_argument( '--train', type=str, default='data/train', help='Path to training image-pose list file.') parser.add_argument( '--val', type=str, default='data/test', help='Path to validation image-pose list file.') parser.add_argument( '--batchsize', type=int, default=32, help='Learning minibatch size.') parser.add_argument( '--out', default='result', help='Output directory') parser.add_argument( '--resume', default=None, help='Initialize the trainer from given file. \ The file name is "epoch-{epoch number}.iter".') parser.add_argument( '--resume-model', type=str, default=None, help='Load model definition file to use for resuming training \ (it\'s necessary when you resume a training). \ The file name is "epoch-{epoch number}.mode"') parser.add_argument( '--resume-opt', type=str, default=None, help='Load optimization states from this file \ (it\'s necessary when you resume a training). \ The file name is "epoch-{epoch number}.state"') parser.add_argument( '--hyperdash', type=str, default=None, help='If you use hyperdash logging, enter here the name of experiment. Before using, you have to login to hyperdash with "hyperdash login --github". The default is None that means no logging with hyperdash') args = parser.parse_args() return args
# Set seeds seed(args.seed) state_dim = env.observation_space.shape action_dim = env.action_space.shape[0] max_action = float(env.action_space.high[0]) # Initialize policy policy = DDPG(state_dim, action_dim, max_action, net_type="cnn") replay_buffer = utils.ReplayBuffer(args.replay_buffer_max_size) # Evaluate untrained policy evaluations = [evaluate_policy(env, policy)] exp.metric("rewards", evaluations[0]) total_timesteps = 0 timesteps_since_eval = 0 episode_num = 0 done = True episode_reward = None env_counter = 0 while total_timesteps < args.max_timesteps: if done: if total_timesteps != 0: print(("Total T: %d Episode Num: %d Episode T: %d Reward: %f") % (total_timesteps, episode_num, episode_timesteps, episode_reward))
next_sim = robot.observe() variable = data_to_var(next_sim, current_real, ds.action[epi, frame]) delta = double_squeeze(net.forward(variable)) next_real = to_var(next_sim).float() + delta robot.set(next_real.data.cpu().numpy()) target = to_var(ds.next_real[epi, frame], volatile=True) loss = loss_function(next_real, target) losses += loss diffs += F.mse_loss( to_var(next_sim).float(), to_var(ds.next_real[epi, frame])).clone().cpu().data.numpy()[0] exp.metric("loss episode", losses.cpu().data.numpy()[0]) exp.metric("diff episode", diffs) exp.metric("epoch", epoch) losses.backward() optimizer.step() del losses del loss save_model(net.state_dict()) robot.close()
class condGANTrainer(object): def __init__(self, output_dir, data_loader, n_words, ixtoword): if cfg.TRAIN.FLAG: self.model_dir = os.path.join(output_dir, 'Model') self.image_dir = os.path.join(output_dir, 'Image') mkdir_p(self.model_dir) mkdir_p(self.image_dir) # torch.cuda.set_device(cfg.GPU_ID) cudnn.benchmark = True self.batch_size = cfg.TRAIN.BATCH_SIZE self.max_epoch = cfg.TRAIN.MAX_EPOCH self.snapshot_interval = cfg.TRAIN.SNAPSHOT_INTERVAL self.n_words = n_words self.ixtoword = ixtoword self.data_loader = data_loader self.num_batches = len(self.data_loader) self.start_epoch = 0 self.exp = Experiment("t2s", capture_io=False, api_key_getter=get_api_key_from_env) def build_models(self): device = 'cuda' if torch.cuda.is_available() else 'cpu' image_encoder = CNN_ENCODER(cfg.TEXT.EMBEDDING_DIM) img_encoder_path = cfg.TRAIN.NET_E.replace('text_encoder', 'image_encoder') state_dict = torch.load(img_encoder_path, map_location=lambda storage, loc: storage) image_encoder.load_state_dict(state_dict) for p in image_encoder.parameters(): p.requires_grad = False print('Load image encoder from:', img_encoder_path) image_encoder.eval() image_encoder = image_encoder.to(device) text_encoder = RNN_ENCODER(self.n_words, nhidden=cfg.TEXT.EMBEDDING_DIM) state_dict = torch.load(cfg.TRAIN.NET_E, map_location=lambda storage, loc: storage) text_encoder.load_state_dict(state_dict) for p in text_encoder.parameters(): p.requires_grad = False print('Load text encoder from:', cfg.TRAIN.NET_E) text_encoder.eval() text_encoder = text_encoder.to(device) netG = G_NET().to(device) netD = D_NET_32().to(device) self.netG = torch.nn.DataParallel(netG) self.netD = torch.nn.DataParallel(netD) self.optimizerG = optim.Adam(self.netG.parameters(), lr=cfg.TRAIN.DISCRIMINATOR_LR, betas=(0.5, 0.999)) self.optimizerD = optim.Adam(self.netD.parameters(), lr=cfg.TRAIN.GENERATOR_LR, betas=(0.5, 0.999)) epoch = 0 if cfg.TRAIN.NET_G != '': state_dict = torch.load(cfg.TRAIN.NET_G, map_location=lambda storage, loc: storage) self.netG.load_state_dict(state_dict) print('Load G from: ', cfg.TRAIN.NET_G) istart = cfg.TRAIN.NET_G.rfind('_') + 1 iend = cfg.TRAIN.NET_G.rfind('.') epoch = cfg.TRAIN.NET_G[istart:iend] epoch = int(epoch) + 1 self.start_epoch = epoch state_dict = torch.load(cfg.TRAIN.NET_G.replace('G', 'D'), map_location=lambda storage, loc: storage) self.netD.load_state_dict(state_dict) print('Load D from: ', cfg.TRAIN.NET_G.replace('G', 'D')) return [text_encoder, image_encoder] def train(self): text_encoder, image_encoder = self.build_models() netG, netD = self.netG, self.netD optimizerG = self.optimizerG optimizerD = self.optimizerD device = 'cuda' if torch.cuda.is_available() else 'cpu' start_epoch = 0 nz = cfg.GAN.Z_DIM noise = Variable(torch.FloatTensor(self.batch_size, nz)).to(device) fixed_noise = Variable(torch.FloatTensor(self.batch_size, nz).normal_(0, 1)).to(device) # gen_iterations = start_epoch * self.num_batches for epoch in range(start_epoch, self.max_epoch): epoch = epoch + self.start_epoch for step, data in enumerate(self.data_loader): start_t = time.time() shape, cap, cap_len, cls_id, key = data sorted_cap_lens, sorted_cap_indices = torch.sort(cap_len, 0, True) #sort shapes = shape[sorted_cap_indices].squeeze().to(device) captions = cap[sorted_cap_indices].squeeze().to(device) class_ids = cls_id[sorted_cap_indices].squeeze().numpy() hidden = text_encoder.init_hidden(self.batch_size) # words_embs: batch_size x nef x seq_len # sent_emb: batch_size x nef words_embs, sent_emb = text_encoder(captions, sorted_cap_lens, hidden) # words_embs, sent_emb = words_embs.detach(), sent_emb.detach() mask = (captions == 0) num_words = words_embs.size(2) if mask.size(1) > num_words: mask = mask[:, :num_words] ####################################################### # (2) Generate fake images ###################################################### noise.data.normal_(0, 1) fake_shapes, mu, logvar = netG(noise, sent_emb) ####################################################### # (3) Update D network ###################################################### real_labels = torch.FloatTensor(self.batch_size).fill_(1).to(device) fake_labels = torch.FloatTensor(self.batch_size).fill_(0).to(device) netD.zero_grad() real_features = netD(shapes).to(device) cond_real_errD = nn.BCELoss()(real_features, real_labels) fake_features = netD(fake_shapes).to(device) cond_fake_errD = nn.BCELoss()(fake_features, fake_labels) errD_total = cond_real_errD + cond_fake_errD / 2. d_real_acu = torch.ge(real_features.squeeze(), 0.5).float() d_fake_acu = torch.le(fake_features.squeeze(), 0.5).float() d_total_acu = torch.mean(torch.cat((d_real_acu, d_fake_acu),0)) if d_total_acu < 0.85: errD_total.backward(retain_graph=True) optimizerD.step() # ####################################################### # # (4) Update G network: maximize log(D(G(z))) # ###################################################### # # compute total loss for training G # step += 1 # gen_iterations += 1 # # do not need to compute gradient for Ds # # self.set_requires_grad_value(netsD, False) netG.zero_grad() # errG_total, G_logs = \ # generator_loss(netsD, image_encoder, fake_imgs, real_labels, # words_embs, sent_emb, match_labels, cap_lens, class_ids) labels = Variable(torch.LongTensor(range(self.batch_size))) real_labels = torch.FloatTensor(self.batch_size).fill_(1).to(device) real_features = netD(fake_shapes) cond_real_errG = nn.BCELoss()(real_features, real_labels) kl_loss = KL_loss(mu, logvar) errG_total = kl_loss + cond_real_errG if step % 10 == 0: region_features, cnn_code = image_encoder(fake_shapes) w_loss0, w_loss1, _ = words_loss(region_features, words_embs, labels, sorted_cap_lens, class_ids, self.batch_size) w_loss = (w_loss0 + w_loss1) * \ cfg.TRAIN.SMOOTH.LAMBDA s_loss0, s_loss1 = sent_loss(cnn_code, sent_emb, labels, class_ids, self.batch_size) s_loss = (s_loss0 + s_loss1) * \ cfg.TRAIN.SMOOTH.LAMBDA errG_total += s_loss + w_loss self.exp.metric('s_loss', s_loss.item()) self.exp.metric('w_loss', w_loss.item()) # print('kl: %.2f w s, %.2f %.2f, cond %.2f' % (kl_loss.item(), w_loss.item(), s_loss.item(), cond_real_errG.item())) # # backward and update parameters errG_total.backward() optimizerG.step() end_t = time.time() self.exp.metric('d_loss', errD_total.item()) self.exp.metric('g_loss', errG_total.item()) self.exp.metric('act', d_total_acu.item()) print('''[%d/%d][%d/%d] Loss_D: %.2f Loss_G: %.2f Time: %.2fs''' % (epoch, self.max_epoch, step, self.num_batches, errD_total.item(), errG_total.item(), end_t - start_t)) if step % 500 == 0: fullpath = '%s/lean_%d_%d.png' % (self.image_dir,epoch, step) build_images(fake_shapes, captions, self.ixtoword, fullpath) torch.save(netG.state_dict(),'%s/netG_epoch_%d.pth' % (self.model_dir, epoch)) torch.save(netD.state_dict(),'%s/netD_epoch_%d.pth' % (self.model_dir, epoch)) print('Save G/Ds models.')